#!/usr/bin/env python # coding: utf-8 # # k-Nearest Neighbors (kNN) # ## 1. kNN 이란? # - 참고: https://www.youtube.com/watch?v=UqYde-LULfs # ## 2. 데이터 다루기 (Data Handling) # ### 1) 데이터 로딩하기 # - Iris (붓꽃) 데이터 로딩 # In[1]: import urllib2 import json from scipy import stats from pandas import Series, DataFrame import pandas as pd import matplotlib.pyplot as plt import numpy as np get_ipython().run_line_magic('matplotlib', 'inline') path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data' raw_csv = urllib2.urlopen(path) feature_names = ('sepal length', 'sepal width', 'petal length', 'petal width') all_names = feature_names + ('class',) df = pd.read_csv(raw_csv, names=all_names) print df # ### 2) 학습 데이터와 검증 데이터 분리하기 # In[10]: import random def splitDataset(split, df, training_set=[], test_set=[]): for i in range(len(df)): if random.random() < split: training_set.append(df.ix[i]) else: test_set.append(df.ix[i]) return training_set, test_set split = 0.66 training_set, test_set = splitDataset(split, df) print 'Train: ' + str(len(training_set)) + " - ratio: " + str(float(len(training_set))/len(df)) print 'Test: ' + str(len(test_set)) + " - ratio: " + str(float(len(test_set))/len(df)) # ## 2. 유사도 (Similarity) 정의 # - 유사도 정의 # - Euclidean Distance (https://en.wikipedia.org/wiki/Euclidean_distance) # In[11]: num_feature = len(feature_names) import math def euclideanDistance(instance1, instance2): distance = 0 for x in range(num_feature): distance += pow((instance1[x] - instance2[x]), 2) return math.sqrt(distance) df_feature = df.drop('class', axis=1) print df_feature.head() print distance = euclideanDistance(df_feature.ix[0], df_feature.ix[1]) print 'Distance: ' + str(distance) # ## 3. k-이웃 (k-Neighbors) 찾기 # - 테스트 집합내 임의의 인스턴스(test_instance)에 대하여 훈련 데이터 집합(training_set)내에서 유사도가 높은 k개의 인스턴스 찾기 # In[14]: import operator def getNeighbors(training_set, test_instance, k): distances = [] for i in range(len(training_set)): dist = euclideanDistance(training_set[i], test_instance) distances.append((training_set[i], dist)) distances.sort(key=operator.itemgetter(1)) neighbors = [] for i in range(k): neighbors.append(distances[i][0]) return neighbors print test_set[0] print k = 1 neighbors = getNeighbors(training_set, test_set[0], k) print neighbors # In[19]: print neighbors[0] print print type(neighbors[0]) print print neighbors[0][-1] # In[20]: k = 3 neighbors = getNeighbors(training_set, test_set[0], k) print(neighbors) # ## 4. 분류하기 (Classify) # - 테스트 집합(test_set)내 임의의 인스턴스(test_instance)에 대하여... # - 훈련 데이터 집합(training_set)내에서 유사도가 높은 k개의 인스턴스의 분류 중 가장 빈도수가 높은 분류를 해당 인스턴스(test_instance)의 분류로 정하기 # In[19]: def classify(neighbors): class_frequency = {} for i in range(len(neighbors)): class_name = neighbors[i][-1] if class_name in class_frequency: class_frequency[class_name] += 1 else: class_frequency[class_name] = 1 sorted_class_frequency = sorted(class_frequency.iteritems(), key=operator.itemgetter(1), reverse=True) return sorted_class_frequency[0][0] k = 3 neighbors = getNeighbors(training_set, test_set[0], k) classified_class_name = classify(neighbors) print "Classified:", classified_class_name, "- Actual:", test_set[0]['class'] # ## 5. 전체 테스트 집합에 대해 분류 및 정확도 (Accuracy) 평가 # In[21]: k = 3 classified_class_names=[] for i in range(len(test_set)): neighbors = getNeighbors(training_set, test_set[i], k) result = classify(neighbors) classified_class_names.append(result) print('Classified:' + result + ', Actual:' + test_set[i][-1]) correct = 0.0 for i in range(len(test_set)): if classified_class_names[i] == test_set[i][-1]: correct += 1.0 print print('Accuracy: ' + str(correct / float(len(test_set)) * 100.0) + '%') # ## 6. kNN 분류 전체 코드 # - 중간 과정의 테스트 코드 삭제 # # - 보다 정확한 정확도 측정을 위하여 전체적으로 num_trials번의 테스트 후 평균 정확도 산출 # In[27]: import urllib2 import json from scipy import stats from pandas import Series, DataFrame import pandas as pd import matplotlib.pyplot as plt import numpy as np import random import math import operator def splitDataset(split, df, training_set=[] , test_set=[]): for i in range(len(df)): if random.random() < split: training_set.append(df.ix[i]) else: test_set.append(df.ix[i]) return training_set, test_set def euclideanDistance(instance1, instance2): distance = 0 for x in range(num_feature): distance += pow((instance1[x] - instance2[x]), 2) return math.sqrt(distance) def getNeighbors(training_set, test_instance, k): distances = [] for i in range(len(training_set)): dist = euclideanDistance(training_set[i], test_instance) distances.append((training_set[i], dist)) distances.sort(key=operator.itemgetter(1)) neighbors = [] for i in range(k): neighbors.append(distances[i][0]) return neighbors def classify(neighbors): class_frequency = {} for i in range(len(neighbors)): class_name = neighbors[i][-1] if class_name in class_frequency: class_frequency[class_name] += 1 else: class_frequency[class_name] = 1 sorted_class_frequency = sorted(class_frequency.iteritems(), key=operator.itemgetter(1), reverse=True) return sorted_class_frequency[0][0] if __name__ == '__main__': path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data' raw_csv = urllib2.urlopen(path) feature_names = ('sepal length', 'sepal width', 'petal length', 'petal width') iris_names = ('Iris-setosa', 'Iris-versicolor', 'Iris-virginica') all_names = feature_names + ('class',) df = pd.read_csv(raw_csv, names=all_names) df_feature = df.drop('class', axis=1) num_feature = len(feature_names) split = 0.66 k = 3 num_trials = 3 accuracy_sum = 0.0 for i in range(num_trials): training_set, test_set = splitDataset(split, df) classified_class_names=[] for i in range(len(test_set)): neighbors = getNeighbors(training_set, test_set[i], k) result = classify(neighbors) classified_class_names.append(result) correct = 0.0 for i in range(len(test_set)): if test_set[i][-1] == classified_class_names[i]: correct += 1.0 accuracy_sum += (correct / float(len(test_set))) * 100.0 print('Mean Accuracy: ' + str(accuracy_sum / float(num_trials)) + '%') # ## 7. scikit-learn을 활용한 kNN 수행 # - sklearn.datasets.load_iris()를 제공하여 iris 데이터를 편하게 로드할 수 있음 # In[23]: import numpy as np import matplotlib.pyplot as plt from sklearn import neighbors, datasets iris = datasets.load_iris() print iris.data[0:5] print iris.target[0:5] # - datasets.load_iris()가 반환한 데이터 형태에 맞게 훈련 데이터와 테스트 데이터 구분 # In[29]: import random def splitDataset2(split, data, training_feature_set=[], training_target_set=[], test_feature_set=[], test_target_set=[]): for i in range(len(data)): if random.random() < split: training_feature_set.append(iris.data[i]) training_target_set.append(iris.target[i]) else: test_feature_set.append(iris.data[i]) test_target_set.append(iris.target[i]) return training_feature_set, training_target_set, test_feature_set, test_target_set split = 0.66 training_feature_set, training_target_set, test_feature_set, test_target_set = splitDataset2(split, iris.data) print 'Train: ' + str(len(training_feature_set)) + " - ratio: " + str(float(len(training_feature_set))/len(df)) print 'Test: ' + str(len(test_feature_set)) + " - ratio: " + str(float(len(test_feature_set))/len(df)) print print training_feature_set print training_target_set print print test_feature_set print test_target_set # - knn.fit(훈련 속성 데이터 집합, 훈련 분류 데이터 집합)을 통하여 knn 알고리즘 훈련 및 모델 형성 # - knn.predict(테스트 속성 데이터)를 통해 테스트 데이터의 분류 결과를 얻어옴 # In[30]: k = 3 knn = neighbors.KNeighborsClassifier(k) knn.fit(training_feature_set, training_target_set) result_index = knn.predict(test_feature_set[0]) print('Classified:' + iris_names[result_index] + ', Actual:' + iris_names[test_target_set[0]]) # - 전체 코드 # In[31]: import numpy as np import matplotlib.pyplot as plt from sklearn import neighbors, datasets iris = datasets.load_iris() def splitDataset2(split, data, training_feature_set=[], training_target_set=[], test_feature_set=[], test_target_set=[]): for i in range(len(data)): if random.random() < split: training_feature_set.append(iris.data[i]) training_target_set.append(iris.target[i]) else: test_feature_set.append(iris.data[i]) test_target_set.append(iris.target[i]) return training_feature_set, training_target_set, test_feature_set, test_target_set if __name__ == '__main__': feature_names = ('sepal length', 'sepal width', 'petal length', 'petal width') iris_names = ('Iris-setosa', 'Iris-versicolor', 'Iris-virginica') all_names = feature_names + ('class',) num_feature = len(feature_names) split = 0.66 k = 3 num_trials = 3 accuracy_sum = 0.0 for i in range(num_trials): training_feature_set, training_target_set, test_feature_set, test_target_set = splitDataset2(split, iris.data) knn = neighbors.KNeighborsClassifier(k) knn.fit(training_feature_set, training_target_set) classified_class_names=[] for i in range(len(test_feature_set)): result_index = knn.predict(test_feature_set[i]) classified_class_names.append(iris_names[result_index]) correct = 0.0 for i in range(len(test_feature_set)): if iris_names[test_target_set[i]] == classified_class_names[i]: correct += 1.0 accuracy_sum += (correct / float(len(test_feature_set))) * 100.0 print('Mean Accuracy: ' + str(accuracy_sum / float(num_trials)) + '%') # ## 8. Refererence # - http://machinelearningmastery.com/tutorial-to-implement-k-nearest-neighbors-in-python-from-scratch/ # - http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html