#!/usr/bin/env python # coding: utf-8 # # k-means # ## 1. k-means 란? # - 참고: http://shabal.in/visuals/kmeans/2.html # ## 2. scikit-learn을 활용한 k-means 수행 # - iris_data 사용 # - scikit-learn의 KMeans를 활용한 iris 전체 데이터에 대한 클러스터링 # - 각 인스턴스의 특징값만으로 클러스터링을 수행하여 그 결과를 kmeans.labels\_ 에 클러스터링 인덱스를 ndarray 자료구조를 사용하여 표현함 # In[11]: import numpy as np import matplotlib.pyplot as plt import random from sklearn.cluster import KMeans from sklearn import datasets iris = datasets.load_iris() feature_names = ('sepal length', 'sepal width', 'petal length', 'petal width') num_feature = len(feature_names) kmeans = KMeans(n_clusters=3) kmeans.fit(iris.data) print kmeans.labels_ print print iris.target # ## 3. 정확도 (Accuracy) 분석 # - 실제 타겟 그룹 인덱스와 Kmeans에 의해 클러스터링된 그룹 인덱스에 차이가 발생하므로 두 인덱스 사이의 Match 관계를 만들어줌 # In[12]: import math import operator def euclideanDistance(instance1, instance2): distance = 0 for x in range(num_feature): distance += pow((instance1[x] - instance2[x]), 2) return math.sqrt(distance) def getGroupMatch(group1, group2): numGroupsOfGroups1 = len(np.unique(group1)) numGroupsOfGroups2 = len(np.unique(group2)) group1_dict = {} group2_dict = {} for i in range(numGroupsOfGroups1): group1_dict[i] = [] for i in range(numGroupsOfGroups2): group2_dict[i] = [] index = 0 for i in group1: group1_dict[i].append(index) index += 1 index = 0 for i in group2: group2_dict[i].append(index) index += 1 group_match = {} ## actual_group_index : kmeans_group_index for i in range(len(group1_dict)): distance_set = [] for j in range(len(group2_dict)): distance_set.append((j, euclideanDistance(group1_dict[i], group2_dict[j]))) distance_set.sort(key = operator.itemgetter(1)) group_match[i] = distance_set[0][0] return group1_dict, group2_dict, group_match group1_dict, group2_dict, group_match = getGroupMatch(kmeans.labels_, iris.target) print group1_dict print print group2_dict print print group_match # In[13]: def getAccuracy(group1, group2, numData): group1_dict, group2_dict, group_match = getGroupMatch(group1, group2) correct = 0.0 for i in range(len(group1_dict)): for index in group1_dict[i]: if index in group2_dict[group_match[i]]: correct += 1.0 return correct / float(numData) * 100.0 accuacy = getAccuracy(kmeans.labels_, iris.target, len(iris.data)) print('Accuracy: ' + str(accuacy) + '%') # ## 4. 훈련 데이터를 통한 학습과 테스트 데이터의 분류 검증 # - 훈련 데이터와 테스트 데이터의 분리 # In[14]: iris = datasets.load_iris() split = 0.66 iris_names = ('Iris-setosa', 'Iris-versicolor', 'Iris-virginica') def splitDataset(split, training_feature_set=[], training_target_set=[], test_feature_set=[], test_target_set=[]): for i in range(len(iris.data)): if random.random() < split: training_feature_set.append(iris.data[i]) training_target_set.append(iris.target[i]) else: test_feature_set.append(iris.data[i]) test_target_set.append(iris.target[i]) return training_feature_set, training_target_set, test_feature_set, test_target_set training_feature_set, training_target_set, test_feature_set, test_target_set = splitDataset(split) print 'Train: ' + str(len(training_feature_set)) print 'Test: ' + str(len(test_feature_set)) print print training_feature_set print training_target_set print print test_feature_set print test_target_set # - 훈련 데이터만 kmeans에 넣어 분류 # - 테스트 데이터를 kmeans.predict()에 넣어 kmeans_target_set 얻음 # - kmeans_target_set과 test_target_set과의 정확도 산출 # In[19]: from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=3) kmeans.fit(training_feature_set) kmeans_target_set = kmeans.predict(test_feature_set).tolist() print kmeans_target_set print print test_target_set print accuracy = getAccuracy(kmeans_target_set, test_target_set, len(test_target_set)) print('Accuracy: ' + str(accuracy) + '%') # - 전체 코드 # In[20]: import numpy as np import matplotlib.pyplot as plt import random from sklearn.cluster import KMeans from sklearn import datasets iris = datasets.load_iris() split = 0.66 iris_names = ('Iris-setosa', 'Iris-versicolor', 'Iris-virginica') feature_names = ('sepal length', 'sepal width', 'petal length', 'petal width') num_feature = len(feature_names) def splitDataset(split, training_feature_set=[], training_target_set=[], test_feature_set=[], test_target_set=[]): for i in range(len(iris.data)): if random.random() < split: training_feature_set.append(iris.data[i]) training_target_set.append(iris.target[i]) else: test_feature_set.append(iris.data[i]) test_target_set.append(iris.target[i]) return training_feature_set, training_target_set, test_feature_set, test_target_set if __name__ == '__main__': num_trials = 3 accuracy_sum = 0.0 for i in range(num_trials): training_feature_set, training_target_set, test_feature_set, test_target_set = splitDataset(split) kmeans = KMeans(n_clusters=3) kmeans.fit(training_feature_set) kmeans_target_set = kmeans.predict(test_feature_set).tolist() accuracy = getAccuracy(kmeans_target_set, test_target_set, len(test_target_set)) accuracy_sum += accuracy print('Mean Accuracy: ' + str(accuracy_sum / float(num_trials)) + '%') # ## 5. Refererence # - http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html