k-means

1. k-means 란?

2. scikit-learn을 활용한 k-means 수행

  • iris_data 사용
  • scikit-learn의 KMeans를 활용한 iris 전체 데이터에 대한 클러스터링
  • 각 인스턴스의 특징값만으로 클러스터링을 수행하여 그 결과를 kmeans.labels_ 에 클러스터링 인덱스를 ndarray 자료구조를 사용하여 표현함
In [11]:
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.cluster import KMeans
from sklearn import datasets

iris = datasets.load_iris()
feature_names = ('sepal length', 'sepal width', 'petal length', 'petal width')
num_feature = len(feature_names)

kmeans = KMeans(n_clusters=3)
kmeans.fit(iris.data)
print kmeans.labels_
print
print iris.target
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 0 0 0 0 2 0 0 0 0
 0 0 2 2 0 0 0 0 2 0 2 0 2 0 0 2 2 0 0 0 0 0 2 0 0 0 0 2 0 0 0 2 0 0 0 2 0
 0 2]

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]

3. 정확도 (Accuracy) 분석

  • 실제 타겟 그룹 인덱스와 Kmeans에 의해 클러스터링된 그룹 인덱스에 차이가 발생하므로 두 인덱스 사이의 Match 관계를 만들어줌
In [12]:
import math
import operator

def euclideanDistance(instance1, instance2):
    distance = 0
    for x in range(num_feature):
        distance += pow((instance1[x] - instance2[x]), 2)
    return math.sqrt(distance)

def getGroupMatch(group1, group2):
    numGroupsOfGroups1 = len(np.unique(group1))
    numGroupsOfGroups2 = len(np.unique(group2))
    group1_dict = {}
    group2_dict = {}
    for i in range(numGroupsOfGroups1):
        group1_dict[i] = []
    for i in range(numGroupsOfGroups2):
        group2_dict[i] = []    

    index = 0
    for i in group1:
        group1_dict[i].append(index)
        index += 1

    index = 0
    for i in group2:
        group2_dict[i].append(index)
        index += 1

    group_match = {} ## actual_group_index : kmeans_group_index
    for i in range(len(group1_dict)):
        distance_set = []
        for j in range(len(group2_dict)):
            distance_set.append((j, euclideanDistance(group1_dict[i], group2_dict[j])))
        distance_set.sort(key = operator.itemgetter(1))
        group_match[i] = distance_set[0][0]

    return group1_dict, group2_dict, group_match

group1_dict, group2_dict, group_match = getGroupMatch(kmeans.labels_, iris.target)
print group1_dict
print
print group2_dict
print
print group_match
{0: [52, 77, 100, 102, 103, 104, 105, 107, 108, 109, 110, 111, 112, 115, 116, 117, 118, 120, 122, 124, 125, 128, 129, 130, 131, 132, 134, 135, 136, 137, 139, 140, 141, 143, 144, 145, 147, 148], 1: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49], 2: [50, 51, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 101, 106, 113, 114, 119, 121, 123, 126, 127, 133, 138, 142, 146, 149]}

{0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49], 1: [50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], 2: [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]}

{0: 2, 1: 0, 2: 1}
In [13]:
def getAccuracy(group1, group2, numData):
    group1_dict, group2_dict, group_match = getGroupMatch(group1, group2)
    correct = 0.0
    for i in range(len(group1_dict)):
        for index in group1_dict[i]:
            if index in group2_dict[group_match[i]]:
                correct += 1.0
    return correct / float(numData) * 100.0

accuacy = getAccuracy(kmeans.labels_, iris.target, len(iris.data))
print('Accuracy: ' + str(accuacy) + '%')
Accuracy: 89.3333333333%

4. 훈련 데이터를 통한 학습과 테스트 데이터의 분류 검증

  • 훈련 데이터와 테스트 데이터의 분리
In [14]:
iris = datasets.load_iris()
split = 0.66
iris_names = ('Iris-setosa', 'Iris-versicolor', 'Iris-virginica')

def splitDataset(split, training_feature_set=[], training_target_set=[], test_feature_set=[], test_target_set=[]):
    for i in range(len(iris.data)):
        if random.random() < split:
            training_feature_set.append(iris.data[i])
            training_target_set.append(iris.target[i])
        else:
            test_feature_set.append(iris.data[i])
            test_target_set.append(iris.target[i])
    return training_feature_set, training_target_set, test_feature_set, test_target_set


training_feature_set, training_target_set, test_feature_set, test_target_set = splitDataset(split)
print 'Train: ' + str(len(training_feature_set))
print 'Test: ' + str(len(test_feature_set))
print
print training_feature_set
print training_target_set
print
print test_feature_set
print test_target_set
Train: 94
Test: 56

[array([ 5.1,  3.5,  1.4,  0.2]), array([ 4.7,  3.2,  1.3,  0.2]), array([ 4.6,  3.1,  1.5,  0.2]), array([ 5. ,  3.6,  1.4,  0.2]), array([ 5.4,  3.9,  1.7,  0.4]), array([ 4.6,  3.4,  1.4,  0.3]), array([ 5. ,  3.4,  1.5,  0.2]), array([ 4.4,  2.9,  1.4,  0.2]), array([ 4.9,  3.1,  1.5,  0.1]), array([ 4.8,  3.4,  1.6,  0.2]), array([ 4.8,  3. ,  1.4,  0.1]), array([ 4.3,  3. ,  1.1,  0.1]), array([ 5.8,  4. ,  1.2,  0.2]), array([ 5.4,  3.9,  1.3,  0.4]), array([ 5.1,  3.5,  1.4,  0.3]), array([ 5.1,  3.8,  1.5,  0.3]), array([ 5.4,  3.4,  1.7,  0.2]), array([ 5.1,  3.7,  1.5,  0.4]), array([ 4.6,  3.6,  1. ,  0.2]), array([ 5.1,  3.3,  1.7,  0.5]), array([ 5. ,  3. ,  1.6,  0.2]), array([ 5.2,  3.5,  1.5,  0.2]), array([ 5.5,  4.2,  1.4,  0.2]), array([ 5. ,  3.2,  1.2,  0.2]), array([ 5. ,  3.5,  1.3,  0.3]), array([ 5. ,  3.5,  1.6,  0.6]), array([ 5.1,  3.8,  1.9,  0.4]), array([ 4.8,  3. ,  1.4,  0.3]), array([ 5.1,  3.8,  1.6,  0.2]), array([ 5.3,  3.7,  1.5,  0.2]), array([ 7. ,  3.2,  4.7,  1.4]), array([ 6.9,  3.1,  4.9,  1.5]), array([ 6.5,  2.8,  4.6,  1.5]), array([ 5.7,  2.8,  4.5,  1.3]), array([ 6.6,  2.9,  4.6,  1.3]), array([ 5.2,  2.7,  3.9,  1.4]), array([ 5. ,  2. ,  3.5,  1. ]), array([ 6. ,  2.2,  4. ,  1. ]), array([ 6.1,  2.9,  4.7,  1.4]), array([ 5.6,  2.9,  3.6,  1.3]), array([ 6.7,  3.1,  4.4,  1.4]), array([ 5.6,  3. ,  4.5,  1.5]), array([ 5.8,  2.7,  4.1,  1. ]), array([ 5.9,  3.2,  4.8,  1.8]), array([ 6.3,  2.5,  4.9,  1.5]), array([ 6.1,  2.8,  4.7,  1.2]), array([ 6.6,  3. ,  4.4,  1.4]), array([ 6. ,  2.9,  4.5,  1.5]), array([ 5.5,  2.4,  3.8,  1.1]), array([ 5.5,  2.4,  3.7,  1. ]), array([ 5.8,  2.7,  3.9,  1.2]), array([ 5.4,  3. ,  4.5,  1.5]), array([ 6. ,  3.4,  4.5,  1.6]), array([ 6.7,  3.1,  4.7,  1.5]), array([ 5.5,  2.5,  4. ,  1.3]), array([ 5.5,  2.6,  4.4,  1.2]), array([ 5.8,  2.6,  4. ,  1.2]), array([ 5.6,  2.7,  4.2,  1.3]), array([ 5.7,  2.9,  4.2,  1.3]), array([ 6.2,  2.9,  4.3,  1.3]), array([ 5.1,  2.5,  3. ,  1.1]), array([ 5.8,  2.7,  5.1,  1.9]), array([ 7.1,  3. ,  5.9,  2.1]), array([ 6.3,  2.9,  5.6,  1.8]), array([ 6.5,  3. ,  5.8,  2.2]), array([ 7.6,  3. ,  6.6,  2.1]), array([ 4.9,  2.5,  4.5,  1.7]), array([ 6.7,  2.5,  5.8,  1.8]), array([ 6.5,  3.2,  5.1,  2. ]), array([ 6.4,  2.7,  5.3,  1.9]), array([ 6.8,  3. ,  5.5,  2.1]), array([ 5.8,  2.8,  5.1,  2.4]), array([ 6.4,  3.2,  5.3,  2.3]), array([ 7.7,  2.6,  6.9,  2.3]), array([ 6. ,  2.2,  5. ,  1.5]), array([ 6.9,  3.2,  5.7,  2.3]), array([ 5.6,  2.8,  4.9,  2. ]), array([ 6.3,  2.7,  4.9,  1.8]), array([ 7.2,  3.2,  6. ,  1.8]), array([ 6.1,  3. ,  4.9,  1.8]), array([ 6.4,  2.8,  5.6,  2.1]), array([ 7.2,  3. ,  5.8,  1.6]), array([ 7.9,  3.8,  6.4,  2. ]), array([ 6.4,  2.8,  5.6,  2.2]), array([ 6.1,  2.6,  5.6,  1.4]), array([ 7.7,  3. ,  6.1,  2.3]), array([ 6.3,  3.4,  5.6,  2.4]), array([ 6. ,  3. ,  4.8,  1.8]), array([ 6.7,  3.1,  5.6,  2.4]), array([ 5.8,  2.7,  5.1,  1.9]), array([ 6.7,  3. ,  5.2,  2.3]), array([ 6.5,  3. ,  5.2,  2. ]), array([ 6.2,  3.4,  5.4,  2.3]), array([ 5.9,  3. ,  5.1,  1.8])]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]

[array([ 4.9,  3. ,  1.4,  0.2]), array([ 5.4,  3.7,  1.5,  0.2]), array([ 5.7,  4.4,  1.5,  0.4]), array([ 5.7,  3.8,  1.7,  0.3]), array([ 4.8,  3.4,  1.9,  0.2]), array([ 5. ,  3.4,  1.6,  0.4]), array([ 5.2,  3.4,  1.4,  0.2]), array([ 4.7,  3.2,  1.6,  0.2]), array([ 4.8,  3.1,  1.6,  0.2]), array([ 5.4,  3.4,  1.5,  0.4]), array([ 5.2,  4.1,  1.5,  0.1]), array([ 4.9,  3.1,  1.5,  0.1]), array([ 5.5,  3.5,  1.3,  0.2]), array([ 4.9,  3.1,  1.5,  0.1]), array([ 4.4,  3. ,  1.3,  0.2]), array([ 5.1,  3.4,  1.5,  0.2]), array([ 4.5,  2.3,  1.3,  0.3]), array([ 4.4,  3.2,  1.3,  0.2]), array([ 4.6,  3.2,  1.4,  0.2]), array([ 5. ,  3.3,  1.4,  0.2]), array([ 6.4,  3.2,  4.5,  1.5]), array([ 5.5,  2.3,  4. ,  1.3]), array([ 6.3,  3.3,  4.7,  1.6]), array([ 4.9,  2.4,  3.3,  1. ]), array([ 5.9,  3. ,  4.2,  1.5]), array([ 6.2,  2.2,  4.5,  1.5]), array([ 5.6,  2.5,  3.9,  1.1]), array([ 6.1,  2.8,  4. ,  1.3]), array([ 6.4,  2.9,  4.3,  1.3]), array([ 6.8,  2.8,  4.8,  1.4]), array([ 6.7,  3. ,  5. ,  1.7]), array([ 5.7,  2.6,  3.5,  1. ]), array([ 6. ,  2.7,  5.1,  1.6]), array([ 6.3,  2.3,  4.4,  1.3]), array([ 5.6,  3. ,  4.1,  1.3]), array([ 6.1,  3. ,  4.6,  1.4]), array([ 5. ,  2.3,  3.3,  1. ]), array([ 5.7,  3. ,  4.2,  1.2]), array([ 5.7,  2.8,  4.1,  1.3]), array([ 6.3,  3.3,  6. ,  2.5]), array([ 7.3,  2.9,  6.3,  1.8]), array([ 7.2,  3.6,  6.1,  2.5]), array([ 5.7,  2.5,  5. ,  2. ]), array([ 6.5,  3. ,  5.5,  1.8]), array([ 7.7,  3.8,  6.7,  2.2]), array([ 7.7,  2.8,  6.7,  2. ]), array([ 6.7,  3.3,  5.7,  2.1]), array([ 6.2,  2.8,  4.8,  1.8]), array([ 7.4,  2.8,  6.1,  1.9]), array([ 6.3,  2.8,  5.1,  1.5]), array([ 6.4,  3.1,  5.5,  1.8]), array([ 6.9,  3.1,  5.4,  2.1]), array([ 6.9,  3.1,  5.1,  2.3]), array([ 6.8,  3.2,  5.9,  2.3]), array([ 6.7,  3.3,  5.7,  2.5]), array([ 6.3,  2.5,  5. ,  1.9])]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  • 훈련 데이터만 kmeans에 넣어 분류
  • 테스트 데이터를 kmeans.predict()에 넣어 kmeans_target_set 얻음
  • kmeans_target_set과 test_target_set과의 정확도 산출
In [19]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3)
kmeans.fit(training_feature_set)
kmeans_target_set = kmeans.predict(test_feature_set).tolist()
print kmeans_target_set
print
print test_target_set
print

accuracy = getAccuracy(kmeans_target_set, test_target_set, len(test_target_set))
print('Accuracy: ' + str(accuracy) + '%')
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 0, 2, 0, 0, 2, 2, 0, 2, 0, 0]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]

Accuracy: 83.6734693878%
  • 전체 코드
In [20]:
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.cluster import KMeans
from sklearn import datasets

iris = datasets.load_iris()
split = 0.66
iris_names = ('Iris-setosa', 'Iris-versicolor', 'Iris-virginica')
feature_names = ('sepal length', 'sepal width', 'petal length', 'petal width')
num_feature = len(feature_names)

def splitDataset(split, training_feature_set=[], training_target_set=[], test_feature_set=[], test_target_set=[]):
    for i in range(len(iris.data)):
        if random.random() < split:
            training_feature_set.append(iris.data[i])
            training_target_set.append(iris.target[i])
        else:
            test_feature_set.append(iris.data[i])
            test_target_set.append(iris.target[i])
    return training_feature_set, training_target_set, test_feature_set, test_target_set

if __name__ == '__main__':
    num_trials = 3
    accuracy_sum = 0.0

    for i in range(num_trials):
        training_feature_set, training_target_set, test_feature_set, test_target_set = splitDataset(split)
        kmeans = KMeans(n_clusters=3)
        kmeans.fit(training_feature_set)
        kmeans_target_set = kmeans.predict(test_feature_set).tolist()

        accuracy = getAccuracy(kmeans_target_set, test_target_set, len(test_target_set))
        accuracy_sum += accuracy
        
    print('Mean Accuracy: ' + str(accuracy_sum / float(num_trials)) + '%')
Mean Accuracy: 88.2586824684%

5. Refererence