import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.cluster import KMeans
from sklearn import datasets
iris = datasets.load_iris()
feature_names = ('sepal length', 'sepal width', 'petal length', 'petal width')
num_feature = len(feature_names)
kmeans = KMeans(n_clusters=3)
kmeans.fit(iris.data)
print kmeans.labels_
print
print iris.target
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 0 0 0 0 2 0 0 0 0 0 0 2 2 0 0 0 0 2 0 2 0 2 0 0 2 2 0 0 0 0 0 2 0 0 0 0 2 0 0 0 2 0 0 0 2 0 0 2] [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
import math
import operator
def euclideanDistance(instance1, instance2):
distance = 0
for x in range(num_feature):
distance += pow((instance1[x] - instance2[x]), 2)
return math.sqrt(distance)
def getGroupMatch(group1, group2):
numGroupsOfGroups1 = len(np.unique(group1))
numGroupsOfGroups2 = len(np.unique(group2))
group1_dict = {}
group2_dict = {}
for i in range(numGroupsOfGroups1):
group1_dict[i] = []
for i in range(numGroupsOfGroups2):
group2_dict[i] = []
index = 0
for i in group1:
group1_dict[i].append(index)
index += 1
index = 0
for i in group2:
group2_dict[i].append(index)
index += 1
group_match = {} ## actual_group_index : kmeans_group_index
for i in range(len(group1_dict)):
distance_set = []
for j in range(len(group2_dict)):
distance_set.append((j, euclideanDistance(group1_dict[i], group2_dict[j])))
distance_set.sort(key = operator.itemgetter(1))
group_match[i] = distance_set[0][0]
return group1_dict, group2_dict, group_match
group1_dict, group2_dict, group_match = getGroupMatch(kmeans.labels_, iris.target)
print group1_dict
print
print group2_dict
print
print group_match
{0: [52, 77, 100, 102, 103, 104, 105, 107, 108, 109, 110, 111, 112, 115, 116, 117, 118, 120, 122, 124, 125, 128, 129, 130, 131, 132, 134, 135, 136, 137, 139, 140, 141, 143, 144, 145, 147, 148], 1: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49], 2: [50, 51, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 101, 106, 113, 114, 119, 121, 123, 126, 127, 133, 138, 142, 146, 149]} {0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49], 1: [50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], 2: [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]} {0: 2, 1: 0, 2: 1}
def getAccuracy(group1, group2, numData):
group1_dict, group2_dict, group_match = getGroupMatch(group1, group2)
correct = 0.0
for i in range(len(group1_dict)):
for index in group1_dict[i]:
if index in group2_dict[group_match[i]]:
correct += 1.0
return correct / float(numData) * 100.0
accuacy = getAccuracy(kmeans.labels_, iris.target, len(iris.data))
print('Accuracy: ' + str(accuacy) + '%')
Accuracy: 89.3333333333%
iris = datasets.load_iris()
split = 0.66
iris_names = ('Iris-setosa', 'Iris-versicolor', 'Iris-virginica')
def splitDataset(split, training_feature_set=[], training_target_set=[], test_feature_set=[], test_target_set=[]):
for i in range(len(iris.data)):
if random.random() < split:
training_feature_set.append(iris.data[i])
training_target_set.append(iris.target[i])
else:
test_feature_set.append(iris.data[i])
test_target_set.append(iris.target[i])
return training_feature_set, training_target_set, test_feature_set, test_target_set
training_feature_set, training_target_set, test_feature_set, test_target_set = splitDataset(split)
print 'Train: ' + str(len(training_feature_set))
print 'Test: ' + str(len(test_feature_set))
print
print training_feature_set
print training_target_set
print
print test_feature_set
print test_target_set
Train: 94 Test: 56 [array([ 5.1, 3.5, 1.4, 0.2]), array([ 4.7, 3.2, 1.3, 0.2]), array([ 4.6, 3.1, 1.5, 0.2]), array([ 5. , 3.6, 1.4, 0.2]), array([ 5.4, 3.9, 1.7, 0.4]), array([ 4.6, 3.4, 1.4, 0.3]), array([ 5. , 3.4, 1.5, 0.2]), array([ 4.4, 2.9, 1.4, 0.2]), array([ 4.9, 3.1, 1.5, 0.1]), array([ 4.8, 3.4, 1.6, 0.2]), array([ 4.8, 3. , 1.4, 0.1]), array([ 4.3, 3. , 1.1, 0.1]), array([ 5.8, 4. , 1.2, 0.2]), array([ 5.4, 3.9, 1.3, 0.4]), array([ 5.1, 3.5, 1.4, 0.3]), array([ 5.1, 3.8, 1.5, 0.3]), array([ 5.4, 3.4, 1.7, 0.2]), array([ 5.1, 3.7, 1.5, 0.4]), array([ 4.6, 3.6, 1. , 0.2]), array([ 5.1, 3.3, 1.7, 0.5]), array([ 5. , 3. , 1.6, 0.2]), array([ 5.2, 3.5, 1.5, 0.2]), array([ 5.5, 4.2, 1.4, 0.2]), array([ 5. , 3.2, 1.2, 0.2]), array([ 5. , 3.5, 1.3, 0.3]), array([ 5. , 3.5, 1.6, 0.6]), array([ 5.1, 3.8, 1.9, 0.4]), array([ 4.8, 3. , 1.4, 0.3]), array([ 5.1, 3.8, 1.6, 0.2]), array([ 5.3, 3.7, 1.5, 0.2]), array([ 7. , 3.2, 4.7, 1.4]), array([ 6.9, 3.1, 4.9, 1.5]), array([ 6.5, 2.8, 4.6, 1.5]), array([ 5.7, 2.8, 4.5, 1.3]), array([ 6.6, 2.9, 4.6, 1.3]), array([ 5.2, 2.7, 3.9, 1.4]), array([ 5. , 2. , 3.5, 1. ]), array([ 6. , 2.2, 4. , 1. ]), array([ 6.1, 2.9, 4.7, 1.4]), array([ 5.6, 2.9, 3.6, 1.3]), array([ 6.7, 3.1, 4.4, 1.4]), array([ 5.6, 3. , 4.5, 1.5]), array([ 5.8, 2.7, 4.1, 1. ]), array([ 5.9, 3.2, 4.8, 1.8]), array([ 6.3, 2.5, 4.9, 1.5]), array([ 6.1, 2.8, 4.7, 1.2]), array([ 6.6, 3. , 4.4, 1.4]), array([ 6. , 2.9, 4.5, 1.5]), array([ 5.5, 2.4, 3.8, 1.1]), array([ 5.5, 2.4, 3.7, 1. ]), array([ 5.8, 2.7, 3.9, 1.2]), array([ 5.4, 3. , 4.5, 1.5]), array([ 6. , 3.4, 4.5, 1.6]), array([ 6.7, 3.1, 4.7, 1.5]), array([ 5.5, 2.5, 4. , 1.3]), array([ 5.5, 2.6, 4.4, 1.2]), array([ 5.8, 2.6, 4. , 1.2]), array([ 5.6, 2.7, 4.2, 1.3]), array([ 5.7, 2.9, 4.2, 1.3]), array([ 6.2, 2.9, 4.3, 1.3]), array([ 5.1, 2.5, 3. , 1.1]), array([ 5.8, 2.7, 5.1, 1.9]), array([ 7.1, 3. , 5.9, 2.1]), array([ 6.3, 2.9, 5.6, 1.8]), array([ 6.5, 3. , 5.8, 2.2]), array([ 7.6, 3. , 6.6, 2.1]), array([ 4.9, 2.5, 4.5, 1.7]), array([ 6.7, 2.5, 5.8, 1.8]), array([ 6.5, 3.2, 5.1, 2. ]), array([ 6.4, 2.7, 5.3, 1.9]), array([ 6.8, 3. , 5.5, 2.1]), array([ 5.8, 2.8, 5.1, 2.4]), array([ 6.4, 3.2, 5.3, 2.3]), array([ 7.7, 2.6, 6.9, 2.3]), array([ 6. , 2.2, 5. , 1.5]), array([ 6.9, 3.2, 5.7, 2.3]), array([ 5.6, 2.8, 4.9, 2. ]), array([ 6.3, 2.7, 4.9, 1.8]), array([ 7.2, 3.2, 6. , 1.8]), array([ 6.1, 3. , 4.9, 1.8]), array([ 6.4, 2.8, 5.6, 2.1]), array([ 7.2, 3. , 5.8, 1.6]), array([ 7.9, 3.8, 6.4, 2. ]), array([ 6.4, 2.8, 5.6, 2.2]), array([ 6.1, 2.6, 5.6, 1.4]), array([ 7.7, 3. , 6.1, 2.3]), array([ 6.3, 3.4, 5.6, 2.4]), array([ 6. , 3. , 4.8, 1.8]), array([ 6.7, 3.1, 5.6, 2.4]), array([ 5.8, 2.7, 5.1, 1.9]), array([ 6.7, 3. , 5.2, 2.3]), array([ 6.5, 3. , 5.2, 2. ]), array([ 6.2, 3.4, 5.4, 2.3]), array([ 5.9, 3. , 5.1, 1.8])] [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] [array([ 4.9, 3. , 1.4, 0.2]), array([ 5.4, 3.7, 1.5, 0.2]), array([ 5.7, 4.4, 1.5, 0.4]), array([ 5.7, 3.8, 1.7, 0.3]), array([ 4.8, 3.4, 1.9, 0.2]), array([ 5. , 3.4, 1.6, 0.4]), array([ 5.2, 3.4, 1.4, 0.2]), array([ 4.7, 3.2, 1.6, 0.2]), array([ 4.8, 3.1, 1.6, 0.2]), array([ 5.4, 3.4, 1.5, 0.4]), array([ 5.2, 4.1, 1.5, 0.1]), array([ 4.9, 3.1, 1.5, 0.1]), array([ 5.5, 3.5, 1.3, 0.2]), array([ 4.9, 3.1, 1.5, 0.1]), array([ 4.4, 3. , 1.3, 0.2]), array([ 5.1, 3.4, 1.5, 0.2]), array([ 4.5, 2.3, 1.3, 0.3]), array([ 4.4, 3.2, 1.3, 0.2]), array([ 4.6, 3.2, 1.4, 0.2]), array([ 5. , 3.3, 1.4, 0.2]), array([ 6.4, 3.2, 4.5, 1.5]), array([ 5.5, 2.3, 4. , 1.3]), array([ 6.3, 3.3, 4.7, 1.6]), array([ 4.9, 2.4, 3.3, 1. ]), array([ 5.9, 3. , 4.2, 1.5]), array([ 6.2, 2.2, 4.5, 1.5]), array([ 5.6, 2.5, 3.9, 1.1]), array([ 6.1, 2.8, 4. , 1.3]), array([ 6.4, 2.9, 4.3, 1.3]), array([ 6.8, 2.8, 4.8, 1.4]), array([ 6.7, 3. , 5. , 1.7]), array([ 5.7, 2.6, 3.5, 1. ]), array([ 6. , 2.7, 5.1, 1.6]), array([ 6.3, 2.3, 4.4, 1.3]), array([ 5.6, 3. , 4.1, 1.3]), array([ 6.1, 3. , 4.6, 1.4]), array([ 5. , 2.3, 3.3, 1. ]), array([ 5.7, 3. , 4.2, 1.2]), array([ 5.7, 2.8, 4.1, 1.3]), array([ 6.3, 3.3, 6. , 2.5]), array([ 7.3, 2.9, 6.3, 1.8]), array([ 7.2, 3.6, 6.1, 2.5]), array([ 5.7, 2.5, 5. , 2. ]), array([ 6.5, 3. , 5.5, 1.8]), array([ 7.7, 3.8, 6.7, 2.2]), array([ 7.7, 2.8, 6.7, 2. ]), array([ 6.7, 3.3, 5.7, 2.1]), array([ 6.2, 2.8, 4.8, 1.8]), array([ 7.4, 2.8, 6.1, 1.9]), array([ 6.3, 2.8, 5.1, 1.5]), array([ 6.4, 3.1, 5.5, 1.8]), array([ 6.9, 3.1, 5.4, 2.1]), array([ 6.9, 3.1, 5.1, 2.3]), array([ 6.8, 3.2, 5.9, 2.3]), array([ 6.7, 3.3, 5.7, 2.5]), array([ 6.3, 2.5, 5. , 1.9])] [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
kmeans.fit(training_feature_set)
kmeans_target_set = kmeans.predict(test_feature_set).tolist()
print kmeans_target_set
print
print test_target_set
print
accuracy = getAccuracy(kmeans_target_set, test_target_set, len(test_target_set))
print('Accuracy: ' + str(accuracy) + '%')
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 0, 2, 0, 0, 2, 2, 0, 2, 0, 0] [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] Accuracy: 83.6734693878%
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.cluster import KMeans
from sklearn import datasets
iris = datasets.load_iris()
split = 0.66
iris_names = ('Iris-setosa', 'Iris-versicolor', 'Iris-virginica')
feature_names = ('sepal length', 'sepal width', 'petal length', 'petal width')
num_feature = len(feature_names)
def splitDataset(split, training_feature_set=[], training_target_set=[], test_feature_set=[], test_target_set=[]):
for i in range(len(iris.data)):
if random.random() < split:
training_feature_set.append(iris.data[i])
training_target_set.append(iris.target[i])
else:
test_feature_set.append(iris.data[i])
test_target_set.append(iris.target[i])
return training_feature_set, training_target_set, test_feature_set, test_target_set
if __name__ == '__main__':
num_trials = 3
accuracy_sum = 0.0
for i in range(num_trials):
training_feature_set, training_target_set, test_feature_set, test_target_set = splitDataset(split)
kmeans = KMeans(n_clusters=3)
kmeans.fit(training_feature_set)
kmeans_target_set = kmeans.predict(test_feature_set).tolist()
accuracy = getAccuracy(kmeans_target_set, test_target_set, len(test_target_set))
accuracy_sum += accuracy
print('Mean Accuracy: ' + str(accuracy_sum / float(num_trials)) + '%')
Mean Accuracy: 88.2586824684%