!pip install graphviz import queue,graphviz import numpy as np import pandas as pd import matplotlib.pyplot as plt from graphviz import Digraph from sklearn.cluster import KMeans from sklearn.tree import export_graphviz,DecisionTreeClassifier #ソートと動的計画法によって,最適な分割を求める (k=2) #this func is main func and get best split condition using sort and dynamic programming def optimal_threshold_2means(X): bests_split = {'cost':np.inf,'coordinate':None,'threshold':None} data_num = X.shape[0] data_dimentions = X.shape[1] u = np.sum(X*X) for i in range(data_dimentions): s = np.zeros(data_dimentions) r = np.sum(X,axis=0) ith_sorted_X = X[X[:,i].argsort(), :] for j,data in enumerate(ith_sorted_X[:-1]): s += data r -= data cost = u - np.sum(s*s)/(j+1) -np.sum(r*r)/(data_num-j-1) #print(cost) if cost < bests_split['cost'] and X[j][i]!=X[j+1][i]: bests_split['cost'] = cost bests_split['coordinate'] = i bests_split['threshold'] = data[i] return bests_split #最適な分割に基づいてクラスタリング(k=2) #this func is for clustering datasets based on best splits got from above func def clustering_2means_by_tree(bests_split,X): cluster = np.ones(X.shape[0]) for i,data in enumerate(X): if(data[bests_split['coordinate']]>bests_split['threshold']): cluster[i] = 0 return cluster #得られた分割の中心座標を求める(k=2) #this func is for calculating center points def get_mean(X,approx_labels): res=[] for k in range(len(np.unique(approx_labels))): n = 0 mean = np.zeros(X.shape[1]) for i,data in enumerate(X): if(approx_labels[i]==k): mean+=data n+=1 res.append(mean/n) return np.array(res) #近似の比率を計算する,論文ではkmeansの場合,上界は4である(k=2). #this func is for calculating approximation ratio def approx_score(approx_labels,kmeans_model,X): kmeans_cost = 0 kmeans_label = kmeans_model.labels_ kmeans_centers = kmeans_model.cluster_centers_ for i,data in enumerate(X): kmeans_cost += np.sum((data-kmeans_centers[kmeans_label[i]])*(data-kmeans_centers[kmeans_label[i]])) approx_cost = 0 mean = get_mean(X,approx_labels) for k in range(kmeans_model.n_clusters): for i,data in enumerate(X): if(approx_labels[i]==k): approx_cost += np.sum((data-mean[k])*(data-mean[k])) print(kmeans_cost) print(approx_cost) return approx_cost/kmeans_cost # First dataset # 生徒の国語・数学・英語の各得点を配列として与える X = np.array([ [ 80, 85, 100 ], [ 96, 100, 100 ], [ 54, 83, 98 ], [ 80, 98, 98 ], [ 90, 92, 91 ], [ 84, 78, 82 ], [ 79, 100, 96 ], [ 88, 92, 92 ], [ 98, 73, 72 ], [ 75, 84, 85 ], [ 92, 100, 96 ], [ 96, 92, 90 ], [ 99, 76, 91 ], [ 75, 82, 88 ], [ 90, 94, 94 ], [ 54, 84, 87 ], [ 92, 89, 62 ], [ 88, 94, 97 ], [ 42, 99, 80 ], [ 70, 98, 70 ], [ 94, 78, 83 ], [ 52, 73, 87 ], [ 94, 88, 72 ], [ 70, 73, 80 ], [ 95, 84, 90 ], [ 95, 88, 84 ], [ 75, 97, 89 ], [ 49, 81, 86 ], [ 83, 72, 80 ], [ 75, 73, 88 ], [ 79, 82, 76 ], [ 100, 77, 89 ], [ 88, 63, 79 ], [ 100, 50, 86 ], [ 55, 96, 84 ], [ 92, 74, 77 ], [ 97, 50, 73 ], ]) #second datasets cust_df = pd.read_csv("http://pythondatascience.plavox.info/wp-content/uploads/2016/05/Wholesale_customers_data.csv") del(cust_df['Channel']) del(cust_df['Region']) cust_array = np.array([cust_df['Fresh'].tolist(), cust_df['Milk'].tolist(), cust_df['Grocery'].tolist(), cust_df['Frozen'].tolist(), cust_df['Milk'].tolist(), cust_df['Detergents_Paper'].tolist(), cust_df['Delicassen'].tolist() ], np.int32) cust_array = cust_array.T cust_array.shape #K-meansクラスタリングをおこなう kmeans_model = KMeans(n_clusters=2, random_state=10).fit(X) #分類先となったラベルを取得する labels = kmeans_model.labels_ #提案手法による近似アルゴリズムで取得する bests_split = optimal_threshold_2means(X) approx_labels = clustering_2means_by_tree(bests_split,X) print(bests_split) print(approx_labels) approx_score(approx_labels,kmeans_model,X) #近似アルゴリズムのコスト ➗ k-meansのコスト #K-meansクラスタリングをおこなう kmeans_model = KMeans(n_clusters=2, random_state=10).fit(cust_array) #分類先となったラベルを取得する labels = kmeans_model.labels_ #提案手法による近似アルゴリズムで取得する bests_split = optimal_threshold_2means(cust_array) approx_labels = clustering_2means_by_tree(bests_split,cust_array) print(bests_split) approx_score(approx_labels,kmeans_model,cust_array) #近似アルゴリズムのコスト ➗ k-meansのコスト class TreeNode: def __init__(self, cluster=None, left=None, right=None,condition=None): self.cluster = cluster self.left = left self.right = right self.condition = (0,0) #(i,threshold) x_i <= threshold or x_i > threshold def minimum_center(i,labels,centers): minimum = np.inf for j in labels: minimum = min(minimum, centers[j][i]) return minimum def maximum_center(i,labels,centers): maximum = -np.inf for j in labels: maximum = max(maximum, centers[j][i]) return maximum def mistake(x,center,i,threshold): return 0 if ((x[i]<=threshold) == (center[i]<=threshold)) else 1 def delete_mistakes_data(X,labels,centers,i,threshold): new_data = [] new_labels=[] for idx,x in enumerate(X): if(mistake(x,centers[labels[idx]],i,threshold)==0): new_data.append(x) new_labels.append(labels[idx]) return np.array(new_data),np.array(new_labels) def make_next_data(X,labels,i,threshold): l_data=[] l_labels=[] r_data=[] r_labels=[] for idx,x in enumerate(X): if(x[i]<=threshold): l_data.append(x) l_labels.append(labels[idx]) else: r_data.append(x) r_labels.append(labels[idx]) return np.array(l_data),np.array(l_labels),np.array(r_data),np.array(r_labels) def count_mistakes(X,l,i,labels,centers): cnt=0 for idx,x in enumerate(X): if(mistake(x,centers[labels[idx]],i,l[i])==1): cnt+=1 return cnt def get_best_splits(X,l,r,labels,centers): bests_split = {'mistake':np.inf,'coordinate':None,'threshold':None} data_dimentions = X.shape[1] for i in range(data_dimentions): ith_sorted_X = X[X[:,i].argsort(), :] ith_sorted_centers = centers[centers[:,i].argsort(), :] idx_center = 1 cnt_mistakes = count_mistakes(X,l,i,labels,centers) for j,x in enumerate(ith_sorted_X[:-1]): if(l[i]>x[i] or x[i]>=r[i]): continue cnt_mistakes = count_mistakes(X,x,i,labels,centers) #ここで本来はDPでより効率よく計算すべきだが,やり方がよくわからない.なのでナイーブなやり方でやっている.つまり,全データに対してその分割でmistakeとなるのか否かを調べている if bests_split['mistake'] > cnt_mistakes: bests_split['mistake'] = cnt_mistakes bests_split['coordinate'] = i bests_split['threshold'] = x[i] print("num of mistakes at this node => {}".format(bests_split['mistake'])) return bests_split['coordinate'],bests_split['threshold'] def build_tree(X,labels,centers): node = TreeNode() l=[] r=[] #論文疑似コード 2〜4行目 if(len(np.unique(labels))==1): node.cluster = labels[0] return node #論文疑似コード 6〜9行目 for i in range(X.shape[1]): l.append(minimum_center(i,labels,centers)) r.append(maximum_center(i,labels,centers)) #論文疑似コード 10〜13行目 i,threshold = get_best_splits(X,l,r,labels,centers) X,labels = delete_mistakes_data(X,labels,centers,i,threshold) left_data,left_labels,right_data,right_labels = make_next_data(X,labels,i,threshold) #論文疑似コード 14〜16行目 node.condition = (i,threshold) node.left = build_tree(left_data,left_labels,centers) node.right = build_tree(right_data,right_labels,centers) return node #IMM procedure kmeans_model = KMeans(n_clusters=3, random_state=10).fit(X) centers = kmeans_model.cluster_centers_ labels = kmeans_model.labels_ root = build_tree(X,labels,centers) make_tree(root,kmeans_model.n_clusters) labels[X[:,0] <= 75] labels[X[:,1] <= 82] labels[X[:,1] > 82] #IMM procedure kmeans_model = KMeans(n_clusters=3, random_state=10).fit(cust_array) centers = kmeans_model.cluster_centers_ labels = kmeans_model.labels_ root = build_tree(cust_array,labels,centers) make_tree(root,kmeans_model.n_clusters) labels[cust_array[:,0] > 20049] labels[cust_array[:,1] <= 12220] labels[cust_array[:,1] > 12220] def make_tree(root,n_clusters): G = Digraph(format='png') G.attr('node', shape='circle') N = 2*n_clusters - 1 #ノード数 q = queue.Queue() q.put(root) if(root.right.cluster != None): G.node(str(0),"X_{} > {}".format(root.condition[0],root.condition[1])) else: G.node(str(0),"X_{} <= {}".format(root.condition[0],root.condition[1])) i=1 while not q.empty(): root = q.get() if root.left.cluster != None and root.right.cluster != None: G.node(str(i), str(root.left.cluster)) G.edge(str(i-1), str(i),label='True') G.node(str(i+1), str(root.right.cluster)) G.edge(str(i-1), str(i+1),label='False') elif root.right.cluster != None: G.node(str(i), str(root.right.cluster)) G.edge(str(i-1), str(i),label='True') G.node(str(i+1),"X_{} <= {}".format(root.left.condition[0],root.left.condition[1])) G.edge(str(i-1), str(i+1),label='False') q.put(root.left) else: G.node(str(i), str(root.left.cluster)) G.edge(str(i-1), str(i),label='True') G.node(str(i+1),"X_{} <= {}".format(root.right.condition[0],root.right.condition[1])) G.edge(str(i-1), str(i+1),label='False') q.put(root.right) i+=2 return G def make_toydata(v): mean1 = np.array([2, 0]) cov1 = np.array([[0.3, 0], [0, 0.3]]) data_1 = np.random.multivariate_normal(mean1, cov1, size=200) mean2 = np.array([-2, 0]) cov2 = np.array([[0.3, 0], [0, 0.3]]) data_2 = np.random.multivariate_normal(mean2, cov2, size=200) data_3 = np.array([[-2,v],[2,v]]) return data_1,data_2,data_3 data_1,data_2,data_3 = make_toydata(v=100) plt.scatter(data_1[:,0],data_1[:,1]) plt.scatter(data_2[:,0],data_2[:,1]) plt.scatter(data_1[:,0],data_1[:,1]) plt.scatter(data_2[:,0],data_2[:,1]) plt.scatter(data_3[:,0],data_3[:,1]) toy_X = np.concatenate([data_1,data_2,data_3]) kmeans_model = KMeans(n_clusters=3, random_state=10).fit(toy_X) centers = kmeans_model.cluster_centers_ labels = kmeans_model.labels_ labels dt = DecisionTreeClassifier(criterion='entropy',max_leaf_nodes=3).fit(toy_X,labels) dt.predict(toy_X) dot_data = export_graphviz( dt, filled=True, ) graph = graphviz.Source(dot_data) graph for i in range(1,10000,1000): data_1,data_2,data_3 = make_toydata(v=i) toy_X = np.concatenate([data_1,data_2,data_3]) kmeans_model = KMeans(n_clusters=3, random_state=10).fit(toy_X) kmeans_labels = kmeans_model.labels_ dt = DecisionTreeClassifier(criterion='entropy',max_leaf_nodes=3).fit(toy_X,kmeans_labels) cost = 0 labels = dt.predict(toy_X) centers = get_mean(toy_X,labels) for i,data in enumerate(toy_X): cost += np.sum((data-kmeans_centers[labels[i]])*(data-kmeans_centers[labels[i]])) print("Optimal Score:{} vs Decision Tree Score:{}".format(kmeans_model.score(toy_X),cost))