!pip install graphviz
Requirement already satisfied: graphviz in /usr/local/lib/python3.6/dist-packages (0.10.1)
import queue,graphviz
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from graphviz import Digraph
from sklearn.cluster import KMeans
from sklearn.tree import export_graphviz,DecisionTreeClassifier
I did not create Class of their algorithm, so this is a little experiment for me.
#ソートと動的計画法によって,最適な分割を求める (k=2)
#this func is main func and get best split condition using sort and dynamic programming
def optimal_threshold_2means(X):
bests_split = {'cost':np.inf,'coordinate':None,'threshold':None}
data_num = X.shape[0]
data_dimentions = X.shape[1]
u = np.sum(X*X)
for i in range(data_dimentions):
s = np.zeros(data_dimentions)
r = np.sum(X,axis=0)
ith_sorted_X = X[X[:,i].argsort(), :]
for j,data in enumerate(ith_sorted_X[:-1]):
s += data
r -= data
cost = u - np.sum(s*s)/(j+1) -np.sum(r*r)/(data_num-j-1)
#print(cost)
if cost < bests_split['cost'] and X[j][i]!=X[j+1][i]:
bests_split['cost'] = cost
bests_split['coordinate'] = i
bests_split['threshold'] = data[i]
return bests_split
#最適な分割に基づいてクラスタリング(k=2)
#this func is for clustering datasets based on best splits got from above func
def clustering_2means_by_tree(bests_split,X):
cluster = np.ones(X.shape[0])
for i,data in enumerate(X):
if(data[bests_split['coordinate']]>bests_split['threshold']):
cluster[i] = 0
return cluster
#得られた分割の中心座標を求める(k=2)
#this func is for calculating center points
def get_mean(X,approx_labels):
res=[]
for k in range(len(np.unique(approx_labels))):
n = 0
mean = np.zeros(X.shape[1])
for i,data in enumerate(X):
if(approx_labels[i]==k):
mean+=data
n+=1
res.append(mean/n)
return np.array(res)
#近似の比率を計算する,論文ではkmeansの場合,上界は4である(k=2).
#this func is for calculating approximation ratio
def approx_score(approx_labels,kmeans_model,X):
kmeans_cost = 0
kmeans_label = kmeans_model.labels_
kmeans_centers = kmeans_model.cluster_centers_
for i,data in enumerate(X):
kmeans_cost += np.sum((data-kmeans_centers[kmeans_label[i]])*(data-kmeans_centers[kmeans_label[i]]))
approx_cost = 0
mean = get_mean(X,approx_labels)
for k in range(kmeans_model.n_clusters):
for i,data in enumerate(X):
if(approx_labels[i]==k):
approx_cost += np.sum((data-mean[k])*(data-mean[k]))
print(kmeans_cost)
print(approx_cost)
return approx_cost/kmeans_cost
First is from this good article.
Second is from uci repo and I get the code from this good article.
# First dataset
# 生徒の国語・数学・英語の各得点を配列として与える
X = np.array([
[ 80, 85, 100 ],
[ 96, 100, 100 ],
[ 54, 83, 98 ],
[ 80, 98, 98 ],
[ 90, 92, 91 ],
[ 84, 78, 82 ],
[ 79, 100, 96 ],
[ 88, 92, 92 ],
[ 98, 73, 72 ],
[ 75, 84, 85 ],
[ 92, 100, 96 ],
[ 96, 92, 90 ],
[ 99, 76, 91 ],
[ 75, 82, 88 ],
[ 90, 94, 94 ],
[ 54, 84, 87 ],
[ 92, 89, 62 ],
[ 88, 94, 97 ],
[ 42, 99, 80 ],
[ 70, 98, 70 ],
[ 94, 78, 83 ],
[ 52, 73, 87 ],
[ 94, 88, 72 ],
[ 70, 73, 80 ],
[ 95, 84, 90 ],
[ 95, 88, 84 ],
[ 75, 97, 89 ],
[ 49, 81, 86 ],
[ 83, 72, 80 ],
[ 75, 73, 88 ],
[ 79, 82, 76 ],
[ 100, 77, 89 ],
[ 88, 63, 79 ],
[ 100, 50, 86 ],
[ 55, 96, 84 ],
[ 92, 74, 77 ],
[ 97, 50, 73 ],
])
#second datasets
cust_df = pd.read_csv("http://pythondatascience.plavox.info/wp-content/uploads/2016/05/Wholesale_customers_data.csv")
del(cust_df['Channel'])
del(cust_df['Region'])
cust_array = np.array([cust_df['Fresh'].tolist(),
cust_df['Milk'].tolist(),
cust_df['Grocery'].tolist(),
cust_df['Frozen'].tolist(),
cust_df['Milk'].tolist(),
cust_df['Detergents_Paper'].tolist(),
cust_df['Delicassen'].tolist()
], np.int32)
cust_array = cust_array.T
cust_array.shape
(440, 7)
#K-meansクラスタリングをおこなう
kmeans_model = KMeans(n_clusters=2, random_state=10).fit(X)
#分類先となったラベルを取得する
labels = kmeans_model.labels_
#提案手法による近似アルゴリズムで取得する
bests_split = optimal_threshold_2means(X)
approx_labels = clustering_2means_by_tree(bests_split,X)
print(bests_split)
print(approx_labels)
{'cost': 11316.030172413797, 'coordinate': 0, 'threshold': 70} [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
approx_score(approx_labels,kmeans_model,X) #近似アルゴリズムのコスト ➗ k-meansのコスト
11316.030172413793 11316.030172413793
1.0
#K-meansクラスタリングをおこなう
kmeans_model = KMeans(n_clusters=2, random_state=10).fit(cust_array)
#分類先となったラベルを取得する
labels = kmeans_model.labels_
#提案手法による近似アルゴリズムで取得する
bests_split = optimal_threshold_2means(cust_array)
approx_labels = clustering_2means_by_tree(bests_split,cust_array)
print(bests_split)
{'cost': 41288343652.5358, 'coordinate': 2, 'threshold': 16483}
approx_score(approx_labels,kmeans_model,cust_array) #近似アルゴリズムのコスト ➗ k-meansのコスト
132340344661.33641 135777624164.53575
1.0259730281948067
I did not create Class of their algorithm, so this is a little experiment for me.
class TreeNode:
def __init__(self, cluster=None, left=None, right=None,condition=None):
self.cluster = cluster
self.left = left
self.right = right
self.condition = (0,0) #(i,threshold) x_i <= threshold or x_i > threshold
def minimum_center(i,labels,centers):
minimum = np.inf
for j in labels:
minimum = min(minimum, centers[j][i])
return minimum
def maximum_center(i,labels,centers):
maximum = -np.inf
for j in labels:
maximum = max(maximum, centers[j][i])
return maximum
def mistake(x,center,i,threshold):
return 0 if ((x[i]<=threshold) == (center[i]<=threshold)) else 1
def delete_mistakes_data(X,labels,centers,i,threshold):
new_data = []
new_labels=[]
for idx,x in enumerate(X):
if(mistake(x,centers[labels[idx]],i,threshold)==0):
new_data.append(x)
new_labels.append(labels[idx])
return np.array(new_data),np.array(new_labels)
def make_next_data(X,labels,i,threshold):
l_data=[]
l_labels=[]
r_data=[]
r_labels=[]
for idx,x in enumerate(X):
if(x[i]<=threshold):
l_data.append(x)
l_labels.append(labels[idx])
else:
r_data.append(x)
r_labels.append(labels[idx])
return np.array(l_data),np.array(l_labels),np.array(r_data),np.array(r_labels)
def count_mistakes(X,l,i,labels,centers):
cnt=0
for idx,x in enumerate(X):
if(mistake(x,centers[labels[idx]],i,l[i])==1):
cnt+=1
return cnt
def get_best_splits(X,l,r,labels,centers):
bests_split = {'mistake':np.inf,'coordinate':None,'threshold':None}
data_dimentions = X.shape[1]
for i in range(data_dimentions):
ith_sorted_X = X[X[:,i].argsort(), :]
ith_sorted_centers = centers[centers[:,i].argsort(), :]
idx_center = 1
cnt_mistakes = count_mistakes(X,l,i,labels,centers)
for j,x in enumerate(ith_sorted_X[:-1]):
if(l[i]>x[i] or x[i]>=r[i]):
continue
cnt_mistakes = count_mistakes(X,x,i,labels,centers) #ここで本来はDPでより効率よく計算すべきだが,やり方がよくわからない.なのでナイーブなやり方でやっている.つまり,全データに対してその分割でmistakeとなるのか否かを調べている
if bests_split['mistake'] > cnt_mistakes:
bests_split['mistake'] = cnt_mistakes
bests_split['coordinate'] = i
bests_split['threshold'] = x[i]
print("num of mistakes at this node => {}".format(bests_split['mistake']))
return bests_split['coordinate'],bests_split['threshold']
def build_tree(X,labels,centers):
node = TreeNode()
l=[]
r=[]
#論文疑似コード 2〜4行目
if(len(np.unique(labels))==1):
node.cluster = labels[0]
return node
#論文疑似コード 6〜9行目
for i in range(X.shape[1]):
l.append(minimum_center(i,labels,centers))
r.append(maximum_center(i,labels,centers))
#論文疑似コード 10〜13行目
i,threshold = get_best_splits(X,l,r,labels,centers)
X,labels = delete_mistakes_data(X,labels,centers,i,threshold)
left_data,left_labels,right_data,right_labels = make_next_data(X,labels,i,threshold)
#論文疑似コード 14〜16行目
node.condition = (i,threshold)
node.left = build_tree(left_data,left_labels,centers)
node.right = build_tree(right_data,right_labels,centers)
return node
#IMM procedure
kmeans_model = KMeans(n_clusters=3, random_state=10).fit(X)
centers = kmeans_model.cluster_centers_
labels = kmeans_model.labels_
root = build_tree(X,labels,centers)
make_tree(root,kmeans_model.n_clusters)
num of mistakes at this node => 1 num of mistakes at this node => 2
⇩クラスタ0におけるmistakeは1つだけ
labels[X[:,0] <= 75]
array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], dtype=int32)
⇩クラスタ2におけるmistakeはない(0は考えなくて良い).
labels[X[:,1] <= 82]
array([2, 2, 2, 0, 2, 0, 0, 0, 2, 0, 2, 2, 2, 2, 2, 2], dtype=int32)
⇩クラスタ1におけるmistakeは2つ
labels[X[:,1] > 82]
array([1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 2, 1, 0, 0, 2, 1, 1, 1, 0], dtype=int32)
#IMM procedure
kmeans_model = KMeans(n_clusters=3, random_state=10).fit(cust_array)
centers = kmeans_model.cluster_centers_
labels = kmeans_model.labels_
root = build_tree(cust_array,labels,centers)
make_tree(root,kmeans_model.n_clusters)
num of mistakes at this node => 8 num of mistakes at this node => 11
labels[cust_array[:,0] > 20049]
array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)
labels[cust_array[:,1] <= 12220]
array([2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 0, 0, 2, 2, 0, 2, 0, 0, 2, 2, 1, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 0, 0, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 1, 2, 0, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 0, 2, 2, 1, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, 2], dtype=int32)
labels[cust_array[:,1] > 12220]
array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 0, 1, 1, 2, 1, 1, 0, 1], dtype=int32)
func for visualization
sorry for dirty code...
def make_tree(root,n_clusters):
G = Digraph(format='png')
G.attr('node', shape='circle')
N = 2*n_clusters - 1 #ノード数
q = queue.Queue()
q.put(root)
if(root.right.cluster != None):
G.node(str(0),"X_{} > {}".format(root.condition[0],root.condition[1]))
else:
G.node(str(0),"X_{} <= {}".format(root.condition[0],root.condition[1]))
i=1
while not q.empty():
root = q.get()
if root.left.cluster != None and root.right.cluster != None:
G.node(str(i), str(root.left.cluster))
G.edge(str(i-1), str(i),label='True')
G.node(str(i+1), str(root.right.cluster))
G.edge(str(i-1), str(i+1),label='False')
elif root.right.cluster != None:
G.node(str(i), str(root.right.cluster))
G.edge(str(i-1), str(i),label='True')
G.node(str(i+1),"X_{} <= {}".format(root.left.condition[0],root.left.condition[1]))
G.edge(str(i-1), str(i+1),label='False')
q.put(root.left)
else:
G.node(str(i), str(root.left.cluster))
G.edge(str(i-1), str(i),label='True')
G.node(str(i+1),"X_{} <= {}".format(root.right.condition[0],root.right.condition[1]))
G.edge(str(i-1), str(i+1),label='False')
q.put(root.right)
i+=2
return G
def make_toydata(v):
mean1 = np.array([2, 0])
cov1 = np.array([[0.3, 0], [0, 0.3]])
data_1 = np.random.multivariate_normal(mean1, cov1, size=200)
mean2 = np.array([-2, 0])
cov2 = np.array([[0.3, 0], [0, 0.3]])
data_2 = np.random.multivariate_normal(mean2, cov2, size=200)
data_3 = np.array([[-2,v],[2,v]])
return data_1,data_2,data_3
data_1,data_2,data_3 = make_toydata(v=100)
plt.scatter(data_1[:,0],data_1[:,1])
plt.scatter(data_2[:,0],data_2[:,1])
<matplotlib.collections.PathCollection at 0x7f28e98d7eb8>
plt.scatter(data_1[:,0],data_1[:,1])
plt.scatter(data_2[:,0],data_2[:,1])
plt.scatter(data_3[:,0],data_3[:,1])
<matplotlib.collections.PathCollection at 0x7f28e9811e80>
toy_X = np.concatenate([data_1,data_2,data_3])
kmeans_model = KMeans(n_clusters=3, random_state=10).fit(toy_X)
centers = kmeans_model.cluster_centers_
labels = kmeans_model.labels_
labels
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1], dtype=int32)
0クラスタが200データ,1クラスタが2データ,2クラスタが200データと狙い通りにクラスタリングできている
dt = DecisionTreeClassifier(criterion='entropy',max_leaf_nodes=3).fit(toy_X,labels)
dt.predict(toy_X)
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0], dtype=int32)
やはり最初の分割が論文中のような分割になっている(data_3の片方が右へ,片方が左へ)
dot_data = export_graphviz(
dt,
filled=True,
)
graph = graphviz.Source(dot_data)
graph
for i in range(1,10000,1000):
data_1,data_2,data_3 = make_toydata(v=i)
toy_X = np.concatenate([data_1,data_2,data_3])
kmeans_model = KMeans(n_clusters=3, random_state=10).fit(toy_X)
kmeans_labels = kmeans_model.labels_
dt = DecisionTreeClassifier(criterion='entropy',max_leaf_nodes=3).fit(toy_X,kmeans_labels)
cost = 0
labels = dt.predict(toy_X)
centers = get_mean(toy_X,labels)
for i,data in enumerate(toy_X):
cost += np.sum((data-kmeans_centers[labels[i]])*(data-kmeans_centers[labels[i]]))
print("Optimal Score:{} vs Decision Tree Score:{}".format(kmeans_model.score(toy_X),cost))
Optimal Score:-199.17982861954835 vs Decision Tree Score:16284591918.10414 Optimal Score:-246.18124015152924 vs Decision Tree Score:65002282.27015967 Optimal Score:-246.4869545232153 vs Decision Tree Score:53004322.696862355 Optimal Score:-219.56312850576407 vs Decision Tree Score:45006333.642052606 Optimal Score:-260.7027891214368 vs Decision Tree Score:41008413.387505375 Optimal Score:-244.1077337078409 vs Decision Tree Score:41010435.60921242 Optimal Score:-267.98712249271443 vs Decision Tree Score:45012495.40759582 Optimal Score:-254.49826816169144 vs Decision Tree Score:53014522.64540449 Optimal Score:-272.9091436380789 vs Decision Tree Score:65016576.71783102 Optimal Score:-239.80939207439016 vs Decision Tree Score:81024105.0891669
v→無限大につれて,近似スコアは無限大になってしまう(もちろん,葉をクラスタ数にしなければこれは起こらないが)
#sklearnのkmeansは目的関数のスコアのマイナスをreturnする