### k-means¶

In [5]:
function input_data()
parse_data1= Float64[]
parse_data2 = Float64[]
open("./dataset/data_2.txt","r") do fp
for i in 1:length(data)
push!(parse_data1,parse(Float64,split(data[i], " ")[1]))
push!(parse_data2 ,parse(Float64,split(data[i], " ")[2]))
end
end
X = [parse_data1 parse_data2]
return X
end

X = input_data()

using Plots
gr()

function dist(μx, μy, xx, xy)
# ユークリッド距離
return sqrt((xx-μx)^2+(μy-xy)^2)
end

# 重心の数
k = 9
# 重心の変数．
μx_old = Float64[]
μy_old = Float64[]
for i in 1:k
# データの端を乱数の範囲にする．
# 初期の重心を決める．
push!(μx_old, rand(minimum(X[:,1]):0.1:maximum(X[:,1])))
push!(μy_old, rand(minimum(X[:,2]):0.1:maximum(X[:,2])))
end

min = 1000000.0
cluster = 0
center_of_gravity_x = Float64[]
center_of_gravity_y = Float64[]
clusters = Int64[]
n = length(X[:,1])

μx_new = Float64[]
μy_new = Float64[]

# 収束判定用の変数
error = 100.0
error_arr = Float64[]

# プロットのgif用変数
plot_data_arr = Any[]
μx_arr = Any[]
μy_arr = Any[]

# 誤差関数が収束すれば終わり．
while error != 0.0
clusters = Int64[]

for i in 1:n
# 重心割り当て
# どの重心に一番近いかを決める．その距離の計算をしている．
# iには全データが入る．
min = 100000.0
for j in 1:k
distance = dist(μx_old[j], μy_old[j], X[i,1], X[i,2])
if min > distance
min = distance
cluster = j
end
end
push!(clusters, cluster)
end

plotdata = [X [string(i) for i=clusters];]
push!(plot_data_arr, plotdata)

# 主に重心の計算
for i in 1:k
cluster_len = length(findin(plotdata[:,3],["$i"])) for j in findin(plotdata[:,3],["$i"])
# 下二行が重心の計算
push!(center_of_gravity_x,((X[j,1]) / cluster_len))
push!(center_of_gravity_y,((X[j,2]) / cluster_len))
end
push!(μx_new, sum(center_of_gravity_x))
push!(μy_new, sum(center_of_gravity_y))
center_of_gravity_x = Float64[]
center_of_gravity_y = Float64[]
error = dist(μx_new[i], μy_new[i], μx_old[i], μy_old[i])
push!(error_arr, error)
end
push!(μx_arr, μx_old)
push!(μy_arr, μy_old)
μx_old = μx_new
μy_old = μy_new
μx_new = Float64[]
μy_new = Float64[]
end

tmp_x = Float64[]
tmp_y = Float64[]
x_arr = []
y_arr = []

function result(cluster_num)
tmp_x = Float64[]
tmp_y = Float64[]
for p in 1:cluster_num
result = findin(plotdata[:,3],["$p"]) # 割り振られたデータをx座標をx_arr, y座標をy_arrにそれぞれ代入する． # 重心数が３だったら三つの配列で帰ってくる． for i in result push!(tmp_x, plotdata[i,1]) push!(tmp_y, plotdata[i,2]) end push!(x_arr, tmp_x) push!(y_arr, tmp_y) tmp_x = Float64[] tmp_y = Float64[] end end plotdata = [X [string(i) for i=clusters];] result(k) plotdata  Out[5]: 1000×3 Array{Any,2}: -3.93672 -3.5936 "3" -7.03037 -4.30673 "3" -5.61878 -6.48806 "3" -6.36537 -3.86141 "3" -3.42238 -4.81756 "2" -3.96087 -6.21365 "2" -5.2541 -4.79723 "3" -3.94911 -5.78381 "2" -5.35447 -4.52871 "3" -5.46655 -4.0682 "3" -4.74938 -5.46327 "3" -3.27678 -4.64628 "2" -4.15944 -3.41176 "3" ⋮ -1.04288 6.2303 "4" 1.50573 4.92199 "4" 0.99633 5.6953 "4" -0.255848 4.59955 "5" 0.0165902 4.76575 "5" -1.09301 6.4878 "4" 0.349573 5.30376 "4" 0.225971 3.7339 "5" 0.947491 7.34555 "4" -0.591647 6.59899 "4" 1.44914 4.65748 "5" -1.62489 6.74928 "4" In [8]: function result2(cluster_num) tmp_x = Float64[] tmp_y = Float64[] x_arr = [] y_arr = [] @gif for m in 1:length(plot_data_arr) plotdata = plot_data_arr[m] μx_old = μx_arr[m] μy_old = μy_arr[m] for p in 1:cluster_num result = findin(plotdata[:,3],["$p"])
# 割り振られたデータをx座標をx_arr, y座標をy_arrにそれぞれ代入する．
# 重心数が３だったら三つの配列で帰ってくる．
for i in result
push!(tmp_x, plotdata[i,1])
push!(tmp_y, plotdata[i,2])
end
push!(x_arr, tmp_x)
push!(y_arr, tmp_y)
tmp_x = Float64[]
tmp_y = Float64[]
end
scatter(x_arr, y_arr)
scatter!(μx_old, μy_old,label="center")
x_arr = []
y_arr = []
end every 1
end
result2(k)

INFO: Saved animation to /Users/noriakioshita/Github/julia/ml_page/julia/tmp.gif

Out[8]:
In [7]:
length(plot_data_arr)

Out[7]:
16
In [6]:
scatter(x_arr, y_arr)
scatter!(μx_old, μy_old,label="center")

Out[6]:
In [9]:
plot(error_arr)

Out[9]:

### k-means++¶

In [116]:
function input_data()
parse_data1= Float64[]
parse_data2 = Float64[]
open("./dataset/data_2.txt","r") do fp
for i in 1:length(data)
push!(parse_data1,parse(Float64,split(data[i], " ")[1]))
push!(parse_data2 ,parse(Float64,split(data[i], " ")[2]))
end
end
X = [parse_data1 parse_data2]
return X
end

X = input_data()
n = length(X[:,1])

using Plots
gr()

function dist(μx, μy, xx, xy)
return sqrt((xx-μx)^2+(μy-xy)^2)
end

# 重心の数
k = 5
# 重心の変数．
μx_old = Float64[]
μy_old = Float64[]

DX = Float64[]
function centroid()
# データ点からランダムに選びそれを重心とする．
z1 = rand(1:1:length(X[:,1]))
push!(μx_old, X[z1,1])
push!(μy_old, X[z1,2])
DX = Float64[]
max = 0
max_num = 0

# データ点との距離を代入する．
for i in 1:n
Dx = dist(X[i,1], X[i,2], X[z1,1], X[z1,2])
push!(DX, Dx)
for i in 1:k
max = 0
for j in 1:n
Dx = dist(X[j,1], X[j,2], X[z1,1], X[z1,2])
if max < Dx
max = Dx^2 / sum(DX)^2
max_num = j
end
end
push!(μx_old, X[max_num,1])
push!(μy_old, X[max_num,2])
end
end

"    for i in 2:k
max = 0
for j in 1:n
Dx = dist(X[j,1], X[j,2], X[z1,1], X[z1,2])
if max < Dx
max = Dx^2 / sum(DX)^2
max_num = j
end
end
push!(μx_old, X[max_num,1])
push!(μy_old, X[max_num,2])
end"
end

centroid()

min = 1000000.0
cluster = 0
center_of_gravity_x = Float64[]
center_of_gravity_y = Float64[]
clusters = Int64[]
n = length(X[:,1])

μx_new = Float64[]
μy_new = Float64[]

# 収束判定用の変数
error = 100.0
error_arr = Float64[]

# 誤差関数が収束すれば終わり．
while error != 0.0
clusters = Int64[]

for i in 1:n
# 重心割り当て
# どの重心に一番近いかを決める．その距離の計算をしている．
# iには全データが入る．
min = 100000.0
for j in 1:k
distance = dist(μx_old[j], μy_old[j], X[i,1], X[i,2])
if min > distance
min = distance
cluster = j
end
end
push!(clusters, cluster)
end

plotdata = [X [string(i) for i=clusters];]

# 主に重心の計算
for i in 1:k
cluster_len = length(findin(plotdata[:,3],["$i"])) for j in findin(plotdata[:,3],["$i"])
# 下二行が重心の計算
push!(center_of_gravity_x,((X[j,1]) / cluster_len))
push!(center_of_gravity_y,((X[j,2]) / cluster_len))
end
push!(μx_new, sum(center_of_gravity_x))
push!(μy_new, sum(center_of_gravity_y))
center_of_gravity_x = Float64[]
center_of_gravity_y = Float64[]
error = dist(μx_new[i], μy_new[i], μx_old[i], μy_old[i])
push!(error_arr, error)
end
μx_old = μx_new
μy_old = μy_new
μx_new = Float64[]
μy_new = Float64[]
end

tmp_x = Float64[]
tmp_y = Float64[]
x_arr = []
y_arr = []

function result(cluster_num)
tmp_x = Float64[]
tmp_y = Float64[]
for p in 1:cluster_num
result = findin(plotdata[:,3],["\$p"])
# 割り振られたデータをx座標をx_arr, y座標をy_arrにそれぞれ代入する．
# 重心数が３だったら三つの配列で帰ってくる．
for i in result
push!(tmp_x, plotdata[i,1])
push!(tmp_y, plotdata[i,2])
end
push!(x_arr, tmp_x)
push!(y_arr, tmp_y)
tmp_x = Float64[]
tmp_y = Float64[]
end
end

plotdata = [X [string(i) for i=clusters];]

result(k)
plotdata

Out[116]:
1000×3 Array{Any,2}:
-3.93672    -3.5936   "2"
-7.03037    -4.30673  "2"
-5.61878    -6.48806  "2"
-6.36537    -3.86141  "2"
-3.42238    -4.81756  "2"
-3.96087    -6.21365  "2"
-5.2541     -4.79723  "2"
-3.94911    -5.78381  "2"
-5.35447    -4.52871  "2"
-5.46655    -4.0682   "2"
-4.74938    -5.46327  "2"
-3.27678    -4.64628  "2"
-4.15944    -3.41176  "2"
⋮
-1.04288     6.2303   "2"
1.50573     4.92199  "3"
0.99633     5.6953   "3"
-0.255848    4.59955  "3"
0.0165902   4.76575  "3"
-1.09301     6.4878   "2"
0.349573    5.30376  "3"
0.225971    3.7339   "3"
0.947491    7.34555  "3"
-0.591647    6.59899  "3"
1.44914     4.65748  "3"
-1.62489     6.74928  "2"
In [117]:
scatter(x_arr, y_arr)
scatter!(μx_old, μy_old,label="center")

Out[117]:
In [118]:
μx_old

Out[118]:
5-element Array{Float64,1}:
10.2647
-4.64161
1.22032
0.0
0.0    
In [49]:
plot(error_arr)

Out[49]:
In [108]:
using PyCall
@pyimport sklearn.cluster as kcluster

In [109]:
# クラスタを3つでやる．
n_cluster = 10
y_km = kcluster.KMeans(n_clusters=n_cluster, init="k-means++", n_init=10, max_iter=300, tol=1e-04, random_state=0)[:fit_predict](X) +1
plotdata = [X [string(i) for i=y_km];]

Out[109]:
1000×3 Array{Any,2}:
-3.93672    -3.5936   "4"
-7.03037    -4.30673  "4"
-5.61878    -6.48806  "4"
-6.36537    -3.86141  "4"
-3.42238    -4.81756  "4"
-3.96087    -6.21365  "4"
-5.2541     -4.79723  "4"
-3.94911    -5.78381  "4"
-5.35447    -4.52871  "4"
-5.46655    -4.0682   "4"
-4.74938    -5.46327  "4"
-3.27678    -4.64628  "4"
-4.15944    -3.41176  "4"
⋮
-1.04288     6.2303   "3"
1.50573     4.92199  "3"
0.99633     5.6953   "3"
-0.255848    4.59955  "3"
0.0165902   4.76575  "3"
-1.09301     6.4878   "3"
0.349573    5.30376  "3"
0.225971    3.7339   "3"
0.947491    7.34555  "3"
-0.591647    6.59899  "3"
1.44914     4.65748  "3"
-1.62489     6.74928  "3"
In [110]:
x_arr = []
y_arr = []
result(n_cluster)
scatter(x_arr, y_arr)

Out[110]: