k-means

In [5]:
function input_data()
    parse_data1= Float64[]
    parse_data2 = Float64[]
    open("./dataset/data_2.txt","r") do fp
        data = readlines(fp)
        for i in 1:length(data)
            push!(parse_data1,parse(Float64,split(data[i], " ")[1]))
            push!(parse_data2 ,parse(Float64,split(data[i], " ")[2]))
        end
    end
    X = [parse_data1 parse_data2]
    return X
end

X = input_data()

using Plots
gr()

function dist(μx, μy, xx, xy)
    # ユークリッド距離
    return sqrt((xx-μx)^2+(μy-xy)^2)
end

# 重心の数
k = 9
# 重心の変数.
μx_old = Float64[]
μy_old = Float64[]
for i in 1:k
    # データの端を乱数の範囲にする.
    # 初期の重心を決める.
    push!(μx_old, rand(minimum(X[:,1]):0.1:maximum(X[:,1])))
    push!(μy_old, rand(minimum(X[:,2]):0.1:maximum(X[:,2])))
end

min = 1000000.0
cluster = 0
center_of_gravity_x = Float64[]
center_of_gravity_y = Float64[]
clusters = Int64[]
n = length(X[:,1])
    
μx_new = Float64[]
μy_new = Float64[]

# 収束判定用の変数
error = 100.0
error_arr = Float64[]

# プロットのgif用変数
plot_data_arr = Any[]
μx_arr = Any[]
μy_arr = Any[]

# 誤差関数が収束すれば終わり.
while error != 0.0  
    clusters = Int64[]
    
     
    for i in 1:n
        # 重心割り当て
        # どの重心に一番近いかを決める.その距離の計算をしている.
        # iには全データが入る.
        min = 100000.0
        for j in 1:k
            distance = dist(μx_old[j], μy_old[j], X[i,1], X[i,2])
            if min > distance
                min = distance
                cluster = j
            end
        end
        push!(clusters, cluster)
    end
    
    plotdata = [X [string(i) for i=clusters];]
    push!(plot_data_arr, plotdata)

    # 主に重心の計算
    for i in 1:k
        cluster_len = length(findin(plotdata[:,3],["$i"]))
        for j in findin(plotdata[:,3],["$i"])
            # 下二行が重心の計算
            push!(center_of_gravity_x,((X[j,1]) / cluster_len))
            push!(center_of_gravity_y,((X[j,2]) / cluster_len))
        end
        push!(μx_new, sum(center_of_gravity_x))
        push!(μy_new, sum(center_of_gravity_y))
        center_of_gravity_x = Float64[]
        center_of_gravity_y = Float64[]
        error = dist(μx_new[i], μy_new[i], μx_old[i], μy_old[i])
        push!(error_arr, error)
    end
    push!(μx_arr, μx_old)
    push!(μy_arr, μy_old)
    μx_old = μx_new
    μy_old = μy_new
    μx_new = Float64[]
    μy_new = Float64[]
end

tmp_x = Float64[]
tmp_y = Float64[]
x_arr = []
y_arr = []

function result(cluster_num)
    tmp_x = Float64[]
    tmp_y = Float64[]
    for p in 1:cluster_num
        result = findin(plotdata[:,3],["$p"])
        # 割り振られたデータをx座標をx_arr, y座標をy_arrにそれぞれ代入する.
        # 重心数が3だったら三つの配列で帰ってくる.
        for i in result
            push!(tmp_x, plotdata[i,1])
            push!(tmp_y, plotdata[i,2])
        end
        push!(x_arr, tmp_x)
        push!(y_arr, tmp_y)
        tmp_x = Float64[]
        tmp_y = Float64[]
    end
end

plotdata = [X [string(i) for i=clusters];]
result(k)
plotdata
Out[5]:
1000×3 Array{Any,2}:
 -3.93672    -3.5936   "3"
 -7.03037    -4.30673  "3"
 -5.61878    -6.48806  "3"
 -6.36537    -3.86141  "3"
 -3.42238    -4.81756  "2"
 -3.96087    -6.21365  "2"
 -5.2541     -4.79723  "3"
 -3.94911    -5.78381  "2"
 -5.35447    -4.52871  "3"
 -5.46655    -4.0682   "3"
 -4.74938    -5.46327  "3"
 -3.27678    -4.64628  "2"
 -4.15944    -3.41176  "3"
  ⋮                       
 -1.04288     6.2303   "4"
  1.50573     4.92199  "4"
  0.99633     5.6953   "4"
 -0.255848    4.59955  "5"
  0.0165902   4.76575  "5"
 -1.09301     6.4878   "4"
  0.349573    5.30376  "4"
  0.225971    3.7339   "5"
  0.947491    7.34555  "4"
 -0.591647    6.59899  "4"
  1.44914     4.65748  "5"
 -1.62489     6.74928  "4"
In [8]:
function result2(cluster_num)
    tmp_x = Float64[]
    tmp_y = Float64[]
    x_arr = []
    y_arr = []
    @gif for m in 1:length(plot_data_arr)
        plotdata = plot_data_arr[m]
        μx_old = μx_arr[m]
        μy_old = μy_arr[m]
        for p in 1:cluster_num
            result = findin(plotdata[:,3],["$p"])
            # 割り振られたデータをx座標をx_arr, y座標をy_arrにそれぞれ代入する.
            # 重心数が3だったら三つの配列で帰ってくる.
            for i in result
                push!(tmp_x, plotdata[i,1])
                push!(tmp_y, plotdata[i,2])
            end
            push!(x_arr, tmp_x)
            push!(y_arr, tmp_y)
            tmp_x = Float64[]
            tmp_y = Float64[]
        end
        scatter(x_arr, y_arr)
        scatter!(μx_old, μy_old,label="center")
        x_arr = []
        y_arr = []
        end every 1
end
result2(k)
INFO: Saved animation to /Users/noriakioshita/Github/julia/ml_page/julia/tmp.gif
Out[8]:
In [7]:
length(plot_data_arr)
Out[7]:
16
In [6]:
scatter(x_arr, y_arr)
scatter!(μx_old, μy_old,label="center")
Out[6]:
-5 0 5 10 15 20 -10 -5 0 5 y1 y2 y3 y4 y5 y6 y7 y8 y9 center
In [9]:
plot(error_arr)
Out[9]:
25 50 75 100 125 0 1 2 3 4 5 y1

k-means++

In [116]:
function input_data()
    parse_data1= Float64[]
    parse_data2 = Float64[]
    open("./dataset/data_2.txt","r") do fp
        data = readlines(fp)
        for i in 1:length(data)
            push!(parse_data1,parse(Float64,split(data[i], " ")[1]))
            push!(parse_data2 ,parse(Float64,split(data[i], " ")[2]))
        end
    end
    X = [parse_data1 parse_data2]
    return X
end

X = input_data()
n = length(X[:,1])

using Plots
gr()

function dist(μx, μy, xx, xy)
    return sqrt((xx-μx)^2+(μy-xy)^2)
end

# 重心の数
k = 5
# 重心の変数.
μx_old = Float64[]
μy_old = Float64[]

DX = Float64[]
function centroid()
    # データ点からランダムに選びそれを重心とする.
    z1 = rand(1:1:length(X[:,1]))
    push!(μx_old, X[z1,1])
    push!(μy_old, X[z1,2])
    DX = Float64[]
    max = 0
    max_num = 0
    
    # データ点との距離を代入する.
    for i in 1:n
        Dx = dist(X[i,1], X[i,2], X[z1,1], X[z1,2])
        push!(DX, Dx)
        for i in 1:k
            max = 0
            for j in 1:n
                Dx = dist(X[j,1], X[j,2], X[z1,1], X[z1,2])
                if max < Dx
                    max = Dx^2 / sum(DX)^2
                    max_num = j
                end
            end
            push!(μx_old, X[max_num,1])
            push!(μy_old, X[max_num,2])
        end
    end
    
"    for i in 2:k
        max = 0
        for j in 1:n
            Dx = dist(X[j,1], X[j,2], X[z1,1], X[z1,2])
            if max < Dx
                max = Dx^2 / sum(DX)^2
                max_num = j
            end
        end
        push!(μx_old, X[max_num,1])
        push!(μy_old, X[max_num,2])
    end"
end

centroid()


min = 1000000.0
cluster = 0
center_of_gravity_x = Float64[]
center_of_gravity_y = Float64[]
clusters = Int64[]
n = length(X[:,1])
    
μx_new = Float64[]
μy_new = Float64[]

# 収束判定用の変数
error = 100.0
error_arr = Float64[]

# 誤差関数が収束すれば終わり.
while error != 0.0  
    clusters = Int64[]
    
     
    for i in 1:n
        # 重心割り当て
        # どの重心に一番近いかを決める.その距離の計算をしている.
        # iには全データが入る.
        min = 100000.0
        for j in 1:k
            distance = dist(μx_old[j], μy_old[j], X[i,1], X[i,2])
            if min > distance
                min = distance
                cluster = j
            end
        end
        push!(clusters, cluster)
    end
    
    plotdata = [X [string(i) for i=clusters];]
    
    # 主に重心の計算
    for i in 1:k
        cluster_len = length(findin(plotdata[:,3],["$i"]))
        for j in findin(plotdata[:,3],["$i"])
            # 下二行が重心の計算
            push!(center_of_gravity_x,((X[j,1]) / cluster_len))
            push!(center_of_gravity_y,((X[j,2]) / cluster_len))
        end
        push!(μx_new, sum(center_of_gravity_x))
        push!(μy_new, sum(center_of_gravity_y))
        center_of_gravity_x = Float64[]
        center_of_gravity_y = Float64[]
        error = dist(μx_new[i], μy_new[i], μx_old[i], μy_old[i])
        push!(error_arr, error)
    end
    μx_old = μx_new
    μy_old = μy_new
    μx_new = Float64[]
    μy_new = Float64[]
end

tmp_x = Float64[]
tmp_y = Float64[]
x_arr = []
y_arr = []

function result(cluster_num)
    tmp_x = Float64[]
    tmp_y = Float64[]
    for p in 1:cluster_num
        result = findin(plotdata[:,3],["$p"])
        # 割り振られたデータをx座標をx_arr, y座標をy_arrにそれぞれ代入する.
        # 重心数が3だったら三つの配列で帰ってくる.
        for i in result
            push!(tmp_x, plotdata[i,1])
            push!(tmp_y, plotdata[i,2])
        end
        push!(x_arr, tmp_x)
        push!(y_arr, tmp_y)
        tmp_x = Float64[]
        tmp_y = Float64[]
    end
end

plotdata = [X [string(i) for i=clusters];]

result(k)
plotdata
Out[116]:
1000×3 Array{Any,2}:
 -3.93672    -3.5936   "2"
 -7.03037    -4.30673  "2"
 -5.61878    -6.48806  "2"
 -6.36537    -3.86141  "2"
 -3.42238    -4.81756  "2"
 -3.96087    -6.21365  "2"
 -5.2541     -4.79723  "2"
 -3.94911    -5.78381  "2"
 -5.35447    -4.52871  "2"
 -5.46655    -4.0682   "2"
 -4.74938    -5.46327  "2"
 -3.27678    -4.64628  "2"
 -4.15944    -3.41176  "2"
  ⋮                       
 -1.04288     6.2303   "2"
  1.50573     4.92199  "3"
  0.99633     5.6953   "3"
 -0.255848    4.59955  "3"
  0.0165902   4.76575  "3"
 -1.09301     6.4878   "2"
  0.349573    5.30376  "3"
  0.225971    3.7339   "3"
  0.947491    7.34555  "3"
 -0.591647    6.59899  "3"
  1.44914     4.65748  "3"
 -1.62489     6.74928  "2"
In [117]:
scatter(x_arr, y_arr)
scatter!(μx_old, μy_old,label="center")
Out[117]:
-5 0 5 10 15 20 -10 -5 0 5 y1 y2 y3 y4 y5 center
In [118]:
μx_old
Out[118]:
5-element Array{Float64,1}:
 10.2647 
 -4.64161
  1.22032
  0.0    
  0.0    
In [49]:
plot(error_arr)
Out[49]:
2 4 6 8 10 12 14 0.0 2.5 5.0 7.5 10.0 y1
In [108]:
using PyCall
@pyimport sklearn.cluster as kcluster
In [109]:
# クラスタを3つでやる.
n_cluster = 10
y_km = kcluster.KMeans(n_clusters=n_cluster, init="k-means++", n_init=10, max_iter=300, tol=1e-04, random_state=0)[:fit_predict](X) +1
plotdata = [X [string(i) for i=y_km];]
Out[109]:
1000×3 Array{Any,2}:
 -3.93672    -3.5936   "4"
 -7.03037    -4.30673  "4"
 -5.61878    -6.48806  "4"
 -6.36537    -3.86141  "4"
 -3.42238    -4.81756  "4"
 -3.96087    -6.21365  "4"
 -5.2541     -4.79723  "4"
 -3.94911    -5.78381  "4"
 -5.35447    -4.52871  "4"
 -5.46655    -4.0682   "4"
 -4.74938    -5.46327  "4"
 -3.27678    -4.64628  "4"
 -4.15944    -3.41176  "4"
  ⋮                       
 -1.04288     6.2303   "3"
  1.50573     4.92199  "3"
  0.99633     5.6953   "3"
 -0.255848    4.59955  "3"
  0.0165902   4.76575  "3"
 -1.09301     6.4878   "3"
  0.349573    5.30376  "3"
  0.225971    3.7339   "3"
  0.947491    7.34555  "3"
 -0.591647    6.59899  "3"
  1.44914     4.65748  "3"
 -1.62489     6.74928  "3"
In [110]:
x_arr = []
y_arr = []
result(n_cluster)
scatter(x_arr, y_arr)
Out[110]:
-5 0 5 10 15 20 -10 -5 0 5 y1 y2 y3 y4 y5 y6 y7 y8 y9 y10