import pandas as pd
znaczki = pd.read_excel("../input/wykaz-zt-polski.xls", skiprows = 4)
znaczki.rename(columns={"GPS":"lat", "Unnamed: 5":"lon"}, inplace=True)
znaczki["lat"] = znaczki["lat"].astype(float)
znaczki["lon"] = znaczki["lon"].astype(float)
from sklearn.cluster import KMeans
n_clusters = 12
kmeans = KMeans(n_clusters = n_clusters, random_state = 42)
coordinates = znaczki[["lat", "lon"]]
kmeans.fit(coordinates)
znaczki["grupa"] = kmeans.labels_
znaczki.head()
LP. | Numer znaczka | Nazwa znaczka | Województwo | lat | lon | grupa | |
---|---|---|---|---|---|---|---|
0 | 1 | No. 001 | Rysy – najwyższy szczyt polskich Tatr | małopolskie | 49.179628 | 20.087987 | 1 |
1 | 2 | No. 002 | Schronisko "Murowaniec" na Hali Gąsienicowej | małopolskie | 49.244167 | 20.007222 | 1 |
2 | 3 | No. 003 | Babia Góra – najwyższy szczyt Beskidu Żywieckiego | małopolskie | 49.573055 | 19.529444 | 4 |
3 | 4 | No. 004 | Schronisko Morskie Oko | małopolskie | 49.201378 | 20.071276 | 1 |
4 | 5 | No. 005 | Schronisko Głodówka | małopolskie | 49.302124 | 20.116664 | 1 |
# kod 1
from sklearn.metrics import silhouette_score
silhouette_score(X = coordinates, labels = znaczki["grupa"])
0.450639136417191
# kod 2
silhouette_averages = []
for ilosc_grup in range(2,len(znaczki)):
n_clusters = ilosc_grup
kmeans = KMeans(n_clusters = n_clusters,
random_state = 42)
kmeans.fit(coordinates)
znaczki["grupa"] = kmeans.labels_
sil_sco = silhouette_score(X = coordinates,
labels = znaczki["grupa"])
silhouette_averages.append(sil_sco)
if ilosc_grup % 10 == 0:
print(ilosc_grup,sil_sco)
10 0.4414132828753228 20 0.44058327835141947 30 0.4538771864033944 40 0.47266042687421567 50 0.48266687262597 60 0.4993816232672794 70 0.5044715377903637 80 0.5077644072529796 90 0.5158986440745873 100 0.5297141020047416 110 0.5277911607622169 120 0.5454616249331111 130 0.539129063779735 140 0.5536761609291103 150 0.5580628610213431 160 0.5645657410311796 170 0.5701974656932992 180 0.5783162478358502 190 0.5709873719579736 200 0.5723340580127944 210 0.5766784328328961 220 0.5881459330910309 230 0.588837942380066 240 0.5931334811769141 250 0.5893427059136105 260 0.6036059401757289 270 0.6092348723358586 280 0.6026894911310365 290 0.6037853342406011 300 0.6009959823794525 310 0.6009650894849635 320 0.6012345395044343 330 0.5988163799662259 340 0.5868131570508178 350 0.5866953958788685 360 0.5867281618603429 370 0.5822697667110571 380 0.5641549249693714 390 0.5620795793047825 400 0.5609647765412068 410 0.5608638526819211 420 0.5542612675262899 430 0.5411053873626841 440 0.5343564438353051 450 0.5303899402437036 460 0.5204653891791396 470 0.5056420751205427 480 0.4909855319320274 490 0.4770085241649209 500 0.46007987045913934 510 0.44916528309074744 520 0.4358432673463338 530 0.4152421037169779 540 0.4001538236102473 550 0.38452943342565277 560 0.35991073669130375 570 0.3451362652901065 580 0.3280827062777076 590 0.30820250629854945 600 0.2911908081623258 610 0.27462817041815657 620 0.2506229676946552 630 0.22270819074984513 640 0.2081454624564019 650 0.18855651774295545 660 0.1687476603764477 670 0.14983495012192982 680 0.13617200110919506 690 0.12143601425977638 700 0.10196364174718407 710 0.0852391034384813 720 0.06595986704659981 730 0.0435861415409139 740 0.02090973189214212
import pylab
pylab.plot(range(2,len(silhouette_averages)+2),silhouette_averages, )
[<matplotlib.lines.Line2D at 0x7f35e748d438>]
import operator
min_index, min_value = min(enumerate(silhouette_averages), key=operator.itemgetter(1))
max_index, max_value = max(enumerate(silhouette_averages), key=operator.itemgetter(1))
print(min_index+2, min_value)
print(max_index+2, max_value)
746 0.0053475935828877 266 0.6116557772535383
len(znaczki)
748