In [1]:
import pandas as pd

znaczki = pd.read_excel("../input/wykaz-zt-polski.xls", skiprows = 4)

znaczki.rename(columns={"GPS":"lat", "Unnamed: 5":"lon"}, inplace=True)

znaczki["lat"] = znaczki["lat"].astype(float)
znaczki["lon"] = znaczki["lon"].astype(float)
In [2]:
from sklearn.cluster import KMeans

n_clusters = 12

kmeans = KMeans(n_clusters = n_clusters, random_state = 42)

coordinates = znaczki[["lat", "lon"]]

kmeans.fit(coordinates)

znaczki["grupa"] = kmeans.labels_

znaczki.head()
Out[2]:
LP. Numer znaczka Nazwa znaczka Województwo lat lon grupa
0 1 No. 001 Rysy – najwyższy szczyt polskich Tatr małopolskie 49.179628 20.087987 1
1 2 No. 002 Schronisko "Murowaniec" na Hali Gąsienicowej małopolskie 49.244167 20.007222 1
2 3 No. 003 Babia Góra – najwyższy szczyt Beskidu Żywieckiego małopolskie 49.573055 19.529444 4
3 4 No. 004 Schronisko Morskie Oko małopolskie 49.201378 20.071276 1
4 5 No. 005 Schronisko Głodówka małopolskie 49.302124 20.116664 1
In [13]:
# kod 1
from sklearn.metrics import silhouette_score
silhouette_score(X = coordinates, labels = znaczki["grupa"])
Out[13]:
0.450639136417191
In [50]:
# kod 2 
silhouette_averages = []
for ilosc_grup in range(2,len(znaczki)):
    n_clusters = ilosc_grup
    kmeans = KMeans(n_clusters = n_clusters, 
                    random_state = 42)
    kmeans.fit(coordinates)
    znaczki["grupa"] = kmeans.labels_
    sil_sco = silhouette_score(X = coordinates, 
                               labels = znaczki["grupa"])
    silhouette_averages.append(sil_sco)
    if ilosc_grup % 10 == 0:
        print(ilosc_grup,sil_sco)
10 0.4414132828753228
20 0.44058327835141947
30 0.4538771864033944
40 0.47266042687421567
50 0.48266687262597
60 0.4993816232672794
70 0.5044715377903637
80 0.5077644072529796
90 0.5158986440745873
100 0.5297141020047416
110 0.5277911607622169
120 0.5454616249331111
130 0.539129063779735
140 0.5536761609291103
150 0.5580628610213431
160 0.5645657410311796
170 0.5701974656932992
180 0.5783162478358502
190 0.5709873719579736
200 0.5723340580127944
210 0.5766784328328961
220 0.5881459330910309
230 0.588837942380066
240 0.5931334811769141
250 0.5893427059136105
260 0.6036059401757289
270 0.6092348723358586
280 0.6026894911310365
290 0.6037853342406011
300 0.6009959823794525
310 0.6009650894849635
320 0.6012345395044343
330 0.5988163799662259
340 0.5868131570508178
350 0.5866953958788685
360 0.5867281618603429
370 0.5822697667110571
380 0.5641549249693714
390 0.5620795793047825
400 0.5609647765412068
410 0.5608638526819211
420 0.5542612675262899
430 0.5411053873626841
440 0.5343564438353051
450 0.5303899402437036
460 0.5204653891791396
470 0.5056420751205427
480 0.4909855319320274
490 0.4770085241649209
500 0.46007987045913934
510 0.44916528309074744
520 0.4358432673463338
530 0.4152421037169779
540 0.4001538236102473
550 0.38452943342565277
560 0.35991073669130375
570 0.3451362652901065
580 0.3280827062777076
590 0.30820250629854945
600 0.2911908081623258
610 0.27462817041815657
620 0.2506229676946552
630 0.22270819074984513
640 0.2081454624564019
650 0.18855651774295545
660 0.1687476603764477
670 0.14983495012192982
680 0.13617200110919506
690 0.12143601425977638
700 0.10196364174718407
710 0.0852391034384813
720 0.06595986704659981
730 0.0435861415409139
740 0.02090973189214212
In [51]:
import pylab
In [52]:
pylab.plot(range(2,len(silhouette_averages)+2),silhouette_averages, )
Out[52]:
[<matplotlib.lines.Line2D at 0x7f35e748d438>]
In [53]:
import operator
min_index, min_value = min(enumerate(silhouette_averages), key=operator.itemgetter(1))
max_index, max_value = max(enumerate(silhouette_averages), key=operator.itemgetter(1))
print(min_index+2, min_value)
print(max_index+2, max_value)
746 0.0053475935828877
266 0.6116557772535383
In [54]:
len(znaczki)
Out[54]:
748
In [ ]: