%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12,8)
/Users/andrey.shestakov/anaconda3/lib/python3.7/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88 return f(*args, **kwds)
Загрузите данные в которых содержится описание интересов профилей учеников старшей школы США.
df_sns = pd.read_csv('data/snsdata.csv', sep=',')
df_sns.head()
gradyear | gender | age | friends | basketball | football | soccer | softball | volleyball | swimming | ... | blonde | mall | shopping | clothes | hollister | abercrombie | die | death | drunk | drugs | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2006 | M | 18.982 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 2006 | F | 18.801 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 2006 | M | 18.335 | 69 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
3 | 2006 | F | 18.875 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 2006 | NaN | 18.995 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
5 rows × 40 columns
Данные устроены так:
df_sns.columns[4:].values
array(['basketball', 'football', 'soccer', 'softball', 'volleyball', 'swimming', 'cheerleading', 'baseball', 'tennis', 'sports', 'cute', 'sex', 'sexy', 'hot', 'kissed', 'dance', 'band', 'marching', 'music', 'rock', 'god', 'church', 'jesus', 'bible', 'hair', 'dress', 'blonde', 'mall', 'shopping', 'clothes', 'hollister', 'abercrombie', 'die', 'death', 'drunk', 'drugs'], dtype=object)
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
X = df_sns.iloc[:, 4:].values
# нормализуем данные
scaler = StandardScaler()
X_ = scaler.fit_transform(X)
/Users/andrey.shestakov/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:590: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler. warnings.warn(msg, DataConversionWarning) /Users/andrey.shestakov/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:590: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler. warnings.warn(msg, DataConversionWarning)
#применим к-средних с к=9
kmeans = KMeans(n_clusters=9, random_state=123)
kmeans.fit(X_)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300, n_clusters=9, n_init=10, n_jobs=None, precompute_distances='auto', random_state=123, tol=0.0001, verbose=0)
labels = kmeans.labels_ # метки кластеров для объектов из Х
centroids = kmeans.cluster_centers_ # координаты центройдов
criterion = kmeans.inertia_ # значения критерия для разбиения
criterion
861745.6454158238
centroids.shape
(9, 36)
df_sns.loc[:, 'cluster_label'] = labels
df_sns.cluster_label.value_counts()
4 20024 0 5036 1 1337 6 846 8 841 2 752 3 697 7 466 5 1 Name: cluster_label, dtype: int64
for k, group in df_sns.groupby('cluster_label'):
print('='*10)
print('Cluster {}'.format(k))
top_words = group.iloc[:, 4:-1].mean()\
.sort_values(ascending=False)\
.head(10)
print(top_words)
========== Cluster 0 music 1.066521 dance 1.051033 shopping 0.890191 cute 0.828435 basketball 0.722597 hair 0.691223 mall 0.652502 football 0.617752 god 0.573272 church 0.496426 dtype: float64 ========== Cluster 1 drunk 1.409873 music 0.707554 hair 0.629020 god 0.522064 dance 0.439043 cute 0.384443 sex 0.380703 shopping 0.326103 mall 0.287210 die 0.275991 dtype: float64 ========== Cluster 2 band 4.105053 marching 1.418883 music 1.215426 god 0.505319 dance 0.464096 hair 0.371011 rock 0.344415 shopping 0.289894 football 0.275266 cute 0.275266 dtype: float64 ========== Cluster 3 soccer 4.901004 music 0.773314 shopping 0.499283 god 0.469154 hair 0.440459 basketball 0.428981 dance 0.398852 football 0.397418 cute 0.337159 church 0.321377 dtype: float64 ========== Cluster 4 music 0.554035 god 0.311626 dance 0.230423 hair 0.192419 shopping 0.181632 cute 0.162855 band 0.156962 rock 0.152867 football 0.136187 church 0.135238 dtype: float64 ========== Cluster 5 blonde 327.0 sex 22.0 hair 12.0 god 10.0 death 6.0 die 6.0 drunk 6.0 football 2.0 dress 2.0 sexy 1.0 dtype: float64 ========== Cluster 6 hair 3.475177 sex 2.760047 music 2.374704 kissed 1.874704 die 1.269504 rock 1.257683 drugs 1.076832 dance 1.005910 god 0.964539 clothes 0.812057 dtype: float64 ========== Cluster 7 god 4.725322 church 2.180258 jesus 2.049356 music 1.066524 bible 0.972103 hair 0.448498 dance 0.427039 band 0.407725 shopping 0.396996 die 0.371245 dtype: float64 ========== Cluster 8 hollister 1.512485 abercrombie 1.173603 shopping 0.932224 music 0.909631 hair 0.897741 dance 0.693222 mall 0.673008 cute 0.612366 god 0.474435 clothes 0.424495 dtype: float64
Загрузите файл food.txt
. В нем содержится информация о пищевой ценности разных продуктов
Почему перед применением кластеризации признки необходимо нормализовать?
df = pd.read_csv('data/food.txt', sep=' ')
df.head()
Name | Energy | Protein | Fat | Calcium | Iron | |
---|---|---|---|---|---|---|
0 | Braised beef | 340 | 20 | 28 | 9 | 2.6 |
1 | Hamburger | 245 | 21 | 17 | 9 | 2.7 |
2 | Roast beef | 420 | 15 | 39 | 7 | 2.0 |
3 | Beefsteak | 375 | 19 | 32 | 9 | 2.6 |
4 | Canned beef | 180 | 22 | 10 | 17 | 3.7 |
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
X = df.iloc[:, 1:].values
scaler = StandardScaler()
X_ = scaler.fit_transform(X)
X_.shape
(27, 5)
Z = linkage(X_, method='average', metric='euclidean')
names = df.Name.values
dend = dendrogram(Z, color_threshold=0, labels=names,
orientation='left')
t = 2.3
labels = fcluster(Z, t, criterion='distance')
# labels = fcluster(Z, t, criterion='maxclust')
df.loc[:, 'label'] = labels
df.head()
Name | Energy | Protein | Fat | Calcium | Iron | label | |
---|---|---|---|---|---|---|---|
0 | Braised beef | 340 | 20 | 28 | 9 | 2.6 | 2 |
1 | Hamburger | 245 | 21 | 17 | 9 | 2.7 | 4 |
2 | Roast beef | 420 | 15 | 39 | 7 | 2.0 | 2 |
3 | Beefsteak | 375 | 19 | 32 | 9 | 2.6 | 2 |
4 | Canned beef | 180 | 22 | 10 | 17 | 3.7 | 4 |
for k, group in df.groupby('label'):
print('='*10)
print('Cluster {}'.format(k))
print(group)
========== Cluster 1 Name Energy Protein Fat Calcium Iron label 16 Raw clams 70 11 1 82 6.0 1 17 Canned clams 45 7 1 74 5.4 1 ========== Cluster 2 Name Energy Protein Fat Calcium Iron label 0 Braised beef 340 20 28 9 2.6 2 2 Roast beef 420 15 39 7 2.0 2 3 Beefsteak 375 19 32 9 2.6 2 9 Roast lamb shoulder 300 18 25 9 2.3 2 10 Smoked ham 340 20 28 9 2.5 2 11 Pork roast 340 19 29 9 2.5 2 12 Pork simmered 355 19 30 9 2.4 2 ========== Cluster 3 Name Energy Protein Fat Calcium Iron label 21 Canned mackerel 155 16 9 157 1.8 3 23 Canned salmon 120 17 5 159 0.7 3 ========== Cluster 4 Name Energy Protein Fat Calcium Iron label 1 Hamburger 245 21 17 9 2.7 4 4 Canned beef 180 22 10 17 3.7 4 5 Broiled chicken 115 20 3 8 1.4 4 6 Canned chicken 170 25 7 12 1.5 4 8 Roast lamb leg 265 20 20 9 2.6 4 13 Beef tongue 205 18 14 7 2.5 4 14 Veal cutlet 185 23 9 9 2.7 4 15 Baked bluefish 135 22 4 25 0.6 4 18 Canned crabmeat 90 14 2 38 0.8 4 19 Fried haddock 135 16 5 15 0.5 4 20 Broiled mackerel 200 19 13 5 1.0 4 22 Fried perch 195 16 11 14 1.3 4 25 Canned tuna 170 25 7 7 1.2 4 26 Canned shrimp 110 23 1 98 2.6 4 ========== Cluster 5 Name Energy Protein Fat Calcium Iron label 7 Beef heart 160 26 5 14 5.9 5 ========== Cluster 6 Name Energy Protein Fat Calcium Iron label 24 Canned sardines 180 22 9 367 2.5 6