import pandas as pd
import hypertools as hyp
%matplotlib inline
data = pd.read_csv('data/mushrooms.csv')
data.head()
class | cap-shape | cap-surface | cap-color | bruises | odor | gill-attachment | gill-spacing | gill-size | gill-color | ... | stalk-surface-below-ring | stalk-color-above-ring | stalk-color-below-ring | veil-type | veil-color | ring-number | ring-type | spore-print-color | population | habitat | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | p | x | s | n | t | p | f | c | n | k | ... | s | w | w | p | w | o | p | k | s | u |
1 | e | x | s | y | t | a | f | c | b | k | ... | s | w | w | p | w | o | p | n | n | g |
2 | e | b | s | w | t | l | f | c | b | n | ... | s | w | w | p | w | o | p | n | n | m |
3 | p | x | y | w | t | p | f | c | n | n | ... | s | w | w | p | w | o | p | k | s | u |
4 | e | x | s | g | f | n | f | w | b | k | ... | s | w | w | p | w | o | e | n | a | g |
5 rows × 23 columns
class_labels = data.pop('class')
geo = hyp.plot(data, '.') # if the number of features is greater than 3, the default is to plot in 3d
geo = hyp.plot(data, '.', group=class_labels, legend=list(set(class_labels)))
geo = hyp.plot(data, '.', n_clusters=23)
# you can also recover the cluster labels using the cluster tool
cluster_labels = hyp.cluster(data, n_clusters=23)
# hyp.plot(data, 'o', point_colors=cluster_labels, ndims=2)
geo = hyp.plot(data, '.', group=cluster_labels, palette="deep")
geo_ica = hyp.plot(data, '.', group=class_labels, legend=list(set(class_labels)), reduce='FastICA', ndims=3)
geo_tsne = hyp.plot(data, '.', group=class_labels, legend=list(set(class_labels)), reduce='TSNE', ndims=3)
geo = hyp.plot(data, '.', group=cluster_labels, reduce='PCA', title='PCA')
geo_ica.plot(group=cluster_labels, legend=None, title='ICA')
geo_tsne.plot(group=cluster_labels, legend=None, title='TSNE')
<hypertools.datageometry.DataGeometry at 0x10f9b1ed0>
geo = hyp.plot(data, '.', reduce='PCA', title='PCA')
ks = [3,6,9,12,15]
for k in ks:
geo.plot(n_clusters=k, title='k=' + str(k))
for k in ks:
geo_ica.plot(n_clusters=k, title='k=' + str(k))
for k in ks:
geo_tsne.plot(n_clusters=k, title='k=' + str(k))