#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('watermark', '-a "Romell D.Z." -u -d -p numpy,pandas,matplotlib,sklearn') # # 2. Unsupervised Learning # In[3]: from __future__ import division import warnings warnings.simplefilter('ignore' ) get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import numpy as np import pandas as pd get_ipython().run_line_magic('config', "InlineBackend.figure_format = 'retina'") plt.rcParams['figure.figsize'] = (12,6) # In[6]: from sklearn import datasets X, y = datasets.make_blobs(centers=4,n_samples=1500,n_features=2,cluster_std=1.5) plt.scatter(X[:,0],X[:,1],c='gray') # In[7]: from sklearn.cluster import KMeans model = KMeans(n_clusters=4).fit(X) model.cluster_centers_ # In[8]: plt.scatter(X[:,0],X[:,1],c=model.labels_,cmap=plt.cm.viridis,alpha=.2) centroids_x = model.cluster_centers_[:,0] centroids_y = model.cluster_centers_[:,1] plt.scatter(centroids_x,centroids_y,marker='D',c='r',s=50) plt.savefig('snapshot/centroid_clusters',bbox_inches='tight',dpi=100); # In[23]: X_test = np.random.random([3,2]) * 20 - 10 plt.scatter(X[:,0],X[:,1],c='gray',alpha=.2) plt.scatter(list(zip(*X_test))[0],list(zip(*X_test))[1],c='b'); # In[24]: fig, ax = plt.subplots(1,2,figsize=(15,7)) model = KMeans(n_clusters=3) model.fit(X) ax[0].scatter(X[:,0],X[:,1],c=model.labels_,cmap=plt.cm.viridis,alpha=.2) p_predict = model.predict(X_test) centroids_x = model.cluster_centers_[:,0] centroids_y = model.cluster_centers_[:,1] ax[0].scatter(list(zip(*X_test))[0],list(zip(*X_test))[1],c=p_predict,cmap=plt.cm.viridis,marker='X',s=200) ax[0].scatter(centroids_x,centroids_y,marker='D',c='r',s=50) model = KMeans(n_clusters=4).fit(X) p_predict = model.predict(X_test) centroids_x = model.cluster_centers_[:,0] centroids_y = model.cluster_centers_[:,1] ax[1].scatter(X[:,0],X[:,1],c=model.labels_,cmap=plt.cm.viridis,alpha=.2)#kmeans.cluster_centers_ ax[1].scatter(list(zip(*X_test))[0],list(zip(*X_test))[1],c=p_predict,cmap=plt.cm.viridis,marker='X',s=200) ax[1].scatter(centroids_x,centroids_y,marker='D',c='r',s=50) plt.savefig('snapshot/kmeans_clusters',bbox_inches='tight',dpi=100); # In[25]: from sklearn.cluster import MeanShift, estimate_bandwidth bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=200) model = MeanShift(bandwidth=bandwidth, bin_seeding=True) model.fit(X) plt.scatter(X[:,0],X[:,1],c=model.labels_,cmap=plt.cm.viridis,alpha=.2) p_predict = model.predict(X_test) centroids_x = model.cluster_centers_[:,0] centroids_y = model.cluster_centers_[:,1] plt.scatter(list(zip(*X_test))[0],list(zip(*X_test))[1],c=p_predict,cmap=plt.cm.viridis,marker='X',s=200) plt.scatter(centroids_x,centroids_y,marker='D',c='r',s=20); # In[26]: from sklearn.cluster import MiniBatchKMeans mbk = MiniBatchKMeans(init='k-means++', n_clusters=4, batch_size=100,n_init=10, max_no_improvement=10, verbose=0,).fit(X) fig, ax = plt.subplots(1,2,figsize=(15,7)) ax[0].scatter(X[:,0],X[:,1],c=mbk.labels_,cmap=plt.cm.viridis,alpha=.2) p_predict = mbk.predict(X_test) ax[0].scatter(list(zip(*X_test))[0],list(zip(*X_test))[1],c='r',marker='X',s=200) mbk = MiniBatchKMeans(init='k-means++', n_clusters=4, batch_size=200,n_init=10, max_no_improvement=10, verbose=0,).fit(X) ax[1].scatter(X[:,0],X[:,1],c=mbk.labels_,cmap=plt.cm.viridis,alpha=.2) p_predict = mbk.predict(X_test) ax[1].scatter(list(zip(*X_test))[0],list(zip(*X_test))[1],c=p_predict,cmap=plt.cm.viridis,marker='X',s=200) plt.savefig('snapshot/mini_batch_kmeans_clusters',bbox_inches='tight',dpi=100); # In[27]: from sklearn.decomposition import PCA pca = PCA(n_components=2) pca_transform_ = pca.fit_transform(X) kmeans = KMeans( n_clusters=4, ) kmeans.fit(pca_transform_) h = .02 x_min, x_max = pca_transform_[:, 0].min() - 1.5, pca_transform_[:, 0].max() + 1.5 y_min, y_max = pca_transform_[:, 1].min() - 1.5, pca_transform_[:, 1].max() + 1.5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape) plt.figure(1,figsize=(18,6)) plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.viridis, aspect='auto', origin='lower') plt.plot(pca_transform_[:, 0], pca_transform_[:, 1], 'k.', markersize=2) centroids = kmeans.cluster_centers_ plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10) x_pca = pca.transform(X_test) x_pca_z = kmeans.predict(x_pca) print("Clusters for each point: %s"%x_pca_z) plt.scatter(x_pca[:, 0], x_pca[:, 1],c=x_pca_z,marker= '*', s=300,edgecolors='r') plt.savefig('snapshot/pca_clusters',bbox_inches='tight',dpi=100); # In[28]: ks = range(1, 6) inertias = [] for k in ks: model = KMeans(n_clusters=k) model.fit(X) inertias.append(model.inertia_) plt.plot(ks, inertias, '-o') plt.xlabel('number of clusters') plt.ylabel('model inertia') plt.xticks(ks) plt.savefig('snapshot/inertia_clusters',bbox_inches='tight',dpi=100); plt.show() print("Best choice is 4 K clusters") # In[50]: from scipy.cluster.hierarchy import linkage, dendrogram mergings = linkage(X, method='complete') result= dendrogram(mergings,labels=y, leaf_rotation=90, leaf_font_size=8, truncate_mode='lastp', p=3, show_leaf_counts=True) plt.legend() plt.axhline(y=result['icoord'][1][0]) plt.axhline(y=result['icoord'][1][-1]) plt.savefig('snapshot/hierarchy_clusters',bbox_inches='tight',dpi=100); print('Best n clusters:',3) # In[51]: from sklearn.manifold import TSNE fig,(ax1,ax2) = plt.subplots(1,2,figsize=(15,6)) ax1.scatter(X[:,0],X[:,1],c=y,alpha=.2) ax1.set_title('Data Generate') model = TSNE(learning_rate=200) tsne_features = model.fit_transform(X) ax2.set_title('t-distributed Stochastic Neighbor Embedding') ax2.scatter(tsne_features[:,0] ,tsne_features[:,1],c=y,alpha=.2) plt.tight_layout() plt.savefig('snapshot/tsne_clusters',bbox_inches='tight',dpi=100); # In[52]: from sklearn.manifold import Isomap fig,(ax1,ax2) = plt.subplots(1,2,figsize=(15,6)) ax1.scatter(X[:,0],X[:,1],c=y) ax1.set_title('Data Generate') model = Isomap(n_neighbors=4) tsne_features = model.fit_transform(X) ax2.set_title('Isomap') ax2.scatter(tsne_features[:,0] ,tsne_features[:,1],c=y) plt.tight_layout() plt.savefig('snapshot/isomap_clusters',bbox_inches='tight',dpi=100); # In[53]: from sklearn.manifold import MDS fig,(ax1,ax2) = plt.subplots(1,2,figsize=(15,6)) ax1.scatter(X[:,0],X[:,1],c=y) ax1.set_title('Data Generate') model = MDS() tsne_features = model.fit_transform(X) ax2.set_title('Multidimensional scaling') ax2.scatter(tsne_features[:,0] ,tsne_features[:,1],c=y) plt.tight_layout() plt.savefig('snapshot/mds_clusters',bbox_inches='tight',dpi=100); # In[54]: semillas = pd.read_csv('semillas_dimenciones.csv') from scipy.stats import pearsonr width = semillas['anchura'] length = semillas['altura'] plt.scatter(width, length,c='g',alpha=.2) plt.title('Dimenciones de las Semillas') plt.xlabel('Anchura') plt.ylabel('Altura') plt.axis('equal') plt.title('PCA Features corr: %.4f'%pearsonr(width, length)[0]) plt.show() # In[55]: from sklearn.decomposition import PCA model = PCA(n_components=2) pca_features = model.fit_transform(semillas) plt.scatter(pca_features[:,0], pca_features[:,1],c='g',alpha=.2) plt.axis('equal') plt.title('PCA Features corr: %.4f'%pearsonr(pca_features[:,0], pca_features[:,1])[0]) plt.show() # In[56]: plt.scatter(width, length,c='g') mean = model.mean_ first_pc = model.components_[0,:] plt.arrow(mean[0], mean[1], first_pc[0], first_pc[1], color='red', width=0.1) second_pc = model.components_[1,:] plt.arrow(mean[0], mean[1], second_pc[0]/100, second_pc[1]/100, color='b', width=0.05) plt.axis('equal') plt.savefig('snapshot/pca_clusters',bbox_inches='tight',dpi=100); # In[22]: print('Shape of normal dimensional: %s'%str(semillas.shape)) # In[23]: print('Shape of Principal Dimensional component: %s'%str(pca_features.shape)) # In[24]: digits = pd.read_csv('digits.csv',header=None)[:].values seven = digits[0,:] print(seven) print("The shape for each digit: %s"%str(seven.shape)) bitmap_seven = seven.reshape(-1,8) print(bitmap_seven) print("The shape for each digit: %s"%str(bitmap_seven.shape)) plt.imshow(bitmap_seven, cmap='gray', interpolation='nearest') plt.xticks([]) plt.yticks([]) plt.show() # In[25]: from sklearn.decomposition import NMF model = NMF(n_components=7) features = model.fit_transform(digits) print("Seven Components for each LCD number",features.shape) for component in model.components_: bitmap = component.reshape((13, 8)) plt.figure() plt.imshow(bitmap, cmap='gray', interpolation='nearest') plt.xticks([]) plt.yticks([]) plt.show() digit_features = features[0,:] print(digit_features) # In[26]: spotify_artists = pd.read_csv('spotify_artists.csv',header=None,names=['Name']) artist_names = spotify_artists.iloc[:,0] spotify_artists.sample(5) # In[27]: spotify_artists['artista'] = spotify_artists.index spotify_artists.head() # In[28]: from sklearn.feature_extraction.text import CountVectorizer artistCV = CountVectorizer().fit_transform(artist_names) artistCV # In[29]: from sklearn.preprocessing import Normalizer, MaxAbsScaler from sklearn.pipeline import make_pipeline pipeline = make_pipeline(MaxAbsScaler(), NMF(n_components=20), Normalizer()) norm_features = pipeline.fit_transform(artistCV) df_spotify_normalized = pd.DataFrame(norm_features,index=artist_names) # In[30]: artist = df_spotify_normalized.loc['AC/DC'] similarities = df_spotify_normalized.dot(artist) print("The fifth most similar:") for i_,proba in similarities.nlargest(6)[1:].iteritems(): print("Artist:\t%s, probability:%.3f"%(i_,proba)) # In[31]: artist = df_spotify_normalized.loc['The Beach Boys'] similarities = df_spotify_normalized.dot(artist) print("The fifth most similar:") for i_,proba in similarities.nlargest(6)[1:].iteritems(): print("Artist:\t%s,\t probability:%.3f"%(i_,proba)) # In[ ]: