#!/usr/bin/env python # coding: utf-8 # K Means Clustering in Python. including Performance metric ,Confusion Matrix, Visualization # In[1]: import matplotlib.pyplot as plt from sklearn import datasets from sklearn.cluster import KMeans import sklearn.metrics as sm import pandas as pd import numpy as np # In[2]: wine=pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",header=None) # In[3]: wine.head() # From http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.names we get the column names # In[4]: wine.columns=['winetype','Alcohol','Malic acid','Ash','Alcalinity of ash','Magnesium','Total phenols','Flavanoids','Nonflavanoid phenols','Proanthocyanins','Color intensity','Hue','OD280/OD315 of diluted wines','Proline'] # In[5]: wine.head() # In[6]: wine.info() # In[7]: wine.describe() # In[8]: pd.value_counts(wine['winetype']) # R solution is https://rstudio-pubs-static.s3.amazonaws.com/33876_1d7794d9a86647ca90c4f182df93f0e8.html # In[9]: x=wine.ix[:,1:14] y=wine.ix[:,:1] # In[10]: x.columns # In[33]: x.ix[:,:1].head() # In[12]: y.columns # In[13]: x.head() # In[14]: y.head() # In[15]: y.info # In[16]: # K Means Cluster model = KMeans(n_clusters=3) model.fit(x) # In[17]: model.labels_ # In[18]: pd.value_counts(model.labels_) # In[19]: pd.value_counts(y['winetype']) # In[20]: # We convert all the 1s to 0s and 0s to 1s. predY = np.choose(model.labels_, [1, 2, 3]).astype(np.int64) # In[21]: print (y['winetype']) print (model.labels_) print (predY) # In[22]: # Performance Metrics sm.accuracy_score(y, predY) # In[23]: # Confusion Matrix sm.confusion_matrix(y, predY) # In[24]: pd.unique(y.winetype) # In[25]: #!sudo pip install ggplot # In[30]: from ggplot import * get_ipython().run_line_magic('matplotlib', 'inline') # In[31]: p = ggplot(aes(x='Alcohol', y='Ash',color="winetype"), data=wine) p + geom_point() # In[32]: p2 = ggplot(aes(x='Alcohol', y='Ash',color="predY"), data=wine) p2 + geom_point()