#!/usr/bin/env python
# coding: utf-8

# K Means Clustering in Python. including Performance metric ,Confusion Matrix, Visualization

# In[1]:


import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.cluster import KMeans
import sklearn.metrics as sm
 
import pandas as pd
import numpy as np


# In[2]:


wine=pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data",header=None)


# In[3]:


wine.head()


# From http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.names we get the column names

# In[4]:


wine.columns=['winetype','Alcohol','Malic acid','Ash','Alcalinity of ash','Magnesium','Total phenols','Flavanoids','Nonflavanoid phenols','Proanthocyanins','Color intensity','Hue','OD280/OD315 of diluted wines','Proline']


# In[5]:


wine.head()


# In[6]:


wine.info()


# In[7]:


wine.describe()


# In[8]:


pd.value_counts(wine['winetype'])


# R solution is https://rstudio-pubs-static.s3.amazonaws.com/33876_1d7794d9a86647ca90c4f182df93f0e8.html

# In[9]:


x=wine.ix[:,1:14]
y=wine.ix[:,:1]


# In[10]:


x.columns


# In[33]:


x.ix[:,:1].head()


# In[12]:


y.columns


# In[13]:


x.head()


# In[14]:


y.head()


# In[15]:


y.info


# In[16]:


# K Means Cluster
model = KMeans(n_clusters=3)
model.fit(x)


# In[17]:


model.labels_


# In[18]:


pd.value_counts(model.labels_)


# In[19]:


pd.value_counts(y['winetype'])


# In[20]:


# We convert all the 1s to 0s and 0s to 1s.
predY = np.choose(model.labels_, [1, 2, 3]).astype(np.int64)


# In[21]:


print (y['winetype'])
print (model.labels_)
print (predY)


# In[22]:


# Performance Metrics
sm.accuracy_score(y, predY)


# In[23]:


# Confusion Matrix
sm.confusion_matrix(y, predY)


# In[24]:


pd.unique(y.winetype)


# In[25]:


#!sudo pip install ggplot


# In[30]:


from ggplot import *
get_ipython().run_line_magic('matplotlib', 'inline')


# In[31]:


p = ggplot(aes(x='Alcohol', y='Ash',color="winetype"), data=wine)
p + geom_point()


# In[32]:


p2 = ggplot(aes(x='Alcohol', y='Ash',color="predY"), data=wine)
p2 + geom_point()