#!/usr/bin/env python
# coding: utf-8

# # Email Analysis: K-Means Clustering

# Clustering is an 'unsupervised learning' algorithm that has many diverse applications from computer graphics to economics. In Natural Language Processing, clustering has been used to determine authorship of the federalist papers, and analyze hundrends of thousands of documents as they are released in an instant. Clustering is particularly useful when you want to get a snapshot of a large dataset you have at hand.

# ### Pre-analysis Houskeeping - Imports and Defining Methods

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')

# Basic Imports
import re, os, logging, sys, random, json, xml.etree.ElementTree 
from time import time
from operator import itemgetter
from datetime import date
from datetime import timedelta
from dateutil.parser import parse
from pprint import pprint

# Sci-kit learn Machine Learning Tookit Imports
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import pairwise_distances

# Plotting Imports
import numpy as np
from scipy.stats import mode
import matplotlib.pyplot as plt, mpld3
from mpld3 import plugins
from click_info import ClickInfo
# mpld3.enable_notebook()

# Data Object Related Imports
from peewee import *
from Email import Email

def make_histogram(innerDict):
    x = np.arange(len(innerDict.keys()))
    y = innerDict.values()
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(1,1,1)
    ax.bar(x, y)
    ax.set_xticks(x)
    ax.set_xticklabels(innerDict.keys(), rotation=70)
    plt.show()

db = SqliteDatabase('emails.db')
db.connect()


# ### Perpare the tools we will use

# In[2]:


N_CLUSTERS = 3
reduce_dimensionality = True
color_by_party = True
k_means = KMeans(n_clusters=N_CLUSTERS, init='k-means++', max_iter=100, n_init=1, verbose=True)
vectorizer = TfidfVectorizer(max_df=0.5, min_df=0.1, stop_words='english')
lsa = TruncatedSVD(2)


# ### Perpare the Data

# In[3]:


emails = np.array([email for email in Email.select()])
text = np.array([email.text for email in emails])
parties = np.array([email.sender.party if email and email.sender else "" for email in emails])
vectors = vectorizer.fit_transform(text)
if reduce_dimensionality == True:
    X = lsa.fit_transform(vectors)
else:
    X = vectors


# ### Run K-means Clustering

# In[4]:


km = k_means.fit(X)


# In[5]:


k_means_labels = k_means.labels_
k_means_cluster_centers = k_means.cluster_centers_
k_means_labels_unique = np.unique(k_means_labels)
terms = vectorizer.get_feature_names()


# In[6]:


if reduce_dimensionality == False:
    order_centroids = k_means_cluster_centers.argsort()[:, ::-1]
    for i in range(N_CLUSTERS):
        print([(terms[ind], k_means_cluster_centers[i][ind]) for ind in order_centroids[i, :100]])
        print("")


# In[7]:


for k in range(N_CLUSTERS):
    z = vectors.toarray()[k_means_labels == k]
    wordz_tfidf = [(terms[i], z[:,i].sum()) for i in range(z.shape[1])]
    wordz_tfidf = sorted(wordz_tfidf, key=lambda x: x[1], reverse=True )
    print(wordz_tfidf[:100])
    print("")


# In[8]:


if reduce_dimensionality == True:
    mpld3.enable_notebook()
    fig,ax = plt.subplots(figsize=(15,10))
    
    ph = {}
    ph_counter = 0
    def add_labels(fig, boolmap, points):
            global ph_counter
            labels = []
            for email in emails[boolmap]:
                ph[ph_counter] = email.politicalnewsbot_link()
                # labels.append(str(ph_counter) + " " + email.message_id + " " + email.message_labels + " " + email.sender.party)
                labels.append(email.sender.email_address)
                ph_counter += 1

            tooltip = plugins.PointHTMLTooltip(points[0], labels, voffset=10, hoffset=10)
            plugins.connect(fig, tooltip)
            urls = [email.politicalnewsbot_link() for email in emails[boolmap]]
            # plugins.connect(fig, ClickInfo(points[0], urls))

    if color_by_party == True:
        for party, col in zip(['d', 'r'], ['blue', 'red']):
            my_members = parties == party
            points = ax.plot(X[my_members, 0], X[my_members, 1], 'w', markerfacecolor=col, marker='.', label='Party %s' % party )
            
            add_labels(fig, parties == party, points)
        
        for k in range(N_CLUSTERS):
            cluster_center = k_means_cluster_centers[k]
            centers = ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor='black', markeredgecolor='k', markersize=6)

    else:
        colors = [(random.random(), random.random(), random.random()) for x in range(N_CLUSTERS)]
        for k, col in zip(range(N_CLUSTERS), colors):
            my_members = k_means_labels == k
            cluster_center = k_means_cluster_centers[k]
            points = ax.plot(X[my_members, 0], X[my_members, 1], 'w', markerfacecolor=col, marker='.', label='Cluster %i' % k)
            centers = ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6)

            add_labels(fig, k_means_labels == k, points)

    ax.set_title('KMeans')
    ax.set_xticks(())
    ax.set_yticks(())
    ax.legend()
    ph_counter=0


# In[9]:


mpld3.disable_notebook()


# In[10]:


labels = ['CATEGORY_PROMOTIONS', 'CATEGORY_UPDATES', 'IMPORTANT', 'INBOX', 'UNREAD']
label_histogram = []
for k in range(N_CLUSTERS):
    innerDict = dict.fromkeys(labels, 0)
    for email in emails[k_means_labels == k]:
        current_labels = eval(email.message_labels)
        for label in current_labels:
            if label in innerDict:
                innerDict[label] = innerDict[label]+1
            else:
                innerDict[label] = 0

    label_histogram.append(innerDict)
label_histogram


# In[11]:


make_histogram(label_histogram[0])


# In[12]:


make_histogram(label_histogram[1])


# In[13]:


print("\n".join([email.politicalnewsbot_link() for email in emails[k_means_labels == 1]]))


# In[ ]: