#!/usr/bin/env python # coding: utf-8 # # Email Analysis: K-Means Clustering # Clustering is an 'unsupervised learning' algorithm that has many diverse applications from computer graphics to economics. In Natural Language Processing, clustering has been used to determine authorship of the federalist papers, and analyze hundrends of thousands of documents as they are released in an instant. Clustering is particularly useful when you want to get a snapshot of a large dataset you have at hand. # ### Pre-analysis Houskeeping - Imports and Defining Methods # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') # Basic Imports import re, os, logging, sys, random, json, xml.etree.ElementTree from time import time from operator import itemgetter from datetime import date from datetime import timedelta from dateutil.parser import parse from pprint import pprint # Sci-kit learn Machine Learning Tookit Imports from sklearn.cluster import KMeans from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import PCA from sklearn.decomposition import TruncatedSVD from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer from sklearn.model_selection import cross_val_score from sklearn import metrics from sklearn.metrics import pairwise_distances # Plotting Imports import numpy as np from scipy.stats import mode import matplotlib.pyplot as plt, mpld3 from mpld3 import plugins from click_info import ClickInfo # mpld3.enable_notebook() # Data Object Related Imports from peewee import * from Email import Email def make_histogram(innerDict): x = np.arange(len(innerDict.keys())) y = innerDict.values() fig = plt.figure(figsize=(10,10)) ax = fig.add_subplot(1,1,1) ax.bar(x, y) ax.set_xticks(x) ax.set_xticklabels(innerDict.keys(), rotation=70) plt.show() db = SqliteDatabase('emails.db') db.connect() # ### Perpare the tools we will use # In[2]: N_CLUSTERS = 3 reduce_dimensionality = True color_by_party = True k_means = KMeans(n_clusters=N_CLUSTERS, init='k-means++', max_iter=100, n_init=1, verbose=True) vectorizer = TfidfVectorizer(max_df=0.5, min_df=0.1, stop_words='english') lsa = TruncatedSVD(2) # ### Perpare the Data # In[3]: emails = np.array([email for email in Email.select()]) text = np.array([email.text for email in emails]) parties = np.array([email.sender.party if email and email.sender else "" for email in emails]) vectors = vectorizer.fit_transform(text) if reduce_dimensionality == True: X = lsa.fit_transform(vectors) else: X = vectors # ### Run K-means Clustering # In[4]: km = k_means.fit(X) # In[5]: k_means_labels = k_means.labels_ k_means_cluster_centers = k_means.cluster_centers_ k_means_labels_unique = np.unique(k_means_labels) terms = vectorizer.get_feature_names() # In[6]: if reduce_dimensionality == False: order_centroids = k_means_cluster_centers.argsort()[:, ::-1] for i in range(N_CLUSTERS): print([(terms[ind], k_means_cluster_centers[i][ind]) for ind in order_centroids[i, :100]]) print("") # In[7]: for k in range(N_CLUSTERS): z = vectors.toarray()[k_means_labels == k] wordz_tfidf = [(terms[i], z[:,i].sum()) for i in range(z.shape[1])] wordz_tfidf = sorted(wordz_tfidf, key=lambda x: x[1], reverse=True ) print(wordz_tfidf[:100]) print("") # In[8]: if reduce_dimensionality == True: mpld3.enable_notebook() fig,ax = plt.subplots(figsize=(15,10)) ph = {} ph_counter = 0 def add_labels(fig, boolmap, points): global ph_counter labels = [] for email in emails[boolmap]: ph[ph_counter] = email.politicalnewsbot_link() # labels.append(str(ph_counter) + " " + email.message_id + " " + email.message_labels + " " + email.sender.party) labels.append(email.sender.email_address) ph_counter += 1 tooltip = plugins.PointHTMLTooltip(points[0], labels, voffset=10, hoffset=10) plugins.connect(fig, tooltip) urls = [email.politicalnewsbot_link() for email in emails[boolmap]] # plugins.connect(fig, ClickInfo(points[0], urls)) if color_by_party == True: for party, col in zip(['d', 'r'], ['blue', 'red']): my_members = parties == party points = ax.plot(X[my_members, 0], X[my_members, 1], 'w', markerfacecolor=col, marker='.', label='Party %s' % party ) add_labels(fig, parties == party, points) for k in range(N_CLUSTERS): cluster_center = k_means_cluster_centers[k] centers = ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor='black', markeredgecolor='k', markersize=6) else: colors = [(random.random(), random.random(), random.random()) for x in range(N_CLUSTERS)] for k, col in zip(range(N_CLUSTERS), colors): my_members = k_means_labels == k cluster_center = k_means_cluster_centers[k] points = ax.plot(X[my_members, 0], X[my_members, 1], 'w', markerfacecolor=col, marker='.', label='Cluster %i' % k) centers = ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6) add_labels(fig, k_means_labels == k, points) ax.set_title('KMeans') ax.set_xticks(()) ax.set_yticks(()) ax.legend() ph_counter=0 # In[9]: mpld3.disable_notebook() # In[10]: labels = ['CATEGORY_PROMOTIONS', 'CATEGORY_UPDATES', 'IMPORTANT', 'INBOX', 'UNREAD'] label_histogram = [] for k in range(N_CLUSTERS): innerDict = dict.fromkeys(labels, 0) for email in emails[k_means_labels == k]: current_labels = eval(email.message_labels) for label in current_labels: if label in innerDict: innerDict[label] = innerDict[label]+1 else: innerDict[label] = 0 label_histogram.append(innerDict) label_histogram # In[11]: make_histogram(label_histogram[0]) # In[12]: make_histogram(label_histogram[1]) # In[13]: print("\n".join([email.politicalnewsbot_link() for email in emails[k_means_labels == 1]])) # In[ ]: