#!/usr/bin/env python
# coding: utf-8

# In[2]:


get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
import sys, os
sys.path.append('/Users/arimorcos/Github/getRedditDataset/')
#sys.path.append('D:\Documents\GitHub\getRedditDataset')
#from celebReddit import countWords
import redditDB
import datetime
import praw
from matplotlib import pyplot as pp
from matplotlib import colorbar as cb
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as PD
import scipy
from scipy import spatial
from sklearn import cluster, decomposition
get_ipython().run_line_magic('matplotlib', 'inline')


# In[3]:


def addCopyright(ax, xOffset=0, yOffset=0):
    year = datetime.date.today().year
    if any([i == '_axis3don' for i in dir(ax)]):
        ax.text(0 + xOffset, -.05 + yOffset, 0, r'$\copyright$ Ari Morcos  ' + str(year), transform=ax.transAxes)
    else: 
        ax.text(0 + xOffset, -.05 + yOffset, r'$\copyright$ Ari Morcos  ' + str(year), transform=ax.transAxes)


# In[4]:


""" Establish database connection """
db = redditDB.RedditDB('March2-8_Top200')


# In[5]:


""" Get comments from subreddits """
if not os.path.isfile('popSubs.npy'):
    subList = praw.Reddit(user_agent='pop').get_popular_subreddits(limit=200)
    subList = [sub.display_name for sub in subList]
    np.save('popSubs.npy', subList)
else:
    subList = list(np.load('popSubs.npy'))
comments = db.getSubredditCommentText(subList[0])


# In[6]:


""" Strip formatting """ 
comments = [''.join(s for s in comment if ord(s)>31 and ord(s)<126) for comment in comments]


# In[7]:


""" Enumerate words to get list and values """
def countNWords(allWords):
    wordCounts = {}
    for word in allWords:
        if word in wordCounts:
            wordCounts[word] += 1 
        else: 
            wordCounts[word] = 1
    return wordCounts


# In[8]:


""" Concatenate all comments together and split into individual words """ 
allComments = " ".join(comments)
allWords = allComments.split()
allWords = [word.lower() for word in allWords]
wordCounts = countNWords(allWords)


# In[9]:


""" sort by values and get 100 most frequent words""" 
sortedWords = sorted(wordCounts, key=wordCounts.get, reverse=True)
mostFreqWords = sortedWords[0:100]


# In[10]:


freqWordCounts = [wordCounts[word] for word in mostFreqWords]


# In[11]:


reload = False
usedSubs = subList[0:50]

if not os.path.isfile('allCounts.npy') or reload:

    allCounts = np.zeros(shape=(len(mostFreqWords) ,len(usedSubs)))
    subInd = 0
    for sub in usedSubs: 

        # get comments 
        comments = db.getSubredditCommentText(sub)
        comments = [''.join(s for s in comment if ord(s)>31 and ord(s)<126) for comment in comments]

        # convert to all words
        allComments = " ".join(comments)
        allWords = allComments.split()
        allWords = [word.lower() for word in allWords]

        # count words 
        wordCounts = countNWords(allWords)

        # get total words 
        totWords = float(len(allWords))

        # get freqWordCounts 
        freqWordCounts = [wordCounts[word] if word in wordCounts else 0 for word in mostFreqWords ]
        freqWordCounts = [i/totWords for i in freqWordCounts]

        # concatenate 
        allCounts[:,subInd] = freqWordCounts

        # increment subInd
        subInd += 1    
        
    # save
    np.save('allCounts',allCounts)
else:
    allCounts = np.load('allCounts.npy')


# In[12]:


""" get pairwise distance """
normMax = np.max(allCounts, axis=1)
normCounts = np.transpose(np.divide(np.transpose(allCounts), normMax))
distVec = spatial.distance.pdist(np.transpose(normCounts), 'euclidean')
distMat = spatial.distance.squareform(distVec)


# In[13]:


""" print top words """ 
print '100 Most Frequent Words'
for word, ind in zip(mostFreqWords, range(len(mostFreqWords))):
    print str(ind+1) + ': ' + word 


# In[47]:


get_ipython().run_line_magic('matplotlib', 'inline')
fig, ax = pp.subplots()
fig.set_figheight(15)
fig.set_figwidth(15)
axMat = ax.matshow(distMat)
ax.set_xticks(range(len(distMat)))
ax.set_xticklabels(subList, rotation=90);
ax.set_yticks(range(len(distMat)))
ax.set_yticklabels(subList);
cbar = fig.colorbar(axMat, shrink=0.8)
cbar.set_label('Euclidean distance')
addCopyright(ax)


# In[15]:


""" Plot most 10 most similar pairs"""
x = distMat.copy()
np.fill_diagonal(x, np.inf)
for ind in range(30):
    minInd = np.argmin(x)
    inds = np.unravel_index(minInd, distMat.shape)
    print usedSubs[inds[0]] + ', ' + usedSubs[inds[1]] + ': ' + str(x[inds])
    x[inds] = np.inf
    x[inds[1], inds[0]] = np.inf


# In[16]:


""" Plot most 10 most dissimilar pairs"""
x = distMat.copy()
np.fill_diagonal(x, -1*np.inf)
for ind in range(30):
    maxInd = np.argmax(x)
    inds = np.unravel_index(maxInd, distMat.shape)
    print usedSubs[inds[0]] + ', ' + usedSubs[inds[1]] + ': ' + str(x[inds])
    x[inds] = -1*np.inf
    x[inds[1], inds[0]] = -1*np.inf


# In[17]:


""" perform pca """
pcaObj = decomposition.PCA()
reducedCounts = pcaObj.fit_transform(normCounts.transpose())[:,0:3]


# In[48]:


""" plot cumsum of variance explained """ 
varExp = np.cumsum(pcaObj.explained_variance_ratio_)
fig = pp.figure(figsize=(15,15))
ax = pp.subplot()
ax.plot(varExp, marker='o');
ax.set_xlabel("Number of principal components")
ax.set_ylabel("Cumulative variance explained")
ax.set_ylim(bottom=0, top=1)
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
                 ax.get_xticklabels() + ax.get_yticklabels()):
        item.set_fontsize(20)
addCopyright(ax)


# In[19]:


""" cluster points """
apObj = cluster.AffinityPropagation(damping=0.75)
apObj.fit(reducedCounts);
len(set(apObj.labels_))


# In[20]:


# plot 3d
#%matplotlib qt
get_ipython().run_line_magic('matplotlib', 'inline')
fig = pp.figure(figsize=(15,15))
ax = pp.subplot(projection='3d')
ax.scatter(reducedCounts[:,0], reducedCounts[:,1], reducedCounts[:,2], s=120, c=apObj.labels_)
ax.set_xlabel('PC1'); ax.set_ylabel('PC2'); ax.set_zlabel('PC3')
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label, ax.zaxis.label] +
                 ax.get_xticklabels() + ax.get_yticklabels() + ax.get_zticklabels()):
        item.set_fontsize(20)
addCopyright(ax)


# In[22]:


get_ipython().run_line_magic('matplotlib', 'inline')
#plot 2d 
fig = pp.figure(figsize=(15,15))
ax2 = pp.subplot()
ax2.scatter(reducedCounts[:,0], reducedCounts[:,1], s=50, c=apObj.labels_)
ax2.set_xlabel('PC1'); ax2.set_ylabel('PC2')

# get distance between all points
dists = spatial.distance.squareform(spatial.distance.pdist(reducedCounts[:,0:2], 'euclidean'))
np.fill_diagonal(dists,np.inf)

# label points 
for point in range(len(usedSubs)):
    yOffset = 0
    if any(np.argwhere(dists[point,:] < 0.05)):
        yOffset = 0.01
        if reducedCounts[point,1] < reducedCounts[np.argwhere(dists[point,:] < 0.05),1]:
            yOffset = -1*yOffset
    if point == 37 or point == 23:  # ALL THE KLUGE
        yOffset = -0.02
    elif point == 27:
        yOffset = 0.02
    ax2.text(reducedCounts[point,0]+0.02, reducedCounts[point,1]+yOffset, usedSubs[point],
             size=10, rotation=0, va='center', ha='left')
for item in ([ax2.title, ax2.xaxis.label, ax2.yaxis.label] +
                 ax2.get_xticklabels() + ax2.get_yticklabels()):
        item.set_fontsize(20)
ax2.set_xlim(left=-1.6, right=1.6)
addCopyright(ax2)


# In[ ]:


""" animate """ 
shouldSave = False

def animate(i, ax):
    newVal = i*5
    if newVal > 360:
        newVal = 0
    ax.azim = newVal

import matplotlib.animation as animation
get_ipython().run_line_magic('matplotlib', 'qt')
get_ipython().run_line_magic('pylab', 'qt')
fig = pp.figure(figsize=(10,10))
ax = pp.subplot(projection='3d')
ax.scatter(reducedCounts[:,0], reducedCounts[:,1], reducedCounts[:,2], s=120, c=apObj.labels_)
ax.set_xlabel('PC1'); ax.set_ylabel('PC2'); ax.set_zlabel('PC3')
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label, ax.zaxis.label] +
                 ax.get_xticklabels() + ax.get_yticklabels() + ax.get_zticklabels()):
        item.set_fontsize(20)
addCopyright(ax)
ax.elev = 20
ani = animation.FuncAnimation(fig, animate, fargs=[ax], frames=range(int(72)), interval=175)

if shouldSave: 
    pp.rcParams['animation.ffmpeg_path'] = 'C:\\FFMPEG\\bin\\ffmpeg.exe'
    pp.rcParams['animation.mencoder_path'] = 'C:\\Mencoder\\mencoder.exe'
    Writer = animation.FFMpegWriter()
    ani.save('D:\\Dropbox\\Website\\Blog Posts\\201503\\Clustering subreddits\\pc3DAnimation.avi', 
             writer=Writer, dpi=300, savefig_kwargs={'transparent': True})
    print('Saved file: complete!')


# In[50]:


""" plot normalized word distribution """ 
def plotBarNormFreq(normCounts, barSubs, usedSubs):

    # get meanCounts 
    meanNormCounts = np.mean(normCounts, axis=1)

    #generate colors 
    colors = 'brgmy'

    fig, ax = pp.subplots(figsize=(15,15))
    handles = []
    width = 1./(len(barSubs))
    for sub, ind in zip(barSubs, range(len(barSubs))):

        # normalize 
        tempNorm = normCounts[:,usedSubs.index(barSubs[ind])]/meanNormCounts

        # sort 
        sortedNorm = sorted(tempNorm, reverse=True)
        sortOrder = [i[0] for i in sorted(enumerate(tempNorm), key=lambda x:x[1], reverse=True)]

        # plot bar 
        handles.append(ax.bar([i + -0.5*width + width*ind for i in range(len(mostFreqWords))], 
                               sortedNorm, width=width, color=colors[ind], align='center'))
        
    # plot chance line 
    ax.plot([0, len(mostFreqWords)], [1, 1], ls='--', color='k')

    # label axes 
    ax.set_ylabel('Normalized word frequency')
    ax.set_xticks(range(len(mostFreqWords)));
    ax.set_xticklabels([mostFreqWords[i] for i in sortOrder], rotation=-90);
    ax.set_xlim(left=0-0.5*width, right=len(mostFreqWords)-0.5*width);
    ax.set_title(", ".join(barSubs) + ' Normalized Word Frequency')
    for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
                 ax.get_yticklabels()):
        item.set_fontsize(20)
    if len(barSubs) > 1:
        ax.legend(handles, barSubs, loc='upper right');
    addCopyright(ax, yOffset=-0.01)


# In[51]:


get_ipython().run_line_magic('matplotlib', 'inline')
plotBarNormFreq(normCounts, ['gentlemanboners','Celebs'], usedSubs)


# In[40]:


plotBarNormFreq(normCounts, ['nfl'], usedSubs)


# In[53]:


plotBarNormFreq(normCounts, ['TrollXChromosomes'], usedSubs)


# In[42]:


plotBarNormFreq(normCounts, ['pcmasterrace'], usedSubs)


# In[43]:


plotBarNormFreq(normCounts, ['circlejerk'], usedSubs)


# In[45]:


plotBarNormFreq(normCounts, ['science'], usedSubs)


# In[ ]:


# In[44]:


plotBarNormFreq(normCounts, ['movies'], usedSubs)


# In[49]:


""" plot pcs themselves """ 
get_ipython().run_line_magic('matplotlib', 'inline')
pcaObj.components_

barSubs = ['PC 1', 'PC 2', 'PC 3']
barSubs = ['PC 1']


#generate colors 
colors = 'brgmy'

fig, ax = pp.subplots(figsize=(15,15))
handles = []
width = 1
for sub, ind in zip(barSubs, range(len(barSubs))):

    # normalize 
    tempNorm = np.sum(np.abs(pcaObj.components_[0:2,:]), axis=0)
    
    # sort 
    sortedNorm = sorted(tempNorm, reverse=True)
    sortOrder = [i[0] for i in sorted(enumerate(tempNorm), key=lambda x:x[1], reverse=True)]

    # plot bar 
    handles.append(ax.bar([i + ind for i in range(len(mostFreqWords))], 
                           sortedNorm, width=width, color=colors[ind], align='center'))

# label axes 
ax.set_ylabel('Summed principal component contribution (a.u.)')
ax.set_xticks(range(len(mostFreqWords)));
ax.set_xticklabels([mostFreqWords[i] for i in sortOrder], rotation=-90);
ax.set_xlim(left=0-0.5*width, right=len(mostFreqWords)-0.5*width);
#ax.set_title('PC contributions')
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
             ax.get_yticklabels()):
    item.set_fontsize(20)
addCopyright(ax, yOffset=-0.02)


# In[ ]: