#!/usr/bin/env python # coding: utf-8 # In[2]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') import sys, os sys.path.append('/Users/arimorcos/Github/getRedditDataset/') #sys.path.append('D:\Documents\GitHub\getRedditDataset') #from celebReddit import countWords import redditDB import datetime import praw from matplotlib import pyplot as pp from matplotlib import colorbar as cb from mpl_toolkits.mplot3d import Axes3D import numpy as np import pandas as PD import scipy from scipy import spatial from sklearn import cluster, decomposition get_ipython().run_line_magic('matplotlib', 'inline') # In[3]: def addCopyright(ax, xOffset=0, yOffset=0): year = datetime.date.today().year if any([i == '_axis3don' for i in dir(ax)]): ax.text(0 + xOffset, -.05 + yOffset, 0, r'$\copyright$ Ari Morcos ' + str(year), transform=ax.transAxes) else: ax.text(0 + xOffset, -.05 + yOffset, r'$\copyright$ Ari Morcos ' + str(year), transform=ax.transAxes) # In[4]: """ Establish database connection """ db = redditDB.RedditDB('March2-8_Top200') # In[5]: """ Get comments from subreddits """ if not os.path.isfile('popSubs.npy'): subList = praw.Reddit(user_agent='pop').get_popular_subreddits(limit=200) subList = [sub.display_name for sub in subList] np.save('popSubs.npy', subList) else: subList = list(np.load('popSubs.npy')) comments = db.getSubredditCommentText(subList[0]) # In[6]: """ Strip formatting """ comments = [''.join(s for s in comment if ord(s)>31 and ord(s)<126) for comment in comments] # In[7]: """ Enumerate words to get list and values """ def countNWords(allWords): wordCounts = {} for word in allWords: if word in wordCounts: wordCounts[word] += 1 else: wordCounts[word] = 1 return wordCounts # In[8]: """ Concatenate all comments together and split into individual words """ allComments = " ".join(comments) allWords = allComments.split() allWords = [word.lower() for word in allWords] wordCounts = countNWords(allWords) # In[9]: """ sort by values and get 100 most frequent words""" sortedWords = sorted(wordCounts, key=wordCounts.get, reverse=True) mostFreqWords = sortedWords[0:100] # In[10]: freqWordCounts = [wordCounts[word] for word in mostFreqWords] # In[11]: reload = False usedSubs = subList[0:50] if not os.path.isfile('allCounts.npy') or reload: allCounts = np.zeros(shape=(len(mostFreqWords) ,len(usedSubs))) subInd = 0 for sub in usedSubs: # get comments comments = db.getSubredditCommentText(sub) comments = [''.join(s for s in comment if ord(s)>31 and ord(s)<126) for comment in comments] # convert to all words allComments = " ".join(comments) allWords = allComments.split() allWords = [word.lower() for word in allWords] # count words wordCounts = countNWords(allWords) # get total words totWords = float(len(allWords)) # get freqWordCounts freqWordCounts = [wordCounts[word] if word in wordCounts else 0 for word in mostFreqWords ] freqWordCounts = [i/totWords for i in freqWordCounts] # concatenate allCounts[:,subInd] = freqWordCounts # increment subInd subInd += 1 # save np.save('allCounts',allCounts) else: allCounts = np.load('allCounts.npy') # In[12]: """ get pairwise distance """ normMax = np.max(allCounts, axis=1) normCounts = np.transpose(np.divide(np.transpose(allCounts), normMax)) distVec = spatial.distance.pdist(np.transpose(normCounts), 'euclidean') distMat = spatial.distance.squareform(distVec) # In[13]: """ print top words """ print '100 Most Frequent Words' for word, ind in zip(mostFreqWords, range(len(mostFreqWords))): print str(ind+1) + ': ' + word # In[47]: get_ipython().run_line_magic('matplotlib', 'inline') fig, ax = pp.subplots() fig.set_figheight(15) fig.set_figwidth(15) axMat = ax.matshow(distMat) ax.set_xticks(range(len(distMat))) ax.set_xticklabels(subList, rotation=90); ax.set_yticks(range(len(distMat))) ax.set_yticklabels(subList); cbar = fig.colorbar(axMat, shrink=0.8) cbar.set_label('Euclidean distance') addCopyright(ax) # In[15]: """ Plot most 10 most similar pairs""" x = distMat.copy() np.fill_diagonal(x, np.inf) for ind in range(30): minInd = np.argmin(x) inds = np.unravel_index(minInd, distMat.shape) print usedSubs[inds[0]] + ', ' + usedSubs[inds[1]] + ': ' + str(x[inds]) x[inds] = np.inf x[inds[1], inds[0]] = np.inf # In[16]: """ Plot most 10 most dissimilar pairs""" x = distMat.copy() np.fill_diagonal(x, -1*np.inf) for ind in range(30): maxInd = np.argmax(x) inds = np.unravel_index(maxInd, distMat.shape) print usedSubs[inds[0]] + ', ' + usedSubs[inds[1]] + ': ' + str(x[inds]) x[inds] = -1*np.inf x[inds[1], inds[0]] = -1*np.inf # In[17]: """ perform pca """ pcaObj = decomposition.PCA() reducedCounts = pcaObj.fit_transform(normCounts.transpose())[:,0:3] # In[48]: """ plot cumsum of variance explained """ varExp = np.cumsum(pcaObj.explained_variance_ratio_) fig = pp.figure(figsize=(15,15)) ax = pp.subplot() ax.plot(varExp, marker='o'); ax.set_xlabel("Number of principal components") ax.set_ylabel("Cumulative variance explained") ax.set_ylim(bottom=0, top=1) for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] + ax.get_xticklabels() + ax.get_yticklabels()): item.set_fontsize(20) addCopyright(ax) # In[19]: """ cluster points """ apObj = cluster.AffinityPropagation(damping=0.75) apObj.fit(reducedCounts); len(set(apObj.labels_)) # In[20]: # plot 3d #%matplotlib qt get_ipython().run_line_magic('matplotlib', 'inline') fig = pp.figure(figsize=(15,15)) ax = pp.subplot(projection='3d') ax.scatter(reducedCounts[:,0], reducedCounts[:,1], reducedCounts[:,2], s=120, c=apObj.labels_) ax.set_xlabel('PC1'); ax.set_ylabel('PC2'); ax.set_zlabel('PC3') for item in ([ax.title, ax.xaxis.label, ax.yaxis.label, ax.zaxis.label] + ax.get_xticklabels() + ax.get_yticklabels() + ax.get_zticklabels()): item.set_fontsize(20) addCopyright(ax) # In[22]: get_ipython().run_line_magic('matplotlib', 'inline') #plot 2d fig = pp.figure(figsize=(15,15)) ax2 = pp.subplot() ax2.scatter(reducedCounts[:,0], reducedCounts[:,1], s=50, c=apObj.labels_) ax2.set_xlabel('PC1'); ax2.set_ylabel('PC2') # get distance between all points dists = spatial.distance.squareform(spatial.distance.pdist(reducedCounts[:,0:2], 'euclidean')) np.fill_diagonal(dists,np.inf) # label points for point in range(len(usedSubs)): yOffset = 0 if any(np.argwhere(dists[point,:] < 0.05)): yOffset = 0.01 if reducedCounts[point,1] < reducedCounts[np.argwhere(dists[point,:] < 0.05),1]: yOffset = -1*yOffset if point == 37 or point == 23: # ALL THE KLUGE yOffset = -0.02 elif point == 27: yOffset = 0.02 ax2.text(reducedCounts[point,0]+0.02, reducedCounts[point,1]+yOffset, usedSubs[point], size=10, rotation=0, va='center', ha='left') for item in ([ax2.title, ax2.xaxis.label, ax2.yaxis.label] + ax2.get_xticklabels() + ax2.get_yticklabels()): item.set_fontsize(20) ax2.set_xlim(left=-1.6, right=1.6) addCopyright(ax2) # In[ ]: """ animate """ shouldSave = False def animate(i, ax): newVal = i*5 if newVal > 360: newVal = 0 ax.azim = newVal import matplotlib.animation as animation get_ipython().run_line_magic('matplotlib', 'qt') get_ipython().run_line_magic('pylab', 'qt') fig = pp.figure(figsize=(10,10)) ax = pp.subplot(projection='3d') ax.scatter(reducedCounts[:,0], reducedCounts[:,1], reducedCounts[:,2], s=120, c=apObj.labels_) ax.set_xlabel('PC1'); ax.set_ylabel('PC2'); ax.set_zlabel('PC3') for item in ([ax.title, ax.xaxis.label, ax.yaxis.label, ax.zaxis.label] + ax.get_xticklabels() + ax.get_yticklabels() + ax.get_zticklabels()): item.set_fontsize(20) addCopyright(ax) ax.elev = 20 ani = animation.FuncAnimation(fig, animate, fargs=[ax], frames=range(int(72)), interval=175) if shouldSave: pp.rcParams['animation.ffmpeg_path'] = 'C:\\FFMPEG\\bin\\ffmpeg.exe' pp.rcParams['animation.mencoder_path'] = 'C:\\Mencoder\\mencoder.exe' Writer = animation.FFMpegWriter() ani.save('D:\\Dropbox\\Website\\Blog Posts\\201503\\Clustering subreddits\\pc3DAnimation.avi', writer=Writer, dpi=300, savefig_kwargs={'transparent': True}) print('Saved file: complete!') # In[50]: """ plot normalized word distribution """ def plotBarNormFreq(normCounts, barSubs, usedSubs): # get meanCounts meanNormCounts = np.mean(normCounts, axis=1) #generate colors colors = 'brgmy' fig, ax = pp.subplots(figsize=(15,15)) handles = [] width = 1./(len(barSubs)) for sub, ind in zip(barSubs, range(len(barSubs))): # normalize tempNorm = normCounts[:,usedSubs.index(barSubs[ind])]/meanNormCounts # sort sortedNorm = sorted(tempNorm, reverse=True) sortOrder = [i[0] for i in sorted(enumerate(tempNorm), key=lambda x:x[1], reverse=True)] # plot bar handles.append(ax.bar([i + -0.5*width + width*ind for i in range(len(mostFreqWords))], sortedNorm, width=width, color=colors[ind], align='center')) # plot chance line ax.plot([0, len(mostFreqWords)], [1, 1], ls='--', color='k') # label axes ax.set_ylabel('Normalized word frequency') ax.set_xticks(range(len(mostFreqWords))); ax.set_xticklabels([mostFreqWords[i] for i in sortOrder], rotation=-90); ax.set_xlim(left=0-0.5*width, right=len(mostFreqWords)-0.5*width); ax.set_title(", ".join(barSubs) + ' Normalized Word Frequency') for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] + ax.get_yticklabels()): item.set_fontsize(20) if len(barSubs) > 1: ax.legend(handles, barSubs, loc='upper right'); addCopyright(ax, yOffset=-0.01) # In[51]: get_ipython().run_line_magic('matplotlib', 'inline') plotBarNormFreq(normCounts, ['gentlemanboners','Celebs'], usedSubs) # In[40]: plotBarNormFreq(normCounts, ['nfl'], usedSubs) # In[53]: plotBarNormFreq(normCounts, ['TrollXChromosomes'], usedSubs) # In[42]: plotBarNormFreq(normCounts, ['pcmasterrace'], usedSubs) # In[43]: plotBarNormFreq(normCounts, ['circlejerk'], usedSubs) # In[45]: plotBarNormFreq(normCounts, ['science'], usedSubs) # In[ ]: # In[44]: plotBarNormFreq(normCounts, ['movies'], usedSubs) # In[49]: """ plot pcs themselves """ get_ipython().run_line_magic('matplotlib', 'inline') pcaObj.components_ barSubs = ['PC 1', 'PC 2', 'PC 3'] barSubs = ['PC 1'] #generate colors colors = 'brgmy' fig, ax = pp.subplots(figsize=(15,15)) handles = [] width = 1 for sub, ind in zip(barSubs, range(len(barSubs))): # normalize tempNorm = np.sum(np.abs(pcaObj.components_[0:2,:]), axis=0) # sort sortedNorm = sorted(tempNorm, reverse=True) sortOrder = [i[0] for i in sorted(enumerate(tempNorm), key=lambda x:x[1], reverse=True)] # plot bar handles.append(ax.bar([i + ind for i in range(len(mostFreqWords))], sortedNorm, width=width, color=colors[ind], align='center')) # label axes ax.set_ylabel('Summed principal component contribution (a.u.)') ax.set_xticks(range(len(mostFreqWords))); ax.set_xticklabels([mostFreqWords[i] for i in sortOrder], rotation=-90); ax.set_xlim(left=0-0.5*width, right=len(mostFreqWords)-0.5*width); #ax.set_title('PC contributions') for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] + ax.get_yticklabels()): item.set_fontsize(20) addCopyright(ax, yOffset=-0.02) # In[ ]: