%matplotlib inline import pandas as pd import string import copy import matplotlib.pyplot as plt import os import numpy as np import scipy from scipy.stats import pearsonr from datetime import datetime import nltk from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cross_validation import train_test_split from sklearn.naive_bayes import MultinomialNB from sklearn import svm, tree from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import HashingVectorizer from sklearn.feature_selection import SelectKBest, chi2 from sklearn.linear_model import RidgeClassifier from sklearn.svm import LinearSVC from sklearn.linear_model import SGDClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.naive_bayes import BernoulliNB, MultinomialNB from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import NearestCentroid from sklearn.utils.extmath import density from sklearn import metrics pd.set_option('display.width', 500) pd.set_option('display.max_columns', 30) # set some nicer defaults for matplotlib from matplotlib import rcParams #these colors come from colorbrewer2.org. Each is an RGB triplet dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667), (0.8509803921568627, 0.37254901960784315, 0.00784313725490196), (0.4588235294117647, 0.4392156862745098, 0.7019607843137254), (0.9058823529411765, 0.1607843137254902, 0.5411764705882353), (0.4, 0.6509803921568628, 0.11764705882352941), (0.9019607843137255, 0.6705882352941176, 0.00784313725490196), (0.6509803921568628, 0.4627450980392157, 0.11372549019607843), (0.4, 0.4, 0.4)] rcParams['figure.figsize'] = (10, 6) rcParams['figure.dpi'] = 150 rcParams['axes.color_cycle'] = dark2_colors rcParams['lines.linewidth'] = 2 rcParams['axes.grid'] = False rcParams['axes.facecolor'] = 'white' rcParams['font.size'] = 14 rcParams['patch.edgecolor'] = 'none' # Not our code here - credit to the CS109 psets. def remove_border(axes=None, top=False, right=False, left=True, bottom=True): """ Minimize chartjunk by stripping out unnecessary plot borders and axis ticks The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn """ ax = axes or plt.gca() ax.spines['top'].set_visible(top) ax.spines['right'].set_visible(right) ax.spines['left'].set_visible(left) ax.spines['bottom'].set_visible(bottom) #turn off all ticks ax.yaxis.set_ticks_position('none') ax.xaxis.set_ticks_position('none') #now re-enable visibles if top: ax.xaxis.tick_top() if bottom: ax.xaxis.tick_bottom() if left: ax.yaxis.tick_left() if right: ax.yaxis.tick_right() df = pd.read_csv('Data/full.csv', encoding='utf-8') subs = list(df['subreddit'].unique()) types = list(df['type'].unique()) print "Original size of data set is", len(df) df = df.drop_duplicates('id') print "Size of data set with only unique posts is", len(df) dfmean = np.mean(df['score']) df = df.sort('score') df = df.reset_index(level=0, drop=True) median = len(df)/2 md = df['score'][median] st = nltk.stem.lancaster.LancasterStemmer() def stem_title(title): tokens = nltk.word_tokenize(title) stemmed_tokens = [st.stem(word) for word in tokens] stemmed_title = " ".join(stemmed_tokens) return stemmed_title stem_title("Thinking historically is, first, an attitude acknowledging that every event can be meaningfully \ understood only in relation to previous events, and, second, the methodical application of this attitude, \ which entails both analyzing events contextually--as having occurred in the midst of pre-existing circumstances--and \ comprehending them from historical actors.") print len(df) df = df.drop('type',1) df = df.drop_duplicates() print len(df) df['stems'] = df['title'].map(lambda x: stem_title(x)) print len(df)#dfavgs = [485.13011] *len(df) sse = 0 dfidlist = list(df.index) for i in dfidlist: sse += (df['score'][i]-md)**2 sst = 0 dfidlist = list(df.index) for i in dfidlist: sst += (df['score'][i]-dfmean)**2 rsq = 1 - (sst/sse) print sse print sst print rsq #print pearsonr(dfavgs, df['score']) #df['score'] from sklearn.feature_extraction.text import CountVectorizer from sklearn.cross_validation import train_test_split from sklearn.naive_bayes import MultinomialNB def make_xy(titles, scores, vectorizer=None): #Set default vecotrizer if not vectorizer: vectorizer = CountVectorizer(min_df=0.001) #Build the vocabulary by fitting the vectorizer to the list of quotes vectorizer.fit(titles) #Convert into a bag-of-words and use a sparse array to save memory x = vectorizer.transform(titles) x = x.tocsc() #save into numpy array, and return everything y = np.array(scores) return x, y, vectorizer X,Y,vectorizer = make_xy(list(df['title']), df['score']) x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5) clf = MultinomialNB(alpha=50) clf.fit(x_train, y_train) print "Training accuracy is", clf.score(x_train, y_train) print "Test accuracy is", clf.score(x_test, y_test) sorteddf = df.sort('score') sorteddf['category'] = df['score'] size = len(df) best_test = 0 best_vect = None best_Ysort = None best_clf = None for num in range(2, 11): blocksize = size/num blocks = [blocksize * i for i in range(num)] blocks.append(size) for i in range(num): sorteddf['category'][blocks[i]:blocks[i+1]] = i+1 Xsort, Ysort, vectorizer2 = make_xy(list(sorteddf['title']), sorteddf['category']) x_train3, x_test3, y_train3, y_test3 = train_test_split(Xsort, Ysort, train_size=0.5) clf3 = MultinomialNB(alpha=50) clf3.fit(x_train3, y_train3) train_acc = clf3.score(x_train3, y_train3) test_acc = clf3.score(x_test3, y_test3) if best_test < test_acc: best_test = test_acc best_vect = copy.deepcopy(vectorizer2) best_Ysort = copy.deepcopy(Ysort) best_clf = copy.deepcopy(clf3) print "For", num, "bins:" print "Training accuracy is", train_acc print "Test accuracy is", test_acc print "---------------------------------" best_test2 = 0 best_vect2 = None best_Ysort2 = None best_clf2 = None for num in range(2, 11): blocksize = size/num blocks = [blocksize * i for i in range(num)] blocks.append(size) for i in range(num): sorteddf['category'][blocks[i]:blocks[i+1]] = i+1 Xstem, Ystem, vectorizer3 = make_xy(list(sorteddf['stems']), sorteddf['category']) x_train4, x_test4, y_train4, y_test4 = train_test_split(Xstem, Ystem, train_size=0.5) clf4 = MultinomialNB(alpha=1) clf4.fit(x_train4, y_train4) train_acc = clf4.score(x_train4, y_train4) test_acc = clf4.score(x_test4, y_test4) if best_test < test_acc: best_test2 = test_acc best_vect2 = copy.deepcopy(vectorizer3) best_category2 = copy.deepcopy(sorteddf['category']) best_clf2 = copy.deepcopy(clf4) print "For", num, "bins:" print "Training accuracy is", train_acc print "Test accuracy is", test_acc print "---------------------------------" n_grams = CountVectorizer(ngram_range=[1, 5], analyzer='word') n_grams.fit(list(sorteddf['title'])) Xngram = n_grams.transform(list(sorteddf['title'])) x_train4, x_test4, y_train4, y_test4 = train_test_split(Xngram, best_Ysort, train_size=0.5) clf4 = MultinomialNB(alpha=1) clf4.fit(x_train4, y_train4) print "Training accuracy is", clf4.score(x_train4, y_train4) print "Test accuracy is", clf4.score(x_test4, y_test4) tdidf = TfidfVectorizer(ngram_range=[1, 5], sublinear_tf=True) tdidf.fit(list(sorteddf['title'])) Xtdidf = tdidf.transform(list(sorteddf['title'])) x_train5, x_test5, y_train5, y_test5 = train_test_split(Xtdidf, best_Ysort, train_size=0.5) clf5 = MultinomialNB(alpha=1) clf5.fit(x_train5, y_train5) print "Training accuracy is", clf5.score(x_train5, y_train5) print "Test accuracy is", clf5.score(x_test5, y_test5) subreddit_ngrams = {} for subreddit in subs: smalldf = df[df['subreddit'] == subreddit] sortedsmalldf = smalldf.sort('score') sortedsmalldf['category'] = smalldf['score'] size = len(smalldf) num = 2 blocksize = size/num blocks = [blocksize * i for i in range(num)] blocks.append(size) for i in range(num): sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1 n_grams = CountVectorizer(ngram_range=[1, 3]) n_grams.fit(list(sortedsmalldf['title'])) X = n_grams.transform(list(sortedsmalldf['title'])) Y = np.array(sortedsmalldf['category']) x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5) clf = MultinomialNB(alpha=50) clf.fit(x_train, y_train) subreddit_ngrams[subreddit] = [clf, n_grams] train_acc = clf.score(x_train, y_train) test_acc = clf.score(x_test, y_test) print "For", subreddit, "subreddit:" print "Training accuracy is", train_acc print "Test accuracy is", test_acc print "---------------------------------" gen_probs = [] spec_probs = [] for i in df.index: title = df.title[i] subreddit = df.subreddit[i] clf = subreddit_ngrams[subreddit][0] n_grams_spec = subreddit_ngrams[subreddit][1] #prob_gen = clf4.predict_proba(n_grams.transform([title]))[0][1] prob_spec = clf.predict_proba(n_grams_spec.transform([title]))[0][1] #gen_probs.append(prob_gen) spec_probs.append(prob_spec) #df['gen_probs'] = gen_probs df['spec_probs'] = spec_probs df.to_csv("Data/new_full.csv", index=False, encoding='utf-8') m, b, r, p, std = scipy.stats.linregress(np.array(df['spec_probs']), np.array(df['score'])) print m print b print r**2 print p print std def predict(title): x = clf.predict_proba(n_grams_spec.transform([title]))[0][1] y = m*x + b return y import pickle #for the website tup = (clf, n_grams_spec) with open('clf.pickle', 'wb') as handle: pickle.dump(tup, handle) print predict("If the Big Bang happened 13.7 Billion years ago, how is the edge of the observable universe 16 Billion light years away? Did the universe expand faster than the speed of light?") subreddit_svm = {} for subreddit in subs: smalldf = df[df['subreddit'] == subreddit] sortedsmalldf = smalldf.sort('score') sortedsmalldf['category'] = smalldf['score'] size = len(smalldf) num = 2 blocksize = size/num blocks = [blocksize * i for i in range(num)] blocks.append(size) for i in range(num): sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1 n_grams = CountVectorizer(ngram_range=[1, 3]) n_grams.fit(list(sortedsmalldf['title'])) X = n_grams.transform(list(sortedsmalldf['title'])) Y = np.array(sortedsmalldf['category']) x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5) clf = RidgeClassifier(tol=1e-2, solver="lsqr") clf.fit(x_train, y_train) subreddit_svm[subreddit] = [clf, n_grams] train_acc = clf.score(x_train, y_train) test_acc = clf.score(x_test, y_test) print "For", subreddit, "subreddit:" print "Training accuracy is", train_acc print "Test accuracy is", test_acc print "---------------------------------" subreddit_alchemy = {} for subreddit in subs: smalldf = df[df['subreddit'] == subreddit] sortedsmalldf = smalldf.sort('score') sortedsmalldf['category'] = smalldf['score'] size = len(smalldf) num = 2 blocksize = size/num blocks = [blocksize * i for i in range(num)] blocks.append(size) for i in range(num): sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1 alch_titles = [] for title in list(sortedsmalldf['title']): titles = [lst.replace('(', '') for lst in sortedsmalldf[sortedsmalldf['title'] == title]['alchemy']] titles = [lst.replace(')', '') for lst in titles] titles = [lst.replace('[', '') for lst in titles] titles = [lst.replace(']', '') for lst in titles] titles = "".join(titles) titles = "".join(ch for ch in titles if ch in 'qwertyuiopasdfghjklzxcvbnm ') titles = titles.replace(' ', ' ') titles = titles.split(' ') alch_titles.append(" ".join(titles[1:])) n_grams = CountVectorizer(ngram_range=[1, 3]) n_grams.fit(list(alch_titles)) X = n_grams.transform(alch_titles) Y = np.array(sortedsmalldf['category']) x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5) clf = RidgeClassifier(tol=1e-2, solver="lsqr") clf.fit(x_train, y_train) subreddit_alchemy[subreddit] = [clf, n_grams] train_acc = clf.score(x_train, y_train) test_acc = clf.score(x_test, y_test) print "For", subreddit, "subreddit:" print "Training accuracy is", train_acc print "Test accuracy is", test_acc print "---------------------------------" subreddit_svm = {} for subreddit in subs: smalldf = df[df['subreddit'] == subreddit] sortedsmalldf = smalldf.sort('score') sortedsmalldf['category'] = smalldf['score'] size = len(smalldf) num = 2 blocksize = size/num blocks = [blocksize * i for i in range(num)] blocks.append(size) for i in range(num): sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1 n_grams = CountVectorizer(ngram_range=[1, 3]) n_grams.fit(list(sortedsmalldf['title'])) X = n_grams.transform(list(sortedsmalldf['title'])) Y = np.array(sortedsmalldf['category']) x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5) clf = Perceptron(n_iter=50) clf.fit(x_train, y_train) subreddit_svm[subreddit] = [clf, n_grams] train_acc = clf.score(x_train, y_train) test_acc = clf.score(x_test, y_test) print "For", subreddit, "subreddit:" print "Training accuracy is", train_acc print "Test accuracy is", test_acc print "---------------------------------" subreddit_alchemy = {} for subreddit in subs: smalldf = df[df['subreddit'] == subreddit] sortedsmalldf = smalldf.sort('score') sortedsmalldf['category'] = smalldf['score'] size = len(smalldf) num = 2 blocksize = size/num blocks = [blocksize * i for i in range(num)] blocks.append(size) for i in range(num): sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1 alch_titles = [] for title in list(sortedsmalldf['title']): titles = [lst.replace('(', '') for lst in sortedsmalldf[sortedsmalldf['title'] == title]['alchemy']] titles = [lst.replace(')', '') for lst in titles] titles = [lst.replace('[', '') for lst in titles] titles = [lst.replace(']', '') for lst in titles] titles = "".join(titles) titles = "".join(ch for ch in titles if ch in 'qwertyuiopasdfghjklzxcvbnm ') titles = titles.replace(' ', ' ') titles = titles.split(' ') alch_titles.append(" ".join(titles[1:])) n_grams = CountVectorizer(ngram_range=[1, 3]) n_grams.fit(list(alch_titles)) X = n_grams.transform(alch_titles) Y = np.array(sortedsmalldf['category']) x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5) clf = Perceptron(n_iter=50) clf.fit(x_train, y_train) subreddit_alchemy[subreddit] = [clf, n_grams] train_acc = clf.score(x_train, y_train) test_acc = clf.score(x_test, y_test) print "For", subreddit, "subreddit:" print "Training accuracy is", train_acc print "Test accuracy is", test_acc print "---------------------------------" subreddit_svm = {} for subreddit in subs: smalldf = df[df['subreddit'] == subreddit] sortedsmalldf = smalldf.sort('score') sortedsmalldf['category'] = smalldf['score'] size = len(smalldf) num = 2 blocksize = size/num blocks = [blocksize * i for i in range(num)] blocks.append(size) for i in range(num): sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1 n_grams = CountVectorizer(ngram_range=[1, 3]) n_grams.fit(list(sortedsmalldf['title'])) X = n_grams.transform(list(sortedsmalldf['title'])) Y = np.array(sortedsmalldf['category']) x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5) clf = PassiveAggressiveClassifier(n_iter=50) clf.fit(x_train, y_train) subreddit_svm[subreddit] = [clf, n_grams] train_acc = clf.score(x_train, y_train) test_acc = clf.score(x_test, y_test) print "For", subreddit, "subreddit:" print "Training accuracy is", train_acc print "Test accuracy is", test_acc print "---------------------------------" subreddit_alchemy = {} for subreddit in subs: smalldf = df[df['subreddit'] == subreddit] sortedsmalldf = smalldf.sort('score') sortedsmalldf['category'] = smalldf['score'] size = len(smalldf) num = 2 blocksize = size/num blocks = [blocksize * i for i in range(num)] blocks.append(size) for i in range(num): sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1 alch_titles = [] for title in list(sortedsmalldf['title']): titles = [lst.replace('(', '') for lst in sortedsmalldf[sortedsmalldf['title'] == title]['alchemy']] titles = [lst.replace(')', '') for lst in titles] titles = [lst.replace('[', '') for lst in titles] titles = [lst.replace(']', '') for lst in titles] titles = "".join(titles) titles = "".join(ch for ch in titles if ch in 'qwertyuiopasdfghjklzxcvbnm ') titles = titles.replace(' ', ' ') titles = titles.split(' ') alch_titles.append(" ".join(titles[1:])) n_grams = CountVectorizer(ngram_range=[1, 3]) n_grams.fit(list(alch_titles)) X = n_grams.transform(alch_titles) Y = np.array(sortedsmalldf['category']) x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5) clf = PassiveAggressiveClassifier(n_iter=50) clf.fit(x_train, y_train) subreddit_alchemy[subreddit] = [clf, n_grams] train_acc = clf.score(x_train, y_train) test_acc = clf.score(x_test, y_test) print "For", subreddit, "subreddit:" print "Training accuracy is", train_acc print "Test accuracy is", test_acc print "---------------------------------" subreddit_svm = {} for subreddit in subs: smalldf = df[df['subreddit'] == subreddit] sortedsmalldf = smalldf.sort('score') sortedsmalldf['category'] = smalldf['score'] size = len(smalldf) num = 2 blocksize = size/num blocks = [blocksize * i for i in range(num)] blocks.append(size) for i in range(num): sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1 n_grams = CountVectorizer(ngram_range=[1, 3]) n_grams.fit(list(sortedsmalldf['title'])) X = n_grams.transform(list(sortedsmalldf['title'])) Y = np.array(sortedsmalldf['category']) x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5) clf = KNeighborsClassifier(n_neighbors=10) clf.fit(x_train, y_train) subreddit_svm[subreddit] = [clf, n_grams] train_acc = clf.score(x_train, y_train) test_acc = clf.score(x_test, y_test) print "For", subreddit, "subreddit:" print "Training accuracy is", train_acc print "Test accuracy is", test_acc print "---------------------------------" subreddit_alchemy = {} for subreddit in subs: smalldf = df[df['subreddit'] == subreddit] sortedsmalldf = smalldf.sort('score') sortedsmalldf['category'] = smalldf['score'] size = len(smalldf) num = 2 blocksize = size/num blocks = [blocksize * i for i in range(num)] blocks.append(size) for i in range(num): sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1 alch_titles = [] for title in list(sortedsmalldf['title']): titles = [lst.replace('(', '') for lst in sortedsmalldf[sortedsmalldf['title'] == title]['alchemy']] titles = [lst.replace(')', '') for lst in titles] titles = [lst.replace('[', '') for lst in titles] titles = [lst.replace(']', '') for lst in titles] titles = "".join(titles) titles = "".join(ch for ch in titles if ch in 'qwertyuiopasdfghjklzxcvbnm ') titles = titles.replace(' ', ' ') titles = titles.split(' ') alch_titles.append(" ".join(titles[1:])) n_grams = CountVectorizer(ngram_range=[1, 3]) n_grams.fit(list(alch_titles)) X = n_grams.transform(alch_titles) Y = np.array(sortedsmalldf['category']) x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5) clf = KNeighborsClassifier(n_neighbors=10) clf.fit(x_train, y_train) subreddit_alchemy[subreddit] = [clf, n_grams] train_acc = clf.score(x_train, y_train) test_acc = clf.score(x_test, y_test) print "For", subreddit, "subreddit:" print "Training accuracy is", train_acc print "Test accuracy is", test_acc print "---------------------------------" for i, d in enumerate(['Not alchemy', 'Alchemy']): for clf, name in ( (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), (Perceptron(n_iter=50), "Perceptron"), (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN")): subreddit_svm = {} for subreddit in subs: smalldf = df[df['subreddit'] == subreddit] sortedsmalldf = smalldf.sort('score') sortedsmalldf['category'] = smalldf['score'] size = len(smalldf) num = 2 blocksize = size/num blocks = [blocksize * i for i in range(num)] blocks.append(size) for i in range(num): sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1 titles = list(sortedsmalldf['title']) bins = list(sortedsmalldf['category']) if (i==1): alch_titles = [] for title in list(sortedsmalldf['title']): titles = [lst.replace('(', '') for lst in sortedsmalldf[sortedsmalldf['title'] == title]['alchemy']] titles = [lst.replace(')', '') for lst in titles] titles = [lst.replace('[', '') for lst in titles] titles = [lst.replace(']', '') for lst in titles] titles = "".join(titles) titles = "".join(ch for ch in titles if ch in 'qwertyuiopasdfghjklzxcvbnm ') titles = titles.replace(' ', ' ') titles = titles.split(' ')[1:] alch_titles.append(titles) alch_bins = [] categories = np.array(sortedsmalldf['category']) for i, lst in enumerate(alch_titles): b = categories[i] for j in range(len(lst)): alch_bins.append(b) alch_titles = [word for words in alch_titles for word in words] titles = alch_titles bins = alch_bins n_grams = CountVectorizer(ngram_range=[1, 3]) n_grams.fit(titles) X = n_grams.transform(titles) Y = np.array(bins) x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5) clf2 = clf clf2.fit(x_train, y_train) subreddit_svm[subreddit] = [clf, n_grams] train_acc = clf.score(x_train, y_train) test_acc = clf.score(x_test, y_test) print "For", d, "and", subreddit, "subreddit and", name, "classifier:" print "Training accuracy is", train_acc print "Test accuracy is", test_acc print "---------------------------------"