%matplotlib inline
import pandas as pd
import string
import copy
import matplotlib.pyplot as plt
import os
import numpy as np
import scipy
from scipy.stats import pearsonr
from datetime import datetime
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm, tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.utils.extmath import density
from sklearn import metrics
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 30)
# set some nicer defaults for matplotlib
from matplotlib import rcParams
#these colors come from colorbrewer2.org. Each is an RGB triplet
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
(0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
(0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
(0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
(0.4, 0.6509803921568628, 0.11764705882352941),
(0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
(0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
(0.4, 0.4, 0.4)]
rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = False
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'
# Not our code here - credit to the CS109 psets.
def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
"""
Minimize chartjunk by stripping out unnecessary plot borders and axis ticks
The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
"""
ax = axes or plt.gca()
ax.spines['top'].set_visible(top)
ax.spines['right'].set_visible(right)
ax.spines['left'].set_visible(left)
ax.spines['bottom'].set_visible(bottom)
#turn off all ticks
ax.yaxis.set_ticks_position('none')
ax.xaxis.set_ticks_position('none')
#now re-enable visibles
if top:
ax.xaxis.tick_top()
if bottom:
ax.xaxis.tick_bottom()
if left:
ax.yaxis.tick_left()
if right:
ax.yaxis.tick_right()
df = pd.read_csv('Data/full.csv', encoding='utf-8')
subs = list(df['subreddit'].unique())
types = list(df['type'].unique())
print "Original size of data set is", len(df)
df = df.drop_duplicates('id')
print "Size of data set with only unique posts is", len(df)
dfmean = np.mean(df['score'])
df = df.sort('score')
df = df.reset_index(level=0, drop=True)
median = len(df)/2
md = df['score'][median]
Original size of data set is 44261 Size of data set with only unique posts is 25992
Stemming the titles
We want to define a function with which we can stem the titles
st = nltk.stem.lancaster.LancasterStemmer()
def stem_title(title):
tokens = nltk.word_tokenize(title)
stemmed_tokens = [st.stem(word) for word in tokens]
stemmed_title = " ".join(stemmed_tokens)
return stemmed_title
stem_title("Thinking historically is, first, an attitude acknowledging that every event can be meaningfully \
understood only in relation to previous events, and, second, the methodical application of this attitude, \
which entails both analyzing events contextually--as having occurred in the midst of pre-existing circumstances--and \
comprehending them from historical actors.")
'think hist is , first , an attitud acknowledg that every ev can be mean understood on in rel to prevy ev , and , second , the method apply of thi attitud , which entail both analys ev context -- as hav occur in the midst of pre-existing circumst -- and comprehend them from hist act .'
R squared
We were trying to find the $r^2$ for the data. This code isn't used anymore
print len(df)
df = df.drop('type',1)
df = df.drop_duplicates()
print len(df)
df['stems'] = df['title'].map(lambda x: stem_title(x))
print len(df)#dfavgs = [485.13011] *len(df)
sse = 0
dfidlist = list(df.index)
for i in dfidlist:
sse += (df['score'][i]-md)**2
sst = 0
dfidlist = list(df.index)
for i in dfidlist:
sst += (df['score'][i]-dfmean)**2
rsq = 1 - (sst/sse)
print sse
print sst
print rsq
#print pearsonr(dfavgs, df['score'])
#df['score']
25992 25992 25992 27016697811 21238960370.3 0.213858017776
Define Make XY function and test it
This function uses a countVectorizer to create a bag of words and then runs a regression using the titles on it.
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
def make_xy(titles, scores, vectorizer=None):
#Set default vecotrizer
if not vectorizer:
vectorizer = CountVectorizer(min_df=0.001)
#Build the vocabulary by fitting the vectorizer to the list of quotes
vectorizer.fit(titles)
#Convert into a bag-of-words and use a sparse array to save memory
x = vectorizer.transform(titles)
x = x.tocsc()
#save into numpy array, and return everything
y = np.array(scores)
return x, y, vectorizer
X,Y,vectorizer = make_xy(list(df['title']), df['score'])
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)
clf = MultinomialNB(alpha=50)
clf.fit(x_train, y_train)
print "Training accuracy is", clf.score(x_train, y_train)
print "Test accuracy is", clf.score(x_test, y_test)
Training accuracy is 0.0800246229609 Test accuracy is 0.0826408125577
Test for the best num bins (We get 2 as the best btw)
We tried binning the data based on the score of the post and then run the regression again in order to find the optimal number of bins
sorteddf = df.sort('score')
sorteddf['category'] = df['score']
size = len(df)
best_test = 0
best_vect = None
best_Ysort = None
best_clf = None
for num in range(2, 11):
blocksize = size/num
blocks = [blocksize * i for i in range(num)]
blocks.append(size)
for i in range(num):
sorteddf['category'][blocks[i]:blocks[i+1]] = i+1
Xsort, Ysort, vectorizer2 = make_xy(list(sorteddf['title']), sorteddf['category'])
x_train3, x_test3, y_train3, y_test3 = train_test_split(Xsort, Ysort, train_size=0.5)
clf3 = MultinomialNB(alpha=50)
clf3.fit(x_train3, y_train3)
train_acc = clf3.score(x_train3, y_train3)
test_acc = clf3.score(x_test3, y_test3)
if best_test < test_acc:
best_test = test_acc
best_vect = copy.deepcopy(vectorizer2)
best_Ysort = copy.deepcopy(Ysort)
best_clf = copy.deepcopy(clf3)
print "For", num, "bins:"
print "Training accuracy is", train_acc
print "Test accuracy is", test_acc
print "---------------------------------"
For 2 bins: Training accuracy is 0.617343798092 Test accuracy is 0.595183133272 --------------------------------- For 3 bins: Training accuracy is 0.4905355494 Test accuracy is 0.466143428747 --------------------------------- For 4 bins: Training accuracy is 0.39242843952 Test accuracy is 0.366035703293 --------------------------------- For 5 bins: Training accuracy is 0.32971683595 Test accuracy is 0.293551862111 --------------------------------- For 6 bins: Training accuracy is 0.292859341336 Test accuracy is 0.260772545399 --------------------------------- For 7 bins: Training accuracy is 0.265927977839 Test accuracy is 0.233379501385 --------------------------------- For 8 bins: Training accuracy is 0.243459526008 Test accuracy is 0.212296091105 --------------------------------- For 9 bins: Training accuracy is 0.216528162512 Test accuracy is 0.188904278239 --------------------------------- For 10 bins: Training accuracy is 0.20729455217 Test accuracy is 0.18297937827 ---------------------------------
** Test with stemming (though we get the same answer)**
We run the same tests using the stemming from earlier.
best_test2 = 0
best_vect2 = None
best_Ysort2 = None
best_clf2 = None
for num in range(2, 11):
blocksize = size/num
blocks = [blocksize * i for i in range(num)]
blocks.append(size)
for i in range(num):
sorteddf['category'][blocks[i]:blocks[i+1]] = i+1
Xstem, Ystem, vectorizer3 = make_xy(list(sorteddf['stems']), sorteddf['category'])
x_train4, x_test4, y_train4, y_test4 = train_test_split(Xstem, Ystem, train_size=0.5)
clf4 = MultinomialNB(alpha=1)
clf4.fit(x_train4, y_train4)
train_acc = clf4.score(x_train4, y_train4)
test_acc = clf4.score(x_test4, y_test4)
if best_test < test_acc:
best_test2 = test_acc
best_vect2 = copy.deepcopy(vectorizer3)
best_category2 = copy.deepcopy(sorteddf['category'])
best_clf2 = copy.deepcopy(clf4)
print "For", num, "bins:"
print "Training accuracy is", train_acc
print "Test accuracy is", test_acc
print "---------------------------------"
For 2 bins: Training accuracy is 0.659664512158 Test accuracy is 0.601261926747 --------------------------------- For 3 bins: Training accuracy is 0.557863958141 Test accuracy is 0.482686980609 --------------------------------- For 4 bins: Training accuracy is 0.479301323484 Test accuracy is 0.388888888889 --------------------------------- For 5 bins: Training accuracy is 0.441982148353 Test accuracy is 0.326100338566 --------------------------------- For 6 bins: Training accuracy is 0.411280393967 Test accuracy is 0.287934749154 --------------------------------- For 7 bins: Training accuracy is 0.39296706679 Test accuracy is 0.257463835026 --------------------------------- For 8 bins: Training accuracy is 0.376808248692 Test accuracy is 0.240150815636 --------------------------------- For 9 bins: Training accuracy is 0.359572176054 Test accuracy is 0.210834102801 --------------------------------- For 10 bins: Training accuracy is 0.350954139735 Test accuracy is 0.199753770391 ---------------------------------
N-grams
We now try to run the regression using n_grams and the optimal number of bins.
n_grams = CountVectorizer(ngram_range=[1, 5], analyzer='word')
n_grams.fit(list(sorteddf['title']))
Xngram = n_grams.transform(list(sorteddf['title']))
x_train4, x_test4, y_train4, y_test4 = train_test_split(Xngram, best_Ysort, train_size=0.5)
clf4 = MultinomialNB(alpha=1)
clf4.fit(x_train4, y_train4)
print "Training accuracy is", clf4.score(x_train4, y_train4)
print "Test accuracy is", clf4.score(x_test4, y_test4)
Training accuracy is 0.876192674669 Test accuracy is 0.614112034472
TDIDF
Uses TFIDF instead of the normal vectorizer
tdidf = TfidfVectorizer(ngram_range=[1, 5], sublinear_tf=True)
tdidf.fit(list(sorteddf['title']))
Xtdidf = tdidf.transform(list(sorteddf['title']))
x_train5, x_test5, y_train5, y_test5 = train_test_split(Xtdidf, best_Ysort, train_size=0.5)
clf5 = MultinomialNB(alpha=1)
clf5.fit(x_train5, y_train5)
print "Training accuracy is", clf5.score(x_train5, y_train5)
print "Test accuracy is", clf5.score(x_test5, y_test5)
Training accuracy is 0.896968297938 Test accuracy is 0.614112034472
Splitting by subreddits and examining title n-grams
Maybe looking at distinct subreddits improves the model
subreddit_ngrams = {}
for subreddit in subs:
smalldf = df[df['subreddit'] == subreddit]
sortedsmalldf = smalldf.sort('score')
sortedsmalldf['category'] = smalldf['score']
size = len(smalldf)
num = 2
blocksize = size/num
blocks = [blocksize * i for i in range(num)]
blocks.append(size)
for i in range(num):
sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1
n_grams = CountVectorizer(ngram_range=[1, 3])
n_grams.fit(list(sortedsmalldf['title']))
X = n_grams.transform(list(sortedsmalldf['title']))
Y = np.array(sortedsmalldf['category'])
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)
clf = MultinomialNB(alpha=50)
clf.fit(x_train, y_train)
subreddit_ngrams[subreddit] = [clf, n_grams]
train_acc = clf.score(x_train, y_train)
test_acc = clf.score(x_test, y_test)
print "For", subreddit, "subreddit:"
print "Training accuracy is", train_acc
print "Test accuracy is", test_acc
print "---------------------------------"
For atheism subreddit: Training accuracy is 0.768292682927 Test accuracy is 0.587242026266 --------------------------------- For politics subreddit: Training accuracy is 0.637393767705 Test accuracy is 0.605288007554 --------------------------------- For nosleep subreddit: Training accuracy is 0.714851485149 Test accuracy is 0.546983184965 --------------------------------- For pettyrevenge subreddit: Training accuracy is 0.703743315508 Test accuracy is 0.519230769231 --------------------------------- For jokes subreddit: Training accuracy is 0.709800190295 Test accuracy is 0.55946717412 --------------------------------- For askhistorians subreddit: Training accuracy is 0.612244897959 Test accuracy is 0.551020408163 --------------------------------- For TalesFromTechsupport subreddit: Training accuracy is 0.755037115589 Test accuracy is 0.522799575822 --------------------------------- For AskReddit subreddit: Training accuracy is 0.692571428571 Test accuracy is 0.558285714286 --------------------------------- For talesFromRetail subreddit: Training accuracy is 0.684656084656 Test accuracy is 0.554497354497 --------------------------------- For askscience subreddit: Training accuracy is 0.6119257087 Test accuracy is 0.5390625 --------------------------------- For tifu subreddit: Training accuracy is 0.717659137577 Test accuracy is 0.550308008214 --------------------------------- For explainlikeimfive subreddit: Training accuracy is 0.673290937997 Test accuracy is 0.544876886418 ---------------------------------
Getting the probability of each title being successful (both specific and generic)
We are calculating the probability of a post being successful for the whole data set and for each subreddit and add it to our dataframe in order to use the data later.
gen_probs = []
spec_probs = []
for i in df.index:
title = df.title[i]
subreddit = df.subreddit[i]
clf = subreddit_ngrams[subreddit][0]
n_grams_spec = subreddit_ngrams[subreddit][1]
#prob_gen = clf4.predict_proba(n_grams.transform([title]))[0][1]
prob_spec = clf.predict_proba(n_grams_spec.transform([title]))[0][1]
#gen_probs.append(prob_gen)
spec_probs.append(prob_spec)
#df['gen_probs'] = gen_probs
df['spec_probs'] = spec_probs
df.to_csv("Data/new_full.csv", index=False, encoding='utf-8')
** Prediction Function**
m, b, r, p, std = scipy.stats.linregress(np.array(df['spec_probs']), np.array(df['score']))
print m
print b
print r**2
print p
print std
def predict(title):
x = clf.predict_proba(n_grams_spec.transform([title]))[0][1]
y = m*x + b
return y
1961.81582836 -517.566973807 0.137196851471 0.0 30.5167923553
Testing the prediction function
import pickle
#for the website
tup = (clf, n_grams_spec)
with open('clf.pickle', 'wb') as handle:
pickle.dump(tup, handle)
print predict("If the Big Bang happened 13.7 Billion years ago, how is the edge of the observable universe 16 Billion light years away? Did the universe expand faster than the speed of light?")
1131.34965136
sklearn regression models
Ridge Classifier on the regular dataset
subreddit_svm = {}
for subreddit in subs:
smalldf = df[df['subreddit'] == subreddit]
sortedsmalldf = smalldf.sort('score')
sortedsmalldf['category'] = smalldf['score']
size = len(smalldf)
num = 2
blocksize = size/num
blocks = [blocksize * i for i in range(num)]
blocks.append(size)
for i in range(num):
sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1
n_grams = CountVectorizer(ngram_range=[1, 3])
n_grams.fit(list(sortedsmalldf['title']))
X = n_grams.transform(list(sortedsmalldf['title']))
Y = np.array(sortedsmalldf['category'])
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)
clf = RidgeClassifier(tol=1e-2, solver="lsqr")
clf.fit(x_train, y_train)
subreddit_svm[subreddit] = [clf, n_grams]
train_acc = clf.score(x_train, y_train)
test_acc = clf.score(x_test, y_test)
print "For", subreddit, "subreddit:"
print "Training accuracy is", train_acc
print "Test accuracy is", test_acc
print "---------------------------------"
For atheism subreddit: Training accuracy is 0.992495309568 Test accuracy is 0.602251407129 --------------------------------- For politics subreddit: Training accuracy is 1.0 Test accuracy is 0.67044381492 --------------------------------- For nosleep subreddit: Training accuracy is 0.925742574257 Test accuracy is 0.627101879327 --------------------------------- For pettyrevenge subreddit: Training accuracy is 0.974331550802 Test accuracy is 0.544871794872 --------------------------------- For jokes subreddit: Training accuracy is 0.966698382493 Test accuracy is 0.549000951475 --------------------------------- For askhistorians subreddit: Training accuracy is 1.0 Test accuracy is 0.586734693878 --------------------------------- For TalesFromTechsupport subreddit: Training accuracy is 0.985153764581 Test accuracy is 0.519618239661 --------------------------------- For AskReddit subreddit: Training accuracy is 0.998857142857 Test accuracy is 0.556571428571 --------------------------------- For talesFromRetail subreddit: Training accuracy is 0.98835978836 Test accuracy is 0.577777777778 --------------------------------- For askscience subreddit: Training accuracy is 0.99706744868 Test accuracy is 0.595703125 --------------------------------- For tifu subreddit: Training accuracy is 0.992813141684 Test accuracy is 0.517453798768 --------------------------------- For explainlikeimfive subreddit: Training accuracy is 0.994435612083 Test accuracy is 0.560762509929 ---------------------------------
** Now we apply the ridge classifier to the newly constructed alchemy stuff**
subreddit_alchemy = {}
for subreddit in subs:
smalldf = df[df['subreddit'] == subreddit]
sortedsmalldf = smalldf.sort('score')
sortedsmalldf['category'] = smalldf['score']
size = len(smalldf)
num = 2
blocksize = size/num
blocks = [blocksize * i for i in range(num)]
blocks.append(size)
for i in range(num):
sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1
alch_titles = []
for title in list(sortedsmalldf['title']):
titles = [lst.replace('(', '') for lst in sortedsmalldf[sortedsmalldf['title'] == title]['alchemy']]
titles = [lst.replace(')', '') for lst in titles]
titles = [lst.replace('[', '') for lst in titles]
titles = [lst.replace(']', '') for lst in titles]
titles = "".join(titles)
titles = "".join(ch for ch in titles if ch in 'qwertyuiopasdfghjklzxcvbnm ')
titles = titles.replace(' ', ' ')
titles = titles.split(' ')
alch_titles.append(" ".join(titles[1:]))
n_grams = CountVectorizer(ngram_range=[1, 3])
n_grams.fit(list(alch_titles))
X = n_grams.transform(alch_titles)
Y = np.array(sortedsmalldf['category'])
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)
clf = RidgeClassifier(tol=1e-2, solver="lsqr")
clf.fit(x_train, y_train)
subreddit_alchemy[subreddit] = [clf, n_grams]
train_acc = clf.score(x_train, y_train)
test_acc = clf.score(x_test, y_test)
print "For", subreddit, "subreddit:"
print "Training accuracy is", train_acc
print "Test accuracy is", test_acc
print "---------------------------------"
For atheism subreddit: Training accuracy is 0.997185741088 Test accuracy is 0.71200750469 --------------------------------- For politics subreddit: Training accuracy is 0.998111425873 Test accuracy is 0.745986779981 --------------------------------- For nosleep subreddit: Training accuracy is 0.99504950495 Test accuracy is 0.824925816024 --------------------------------- For pettyrevenge subreddit: Training accuracy is 0.998930481283 Test accuracy is 0.653846153846 --------------------------------- For jokes subreddit: Training accuracy is 0.997145575642 Test accuracy is 0.649857278782 --------------------------------- For askhistorians subreddit: Training accuracy is 0.998979591837 Test accuracy is 0.630612244898 --------------------------------- For TalesFromTechsupport subreddit: Training accuracy is 0.996818663839 Test accuracy is 0.688229056204 --------------------------------- For AskReddit subreddit: Training accuracy is 0.998857142857 Test accuracy is 0.717142857143 --------------------------------- For talesFromRetail subreddit: Training accuracy is 0.996825396825 Test accuracy is 0.668783068783 --------------------------------- For askscience subreddit: Training accuracy is 1.0 Test accuracy is 0.6484375 --------------------------------- For tifu subreddit: Training accuracy is 0.99794661191 Test accuracy is 0.686858316222 --------------------------------- For explainlikeimfive subreddit: Training accuracy is 0.99920508744 Test accuracy is 0.626687847498 ---------------------------------
** Now same as above but with the Perceptron algorithm**
subreddit_svm = {}
for subreddit in subs:
smalldf = df[df['subreddit'] == subreddit]
sortedsmalldf = smalldf.sort('score')
sortedsmalldf['category'] = smalldf['score']
size = len(smalldf)
num = 2
blocksize = size/num
blocks = [blocksize * i for i in range(num)]
blocks.append(size)
for i in range(num):
sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1
n_grams = CountVectorizer(ngram_range=[1, 3])
n_grams.fit(list(sortedsmalldf['title']))
X = n_grams.transform(list(sortedsmalldf['title']))
Y = np.array(sortedsmalldf['category'])
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)
clf = Perceptron(n_iter=50)
clf.fit(x_train, y_train)
subreddit_svm[subreddit] = [clf, n_grams]
train_acc = clf.score(x_train, y_train)
test_acc = clf.score(x_test, y_test)
print "For", subreddit, "subreddit:"
print "Training accuracy is", train_acc
print "Test accuracy is", test_acc
print "---------------------------------"
For atheism subreddit: Training accuracy is 1.0 Test accuracy is 0.575984990619 --------------------------------- For politics subreddit: Training accuracy is 0.996222851747 Test accuracy is 0.690273843248 --------------------------------- For nosleep subreddit: Training accuracy is 1.0 Test accuracy is 0.603363006924 --------------------------------- For pettyrevenge subreddit: Training accuracy is 1.0 Test accuracy is 0.523504273504 --------------------------------- For jokes subreddit: Training accuracy is 0.989533777355 Test accuracy is 0.562321598478 --------------------------------- For askhistorians subreddit: Training accuracy is 1.0 Test accuracy is 0.592857142857 --------------------------------- For TalesFromTechsupport subreddit: Training accuracy is 0.998939554613 Test accuracy is 0.566277836691 --------------------------------- For AskReddit subreddit: Training accuracy is 1.0 Test accuracy is 0.581714285714 --------------------------------- For talesFromRetail subreddit: Training accuracy is 1.0 Test accuracy is 0.550264550265 --------------------------------- For askscience subreddit: Training accuracy is 1.0 Test accuracy is 0.595703125 --------------------------------- For tifu subreddit: Training accuracy is 0.998973305955 Test accuracy is 0.550308008214 --------------------------------- For explainlikeimfive subreddit: Training accuracy is 1.0 Test accuracy is 0.550436854647 ---------------------------------
subreddit_alchemy = {}
for subreddit in subs:
smalldf = df[df['subreddit'] == subreddit]
sortedsmalldf = smalldf.sort('score')
sortedsmalldf['category'] = smalldf['score']
size = len(smalldf)
num = 2
blocksize = size/num
blocks = [blocksize * i for i in range(num)]
blocks.append(size)
for i in range(num):
sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1
alch_titles = []
for title in list(sortedsmalldf['title']):
titles = [lst.replace('(', '') for lst in sortedsmalldf[sortedsmalldf['title'] == title]['alchemy']]
titles = [lst.replace(')', '') for lst in titles]
titles = [lst.replace('[', '') for lst in titles]
titles = [lst.replace(']', '') for lst in titles]
titles = "".join(titles)
titles = "".join(ch for ch in titles if ch in 'qwertyuiopasdfghjklzxcvbnm ')
titles = titles.replace(' ', ' ')
titles = titles.split(' ')
alch_titles.append(" ".join(titles[1:]))
n_grams = CountVectorizer(ngram_range=[1, 3])
n_grams.fit(list(alch_titles))
X = n_grams.transform(alch_titles)
Y = np.array(sortedsmalldf['category'])
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)
clf = Perceptron(n_iter=50)
clf.fit(x_train, y_train)
subreddit_alchemy[subreddit] = [clf, n_grams]
train_acc = clf.score(x_train, y_train)
test_acc = clf.score(x_test, y_test)
print "For", subreddit, "subreddit:"
print "Training accuracy is", train_acc
print "Test accuracy is", test_acc
print "---------------------------------"
For atheism subreddit: Training accuracy is 0.996247654784 Test accuracy is 0.711069418386 --------------------------------- For politics subreddit: Training accuracy is 0.996222851747 Test accuracy is 0.780925401322 --------------------------------- For nosleep subreddit: Training accuracy is 0.99504950495 Test accuracy is 0.854599406528 --------------------------------- For pettyrevenge subreddit: Training accuracy is 0.995721925134 Test accuracy is 0.662393162393 --------------------------------- For jokes subreddit: Training accuracy is 0.985727878211 Test accuracy is 0.693625118934 --------------------------------- For askhistorians subreddit: Training accuracy is 0.997959183673 Test accuracy is 0.638775510204 --------------------------------- For TalesFromTechsupport subreddit: Training accuracy is 0.994697773065 Test accuracy is 0.677624602333 --------------------------------- For AskReddit subreddit: Training accuracy is 0.999428571429 Test accuracy is 0.717142857143 --------------------------------- For talesFromRetail subreddit: Training accuracy is 0.998941798942 Test accuracy is 0.674074074074 --------------------------------- For askscience subreddit: Training accuracy is 1.0 Test accuracy is 0.654296875 --------------------------------- For tifu subreddit: Training accuracy is 0.990759753593 Test accuracy is 0.724845995893 --------------------------------- For explainlikeimfive subreddit: Training accuracy is 0.998410174881 Test accuracy is 0.636219221604 ---------------------------------
** Now the Passive Aggressive Classifier**
subreddit_svm = {}
for subreddit in subs:
smalldf = df[df['subreddit'] == subreddit]
sortedsmalldf = smalldf.sort('score')
sortedsmalldf['category'] = smalldf['score']
size = len(smalldf)
num = 2
blocksize = size/num
blocks = [blocksize * i for i in range(num)]
blocks.append(size)
for i in range(num):
sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1
n_grams = CountVectorizer(ngram_range=[1, 3])
n_grams.fit(list(sortedsmalldf['title']))
X = n_grams.transform(list(sortedsmalldf['title']))
Y = np.array(sortedsmalldf['category'])
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)
clf = PassiveAggressiveClassifier(n_iter=50)
clf.fit(x_train, y_train)
subreddit_svm[subreddit] = [clf, n_grams]
train_acc = clf.score(x_train, y_train)
test_acc = clf.score(x_test, y_test)
print "For", subreddit, "subreddit:"
print "Training accuracy is", train_acc
print "Test accuracy is", test_acc
print "---------------------------------"
For atheism subreddit: Training accuracy is 0.999061913696 Test accuracy is 0.613508442777 --------------------------------- For politics subreddit: Training accuracy is 1.0 Test accuracy is 0.712936732767 --------------------------------- For nosleep subreddit: Training accuracy is 0.99702970297 Test accuracy is 0.612265084075 --------------------------------- For pettyrevenge subreddit: Training accuracy is 1.0 Test accuracy is 0.530982905983 --------------------------------- For jokes subreddit: Training accuracy is 0.998097050428 Test accuracy is 0.577545195052 --------------------------------- For askhistorians subreddit: Training accuracy is 1.0 Test accuracy is 0.571428571429 --------------------------------- For TalesFromTechsupport subreddit: Training accuracy is 1.0 Test accuracy is 0.522799575822 --------------------------------- For AskReddit subreddit: Training accuracy is 1.0 Test accuracy is 0.583428571429 --------------------------------- For talesFromRetail subreddit: Training accuracy is 1.0 Test accuracy is 0.57671957672 --------------------------------- For askscience subreddit: Training accuracy is 1.0 Test accuracy is 0.6005859375 --------------------------------- For tifu subreddit: Training accuracy is 0.998973305955 Test accuracy is 0.532854209446 --------------------------------- For explainlikeimfive subreddit: Training accuracy is 1.0 Test accuracy is 0.561556791104 ---------------------------------
subreddit_alchemy = {}
for subreddit in subs:
smalldf = df[df['subreddit'] == subreddit]
sortedsmalldf = smalldf.sort('score')
sortedsmalldf['category'] = smalldf['score']
size = len(smalldf)
num = 2
blocksize = size/num
blocks = [blocksize * i for i in range(num)]
blocks.append(size)
for i in range(num):
sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1
alch_titles = []
for title in list(sortedsmalldf['title']):
titles = [lst.replace('(', '') for lst in sortedsmalldf[sortedsmalldf['title'] == title]['alchemy']]
titles = [lst.replace(')', '') for lst in titles]
titles = [lst.replace('[', '') for lst in titles]
titles = [lst.replace(']', '') for lst in titles]
titles = "".join(titles)
titles = "".join(ch for ch in titles if ch in 'qwertyuiopasdfghjklzxcvbnm ')
titles = titles.replace(' ', ' ')
titles = titles.split(' ')
alch_titles.append(" ".join(titles[1:]))
n_grams = CountVectorizer(ngram_range=[1, 3])
n_grams.fit(list(alch_titles))
X = n_grams.transform(alch_titles)
Y = np.array(sortedsmalldf['category'])
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)
clf = PassiveAggressiveClassifier(n_iter=50)
clf.fit(x_train, y_train)
subreddit_alchemy[subreddit] = [clf, n_grams]
train_acc = clf.score(x_train, y_train)
test_acc = clf.score(x_test, y_test)
print "For", subreddit, "subreddit:"
print "Training accuracy is", train_acc
print "Test accuracy is", test_acc
print "---------------------------------"
For atheism subreddit: Training accuracy is 0.996247654784 Test accuracy is 0.757973733583 --------------------------------- For politics subreddit: Training accuracy is 0.99716713881 Test accuracy is 0.807365439093 --------------------------------- For nosleep subreddit: Training accuracy is 0.99504950495 Test accuracy is 0.848664688427 --------------------------------- For pettyrevenge subreddit: Training accuracy is 0.99679144385 Test accuracy is 0.692307692308 --------------------------------- For jokes subreddit: Training accuracy is 0.996194100856 Test accuracy is 0.742150333016 --------------------------------- For askhistorians subreddit: Training accuracy is 0.997959183673 Test accuracy is 0.65612244898 --------------------------------- For TalesFromTechsupport subreddit: Training accuracy is 0.998939554613 Test accuracy is 0.673382820785 --------------------------------- For AskReddit subreddit: Training accuracy is 0.999428571429 Test accuracy is 0.724 --------------------------------- For talesFromRetail subreddit: Training accuracy is 0.998941798942 Test accuracy is 0.67619047619 --------------------------------- For askscience subreddit: Training accuracy is 1.0 Test accuracy is 0.6728515625 --------------------------------- For tifu subreddit: Training accuracy is 0.994866529774 Test accuracy is 0.715605749487 --------------------------------- For explainlikeimfive subreddit: Training accuracy is 0.99920508744 Test accuracy is 0.669579030977 ---------------------------------
** And finally.... K-neighbors... **
subreddit_svm = {}
for subreddit in subs:
smalldf = df[df['subreddit'] == subreddit]
sortedsmalldf = smalldf.sort('score')
sortedsmalldf['category'] = smalldf['score']
size = len(smalldf)
num = 2
blocksize = size/num
blocks = [blocksize * i for i in range(num)]
blocks.append(size)
for i in range(num):
sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1
n_grams = CountVectorizer(ngram_range=[1, 3])
n_grams.fit(list(sortedsmalldf['title']))
X = n_grams.transform(list(sortedsmalldf['title']))
Y = np.array(sortedsmalldf['category'])
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)
clf = KNeighborsClassifier(n_neighbors=10)
clf.fit(x_train, y_train)
subreddit_svm[subreddit] = [clf, n_grams]
train_acc = clf.score(x_train, y_train)
test_acc = clf.score(x_test, y_test)
print "For", subreddit, "subreddit:"
print "Training accuracy is", train_acc
print "Test accuracy is", test_acc
print "---------------------------------"
For atheism subreddit: Training accuracy is 0.68574108818 Test accuracy is 0.540337711069 --------------------------------- For politics subreddit: Training accuracy is 0.478753541076 Test accuracy is 0.522190745987 --------------------------------- For nosleep subreddit: Training accuracy is 0.714851485149 Test accuracy is 0.495548961424 --------------------------------- For pettyrevenge subreddit: Training accuracy is 0.626737967914 Test accuracy is 0.520299145299 --------------------------------- For jokes subreddit: Training accuracy is 0.603235014272 Test accuracy is 0.50808753568 --------------------------------- For askhistorians subreddit: Training accuracy is 0.521428571429 Test accuracy is 0.488775510204 --------------------------------- For TalesFromTechsupport subreddit: Training accuracy is 0.71474019088 Test accuracy is 0.520678685048 --------------------------------- For AskReddit subreddit: Training accuracy is 0.553142857143 Test accuracy is 0.504571428571 --------------------------------- For talesFromRetail subreddit: Training accuracy is 0.627513227513 Test accuracy is 0.51746031746 --------------------------------- For askscience subreddit: Training accuracy is 0.492668621701 Test accuracy is 0.5107421875 --------------------------------- For tifu subreddit: Training accuracy is 0.519507186858 Test accuracy is 0.525667351129 --------------------------------- For explainlikeimfive subreddit: Training accuracy is 0.523847376789 Test accuracy is 0.513105639396 ---------------------------------
subreddit_alchemy = {}
for subreddit in subs:
smalldf = df[df['subreddit'] == subreddit]
sortedsmalldf = smalldf.sort('score')
sortedsmalldf['category'] = smalldf['score']
size = len(smalldf)
num = 2
blocksize = size/num
blocks = [blocksize * i for i in range(num)]
blocks.append(size)
for i in range(num):
sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1
alch_titles = []
for title in list(sortedsmalldf['title']):
titles = [lst.replace('(', '') for lst in sortedsmalldf[sortedsmalldf['title'] == title]['alchemy']]
titles = [lst.replace(')', '') for lst in titles]
titles = [lst.replace('[', '') for lst in titles]
titles = [lst.replace(']', '') for lst in titles]
titles = "".join(titles)
titles = "".join(ch for ch in titles if ch in 'qwertyuiopasdfghjklzxcvbnm ')
titles = titles.replace(' ', ' ')
titles = titles.split(' ')
alch_titles.append(" ".join(titles[1:]))
n_grams = CountVectorizer(ngram_range=[1, 3])
n_grams.fit(list(alch_titles))
X = n_grams.transform(alch_titles)
Y = np.array(sortedsmalldf['category'])
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)
clf = KNeighborsClassifier(n_neighbors=10)
clf.fit(x_train, y_train)
subreddit_alchemy[subreddit] = [clf, n_grams]
train_acc = clf.score(x_train, y_train)
test_acc = clf.score(x_test, y_test)
print "For", subreddit, "subreddit:"
print "Training accuracy is", train_acc
print "Test accuracy is", test_acc
print "---------------------------------"
For atheism subreddit: Training accuracy is 0.506566604128 Test accuracy is 0.496247654784 --------------------------------- For politics subreddit: Training accuracy is 0.503305004721 Test accuracy is 0.496694995279 --------------------------------- For nosleep subreddit: Training accuracy is 0.619801980198 Test accuracy is 0.525222551929 --------------------------------- For pettyrevenge subreddit: Training accuracy is 0.500534759358 Test accuracy is 0.498931623932 --------------------------------- For jokes subreddit: Training accuracy is 0.492863939106 Test accuracy is 0.507136060894 --------------------------------- For askhistorians subreddit: Training accuracy is 0.504081632653 Test accuracy is 0.495918367347 --------------------------------- For TalesFromTechsupport subreddit: Training accuracy is 0.496288441145 Test accuracy is 0.503711558855 --------------------------------- For AskReddit subreddit: Training accuracy is 0.500571428571 Test accuracy is 0.499428571429 --------------------------------- For talesFromRetail subreddit: Training accuracy is 0.502645502646 Test accuracy is 0.497354497354 --------------------------------- For askscience subreddit: Training accuracy is 0.502443792766 Test accuracy is 0.498046875 --------------------------------- For tifu subreddit: Training accuracy is 0.496919917864 Test accuracy is 0.504106776181 --------------------------------- For explainlikeimfive subreddit: Training accuracy is 0.515103338633 Test accuracy is 0.486894360604 ---------------------------------
** NOw putting it all together so we don't have to keep r-running the same code all the time...**
for i, d in enumerate(['Not alchemy', 'Alchemy']):
for clf, name in (
(RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
(Perceptron(n_iter=50), "Perceptron"),
(PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
(KNeighborsClassifier(n_neighbors=10), "kNN")):
subreddit_svm = {}
for subreddit in subs:
smalldf = df[df['subreddit'] == subreddit]
sortedsmalldf = smalldf.sort('score')
sortedsmalldf['category'] = smalldf['score']
size = len(smalldf)
num = 2
blocksize = size/num
blocks = [blocksize * i for i in range(num)]
blocks.append(size)
for i in range(num):
sortedsmalldf['category'][blocks[i]:blocks[i+1]] = i+1
titles = list(sortedsmalldf['title'])
bins = list(sortedsmalldf['category'])
if (i==1):
alch_titles = []
for title in list(sortedsmalldf['title']):
titles = [lst.replace('(', '') for lst in sortedsmalldf[sortedsmalldf['title'] == title]['alchemy']]
titles = [lst.replace(')', '') for lst in titles]
titles = [lst.replace('[', '') for lst in titles]
titles = [lst.replace(']', '') for lst in titles]
titles = "".join(titles)
titles = "".join(ch for ch in titles if ch in 'qwertyuiopasdfghjklzxcvbnm ')
titles = titles.replace(' ', ' ')
titles = titles.split(' ')[1:]
alch_titles.append(titles)
alch_bins = []
categories = np.array(sortedsmalldf['category'])
for i, lst in enumerate(alch_titles):
b = categories[i]
for j in range(len(lst)):
alch_bins.append(b)
alch_titles = [word for words in alch_titles for word in words]
titles = alch_titles
bins = alch_bins
n_grams = CountVectorizer(ngram_range=[1, 3])
n_grams.fit(titles)
X = n_grams.transform(titles)
Y = np.array(bins)
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5)
clf2 = clf
clf2.fit(x_train, y_train)
subreddit_svm[subreddit] = [clf, n_grams]
train_acc = clf.score(x_train, y_train)
test_acc = clf.score(x_test, y_test)
print "For", d, "and", subreddit, "subreddit and", name, "classifier:"
print "Training accuracy is", train_acc
print "Test accuracy is", test_acc
print "---------------------------------"
For Not alchemy and atheism subreddit and Ridge Classifier classifier: Training accuracy is 0.781820835532 Test accuracy is 0.709438749733 --------------------------------- For Not alchemy and politics subreddit and Ridge Classifier classifier: Training accuracy is 0.829282780411 Test accuracy is 0.811248025276 --------------------------------- For Not alchemy and nosleep subreddit and Ridge Classifier classifier: Training accuracy is 0.734472447672 Test accuracy is 0.65041733945 --------------------------------- For Not alchemy and pettyrevenge subreddit and Ridge Classifier classifier: Training accuracy is 0.714860702307 Test accuracy is 0.613887139611 --------------------------------- For Not alchemy and jokes subreddit and Ridge Classifier classifier: Training accuracy is 0.859851018196 Test accuracy is 0.801388344365 --------------------------------- For Not alchemy and askhistorians subreddit and Ridge Classifier classifier: Training accuracy is 0.761180962572 Test accuracy is 0.667481365289 --------------------------------- For Not alchemy and TalesFromTechsupport subreddit and Ridge Classifier classifier: Training accuracy is 0.687174322372 Test accuracy is 0.578304943732 --------------------------------- For Not alchemy and AskReddit subreddit and Ridge Classifier classifier: Training accuracy is 0.773876452466 Test accuracy is 0.705906864587 --------------------------------- For Not alchemy and talesFromRetail subreddit and Ridge Classifier classifier: Training accuracy is 0.670503987414 Test accuracy is 0.565479677972 --------------------------------- For Not alchemy and askscience subreddit and Ridge Classifier classifier: Training accuracy is 0.751232014912 Test accuracy is 0.668620026796 --------------------------------- For Not alchemy and tifu subreddit and Ridge Classifier classifier: Training accuracy is 0.735181174345 Test accuracy is 0.648525112942 --------------------------------- For Not alchemy and explainlikeimfive subreddit and Ridge Classifier classifier: Training accuracy is 0.772828252276 Test accuracy is 0.701780369007 --------------------------------- For Not alchemy and atheism subreddit and Perceptron classifier: Training accuracy is 0.73519406303 Test accuracy is 0.671659050848 --------------------------------- For Not alchemy and politics subreddit and Perceptron classifier: Training accuracy is 0.813952606635 Test accuracy is 0.773219589258 --------------------------------- For Not alchemy and nosleep subreddit and Perceptron classifier: Training accuracy is 0.669542930372 Test accuracy is 0.607008910646 --------------------------------- For Not alchemy and pettyrevenge subreddit and Perceptron classifier: Training accuracy is 0.52497775344 Test accuracy is 0.449702926923 --------------------------------- For Not alchemy and jokes subreddit and Perceptron classifier: Training accuracy is 0.822167385439 Test accuracy is 0.757547104541 --------------------------------- For Not alchemy and askhistorians subreddit and Perceptron classifier: Training accuracy is 0.528490478854 Test accuracy is 0.437286445604 --------------------------------- For Not alchemy and TalesFromTechsupport subreddit and Perceptron classifier: Training accuracy is 0.63752900994 Test accuracy is 0.569218811578 --------------------------------- For Not alchemy and AskReddit subreddit and Perceptron classifier: Training accuracy is 0.723346699547 Test accuracy is 0.664537636025 --------------------------------- For Not alchemy and talesFromRetail subreddit and Perceptron classifier: Training accuracy is 0.616372809635 Test accuracy is 0.551993142808 --------------------------------- For Not alchemy and askscience subreddit and Perceptron classifier: Training accuracy is 0.707566843362 Test accuracy is 0.642290440962 --------------------------------- For Not alchemy and tifu subreddit and Perceptron classifier: Training accuracy is 0.693020103908 Test accuracy is 0.622800956683 --------------------------------- For Not alchemy and explainlikeimfive subreddit and Perceptron classifier: Training accuracy is 0.528207428113 Test accuracy is 0.445582452883 --------------------------------- For Not alchemy and atheism subreddit and Passive-Aggressive classifier: Training accuracy is 0.744467772177 Test accuracy is 0.666550651742 --------------------------------- For Not alchemy and politics subreddit and Passive-Aggressive classifier: Training accuracy is 0.828044233807 Test accuracy is 0.781838862559 --------------------------------- For Not alchemy and nosleep subreddit and Passive-Aggressive classifier: Training accuracy is 0.703494233234 Test accuracy is 0.626171497894 --------------------------------- For Not alchemy and pettyrevenge subreddit and Passive-Aggressive classifier: Training accuracy is 0.672160996646 Test accuracy is 0.574035539249 --------------------------------- For Not alchemy and jokes subreddit and Passive-Aggressive classifier: Training accuracy is 0.820875902309 Test accuracy is 0.745370263601 --------------------------------- For Not alchemy and askhistorians subreddit and Passive-Aggressive classifier: Training accuracy is 0.71066903378 Test accuracy is 0.61296951568 --------------------------------- For Not alchemy and TalesFromTechsupport subreddit and Passive-Aggressive classifier: Training accuracy is 0.641864080221 Test accuracy is 0.546142225336 --------------------------------- For Not alchemy and AskReddit subreddit and Passive-Aggressive classifier: Training accuracy is 0.748901898367 Test accuracy is 0.686820731066 --------------------------------- For Not alchemy and talesFromRetail subreddit and Passive-Aggressive classifier: Training accuracy is 0.638821678511 Test accuracy is 0.553305991363 --------------------------------- For Not alchemy and askscience subreddit and Passive-Aggressive classifier: Training accuracy is 0.725822799557 Test accuracy is 0.650620376303 --------------------------------- For Not alchemy and tifu subreddit and Passive-Aggressive classifier: Training accuracy is 0.712233752774 Test accuracy is 0.630826468243 --------------------------------- For
--------------------------------------------------------------------------- MemoryError Traceback (most recent call last) <ipython-input-42-5656c2102915> in <module>() 51 clf2.fit(x_train, y_train) 52 subreddit_svm[subreddit] = [clf, n_grams] ---> 53 train_acc = clf.score(x_train, y_train) 54 test_acc = clf.score(x_test, y_test) 55 print "For", d, "and", subreddit, "subreddit and", name, "classifier:" C:\Anaconda\lib\site-packages\sklearn\base.pyc in score(self, X, y) 292 """ 293 from .metrics import accuracy_score --> 294 return accuracy_score(y, self.predict(X)) 295 296 C:\Anaconda\lib\site-packages\sklearn\neighbors\classification.pyc in predict(self, X) 144 X = atleast2d_or_csr(X) 145 --> 146 neigh_dist, neigh_ind = self.kneighbors(X) 147 148 classes_ = self.classes_ C:\Anaconda\lib\site-packages\sklearn\neighbors\base.pyc in kneighbors(self, X, n_neighbors, return_distance) 292 if self.effective_metric_ == 'euclidean': 293 dist = pairwise_distances(X, self._fit_X, 'euclidean', --> 294 squared=True) 295 else: 296 dist = pairwise_distances(X, self._fit_X, C:\Anaconda\lib\site-packages\sklearn\metrics\pairwise.pyc in pairwise_distances(X, Y, metric, n_jobs, **kwds) 655 func = PAIRWISE_DISTANCE_FUNCTIONS[metric] 656 if n_jobs == 1: --> 657 return func(X, Y, **kwds) 658 else: 659 return _parallel_pairwise(X, Y, func, n_jobs, **kwds) C:\Anaconda\lib\site-packages\sklearn\metrics\pairwise.pyc in euclidean_distances(X, Y, Y_norm_squared, squared) 174 "Incompatible dimensions for Y and Y_norm_squared") 175 --> 176 distances = safe_sparse_dot(X, Y.T, dense_output=True) 177 distances *= -2 178 distances += XX C:\Anaconda\lib\site-packages\sklearn\utils\extmath.pyc in safe_sparse_dot(a, b, dense_output) 78 ret = a * b 79 if dense_output and hasattr(ret, "toarray"): ---> 80 ret = ret.toarray() 81 return ret 82 else: C:\Anaconda\lib\site-packages\scipy\sparse\compressed.pyc in toarray(self, order, out) 559 def toarray(self, order=None, out=None): 560 """See the docstring for `spmatrix.toarray`.""" --> 561 return self.tocoo(copy=False).toarray(order=order, out=out) 562 563 ############################################################## C:\Anaconda\lib\site-packages\scipy\sparse\coo.pyc in toarray(self, order, out) 236 def toarray(self, order=None, out=None): 237 """See the docstring for `spmatrix.toarray`.""" --> 238 B = self._process_toarray_args(order, out) 239 fortran = int(B.flags.f_contiguous) 240 if not fortran and not B.flags.c_contiguous: C:\Anaconda\lib\site-packages\scipy\sparse\base.pyc in _process_toarray_args(self, order, out) 633 return out 634 else: --> 635 return np.zeros(self.shape, dtype=self.dtype, order=order) 636 637 MemoryError:
Not alchemy and explainlikeimfive subreddit and Passive-Aggressive classifier: Training accuracy is 0.69543904296 Test accuracy is 0.618268859709 ---------------------------------