In [1]:

import matplotlib.pyplot as plt
import numpy as np

In [2]:

from sklearn.datasets import make_moons
X, y = make_moons(noise=0.2, n_samples=200)

In [30]:

X.shape

Out[30]:

(200, 2)

In [31]:

y.shape

Out[31]:

(200,)

In [32]:

print(y)

[0 1 0 1 0 0 1 1 1 0 0 1 1 0 0 0 1 1 1 1 1 1 0 1 0 0 1 0 1 0 1 1 1 0 0 0 0
 1 0 1 0 1 0 0 0 1 1 1 1 1 1 0 0 0 1 0 0 1 1 0 1 0 1 1 0 0 1 1 0 0 1 1 0 0
 1 0 0 1 1 0 0 0 1 1 1 0 1 0 0 0 0 0 0 1 0 1 1 1 1 0 1 0 0 1 0 1 0 0 1 1 0
 1 1 1 0 0 1 1 0 1 0 0 1 0 1 1 0 0 1 1 1 1 1 0 0 0 1 1 0 1 1 1 1 0 0 1 1 1
 1 0 1 1 0 1 1 0 0 0 0 1 1 0 1 0 1 0 0 0 0 1 1 1 0 0 1 1 1 1 0 0 0 0 1 0 0
 1 1 1 0 0 0 1 0 0 0 1 0 0 0 0]

In [33]:

plt.scatter(X[:, 0], X[:, 1], c=y, s=100)
plt.figsize(14,10)

In [34]:

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)

In [35]:

X_train.shape

Out[35]:

(120, 2)

In [36]:

X_test.shape

Out[36]:

(80, 2)

In [50]:

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
print(knn)

KNeighborsClassifier(algorithm=auto, leaf_size=30, n_neighbors=5, p=2,
           warn_on_equidistant=True, weights=uniform)

In [54]:

knn.fit(X_train, y_train)

Out[54]:

KNeighborsClassifier(algorithm='auto', leaf_size=30, n_neighbors=5, p=2,
           warn_on_equidistant=True, weights='uniform')

In [55]:

y_predict = knn.predict(X_test)

In [56]:

plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=100)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_predict, alpha=0.3, s=100)
#plt.scatter(X_test[:, 0], X_test[:, 1], c='w', alpha=0.3, s=100)
plt.figsize(14,10)

In [57]:

y_predict

Out[57]:

array([0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0])

In [58]:

y_test

Out[58]:

array([0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0])

In [59]:

knn.score(X_test, y_test)

Out[59]:

0.9375

In [102]:

from sklearn.linear_model import LogisticRegression
svm = LogisticRegression()

In [103]:

svm.fit(X_train, y_train)

Out[103]:

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [104]:

y_predict = svm.predict(X_test)

In [105]:

plt.figsize(14,10)
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=100)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_predict, alpha=0.3, s=100)
w = svm.coef_.ravel()
a = -w[0] / w[1]
xx = np.linspace(-1, 2)
yy = a * xx - svm.intercept_ / w[1]
plt.plot(xx, yy)

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-105-3427b9cdffe3> in <module>()
      1 plt.figsize(14,10)
----> 2 plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=100)
      3 plt.scatter(X_test[:, 0], X_test[:, 1], c=y_predict, alpha=0.3, s=100)
      4 w = svm.coef_.ravel()
      5 a = -w[0] / w[1]

TypeError: 'coo_matrix' object is not subscriptable

In [106]:

import pandas as pd
train_data = pd.read_csv("../kaggle_insults/train.csv")
test_data = pd.read_csv("../kaggle_insults/test_with_solutions.csv")

In [107]:

y_train = np.array(train_data.Insult)
comments_train = np.array(train_data.Comment)
print(comments_train.shape)
print(y_train.shape)

(3947,)
(3947,)

In [108]:

comments_train[8], y_train[8]

Out[108]:

('"Either you are fake or extremely stupid...maybe both..."', 1)

In [109]:

comments_train[10], y_train[5]

Out[109]:

('"@jdstorm dont wish him injury but it happened on its OWN and i DOUBT he\'s injured, he looked embarrassed to me"',
 0)

In [110]:

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(min_df=1)
cv.fit(comments_train)
X_train = cv.transform(comments_train)

In [111]:

svm.fit(X_train, y_train)

Out[111]:

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [112]:

comments_test = np.array(test_data.Comment)
y_test = np.array(test_data.Insult)
X_test = cv.transform(comments_test)
svm.score(X_test, y_test)

Out[112]:

0.84548545523233853

In [113]:

index = 8
comments_test[index], y_test[index], svm.predict(X_test.tocsr()[index])[0]

Out[113]:

('"To engage in an intelligent debate with you is like debating to a retarded person.  It\'s useless.  It looks like you\'re bent on disregarding the efforts of the government."',
 1,
 1)

In [114]:

np.where(y_train==1)[0][:10]

Out[114]:

array([ 0,  7,  8,  9, 15, 16, 18, 19, 34, 37])

In [115]:

x = X_train.tocsr()[8].toarray()

In [116]:

nonzero = np.where(x.ravel())[0]

In [117]:

np.array(cv.get_feature_names())[nonzero]

Out[117]:

array([u'are', u'both', u'either', u'extremely', u'fake', u'maybe', u'or',
       u'stupid', u'you'], 
      dtype='<U95')

In [118]:

nonzero

Out[118]:

array([  983,  1887,  4396,  4834,  4892,  8287, 10070, 13525, 16397])

In [119]:

X_test.shape

Out[119]:

(2647, 16469)

In [120]:

feats = cv.get_feature_names()

In [120]:

In [121]:

inds = [ 352, 983, 5000, 10004, 13525, 16397, 16440, 16468]

In [122]:

np.array(feats)[inds]

Out[122]:

array([u'aaaah', u'are', u'feathers', u'olympic', u'stupid', u'you',
       u'zealot', u'zuckerberg'], 
      dtype='<U95')

In [123]:

x.ravel()[inds]

Out[123]:

array([0, 1, 0, 0, 1, 1, 0, 0], dtype=int64)

In [138]:

coef_ = svm.coef_
inds = np.argsort(coef_.ravel())
important = np.hstack([inds[:10], inds[-20:]])
feature_names = np.array(cv.get_feature_names())
f_imp = feature_names[important]
coef = coef_.ravel()[important]
inds = np.argsort(coef)
f_imp = f_imp[inds]
coef = coef[inds]

In [181]:

plt.figsize(10, 3)
plt.bar(np.arange(len(coef)), np.abs(coef), width=.6, color=np.array(["green", "red"])[(coef>0).astype(np.int)])
ax = plt.gca()
ax.set_xticks(np.arange(len(coef)) + .4)
labels = ax.set_xticklabels(f_imp, rotation=45, rotation_mode="anchor", va="baseline", ha="right")
#for label in labels:
#    label.set_rotation(45)
plt.savefig("presentation/logreg-pics/bow_coef.pdf", bbox_inches="tight")
plt.show()

In [131]:

In [155]:

import matplotlib.ticker

In [ ]:

matplotlib.ticker.TickHelper