import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_moons
X, y = make_moons(noise=0.2, n_samples=200)
X.shape
(200, 2)
y.shape
(200,)
print(y)
[0 1 0 1 0 0 1 1 1 0 0 1 1 0 0 0 1 1 1 1 1 1 0 1 0 0 1 0 1 0 1 1 1 0 0 0 0 1 0 1 0 1 0 0 0 1 1 1 1 1 1 0 0 0 1 0 0 1 1 0 1 0 1 1 0 0 1 1 0 0 1 1 0 0 1 0 0 1 1 0 0 0 1 1 1 0 1 0 0 0 0 0 0 1 0 1 1 1 1 0 1 0 0 1 0 1 0 0 1 1 0 1 1 1 0 0 1 1 0 1 0 0 1 0 1 1 0 0 1 1 1 1 1 0 0 0 1 1 0 1 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1 0 0 0 0 1 1 0 1 0 1 0 0 0 0 1 1 1 0 0 1 1 1 1 0 0 0 0 1 0 0 1 1 1 0 0 0 1 0 0 0 1 0 0 0 0]
plt.scatter(X[:, 0], X[:, 1], c=y, s=100)
plt.figsize(14,10)
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
X_train.shape
(120, 2)
X_test.shape
(80, 2)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
print(knn)
KNeighborsClassifier(algorithm=auto, leaf_size=30, n_neighbors=5, p=2, warn_on_equidistant=True, weights=uniform)
knn.fit(X_train, y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, n_neighbors=5, p=2, warn_on_equidistant=True, weights='uniform')
y_predict = knn.predict(X_test)
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=100)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_predict, alpha=0.3, s=100)
#plt.scatter(X_test[:, 0], X_test[:, 1], c='w', alpha=0.3, s=100)
plt.figsize(14,10)
y_predict
array([0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0])
y_test
array([0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0])
knn.score(X_test, y_test)
0.9375
from sklearn.linear_model import LogisticRegression
svm = LogisticRegression()
svm.fit(X_train, y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)
y_predict = svm.predict(X_test)
plt.figsize(14,10)
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=100)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_predict, alpha=0.3, s=100)
w = svm.coef_.ravel()
a = -w[0] / w[1]
xx = np.linspace(-1, 2)
yy = a * xx - svm.intercept_ / w[1]
plt.plot(xx, yy)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-105-3427b9cdffe3> in <module>() 1 plt.figsize(14,10) ----> 2 plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=100) 3 plt.scatter(X_test[:, 0], X_test[:, 1], c=y_predict, alpha=0.3, s=100) 4 w = svm.coef_.ravel() 5 a = -w[0] / w[1] TypeError: 'coo_matrix' object is not subscriptable
import pandas as pd
train_data = pd.read_csv("../kaggle_insults/train.csv")
test_data = pd.read_csv("../kaggle_insults/test_with_solutions.csv")
y_train = np.array(train_data.Insult)
comments_train = np.array(train_data.Comment)
print(comments_train.shape)
print(y_train.shape)
(3947,) (3947,)
comments_train[8], y_train[8]
('"Either you are fake or extremely stupid...maybe both..."', 1)
comments_train[10], y_train[5]
('"@jdstorm dont wish him injury but it happened on its OWN and i DOUBT he\'s injured, he looked embarrassed to me"', 0)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(min_df=1)
cv.fit(comments_train)
X_train = cv.transform(comments_train)
svm.fit(X_train, y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)
comments_test = np.array(test_data.Comment)
y_test = np.array(test_data.Insult)
X_test = cv.transform(comments_test)
svm.score(X_test, y_test)
0.84548545523233853
index = 8
comments_test[index], y_test[index], svm.predict(X_test.tocsr()[index])[0]
('"To engage in an intelligent debate with you is like debating to a retarded person. It\'s useless. It looks like you\'re bent on disregarding the efforts of the government."', 1, 1)
np.where(y_train==1)[0][:10]
array([ 0, 7, 8, 9, 15, 16, 18, 19, 34, 37])
x = X_train.tocsr()[8].toarray()
nonzero = np.where(x.ravel())[0]
np.array(cv.get_feature_names())[nonzero]
array([u'are', u'both', u'either', u'extremely', u'fake', u'maybe', u'or', u'stupid', u'you'], dtype='<U95')
nonzero
array([ 983, 1887, 4396, 4834, 4892, 8287, 10070, 13525, 16397])
X_test.shape
(2647, 16469)
feats = cv.get_feature_names()
inds = [ 352, 983, 5000, 10004, 13525, 16397, 16440, 16468]
np.array(feats)[inds]
array([u'aaaah', u'are', u'feathers', u'olympic', u'stupid', u'you', u'zealot', u'zuckerberg'], dtype='<U95')
x.ravel()[inds]
array([0, 1, 0, 0, 1, 1, 0, 0], dtype=int64)
coef_ = svm.coef_
inds = np.argsort(coef_.ravel())
important = np.hstack([inds[:10], inds[-20:]])
feature_names = np.array(cv.get_feature_names())
f_imp = feature_names[important]
coef = coef_.ravel()[important]
inds = np.argsort(coef)
f_imp = f_imp[inds]
coef = coef[inds]
plt.figsize(10, 3)
plt.bar(np.arange(len(coef)), np.abs(coef), width=.6, color=np.array(["green", "red"])[(coef>0).astype(np.int)])
ax = plt.gca()
ax.set_xticks(np.arange(len(coef)) + .4)
labels = ax.set_xticklabels(f_imp, rotation=45, rotation_mode="anchor", va="baseline", ha="right")
#for label in labels:
# label.set_rotation(45)
plt.savefig("presentation/logreg-pics/bow_coef.pdf", bbox_inches="tight")
plt.show()
import matplotlib.ticker
matplotlib.ticker.TickHelper