In [ ]:
import pandas as pd
data = pd.read_csv("train.csv")
In [ ]:
len(data)
In [ ]:
data.columns
In [ ]:
data.Insult.value_counts()
In [ ]:
 
In [ ]:
import numpy as np
y_train = np.array(data.Insult)
In [ ]:
y_train
In [ ]:
text_train = data.Comment.tolist()
In [ ]:
data_test = pd.read_csv("test_with_solutions.csv")
In [ ]:
data_test
In [ ]:
text_test, y_test = data_test.Comment.tolist(), np.array(data_test.Insult)
In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
In [ ]:
cv = CountVectorizer()
cv.fit(text_train)
In [ ]:
len(cv.vocabulary_)
In [ ]:
cv.vocabulary_
In [ ]:
X_train = cv.transform(text_train)
In [ ]:
X_train.shape
In [ ]:
text_train[6]
In [ ]:
X_train[6, :].nonzero()
In [ ]:
X_train[6]
In [ ]:
X_test = cv.transform(text_test)
In [ ]:
from sklearn.svm import LinearSVC
svm = LinearSVC(C=.01)
In [ ]:
svm.fit(X_train, y_train)
In [ ]:
svm.score(X_train, y_train)
In [ ]:
svm.score(X_test, y_test)
In [ ]:
y_test_pred = svm.predict(X_test)
In [ ]:
from sklearn.metrics import classification_report
In [ ]:
print(classification_report(y_test, y_test_pred))
In [ ]:
coef = svm.coef_.ravel()
positive_coefficients = np.argsort(coef)[-25:]
negative_coefficients = np.argsort(coef)[:25]
interesting_coefficients = np.hstack([negative_coefficients, positive_coefficients])
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.figure(figsize=(15, 5))
plt.bar(np.arange(50), coef[interesting_coefficients], color=["red" if c < 0 else "blue" for c in coef[interesting_coefficients]])
feature_names = np.array(cv.get_feature_names())
plt.xticks(np.arange(1, 51), feature_names[interesting_coefficients], rotation=60, ha="right");
In [ ]:
from sklearn.pipeline import Pipeline
In [ ]:
pipeline = Pipeline([('vectorizer', cv), ('classifier', svm)])
In [ ]:
pipeline.fit(text_train, y_train)
In [ ]:
pipeline.score(text_train, y_train)
In [ ]:
pipeline.score(text_test, y_test)
In [ ]:
from sklearn.grid_search import GridSearchCV
In [ ]:
param_grid = {'classifier__C': 10. ** np.arange(-3, 3)}
grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10)
In [ ]:
grid_search.fit(text_train, y_train)
In [ ]:
grid_search.best_score_
In [ ]:
grid_search.best_params_
In [ ]:
param_grid = {'classifier__C': 10. ** np.arange(-3, 3), "vectorizer__ngram_range": [(1, 1), (1, 2), (1, 3), (2, 3), (2, 2)]}
grid_search = GridSearchCV(pipeline, param_grid=param_grid, n_jobs=3)
In [ ]:
grid_search.fit(text_train, y_train)
In [ ]:
grid_search.best_params_
In [ ]:
grid_search.best_score_

Tasks

  1. Remove the above visualization code for the coefficients and try to recreate it.
  2. Can you think of any other useful features for this task?