# Imports
import os
import lime
import sklearn
import sklearn.ensemble
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups
print pd.__version__
print sklearn.__version__
print np.__version__
0.20.3 0.19.1 1.14.2
# lists out all the classes possible in the newsgroup dataset
newsgroups_train = fetch_20newsgroups(subset='train')
print newsgroups_train.target_names
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
# we will be exploring "comp.sys.ibm.pc.hardware", "comp.sys.mac.hardware"
categories = ["comp.sys.ibm.pc.hardware", "comp.sys.mac.hardware"]
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
newsgroups_train.keys()
['description', 'DESCR', 'filenames', 'target_names', 'data', 'target']
# Sample
print newsgroups_test.get('data')[0]
print "++++++++++++++++++"
print newsgroups_test.get('target_names')[0]
From: blakey@ug.cs.dal.ca (Jason "Fish" Blakey) Subject: Newlife 25 and hard drives Nntp-Posting-Host: ug.cs.dal.ca Organization: Math, Stats & CS, Dalhousie University, Halifax, NS, Canada Lines: 12 Giday netters! Just got a used Newlife 25 accelerator, with FPU, and i was wondering about a few points. -Anyone know the current driver version for it?? -Can it handle the 16-bit grayscale card, if i get the video option -Why would it be hating my hard drive?(can't use the accelerator and hard drive at the same time). Do i need a new driver on my drive? What make? -Thanks, Jason -- ............................................................................ blakey@ug.cs.dal.ca -> He's big! He's purple! He's your best friend! ++++++++++++++++++ comp.sys.ibm.pc.hardware
# we will be vectorizing the text using TF-IDF vectorization technique
# we will discuss this TF-IDF in future as part of this challenge itself;
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(lowercase=False)
train_vectors = vectorizer.fit_transform(newsgroups_train.data)
test_vectors = vectorizer.transform(newsgroups_test.data)
train_vectors.shape, test_vectors.shape
((1168, 21486), (777, 21486))
# model 1
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500)
rf.fit(train_vectors, newsgroups_train.target)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False)
# do prediction
pred = rf.predict(test_vectors)
sklearn.metrics.accuracy_score(newsgroups_test.target, pred)
0.8854568854568855
# model 2
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(train_vectors, newsgroups_train.target)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
# do prediction
pred = nb.predict(test_vectors)
sklearn.metrics.accuracy_score(newsgroups_test.target, pred)
0.9124839124839125
# model 3
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(train_vectors, newsgroups_train.target)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
# do prediction
pred = lr.predict(test_vectors)
sklearn.metrics.accuracy_score(newsgroups_test.target, pred)
0.8867438867438867
from lime import lime_text
from sklearn.pipeline import make_pipeline
crf = make_pipeline(vectorizer, rf)
cnb = make_pipeline(vectorizer, nb)
clr = make_pipeline(vectorizer, lr)
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=['ibm', 'mac'])
# picking on random example from the test dataset; and seeing the top 6 features learnt by each different
# classifier for predicting the actual class of the example data point
idx = np.random.randint(1, len(newsgroups_test.data))
exp_crf = explainer.explain_instance(newsgroups_test.data[idx], crf.predict_proba, num_features=6)
exp_clr = explainer.explain_instance(newsgroups_test.data[idx], clr.predict_proba, num_features=6)
exp_cnb = explainer.explain_instance(newsgroups_test.data[idx], cnb.predict_proba, num_features=6)
exp_crf.show_in_notebook(text=True)
exp_cnb.show_in_notebook(text=True)
exp_clr.show_in_notebook(text=True)