In [1]:

# Imports

import os
import lime
import sklearn
import sklearn.ensemble
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups

print pd.__version__
print sklearn.__version__
print np.__version__

0.20.3
0.19.1
1.14.2

In [2]:

# lists out all the classes possible in the newsgroup dataset
newsgroups_train = fetch_20newsgroups(subset='train')
print newsgroups_train.target_names

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

In [3]:

# we will be exploring "comp.sys.ibm.pc.hardware", "comp.sys.mac.hardware"
categories = ["comp.sys.ibm.pc.hardware", "comp.sys.mac.hardware"]
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

newsgroups_train.keys()

Out[3]:

['description', 'DESCR', 'filenames', 'target_names', 'data', 'target']

In [4]:

# Sample
print newsgroups_test.get('data')[0]
print "++++++++++++++++++"
print newsgroups_test.get('target_names')[0]

From: blakey@ug.cs.dal.ca (Jason "Fish" Blakey)
Subject: Newlife 25 and hard drives
Nntp-Posting-Host: ug.cs.dal.ca
Organization: Math, Stats & CS, Dalhousie University, Halifax, NS, Canada
Lines: 12

  Giday netters!  Just got a used Newlife 25 accelerator, with FPU, and i 
was wondering about a few points.  
-Anyone know the current driver version for it??
-Can it handle the 16-bit grayscale card, if i get the video option
-Why would it be hating my hard drive?(can't use the accelerator and 
	hard drive at the same time). Do i need a new driver on my drive?
	What make?
-Thanks,
	Jason
-- 
 ............................................................................ 
     blakey@ug.cs.dal.ca -> He's big! He's purple! He's your best friend!

++++++++++++++++++
comp.sys.ibm.pc.hardware

In [5]:

# we will be vectorizing the text using TF-IDF vectorization technique
# we will discuss this TF-IDF in future as part of this challenge itself;
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(lowercase=False)
train_vectors = vectorizer.fit_transform(newsgroups_train.data)
test_vectors = vectorizer.transform(newsgroups_test.data)

In [6]:

train_vectors.shape, test_vectors.shape

Out[6]:

((1168, 21486), (777, 21486))

Random Forest¶

In [7]:

# model 1
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500)
rf.fit(train_vectors, newsgroups_train.target)

Out[7]:

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [8]:

# do prediction
pred = rf.predict(test_vectors)
sklearn.metrics.accuracy_score(newsgroups_test.target, pred)

Out[8]:

0.8854568854568855

Naive Bayes¶

In [9]:

# model 2
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(train_vectors, newsgroups_train.target)

Out[9]:

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [10]:

# do prediction
pred = nb.predict(test_vectors)
sklearn.metrics.accuracy_score(newsgroups_test.target, pred)

Out[10]:

0.9124839124839125

Logistic Classifier¶

In [11]:

# model 3
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(train_vectors, newsgroups_train.target)

Out[11]:

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:

# do prediction
pred = lr.predict(test_vectors)
sklearn.metrics.accuracy_score(newsgroups_test.target, pred)

Out[12]:

0.8867438867438867

Lime in Action¶

In [13]:

from lime import lime_text
from sklearn.pipeline import make_pipeline

crf = make_pipeline(vectorizer, rf)
cnb = make_pipeline(vectorizer, nb)
clr = make_pipeline(vectorizer, lr)

In [14]:

from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=['ibm', 'mac'])

In [15]:

# picking on random example from the test dataset; and seeing the top 6 features learnt by each different
# classifier for predicting the actual class of the example data point
idx = np.random.randint(1, len(newsgroups_test.data))
exp_crf = explainer.explain_instance(newsgroups_test.data[idx], crf.predict_proba, num_features=6)
exp_clr = explainer.explain_instance(newsgroups_test.data[idx], clr.predict_proba, num_features=6)
exp_cnb = explainer.explain_instance(newsgroups_test.data[idx], cnb.predict_proba, num_features=6)

In [16]:

exp_crf.show_in_notebook(text=True)

Text with highlighted words

From: woody@praxis.co.uk (Paul Woodman) Subject: HELP: FDHD failure in IIcx Organization: Praxis, Bath, U.K. Lines: 60 REPOST - DUE TO NET PROBLEMS Hi, I have problems with the FDHD on a IICX that EKG reports is ROM revision 376. I had reports that this Mac was having trouble formatting disks and by the time I got to it, the FDHD just would not respond although all other functions were fine. I checked the voltages on the DB19 external drive connector and pin 6 showed no volts with 7 and 8 giving 25volts unloaded. The power supply is an Astec and I sent it away for repair. With the power supply reinstalled, I now get the following error: With the internal FDHD connected, the Mac boots fine but on completion of boot cycle an alert displays "The disk is unreadable - do you want to eject it?". For every combination of response, the alert reappears within approx 2 seconds and this error is continuous. The FDHD does spin but there is no head seeking and the FDHD is incapable of ejecting the disk. This error sequence occurs regardless of whether there is a floppy in the drive or not. Pin 6 on the DB19 external connector shows 1.2V, pins 7 and 8 show 10.75V. With the internal FDHD disconnected, the Mac boots fine and works great (MacEKG reports no errors). The DB19 external connector now shows no voltage on pin 6 but pins 7 and 8 show 10.75V. Now, the internal FDHD may be fubar, but I have an apple 800K external drive that gives exactly the same symptoms as the internal drive when connected to either the external DB19 or the internal connector at J16 (after suitable surgery to the external drive). The voltages at J16 (with the internal FDHD disconnected) are as follows: PIN Volts PIN Volts 1 - 2 5 3 - 4 5 5 - 6 5 7 - 8 - 9 - 10 5 11 - 12 - 13 10.75 14 5 15 10.75 16 5 17 10.75 18 - 19 10.75 20 5 This machine has a SWIM which I don't think EKG tests. I have tested all diodes and pica fuses and can find no problems. Does anybody have any ideas ? Best wishes, Woody. Paul Woodman Praxis plc, \ / | the software engineering company of Touche Ross, \ / ___ ___ _| 20 Manvers Street, Bath, BA1 1PX, UK. \ /\ / / / / / / | \ / Tel +44 225 444700 xt228 \/ \/ /__/ /__/ /__| \/ Fax +44 225 465205. _________________________/ woody@praxis.co.uk

In [17]:

exp_cnb.show_in_notebook(text=True)

Text with highlighted words

From: woody@praxis.co.uk (Paul Woodman) Subject: HELP: FDHD failure in IIcx Organization: Praxis, Bath, U.K. Lines: 60 REPOST - DUE TO NET PROBLEMS Hi, I have problems with the FDHD on a IICX that EKG reports is ROM revision 376. I had reports that this Mac was having trouble formatting disks and by the time I got to it, the FDHD just would not respond although all other functions were fine. I checked the voltages on the DB19 external drive connector and pin 6 showed no volts with 7 and 8 giving 25volts unloaded. The power supply is an Astec and I sent it away for repair. With the power supply reinstalled, I now get the following error: With the internal FDHD connected, the Mac boots fine but on completion of boot cycle an alert displays "The disk is unreadable - do you want to eject it?". For every combination of response, the alert reappears within approx 2 seconds and this error is continuous. The FDHD does spin but there is no head seeking and the FDHD is incapable of ejecting the disk. This error sequence occurs regardless of whether there is a floppy in the drive or not. Pin 6 on the DB19 external connector shows 1.2V, pins 7 and 8 show 10.75V. With the internal FDHD disconnected, the Mac boots fine and works great (MacEKG reports no errors). The DB19 external connector now shows no voltage on pin 6 but pins 7 and 8 show 10.75V. Now, the internal FDHD may be fubar, but I have an apple 800K external drive that gives exactly the same symptoms as the internal drive when connected to either the external DB19 or the internal connector at J16 (after suitable surgery to the external drive). The voltages at J16 (with the internal FDHD disconnected) are as follows: PIN Volts PIN Volts 1 - 2 5 3 - 4 5 5 - 6 5 7 - 8 - 9 - 10 5 11 - 12 - 13 10.75 14 5 15 10.75 16 5 17 10.75 18 - 19 10.75 20 5 This machine has a SWIM which I don't think EKG tests. I have tested all diodes and pica fuses and can find no problems. Does anybody have any ideas ? Best wishes, Woody. Paul Woodman Praxis plc, \ / | the software engineering company of Touche Ross, \ / ___ ___ _| 20 Manvers Street, Bath, BA1 1PX, UK. \ /\ / / / / / / | \ / Tel +44 225 444700 xt228 \/ \/ /__/ /__/ /__| \/ Fax +44 225 465205. _________________________/ woody@praxis.co.uk

In [18]:

exp_clr.show_in_notebook(text=True)

Text with highlighted words

From: woody@praxis.co.uk (Paul Woodman) Subject: HELP: FDHD failure in IIcx Organization: Praxis, Bath, U.K. Lines: 60 REPOST - DUE TO NET PROBLEMS Hi, I have problems with the FDHD on a IICX that EKG reports is ROM revision 376. I had reports that this Mac was having trouble formatting disks and by the time I got to it, the FDHD just would not respond although all other functions were fine. I checked the voltages on the DB19 external drive connector and pin 6 showed no volts with 7 and 8 giving 25volts unloaded. The power supply is an Astec and I sent it away for repair. With the power supply reinstalled, I now get the following error: With the internal FDHD connected, the Mac boots fine but on completion of boot cycle an alert displays "The disk is unreadable - do you want to eject it?". For every combination of response, the alert reappears within approx 2 seconds and this error is continuous. The FDHD does spin but there is no head seeking and the FDHD is incapable of ejecting the disk. This error sequence occurs regardless of whether there is a floppy in the drive or not. Pin 6 on the DB19 external connector shows 1.2V, pins 7 and 8 show 10.75V. With the internal FDHD disconnected, the Mac boots fine and works great (MacEKG reports no errors). The DB19 external connector now shows no voltage on pin 6 but pins 7 and 8 show 10.75V. Now, the internal FDHD may be fubar, but I have an apple 800K external drive that gives exactly the same symptoms as the internal drive when connected to either the external DB19 or the internal connector at J16 (after suitable surgery to the external drive). The voltages at J16 (with the internal FDHD disconnected) are as follows: PIN Volts PIN Volts 1 - 2 5 3 - 4 5 5 - 6 5 7 - 8 - 9 - 10 5 11 - 12 - 13 10.75 14 5 15 10.75 16 5 17 10.75 18 - 19 10.75 20 5 This machine has a SWIM which I don't think EKG tests. I have tested all diodes and pica fuses and can find no problems. Does anybody have any ideas ? Best wishes, Woody. Paul Woodman Praxis plc, \ / | the software engineering company of Touche Ross, \ / ___ ___ _| 20 Manvers Street, Bath, BA1 1PX, UK. \ /\ / / / / / / | \ / Tel +44 225 444700 xt228 \/ \/ /__/ /__/ /__| \/ Fax +44 225 465205. _________________________/ woody@praxis.co.uk