Classifying News Headlines and Explaining the Result

Reference: Classifying News Headlines and Explaining the Result from Kaggle

In [1]:
import pandas as pd
# using Kaggle API https://github.com/Kaggle/kaggle-api
DATA_FILE = "~/.kaggle/datasets/uciml/news-aggregator-dataset/uci-news-aggregator.csv"
news = pd.read_csv(DATA_FILE).sample(frac=0.1)
In [2]:
len(news)
Out[2]:
42242
In [3]:
news.head(3)
Out[3]:
ID TITLE URL PUBLISHER CATEGORY STORY HOSTNAME TIMESTAMP
13529 13530 Robotic fish designed to perform escape maneuv... http://www.ecnmag.com/news/2014/03/robotic-fis... ECNmag.com t dSmJK-WR4xv2inMKmnmxaRfd6cf1M www.ecnmag.com 1395059947658
254251 254697 Faces & names: 'X-Men' climbs to $302 million ... http://www.duluthnewstribune.com/content/faces... Duluth News Tribune e d5poaO2w8Yffx6MDgPRQSF5POXCXM www.duluthnewstribune.com 1401174011596
27785 27786 Which 'Divergent' Starlet Skipped Underwear fo... http://www.cambio.com/2014/03/19/which-diverge... Cambio e d55mX4D4wN3d5vMMYF9GgviF21QlM www.cambio.com 1395333837043
In [4]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

X = news['TITLE']
y = encoder.fit_transform(news['CATEGORY'])

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
In [5]:
# You can also make it categorical ont-hot vector.
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
OneHotEncoder().fit_transform(
    LabelEncoder().fit_transform(news['CATEGORY']).reshape(-1,1)
).toarray()
Out[5]:
array([[0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       ...,
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])
In [6]:
len(X_train)
Out[6]:
31681
In [7]:
len(X_test)
Out[7]:
10561
In [8]:
type(X_train)
Out[8]:
pandas.core.series.Series
In [9]:
X_train.head(3)
Out[9]:
155319                       Facebook: Mobile Powers Growth
230004    Stocks rise ahead of Fed minutes; Dow jumps 10...
91687     GM Expected to Announce Major Investment in 20...
Name: TITLE, dtype: object
In [10]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=3)

train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)
In [11]:
train_vectors
Out[11]:
<31681x9886 sparse matrix of type '<class 'numpy.int64'>'
	with 267205 stored elements in Compressed Sparse Row format>
In [12]:
X_train.iloc[1]
Out[12]:
'Stocks rise ahead of Fed minutes; Dow jumps 100 points'
In [13]:
train_vectors[1]
Out[13]:
<1x9886 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Row format>
In [14]:
type(train_vectors)
Out[14]:
scipy.sparse.csr.csr_matrix
In [15]:
# one-hot vector
train_vectors[1].toarray()
Out[15]:
array([[0, 0, 0, ..., 0, 0, 0]])

Decision Tree

In [16]:
from sklearn.metrics import accuracy_score
%load_ext autotime
In [17]:
from sklearn import tree
dt = tree.DecisionTreeClassifier()
dt.fit(train_vectors, y_train)
Out[17]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
time: 5.64 s
In [18]:
pred = dt.predict(test_vectors)
accuracy_score(y_test, pred, )
Out[18]:
0.8092983618975476
time: 13.9 ms

Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=20)
rf.fit(train_vectors, y_train)
Out[19]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
time: 8.03 s
In [20]:
pred = rf.predict(test_vectors)
accuracy_score(y_test, pred, )
Out[20]:
0.8496354511883344
time: 247 ms

Multinomial Naive Bayes

In [21]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(train_vectors, y_train)
Out[21]:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
time: 20.6 ms
In [22]:
pred = nb.predict(test_vectors)
accuracy_score(y_test, pred, )
Out[22]:
0.9051226209639238
time: 7.33 ms

Explaining the result

In [23]:
%unload_ext autotime
In [24]:
from sklearn.pipeline import make_pipeline
c = make_pipeline(vectorizer, nb)

from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=list(encoder.classes_))
In [25]:
# .sample is random select
example = X_test.sample(1).iloc[0]
example
Out[25]:
'Henry Cavill Is Still Super Handsome (But Way More Serious) in the First Official  ...'
In [26]:
c.predict_proba([example])
Out[26]:
array([[8.54676048e-04, 9.00036923e-01, 4.36873332e-02, 5.54210682e-02]])
In [27]:
%%capture
exp = explainer.explain_instance(example, c.predict_proba, top_labels=2)
In [28]:
exp.show_in_notebook()