Classifying News Headlines with Naive Bayes

Reference: Classifying News Headlines and Explaining the Result from Kaggle1

In [1]:
import pandas as pd
news = pd.read_csv('uci-news-aggregator.csv').sample(frac=0.1)
In [2]:
len(news)
Out[2]:
42242
In [3]:
news.head()
Out[3]:
ID TITLE URL PUBLISHER CATEGORY STORY HOSTNAME TIMESTAMP
24589 24590 Wal-Mart takes aim at $2B used video game mark... http://www.wncn.com/story/25002497/wal-mart-ta... WNCN b dxrStbpfnH5zjIM-2HS6xpt7arezM www.wncn.com 1395317817464
114446 114782 You'll likely recognize this 'Fargo,' you betcha! http://www.usatoday.com/story/life/tv/2014/04/... USA TODAY e dHALsQo-hgLH_0MDzDRorxcN9BVnM www.usatoday.com 1397520248385
357516 357976 Sentence for sex abuser Rolf Harris 'too lenient' http://www.scotsman.com/news/sentence-for-sex-... Scotsman \(blog\) e dDGfw8WpQ5GpyLMGzdY_Xi99sjNPM www.scotsman.com 1404528720056
241734 242180 Google (GOOGL) Developing Tablet with Advanced... http://wallstreetpit.com/104126-google-googl-d... Wall Street Pit t duaEdk4hebuJCDM4aiSvC1Nv3ThrM wallstreetpit.com 1400868306109
206956 207392 FDA : Advanced prosthetic arm is approved for ... http://canadajournal.net/health/fda-advanced-p... Canada News m dz-jKxOMO7HMK0M7xdtCAX5E1-UOM canadajournal.net 1399927161820
In [4]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

X = news['TITLE']
y = encoder.fit_transform(news['CATEGORY'])

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
In [5]:
len(X_train)
Out[5]:
31681
In [6]:
len(X_test)
Out[6]:
10561
In [7]:
type(X_train)
Out[7]:
pandas.core.series.Series
In [8]:
X_train.head()
Out[8]:
288020    Oil prices rises to highest level in 9 month h...
382781    Gold gains on bargain hunting, though Yellen c...
339760    LG G3 Beat vs. Galaxy S5 Mini vs. Moto X: Came...
334230    Bulgarian depositors withdraw more savings as ...
39177     New Motorola Phablet to Hit the Market Later T...
Name: TITLE, dtype: object
In [9]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=3)

train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)
In [10]:
train_vectors
Out[10]:
<31681x9915 sparse matrix of type '<class 'numpy.int64'>'
	with 267651 stored elements in Compressed Sparse Row format>
In [11]:
X_train.iloc[1]
Out[11]:
'Gold gains on bargain hunting, though Yellen comments still weigh'
In [12]:
train_vectors[1]
Out[12]:
<1x9915 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Row format>
In [13]:
type(train_vectors)
Out[13]:
scipy.sparse.csr.csr_matrix
In [14]:
# one-hot vector
train_vectors[1].toarray()
Out[14]:
array([[0, 0, 0, ..., 0, 0, 0]])
In [15]:
from sklearn.metrics import accuracy_score
In [16]:
train_vectors.toarray()
Out[16]:
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

Gaussian Naive Bayes

In [17]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(train_vectors.toarray(), y_train)
Out[17]:
GaussianNB(priors=None)
In [18]:
pred = clf.predict(test_vectors.toarray())
accuracy_score(y_test, pred, )
Out[18]:
0.82056623425811948

Multinomial Naive Bayes

In [19]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(train_vectors, y_train)
Out[19]:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
In [20]:
pred = clf.predict(test_vectors)
accuracy_score(y_test, pred, )
Out[20]:
0.89792633273364264

Bernoulli Naive Bayes

In [21]:
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(train_vectors, y_train)
Out[21]:
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
In [22]:
pred = clf.predict(test_vectors.toarray())
accuracy_score(y_test, pred, )
Out[22]:
0.89660070069122244