import pandas as pd
news = pd.read_csv('uci-news-aggregator.csv').sample(frac=0.1)
len(news)
42242
news.head()
ID | TITLE | URL | PUBLISHER | CATEGORY | STORY | HOSTNAME | TIMESTAMP | |
---|---|---|---|---|---|---|---|---|
24589 | 24590 | Wal-Mart takes aim at $2B used video game mark... | http://www.wncn.com/story/25002497/wal-mart-ta... | WNCN | b | dxrStbpfnH5zjIM-2HS6xpt7arezM | www.wncn.com | 1395317817464 |
114446 | 114782 | You'll likely recognize this 'Fargo,' you betcha! | http://www.usatoday.com/story/life/tv/2014/04/... | USA TODAY | e | dHALsQo-hgLH_0MDzDRorxcN9BVnM | www.usatoday.com | 1397520248385 |
357516 | 357976 | Sentence for sex abuser Rolf Harris 'too lenient' | http://www.scotsman.com/news/sentence-for-sex-... | Scotsman \(blog\) | e | dDGfw8WpQ5GpyLMGzdY_Xi99sjNPM | www.scotsman.com | 1404528720056 |
241734 | 242180 | Google (GOOGL) Developing Tablet with Advanced... | http://wallstreetpit.com/104126-google-googl-d... | Wall Street Pit | t | duaEdk4hebuJCDM4aiSvC1Nv3ThrM | wallstreetpit.com | 1400868306109 |
206956 | 207392 | FDA : Advanced prosthetic arm is approved for ... | http://canadajournal.net/health/fda-advanced-p... | Canada News | m | dz-jKxOMO7HMK0M7xdtCAX5E1-UOM | canadajournal.net | 1399927161820 |
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
X = news['TITLE']
y = encoder.fit_transform(news['CATEGORY'])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
len(X_train)
31681
len(X_test)
10561
type(X_train)
pandas.core.series.Series
X_train.head()
288020 Oil prices rises to highest level in 9 month h... 382781 Gold gains on bargain hunting, though Yellen c... 339760 LG G3 Beat vs. Galaxy S5 Mini vs. Moto X: Came... 334230 Bulgarian depositors withdraw more savings as ... 39177 New Motorola Phablet to Hit the Market Later T... Name: TITLE, dtype: object
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=3)
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)
train_vectors
<31681x9915 sparse matrix of type '<class 'numpy.int64'>' with 267651 stored elements in Compressed Sparse Row format>
X_train.iloc[1]
'Gold gains on bargain hunting, though Yellen comments still weigh'
train_vectors[1]
<1x9915 sparse matrix of type '<class 'numpy.int64'>' with 10 stored elements in Compressed Sparse Row format>
type(train_vectors)
scipy.sparse.csr.csr_matrix
# one-hot vector
train_vectors[1].toarray()
array([[0, 0, 0, ..., 0, 0, 0]])
from sklearn.metrics import accuracy_score
train_vectors.toarray()
array([[0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], ..., [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0]], dtype=int64)
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(train_vectors.toarray(), y_train)
GaussianNB(priors=None)
pred = clf.predict(test_vectors.toarray())
accuracy_score(y_test, pred, )
0.82056623425811948
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(train_vectors, y_train)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
pred = clf.predict(test_vectors)
accuracy_score(y_test, pred, )
0.89792633273364264
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(train_vectors, y_train)
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
pred = clf.predict(test_vectors.toarray())
accuracy_score(y_test, pred, )
0.89660070069122244