%pylab inline
Populating the interactive namespace from numpy and matplotlib
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all')
# We'll use only the first 3000 documents
n_samples = 3000
X = news.data[:n_samples]
y = news.target[:n_samples]
news.target_names
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
### Each document is a message posted on a newsgroup. Below is the text of the first two messages in the data.
X[:2]
["From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>\nSubject: Pens fans reactions\nOrganization: Post Office, Carnegie Mellon, Pittsburgh, PA\nLines: 12\nNNTP-Posting-Host: po4.andrew.cmu.edu\n\n\n\nI am sure some bashers of Pens fans are pretty confused about the lack\nof any kind of posts about the recent Pens massacre of the Devils. Actually,\nI am bit puzzled too and a bit relieved. However, I am going to put an end\nto non-PIttsburghers' relief with a bit of praise for the Pens. Man, they\nare killing those Devils worse than I thought. Jagr just showed you why\nhe is much better than his regular season stats. He is also a lot\nfo fun to watch in the playoffs. Bowman should let JAgr have a lot of\nfun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final\nregular season game. PENS RULE!!!\n\n", 'From: mblawson@midway.ecn.uoknor.edu (Matthew B Lawson)\nSubject: Which high-performance VLB video card?\nSummary: Seek recommendations for VLB video card\nNntp-Posting-Host: midway.ecn.uoknor.edu\nOrganization: Engineering Computer Network, University of Oklahoma, Norman, OK, USA\nKeywords: orchid, stealth, vlb\nLines: 21\n\n My brother is in the market for a high-performance video card that supports\nVESA local bus with 1-2MB RAM. Does anyone have suggestions/ideas on:\n\n - Diamond Stealth Pro Local Bus\n\n - Orchid Farenheit 1280\n\n - ATI Graphics Ultra Pro\n\n - Any other high-performance VLB card\n\n\nPlease post or email. Thank you!\n\n - Matt\n\n-- \n | Matthew B. Lawson <------------> (mblawson@essex.ecn.uoknor.edu) | \n --+-- "Now I, Nebuchadnezzar, praise and exalt and glorify the King --+-- \n | of heaven, because everything he does is right and all his ways | \n | are just." - Nebuchadnezzar, king of Babylon, 562 B.C. | \n']
### Class labels for the newsgroups
print(set(y))
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)
### As part of preprocessing, we will remove the stop word
stopwords = pd.read_csv('stopwords_en.txt', header=None)
stopwords
0 | |
---|---|
0 | a |
1 | about |
2 | above |
3 | across |
4 | after |
... | ... |
313 | you |
314 | your |
315 | yours |
316 | yourself |
317 | yourselves |
318 rows × 1 columns
stop_words = list(stopwords.values.ravel())
# TfidfVectorizer performs tokenization, removes stop words, and performs tfxidf transformation
tfidf = TfidfVectorizer(stop_words=stop_words, token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b", norm=None)
X_tfidf = tfidf.fit_transform(X_train)
X_tfidf.shape
(2400, 50884)
X_test_tfidf = tfidf.transform(X_test)
X_test_tfidf.shape
(600, 50884)
X_tfidf
<2400x50884 sparse matrix of type '<class 'numpy.float64'>' with 240415 stored elements in Compressed Sparse Row format>
print(X_tfidf[:2])
(0, 49332) 7.397346235101363 (0, 43052) 5.564764771353053 (0, 18167) 5.648146380292103 (0, 7498) 8.090493415661307 (0, 39007) 4.12968024606373 (0, 4094) 4.178470410233162 (0, 44717) 2.870137590582983 (0, 23250) 3.820795965961346 (0, 38015) 5.287133034754773 (0, 35158) 4.609253326325616 (0, 7542) 5.605586765873308 (0, 30286) 5.287133034754773 (0, 35471) 3.5632847711429285 (0, 11875) 4.912439585313362 (0, 23280) 3.3326021426555528 (0, 7577) 3.5687048386122675 (0, 5435) 4.832396877639826 (0, 17725) 8.090493415661307 (0, 28190) 5.73911815849783 (0, 15253) 7.786582935998999 (0, 3192) 8.090493415661307 (0, 21073) 3.5741544433798325 (0, 23657) 3.8636596703931283 (0, 33982) 5.094761142107317 (0, 43410) 6.075590395119043 : : (1, 29378) 3.8419981736119486 (1, 20292) 8.090493415661307 (1, 49558) 7.685028307553144 (1, 43458) 16.180986831322613 (1, 16773) 6.385745323422883 (1, 38287) 3.7273947908729452 (1, 13582) 6.218691238759717 (1, 2841) 6.8377304471659395 (1, 11801) 7.397346235101363 (1, 31865) 1.8443866501797452 (1, 24072) 7.397346235101363 (1, 10040) 7.397346235101363 (1, 35009) 5.525544058199771 (1, 44668) 4.5209607191799375 (1, 11802) 7.397346235101363 (1, 14055) 26.81679621816567 (1, 14054) 12.28916653321199 (1, 37089) 18.65607371627915 (1, 8705) 14.348405367574305 (1, 42707) 19.15723597026865 (1, 15227) 15.370056615106288 (1, 39913) 15.370056615106288 (1, 26987) 1.0016673617868688 (1, 33192) 1.0412385744054713 (1, 43323) 1.0
# It's possible (though not usually necessary) to convert the matrix into a "dense" matrix
newX = X_tfidf.todense()
newX.shape
(2400, 50884)
np.set_printoptions(linewidth=120, edgeitems=12)
print(newX[:10])
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ... 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ... 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ... 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ... 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ... 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ... 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ... 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ... 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ... 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ... 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
print(newX.sum(axis=1))
[[ 353.9656672 ] [ 615.80709319] [1509.21571478] [ 312.42228218] [ 497.40865428] [ 296.22891235] [ 394.92406169] [ 79.0181996 ] [ 434.75655621] [ 604.78797848] [ 194.77220879] [ 649.8057449 ] ... [1081.70559415] [ 318.71902192] [ 458.42934382] [1428.55046974] [ 672.76200713] [1823.29391322] [ 372.98326499] [ 181.37967281] [ 436.71258582] [ 468.22025205] [ 468.7423809 ] [ 174.00737792]]
### Lets' set up a pipeline to perform preprocessng of the newsgroup data and
### classification of the documents using Multiomial Naive Bayes
clf = Pipeline([
('vect', TfidfVectorizer(
stop_words=stop_words,
token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",
)),
('nb', MultinomialNB(alpha=0.01)),
])
### We can use this function ot do cross-validation
from sklearn.model_selection import cross_val_score, KFold
from scipy.stats import sem
def evaluate_cross_validation(clf, X, y, K):
# create a k-fold cross validation iterator of K folds
cv = KFold(n_splits=K, random_state=0, shuffle=True)
# by default the score used is the one returned by score method of the estimator (accuracy)
scores = cross_val_score(clf, X, y, cv=cv)
print(scores)
print("Mean score: %.3f (+/-%.3f)" % (np.mean(scores), sem(scores)))
evaluate_cross_validation(clf, X_train, y_train, 5)
[0.80833333 0.81458333 0.81666667 0.79583333 0.8 ] Mean score: 0.807 (+/-0.004)
def calc_params(X, y, clf, param_values, param_name, K):
# Convert input to Numpy arrays
X = np.array(X)
y = np.array(y)
# initialize training and testing score arrays with zeros
train_scores = np.zeros(len(param_values))
test_scores = np.zeros(len(param_values))
# iterate over the different parameter values
for i, param_value in enumerate(param_values):
# set classifier parameters
clf.set_params(**{param_name:param_value})
# initialize the K scores obtained for each fold
k_train_scores = np.zeros(K)
k_test_scores = np.zeros(K)
# create KFold cross validation
cv = KFold(n_splits=K, shuffle=True, random_state=0)
# iterate over the K folds
j = 0
for train, test in cv.split(X):
# fit the classifier in the corresponding fold
# and obtain the corresponding accuracy scores on train and test sets
clf.fit(X[train], y[train])
k_train_scores[j] = clf.score(X[train], y[train])
k_test_scores[j] = clf.score(X[test], y[test])
j += 1
# store the mean of the K fold scores
train_scores[i] = np.mean(k_train_scores)
test_scores[i] = np.mean(k_test_scores)
print(param_name, '=', param_value, "Train =", train_scores[i], "Test =", test_scores[i])
# plot the training and testing scores in a log scale
plt.semilogx(param_values, train_scores, label='Train', alpha=0.4, lw=2, c='b')
plt.semilogx(param_values, test_scores, label='X-Val', alpha=0.4, lw=2, c='g')
plt.legend(loc=7)
plt.xlabel(param_name + " values")
plt.ylabel("Mean cross validation accuracy")
# return the training and testing scores on each parameter value
return train_scores, test_scores
alphas = np.logspace(-7, 0, 10)
print(alphas)
[1.00000000e-07 5.99484250e-07 3.59381366e-06 2.15443469e-05 1.29154967e-04 7.74263683e-04 4.64158883e-03 2.78255940e-02 1.66810054e-01 1.00000000e+00]
train_scores, test_scores = calc_params(X_train, y_train, clf, alphas, 'nb__alpha', 5)
nb__alpha = 1e-07 Train = 1.0 Test = 0.7670833333333332 nb__alpha = 5.994842503189409e-07 Train = 1.0 Test = 0.7708333333333334 nb__alpha = 3.5938136638046257e-06 Train = 1.0 Test = 0.77625 nb__alpha = 2.1544346900318867e-05 Train = 1.0 Test = 0.7845833333333334 nb__alpha = 0.0001291549665014884 Train = 1.0 Test = 0.7908333333333333 nb__alpha = 0.000774263682681127 Train = 1.0 Test = 0.7987500000000001 nb__alpha = 0.004641588833612782 Train = 1.0 Test = 0.8066666666666666 nb__alpha = 0.02782559402207126 Train = 0.9992708333333333 Test = 0.8016666666666665 nb__alpha = 0.1668100537200059 Train = 0.9945833333333335 Test = 0.7858333333333333 nb__alpha = 1.0 Train = 0.9701041666666667 Test = 0.7145833333333333
### Let's use alpha = 0.04 for the final model
mnb = MultinomialNB(alpha=0.04)
mnb.fit(X_tfidf, y_train)
MultinomialNB(alpha=0.04, class_prior=None, fit_prior=True)
### Performance on the test data
mnb_pred = mnb.predict(X_test_tfidf)
print(mnb_pred)
[15 14 3 19 6 19 3 13 1 19 0 12 8 4 12 3 11 11 18 4 12 12 18 12 5 10 5 12 4 3 1 13 9 15 12 17 7 8 12 12 1 3 18 11 3 3 6 5 17 8 10 18 19 10 9 12 1 12 16 5 7 13 18 3 10 14 8 7 7 12 4 3 18 12 5 13 15 18 15 14 16 9 16 14 11 9 15 4 3 14 14 8 9 17 15 4 11 12 7 9 4 17 17 12 1 4 11 6 13 0 1 5 12 13 6 6 11 18 13 12 17 3 8 12 19 14 0 10 5 14 14 9 12 4 12 11 4 9 7 12 11 9 7 16 16 14 12 9 15 8 9 18 7 18 5 10 15 8 1 5 17 12 3 7 4 16 11 4 6 12 11 13 18 5 8 15 3 15 5 15 3 19 14 5 10 2 7 13 4 13 17 6 8 12 14 14 14 15 12 3 5 17 18 19 4 3 16 9 1 7 15 10 13 10 3 10 9 15 17 14 14 1 3 8 11 18 16 4 15 5 13 5 0 16 14 10 7 3 11 6 0 4 6 15 18 2 4 14 5 5 10 15 3 17 8 18 8 7 6 19 10 4 10 8 18 1 8 15 11 12 11 3 18 11 11 18 11 0 12 8 13 11 0 0 12 19 16 13 1 13 3 17 11 5 12 0 0 12 7 2 15 3 13 3 15 6 4 11 5 10 15 16 16 17 3 18 16 8 17 4 2 4 16 17 16 5 15 2 7 14 6 19 3 6 18 15 10 12 19 8 13 18 15 14 1 15 5 11 6 12 4 14 13 18 11 4 1 16 10 16 19 5 9 6 10 14 3 12 6 14 16 3 19 5 10 15 18 5 8 10 7 17 11 1 3 9 7 17 5 4 17 8 9 5 0 16 2 11 6 7 0 12 3 12 1 19 1 16 9 8 17 12 9 5 3 9 17 8 17 4 8 17 3 4 4 11 16 17 8 13 15 9 1 14 4 6 0 16 8 18 16 18 9 12 2 3 13 10 18 10 2 4 9 18 1 11 0 5 9 0 15 11 14 6 18 10 16 1 7 16 16 8 8 15 14 3 10 1 12 4 4 12 0 5 14 15 4 3 4 7 10 12 12 12 14 6 3 8 9 1 8 0 5 16 14 0 0 18 11 7 11 10 1 11 9 3 9 0 4 5 12 18 5 8 5 1 17 19 14 18 5 15 4 3 18 16 1 1 0 18 18 10 3 13 4 6 13 11 6 3 10 12 0 1 4 18 2 10 1 4 16 6 17 4 13 7 5 19 1 12 8 13 1 3 5 11 10 4 15 16 0 13 4 0 2 18 12 9 10 5 14 5 1 10 0 18 10 14 9 8]
from sklearn.metrics import classification_report
print(classification_report(mnb_pred, y_test,
target_names=news.target_names))
precision recall f1-score support alt.atheism 0.85 0.92 0.88 25 comp.graphics 0.69 0.83 0.76 30 comp.os.ms-windows.misc 0.26 1.00 0.41 10 comp.sys.ibm.pc.hardware 0.67 0.49 0.56 41 comp.sys.mac.hardware 0.77 0.56 0.65 41 comp.windows.x 0.81 0.68 0.74 38 misc.forsale 0.55 0.74 0.63 23 rec.autos 0.81 0.77 0.79 22 rec.motorcycles 0.86 0.94 0.90 32 rec.sport.baseball 0.96 0.93 0.95 28 rec.sport.hockey 1.00 0.94 0.97 33 sci.crypt 0.83 0.76 0.79 33 sci.electronics 0.79 0.72 0.76 47 sci.med 0.88 0.88 0.88 25 sci.space 0.84 0.84 0.84 32 soc.religion.christian 0.85 0.69 0.76 32 talk.politics.guns 0.89 0.83 0.86 30 talk.politics.mideast 0.88 0.92 0.90 25 talk.politics.misc 0.93 0.70 0.80 37 talk.religion.misc 0.61 0.88 0.72 16 accuracy 0.78 600 macro avg 0.79 0.80 0.78 600 weighted avg 0.81 0.78 0.78 600
from sklearn.svm import SVC
### Let's again create a pipeline, this time with a linear SVM
clf = Pipeline([
('vect', TfidfVectorizer(
stop_words=stop_words,
token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",
)),
('svc', SVC(kernel='linear')),
])
evaluate_cross_validation(clf, X_train, y_train, 5)
[0.79791667 0.80208333 0.83125 0.79583333 0.80208333] Mean score: 0.806 (+/-0.006)
c_vals = [1, 5, 10, 50, 100]
train_scores, test_scores = calc_params(X_train, y_train, clf, c_vals, 'svc__C', 5)
svc__C = 1 Train = 0.9971874999999999 Test = 0.8058333333333334 svc__C = 5 Train = 1.0 Test = 0.8150000000000001 svc__C = 10 Train = 1.0 Test = 0.8150000000000001 svc__C = 50 Train = 1.0 Test = 0.8150000000000001 svc__C = 100 Train = 1.0 Test = 0.8150000000000001
from sklearn.model_selection import GridSearchCV
parameters = {
'svc__gamma': np.logspace(-3, 0, 4),
'svc__C': [1, 5, 10, 50, 100],
}
clf = Pipeline([
('vect', TfidfVectorizer(
stop_words=stop_words,
token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",
)),
('svc', SVC(kernel='rbf')),
])
gs = GridSearchCV(clf, parameters, verbose=2, cv=3)
%time _ = gs.fit(X, y)
gs.best_params_, gs.best_score_
Fitting 3 folds for each of 20 candidates, totalling 60 fits [CV] svc__C=1, svc__gamma=0.001 ......................................
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] ....................... svc__C=1, svc__gamma=0.001, total= 8.4s [CV] svc__C=1, svc__gamma=0.001 ......................................
[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 8.3s remaining: 0.0s
[CV] ....................... svc__C=1, svc__gamma=0.001, total= 8.4s [CV] svc__C=1, svc__gamma=0.001 ...................................... [CV] ....................... svc__C=1, svc__gamma=0.001, total= 8.4s [CV] svc__C=1, svc__gamma=0.01 ....................................... [CV] ........................ svc__C=1, svc__gamma=0.01, total= 8.2s [CV] svc__C=1, svc__gamma=0.01 ....................................... [CV] ........................ svc__C=1, svc__gamma=0.01, total= 8.5s [CV] svc__C=1, svc__gamma=0.01 ....................................... [CV] ........................ svc__C=1, svc__gamma=0.01, total= 8.4s [CV] svc__C=1, svc__gamma=0.1 ........................................ [CV] ......................... svc__C=1, svc__gamma=0.1, total= 8.3s [CV] svc__C=1, svc__gamma=0.1 ........................................ [CV] ......................... svc__C=1, svc__gamma=0.1, total= 8.4s [CV] svc__C=1, svc__gamma=0.1 ........................................ [CV] ......................... svc__C=1, svc__gamma=0.1, total= 8.8s [CV] svc__C=1, svc__gamma=1.0 ........................................ [CV] ......................... svc__C=1, svc__gamma=1.0, total= 8.7s [CV] svc__C=1, svc__gamma=1.0 ........................................ [CV] ......................... svc__C=1, svc__gamma=1.0, total= 9.1s [CV] svc__C=1, svc__gamma=1.0 ........................................ [CV] ......................... svc__C=1, svc__gamma=1.0, total= 8.6s [CV] svc__C=5, svc__gamma=0.001 ...................................... [CV] ....................... svc__C=5, svc__gamma=0.001, total= 8.3s [CV] svc__C=5, svc__gamma=0.001 ...................................... [CV] ....................... svc__C=5, svc__gamma=0.001, total= 8.3s [CV] svc__C=5, svc__gamma=0.001 ...................................... [CV] ....................... svc__C=5, svc__gamma=0.001, total= 8.4s [CV] svc__C=5, svc__gamma=0.01 ....................................... [CV] ........................ svc__C=5, svc__gamma=0.01, total= 8.3s [CV] svc__C=5, svc__gamma=0.01 ....................................... [CV] ........................ svc__C=5, svc__gamma=0.01, total= 8.4s [CV] svc__C=5, svc__gamma=0.01 ....................................... [CV] ........................ svc__C=5, svc__gamma=0.01, total= 8.3s [CV] svc__C=5, svc__gamma=0.1 ........................................ [CV] ......................... svc__C=5, svc__gamma=0.1, total= 8.7s [CV] svc__C=5, svc__gamma=0.1 ........................................ [CV] ......................... svc__C=5, svc__gamma=0.1, total= 8.5s [CV] svc__C=5, svc__gamma=0.1 ........................................ [CV] ......................... svc__C=5, svc__gamma=0.1, total= 8.6s [CV] svc__C=5, svc__gamma=1.0 ........................................ [CV] ......................... svc__C=5, svc__gamma=1.0, total= 8.8s [CV] svc__C=5, svc__gamma=1.0 ........................................ [CV] ......................... svc__C=5, svc__gamma=1.0, total= 8.6s [CV] svc__C=5, svc__gamma=1.0 ........................................ [CV] ......................... svc__C=5, svc__gamma=1.0, total= 8.8s [CV] svc__C=10, svc__gamma=0.001 ..................................... [CV] ...................... svc__C=10, svc__gamma=0.001, total= 8.3s [CV] svc__C=10, svc__gamma=0.001 ..................................... [CV] ...................... svc__C=10, svc__gamma=0.001, total= 8.6s [CV] svc__C=10, svc__gamma=0.001 ..................................... [CV] ...................... svc__C=10, svc__gamma=0.001, total= 8.4s [CV] svc__C=10, svc__gamma=0.01 ...................................... [CV] ....................... svc__C=10, svc__gamma=0.01, total= 8.4s [CV] svc__C=10, svc__gamma=0.01 ...................................... [CV] ....................... svc__C=10, svc__gamma=0.01, total= 8.3s [CV] svc__C=10, svc__gamma=0.01 ...................................... [CV] ....................... svc__C=10, svc__gamma=0.01, total= 8.3s [CV] svc__C=10, svc__gamma=0.1 ....................................... [CV] ........................ svc__C=10, svc__gamma=0.1, total= 8.3s [CV] svc__C=10, svc__gamma=0.1 ....................................... [CV] ........................ svc__C=10, svc__gamma=0.1, total= 8.4s [CV] svc__C=10, svc__gamma=0.1 ....................................... [CV] ........................ svc__C=10, svc__gamma=0.1, total= 8.8s [CV] svc__C=10, svc__gamma=1.0 ....................................... [CV] ........................ svc__C=10, svc__gamma=1.0, total= 9.7s [CV] svc__C=10, svc__gamma=1.0 ....................................... [CV] ........................ svc__C=10, svc__gamma=1.0, total= 9.5s [CV] svc__C=10, svc__gamma=1.0 ....................................... [CV] ........................ svc__C=10, svc__gamma=1.0, total= 9.7s [CV] svc__C=50, svc__gamma=0.001 ..................................... [CV] ...................... svc__C=50, svc__gamma=0.001, total= 8.3s [CV] svc__C=50, svc__gamma=0.001 ..................................... [CV] ...................... svc__C=50, svc__gamma=0.001, total= 8.3s [CV] svc__C=50, svc__gamma=0.001 ..................................... [CV] ...................... svc__C=50, svc__gamma=0.001, total= 8.3s [CV] svc__C=50, svc__gamma=0.01 ...................................... [CV] ....................... svc__C=50, svc__gamma=0.01, total= 8.3s [CV] svc__C=50, svc__gamma=0.01 ...................................... [CV] ....................... svc__C=50, svc__gamma=0.01, total= 8.2s [CV] svc__C=50, svc__gamma=0.01 ...................................... [CV] ....................... svc__C=50, svc__gamma=0.01, total= 8.4s [CV] svc__C=50, svc__gamma=0.1 ....................................... [CV] ........................ svc__C=50, svc__gamma=0.1, total= 9.6s [CV] svc__C=50, svc__gamma=0.1 ....................................... [CV] ........................ svc__C=50, svc__gamma=0.1, total= 8.3s [CV] svc__C=50, svc__gamma=0.1 ....................................... [CV] ........................ svc__C=50, svc__gamma=0.1, total= 8.3s [CV] svc__C=50, svc__gamma=1.0 ....................................... [CV] ........................ svc__C=50, svc__gamma=1.0, total= 8.4s [CV] svc__C=50, svc__gamma=1.0 ....................................... [CV] ........................ svc__C=50, svc__gamma=1.0, total= 9.5s [CV] svc__C=50, svc__gamma=1.0 ....................................... [CV] ........................ svc__C=50, svc__gamma=1.0, total= 8.9s [CV] svc__C=100, svc__gamma=0.001 .................................... [CV] ..................... svc__C=100, svc__gamma=0.001, total= 8.4s [CV] svc__C=100, svc__gamma=0.001 .................................... [CV] ..................... svc__C=100, svc__gamma=0.001, total= 8.4s [CV] svc__C=100, svc__gamma=0.001 .................................... [CV] ..................... svc__C=100, svc__gamma=0.001, total= 8.2s [CV] svc__C=100, svc__gamma=0.01 ..................................... [CV] ...................... svc__C=100, svc__gamma=0.01, total= 8.3s [CV] svc__C=100, svc__gamma=0.01 ..................................... [CV] ...................... svc__C=100, svc__gamma=0.01, total= 8.3s [CV] svc__C=100, svc__gamma=0.01 ..................................... [CV] ...................... svc__C=100, svc__gamma=0.01, total= 8.3s [CV] svc__C=100, svc__gamma=0.1 ...................................... [CV] ....................... svc__C=100, svc__gamma=0.1, total= 8.2s [CV] svc__C=100, svc__gamma=0.1 ...................................... [CV] ....................... svc__C=100, svc__gamma=0.1, total= 8.3s [CV] svc__C=100, svc__gamma=0.1 ...................................... [CV] ....................... svc__C=100, svc__gamma=0.1, total= 8.4s [CV] svc__C=100, svc__gamma=1.0 ...................................... [CV] ....................... svc__C=100, svc__gamma=1.0, total= 8.4s [CV] svc__C=100, svc__gamma=1.0 ...................................... [CV] ....................... svc__C=100, svc__gamma=1.0, total= 8.4s [CV] svc__C=100, svc__gamma=1.0 ...................................... [CV] ....................... svc__C=100, svc__gamma=1.0, total= 8.5s
[Parallel(n_jobs=1)]: Done 60 out of 60 | elapsed: 8.5min finished C:\Users\bmobashe\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:814: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal. DeprecationWarning)
Wall time: 8min 46s
({'svc__C': 100, 'svc__gamma': 0.01}, 0.8293333333333334)
clf = Pipeline([
('vect', TfidfVectorizer(
stop_words=stop_words,
token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",
)),
('svc', SVC(kernel='rbf', C=100, gamma=0.01)),
])
clf.fit(X_train, y_train)
Pipeline(memory=None, steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.float64'>, encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True, stop_words=['a', 'about', 'above', 'across', 'after', 'afterwards', 'again',... strip_accents=None, sublinear_tf=False, token_pattern='\\b[a-z0-9_\\-\\.]+[a-z][a-z0-9_\\-\\.]+\\b', tokenizer=None, use_idf=True, vocabulary=None)), ('svc', SVC(C=100, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False))], verbose=False)
svc_pred = clf.predict(X_test)
print(clf.score(X_test, y_test))
0.8433333333333334
print(classification_report(svc_pred, y_test, target_names=news.target_names))
precision recall f1-score support alt.atheism 0.85 0.92 0.88 25 comp.graphics 0.89 0.78 0.83 41 comp.os.ms-windows.misc 0.85 0.85 0.85 39 comp.sys.ibm.pc.hardware 0.60 0.82 0.69 22 comp.sys.mac.hardware 0.90 0.77 0.83 35 comp.windows.x 0.88 0.82 0.85 34 misc.forsale 0.77 0.92 0.84 26 rec.autos 0.76 0.73 0.74 22 rec.motorcycles 0.91 0.91 0.91 35 rec.sport.baseball 0.81 0.96 0.88 23 rec.sport.hockey 0.97 0.94 0.95 32 sci.crypt 0.80 0.96 0.87 25 sci.electronics 0.84 0.77 0.80 47 sci.med 0.92 0.74 0.82 31 sci.space 0.88 0.97 0.92 29 soc.religion.christian 0.92 0.73 0.81 33 talk.politics.guns 0.93 0.74 0.83 35 talk.politics.mideast 0.92 1.00 0.96 24 talk.politics.misc 0.79 0.85 0.81 26 talk.religion.misc 0.61 0.88 0.72 16 accuracy 0.84 600 macro avg 0.84 0.85 0.84 600 weighted avg 0.85 0.84 0.84 600
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(y_test, svc_pred)
fig, ax = plt.subplots(figsize=(8,8))
ax = sns.heatmap(mat.T, square=True, linecolor='grey', linewidths=1, annot=True,
fmt='d', cbar=True, cmap='Reds', ax=ax, annot_kws={"fontsize":12, "weight":"bold"},
xticklabels=news.target_names,
yticklabels=news.target_names)
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.xlabel('true label')
plt.ylabel('predicted label');