# import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, precision_score
%matplotlib inline
Lets revise
# reading the data
df = pd.read_csv("data/fertility_Diagnosis.txt", delimiter=',', header=None)
df.iloc[:4,0:9]
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | |
---|---|---|---|---|---|---|---|---|---|
0 | -0.33 | 0.69 | 0 | 1 | 1 | 0 | 0.8 | 0 | 0.88 |
1 | -0.33 | 0.94 | 1 | 0 | 1 | 0 | 0.8 | 1 | 0.31 |
2 | -0.33 | 0.50 | 1 | 0 | 0 | 0 | 1.0 | -1 | 0.50 |
3 | -0.33 | 0.75 | 0 | 1 | 1 | 0 | 1.0 | -1 | 0.38 |
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:9], df[9], test_size=0.1)
pipeline = Pipeline([('clf', LogisticRegression())])
parameters = {
'clf__penalty': ('l1', 'l2'),
'clf__C': (0.01, 0.1, 1, 10)
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=5, verbose=True, scoring='accuracy', cv = 5)
grid_search.fit(X_train, y_train)
Fitting 5 folds for each of 8 candidates, totalling 40 fits
[Parallel(n_jobs=5)]: Done 31 out of 40 | elapsed: 5.8s remaining: 1.6s [Parallel(n_jobs=5)]: Done 40 out of 40 | elapsed: 5.8s finished
GridSearchCV(cv=5, error_score='raise', estimator=Pipeline(steps=[('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False))]), fit_params={}, iid=True, n_jobs=5, param_grid={'clf__C': (0.01, 0.1, 1, 10), 'clf__penalty': ('l1', 'l2')}, pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=True)
print( 'Best score: %0.3f' % grid_search.best_score_)
print( 'Best parameters set:')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print( '\t%s: %r' % (param_name, best_parameters[param_name]))
Best score: 0.933 Best parameters set: clf__C: 0.01 clf__penalty: 'l1'
y_pred = grid_search.predict(X_test)
#print((y_pred), (y_test))
y_test = [2 if x=='N' else 1 for x in y_test]
y_pred = [2 if x=='N' else 1 for x in y_pred]
#print((y_pred), (y_test))
print( 'Accuracy:', accuracy_score(y_test, y_pred))
print( 'Precision:', precision_score(y_test, y_pred))
print( 'Recall:', recall_score(y_test, y_pred))
Accuracy: 0.8 Precision: 0.0 Recall: 0.0
C:\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. 'precision', 'predicted', average, warn_for)
The goal of multi-class classification is to assign an instance to one of the set of classes. scikit-learn uses a strategy called one-vs.-all, or one-vs.-the-rest, to support multi-class classification. Onevs.- all classification uses one binary classifier for each of the possible classes. The class that is predicted with the greatest confidence is assigned to the instance.
movie = pd.read_csv("data/movie_train.tsv", delimiter="\t")
movie[:10]
PhraseId | SentenceId | Phrase | Sentiment | |
---|---|---|---|---|
0 | 1 | 1 | A series of escapades demonstrating the adage ... | 1 |
1 | 2 | 1 | A series of escapades demonstrating the adage ... | 2 |
2 | 3 | 1 | A series | 2 |
3 | 4 | 1 | A | 2 |
4 | 5 | 1 | series | 2 |
5 | 6 | 1 | of escapades demonstrating the adage that what... | 2 |
6 | 7 | 1 | of | 2 |
7 | 8 | 1 | escapades demonstrating the adage that what is... | 2 |
8 | 9 | 1 | escapades | 2 |
9 | 10 | 1 | demonstrating the adage that what is good for ... | 2 |
print(movie['Sentiment'].describe())
count 156060.000000 mean 2.063578 std 0.893832 min 0.000000 25% 2.000000 50% 2.000000 75% 3.000000 max 4.000000 Name: Sentiment, dtype: float64
print(movie['Sentiment'].value_counts())
2 79582 3 32927 1 27273 4 9206 0 7072 Name: Sentiment, dtype: int64
def movie_rank():
pipeline = Pipeline([('vect', TfidfVectorizer(stop_words='english')),
('clf', LogisticRegression())
])
parameters = {'vect__max_df': (0.25, 0.5),
'vect__ngram_range': ((1, 1), (1, 2)),
'vect__use_idf': (True, False),
'clf__C': (0.1, 1, 10),}
movie=pd.read_csv('data/movie_train.tsv', header=0, delimiter='\t')
X, y = movie['Phrase'], movie['Sentiment'].as_matrix()
#print(X[:3])
#print(y[:3])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 19)
#print(X_train[:3])
#print(y_train[:3])
grid_search = GridSearchCV(pipeline, parameters, n_jobs=2, verbose=1, scoring='accuracy')
grid_search.fit(X_train, y_train)
print( 'Best score: %0.3f' % grid_search.best_score_)
print( 'Best parameters set:')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print( '\t%s: %r' % (param_name, best_parameters[param_name]))
predictions = grid_search.predict(X_test)
print ('Accuracy:', accuracy_score(y_test, predictions))
print ('Confusion Matrix:', confusion_matrix(y_test, predictions))
print ('Classification Report:', classification_report(y_test, predictions))
movie_rank()
Fitting 3 folds for each of 24 candidates, totalling 72 fits
[Parallel(n_jobs=2)]: Done 46 tasks | elapsed: 2.5min [Parallel(n_jobs=2)]: Done 72 out of 72 | elapsed: 5.7min finished
Best score: 0.631 Best parameters set: clf__C: 10 vect__max_df: 0.25 vect__ngram_range: (1, 2) vect__use_idf: False Accuracy: 0.651159810329 Confusion Matrix: [[ 740 1022 287 24 2] [ 526 3854 3568 289 10] [ 120 1869 19712 2057 82] [ 10 248 3740 5096 780] [ 4 11 248 1435 1084]] Classification Report: precision recall f1-score support 0 0.53 0.36 0.43 2075 1 0.55 0.47 0.51 8247 2 0.72 0.83 0.77 23840 3 0.57 0.52 0.54 9874 4 0.55 0.39 0.46 2782 avg / total 0.64 0.65 0.64 46818