In [15]:

# import
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import  confusion_matrix, accuracy_score, classification_report, precision_score

%matplotlib inline

Lets revise

In [2]:

# reading the data
df = pd.read_csv("data/fertility_Diagnosis.txt", delimiter=',', header=None)
df.iloc[:4,0:9]

Out[2]:

	0	1	2	3	4	6	7	8
0	-0.33	0.69	0	1	1	0.8	0	0.88
1	-0.33	0.94	1	0	1	0.8	1	0.31
2	-0.33	0.50	1	0	0	1.0	-1	0.50
3	-0.33	0.75	0	1	1	1.0	-1	0.38

In [3]:

X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:9], df[9], test_size=0.1)

In [4]:

pipeline = Pipeline([('clf', LogisticRegression())])

parameters = {
 'clf__penalty': ('l1', 'l2'),
    'clf__C': (0.01, 0.1, 1, 10)    
    }

grid_search = GridSearchCV(pipeline, parameters, n_jobs=5, verbose=True, scoring='accuracy', cv = 5)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits

[Parallel(n_jobs=5)]: Done  31 out of  40 | elapsed:    5.8s remaining:    1.6s
[Parallel(n_jobs=5)]: Done  40 out of  40 | elapsed:    5.8s finished

Out[4]:

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=5,
       param_grid={'clf__C': (0.01, 0.1, 1, 10), 'clf__penalty': ('l1', 'l2')},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy',
       verbose=True)

In [5]:

print( 'Best score: %0.3f' % grid_search.best_score_)
print( 'Best parameters set:')

best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print( '\t%s: %r' % (param_name, best_parameters[param_name]))

Best score: 0.933
Best parameters set:
	clf__C: 0.01
	clf__penalty: 'l1'

In [6]:

y_pred = grid_search.predict(X_test)

#print((y_pred), (y_test))

y_test = [2 if x=='N' else 1 for x in y_test]
y_pred = [2 if x=='N' else 1 for x in y_pred]

#print((y_pred), (y_test))

print( 'Accuracy:', accuracy_score(y_test, y_pred))
print( 'Precision:', precision_score(y_test, y_pred))
print( 'Recall:', recall_score(y_test, y_pred))

Accuracy: 0.8
Precision: 0.0
Recall: 0.0

C:\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)

Multi-class classification¶

The goal of multi-class classification is to assign an instance to one of the set of classes. scikit-learn uses a strategy called one-vs.-all, or one-vs.-the-rest, to support multi-class classification. Onevs.- all classification uses one binary classifier for each of the possible classes. The class that is predicted with the greatest confidence is assigned to the instance.

In [7]:

movie = pd.read_csv("data/movie_train.tsv", delimiter="\t")
movie[:10]

Out[7]:

	PhraseId	SentenceId	Phrase	Sentiment
0	1	1	A series of escapades demonstrating the adage ...	1
1	2	1	A series of escapades demonstrating the adage ...	2
2	3	1	A series	2
3	4	1	A	2
4	5	1	series	2
5	6	1	of escapades demonstrating the adage that what...	2
6	7	1	of	2
7	8	1	escapades demonstrating the adage that what is...	2
8	9	1	escapades	2
9	10	1	demonstrating the adage that what is good for ...	2

In [8]:

print(movie['Sentiment'].describe())

count    156060.000000
mean          2.063578
std           0.893832
min           0.000000
25%           2.000000
50%           2.000000
75%           3.000000
max           4.000000
Name: Sentiment, dtype: float64

In [9]:

print(movie['Sentiment'].value_counts())

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [16]:

def movie_rank():
    
    pipeline = Pipeline([('vect', TfidfVectorizer(stop_words='english')),
            ('clf', LogisticRegression()) 
                 ])
    
    parameters = {'vect__max_df': (0.25, 0.5),
                    'vect__ngram_range': ((1, 1), (1, 2)),
                    'vect__use_idf': (True, False),
                    'clf__C': (0.1, 1, 10),}
    
    movie=pd.read_csv('data/movie_train.tsv', header=0, delimiter='\t')
    X, y = movie['Phrase'], movie['Sentiment'].as_matrix()
    #print(X[:3])
    #print(y[:3])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 19)
    #print(X_train[:3])
    #print(y_train[:3])

    grid_search = GridSearchCV(pipeline, parameters, n_jobs=2, verbose=1, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    
   
    print( 'Best score: %0.3f' % grid_search.best_score_)
    print( 'Best parameters set:')
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print( '\t%s: %r' % (param_name, best_parameters[param_name]))
        
    predictions = grid_search.predict(X_test)

    print ('Accuracy:', accuracy_score(y_test, predictions))
    print ('Confusion Matrix:', confusion_matrix(y_test, predictions))
    print ('Classification Report:', classification_report(y_test, predictions))

movie_rank()

Fitting 3 folds for each of 24 candidates, totalling 72 fits

[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  2.5min
[Parallel(n_jobs=2)]: Done  72 out of  72 | elapsed:  5.7min finished

Best score: 0.631
Best parameters set:
	clf__C: 10
	vect__max_df: 0.25
	vect__ngram_range: (1, 2)
	vect__use_idf: False
Accuracy: 0.651159810329
Confusion Matrix: [[  740  1022   287    24     2]
 [  526  3854  3568   289    10]
 [  120  1869 19712  2057    82]
 [   10   248  3740  5096   780]
 [    4    11   248  1435  1084]]
Classification Report:              precision    recall  f1-score   support

          0       0.53      0.36      0.43      2075
          1       0.55      0.47      0.51      8247
          2       0.72      0.83      0.77     23840
          3       0.57      0.52      0.54      9874
          4       0.55      0.39      0.46      2782

avg / total       0.64      0.65      0.64     46818

In [ ]: