%matplotlib inline import datetime import nltk import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy import interp from sklearn import dummy from sklearn import ensemble from sklearn import grid_search from sklearn import linear_model from sklearn import cross_validation from sklearn.pipeline import Pipeline, FeatureUnion from sklearn.base import TransformerMixin, BaseEstimator, ClassifierMixin from sklearn.metrics import roc_curve, auc from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer store = pd.HDFStore("/Users/thead/git/arxiv-experiments/hep-ex.h5") #store['df'] = df df = store['df'] store.close() end_of_2014 = datetime.date(2014,12,31) df['cites_per_day'] = (df.citation_count / ((end_of_2014 - df.created).astype(int) / 1000000000 / (3600*24))) df.cites_per_day.hist(bins=100, range=(0,1.5), normed=True, log=True, histtype='step') plt.xlabel("Citations per day") plt.ylabel("Arbitrary units") threshold = df.cites_per_day.quantile(0.9) df['Y'] = df.cites_per_day>threshold df.Y.value_counts(normalize=True) class ColumnExtractor(BaseEstimator, TransformerMixin): def __init__(self, column=0): """Extract a column from a pandas.DataFrame Use this transformer at the beginning of a pipeline to extract the column of interest from a data frame. """ self.column = column def fit_transform(self, X, y=None, **kwargs): self.fit(X, y, **kwargs) return self.transform(X) def transform(self, X, **kwargs): return X[:,self.column] def fit(self, X, y=None, **kwargs): return self X_dev, X_eval, y_dev, y_eval = cross_validation.train_test_split(df, df.Y, test_size=0.33, random_state=78534) train = ["hello world world tiger bunny hopping", "tiger world"] test = ["hello world world tiger tiger", "bunny hopping camel"] c = CountVectorizer() # learn this vocabulary, with 5 unique tokens # the vectorizer will produce a 5D vector when # transforming text c.fit(train) # this is the mapping of token to dimension print c.vocabulary_ # tokenise new text and display the 5D vector c.transform(test).toarray() t = TfidfTransformer() t.fit(c.transform(train)) print t.transform(c.transform(test)).toarray() # pretty names for the dataframe column indices TITLE = 0 ABSTRACT = 1 # sub-pipeline for extracting the abstract vect = CountVectorizer(stop_words='english', token_pattern=u'(?u)\\b[A-z][A-z]+\\b') abstract_pipe = Pipeline([('abstracts', ColumnExtractor(ABSTRACT)), ('vect', vect), ('tfidf', TfidfTransformer()) ]) # sub-pipeline for extracting the title title_pipe = Pipeline([('title', ColumnExtractor(TITLE)), ('vect', CountVectorizer(stop_words='english', token_pattern=u'(?u)\\b[A-z][A-z]+\\b')), ('tfidf', TfidfTransformer()) ]) # combined pipeline text_clf = Pipeline([('features', FeatureUnion([('abstract', abstract_pipe), ('title', title_pipe)])), ('clf', linear_model.SGDClassifier()) # use DummyClassifier to convince yourself we do better # than pure luck #('clf', dummy.DummyClassifier()) ]) scores = cross_validation.cross_val_score(text_clf, X_dev, y_dev, cv=3, scoring='roc_auc') scores parameters = { 'features__title__vect__max_df': (0.25, 0.5, 0.75, 1.0), #1.0 is best it seems #'features__title__vect__min_df': (0.0, 0.25, 0.5, 0.75), #0.0 seems best ##'vect__max_features': (None, 5000, 10000, 50000), #no limit wins #'features__abstract__vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams, bigrams win 'features__title__vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams, ##'tfidf__use_idf': (True, False), ##'tfidf__norm': ('l1', 'l2'), #'clf__loss': ('hinge', 'log',), #log wins #'clf__alpha': (1e-4, 1e-5, 1e-6, 1e-7), #alpha 1e-5 ##'clf__penalty': ('l1', 'l2', 'elasticnet'), #l2 ##'clf__n_iter': (10, 50, 80), } grid = grid_search.GridSearchCV(text_clf, parameters, n_jobs=-1, #verbose=1, scoring='roc_auc') grid.fit(X_dev, y_dev) print("Best score: %0.3f"%grid.best_score_) print("Best parameters set:") best_parameters = grid.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r"%(param_name, best_parameters[param_name])) best_parameters = {'clf__alpha': 1e-05, 'clf__class_weight': 'auto', 'clf__penalty': 'l2', 'clf__loss': 'log', 'features__abstract__vect__ngram_range': (1, 2), 'features__title__vect__ngram_range': (1, 2), 'features__title__vect__max_df': 0.25, } text_clf.set_params(**best_parameters) scores = cross_validation.cross_val_score(text_clf, X_eval, y_eval, cv=3, scoring='roc_auc') scores cv = cross_validation.StratifiedKFold(y_dev, n_folds=3) for i, (train, test) in enumerate(cv): probas_ = text_clf.fit(X_dev[train], y_dev[train]).predict_proba(X_dev[test]) # Compute ROC curve and area under the curve fpr, tpr, thresholds = roc_curve(y_dev[test], probas_[:, 1]) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)'%(i, roc_auc)) plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") plt.grid() plt.show() fake = 90*0.2 real = 10*0.7 print "true positives: {} false positives: {}".format(real, fake) print "Chance of reading something interesting: {}".format(real/(fake+real))