#!/usr/bin/env python # coding: utf-8 # # Scikit-learn tutorial # # In this tutorial, we will demonstrate an exemplary complete machine learning process starting with the data and ending with predictions and proper evaluation. We will focus on textual data in this tutorial. # # Technically, we will utilize Python and specifically, the scikit-learn library. # Parts of this tutorial are motivated by http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html. # ## Data # # We work with one of the most classic machine learning textual datasets---the so-called 20 newsgroup dataset. This dataset is directly available in scikit-learn (after downloading it internally). # # Basically, it consists of 20,000 newsgroup documents that are partitioned across 20 different newsgroups. # In[1]: from sklearn.datasets import fetch_20newsgroups # In[2]: train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42) # In[3]: print type(train) # In[4]: print train.keys() # In[5]: print train.data[:1] # In[6]: print train.target[:1] # In[7]: print train.target_names[:1] # In[8]: print set(train.target_names) # In[9]: print len(train.data) # In[10]: x_train = train.data y_train = train.target # In[11]: test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42) # In[12]: print len(test.data) # In[13]: x_test = test.data y_test = test.target # ## Goal # # Classify newsgroup postings simply based on their text into their respective category # ## Feature engineering # In[14]: from sklearn.feature_extraction.text import CountVectorizer # In[15]: vec = CountVectorizer() x_train_f = vec.fit_transform(x_train) # In[16]: print x_train_f.shape # In[17]: print x_train_f[0,0:20000] # In[18]: x_test_f = vec.transform(x_test) # In[19]: print x_test_f.shape # In[20]: vec.vocabulary_.get('apple') # ## Naive Bayes Classifier # In[21]: from sklearn.naive_bayes import MultinomialNB # In[22]: clf = MultinomialNB().fit(x_train_f, y_train) # In[23]: docs = ["Where is the start menu?", "Most homeruns in a game", "Who was the first man on the moon?"] # In[24]: predicted = clf.predict(vec.transform(docs)) # In[25]: print predicted # In[26]: for doc, category in zip(docs, predicted): print('%r => %s' % (doc, train.target_names[category])) # In[27]: predicted = clf.predict(x_test_f) # In[28]: import numpy as np # In[29]: print np.mean(predicted==y_test) # In[30]: from sklearn.metrics import f1_score # In[31]: f1_score(y_test,predicted,average="weighted") # ## Pipeline # In[32]: from sklearn.pipeline import Pipeline # In[33]: clf = Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB()), ]) # In[34]: clf.fit(x_train, y_train) # In[35]: predicted = clf.predict(x_test) # In[36]: print np.mean(predicted==y_test) # In[37]: from sklearn.metrics import classification_report # In[38]: print classification_report(y_test,predicted) # ## Parameter tuning # In[39]: from sklearn.grid_search import GridSearchCV # In[40]: parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'clf__alpha': (1., 2.), } # In[41]: gs_clf = GridSearchCV(clf, parameters, n_jobs=-1) # In[42]: gs_clf.fit(x_train, y_train) # In[43]: predicted = gs_clf.predict(x_test) # In[44]: print classification_report(y_test,predicted) # In[45]: print gs_clf.grid_scores_ # In[46]: print gs_clf.best_estimator_ # ## Reddit data # In[48]: import pandas as pd # In[49]: train = pd.read_csv("reddit_train_top20") # In[50]: test = pd.read_csv("reddit_test_top20") # In[51]: len(train) # In[52]: len(test) # In[53]: train.head() # In[ ]: