#!/usr/bin/env python # coding: utf-8 # ## Classifying News Headlines and Explaining the Result # Reference: Classifying News Headlines and Explaining the Result from [Kaggle](http://nbviewer.jupyter.org/github/dreamgonfly/lime-examples/blob/master/Classifying%20News%20Headlines%20and%20Explaining%20the%20Result.ipynb) # In[1]: import pandas as pd # using Kaggle API https://github.com/Kaggle/kaggle-api DATA_FILE = "~/.kaggle/datasets/uciml/news-aggregator-dataset/uci-news-aggregator.csv" news = pd.read_csv(DATA_FILE).sample(frac=0.1) # In[2]: len(news) # In[3]: news.head(3) # In[4]: from sklearn.preprocessing import LabelEncoder encoder = LabelEncoder() X = news['TITLE'] y = encoder.fit_transform(news['CATEGORY']) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y) # In[5]: # You can also make it categorical ont-hot vector. from sklearn.preprocessing import LabelEncoder, OneHotEncoder OneHotEncoder().fit_transform( LabelEncoder().fit_transform(news['CATEGORY']).reshape(-1,1) ).toarray() # In[6]: len(X_train) # In[7]: len(X_test) # In[8]: type(X_train) # In[9]: X_train.head(3) # In[10]: from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(min_df=3) train_vectors = vectorizer.fit_transform(X_train) test_vectors = vectorizer.transform(X_test) # In[11]: train_vectors # In[12]: X_train.iloc[1] # In[13]: train_vectors[1] # In[14]: type(train_vectors) # In[15]: # one-hot vector train_vectors[1].toarray() # ### Decision Tree # In[16]: from sklearn.metrics import accuracy_score get_ipython().run_line_magic('load_ext', 'autotime') # In[17]: from sklearn import tree dt = tree.DecisionTreeClassifier() dt.fit(train_vectors, y_train) # In[18]: pred = dt.predict(test_vectors) accuracy_score(y_test, pred, ) # ### Random Forest # In[19]: from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(n_estimators=20) rf.fit(train_vectors, y_train) # In[20]: pred = rf.predict(test_vectors) accuracy_score(y_test, pred, ) # ### Multinomial Naive Bayes # In[21]: from sklearn.naive_bayes import MultinomialNB nb = MultinomialNB() nb.fit(train_vectors, y_train) # In[22]: pred = nb.predict(test_vectors) accuracy_score(y_test, pred, ) # ## Explaining the result # In[23]: get_ipython().run_line_magic('unload_ext', 'autotime') # In[24]: from sklearn.pipeline import make_pipeline c = make_pipeline(vectorizer, nb) from lime.lime_text import LimeTextExplainer explainer = LimeTextExplainer(class_names=list(encoder.classes_)) # In[25]: # .sample is random select example = X_test.sample(1).iloc[0] example # In[26]: c.predict_proba([example]) # In[27]: get_ipython().run_cell_magic('capture', '', 'exp = explainer.explain_instance(example, c.predict_proba, top_labels=2)\n') # In[28]: exp.show_in_notebook()