#!/usr/bin/env python # coding: utf-8 # # TF-IDF and N-Grams # The goal of this project was to predict the sentiment of an IMDB movie review using a binary classification system. The dataset was part of the [Bag of Words Meets Bag of Popcorn Competition](https://www.kaggle.com/c/word2vec-nlp-tutorial). # # Model Accuracy: 0.89532 # # ## Bag of Words & TF-IDF # # A Bag of Words (BoW) model is a simple algorithm used in Natural Language Processing. It simply counts the number of times a word appears in a document. # # TF-IDF (or Term Frequency-Inverse Document Frequency) on the other hand reflects how important a word is to a document, or corpus. With TF-IDF, words are given weight, measured by relevance, rather than frequency. # # # It is the product of two statistics: # 1. Term Frequency (TF): The number of times a word appears in a given document. # 2. Inverse Document Frequency (IDF): The more documents a word appears in, the less valuable that word is as a signal. Very common words, such as “a” or “the”, thereby receive heavily discounted tf-idf scores, in contrast to words that are very specific to the document in question. # # # # In the project, I used two separate TF-IDF vectorizers and merged them into a single bag of words. # * The first vectorizer (word_vectorizer) analyzed complete words. # * The second vectorizer (char_vectorizer) analyzed the frequency of character n-grams. An n-gram is a continous sequence of n items from a document. Using Trigrams (N-gram size = 3) yielded a high predictive score. # # Lastly, we used a Logistic Regression to predict the sentiment attached to each review. The hyperparameters of the model were tuned using a validation dataset prior to training the model. # # #### Interestingly, our model performed worse if we cleaned the text data in the usual methods. This includes removing html, removing unwanted punctuation, removing stopwords, stemming, or tokenizing. # ## Loading Required Libraries and Reading the Data into Python # First, we need to load the required libraries and read the data into Python. # In[1]: import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report, confusion_matrix, accuracy_score from scipy.sparse import hstack from time import time # In[2]: train = pd.read_csv( "labeledTrainData.tsv", header=0, delimiter="\t") test = pd.read_csv("testData.tsv", header=0, delimiter="\t") train_text = train['review'] test_text = test['review'] y = train['sentiment'] all_text = pd.concat([train_text, test_text]) # ## TF-IDF Vectorizers # First, we convert the reviews into a Bag of Words using the TF-IDF vectorizer for words and for character trigrams. # In[3]: word_vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', sublinear_tf=True, strip_accents='unicode', stop_words='english', ngram_range=(1, 1), max_features=10000) word_vectorizer.fit(train_text) train_word_features = word_vectorizer.transform(train_text) # In[4]: char_vectorizer = TfidfVectorizer(analyzer='char', sublinear_tf=True, strip_accents='unicode', stop_words='english', ngram_range=(1, 3), max_features=50000) char_vectorizer.fit(train_text) train_char_features = char_vectorizer.transform(train_text) # In[5]: train_features = hstack([train_word_features, train_char_features]) # ## Hyperparameter Tuning of Logistic Regression # Since there are multiple hyperparameters to tune in the XGBoost model, we will use the [GridSearchCV](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) function of Sklearn to determine the optimal hyperparameter values. Next, I used the [train_test_split](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) function to generate a validation set and find the best parameters. # In[6]: X_train, X_test, y_train, y_test = train_test_split(train_features, y,test_size=0.3 ,random_state=1234) lr_model = LogisticRegression(random_state=1234) param_dict = {'C': [0.001, 0.01, 0.1, 1, 10], 'solver': ['sag', 'lbfgs', 'saga']} start = time() grid_search = GridSearchCV(lr_model, param_dict) grid_search.fit(X_train, y_train) print("GridSearch took %.2f seconds to complete." % (time()-start)) display(grid_search.best_params_) print("Cross-Validated Score of the Best Estimator: %.3f" % grid_search.best_score_) # Let's see how well our model does on the validation dataset and where any misclassifications occur. # # We have several metrics available for classification accuracy, including a confusion matrix and a classification report. # In[7]: lr=LogisticRegression(C=1, solver ='saga') lr.fit(X_train, y_train) lr_preds=lr.predict(X_test) print(confusion_matrix(y_test, lr_preds)) print(classification_report(y_test, lr_preds)) print("Accuracy Score: %.3f" % accuracy_score(y_test, lr_preds)) # The number of false positives (FP = 366) is similar to the number of false negatives (FN = 399), suggesting that our model is not biased towards either specificity nor sensitivity. # ## Modelling Sentiment from Reviews # We will redo the steps taken above, this time we both the train and test dataset. # # 1. Create a TF-IDF BoW for both words and trigrams. # 2. Train the Logistic Regression model using the tuned hyperparameters. # 3. Format predictions for submission to Kaggle Competition. # In[8]: word_vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', sublinear_tf=True, strip_accents='unicode', stop_words='english', ngram_range=(1, 1), max_features=10000) word_vectorizer.fit(all_text) train_word_features = word_vectorizer.transform(train_text) test_word_features = word_vectorizer.transform(test_text) # In[9]: char_vectorizer = TfidfVectorizer(analyzer='char', sublinear_tf=True, strip_accents='unicode', stop_words='english', ngram_range=(1, 3), max_features=50000) char_vectorizer.fit(all_text) train_char_features = char_vectorizer.transform(train_text) test_char_features = char_vectorizer.transform(test_text) # In[10]: train_features = hstack([train_char_features, train_word_features]) test_features = hstack([test_char_features, test_word_features]) # In[11]: lr=LogisticRegression(C=1,solver='saga') lr.fit(train_features,y) final_preds=lr.predict(test_features) # The predictions are then formatted in an appropriate layout for submission to Kaggle. # In[12]: test['sentiment'] = final_preds test = test[['id', 'sentiment']] test.to_csv('Submission.csv',index=False) # ### Logistic Regression Sentiment Accuracy = 0.89532