############################################################################## # Taposh Dutta Roy # Sentiment Analysis ############################################################################## import os from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestRegressor from sklearn.preprocessing import LabelEncoder from sklearn import cross_validation import pandas as pd import numpy as np import re import nltk from nltk.stem.porter import PorterStemmer from nltk.stem.snowball import SnowballStemmer from nltk.stem.wordnet import WordNetLemmatizer from bs4 import BeautifulSoup from nltk.corpus import stopwords ## Stemming functionality class stemmerUtility(object): """Stemming functionality""" @staticmethod def stemPorter(review_text): porter = PorterStemmer() preprocessed_docs = [] for doc in review_text: final_doc = [] for word in doc: final_doc.append(porter.stem(word)) #final_doc.append(wordnet.lemmatize(word)) #note that lemmatize() can also takes part of speech as an argument! preprocessed_docs.append(final_doc) return preprocessed_docs ## Originally provided by Google ## Modified by Taposh class KaggleWord2VecUtility(object): """KaggleWord2VecUtility is a utility class for processing raw HTML text into segments for further learning""" @staticmethod def review_to_wordlist( review, remove_stopwords=False ): # 1. Remove HTML review_text = BeautifulSoup(review).get_text() # 2. Remove non-letters review_text = re.sub("[^a-zA-Z]"," ", review_text) # 2.1 Remove single letters review_text = re.sub('/(?2: newwords.append(word) # 4. Optionally remove stop words (false by default) if remove_stopwords: stops = set(stopwords.words("english")) newwords = [w for w in newwords if not w in stops] # # 5. Return a list of words return(newwords) # Define a function to split a review into parsed sentences @staticmethod def review_to_sentences( review, tokenizer, remove_stopwords=False ): # Function to split a review into parsed sentences. Returns a # list of sentences, where each sentence is a list of words # # 1. Use the NLTK tokenizer to split the paragraph into sentences raw_sentences = tokenizer.tokenize(review.decode('utf8').strip()) # # 2. Loop over each sentence sentences = [] for raw_sentence in raw_sentences: # If a sentence is empty, skip it if len(raw_sentence) > 0: # Otherwise, call review_to_wordlist to get a list of words sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, \ remove_stopwords )) # # Return the list of sentences (each sentence is a list of words, # so this returns a list of lists return sentences train = pd.read_csv("/Users/taposh/workspace/mlearning/nlp/sentiment/bow/labeledTrainData.tsv", header=0,delimiter="\t", quoting=3) test = pd.read_csv("/Users/taposh/workspace/mlearning/nlp/sentiment/bow/testData.tsv", header=0, delimiter="\t",quoting=3) #train = pd.read_csv("/Users/taposh/workspace/kaggle/bow/labeledTrainData.tsv", header=0, \ # delimiter="\t", quoting=3) #test = pd.read_csv("/Users/taposh/workspace/kaggle/bow/testData.tsv", header=0, delimiter="\t", \ #quoting=3 ) y = train["sentiment"] print("Cleaning and parsing movie reviews...\n") traindata = [] for i in range( 0, len(train["review"])): #for i in range(0,10): traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train["review"][i], False))) testdata = [] #### for i in range(0,len(test["review"])): testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test["review"][i], False))) #print(testdata) print ('vectorizing... ',) tfv = TfidfVectorizer(min_df=2, max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=2,smooth_idf=1,sublinear_tf=1, stop_words = 'english') X_all = traindata + testdata lentrain = len(traindata) print ("fitting pipeline... ",) tfv.fit(X_all) X_all = tfv.transform(X_all) # RF transform 1st column to numbers #X_all[:,0] = LabelEncoder().fit_transform(X_all[:,0]) #for Logit X = X_all[:lentrain] X_test = X_all[lentrain:] #model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None model = LogisticRegression(penalty='l2', dual=True, tol=0.0001,C=14, fit_intercept=True, intercept_scaling=1,class_weight=None, random_state=None) #http://nbviewer.ipython.org/gist/rjweiss/7577004 #model = RandomForestRegressor(n_estimators=150, min_samples_split=1) #model.fit(X, y) #print X #print regressor.predict(X) print("25 Fold CV Score: ", np.mean(cross_validation.cross_val_score(model, X, y, cv=36, scoring='roc_auc'))) print("Retrain on all training data, predicting test labels...\n") model.fit(X,y) result = model.predict_proba(X_test)[:,1] #result = model.predict(X_test) print((result)) output = pd.DataFrame( data={"id":test["id"], "sentiment":result} ) import csv # Use pandas to write the comma-separated output file output.to_csv('/Users/taposh/workspace/mlearning/nlp/sentiment/bow/Bag_of_Words_model_v17.csv',quoting=3, escapechar=",",index=False,encoding='utf-8') #output.to_csv("/Users/taposhdr/workspace/decision_science/kaggle/bow/data/Bag_of_Words_model-1.csv", index=False, quoting=csv.QUOTE_NONE) print("Wrote results to csv file") c=16 | 25 Fold CV Score: 0.96409904 c=17 | 25 Fold CV Score: 0.96408976 c=13 | 25 Fold CV Score: 0.96410064 c=09 | 25 Fold CV Score: 0.96406416 c=08 | 25 Fold CV Score: 0.96402832 c=14 | 25 Fold CV Score: 0.96410448 c=14 tf=2 | 25 Fold CV Score: 0.96444656* c=14 tf=2 | 35 Fold CV Score: 0.96461634252 c=14 tf=2 | 36 Fold CV Score: 0.964697569423 c=14 tf=1 | 25 Fold CV Score: 0.96239056 c=12 | 25 Fold CV Score: 0.96409408 c=11 | 25 Fold CV Score: 0.96408656 c=15 | 25 Fold CV Score: 0.9641014 c=14.8 | 25 Fold CV Score: 0.96410384