#!/usr/bin/env python # coding: utf-8 # # Exercise 08 # # ## Analyze how travelers expressed their feelings on Twitter # # A sentiment analysis job about the problems of each major U.S. airline. # Twitter data was scraped from February of 2015 and contributors were # asked to first classify positive, negative, and neutral tweets, followed # by categorizing negative reasons (such as "late flight" or "rude service"). # In[4]: import pandas as pd import numpy as np get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt # read the data and set the datetime as the index import zipfile with zipfile.ZipFile('../datasets/Tweets.zip', 'r') as z: f = z.open('Tweets.csv') tweets = pd.read_csv(f, index_col=0) tweets.head() # In[5]: tweets.shape # ### Proportion of tweets with each sentiment # In[6]: tweets['airline_sentiment'].value_counts() # ### Proportion of tweets per airline # # In[7]: tweets['airline'].value_counts() # In[11]: pd.Series(tweets["airline"]).value_counts().plot(kind = "bar",figsize=(8,6),rot = 0) # In[12]: pd.crosstab(index = tweets["airline"],columns = tweets["airline_sentiment"]).plot(kind='bar',figsize=(10, 6),alpha=0.5,rot=0,stacked=True,title="Sentiment by airline") # # Exercise 8.1 # # Predict the sentiment using CountVectorizer, stopwords, n_grams, stemmer, TfidfVectorizer # # use Random Forest classifier # In[32]: from sklearn.model_selection import train_test_split, cross_val_score from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.ensemble import RandomForestClassifier from nltk.stem.snowball import SnowballStemmer from nltk.stem import WordNetLemmatizer # In[18]: X = tweets['text'] y = tweets['airline_sentiment'].map({'negative':-1,'neutral':0,'positive':1}) # In[ ]: # # Exercise 8.2 # # Train a Deep Neural Network with the following architecture: # # - Input = text # - Dense(128) # - Relu Activation # - BatchNormalization # - Dropout(0.5) # - Dense(10, Softmax) # # Optimized using rmsprop using as loss categorical_crossentropy # # Hints: # - test with two iterations then try more. # - learning can be ajusted # # Evaluate the performance using the testing set (aprox 55% with 50 epochs) # In[75]: from keras.models import Sequential from keras.utils import np_utils from keras.layers import Dense, Dropout, Activation, BatchNormalization from keras.optimizers import RMSprop from keras.callbacks import History from livelossplot import PlotLossesKeras # In[ ]: