#!/usr/bin/env python
# coding: utf-8

# # Exercise 08
# 
# ## Analyze how travelers expressed their feelings on Twitter
# 
# A sentiment analysis job about the problems of each major U.S. airline. 
# Twitter data was scraped from February of 2015 and contributors were 
# asked to first classify positive, negative, and neutral tweets, followed
# by categorizing negative reasons (such as "late flight" or "rude service").

# In[4]:


import pandas as pd
import numpy as np

get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt

# read the data and set the datetime as the index
import zipfile
with zipfile.ZipFile('../datasets/Tweets.zip', 'r') as z:
    f = z.open('Tweets.csv')
    tweets = pd.read_csv(f, index_col=0)

tweets.head()


# In[5]:


tweets.shape


# ### Proportion of tweets with each sentiment

# In[6]:


tweets['airline_sentiment'].value_counts()


# ### Proportion of tweets per airline
# 

# In[7]:


tweets['airline'].value_counts()


# In[11]:


pd.Series(tweets["airline"]).value_counts().plot(kind = "bar",figsize=(8,6),rot = 0)


# In[12]:


pd.crosstab(index = tweets["airline"],columns = tweets["airline_sentiment"]).plot(kind='bar',figsize=(10, 6),alpha=0.5,rot=0,stacked=True,title="Sentiment by airline")


# # Exercise 8.1 
# 
# Predict the sentiment using CountVectorizer, stopwords, n_grams, stemmer, TfidfVectorizer
# 
# use Random Forest classifier

# In[32]:


from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer


# In[18]:


X = tweets['text']
y = tweets['airline_sentiment'].map({'negative':-1,'neutral':0,'positive':1})


# In[ ]:


# # Exercise 8.2
# 
# Train a Deep Neural Network with the following architecture:
# 
# - Input = text 
# - Dense(128)
# - Relu Activation
# - BatchNormalization
# - Dropout(0.5)
# - Dense(10, Softmax)
# 
# Optimized using rmsprop using as loss categorical_crossentropy
# 
# Hints: 
# - test with two iterations then try more. 
# - learning can be ajusted
# 
# Evaluate the performance using the testing set (aprox 55% with 50 epochs)

# In[75]:


from keras.models import Sequential
from keras.utils import np_utils
from keras.layers import Dense, Dropout, Activation, BatchNormalization
from keras.optimizers import RMSprop
from keras.callbacks import History
from livelossplot import PlotLossesKeras


# In[ ]: