#!/usr/bin/env python
# coding: utf-8

# Personally, i've never been a fan or follower of Star Wars. I've never understood why this part of American culture gets so much love.
# 
# THAT BEING SAID...
# 
# We are here to explore FiveThirtyEIght's survey so we can make it nice and tidy.

# In[1]:


#First things first, import the data. Note some characters aren't in default utf-8
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
star_wars = pd.read_csv('star_wars.csv',encoding='ISO-8859-1')


# In[2]:


#let's look at the first few rows
star_wars.head(10)


# This table contains lots of data that isn't very clear or properly filled. Some of  the column names could also use some trimming to make reading easier. 

# In[3]:


#look at dataframe dimension
star_wars.shape


# In[4]:


#look at columns of dataset
star_wars.columns


# In[5]:


star_wars.iloc[0]


# In[6]:


#remove any rows where Respondent ID is missing
star_wars = star_wars[star_wars['RespondentID'].notnull()]


# In[7]:


#look at dataframe dimension again
star_wars.shape


# **OBSERVATION**: It appears there was only **one** entry with a missing Respondent ID, which was the first row of the dataset.

# In[8]:


yes_no = {'Yes':True,'No':False}


# We make the 2nd and 3rd columns a sub-list so we can apply the boolean function to them.

# In[9]:


second_third_column = ['Have you seen any of the 6 films in the Star Wars franchise?','Do you consider yourself to be a fan of the Star Wars film franchise?']


# In[10]:


for c in second_third_column:
    star_wars[c]=star_wars[c].map(yes_no)


# In[11]:


print(star_wars.head())


# I just want to quickly check the value counts ini both columns.

# In[12]:


star_wars.iloc[:,1].value_counts(dropna=False)


# In[13]:


star_wars.iloc[:,2].value_counts(dropna=False)


# Now we clean up the first respondent columns

# In[14]:


star_wars.iloc[:,3:9].head(1)


# In[15]:


dic_map={"Star Wars: Episode I  The Phantom Menace": True, 'Star Wars: Episode II  Attack of the Clones': True,'Star Wars: Episode III  Revenge of the Sith': True, 'Star Wars: Episode IV  A New Hope': True, 'Star Wars: Episode V The Empire Strikes Back': True,'Star Wars: Episode VI Return of the Jedi': True,np.NaN: False}


# In[16]:


star_wars.iloc[:,4].value_counts(dropna=False)


# In[17]:


for col in star_wars.columns[3:9]:
    star_wars[col]=star_wars[col].map(dic_map)


# In[18]:


print(star_wars.iloc[:,0:9].head())


# In[19]:


star_wars = star_wars.rename(columns={"Which of the following Star Wars films have you seen? Please select all that apply.": "seen_1",'Unnamed: 4':'seen_2','Unnamed: 5':'seen_3','Unnamed: 6':'seen_4', 'Unnamed: 7':'seen_5', 'Unnamed: 8':'seen_6'})
star_wars.iloc[:,0:10].head(10)


# The columns from 9 to 14 inclusive ask the respondent to rank the Star Wars movies in order of most favorite to least favorite (from 1 to 6). Each column represents a movies, according to the scheme below:

# In[20]:


star_wars[star_wars.columns[9:15]]=star_wars[star_wars.columns[9:15]].astype(float)


# In[21]:


star_wars = star_wars.rename(columns={"Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film.": "ranking_1",'Unnamed: 10':'ranking_2','Unnamed: 11':'ranking_3','Unnamed: 12':'ranking_4', 'Unnamed: 13':'ranking_5','Unnamed: 14':'ranking_6'})
star_wars.iloc[:,0:15].head(10)


# In[22]:


star_wars_mean_rankings=star_wars.iloc[:,9:15].mean()
star_wars_mean_rankings


# In[23]:


get_ipython().run_line_magic('matplotlib', 'inline')


# In[24]:


star_wars_mean_rankings.plot.barh(rot=0,color='blue',title='Average Ranking Per Star Wars Movie')


# From the chart above, we can see that Star Wars V- The Empire Strikes Back is the hightet rated movie of the series, followed by Star Wars 6.

# Now, we want to see how many people watched each of these movies. This of course will affect the ratinngs.

# In[25]:


seen_set = star_wars.iloc[:,3:9].sum()
seen_set


# In[26]:


seen_set.plot.barh(rot=0,color='blue',title='Viewer Count Per Star Wars Movie')


# Well, well well. 
# It appears the two most popular movies are also the most watched.
# It is a common trend for most star wars fans to prefer the older movies.

# So we want to see the differences between male and female opinions from the same data. Let's split them up.

# In[27]:


males = star_wars[star_wars["Gender"] == "Male"]


# In[28]:


females = star_wars[star_wars["Gender"] == "Female"]


# In[29]:


males.iloc[:,3:9].sum().plot(kind='barh',rot=0,color='blue',title='Male Viewer Count Per Star Wars Movie')


# In[30]:


females.iloc[:,3:9].sum().plot(kind='barh',rot=0,color='blue',title='Female Viewer Count Per Star Wars Movie')


# In[31]:


males.iloc[:,9:15].mean().plot.barh(rot=0,color='blue',title='Male Average Ranking Per Star Wars Movie')


# In[32]:


females.iloc[:,9:15].mean().plot.barh(rot=0,color='blue',title='Female Average Ranking Per Star Wars Movie')


# In[33]:


len(males)


# In[34]:


len(females)


# In[35]:


star_wars['Gender'].value_counts()


# In the code above, i just wanted to check how many men vs women filled the survey. Curious.

# From analyzing the bar graphs, we can see that, by a close margin in views, Star Wars: Episode V The Empire Strikes Bacm is the most viwed and highest ranked movie of the star wars franchise among males AND Females.