#!/usr/bin/env python # coding: utf-8 # Personally, i've never been a fan or follower of Star Wars. I've never understood why this part of American culture gets so much love. # # THAT BEING SAID... # # We are here to explore FiveThirtyEIght's survey so we can make it nice and tidy. # In[1]: #First things first, import the data. Note some characters aren't in default utf-8 import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import re star_wars = pd.read_csv('star_wars.csv',encoding='ISO-8859-1') # In[2]: #let's look at the first few rows star_wars.head(10) # This table contains lots of data that isn't very clear or properly filled. Some of the column names could also use some trimming to make reading easier. # In[3]: #look at dataframe dimension star_wars.shape # In[4]: #look at columns of dataset star_wars.columns # In[5]: star_wars.iloc[0] # In[6]: #remove any rows where Respondent ID is missing star_wars = star_wars[star_wars['RespondentID'].notnull()] # In[7]: #look at dataframe dimension again star_wars.shape # **OBSERVATION**: It appears there was only **one** entry with a missing Respondent ID, which was the first row of the dataset. # In[8]: yes_no = {'Yes':True,'No':False} # We make the 2nd and 3rd columns a sub-list so we can apply the boolean function to them. # In[9]: second_third_column = ['Have you seen any of the 6 films in the Star Wars franchise?','Do you consider yourself to be a fan of the Star Wars film franchise?'] # In[10]: for c in second_third_column: star_wars[c]=star_wars[c].map(yes_no) # In[11]: print(star_wars.head()) # I just want to quickly check the value counts ini both columns. # In[12]: star_wars.iloc[:,1].value_counts(dropna=False) # In[13]: star_wars.iloc[:,2].value_counts(dropna=False) # Now we clean up the first respondent columns # In[14]: star_wars.iloc[:,3:9].head(1) # In[15]: dic_map={"Star Wars: Episode I The Phantom Menace": True, 'Star Wars: Episode II Attack of the Clones': True,'Star Wars: Episode III Revenge of the Sith': True, 'Star Wars: Episode IV A New Hope': True, 'Star Wars: Episode V The Empire Strikes Back': True,'Star Wars: Episode VI Return of the Jedi': True,np.NaN: False} # In[16]: star_wars.iloc[:,4].value_counts(dropna=False) # In[17]: for col in star_wars.columns[3:9]: star_wars[col]=star_wars[col].map(dic_map) # In[18]: print(star_wars.iloc[:,0:9].head()) # In[19]: star_wars = star_wars.rename(columns={"Which of the following Star Wars films have you seen? Please select all that apply.": "seen_1",'Unnamed: 4':'seen_2','Unnamed: 5':'seen_3','Unnamed: 6':'seen_4', 'Unnamed: 7':'seen_5', 'Unnamed: 8':'seen_6'}) star_wars.iloc[:,0:10].head(10) # The columns from 9 to 14 inclusive ask the respondent to rank the Star Wars movies in order of most favorite to least favorite (from 1 to 6). Each column represents a movies, according to the scheme below: # In[20]: star_wars[star_wars.columns[9:15]]=star_wars[star_wars.columns[9:15]].astype(float) # In[21]: star_wars = star_wars.rename(columns={"Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film.": "ranking_1",'Unnamed: 10':'ranking_2','Unnamed: 11':'ranking_3','Unnamed: 12':'ranking_4', 'Unnamed: 13':'ranking_5','Unnamed: 14':'ranking_6'}) star_wars.iloc[:,0:15].head(10) # In[22]: star_wars_mean_rankings=star_wars.iloc[:,9:15].mean() star_wars_mean_rankings # In[23]: get_ipython().run_line_magic('matplotlib', 'inline') # In[24]: star_wars_mean_rankings.plot.barh(rot=0,color='blue',title='Average Ranking Per Star Wars Movie') # From the chart above, we can see that Star Wars V- The Empire Strikes Back is the hightet rated movie of the series, followed by Star Wars 6. # Now, we want to see how many people watched each of these movies. This of course will affect the ratinngs. # In[25]: seen_set = star_wars.iloc[:,3:9].sum() seen_set # In[26]: seen_set.plot.barh(rot=0,color='blue',title='Viewer Count Per Star Wars Movie') # Well, well well. # It appears the two most popular movies are also the most watched. # It is a common trend for most star wars fans to prefer the older movies. # So we want to see the differences between male and female opinions from the same data. Let's split them up. # In[27]: males = star_wars[star_wars["Gender"] == "Male"] # In[28]: females = star_wars[star_wars["Gender"] == "Female"] # In[29]: males.iloc[:,3:9].sum().plot(kind='barh',rot=0,color='blue',title='Male Viewer Count Per Star Wars Movie') # In[30]: females.iloc[:,3:9].sum().plot(kind='barh',rot=0,color='blue',title='Female Viewer Count Per Star Wars Movie') # In[31]: males.iloc[:,9:15].mean().plot.barh(rot=0,color='blue',title='Male Average Ranking Per Star Wars Movie') # In[32]: females.iloc[:,9:15].mean().plot.barh(rot=0,color='blue',title='Female Average Ranking Per Star Wars Movie') # In[33]: len(males) # In[34]: len(females) # In[35]: star_wars['Gender'].value_counts() # In the code above, i just wanted to check how many men vs women filled the survey. Curious. # From analyzing the bar graphs, we can see that, by a close margin in views, Star Wars: Episode V The Empire Strikes Bacm is the most viwed and highest ranked movie of the star wars franchise among males AND Females.