#!/usr/bin/env python # coding: utf-8 # ## Movie Data Analysis # # #### Objective: # Conduct exploratory data analysis on movie data # # # ### Table of Contents # * [Import and Explore Data](#importandexplore) # * [Converting numeric variables to categorical variables](#convert) # * [Visualization](#visualization) # * [Import visualization libraries](#visuallibraries) # * [Jointplots](#jointplots) # * [Histograms](#histograms) # * [Stacked Histograms](#stacked) # * [KDE Plot](#kdeplot) # * [Subplots()](#subplots) # * [Violinplots vs. Boxplots](#violin_box) # * [Facet Grid](#facetgrid) # * [Coordinates and Diagonals](#coordinates) # * [Building Dashboards](#dashboards) # * [Styling Tips](#styling) # * [Finishing Touches](#styling2) # * [Section Recap](#recap) # ### Import Libraries # In[2]: import pandas as pd import os # In[3]: os.getcwd() # --- # # ### Import and Explore Data # In[4]: movies = pd.read_csv('Movie-Ratings.csv') # In[4]: len(movies) # In[5]: movies.head() # In[6]: movies.columns # In[7]: movies.info() # In[8]: movies.describe() # In[5]: # Rename column names movies.columns = ['Film', 'Genre', 'CriticRating', 'AudienceRating', \ 'BudgetMillions', 'Year'] # In[6]: movies.head() # In[11]: movies.info() # In[12]: movies.describe() # --- # # ### Convert numeric variables into categorical variables # [Table of Contents](#toc) # In[7]: # Assign category type to Film, Genre, and Year movies.Film = movies.Film.astype('category') movies.Genre = movies.Genre.astype('category') movies.Year = movies.Year.astype('category') # In[16]: movies.info() # In[17]: movies.describe() # In[18]: movies.Genre.cat.categories # --- # # ## Visualization # [Table of Contents](#toc) | # [Matplotlib Website](https://matplotlib.org) # # ### Import visualization libraries # In[8]: from matplotlib import pyplot as plt import seaborn as sns get_ipython().run_line_magic('matplotlib', 'inline') import warnings warnings.filterwarnings('ignore') # --- # # ### Jointplots # [Table of Contents](#toc) | [Section](#visualization) # In[9]: #<<< Chart 1 j = sns.jointplot(data=movies, x='CriticRating', y='AudienceRating', kind='hex') # In[10]: #<<< Chart 2 j = sns.jointplot(data=movies, x='CriticRating', y='BudgetMillions', kind='hex') # In[28]: #<<< Chart 2 j = sns.jointplot(data=movies, x='AudienceRating', y='BudgetMillions', kind='hex') # In[12]: sns.set_style('darkgrid') # --- # # ### Histograms # [Table of Contents](#toc) | [Section](#visualization) # In[13]: m1 = sns.distplot(movies.AudienceRating, bins=15) # In[30]: m2 = sns.distplot(movies.CriticRating, bins=15) # In[32]: # Pyplot version sns.set_style('white') n1 = plt.hist(movies.AudienceRating, bins=15) # In[33]: n2 = plt.hist(movies.CriticRating, bins=15) # #### NOTE: Chart Background Formatting # To toggle chart background from grid to white background: # # **Display grid** # ```python # sns.set_style('darkgrid') # ``` # # **Hide grid** # ```python # sns.set_style('white') # ``` # #### Analysis Notes: # * Audience ratings adopt a normal distribution. # * Critics ratings are evenly distributed. # * Highest ratings (both audience and critics) are for movies with budgets 50 million or less. # --- # # ### Stacked Histograms # [Table of Contents](#toc) | [Section](#visualization) # In[49]: # Displaying histograms from multiple data sources in one chart plt.hist(movies[movies.Genre == 'Action'].BudgetMillions, bins=15) plt.hist(movies[movies.Genre == 'Drama'].BudgetMillions, bins=15) plt.hist(movies[movies.Genre == 'Thriller'].BudgetMillions, bins=15) plt.show() # In[53]: # Stack multiple data sources in one histogram chart (MANUAL METHOD) plt.hist([movies[movies.Genre == 'Action'].BudgetMillions, \ movies[movies.Genre == 'Drama'].BudgetMillions, \ movies[movies.Genre == 'Thriller'].BudgetMillions], \ bins=15, stacked=True) plt.show() # In[ ]: # In[97]: # Stack multiple data sources in one histogram chart (Better Way Version 1) genres = movies.Genre.cat.categories datasource = [] data_label = [] for i in genres: datasource.append((movies[movies.Genre==i].BudgetMillions)) data_label.append(i) plt.hist(datasource, bins=30, stacked=True, label=data_label) plt.legend() plt.show() # In[98]: # Stack multiple data sources in one histogram chart (Kirill's Method) list1 = list() mylabels = list() for gen in movies.Genre.cat.categories: list1.append((movies[movies.Genre==gen].BudgetMillions)) mylabels.append(gen) h = plt.hist(list1, bins=30, stacked=True, rwidth=1, label=mylabels) plt.legend() plt.show() # --- # # ### KDE Plot # [*Kernel Density Estimate*](https://en.wikipedia.org/wiki/Kernel_density_estimation) # # [Table of Contents](#toc) | [Section](#visualization) # In[101]: # Scatter Plot Method vis1 = sns.lmplot(data=movies, x='CriticRating', y='AudienceRating', \ fit_reg=False, hue='Genre', \ size=7, aspect=1) # In[108]: # KDE Plot k1 = sns.kdeplot(movies.CriticRating, movies.AudienceRating, \ shade=True, shade_lowest=False, cmap='Reds') # In[110]: # KDE Plot (Smoother Chart) k1 = sns.kdeplot(movies.CriticRating, movies.AudienceRating, \ shade=True, shade_lowest=False, cmap='Reds') # TIP: k1b = sns.kdeplot(movies.CriticRating, movies.AudienceRating, \ cmap='Reds') # --- # # ### Subplots() # [Table of Contents](#toc) | [Section](#visualization) # In[121]: sns.set_style('dark') k1 = sns.kdeplot(movies.BudgetMillions, movies.AudienceRating) # In[120]: k2 = sns.kdeplot(movies.BudgetMillions, movies.CriticRating) # In[124]: # Subplot Template f, axes = plt.subplots(1, 2, figsize=(12,6)) # In[125]: # Subplot Example 1 f, axes = plt.subplots(1, 2, figsize=(12,6)) k1 = sns.kdeplot(movies.BudgetMillions, movies.AudienceRating, ax=axes[0]) k2 = sns.kdeplot(movies.BudgetMillions, movies.CriticRating, ax=axes[1]) # In[126]: axes # In[132]: # Subplot Example 2 (Multi-dimensional plot) f, axes = plt.subplots(2, 2, figsize=(12,6)) #More than one dimension # Need to specify row and column coordinates in the 'ax' parameter k1 = sns.kdeplot(movies.BudgetMillions, movies.AudienceRating, ax=axes[0,0]) k2 = sns.kdeplot(movies.BudgetMillions, movies.CriticRating, ax=axes[1,1]) # In[136]: # Subplot Example 3 (Make it prettier) f, axes = plt.subplots(1, 2, figsize=(12,6), sharex=True, sharey=True) k1 = sns.kdeplot(movies.BudgetMillions, movies.AudienceRating, ax=axes[0]) k2 = sns.kdeplot(movies.BudgetMillions, movies.CriticRating, ax=axes[1]) # Zoom in on the chart to a specific axis scale k1.set(xlim=(-20,160)) #k2.set(xlim=(-20,200)) # --- # # ### ViolinPlots vs BoxPlots # [Table of Contents](#toc) | [Section](#visualization) # In[140]: #BoxPlots w = sns.boxplot(data=movies, x='Genre', y='CriticRating') # In[139]: #ViolinPlots z = sns.violinplot(data=movies, x='Genre', y='CriticRating') # In[142]: # Side by Side Comparison f, axes = plt.subplots(1, 2, figsize=(12,6), sharex=True, sharey=True) w = sns.boxplot(data=movies, x='Genre', y='CriticRating', ax=axes[0]) z = sns.violinplot(data=movies, x='Genre', y='CriticRating', ax=axes[1]) # In[143]: # Drill down to a specific genre # BoxPlot u = sns.boxplot(data=movies[movies.Genre=='Drama'], x='Year', y='CriticRating') # In[144]: # ViolinPlot v = sns.violinplot(data=movies[movies.Genre=='Drama'], x='Year', y='CriticRating') # In[145]: # Side by Side Comparison f, axes = plt.subplots(1, 2, figsize=(12,6), sharex=True, sharey=True) u = sns.boxplot(data=movies[movies.Genre=='Drama'], x='Year', y='CriticRating', ax=axes[0]) v = sns.violinplot(data=movies[movies.Genre=='Drama'], x='Year', y='CriticRating', ax=axes[1]) # ### Analysis # * ViolinPlots can sometimes provide a more visually intuitive way to identify concentration of observations # * BoxPlots are preferred by executives because of the quartile displays # * **NOTE: In a boxplot the shorter the quartile the denser the concentration of data points in that quartile** # --- # # ### Facet Grid # [Table of Contents](#toc) | [Section](#visualization) # In[15]: #g = sns.FacetGrid(movies, row='Genre', hue='Genre') g = sns.FacetGrid(movies, row='Genre', col='Year', hue='Genre') # In[17]: g = sns.FacetGrid(movies, row='Genre', col='Year', hue='Genre') g = g.map(plt.scatter, 'CriticRating', 'AudienceRating') # In[22]: #Can populate with any type of chart g = sns.FacetGrid(movies, row='Genre', col='Year', hue='Genre') g = g.map(plt.hist, 'BudgetMillions') # In[19]: # More scatter plot examples g = sns.FacetGrid(movies, row='Genre', col='Year', hue='Genre') kws = dict(s=50, linewidth=0.5, edgecolor='black') g = g.map(plt.scatter, 'CriticRating', 'AudienceRating', **kws) # --- # # ### Coordinates and Diagonals # [Table of Contents](#toc) | [Section](#visualization) # In[26]: # Controlling Axes and Adding Diagonals g = sns.FacetGrid(movies, row='Genre', col='Year', hue='Genre') kws = dict(s=50, linewidth=0.5, edgecolor='black') g = g.map(plt.scatter, 'CriticRating', 'AudienceRating', **kws) g.set(xlim=(0,100),ylim=(0,100)) #Not necessary in newer versions of Jupyter notebooks for ax in g.axes.flat: ax.plot((0, 100), (0, 100), c='gray', ls='--') g.add_legend() # --- # # ### Building Dashboards # [Table of Contents](#toc) | [Section](#visualization) # In[27]: #Import Libraries from matplotlib import pyplot as plt import seaborn as sns get_ipython().run_line_magic('matplotlib', 'inline') import warnings warnings.filterwarnings('ignore') # In[57]: # Dashboard Example 1 sns.set_style('darkgrid') f, axes = plt.subplots(2, 2, figsize=(15,15)) k1 = sns.kdeplot(movies.BudgetMillions, movies.AudienceRating, ax=axes[0,0]) k2 = sns.kdeplot(movies.BudgetMillions, movies.CriticRating, ax=axes[0,1]) k3 = sns.violinplot(data=movies, x='Year', y='BudgetMillions', ax=axes[1,0]) #NOTE: a different charting approach for non-Seaborn (pyplot) charts #replace 'plt' prefix/object with chart location 'axes[x,y] """ Since the subplots() function is built in pyplot regular pyplot charts can be generated with the 'plt' prefix and chart location is assigned with the 'axes[row,col].' coordinates """ #axes[1,1].hist(movies.CriticRating, bins=15) k4 = sns.kdeplot(movies.CriticRating, movies.AudienceRating, \ shade=True, shade_lowest=False, cmap='Reds', \ ax=axes[1,1]) # TIP: k1b = sns.kdeplot(movies.CriticRating, movies.AudienceRating, \ cmap='Reds', ax=axes[1,1]) k1.set(xlim=(-20,160)) k2.set(xlim=(-20,160)) plt.show() # --- # # ### Styling Tips # [Table of Contents](#toc) | [Section](#visualization) # In[43]: #Import Libraries from matplotlib import pyplot as plt import seaborn as sns get_ipython().run_line_magic('matplotlib', 'inline') import warnings warnings.filterwarnings('ignore') # In[61]: # Styling a dashboard #Set background to black without a grid sns.set_style('dark', {'axes.facecolor':'black'}) #white, whitegrid, dark, darkgrid, ticks f, axes = plt.subplots(2, 2, figsize=(15,15)) #Plot [0,0] k1 = sns.kdeplot(movies.BudgetMillions, movies.AudienceRating, \ shade=True, shade_lowest=True, cmap='inferno', \ ax=axes[0,0]) k1b = sns.kdeplot(movies.BudgetMillions, movies.AudienceRating, \ cmap='cool', ax=axes[0,0]) #Plot [0,1] k2 = sns.kdeplot(movies.BudgetMillions, movies.CriticRating, \ shade=True, shade_lowest=True, cmap='inferno', \ ax=axes[0,1]) k2 = sns.kdeplot(movies.BudgetMillions, movies.CriticRating, \ cmap='cool', ax=axes[0,1]) #Plot [1,0] k3 = sns.violinplot(data=movies, x='Year', y='BudgetMillions', ax=axes[1,0], \ palette='YlOrRd') #Plot [1,1] k4 = sns.kdeplot(movies.CriticRating, movies.AudienceRating, \ shade=True, shade_lowest=False, cmap='Blues_r', \ ax=axes[1,1]) k4b = sns.kdeplot(movies.CriticRating, movies.AudienceRating, \ cmap='gist_gray_r', ax=axes[1,1]) k1.set(xlim=(-20,160)) k2.set(xlim=(-20,160)) plt.show() # --- # # ### More Styling Tips # [Table of Contents](#toc) | [Section](#visualization) # #### Thematic Edits # * Change background to 'whitegrid' # * Add a chart into a plt.subplot() frame to change chart size # * Include a chart title, and specify font size, color, and style # * Add labels to x and y axes # * Change the size of the chart ticks # * Improve the legend appearance: change fontsize, backgroundcolor # * OPTION: can specify actual color of the data # # In[85]: #Thematic Edits list1 = list() mylabels = list() for gen in movies.Genre.cat.categories: list1.append((movies[movies.Genre==gen].BudgetMillions)) mylabels.append(gen) sns.set_style('whitegrid') fig, ax = plt.subplots() fig.set_size_inches(12, 8) h = plt.hist(list1, bins=30, stacked=True, rwidth=1, label=mylabels, ) plt.title('Movie Budget Distribution', fontsize=35, color='DarkBlue', fontname='Console') plt.ylabel('Number of Movies', fontsize=25, color='Red') plt.xlabel('Budget', fontsize=25, color='Green') plt.yticks(fontsize=20) plt.xticks(fontsize=20) plt.legend(frameon=True, fancybox=True, shadow=True, \ framealpha=1, prop={'size':20}) plt.show() # --- # # ### Section Recap # [Table of Contents](#toc) | [Section](#visualization) # # **In this section we learned:** # 1. [Category data type in Python](#convert) # 2. [Jointplots](#jointplots) # 3. [Histograms](#histograms) # 4. [Stacked Histograms](#stacked) # 5. [KDE Plot](#kdeplot) # 6. [Using the Subplots() function](#subplots) # 7. [Violinplots vs. Boxplots](#violin_box) # 8. [Creating a Facet Grid](#facetgrid) # 9. [Coordinates and Diagonals](#coordinates) # 10. [Building Dashboards](#dashboards) # 11. [Styling Tips](#styling) # 12. [Finishing Touches](#styling2) # In[ ]: