Import Libraries

In [2]:
import pandas as pd
import os
In [3]:
os.getcwd()
Out[3]:
'/Users/wmemorgan/Google Drive/Education/Programming/Python_Programming_A-Z/Section_06_Advanced_Visualization'

Import and Explore Data

In [4]:
movies = pd.read_csv('Movie-Ratings.csv')
In [4]:
len(movies)
Out[4]:
559
In [5]:
movies.head()
Out[5]:
Film Genre Rotten Tomatoes Ratings % Audience Ratings % Budget (million $) Year of release
0 (500) Days of Summer Comedy 87 81 8 2009
1 10,000 B.C. Adventure 9 44 105 2008
2 12 Rounds Action 30 52 20 2009
3 127 Hours Adventure 93 84 18 2010
4 17 Again Comedy 55 70 20 2009
In [6]:
movies.columns
Out[6]:
Index(['Film', 'Genre', 'Rotten Tomatoes Ratings %', 'Audience Ratings %',
       'Budget (million $)', 'Year of release'],
      dtype='object')
In [7]:
movies.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 559 entries, 0 to 558
Data columns (total 6 columns):
Film                         559 non-null object
Genre                        559 non-null object
Rotten Tomatoes Ratings %    559 non-null int64
Audience Ratings %           559 non-null int64
Budget (million $)           559 non-null int64
Year of release              559 non-null int64
dtypes: int64(4), object(2)
memory usage: 26.3+ KB
In [8]:
movies.describe()
Out[8]:
Rotten Tomatoes Ratings % Audience Ratings % Budget (million $) Year of release
count 559.000000 559.000000 559.000000 559.000000
mean 47.309481 58.744186 50.236136 2009.152057
std 26.413091 16.826887 48.731817 1.362632
min 0.000000 0.000000 0.000000 2007.000000
25% 25.000000 47.000000 20.000000 2008.000000
50% 46.000000 58.000000 35.000000 2009.000000
75% 70.000000 72.000000 65.000000 2010.000000
max 97.000000 96.000000 300.000000 2011.000000
In [5]:
# Rename column names
movies.columns = ['Film', 'Genre', 'CriticRating', 'AudienceRating', \
                 'BudgetMillions', 'Year']
In [6]:
movies.head()
Out[6]:
Film Genre CriticRating AudienceRating BudgetMillions Year
0 (500) Days of Summer Comedy 87 81 8 2009
1 10,000 B.C. Adventure 9 44 105 2008
2 12 Rounds Action 30 52 20 2009
3 127 Hours Adventure 93 84 18 2010
4 17 Again Comedy 55 70 20 2009
In [11]:
movies.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 559 entries, 0 to 558
Data columns (total 6 columns):
Film              559 non-null object
Genre             559 non-null object
CriticRating      559 non-null int64
AudienceRating    559 non-null int64
BudgetMillions    559 non-null int64
Year              559 non-null int64
dtypes: int64(4), object(2)
memory usage: 26.3+ KB
In [12]:
movies.describe()
Out[12]:
CriticRating AudienceRating BudgetMillions Year
count 559.000000 559.000000 559.000000 559.000000
mean 47.309481 58.744186 50.236136 2009.152057
std 26.413091 16.826887 48.731817 1.362632
min 0.000000 0.000000 0.000000 2007.000000
25% 25.000000 47.000000 20.000000 2008.000000
50% 46.000000 58.000000 35.000000 2009.000000
75% 70.000000 72.000000 65.000000 2010.000000
max 97.000000 96.000000 300.000000 2011.000000

Convert numeric variables into categorical variables

Table of Contents

In [7]:
# Assign category type to Film, Genre, and Year
movies.Film = movies.Film.astype('category')
movies.Genre = movies.Genre.astype('category')
movies.Year = movies.Year.astype('category')
In [16]:
movies.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 559 entries, 0 to 558
Data columns (total 6 columns):
Film              559 non-null category
Genre             559 non-null category
CriticRating      559 non-null int64
AudienceRating    559 non-null int64
BudgetMillions    559 non-null int64
Year              559 non-null category
dtypes: category(3), int64(3)
memory usage: 40.3 KB
In [17]:
movies.describe()
Out[17]:
CriticRating AudienceRating BudgetMillions
count 559.000000 559.000000 559.000000
mean 47.309481 58.744186 50.236136
std 26.413091 16.826887 48.731817
min 0.000000 0.000000 0.000000
25% 25.000000 47.000000 20.000000
50% 46.000000 58.000000 35.000000
75% 70.000000 72.000000 65.000000
max 97.000000 96.000000 300.000000
In [18]:
movies.Genre.cat.categories
Out[18]:
Index(['Action', 'Adventure', 'Comedy', 'Drama', 'Horror', 'Romance',
       'Thriller'],
      dtype='object')

Import visualization libraries

In [8]:
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [9]:
#<<< Chart 1
j = sns.jointplot(data=movies, x='CriticRating', y='AudienceRating', kind='hex')
In [10]:
#<<< Chart 2
j = sns.jointplot(data=movies, x='CriticRating', y='BudgetMillions', kind='hex')
In [28]:
#<<< Chart 2
j = sns.jointplot(data=movies, x='AudienceRating', y='BudgetMillions', kind='hex')
In [12]:
sns.set_style('darkgrid')

In [13]:
m1 = sns.distplot(movies.AudienceRating, bins=15)
In [30]:
m2 = sns.distplot(movies.CriticRating, bins=15)
In [32]:
# Pyplot version
sns.set_style('white')
n1 = plt.hist(movies.AudienceRating, bins=15)
In [33]:
n2 = plt.hist(movies.CriticRating, bins=15)

NOTE: Chart Background Formatting

To toggle chart background from grid to white background:

Display grid

sns.set_style('darkgrid')

Hide grid

sns.set_style('white')

Analysis Notes:

  • Audience ratings adopt a normal distribution.
  • Critics ratings are evenly distributed.
  • Highest ratings (both audience and critics) are for movies with budgets 50 million or less.

Stacked Histograms

Table of Contents | Section

In [49]:
# Displaying histograms from multiple data sources in one chart
plt.hist(movies[movies.Genre == 'Action'].BudgetMillions, bins=15)
plt.hist(movies[movies.Genre == 'Drama'].BudgetMillions, bins=15)
plt.hist(movies[movies.Genre == 'Thriller'].BudgetMillions, bins=15)
plt.show()
In [53]:
# Stack multiple data sources in one histogram chart (MANUAL METHOD)
plt.hist([movies[movies.Genre == 'Action'].BudgetMillions, \
        movies[movies.Genre == 'Drama'].BudgetMillions, \
        movies[movies.Genre == 'Thriller'].BudgetMillions], \
        bins=15, stacked=True)
plt.show()
In [97]:
# Stack multiple data sources in one histogram chart (Better Way Version 1)
genres = movies.Genre.cat.categories
datasource = []
data_label = []
for i in genres:
    datasource.append((movies[movies.Genre==i].BudgetMillions))
    data_label.append(i)
    
plt.hist(datasource, bins=30, stacked=True, label=data_label)
plt.legend()
plt.show()
In [98]:
# Stack multiple data sources in one histogram chart (Kirill's Method)
list1 = list()
mylabels = list()
for gen in movies.Genre.cat.categories:
    list1.append((movies[movies.Genre==gen].BudgetMillions))
    mylabels.append(gen)

h = plt.hist(list1, bins=30, stacked=True, rwidth=1, label=mylabels)
plt.legend()
plt.show()

In [101]:
# Scatter Plot Method
vis1 = sns.lmplot(data=movies, x='CriticRating', y='AudienceRating', \
                 fit_reg=False, hue='Genre', \
                 size=7, aspect=1)
In [108]:
# KDE Plot
k1 = sns.kdeplot(movies.CriticRating, movies.AudienceRating, \
                shade=True, shade_lowest=False, cmap='Reds')
In [110]:
# KDE Plot (Smoother Chart)
k1 = sns.kdeplot(movies.CriticRating, movies.AudienceRating, \
                shade=True, shade_lowest=False, cmap='Reds')
# TIP:
k1b = sns.kdeplot(movies.CriticRating, movies.AudienceRating, \
                cmap='Reds')

In [121]:
sns.set_style('dark')
k1 = sns.kdeplot(movies.BudgetMillions, movies.AudienceRating)
In [120]:
k2 = sns.kdeplot(movies.BudgetMillions, movies.CriticRating)
In [124]:
# Subplot Template
f, axes = plt.subplots(1, 2, figsize=(12,6))
In [125]:
# Subplot Example 1
f, axes = plt.subplots(1, 2, figsize=(12,6))
k1 = sns.kdeplot(movies.BudgetMillions, movies.AudienceRating, ax=axes[0])
k2 = sns.kdeplot(movies.BudgetMillions, movies.CriticRating, ax=axes[1])
In [126]:
axes
Out[126]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x11622fd68>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x114d8d208>], dtype=object)
In [132]:
# Subplot Example 2 (Multi-dimensional plot)
f, axes = plt.subplots(2, 2, figsize=(12,6)) #More than one dimension
# Need to specify row and column coordinates in the 'ax' parameter
k1 = sns.kdeplot(movies.BudgetMillions, movies.AudienceRating, ax=axes[0,0])
k2 = sns.kdeplot(movies.BudgetMillions, movies.CriticRating, ax=axes[1,1])
In [136]:
# Subplot Example 3 (Make it prettier)
f, axes = plt.subplots(1, 2, figsize=(12,6), sharex=True, sharey=True)
k1 = sns.kdeplot(movies.BudgetMillions, movies.AudienceRating, ax=axes[0])
k2 = sns.kdeplot(movies.BudgetMillions, movies.CriticRating, ax=axes[1])
# Zoom in on the chart to a specific axis scale
k1.set(xlim=(-20,160))
#k2.set(xlim=(-20,200))
Out[136]:
[(-20, 160)]

ViolinPlots vs BoxPlots

Table of Contents | Section

In [140]:
#BoxPlots
w = sns.boxplot(data=movies, x='Genre', y='CriticRating')
In [139]:
#ViolinPlots
z = sns.violinplot(data=movies, x='Genre', y='CriticRating')
In [142]:
# Side by Side Comparison
f, axes = plt.subplots(1, 2, figsize=(12,6), sharex=True, sharey=True)
w = sns.boxplot(data=movies, x='Genre', y='CriticRating', ax=axes[0])
z = sns.violinplot(data=movies, x='Genre', y='CriticRating', ax=axes[1])
In [143]:
# Drill down to a specific genre
# BoxPlot
u = sns.boxplot(data=movies[movies.Genre=='Drama'], x='Year', y='CriticRating')
In [144]:
# ViolinPlot
v = sns.violinplot(data=movies[movies.Genre=='Drama'], x='Year', y='CriticRating')
In [145]:
# Side by Side Comparison
f, axes = plt.subplots(1, 2, figsize=(12,6), sharex=True, sharey=True)
u = sns.boxplot(data=movies[movies.Genre=='Drama'], x='Year', y='CriticRating', ax=axes[0])
v = sns.violinplot(data=movies[movies.Genre=='Drama'], x='Year', y='CriticRating', ax=axes[1])

Analysis

  • ViolinPlots can sometimes provide a more visually intuitive way to identify concentration of observations
  • BoxPlots are preferred by executives because of the quartile displays
  • NOTE: In a boxplot the shorter the quartile the denser the concentration of data points in that quartile

In [15]:
#g = sns.FacetGrid(movies, row='Genre', hue='Genre')
g = sns.FacetGrid(movies, row='Genre', col='Year', hue='Genre')