In [ ]:
from datascience import *
import numpy as np
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
plots.rcParams["patch.force_edgecolor"] = True

Lecture 8

Categorical Distribution

In [ ]:
top = Table.read_table('top_movies_2017.csv')
top
In [ ]:
studios = top.select('Studio')
studios
In [ ]:
studio_distribution = studios.group('Studio')
In [ ]:
studio_distribution
In [ ]:
sum(studio_distribution.column(1))

Bar Charts

In [ ]:
studio_distribution.barh('Studio')
In [ ]:
studio_distribution.sort(1, descending=True).barh(0)

Numerical Distribution

In [ ]:
ages = 2018 - top.column('Year')
top = top.with_column('Age', ages)
In [ ]:
top

Binning

In [ ]:
[min(ages), max(ages)]
In [ ]:
my_bins = make_array(0, 5, 10, 15, 25, 40, 65, 100)
my_bins
In [ ]:
top.bin('Age', bins = my_bins)
In [ ]:
sum(top.bin('Age', bins = my_bins).column(1))
In [ ]:
top.bin('Age', bins = np.arange(0, 101, 25))
In [ ]:
top.bin('Age', bins = np.arange(0, 60, 25))
In [ ]:
top.where('Age', 50)

Histograms

In [ ]:
my_bins
In [ ]:
top.bin('Age', bins = my_bins)
In [ ]:
top.hist('Age', bins = my_bins, unit = 'Year')
In [ ]:
# What *not* to do:
top.hist('Age', bins = my_bins, unit = 'Year', normed = False)
In [ ]:
top.hist('Age', bins = my_bins, unit = 'Year')
In [ ]:
top.hist('Age', bins = np.arange(0, 110, 10), unit = 'Year')
In [ ]:
top.hist('Age', bins = 20, unit = 'Year')
In [ ]:
 
In [ ]:
 
In [ ]:
top.hist('Age', unit='Year')
In [ ]:
top.hist('Age', bins = my_bins, unit = 'Year')
In [ ]:
distribution = top.bin('Age', bins = my_bins)
In [ ]:
distribution
In [ ]:
# 52 out of 200 movies in the [40, 65) bin

percent = (52/200) * 100
percent
In [ ]:
width = 65 - 40
width
In [ ]:
height = percent / width
height
In [ ]:
 
In [ ]:
 
In [ ]: