Companion to Lecture 4 of Harvard CS109: Data Science | Prepared by Chris Beaumont
This scene from Cast Away is an accurate metaphor for the amount of time you'll spend cleaning data, and the delirium you'll experience at the end.
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
#tell pandas to display wide tables as pretty HTML tables
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
"""
Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
"""
ax = axes or plt.gca()
ax.spines['top'].set_visible(top)
ax.spines['right'].set_visible(right)
ax.spines['left'].set_visible(left)
ax.spines['bottom'].set_visible(bottom)
#turn off all ticks
ax.yaxis.set_ticks_position('none')
ax.xaxis.set_ticks_position('none')
#now re-enable visibles
if top:
ax.xaxis.tick_top()
if bottom:
ax.xaxis.tick_bottom()
if left:
ax.yaxis.tick_left()
if right:
ax.yaxis.tick_right()
I'd like to suggest a basic rubric for the early stages of exploratory data analysis in Python. This isn't universally applicable, but it does cover many patterns which recur in several data analysis contexts. It's useful to keep this rubric in mind when encountering a new dataset.
The basic workflow is as follows:
This process transforms your data into a format which is easier to work with, gives you a basic overview of the data's properties, and likely generates several questions for you to followup in subsequent analysis.
Here's a preview of the raw data we'll use -- it's a list of the 10,000 movies made since 1950 with the most IMDB user ratings. It was scraped about a year ago from pages like this. Download the data at http://bit.ly/cs109_imdb.
!head imdb_top_10000.txt
The textfile is tab-separated, and doesn't have any column headers. We
set the appropriate keywords in pd.read_csv
to handle this
names = ['imdbID', 'title', 'year', 'score', 'votes', 'runtime', 'genres']
data = pd.read_csv('imdb_top_10000.txt', delimiter='\t', names=names).dropna()
print "Number of rows: %i" % data.shape[0]
data.head() # print the first 5 rows
There are several problems with the DataFrame at this point:
The following snipptet converts a string like '142 mins.' to the number 142:
dirty = '142 mins.'
number, text = dirty.split(' ')
clean = int(number)
print number
We can package this up into a list comprehension
clean_runtime = [float(r.split(' ')[0]) for r in data.runtime]
data['runtime'] = clean_runtime
data.head()
We can use the concept of indicator variables to split the genres column into many columns. Each new column will correspond to a single genre, and each cell will be True or False.
#determine the unique genres
genres = set()
for m in data.genres:
genres.update(g for g in m.split('|'))
genres = sorted(genres)
#make a column for each genre
for genre in genres:
data[genre] = [genre in movie.split('|') for movie in data.genres]
data.head()
We can fix each element by stripping off the last 7 characters
data['title'] = [t[0:-7] for t in data.title]
data.head()
data[['score', 'runtime', 'year', 'votes']].describe()
#hmmm, a runtime of 0 looks suspicious. How many movies have that?
print len(data[data.runtime == 0])
#probably best to flag those bad data as NAN
data.runtime[data.runtime==0] = np.nan
After flagging bad runtimes, we repeat
data.runtime.describe()
# more movies in recent years, but not *very* recent movies (they haven't had time to receive lots of votes yet?)
plt.hist(data.year, bins=np.arange(1950, 2013), color='#cccccc')
plt.xlabel("Release Year")
remove_border()
plt.hist(data.score, bins=20, color='#cccccc')
plt.xlabel("IMDB rating")
remove_border()
plt.hist(data.runtime.dropna(), bins=50, color='#cccccc')
plt.xlabel("Runtime distribution")
remove_border()
#hmm, more bad, recent movies. Real, or a selection bias?
plt.scatter(data.year, data.score, lw=0, alpha=.08, color='k')
plt.xlabel("Year")
plt.ylabel("IMDB Rating")
remove_border()
plt.scatter(data.votes, data.score, lw=0, alpha=.2, color='k')
plt.xlabel("Number of Votes")
plt.ylabel("IMDB Rating")
plt.xscale('log')
remove_border()
# low-score movies with lots of votes
data[(data.votes > 9e4) & (data.score < 5)][['title', 'year', 'score', 'votes', 'genres']]
# The lowest rated movies
data[data.score == data.score.min()][['title', 'year', 'score', 'votes', 'genres']]
# The highest rated movies
data[data.score == data.score.max()][['title', 'year', 'score', 'votes', 'genres']]
sum
over several rows or columns¶What genres are the most frequent?
#sum sums over rows by default
genre_count = np.sort(data[genres].sum())[::-1]
pd.DataFrame({'Genre Count': genre_count})
How many genres does a movie have, on average?
#axis=1 sums over columns instead
genre_count = data[genres].sum(axis=1)
print "Average movie has %0.2f genres" % genre_count.mean()
genre_count.describe()
Let's split up movies by decade
decade = (data.year // 10) * 10
tyd = data[['title', 'year']]
tyd['decade'] = decade
tyd.head()
GroupBy will gather movies into groups with equal decade values
#mean score for all movies in each decade
decade_mean = data.groupby(decade).score.mean()
decade_mean.name = 'Decade Mean'
print decade_mean
plt.plot(decade_mean.index, decade_mean.values, 'o-',
color='r', lw=3, label='Decade Average')
plt.scatter(data.year, data.score, alpha=.04, lw=0, color='k')
plt.xlabel("Year")
plt.ylabel("Score")
plt.legend(frameon=False)
remove_border()
We can go one further, and compute the scatter in each year as well
grouped_scores = data.groupby(decade).score
mean = grouped_scores.mean()
std = grouped_scores.std()
plt.plot(decade_mean.index, decade_mean.values, 'o-',
color='r', lw=3, label='Decade Average')
plt.fill_between(decade_mean.index, (decade_mean + std).values,
(decade_mean - std).values, color='r', alpha=.2)
plt.scatter(data.year, data.score, alpha=.04, lw=0, color='k')
plt.xlabel("Year")
plt.ylabel("Score")
plt.legend(frameon=False)
remove_border()
You can also iterate over a GroupBy object. Each iteration yields two variables: one of the distinct values of the group key, and the subset of the dataframe where the key equals that value. To find the most popular movie each year:
for year, subset in data.groupby('year'):
print year, subset[subset.score == subset.score.max()].title.values
Let's split up the movies by genre, and look at how their release year/runtime/IMDB score vary. The distribution for all movies is shown as a grey background.
This isn't a standard groupby, so we can't use the groupby
method here. A manual loop is needed
#create a 4x6 grid of plots.
fig, axes = plt.subplots(nrows=4, ncols=6, figsize=(12, 8),
tight_layout=True)
bins = np.arange(1950, 2013, 3)
for ax, genre in zip(axes.ravel(), genres):
ax.hist(data[data[genre] == 1].year,
bins=bins, histtype='stepfilled', normed=True, color='r', alpha=.3, ec='none')
ax.hist(data.year, bins=bins, histtype='stepfilled', ec='None', normed=True, zorder=0, color='#cccccc')
ax.annotate(genre, xy=(1955, 3e-2), fontsize=14)
ax.xaxis.set_ticks(np.arange(1950, 2013, 30))
ax.set_yticks([])
remove_border(ax, left=False)
ax.set_xlabel('Year')
Some subtler patterns here:
fig, axes = plt.subplots(nrows=4, ncols=6, figsize=(12, 8), tight_layout=True)
bins = np.arange(30, 240, 10)
for ax, genre in zip(axes.ravel(), genres):
ax.hist(data[data[genre] == 1].runtime,
bins=bins, histtype='stepfilled', color='r', ec='none', alpha=.3, normed=True)
ax.hist(data.runtime, bins=bins, normed=True,
histtype='stepfilled', ec='none', color='#cccccc',
zorder=0)
ax.set_xticks(np.arange(30, 240, 60))
ax.set_yticks([])
ax.set_xlabel("Runtime [min]")
remove_border(ax, left=False)
ax.annotate(genre, xy=(230, .02), ha='right', fontsize=12)
fig, axes = plt.subplots(nrows=4, ncols=6, figsize=(12, 8), tight_layout=True)
bins = np.arange(0, 10, .5)
for ax, genre in zip(axes.ravel(), genres):
ax.hist(data[data[genre] == 1].score,
bins=bins, histtype='stepfilled', color='r', ec='none', alpha=.3, normed=True)
ax.hist(data.score, bins=bins, normed=True,
histtype='stepfilled', ec='none', color='#cccccc',
zorder=0)
ax.set_yticks([])
ax.set_xlabel("Score")
remove_border(ax, left=False)
ax.set_ylim(0, .4)
ax.annotate(genre, xy=(0, .2), ha='left', fontsize=12)
css tweaks in this cell