Data Analysis Examples¶

In [2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [68]:

rc = {"figure.figsize" : (14, 6)}
sns.set(rc = rc)
my_palette = sns.color_palette("husl", 4)
sns.set_palette(my_palette)
sns.set_style("whitegrid")

USA.gov data¶

Load the dataset¶

Use the json module to load the data

In [4]:

path = "datasets/bitly_usagov/example.txt"

In [10]:

import json

In [17]:

records = [json.loads(line) for line in open(path)]
records[0]

Out[17]:

{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',
 'al': 'en-US,en;q=0.8',
 'c': 'US',
 'cy': 'Danvers',
 'g': 'A6qOVH',
 'gr': 'MA',
 'h': 'wfLQtf',
 'hc': 1331822918,
 'hh': '1.usa.gov',
 'l': 'orofrog',
 'll': [42.576698, -70.954903],
 'nk': 1,
 'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
 't': 1331923247,
 'tz': 'America/New_York',
 'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}

Count the number of timezones with pandas¶

In [19]:

df = pd.DataFrame(records)
df.head()

Out[19]:

	_heartbeat_	a	al	c	cy	g	gr	h	hc	hh	kw	l	ll	nk	r	t	tz	u
0	NaN	Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...	en-US,en;q=0.8	US	Danvers	A6qOVH	MA	wfLQtf	1.331823e+09	1.usa.gov	NaN	orofrog	[42.576698, -70.954903]	1.0	http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/...	1.331923e+09	America/New_York	http://www.ncbi.nlm.nih.gov/pubmed/22415991
1	NaN	GoogleMaps/RochesterNY	NaN	US	Provo	mwszkS	UT	mwszkS	1.308262e+09	j.mp	NaN	bitly	[40.218102, -111.613297]	0.0	http://www.AwareMap.com/	1.331923e+09	America/Denver	http://www.monroecounty.gov/etc/911/rss.php
2	NaN	Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...	en-US	US	Washington	xxr3Qb	DC	xxr3Qb	1.331920e+09	1.usa.gov	NaN	bitly	[38.9007, -77.043098]	1.0	http://t.co/03elZC4Q	1.331923e+09	America/New_York	http://boxer.senate.gov/en/press/releases/0316...
3	NaN	Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)...	pt-br	BR	Braz	zCaLwp	27	zUtuOu	1.331923e+09	1.usa.gov	NaN	alelex88	[-23.549999, -46.616699]	0.0	direct	1.331923e+09	America/Sao_Paulo	http://apod.nasa.gov/apod/ap120312.html
4	NaN	Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...	en-US,en;q=0.8	US	Shrewsbury	9b6kNl	MA	9b6kNl	1.273672e+09	bit.ly	NaN	bitly	[42.286499, -71.714699]	0.0	http://www.shrewsbury-ma.gov/selco/	1.331923e+09	America/New_York	http://www.shrewsbury-ma.gov/egov/gallery/1341...

In [20]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3560 entries, 0 to 3559
Data columns (total 18 columns):
_heartbeat_    120 non-null float64
a              3440 non-null object
al             3094 non-null object
c              2919 non-null object
cy             2919 non-null object
g              3440 non-null object
gr             2919 non-null object
h              3440 non-null object
hc             3440 non-null float64
hh             3440 non-null object
kw             93 non-null object
l              3440 non-null object
ll             2919 non-null object
nk             3440 non-null float64
r              3440 non-null object
t              3440 non-null float64
tz             3440 non-null object
u              3440 non-null object
dtypes: float64(4), object(14)
memory usage: 500.7+ KB

In [30]:

df["tz"][:10].value_counts()

Out[30]:

America/New_York     4
                     3
Europe/Warsaw        1
America/Denver       1
America/Sao_Paulo    1
Name: tz, dtype: int64

In [36]:

clean_tz = df["tz"].fillna("Missing")

In [38]:

clean_tz[clean_tz == ""] = "Unknown"

In [41]:

clean_tz.head(n = 20)

Out[41]:

0        America/New_York
1          America/Denver
2        America/New_York
3       America/Sao_Paulo
4        America/New_York
5        America/New_York
6           Europe/Warsaw
7                 Unknown
8                 Unknown
9                 Unknown
10    America/Los_Angeles
11       America/New_York
12       America/New_York
13                Missing
14       America/New_York
15         Asia/Hong_Kong
16         Asia/Hong_Kong
17       America/New_York
18         America/Denver
19            Europe/Rome
Name: tz, dtype: object

In [43]:

clean_tz[:20].value_counts()

Out[43]:

America/New_York       8
Unknown                3
America/Denver         2
Asia/Hong_Kong         2
Europe/Warsaw          1
America/Los_Angeles    1
Europe/Rome            1
America/Sao_Paulo      1
Missing                1
Name: tz, dtype: int64

In [69]:

sns.barplot(y = clean_tz[:10].values, x = clean_tz[:10].index)

Out[69]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f2a1dc9e6a0>

MovieLens data¶

In [ ]:

# Make display smaller
pd.options.display.max_rows = 15
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('datasets/movielens/users.dat', sep='::',header=None, names=unames)
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('datasets/movielens/ratings.dat', sep='::',header=None, names=rnames)
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('datasets/movielens/movies.dat', sep='::',header=None, names=mnames)

/home/ubuntu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:4: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  after removing the cwd from sys.path.
/home/ubuntu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:6: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.

In [73]:

users.head()

Out[73]:

	user_id	gender	age	occupation	zip
0	1	F	1	10	48067
1	2	M	56	16	70072
2	3	M	25	15	55117
3	4	M	45	7	02460
4	5	M	25	20	55455

In [74]:

ratings.head()

Out[74]:

	user_id	movie_id	rating	timestamp
0	1	1193	5	978300760
1	1	661	3	978302109
2	1	914	3	978301968
3	1	3408	4	978300275
4	1	2355	5	978824291

In [75]:

movies.head()

Out[75]:

	movie_id	title	genres
0	1	Toy Story (1995)	Animation\|Children's\|Comedy
1	2	Jumanji (1995)	Adventure\|Children's\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama
4	5	Father of the Bride Part II (1995)	Comedy

In [80]:

first_merge = pd.merge(ratings, users, on = "user_id")
first_merge.head()

Out[80]:

	user_id	movie_id	rating	timestamp	gender	age	occupation	zip
0	1	1193	5	978300760	F	1	10	48067
1	1	661	3	978302109	F	1	10	48067
2	1	914	3	978301968	F	1	10	48067
3	1	3408	4	978300275	F	1	10	48067
4	1	2355	5	978824291	F	1	10	48067

In [81]:

second_merge = pd.merge(first_merge, movies, on = "movie_id")
second_merge.head()

Out[81]:

	user_id	movie_id	rating	timestamp	gender	age	occupation	zip	title	genres
0	1	1193	5	978300760	F	1	10	48067	One Flew Over the Cuckoo's Nest (1975)	Drama
1	2	1193	5	978298413	M	56	16	70072	One Flew Over the Cuckoo's Nest (1975)	Drama
2	12	1193	4	978220179	M	25	12	32793	One Flew Over the Cuckoo's Nest (1975)	Drama
3	15	1193	4	978199279	M	25	7	22903	One Flew Over the Cuckoo's Nest (1975)	Drama
4	17	1193	5	978158471	M	50	1	95350	One Flew Over the Cuckoo's Nest (1975)	Drama

In [82]:

second_merge.shape

Out[82]:

(1000209, 10)

In [119]:

df = second_merge

In [101]:

df.groupby(["gender", "title"])[["rating"]].agg("mean")

Out[101]:

		rating
gender	title
F	$1,000,000 Duck (1971)	3.375000
	'Night Mother (1986)	3.388889
	'Til There Was You (1997)	2.675676
	'burbs, The (1989)	2.793478
	...And Justice for All (1979)	3.828571
...	...	...
M	Zed & Two Noughts, A (1985)	3.380952
	Zero Effect (1998)	3.723140
	Zero Kelvin (Kj�rlighetens kj�tere) (1995)	3.500000
	Zeus and Roxanne (1997)	2.357143
	eXistenZ (1999)	3.289086

7152 rows × 1 columns

In [104]:

df.pivot_table("rating", index = "title", columns = "gender")

Out[104]:

gender	F	M
title
$1,000,000 Duck (1971)	3.375000	2.761905
'Night Mother (1986)	3.388889	3.352941
'Til There Was You (1997)	2.675676	2.733333
'burbs, The (1989)	2.793478	2.962085
...And Justice for All (1979)	3.828571	3.689024
...	...	...
Zed & Two Noughts, A (1985)	3.500000	3.380952
Zero Effect (1998)	3.864407	3.723140
Zero Kelvin (Kj�rlighetens kj�tere) (1995)	NaN	3.500000
Zeus and Roxanne (1997)	2.777778	2.357143
eXistenZ (1999)	3.098592	3.289086

3706 rows × 2 columns

In [121]:

avg = df.pivot_table("rating", index = "title", columns = "gender", aggfunc = "mean")
avg

Out[121]:

gender	F	M
title
$1,000,000 Duck (1971)	3.375000	2.761905
'Night Mother (1986)	3.388889	3.352941
'Til There Was You (1997)	2.675676	2.733333
'burbs, The (1989)	2.793478	2.962085
...And Justice for All (1979)	3.828571	3.689024
...	...	...
Zed & Two Noughts, A (1985)	3.500000	3.380952
Zero Effect (1998)	3.864407	3.723140
Zero Kelvin (Kj�rlighetens kj�tere) (1995)	NaN	3.500000
Zeus and Roxanne (1997)	2.777778	2.357143
eXistenZ (1999)	3.098592	3.289086

3706 rows × 2 columns

In [111]:

ratings_by_title = df.groupby("title")[["title"]].count()
ratings_by_title

Out[111]:

	title
title
$1,000,000 Duck (1971)	37
'Night Mother (1986)	70
'Til There Was You (1997)	52
'burbs, The (1989)	303
...And Justice for All (1979)	199
...	...
Zed & Two Noughts, A (1985)	29
Zero Effect (1998)	301
Zero Kelvin (Kj�rlighetens kj�tere) (1995)	2
Zeus and Roxanne (1997)	23
eXistenZ (1999)	410

3706 rows × 1 columns

In [ ]:

ratings_2500 = ratings_by_title.index[ratings_by_title.title >= 2500]
ratings_2500

In [ ]:

avg.loc[ratings_2500]

In [ ]: