import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
rc = {"figure.figsize" : (14, 6)}
sns.set(rc = rc)
my_palette = sns.color_palette("husl", 4)
sns.set_palette(my_palette)
sns.set_style("whitegrid")
Use the json
module to load the data
path = "datasets/bitly_usagov/example.txt"
import json
records = [json.loads(line) for line in open(path)]
records[0]
{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11', 'al': 'en-US,en;q=0.8', 'c': 'US', 'cy': 'Danvers', 'g': 'A6qOVH', 'gr': 'MA', 'h': 'wfLQtf', 'hc': 1331822918, 'hh': '1.usa.gov', 'l': 'orofrog', 'll': [42.576698, -70.954903], 'nk': 1, 'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf', 't': 1331923247, 'tz': 'America/New_York', 'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}
df = pd.DataFrame(records)
df.head()
_heartbeat_ | a | al | c | cy | g | gr | h | hc | hh | kw | l | ll | nk | r | t | tz | u | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | NaN | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... | en-US,en;q=0.8 | US | Danvers | A6qOVH | MA | wfLQtf | 1.331823e+09 | 1.usa.gov | NaN | orofrog | [42.576698, -70.954903] | 1.0 | http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/... | 1.331923e+09 | America/New_York | http://www.ncbi.nlm.nih.gov/pubmed/22415991 |
1 | NaN | GoogleMaps/RochesterNY | NaN | US | Provo | mwszkS | UT | mwszkS | 1.308262e+09 | j.mp | NaN | bitly | [40.218102, -111.613297] | 0.0 | http://www.AwareMap.com/ | 1.331923e+09 | America/Denver | http://www.monroecounty.gov/etc/911/rss.php |
2 | NaN | Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... | en-US | US | Washington | xxr3Qb | DC | xxr3Qb | 1.331920e+09 | 1.usa.gov | NaN | bitly | [38.9007, -77.043098] | 1.0 | http://t.co/03elZC4Q | 1.331923e+09 | America/New_York | http://boxer.senate.gov/en/press/releases/0316... |
3 | NaN | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)... | pt-br | BR | Braz | zCaLwp | 27 | zUtuOu | 1.331923e+09 | 1.usa.gov | NaN | alelex88 | [-23.549999, -46.616699] | 0.0 | direct | 1.331923e+09 | America/Sao_Paulo | http://apod.nasa.gov/apod/ap120312.html |
4 | NaN | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... | en-US,en;q=0.8 | US | Shrewsbury | 9b6kNl | MA | 9b6kNl | 1.273672e+09 | bit.ly | NaN | bitly | [42.286499, -71.714699] | 0.0 | http://www.shrewsbury-ma.gov/selco/ | 1.331923e+09 | America/New_York | http://www.shrewsbury-ma.gov/egov/gallery/1341... |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3560 entries, 0 to 3559 Data columns (total 18 columns): _heartbeat_ 120 non-null float64 a 3440 non-null object al 3094 non-null object c 2919 non-null object cy 2919 non-null object g 3440 non-null object gr 2919 non-null object h 3440 non-null object hc 3440 non-null float64 hh 3440 non-null object kw 93 non-null object l 3440 non-null object ll 2919 non-null object nk 3440 non-null float64 r 3440 non-null object t 3440 non-null float64 tz 3440 non-null object u 3440 non-null object dtypes: float64(4), object(14) memory usage: 500.7+ KB
df["tz"][:10].value_counts()
America/New_York 4 3 Europe/Warsaw 1 America/Denver 1 America/Sao_Paulo 1 Name: tz, dtype: int64
clean_tz = df["tz"].fillna("Missing")
clean_tz[clean_tz == ""] = "Unknown"
clean_tz.head(n = 20)
0 America/New_York 1 America/Denver 2 America/New_York 3 America/Sao_Paulo 4 America/New_York 5 America/New_York 6 Europe/Warsaw 7 Unknown 8 Unknown 9 Unknown 10 America/Los_Angeles 11 America/New_York 12 America/New_York 13 Missing 14 America/New_York 15 Asia/Hong_Kong 16 Asia/Hong_Kong 17 America/New_York 18 America/Denver 19 Europe/Rome Name: tz, dtype: object
clean_tz[:20].value_counts()
America/New_York 8 Unknown 3 America/Denver 2 Asia/Hong_Kong 2 Europe/Warsaw 1 America/Los_Angeles 1 Europe/Rome 1 America/Sao_Paulo 1 Missing 1 Name: tz, dtype: int64
sns.barplot(y = clean_tz[:10].values, x = clean_tz[:10].index)
<matplotlib.axes._subplots.AxesSubplot at 0x7f2a1dc9e6a0>
# Make display smaller
pd.options.display.max_rows = 15
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('datasets/movielens/users.dat', sep='::',header=None, names=unames)
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('datasets/movielens/ratings.dat', sep='::',header=None, names=rnames)
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('datasets/movielens/movies.dat', sep='::',header=None, names=mnames)
/home/ubuntu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:4: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'. after removing the cwd from sys.path. /home/ubuntu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:6: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
users.head()
user_id | gender | age | occupation | zip | |
---|---|---|---|---|---|
0 | 1 | F | 1 | 10 | 48067 |
1 | 2 | M | 56 | 16 | 70072 |
2 | 3 | M | 25 | 15 | 55117 |
3 | 4 | M | 45 | 7 | 02460 |
4 | 5 | M | 25 | 20 | 55455 |
ratings.head()
user_id | movie_id | rating | timestamp | |
---|---|---|---|---|
0 | 1 | 1193 | 5 | 978300760 |
1 | 1 | 661 | 3 | 978302109 |
2 | 1 | 914 | 3 | 978301968 |
3 | 1 | 3408 | 4 | 978300275 |
4 | 1 | 2355 | 5 | 978824291 |
movies.head()
movie_id | title | genres | |
---|---|---|---|
0 | 1 | Toy Story (1995) | Animation|Children's|Comedy |
1 | 2 | Jumanji (1995) | Adventure|Children's|Fantasy |
2 | 3 | Grumpier Old Men (1995) | Comedy|Romance |
3 | 4 | Waiting to Exhale (1995) | Comedy|Drama |
4 | 5 | Father of the Bride Part II (1995) | Comedy |
first_merge = pd.merge(ratings, users, on = "user_id")
first_merge.head()
user_id | movie_id | rating | timestamp | gender | age | occupation | zip | |
---|---|---|---|---|---|---|---|---|
0 | 1 | 1193 | 5 | 978300760 | F | 1 | 10 | 48067 |
1 | 1 | 661 | 3 | 978302109 | F | 1 | 10 | 48067 |
2 | 1 | 914 | 3 | 978301968 | F | 1 | 10 | 48067 |
3 | 1 | 3408 | 4 | 978300275 | F | 1 | 10 | 48067 |
4 | 1 | 2355 | 5 | 978824291 | F | 1 | 10 | 48067 |
second_merge = pd.merge(first_merge, movies, on = "movie_id")
second_merge.head()
user_id | movie_id | rating | timestamp | gender | age | occupation | zip | title | genres | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1193 | 5 | 978300760 | F | 1 | 10 | 48067 | One Flew Over the Cuckoo's Nest (1975) | Drama |
1 | 2 | 1193 | 5 | 978298413 | M | 56 | 16 | 70072 | One Flew Over the Cuckoo's Nest (1975) | Drama |
2 | 12 | 1193 | 4 | 978220179 | M | 25 | 12 | 32793 | One Flew Over the Cuckoo's Nest (1975) | Drama |
3 | 15 | 1193 | 4 | 978199279 | M | 25 | 7 | 22903 | One Flew Over the Cuckoo's Nest (1975) | Drama |
4 | 17 | 1193 | 5 | 978158471 | M | 50 | 1 | 95350 | One Flew Over the Cuckoo's Nest (1975) | Drama |
second_merge.shape
(1000209, 10)
df = second_merge
df.groupby(["gender", "title"])[["rating"]].agg("mean")
rating | ||
---|---|---|
gender | title | |
F | $1,000,000 Duck (1971) | 3.375000 |
'Night Mother (1986) | 3.388889 | |
'Til There Was You (1997) | 2.675676 | |
'burbs, The (1989) | 2.793478 | |
...And Justice for All (1979) | 3.828571 | |
... | ... | ... |
M | Zed & Two Noughts, A (1985) | 3.380952 |
Zero Effect (1998) | 3.723140 | |
Zero Kelvin (Kj�rlighetens kj�tere) (1995) | 3.500000 | |
Zeus and Roxanne (1997) | 2.357143 | |
eXistenZ (1999) | 3.289086 |
7152 rows × 1 columns
df.pivot_table("rating", index = "title", columns = "gender")
gender | F | M |
---|---|---|
title | ||
$1,000,000 Duck (1971) | 3.375000 | 2.761905 |
'Night Mother (1986) | 3.388889 | 3.352941 |
'Til There Was You (1997) | 2.675676 | 2.733333 |
'burbs, The (1989) | 2.793478 | 2.962085 |
...And Justice for All (1979) | 3.828571 | 3.689024 |
... | ... | ... |
Zed & Two Noughts, A (1985) | 3.500000 | 3.380952 |
Zero Effect (1998) | 3.864407 | 3.723140 |
Zero Kelvin (Kj�rlighetens kj�tere) (1995) | NaN | 3.500000 |
Zeus and Roxanne (1997) | 2.777778 | 2.357143 |
eXistenZ (1999) | 3.098592 | 3.289086 |
3706 rows × 2 columns
avg = df.pivot_table("rating", index = "title", columns = "gender", aggfunc = "mean")
avg
gender | F | M |
---|---|---|
title | ||
$1,000,000 Duck (1971) | 3.375000 | 2.761905 |
'Night Mother (1986) | 3.388889 | 3.352941 |
'Til There Was You (1997) | 2.675676 | 2.733333 |
'burbs, The (1989) | 2.793478 | 2.962085 |
...And Justice for All (1979) | 3.828571 | 3.689024 |
... | ... | ... |
Zed & Two Noughts, A (1985) | 3.500000 | 3.380952 |
Zero Effect (1998) | 3.864407 | 3.723140 |
Zero Kelvin (Kj�rlighetens kj�tere) (1995) | NaN | 3.500000 |
Zeus and Roxanne (1997) | 2.777778 | 2.357143 |
eXistenZ (1999) | 3.098592 | 3.289086 |
3706 rows × 2 columns
ratings_by_title = df.groupby("title")[["title"]].count()
ratings_by_title
title | |
---|---|
title | |
$1,000,000 Duck (1971) | 37 |
'Night Mother (1986) | 70 |
'Til There Was You (1997) | 52 |
'burbs, The (1989) | 303 |
...And Justice for All (1979) | 199 |
... | ... |
Zed & Two Noughts, A (1985) | 29 |
Zero Effect (1998) | 301 |
Zero Kelvin (Kj�rlighetens kj�tere) (1995) | 2 |
Zeus and Roxanne (1997) | 23 |
eXistenZ (1999) | 410 |
3706 rows × 1 columns
ratings_2500 = ratings_by_title.index[ratings_by_title.title >= 2500]
ratings_2500
avg.loc[ratings_2500]