#!/usr/bin/env python
# coding: utf-8

# # Data Analysis Examples

# ## 1.USA.gov Data from Bitly

# In[ ]:


from numpy.random import randn
import numpy as np
np.random.seed(123)
import os
import matplotlib.pyplot as plt
import pandas as pd
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4)
pd.options.display.max_rows = 20


# In [5]: path = 'datasets/bitly_usagov/example.txt'
# 
# In [6]: open(path).readline()
# Out[6]: '{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11
# (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1,
# "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l":
# "orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r":
# "http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u":
# "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc":
# 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'

# In[ ]:


import json
path = 'datasets/bitly_usagov/example.txt'
records = [json.loads(line) for line in open(path)]


# In [18]: records[0]
# Out[18]:
# {'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko)
# Chrome/17.0.963.78 Safari/535.11',
#  'al': 'en-US,en;q=0.8',
#  'c': 'US',
#  'cy': 'Danvers',
#  'g': 'A6qOVH',
#  'gr': 'MA',
#  'h': 'wfLQtf',
#  'hc': 1331822918,
#  'hh': '1.usa.gov',
#  'l': 'orofrog',
#  'll': [42.576698, -70.954903],
#  'nk': 1,
#  'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
#  't': 1331923247,
#  'tz': 'America/New_York',
#  'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}

# ### Counting Time Zones in Pure Python

# In[ ]:


time_zones = [rec['tz'] for rec in records]


# In[ ]:


time_zones = [rec['tz'] for rec in records if 'tz' in rec]
time_zones[:10]


# In[ ]:


def get_counts(sequence):
    counts = {}
    for x in sequence:
        if x in counts:
            counts[x] += 1
        else:
            counts[x] = 1
    return counts


# In[ ]:


from collections import defaultdict

def get_counts2(sequence):
    counts = defaultdict(int) # values will initialize to 0
    for x in sequence:
        counts[x] += 1
    return counts


# In[ ]:


counts = get_counts(time_zones)
counts['America/New_York']
len(time_zones)


# In[ ]:


def top_counts(count_dict, n=10):
    value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
    value_key_pairs.sort()
    return value_key_pairs[-n:]


# In[ ]:


top_counts(counts)


# In[ ]:


from collections import Counter
counts = Counter(time_zones)
counts.most_common(10)


# ### Counting Time Zones with pandas

# In[ ]:


import pandas as pd
frame = pd.DataFrame(records)
frame.info()
frame['tz'][:10]


# In[ ]:


tz_counts = frame['tz'].value_counts()
tz_counts[:10]


# In[ ]:


clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts = clean_tz.value_counts()
tz_counts[:10]


# In[ ]:


plt.figure(figsize=(10, 4))


# In[ ]:


import seaborn as sns
subset = tz_counts[:10]
sns.barplot(y=subset.index, x=subset.values)


# In[ ]:


frame['a'][1]
frame['a'][50]
frame['a'][51][:50]  # long line


# In[ ]:


results = pd.Series([x.split()[0] for x in frame.a.dropna()])
results[:5]
results.value_counts()[:8]


# In[ ]:


cframe = frame[frame.a.notnull()]


# In[ ]:


cframe = cframe.copy()


# In[ ]:


cframe['os'] = np.where(cframe['a'].str.contains('Windows'),
                        'Windows', 'Not Windows')
cframe['os'][:5]


# In[ ]:


by_tz_os = cframe.groupby(['tz', 'os'])


# In[ ]:


agg_counts = by_tz_os.size().unstack().fillna(0)
agg_counts[:10]


# In[ ]:


# Use to sort in ascending order
indexer = agg_counts.sum(1).argsort()
indexer[:10]


# In[ ]:


count_subset = agg_counts.take(indexer[-10:])
count_subset


# In[ ]:


agg_counts.sum(1).nlargest(10)


# In[ ]:


plt.figure()


# In[ ]:


# Rearrange the data for plotting
count_subset = count_subset.stack()
count_subset.name = 'total'
count_subset = count_subset.reset_index()
count_subset[:10]
sns.barplot(x='total', y='tz', hue='os',  data=count_subset)


# In[ ]:


def norm_total(group):
    group['normed_total'] = group.total / group.total.sum()
    return group

results = count_subset.groupby('tz').apply(norm_total)


# In[ ]:


plt.figure()


# In[ ]:


sns.barplot(x='normed_total', y='tz', hue='os',  data=results)


# In[ ]:


g = count_subset.groupby('tz')
results2 = count_subset.total / g.total.transform('sum')


# ## MovieLens 1M Dataset

# In[ ]:


import pandas as pd

# Make display smaller
pd.options.display.max_rows = 10

unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('datasets/movielens/users.dat', sep='::',
                      header=None, names=unames)

rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('datasets/movielens/ratings.dat', sep='::',
                        header=None, names=rnames)

mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('datasets/movielens/movies.dat', sep='::',
                       header=None, names=mnames)


# In[ ]:


users[:5]
ratings[:5]
movies[:5]
ratings


# In[ ]:


data = pd.merge(pd.merge(ratings, users), movies)
data
data.iloc[0]


# In[ ]:


mean_ratings = data.pivot_table('rating', index='title',
                                columns='gender', aggfunc='mean')
mean_ratings[:5]


# In[ ]:


ratings_by_title = data.groupby('title').size()
ratings_by_title[:10]
active_titles = ratings_by_title.index[ratings_by_title >= 250]
active_titles


# In[ ]:


# Select rows on the index
mean_ratings = mean_ratings.loc[active_titles]
mean_ratings


# In[ ]:


mean_ratings = mean_ratings.rename(index={'Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)':
                           'Seven Samurai (Shichinin no samurai) (1954)'})


# In[ ]:


top_female_ratings = mean_ratings.sort_values(by='F', ascending=False)
top_female_ratings[:10]


# ### Measuring Rating Disagreement

# In[ ]:


mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']


# In[ ]:


sorted_by_diff = mean_ratings.sort_values(by='diff')
sorted_by_diff[:10]


# In[ ]:


# Reverse order of rows, take first 10 rows
sorted_by_diff[::-1][:10]


# In[ ]:


# Standard deviation of rating grouped by title
rating_std_by_title = data.groupby('title')['rating'].std()
# Filter down to active_titles
rating_std_by_title = rating_std_by_title.loc[active_titles]
# Order Series by value in descending order
rating_std_by_title.sort_values(ascending=False)[:10]


# ## US Baby Names 1880–2010

# In [4]: names.head(10)
# Out[4]:
#         name sex  births  year
# 0       Mary   F    7065  1880
# 1       Anna   F    2604  1880
# 2       Emma   F    2003  1880
# 3  Elizabeth   F    1939  1880
# 4     Minnie   F    1746  1880
# 5   Margaret   F    1578  1880
# 6        Ida   F    1472  1880
# 7      Alice   F    1414  1880
# 8     Bertha   F    1320  1880
# 9      Sarah   F    1288  1880

# In[ ]:


get_ipython().system('head -n 10 datasets/babynames/yob1880.txt')


# In[ ]:


import pandas as pd
names1880 = pd.read_csv('datasets/babynames/yob1880.txt',
                        names=['name', 'sex', 'births'])
names1880


# In[ ]:


names1880.groupby('sex').births.sum()


# In[ ]:


years = range(1880, 2011)

pieces = []
columns = ['name', 'sex', 'births']

for year in years:
    path = 'datasets/babynames/yob%d.txt' % year
    frame = pd.read_csv(path, names=columns)

    frame['year'] = year
    pieces.append(frame)

# Concatenate everything into a single DataFrame
names = pd.concat(pieces, ignore_index=True)


# In[ ]:


names


# In[ ]:


total_births = names.pivot_table('births', index='year',
                                 columns='sex', aggfunc=sum)
total_births.tail()
total_births.plot(title='Total births by sex and year')


# In[ ]:


def add_prop(group):
    group['prop'] = group.births / group.births.sum()
    return group
names = names.groupby(['year', 'sex']).apply(add_prop)


# In[ ]:


names


# In[ ]:


names.groupby(['year', 'sex']).prop.sum()


# In[ ]:


def get_top1000(group):
    return group.sort_values(by='births', ascending=False)[:1000]
grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)
# Drop the group index, not needed
top1000.reset_index(inplace=True, drop=True)


# pieces = []
# for year, group in names.groupby(['year', 'sex']):
#     pieces.append(group.sort_values(by='births', ascending=False)[:1000])
# top1000 = pd.concat(pieces, ignore_index=True)

# In[ ]:


top1000


# ### Analyzing Naming Trends

# In[ ]:


boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']


# In[ ]:


total_births = top1000.pivot_table('births', index='year',
                                   columns='name',
                                   aggfunc=sum)


# In[ ]:


total_births.info()
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]
subset.plot(subplots=True, figsize=(12, 10), grid=False,
            title="Number of births per year")


# #### Measuring the increase in naming diversity

# In[ ]:


plt.figure()


# In[ ]:


table = top1000.pivot_table('prop', index='year',
                            columns='sex', aggfunc=sum)
table.plot(title='Sum of table1000.prop by year and sex',
           yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))


# In[ ]:


df = boys[boys.year == 2010]
df


# In[ ]:


prop_cumsum = df.sort_values(by='prop', ascending=False).prop.cumsum()
prop_cumsum[:10]
prop_cumsum.values.searchsorted(0.5)


# In[ ]:


df = boys[boys.year == 1900]
in1900 = df.sort_values(by='prop', ascending=False).prop.cumsum()
in1900.values.searchsorted(0.5) + 1


# In[ ]:


def get_quantile_count(group, q=0.5):
    group = group.sort_values(by='prop', ascending=False)
    return group.prop.cumsum().values.searchsorted(q) + 1

diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')


# In[ ]:


fig = plt.figure()


# In[ ]:


diversity.head()
diversity.plot(title="Number of popular names in top 50%")


# #### The “last letter” revolution

# In[ ]:


# extract last letter from name column
get_last_letter = lambda x: x[-1]
last_letters = names.name.map(get_last_letter)
last_letters.name = 'last_letter'

table = names.pivot_table('births', index=last_letters,
                          columns=['sex', 'year'], aggfunc=sum)


# In[ ]:


subtable = table.reindex(columns=[1910, 1960, 2010], level='year')
subtable.head()


# In[ ]:


subtable.sum()
letter_prop = subtable / subtable.sum()
letter_prop


# In[ ]:


import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',
                      legend=False)


# In[ ]:


plt.subplots_adjust(hspace=0.25)


# In[ ]:


letter_prop = table / table.sum()
dny_ts = letter_prop.loc[['d', 'n', 'y'], 'M'].T
dny_ts.head()


# In[ ]:


plt.close('all')


# In[ ]:


fig = plt.figure()


# In[ ]:


dny_ts.plot()


# #### Boy names that became girl names (and vice versa)

# In[ ]:


all_names = pd.Series(top1000.name.unique())
lesley_like = all_names[all_names.str.lower().str.contains('lesl')]
lesley_like


# In[ ]:


filtered = top1000[top1000.name.isin(lesley_like)]
filtered.groupby('name').births.sum()


# In[ ]:


table = filtered.pivot_table('births', index='year',
                             columns='sex', aggfunc='sum')
table = table.div(table.sum(1), axis=0)
table.tail()


# In[ ]:


fig = plt.figure()


# In[ ]:


table.plot(style={'M': 'k-', 'F': 'k--'})


# ## USDA Food Database

# {
#   "id": 21441,
#   "description": "KENTUCKY FRIED CHICKEN, Fried Chicken, EXTRA CRISPY,
# Wing, meat and skin with breading",
#   "tags": ["KFC"],
#   "manufacturer": "Kentucky Fried Chicken",
#   "group": "Fast Foods",
#   "portions": [
#     {
#       "amount": 1,
#       "unit": "wing, with skin",
#       "grams": 68.0
#     },
# 
#     ...
#   ],
#   "nutrients": [
#     {
#       "value": 20.8,
#       "units": "g",
#       "description": "Protein",
#       "group": "Composition"
#     },
# 
#     ...
#   ]
# }

# In[ ]:


import json
db = json.load(open('datasets/usda_food/database.json'))
len(db)


# In[ ]:


db[0].keys()
db[0]['nutrients'][0]
nutrients = pd.DataFrame(db[0]['nutrients'])
nutrients[:7]


# In[ ]:


info_keys = ['description', 'group', 'id', 'manufacturer']
info = pd.DataFrame(db, columns=info_keys)
info[:5]
info.info()


# In[ ]:


pd.value_counts(info.group)[:10]


# In[ ]:


nutrients = []

for rec in db:
    fnuts = pd.DataFrame(rec['nutrients'])
    fnuts['id'] = rec['id']
    nutrients.append(fnuts)

nutrients = pd.concat(nutrients, ignore_index=True)


# In[ ]:


nutrients


# In[ ]:


nutrients.duplicated().sum()  # number of duplicates
nutrients = nutrients.drop_duplicates()


# In[ ]:


col_mapping = {'description' : 'food',
               'group'       : 'fgroup'}
info = info.rename(columns=col_mapping, copy=False)
info.info()
col_mapping = {'description' : 'nutrient',
               'group' : 'nutgroup'}
nutrients = nutrients.rename(columns=col_mapping, copy=False)
nutrients


# In[ ]:


ndata = pd.merge(nutrients, info, on='id', how='outer')
ndata.info()
ndata.iloc[30000]


# In[ ]:


fig = plt.figure()


# In[ ]:


result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)
result['Zinc, Zn'].sort_values().plot(kind='barh')


# In[ ]:


by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])

get_maximum = lambda x: x.loc[x.value.idxmax()]
get_minimum = lambda x: x.loc[x.value.idxmin()]

max_foods = by_nutrient.apply(get_maximum)[['value', 'food']]

# make the food a little smaller
max_foods.food = max_foods.food.str[:50]


# In[ ]:


max_foods.loc['Amino Acids']['food']


# ## 2012 Federal Election Commission Database

# In[ ]:


fec = pd.read_csv('datasets/fec/P00000001-ALL.csv')
fec.info()


# In[ ]:


fec.iloc[123456]


# In[ ]:


unique_cands = fec.cand_nm.unique()
unique_cands
unique_cands[2]


# In[ ]:


parties = {'Bachmann, Michelle': 'Republican',
           'Cain, Herman': 'Republican',
           'Gingrich, Newt': 'Republican',
           'Huntsman, Jon': 'Republican',
           'Johnson, Gary Earl': 'Republican',
           'McCotter, Thaddeus G': 'Republican',
           'Obama, Barack': 'Democrat',
           'Paul, Ron': 'Republican',
           'Pawlenty, Timothy': 'Republican',
           'Perry, Rick': 'Republican',
           "Roemer, Charles E. 'Buddy' III": 'Republican',
           'Romney, Mitt': 'Republican',
           'Santorum, Rick': 'Republican'}


# In[ ]:


fec.cand_nm[123456:123461]
fec.cand_nm[123456:123461].map(parties)
# Add it as a column
fec['party'] = fec.cand_nm.map(parties)
fec['party'].value_counts()


# In[ ]:


(fec.contb_receipt_amt > 0).value_counts()


# In[ ]:


fec = fec[fec.contb_receipt_amt > 0]


# In[ ]:


fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack', 'Romney, Mitt'])]


# ### Donation Statistics by Occupation and Employer

# In[ ]:


fec.contbr_occupation.value_counts()[:10]


# In[ ]:


occ_mapping = {
   'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED',
   'INFORMATION REQUESTED' : 'NOT PROVIDED',
   'INFORMATION REQUESTED (BEST EFFORTS)' : 'NOT PROVIDED',
   'C.E.O.': 'CEO'
}

# If no mapping provided, return x
f = lambda x: occ_mapping.get(x, x)
fec.contbr_occupation = fec.contbr_occupation.map(f)


# In[ ]:


emp_mapping = {
   'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED',
   'INFORMATION REQUESTED' : 'NOT PROVIDED',
   'SELF' : 'SELF-EMPLOYED',
   'SELF EMPLOYED' : 'SELF-EMPLOYED',
}

# If no mapping provided, return x
f = lambda x: emp_mapping.get(x, x)
fec.contbr_employer = fec.contbr_employer.map(f)


# In[ ]:


by_occupation = fec.pivot_table('contb_receipt_amt',
                                index='contbr_occupation',
                                columns='party', aggfunc='sum')
over_2mm = by_occupation[by_occupation.sum(1) > 2000000]
over_2mm


# In[ ]:


plt.figure()


# In[ ]:


over_2mm.plot(kind='barh')


# In[ ]:


def get_top_amounts(group, key, n=5):
    totals = group.groupby(key)['contb_receipt_amt'].sum()
    return totals.nlargest(n)


# In[ ]:


grouped = fec_mrbo.groupby('cand_nm')
grouped.apply(get_top_amounts, 'contbr_occupation', n=7)
grouped.apply(get_top_amounts, 'contbr_employer', n=10)


# ### Bucketing Donation Amounts

# In[ ]:


bins = np.array([0, 1, 10, 100, 1000, 10000,
                 100000, 1000000, 10000000])
labels = pd.cut(fec_mrbo.contb_receipt_amt, bins)
labels


# In[ ]:


grouped = fec_mrbo.groupby(['cand_nm', labels])
grouped.size().unstack(0)


# In[ ]:


plt.figure()


# In[ ]:


bucket_sums = grouped.contb_receipt_amt.sum().unstack(0)
normed_sums = bucket_sums.div(bucket_sums.sum(axis=1), axis=0)
normed_sums
normed_sums[:-2].plot(kind='barh')


# ### Donation Statistics by State

# In[ ]:


grouped = fec_mrbo.groupby(['cand_nm', 'contbr_st'])
totals = grouped.contb_receipt_amt.sum().unstack(0).fillna(0)
totals = totals[totals.sum(1) > 100000]
totals[:10]


# In[ ]:


percent = totals.div(totals.sum(1), axis=0)
percent[:10]


# ## Conclusion