import pandas as pd
import numpy as np
pd.__version__

values = [1.5, 5, -5, 4, 2]

s = pd.Series(values)
s

s.values

s.index

labels = ['a', 'b', 'c', 'd', 'e']

s2 = pd.Series(values, index=labels)
s2

s2.index

s2.index[0]

s2[2]

s2['c']

s2[:3]

s2[['a', 'c', 'd']]

# missing data
s2[['a', 'c', 'd', 'f']]

s3 = s2[['a', 'c', 'd', 'f']]
s3[s3.isnull()]

s3[s3.notnull()]

s3.dropna()

s3.fillna(0)

s3.mean()

s3.fillna(s3.mean())

data = {
  'two': s2,
  'three': s3.fillna(s3.mean())
}

df = pd.DataFrame(data)
df

df.index.name = 'edition'
df.columns.name = 'number'
df

np.set_printoptions(precision=4)
np.random.randn(5, 3).T

df.T

df['three']

df

df2 = pd.DataFrame(data, columns=['one', 'two', 'three'])
df2

df

df[['two', 'three']]

df.ix['c']

df.ix['c':]

df.ix[['c', 'e']]

df.ix[['c', 'e'], 'three']

df.ix[['c', 'e'], ['two', 'three']]

df['four'] = df['two'] * 2
df

del df['four']
df

df.three

df['three']

df['index'] = 0
df

df.index

!head -n 5 PandasUsers.csv

askers = pd.read_csv('PandasUsers.csv')
questions = pd.read_csv('PandasPosts.csv')
helpers = pd.read_csv('PandasAnswerers.csv')
answers = pd.read_csv('PandasAnswers.csv')

askers

askers.ix[100]

askers.ix[100]['CreationDate']

questions

answers

askers.DisplayName

asker_count = askers.DisplayName.value_counts()
helper_count = helpers.DisplayName.value_counts()


ask_and_answer = asker_count.index.intersection(helper_count.index)

len(ask_and_answer)

len(askers)

len(helpers)

users = askers.append(helpers, ignore_index=True)
users

users.DisplayName.duplicated().sum()

-users.Id.duplicated()

# converts to distinct users
users = users[-users.Id.duplicated()]

askers = askers.set_index('Id')
helpers = helpers.set_index('Id')

len(askers.index.intersection(helpers.index))

users

questions

questions = pd.merge(questions, users,
                     left_on='OwnerUserId',
                     right_on='Id',
                     suffixes=['_post', '_user'])

answers = pd.merge(answers, users,
                   left_on='OwnerUserId',
                   right_on='Id',
                   suffixes=['_post', '_user'])

column_mapping = {
    'CreationDate_post': 'PostTime',
    'CreationDate_user': 'UserJoinTime'    
}

questions = questions.rename(columns=column_mapping)
answers = answers.rename(columns=column_mapping)

questions.PostTime[0]

pd.to_datetime('2012-11-19 02:12:10')

pd.to_datetime('11/12/2012', dayfirst=True)

pd.to_datetime('11/12/12', format='%d/%m/%y')

questions.PostTime.dtype

pd.to_datetime(questions.PostTime)

pd.to_datetime(questions.PostTime)[0]

stamp = pd.to_datetime(questions.PostTime)[0]
stamp

stamp.tz_localize('utc')

stamp.tz_localize('utc').tz_convert('Asia/Singapore')

questions['PostTime'] = pd.to_datetime(questions.PostTime)
answers['PostTime'] = pd.to_datetime(answers.PostTime)


answers.PostTime[5]

# you can do this in read_csv
posts2 = pd.read_csv('PandasPosts.csv', parse_dates=['CreationDate'])
posts2.CreationDate[0]

questions.sort('ViewCount', ascending=False).ViewCount

top10 = questions.sort('ViewCount', ascending=False)[:10]
top10[['Title', 'ViewCount']]

views = questions.ViewCount.values

views = pd.Series(views, index=questions.PostTime)
views

views.index

views.index[0]

views.index.year

year = views.index.year
month = views.index.month

views.groupby(year)

sizes = views.groupby(year).size()
sizes

sums = views.groupby(year).sum()
sums

reordered = sizes[[2013, 2012, 2011]]

sums / reordered

print views.count()  # excludes missing data (if any)
print len(views)

grouped = views.groupby(year)

grouped.agg(['count', 'sum', 'min', 'max', 'median', 'mean'])


views.order(ascending=False)[:5]

def topn(group, n=5):
    return group.order(ascending=False)[:n]
 
grouped = views.groupby(year)

result = grouped.apply(topn)
result

result[2012]

result

year = views.index.year
month = views.index.month

keys = [year, month]
stats = ['count']

grouped = views.groupby(keys)

result = grouped.agg(stats)
result

result.unstack(0)

result.unstack(0).fillna(0)

result

result.index.names = ['year', 'month']
result

result.unstack('year')

result.unstack('year').stack('year')

stats = ['count', 'sum', 'mean', 'median']
result = grouped.agg(stats)
result.index.names = ['year', 'month']
result

pd.options.display.max_columns = 20
pd.options.display.line_width = 1000
result.unstack('year')

views.groupby(year).agg({'foo': np.mean})

# BLACK MAGIC!

def q(q):
    def f(x):
        return x.quantile(q)
    f.__name__ = 'quantile_%d' % int(q * 100)
    return f
    
views.groupby(year).agg([q(0.1), q(0.5), q(0.9)])

np.random.seed(130612001)
attendees = open('attendees.txt').read().split()
np.random.shuffle(attendees)
attendees = pd.Series(attendees)
attendees[:7]

views.resample('A', how='sum')

views.resample('M', how='sum')

views.groupby(year).sum()

answers.DisplayName

answers.groupby('DisplayName').size()

num_answers = answers.groupby('DisplayName').size()
num_answers.order(ascending=False)

answers.Score

def sortd(self):
    return self.order(ascending=False)
pd.Series.sortd = sortd

# pd.options.display.max_rows = 1000
by_name = answers.groupby('DisplayName')
result = by_name['Score'].agg(['mean', 'count'])

result = result.sort('mean', ascending=False)
result[:20]

result['count'] >= 10

(result['count'] >= 10).sum()

result[result['count'] >= 10]

stats = ['mean', 'min', 'max', 'std', 'count']
result = by_name['Score'].agg(stats)
result = result.sort('std', ascending=False)
result[result['count'] >= 10]

top_so_users = result[result['count'] >= 10].index
top_so_users

# Doing this 21 times would be slow
answers.DisplayName == 'Wes McKinney'

'Wes McKinney' in top_so_users

answers.DisplayName.isin(top_so_users)

crit = answers.DisplayName.isin(top_so_users)
answers_subset = answers[crit]
answers_subset

# Doh, have to convert to index to do this
year = pd.DatetimeIndex(answers_subset.PostTime).year
year[:20]

keys = ['DisplayName', year]

grouped = answers_subset.groupby(keys)

stats = ['mean', 'min', 'max', 'std', 'count']
result = grouped['Score'].agg(stats)
# result = result.sort('std', ascending=False)
result

np.log(users.Reputation).hist(bins=100)

buckets = [0, 100, 1000, 10000, 1000000]
rep_bucket = pd.cut(answers.Reputation, buckets)

answers.groupby(rep_bucket).size()

len(answers.DisplayName.unique())

grouped = answers.groupby(rep_bucket)

def distinct(names):
    return len(names.unique())

statistics = {
    'Score': ['mean', 'median', 'count'],
    'Age': ['min', 'max'],
    'DisplayName': [distinct]
}

grouped.agg(statistics)

users.irow(users.Reputation.argmax())

import json
path = 'foods-2011-10-03.json'

db = json.load(open(path))

print type(db)
len(db)

db[0]

db[0]['nutrients'][0]

pd.DataFrame([{'a': 1, 'b': 2},
              {'a': 2, 'b': 3, 'c': 4}])

pd.DataFrame(db[0]['nutrients'])

pd.DataFrame(db[0]['nutrients']).head()

db[0]['nutrients'][0]

db[0]['id']

all_nutrients = []
for rec in db:
    for nutrient in rec['nutrients']:
        nutrient['id'] = rec['id']
        all_nutrients.append(nutrient)

nutrients = pd.DataFrame(all_nutrients)

nutrients.head()

pd.DataFrame(all_nutrients, columns=['value'])

of_interest = ['id', 'description', 'group']
meta = pd.DataFrame(db, columns=of_interest)
meta

nutrients

mapping = {
    'description': 'nutrient',
    'group': 'ngroup'
}
nutrients = nutrients.rename(columns=mapping)

# COULD HAVE DONE
# nutrients.rename(columns=mapping, inplace=True)

mapping = {
    'description': 'food',
    'group': 'fgroup'
}
meta = meta.rename(columns=mapping)

data = pd.merge(meta, nutrients, on='id')

data

cafdata = data[data.nutrient == 'Caffeine']
cafdata

data.ix[172]

cafdata.ix[172]

cafdata.units.value_counts()

# MAXIMUM IN NUMPY
cafdata.value.values.argmax()

# INDEX OF THE MAXIMUM VALUE
cafdata.value.idxmax()

data.ix[cafdata.value.idxmax()]

cafdata.dtypes

def max_value(group):
    return group.ix[group['value'].idxmax()]

max_value(cafdata)

cafdata.groupby('fgroup').apply(max_value).dtypes

cafdata.groupby('fgroup').apply(max_value)

grouped = data.groupby(['nutrient', 'fgroup'])
results = grouped.apply(max_value)
results

pd.options.display.line_width = 10000
def topn(group, by=None, n=5):
    return group.sort(by, ascending=False)[:n]

result = cafdata.groupby('fgroup').apply(topn, by='value', n=3)
result.ix['Beverages']

data.nutrient.unique()

grouped = data.groupby(['nutrient', 'fgroup'])
results = grouped.apply(max_value)
results.ix['Zinc, Zn']

results.ix['Zinc, Zn'].value

results.ix['Zinc, Zn'].value.plot(kind='barh')

plt.figure(figsize=(10, 8))
medians = data.groupby(['nutrient', 'fgroup']).value.median()
medians.ix['Zinc, Zn'].plot(kind='barh')

plt.figure(figsize=(10, 8))
maxes = data.groupby(['nutrient', 'fgroup']).value.max()
np.log(maxes.ix['Caffeine']).plot(kind='barh')

data[['food', 'nutrient']].describe()