import pandas as pd import numpy as np pd.__version__ values = [1.5, 5, -5, 4, 2] s = pd.Series(values) s s.values s.index labels = ['a', 'b', 'c', 'd', 'e'] s2 = pd.Series(values, index=labels) s2 s2.index s2.index[0] s2[2] s2['c'] s2[:3] s2[['a', 'c', 'd']] # missing data s2[['a', 'c', 'd', 'f']] s3 = s2[['a', 'c', 'd', 'f']] s3[s3.isnull()] s3[s3.notnull()] s3.dropna() s3.fillna(0) s3.mean() s3.fillna(s3.mean()) data = { 'two': s2, 'three': s3.fillna(s3.mean()) } df = pd.DataFrame(data) df df.index.name = 'edition' df.columns.name = 'number' df np.set_printoptions(precision=4) np.random.randn(5, 3).T df.T df['three'] df df2 = pd.DataFrame(data, columns=['one', 'two', 'three']) df2 df df[['two', 'three']] df.ix['c'] df.ix['c':] df.ix[['c', 'e']] df.ix[['c', 'e'], 'three'] df.ix[['c', 'e'], ['two', 'three']] df['four'] = df['two'] * 2 df del df['four'] df df.three df['three'] df['index'] = 0 df df.index !head -n 5 PandasUsers.csv askers = pd.read_csv('PandasUsers.csv') questions = pd.read_csv('PandasPosts.csv') helpers = pd.read_csv('PandasAnswerers.csv') answers = pd.read_csv('PandasAnswers.csv') askers askers.ix[100] askers.ix[100]['CreationDate'] questions answers askers.DisplayName asker_count = askers.DisplayName.value_counts() helper_count = helpers.DisplayName.value_counts() ask_and_answer = asker_count.index.intersection(helper_count.index) len(ask_and_answer) len(askers) len(helpers) users = askers.append(helpers, ignore_index=True) users users.DisplayName.duplicated().sum() -users.Id.duplicated() # converts to distinct users users = users[-users.Id.duplicated()] askers = askers.set_index('Id') helpers = helpers.set_index('Id') len(askers.index.intersection(helpers.index)) users questions questions = pd.merge(questions, users, left_on='OwnerUserId', right_on='Id', suffixes=['_post', '_user']) answers = pd.merge(answers, users, left_on='OwnerUserId', right_on='Id', suffixes=['_post', '_user']) column_mapping = { 'CreationDate_post': 'PostTime', 'CreationDate_user': 'UserJoinTime' } questions = questions.rename(columns=column_mapping) answers = answers.rename(columns=column_mapping) questions.PostTime[0] pd.to_datetime('2012-11-19 02:12:10') pd.to_datetime('11/12/2012', dayfirst=True) pd.to_datetime('11/12/12', format='%d/%m/%y') questions.PostTime.dtype pd.to_datetime(questions.PostTime) pd.to_datetime(questions.PostTime)[0] stamp = pd.to_datetime(questions.PostTime)[0] stamp stamp.tz_localize('utc') stamp.tz_localize('utc').tz_convert('Asia/Singapore') questions['PostTime'] = pd.to_datetime(questions.PostTime) answers['PostTime'] = pd.to_datetime(answers.PostTime) answers.PostTime[5] # you can do this in read_csv posts2 = pd.read_csv('PandasPosts.csv', parse_dates=['CreationDate']) posts2.CreationDate[0] questions.sort('ViewCount', ascending=False).ViewCount top10 = questions.sort('ViewCount', ascending=False)[:10] top10[['Title', 'ViewCount']] views = questions.ViewCount.values views = pd.Series(views, index=questions.PostTime) views views.index views.index[0] views.index.year year = views.index.year month = views.index.month views.groupby(year) sizes = views.groupby(year).size() sizes sums = views.groupby(year).sum() sums reordered = sizes[[2013, 2012, 2011]] sums / reordered print views.count() # excludes missing data (if any) print len(views) grouped = views.groupby(year) grouped.agg(['count', 'sum', 'min', 'max', 'median', 'mean']) views.order(ascending=False)[:5] def topn(group, n=5): return group.order(ascending=False)[:n] grouped = views.groupby(year) result = grouped.apply(topn) result result[2012] result year = views.index.year month = views.index.month keys = [year, month] stats = ['count'] grouped = views.groupby(keys) result = grouped.agg(stats) result result.unstack(0) result.unstack(0).fillna(0) result result.index.names = ['year', 'month'] result result.unstack('year') result.unstack('year').stack('year') stats = ['count', 'sum', 'mean', 'median'] result = grouped.agg(stats) result.index.names = ['year', 'month'] result pd.options.display.max_columns = 20 pd.options.display.line_width = 1000 result.unstack('year') views.groupby(year).agg({'foo': np.mean}) # BLACK MAGIC! def q(q): def f(x): return x.quantile(q) f.__name__ = 'quantile_%d' % int(q * 100) return f views.groupby(year).agg([q(0.1), q(0.5), q(0.9)]) np.random.seed(130612001) attendees = open('attendees.txt').read().split() np.random.shuffle(attendees) attendees = pd.Series(attendees) attendees[:7] views.resample('A', how='sum') views.resample('M', how='sum') views.groupby(year).sum() answers.DisplayName answers.groupby('DisplayName').size() num_answers = answers.groupby('DisplayName').size() num_answers.order(ascending=False) answers.Score def sortd(self): return self.order(ascending=False) pd.Series.sortd = sortd # pd.options.display.max_rows = 1000 by_name = answers.groupby('DisplayName') result = by_name['Score'].agg(['mean', 'count']) result = result.sort('mean', ascending=False) result[:20] result['count'] >= 10 (result['count'] >= 10).sum() result[result['count'] >= 10] stats = ['mean', 'min', 'max', 'std', 'count'] result = by_name['Score'].agg(stats) result = result.sort('std', ascending=False) result[result['count'] >= 10] top_so_users = result[result['count'] >= 10].index top_so_users # Doing this 21 times would be slow answers.DisplayName == 'Wes McKinney' 'Wes McKinney' in top_so_users answers.DisplayName.isin(top_so_users) crit = answers.DisplayName.isin(top_so_users) answers_subset = answers[crit] answers_subset # Doh, have to convert to index to do this year = pd.DatetimeIndex(answers_subset.PostTime).year year[:20] keys = ['DisplayName', year] grouped = answers_subset.groupby(keys) stats = ['mean', 'min', 'max', 'std', 'count'] result = grouped['Score'].agg(stats) # result = result.sort('std', ascending=False) result np.log(users.Reputation).hist(bins=100) buckets = [0, 100, 1000, 10000, 1000000] rep_bucket = pd.cut(answers.Reputation, buckets) answers.groupby(rep_bucket).size() len(answers.DisplayName.unique()) grouped = answers.groupby(rep_bucket) def distinct(names): return len(names.unique()) statistics = { 'Score': ['mean', 'median', 'count'], 'Age': ['min', 'max'], 'DisplayName': [distinct] } grouped.agg(statistics) users.irow(users.Reputation.argmax()) import json path = 'foods-2011-10-03.json' db = json.load(open(path)) print type(db) len(db) db[0] db[0]['nutrients'][0] pd.DataFrame([{'a': 1, 'b': 2}, {'a': 2, 'b': 3, 'c': 4}]) pd.DataFrame(db[0]['nutrients']) pd.DataFrame(db[0]['nutrients']).head() db[0]['nutrients'][0] db[0]['id'] all_nutrients = [] for rec in db: for nutrient in rec['nutrients']: nutrient['id'] = rec['id'] all_nutrients.append(nutrient) nutrients = pd.DataFrame(all_nutrients) nutrients.head() pd.DataFrame(all_nutrients, columns=['value']) of_interest = ['id', 'description', 'group'] meta = pd.DataFrame(db, columns=of_interest) meta nutrients mapping = { 'description': 'nutrient', 'group': 'ngroup' } nutrients = nutrients.rename(columns=mapping) # COULD HAVE DONE # nutrients.rename(columns=mapping, inplace=True) mapping = { 'description': 'food', 'group': 'fgroup' } meta = meta.rename(columns=mapping) data = pd.merge(meta, nutrients, on='id') data cafdata = data[data.nutrient == 'Caffeine'] cafdata data.ix[172] cafdata.ix[172] cafdata.units.value_counts() # MAXIMUM IN NUMPY cafdata.value.values.argmax() # INDEX OF THE MAXIMUM VALUE cafdata.value.idxmax() data.ix[cafdata.value.idxmax()] cafdata.dtypes def max_value(group): return group.ix[group['value'].idxmax()] max_value(cafdata) cafdata.groupby('fgroup').apply(max_value).dtypes cafdata.groupby('fgroup').apply(max_value) grouped = data.groupby(['nutrient', 'fgroup']) results = grouped.apply(max_value) results pd.options.display.line_width = 10000 def topn(group, by=None, n=5): return group.sort(by, ascending=False)[:n] result = cafdata.groupby('fgroup').apply(topn, by='value', n=3) result.ix['Beverages'] data.nutrient.unique() grouped = data.groupby(['nutrient', 'fgroup']) results = grouped.apply(max_value) results.ix['Zinc, Zn'] results.ix['Zinc, Zn'].value results.ix['Zinc, Zn'].value.plot(kind='barh') plt.figure(figsize=(10, 8)) medians = data.groupby(['nutrient', 'fgroup']).value.median() medians.ix['Zinc, Zn'].plot(kind='barh') plt.figure(figsize=(10, 8)) maxes = data.groupby(['nutrient', 'fgroup']).value.max() np.log(maxes.ix['Caffeine']).plot(kind='barh') data[['food', 'nutrient']].describe()