In [1]:

import pandas as pd
import numpy as np
pd.__version__

Out[1]:

'0.11.0'

In [2]:

values = [1.5, 5, -5, 4, 2]

s = pd.Series(values)
s

Out[2]:

0    1.5
1    5.0
2   -5.0
3    4.0
4    2.0
dtype: float64

In [3]:

s.values

Out[3]:

array([ 1.5,  5. , -5. ,  4. ,  2. ])

In [4]:

s.index

Out[4]:

Int64Index([0, 1, 2, 3, 4], dtype=int64)

In [5]:

labels = ['a', 'b', 'c', 'd', 'e']

s2 = pd.Series(values, index=labels)
s2

Out[5]:

a    1.5
b    5.0
c   -5.0
d    4.0
e    2.0
dtype: float64

In [6]:

s2.index

Out[6]:

Index([a, b, c, d, e], dtype=object)

In [7]:

s2.index[0]

Out[7]:

'a'

In [8]:

s2[2]

Out[8]:

-5.0

In [9]:

s2['c']

Out[9]:

-5.0

In [10]:

s2[:3]

Out[10]:

a    1.5
b    5.0
c   -5.0
dtype: float64

In [11]:

s2[['a', 'c', 'd']]

Out[11]:

a    1.5
c   -5.0
d    4.0
dtype: float64

In [12]:

# missing data
s2[['a', 'c', 'd', 'f']]

Out[12]:

a    1.5
c   -5.0
d    4.0
f    NaN
dtype: float64

In [13]:

s3 = s2[['a', 'c', 'd', 'f']]
s3[s3.isnull()]

Out[13]:

f   NaN
dtype: float64

In [14]:

s3[s3.notnull()]

Out[14]:

a    1.5
c   -5.0
d    4.0
dtype: float64

In [15]:

s3.dropna()

Out[15]:

a    1.5
c   -5.0
d    4.0
dtype: float64

In [16]:

s3.fillna(0)

Out[16]:

a    1.5
c   -5.0
d    4.0
f    0.0
dtype: float64

In [17]:

s3.mean()

Out[17]:

0.16666666666666666

In [18]:

s3.fillna(s3.mean())

Out[18]:

a    1.500000
c   -5.000000
d    4.000000
f    0.166667
dtype: float64

In [19]:

data = {
  'two': s2,
  'three': s3.fillna(s3.mean())
}

df = pd.DataFrame(data)
df

Out[19]:

	three	two
a	1.500000	1.5
b	NaN	5.0
c	-5.000000	-5.0
d	4.000000	4.0
e	NaN	2.0
f	0.166667	NaN

In [20]:

df.index.name = 'edition'
df.columns.name = 'number'
df

Out[20]:

number	three	two
edition
a	1.500000	1.5
b	NaN	5.0
c	-5.000000	-5.0
d	4.000000	4.0
e	NaN	2.0
f	0.166667	NaN

In [21]:

np.set_printoptions(precision=4)
np.random.randn(5, 3).T

Out[21]:

array([[-0.5463, -0.3112, -0.9632,  1.8207,  2.6747],
       [ 0.3955,  0.8636, -0.0378, -0.3928, -0.4491],
       [-0.4356,  0.9094, -0.9029, -2.0247,  0.2625]])

In [22]:

df.T

Out[22]:

edition	a	b	c	d	e	f
number
three	1.5	NaN	-5	4	NaN	0.166667
two	1.5	5	-5	4	2	NaN

In [23]:

df['three']

Out[23]:

edition
a          1.500000
b               NaN
c         -5.000000
d          4.000000
e               NaN
f          0.166667
Name: three, dtype: float64

In [24]:

df

Out[24]:

number	three	two
edition
a	1.500000	1.5
b	NaN	5.0
c	-5.000000	-5.0
d	4.000000	4.0
e	NaN	2.0
f	0.166667	NaN

In [25]:

df2 = pd.DataFrame(data, columns=['one', 'two', 'three'])
df2

Out[25]:

	one	two	three
a	NaN	1.5	1.500000
b	NaN	5.0	NaN
c	NaN	-5.0	-5.000000
d	NaN	4.0	4.000000
e	NaN	2.0	NaN
f	NaN	NaN	0.166667

In [26]:

df

Out[26]:

number	three	two
edition
a	1.500000	1.5
b	NaN	5.0
c	-5.000000	-5.0
d	4.000000	4.0
e	NaN	2.0
f	0.166667	NaN

In [27]:

df[['two', 'three']]

Out[27]:

number	two	three
edition
a	1.5	1.500000
b	5.0	NaN
c	-5.0	-5.000000
d	4.0	4.000000
e	2.0	NaN
f	NaN	0.166667

In [28]:

df.ix['c']

Out[28]:

number
three    -5
two      -5
Name: c, dtype: float64

In [29]:

df.ix['c':]

Out[29]:

number	three	two
edition
c	-5.000000	-5
d	4.000000	4
e	NaN	2
f	0.166667	NaN

In [30]:

df.ix[['c', 'e']]

Out[30]:

number	three	two
c	-5	-5
e	NaN	2

In [31]:

df.ix[['c', 'e'], 'three']

Out[31]:

c    -5
e   NaN
Name: three, dtype: float64

In [32]:

df.ix[['c', 'e'], ['two', 'three']]

Out[32]:

	two	three
c	-5	-5
e	2	NaN

In [33]:

df['four'] = df['two'] * 2
df

Out[33]:

number	three	two	four
edition
a	1.500000	1.5	3
b	NaN	5.0	10
c	-5.000000	-5.0	-10
d	4.000000	4.0	8
e	NaN	2.0	4
f	0.166667	NaN	NaN

In [34]:

del df['four']
df

Out[34]:

	three	two
edition
a	1.500000	1.5
b	NaN	5.0
c	-5.000000	-5.0
d	4.000000	4.0
e	NaN	2.0
f	0.166667	NaN

In [35]:

df.three

Out[35]:

edition
a          1.500000
b               NaN
c         -5.000000
d          4.000000
e               NaN
f          0.166667
Name: three, dtype: float64

In [36]:

df['three']

Out[36]:

edition
a          1.500000
b               NaN
c         -5.000000
d          4.000000
e               NaN
f          0.166667
Name: three, dtype: float64

In [37]:

df['index'] = 0
df

Out[37]:

	three	two	index
edition
a	1.500000	1.5	0
b	NaN	5.0	0
c	-5.000000	-5.0	0
d	4.000000	4.0	0
e	NaN	2.0	0
f	0.166667	NaN	0

In [38]:

df.index

Out[38]:

Index([a, b, c, d, e, f], dtype=object)

In [39]:

!head -n 5 PandasUsers.csv

Id,Reputation,CreationDate,DisplayName,LastAccessDate,WebsiteUrl,Location,AboutMe,Views,UpVotes,DownVotes,EmailHash,Age
1843099,100,2012-11-21 19:25:04,user1843099,2013-02-03 03:46:50,,,,0,16,0,17a9c54b937f8b505dcb3b0b5b34cbb6,
854739,766,2011-07-20 20:29:21,mikebmassey,2013-02-02 17:40:38,http://Itsprivate...,"Charlotte, NC","<p>I'm a analytics guy for a financial company.  Passionate about new technology.  Trying to simultaneously learn 5 different programming languages and failing at all of them...</p>
",0,242,3,61af09414341715d110f08b7505fe114,35.0
1479269,472,2012-06-25 07:10:33,dmvianna,2013-01-31 10:20:43,,"Melbourne, Australia","<p>Former neuroscientist now data analyst.</p>

In [40]:

askers = pd.read_csv('PandasUsers.csv')
questions = pd.read_csv('PandasPosts.csv')
helpers = pd.read_csv('PandasAnswerers.csv')
answers = pd.read_csv('PandasAnswers.csv')

In [41]:

askers

Out[41]:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 490 entries, 0 to 489
Data columns (total 13 columns):
Id                490  non-null values
Reputation        490  non-null values
CreationDate      490  non-null values
DisplayName       490  non-null values
LastAccessDate    490  non-null values
WebsiteUrl        81  non-null values
Location          143  non-null values
AboutMe           105  non-null values
Views             490  non-null values
UpVotes           490  non-null values
DownVotes         490  non-null values
EmailHash         490  non-null values
Age               91  non-null values
dtypes: float64(1), int64(5), object(7)

In [42]:

askers.ix[100]

Out[42]:

Id                                                 1301710
Reputation                                            3110
CreationDate                           2012-03-29 19:23:17
DisplayName                                            bmu
LastAccessDate                         2013-02-02 22:18:29
WebsiteUrl                                             NaN
Location                                               NaN
AboutMe           <p>My about me is currently blank.</p>\n
Views                                                    0
UpVotes                                                975
DownVotes                                               38
EmailHash                 c09e91f8758f4ea83dbaa5e199ebb71a
Age                                                    NaN
Name: 100, dtype: object

In [43]:

askers.ix[100]['CreationDate']

Out[43]:

'2012-03-29 19:23:17'

In [44]:

questions

Out[44]:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 952 entries, 0 to 951
Data columns (total 18 columns):
Id                  952  non-null values
PostTypeId          952  non-null values
AcceptedAnswerId    656  non-null values
CreationDate        952  non-null values
Score               952  non-null values
ViewCount           952  non-null values
Body                952  non-null values
OwnerUserId         952  non-null values
OwnerDisplayName    1  non-null values
LastEditorUserId    438  non-null values
LastEditDate        438  non-null values
LastActivityDate    952  non-null values
Title               952  non-null values
Tags                952  non-null values
AnswerCount         903  non-null values
CommentCount        438  non-null values
FavoriteCount       303  non-null values
ClosedDate          21  non-null values
dtypes: float64(5), int64(5), object(8)

In [45]:

answers

Out[45]:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1331 entries, 0 to 1330
Data columns (total 13 columns):
Id                       1331  non-null values
PostTypeId               1331  non-null values
ParentId                 1331  non-null values
CreationDate             1331  non-null values
Score                    1331  non-null values
Body                     1331  non-null values
OwnerUserId              1329  non-null values
OwnerDisplayName         2  non-null values
LastEditorUserId         366  non-null values
LastEditorDisplayName    2  non-null values
LastEditDate             368  non-null values
LastActivityDate         1331  non-null values
CommentCount             804  non-null values
dtypes: float64(3), int64(4), object(6)

In [46]:

askers.DisplayName

Out[46]:

0          user1843099
1          mikebmassey
2             dmvianna
3            ezbentley
4                Ralph
5           Evan Davey
6      Nicola Vianello
7                Roger
8                Shane
9                roysc
10                RJCL
11                 PhE
12    Martin C. Martin
13               Einar
14              javier
...
475           briant57
476        user1470604
477            cpcloud
478             mhubig
479          akiladila
480        user1246428
481             Shakti
482            PlagTag
483          user14121
484               Tony
485             GBadge
486    Skylar Saveland
487          sashkello
488        user1498485
489          A Alstone
Name: DisplayName, Length: 490, dtype: object

In [47]:

asker_count = askers.DisplayName.value_counts()
helper_count = helpers.DisplayName.value_counts()

In [48]:

ask_and_answer = asker_count.index.intersection(helper_count.index)

In [49]:

len(ask_and_answer)

Out[49]:

In [50]:

len(askers)

Out[50]:

In [51]:

len(helpers)

Out[51]:

In [52]:

users = askers.append(helpers, ignore_index=True)
users

Out[52]:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 809 entries, 0 to 808
Data columns (total 13 columns):
Id                809  non-null values
Reputation        809  non-null values
CreationDate      809  non-null values
DisplayName       809  non-null values
LastAccessDate    809  non-null values
WebsiteUrl        195  non-null values
Location          298  non-null values
AboutMe           240  non-null values
Views             809  non-null values
UpVotes           809  non-null values
DownVotes         809  non-null values
EmailHash         809  non-null values
Age               200  non-null values
dtypes: float64(1), int64(5), object(7)

In [53]:

users.DisplayName.duplicated().sum()

Out[53]:

In [54]:

-users.Id.duplicated()

Out[54]:

0     True
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
10    True
11    True
12    True
13    True
14    True
...
794     True
795     True
796    False
797     True
798     True
799     True
800     True
801     True
802     True
803     True
804     True
805    False
806     True
807    False
808     True
Name: Id, Length: 809, dtype: bool

In [55]:

# converts to distinct users
users = users[-users.Id.duplicated()]

In [56]:

askers = askers.set_index('Id')
helpers = helpers.set_index('Id')

len(askers.index.intersection(helpers.index))

Out[56]:

In [57]:

users

Out[57]:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 694 entries, 0 to 808
Data columns (total 13 columns):
Id                694  non-null values
Reputation        694  non-null values
CreationDate      694  non-null values
DisplayName       694  non-null values
LastAccessDate    694  non-null values
WebsiteUrl        167  non-null values
Location          255  non-null values
AboutMe           202  non-null values
Views             694  non-null values
UpVotes           694  non-null values
DownVotes         694  non-null values
EmailHash         694  non-null values
Age               170  non-null values
dtypes: float64(1), int64(5), object(7)

In [58]:

questions

Out[58]:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 952 entries, 0 to 951
Data columns (total 18 columns):
Id                  952  non-null values
PostTypeId          952  non-null values
AcceptedAnswerId    656  non-null values
CreationDate        952  non-null values
Score               952  non-null values
ViewCount           952  non-null values
Body                952  non-null values
OwnerUserId         952  non-null values
OwnerDisplayName    1  non-null values
LastEditorUserId    438  non-null values
LastEditDate        438  non-null values
LastActivityDate    952  non-null values
Title               952  non-null values
Tags                952  non-null values
AnswerCount         903  non-null values
CommentCount        438  non-null values
FavoriteCount       303  non-null values
ClosedDate          21  non-null values
dtypes: float64(5), int64(5), object(8)

In [59]:

questions = pd.merge(questions, users,
                     left_on='OwnerUserId',
                     right_on='Id',
                     suffixes=['_post', '_user'])

answers = pd.merge(answers, users,
                   left_on='OwnerUserId',
                   right_on='Id',
                   suffixes=['_post', '_user'])

In [60]:

column_mapping = {
    'CreationDate_post': 'PostTime',
    'CreationDate_user': 'UserJoinTime'    
}

questions = questions.rename(columns=column_mapping)
answers = answers.rename(columns=column_mapping)

In [61]:

questions.PostTime[0]

Out[61]:

'2012-11-19 02:12:10'

In [62]:

pd.to_datetime('2012-11-19 02:12:10')

Out[62]:

datetime.datetime(2012, 11, 19, 2, 12, 10)

In [63]:

pd.to_datetime('11/12/2012', dayfirst=True)

Out[63]:

datetime.datetime(2012, 12, 11, 0, 0)

In [64]:

pd.to_datetime('11/12/12', format='%d/%m/%y')

Out[64]:

datetime.datetime(2012, 11, 12, 0, 0)

In [65]:

questions.PostTime.dtype

Out[65]:

dtype('O')

In [66]:

pd.to_datetime(questions.PostTime)

Out[66]:

0    2012-11-19 02:12:10
1    2012-11-20 17:22:04
2    2012-12-30 17:19:57
3    2012-12-30 22:43:48
4    2012-12-31 20:22:16
5    2012-11-19 00:04:20
6    2012-11-08 22:04:48
7    2012-11-17 15:59:38
8    2012-11-19 16:15:37
9    2012-11-21 11:23:14
10   2012-11-19 13:15:45
11   2013-01-06 08:44:13
12   2012-08-29 21:47:00
13   2012-09-03 15:16:55
14   2012-09-11 10:58:50
...
937   2012-11-11 13:26:27
938   2012-11-13 02:18:10
939   2012-11-08 01:51:17
940   2012-11-08 23:33:25
941   2012-11-14 23:29:30
942   2012-11-14 12:04:13
943   2012-11-15 19:11:57
944   2012-11-16 17:57:11
945   2012-11-16 17:58:23
946   2012-11-15 20:27:19
947   2012-11-16 12:10:27
948   2012-11-18 22:14:06
949   2012-11-14 22:14:50
950   2012-11-17 19:34:36
951   2012-11-18 22:22:39
Name: PostTime, Length: 952, dtype: datetime64[ns]

In [67]:

pd.to_datetime(questions.PostTime)[0]

Out[67]:

<Timestamp: 2012-11-19 02:12:10>

In [68]:

stamp = pd.to_datetime(questions.PostTime)[0]
stamp

Out[68]:

<Timestamp: 2012-11-19 02:12:10>

In [69]:

stamp.tz_localize('utc')

Out[69]:

<Timestamp: 2012-11-19 02:12:10+0000 UTC, tz=UTC>

In [70]:

stamp.tz_localize('utc').tz_convert('Asia/Singapore')

Out[70]:

<Timestamp: 2012-11-19 10:12:10+0800 SGT, tz=Asia/Singapore>

In [71]:

questions['PostTime'] = pd.to_datetime(questions.PostTime)
answers['PostTime'] = pd.to_datetime(answers.PostTime)

In [72]:

answers.PostTime[5]

Out[72]:

<Timestamp: 2012-11-07 01:23:02>

In [73]:

# you can do this in read_csv
posts2 = pd.read_csv('PandasPosts.csv', parse_dates=['CreationDate'])
posts2.CreationDate[0]

Out[73]:

<Timestamp: 2012-11-19 02:12:10>

In [74]:

questions.sort('ViewCount', ascending=False).ViewCount

Out[74]:

444    6454
490    4228
615    3238
486    2913
515    2624
443    2484
672    2422
849    2066
612    2052
504    2035
681    1992
497    1884
269    1817
582    1777
550    1768
...
25     27
228    27
407    26
440    25
422    24
394    24
63     23
123    23
324    22
501    21
65     21
433    21
64     19
459    19
481     5
Name: ViewCount, Length: 952, dtype: int64

In [75]:

top10 = questions.sort('ViewCount', ascending=False)[:10]
top10[['Title', 'ViewCount']]

Out[75]:

	Title	ViewCount
444	Tutorial on PANDAS and PYTABLES	6454
490	What is the most efficient way to loop through...	4228
615	add one row in a pandas.DataFrame	3238
486	append two data frame with pandas	2913
515	Add indexed column to DataFrame with pandas	2624
443	Sort a pandas DataMatrix in ascending order	2484
672	Renaming columns in pandas	2422
849	How to fix Python Numpy/Pandas installation?	2066
612	iterating row by row through a pandas dataframe	2052
504	How to get the correlation between two timeser...	2035

In [76]:

views = questions.ViewCount.values

views = pd.Series(views, index=questions.PostTime)
views

Out[76]:

PostTime
2012-11-19 02:12:10     74
2012-11-20 17:22:04    107
2012-12-30 17:19:57     65
2012-12-30 22:43:48     53
2012-12-31 20:22:16     48
2012-11-19 00:04:20     71
2012-11-08 22:04:48     64
2012-11-17 15:59:38    238
2012-11-19 16:15:37     31
2012-11-21 11:23:14     44
2012-11-19 13:15:45    158
2013-01-06 08:44:13     67
2012-08-29 21:47:00    266
2012-09-03 15:16:55    365
2012-09-11 10:58:50    127
...
2012-11-11 13:26:27    114
2012-11-13 02:18:10    151
2012-11-08 01:51:17     92
2012-11-08 23:33:25    110
2012-11-14 23:29:30    135
2012-11-14 12:04:13     87
2012-11-15 19:11:57    140
2012-11-16 17:57:11     92
2012-11-16 17:58:23     65
2012-11-15 20:27:19    158
2012-11-16 12:10:27    125
2012-11-18 22:14:06    109
2012-11-14 22:14:50     40
2012-11-17 19:34:36     74
2012-11-18 22:22:39    154
Length: 952, dtype: int64

In [77]:

views.index

Out[77]:

<class 'pandas.tseries.index.DatetimeIndex'>
[2012-11-19 02:12:10, ..., 2012-11-18 22:22:39]
Length: 952, Freq: None, Timezone: None

In [78]:

views.index[0]

Out[78]:

<Timestamp: 2012-11-19 02:12:10>

In [79]:

views.index.year

Out[79]:

array([2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2013, 2012, 2012, 2012, 2012, 2012, 2013, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2013, 2013,
       2012, 2012, 2012, 2012, 2013, 2013, 2013, 2013, 2013, 2013, 2013,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2013, 2013, 2012,
       2012, 2013, 2012, 2013, 2013, 2012, 2013, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2013, 2012, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2013, 2013, 2012, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013,
       2013, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013,
       2012, 2012, 2012, 2013, 2013, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2012, 2012, 2012,
       2012, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013,
       2012, 2012, 2013, 2013, 2013, 2013, 2013, 2013, 2012, 2012, 2012,
       2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2013, 2012, 2012, 2012, 2013, 2013, 2013, 2013, 2012,
       2012, 2012, 2012, 2012, 2012, 2013, 2012, 2012, 2012, 2012, 2013,
       2012, 2012, 2012, 2012, 2013, 2013, 2013, 2013, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2013, 2013, 2013, 2012, 2013, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2013, 2013, 2013, 2013, 2013,
       2013, 2013, 2013, 2012, 2013, 2013, 2013, 2013, 2012, 2012, 2013,
       2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,
       2013, 2012, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2012, 2012,
       2013, 2013, 2012, 2012, 2012, 2013, 2012, 2013, 2012, 2013, 2012,
       2012, 2012, 2012, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,
       2013, 2012, 2013, 2013, 2013, 2013, 2012, 2013, 2013, 2013, 2013,
       2013, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2013, 2013,
       2013, 2013, 2013, 2013, 2013, 2012, 2012, 2013, 2013, 2013, 2013,
       2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2012, 2013,
       2013, 2013, 2011, 2011, 2011, 2013, 2013, 2013, 2013, 2012, 2012,
       2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,
       2013, 2013, 2013, 2013, 2013, 2012, 2012, 2012, 2013, 2012, 2013,
       2013, 2012, 2013, 2013, 2013, 2013, 2013, 2011, 2013, 2011, 2011,
       2013, 2011, 2011, 2013, 2012, 2012, 2011, 2013, 2012, 2011, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2013, 2011, 2012, 2011, 2012,
       2012, 2012, 2011, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012], dtype=int32)

In [80]:

year = views.index.year
month = views.index.month

views.groupby(year)

Out[80]:

<pandas.core.groupby.SeriesGroupBy at 0x106ab0050>

In [81]:

sizes = views.groupby(year).size()
sizes

Out[81]:

2011     13
2012    766
2013    173
dtype: int64

In [82]:

sums = views.groupby(year).sum()
sums

Out[82]:

2011     25083
2012    208237
2013     10520
dtype: int64

In [83]:

reordered = sizes[[2013, 2012, 2011]]

In [84]:

sums / reordered

Out[84]:

2011    1929
2012     271
2013      60
dtype: int64

In [85]:

print views.count()  # excludes missing data (if any)
print len(views)

952
952

In [86]:

grouped = views.groupby(year)

grouped.agg(['count', 'sum', 'min', 'max', 'median', 'mean'])

Out[86]:

	count	sum	min	max	median	mean
2011	13	25083	368	6454	1128	1929.461538
2012	766	208237	27	3238	143	271.849869
2013	173	10520	5	518	51	60.809249

In [87]:

views.order(ascending=False)[:5]

Out[87]:

PostTime
2011-05-10 19:55:08    6454
2011-10-20 14:46:14    4228
2012-05-23 08:12:31    3238
2011-10-15 08:21:17    2913
2012-03-18 22:34:26    2624
dtype: int64

In [88]:

def topn(group, n=5):
    return group.order(ascending=False)[:n]
 
grouped = views.groupby(year)

result = grouped.apply(topn)
result

Out[88]:

      PostTime           
2011  2011-05-10 19:55:08    6454
      2011-10-20 14:46:14    4228
      2011-10-15 08:21:17    2913
      2011-04-05 21:13:50    2484
      2011-06-24 12:31:45    2035
2012  2012-05-23 08:12:31    3238
      2012-03-18 22:34:26    2624
      2012-07-05 14:21:15    2422
      2012-09-15 11:30:35    2066
      2012-05-23 23:21:18    2052
2013  2013-01-15 23:45:53     518
      2013-01-07 15:42:28     459
      2013-01-10 16:20:32     335
      2013-01-11 18:18:42     184
      2013-01-16 09:35:36     158
dtype: int64

In [89]:

result[2012]

Out[89]:

PostTime
2012-05-23 08:12:31    3238
2012-03-18 22:34:26    2624
2012-07-05 14:21:15    2422
2012-09-15 11:30:35    2066
2012-05-23 23:21:18    2052
dtype: int64

In [90]:

result

Out[90]:

      PostTime           
2011  2011-05-10 19:55:08    6454
      2011-10-20 14:46:14    4228
      2011-10-15 08:21:17    2913
      2011-04-05 21:13:50    2484
      2011-06-24 12:31:45    2035
2012  2012-05-23 08:12:31    3238
      2012-03-18 22:34:26    2624
      2012-07-05 14:21:15    2422
      2012-09-15 11:30:35    2066
      2012-05-23 23:21:18    2052
2013  2013-01-15 23:45:53     518
      2013-01-07 15:42:28     459
      2013-01-10 16:20:32     335
      2013-01-11 18:18:42     184
      2013-01-16 09:35:36     158
dtype: int64

In [91]:

year = views.index.year
month = views.index.month

keys = [year, month]
stats = ['count']

grouped = views.groupby(keys)

result = grouped.agg(stats)
result

Out[91]:

		count
2011	3	1
	4	2
	5	1
	6	1
	9	1
	10	4
	11	2
	12	1
2012	1	6
	2	4
	3	29
	4	27
	5	44
	6	50
	7	67
	8	91
	9	94
	10	105
	11	121
	12	128
2013	1	161
2013	2	12

Stack and Unstack¶

In [92]:

result.unstack(0)

Out[92]:

	count
	2011	2012	2013
1	NaN	6	161
2	NaN	4	12
3	1	29	NaN
4	2	27	NaN
5	1	44	NaN
6	1	50	NaN
7	NaN	67	NaN
8	NaN	91	NaN
9	1	94	NaN
10	4	105	NaN
11	2	121	NaN
12	1	128	NaN

In [93]:

result.unstack(0).fillna(0)

Out[93]:

	count
	2011	2012	2013
1	0	6	161
2	0	4	12
3	1	29	0
4	2	27	0
5	1	44	0
6	1	50	0
7	0	67	0
8	0	91	0
9	1	94	0
10	4	105	0
11	2	121	0
12	1	128	0

In [94]:

result

Out[94]:

		count
2011	3	1
	4	2
	5	1
	6	1
	9	1
	10	4
	11	2
	12	1
2012	1	6
	2	4
	3	29
	4	27
	5	44
	6	50
	7	67
	8	91
	9	94
	10	105
	11	121
	12	128
2013	1	161
2013	2	12

In [95]:

result.index.names = ['year', 'month']
result

Out[95]:

		count
year	month
2011	3	1
	4	2
	5	1
	6	1
	9	1
	10	4
	11	2
	12	1
2012	1	6
	2	4
	3	29
	4	27
	5	44
	6	50
	7	67
	8	91
	9	94
	10	105
	11	121
	12	128
2013	1	161
2013	2	12

In [96]:

result.unstack('year')

Out[96]:

	count
year	2011	2012	2013
month
1	NaN	6	161
2	NaN	4	12
3	1	29	NaN
4	2	27	NaN
5	1	44	NaN
6	1	50	NaN
7	NaN	67	NaN
8	NaN	91	NaN
9	1	94	NaN
10	4	105	NaN
11	2	121	NaN
12	1	128	NaN

In [97]:

result.unstack('year').stack('year')

Out[97]:

		count
month	year
1	2012	6
1	2013	161
2	2012	4
2	2013	12
3	2011	1
3	2012	29
4	2011	2
4	2012	27
5	2011	1
5	2012	44
6	2011	1
6	2012	50
7	2012	67
8	2012	91
9	2011	1
9	2012	94
10	2011	4
10	2012	105
11	2011	2
11	2012	121
12	2011	1
12	2012	128

In [98]:

stats = ['count', 'sum', 'mean', 'median']
result = grouped.agg(stats)
result.index.names = ['year', 'month']
result

Out[98]:

		count	sum	mean	median
year	month
2011	3	1	1128	1128.000000	1128.0
	4	2	3914	1957.000000	1957.0
	5	1	6454	6454.000000	6454.0
	6	1	2035	2035.000000	2035.0
	9	1	960	960.000000	960.0
	10	4	8590	2147.500000	1989.5
	11	2	1203	601.500000	601.5
	12	1	799	799.000000	799.0
2012	1	6	4901	816.833333	544.5
	2	4	3137	784.250000	505.0
	3	29	19587	675.413793	533.0
	4	27	16477	610.259259	487.0
	5	44	24072	547.090909	430.0
	6	50	21305	426.100000	272.5
	7	67	28852	430.626866	289.0
	8	91	26752	293.978022	206.0
	9	94	21712	230.978723	157.5
	10	105	16909	161.038095	125.0
	11	121	13561	112.074380	93.0
	12	128	10972	85.718750	66.0
2013	1	161	10106	62.770186	52.0
2013	2	12	414	34.500000	34.0

In [99]:

pd.options.display.max_columns = 20
pd.options.display.line_width = 1000
result.unstack('year')

Out[99]:

	count			sum			mean			median
year	2011	2012	2013	2011	2012	2013	2011	2012	2013	2011	2012	2013
month
1	NaN	6	161	NaN	4901	10106	NaN	816.833333	62.770186	NaN	544.5	52
2	NaN	4	12	NaN	3137	414	NaN	784.250000	34.500000	NaN	505.0	34
3	1	29	NaN	1128	19587	NaN	1128.0	675.413793	NaN	1128.0	533.0	NaN
4	2	27	NaN	3914	16477	NaN	1957.0	610.259259	NaN	1957.0	487.0	NaN
5	1	44	NaN	6454	24072	NaN	6454.0	547.090909	NaN	6454.0	430.0	NaN
6	1	50	NaN	2035	21305	NaN	2035.0	426.100000	NaN	2035.0	272.5	NaN
7	NaN	67	NaN	NaN	28852	NaN	NaN	430.626866	NaN	NaN	289.0	NaN
8	NaN	91	NaN	NaN	26752	NaN	NaN	293.978022	NaN	NaN	206.0	NaN
9	1	94	NaN	960	21712	NaN	960.0	230.978723	NaN	960.0	157.5	NaN
10	4	105	NaN	8590	16909	NaN	2147.5	161.038095	NaN	1989.5	125.0	NaN
11	2	121	NaN	1203	13561	NaN	601.5	112.074380	NaN	601.5	93.0	NaN
12	1	128	NaN	799	10972	NaN	799.0	85.718750	NaN	799.0	66.0	NaN

In [100]:

views.groupby(year).agg({'foo': np.mean})

Out[100]:

	foo
2011	1929.461538
2012	271.849869
2013	60.809249

In [101]:

# BLACK MAGIC!

def q(q):
    def f(x):
        return x.quantile(q)
    f.__name__ = 'quantile_%d' % int(q * 100)
    return f
    
views.groupby(year).agg([q(0.1), q(0.5), q(0.9)])

Out[101]:

	quantile_10	quantile_50	quantile_90
2011	466.2	1128	3965.0
2012	58.0	143	599.0
2013	29.0	51	88.8

In [102]:

np.random.seed(130612001)
attendees = open('attendees.txt').read().split()
np.random.shuffle(attendees)
attendees = pd.Series(attendees)
attendees[:7]

Out[102]:

0         khattri.vaibhav@gmail.com
1               giangdh89@gmail.com
2          johnchoo.lives@gmail.com
3    kukkillayavu@gis.a-star.edu.sg
4          wonglz@i2r.a-star.edu.sg
5          chenhl@i2r.a-star.edu.sg
6              kuanwh1996@gmail.com
dtype: object

In [103]:

views.resample('A', how='sum')

Out[103]:

PostTime
2011-12-31     25083
2012-12-31    208237
2013-12-31     10520
Freq: A-DEC, dtype: int64

In [104]:

views.resample('M', how='sum')

Out[104]:

PostTime
2011-03-31     1128
2011-04-30     3914
2011-05-31     6454
2011-06-30     2035
2011-07-31      NaN
2011-08-31      NaN
2011-09-30      960
2011-10-31     8590
2011-11-30     1203
2011-12-31      799
2012-01-31     4901
2012-02-29     3137
2012-03-31    19587
2012-04-30    16477
2012-05-31    24072
2012-06-30    21305
2012-07-31    28852
2012-08-31    26752
2012-09-30    21712
2012-10-31    16909
2012-11-30    13561
2012-12-31    10972
2013-01-31    10106
2013-02-28      414
Freq: M, dtype: float64

In [105]:

views.groupby(year).sum()

Out[105]:

2011     25083
2012    208237
2013     10520
dtype: int64

Grouping tables¶

In [106]:

answers.DisplayName

Out[106]:

0     Calvin Cheng
1     Calvin Cheng
2     Calvin Cheng
3     Calvin Cheng
4     Calvin Cheng
5     Calvin Cheng
6     Calvin Cheng
7     Calvin Cheng
8     Calvin Cheng
9     Calvin Cheng
10    Calvin Cheng
11    Calvin Cheng
12    Calvin Cheng
13    Calvin Cheng
14     Andy Hayden
...
1314                   Tom
1315       Henry Gomersall
1316          milkypostman
1317              RParadox
1318             Mr. Squig
1319                Austin
1320            waitingkuo
1321              Victor K
1322                monkut
1323           Hyperboreus
1324             Dan Allan
1325               mankoff
1326    Yaroslav Halchenko
1327        Felix Zumstein
1328            dantes_419
Name: DisplayName, Length: 1329, dtype: object

In [107]:

answers.groupby('DisplayName').size()

Out[107]:

DisplayName
0xc0de                 1
Aaron                  1
Aaron Digulla          1
Abe                    1
Abhi                   1
Abhranil Das           1
Abraham D Flaxman      1
Adam                   1
Adam Greenhall         1
Adobe                  1
Akavall                1
Alexander Stefanov     1
Alok                   2
Aman                  18
Amyunimus              1
...
user1569050    1
user1763885    1
user1827356    1
user1843099    3
user1917577    1
user1988295    1
user333700     3
user948652     2
vartec         1
velotron       1
waitingkuo     1
wiseveri       1
zach           4
zarthur        1
Óscar López    1
Length: 319, dtype: int64

In [108]:

num_answers = answers.groupby('DisplayName').size()
num_answers.order(ascending=False)

Out[108]:

DisplayName
Wes McKinney        208
Andy Hayden         116
Chang She            89
Wouter Overmeire     72
crewbum              40
BrenBarn             37
DSM                  35
Zelazny7             34
eumiro               31
unutbu               30
root                 20
bmu                  18
Matti John           18
Aman                 18
Rutger Kassies       14
...
Anurag Uniyal         1
Anaphory              1
Amyunimus             1
Alexander Stefanov    1
Akavall               1
Adobe                 1
Adam Greenhall        1
Adam                  1
Abraham D Flaxman     1
Abhranil Das          1
Abhi                  1
Abe                   1
Aaron Digulla         1
Aaron                 1
0xc0de                1
Length: 319, dtype: int64

In [109]:

answers.Score

Out[109]:

0     2
1     0
2     1
3     2
4     1
5     1
6     0
7     2
8     1
9     0
10    2
11    2
12    0
13    0
14    2
...
1314    3
1315    4
1316    0
1317    1
1318    0
1319    1
1320    2
1321    2
1322    2
1323    1
1324    2
1325    4
1326    4
1327    1
1328    0
Name: Score, Length: 1329, dtype: int64

In [110]:

def sortd(self):
    return self.order(ascending=False)
pd.Series.sortd = sortd

In [111]:

# pd.options.display.max_rows = 1000
by_name = answers.groupby('DisplayName')
result = by_name['Score'].agg(['mean', 'count'])

result = result.sort('mean', ascending=False)
result[:20]

Out[111]:

	mean	count
DisplayName
gcalmettes	16.000000	1
Dirk Eddelbuettel	12.000000	1
Matthew Dowle	10.000000	2
tr33hous	9.000000	1
Dani Arribas-Bel	8.000000	1
Lennart Regebro	8.000000	1
Joe Kington	7.000000	1
minrk	6.000000	1
lexual	6.000000	1
Bi Rico	6.000000	2
Sven Marnach	5.000000	2
user1234440	5.000000	1
dbaupp	5.000000	1
Brian Keegan	5.000000	1
luke14free	5.000000	1
jseabold	5.000000	2
andrew cooke	5.000000	1
Nick Crawford	5.000000	1
Mike Pennington	4.500000	2
ogrisel	4.333333	3

In [112]:

result['count'] >= 10

Out[112]:

DisplayName
gcalmettes           False
Dirk Eddelbuettel    False
Matthew Dowle        False
tr33hous             False
Dani Arribas-Bel     False
Lennart Regebro      False
Joe Kington          False
minrk                False
lexual               False
Bi Rico              False
Sven Marnach         False
user1234440          False
dbaupp               False
Brian Keegan         False
luke14free           False
...
gbronner         False
roysc            False
rocketman        False
Ross R           False
Rich Signell     False
Rian             False
Randy Olson      False
Rachel Gallen    False
Jason Morgan     False
nat              False
Jeff Tratner     False
mspadaccino      False
0xc0de           False
velotron         False
Paul Hiemstra    False
Name: count, Length: 319, dtype: bool

In [113]:

(result['count'] >= 10).sum()

Out[113]:

In [114]:

result[result['count'] >= 10]

Out[114]:

	mean	count
DisplayName
Paul H	3.454545	11
unutbu	3.000000	30
BrenBarn	2.675676	37
Wes McKinney	2.610577	208
DSM	2.371429	35
EMS	2.153846	13
Wouter Overmeire	2.097222	72
bmu	2.055556	18
eumiro	1.967742	31
Aman	1.888889	18
Zelazny7	1.882353	34
Matti John	1.666667	18
Andy Hayden	1.586207	116
crewbum	1.525000	40
root	1.500000	20
Jeff	1.454545	11
Chang She	1.325843	89
Rutger Kassies	1.285714	14
lbolla	1.272727	11
Calvin Cheng	1.000000	14
locojay	0.900000	10

In [115]:

stats = ['mean', 'min', 'max', 'std', 'count']
result = by_name['Score'].agg(stats)
result = result.sort('std', ascending=False)
result[result['count'] >= 10]

Out[115]:

	mean	min	max	std	count
DisplayName
Paul H	3.454545	1	18	4.926736	11
Wes McKinney	2.610577	0	40	4.042920	208
BrenBarn	2.675676	0	9	2.055535	37
bmu	2.055556	0	6	1.862074	18
unutbu	3.000000	0	8	1.819435	30
EMS	2.153846	0	6	1.818706	13
Aman	1.888889	0	5	1.604732	18
Wouter Overmeire	2.097222	0	6	1.584784	72
crewbum	1.525000	0	6	1.568725	40
eumiro	1.967742	0	6	1.471595	31
DSM	2.371429	0	6	1.373825	35
Zelazny7	1.882353	0	5	1.365477	34
Chang She	1.325843	0	7	1.346505	89
root	1.500000	0	5	1.317893	20
Andy Hayden	1.586207	0	6	1.305966	116
Rutger Kassies	1.285714	0	4	1.204388	14
lbolla	1.272727	0	4	1.103713	11
Jeff	1.454545	0	3	1.035725	11
locojay	0.900000	0	3	0.994429	10
Matti John	1.666667	0	4	0.970143	18
Calvin Cheng	1.000000	0	2	0.877058	14

In [116]:

top_so_users = result[result['count'] >= 10].index
top_so_users

Out[116]:

Index([Paul H, Wes McKinney, BrenBarn, bmu, unutbu, EMS, Aman, Wouter Overmeire, crewbum, eumiro, DSM, Zelazny7, Chang She, root, Andy Hayden, Rutger Kassies, lbolla, Jeff, locojay, Matti John, Calvin Cheng], dtype=object)

In [117]:

# Doing this 21 times would be slow
answers.DisplayName == 'Wes McKinney'

Out[117]:

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
...
1314    False
1315    False
1316    False
1317    False
1318    False
1319    False
1320    False
1321    False
1322    False
1323    False
1324    False
1325    False
1326    False
1327    False
1328    False
Name: DisplayName, Length: 1329, dtype: bool

In [118]:

'Wes McKinney' in top_so_users

Out[118]:

True

In [119]:

answers.DisplayName.isin(top_so_users)

Out[119]:

0     True
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
10    True
11    True
12    True
13    True
14    True
...
1314    False
1315    False
1316    False
1317    False
1318    False
1319    False
1320    False
1321    False
1322    False
1323    False
1324    False
1325    False
1326    False
1327    False
1328    False
Name: DisplayName, Length: 1329, dtype: bool

In [120]:

crit = answers.DisplayName.isin(top_so_users)
answers_subset = answers[crit]
answers_subset

Out[120]:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 850 entries, 0 to 945
Data columns (total 26 columns):
Id_post                  850  non-null values
PostTypeId               850  non-null values
ParentId                 850  non-null values
PostTime                 850  non-null values
Score                    850  non-null values
Body                     850  non-null values
OwnerUserId              850  non-null values
OwnerDisplayName         0  non-null values
LastEditorUserId         238  non-null values
LastEditorDisplayName    1  non-null values
LastEditDate             239  non-null values
LastActivityDate         850  non-null values
CommentCount             556  non-null values
Id_user                  850  non-null values
Reputation               850  non-null values
UserJoinTime             850  non-null values
DisplayName              850  non-null values
LastAccessDate           850  non-null values
WebsiteUrl               414  non-null values
Location                 542  non-null values
AboutMe                  617  non-null values
Views                    850  non-null values
UpVotes                  850  non-null values
DownVotes                850  non-null values
EmailHash                850  non-null values
Age                      419  non-null values
dtypes: datetime64[ns](1), float64(4), int64(9), object(12)

In [121]:

# Doh, have to convert to index to do this
year = pd.DatetimeIndex(answers_subset.PostTime).year
year[:20]

Out[121]:

array([2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2013], dtype=int32)

In [122]:

keys = ['DisplayName', year]

grouped = answers_subset.groupby(keys)

stats = ['mean', 'min', 'max', 'std', 'count']
result = grouped['Score'].agg(stats)
# result = result.sort('std', ascending=False)
result

Out[122]:

		mean	min	max	std	count
DisplayName
Aman	2012	2.000000	0	5	1.581139	17
Aman	2013	0.000000	0	0	NaN	1
Andy Hayden	2012	1.450704	0	6	1.307000	71
Andy Hayden	2013	1.800000	0	5	1.289820	45
BrenBarn	2012	2.705882	0	9	2.139519	34
BrenBarn	2013	2.333333	2	3	0.577350	3
Calvin Cheng	2012	1.000000	0	2	0.877058	14
Chang She	2012	1.325843	0	7	1.346505	89
DSM	2012	2.478261	0	6	1.473079	23
DSM	2013	2.166667	0	4	1.193416	12
EMS	2012	2.153846	0	6	1.818706	13
Jeff	2012	1.500000	0	3	1.290994	4
Jeff	2013	1.428571	0	3	0.975900	7
Matti John	2012	1.666667	0	4	0.970143	18
Paul H	2012	2.000000	1	4	1.154701	7
Paul H	2013	6.000000	1	18	8.041559	4
Rutger Kassies	2012	1.363636	0	4	1.286291	11
Rutger Kassies	2013	1.000000	0	2	1.000000	3
Wes McKinney	2011	8.125000	1	21	7.180082	8
	2012	2.450777	0	40	3.780282	193
	2013	0.714286	0	2	0.951190	7
Wouter Overmeire	2012	2.097222	0	6	1.584784	72
Zelazny7	2012	2.125000	0	5	1.726888	8
Zelazny7	2013	1.807692	0	5	1.265519	26
bmu	2012	1.875000	0	5	1.668333	16
bmu	2013	3.500000	1	6	3.535534	2
crewbum	2012	1.354839	0	6	1.427081	31
crewbum	2013	2.111111	0	5	1.964971	9
eumiro	2012	1.892857	0	6	1.523624	28
eumiro	2013	2.666667	2	3	0.577350	3
lbolla	2012	1.272727	0	4	1.103713	11
locojay	2012	0.777778	0	3	0.971825	9
locojay	2013	2.000000	2	2	NaN	1
root	2012	1.187500	0	5	1.223043	16
root	2013	2.750000	2	4	0.957427	4
unutbu	2011	8.000000	8	8	NaN	1
	2012	3.000000	0	7	1.878673	18
	2013	2.545455	2	5	0.934199	11

In [123]:

np.log(users.Reputation).hist(bins=100)

Out[123]:

<matplotlib.axes.AxesSubplot at 0x1075a5b90>

In [124]:

buckets = [0, 100, 1000, 10000, 1000000]
rep_bucket = pd.cut(answers.Reputation, buckets)

In [125]:

answers.groupby(rep_bucket).size()

Out[125]:

Reputation
(0, 100]            120
(100, 1000]         237
(1000, 10000]       783
(10000, 1000000]    189
dtype: int64

In [126]:

len(answers.DisplayName.unique())

Out[126]:

In [127]:

grouped = answers.groupby(rep_bucket)

def distinct(names):
    return len(names.unique())

statistics = {
    'Score': ['mean', 'median', 'count'],
    'Age': ['min', 'max'],
    'DisplayName': [distinct]
}

grouped.agg(statistics)

Out[127]:

	Age		Score			DisplayName
	min	max	mean	median	count	distinct
Reputation
(0, 100]	23	45	0.966667	0	120	87
(100, 1000]	17	46	1.413502	1	237	111
(1000, 10000]	19	49	1.978289	1	783	83
(10000, 1000000]	24	64	2.470899	2	189	38

In [128]:

users.irow(users.Reputation.argmax())

Out[128]:

Id                                                            20862
Reputation                                                   220371
CreationDate                                    2008-09-23 02:06:20
DisplayName                                  Ignacio Vazquez-Abrams
LastAccessDate                                  2013-02-02 22:20:15
WebsiteUrl                                                      NaN
Location                                                        NaN
AboutMe           <p><a href="http://whathaveyoutried.com/" rel=...
Views                                                          5231
UpVotes                                                        1426
DownVotes                                                        63
EmailHash                          2343ae368d3241278581ce6c87f62a25
Age                                                             NaN
Name: 532, dtype: object

New dataset time!¶

In [129]:

import json
path = 'foods-2011-10-03.json'

db = json.load(open(path))

In [130]:

print type(db)
len(db)

<type 'list'>

Out[130]:

In [131]:

db[0]

Out[131]:

{u'description': u'Cheese, caraway',
 u'group': u'Dairy and Egg Products',
 u'id': 1008,
 u'manufacturer': u'',
 u'nutrients': [{u'description': u'Protein',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 25.18},
  {u'description': u'Total lipid (fat)',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 29.2},
  {u'description': u'Carbohydrate, by difference',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 3.06},
  {u'description': u'Ash', u'group': u'Other', u'units': u'g', u'value': 3.28},
  {u'description': u'Energy',
   u'group': u'Energy',
   u'units': u'kcal',
   u'value': 376.0},
  {u'description': u'Water',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 39.28},
  {u'description': u'Energy',
   u'group': u'Energy',
   u'units': u'kJ',
   u'value': 1573.0},
  {u'description': u'Fiber, total dietary',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 0.0},
  {u'description': u'Calcium, Ca',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 673.0},
  {u'description': u'Iron, Fe',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 0.64},
  {u'description': u'Magnesium, Mg',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 22.0},
  {u'description': u'Phosphorus, P',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 490.0},
  {u'description': u'Potassium, K',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 93.0},
  {u'description': u'Sodium, Na',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 690.0},
  {u'description': u'Zinc, Zn',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 2.94},
  {u'description': u'Copper, Cu',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 0.024},
  {u'description': u'Manganese, Mn',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 0.021},
  {u'description': u'Selenium, Se',
   u'group': u'Elements',
   u'units': u'mcg',
   u'value': 14.5},
  {u'description': u'Vitamin A, IU',
   u'group': u'Vitamins',
   u'units': u'IU',
   u'value': 1054.0},
  {u'description': u'Retinol',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 262.0},
  {u'description': u'Vitamin A, RAE',
   u'group': u'Vitamins',
   u'units': u'mcg_RAE',
   u'value': 271.0},
  {u'description': u'Vitamin C, total ascorbic acid',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.0},
  {u'description': u'Thiamin',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.031},
  {u'description': u'Riboflavin',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.45},
  {u'description': u'Niacin',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.18},
  {u'description': u'Pantothenic acid',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.19},
  {u'description': u'Vitamin B-6',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.074},
  {u'description': u'Folate, total',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 18.0},
  {u'description': u'Vitamin B-12',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 0.27},
  {u'description': u'Folic acid',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 0.0},
  {u'description': u'Folate, food',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 18.0},
  {u'description': u'Folate, DFE',
   u'group': u'Vitamins',
   u'units': u'mcg_DFE',
   u'value': 18.0},
  {u'description': u'Cholesterol',
   u'group': u'Other',
   u'units': u'mg',
   u'value': 93.0},
  {u'description': u'Fatty acids, total saturated',
   u'group': u'Other',
   u'units': u'g',
   u'value': 18.584},
  {u'description': u'Fatty acids, total monounsaturated',
   u'group': u'Other',
   u'units': u'g',
   u'value': 8.275},
  {u'description': u'Fatty acids, total polyunsaturated',
   u'group': u'Other',
   u'units': u'g',
   u'value': 0.83},
  {u'description': u'Tryptophan',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.324},
  {u'description': u'Threonine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.896},
  {u'description': u'Isoleucine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.563},
  {u'description': u'Leucine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 2.412},
  {u'description': u'Lysine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 2.095},
  {u'description': u'Methionine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.659},
  {u'description': u'Cystine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.126},
  {u'description': u'Phenylalanine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.326},
  {u'description': u'Tyrosine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.216},
  {u'description': u'Valine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.682},
  {u'description': u'Arginine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.952},
  {u'description': u'Histidine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.884},
  {u'description': u'Alanine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.711},
  {u'description': u'Aspartic acid',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.618},
  {u'description': u'Glutamic acid',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 6.16},
  {u'description': u'Glycine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.439},
  {u'description': u'Proline',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 2.838},
  {u'description': u'Serine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.472},
  {u'description': u'Protein',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 25.18},
  {u'description': u'Total lipid (fat)',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 29.2},
  {u'description': u'Carbohydrate, by difference',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 3.06},
  {u'description': u'Ash', u'group': u'Other', u'units': u'g', u'value': 3.28},
  {u'description': u'Energy',
   u'group': u'Energy',
   u'units': u'kcal',
   u'value': 376.0},
  {u'description': u'Water',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 39.28},
  {u'description': u'Energy',
   u'group': u'Energy',
   u'units': u'kJ',
   u'value': 1573.0},
  {u'description': u'Fiber, total dietary',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 0.0},
  {u'description': u'Calcium, Ca',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 673.0},
  {u'description': u'Iron, Fe',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 0.64},
  {u'description': u'Magnesium, Mg',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 22.0},
  {u'description': u'Phosphorus, P',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 490.0},
  {u'description': u'Potassium, K',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 93.0},
  {u'description': u'Sodium, Na',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 690.0},
  {u'description': u'Zinc, Zn',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 2.94},
  {u'description': u'Copper, Cu',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 0.024},
  {u'description': u'Manganese, Mn',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 0.021},
  {u'description': u'Selenium, Se',
   u'group': u'Elements',
   u'units': u'mcg',
   u'value': 14.5},
  {u'description': u'Vitamin A, IU',
   u'group': u'Vitamins',
   u'units': u'IU',
   u'value': 1054.0},
  {u'description': u'Retinol',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 262.0},
  {u'description': u'Vitamin A, RAE',
   u'group': u'Vitamins',
   u'units': u'mcg_RAE',
   u'value': 271.0},
  {u'description': u'Vitamin C, total ascorbic acid',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.0},
  {u'description': u'Thiamin',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.031},
  {u'description': u'Riboflavin',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.45},
  {u'description': u'Niacin',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.18},
  {u'description': u'Pantothenic acid',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.19},
  {u'description': u'Vitamin B-6',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.074},
  {u'description': u'Folate, total',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 18.0},
  {u'description': u'Vitamin B-12',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 0.27},
  {u'description': u'Folic acid',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 0.0},
  {u'description': u'Folate, food',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 18.0},
  {u'description': u'Folate, DFE',
   u'group': u'Vitamins',
   u'units': u'mcg_DFE',
   u'value': 18.0},
  {u'description': u'Tryptophan',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.324},
  {u'description': u'Threonine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.896},
  {u'description': u'Isoleucine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.563},
  {u'description': u'Leucine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 2.412},
  {u'description': u'Lysine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 2.095},
  {u'description': u'Methionine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.659},
  {u'description': u'Cystine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.126},
  {u'description': u'Phenylalanine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.326},
  {u'description': u'Tyrosine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.216},
  {u'description': u'Valine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.682},
  {u'description': u'Arginine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.952},
  {u'description': u'Histidine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.884},
  {u'description': u'Alanine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.711},
  {u'description': u'Aspartic acid',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.618},
  {u'description': u'Glutamic acid',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 6.16},
  {u'description': u'Glycine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.439},
  {u'description': u'Proline',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 2.838},
  {u'description': u'Serine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.472},
  {u'description': u'Cholesterol',
   u'group': u'Other',
   u'units': u'mg',
   u'value': 93.0},
  {u'description': u'Fatty acids, total saturated',
   u'group': u'Other',
   u'units': u'g',
   u'value': 18.584},
  {u'description': u'Fatty acids, total monounsaturated',
   u'group': u'Other',
   u'units': u'g',
   u'value': 8.275},
  {u'description': u'Fatty acids, total polyunsaturated',
   u'group': u'Other',
   u'units': u'g',
   u'value': 0.83},
  {u'description': u'Protein',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 25.18},
  {u'description': u'Total lipid (fat)',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 29.2},
  {u'description': u'Carbohydrate, by difference',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 3.06},
  {u'description': u'Ash', u'group': u'Other', u'units': u'g', u'value': 3.28},
  {u'description': u'Energy',
   u'group': u'Energy',
   u'units': u'kcal',
   u'value': 376.0},
  {u'description': u'Water',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 39.28},
  {u'description': u'Energy',
   u'group': u'Energy',
   u'units': u'kJ',
   u'value': 1573.0},
  {u'description': u'Fiber, total dietary',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 0.0},
  {u'description': u'Calcium, Ca',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 673.0},
  {u'description': u'Iron, Fe',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 0.64},
  {u'description': u'Magnesium, Mg',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 22.0},
  {u'description': u'Phosphorus, P',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 490.0},
  {u'description': u'Potassium, K',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 93.0},
  {u'description': u'Sodium, Na',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 690.0},
  {u'description': u'Zinc, Zn',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 2.94},
  {u'description': u'Copper, Cu',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 0.024},
  {u'description': u'Manganese, Mn',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 0.021},
  {u'description': u'Selenium, Se',
   u'group': u'Elements',
   u'units': u'mcg',
   u'value': 14.5},
  {u'description': u'Vitamin A, IU',
   u'group': u'Vitamins',
   u'units': u'IU',
   u'value': 1054.0},
  {u'description': u'Retinol',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 262.0},
  {u'description': u'Vitamin A, RAE',
   u'group': u'Vitamins',
   u'units': u'mcg_RAE',
   u'value': 271.0},
  {u'description': u'Vitamin C, total ascorbic acid',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.0},
  {u'description': u'Thiamin',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.031},
  {u'description': u'Riboflavin',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.45},
  {u'description': u'Niacin',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.18},
  {u'description': u'Pantothenic acid',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.19},
  {u'description': u'Vitamin B-6',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.074},
  {u'description': u'Folate, total',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 18.0},
  {u'description': u'Vitamin B-12',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 0.27},
  {u'description': u'Folic acid',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 0.0},
  {u'description': u'Folate, food',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 18.0},
  {u'description': u'Folate, DFE',
   u'group': u'Vitamins',
   u'units': u'mcg_DFE',
   u'value': 18.0},
  {u'description': u'Tryptophan',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.324},
  {u'description': u'Threonine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.896},
  {u'description': u'Isoleucine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.563},
  {u'description': u'Leucine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 2.412},
  {u'description': u'Lysine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 2.095},
  {u'description': u'Methionine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.659},
  {u'description': u'Cystine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.126},
  {u'description': u'Phenylalanine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.326},
  {u'description': u'Tyrosine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.216},
  {u'description': u'Valine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.682},
  {u'description': u'Arginine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.952},
  {u'description': u'Histidine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.884},
  {u'description': u'Alanine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.711},
  {u'description': u'Aspartic acid',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.618},
  {u'description': u'Glutamic acid',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 6.16},
  {u'description': u'Glycine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.439},
  {u'description': u'Proline',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 2.838},
  {u'description': u'Serine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.472},
  {u'description': u'Cholesterol',
   u'group': u'Other',
   u'units': u'mg',
   u'value': 93.0},
  {u'description': u'Fatty acids, total saturated',
   u'group': u'Other',
   u'units': u'g',
   u'value': 18.584},
  {u'description': u'Fatty acids, total monounsaturated',
   u'group': u'Other',
   u'units': u'g',
   u'value': 8.275},
  {u'description': u'Fatty acids, total polyunsaturated',
   u'group': u'Other',
   u'units': u'g',
   u'value': 0.83}],
 u'portions': [{u'amount': 1, u'grams': 28.35, u'unit': u'oz'}],
 u'tags': []}

In [132]:

db[0]['nutrients'][0]

Out[132]:

{u'description': u'Protein',
 u'group': u'Composition',
 u'units': u'g',
 u'value': 25.18}

In [133]:

pd.DataFrame([{'a': 1, 'b': 2},
              {'a': 2, 'b': 3, 'c': 4}])

Out[133]:

	a	b	c
0	1	2	NaN
1	2	3	4

In [134]:

pd.DataFrame(db[0]['nutrients'])

Out[134]:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162 entries, 0 to 161
Data columns (total 4 columns):
description    162  non-null values
group          162  non-null values
units          162  non-null values
value          162  non-null values
dtypes: float64(1), object(3)

In [135]:

pd.DataFrame(db[0]['nutrients']).head()

Out[135]:

	description	group	units	value
0	Protein	Composition	g	25.18
1	Total lipid (fat)	Composition	g	29.20
2	Carbohydrate, by difference	Composition	g	3.06
3	Ash	Other	g	3.28
4	Energy	Energy	kcal	376.00

In [136]:

db[0]['nutrients'][0]

Out[136]:

{u'description': u'Protein',
 u'group': u'Composition',
 u'units': u'g',
 u'value': 25.18}

In [137]:

db[0]['id']

Out[137]:

In [138]:

all_nutrients = []
for rec in db:
    for nutrient in rec['nutrients']:
        nutrient['id'] = rec['id']
        all_nutrients.append(nutrient)

nutrients = pd.DataFrame(all_nutrients)

In [139]:

nutrients.head()

Out[139]:

	description	group	id	units	value
0	Protein	Composition	1008	g	25.18
1	Total lipid (fat)	Composition	1008	g	29.20
2	Carbohydrate, by difference	Composition	1008	g	3.06
3	Ash	Other	1008	g	3.28
4	Energy	Energy	1008	kcal	376.00

In [140]:

pd.DataFrame(all_nutrients, columns=['value'])

Out[140]:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 389355 entries, 0 to 389354
Data columns (total 1 columns):
value    389355  non-null values
dtypes: float64(1)

In [141]:

of_interest = ['id', 'description', 'group']
meta = pd.DataFrame(db, columns=of_interest)
meta

Out[141]:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6636 entries, 0 to 6635
Data columns (total 3 columns):
id             6636  non-null values
description    6636  non-null values
group          6636  non-null values
dtypes: int64(1), object(2)

In [142]:

nutrients

Out[142]:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 389355 entries, 0 to 389354
Data columns (total 5 columns):
description    389355  non-null values
group          389355  non-null values
id             389355  non-null values
units          389355  non-null values
value          389355  non-null values
dtypes: float64(1), int64(1), object(3)

In [143]:

mapping = {
    'description': 'nutrient',
    'group': 'ngroup'
}
nutrients = nutrients.rename(columns=mapping)

# COULD HAVE DONE
# nutrients.rename(columns=mapping, inplace=True)

mapping = {
    'description': 'food',
    'group': 'fgroup'
}
meta = meta.rename(columns=mapping)

In [144]:

data = pd.merge(meta, nutrients, on='id')

In [145]:

data

Out[145]:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 389355 entries, 0 to 389354
Data columns (total 7 columns):
id          389355  non-null values
food        389355  non-null values
fgroup      389355  non-null values
nutrient    389355  non-null values
ngroup      389355  non-null values
units       389355  non-null values
value       389355  non-null values
dtypes: float64(1), int64(1), object(5)

In [146]:

cafdata = data[data.nutrient == 'Caffeine']
cafdata

Out[146]:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3911 entries, 172 to 389312
Data columns (total 7 columns):
id          3911  non-null values
food        3911  non-null values
fgroup      3911  non-null values
nutrient    3911  non-null values
ngroup      3911  non-null values
units       3911  non-null values
value       3911  non-null values
dtypes: float64(1), int64(1), object(5)

In [147]:

data.ix[172]

Out[147]:

id                            1009
food               Cheese, cheddar
fgroup      Dairy and Egg Products
nutrient                  Caffeine
ngroup                       Other
units                           mg
value                            0
Name: 172, dtype: object

In [148]:

cafdata.ix[172]

Out[148]:

id                            1009
food               Cheese, cheddar
fgroup      Dairy and Egg Products
nutrient                  Caffeine
ngroup                       Other
units                           mg
value                            0
Name: 172, dtype: object

In [149]:

cafdata.units.value_counts()

Out[149]:

mg    3911
dtype: int64

In [150]:

# MAXIMUM IN NUMPY
cafdata.value.values.argmax()

Out[150]:

In [151]:

# INDEX OF THE MAXIMUM VALUE
cafdata.value.idxmax()

Out[151]:

In [152]:

data.ix[cafdata.value.idxmax()]

Out[152]:

id                                      14366
food        Tea, instant, unsweetened, powder
fgroup                              Beverages
nutrient                             Caffeine
ngroup                                  Other
units                                      mg
value                                    3680
Name: 336702, dtype: object

In [153]:

cafdata.dtypes

Out[153]:

id            int64
food         object
fgroup       object
nutrient     object
ngroup       object
units        object
value       float64
dtype: object

In [154]:

def max_value(group):
    return group.ix[group['value'].idxmax()]

max_value(cafdata)

Out[154]:

id                                      14366
food        Tea, instant, unsweetened, powder
fgroup                              Beverages
nutrient                             Caffeine
ngroup                                  Other
units                                      mg
value                                    3680
Name: 336702, dtype: object

In [156]:

cafdata.groupby('fgroup').apply(max_value).dtypes

Out[156]:

id          object
food        object
fgroup      object
nutrient    object
ngroup      object
units       object
value       object
dtype: object

In [157]:

cafdata.groupby('fgroup').apply(max_value)

Out[157]:

	id	food	fgroup	nutrient	ngroup	units	value
fgroup
Baby Foods	3008	Babyfood, meat, ham, strained	Baby Foods	Caffeine	Other	mg	0
Baked Products	18174	Cookies, graham crackers, chocolate-coated	Baked Products	Caffeine	Other	mg	46
Beef Products	13000	Beef, grass-fed, strip steaks, lean only, raw	Beef Products	Caffeine	Other	mg	0
Beverages	14366	Tea, instant, unsweetened, powder	Beverages	Caffeine	Other	mg	3680
Breakfast Cereals	8294	Cereals ready-to-eat, QUAKER, QUAKER COCOA BLASTS	Breakfast Cereals	Caffeine	Other	mg	21
Cereal Grains and Pasta	20004	Barley, hulled	Cereal Grains and Pasta	Caffeine	Other	mg	0
Dairy and Egg Products	43260	Beverage, instant breakfast powder, chocolate,...	Dairy and Egg Products	Caffeine	Other	mg	52
Ethnic Foods	35009	Whale, beluga, meat, dried (Alaska Native)	Ethnic Foods	Caffeine	Other	mg	0
Fast Foods	21030	Fast foods, cookies, chocolate chip	Fast Foods	Caffeine	Other	mg	11
Fats and Oils	2050	Lard	Fats and Oils	Caffeine	Other	mg	0
Finfish and Shellfish Products	15001	Fish, anchovy, european, raw	Finfish and Shellfish Products	Caffeine	Other	mg	0
Fruits and Fruit Juices	9002	Acerola juice, raw	Fruits and Fruit Juices	Caffeine	Other	mg	0
Lamb, Veal, and Game Products	17094	Veal, leg (top round), separable lean and fat,...	Lamb, Veal, and Game Products	Caffeine	Other	mg	0
Legumes and Legume Products	16166	Soymilk, chocolate, unfortified	Legumes and Legume Products	Caffeine	Other	mg	2
Meals, Entrees, and Sidedishes	22247	Macaroni and Cheese, canned entree	Meals, Entrees, and Sidedishes	Caffeine	Other	mg	0
Nut and Seed Products	12014	Seeds, pumpkin and squash seed kernels, dried	Nut and Seed Products	Caffeine	Other	mg	0
Pork Products	10000	Pork, fresh, enhanced, composite of separable ...	Pork Products	Caffeine	Other	mg	0
Poultry Products	5008	Chicken, broilers or fryers, meat and skin, co...	Poultry Products	Caffeine	Other	mg	0
Restaurant Foods	36617	Restaurant, Chinese, lemon chicken	Restaurant Foods	Caffeine	Other	mg	0
Sausages and Luncheon Meats	7008	Bologna, beef and pork	Sausages and Luncheon Meats	Caffeine	Other	mg	0
Snacks	25031	Formulated bar, ZONE PERFECT CLASSIC CRUNCH BA...	Snacks	Caffeine	Other	mg	11
Soups, Sauces, and Gravies	3073	Soup, cream of asparagus, canned, condensed	Soups, Sauces, and Gravies	Caffeine	Other	mg	0
Spices and Herbs	1025	Spices, allspice, ground	Spices and Herbs	Caffeine	Other	mg	0
Sweets	19268	Candies, dark chocolate coated coffee beans	Sweets	Caffeine	Other	mg	839
Vegetables and Vegetable Products	11001	Alfalfa seeds, sprouted, raw	Vegetables and Vegetable Products	Caffeine	Other	mg	0

In [158]:

grouped = data.groupby(['nutrient', 'fgroup'])
results = grouped.apply(max_value)
results

Out[158]:

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2246 entries, (Adjusted Protein, Sweets) to (Zinc, Zn, Vegetables and Vegetable Products)
Data columns (total 7 columns):
id          2246  non-null values
food        2246  non-null values
fgroup      2246  non-null values
nutrient    2246  non-null values
ngroup      2246  non-null values
units       2246  non-null values
value       2246  non-null values
dtypes: object(7)

In [159]:

pd.options.display.line_width = 10000
def topn(group, by=None, n=5):
    return group.sort(by, ascending=False)[:n]

result = cafdata.groupby('fgroup').apply(topn, by='value', n=3)
result.ix['Beverages']

Out[159]:

	id	food	fgroup	nutrient	ngroup	units	value
336702	14366	Tea, instant, unsweetened, powder	Beverages	Caffeine	Other	mg	3680
334542	14214	Coffee, instant, regular, powder	Beverages	Caffeine	Other	mg	3142
336913	14375	Tea, instant, sweetened with sodium saccharin,...	Beverages	Caffeine	Other	mg	2240

In [160]:

data.nutrient.unique()

Out[160]:

array([u'Protein', u'Total lipid (fat)', u'Carbohydrate, by difference',
       u'Ash', u'Energy', u'Water', u'Fiber, total dietary',
       u'Calcium, Ca', u'Iron, Fe', u'Magnesium, Mg', u'Phosphorus, P',
       u'Potassium, K', u'Sodium, Na', u'Zinc, Zn', u'Copper, Cu',
       u'Manganese, Mn', u'Selenium, Se', u'Vitamin A, IU', u'Retinol',
       u'Vitamin A, RAE', u'Vitamin C, total ascorbic acid', u'Thiamin',
       u'Riboflavin', u'Niacin', u'Pantothenic acid', u'Vitamin B-6',
       u'Folate, total', u'Vitamin B-12', u'Folic acid', u'Folate, food',
       u'Folate, DFE', u'Cholesterol', u'Fatty acids, total saturated',
       u'Fatty acids, total monounsaturated',
       u'Fatty acids, total polyunsaturated', u'Tryptophan', u'Threonine',
       u'Isoleucine', u'Leucine', u'Lysine', u'Methionine', u'Cystine',
       u'Phenylalanine', u'Tyrosine', u'Valine', u'Arginine', u'Histidine',
       u'Alanine', u'Aspartic acid', u'Glutamic acid', u'Glycine',
       u'Proline', u'Serine', u'Sucrose', u'Lactose', u'Maltose',
       u'Alcohol, ethyl', u'Caffeine', u'Theobromine', u'Sugars, total',
       u'Fluoride, F', u'Carotene, beta', u'Carotene, alpha',
       u'Vitamin E (alpha-tocopherol)', u'Vitamin D',
       u'Vitamin D3 (cholecalciferol)', u'Vitamin D (D2 + D3)',
       u'Cryptoxanthin, beta', u'Lycopene', u'Lutein + zeaxanthin',
       u'Tocopherol, gamma', u'Tocopherol, delta', u'Choline, total',
       u'Vitamin K (phylloquinone)', u'Betaine', u'Vitamin E, added',
       u'Vitamin B-12, added', u'Dihydrophylloquinone',
       u'Tocopherol, beta', u'Phytosterols', u'Glucose (dextrose)',
       u'Fructose', u'Galactose', u'Fatty acids, total trans',
       u'Stigmasterol', u'Campesterol', u'Beta-sitosterol',
       u'Fatty acids, total trans-monoenoic',
       u'Fatty acids, total trans-polyenoic', u'Hydroxyproline', u'Starch',
       u'Menaquinone-4', u'Vitamin D2 (ergocalciferol)',
       u'Adjusted Protein'], dtype=object)

In [167]:

grouped = data.groupby(['nutrient', 'fgroup'])
results = grouped.apply(max_value)
results.ix['Zinc, Zn']

Out[167]:

	id	food	fgroup	nutrient	ngroup	units	value
fgroup
Baby Foods	3192	Babyfood, cereal, oatmeal, with applesauce and...	Baby Foods	Zinc, Zn	Elements	mg	7.5
Baked Products	18374	Leavening agents, yeast, baker's, compressed	Baked Products	Zinc, Zn	Elements	mg	9.97
Beef Products	13981	Beef, chuck, short ribs, boneless, separable l...	Beef Products	Zinc, Zn	Elements	mg	12.28
Beverages	14557	Chocolate-flavor beverage mix for milk, powder...	Beverages	Zinc, Zn	Elements	mg	6.82
Breakfast Cereals	8504	Cereals ready-to-eat, Ralston Enriched Bran fl...	Breakfast Cereals	Zinc, Zn	Elements	mg	64.33
Cereal Grains and Pasta	20078	Wheat germ, crude	Cereal Grains and Pasta	Zinc, Zn	Elements	mg	12.29
Dairy and Egg Products	43260	Beverage, instant breakfast powder, chocolate,...	Dairy and Egg Products	Zinc, Zn	Elements	mg	15.4
Ethnic Foods	35194	Agave, dried (Southwest)	Ethnic Foods	Zinc, Zn	Elements	mg	12.1
Fast Foods	21048	Fast foods, oysters, battered or breaded, and ...	Fast Foods	Zinc, Zn	Elements	mg	11.25
Fats and Oils	2066	Salad dressing, russian dressing, low calorie	Fats and Oils	Zinc, Zn	Elements	mg	2.41
Finfish and Shellfish Products	15170	Mollusks, oyster, eastern, canned	Finfish and Shellfish Products	Zinc, Zn	Elements	mg	90.95
Fruits and Fruit Juices	9030	Apricots, dehydrated (low-moisture), sulfured,...	Fruits and Fruit Juices	Zinc, Zn	Elements	mg	1
Lamb, Veal, and Game Products	17202	Veal, variety meats and by-products, liver, raw	Lamb, Veal, and Game Products	Zinc, Zn	Elements	mg	12.02
Legumes and Legume Products	16176	Soy protein isolate, PROTEIN TECHNOLOGIES INTE...	Legumes and Legume Products	Zinc, Zn	Elements	mg	60
Meals, Entrees, and Sidedishes	22959	Macaroni and cheese dinner with dry sauce mix,...	Meals, Entrees, and Sidedishes	Zinc, Zn	Elements	mg	1.42
Nut and Seed Products	12011	Seeds, cottonseed meal, partially defatted (gl...	Nut and Seed Products	Zinc, Zn	Elements	mg	12.32
Pork Products	10111	Pork, fresh, variety meats and by-products, li...	Pork Products	Zinc, Zn	Elements	mg	6.72
Poultry Products	5180	Turkey, all classes, neck, meat only, cooked, ...	Poultry Products	Zinc, Zn	Elements	mg	7.12
Restaurant Foods	36608	CRACKER BARREL, grilled sirloin steak	Restaurant Foods	Zinc, Zn	Elements	mg	5.67
Sausages and Luncheon Meats	7912	Roast beef spread	Sausages and Luncheon Meats	Zinc, Zn	Elements	mg	6.08
Snacks	25006	Formulated bar, MARS SNACKFOOD US, SNICKERS Ma...	Snacks	Zinc, Zn	Elements	mg	13.57
Soups, Sauces, and Gravies	6048	Soup, oyster stew, canned, condensed	Soups, Sauces, and Gravies	Zinc, Zn	Elements	mg	8.4
Spices and Herbs	2008	Spices, chervil, dried	Spices and Herbs	Zinc, Zn	Elements	mg	8.8
Sweets	19078	Baking chocolate, unsweetened, squares	Sweets	Zinc, Zn	Elements	mg	9.63
Vegetables and Vegetable Products	11268	Mushrooms, shiitake, dried	Vegetables and Vegetable Products	Zinc, Zn	Elements	mg	7.66

In [168]:

results.ix['Zinc, Zn'].value

Out[168]:

fgroup
Baby Foods                             7.5
Baked Products                        9.97
Beef Products                        12.28
Beverages                             6.82
Breakfast Cereals                    64.33
Cereal Grains and Pasta              12.29
Dairy and Egg Products                15.4
Ethnic Foods                          12.1
Fast Foods                           11.25
Fats and Oils                         2.41
Finfish and Shellfish Products       90.95
Fruits and Fruit Juices                  1
Lamb, Veal, and Game Products        12.02
Legumes and Legume Products             60
Meals, Entrees, and Sidedishes        1.42
Nut and Seed Products                12.32
Pork Products                         6.72
Poultry Products                      7.12
Restaurant Foods                      5.67
Sausages and Luncheon Meats           6.08
Snacks                               13.57
Soups, Sauces, and Gravies             8.4
Spices and Herbs                       8.8
Sweets                                9.63
Vegetables and Vegetable Products     7.66
Name: value, dtype: object

In [169]:

results.ix['Zinc, Zn'].value.plot(kind='barh')

Out[169]:

<matplotlib.axes.AxesSubplot at 0x10100a190>

In [170]:

plt.figure(figsize=(10, 8))
medians = data.groupby(['nutrient', 'fgroup']).value.median()
medians.ix['Zinc, Zn'].plot(kind='barh')

Out[170]:

<matplotlib.axes.AxesSubplot at 0x11e151090>

In [171]:

plt.figure(figsize=(10, 8))
maxes = data.groupby(['nutrient', 'fgroup']).value.max()
np.log(maxes.ix['Caffeine']).plot(kind='barh')

Out[171]:

<matplotlib.axes.AxesSubplot at 0x106c3abd0>

In [172]:

data[['food', 'nutrient']].describe()

Out[172]:

	food	nutrient
count	389355	389355
unique	6636	94
top	Milk, reduced fat, fluid, 2% milkfat, with add...	Energy
freq	270	13704

In [ ]: