In [1]:
import pandas as pd
import numpy as np
pd.__version__
Out[1]:
'0.11.0'
In [2]:
values = [1.5, 5, -5, 4, 2]

s = pd.Series(values)
s
Out[2]:
0    1.5
1    5.0
2   -5.0
3    4.0
4    2.0
dtype: float64
In [3]:
s.values
Out[3]:
array([ 1.5,  5. , -5. ,  4. ,  2. ])
In [4]:
s.index
Out[4]:
Int64Index([0, 1, 2, 3, 4], dtype=int64)
In [5]:
labels = ['a', 'b', 'c', 'd', 'e']

s2 = pd.Series(values, index=labels)
s2
Out[5]:
a    1.5
b    5.0
c   -5.0
d    4.0
e    2.0
dtype: float64
In [6]:
s2.index
Out[6]:
Index([a, b, c, d, e], dtype=object)
In [7]:
s2.index[0]
Out[7]:
'a'
In [8]:
s2[2]
Out[8]:
-5.0
In [9]:
s2['c']
Out[9]:
-5.0
In [10]:
s2[:3]
Out[10]:
a    1.5
b    5.0
c   -5.0
dtype: float64
In [11]:
s2[['a', 'c', 'd']]
Out[11]:
a    1.5
c   -5.0
d    4.0
dtype: float64
In [12]:
# missing data
s2[['a', 'c', 'd', 'f']]
Out[12]:
a    1.5
c   -5.0
d    4.0
f    NaN
dtype: float64
In [13]:
s3 = s2[['a', 'c', 'd', 'f']]
s3[s3.isnull()]
Out[13]:
f   NaN
dtype: float64
In [14]:
s3[s3.notnull()]
Out[14]:
a    1.5
c   -5.0
d    4.0
dtype: float64
In [15]:
s3.dropna()
Out[15]:
a    1.5
c   -5.0
d    4.0
dtype: float64
In [16]:
s3.fillna(0)
Out[16]:
a    1.5
c   -5.0
d    4.0
f    0.0
dtype: float64
In [17]:
s3.mean()
Out[17]:
0.16666666666666666
In [18]:
s3.fillna(s3.mean())
Out[18]:
a    1.500000
c   -5.000000
d    4.000000
f    0.166667
dtype: float64
In [19]:
data = {
  'two': s2,
  'three': s3.fillna(s3.mean())
}

df = pd.DataFrame(data)
df
Out[19]:
three two
a 1.500000 1.5
b NaN 5.0
c -5.000000 -5.0
d 4.000000 4.0
e NaN 2.0
f 0.166667 NaN
In [20]:
df.index.name = 'edition'
df.columns.name = 'number'
df
Out[20]:
number three two
edition
a 1.500000 1.5
b NaN 5.0
c -5.000000 -5.0
d 4.000000 4.0
e NaN 2.0
f 0.166667 NaN
In [21]:
np.set_printoptions(precision=4)
np.random.randn(5, 3).T
Out[21]:
array([[-0.5463, -0.3112, -0.9632,  1.8207,  2.6747],
       [ 0.3955,  0.8636, -0.0378, -0.3928, -0.4491],
       [-0.4356,  0.9094, -0.9029, -2.0247,  0.2625]])
In [22]:
df.T
Out[22]:
edition a b c d e f
number
three 1.5 NaN -5 4 NaN 0.166667
two 1.5 5 -5 4 2 NaN
In [23]:
df['three']
Out[23]:
edition
a          1.500000
b               NaN
c         -5.000000
d          4.000000
e               NaN
f          0.166667
Name: three, dtype: float64
In [24]:
df
Out[24]:
number three two
edition
a 1.500000 1.5
b NaN 5.0
c -5.000000 -5.0
d 4.000000 4.0
e NaN 2.0
f 0.166667 NaN
In [25]:
df2 = pd.DataFrame(data, columns=['one', 'two', 'three'])
df2
Out[25]:
one two three
a NaN 1.5 1.500000
b NaN 5.0 NaN
c NaN -5.0 -5.000000
d NaN 4.0 4.000000
e NaN 2.0 NaN
f NaN NaN 0.166667
In [26]:
df
Out[26]:
number three two
edition
a 1.500000 1.5
b NaN 5.0
c -5.000000 -5.0
d 4.000000 4.0
e NaN 2.0
f 0.166667 NaN
In [27]:
df[['two', 'three']]
Out[27]:
number two three
edition
a 1.5 1.500000
b 5.0 NaN
c -5.0 -5.000000
d 4.0 4.000000
e 2.0 NaN
f NaN 0.166667
In [28]:
df.ix['c']
Out[28]:
number
three    -5
two      -5
Name: c, dtype: float64
In [29]:
df.ix['c':]
Out[29]:
number three two
edition
c -5.000000 -5
d 4.000000 4
e NaN 2
f 0.166667 NaN
In [30]:
df.ix[['c', 'e']]
Out[30]:
number three two
c -5 -5
e NaN 2
In [31]:
df.ix[['c', 'e'], 'three']
Out[31]:
c    -5
e   NaN
Name: three, dtype: float64
In [32]:
df.ix[['c', 'e'], ['two', 'three']]
Out[32]:
two three
c -5 -5
e 2 NaN
In [33]:
df['four'] = df['two'] * 2
df
Out[33]:
number three two four
edition
a 1.500000 1.5 3
b NaN 5.0 10
c -5.000000 -5.0 -10
d 4.000000 4.0 8
e NaN 2.0 4
f 0.166667 NaN NaN
In [34]:
del df['four']
df
Out[34]:
three two
edition
a 1.500000 1.5
b NaN 5.0
c -5.000000 -5.0
d 4.000000 4.0
e NaN 2.0
f 0.166667 NaN
In [35]:
df.three
Out[35]:
edition
a          1.500000
b               NaN
c         -5.000000
d          4.000000
e               NaN
f          0.166667
Name: three, dtype: float64
In [36]:
df['three']
Out[36]:
edition
a          1.500000
b               NaN
c         -5.000000
d          4.000000
e               NaN
f          0.166667
Name: three, dtype: float64
In [37]:
df['index'] = 0
df
Out[37]:
three two index
edition
a 1.500000 1.5 0
b NaN 5.0 0
c -5.000000 -5.0 0
d 4.000000 4.0 0
e NaN 2.0 0
f 0.166667 NaN 0
In [38]:
df.index
Out[38]:
Index([a, b, c, d, e, f], dtype=object)
In [39]:
!head -n 5 PandasUsers.csv
Id,Reputation,CreationDate,DisplayName,LastAccessDate,WebsiteUrl,Location,AboutMe,Views,UpVotes,DownVotes,EmailHash,Age
1843099,100,2012-11-21 19:25:04,user1843099,2013-02-03 03:46:50,,,,0,16,0,17a9c54b937f8b505dcb3b0b5b34cbb6,
854739,766,2011-07-20 20:29:21,mikebmassey,2013-02-02 17:40:38,http://Itsprivate...,"Charlotte, NC","<p>I'm a analytics guy for a financial company.  Passionate about new technology.  Trying to simultaneously learn 5 different programming languages and failing at all of them...</p>
",0,242,3,61af09414341715d110f08b7505fe114,35.0
1479269,472,2012-06-25 07:10:33,dmvianna,2013-01-31 10:20:43,,"Melbourne, Australia","<p>Former neuroscientist now data analyst.</p>
In [40]:
askers = pd.read_csv('PandasUsers.csv')
questions = pd.read_csv('PandasPosts.csv')
helpers = pd.read_csv('PandasAnswerers.csv')
answers = pd.read_csv('PandasAnswers.csv')
In [41]:
askers
Out[41]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 490 entries, 0 to 489
Data columns (total 13 columns):
Id                490  non-null values
Reputation        490  non-null values
CreationDate      490  non-null values
DisplayName       490  non-null values
LastAccessDate    490  non-null values
WebsiteUrl        81  non-null values
Location          143  non-null values
AboutMe           105  non-null values
Views             490  non-null values
UpVotes           490  non-null values
DownVotes         490  non-null values
EmailHash         490  non-null values
Age               91  non-null values
dtypes: float64(1), int64(5), object(7)
In [42]:
askers.ix[100]
Out[42]:
Id                                                 1301710
Reputation                                            3110
CreationDate                           2012-03-29 19:23:17
DisplayName                                            bmu
LastAccessDate                         2013-02-02 22:18:29
WebsiteUrl                                             NaN
Location                                               NaN
AboutMe           <p>My about me is currently blank.</p>\n
Views                                                    0
UpVotes                                                975
DownVotes                                               38
EmailHash                 c09e91f8758f4ea83dbaa5e199ebb71a
Age                                                    NaN
Name: 100, dtype: object
In [43]:
askers.ix[100]['CreationDate']
Out[43]:
'2012-03-29 19:23:17'
In [44]:
questions
Out[44]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 952 entries, 0 to 951
Data columns (total 18 columns):
Id                  952  non-null values
PostTypeId          952  non-null values
AcceptedAnswerId    656  non-null values
CreationDate        952  non-null values
Score               952  non-null values
ViewCount           952  non-null values
Body                952  non-null values
OwnerUserId         952  non-null values
OwnerDisplayName    1  non-null values
LastEditorUserId    438  non-null values
LastEditDate        438  non-null values
LastActivityDate    952  non-null values
Title               952  non-null values
Tags                952  non-null values
AnswerCount         903  non-null values
CommentCount        438  non-null values
FavoriteCount       303  non-null values
ClosedDate          21  non-null values
dtypes: float64(5), int64(5), object(8)
In [45]:
answers
Out[45]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1331 entries, 0 to 1330
Data columns (total 13 columns):
Id                       1331  non-null values
PostTypeId               1331  non-null values
ParentId                 1331  non-null values
CreationDate             1331  non-null values
Score                    1331  non-null values
Body                     1331  non-null values
OwnerUserId              1329  non-null values
OwnerDisplayName         2  non-null values
LastEditorUserId         366  non-null values
LastEditorDisplayName    2  non-null values
LastEditDate             368  non-null values
LastActivityDate         1331  non-null values
CommentCount             804  non-null values
dtypes: float64(3), int64(4), object(6)
In [46]:
askers.DisplayName
Out[46]:
0          user1843099
1          mikebmassey
2             dmvianna
3            ezbentley
4                Ralph
5           Evan Davey
6      Nicola Vianello
7                Roger
8                Shane
9                roysc
10                RJCL
11                 PhE
12    Martin C. Martin
13               Einar
14              javier
...
475           briant57
476        user1470604
477            cpcloud
478             mhubig
479          akiladila
480        user1246428
481             Shakti
482            PlagTag
483          user14121
484               Tony
485             GBadge
486    Skylar Saveland
487          sashkello
488        user1498485
489          A Alstone
Name: DisplayName, Length: 490, dtype: object
In [47]:
asker_count = askers.DisplayName.value_counts()
helper_count = helpers.DisplayName.value_counts()
In [48]:
ask_and_answer = asker_count.index.intersection(helper_count.index)
In [49]:
len(ask_and_answer)
Out[49]:
118
In [50]:
len(askers)
Out[50]:
490
In [51]:
len(helpers)
Out[51]:
319
In [52]:
users = askers.append(helpers, ignore_index=True)
users
Out[52]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 809 entries, 0 to 808
Data columns (total 13 columns):
Id                809  non-null values
Reputation        809  non-null values
CreationDate      809  non-null values
DisplayName       809  non-null values
LastAccessDate    809  non-null values
WebsiteUrl        195  non-null values
Location          298  non-null values
AboutMe           240  non-null values
Views             809  non-null values
UpVotes           809  non-null values
DownVotes         809  non-null values
EmailHash         809  non-null values
Age               200  non-null values
dtypes: float64(1), int64(5), object(7)
In [53]:
users.DisplayName.duplicated().sum()
Out[53]:
120
In [54]:
-users.Id.duplicated()
Out[54]:
0     True
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
10    True
11    True
12    True
13    True
14    True
...
794     True
795     True
796    False
797     True
798     True
799     True
800     True
801     True
802     True
803     True
804     True
805    False
806     True
807    False
808     True
Name: Id, Length: 809, dtype: bool
In [55]:
# converts to distinct users
users = users[-users.Id.duplicated()]
In [56]:
askers = askers.set_index('Id')
helpers = helpers.set_index('Id')

len(askers.index.intersection(helpers.index))
Out[56]:
115
In [57]:
users
Out[57]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 694 entries, 0 to 808
Data columns (total 13 columns):
Id                694  non-null values
Reputation        694  non-null values
CreationDate      694  non-null values
DisplayName       694  non-null values
LastAccessDate    694  non-null values
WebsiteUrl        167  non-null values
Location          255  non-null values
AboutMe           202  non-null values
Views             694  non-null values
UpVotes           694  non-null values
DownVotes         694  non-null values
EmailHash         694  non-null values
Age               170  non-null values
dtypes: float64(1), int64(5), object(7)
In [58]:
questions
Out[58]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 952 entries, 0 to 951
Data columns (total 18 columns):
Id                  952  non-null values
PostTypeId          952  non-null values
AcceptedAnswerId    656  non-null values
CreationDate        952  non-null values
Score               952  non-null values
ViewCount           952  non-null values
Body                952  non-null values
OwnerUserId         952  non-null values
OwnerDisplayName    1  non-null values
LastEditorUserId    438  non-null values
LastEditDate        438  non-null values
LastActivityDate    952  non-null values
Title               952  non-null values
Tags                952  non-null values
AnswerCount         903  non-null values
CommentCount        438  non-null values
FavoriteCount       303  non-null values
ClosedDate          21  non-null values
dtypes: float64(5), int64(5), object(8)
In [59]:
questions = pd.merge(questions, users,
                     left_on='OwnerUserId',
                     right_on='Id',
                     suffixes=['_post', '_user'])

answers = pd.merge(answers, users,
                   left_on='OwnerUserId',
                   right_on='Id',
                   suffixes=['_post', '_user'])
In [60]:
column_mapping = {
    'CreationDate_post': 'PostTime',
    'CreationDate_user': 'UserJoinTime'    
}

questions = questions.rename(columns=column_mapping)
answers = answers.rename(columns=column_mapping)
In [61]:
questions.PostTime[0]
Out[61]:
'2012-11-19 02:12:10'
In [62]:
pd.to_datetime('2012-11-19 02:12:10')
Out[62]:
datetime.datetime(2012, 11, 19, 2, 12, 10)
In [63]:
pd.to_datetime('11/12/2012', dayfirst=True)
Out[63]:
datetime.datetime(2012, 12, 11, 0, 0)
In [64]:
pd.to_datetime('11/12/12', format='%d/%m/%y')
Out[64]:
datetime.datetime(2012, 11, 12, 0, 0)
In [65]:
questions.PostTime.dtype
Out[65]:
dtype('O')
In [66]:
pd.to_datetime(questions.PostTime)
Out[66]:
0    2012-11-19 02:12:10
1    2012-11-20 17:22:04
2    2012-12-30 17:19:57
3    2012-12-30 22:43:48
4    2012-12-31 20:22:16
5    2012-11-19 00:04:20
6    2012-11-08 22:04:48
7    2012-11-17 15:59:38
8    2012-11-19 16:15:37
9    2012-11-21 11:23:14
10   2012-11-19 13:15:45
11   2013-01-06 08:44:13
12   2012-08-29 21:47:00
13   2012-09-03 15:16:55
14   2012-09-11 10:58:50
...
937   2012-11-11 13:26:27
938   2012-11-13 02:18:10
939   2012-11-08 01:51:17
940   2012-11-08 23:33:25
941   2012-11-14 23:29:30
942   2012-11-14 12:04:13
943   2012-11-15 19:11:57
944   2012-11-16 17:57:11
945   2012-11-16 17:58:23
946   2012-11-15 20:27:19
947   2012-11-16 12:10:27
948   2012-11-18 22:14:06
949   2012-11-14 22:14:50
950   2012-11-17 19:34:36
951   2012-11-18 22:22:39
Name: PostTime, Length: 952, dtype: datetime64[ns]
In [67]:
pd.to_datetime(questions.PostTime)[0]
Out[67]:
<Timestamp: 2012-11-19 02:12:10>
In [68]:
stamp = pd.to_datetime(questions.PostTime)[0]
stamp
Out[68]:
<Timestamp: 2012-11-19 02:12:10>
In [69]:
stamp.tz_localize('utc')
Out[69]:
<Timestamp: 2012-11-19 02:12:10+0000 UTC, tz=UTC>
In [70]:
stamp.tz_localize('utc').tz_convert('Asia/Singapore')
Out[70]:
<Timestamp: 2012-11-19 10:12:10+0800 SGT, tz=Asia/Singapore>
In [71]:
questions['PostTime'] = pd.to_datetime(questions.PostTime)
answers['PostTime'] = pd.to_datetime(answers.PostTime)
In [72]:
answers.PostTime[5]
Out[72]:
<Timestamp: 2012-11-07 01:23:02>
In [73]:
# you can do this in read_csv
posts2 = pd.read_csv('PandasPosts.csv', parse_dates=['CreationDate'])
posts2.CreationDate[0]
Out[73]:
<Timestamp: 2012-11-19 02:12:10>
In [74]:
questions.sort('ViewCount', ascending=False).ViewCount
Out[74]:
444    6454
490    4228
615    3238
486    2913
515    2624
443    2484
672    2422
849    2066
612    2052
504    2035
681    1992
497    1884
269    1817
582    1777
550    1768
...
25     27
228    27
407    26
440    25
422    24
394    24
63     23
123    23
324    22
501    21
65     21
433    21
64     19
459    19
481     5
Name: ViewCount, Length: 952, dtype: int64
In [75]:
top10 = questions.sort('ViewCount', ascending=False)[:10]
top10[['Title', 'ViewCount']]
Out[75]:
Title ViewCount
444 Tutorial on PANDAS and PYTABLES 6454
490 What is the most efficient way to loop through... 4228
615 add one row in a pandas.DataFrame 3238
486 append two data frame with pandas 2913
515 Add indexed column to DataFrame with pandas 2624
443 Sort a pandas DataMatrix in ascending order 2484
672 Renaming columns in pandas 2422
849 How to fix Python Numpy/Pandas installation? 2066
612 iterating row by row through a pandas dataframe 2052
504 How to get the correlation between two timeser... 2035
In [76]:
views = questions.ViewCount.values

views = pd.Series(views, index=questions.PostTime)
views
Out[76]:
PostTime
2012-11-19 02:12:10     74
2012-11-20 17:22:04    107
2012-12-30 17:19:57     65
2012-12-30 22:43:48     53
2012-12-31 20:22:16     48
2012-11-19 00:04:20     71
2012-11-08 22:04:48     64
2012-11-17 15:59:38    238
2012-11-19 16:15:37     31
2012-11-21 11:23:14     44
2012-11-19 13:15:45    158
2013-01-06 08:44:13     67
2012-08-29 21:47:00    266
2012-09-03 15:16:55    365
2012-09-11 10:58:50    127
...
2012-11-11 13:26:27    114
2012-11-13 02:18:10    151
2012-11-08 01:51:17     92
2012-11-08 23:33:25    110
2012-11-14 23:29:30    135
2012-11-14 12:04:13     87
2012-11-15 19:11:57    140
2012-11-16 17:57:11     92
2012-11-16 17:58:23     65
2012-11-15 20:27:19    158
2012-11-16 12:10:27    125
2012-11-18 22:14:06    109
2012-11-14 22:14:50     40
2012-11-17 19:34:36     74
2012-11-18 22:22:39    154
Length: 952, dtype: int64
In [77]:
views.index
Out[77]:
<class 'pandas.tseries.index.DatetimeIndex'>
[2012-11-19 02:12:10, ..., 2012-11-18 22:22:39]
Length: 952, Freq: None, Timezone: None
In [78]:
views.index[0]
Out[78]:
<Timestamp: 2012-11-19 02:12:10>
In [79]:
views.index.year
Out[79]:
array([2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2013, 2012, 2012, 2012, 2012, 2012, 2013, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2013, 2013,
       2012, 2012, 2012, 2012, 2013, 2013, 2013, 2013, 2013, 2013, 2013,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2013, 2013, 2012,
       2012, 2013, 2012, 2013, 2013, 2012, 2013, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2013, 2012, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2013, 2013, 2012, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013,
       2013, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013,
       2012, 2012, 2012, 2013, 2013, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2012, 2012, 2012,
       2012, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013,
       2012, 2012, 2013, 2013, 2013, 2013, 2013, 2013, 2012, 2012, 2012,
       2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2013, 2012, 2012, 2012, 2013, 2013, 2013, 2013, 2012,
       2012, 2012, 2012, 2012, 2012, 2013, 2012, 2012, 2012, 2012, 2013,
       2012, 2012, 2012, 2012, 2013, 2013, 2013, 2013, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2013, 2013, 2013, 2012, 2013, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2013, 2013, 2013, 2013, 2013,
       2013, 2013, 2013, 2012, 2013, 2013, 2013, 2013, 2012, 2012, 2013,
       2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,
       2013, 2012, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2012, 2012,
       2013, 2013, 2012, 2012, 2012, 2013, 2012, 2013, 2012, 2013, 2012,
       2012, 2012, 2012, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,
       2013, 2012, 2013, 2013, 2013, 2013, 2012, 2013, 2013, 2013, 2013,
       2013, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2013, 2013,
       2013, 2013, 2013, 2013, 2013, 2012, 2012, 2013, 2013, 2013, 2013,
       2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2012, 2013,
       2013, 2013, 2011, 2011, 2011, 2013, 2013, 2013, 2013, 2012, 2012,
       2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,
       2013, 2013, 2013, 2013, 2013, 2012, 2012, 2012, 2013, 2012, 2013,
       2013, 2012, 2013, 2013, 2013, 2013, 2013, 2011, 2013, 2011, 2011,
       2013, 2011, 2011, 2013, 2012, 2012, 2011, 2013, 2012, 2011, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2013, 2011, 2012, 2011, 2012,
       2012, 2012, 2011, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012], dtype=int32)
In [80]:
year = views.index.year
month = views.index.month

views.groupby(year)
Out[80]:
<pandas.core.groupby.SeriesGroupBy at 0x106ab0050>
In [81]:
sizes = views.groupby(year).size()
sizes
Out[81]:
2011     13
2012    766
2013    173
dtype: int64
In [82]:
sums = views.groupby(year).sum()
sums
Out[82]:
2011     25083
2012    208237
2013     10520
dtype: int64
In [83]:
reordered = sizes[[2013, 2012, 2011]]
In [84]:
sums / reordered
Out[84]:
2011    1929
2012     271
2013      60
dtype: int64
In [85]:
print views.count()  # excludes missing data (if any)
print len(views)
952
952
In [86]:
grouped = views.groupby(year)

grouped.agg(['count', 'sum', 'min', 'max', 'median', 'mean'])
Out[86]:
count sum min max median mean
2011 13 25083 368 6454 1128 1929.461538
2012 766 208237 27 3238 143 271.849869
2013 173 10520 5 518 51 60.809249
In [87]:
views.order(ascending=False)[:5]
Out[87]:
PostTime
2011-05-10 19:55:08    6454
2011-10-20 14:46:14    4228
2012-05-23 08:12:31    3238
2011-10-15 08:21:17    2913
2012-03-18 22:34:26    2624
dtype: int64
In [88]:
def topn(group, n=5):
    return group.order(ascending=False)[:n]
 
grouped = views.groupby(year)

result = grouped.apply(topn)
result
Out[88]:
      PostTime           
2011  2011-05-10 19:55:08    6454
      2011-10-20 14:46:14    4228
      2011-10-15 08:21:17    2913
      2011-04-05 21:13:50    2484
      2011-06-24 12:31:45    2035
2012  2012-05-23 08:12:31    3238
      2012-03-18 22:34:26    2624
      2012-07-05 14:21:15    2422
      2012-09-15 11:30:35    2066
      2012-05-23 23:21:18    2052
2013  2013-01-15 23:45:53     518
      2013-01-07 15:42:28     459
      2013-01-10 16:20:32     335
      2013-01-11 18:18:42     184
      2013-01-16 09:35:36     158
dtype: int64
In [89]:
result[2012]
Out[89]:
PostTime
2012-05-23 08:12:31    3238
2012-03-18 22:34:26    2624
2012-07-05 14:21:15    2422
2012-09-15 11:30:35    2066
2012-05-23 23:21:18    2052
dtype: int64
In [90]:
result
Out[90]:
      PostTime           
2011  2011-05-10 19:55:08    6454
      2011-10-20 14:46:14    4228
      2011-10-15 08:21:17    2913
      2011-04-05 21:13:50    2484
      2011-06-24 12:31:45    2035
2012  2012-05-23 08:12:31    3238
      2012-03-18 22:34:26    2624
      2012-07-05 14:21:15    2422
      2012-09-15 11:30:35    2066
      2012-05-23 23:21:18    2052
2013  2013-01-15 23:45:53     518
      2013-01-07 15:42:28     459
      2013-01-10 16:20:32     335
      2013-01-11 18:18:42     184
      2013-01-16 09:35:36     158
dtype: int64
In [91]:
year = views.index.year
month = views.index.month

keys = [year, month]
stats = ['count']

grouped = views.groupby(keys)

result = grouped.agg(stats)
result
Out[91]:
count
2011 3 1
4 2
5 1
6 1
9 1
10 4
11 2
12 1
2012 1 6
2 4
3 29
4 27
5 44
6 50
7 67
8 91
9 94
10 105
11 121
12 128
2013 1 161
2 12

Stack and Unstack

In [92]:
result.unstack(0)
Out[92]:
count
2011 2012 2013
1 NaN 6 161
2 NaN 4 12
3 1 29 NaN
4 2 27 NaN
5 1 44 NaN
6 1 50 NaN
7 NaN 67 NaN
8 NaN 91 NaN
9 1 94 NaN
10 4 105 NaN
11 2 121 NaN
12 1 128 NaN
In [93]:
result.unstack(0).fillna(0)
Out[93]:
count
2011 2012 2013
1 0 6 161
2 0 4 12
3 1 29 0
4 2 27 0
5 1 44 0
6 1 50 0
7 0 67 0
8 0 91 0
9 1 94 0
10 4 105 0
11 2 121 0
12 1 128 0
In [94]:
result
Out[94]:
count
2011 3 1
4 2
5 1
6 1
9 1
10 4
11 2
12 1
2012 1 6
2 4
3 29
4 27
5 44
6 50
7 67
8 91
9 94
10 105
11 121
12 128
2013 1 161
2 12
In [95]:
result.index.names = ['year', 'month']
result
Out[95]:
count
year month
2011 3 1
4 2
5 1
6 1
9 1
10 4
11 2
12 1
2012 1 6
2 4
3 29
4 27
5 44
6 50
7 67
8 91
9 94
10 105
11 121
12 128
2013 1 161
2 12
In [96]:
result.unstack('year')
Out[96]:
count
year 2011 2012 2013
month
1 NaN 6 161
2 NaN 4 12
3 1 29 NaN
4 2 27 NaN
5 1 44 NaN
6 1 50 NaN
7 NaN 67 NaN
8 NaN 91 NaN
9 1 94 NaN
10 4 105 NaN
11 2 121 NaN
12 1 128 NaN
In [97]:
result.unstack('year').stack('year')
Out[97]:
count
month year
1 2012 6
2013 161
2 2012 4
2013 12
3 2011 1
2012 29
4 2011 2
2012 27
5 2011 1
2012 44
6 2011 1
2012 50
7 2012 67
8 2012 91
9 2011 1
2012 94
10 2011 4
2012 105
11 2011 2
2012 121
12 2011 1
2012 128
In [98]:
stats = ['count', 'sum', 'mean', 'median']
result = grouped.agg(stats)
result.index.names = ['year', 'month']
result
Out[98]:
count sum mean median
year month
2011 3 1 1128 1128.000000 1128.0
4 2 3914 1957.000000 1957.0
5 1 6454 6454.000000 6454.0
6 1 2035 2035.000000 2035.0
9 1 960 960.000000 960.0
10 4 8590 2147.500000 1989.5
11 2 1203 601.500000 601.5
12 1 799 799.000000 799.0
2012 1 6 4901 816.833333 544.5
2 4 3137 784.250000 505.0
3 29 19587 675.413793 533.0
4 27 16477 610.259259 487.0
5 44 24072 547.090909 430.0
6 50 21305 426.100000 272.5
7 67 28852 430.626866 289.0
8 91 26752 293.978022 206.0
9 94 21712 230.978723 157.5
10 105 16909 161.038095 125.0
11 121 13561 112.074380 93.0
12 128 10972 85.718750 66.0
2013 1 161 10106 62.770186 52.0
2 12 414 34.500000 34.0
In [99]:
pd.options.display.max_columns = 20
pd.options.display.line_width = 1000
result.unstack('year')
Out[99]:
count sum mean median
year 2011 2012 2013 2011 2012 2013 2011 2012 2013 2011 2012 2013
month
1 NaN 6 161 NaN 4901 10106 NaN 816.833333 62.770186 NaN 544.5 52
2 NaN 4 12 NaN 3137 414 NaN 784.250000 34.500000 NaN 505.0 34
3 1 29 NaN 1128 19587 NaN 1128.0 675.413793 NaN 1128.0 533.0 NaN
4 2 27 NaN 3914 16477 NaN 1957.0 610.259259 NaN 1957.0 487.0 NaN
5 1 44 NaN 6454 24072 NaN 6454.0 547.090909 NaN 6454.0 430.0 NaN
6 1 50 NaN 2035 21305 NaN 2035.0 426.100000 NaN 2035.0 272.5 NaN
7 NaN 67 NaN NaN 28852 NaN NaN 430.626866 NaN NaN 289.0 NaN
8 NaN 91 NaN NaN 26752 NaN NaN 293.978022 NaN NaN 206.0 NaN
9 1 94 NaN 960 21712 NaN 960.0 230.978723 NaN 960.0 157.5 NaN
10 4 105 NaN 8590 16909 NaN 2147.5 161.038095 NaN 1989.5 125.0 NaN
11 2 121 NaN 1203 13561 NaN 601.5 112.074380 NaN 601.5 93.0 NaN
12 1 128 NaN 799 10972 NaN 799.0 85.718750 NaN 799.0 66.0 NaN
In [100]:
views.groupby(year).agg({'foo': np.mean})
Out[100]:
foo
2011 1929.461538
2012 271.849869
2013 60.809249
In [101]:
# BLACK MAGIC!

def q(q):
    def f(x):
        return x.quantile(q)
    f.__name__ = 'quantile_%d' % int(q * 100)
    return f
    
views.groupby(year).agg([q(0.1), q(0.5), q(0.9)])
Out[101]:
quantile_10 quantile_50 quantile_90
2011 466.2 1128 3965.0
2012 58.0 143 599.0
2013 29.0 51 88.8
In [102]:
np.random.seed(130612001)
attendees = open('attendees.txt').read().split()
np.random.shuffle(attendees)
attendees = pd.Series(attendees)
attendees[:7]
In [103]:
views.resample('A', how='sum')
Out[103]:
PostTime
2011-12-31     25083
2012-12-31    208237
2013-12-31     10520
Freq: A-DEC, dtype: int64
In [104]:
views.resample('M', how='sum')
Out[104]:
PostTime
2011-03-31     1128
2011-04-30     3914
2011-05-31     6454
2011-06-30     2035
2011-07-31      NaN
2011-08-31      NaN
2011-09-30      960
2011-10-31     8590
2011-11-30     1203
2011-12-31      799
2012-01-31     4901
2012-02-29     3137
2012-03-31    19587
2012-04-30    16477
2012-05-31    24072
2012-06-30    21305
2012-07-31    28852
2012-08-31    26752
2012-09-30    21712
2012-10-31    16909
2012-11-30    13561
2012-12-31    10972
2013-01-31    10106
2013-02-28      414
Freq: M, dtype: float64
In [105]:
views.groupby(year).sum()
Out[105]:
2011     25083
2012    208237
2013     10520
dtype: int64

Grouping tables

In [106]:
answers.DisplayName
Out[106]:
0     Calvin Cheng
1     Calvin Cheng
2     Calvin Cheng
3     Calvin Cheng
4     Calvin Cheng
5     Calvin Cheng
6     Calvin Cheng
7     Calvin Cheng
8     Calvin Cheng
9     Calvin Cheng
10    Calvin Cheng
11    Calvin Cheng
12    Calvin Cheng
13    Calvin Cheng
14     Andy Hayden
...
1314                   Tom
1315       Henry Gomersall
1316          milkypostman
1317              RParadox
1318             Mr. Squig
1319                Austin
1320            waitingkuo
1321              Victor K
1322                monkut
1323           Hyperboreus
1324             Dan Allan
1325               mankoff
1326    Yaroslav Halchenko
1327        Felix Zumstein
1328            dantes_419
Name: DisplayName, Length: 1329, dtype: object
In [107]:
answers.groupby('DisplayName').size()
Out[107]:
DisplayName
0xc0de                 1
Aaron                  1
Aaron Digulla          1
Abe                    1
Abhi                   1
Abhranil Das           1
Abraham D Flaxman      1
Adam                   1
Adam Greenhall         1
Adobe                  1
Akavall                1
Alexander Stefanov     1
Alok                   2
Aman                  18
Amyunimus              1
...
user1569050    1
user1763885    1
user1827356    1
user1843099    3
user1917577    1
user1988295    1
user333700     3
user948652     2
vartec         1
velotron       1
waitingkuo     1
wiseveri       1
zach           4
zarthur        1
Óscar López    1
Length: 319, dtype: int64
In [108]:
num_answers = answers.groupby('DisplayName').size()
num_answers.order(ascending=False)
Out[108]:
DisplayName
Wes McKinney        208
Andy Hayden         116
Chang She            89
Wouter Overmeire     72
crewbum              40
BrenBarn             37
DSM                  35
Zelazny7             34
eumiro               31
unutbu               30
root                 20
bmu                  18
Matti John           18
Aman                 18
Rutger Kassies       14
...
Anurag Uniyal         1
Anaphory              1
Amyunimus             1
Alexander Stefanov    1
Akavall               1
Adobe                 1
Adam Greenhall        1
Adam                  1
Abraham D Flaxman     1
Abhranil Das          1
Abhi                  1
Abe                   1
Aaron Digulla         1
Aaron                 1
0xc0de                1
Length: 319, dtype: int64
In [109]:
answers.Score
Out[109]:
0     2
1     0
2     1
3     2
4     1
5     1
6     0
7     2
8     1
9     0
10    2
11    2
12    0
13    0
14    2
...
1314    3
1315    4
1316    0
1317    1
1318    0
1319    1
1320    2
1321    2
1322    2
1323    1
1324    2
1325    4
1326    4
1327    1
1328    0
Name: Score, Length: 1329, dtype: int64
In [110]:
def sortd(self):
    return self.order(ascending=False)
pd.Series.sortd = sortd
In [111]:
# pd.options.display.max_rows = 1000
by_name = answers.groupby('DisplayName')
result = by_name['Score'].agg(['mean', 'count'])

result = result.sort('mean', ascending=False)
result[:20]
Out[111]:
mean count
DisplayName
gcalmettes 16.000000 1
Dirk Eddelbuettel 12.000000 1
Matthew Dowle 10.000000 2
tr33hous 9.000000 1
Dani Arribas-Bel 8.000000 1
Lennart Regebro 8.000000 1
Joe Kington 7.000000 1
minrk 6.000000 1
lexual 6.000000 1
Bi Rico 6.000000 2
Sven Marnach 5.000000 2
user1234440 5.000000 1
dbaupp 5.000000 1
Brian Keegan 5.000000 1
luke14free 5.000000 1
jseabold 5.000000 2
andrew cooke 5.000000 1
Nick Crawford 5.000000 1
Mike Pennington 4.500000 2
ogrisel 4.333333 3
In [112]:
result['count'] >= 10
Out[112]:
DisplayName
gcalmettes           False
Dirk Eddelbuettel    False
Matthew Dowle        False
tr33hous             False
Dani Arribas-Bel     False
Lennart Regebro      False
Joe Kington          False
minrk                False
lexual               False
Bi Rico              False
Sven Marnach         False
user1234440          False
dbaupp               False
Brian Keegan         False
luke14free           False
...
gbronner         False
roysc            False
rocketman        False
Ross R           False
Rich Signell     False
Rian             False
Randy Olson      False
Rachel Gallen    False
Jason Morgan     False
nat              False
Jeff Tratner     False
mspadaccino      False
0xc0de           False
velotron         False
Paul Hiemstra    False
Name: count, Length: 319, dtype: bool
In [113]:
(result['count'] >= 10).sum()
Out[113]:
21
In [114]:
result[result['count'] >= 10]
Out[114]:
mean count
DisplayName
Paul H 3.454545 11
unutbu 3.000000 30
BrenBarn 2.675676 37
Wes McKinney 2.610577 208
DSM 2.371429 35
EMS 2.153846 13
Wouter Overmeire 2.097222 72
bmu 2.055556 18
eumiro 1.967742 31
Aman 1.888889 18
Zelazny7 1.882353 34
Matti John 1.666667 18
Andy Hayden 1.586207 116
crewbum 1.525000 40
root 1.500000 20
Jeff 1.454545 11
Chang She 1.325843 89
Rutger Kassies 1.285714 14
lbolla 1.272727 11
Calvin Cheng 1.000000 14
locojay 0.900000 10
In [115]:
stats = ['mean', 'min', 'max', 'std', 'count']
result = by_name['Score'].agg(stats)
result = result.sort('std', ascending=False)
result[result['count'] >= 10]
Out[115]:
mean min max std count
DisplayName
Paul H 3.454545 1 18 4.926736 11
Wes McKinney 2.610577 0 40 4.042920 208
BrenBarn 2.675676 0 9 2.055535 37
bmu 2.055556 0 6 1.862074 18
unutbu 3.000000 0 8 1.819435 30
EMS 2.153846 0 6 1.818706 13
Aman 1.888889 0 5 1.604732 18
Wouter Overmeire 2.097222 0 6 1.584784 72
crewbum 1.525000 0 6 1.568725 40
eumiro 1.967742 0 6 1.471595 31
DSM 2.371429 0 6 1.373825 35
Zelazny7 1.882353 0 5 1.365477 34
Chang She 1.325843 0 7 1.346505 89
root 1.500000 0 5 1.317893 20
Andy Hayden 1.586207 0 6 1.305966 116
Rutger Kassies 1.285714 0 4 1.204388 14
lbolla 1.272727 0 4 1.103713 11
Jeff 1.454545 0 3 1.035725 11
locojay 0.900000 0 3 0.994429 10
Matti John 1.666667 0 4 0.970143 18
Calvin Cheng 1.000000 0 2 0.877058 14
In [116]:
top_so_users = result[result['count'] >= 10].index
top_so_users
Out[116]:
Index([Paul H, Wes McKinney, BrenBarn, bmu, unutbu, EMS, Aman, Wouter Overmeire, crewbum, eumiro, DSM, Zelazny7, Chang She, root, Andy Hayden, Rutger Kassies, lbolla, Jeff, locojay, Matti John, Calvin Cheng], dtype=object)
In [117]:
# Doing this 21 times would be slow
answers.DisplayName == 'Wes McKinney'
Out[117]:
0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
...
1314    False
1315    False
1316    False
1317    False
1318    False
1319    False
1320    False
1321    False
1322    False
1323    False
1324    False
1325    False
1326    False
1327    False
1328    False
Name: DisplayName, Length: 1329, dtype: bool
In [118]:
'Wes McKinney' in top_so_users
Out[118]:
True
In [119]:
answers.DisplayName.isin(top_so_users)
Out[119]:
0     True
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
10    True
11    True
12    True
13    True
14    True
...
1314    False
1315    False
1316    False
1317    False
1318    False
1319    False
1320    False
1321    False
1322    False
1323    False
1324    False
1325    False
1326    False
1327    False
1328    False
Name: DisplayName, Length: 1329, dtype: bool
In [120]:
crit = answers.DisplayName.isin(top_so_users)
answers_subset = answers[crit]
answers_subset
Out[120]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 850 entries, 0 to 945
Data columns (total 26 columns):
Id_post                  850  non-null values
PostTypeId               850  non-null values
ParentId                 850  non-null values
PostTime                 850  non-null values
Score                    850  non-null values
Body                     850  non-null values
OwnerUserId              850  non-null values
OwnerDisplayName         0  non-null values
LastEditorUserId         238  non-null values
LastEditorDisplayName    1  non-null values
LastEditDate             239  non-null values
LastActivityDate         850  non-null values
CommentCount             556  non-null values
Id_user                  850  non-null values
Reputation               850  non-null values
UserJoinTime             850  non-null values
DisplayName              850  non-null values
LastAccessDate           850  non-null values
WebsiteUrl               414  non-null values
Location                 542  non-null values
AboutMe                  617  non-null values
Views                    850  non-null values
UpVotes                  850  non-null values
DownVotes                850  non-null values
EmailHash                850  non-null values
Age                      419  non-null values
dtypes: datetime64[ns](1), float64(4), int64(9), object(12)
In [121]:
# Doh, have to convert to index to do this
year = pd.DatetimeIndex(answers_subset.PostTime).year
year[:20]
Out[121]:
array([2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
       2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2013], dtype=int32)
In [122]:
keys = ['DisplayName', year]

grouped = answers_subset.groupby(keys)

stats = ['mean', 'min', 'max', 'std', 'count']
result = grouped['Score'].agg(stats)
# result = result.sort('std', ascending=False)
result
Out[122]:
mean min max std count
DisplayName
Aman 2012 2.000000 0 5 1.581139 17
2013 0.000000 0 0 NaN 1
Andy Hayden 2012 1.450704 0 6 1.307000 71
2013 1.800000 0 5 1.289820 45
BrenBarn 2012 2.705882 0 9 2.139519 34
2013 2.333333 2 3 0.577350 3
Calvin Cheng 2012 1.000000 0 2 0.877058 14
Chang She 2012 1.325843 0 7 1.346505 89
DSM 2012 2.478261 0 6 1.473079 23
2013 2.166667 0 4 1.193416 12
EMS 2012 2.153846 0 6 1.818706 13
Jeff 2012 1.500000 0 3 1.290994 4
2013 1.428571 0 3 0.975900 7
Matti John 2012 1.666667 0 4 0.970143 18
Paul H 2012 2.000000 1 4 1.154701 7
2013 6.000000 1 18 8.041559 4
Rutger Kassies 2012 1.363636 0 4 1.286291 11
2013 1.000000 0 2 1.000000 3
Wes McKinney 2011 8.125000 1 21 7.180082 8
2012 2.450777 0 40 3.780282 193
2013 0.714286 0 2 0.951190 7
Wouter Overmeire 2012 2.097222 0 6 1.584784 72
Zelazny7 2012 2.125000 0 5 1.726888 8
2013 1.807692 0 5 1.265519 26
bmu 2012 1.875000 0 5 1.668333 16
2013 3.500000 1 6 3.535534 2
crewbum 2012 1.354839 0 6 1.427081 31
2013 2.111111 0 5 1.964971 9
eumiro 2012 1.892857 0 6 1.523624 28
2013 2.666667 2 3 0.577350 3
lbolla 2012 1.272727 0 4 1.103713 11
locojay 2012 0.777778 0 3 0.971825 9
2013 2.000000 2 2 NaN 1
root 2012 1.187500 0 5 1.223043 16
2013 2.750000 2 4 0.957427 4
unutbu 2011 8.000000 8 8 NaN 1
2012 3.000000 0 7 1.878673 18
2013 2.545455 2 5 0.934199 11
In [123]:
np.log(users.Reputation).hist(bins=100)
Out[123]:
<matplotlib.axes.AxesSubplot at 0x1075a5b90>
In [124]:
buckets = [0, 100, 1000, 10000, 1000000]
rep_bucket = pd.cut(answers.Reputation, buckets)
In [125]:
answers.groupby(rep_bucket).size()
Out[125]:
Reputation
(0, 100]            120
(100, 1000]         237
(1000, 10000]       783
(10000, 1000000]    189
dtype: int64
In [126]:
len(answers.DisplayName.unique())
Out[126]:
319
In [127]:
grouped = answers.groupby(rep_bucket)

def distinct(names):
    return len(names.unique())

statistics = {
    'Score': ['mean', 'median', 'count'],
    'Age': ['min', 'max'],
    'DisplayName': [distinct]
}

grouped.agg(statistics)
Out[127]:
Age Score DisplayName
min max mean median count distinct
Reputation
(0, 100] 23 45 0.966667 0 120 87
(100, 1000] 17 46 1.413502 1 237 111
(1000, 10000] 19 49 1.978289 1 783 83
(10000, 1000000] 24 64 2.470899 2 189 38
In [128]:
users.irow(users.Reputation.argmax())
Out[128]:
Id                                                            20862
Reputation                                                   220371
CreationDate                                    2008-09-23 02:06:20
DisplayName                                  Ignacio Vazquez-Abrams
LastAccessDate                                  2013-02-02 22:20:15
WebsiteUrl                                                      NaN
Location                                                        NaN
AboutMe           <p><a href="http://whathaveyoutried.com/" rel=...
Views                                                          5231
UpVotes                                                        1426
DownVotes                                                        63
EmailHash                          2343ae368d3241278581ce6c87f62a25
Age                                                             NaN
Name: 532, dtype: object

New dataset time!

In [129]:
import json
path = 'foods-2011-10-03.json'

db = json.load(open(path))
In [130]:
print type(db)
len(db)
<type 'list'>
Out[130]:
6636
In [131]:
db[0]
Out[131]:
{u'description': u'Cheese, caraway',
 u'group': u'Dairy and Egg Products',
 u'id': 1008,
 u'manufacturer': u'',
 u'nutrients': [{u'description': u'Protein',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 25.18},
  {u'description': u'Total lipid (fat)',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 29.2},
  {u'description': u'Carbohydrate, by difference',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 3.06},
  {u'description': u'Ash', u'group': u'Other', u'units': u'g', u'value': 3.28},
  {u'description': u'Energy',
   u'group': u'Energy',
   u'units': u'kcal',
   u'value': 376.0},
  {u'description': u'Water',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 39.28},
  {u'description': u'Energy',
   u'group': u'Energy',
   u'units': u'kJ',
   u'value': 1573.0},
  {u'description': u'Fiber, total dietary',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 0.0},
  {u'description': u'Calcium, Ca',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 673.0},
  {u'description': u'Iron, Fe',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 0.64},
  {u'description': u'Magnesium, Mg',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 22.0},
  {u'description': u'Phosphorus, P',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 490.0},
  {u'description': u'Potassium, K',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 93.0},
  {u'description': u'Sodium, Na',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 690.0},
  {u'description': u'Zinc, Zn',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 2.94},
  {u'description': u'Copper, Cu',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 0.024},
  {u'description': u'Manganese, Mn',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 0.021},
  {u'description': u'Selenium, Se',
   u'group': u'Elements',
   u'units': u'mcg',
   u'value': 14.5},
  {u'description': u'Vitamin A, IU',
   u'group': u'Vitamins',
   u'units': u'IU',
   u'value': 1054.0},
  {u'description': u'Retinol',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 262.0},
  {u'description': u'Vitamin A, RAE',
   u'group': u'Vitamins',
   u'units': u'mcg_RAE',
   u'value': 271.0},
  {u'description': u'Vitamin C, total ascorbic acid',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.0},
  {u'description': u'Thiamin',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.031},
  {u'description': u'Riboflavin',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.45},
  {u'description': u'Niacin',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.18},
  {u'description': u'Pantothenic acid',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.19},
  {u'description': u'Vitamin B-6',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.074},
  {u'description': u'Folate, total',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 18.0},
  {u'description': u'Vitamin B-12',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 0.27},
  {u'description': u'Folic acid',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 0.0},
  {u'description': u'Folate, food',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 18.0},
  {u'description': u'Folate, DFE',
   u'group': u'Vitamins',
   u'units': u'mcg_DFE',
   u'value': 18.0},
  {u'description': u'Cholesterol',
   u'group': u'Other',
   u'units': u'mg',
   u'value': 93.0},
  {u'description': u'Fatty acids, total saturated',
   u'group': u'Other',
   u'units': u'g',
   u'value': 18.584},
  {u'description': u'Fatty acids, total monounsaturated',
   u'group': u'Other',
   u'units': u'g',
   u'value': 8.275},
  {u'description': u'Fatty acids, total polyunsaturated',
   u'group': u'Other',
   u'units': u'g',
   u'value': 0.83},
  {u'description': u'Tryptophan',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.324},
  {u'description': u'Threonine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.896},
  {u'description': u'Isoleucine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.563},
  {u'description': u'Leucine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 2.412},
  {u'description': u'Lysine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 2.095},
  {u'description': u'Methionine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.659},
  {u'description': u'Cystine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.126},
  {u'description': u'Phenylalanine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.326},
  {u'description': u'Tyrosine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.216},
  {u'description': u'Valine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.682},
  {u'description': u'Arginine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.952},
  {u'description': u'Histidine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.884},
  {u'description': u'Alanine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.711},
  {u'description': u'Aspartic acid',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.618},
  {u'description': u'Glutamic acid',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 6.16},
  {u'description': u'Glycine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.439},
  {u'description': u'Proline',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 2.838},
  {u'description': u'Serine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.472},
  {u'description': u'Protein',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 25.18},
  {u'description': u'Total lipid (fat)',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 29.2},
  {u'description': u'Carbohydrate, by difference',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 3.06},
  {u'description': u'Ash', u'group': u'Other', u'units': u'g', u'value': 3.28},
  {u'description': u'Energy',
   u'group': u'Energy',
   u'units': u'kcal',
   u'value': 376.0},
  {u'description': u'Water',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 39.28},
  {u'description': u'Energy',
   u'group': u'Energy',
   u'units': u'kJ',
   u'value': 1573.0},
  {u'description': u'Fiber, total dietary',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 0.0},
  {u'description': u'Calcium, Ca',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 673.0},
  {u'description': u'Iron, Fe',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 0.64},
  {u'description': u'Magnesium, Mg',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 22.0},
  {u'description': u'Phosphorus, P',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 490.0},
  {u'description': u'Potassium, K',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 93.0},
  {u'description': u'Sodium, Na',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 690.0},
  {u'description': u'Zinc, Zn',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 2.94},
  {u'description': u'Copper, Cu',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 0.024},
  {u'description': u'Manganese, Mn',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 0.021},
  {u'description': u'Selenium, Se',
   u'group': u'Elements',
   u'units': u'mcg',
   u'value': 14.5},
  {u'description': u'Vitamin A, IU',
   u'group': u'Vitamins',
   u'units': u'IU',
   u'value': 1054.0},
  {u'description': u'Retinol',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 262.0},
  {u'description': u'Vitamin A, RAE',
   u'group': u'Vitamins',
   u'units': u'mcg_RAE',
   u'value': 271.0},
  {u'description': u'Vitamin C, total ascorbic acid',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.0},
  {u'description': u'Thiamin',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.031},
  {u'description': u'Riboflavin',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.45},
  {u'description': u'Niacin',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.18},
  {u'description': u'Pantothenic acid',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.19},
  {u'description': u'Vitamin B-6',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.074},
  {u'description': u'Folate, total',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 18.0},
  {u'description': u'Vitamin B-12',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 0.27},
  {u'description': u'Folic acid',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 0.0},
  {u'description': u'Folate, food',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 18.0},
  {u'description': u'Folate, DFE',
   u'group': u'Vitamins',
   u'units': u'mcg_DFE',
   u'value': 18.0},
  {u'description': u'Tryptophan',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.324},
  {u'description': u'Threonine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.896},
  {u'description': u'Isoleucine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.563},
  {u'description': u'Leucine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 2.412},
  {u'description': u'Lysine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 2.095},
  {u'description': u'Methionine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.659},
  {u'description': u'Cystine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.126},
  {u'description': u'Phenylalanine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.326},
  {u'description': u'Tyrosine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.216},
  {u'description': u'Valine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.682},
  {u'description': u'Arginine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.952},
  {u'description': u'Histidine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.884},
  {u'description': u'Alanine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.711},
  {u'description': u'Aspartic acid',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.618},
  {u'description': u'Glutamic acid',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 6.16},
  {u'description': u'Glycine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.439},
  {u'description': u'Proline',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 2.838},
  {u'description': u'Serine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.472},
  {u'description': u'Cholesterol',
   u'group': u'Other',
   u'units': u'mg',
   u'value': 93.0},
  {u'description': u'Fatty acids, total saturated',
   u'group': u'Other',
   u'units': u'g',
   u'value': 18.584},
  {u'description': u'Fatty acids, total monounsaturated',
   u'group': u'Other',
   u'units': u'g',
   u'value': 8.275},
  {u'description': u'Fatty acids, total polyunsaturated',
   u'group': u'Other',
   u'units': u'g',
   u'value': 0.83},
  {u'description': u'Protein',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 25.18},
  {u'description': u'Total lipid (fat)',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 29.2},
  {u'description': u'Carbohydrate, by difference',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 3.06},
  {u'description': u'Ash', u'group': u'Other', u'units': u'g', u'value': 3.28},
  {u'description': u'Energy',
   u'group': u'Energy',
   u'units': u'kcal',
   u'value': 376.0},
  {u'description': u'Water',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 39.28},
  {u'description': u'Energy',
   u'group': u'Energy',
   u'units': u'kJ',
   u'value': 1573.0},
  {u'description': u'Fiber, total dietary',
   u'group': u'Composition',
   u'units': u'g',
   u'value': 0.0},
  {u'description': u'Calcium, Ca',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 673.0},
  {u'description': u'Iron, Fe',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 0.64},
  {u'description': u'Magnesium, Mg',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 22.0},
  {u'description': u'Phosphorus, P',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 490.0},
  {u'description': u'Potassium, K',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 93.0},
  {u'description': u'Sodium, Na',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 690.0},
  {u'description': u'Zinc, Zn',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 2.94},
  {u'description': u'Copper, Cu',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 0.024},
  {u'description': u'Manganese, Mn',
   u'group': u'Elements',
   u'units': u'mg',
   u'value': 0.021},
  {u'description': u'Selenium, Se',
   u'group': u'Elements',
   u'units': u'mcg',
   u'value': 14.5},
  {u'description': u'Vitamin A, IU',
   u'group': u'Vitamins',
   u'units': u'IU',
   u'value': 1054.0},
  {u'description': u'Retinol',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 262.0},
  {u'description': u'Vitamin A, RAE',
   u'group': u'Vitamins',
   u'units': u'mcg_RAE',
   u'value': 271.0},
  {u'description': u'Vitamin C, total ascorbic acid',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.0},
  {u'description': u'Thiamin',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.031},
  {u'description': u'Riboflavin',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.45},
  {u'description': u'Niacin',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.18},
  {u'description': u'Pantothenic acid',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.19},
  {u'description': u'Vitamin B-6',
   u'group': u'Vitamins',
   u'units': u'mg',
   u'value': 0.074},
  {u'description': u'Folate, total',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 18.0},
  {u'description': u'Vitamin B-12',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 0.27},
  {u'description': u'Folic acid',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 0.0},
  {u'description': u'Folate, food',
   u'group': u'Vitamins',
   u'units': u'mcg',
   u'value': 18.0},
  {u'description': u'Folate, DFE',
   u'group': u'Vitamins',
   u'units': u'mcg_DFE',
   u'value': 18.0},
  {u'description': u'Tryptophan',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.324},
  {u'description': u'Threonine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.896},
  {u'description': u'Isoleucine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.563},
  {u'description': u'Leucine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 2.412},
  {u'description': u'Lysine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 2.095},
  {u'description': u'Methionine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.659},
  {u'description': u'Cystine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.126},
  {u'description': u'Phenylalanine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.326},
  {u'description': u'Tyrosine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.216},
  {u'description': u'Valine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.682},
  {u'description': u'Arginine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.952},
  {u'description': u'Histidine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.884},
  {u'description': u'Alanine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.711},
  {u'description': u'Aspartic acid',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.618},
  {u'description': u'Glutamic acid',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 6.16},
  {u'description': u'Glycine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 0.439},
  {u'description': u'Proline',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 2.838},
  {u'description': u'Serine',
   u'group': u'Amino Acids',
   u'units': u'g',
   u'value': 1.472},
  {u'description': u'Cholesterol',
   u'group': u'Other',
   u'units': u'mg',
   u'value': 93.0},
  {u'description': u'Fatty acids, total saturated',
   u'group': u'Other',
   u'units': u'g',
   u'value': 18.584},
  {u'description': u'Fatty acids, total monounsaturated',
   u'group': u'Other',
   u'units': u'g',
   u'value': 8.275},
  {u'description': u'Fatty acids, total polyunsaturated',
   u'group': u'Other',
   u'units': u'g',
   u'value': 0.83}],
 u'portions': [{u'amount': 1, u'grams': 28.35, u'unit': u'oz'}],
 u'tags': []}
In [132]:
db[0]['nutrients'][0]
Out[132]:
{u'description': u'Protein',
 u'group': u'Composition',
 u'units': u'g',
 u'value': 25.18}
In [133]:
pd.DataFrame([{'a': 1, 'b': 2},
              {'a': 2, 'b': 3, 'c': 4}])
Out[133]:
a b c
0 1 2 NaN
1 2 3 4
In [134]:
pd.DataFrame(db[0]['nutrients'])
Out[134]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 162 entries, 0 to 161
Data columns (total 4 columns):
description    162  non-null values
group          162  non-null values
units          162  non-null values
value          162  non-null values
dtypes: float64(1), object(3)
In [135]:
pd.DataFrame(db[0]['nutrients']).head()
Out[135]:
description group units value
0 Protein Composition g 25.18
1 Total lipid (fat) Composition g 29.20
2 Carbohydrate, by difference Composition g 3.06
3 Ash Other g 3.28
4 Energy Energy kcal 376.00
In [136]:
db[0]['nutrients'][0]
Out[136]:
{u'description': u'Protein',
 u'group': u'Composition',
 u'units': u'g',
 u'value': 25.18}
In [137]:
db[0]['id']
Out[137]:
1008
In [138]:
all_nutrients = []
for rec in db:
    for nutrient in rec['nutrients']:
        nutrient['id'] = rec['id']
        all_nutrients.append(nutrient)

nutrients = pd.DataFrame(all_nutrients)
In [139]:
nutrients.head()
Out[139]:
description group id units value
0 Protein Composition 1008 g 25.18
1 Total lipid (fat) Composition 1008 g 29.20
2 Carbohydrate, by difference Composition 1008 g 3.06
3 Ash Other 1008 g 3.28
4 Energy Energy 1008 kcal 376.00
In [140]:
pd.DataFrame(all_nutrients, columns=['value'])
Out[140]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 389355 entries, 0 to 389354
Data columns (total 1 columns):
value    389355  non-null values
dtypes: float64(1)
In [141]:
of_interest = ['id', 'description', 'group']
meta = pd.DataFrame(db, columns=of_interest)
meta
Out[141]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6636 entries, 0 to 6635
Data columns (total 3 columns):
id             6636  non-null values
description    6636  non-null values
group          6636  non-null values
dtypes: int64(1), object(2)
In [142]:
nutrients
Out[142]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 389355 entries, 0 to 389354
Data columns (total 5 columns):
description    389355  non-null values
group          389355  non-null values
id             389355  non-null values
units          389355  non-null values
value          389355  non-null values
dtypes: float64(1), int64(1), object(3)
In [143]:
mapping = {
    'description': 'nutrient',
    'group': 'ngroup'
}
nutrients = nutrients.rename(columns=mapping)

# COULD HAVE DONE
# nutrients.rename(columns=mapping, inplace=True)

mapping = {
    'description': 'food',
    'group': 'fgroup'
}
meta = meta.rename(columns=mapping)
In [144]:
data = pd.merge(meta, nutrients, on='id')
In [145]:
data
Out[145]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 389355 entries, 0 to 389354
Data columns (total 7 columns):
id          389355  non-null values
food        389355  non-null values
fgroup      389355  non-null values
nutrient    389355  non-null values
ngroup      389355  non-null values
units       389355  non-null values
value       389355  non-null values
dtypes: float64(1), int64(1), object(5)
In [146]:
cafdata = data[data.nutrient == 'Caffeine']
cafdata
Out[146]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3911 entries, 172 to 389312
Data columns (total 7 columns):
id          3911  non-null values
food        3911  non-null values
fgroup      3911  non-null values
nutrient    3911  non-null values
ngroup      3911  non-null values
units       3911  non-null values
value       3911  non-null values
dtypes: float64(1), int64(1), object(5)
In [147]:
data.ix[172]
Out[147]:
id                            1009
food               Cheese, cheddar
fgroup      Dairy and Egg Products
nutrient                  Caffeine
ngroup                       Other
units                           mg
value                            0
Name: 172, dtype: object
In [148]:
cafdata.ix[172]
Out[148]:
id                            1009
food               Cheese, cheddar
fgroup      Dairy and Egg Products
nutrient                  Caffeine
ngroup                       Other
units                           mg
value                            0
Name: 172, dtype: object
In [149]:
cafdata.units.value_counts()
Out[149]:
mg    3911
dtype: int64
In [150]:
# MAXIMUM IN NUMPY
cafdata.value.values.argmax()
Out[150]:
3460
In [151]:
# INDEX OF THE MAXIMUM VALUE
cafdata.value.idxmax()
Out[151]:
336702
In [152]:
data.ix[cafdata.value.idxmax()]
Out[152]:
id                                      14366
food        Tea, instant, unsweetened, powder
fgroup                              Beverages
nutrient                             Caffeine
ngroup                                  Other
units                                      mg
value                                    3680
Name: 336702, dtype: object
In [153]:
cafdata.dtypes
Out[153]:
id            int64
food         object
fgroup       object
nutrient     object
ngroup       object
units        object
value       float64
dtype: object
In [154]:
def max_value(group):
    return group.ix[group['value'].idxmax()]

max_value(cafdata)
Out[154]:
id                                      14366
food        Tea, instant, unsweetened, powder
fgroup                              Beverages
nutrient                             Caffeine
ngroup                                  Other
units                                      mg
value                                    3680
Name: 336702, dtype: object
In [156]:
cafdata.groupby('fgroup').apply(max_value).dtypes
Out[156]:
id          object
food        object
fgroup      object
nutrient    object
ngroup      object
units       object
value       object
dtype: object
In [157]:
cafdata.groupby('fgroup').apply(max_value)
Out[157]:
id food fgroup nutrient ngroup units value
fgroup
Baby Foods 3008 Babyfood, meat, ham, strained Baby Foods Caffeine Other mg 0
Baked Products 18174 Cookies, graham crackers, chocolate-coated Baked Products Caffeine Other mg 46
Beef Products 13000 Beef, grass-fed, strip steaks, lean only, raw Beef Products Caffeine Other mg 0
Beverages 14366 Tea, instant, unsweetened, powder Beverages Caffeine Other mg 3680
Breakfast Cereals 8294 Cereals ready-to-eat, QUAKER, QUAKER COCOA BLASTS Breakfast Cereals Caffeine Other mg 21
Cereal Grains and Pasta 20004 Barley, hulled Cereal Grains and Pasta Caffeine Other mg 0
Dairy and Egg Products 43260 Beverage, instant breakfast powder, chocolate,... Dairy and Egg Products Caffeine Other mg 52
Ethnic Foods 35009 Whale, beluga, meat, dried (Alaska Native) Ethnic Foods Caffeine Other mg 0
Fast Foods 21030 Fast foods, cookies, chocolate chip Fast Foods Caffeine Other mg 11
Fats and Oils 2050 Lard Fats and Oils Caffeine Other mg 0
Finfish and Shellfish Products 15001 Fish, anchovy, european, raw Finfish and Shellfish Products Caffeine Other mg 0
Fruits and Fruit Juices 9002 Acerola juice, raw Fruits and Fruit Juices Caffeine Other mg 0
Lamb, Veal, and Game Products 17094 Veal, leg (top round), separable lean and fat,... Lamb, Veal, and Game Products Caffeine Other mg 0
Legumes and Legume Products 16166 Soymilk, chocolate, unfortified Legumes and Legume Products Caffeine Other mg 2
Meals, Entrees, and Sidedishes 22247 Macaroni and Cheese, canned entree Meals, Entrees, and Sidedishes Caffeine Other mg 0
Nut and Seed Products 12014 Seeds, pumpkin and squash seed kernels, dried Nut and Seed Products Caffeine Other mg 0
Pork Products 10000 Pork, fresh, enhanced, composite of separable ... Pork Products Caffeine Other mg 0
Poultry Products 5008 Chicken, broilers or fryers, meat and skin, co... Poultry Products Caffeine Other mg 0
Restaurant Foods 36617 Restaurant, Chinese, lemon chicken Restaurant Foods Caffeine Other mg 0
Sausages and Luncheon Meats 7008 Bologna, beef and pork Sausages and Luncheon Meats Caffeine Other mg 0
Snacks 25031 Formulated bar, ZONE PERFECT CLASSIC CRUNCH BA... Snacks Caffeine Other mg 11
Soups, Sauces, and Gravies 3073 Soup, cream of asparagus, canned, condensed Soups, Sauces, and Gravies Caffeine Other mg 0
Spices and Herbs 1025 Spices, allspice, ground Spices and Herbs Caffeine Other mg 0
Sweets 19268 Candies, dark chocolate coated coffee beans Sweets Caffeine Other mg 839
Vegetables and Vegetable Products 11001 Alfalfa seeds, sprouted, raw Vegetables and Vegetable Products Caffeine Other mg 0
In [158]:
grouped = data.groupby(['nutrient', 'fgroup'])
results = grouped.apply(max_value)
results
Out[158]:
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2246 entries, (Adjusted Protein, Sweets) to (Zinc, Zn, Vegetables and Vegetable Products)
Data columns (total 7 columns):
id          2246  non-null values
food        2246  non-null values
fgroup      2246  non-null values
nutrient    2246  non-null values
ngroup      2246  non-null values
units       2246  non-null values
value       2246  non-null values
dtypes: object(7)
In [159]:
pd.options.display.line_width = 10000
def topn(group, by=None, n=5):
    return group.sort(by, ascending=False)[:n]

result = cafdata.groupby('fgroup').apply(topn, by='value', n=3)
result.ix['Beverages']
Out[159]:
id food fgroup nutrient ngroup units value
336702 14366 Tea, instant, unsweetened, powder Beverages Caffeine Other mg 3680
334542 14214 Coffee, instant, regular, powder Beverages Caffeine Other mg 3142
336913 14375 Tea, instant, sweetened with sodium saccharin,... Beverages Caffeine Other mg 2240
In [160]:
data.nutrient.unique()
Out[160]:
array([u'Protein', u'Total lipid (fat)', u'Carbohydrate, by difference',
       u'Ash', u'Energy', u'Water', u'Fiber, total dietary',
       u'Calcium, Ca', u'Iron, Fe', u'Magnesium, Mg', u'Phosphorus, P',
       u'Potassium, K', u'Sodium, Na', u'Zinc, Zn', u'Copper, Cu',
       u'Manganese, Mn', u'Selenium, Se', u'Vitamin A, IU', u'Retinol',
       u'Vitamin A, RAE', u'Vitamin C, total ascorbic acid', u'Thiamin',
       u'Riboflavin', u'Niacin', u'Pantothenic acid', u'Vitamin B-6',
       u'Folate, total', u'Vitamin B-12', u'Folic acid', u'Folate, food',
       u'Folate, DFE', u'Cholesterol', u'Fatty acids, total saturated',
       u'Fatty acids, total monounsaturated',
       u'Fatty acids, total polyunsaturated', u'Tryptophan', u'Threonine',
       u'Isoleucine', u'Leucine', u'Lysine', u'Methionine', u'Cystine',
       u'Phenylalanine', u'Tyrosine', u'Valine', u'Arginine', u'Histidine',
       u'Alanine', u'Aspartic acid', u'Glutamic acid', u'Glycine',
       u'Proline', u'Serine', u'Sucrose', u'Lactose', u'Maltose',
       u'Alcohol, ethyl', u'Caffeine', u'Theobromine', u'Sugars, total',
       u'Fluoride, F', u'Carotene, beta', u'Carotene, alpha',
       u'Vitamin E (alpha-tocopherol)', u'Vitamin D',
       u'Vitamin D3 (cholecalciferol)', u'Vitamin D (D2 + D3)',
       u'Cryptoxanthin, beta', u'Lycopene', u'Lutein + zeaxanthin',
       u'Tocopherol, gamma', u'Tocopherol, delta', u'Choline, total',
       u'Vitamin K (phylloquinone)', u'Betaine', u'Vitamin E, added',
       u'Vitamin B-12, added', u'Dihydrophylloquinone',
       u'Tocopherol, beta', u'Phytosterols', u'Glucose (dextrose)',
       u'Fructose', u'Galactose', u'Fatty acids, total trans',
       u'Stigmasterol', u'Campesterol', u'Beta-sitosterol',
       u'Fatty acids, total trans-monoenoic',
       u'Fatty acids, total trans-polyenoic', u'Hydroxyproline', u'Starch',
       u'Menaquinone-4', u'Vitamin D2 (ergocalciferol)',
       u'Adjusted Protein'], dtype=object)
In [167]:
grouped = data.groupby(['nutrient', 'fgroup'])
results = grouped.apply(max_value)
results.ix['Zinc, Zn']
Out[167]:
id food fgroup nutrient ngroup units value
fgroup
Baby Foods 3192 Babyfood, cereal, oatmeal, with applesauce and... Baby Foods Zinc, Zn Elements mg 7.5
Baked Products 18374 Leavening agents, yeast, baker's, compressed Baked Products Zinc, Zn Elements mg 9.97
Beef Products 13981 Beef, chuck, short ribs, boneless, separable l... Beef Products Zinc, Zn Elements mg 12.28
Beverages 14557 Chocolate-flavor beverage mix for milk, powder... Beverages Zinc, Zn Elements mg 6.82
Breakfast Cereals 8504 Cereals ready-to-eat, Ralston Enriched Bran fl... Breakfast Cereals Zinc, Zn Elements mg 64.33
Cereal Grains and Pasta 20078 Wheat germ, crude Cereal Grains and Pasta Zinc, Zn Elements mg 12.29
Dairy and Egg Products 43260 Beverage, instant breakfast powder, chocolate,... Dairy and Egg Products Zinc, Zn Elements mg 15.4
Ethnic Foods 35194 Agave, dried (Southwest) Ethnic Foods Zinc, Zn Elements mg 12.1
Fast Foods 21048 Fast foods, oysters, battered or breaded, and ... Fast Foods Zinc, Zn Elements mg 11.25
Fats and Oils 2066 Salad dressing, russian dressing, low calorie Fats and Oils Zinc, Zn Elements mg 2.41
Finfish and Shellfish Products 15170 Mollusks, oyster, eastern, canned Finfish and Shellfish Products Zinc, Zn Elements mg 90.95
Fruits and Fruit Juices 9030 Apricots, dehydrated (low-moisture), sulfured,... Fruits and Fruit Juices Zinc, Zn Elements mg 1
Lamb, Veal, and Game Products 17202 Veal, variety meats and by-products, liver, raw Lamb, Veal, and Game Products Zinc, Zn Elements mg 12.02
Legumes and Legume Products 16176 Soy protein isolate, PROTEIN TECHNOLOGIES INTE... Legumes and Legume Products Zinc, Zn Elements mg 60
Meals, Entrees, and Sidedishes 22959 Macaroni and cheese dinner with dry sauce mix,... Meals, Entrees, and Sidedishes Zinc, Zn Elements mg 1.42
Nut and Seed Products 12011 Seeds, cottonseed meal, partially defatted (gl... Nut and Seed Products Zinc, Zn Elements mg 12.32
Pork Products 10111 Pork, fresh, variety meats and by-products, li... Pork Products Zinc, Zn Elements mg 6.72
Poultry Products 5180 Turkey, all classes, neck, meat only, cooked, ... Poultry Products Zinc, Zn Elements mg 7.12
Restaurant Foods 36608 CRACKER BARREL, grilled sirloin steak Restaurant Foods Zinc, Zn Elements mg 5.67
Sausages and Luncheon Meats 7912 Roast beef spread Sausages and Luncheon Meats Zinc, Zn Elements mg 6.08
Snacks 25006 Formulated bar, MARS SNACKFOOD US, SNICKERS Ma... Snacks Zinc, Zn Elements mg 13.57
Soups, Sauces, and Gravies 6048 Soup, oyster stew, canned, condensed Soups, Sauces, and Gravies Zinc, Zn Elements mg 8.4
Spices and Herbs 2008 Spices, chervil, dried Spices and Herbs Zinc, Zn Elements mg 8.8
Sweets 19078 Baking chocolate, unsweetened, squares Sweets Zinc, Zn Elements mg 9.63
Vegetables and Vegetable Products 11268 Mushrooms, shiitake, dried Vegetables and Vegetable Products Zinc, Zn Elements mg 7.66
In [168]:
results.ix['Zinc, Zn'].value
Out[168]:
fgroup
Baby Foods                             7.5
Baked Products                        9.97
Beef Products                        12.28
Beverages                             6.82
Breakfast Cereals                    64.33
Cereal Grains and Pasta              12.29
Dairy and Egg Products                15.4
Ethnic Foods                          12.1
Fast Foods                           11.25
Fats and Oils                         2.41
Finfish and Shellfish Products       90.95
Fruits and Fruit Juices                  1
Lamb, Veal, and Game Products        12.02
Legumes and Legume Products             60
Meals, Entrees, and Sidedishes        1.42
Nut and Seed Products                12.32
Pork Products                         6.72
Poultry Products                      7.12
Restaurant Foods                      5.67
Sausages and Luncheon Meats           6.08
Snacks                               13.57
Soups, Sauces, and Gravies             8.4
Spices and Herbs                       8.8
Sweets                                9.63
Vegetables and Vegetable Products     7.66
Name: value, dtype: object
In [169]:
results.ix['Zinc, Zn'].value.plot(kind='barh')
Out[169]:
<matplotlib.axes.AxesSubplot at 0x10100a190>
In [170]:
plt.figure(figsize=(10, 8))
medians = data.groupby(['nutrient', 'fgroup']).value.median()
medians.ix['Zinc, Zn'].plot(kind='barh')
Out[170]:
<matplotlib.axes.AxesSubplot at 0x11e151090>
In [171]:
plt.figure(figsize=(10, 8))
maxes = data.groupby(['nutrient', 'fgroup']).value.max()
np.log(maxes.ix['Caffeine']).plot(kind='barh')
Out[171]:
<matplotlib.axes.AxesSubplot at 0x106c3abd0>
In [172]:
data[['food', 'nutrient']].describe()
Out[172]:
food nutrient
count 389355 389355
unique 6636 94
top Milk, reduced fat, fluid, 2% milkfat, with add... Energy
freq 270 13704