import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.pyplot import title,subplot,figure
import os
import sys
import seaborn as sns
import tak.tak as tw
%matplotlib inline
mpl.rcParams['figure.figsize'] = (8,6)
mpl.rcParams['font.weight'] = 'bold'
mpl.rcParams['font.size'] = 44.0
mpl.rcParams['figure.edgecolor'] = 'black' # <- noeffect
mpl.rcParams['axes.labelsize']=24.0
mpl.rcParams['axes.titlesize']=24.0
mpl.rcParams['axes.titleweight'] = 'bold'
mpl.rcParams['legend.fontsize'] = 18
mpl.rcParams['xtick.labelsize'] = 14
mpl.rcParams['ytick.labelsize'] = 14
# see below for the properties that we can modify
#mpl.rcParams
tips = sns.load_dataset("tips")
figure()
sns.stripplot(x="day", y="total_bill", data=tips)
title('stripplot')
<matplotlib.text.Text at 0x7f4b1e672590>
# nested categorical variable with the "hue" parameter
figure()
sns.stripplot(x="day", y="total_bill", hue="time", data=tips);
title('nested categorical variable with the "hue" parameter')
<matplotlib.text.Text at 0x7f4b1e36bdd0>
#** AWESOME!!! MEMORY USAGE ALSO PRINTED OUT!!!
print tips.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 244 entries, 0 to 243 Data columns (total 7 columns): total_bill 244 non-null float64 tip 244 non-null float64 sex 244 non-null category smoker 244 non-null category day 244 non-null category time 244 non-null category size 244 non-null int64 dtypes: category(4), float64(2), int64(1) memory usage: 8.7 KB None
# pandas: different ways of boolean selection
tips[ tips['sex'].isin(['Female'])].describe()
tips[ tips['sex'] == 'Female' ].describe()
tips.query('sex == "Female"').describe()
total_bill | tip | size | |
---|---|---|---|
count | 87.000000 | 87.000000 | 87.000000 |
mean | 18.056897 | 2.833448 | 2.459770 |
std | 8.009209 | 1.159495 | 0.937644 |
min | 3.070000 | 1.000000 | 1.000000 |
25% | 12.750000 | 2.000000 | 2.000000 |
50% | 16.400000 | 2.750000 | 2.000000 |
75% | 21.520000 | 3.500000 | 3.000000 |
max | 44.300000 | 6.500000 | 6.000000 |
# query will do the column selection for you! (so won't return boolean DF)
tips.query('sex == "Male"').describe().T
tips.query('sex != "Male"').describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
total_bill | 87 | 18.056897 | 8.009209 | 3.07 | 12.75 | 16.40 | 21.52 | 44.3 |
tip | 87 | 2.833448 | 1.159495 | 1.00 | 2.00 | 2.75 | 3.50 | 6.5 |
size | 87 | 2.459770 | 0.937644 | 1.00 | 2.00 | 2.00 | 3.00 | 6.0 |
# get all rows where columns "a" and "b" have overlapping values
tips.query('sex in smoker') # <- concise and readable
tips[tips.sex.isin(tips.smoker)] # equivalent pythonic syntax...
tips.query('sex not in smoker')
tips[~tips.sex.isin(tips.smoker)] # equivalent pythonic syntax...
tips.query('sex == "Female" and tip < 3.5')
tips.query('sex == "Female" and smoker == "Yes"')
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
67 | 3.07 | 1.00 | Female | Yes | Sat | Dinner | 1 |
72 | 26.86 | 3.14 | Female | Yes | Sat | Dinner | 2 |
73 | 25.28 | 5.00 | Female | Yes | Sat | Dinner | 2 |
92 | 5.75 | 1.00 | Female | Yes | Fri | Dinner | 2 |
93 | 16.32 | 4.30 | Female | Yes | Fri | Dinner | 2 |
100 | 11.35 | 2.50 | Female | Yes | Fri | Dinner | 2 |
101 | 15.38 | 3.00 | Female | Yes | Fri | Dinner | 2 |
102 | 44.30 | 2.50 | Female | Yes | Sat | Dinner | 3 |
103 | 22.42 | 3.48 | Female | Yes | Sat | Dinner | 2 |
109 | 14.31 | 4.00 | Female | Yes | Sat | Dinner | 2 |
164 | 17.51 | 3.00 | Female | Yes | Sun | Dinner | 2 |
168 | 10.59 | 1.61 | Female | Yes | Sat | Dinner | 2 |
169 | 10.63 | 2.00 | Female | Yes | Sat | Dinner | 2 |
178 | 9.60 | 4.00 | Female | Yes | Sun | Dinner | 2 |
186 | 20.90 | 3.50 | Female | Yes | Sun | Dinner | 3 |
188 | 18.15 | 3.50 | Female | Yes | Sun | Dinner | 3 |
191 | 19.81 | 4.19 | Female | Yes | Thur | Lunch | 2 |
197 | 43.11 | 5.00 | Female | Yes | Thur | Lunch | 4 |
198 | 13.00 | 2.00 | Female | Yes | Thur | Lunch | 2 |
201 | 12.74 | 2.01 | Female | Yes | Thur | Lunch | 2 |
202 | 13.00 | 2.00 | Female | Yes | Thur | Lunch | 2 |
203 | 16.40 | 2.50 | Female | Yes | Thur | Lunch | 2 |
205 | 16.47 | 3.23 | Female | Yes | Thur | Lunch | 3 |
209 | 12.76 | 2.23 | Female | Yes | Sat | Dinner | 2 |
213 | 13.27 | 2.50 | Female | Yes | Sat | Dinner | 2 |
214 | 28.17 | 6.50 | Female | Yes | Sat | Dinner | 3 |
215 | 12.90 | 1.10 | Female | Yes | Sat | Dinner | 2 |
219 | 30.14 | 3.09 | Female | Yes | Sat | Dinner | 4 |
221 | 13.42 | 3.48 | Female | Yes | Fri | Lunch | 2 |
225 | 16.27 | 2.50 | Female | Yes | Fri | Lunch | 2 |
226 | 10.09 | 2.00 | Female | Yes | Fri | Lunch | 2 |
229 | 22.12 | 2.88 | Female | Yes | Sat | Dinner | 2 |
240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
# list-expressions also works in query
tips.query('sex == "Male" and day in ["Sun","Sat"]')
tips.query('sex in "Male" and day == ["Sun","Sat"]')
tips.query('sex == "Male" and day not in ["Sun","Sat"]')
tips.query('sex == "Male" and day != ["Sun","Sat"]')
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
77 | 27.20 | 4.00 | Male | No | Thur | Lunch | 4 |
78 | 22.76 | 3.00 | Male | No | Thur | Lunch | 2 |
79 | 17.29 | 2.71 | Male | No | Thur | Lunch | 2 |
80 | 19.44 | 3.00 | Male | Yes | Thur | Lunch | 2 |
81 | 16.66 | 3.40 | Male | No | Thur | Lunch | 2 |
83 | 32.68 | 5.00 | Male | Yes | Thur | Lunch | 2 |
84 | 15.98 | 2.03 | Male | No | Thur | Lunch | 2 |
86 | 13.03 | 2.00 | Male | No | Thur | Lunch | 2 |
87 | 18.28 | 4.00 | Male | No | Thur | Lunch | 2 |
88 | 24.71 | 5.85 | Male | No | Thur | Lunch | 2 |
89 | 21.16 | 3.00 | Male | No | Thur | Lunch | 2 |
90 | 28.97 | 3.00 | Male | Yes | Fri | Dinner | 2 |
91 | 22.49 | 3.50 | Male | No | Fri | Dinner | 2 |
95 | 40.17 | 4.73 | Male | Yes | Fri | Dinner | 4 |
96 | 27.28 | 4.00 | Male | Yes | Fri | Dinner | 2 |
97 | 12.03 | 1.50 | Male | Yes | Fri | Dinner | 2 |
98 | 21.01 | 3.00 | Male | Yes | Fri | Dinner | 2 |
99 | 12.46 | 1.50 | Male | No | Fri | Dinner | 2 |
120 | 11.69 | 2.31 | Male | No | Thur | Lunch | 2 |
122 | 14.26 | 2.50 | Male | No | Thur | Lunch | 2 |
123 | 15.95 | 2.00 | Male | No | Thur | Lunch | 2 |
126 | 8.52 | 1.48 | Male | No | Thur | Lunch | 2 |
129 | 22.82 | 2.18 | Male | No | Thur | Lunch | 3 |
130 | 19.08 | 1.50 | Male | No | Thur | Lunch | 2 |
138 | 16.00 | 2.00 | Male | Yes | Thur | Lunch | 2 |
141 | 34.30 | 6.70 | Male | No | Thur | Lunch | 6 |
142 | 41.19 | 5.00 | Male | No | Thur | Lunch | 5 |
148 | 9.78 | 1.73 | Male | No | Thur | Lunch | 2 |
149 | 7.51 | 2.00 | Male | No | Thur | Lunch | 2 |
192 | 28.44 | 2.56 | Male | Yes | Thur | Lunch | 2 |
193 | 15.48 | 2.02 | Male | Yes | Thur | Lunch | 2 |
194 | 16.58 | 4.00 | Male | Yes | Thur | Lunch | 2 |
195 | 7.56 | 1.44 | Male | No | Thur | Lunch | 2 |
196 | 10.34 | 2.00 | Male | Yes | Thur | Lunch | 2 |
199 | 13.51 | 2.00 | Male | Yes | Thur | Lunch | 2 |
200 | 18.71 | 4.00 | Male | Yes | Thur | Lunch | 3 |
204 | 20.53 | 4.00 | Male | Yes | Thur | Lunch | 4 |
220 | 12.16 | 2.20 | Male | Yes | Fri | Lunch | 2 |
222 | 8.58 | 1.92 | Male | Yes | Fri | Lunch | 1 |
224 | 13.42 | 1.58 | Male | Yes | Fri | Lunch | 2 |
# original structure
tips.head(3)
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
# make gender an outer level index
tips_multi_index = tips.set_index('sex', append=True)
tips_multi_index.head(3)
total_bill | tip | smoker | day | time | size | ||
---|---|---|---|---|---|---|---|
sex | |||||||
0 | Female | 16.99 | 1.01 | No | Sun | Dinner | 2 |
1 | Male | 10.34 | 1.66 | No | Sun | Dinner | 3 |
2 | Male | 21.01 | 3.50 | No | Sun | Dinner | 3 |
# name the original index as "id"
tips_multi_index.index.rename(['id','sex'], inplace=True)
tips_multi_index.head(3)
total_bill | tip | smoker | day | time | size | ||
---|---|---|---|---|---|---|---|
id | sex | ||||||
0 | Female | 16.99 | 1.01 | No | Sun | Dinner | 2 |
1 | Male | 10.34 | 1.66 | No | Sun | Dinner | 3 |
2 | Male | 21.01 | 3.50 | No | Sun | Dinner | 3 |
#==============================================================================
# Note: really weird, but to make index into column, use reset_index
# http://stackoverflow.com/questions/20110170/turn-pandas-multi-index-into-column
#==============================================================================
print tips.equals( tips_multi_index.reset_index('sex',col_level = 3))
print ("we got false above since reset_index will plug the index into the 1st "
"column (ie, order not preserved)")
False we got false above since reset_index will plug the index into the 1st column (ie, order not preserved)
#==============================================================================
# Here's where multi-index shines IMO...summary stats based on groups!
#....(but wtf, methods like "describe()" and "quantile" doesn't have "level" arg
#==============================================================================
# using
tips_multi_index.std(level='sex')
total_bill | tip | size | |
---|---|---|---|
sex | |||
Male | 9.246469 | 1.489102 | 0.955997 |
Female | 8.009209 | 1.159495 | 0.937644 |
# name index as "subject"
tips_multi_index.index.name = 'subject'
#%% sort/group by gender
print tips_multi_index.sortlevel('sex').head()
print tips_multi_index.sortlevel('sex').tail()
total_bill tip smoker day time size id sex 1 Male 10.34 1.66 No Sun Dinner 3 2 Male 21.01 3.50 No Sun Dinner 3 3 Male 23.68 3.31 No Sun Dinner 2 5 Male 25.29 4.71 No Sun Dinner 4 6 Male 8.77 2.00 No Sun Dinner 2 total_bill tip smoker day time size id sex 226 Female 10.09 2.00 Yes Fri Lunch 2 229 Female 22.12 2.88 Yes Sat Dinner 2 238 Female 35.83 4.67 No Sat Dinner 3 240 Female 27.18 2.00 Yes Sat Dinner 2 243 Female 18.78 3.00 No Thur Dinner 2
tips_multi_index_gender_sorted = tips_multi_index.sortlevel('sex')
# reorder so gender is the outer level (hate how you can't do this in set_index()
tips_multi_index.reorder_levels([1,0]).head(10)
tips_multi_index = tips_multi_index.reorder_levels(['sex','id'])
tips_multi_index.head(3)
total_bill | tip | smoker | day | time | size | ||
---|---|---|---|---|---|---|---|
sex | id | ||||||
Female | 0 | 16.99 | 1.01 | No | Sun | Dinner | 2 |
Male | 1 | 10.34 | 1.66 | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | No | Sun | Dinner | 3 |
#%% where vs boolean...
idx1 = tips['sex'].where(tips['sex'] == 'Female') # <- returns a NAN
idx2 = tips['sex'] == 'Female' # <- returns True/False
tips.groupby('sex').agg(['count','mean','std',len,np.var])
total_bill | tip | size | |||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | mean | std | len | var | count | mean | std | len | var | count | mean | std | len | var | |
sex | |||||||||||||||
Male | 157 | 20.744076 | 9.246469 | 157 | 85.497185 | 157 | 3.089618 | 1.489102 | 157 | 2.217424 | 157 | 2.630573 | 0.955997 | 157 | 0.913931 |
Female | 87 | 18.056897 | 8.009209 | 87 | 64.147429 | 87 | 2.833448 | 1.159495 | 87 | 1.344428 | 87 | 2.459770 | 0.937644 | 87 | 0.879177 |
tips.groupby(lambda idx: 0).agg(['count','mean','std',len,np.var])
total_bill | tip | size | |||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | mean | std | len | var | count | mean | std | len | var | count | mean | std | len | var | |
0 | 244 | 19.785943 | 8.902412 | 244 | 79.252939 | 244 | 2.998279 | 1.383638 | 244 | 1.914455 | 244 | 2.569672 | 0.9511 | 244 | 0.904591 |
tips.columns
Index([u'total_bill', u'tip', u'sex', u'smoker', u'day', u'time', u'size'], dtype='object')