import pandas as pd
import numpy as np
df = pd.DataFrame({"key1" : ["a", "b", "a", "b", "a", "b", "a", "a"],
"key2" : ["1", "2", "3", "4", "1", "2", "3", "4"],
"data1" : [20, 21, 22, 23, 100, 200, 300, 400],
"data2" : np.random.randn(8)})
df
data1 | data2 | key1 | key2 | |
---|---|---|---|---|
0 | 20 | 1.993315 | a | 1 |
1 | 21 | 1.132069 | b | 2 |
2 | 22 | 0.135880 | a | 3 |
3 | 23 | -1.071349 | b | 4 |
4 | 100 | -0.585936 | a | 1 |
5 | 200 | -1.501368 | b | 2 |
6 | 300 | -0.688437 | a | 3 |
7 | 400 | 0.926304 | a | 4 |
df.groupby(df["key1"]).mean()
data1 | data2 | |
---|---|---|
key1 | ||
a | 168.400000 | 0.356225 |
b | 81.333333 | -0.480216 |
df.groupby("key1").mean()
data1 | data2 | |
---|---|---|
key1 | ||
a | 168.400000 | 0.356225 |
b | 81.333333 | -0.480216 |
means = df.groupby([df["key1"], df["key2"]]).mean()
means
data1 | data2 | ||
---|---|---|---|
key1 | key2 | ||
a | 1 | 60.0 | 0.703689 |
3 | 161.0 | -0.276278 | |
4 | 400.0 | 0.926304 | |
b | 2 | 110.5 | -0.184649 |
4 | 23.0 | -1.071349 |
df.groupby(["key1", "key2"]).mean()
data1 | data2 | ||
---|---|---|---|
key1 | key2 | ||
a | 1 | 60.0 | 0.703689 |
3 | 161.0 | -0.276278 | |
4 | 400.0 | 0.926304 | |
b | 2 | 110.5 | -0.184649 |
4 | 23.0 | -1.071349 |
means.unstack()
data1 | data2 | |||||||
---|---|---|---|---|---|---|---|---|
key2 | 1 | 2 | 3 | 4 | 1 | 2 | 3 | 4 |
key1 | ||||||||
a | 60.0 | NaN | 161.0 | 400.0 | 0.703689 | NaN | -0.276278 | 0.926304 |
b | NaN | 110.5 | NaN | 23.0 | NaN | -0.184649 | NaN | -1.071349 |
df.groupby(["key1", "key2"]).size()
key1 key2 a 1 2 3 2 4 1 b 2 2 4 1 dtype: int64
# iterating over groups
for name, group in df.groupby("key1"):
print(name)
print(group)
a data1 data2 key1 key2 0 20 1.993315 a 1 2 22 0.135880 a 3 4 100 -0.585936 a 1 6 300 -0.688437 a 3 7 400 0.926304 a 4 b data1 data2 key1 key2 1 21 1.132069 b 2 3 23 -1.071349 b 4 5 200 -1.501368 b 2
df.dtypes
data1 int64 data2 float64 key1 object key2 object dtype: object
df2 = df.groupby(df.dtypes, axis = 1)
dict(list(df2))
{dtype('int64'): data1 0 20 1 21 2 22 3 23 4 100 5 200 6 300 7 400, dtype('float64'): data2 0 1.993315 1 1.132069 2 0.135880 3 -1.071349 4 -0.585936 5 -1.501368 6 -0.688437 7 0.926304, dtype('O'): key1 key2 0 a 1 1 b 2 2 a 3 3 b 4 4 a 1 5 b 2 6 a 3 7 a 4}
df.groupby('key1')[['data1']].mean()
data1 | |
---|---|
key1 | |
a | 168.400000 |
b | 81.333333 |
df["key1"]
0 a 1 b 2 a 3 b 4 a 5 b 6 a 7 a Name: key1, dtype: object
df.count()
data1 8 data2 8 key1 8 key2 8 dtype: int64
def my_range(arr):
return arr.max() - arr.min()
df.groupby("key1").aggregate(my_range)
data1 | data2 | |
---|---|---|
key1 | ||
a | 380 | 2.681751 |
b | 179 | 2.633437 |
df.groupby("key1").describe()
data1 | data2 | |||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | mean | std | min | 25% | 50% | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
key1 | ||||||||||||||||
a | 5.0 | 168.400000 | 172.547964 | 20.0 | 22.0 | 100.0 | 300.0 | 400.0 | 5.0 | 0.304734 | 0.896221 | -1.090432 | 0.270046 | 0.392229 | 0.553693 | 1.398133 |
b | 3.0 | 81.333333 | 102.773213 | 21.0 | 22.0 | 23.0 | 111.5 | 200.0 | 3.0 | 0.553255 | 0.466006 | 0.064604 | 0.333526 | 0.602447 | 0.797580 | 0.992713 |
df.groupby("key1").aggregate("describe")
data1 | data2 | |||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | mean | std | min | 25% | 50% | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
key1 | ||||||||||||||||
a | 5.0 | 168.400000 | 172.547964 | 20.0 | 22.0 | 100.0 | 300.0 | 400.0 | 5.0 | 0.356225 | 1.121672 | -0.688437 | -0.585936 | 0.135880 | 0.926304 | 1.993315 |
b | 3.0 | 81.333333 | 102.773213 | 21.0 | 22.0 | 23.0 | 111.5 | 200.0 | 3.0 | -0.480216 | 1.412737 | -1.501368 | -1.286358 | -1.071349 | 0.030360 | 1.132069 |
df = pd.read_csv("examples/tips.csv")
df.head()
total_bill | tip | smoker | day | time | size | |
---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | No | Sun | Dinner | 4 |
df["tip_pct"] = df.tip / df.total_bill * 100
df.head()
total_bill | tip | smoker | day | time | size | tip_pct | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | No | Sun | Dinner | 2 | 5.944673 |
1 | 10.34 | 1.66 | No | Sun | Dinner | 3 | 16.054159 |
2 | 21.01 | 3.50 | No | Sun | Dinner | 3 | 16.658734 |
3 | 23.68 | 3.31 | No | Sun | Dinner | 2 | 13.978041 |
4 | 24.59 | 3.61 | No | Sun | Dinner | 4 | 14.680765 |
df.groupby(["day", "smoker"])[["tip_pct"]].mean()
tip_pct | ||
---|---|---|
day | smoker | |
Fri | No | 15.165044 |
Yes | 17.478305 | |
Sat | No | 15.804766 |
Yes | 14.790607 | |
Sun | No | 16.011294 |
Yes | 18.725032 | |
Thur | No | 16.029808 |
Yes | 16.386327 |
df.groupby(["day", "smoker"]).tip_pct.aggregate('describe')
count | mean | std | min | 25% | 50% | 75% | max | ||
---|---|---|---|---|---|---|---|---|---|
day | smoker | ||||||||
Fri | No | 4.0 | 15.165044 | 2.812295 | 12.038523 | 13.723917 | 14.924093 | 16.365221 | 18.773467 |
Yes | 15.0 | 17.478305 | 5.129267 | 10.355540 | 13.373871 | 17.391304 | 20.924019 | 26.348039 | |
Sat | No | 45.0 | 15.804766 | 3.976730 | 5.679667 | 13.623978 | 15.015198 | 18.391451 | 29.198966 |
Yes | 42.0 | 14.790607 | 6.137495 | 3.563814 | 9.179713 | 15.362439 | 19.050216 | 32.573290 | |
Sun | No | 57.0 | 16.011294 | 4.234723 | 5.944673 | 13.978041 | 16.166505 | 18.518519 | 25.267250 |
Yes | 19.0 | 18.725032 | 15.413424 | 6.565988 | 9.772265 | 13.812155 | 21.532480 | 71.034483 | |
Thur | No | 45.0 | 16.029808 | 3.877420 | 7.296137 | 13.774105 | 15.349194 | 18.484288 | 26.631158 |
Yes | 17.0 | 16.386327 | 3.938881 | 9.001406 | 14.803849 | 15.384615 | 19.483682 | 24.125452 |
df.groupby(["day", "smoker"]).tip_pct.agg(["mean", "sum", np.std])
mean | sum | std | ||
---|---|---|---|---|
day | smoker | |||
Fri | No | 15.165044 | 60.660177 | 2.812295 |
Yes | 17.478305 | 262.174578 | 5.129267 | |
Sat | No | 15.804766 | 711.214459 | 3.976730 |
Yes | 14.790607 | 621.205474 | 6.137495 | |
Sun | No | 16.011294 | 912.643775 | 4.234723 |
Yes | 18.725032 | 355.775601 | 15.413424 | |
Thur | No | 16.029808 | 721.341368 | 3.877420 |
Yes | 16.386327 | 278.567563 | 3.938881 |
import warnings
warnings.filterwarnings(action = "ignore")
df.groupby(["day", "smoker"]).tip_pct.agg({"average" : "mean",
"summation": "sum",
"standar dev" : np.std})
average | summation | standar dev | ||
---|---|---|---|---|
day | smoker | |||
Fri | No | 15.165044 | 60.660177 | 2.812295 |
Yes | 17.478305 | 262.174578 | 5.129267 | |
Sat | No | 15.804766 | 711.214459 | 3.976730 |
Yes | 14.790607 | 621.205474 | 6.137495 | |
Sun | No | 16.011294 | 912.643775 | 4.234723 |
Yes | 18.725032 | 355.775601 | 15.413424 | |
Thur | No | 16.029808 | 721.341368 | 3.877420 |
Yes | 16.386327 | 278.567563 | 3.938881 |
functions = ["count", "mean", "max"]
df.groupby(["day", "smoker"])["tip_pct", "total_bill"].agg(functions)
tip_pct | total_bill | ||||||
---|---|---|---|---|---|---|---|
count | mean | max | count | mean | max | ||
day | smoker | ||||||
Fri | No | 4 | 15.165044 | 18.773467 | 4 | 18.420000 | 22.75 |
Yes | 15 | 17.478305 | 26.348039 | 15 | 16.813333 | 40.17 | |
Sat | No | 45 | 15.804766 | 29.198966 | 45 | 19.661778 | 48.33 |
Yes | 42 | 14.790607 | 32.573290 | 42 | 21.276667 | 50.81 | |
Sun | No | 57 | 16.011294 | 25.267250 | 57 | 20.506667 | 48.17 |
Yes | 19 | 18.725032 | 71.034483 | 19 | 24.120000 | 45.35 | |
Thur | No | 45 | 16.029808 | 26.631158 | 45 | 17.113111 | 41.19 |
Yes | 17 | 16.386327 | 24.125452 | 17 | 19.190588 | 43.11 |
apply splits an object into pieces, invokes the epassed function on each piece, and then attempts to concatenate the pieces together
def top_five(data = df, n = 5, column = "tip_pct" ):
return df.sort_values(by = column)[-n:]
top_five(data = df)
total_bill | tip | smoker | day | time | size | tip_pct | |
---|---|---|---|---|---|---|---|
183 | 23.17 | 6.50 | Yes | Sun | Dinner | 4 | 28.053517 |
232 | 11.61 | 3.39 | No | Sat | Dinner | 2 | 29.198966 |
67 | 3.07 | 1.00 | Yes | Sat | Dinner | 1 | 32.573290 |
178 | 9.60 | 4.00 | Yes | Sun | Dinner | 2 | 41.666667 |
172 | 7.25 | 5.15 | Yes | Sun | Dinner | 2 | 71.034483 |
df.groupby("smoker").apply(top_five)
total_bill | tip | smoker | day | time | size | tip_pct | ||
---|---|---|---|---|---|---|---|---|
smoker | ||||||||
No | 183 | 23.17 | 6.50 | Yes | Sun | Dinner | 4 | 28.053517 |
232 | 11.61 | 3.39 | No | Sat | Dinner | 2 | 29.198966 | |
67 | 3.07 | 1.00 | Yes | Sat | Dinner | 1 | 32.573290 | |
178 | 9.60 | 4.00 | Yes | Sun | Dinner | 2 | 41.666667 | |
172 | 7.25 | 5.15 | Yes | Sun | Dinner | 2 | 71.034483 | |
Yes | 183 | 23.17 | 6.50 | Yes | Sun | Dinner | 4 | 28.053517 |
232 | 11.61 | 3.39 | No | Sat | Dinner | 2 | 29.198966 | |
67 | 3.07 | 1.00 | Yes | Sat | Dinner | 1 | 32.573290 | |
178 | 9.60 | 4.00 | Yes | Sun | Dinner | 2 | 41.666667 | |
172 | 7.25 | 5.15 | Yes | Sun | Dinner | 2 | 71.034483 |