import pandas as pd
raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'],
'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'],
'name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze', 'Jacon', 'Ryaner', 'Sone', 'Sloan', 'Piger', 'Riani', 'Ali'],
'preTestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
'postTestScore': [25, 94, 57, 62, 70, 25, 94, 57, 62, 70, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['regiment', 'company', 'name', 'preTestScore', 'postTestScore'])
df
regiment | company | name | preTestScore | postTestScore | |
---|---|---|---|---|---|
0 | Nighthawks | 1st | Miller | 4 | 25 |
1 | Nighthawks | 1st | Jacobson | 24 | 94 |
2 | Nighthawks | 2nd | Ali | 31 | 57 |
3 | Nighthawks | 2nd | Milner | 2 | 62 |
4 | Dragoons | 1st | Cooze | 3 | 70 |
5 | Dragoons | 1st | Jacon | 4 | 25 |
6 | Dragoons | 2nd | Ryaner | 24 | 94 |
7 | Dragoons | 2nd | Sone | 31 | 57 |
8 | Scouts | 1st | Sloan | 2 | 62 |
9 | Scouts | 1st | Piger | 3 | 70 |
10 | Scouts | 2nd | Riani | 2 | 62 |
11 | Scouts | 2nd | Ali | 3 | 70 |
12 rows × 5 columns
groupby_regiment = df['preTestScore'].groupby(df['company'])
groupby_regiment
<pandas.core.groupby.SeriesGroupBy object at 0x1075fb610>
"This grouped variable is now a GroupBy object. It has not actually computed anything yet except for some intermediate data about the group key df['key1']. The idea is that this object has all of the information needed to then apply some operation to each of the groups." - Python for Data Analysis
Use list() to show what a grouping looks like
list(df['preTestScore'].groupby(df['company']))
[('1st', 0 4 1 24 4 3 5 4 8 2 9 3 Name: preTestScore, dtype: int64), ('2nd', 2 31 3 2 6 24 7 31 10 2 11 3 Name: preTestScore, dtype: int64)]
df['preTestScore'].groupby(df['company']).describe()
company 1st count 6.000000 mean 6.666667 std 8.524475 min 2.000000 25% 3.000000 50% 3.500000 75% 4.000000 max 24.000000 2nd count 6.000000 mean 15.500000 std 14.652645 min 2.000000 25% 2.250000 50% 13.500000 75% 29.250000 max 31.000000 dtype: float64
groupby_regiment.mean()
company 1st 6.666667 2nd 15.500000 dtype: float64
df['preTestScore'].groupby([df['regiment'], df['company']]).mean()
regiment company Dragoons 1st 3.5 2nd 27.5 Nighthawks 1st 14.0 2nd 16.5 Scouts 1st 2.5 2nd 2.5 dtype: float64
df['preTestScore'].groupby([df['regiment'], df['company']]).mean().unstack()
company | 1st | 2nd |
---|---|---|
regiment | ||
Dragoons | 3.5 | 27.5 |
Nighthawks | 14.0 | 16.5 |
Scouts | 2.5 | 2.5 |
3 rows × 2 columns
df.groupby(['regiment', 'company']).mean()
preTestScore | postTestScore | ||
---|---|---|---|
regiment | company | ||
Dragoons | 1st | 3.5 | 47.5 |
2nd | 27.5 | 75.5 | |
Nighthawks | 1st | 14.0 | 59.5 |
2nd | 16.5 | 59.5 | |
Scouts | 1st | 2.5 | 66.0 |
2nd | 2.5 | 66.0 |
6 rows × 2 columns
df.groupby(['regiment', 'company']).size()
regiment company Dragoons 1st 2 2nd 2 Nighthawks 1st 2 2nd 2 Scouts 1st 2 2nd 2 dtype: int64
# Group the dataframe by regiment, and for each regiment,
for name, group in df.groupby('regiment'):
# print the name of the regiment
print(name)
# print the data of that regiment
print(group)
Dragoons regiment company name preTestScore postTestScore 4 Dragoons 1st Cooze 3 70 5 Dragoons 1st Jacon 4 25 6 Dragoons 2nd Ryaner 24 94 7 Dragoons 2nd Sone 31 57 [4 rows x 5 columns] Nighthawks regiment company name preTestScore postTestScore 0 Nighthawks 1st Miller 4 25 1 Nighthawks 1st Jacobson 24 94 2 Nighthawks 2nd Ali 31 57 3 Nighthawks 2nd Milner 2 62 [4 rows x 5 columns] Scouts regiment company name preTestScore postTestScore 8 Scouts 1st Sloan 2 62 9 Scouts 1st Piger 3 70 10 Scouts 2nd Riani 2 62 11 Scouts 2nd Ali 3 70 [4 rows x 5 columns]
Specifically in this case: group by the data types of the columns (i.e. axis=1) and then use list() to view what that grouping looks like
list(df.groupby(df.dtypes, axis=1))
[(dtype('int64'), preTestScore postTestScore 0 4 25 1 24 94 2 31 57 3 2 62 4 3 70 5 4 25 6 24 94 7 31 57 8 2 62 9 3 70 10 2 62 11 3 70 [12 rows x 2 columns]), (dtype('O'), regiment company name 0 Nighthawks 1st Miller 1 Nighthawks 1st Jacobson 2 Nighthawks 2nd Ali 3 Nighthawks 2nd Milner 4 Dragoons 1st Cooze 5 Dragoons 1st Jacon 6 Dragoons 2nd Ryaner 7 Dragoons 2nd Sone 8 Scouts 1st Sloan 9 Scouts 1st Piger 10 Scouts 2nd Riani 11 Scouts 2nd Ali [12 rows x 3 columns])]
df.groupby('regiment').mean().add_prefix('mean_')
mean_preTestScore | mean_postTestScore | |
---|---|---|
regiment | ||
Dragoons | 15.50 | 61.5 |
Nighthawks | 15.25 | 59.5 |
Scouts | 2.50 | 66.0 |
3 rows × 2 columns
def get_stats(group):
return {'min': group.min(), 'max': group.max(), 'count': group.count(), 'mean': group.mean()}
bins = [0, 25, 50, 75, 100]
group_names = ['Low', 'Okay', 'Good', 'Great']
df['categories'] = pd.cut(df['postTestScore'], bins, labels=group_names)
df['postTestScore'].groupby(df['categories']).apply(get_stats).unstack()
count | max | mean | min | |
---|---|---|---|---|
categories | ||||
Good | 8 | 70 | 63.75 | 57 |
Great | 2 | 94 | 94.00 | 94 |
Low | 2 | 25 | 25.00 | 25 |
3 rows × 4 columns