import pandas as pd
import numpy as np
data_url = 'http://vincentarelbundock.github.io/Rdatasets/csv/carData/Salaries.csv'
df = pd.read_csv(data_url, index_col=0)
df.head()
rank | discipline | yrs.since.phd | yrs.service | sex | salary | |
---|---|---|---|---|---|---|
1 | Prof | B | 19 | 18 | Male | 139750 |
2 | Prof | B | 20 | 16 | Male | 173200 |
3 | AsstProf | B | 4 | 3 | Male | 79750 |
4 | Prof | B | 45 | 39 | Male | 115000 |
5 | Prof | B | 40 | 41 | Male | 141500 |
import IPython
# Grouping by one factor
df_rank = df.groupby('rank')
# Getting all methods from the groupby object:
meth = [method_name for method_name in dir(df_rank)
if callable(getattr(df_rank, method_name)) & ~method_name.startswith('_')]
# Printing the result
print(IPython.utils.text.columnize(meth))
agg corr cumsum get_group mean pct_change sem transform aggregate corrwith describe head median pipe shift tshift all count diff hist min plot size var any cov expanding idxmax ngroup prod skew apply cumcount ffill idxmin nth quantile std backfill cummax fillna last nunique rank sum bfill cummin filter mad ohlc resample tail boxplot cumprod first max pad rolling take
df_rank.groups
{'AssocProf': Int64Index([ 6, 11, 25, 40, 42, 55, 56, 58, 59, 61, 64, 66, 90, 92, 93, 97, 105, 107, 108, 109, 112, 124, 131, 133, 139, 141, 142, 154, 157, 159, 163, 169, 175, 177, 178, 183, 187, 189, 194, 195, 196, 215, 218, 219, 223, 228, 232, 256, 258, 261, 285, 286, 294, 300, 317, 322, 323, 329, 335, 364, 368, 371, 380, 383], dtype='int64'), 'AsstProf': Int64Index([ 3, 12, 13, 14, 28, 29, 32, 34, 35, 36, 38, 50, 53, 60, 62, 65, 76, 79, 80, 84, 88, 91, 96, 113, 119, 120, 128, 130, 134, 144, 147, 150, 152, 155, 158, 161, 164, 165, 171, 180, 197, 198, 201, 209, 211, 227, 235, 238, 241, 254, 259, 273, 274, 275, 288, 290, 307, 309, 316, 326, 349, 355, 360, 377, 378, 381, 397], dtype='int64'), 'Prof': Int64Index([ 1, 2, 4, 5, 7, 8, 9, 10, 15, 16, ... 387, 388, 389, 390, 391, 392, 393, 394, 395, 396], dtype='int64', length=266)}
df_rank.get_group('AsstProf').head()
discipline | yrs.since.phd | yrs.service | sex | salary | |
---|---|---|---|---|---|
3 | B | 4 | 3 | Male | 79750 |
12 | B | 7 | 2 | Male | 79800 |
13 | B | 1 | 1 | Male | 77700 |
14 | B | 2 | 0 | Male | 78000 |
28 | B | 5 | 3 | Male | 82379 |
df_rank.head()
rank | discipline | yrs.since.phd | yrs.service | sex | salary | |
---|---|---|---|---|---|---|
1 | Prof | B | 19 | 18 | Male | 139750 |
2 | Prof | B | 20 | 16 | Male | 173200 |
3 | AsstProf | B | 4 | 3 | Male | 79750 |
4 | Prof | B | 45 | 39 | Male | 115000 |
5 | Prof | B | 40 | 41 | Male | 141500 |
6 | AssocProf | B | 6 | 6 | Male | 97000 |
7 | Prof | B | 30 | 23 | Male | 175000 |
11 | AssocProf | B | 12 | 8 | Male | 119800 |
12 | AsstProf | B | 7 | 2 | Male | 79800 |
13 | AsstProf | B | 1 | 1 | Male | 77700 |
14 | AsstProf | B | 2 | 0 | Male | 78000 |
25 | AssocProf | A | 13 | 8 | Female | 74830 |
28 | AsstProf | B | 5 | 3 | Male | 82379 |
40 | AssocProf | B | 9 | 9 | Male | 100938 |
42 | AssocProf | B | 23 | 23 | Male | 93418 |
df_rank.count()
discipline | yrs.since.phd | yrs.service | sex | salary | |
---|---|---|---|---|---|
rank | |||||
AssocProf | 64 | 64 | 64 | 64 | 64 |
AsstProf | 67 | 67 | 67 | 67 | 67 |
Prof | 266 | 266 | 266 | 266 | 266 |
df_rank.nunique()
rank | discipline | yrs.since.phd | yrs.service | sex | salary | |
---|---|---|---|---|---|---|
rank | ||||||
AssocProf | 1 | 2 | 24 | 21 | 2 | 63 |
AsstProf | 1 | 2 | 11 | 7 | 2 | 53 |
Prof | 1 | 2 | 42 | 50 | 2 | 261 |
df_null = df.mask(np.random.random(df.shape) < .1)
df_null.isnull().sum().reset_index(name='N Missing Values')
index | N Missing Values | |
---|---|---|
0 | rank | 44 |
1 | discipline | 41 |
2 | yrs.since.phd | 40 |
3 | yrs.service | 28 |
4 | sex | 45 |
5 | salary | 47 |
df_null.groupby('rank').nunique()
rank | discipline | yrs.since.phd | yrs.service | sex | salary | |
---|---|---|---|---|---|---|
rank | ||||||
AssocProf | 1 | 2 | 19 | 19 | 2 | 53 |
AsstProf | 1 | 2 | 11 | 7 | 2 | 42 |
Prof | 1 | 2 | 41 | 48 | 2 | 200 |
df_rank['salary'].mean().reset_index()
rank | salary | |
---|---|---|
0 | AssocProf | 93876.437500 |
1 | AsstProf | 80775.985075 |
2 | Prof | 126772.109023 |