import numpy as np
import pandas as pd
rdg = np.random.RandomState(2017)
age = rdg.randint(1, 78, 20)
fare = rdg.uniform(10, 100, 20)
df = pd.DataFrame({'Age':age, 'Fare':fare}).round(2)
pd.cut(df['Age'], 4).value_counts()
(0.928, 19.0] 8 (55.0, 73.0] 5 (37.0, 55.0] 5 (19.0, 37.0] 2 Name: Age, dtype: int64
age_catogary = pd.cut(df['Age'], bins=[0, 17, 35, 59, 100], labels=['少年', '青年', '中年', '老年'])
age_catogary.value_counts()
中年 8 少年 8 老年 3 青年 1 Name: Age, dtype: int64
age_dummies = pd.get_dummies(age_catogary, prefix='Age')
df = df.join(age_dummies)
df.head()
Age | Fare | Age_少年 | Age_青年 | Age_中年 | Age_老年 | |
---|---|---|---|---|---|---|
0 | 60 | 76.21 | 0 | 0 | 0 | 1 |
1 | 10 | 71.77 | 1 | 0 | 0 | 0 |
2 | 71 | 24.76 | 0 | 0 | 0 | 1 |
3 | 14 | 71.50 | 1 | 0 | 0 | 0 |
4 | 43 | 43.19 | 0 | 0 | 1 | 0 |
# 将年龄分为3个区间
bins = np.linspace(df['Age'].min(), df['Age'].max()+1, 4)
bins
array([ 1. , 25.33333333, 49.66666667, 74. ])
age_bins = np.digitize(df['Age'], bins)
age_bins
array([3, 1, 3, 1, 2, 1, 3, 3, 1, 2, 1, 1, 3, 1, 2, 2, 2, 1, 3, 2], dtype=int64)
age_bins = pd.qcut(df['Age'], 4)
pd.qcut(df['Age'], 4).value_counts()
(53.0, 73.0] 5 (37.5, 53.0] 5 (9.75, 37.5] 5 (0.999, 9.75] 5 Name: Age, dtype: int64
def fare_rate_func(x):
if x <= np.percentile(df['Fare'], 25):
return 'high'
elif np.percentile(df['Fare'],25) < x <= np.percentile(df['Fare'], 75):
return 'middle'
else:
return 'low'
df['fare_rate'] = df['Fare'].apply(fare_rate_func)
# df['fare_rate'] = df['Fare'].map(fare_rate_func)
fare_dummies = pd.get_dummies(df['fare_rate'], prefix='Fare')
df.drop(['fare_rate'], axis=1, inplace=True)
df = df.join(fare_dummies)
df.head()
Age | Fare | Age_少年 | Age_青年 | Age_中年 | Age_老年 | Fare_high | Fare_low | Fare_middle | |
---|---|---|---|---|---|---|---|---|---|
0 | 60 | 76.21 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
1 | 10 | 71.77 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
2 | 71 | 24.76 | 0 | 0 | 0 | 1 | 1 | 0 | 0 |
3 | 14 | 71.50 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
4 | 43 | 43.19 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |