# 載入套件
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# 初始設定 Ages 的資料
ages = pd.DataFrame({"age": [18,22,25,27,7,21,23,37,30,61,45,41,9,18,80,100]})
# 新增欄位 "equal_width_age", 對年齡做等寬劃分
ages["equal_width_age"] = pd.cut(ages["age"], 4)
# 觀察等寬劃分下, 每個種組距各出現幾次
ages["equal_width_age"].value_counts() # 每個 bin 的值的範圍大小都是一樣的
(6.907, 30.25] 10 (30.25, 53.5] 3 (76.75, 100.0] 2 (53.5, 76.75] 1 Name: equal_width_age, dtype: int64
# 新增欄位 "equal_freq_age", 對年齡做等頻劃分
ages["equal_freq_age"] = pd.qcut(ages["age"], 4)
# 觀察等頻劃分下, 每個種組距各出現幾次
ages["equal_freq_age"].value_counts() # 每個 bin 的資料筆數是一樣的
(42.0, 100.0] 4 (26.0, 42.0] 4 (20.25, 26.0] 4 (6.999, 20.25] 4 Name: equal_freq_age, dtype: int64
新增一個欄位 customized_age_grp
,把 age
分為 (0, 10], (10, 20], (20, 30], (30, 50], (50, 100] 這五組,'(' 表示不包含, ']' 表示包含
Hints: 執行 ??pd.cut(),了解提供其中 bins 這個參數的使用方式
customized_age_grp
,把 age
分為 (0, 10], (10, 20], (20, 30], (30, 50], (50, 100] 這五組,'(' 表示不包含, ']' 表示包含
# 載入套件
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# 初始設定 Ages 的資料
ages = pd.DataFrame({"age": [18,22,25,27,7,21,23,37,30,61,45,41,9,18,80,100]})
# 新增欄位 "equal_width_age", 對年齡做等寬劃分
ages["equal_width_age"] = pd.cut(ages["age"], 4)
新增一個欄位 customized_age_grp,把 age 分為 (0, 10], (10, 20], (20, 30], (30, 50], (50, 100] 這五組, '(' 表示不包含, ']' 表示包含
ages["customized_age_grp"] = pd.cut(ages["age"],bins=[0,10,20,30,50,100])
ages["customized_age_grp"].value_counts()
(20, 30] 6 (50, 100] 3 (30, 50] 3 (10, 20] 2 (0, 10] 2 Name: customized_age_grp, dtype: int64
藉由查詢與改動參數的過程, 熟悉查詢函數的方法與理解參數性質, 並了解數值的離散化的調整工具
ages["customized_age_grp"] = pd.cut(ages["age"],bins=[0,10,20,30,50,100],labels=False)
ages["customized_age_grp"]
0 1 1 2 2 2 3 2 4 0 5 2 6 2 7 3 8 2 9 4 10 3 11 3 12 0 13 1 14 4 15 4 Name: customized_age_grp, dtype: int64
ages["customized_age_grp"].value_counts()
2 6 4 3 3 3 1 2 0 2 Name: customized_age_grp, dtype: int64
ages["customized_age_grp"] = pd.cut(ages["age"],bins=[0,10,20,30,50,100],include_lowest=False)
ages["customized_age_grp"]
0 (10, 20] 1 (20, 30] 2 (20, 30] 3 (20, 30] 4 (0, 10] 5 (20, 30] 6 (20, 30] 7 (30, 50] 8 (20, 30] 9 (50, 100] 10 (30, 50] 11 (30, 50] 12 (0, 10] 13 (10, 20] 14 (50, 100] 15 (50, 100] Name: customized_age_grp, dtype: category Categories (5, interval[int64]): [(0, 10] < (10, 20] < (20, 30] < (30, 50] < (50, 100]]
ages["customized_age_grp"] = pd.cut(ages["age"],bins=[0,10,20,30,50,100],include_lowest=True)
ages["customized_age_grp"].value_counts()
(20.0, 30.0] 6 (50.0, 100.0] 3 (30.0, 50.0] 3 (10.0, 20.0] 2 (-0.001, 10.0] 2 Name: customized_age_grp, dtype: int64
# 觀察等寬劃分下, 每個種組距各出現幾次
ages["equal_width_age"].value_counts() # 每個 bin 的值的範圍大小都是一樣的
(6.907, 30.25] 10 (30.25, 53.5] 3 (76.75, 100.0] 2 (53.5, 76.75] 1 Name: equal_width_age, dtype: int64
# 新增欄位 "equal_freq_age", 對年齡做等頻劃分
ages["equal_freq_age"] = pd.qcut(ages["age"], 4)
# 觀察等頻劃分下, 每個種組距各出現幾次
ages["equal_freq_age"].value_counts() # 每個 bin 的資料筆數是一樣的
(42.0, 100.0] 4 (26.0, 42.0] 4 (20.25, 26.0] 4 (6.999, 20.25] 4 Name: equal_freq_age, dtype: int64
ages
age | equal_width_age | customized_age_grp | equal_freq_age | |
---|---|---|---|---|
0 | 18 | (6.907, 30.25] | (10.0, 20.0] | (6.999, 20.25] |
1 | 22 | (6.907, 30.25] | (20.0, 30.0] | (20.25, 26.0] |
2 | 25 | (6.907, 30.25] | (20.0, 30.0] | (20.25, 26.0] |
3 | 27 | (6.907, 30.25] | (20.0, 30.0] | (26.0, 42.0] |
4 | 7 | (6.907, 30.25] | (-0.001, 10.0] | (6.999, 20.25] |
5 | 21 | (6.907, 30.25] | (20.0, 30.0] | (20.25, 26.0] |
6 | 23 | (6.907, 30.25] | (20.0, 30.0] | (20.25, 26.0] |
7 | 37 | (30.25, 53.5] | (30.0, 50.0] | (26.0, 42.0] |
8 | 30 | (6.907, 30.25] | (20.0, 30.0] | (26.0, 42.0] |
9 | 61 | (53.5, 76.75] | (50.0, 100.0] | (42.0, 100.0] |
10 | 45 | (30.25, 53.5] | (30.0, 50.0] | (42.0, 100.0] |
11 | 41 | (30.25, 53.5] | (30.0, 50.0] | (26.0, 42.0] |
12 | 9 | (6.907, 30.25] | (-0.001, 10.0] | (6.999, 20.25] |
13 | 18 | (6.907, 30.25] | (10.0, 20.0] | (6.999, 20.25] |
14 | 80 | (76.75, 100.0] | (50.0, 100.0] | (42.0, 100.0] |
15 | 100 | (76.75, 100.0] | (50.0, 100.0] | (42.0, 100.0] |