Author：马肖¶

E-Mail：maxiaoscut@aliyun.com¶

GitHub：https://github.com/Albertsr ¶

In [1]:

import numpy as np
import pandas as pd


def high_categorical(dataframe, high_discrete, k=3):
    # df为pandas.DataFrame格式
    # feature为df的某一列高势集离散型特征，为pandas.Series格式
    # k表示上述离散型特征出现频次最高的k个不重复取值
    
    value_counts = high_discrete.value_counts()
    top_categories = list(value_counts[:k].index)
    top_categories.append('other')
    
    high_discrete = high_discrete.apply(lambda category: category if category in top_categories else 'other')
    #print(high_discrete)
    feature_dummies = pd.get_dummies(high_discrete, prefix=high_discrete.name)
    
    dataframe = dataframe.join(feature_dummies)
    dataframe.drop(high_discrete.name, axis=1, inplace=True)
    return dataframe

实验¶

In [2]:

np.random.seed(2019)
zipcode = np.random.randint(10000, 10150, size=5000)
sales = np.random.randint(100, 150, size=5000)
df = pd.DataFrame({'销售额':sales, '邮编':zipcode})
df.head()

Out[2]:

	销售额	邮编
0	142	10072
1	140	10114
2	130	10037
3	108	10024
4	136	10029

In [3]:

high_categorical(df, df['邮编'], k=5).head(5)

Out[3]:

	销售额	邮编_10114	邮编_other
0	142	0	1
1	140	1	0
2	130	0	1
3	108	0	1
4	136	0	1

Author：马肖¶

E-Mail：maxiaoscut@aliyun.com¶

GitHub：https://github.com/Albertsr¶

实验¶

GitHub：https://github.com/Albertsr ¶