# ๆจกๅ1--่ฎธๆฏ็ๆจกๅ
!mkdir /home/kesci/work/features/
!mkdir /home/kesci/work/basic/
-*- coding:utf-8 -*-
import pandas as pd
import numpy as np
from sklearn import metrics
'''
ไธไบๅบ็ก็ๅทฅๅ
ทๆ็ฏๅขๅฝๆฐ
'''
def data_path():
return '/mnt/datasets/fusai/'
def basic_path():
return '/home/kesci/work/basic/'
def features_path():
return '/home/kesci/work/features/'
def split_data(data, columns, start_day, end_day):
data = data[(data[columns] >= start_day) & (data[columns] <= end_day)]
return data
'''
ไธ้ข่ฟๅ็lsit่ฏดๆไธ๏ผ
ไปฅ1-10ๅคฉไธบ่ตทๅง็นๅพๅบ้ด๏ผ็จไบ่ฟๅ้่ฆๅๅคๅฐๅคฉ
ไพๅฆๅ1-18๏ผๅฐฑ่ฟๅ18-10=8๏ผ
ๅๆต่ฏ้1-30๏ผๅฐฑ่ฟๅ30-10=20
้่ฆๅฐๅๅ ไธชๅฏนๅบไฟฎๆนๅฐฑๅฅฝ
... ่ฆไฟฎๆน่ตทๅง็นๅพๅบ้ด๏ผไฟฎๆนไธ้ข็ ups ๅ downs ๅฝๆฐ
'''
def features_addday_list():
return [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20]
def ups():
return 1
def downs():
return 10
'''
ๅญๆพๆ ็ญพๆฐๆฎ
ๅๅ ไธช็ชๅฃๅญๅ ๅๆ ็ญพ
'''
launch = pd.read_csv(data_path()+'app_launch_log.txt', sep='\t', header=None,
names=['user_id', 'launch_day'],
dtype={0: np.uint32, 1: np.uint8})
register = pd.read_csv(data_path() + 'user_register_log.txt', sep='\t', header=None,
names=['user_id', 'register_day', 'register_type', 'device_type'],
dtype={0: np.uint32, 1: np.uint8, 2: np.uint16, 3: np.uint16})
def get_label_list(start_day, end_day):
result = split_data(launch, 'launch_day', start_day, end_day)['user_id'].drop_duplicates()
return pd.Series(result)
if __name__ == '__main__':
up = downs()+1
down = downs()+7
data = register.loc[:, ['user_id']]
for label_num in range(len(features_addday_list())-1):
label_list = get_label_list(up + label_num, down + label_num)
label_name = 'label_' + str(label_num)
data[label_name] = data['user_id'].isin(label_list).replace({True: 1, False: 0})
data.to_csv(basic_path()+'data_label.csv', index=None)
print('data_label.csv complete!')
'''
ๆณจๅ่กจ็นๅพ
'''
if __name__ == '__main__':
up = ups()
down = downs()
for feature_num in features_addday_list():
# ่ฏปๆฐๆฎ
register = pd.read_csv(data_path()+'user_register_log.txt', sep='\t', header=None,
names=['user_id','register_day','register_type','device_type'],
dtype={0: np.uint32, 1: np.uint8, 2: np.uint8, 3: np.uint16})
# ๅบ็กๅ้ๅฎไน
feature_start = up + 0
feature_end = down + feature_num
'''
result_data ๆฏๅญๆพ็นๅพ็็ปๆๆไปถ
feature_data ็จไบๅญๆพ่ขซๆๅ็ๅๆไปถ
*****_tmp ๅญๆพไธดๆถ็นๅพๆไปถ
็ฑปไผผๆไปถๅ็ปญไธๅๆณจ้
'''
result_data = split_data(register, 'register_day', 1, feature_end)
feature_data = split_data(register, 'register_day', feature_start, feature_end)
del register
# # # # # # # # #
# ๆ็นๅพ(ๅทฒ็ปๅ
ๅซ่ฎพๅค็ฑปๅใ่ฎพๅค็ฑปๅ)
#
# ็นๅพๅบ้ดๆๅคงๅคฉๆฐๅๅปๆณจๅๆฅๆ
result_data['maxday_red_registerday'] = max(feature_data['register_day']) - feature_data['register_day']
result_data = result_data.fillna(max(feature_data['register_day']))
del result_data['register_day']
# # # # # # # # #
# ไฟๅญ็ปๆ
result_file_name = 'register_feature_'+str(feature_num)+'.csv'
result_data.to_csv(features_path()+result_file_name, index=None)
print(result_file_name+' complete!')
'''
่ง้ขๅๅปบ็นๅพ
'''
if __name__ == '__main__':
up = ups()
down = downs()
for feature_num in features_addday_list():
# ่ฏปๆฐๆฎ
register = pd.read_csv(data_path() + 'user_register_log.txt', sep='\t', header=None,
names=['user_id', 'register_day', 'register_type', 'device_type'],
dtype={0: np.uint32, 1: np.uint8, 2: np.uint16, 3: np.uint16})
create = pd.read_csv(data_path() + 'video_create_log.txt', sep='\t', header=None,
names=['user_id', 'create_day'],
dtype={0: np.uint32, 1: np.uint8})
# ๅบ็กๅ้ๅฎไน
feature_start = up
feature_end = down + feature_num
result_data = split_data(register, 'register_day', 1, feature_end).loc[:, ['user_id', 'register_day']]
feature_data = split_data(create, 'create_day', feature_start, feature_end)
del register
del create
# # # # # # # # #
# ๆ็นๅพ
#
# ็จๆทๅๅปบ่ง้ข่ฎกๆฐ
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='create_day',
aggfunc='count').reset_index().rename(columns={"create_day": 'create_count'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
result_data = result_data.fillna(0)
# ็จๆทๅๅปบ่ง้ข็ ๅนณๅ/ๆๅคง/ๆๅฐๆฅๆ ไธ ๆณจๅๆฅๆ/ๆๅคงๆถ้ด ็ๆถ้ดๅทฎ
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='create_day',
aggfunc='mean').reset_index().rename(columns={"create_day": 'create_mean'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
result_data['createmean_red_register'] = result_data['create_mean'] - result_data['register_day']
result_data['maxday_red_createmean'] = max(result_data['register_day']) - result_data['create_mean']
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='create_day',
aggfunc=np.max).reset_index().rename(columns={"create_day": 'create_max'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
result_data['createmax_red_register'] = result_data['create_max'] - result_data['register_day']
result_data['maxday_red_createmax'] = max(result_data['register_day']) - result_data['create_max']
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='create_day',
aggfunc=np.min).reset_index().rename(columns={"create_day": 'create_min'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
result_data['createmin_red_register'] = result_data['create_min'] - result_data['register_day']
result_data['maxday_red_createmin'] = max(result_data['register_day']) - result_data['create_min']
result_data = result_data.fillna(-1)
# ๅๅปบๆๅคง้ด้
result_data['max_red_min_create'] = result_data['create_max'] - result_data['create_min']
# ๆๅไธๅคฉๆฏๅฆๆๆดปๅจ
result_data['create_at_lastday'] = pd.Series(
result_data['create_max'] == max(feature_data['create_day'])).replace({True: 1, False: 0})
# ๅๅผ/ๆๅคง/ๆๅฐ ๅคฉๆฐๅค็
result_data['create_mean'] = max(feature_data['create_day']) - result_data['create_mean']
result_data['create_max'] = max(feature_data['create_day']) - result_data['create_max']
result_data['create_min'] = max(feature_data['create_day']) - result_data['create_min']
# ้ด้็ ๆนๅทฎ/ๅๅผ
feature_data_tmp = feature_data.drop_duplicates(['user_id', 'create_day']).sort_values(
by=['user_id', 'create_day'])
feature_data_tmp['create_gap'] = np.array(feature_data_tmp['create_day']) - np.array(
feature_data_tmp.tail(1).append(feature_data_tmp.head(len(feature_data_tmp) - 1))['create_day'])
feature_tmp = pd.pivot_table(feature_data_tmp, index='user_id', values='create_gap',
aggfunc=(lambda a: np.average(a[1:]))).reset_index().rename(
columns={"create_gap": 'create_gap_mean'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
feature_tmp = pd.pivot_table(feature_data_tmp, index='user_id', values='create_gap',
aggfunc=(lambda a: np.var(a[1:]))).reset_index().rename(
columns={"create_gap": 'create_gap_var'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
result_data = result_data.fillna(0)
# ๆฏๅฆไธ็ด่ฟ็ปญ/่ฟ็ปญๅฐ็ปๆ
result_data['always_create'] = [1 if i == 1 else 0 for i in result_data['create_gap_mean']]
tmp = (result_data['create_at_lastday'] == 1).replace({True: 1, False: 0})
result_data['always_create_atlast'] = tmp * result_data['always_create']
del tmp
# ๅๅปบๆฅๆ็ ๆนๅทฎ/ๅณฐๅบฆ/ๅๅบฆ
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='create_day',
aggfunc=np.var).reset_index().rename(columns={"create_day": 'create_var'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='create_day',
aggfunc=pd.Series.kurt).reset_index().rename(columns={"create_day": 'create_kurt'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='create_day',
aggfunc=pd.Series.skew).reset_index().rename(columns={"create_day": 'create_skew'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
result_data = result_data.fillna(0)
# ๆฑไธๅคฉๆๅคงๅๅปบๆฐ
feature_data['max_create_in_oneday'] = 0
feature_tmp = pd.pivot_table(feature_data, index=['user_id', 'create_day'], values='max_create_in_oneday',
aggfunc='count').reset_index()
feature_tmp = pd.DataFrame(feature_tmp.groupby(['user_id'])['max_create_in_oneday'].max()).reset_index()
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
result_data.fillna(0, inplace=True)
del result_data['register_day']
# # # # # # # #
# ไฟๅญ็ปๆ
result_file_name = 'create_feature_' + str(feature_num) + '.csv'
result_data.to_csv(features_path() + result_file_name, index=None)
print(result_file_name + ' complete!')
'''
็ปๅฝ่กจ็นๅพ
'''
if __name__ == '__main__':
up = ups()
down = downs()
for feature_num in features_addday_list():
# ่ฏปๆฐๆฎ
register = pd.read_csv(data_path()+'user_register_log.txt', sep='\t', header=None,
names=['user_id','register_day','register_type','device_type'],
dtype={0: np.uint32, 1: np.uint8, 2: np.uint16, 3: np.uint16})
launch = pd.read_csv(data_path() + 'app_launch_log.txt', sep='\t', header=None,
names=['user_id', 'launch_day'],
dtype={0: np.uint32, 1: np.uint8})
# ๅบ็กๅ้ๅฎไน
feature_start = up
feature_end = down + feature_num
result_data = split_data(register, 'register_day', 1, feature_end).loc[:, ['user_id', 'register_day']]
feature_data = split_data(launch, 'launch_day', feature_start, feature_end)
del register
del launch
# # # # # # # # #
# ๆ็นๅพ
#
# ็ปๅฝ่ฎกๆฐ/็ปๅฝ็
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='launch_day',
aggfunc='count').reset_index().rename(columns={"launch_day": 'launch_count'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
distance = (max(feature_data['launch_day']) - min(feature_data['launch_day']))
result_data['launch_ratio'] = result_data['launch_count'] * 1.0 / distance
result_data = result_data.fillna(0)
# ็ปๅฝ็ ๅนณๅ/ๆๅคง/ๆๅฐๆฅๆ ไธ ๆณจๅๆฅๆ/ๆๅคงๆถ้ด ็ๆถ้ดๅทฎ
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='launch_day',
aggfunc='mean').reset_index().rename(columns={"launch_day": 'launch_mean'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
result_data['launchmean_red_register'] = result_data['launch_mean'] - result_data['register_day']
result_data['maxday_red_launchmean'] = max(result_data['register_day']) - result_data['launch_mean']
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='launch_day',
aggfunc=np.max).reset_index().rename(columns={"launch_day": 'launch_max'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
result_data['launchmax_red_register'] = result_data['launch_max'] - result_data['register_day']
result_data['maxday_red_launchmax'] = max(result_data['register_day']) - result_data['launch_max']
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='launch_day',
aggfunc=np.min).reset_index().rename(columns={"launch_day": 'launch_min'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
# result_data['launchmin_red_register'] = result_data['launch_min'] - result_data['register_day']
result_data['maxday_red_launchmin'] = max(result_data['register_day']) - result_data['launch_min']
result_data = result_data.fillna(-1)
# ็ปๅฝๆๅคงไธๆๅฐๅทฎ
result_data['max_red_min_launch'] = result_data['launch_max'] - result_data['launch_min']
# ๆๅไธๅคฉๆฏๅฆๆๆดปๅจ
result_data['launch_at_lastday'] = pd.Series(result_data['launch_max'] == max(feature_data['launch_day'])).replace({True: 1, False: 0})
# ๅๅผ/ๆๅคง/ๆๅฐ ๅคฉๆฐๅค็
result_data['launch_mean'] = max(feature_data['launch_day']) - result_data['launch_mean']
result_data['launch_max'] = max(feature_data['launch_day']) - result_data['launch_max']
result_data['launch_min'] = max(feature_data['launch_day']) - result_data['launch_min']
# ้ด้็ ๆนๅทฎ/ๅๅผ/ๆๅคง
feature_data_tmp = feature_data.drop_duplicates(['user_id', 'launch_day']).sort_values(by=['user_id', 'launch_day'])
feature_data_tmp['launch_gap'] = np.array(feature_data_tmp['launch_day']) - np.array(
feature_data_tmp.tail(1).append(feature_data_tmp.head(len(feature_data_tmp) - 1))['launch_day'])
feature_tmp = pd.pivot_table(feature_data_tmp, index='user_id', values='launch_gap',
aggfunc=(lambda a: np.average(a[1:]))).reset_index().rename(columns={"launch_gap": 'launch_gap_mean'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
feature_tmp = pd.pivot_table(feature_data_tmp, index='user_id', values='launch_gap',
aggfunc=(lambda a: np.var(a[1:]))).reset_index().rename(columns={"launch_gap": 'launch_gap_var'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
feature_tmp = pd.pivot_table(feature_data_tmp, index='user_id', values='launch_gap',
aggfunc=(lambda a: np.max(a[1:]))).reset_index().rename(columns={"launch_gap": 'launch_gap_max'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
result_data = result_data.fillna(0)
# ๆฏๅฆไธ็ด่ฟ็ปญ/่ฟ็ปญๅฐ็ปๆ
result_data['always_launch'] = [1 if i == 1 else 0 for i in result_data['launch_gap_mean']]
tmp = (result_data['launch_at_lastday'] == 1).replace({True: 1, False: 0})
result_data['always_launch_atlast'] = tmp * result_data['always_launch']
del tmp
# ็ปๅฝๆฅๆ็ ๆนๅทฎ/ๅณฐๅบฆ/ๅๅบฆ
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='launch_day',
aggfunc=np.var).reset_index().rename(columns={"launch_day": 'launch_var'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='launch_day',
aggfunc=pd.Series.kurt).reset_index().rename(columns={"launch_day": 'launch_kurt'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='launch_day',
aggfunc=pd.Series.skew).reset_index().rename(columns={"launch_day": 'launch_skew'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
result_data = result_data.fillna(0)
del result_data['register_day']
# # # # # # # #
# ไฟๅญ็ปๆ
result_file_name = 'launch_feature_' + str(feature_num) + '.csv'
result_data.to_csv(features_path() + result_file_name, index=None)
print(result_file_name + ' complete!')
'''
ๆดปๅจ่กจ็นๅพ
ๅฆๅค
่ฟไธช็็้ๅธธๆ
ข
'''
if __name__ == '__main__':
up = ups()
down = downs()
for feature_num in features_addday_list():
# ่ฏปๆฐๆฎ
register = pd.read_csv(data_path() + 'user_register_log.txt', sep='\t', header=None,
names=['user_id', 'register_day', 'register_type', 'device_type'],
dtype={0: np.uint32, 1: np.uint8, 2: np.uint16, 3: np.uint16})
activity = pd.read_csv(data_path() + 'user_activity_log.txt', sep='\t', header=None,
names=['user_id', 'act_day', 'page', 'video_id', 'author_id', 'action_type'],
dtype={0: np.uint32, 1: np.uint8, 2: np.uint8, 3: np.uint32, 4: np.uint32, 5: np.uint8})
# ๅบ็กๅ้ๅฎไน
feature_start = up
feature_end = down + feature_num
result_data = split_data(register, 'register_day', 1, feature_end).loc[:, ['user_id', 'register_day']]
feature_data = split_data(activity, 'act_day', feature_start, feature_end)
del register
del activity
# # # # # # # # #
# ๆ็นๅพ
#
# ๆดปๅจ่ฎกๆฐ
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',
aggfunc='count').reset_index().rename(columns={"act_day": 'act_count'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
result_data = result_data.fillna(0)
# ๆดปๅจ็ ๅนณๅ/ๆๅคง/ๆๅฐๆฅๆ ไธ ๆณจๅๆฅๆ/ๆๅคงๆถ้ด ็ๆถ้ดๅทฎ
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',
aggfunc='mean').reset_index().rename(columns={"act_day": 'act_mean'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
result_data['actmean_red_register'] = result_data['act_mean'] - result_data['register_day']
result_data['maxday_red_actmean'] = max(result_data['register_day']) - result_data['act_mean']
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',
aggfunc=np.max).reset_index().rename(columns={"act_day": 'act_max'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
result_data['actmax_red_register'] = result_data['act_max'] - result_data['register_day']
result_data['maxday_red_actmax'] = max(result_data['register_day']) - result_data['act_max']
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',
aggfunc=np.min).reset_index().rename(columns={"act_day": 'act_min'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
result_data['actmin_red_register'] = result_data['act_min'] - result_data['register_day']
result_data['maxday_red_actmin'] = max(result_data['register_day']) - result_data['act_min']
result_data = result_data.fillna(-1)
# ๆๅไธๅคฉๆฏๅฆๆๆดปๅจ
result_data['act_at_lastday'] = pd.Series(result_data['act_max'] == max(feature_data['act_day'])).replace({True: 1, False: 0})
# ๅๅผ/ๆๅคง/ๆๅฐ ๅคฉๆฐๅค็
result_data['act_mean'] = max(feature_data['act_day']) - result_data['act_mean']
result_data['act_max'] = max(feature_data['act_day']) - result_data['act_max']
result_data['act_min'] = max(feature_data['act_day']) - result_data['act_min']
# ่ง็่ชๅทฑ่ฎกๆฐ
feature_tmp = pd.pivot_table(feature_data[feature_data['user_id'] == feature_data['author_id']],
index='user_id', values='author_id', aggfunc='count').reset_index().rename(columns={"author_id": 'act_self_count'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
result_data = result_data.fillna(0)
# ๆดปๅจๆฅๆ็ ๆนๅทฎ/ๅณฐๅบฆ/ๅๅบฆ
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',
aggfunc=np.var).reset_index().rename(columns={"act_day": 'act_var'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',
aggfunc=pd.Series.kurt).reset_index().rename(columns={"act_day": 'act_kurt'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',
aggfunc=pd.Series.skew).reset_index().rename(columns={"act_day": 'act_skew'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
result_data = result_data.fillna(0)
# action ็ ่ฎกๆฐ/็
feature_tmp = feature_data.loc[:, ['user_id', 'action_type', 'act_day']].groupby(['user_id', 'action_type']).count().reset_index().rename(columns={"act_day": 'action_count'})
for i in range(6):
fea_name = 'action_' + str(i) + '_count'
action_tmp = feature_tmp[feature_tmp['action_type'] == i].loc[:, ['user_id', 'action_count']].rename(columns={"action_count": fea_name})
result_data = pd.merge(result_data, action_tmp, how='left', on='user_id')
result_data = result_data.fillna(0)
result_data['action_all'] = (result_data['action_0_count']+result_data['action_1_count']+
result_data['action_2_count']+result_data['action_3_count']+
result_data['action_4_count']+result_data['action_5_count']).replace(0, 1)
for i in range(6):
fea_name = 'action_' + str(i) + '_ratio'
fea_name_2 = 'action_' + str(i) + '_count'
result_data[fea_name] = result_data[fea_name_2] / result_data['action_all']
# page ็ ่ฎกๆฐ/็
feature_tmp = feature_data.loc[:, ['user_id', 'page', 'act_day']].groupby(['user_id', 'page']).count().reset_index().rename(columns={"act_day": 'page_count'})
for i in range(5):
fea_name = 'page_' + str(i) + '_count'
page_tmp = feature_tmp[feature_tmp['page'] == i].loc[:, ['user_id', 'page_count']].rename(columns={"page_count": fea_name})
result_data = pd.merge(result_data, page_tmp, how='left', on='user_id')
result_data = result_data.fillna(0)
result_data['page_all'] = (result_data['page_0_count']+result_data['page_1_count']+
result_data['page_2_count']+result_data['page_3_count']+
result_data['page_4_count']).replace(0, 1)
for i in range(5):
fea_name = 'page_' + str(i) + '_ratio'
fea_name_2 = 'page_' + str(i) + '_count'
result_data[fea_name] = result_data[fea_name_2] / result_data['page_all']
del result_data['page_all']
del result_data['action_all']
del result_data['register_day']
# # # # # # # #
# ไฟๅญ็ปๆ
result_file_name = 'activity_feature_' + str(feature_num) + '.csv'
result_data.to_csv(features_path() + result_file_name, index=None)
print(result_file_name + ' complete!')'''
ๆดปๅจ่กจ็นๅพ
ๅฆๅค
่ฟไธช็็้ๅธธๆ
ข
'''
if __name__ == '__main__':
up = ups()
down = downs()
for feature_num in features_addday_list():
# ่ฏปๆฐๆฎ
register = pd.read_csv(data_path() + 'user_register_log.txt', sep='\t', header=None,
names=['user_id', 'register_day', 'register_type', 'device_type'],
dtype={0: np.uint32, 1: np.uint8, 2: np.uint16, 3: np.uint16})
activity = pd.read_csv(data_path() + 'user_activity_log.txt', sep='\t', header=None,
names=['user_id', 'act_day', 'page', 'video_id', 'author_id', 'action_type'],
dtype={0: np.uint32, 1: np.uint8, 2: np.uint8, 3: np.uint32, 4: np.uint32, 5: np.uint8})
# ๅบ็กๅ้ๅฎไน
feature_start = up
feature_end = down + feature_num
result_data = split_data(register, 'register_day', 1, feature_end).loc[:, ['user_id', 'register_day']]
feature_data = split_data(activity, 'act_day', feature_start, feature_end)
del register
del activity
# # # # # # # # #
# ๆ็นๅพ
#
# ๆดปๅจ่ฎกๆฐ
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',
aggfunc='count').reset_index().rename(columns={"act_day": 'act_count'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
result_data = result_data.fillna(0)
# ๆดปๅจ็ ๅนณๅ/ๆๅคง/ๆๅฐๆฅๆ ไธ ๆณจๅๆฅๆ/ๆๅคงๆถ้ด ็ๆถ้ดๅทฎ
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',
aggfunc='mean').reset_index().rename(columns={"act_day": 'act_mean'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
result_data['actmean_red_register'] = result_data['act_mean'] - result_data['register_day']
result_data['maxday_red_actmean'] = max(result_data['register_day']) - result_data['act_mean']
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',
aggfunc=np.max).reset_index().rename(columns={"act_day": 'act_max'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
result_data['actmax_red_register'] = result_data['act_max'] - result_data['register_day']
result_data['maxday_red_actmax'] = max(result_data['register_day']) - result_data['act_max']
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',
aggfunc=np.min).reset_index().rename(columns={"act_day": 'act_min'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
result_data['actmin_red_register'] = result_data['act_min'] - result_data['register_day']
result_data['maxday_red_actmin'] = max(result_data['register_day']) - result_data['act_min']
result_data = result_data.fillna(-1)
# ๆๅไธๅคฉๆฏๅฆๆๆดปๅจ
result_data['act_at_lastday'] = pd.Series(result_data['act_max'] == max(feature_data['act_day'])).replace({True: 1, False: 0})
# ๅๅผ/ๆๅคง/ๆๅฐ ๅคฉๆฐๅค็
result_data['act_mean'] = max(feature_data['act_day']) - result_data['act_mean']
result_data['act_max'] = max(feature_data['act_day']) - result_data['act_max']
result_data['act_min'] = max(feature_data['act_day']) - result_data['act_min']
# ่ง็่ชๅทฑ่ฎกๆฐ
feature_tmp = pd.pivot_table(feature_data[feature_data['user_id'] == feature_data['author_id']],
index='user_id', values='author_id', aggfunc='count').reset_index().rename(columns={"author_id": 'act_self_count'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
result_data = result_data.fillna(0)
# ๆดปๅจๆฅๆ็ ๆนๅทฎ/ๅณฐๅบฆ/ๅๅบฆ
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',
aggfunc=np.var).reset_index().rename(columns={"act_day": 'act_var'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',
aggfunc=pd.Series.kurt).reset_index().rename(columns={"act_day": 'act_kurt'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
feature_tmp = pd.pivot_table(feature_data, index='user_id', values='act_day',
aggfunc=pd.Series.skew).reset_index().rename(columns={"act_day": 'act_skew'})
result_data = pd.merge(result_data, feature_tmp, on='user_id', how='left')
result_data = result_data.fillna(0)
# action ็ ่ฎกๆฐ/็
feature_tmp = feature_data.loc[:, ['user_id', 'action_type', 'act_day']].groupby(['user_id', 'action_type']).count().reset_index().rename(columns={"act_day": 'action_count'})
for i in range(6):
fea_name = 'action_' + str(i) + '_count'
action_tmp = feature_tmp[feature_tmp['action_type'] == i].loc[:, ['user_id', 'action_count']].rename(columns={"action_count": fea_name})
result_data = pd.merge(result_data, action_tmp, how='left', on='user_id')
result_data = result_data.fillna(0)
result_data['action_all'] = (result_data['action_0_count']+result_data['action_1_count']+
result_data['action_2_count']+result_data['action_3_count']+
result_data['action_4_count']+result_data['action_5_count']).replace(0, 1)
for i in range(6):
fea_name = 'action_' + str(i) + '_ratio'
fea_name_2 = 'action_' + str(i) + '_count'
result_data[fea_name] = result_data[fea_name_2] / result_data['action_all']
# page ็ ่ฎกๆฐ/็
feature_tmp = feature_data.loc[:, ['user_id', 'page', 'act_day']].groupby(['user_id', 'page']).count().reset_index().rename(columns={"act_day": 'page_count'})
for i in range(5):
fea_name = 'page_' + str(i) + '_count'
page_tmp = feature_tmp[feature_tmp['page'] == i].loc[:, ['user_id', 'page_count']].rename(columns={"page_count": fea_name})
result_data = pd.merge(result_data, page_tmp, how='left', on='user_id')
result_data = result_data.fillna(0)
result_data['page_all'] = (result_data['page_0_count']+result_data['page_1_count']+
result_data['page_2_count']+result_data['page_3_count']+
result_data['page_4_count']).replace(0, 1)
for i in range(5):
fea_name = 'page_' + str(i) + '_ratio'
fea_name_2 = 'page_' + str(i) + '_count'
result_data[fea_name] = result_data[fea_name_2] / result_data['page_all']
del result_data['page_all']
del result_data['action_all']
del result_data['register_day']
# # # # # # # #
# ไฟๅญ็ปๆ
result_file_name = 'activity_feature_' + str(feature_num) + '.csv'
result_data.to_csv(features_path() + result_file_name, index=None)
print(result_file_name + ' complete!')
'''
่ทๆจกๅ
'''
import xgboost as xgb
def get_feature(num, data_label=None):
register = pd.read_csv(features_path()+'register_feature_'+str(num)+'.csv')
create = pd.read_csv(features_path()+'create_feature_'+str(num)+'.csv')
launch = pd.read_csv(features_path()+'launch_feature_'+str(num)+'.csv')
activity = pd.read_csv(features_path()+'activity_feature_'+str(num)+'.csv')
feature = pd.merge(register, launch, on='user_id', how='left')
feature = pd.merge(feature, activity, on='user_id', how='left')
feature = pd.merge(feature, create, on='user_id', how='left')
del register
del create
del launch
if data_label is not None:
label_name = 'label_' + str(num)
data_label_tmp = data_label[data_label['user_id'].isin(feature['user_id'])]
data_label_tmp = data_label.loc[:, ['user_id', label_name]]
data_label_tmp.columns = ['user_id', 'label']
feature = pd.merge(feature, data_label_tmp, on='user_id', how='left')
return feature
if __name__ == '__main__':
# ่ฏปๆ ็ญพๆฐๆฎ
data_label = pd.read_csv(basic_path()+'data_label.csv')
# ่ฏป็นๅพๆฐๆฎ
test_x = get_feature('20')
train_x = get_feature('0', data_label).append(get_feature('1', data_label)).append(
get_feature('2', data_label)).append(get_feature('3', data_label)).append(
get_feature('4', data_label)).append(get_feature('5', data_label)).append(
get_feature('6', data_label)).append(get_feature('7', data_label)).append(
get_feature('8', data_label)).append(get_feature('9', data_label)).append(
get_feature('10', data_label))
train_y = train_x['label']
test_user = test_x['user_id']
del train_x['user_id']
del test_x['user_id']
del train_x['label']
# XGBOOST ่ฎญ็ป
dtrain = xgb.DMatrix(train_x, label=train_y)
dtest = xgb.DMatrix(test_x)
params = {
# 'objective': 'binary:logistic',
'objective': 'rank:pairwise',
'eta': 0.03,
'max_depth': 5,
'colsample_bytree': 0.8,
'subsample': 0.8,
'min_child_weight': 16,
'silent': 1,
}
bst = xgb.train(params, dtrain, 1500, watchlist = [(dtrain, 'train')])
pre_label = bst.predict(dtest)
# ็ๆ็ปๆๆไปถ
pd.DataFrame(data={0:test_user, 1:pre_label}).to_csv('/home/kesci/xjy_model.txt', index=None, header=None)
# ๆจกๅ2--ไฝ่ๆจกๅ
# -*- coding: utf-8 -*-
"""
Created on Mon May 28 12:20:47 2018
@author: yuwei
"""
import pandas as pd
import numpy as np
import xgboost as xgb
import gc
#%%
def loadData():
"่ฏปๅๆฐๆฎ้"
#่ฆ็ๅฎๅ
จๆณจๅไฟกๆฏ่กจ
app = pd.read_table(r'/mnt/datasets/fusai/app_launch_log.txt',names=['user_id','day'],encoding='utf-8',sep='\t',)
#่ฆ็ๆณจๅไฟกๆฏ่กจ43708
user_act = pd.read_table(r'/mnt/datasets/fusai/user_activity_log.txt',names=['user_id','day','page','video_id','author_id','action_type'],encoding='utf-8',sep='\t')
#ๆณจๅไฟกๆฏ่กจๅ
ฑ่ฎก51709
user_reg = pd.read_table(r'/mnt/datasets/fusai/user_register_log.txt',names=['user_id','register_day','register_type','device_type'],encoding='utf-8',sep='\t')
#ไป
่ฆ็7606
vedio = pd.read_table(r'/mnt/datasets/fusai/video_create_log.txt',names=['user_id','day'],encoding='utf-8',sep='\t')
return app,user_act,user_reg,vedio
#%%
def makeLabel(app,user_act,user_reg,vedio):
"ๆๆ "
"ๆต่ฏ้"
#10-30ๅท ๆชๆฅไธๅคฉ31-71
'''
te_app = app[app.day>=10];te_user_act = user_act[user_act.day>=10]
te_user_reg = user_reg[user_reg.register_day>=10];te_vedio = vedio[vedio.day>=10]
te1 = te_app[['user_id']].drop_duplicates()
te2 = te_user_act[['user_id']].drop_duplicates()
te3 = te_user_reg[['user_id']].drop_duplicates()
te4 = te_vedio[['user_id']].drop_duplicates()
te = pd.concat([te1,te2,te3,te4],axis=0)
test = te[['user_id']].drop_duplicates()
'''
test = user_reg[['user_id']].drop_duplicates()
"้ช่ฏ้-1"
#3-23ๅท
tr_app = app[(app.day>=1)&(app.day<=23)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=23)]
tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=23)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=23)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train1 = tr[['user_id']].drop_duplicates()
#ๆชๆฅ7ๅคฉ๏ผ24-30ๅท
tr_app = app[(app.day>=24)&(app.day<=30)];tr_user_act = user_act[(user_act.day>=24)&(user_act.day<=30)]
tr_user_reg = user_reg[(user_reg.register_day>=24)&(user_reg.register_day<=30)];tr_vedio = vedio[(vedio.day>=24)&(vedio.day<=30)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train1_true = tr[['user_id']].drop_duplicates()
#ๅทฆ่ฟๆฅๆๆ
train1_true['label'] = 1
train1 = pd.merge(train1,train1_true,on='user_id',how='left')
train1= train1.fillna(0)
del train1_true;gc.collect();
"่ฎญ็ป้-2"
tr_app = app[(app.day>=1)&(app.day<=22)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=22)]
tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=22)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=22)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train2 = tr[['user_id']].drop_duplicates()
#ๆชๆฅ7ๅคฉ๏ผ23-29ๅท
tr_app = app[(app.day>=23)&(app.day<=29)];tr_user_act = user_act[(user_act.day>=23)&(user_act.day<=29)]
tr_user_reg = user_reg[(user_reg.register_day>=23)&(user_reg.register_day<=29)];tr_vedio = vedio[(vedio.day>=23)&(vedio.day<=29)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train2_true = tr[['user_id']].drop_duplicates()
#ๅทฆ่ฟๆฅๆๆ
train2_true['label'] = 1
train2 = pd.merge(train2,train2_true,on='user_id',how='left')
train2= train2.fillna(0)
del train2_true;gc.collect();
"่ฎญ็ป้-3"
tr_app = app[(app.day>=1)&(app.day<=21)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=21)]
tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=21)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=21)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train3 = tr[['user_id']].drop_duplicates()
#ๆชๆฅ7ๅคฉ๏ผ22-28ๅท
tr_app = app[(app.day>=22)&(app.day<=28)];tr_user_act = user_act[(user_act.day>=22)&(user_act.day<=28)]
tr_user_reg = user_reg[(user_reg.register_day>=22)&(user_reg.register_day<=28)];tr_vedio = vedio[(vedio.day>=22)&(vedio.day<=28)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train3_true = tr[['user_id']].drop_duplicates()
#ๅทฆ่ฟๆฅๆๆ
train3_true['label'] = 1
train3 = pd.merge(train3,train3_true,on='user_id',how='left')
train3= train3.fillna(0)
del train3_true;gc.collect();
"่ฎญ็ป้-4"
tr_app = app[(app.day>=1)&(app.day<=20)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=20)]
tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=20)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=20)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train4 = tr[['user_id']].drop_duplicates()
#ๆชๆฅ7ๅคฉ๏ผ21-27ๅท
tr_app = app[(app.day>=21)&(app.day<=27)];tr_user_act = user_act[(user_act.day>=21)&(user_act.day<=27)]
tr_user_reg = user_reg[(user_reg.register_day>=21)&(user_reg.register_day<=27)];tr_vedio = vedio[(vedio.day>=21)&(vedio.day<=27)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train4_true = tr[['user_id']].drop_duplicates()
#ๅทฆ่ฟๆฅๆๆ
train4_true['label'] = 1
train4 = pd.merge(train4,train4_true,on='user_id',how='left')
train4= train4.fillna(0)
del train4_true;gc.collect();
"่ฎญ็ป้-5"
tr_app = app[(app.day>=1)&(app.day<=19)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=19)]
tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=19)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=19)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train5 = tr[['user_id']].drop_duplicates()
#ๆชๆฅ7ๅคฉ๏ผ21-27ๅท
tr_app = app[(app.day>=20)&(app.day<=26)];tr_user_act = user_act[(user_act.day>=20)&(user_act.day<=26)]
tr_user_reg = user_reg[(user_reg.register_day>=20)&(user_reg.register_day<=26)];tr_vedio = vedio[(vedio.day>=20)&(vedio.day<=26)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train5_true = tr[['user_id']].drop_duplicates()
#ๅทฆ่ฟๆฅๆๆ
train5_true['label'] = 1
train5 = pd.merge(train5,train5_true,on='user_id',how='left')
train5 = train5.fillna(0)
del train5_true;gc.collect();
"่ฎญ็ป้-6"
tr_app = app[(app.day>=1)&(app.day<=18)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=18)]
tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=18)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=18)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train6 = tr[['user_id']].drop_duplicates()
#ๆชๆฅ7ๅคฉ๏ผ21-27ๅท
tr_app = app[(app.day>=19)&(app.day<=25)];tr_user_act = user_act[(user_act.day>=19)&(user_act.day<=25)]
tr_user_reg = user_reg[(user_reg.register_day>=19)&(user_reg.register_day<=25)];tr_vedio = vedio[(vedio.day>=19)&(vedio.day<=25)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train6_true = tr[['user_id']].drop_duplicates()
#ๅทฆ่ฟๆฅๆๆ
train6_true['label'] = 1
train6 = pd.merge(train6,train6_true,on='user_id',how='left')
train6 = train6.fillna(0)
del train6_true;gc.collect();
"่ฎญ็ป้-7"
tr_app = app[(app.day>=1)&(app.day<=17)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=17)]
tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=17)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=17)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train7 = tr[['user_id']].drop_duplicates()
#ๆชๆฅ7ๅคฉ๏ผ21-27ๅท
tr_app = app[(app.day>=18)&(app.day<=24)];tr_user_act = user_act[(user_act.day>=18)&(user_act.day<=24)]
tr_user_reg = user_reg[(user_reg.register_day>=18)&(user_reg.register_day<=24)];tr_vedio = vedio[(vedio.day>=18)&(vedio.day<=24)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train7_true = tr[['user_id']].drop_duplicates()
#ๅทฆ่ฟๆฅๆๆ
train7_true['label'] = 1
train7 = pd.merge(train7,train7_true,on='user_id',how='left')
train7 = train7.fillna(0)
del train7_true;gc.collect();
"่ฎญ็ป้-8"
tr_app = app[(app.day>=1)&(app.day<=16)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=16)]
tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=16)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=16)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train8 = tr[['user_id']].drop_duplicates()
#ๆชๆฅ7ๅคฉ๏ผ21-27ๅท
tr_app = app[(app.day>=17)&(app.day<=23)];tr_user_act = user_act[(user_act.day>=17)&(user_act.day<=23)]
tr_user_reg = user_reg[(user_reg.register_day>=17)&(user_reg.register_day<=23)];tr_vedio = vedio[(vedio.day>=17)&(vedio.day<=23)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train8_true = tr[['user_id']].drop_duplicates()
#ๅทฆ่ฟๆฅๆๆ
train8_true['label'] = 1
train8 = pd.merge(train8,train8_true,on='user_id',how='left')
train8 = train8.fillna(0)
del train8_true;gc.collect();
"่ฎญ็ป้-9"
tr_app = app[(app.day>=1)&(app.day<=15)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=15)]
tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=15)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=15)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train9 = tr[['user_id']].drop_duplicates()
#ๆชๆฅ7ๅคฉ๏ผ21-27ๅท
tr_app = app[(app.day>=16)&(app.day<=22)];tr_user_act = user_act[(user_act.day>=16)&(user_act.day<=22)]
tr_user_reg = user_reg[(user_reg.register_day>=16)&(user_reg.register_day<=22)];tr_vedio = vedio[(vedio.day>=16)&(vedio.day<=22)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train9_true = tr[['user_id']].drop_duplicates()
#ๅทฆ่ฟๆฅๆๆ
train9_true['label'] = 1
train9 = pd.merge(train9,train9_true,on='user_id',how='left')
train9 = train9.fillna(0)
del train9_true;gc.collect();
"่ฎญ็ป้-10"
tr_app = app[(app.day>=1)&(app.day<=14)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=14)]
tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=14)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=14)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train10 = tr[['user_id']].drop_duplicates()
#ๆชๆฅ7ๅคฉ๏ผ21-27ๅท
tr_app = app[(app.day>=15)&(app.day<=21)];tr_user_act = user_act[(user_act.day>=15)&(user_act.day<=21)]
tr_user_reg = user_reg[(user_reg.register_day>=15)&(user_reg.register_day<=21)];tr_vedio = vedio[(vedio.day>=15)&(vedio.day<=21)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train10_true = tr[['user_id']].drop_duplicates()
#ๅทฆ่ฟๆฅๆๆ
train10_true['label'] = 1
train10 = pd.merge(train10,train10_true,on='user_id',how='left')
train10 = train10.fillna(0)
del train10_true;gc.collect();
"่ฎญ็ป้-11"
tr_app = app[(app.day>=1)&(app.day<=13)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=13)]
tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=13)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=13)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train11 = tr[['user_id']].drop_duplicates()
#ๆชๆฅ7ๅคฉ๏ผ21-27ๅท
tr_app = app[(app.day>=14)&(app.day<=20)];tr_user_act = user_act[(user_act.day>=14)&(user_act.day<=20)]
tr_user_reg = user_reg[(user_reg.register_day>=14)&(user_reg.register_day<=20)];tr_vedio = vedio[(vedio.day>=14)&(vedio.day<=20)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train11_true = tr[['user_id']].drop_duplicates()
#ๅทฆ่ฟๆฅๆๆ
train11_true['label'] = 1
train11 = pd.merge(train11,train11_true,on='user_id',how='left')
train11 = train11.fillna(0)
del train11_true;gc.collect();
"่ฎญ็ป้-12"
tr_app = app[(app.day>=1)&(app.day<=12)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=12)]
tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=12)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=12)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train12 = tr[['user_id']].drop_duplicates()
#ๆชๆฅ7ๅคฉ๏ผ21-27ๅท
tr_app = app[(app.day>=13)&(app.day<=19)];tr_user_act = user_act[(user_act.day>=13)&(user_act.day<=19)]
tr_user_reg = user_reg[(user_reg.register_day>=13)&(user_reg.register_day<=19)];tr_vedio = vedio[(vedio.day>=13)&(vedio.day<=19)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train12_true = tr[['user_id']].drop_duplicates()
#ๅทฆ่ฟๆฅๆๆ
train12_true['label'] = 1
train12 = pd.merge(train12,train12_true,on='user_id',how='left')
train12 = train12.fillna(0)
del train12_true;gc.collect();
"่ฎญ็ป้-13"
tr_app = app[(app.day>=1)&(app.day<=11)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=11)]
tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=11)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=11)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train13 = tr[['user_id']].drop_duplicates()
#ๆชๆฅ7ๅคฉ๏ผ21-27ๅท
tr_app = app[(app.day>=12)&(app.day<=18)];tr_user_act = user_act[(user_act.day>=12)&(user_act.day<=18)]
tr_user_reg = user_reg[(user_reg.register_day>=12)&(user_reg.register_day<=18)];tr_vedio = vedio[(vedio.day>=12)&(vedio.day<=18)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train13_true = tr[['user_id']].drop_duplicates()
#ๅทฆ่ฟๆฅๆๆ
train13_true['label'] = 1
train13 = pd.merge(train13,train13_true,on='user_id',how='left')
train13 = train13.fillna(0)
del train13_true;gc.collect();
"่ฎญ็ป้-14"
tr_app = app[(app.day>=1)&(app.day<=10)];tr_user_act = user_act[(user_act.day>=1)&(user_act.day<=10)]
tr_user_reg = user_reg[(user_reg.register_day>=1)&(user_reg.register_day<=10)];tr_vedio = vedio[(vedio.day>=1)&(vedio.day<=10)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train14 = tr[['user_id']].drop_duplicates()
#ๆชๆฅ7ๅคฉ๏ผ21-27ๅท
tr_app = app[(app.day>=11)&(app.day<=17)];tr_user_act = user_act[(user_act.day>=11)&(user_act.day<=17)]
tr_user_reg = user_reg[(user_reg.register_day>=11)&(user_reg.register_day<=17)];tr_vedio = vedio[(vedio.day>=11)&(vedio.day<=17)]
tr1 = tr_app[['user_id']].drop_duplicates()
tr2 = tr_user_act[['user_id']].drop_duplicates()
tr3 = tr_user_reg[['user_id']].drop_duplicates()
tr4 = tr_vedio[['user_id']].drop_duplicates()
tr = pd.concat([tr1,tr2,tr3,tr4],axis=0)
train14_true = tr[['user_id']].drop_duplicates()
#ๅทฆ่ฟๆฅๆๆ
train14_true['label'] = 1
train14 = pd.merge(train14,train14_true,on='user_id',how='left')
train14 = train14.fillna(0)
del train14_true;gc.collect();
del tr1;gc.collect();
del tr2;gc.collect();
del tr3;gc.collect();
del tr4;gc.collect();
del tr;gc.collect();
del tr_app;gc.collect();
del tr_user_reg;gc.collect();
del tr_vedio;gc.collect();
return test,train1,train2,train3,train4,train5,train6,train7,train8,train9,train10,train11,train12,train13,train14
#%%
def culContinuousMeanLaunchDay(s,day_min):
f_start = day_min - 21
launch_day = [int(x) for x in list(set(s.split(':')))]
launch_day.sort()
continuous_day_count = []
count = 0
for i in range(len(launch_day) - 1):
if ((int(launch_day[i + 1]) - int(launch_day[i]) == 1)):
if (i == len(launch_day) - 2):
count += (int(launch_day[i]) - f_start + 1)
continuous_day_count.append(
count + (int(launch_day[i + 1]) - f_start + 1))
else:
count += (int(launch_day[i]) - f_start + 1)
else:
if (count != 0):
continuous_day_count.append(
count + (int(launch_day[i + 1]) - f_start + 1))
count = 0
if (len(continuous_day_count) > 0):
continuous_day_count = np.array(continuous_day_count)
else:
continuous_day_count.append(0)
continuous_day_count = np.array(continuous_day_count)
return np.mean(continuous_day_count)
#%%
def culContinuousMaxLaunchDay(s,day_min):
f_start = day_min - 21
launch_day = [int(x) for x in list(set(s.split(':')))]
launch_day.sort()
continuous_day_count = []
count = 0
for i in range(len(launch_day) - 1):
if ((int(launch_day[i + 1]) - int(launch_day[i]) == 1)):
if (i == len(launch_day) - 2):
count += (int(launch_day[i]) - f_start + 1)
continuous_day_count.append(
count + (int(launch_day[i + 1]) - f_start + 1))
else:
count += (int(launch_day[i]) - f_start + 1)
else:
if (count != 0):
continuous_day_count.append(
count + (int(launch_day[i + 1]) - f_start + 1))
count = 0
if (len(continuous_day_count) > 0):
continuous_day_count = np.array(continuous_day_count)
else:
continuous_day_count.append(0)
continuous_day_count = np.array(continuous_day_count)
return np.max(continuous_day_count)
#%%
def culContinuousMinLaunchDay(s,day_min):
f_start = day_min - 21
launch_day = [int(x) for x in list(set(s.split(':')))]
launch_day.sort()
continuous_day_count = []
count = 0
for i in range(len(launch_day) - 1):
if ((int(launch_day[i + 1]) - int(launch_day[i]) == 1)):
if (i == len(launch_day) - 2):
count += (int(launch_day[i]) - f_start + 1)
continuous_day_count.append(
count + (int(launch_day[i + 1]) - f_start + 1))
else:
count += (int(launch_day[i]) - f_start + 1)
else:
if (count != 0):
continuous_day_count.append(
count + (int(launch_day[i + 1]) - f_start + 1))
count = 0
if (len(continuous_day_count) > 0):
continuous_day_count = np.array(continuous_day_count)
else:
continuous_day_count.append(0)
continuous_day_count = np.array(continuous_day_count)
return np.min(continuous_day_count)
#%%
def genFeature(day_min,day_max,data,app,user_act,user_reg,vedio):
"็นๅพๆๅ"
#ไฟๅญๅๅง่กจ
ans = data.copy()
#่ฎก็ฎๆๅคงๆๅฐๅนณๅไฝฟ็จๆถ้ด
app = app[(app.day>=day_min-21)&(app.day<=day_max-7)]
vedio = vedio[(vedio.day>=day_min-21)&(vedio.day<=day_max-7)]
user_act = user_act[(user_act.day>=day_min-21)&(user_act.day<=day_max-7)]
a = app.copy()
a['s'] = a.day
a['s'] = a.s.astype('str')
a = a.groupby(['user_id'])['s'].agg(lambda x:':'.join(x)).reset_index()
print(1)
a['s_mean_day'] = a.s.apply(lambda x:culContinuousMeanLaunchDay(x,day_min))
print(2)
a['s_max_day'] = a.s.apply(lambda x:culContinuousMaxLaunchDay(x,day_min))
print(3)
a = a[['user_id','s_mean_day','s_max_day']]
ans = pd.merge(ans,a,on='user_id',how='left')
del a;gc.collect();
a = vedio.copy()
a['s'] = a.day
a['s'] = a.s.astype('str')
a = a.groupby(['user_id'])['s'].agg(lambda x:':'.join(x)).reset_index()
print(1)
a['s_mean_day_1'] = a.s.apply(lambda x:culContinuousMeanLaunchDay(x,day_min))
print(2)
a['s_max_day_1'] = a.s.apply(lambda x:culContinuousMaxLaunchDay(x,day_min))
print(3)
a = a[['user_id','s_mean_day_1','s_max_day_1']]
ans = pd.merge(ans,a,on='user_id',how='left')
del a;gc.collect();
a = user_act.copy()
a['s'] = a.day
a['s'] = a.s.astype('str')
a = a.groupby(['user_id'])['s'].agg(lambda x:':'.join(x)).reset_index()
print(1)
a['s_mean_day_2'] = a.s.apply(lambda x:culContinuousMeanLaunchDay(x,day_min))
print(2)
a['s_max_day_2'] = a.s.apply(lambda x:culContinuousMaxLaunchDay(x,day_min))
print(3)
a = a[['user_id','s_mean_day_2','s_max_day_2']]
ans = pd.merge(ans,a,on='user_id',how='left')
del a;gc.collect();
#%%
"ๆๅ ๆณจๅไฟกๆฏ่กจ ็นๅพ"
ans = pd.merge(ans,user_reg,on='user_id',how='left')
#ๆณจๅๆฅๆ่ท็ฆปๆๅฐ/ๆๅคงๆฅๆ่ท็ฆป
ans['register_sub_min'] = day_min - ans['register_day']
ans['register_sub_max'] = day_max - ans['register_day']
#็ฆปๆฃๆณจๅ็ฑปๅ
register_type_df = pd.get_dummies(ans['register_type'],prefix = 'register_type')
ans = pd.concat([ans,register_type_df],axis=1)
del ans['register_type']
del register_type_df;gc.collect();
#%%
"ๆทปๅ ๆถ้ดๅทฎ"
ans['day_median_sub'] = day_min - ans['register_day']
#%%
"ๆๅ app ็นๅพ"
app = app[(app.day>=day_min-21)&(app.day<=day_max-7)]
#็ป่ฎก็จๆทๅจๅ21ๅคฉๅฏ็จappๅคๅฐๆฌก
app['app_count'] = app['user_id']
feat = pd.pivot_table(app,index=['user_id'],values='app_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['app_count'] = ans['app_count']/ans['day_median_sub']
#็ป่ฎก็จๆทๆ่ฟไธๆฌกไฝฟ็จapp่ท็ฆปๆๅฐ/ๆๅคงๆฅๆ่ท็ฆป
#ๆๅฐไฝฟ็จappๆถ้ด
app['app_day_min'] = app['day']
feat = pd.pivot_table(app,index=['user_id'],values='app_day_min',aggfunc='min').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
#ๆๅคงไฝฟ็จappๆถ้ด
app['app_day_max'] = app['day']
feat = pd.pivot_table(app,index=['user_id'],values='app_day_max',aggfunc='max').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['appmin_sub_day_min'] = day_min - ans['app_day_min']
ans['appmax_sub_day_min'] = day_min - ans['app_day_max']
ans['appmin_sub_day_max'] = day_max - ans['app_day_min']
ans['appmax_sub_day_max'] = day_max - ans['app_day_max']
del ans['app_day_min'];del ans['app_day_max']
"ๅๅ็ฒๅบฆ"
#็ป่ฎก็จๆทๅจๅ14ๅคฉๅฏ็จappๅคๅฐๆฌก
app = app[(app.day>=day_min-14)&(app.day<=day_max-7)]
app['app_count_14'] = app['user_id']
feat = pd.pivot_table(app,index=['user_id'],values='app_count_14',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['app_count_14'] = ans['app_count_14']/ans['day_median_sub']
#็ป่ฎก็จๆทๅจๅ10ๅคฉๅฏ็จappๅคๅฐๆฌก
app = app[(app.day>=day_min-10)&(app.day<=day_max-7)]
app['app_count_10'] = app['user_id']
feat = pd.pivot_table(app,index=['user_id'],values='app_count_10',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['app_count_10'] = ans['app_count_10']/ans['day_median_sub']
#็ป่ฎก็จๆทๅจๅ7ๅคฉๅฏ็จappๅคๅฐๆฌก
app = app[(app.day>=day_min-7)&(app.day<=day_max-7)]
app['app_count_7'] = app['user_id']
feat = pd.pivot_table(app,index=['user_id'],values='app_count_7',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['app_count_7'] = ans['app_count_7']/ans['day_median_sub']
#็ป่ฎก็จๆทๅจๅ5ๅคฉๅฏ็จappๅคๅฐๆฌก
app = app[(app.day>=day_min-5)&(app.day<=day_max-7)]
app['app_count_5'] = app['user_id']
feat = pd.pivot_table(app,index=['user_id'],values='app_count_5',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
#็ป่ฎก็จๆทๅจๅ4ๅคฉๅฏ็จappๅคๅฐๆฌก
app = app[(app.day>=day_min-4)&(app.day<=day_max-7)]
app['app_count_4'] = app['user_id']
feat = pd.pivot_table(app,index=['user_id'],values='app_count_4',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
#็ป่ฎก็จๆทๅจๅ3ๅคฉๅฏ็จappๅคๅฐๆฌก
app = app[(app.day>=day_min-3)&(app.day<=day_max-7)]
app['app_count_3'] = app['user_id']
feat = pd.pivot_table(app,index=['user_id'],values='app_count_3',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
#็ป่ฎก็จๆทๅจๅ2ๅคฉๅฏ็จappๅคๅฐๆฌก
app = app[(app.day>=day_min-2)&(app.day<=day_max-7)]
app['app_count_2'] = app['user_id']
feat = pd.pivot_table(app,index=['user_id'],values='app_count_2',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
#็ป่ฎก็จๆทๅจๅ1ๅคฉๅฏ็จappๅคๅฐๆฌก
app = app[(app.day>=day_min-1)&(app.day<=day_max-7)]
app['app_count_1'] = app['user_id']
feat = pd.pivot_table(app,index=['user_id'],values='app_count_1',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
#%%
"ๆๅ vedio ่กจ็นๅพ"
vedio = vedio[(vedio.day>=day_min-21)&(vedio.day<=day_max-7)]
#็ป่ฎก็จๆทๅจๅ21ๅคฉๆๆ่ง้ขๅคๅฐๆฌก
vedio['vedio_count'] = vedio['user_id']
feat = pd.pivot_table(vedio,index=['user_id'],values='vedio_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['vedio_count'] = ans['vedio_count']/ans['day_median_sub']
#็ป่ฎก็จๆทๆ่ฟไธๆฌกๆๆ่ง้ข่ท็ฆปๆๅฐ/ๆๅคงๆฅๆ่ท็ฆป
#ๆๅฐไฝฟ็จvedioๆถ้ด
vedio['vedio_day_min'] = vedio['day']
feat = pd.pivot_table(vedio,index=['user_id'],values='vedio_day_min',aggfunc='min').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
#ๆๅคงไฝฟ็จvedioๆถ้ด
vedio['vedio_day_max'] = vedio['day']
feat = pd.pivot_table(vedio,index=['user_id'],values='vedio_day_max',aggfunc='max').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['vediomin_sub_day_min'] = day_min - ans['vedio_day_min']
ans['vediomax_sub_day_min'] = day_min - ans['vedio_day_max']
ans['vediomin_sub_day_max'] = day_max - ans['vedio_day_min']
ans['vediomax_sub_day_max'] = day_max - ans['vedio_day_max']
del ans['vedio_day_min'];del ans['vedio_day_max']
"ๅๅ็ฒๅบฆ"
#็ป่ฎก็จๆทๅจๅ14ๅคฉๆๆ่ง้ขๅคๅฐๆฌก
vedio = vedio[(vedio.day>=day_min-14)&(vedio.day<=day_max-7)]
vedio['vedio_count_14'] = vedio['user_id']
feat = pd.pivot_table(vedio,index=['user_id'],values='vedio_count_14',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['vedio_count_14'] = ans['vedio_count_14']/ans['day_median_sub']
#็ป่ฎก็จๆทๅจๅ10ๅคฉๆๆ่ง้ขๅคๅฐๆฌก
vedio = vedio[(vedio.day>=day_min-10)&(vedio.day<=day_max-7)]
vedio['vedio_count_10'] = vedio['user_id']
feat = pd.pivot_table(vedio,index=['user_id'],values='vedio_count_10',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['vedio_count_10'] = ans['vedio_count_10']/ans['day_median_sub']
#็ป่ฎก็จๆทๅจๅ7ๅคฉๆๆ่ง้ขๅคๅฐๆฌก
vedio = vedio[(vedio.day>=day_min-7)&(vedio.day<=day_max-7)]
vedio['vedio_count_7'] = vedio['user_id']
feat = pd.pivot_table(vedio,index=['user_id'],values='vedio_count_7',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['vedio_count_7'] = ans['vedio_count_7']/ans['day_median_sub']
#็ป่ฎก็จๆทๅจๅ5ๅคฉๆๆ่ง้ขๅคๅฐๆฌก
vedio = vedio[(vedio.day>=day_min-5)&(vedio.day<=day_max-7)]
vedio['vedio_count_5'] = vedio['user_id']
feat = pd.pivot_table(vedio,index=['user_id'],values='vedio_count_5',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
#็ป่ฎก็จๆทๅจๅ4ๅคฉๆๆ่ง้ขๅคๅฐๆฌก
vedio = vedio[(vedio.day>=day_min-4)&(vedio.day<=day_max-7)]
vedio['vedio_count_4'] = vedio['user_id']
feat = pd.pivot_table(vedio,index=['user_id'],values='vedio_count_4',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
#็ป่ฎก็จๆทๅจๅ3ๅคฉๆๆ่ง้ขๅคๅฐๆฌก
vedio = vedio[(vedio.day>=day_min-3)&(vedio.day<=day_max-7)]
vedio['vedio_count_3'] = vedio['user_id']
feat = pd.pivot_table(vedio,index=['user_id'],values='vedio_count_3',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
#็ป่ฎก็จๆทๅจๅ2ๅคฉๆๆ่ง้ขๅคๅฐๆฌก
vedio = vedio[(vedio.day>=day_min-2)&(vedio.day<=day_max-7)]
vedio['vedio_count_2'] = vedio['user_id']
feat = pd.pivot_table(vedio,index=['user_id'],values='vedio_count_2',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
#็ป่ฎก็จๆทๅจๅ1ๅคฉๆๆ่ง้ขๅคๅฐๆฌก
vedio = vedio[(vedio.day>=day_min-1)&(vedio.day<=day_max-7)]
vedio['vedio_count_1'] = vedio['user_id']
feat = pd.pivot_table(vedio,index=['user_id'],values='vedio_count_1',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
#%%
"ๆๅ activity ่กจ็นๅพ"
user_act = user_act[(user_act.day>=day_min-21)&(user_act.day<=day_max-7)]
#็ป่ฎกๅ21ๅคฉๅ
ฑๆดป่ทๅคๅฐๆฌก
user_act['user_act_count'] = user_act['user_id']
feat = pd.pivot_table(user_act,index=['user_id'],values='user_act_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['user_act_count'] = ans['user_act_count']/ans['day_median_sub']
#%%
"ๅๅ็ฒๅบฆ"
#%% ๅ18ๅคฉ
'''
user_act = user_act[(user_act.day>=day_min-18)&(user_act.day<=day_max-7)]
#็ป่ฎกๅ18ๅคฉๅ
ฑๆดป่ทๅคๅฐๆฌก
user_act['user_act_count_18'] = user_act['user_id']
feat = pd.pivot_table(user_act,index=['user_id'],values='user_act_count_18',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['user_act_count_18'] = ans['user_act_count_18']/ans['day_median_sub']
'''
#%% ๅ14ๅคฉ
user_act = user_act[(user_act.day>=day_min-14)&(user_act.day<=day_max-7)]
#็ป่ฎกๅ14ๅคฉๅ
ฑๆดป่ทๅคๅฐๆฌก
user_act['user_act_count_14'] = user_act['user_id']
feat = pd.pivot_table(user_act,index=['user_id'],values='user_act_count_14',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['user_act_count_14'] = ans['user_act_count_14']/ans['day_median_sub']
"ๅฏนpage็ฑปๅซ็ป่ฎก"
#็ป่ฎกpageไธบ0็ๆฌกๆฐ
page_0_14 = user_act[user_act.page==0]
page_0_14['page_0_14_count'] = page_0_14['user_id']
feat = pd.pivot_table(page_0_14,index=['user_id'],values='page_0_14_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['page_0_14_count'] = ans['page_0_14_count']/ans['day_median_sub']
del page_0_14;gc.collect();
#็ป่ฎกpageไธบ1็ๆฌกๆฐ
page_1_14 = user_act[user_act.page==1]
page_1_14['page_1_14_count'] = page_1_14['user_id']
feat = pd.pivot_table(page_1_14,index=['user_id'],values='page_1_14_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['page_1_14_count'] = ans['page_1_14_count']/ans['day_median_sub']
del page_1_14;gc.collect();
#็ป่ฎกpageไธบ2็ๆฌกๆฐ
page_2_14 = user_act[user_act.page==2]
page_2_14['page_2_14_count'] = page_2_14['user_id']
feat = pd.pivot_table(page_2_14,index=['user_id'],values='page_2_14_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['page_2_14_count'] = ans['page_2_14_count']/ans['day_median_sub']
del page_2_14;gc.collect();
#็ป่ฎกpageไธบ3็ๆฌกๆฐ
page_3_14 = user_act[user_act.page==3]
page_3_14['page_3_14_count'] = page_3_14['user_id']
feat = pd.pivot_table(page_3_14,index=['user_id'],values='page_3_14_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['page_3_14_count'] = ans['page_3_14_count']/ans['day_median_sub']
del page_3_14;gc.collect();
#็ป่ฎกpageไธบ4็ๆฌกๆฐ
page_4_14 = user_act[user_act.page==4]
page_4_14['page_4_14_count'] = page_4_14['user_id']
feat = pd.pivot_table(page_4_14,index=['user_id'],values='page_4_14_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['page_4_14_count'] = ans['page_4_14_count']/ans['day_median_sub']
del page_4_14;gc.collect();
"ๅฏนaction_type่ฟ่ก็ป่ฎก"
#็ป่ฎกaction_typeไธบ0็ๆฌกๆฐ
user_act_0_14 = user_act[user_act.action_type==0]
user_act_0_14['user_act_0_14_count'] = user_act_0_14['user_id']
feat = pd.pivot_table(user_act_0_14,index=['user_id'],values='user_act_0_14_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['user_act_0_14_count'] = ans['user_act_0_14_count']/ans['day_median_sub']
del user_act_0_14;gc.collect();
#็ป่ฎกaction_typeไธบ1็ๆฌกๆฐ
user_act_1_14 = user_act[user_act.action_type==1]
user_act_1_14['user_act_1_14_count'] = user_act_1_14['user_id']
feat = pd.pivot_table(user_act_1_14,index=['user_id'],values='user_act_1_14_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['user_act_1_14_count'] = ans['user_act_1_14_count']/ans['day_median_sub']
del user_act_1_14;gc.collect();
#็ป่ฎกaction_typeไธบ2็ๆฌกๆฐ
user_act_2_14 = user_act[user_act.action_type==2]
user_act_2_14['user_act_2_14_count'] = user_act_2_14['user_id']
feat = pd.pivot_table(user_act_2_14,index=['user_id'],values='user_act_2_14_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['user_act_2_14_count'] = ans['user_act_2_14_count']/ans['day_median_sub']
del user_act_2_14;gc.collect();
#็ป่ฎกaction_typeไธบ3็ๆฌกๆฐ
user_act_3_14 = user_act[user_act.action_type==3]
user_act_3_14['user_act_3_14_count'] = user_act_3_14['user_id']
feat = pd.pivot_table(user_act_3_14,index=['user_id'],values='user_act_3_14_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['user_act_3_14_count'] = ans['user_act_3_14_count']/ans['day_median_sub']
del user_act_3_14;gc.collect();
#็ป่ฎกaction_typeไธบ4็ๆฌกๆฐ
user_act_4_14 = user_act[user_act.action_type==4]
user_act_4_14['user_act_4_14_count'] = user_act_4_14['user_id']
feat = pd.pivot_table(user_act_4_14,index=['user_id'],values='user_act_4_14_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['user_act_4_14_count'] = ans['user_act_4_14_count']/ans['day_median_sub']
del user_act_4_14;gc.collect();
#็ป่ฎกaction_typeไธบ5็ๆฌกๆฐ
user_act_5_14 = user_act[user_act.action_type==5]
user_act_5_14['user_act_5_14_count'] = user_act_5_14['user_id']
feat = pd.pivot_table(user_act_5_14,index=['user_id'],values='user_act_5_14_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['user_act_5_14_count'] = ans['user_act_5_14_count']/ans['day_median_sub']
del user_act_5_14;gc.collect();
#%% ๅ10ๅคฉ
user_act = user_act[(user_act.day>=day_min-10)&(user_act.day<=day_max-7)]
#็ป่ฎกๅ10ๅคฉๅ
ฑๆดป่ทๅคๅฐๆฌก
user_act['user_act_count_10'] = user_act['user_id']
feat = pd.pivot_table(user_act,index=['user_id'],values='user_act_count_10',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['user_act_count_10'] = ans['user_act_count_10']/ans['day_median_sub']
"ๅฏนpage็ฑปๅซ็ป่ฎก"
#็ป่ฎกpageไธบ0็ๆฌกๆฐ
page_0_10 = user_act[user_act.page==0]
page_0_10['page_0_10_count'] = page_0_10['user_id']
feat = pd.pivot_table(page_0_10,index=['user_id'],values='page_0_10_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['page_0_10_count'] = ans['page_0_10_count']/ans['day_median_sub']
del page_0_10;gc.collect();
#็ป่ฎกpageไธบ1็ๆฌกๆฐ
page_1_10 = user_act[user_act.page==1]
page_1_10['page_1_10_count'] = page_1_10['user_id']
feat = pd.pivot_table(page_1_10,index=['user_id'],values='page_1_10_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['page_1_10_count'] = ans['page_1_10_count']/ans['day_median_sub']
del page_1_10;gc.collect();
#็ป่ฎกpageไธบ2็ๆฌกๆฐ
page_2_10 = user_act[user_act.page==2]
page_2_10['page_2_10_count'] = page_2_10['user_id']
feat = pd.pivot_table(page_2_10,index=['user_id'],values='page_2_10_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['page_2_10_count'] = ans['page_2_10_count']/ans['day_median_sub']
del page_2_10;gc.collect();
#็ป่ฎกpageไธบ3็ๆฌกๆฐ
page_3_10 = user_act[user_act.page==3]
page_3_10['page_3_10_count'] = page_3_10['user_id']
feat = pd.pivot_table(page_3_10,index=['user_id'],values='page_3_10_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['page_3_10_count'] = ans['page_3_10_count']/ans['day_median_sub']
del page_3_10;gc.collect();
#็ป่ฎกpageไธบ4็ๆฌกๆฐ
page_4_10 = user_act[user_act.page==4]
page_4_10['page_4_10_count'] = page_4_10['user_id']
feat = pd.pivot_table(page_4_10,index=['user_id'],values='page_4_10_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['page_4_10_count'] = ans['page_4_10_count']/ans['day_median_sub']
del page_4_10;gc.collect();
"ๅฏนaction_type่ฟ่ก็ป่ฎก"
#็ป่ฎกaction_typeไธบ0็ๆฌกๆฐ
user_act_0_10 = user_act[user_act.action_type==0]
user_act_0_10['user_act_0_10_count'] = user_act_0_10['user_id']
feat = pd.pivot_table(user_act_0_10,index=['user_id'],values='user_act_0_10_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['user_act_0_10_count'] = ans['user_act_0_10_count']/ans['day_median_sub']
del user_act_0_10;gc.collect();
#็ป่ฎกaction_typeไธบ1็ๆฌกๆฐ
user_act_1_10 = user_act[user_act.action_type==1]
user_act_1_10['user_act_1_10_count'] = user_act_1_10['user_id']
feat = pd.pivot_table(user_act_1_10,index=['user_id'],values='user_act_1_10_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['user_act_1_10_count'] = ans['user_act_1_10_count']/ans['day_median_sub']
del user_act_1_10 ;gc.collect();
#็ป่ฎกaction_typeไธบ2็ๆฌกๆฐ
user_act_2_10 = user_act[user_act.action_type==2]
user_act_2_10['user_act_2_10_count'] = user_act_2_10['user_id']
feat = pd.pivot_table(user_act_2_10,index=['user_id'],values='user_act_2_10_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['user_act_2_10_count'] = ans['user_act_2_10_count']/ans['day_median_sub']
del user_act_2_10 ;gc.collect();
#็ป่ฎกaction_typeไธบ3็ๆฌกๆฐ
user_act_3_10 = user_act[user_act.action_type==3]
user_act_3_10['user_act_3_10_count'] = user_act_3_10['user_id']
feat = pd.pivot_table(user_act_3_10,index=['user_id'],values='user_act_3_10_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['user_act_3_10_count'] = ans['user_act_3_10_count']/ans['day_median_sub']
del user_act_3_10 ;gc.collect();
#็ป่ฎกaction_typeไธบ4็ๆฌกๆฐ
user_act_4_10 = user_act[user_act.action_type==4]
user_act_4_10['user_act_4_10_count'] = user_act_4_10['user_id']
feat = pd.pivot_table(user_act_4_10,index=['user_id'],values='user_act_4_10_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['user_act_4_10_count'] = ans['user_act_4_10_count']/ans['day_median_sub']
del user_act_4_10 ;gc.collect();
#็ป่ฎกaction_typeไธบ5็ๆฌกๆฐ
user_act_5_10 = user_act[user_act.action_type==5]
user_act_5_10['user_act_5_10_count'] = user_act_5_10['user_id']
feat = pd.pivot_table(user_act_5_10,index=['user_id'],values='user_act_5_10_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['user_act_5_10_count'] = ans['user_act_5_10_count']/ans['day_median_sub']
del user_act_5_10;gc.collect();
#%% ๅ7ๅคฉ
user_act = user_act[(user_act.day>=day_min-7)&(user_act.day<=day_max-7)]
#็ป่ฎกๅ7ๅคฉๅ
ฑๆดป่ทๅคๅฐๆฌก
user_act['user_act_count_7'] = user_act['user_id']
feat = pd.pivot_table(user_act,index=['user_id'],values='user_act_count_7',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['user_act_count_7'] = ans['user_act_count_7']/ans['day_median_sub']
"ๅฏนpageๆฏ็ง่ฟ่ก็ป่ฎก"
#็ป่ฎกpageไธบ0็ๆฌกๆฐ
page_0_7 = user_act[user_act.page==0]
page_0_7['page_0_7_count'] = page_0_7['user_id']
feat = pd.pivot_table(page_0_7,index=['user_id'],values='page_0_7_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['page_0_7_count'] = ans['page_0_7_count']/ans['day_median_sub']
del page_0_7;gc.collect();
#็ป่ฎกpageไธบ1็ๆฌกๆฐ
page_1_7 = user_act[user_act.page==1]
page_1_7['page_1_7_count'] = page_1_7['user_id']
feat = pd.pivot_table(page_1_7,index=['user_id'],values='page_1_7_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['page_1_7_count'] = ans['page_1_7_count']/ans['day_median_sub']
del page_1_7;gc.collect();
#็ป่ฎกpageไธบ2็ๆฌกๆฐ
page_2_7 = user_act[user_act.page==2]
page_2_7['page_2_7_count'] = page_2_7['user_id']
feat = pd.pivot_table(page_2_7,index=['user_id'],values='page_2_7_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['page_2_7_count'] = ans['page_2_7_count']/ans['day_median_sub']
del page_2_7;gc.collect();
#็ป่ฎกpageไธบ3็ๆฌกๆฐ
page_3_7 = user_act[user_act.page==3]
page_3_7['page_3_7_count'] = page_3_7['user_id']
feat = pd.pivot_table(page_3_7,index=['user_id'],values='page_3_7_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['page_3_7_count'] = ans['page_3_7_count']/ans['day_median_sub']
del page_3_7;gc.collect();
#็ป่ฎกpageไธบ4็ๆฌกๆฐ
page_4_7 = user_act[user_act.page==4]
page_4_7['page_4_7_count'] = page_4_7['user_id']
feat = pd.pivot_table(page_4_7,index=['user_id'],values='page_4_7_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['page_4_7_count'] = ans['page_4_7_count']/ans['day_median_sub']
del page_4_7;gc.collect();
"ๅฏนaction_type่ฟ่ก็ป่ฎก"
#็ป่ฎกaction_typeไธบ0็ๆฌกๆฐ
user_act_0_7 = user_act[user_act.action_type==0]
user_act_0_7['user_act_0_7_count'] = user_act_0_7['user_id']
feat = pd.pivot_table(user_act_0_7,index=['user_id'],values='user_act_0_7_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['user_act_0_7_count'] = ans['user_act_0_7_count']/ans['day_median_sub']
del user_act_0_7;gc.collect();
#็ป่ฎกaction_typeไธบ1็ๆฌกๆฐ
user_act_1_7 = user_act[user_act.action_type==1]
user_act_1_7['user_act_1_7_count'] = user_act_1_7['user_id']
feat = pd.pivot_table(user_act_1_7,index=['user_id'],values='user_act_1_7_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['user_act_1_7_count'] = ans['user_act_1_7_count']/ans['day_median_sub']
del user_act_1_7;gc.collect();
#็ป่ฎกaction_typeไธบ2็ๆฌกๆฐ
user_act_2_7 = user_act[user_act.action_type==2]
user_act_2_7['user_act_2_7_count'] = user_act_2_7['user_id']
feat = pd.pivot_table(user_act_2_7,index=['user_id'],values='user_act_2_7_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['user_act_2_7_count'] = ans['user_act_2_7_count']/ans['day_median_sub']
del user_act_2_7;gc.collect();
#็ป่ฎกaction_typeไธบ3็ๆฌกๆฐ
user_act_3_7 = user_act[user_act.action_type==3]
user_act_3_7['user_act_3_7_count'] = user_act_3_7['user_id']
feat = pd.pivot_table(user_act_3_7,index=['user_id'],values='user_act_3_7_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['user_act_3_7_count'] = ans['user_act_3_7_count']/ans['day_median_sub']
del user_act_3_7;gc.collect();
#็ป่ฎกaction_typeไธบ4็ๆฌกๆฐ
user_act_4_7 = user_act[user_act.action_type==4]
user_act_4_7['user_act_4_7_count'] = user_act_4_7['user_id']
feat = pd.pivot_table(user_act_4_7,index=['user_id'],values='user_act_4_7_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['user_act_4_7_count'] = ans['user_act_4_7_count']/ans['day_median_sub']
del user_act_4_7;gc.collect();
#็ป่ฎกaction_typeไธบ5็ๆฌกๆฐ
user_act_5_7 = user_act[user_act.action_type==5]
user_act_5_7['user_act_5_7_count'] = user_act_5_7['user_id']
feat = pd.pivot_table(user_act_5_7,index=['user_id'],values='user_act_5_7_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
ans['user_act_5_7_count'] = ans['user_act_5_7_count']/ans['day_median_sub']
del user_act_5_7;gc.collect();
#%% ๅ5ๅคฉ
user_act = user_act[(user_act.day>=day_min-5)&(user_act.day<=day_max-7)]
#็ป่ฎกๅ5ๅคฉๅ
ฑๆดป่ทๅคๅฐๆฌก
user_act['user_act_count_5'] = user_act['user_id']
feat = pd.pivot_table(user_act,index=['user_id'],values='user_act_count_5',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
"ๅฏนpageๆฏ็ง่ฟ่ก็ป่ฎก"
#็ป่ฎกpageไธบ0็ๆฌกๆฐ
page_0_5 = user_act[user_act.page==0]
page_0_5['page_0_5_count'] = page_0_5['user_id']
feat = pd.pivot_table(page_0_5,index=['user_id'],values='page_0_5_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del page_0_5;gc.collect();
#็ป่ฎกpageไธบ1็ๆฌกๆฐ
page_1_5 = user_act[user_act.page==1]
page_1_5['page_1_5_count'] = page_1_5['user_id']
feat = pd.pivot_table(page_1_5,index=['user_id'],values='page_1_5_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del page_1_5;gc.collect();
#็ป่ฎกpageไธบ2็ๆฌกๆฐ
page_2_5 = user_act[user_act.page==2]
page_2_5['page_2_5_count'] = page_2_5['user_id']
feat = pd.pivot_table(page_2_5,index=['user_id'],values='page_2_5_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del page_2_5;gc.collect();
#็ป่ฎกpageไธบ3็ๆฌกๆฐ
page_3_5 = user_act[user_act.page==3]
page_3_5['page_3_5_count'] = page_3_5['user_id']
feat = pd.pivot_table(page_3_5,index=['user_id'],values='page_3_5_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del page_3_5;gc.collect();
#็ป่ฎกpageไธบ4็ๆฌกๆฐ
page_4_5 = user_act[user_act.page==4]
page_4_5['page_4_5_count'] = page_4_5['user_id']
feat = pd.pivot_table(page_4_5,index=['user_id'],values='page_4_5_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del page_4_5;gc.collect();
"ๅฏนaction_type่ฟ่ก็ป่ฎก"
#็ป่ฎกaction_typeไธบ0็ๆฌกๆฐ
user_act_0_5 = user_act[user_act.action_type==0]
user_act_0_5['user_act_0_5_count'] = user_act_0_5['user_id']
feat = pd.pivot_table(user_act_0_5,index=['user_id'],values='user_act_0_5_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_0_5;gc.collect();
#็ป่ฎกaction_typeไธบ1็ๆฌกๆฐ
user_act_1_5 = user_act[user_act.action_type==1]
user_act_1_5['user_act_1_5_count'] = user_act_1_5['user_id']
feat = pd.pivot_table(user_act_1_5,index=['user_id'],values='user_act_1_5_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_1_5 ;gc.collect();
#็ป่ฎกaction_typeไธบ2็ๆฌกๆฐ
user_act_2_5 = user_act[user_act.action_type==2]
user_act_2_5['user_act_2_5_count'] = user_act_2_5['user_id']
feat = pd.pivot_table(user_act_2_5,index=['user_id'],values='user_act_2_5_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_2_5 ;gc.collect();
#็ป่ฎกaction_typeไธบ3็ๆฌกๆฐ
user_act_3_5 = user_act[user_act.action_type==3]
user_act_3_5['user_act_3_5_count'] = user_act_3_5['user_id']
feat = pd.pivot_table(user_act_3_5,index=['user_id'],values='user_act_3_5_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_3_5 ;gc.collect();
#็ป่ฎกaction_typeไธบ4็ๆฌกๆฐ
user_act_4_5 = user_act[user_act.action_type==4]
user_act_4_5['user_act_4_5_count'] = user_act_4_5['user_id']
feat = pd.pivot_table(user_act_4_5,index=['user_id'],values='user_act_4_5_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_4_5 ;gc.collect();
#็ป่ฎกaction_typeไธบ5็ๆฌกๆฐ
user_act_5_5 = user_act[user_act.action_type==5]
user_act_5_5['user_act_5_5_count'] = user_act_5_5['user_id']
feat = pd.pivot_table(user_act_5_5,index=['user_id'],values='user_act_5_5_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_5_5;gc.collect();
#%% ๅ4ๅคฉ
user_act = user_act[(user_act.day>=day_min-4)&(user_act.day<=day_max-7)]
#็ป่ฎกๅ4ๅคฉๅ
ฑๆดป่ทๅคๅฐๆฌก
user_act['user_act_count_4'] = user_act['user_id']
feat = pd.pivot_table(user_act,index=['user_id'],values='user_act_count_4',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
"ๅฏนpageๆฏ็ง่ฟ่ก็ป่ฎก"
#็ป่ฎกpageไธบ0็ๆฌกๆฐ
page_0_4 = user_act[user_act.page==0]
page_0_4['page_0_4_count'] = page_0_4['user_id']
feat = pd.pivot_table(page_0_4,index=['user_id'],values='page_0_4_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del page_0_4;gc.collect();
#็ป่ฎกpageไธบ1็ๆฌกๆฐ
page_1_4 = user_act[user_act.page==1]
page_1_4['page_1_4_count'] = page_1_4['user_id']
feat = pd.pivot_table(page_1_4,index=['user_id'],values='page_1_4_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del page_1_4;gc.collect();
#็ป่ฎกpageไธบ2็ๆฌกๆฐ
page_2_4 = user_act[user_act.page==2]
page_2_4['page_2_4_count'] = page_2_4['user_id']
feat = pd.pivot_table(page_2_4,index=['user_id'],values='page_2_4_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del page_2_4;gc.collect();
#็ป่ฎกpageไธบ3็ๆฌกๆฐ
page_3_4 = user_act[user_act.page==3]
page_3_4['page_3_4_count'] = page_3_4['user_id']
feat = pd.pivot_table(page_3_4,index=['user_id'],values='page_3_4_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del page_3_4;gc.collect();
#็ป่ฎกpageไธบ4็ๆฌกๆฐ
page_4_4 = user_act[user_act.page==4]
page_4_4['page_4_4_count'] = page_4_4['user_id']
feat = pd.pivot_table(page_4_4,index=['user_id'],values='page_4_4_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del page_4_4;gc.collect();
"ๅฏนaction_type่ฟ่ก็ป่ฎก"
#็ป่ฎกaction_typeไธบ0็ๆฌกๆฐ
user_act_0_4 = user_act[user_act.action_type==0]
user_act_0_4['user_act_0_4_count'] = user_act_0_4['user_id']
feat = pd.pivot_table(user_act_0_4,index=['user_id'],values='user_act_0_4_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_0_4;gc.collect();
#็ป่ฎกaction_typeไธบ1็ๆฌกๆฐ
user_act_1_4 = user_act[user_act.action_type==1]
user_act_1_4['user_act_1_5_count'] = user_act_1_4['user_id']
feat = pd.pivot_table(user_act_1_4,index=['user_id'],values='user_act_1_5_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_1_4 ;gc.collect();
#็ป่ฎกaction_typeไธบ2็ๆฌกๆฐ
user_act_2_4 = user_act[user_act.action_type==2]
user_act_2_4['user_act_2_4_count'] = user_act_2_4['user_id']
feat = pd.pivot_table(user_act_2_4,index=['user_id'],values='user_act_2_4_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_2_4 ;gc.collect();
#็ป่ฎกaction_typeไธบ3็ๆฌกๆฐ
user_act_3_4 = user_act[user_act.action_type==3]
user_act_3_4['user_act_3_4_count'] = user_act_3_4['user_id']
feat = pd.pivot_table(user_act_3_4,index=['user_id'],values='user_act_3_4_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_3_4 ;gc.collect();
#็ป่ฎกaction_typeไธบ4็ๆฌกๆฐ
user_act_4_4 = user_act[user_act.action_type==4]
user_act_4_4['user_act_4_4_count'] = user_act_4_4['user_id']
feat = pd.pivot_table(user_act_4_4,index=['user_id'],values='user_act_4_4_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_4_4;gc.collect();
#็ป่ฎกaction_typeไธบ5็ๆฌกๆฐ
user_act_5_4 = user_act[user_act.action_type==5]
user_act_5_4['user_act_5_4_count'] = user_act_5_4['user_id']
feat = pd.pivot_table(user_act_5_4,index=['user_id'],values='user_act_5_4_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_5_4 ;gc.collect();
#%% ๅ3ๅคฉ
user_act = user_act[(user_act.day>=day_min-3)&(user_act.day<=day_max-7)]
#็ป่ฎกๅ3ๅคฉๅ
ฑๆดป่ทๅคๅฐๆฌก
user_act['user_act_count_3'] = user_act['user_id']
feat = pd.pivot_table(user_act,index=['user_id'],values='user_act_count_3',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
"ๅฏนpageๆฏ็ง่ฟ่ก็ป่ฎก"
#็ป่ฎกpageไธบ0็ๆฌกๆฐ
page_0_3 = user_act[user_act.page==0]
page_0_3['page_0_3_count'] = page_0_3['user_id']
feat = pd.pivot_table(page_0_3,index=['user_id'],values='page_0_3_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del page_0_3;gc.collect();
#็ป่ฎกpageไธบ1็ๆฌกๆฐ
page_1_3 = user_act[user_act.page==1]
page_1_3['page_1_3_count'] = page_1_3['user_id']
feat = pd.pivot_table(page_1_3,index=['user_id'],values='page_1_3_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del page_1_3;gc.collect();
#็ป่ฎกpageไธบ2็ๆฌกๆฐ
page_2_3 = user_act[user_act.page==2]
page_2_3['page_2_3_count'] = page_2_3['user_id']
feat = pd.pivot_table(page_2_3,index=['user_id'],values='page_2_3_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del page_2_3;gc.collect();
#็ป่ฎกpageไธบ3็ๆฌกๆฐ
page_3_3 = user_act[user_act.page==3]
page_3_3['page_3_3_count'] = page_3_3['user_id']
feat = pd.pivot_table(page_3_3,index=['user_id'],values='page_3_3_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del page_3_3;gc.collect();
#็ป่ฎกpageไธบ4็ๆฌกๆฐ
page_4_3 = user_act[user_act.page==4]
page_4_3['page_4_3_count'] = page_4_3['user_id']
feat = pd.pivot_table(page_4_3,index=['user_id'],values='page_4_3_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del page_4_3;gc.collect();
"ๅฏนaction_type่ฟ่ก็ป่ฎก"
#็ป่ฎกaction_typeไธบ0็ๆฌกๆฐ
user_act_0_3 = user_act[user_act.action_type==0]
user_act_0_3['user_act_0_3_count'] = user_act_0_3['user_id']
feat = pd.pivot_table(user_act_0_3,index=['user_id'],values='user_act_0_3_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_0_3;gc.collect();
#็ป่ฎกaction_typeไธบ1็ๆฌกๆฐ
user_act_1_3 = user_act[user_act.action_type==1]
user_act_1_3['user_act_1_3_count'] = user_act_1_3['user_id']
feat = pd.pivot_table(user_act_1_3,index=['user_id'],values='user_act_1_3_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_1_3;gc.collect();
#็ป่ฎกaction_typeไธบ2็ๆฌกๆฐ
user_act_2_3 = user_act[user_act.action_type==2]
user_act_2_3['user_act_2_3_count'] = user_act_2_3['user_id']
feat = pd.pivot_table(user_act_2_3,index=['user_id'],values='user_act_2_3_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_2_3;gc.collect();
#็ป่ฎกaction_typeไธบ3็ๆฌกๆฐ
user_act_3_3 = user_act[user_act.action_type==3]
user_act_3_3['user_act_3_3_count'] = user_act_3_3['user_id']
feat = pd.pivot_table(user_act_3_3,index=['user_id'],values='user_act_3_3_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_3_3;gc.collect();
#็ป่ฎกaction_typeไธบ4็ๆฌกๆฐ
user_act_4_3 = user_act[user_act.action_type==4]
user_act_4_3['user_act_4_3_count'] = user_act_4_3['user_id']
feat = pd.pivot_table(user_act_4_3,index=['user_id'],values='user_act_4_3_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_4_3;gc.collect();
#็ป่ฎกaction_typeไธบ5็ๆฌกๆฐ
user_act_5_3 = user_act[user_act.action_type==5]
user_act_5_3['user_act_5_3_count'] = user_act_5_3['user_id']
feat = pd.pivot_table(user_act_5_3,index=['user_id'],values='user_act_5_3_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_5_3;gc.collect();
#%% ๅ2ๅคฉ
user_act = user_act[(user_act.day>=day_min-2)&(user_act.day<=day_max-7)]
#็ป่ฎกๅ2ๅคฉๅ
ฑๆดป่ทๅคๅฐๆฌก
user_act['user_act_count_2'] = user_act['user_id']
feat = pd.pivot_table(user_act,index=['user_id'],values='user_act_count_2',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
"ๅฏนpageๆฏ็ง่ฟ่ก็ป่ฎก"
#็ป่ฎกpageไธบ0็ๆฌกๆฐ
page_0_2 = user_act[user_act.page==0]
page_0_2['page_0_2_count'] = page_0_2['user_id']
feat = pd.pivot_table(page_0_2,index=['user_id'],values='page_0_2_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del page_0_2;gc.collect();
#็ป่ฎกpageไธบ1็ๆฌกๆฐ
page_1_2 = user_act[user_act.page==1]
page_1_2['page_1_2_count'] = page_1_2['user_id']
feat = pd.pivot_table(page_1_2,index=['user_id'],values='page_1_2_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del page_1_2;gc.collect();
#็ป่ฎกpageไธบ2็ๆฌกๆฐ
page_2_2 = user_act[user_act.page==2]
page_2_2['page_2_2_count'] = page_2_2['user_id']
feat = pd.pivot_table(page_2_2,index=['user_id'],values='page_2_2_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del page_2_2;gc.collect();
#็ป่ฎกpageไธบ3็ๆฌกๆฐ
page_3_2 = user_act[user_act.page==3]
page_3_2['page_3_2_count'] = page_3_2['user_id']
feat = pd.pivot_table(page_3_2,index=['user_id'],values='page_3_2_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del page_3_2;gc.collect();
#็ป่ฎกpageไธบ4็ๆฌกๆฐ
page_4_2 = user_act[user_act.page==4]
page_4_2['page_4_2_count'] = page_4_2['user_id']
feat = pd.pivot_table(page_4_2,index=['user_id'],values='page_4_2_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del page_4_2;gc.collect();
"ๅฏนaction_type่ฟ่ก็ป่ฎก"
#็ป่ฎกaction_typeไธบ0็ๆฌกๆฐ
user_act_0_2 = user_act[user_act.action_type==0]
user_act_0_2['user_act_0_2_count'] = user_act_0_2['user_id']
feat = pd.pivot_table(user_act_0_2,index=['user_id'],values='user_act_0_2_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_0_2;gc.collect();
#็ป่ฎกaction_typeไธบ1็ๆฌกๆฐ
user_act_1_2 = user_act[user_act.action_type==1]
user_act_1_2['user_act_1_2_count'] = user_act_1_2['user_id']
feat = pd.pivot_table(user_act_1_2,index=['user_id'],values='user_act_1_2_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_1_2;gc.collect();
#็ป่ฎกaction_typeไธบ2็ๆฌกๆฐ
user_act_2_2 = user_act[user_act.action_type==2]
user_act_2_2['user_act_2_2_count'] = user_act_2_2['user_id']
feat = pd.pivot_table(user_act_2_2,index=['user_id'],values='user_act_2_2_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_2_2 ;gc.collect();
#็ป่ฎกaction_typeไธบ3็ๆฌกๆฐ
user_act_3_2 = user_act[user_act.action_type==3]
user_act_3_2['user_act_3_2_count'] = user_act_3_2['user_id']
feat = pd.pivot_table(user_act_3_2,index=['user_id'],values='user_act_3_2_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_3_2 ;gc.collect();
#็ป่ฎกaction_typeไธบ4็ๆฌกๆฐ
user_act_4_2 = user_act[user_act.action_type==4]
user_act_4_2['user_act_4_2_count'] = user_act_4_2['user_id']
feat = pd.pivot_table(user_act_4_2,index=['user_id'],values='user_act_4_2_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_4_2 ;gc.collect();
#็ป่ฎกaction_typeไธบ5็ๆฌกๆฐ
user_act_5_2 = user_act[user_act.action_type==5]
user_act_5_2['user_act_5_2_count'] = user_act_5_2['user_id']
feat = pd.pivot_table(user_act_5_2,index=['user_id'],values='user_act_5_2_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_5_2;gc.collect();
#%% ๅ1ๅคฉ
user_act = user_act[(user_act.day>=day_min-1)&(user_act.day<=day_max-7)]
#็ป่ฎกๅ1ๅคฉๅ
ฑๆดป่ทๅคๅฐๆฌก
user_act['user_act_count_1'] = user_act['user_id']
feat = pd.pivot_table(user_act,index=['user_id'],values='user_act_count_1',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
"ๅฏนpageๆฏ็ง่ฟ่ก็ป่ฎก"
#็ป่ฎกpageไธบ0็ๆฌกๆฐ
page_0_1 = user_act[user_act.page==0]
page_0_1['page_0_1_count'] = page_0_1['user_id']
feat = pd.pivot_table(page_0_1,index=['user_id'],values='page_0_1_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del page_0_1;gc.collect();
#็ป่ฎกpageไธบ1็ๆฌกๆฐ
page_1_1 = user_act[user_act.page==1]
page_1_1['page_1_1_count'] = page_1_1['user_id']
feat = pd.pivot_table(page_1_1,index=['user_id'],values='page_1_1_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del page_1_1;gc.collect();
#็ป่ฎกpageไธบ2็ๆฌกๆฐ
page_2_1 = user_act[user_act.page==2]
page_2_1['page_2_1_count'] = page_2_1['user_id']
feat = pd.pivot_table(page_2_1,index=['user_id'],values='page_2_1_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del page_2_1;gc.collect();
#็ป่ฎกpageไธบ3็ๆฌกๆฐ
page_3_1 = user_act[user_act.page==3]
page_3_1['page_3_1_count'] = page_3_1['user_id']
feat = pd.pivot_table(page_3_1,index=['user_id'],values='page_3_1_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del page_3_1;gc.collect();
#็ป่ฎกpageไธบ4็ๆฌกๆฐ
page_4_1 = user_act[user_act.page==4]
page_4_1['page_4_1_count'] = page_4_1['user_id']
feat = pd.pivot_table(page_4_1,index=['user_id'],values='page_4_1_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del page_4_1;gc.collect();
"ๅฏนaction_type่ฟ่ก็ป่ฎก"
#็ป่ฎกaction_typeไธบ0็ๆฌกๆฐ
user_act_0_1 = user_act[user_act.action_type==0]
user_act_0_1['user_act_0_1_count'] = user_act_0_1['user_id']
feat = pd.pivot_table(user_act_0_1,index=['user_id'],values='user_act_0_1_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_0_1;gc.collect();
#็ป่ฎกaction_typeไธบ1็ๆฌกๆฐ
user_act_1_1 = user_act[user_act.action_type==1]
user_act_1_1['user_act_1_1_count'] = user_act_1_1['user_id']
feat = pd.pivot_table(user_act_1_1,index=['user_id'],values='user_act_1_1_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_1_1;gc.collect();
#็ป่ฎกaction_typeไธบ2็ๆฌกๆฐ
user_act_2_1 = user_act[user_act.action_type==2]
user_act_2_1['user_act_2_1_count'] = user_act_2_1['user_id']
feat = pd.pivot_table(user_act_2_1,index=['user_id'],values='user_act_2_1_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_2_1;gc.collect();
#็ป่ฎกaction_typeไธบ3็ๆฌกๆฐ
user_act_3_1 = user_act[user_act.action_type==3]
user_act_3_1['user_act_3_1_count'] = user_act_3_1['user_id']
feat = pd.pivot_table(user_act_3_1,index=['user_id'],values='user_act_3_1_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_3_1;gc.collect();
#็ป่ฎกaction_typeไธบ4็ๆฌกๆฐ
user_act_4_1 = user_act[user_act.action_type==4]
user_act_4_1['user_act_4_1_count'] = user_act_4_1['user_id']
feat = pd.pivot_table(user_act_4_1,index=['user_id'],values='user_act_4_1_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_4_1;gc.collect();
#็ป่ฎกaction_typeไธบ5็ๆฌกๆฐ
user_act_5_1 = user_act[user_act.action_type==5]
user_act_5_1['user_act_5_1_count'] = user_act_5_1['user_id']
feat = pd.pivot_table(user_act_5_1,index=['user_id'],values='user_act_5_1_count',aggfunc='count').reset_index()
ans = pd.merge(ans,feat,on='user_id',how='left')
del user_act_5_1;gc.collect();
del feat;gc.collect();
del day_min;gc.collect();
del day_max;gc.collect();
del data;gc.collect();
del app;gc.collect();
del user_act;gc.collect();
del user_reg;gc.collect();
del vedio;gc.collect();
return ans
#%%
def modelXgb(train,test):
"xgbๆจกๅ"
train_y = train['label'].values
train_x = train.drop(['user_id','register_day','label'],axis=1).values
test_x = test.drop(['user_id','register_day'],axis=1).values
dtrain = xgb.DMatrix(train_x, label=train_y)
dtest = xgb.DMatrix(test_x)
del train_x;gc.collect();
del test_x;gc.collect();
# ๆจกๅๅๆฐ
params = {'booster': 'gbtree',
'objective':'binary:logistic',
'eval_metric':'auc',
'eta': 0.03,
'max_depth': 5, # 6
'colsample_bytree': 0.9,#0.8
'subsample': 0.9,
'scale_pos_weight': 1,
'min_child_weight': 18 # 2
}
# ่ฎญ็ป
watchlist = [(dtrain,'train')]
bst = xgb.train(params, dtrain, num_boost_round=1500,evals=watchlist)
# ้ขๆต
predict = bst.predict(dtest)
del dtrain;gc.collect();
del dtest;gc.collect();
test_xy = test[['user_id']]
test_xy['predicted_score'] = predict
test_xy = test_xy.sort_values('predicted_score', ascending=False)
del predict;gc.collect();
return test_xy
#%%
def main():
"่ฎญ็ปๆจกๅ"
print('ไธ่ฝฝๆฐๆฎ...')
app,user_act,user_reg,vedio = loadData()
print('ๆๆ ๆฐๆฎ...')
test,train1,train2,train3,train4,train5,train6,train7,train8,train9,train10,train11,train12,train13,train14 = makeLabel(app,user_act,user_reg,vedio)
print('ๆๅte็นๅพ...')
te = genFeature(31,37,test,app,user_act,user_reg,vedio)
del test;gc.collect();
print('ๆต่ฏ้ๆๅๅฎๆ...')
#่ฎญ็ป้ๆๅ็นๅพ
print('ๆๅtr1็นๅพ...')
tr1 = genFeature(24,30,train1,app,user_act,user_reg,vedio)
del train1;gc.collect();
print('ๆๅtr2็นๅพ...')
tr2 = genFeature(23,29,train2,app,user_act,user_reg,vedio)
del train2;gc.collect();
print('ๆๅtr3็นๅพ...')
tr3 = genFeature(22,28,train3,app,user_act,user_reg,vedio)
del train3;gc.collect();
print('ๆๅtr4็นๅพ...')
tr4 = genFeature(21,27,train4,app,user_act,user_reg,vedio)
del train4;gc.collect();
print('ๆๅtr5็นๅพ...')
tr5 = genFeature(20,26,train5,app,user_act,user_reg,vedio)
del train5;gc.collect();
print('ๆๅtr6็นๅพ...')
tr6 = genFeature(19,25,train6,app,user_act,user_reg,vedio)
del train6;gc.collect();
print('ๆๅtr7็นๅพ...')
tr7 = genFeature(18,24,train7,app,user_act,user_reg,vedio)
del train7;gc.collect();
print('ๆๅtr8็นๅพ...')
tr8 = genFeature(17,23,train8,app,user_act,user_reg,vedio)
del train8;gc.collect();
print('ๆๅtr9็นๅพ...')
tr9 = genFeature(16,22,train9,app,user_act,user_reg,vedio)
del train9;gc.collect();
print('ๆๅtr10็นๅพ...')
tr10 = genFeature(15,21,train10,app,user_act,user_reg,vedio)
del train10;gc.collect();
print('ๆๅtr11็นๅพ...')
tr11 = genFeature(14,20,train11,app,user_act,user_reg,vedio)
del train11;gc.collect();
print('ๆๅtr12็นๅพ...')
tr12 = genFeature(13,19,train12,app,user_act,user_reg,vedio)
del train12;gc.collect();
print('ๆๅtr13็นๅพ...')
tr13 = genFeature(12,18,train13,app,user_act,user_reg,vedio)
del train13;gc.collect();
print('ๆๅtr14็นๅพ...')
tr14 = genFeature(11,17,train14,app,user_act,user_reg,vedio)
del train14;gc.collect();
del app;gc.collect();
del user_act;gc.collect();
del user_reg;gc.collect();
del vedio;gc.collect();
#ๅๅนถ่ฎญ็ป้
tr = pd.concat([tr1,tr2,tr3,tr4,tr5,tr6,tr7,tr8,tr9,tr10,tr11,tr12,tr13,tr14],axis=0)
del tr1;gc.collect();
del tr2;gc.collect();
del tr3;gc.collect();
del tr4;gc.collect();
del tr5;gc.collect();
del tr6;gc.collect();
del tr7;gc.collect();
del tr8;gc.collect();
del tr9;gc.collect();
del tr10;gc.collect();
del tr11;gc.collect();
del tr12;gc.collect();
del tr13;gc.collect();
del tr14;gc.collect();
print('ๅผๅง่ฎญ็ปๆจกๅ...')
#่ฎญ็ปๆจกๅ
answer = modelXgb(tr,te)
print('็ปๆ่ฎญ็ปๆจกๅ...')
#ๅฏผๅบ็ปๆ
answer.to_csv('/home/kesci/work/yw_model.txt',index=False, header=None)
#ๆไบคๆไปถmy_submission.txt่ฟ่ก่ฏๅฎก๏ผๆธฉ้ฆจๆ็คบ:ๆฌๆฌกๆฏ่ตๆไบคๆไปถ็ๆ ผๅผไธบtxt
#!./kesci_submit -token 61c8c5af49a7abd0 -file ans_0723_2.txt
#%%
if __name__ == '__main__':
"ไธปๅฝๆฐๅ
ฅๅฃ"
main()
#%%
# ๆจกๅ3--้ตๅญๅๆจกๅ
import pandas as pd
import numpy as np
import xgboost as xgb
def readData():
print('reading...')
print('app launch')
user_launch = pd.read_csv(r'/mnt/datasets/fusai/app_launch_log.txt', sep='\t', header=None)
user_launch.columns = ['user_id', 'day']
print('user register')
user_register = pd.read_csv(r'/mnt/datasets/fusai/user_register_log.txt', sep='\t', header=None)
user_register.columns = ['user_id', 'register_day', 'register_type', 'device_type']
print('video create')
video_create = pd.read_csv(r'/mnt/datasets/fusai/video_create_log.txt', sep='\t', header=None)
video_create.columns = ['user_id', 'day']
print('user activity')
user_activity = pd.read_csv(r'/mnt/datasets/fusai/user_activity_log.txt', sep='\t', header=None)
user_activity.columns = ['user_id', 'day', 'page', 'video_id', 'author_id', 'action_type']
print('reading have finished!')
return user_launch, user_register, video_create, user_activity
def labelRegister(user_launch, user_register, video_create, user_activity, f_start, f_end):
print('labeling...')
act_user = pd.concat([user_launch, video_create]).reset_index(drop = True)
act_user = pd.concat([act_user, user_activity[['user_id', 'day']]]).reset_index(drop = True)
act_user = act_user[(act_user.day >= f_start) & (act_user.day <= f_end)]
act_user = act_user[['user_id']]
act_user.drop_duplicates(inplace = True)
act_user['label'] = 1
user_register = user_register[user_register.register_day <= (f_start - 1)]
user_register = pd.merge(user_register, act_user, on = ['user_id'], how = 'left')
user_register.fillna(0, inplace = True)
print('labeling have finished!')
return user_register
def getMaxMinNormalization(feature):
max = np.max(feature)
min = np.min(feature)
return (feature - min) / (max - min)
def getLabel(user_launch, user_register, video_create, user_activity):
print('label1...')
register_have_label1 = labelRegister(user_launch, user_register, video_create, user_activity, 24, 30)
print('label2...')
register_have_label2 = labelRegister(user_launch, user_register, video_create, user_activity, 17, 23)
return register_have_label1, register_have_label2
def getLastDayFeature(user_launch, user_register, video_create, user_activity, f_start, f_end):
print('get last day feature...')
t1 = user_launch[(user_launch.day >= f_start) &
(user_launch.day <= f_end)][['user_id']]
t1['user_%d_before_launch_count' % (f_end - f_start + 1)] = 1
feat = pd.pivot_table(t1, index=['user_id'],
values='user_%d_before_launch_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
t1_1 = pd.merge(feat, user_register[['user_id', 'register_type', 'device_type']], on=['user_id'], how='left')
t1_1 = t1_1[['register_type', 'device_type', 'user_%d_before_launch_count' % (f_end - f_start + 1)]]
feat1 = pd.pivot_table(t1_1, index=['register_type', 'device_type'],
values='user_%d_before_launch_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
feat1.rename(columns={'user_%d_before_launch_count' % (f_end - f_start + 1):
'device_register_type_%d_before_launch_count'
% (f_end - f_start + 1)}, inplace=True)
feat1['device_register_type_%d_before_launch_count' %
(f_end - f_start + 1)] = \
getMaxMinNormalization(feat1['device_register_type_%d_before_launch_count' %
(f_end - f_start + 1)])
feat2 = pd.pivot_table(t1_1, index=['device_type'],
values='user_%d_before_launch_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
feat2.rename(columns={'user_%d_before_launch_count' % (f_end - f_start + 1):
'device_type_%d_before_launch_count' % (f_end - f_start + 1)}, inplace=True)
feat2['device_type_%d_before_launch_count' %
(f_end - f_start + 1)] = \
getMaxMinNormalization(feat2['device_type_%d_before_launch_count' %
(f_end - f_start + 1)])
feat3 = pd.pivot_table(t1_1, index=['register_type'],
values='user_%d_before_launch_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
feat3.rename(columns={'user_%d_before_launch_count' % (f_end - f_start + 1):
'register_type_%d_before_launch_count'
% (f_end - f_start + 1)}, inplace=True)
feat3['register_type_%d_before_launch_count' %
(f_end - f_start + 1)] = \
getMaxMinNormalization(feat3['register_type_%d_before_launch_count' %
(f_end - f_start + 1)])
t1 = pd.pivot_table(t1, index=['user_id'],
values='user_%d_before_launch_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
t1['user_%d_before_launch_count' % (f_end - f_start + 1)] = \
getMaxMinNormalization(t1['user_%d_before_launch_count' % (f_end - f_start + 1)])
user_register = pd.merge(user_register, t1, on=['user_id'], how='left')
user_register = pd.merge(user_register, feat1, on=['register_type', 'device_type'], how='left')
user_register = pd.merge(user_register, feat2, on=['device_type'], how='left')
user_register = pd.merge(user_register, feat3, on=['register_type'], how='left')
user_register.fillna(0, inplace=True)
t2 = video_create[(video_create.day >= f_start) &
(video_create.day <= f_end)][['user_id']]
t2['user_%d_before_video_create_count' % (f_end - f_start + 1)] = 1
feat = pd.pivot_table(t2, index=['user_id'],
values='user_%d_before_video_create_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
t2_1 = pd.merge(feat, user_register[['user_id', 'register_type', 'device_type']], on=['user_id'], how='left')
t2_1 = t2_1[['register_type', 'device_type', 'user_%d_before_video_create_count' % (f_end - f_start + 1)]]
feat1 = pd.pivot_table(t2_1, index=['register_type', 'device_type'],
values='user_%d_before_video_create_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
feat1.rename(columns={'user_%d_before_video_create_count' % (f_end - f_start + 1):
'device_register_type_%d_before_video_create_count'
% (f_end - f_start + 1)}, inplace=True)
feat1['device_register_type_%d_before_video_create_count' %
(f_end - f_start + 1)] = \
getMaxMinNormalization(feat1['device_register_type_%d_before_video_create_count' %
(f_end - f_start + 1)])
feat2 = pd.pivot_table(t2_1, index=['device_type'],
values='user_%d_before_video_create_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
feat2.rename(columns={'user_%d_before_video_create_count' % (f_end - f_start + 1):
'device_type_%d_before_video_create_count' % (f_end - f_start + 1)}, inplace=True)
feat2['device_type_%d_before_video_create_count' %
(f_end - f_start + 1)] = \
getMaxMinNormalization(feat2['device_type_%d_before_video_create_count' %
(f_end - f_start + 1)])
feat3 = pd.pivot_table(t2_1, index=['register_type'],
values='user_%d_before_video_create_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
feat3.rename(columns={'user_%d_before_video_create_count' % (f_end - f_start + 1):
'register_type_%d_before_video_create_count' % (f_end - f_start + 1)}, inplace=True)
feat3['register_type_%d_before_video_create_count' %
(f_end - f_start + 1)] = \
getMaxMinNormalization(feat3['register_type_%d_before_video_create_count' %
(f_end - f_start + 1)])
t2 = pd.pivot_table(t2, index=['user_id'],
values='user_%d_before_video_create_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
t2['user_%d_before_video_create_count' % (f_end - f_start + 1)] = \
getMaxMinNormalization(t2['user_%d_before_video_create_count'
% (f_end - f_start + 1)])
user_register = pd.merge(user_register, t2, on=['user_id'], how='left')
user_register = pd.merge(user_register, feat1, on=['register_type', 'device_type'], how='left')
user_register = pd.merge(user_register, feat2, on=['device_type'], how='left')
user_register = pd.merge(user_register, feat3, on=['register_type'], how='left')
user_register.fillna(0, inplace=True)
for i in range(5):
t3 = user_activity[(user_activity.day >= f_start) &
(user_activity.day <= f_end) &
(user_activity.page == i)][['user_id']]
t3['user_%d_before_page_count_%d' % (f_end - f_start + 1, i)] = 1
t3 = pd.pivot_table(t3, index=['user_id'],
values='user_%d_before_page_count_%d' % (f_end - f_start + 1, i),
aggfunc='sum').reset_index()
t3['user_%d_before_page_count_%d' % (f_end - f_start + 1, i)] = \
getMaxMinNormalization(t3['user_%d_before_page_count_%d'
% (f_end - f_start + 1, i)])
user_register = pd.merge(user_register, t3, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
for j in range(6):
t4 = user_activity[(user_activity.day >= f_start) &
(user_activity.day <= f_end) &
(user_activity.action_type == j)][['user_id']]
t4['user_%d_before_action_type_count_%d' % (f_end - f_start + 1, j)] = 1
t4 = pd.pivot_table(t4, index=['user_id'],
values='user_%d_before_action_type_count_%d' % (f_end - f_start + 1, j),
aggfunc='sum').reset_index()
t4['user_%d_before_action_type_count_%d' % (f_end - f_start + 1, j)] = \
getMaxMinNormalization(t4['user_%d_before_action_type_count_%d'
% (f_end - f_start + 1, j)])
user_register = pd.merge(user_register, t4, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
t5 = user_activity[(user_activity.day >= f_start) &
(user_activity.day <= f_end)][['user_id', 'video_id']]
t5.drop_duplicates(inplace = True)
t5['video_id'] = 1
t5 = pd.pivot_table(t5, index=['user_id'],
values='video_id',
aggfunc='sum').reset_index()
t5['video_id'] = getMaxMinNormalization(t5['video_id'])
t5.rename(columns={'video_id': 'user_%d_before_watch_video_type_count'
% (f_end - f_start + 1)}, inplace=True)
user_register = pd.merge(user_register, t5, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
t6 = user_activity[(user_activity.day >= f_start) &
(user_activity.day <= f_end)][['user_id', 'author_id']]
t6.drop_duplicates(inplace = True)
t6['author_id'] = 1
t6 = pd.pivot_table(t6, index=['user_id'],
values='author_id',
aggfunc='sum').reset_index()
t6['author_id'] = getMaxMinNormalization(t6['author_id'])
t6.rename(columns={'author_id': 'user_%d_before_watch_video_author_type_count'
% (f_end - f_start + 1)}, inplace=True)
user_register = pd.merge(user_register, t6, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
t7 = user_activity[(user_activity.day >= f_start) &
(user_activity.day <= f_end)][['user_id']]
t7['user_%d_before_activity_count' % (f_end - f_start + 1)] = 1
feat = pd.pivot_table(t7, index=['user_id'],
values='user_%d_before_activity_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
t7_1 = pd.merge(feat, user_register[['user_id', 'register_type', 'device_type']], on=['user_id'], how='left')
t7_1 = t7_1[['register_type', 'device_type', 'user_%d_before_activity_count' % (f_end - f_start + 1)]]
feat1 = pd.pivot_table(t7_1, index=['register_type', 'device_type'],
values='user_%d_before_activity_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
feat2 = pd.pivot_table(t7_1, index=['device_type'],
values='user_%d_before_activity_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
feat3 = pd.pivot_table(t7_1, index=['register_type'],
values='user_%d_before_activity_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
feat1.rename(columns={'user_%d_before_activity_count' % (f_end - f_start + 1):
'device_register_type_%d_before_activity_count' % (f_end - f_start + 1)}, inplace=True)
feat2.rename(columns={'user_%d_before_activity_count' % (f_end - f_start + 1):
'device_type_%d_before_activity_count' % (f_end - f_start + 1)}, inplace=True)
feat3.rename(columns={'user_%d_before_activity_count' % (f_end - f_start + 1):
'register_type_%d_before_activity_count' % (f_end - f_start + 1)}, inplace=True)
feat1['device_register_type_%d_before_activity_count'
% (f_end - f_start + 1)] = \
getMaxMinNormalization(feat1['device_register_type_%d_before_activity_count'
% (f_end - f_start + 1)])
feat2['device_type_%d_before_activity_count'
% (f_end - f_start + 1)] = \
getMaxMinNormalization(feat2['device_type_%d_before_activity_count'
% (f_end - f_start + 1)])
feat3['register_type_%d_before_activity_count'
% (f_end - f_start + 1)] = \
getMaxMinNormalization(feat3['register_type_%d_before_activity_count'
% (f_end - f_start + 1)])
user_register = pd.merge(user_register, feat1, on=['register_type', 'device_type'], how='left')
user_register = pd.merge(user_register, feat2, on=['device_type'], how='left')
user_register = pd.merge(user_register, feat3, on=['register_type'], how='left')
user_register.fillna(0, inplace=True)
for i in range(6):
t8 = user_activity[(user_activity.day >= f_start) &
(user_activity.day <= f_end) &
(user_activity.action_type == i)][['author_id']]
t8['author_%d_before_action_type_%d_showed_count'
% (f_end - f_start + 1, i)] = 1
t8 = pd.pivot_table(t8, index=['author_id'],
values='author_%d_before_action_type_%d_showed_count' % (f_end - f_start + 1, i),
aggfunc='sum').reset_index()
t8['author_%d_before_action_type_%d_showed_count'
% (f_end - f_start + 1, i)] = \
getMaxMinNormalization(t8['author_%d_before_action_type_%d_showed_count'
% (f_end - f_start + 1, i)])
t8.rename(columns={'author_id': 'user_id'}, inplace=True)
user_register = pd.merge(user_register, t8, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
print('get last day feature have finished!')
return user_register
def getAllTimeFeature(user_launch, user_register, video_create, user_activity, f_start, f_end):
print('get all time feature...')
t1 = user_launch[(user_launch.day >= f_start) &
(user_launch.day <= f_end)][['user_id', 'day']]
t1['user_%d_before_launch_count' % (f_end - f_start + 1)] = 1
feat = pd.pivot_table(t1, index=['user_id'],
values='user_%d_before_launch_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
t1_1 = pd.merge(feat, user_register[['user_id', 'register_type', 'device_type']], on=['user_id'], how='left')
t1_1 = t1_1[['register_type', 'device_type', 'user_%d_before_launch_count' % (f_end - f_start + 1)]]
feat1 = pd.pivot_table(t1_1, index=['register_type', 'device_type'],
values='user_%d_before_launch_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
feat1.rename(columns={'user_%d_before_launch_count' % (f_end - f_start + 1):
'device_register_type_%d_before_launch_count' % (f_end - f_start + 1)}, inplace=True)
feat1['device_register_type_%d_before_launch_count' %
(f_end - f_start + 1)] = \
getMaxMinNormalization(feat1['device_register_type_%d_before_launch_count' %
(f_end - f_start + 1)])
feat2 = pd.pivot_table(t1_1, index=['device_type'],
values='user_%d_before_launch_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
feat2.rename(columns={'user_%d_before_launch_count' % (f_end - f_start + 1):
'device_type_%d_before_launch_count' % (f_end - f_start + 1)}, inplace=True)
feat2['device_type_%d_before_launch_count' %
(f_end - f_start + 1)] = \
getMaxMinNormalization(feat2['device_type_%d_before_launch_count' %
(f_end - f_start + 1)])
feat3 = pd.pivot_table(t1_1, index=['register_type'],
values='user_%d_before_launch_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
feat3.rename(columns={'user_%d_before_launch_count' % (f_end - f_start + 1):
'register_type_%d_before_launch_count' % (f_end - f_start + 1)}, inplace=True)
feat3['register_type_%d_before_launch_count' %
(f_end - f_start + 1)] = \
getMaxMinNormalization(feat3['register_type_%d_before_launch_count' %
(f_end - f_start + 1)])
t1['user_%d_before_launch_count' % (f_end - f_start + 1)] = \
(t1['day'] - f_start) / (f_end - f_start)
t1 = pd.pivot_table(t1, index=['user_id'],
values='user_%d_before_launch_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
t1['user_%d_before_launch_count' % (f_end - f_start + 1)] = \
getMaxMinNormalization(t1['user_%d_before_launch_count' % (f_end - f_start + 1)])
user_register = pd.merge(user_register, t1, on=['user_id'], how='left')
user_register = pd.merge(user_register, feat1, on=['register_type', 'device_type'], how='left')
user_register = pd.merge(user_register, feat2, on=['device_type'], how='left')
user_register = pd.merge(user_register, feat3, on=['register_type'], how='left')
user_register.fillna(0, inplace=True)
t2 = video_create[(video_create.day >= f_start) &
(video_create.day <= f_end)][['user_id', 'day']]
t2['user_%d_before_video_create_count' % (f_end - f_start + 1)] = 1
feat = pd.pivot_table(t2, index=['user_id'],
values='user_%d_before_video_create_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
t2_1 = pd.merge(feat, user_register[['user_id', 'register_type', 'device_type']], on=['user_id'], how='left')
t2_1 = t2_1[['register_type', 'device_type', 'user_%d_before_video_create_count' % (f_end - f_start + 1)]]
feat1 = pd.pivot_table(t2_1, index=['register_type', 'device_type'],
values='user_%d_before_video_create_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
feat1.rename(columns={'user_%d_before_video_create_count' % (f_end - f_start + 1):
'device_register_type_%d_before_video_create_count'
% (f_end - f_start + 1)}, inplace=True)
feat1['device_register_type_%d_before_video_create_count' %
(f_end - f_start + 1)] = \
getMaxMinNormalization(feat1['device_register_type_%d_before_video_create_count' %
(f_end - f_start + 1)])
feat2 = pd.pivot_table(t2_1, index=['device_type'],
values='user_%d_before_video_create_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
feat2.rename(columns={'user_%d_before_video_create_count' % (f_end - f_start + 1):
'device_type_%d_before_video_create_count' % (f_end - f_start + 1)}, inplace=True)
feat2['device_type_%d_before_video_create_count' %
(f_end - f_start + 1)] = \
getMaxMinNormalization(feat2['device_type_%d_before_video_create_count' %
(f_end - f_start + 1)])
feat3 = pd.pivot_table(t2_1, index=['register_type'],
values='user_%d_before_video_create_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
feat3.rename(columns={'user_%d_before_video_create_count' % (f_end - f_start + 1):
'register_type_%d_before_video_create_count' % (f_end - f_start + 1)}, inplace=True)
feat3['register_type_%d_before_video_create_count' %
(f_end - f_start + 1)] = \
getMaxMinNormalization(feat3['register_type_%d_before_video_create_count' %
(f_end - f_start + 1)])
t2['user_%d_before_video_create_count' % (f_end - f_start + 1)] = \
(t2['day'] - f_start) / (f_end - f_start)
t2 = pd.pivot_table(t2, index=['user_id'],
values='user_%d_before_video_create_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
t2['user_%d_before_video_create_count' % (f_end - f_start + 1)] = \
getMaxMinNormalization(t2['user_%d_before_video_create_count'
% (f_end - f_start + 1)])
user_register = pd.merge(user_register, t2, on=['user_id'], how='left')
user_register = pd.merge(user_register, feat1, on=['register_type', 'device_type'], how='left')
user_register = pd.merge(user_register, feat2, on=['device_type'], how='left')
user_register = pd.merge(user_register, feat3, on=['register_type'], how='left')
user_register.fillna(0, inplace=True)
for i in range(5):
t3 = user_activity[(user_activity.day >= f_start) &
(user_activity.day <= f_end) &
(user_activity.page == i)][['user_id', 'day']]
t3['user_%d_before_page_count_%d' % (f_end - f_start + 1, i)] \
= (t3['day'] - f_start) / (f_end - f_start)
t3 = pd.pivot_table(t3, index=['user_id'],
values='user_%d_before_page_count_%d' % (f_end - f_start + 1, i),
aggfunc='sum').reset_index()
t3['user_%d_before_page_count_%d' % (f_end - f_start + 1, i)] = \
getMaxMinNormalization(t3['user_%d_before_page_count_%d'
% (f_end - f_start + 1, i)])
user_register = pd.merge(user_register, t3, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
for j in range(6):
t4 = user_activity[(user_activity.day >= f_start) &
(user_activity.day <= f_end) &
(user_activity.action_type == j)][['user_id', 'day']]
t4['user_%d_before_action_type_count_%d' % (f_end - f_start + 1, j)] = \
(t4['day'] - f_start) / (f_end - f_start)
t4 = pd.pivot_table(t4, index=['user_id'],
values='user_%d_before_action_type_count_%d' % (f_end - f_start + 1, j),
aggfunc='sum').reset_index()
t4['user_%d_before_action_type_count_%d' % (f_end - f_start + 1, j)] = \
getMaxMinNormalization(t4['user_%d_before_action_type_count_%d'
% (f_end - f_start + 1, j)])
user_register = pd.merge(user_register, t4, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
t5 = user_activity[(user_activity.day >= f_start) &
(user_activity.day <= f_end)][['user_id', 'video_id', 'day']]
t5 = t5.groupby(['user_id', 'video_id']).agg('max').reset_index()
t5['video_id'] = (t5['day'] - f_start) / (f_end - f_start)
t5 = pd.pivot_table(t5, index=['user_id'],
values='video_id',
aggfunc='sum').reset_index()
t5['video_id'] = getMaxMinNormalization(t5['video_id'])
t5.rename(columns={'video_id': 'user_%d_before_watch_video_type_count'
% (f_end - f_start + 1)}, inplace=True)
user_register = pd.merge(user_register, t5, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
t6 = user_activity[(user_activity.day >= f_start) &
(user_activity.day <= f_end)][['user_id', 'author_id', 'day']]
t6 = t6.groupby(['user_id', 'author_id']).agg('max').reset_index()
t6['author_id'] = (t6['day'] - f_start) / (f_end - f_start)
t6 = pd.pivot_table(t6, index=['user_id'],
values='author_id',
aggfunc='sum').reset_index()
t6['author_id'] = getMaxMinNormalization(t6['author_id'])
t6.rename(columns={'author_id': 'user_%d_before_watch_video_author_type_count'
% (f_end - f_start + 1)}, inplace=True)
user_register = pd.merge(user_register, t6, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
t7 = user_activity[(user_activity.day >= f_start) &
(user_activity.day <= f_end)][['user_id', 'day']]
t7['user_%d_before_activity_count' % (f_end - f_start + 1)] = 1
feat = pd.pivot_table(t7, index=['user_id'],
values='user_%d_before_activity_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
t7_1 = pd.merge(feat, user_register[['user_id', 'register_type', 'device_type']], on=['user_id'], how='left')
t7_1 = t7_1[['register_type', 'device_type', 'user_%d_before_activity_count' % (f_end - f_start + 1)]]
feat1 = pd.pivot_table(t7_1, index=['register_type', 'device_type'],
values='user_%d_before_activity_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
feat2 = pd.pivot_table(t7_1, index=['device_type'],
values='user_%d_before_activity_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
feat3 = pd.pivot_table(t7_1, index=['register_type'],
values='user_%d_before_activity_count' % (f_end - f_start + 1),
aggfunc='sum').reset_index()
feat1.rename(columns={'user_%d_before_activity_count' % (f_end - f_start + 1):
'device_register_type_%d_before_activity_count' % (f_end - f_start + 1)}, inplace=True)
feat1['device_register_type_%d_before_activity_count' % (f_end - f_start + 1)] = \
feat1['device_register_type_%d_before_activity_count' % (f_end - f_start + 1)] / (f_end - f_start + 1)
feat2.rename(columns={'user_%d_before_activity_count' % (f_end - f_start + 1):
'device_type_%d_before_activity_count' % (f_end - f_start + 1)}, inplace=True)
feat2['device_type_%d_before_activity_count' % (f_end - f_start + 1)] = \
feat2['device_type_%d_before_activity_count' % (f_end - f_start + 1)] / (f_end - f_start + 1)
feat3.rename(columns={'user_%d_before_activity_count' % (f_end - f_start + 1):
'register_type_%d_before_activity_count' % (f_end - f_start + 1)}, inplace=True)
feat3['register_type_%d_before_activity_count' % (f_end - f_start + 1)] = \
feat3['register_type_%d_before_activity_count' % (f_end - f_start + 1)] / (f_end - f_start + 1)
feat1['device_register_type_%d_before_activity_count'
% (f_end - f_start + 1)] = \
getMaxMinNormalization(feat1['device_register_type_%d_before_activity_count'
% (f_end - f_start + 1)])
feat2['device_type_%d_before_activity_count'
% (f_end - f_start + 1)] = \
getMaxMinNormalization(feat2['device_type_%d_before_activity_count'
% (f_end - f_start + 1)])
feat3['register_type_%d_before_activity_count'
% (f_end - f_start + 1)] = \
getMaxMinNormalization(feat3['register_type_%d_before_activity_count'
% (f_end - f_start + 1)])
user_register = pd.merge(user_register, feat1, on=['register_type', 'device_type'], how='left')
user_register = pd.merge(user_register, feat2, on=['device_type'], how='left')
user_register = pd.merge(user_register, feat3, on=['register_type'], how='left')
user_register.fillna(0, inplace=True)
t8_1 = user_launch[(user_launch.day >= f_start) &
(user_launch.day <= f_end)][['user_id', 'day']]
t8_1 = t8_1.groupby(['user_id']).agg('max').reset_index()
t8_1.rename(columns={'day': 'max_day'}, inplace=True)
t8_2 = user_launch[(user_launch.day >= f_start) &
(user_launch.day <= f_end)][['user_id', 'day']]
t8_2.drop_duplicates(inplace=True)
t8_2['day'] = 1
t8_2 = t8_2.groupby(['user_id']).agg('sum').reset_index()
t8_2.rename(columns={'day': 'user_%d_before_launch_day_distance'
% (f_end - f_start + 1)}, inplace=True)
t8 = pd.merge(t8_1, t8_2, on=['user_id'], how='left')
t8['user_%d_before_launch_day_distance' % (f_end - f_start + 1)] = \
(t8['user_%d_before_launch_day_distance' % (f_end - f_start + 1)] /
(f_end - f_start + 1)) * (t8['max_day'] - f_start + 1)
t8 = t8[['user_id', 'user_%d_before_launch_day_distance' % (f_end - f_start + 1)]]
user_register = pd.merge(user_register, t8, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
t9_1 = video_create[(video_create.day >= f_start) &
(video_create.day <= f_end)][['user_id', 'day']]
t9_1 = t9_1.groupby(['user_id']).agg('max').reset_index()
t9_1.rename(columns={'day': 'max_day'}, inplace=True)
t9_2 = video_create[(video_create.day >= f_start) &
(video_create.day <= f_end)][['user_id', 'day']]
t9_2.drop_duplicates(inplace=True)
t9_2['day'] = 1
t9_2 = t9_2.groupby(['user_id']).agg('sum').reset_index()
t9_2.rename(columns={'day': 'user_%d_before_video_create_day_distance'
% (f_end - f_start + 1)}, inplace=True)
t9 = pd.merge(t9_1, t9_2, on=['user_id'], how='left')
t9['user_%d_before_video_create_day_distance' % (f_end - f_start + 1)] = \
(t9['user_%d_before_video_create_day_distance' % (f_end - f_start + 1)] /
(f_end - f_start + 1)) * (t9['max_day'] - f_start + 1)
t9 = t9[['user_id', 'user_%d_before_video_create_day_distance' % (f_end - f_start + 1)]]
user_register = pd.merge(user_register, t9, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
t10_1 = user_activity[(user_activity.day >= f_start) &
(user_activity.day <= f_end)][['user_id', 'day']]
t10_1 = t10_1.groupby(['user_id']).agg('max').reset_index()
t10_1.rename(columns={'day': 'max_day'}, inplace=True)
t10_2 = user_activity[(user_activity.day >= f_start) &
(user_activity.day <= f_end)][['user_id', 'day']]
t10_2.drop_duplicates(inplace=True)
t10_2['day'] = 1
t10_2 = t10_2.groupby(['user_id']).agg('sum').reset_index()
t10_2.rename(columns={'day': 'user_%d_before_activity_day_distance'
% (f_end - f_start + 1)}, inplace=True)
t10 = pd.merge(t10_1, t10_2, on=['user_id'], how='left')
t10['user_%d_before_activity_day_distance' % (f_end - f_start + 1)] = \
(t10['user_%d_before_activity_day_distance' % (f_end - f_start + 1)] /
(f_end - f_start + 1)) * (t10['max_day'] - f_start + 1)
t10 = t10[['user_id', 'user_%d_before_activity_day_distance' % (f_end - f_start + 1)]]
user_register = pd.merge(user_register, t10, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
for i in range(4):
t13 = user_activity[(user_activity.day >= f_start) & (user_activity.day <= f_end)
& (user_activity.action_type == i)][['author_id', 'day']]
t13['author_%d_before_action_type_%d_showed_count' % (f_end - f_start + 1, i)] = \
(t13['day'] - f_start) / (f_end - f_start)
t13 = pd.pivot_table(t13, index=['author_id'],
values='author_%d_before_action_type_%d_showed_count'
% (f_end - f_start + 1, i),
aggfunc='sum').reset_index()
t13['author_%d_before_action_type_%d_showed_count'
% (f_end - f_start + 1, i)] = getMaxMinNormalization(t13['author_%d_before_action_type_%d_showed_count'
% (f_end - f_start + 1, i)])
t13.rename(columns={'author_id': 'user_id'}, inplace=True)
user_register = pd.merge(user_register, t13, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
t15 = user_launch[(user_launch.day <= f_end) &
(user_launch.day >= f_start)][['user_id', 'day']]
t15 = t15.groupby(['user_id']).agg('max').reset_index()
t15.columns = ['user_id', 'max_day']
user_register = pd.merge(user_register, t15, on=['user_id'], how='left')
user_register['user_launch_max_day_distance'] = f_end - user_register['max_day'] + 1
user_register.fillna(999, inplace=True)
user_register.drop(['max_day'], axis=1, inplace=True)
t16 = video_create[(video_create.day <= f_end) &
(video_create.day >= f_start)][['user_id', 'day']]
t16 = t16.groupby(['user_id']).agg('max').reset_index()
t16.columns = ['user_id', 'max_day']
user_register = pd.merge(user_register, t16, on=['user_id'], how='left')
user_register['user_video_create_max_day_distance'] = f_end - user_register['max_day'] + 1
user_register.fillna(999, inplace=True)
user_register.drop(['max_day'], axis=1, inplace=True)
for i in range(4):
t17 = user_activity[(user_activity.day <= f_end) &
(user_activity.day >= f_start) &
(user_activity.action_type == i)][['user_id', 'day']]
t17 = t17.groupby(['user_id']).agg('max').reset_index()
t17.columns = ['user_id', 'max_day']
user_register = pd.merge(user_register, t17, on=['user_id'], how='left')
user_register['user_activity_action_type_%d_max_day_distance' % (i)] \
= f_end - user_register['max_day'] + 1
user_register.fillna(999, inplace=True)
user_register.drop(['max_day'], axis=1, inplace=True)
t18 = user_launch[(user_launch.day >= f_start) &
(user_launch.day <= f_end)][['user_id', 'day']]
t18['day'] = t18['day'].astype('str')
t18 = t18.groupby(['user_id']).agg(lambda x: ':'.join(x)).reset_index()
t18['user_launch_number'] = t18['day'].apply(lambda x: len(x.split(':')))
t18 = t18[t18.user_launch_number >= 1]
t18 = t18[['user_id', 'day']]
def culContinuousMeanLaunchDay(s):
launch_day = [int(x) for x in list(set(s.split(':')))]
launch_day.sort()
continuous_day_count = []
if (len(launch_day) == 1):
continuous_day_count.append(int(launch_day[0]) - f_start + 1)
else:
count = 0
for i in range(len(launch_day) - 1):
if ((int(launch_day[i + 1]) - int(launch_day[i]) == 1)):
if (i == len(launch_day) - 2):
count += (int(launch_day[i]) - f_start + 1)
continuous_day_count.append(
count + (int(launch_day[i + 1]) - f_start + 1))
else:
count += (int(launch_day[i]) - f_start + 1)
else:
if (i == len(launch_day) - 2):
continuous_day_count.append(
count + (int(launch_day[i]) - f_start + 1))
count = 0
continuous_day_count.append(
count + (int(launch_day[i + 1]) - f_start + 1))
else:
continuous_day_count.append(
count + (int(launch_day[i]) - f_start + 1))
count = 0
continuous_day_count = np.array(continuous_day_count)
return np.mean(continuous_day_count)
t18['user_%d_before_continuous_mean_launch_day' % (f_end - f_start + 1)] \
= t18.day.apply(culContinuousMeanLaunchDay)
t18 = t18[['user_id', 'user_%d_before_continuous_mean_launch_day' % (f_end - f_start + 1)]]
user_register = pd.merge(user_register, t18, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
t19 = video_create[(video_create.day >= f_start) &
(video_create.day <= f_end)][['user_id', 'day']]
t19['day'] = t19['day'].astype('str')
t19 = t19.groupby(['user_id']).agg(lambda x: ':'.join(x)).reset_index()
t19['user_create_video_number'] = t19['day'].apply(lambda x: len(x.split(':')))
t19 = t19[t19.user_create_video_number >= 1]
t19 = t19[['user_id', 'day']]
def culContinuousMeanVideoCreateDay(s):
video_create_day = [int(x) for x in list(set(s.split(':')))]
video_create_day.sort()
continuous_day_count = []
if(len(video_create_day) == 1):
continuous_day_count.append(int(video_create_day[0] - f_start + 1))
else:
count = 0
for i in range(len(video_create_day) - 1):
if ((int(video_create_day[i + 1]) - int(video_create_day[i]) == 1)):
if (i == len(video_create_day) - 2):
count += (int(video_create_day[i]) - f_start + 1)
continuous_day_count.append(
count + (int(video_create_day[i + 1]) - f_start + 1))
else:
count += (int(video_create_day[i]) - f_start + 1)
else:
if (i == len(video_create_day) - 2):
continuous_day_count.append(
count + (int(video_create_day[i]) - f_start + 1))
count = 0
continuous_day_count.append(
count + (int(video_create_day[i + 1]) - f_start + 1))
else:
continuous_day_count.append(
count + (int(video_create_day[i]) - f_start + 1))
count = 0
continuous_day_count = np.array(continuous_day_count)
return np.mean(continuous_day_count)
t19['user_%d_before_continuous_mean_create_video_day' % (f_end - f_start + 1)] \
= t19.day.apply(culContinuousMeanVideoCreateDay)
t19 = t19[['user_id', 'user_%d_before_continuous_mean_create_video_day' % (f_end - f_start + 1)]]
user_register = pd.merge(user_register, t19, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
t20 = user_activity[(user_activity.day >= f_start) &
(user_activity.day <= f_end)][['user_id', 'day']]
t20['day'] = t20['day'].astype('str')
t20 = t20.groupby(['user_id']).agg(lambda x: ':'.join(x)).reset_index()
t20['user_activity_number'] = t20['day'].apply(lambda x: len(x.split(':')))
t20 = t20[t20.user_activity_number >= 1]
t20 = t20[['user_id', 'day']]
def culContinuousMeanUserActivityDay(s):
user_activity_day = [int(x) for x in list(set(s.split(':')))]
user_activity_day.sort()
continuous_day_count = []
if(len(user_activity_day) == 1):
continuous_day_count.append(int(user_activity_day[0]) - f_start + 1)
else:
count = 0
for i in range(len(user_activity_day) - 1):
if ((int(user_activity_day[i + 1]) - int(user_activity_day[i]) == 1)):
if (i == len(user_activity_day) - 2):
count += (int(user_activity_day[i]) - f_start + 1)
continuous_day_count.append(
count + (int(user_activity_day[i + 1]) - f_start + 1))
else:
count += (int(user_activity_day[i]) - f_start + 1)
else:
if (i == len(user_activity_day) - 2):
continuous_day_count.append(
count + (int(user_activity_day[i]) - f_start + 1))
count = 0
continuous_day_count.append(
count + (int(user_activity_day[i + 1]) - f_start + 1))
else:
continuous_day_count.append(
count + (int(user_activity_day[i]) - f_start + 1))
count = 0
continuous_day_count = np.array(continuous_day_count)
return np.mean(continuous_day_count)
t20['user_%d_before_continuous_mean_user_activity_day' % (f_end - f_start + 1)] \
= t20.day.apply(culContinuousMeanUserActivityDay)
t20 = t20[['user_id', 'user_%d_before_continuous_mean_user_activity_day' % (f_end - f_start + 1)]]
user_register = pd.merge(user_register, t20, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
t24_1 = user_launch[(user_launch.day >= f_start) &
(user_launch.day <= f_end)][['user_id', 'day']]
t24_1.drop_duplicates(inplace=True)
t24_1['day'] = t24_1['day'].astype('str')
t24_1 = t24_1.groupby(['user_id'])['day'].agg(lambda x: ':'.join(x)).reset_index()
t24_1['user_launch_number'] = t24_1['day'].apply(lambda x: len(x.split(':')))
t24_1 = t24_1[t24_1.user_launch_number >= 1]
t24_1 = t24_1[['user_id', 'day']]
t24_1.columns = ['user_id', 'launch_day']
t24_2 = video_create[(video_create.day >= f_start) &
(video_create.day <= f_end)][['user_id', 'day']]
t24_2['day'] = t24_2['day'].astype('str')
t24_2 = t24_2.groupby(['user_id'])['day'].agg(lambda x: ':'.join(x)).reset_index()
t24_2['user_video_number'] = t24_2['day'].apply(lambda x: len(x.split(':')))
t24_2 = t24_2[t24_2.user_video_number >= 1]
t24_2 = t24_2[['user_id', 'day']]
t24_2.columns = ['user_id', 'video_day']
t24 = pd.merge(t24_1, t24_2, on=['user_id'], how='right')
t24['day'] = t24['launch_day'] + ',' + t24['video_day']
t24 = t24[t24.day.notnull()]
t24 = t24[['user_id', 'day']]
def videoCreateFrequencyAfterLaunch(s):
launch, video = s.split(',')
launch_day = [int(x) for x in list(set(launch.split(':')))]
video_day = [int(x) for x in video.split(':')]
launch_day.sort()
video_day.sort()
gap_list = []
for i in range(len(launch_day) - 1):
gap = 0
for j in range(len(video_day)):
if ((int(video_day[j]) >= int(launch_day[i])) &
(int(video_day[j]) < int(launch_day[i + 1]))):
gap += (int(video_day[j]) - f_start + 1)
gap_list.append(gap)
gap = 0
for j in range(len(video_day)):
if (int(video_day[j]) >= int(launch_day[len(launch_day) - 1])):
gap += (int(video_day[j]) - f_start + 1)
gap_list.append(gap)
gap_array = np.array(gap_list)
return np.mean(gap_array)
t24['user_%d_before_create_video_after_launch_frequency' % (f_end - f_start + 1)] = \
t24.day.apply(videoCreateFrequencyAfterLaunch)
t24 = t24[['user_id', 'user_%d_before_create_video_after_launch_frequency' % (f_end - f_start + 1)]]
user_register = pd.merge(user_register, t24, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
t25_1 = user_launch[(user_launch.day >= f_start) &
(user_launch.day <= f_end)][['user_id', 'day']]
t25_1.drop_duplicates(inplace=True)
t25_1['day'] = t25_1['day'].astype('str')
t25_1 = t25_1.groupby(['user_id'])['day'].agg(lambda x: ':'.join(x)).reset_index()
t25_1['user_launch_number'] = t25_1['day'].apply(lambda x: len(x.split(':')))
t25_1 = t25_1[t25_1.user_launch_number >= 1]
t25_1 = t25_1[['user_id', 'day']]
t25_1.columns = ['user_id', 'launch_day']
t25_2 = user_activity[(user_activity.day >= f_start) &
(user_activity.day <= f_end)][['user_id', 'day']]
t25_2['day'] = t25_2['day'].astype('str')
t25_2 = t25_2.groupby(['user_id'])['day'].agg(lambda x: ':'.join(x)).reset_index()
t25_2['user_activity_number'] = t25_2['day'].apply(lambda x: len(x.split(':')))
t25_2 = t25_2[t25_2.user_activity_number >= 1]
t25_2 = t25_2[['user_id', 'day']]
t25_2.columns = ['user_id', 'activity_day']
t25 = pd.merge(t25_1, t25_2, on=['user_id'], how='right')
t25['day'] = t25['launch_day'] + ',' + t25['activity_day']
t25 = t25[t25.day.notnull()]
t25 = t25[['user_id', 'day']]
def activityFrequencyAfterLaunch(s):
launch, activity = s.split(',')
launch_day = [int(x) for x in list(set(launch.split(':')))]
activity_day = [int(x) for x in activity.split(':')]
launch_day.sort()
activity_day.sort()
gap_list = []
for i in range(len(launch_day) - 1):
gap = 0
for j in range(len(activity_day)):
if ((int(activity_day[j]) >= int(launch_day[i])) &
(int(activity_day[j]) < int(launch_day[i + 1]))):
gap += (int(activity_day[j]) - f_start + 1)
gap_list.append(gap)
gap = 0
for j in range(len(activity_day)):
if (int(activity_day[j]) >= int(launch_day[len(launch_day) - 1])):
gap += (int(activity_day[j]) - f_start + 1)
gap_list.append(gap)
gap_array = np.array(gap_list)
return np.mean(gap_array)
t25['user_%d_before_activity_after_launch_frequency' % (f_end - f_start + 1)] = \
t25.day.apply(activityFrequencyAfterLaunch)
t25 = t25[['user_id', 'user_%d_before_activity_after_launch_frequency' % (f_end - f_start + 1)]]
user_register = pd.merge(user_register, t25, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
t26 = user_activity[(user_activity.day <= f_end) &
(user_activity.day >= f_start)][['user_id', 'day']]
t26['user_day_count'] = 1
t26 = t26.groupby(['user_id', 'day']).agg('sum').reset_index()
t26['day'] = t26['day'].astype('str')
t26['user_day_count'] = t26['user_day_count'].astype('str')
t26['user_day_and_day_count'] = t26['day'] + ':' + t26['user_day_count']
t26 = t26[['user_id', 'user_day_and_day_count']]
t26 = t26.groupby(['user_id'])['user_day_and_day_count'].agg(lambda x: ','.join(x)).reset_index()
t26['user_day_number'] = t26.user_day_and_day_count.apply(lambda x: len(x.split(',')))
t26 = t26[t26.user_day_number > 1]
t26 = t26[['user_id', 'user_day_and_day_count']]
def calculateAcceleration(s):
day_and_day_count = [x for x in s.split(',')]
day_list = [int(x.split(':')[0]) for x in day_and_day_count]
day_list.sort()
dc_dict = {}
for dc in day_and_day_count:
dc_dict[int(dc.split(':')[0])] = int(dc.split(':')[1])
gap = []
for i in range(len(day_list) - 1):
gap.append((dc_dict[day_list[i + 1]] - dc_dict[day_list[i]]) /
(day_list[i + 1] - day_list[i]))
gap = np.array(gap)
return np.mean(gap)
t26['user_whole_day_activity_acceleration'] = \
t26.user_day_and_day_count.apply(calculateAcceleration)
t26 = t26[['user_id', 'user_whole_day_activity_acceleration']]
user_register = pd.merge(user_register, t26, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
t29 = user_launch[(user_launch.day >= f_start) &
(user_launch.day <= f_end)][['user_id', 'day']]
t29['day'] = t29['day'].astype('str')
t29 = t29.groupby(['user_id'])['day'].agg(lambda x: '-'.join(x)).reset_index()
t29['user_day_number'] = t29.day.apply(lambda x: len(x.split('-')))
t29 = t29[t29.user_day_number > 1]
t29 = t29[['user_id', 'day']]
def culculateFrequncy(s):
day_list = [int(x) for x in s.split('-')]
day_list.sort()
day_array= np.array(day_list)
gap = np.diff(day_array)
return np.mean(gap)
t29['user_%d_before_launch_day_mean_frequncy' % (f_end - f_start + 1)] = t29.day.apply(culculateFrequncy)
t29 = t29[['user_id', 'user_%d_before_launch_day_mean_frequncy' % (f_end - f_start + 1)]]
user_register = pd.merge(user_register, t29, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
t30 = video_create[(video_create.day >= f_start) &
(video_create.day <= f_end)][['user_id', 'day']]
t30['day'] = t30['day'].astype('str')
t30 = t30.groupby(['user_id'])['day'].agg(lambda x : '-'.join(x)).reset_index()
t30['user_day_number'] = t30.day.apply(lambda x: len(x.split('-')))
t30 = t30[t30.user_day_number > 1]
t30 = t30[['user_id', 'day']]
def culculateFrequncy(s):
day_list = [int(x) for x in s.split('-')]
day_list.sort()
day_array= np.array(day_list)
gap = np.diff(day_array)
return np.mean(gap)
t30['user_%d_before_video_create_day_mean_frequncy' % (f_end - f_start + 1)] = t30.day.apply(culculateFrequncy)
t30 = t30[['user_id', 'user_%d_before_video_create_day_mean_frequncy' % (f_end - f_start + 1)]]
user_register = pd.merge(user_register, t30, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
t31 = user_activity[(user_activity.day >= f_start) &
(user_activity.day <= f_end)][['user_id', 'day']]
t31['day'] = t31['day'].astype('str')
t31 = t31.groupby(['user_id'])['day'].agg(lambda x: '-'.join(x)).reset_index()
t31['user_day_number'] = t31.day.apply(lambda x: len(x.split('-')))
t31 = t31[t31.user_day_number > 1]
t31 = t31[['user_id', 'day']]
def culculateFrequncy(s):
day_list = [int(x) for x in s.split('-')]
day_list.sort()
day_array= np.array(day_list)
gap = np.diff(day_array)
return np.mean(gap)
t31['user_%d_before_activity_day_mean_frequncy' % (f_end - f_start + 1)] = t31.day.apply(culculateFrequncy)
t31 = t31[['user_id', 'user_%d_before_activity_day_mean_frequncy' % (f_end - f_start + 1)]]
user_register = pd.merge(user_register, t31, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
t9_1 = video_create[(video_create.day >= f_start) &
(video_create.day <= f_end)][['user_id', 'day']]
t9_1 = t9_1.groupby(['user_id']).agg('max').reset_index()
t9_1.rename(columns={'day': 'max_day'}, inplace=True)
t9_2 = video_create[(video_create.day >= f_start) &
(video_create.day <= f_end)][['user_id', 'day']]
t9_2 = t9_2.groupby(['user_id']).agg('min').reset_index()
t9_2.rename(columns={'day': 'min_day'}, inplace=True)
t9 = pd.merge(t9_1, t9_2, on=['user_id'], how='left')
t9['user_%d_before_video_create_day_distance2' % (f_end - f_start + 1)] = t9['max_day'] - t9['min_day']
t9['user_%d_before_video_create_day_distance2' % (f_end - f_start + 1)] = \
(t9['user_%d_before_video_create_day_distance2' % (f_end - f_start + 1)] /
(f_end - f_start + 1)) * (t9['max_day'] - f_start)
t9 = t9[['user_id', 'user_%d_before_video_create_day_distance2' % (f_end - f_start + 1)]]
user_register = pd.merge(user_register, t9, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
t10_1 = user_activity[(user_activity.day >= f_start) &
(user_activity.day <= f_end)][['user_id', 'day']]
t10_1 = t10_1.groupby(['user_id']).agg('max').reset_index()
t10_1.rename(columns={'day': 'max_day'}, inplace=True)
t10_2 = user_activity[(user_activity.day >= f_start) &
(user_activity.day <= f_end)][['user_id', 'day']]
t10_2 = t10_2.groupby(['user_id']).agg('min').reset_index()
t10_2.rename(columns={'day': 'min_day'}, inplace=True)
t10 = pd.merge(t10_1, t10_2, on=['user_id'], how='left')
t10['user_%d_before_activity_day_distance2' % (f_end - f_start + 1)] = t10['max_day'] - t10['min_day']
t10['user_%d_before_activity_day_distance2' % (f_end - f_start + 1)] = \
(t10['user_%d_before_activity_day_distance2' % (f_end - f_start + 1)] /
(f_end - f_start + 1)) * (t10['max_day'] - f_start)
t10 = t10[['user_id', 'user_%d_before_activity_day_distance2' % (f_end - f_start + 1)]]
user_register = pd.merge(user_register, t10, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
t8_1 = user_launch[(user_launch.day >= f_start) &
(user_launch.day <= f_end)][['user_id', 'day']]
t8_1 = t8_1.groupby(['user_id']).agg('max').reset_index()
t8_1.rename(columns={'day': 'max_day'}, inplace=True)
user_register = pd.merge(user_register, t8_1, on=['user_id'], how='left')
user_register['user_launch_max_day_register_day_distance'] = \
(user_register['max_day'] - user_register['register_day'] + 1)
user_register.drop(['max_day'], axis=1, inplace=True)
user_register.fillna(0, inplace = True)
t9_1 = video_create[(video_create.day >= f_start) &
(video_create.day <= f_end)][['user_id', 'day']]
t9_1 = t9_1.groupby(['user_id']).agg('max').reset_index()
t9_1.rename(columns={'day': 'max_day'}, inplace=True)
user_register = pd.merge(user_register, t9_1, on=['user_id'], how='left')
user_register['user_video_max_day_register_day_distance'] = \
(user_register['max_day'] - user_register['register_day'] + 1)
user_register.drop(['max_day'], axis=1, inplace=True)
user_register.fillna(0, inplace = True)
t10_1 = user_activity[(user_activity.day >= f_start) &
(user_activity.day <= f_end)][['user_id', 'day']]
t10_1 = t10_1.groupby(['user_id']).agg('max').reset_index()
t10_1.rename(columns={'day': 'max_day'}, inplace=True)
user_register = pd.merge(user_register, t10_1, on=['user_id'], how='left')
user_register['user_activity_max_day_register_day_distance'] = \
(user_register['max_day'] - user_register['register_day'] + 1)
user_register.drop(['max_day'], axis=1, inplace=True)
user_register.fillna(0, inplace = True)
t32 = user_launch[(user_launch.day >= f_start) &
(user_launch.day <= f_end)][['user_id', 'day']]
t32 = pd.merge(t32, user_register[['user_id', 'register_day']], on = ['user_id'], how = 'left')
t32['user_launch_count_concerned_register_day'] = t32['day'] - t32['register_day'] + 1
t32 = pd.pivot_table(t32, values = 'user_launch_count_concerned_register_day',
index = 'user_id', aggfunc = 'sum').reset_index()
user_register = pd.merge(user_register, t32, on = ['user_id'], how = 'left')
user_register.fillna(0, inplace = True)
t33 = video_create[(video_create.day >= f_start) &
(video_create.day <= f_end)][['user_id', 'day']]
t33 = pd.merge(t33, user_register[['user_id', 'register_day']], on = ['user_id'], how = 'left')
t33['user_video_create_count_concerned_register_day'] = t33['day'] - t33['register_day'] + 1
t33 = pd.pivot_table(t33, values = 'user_video_create_count_concerned_register_day',
index = 'user_id', aggfunc = 'sum').reset_index()
user_register = pd.merge(user_register, t33, on = ['user_id'], how = 'left')
user_register.fillna(0, inplace = True)
for i in range(4):
t33 = user_activity[(user_activity.day >= f_start) &
(user_activity.day <= f_end) &
(user_activity.action_type == i)][['user_id', 'day']]
t33 = pd.merge(t33, user_register[['user_id', 'register_day']], on = ['user_id'], how = 'left')
t33['user_activity_action_type_%d_count_concerned_register_day'%(i)] = \
t33['day'] - t33['register_day'] + 1
t33 = pd.pivot_table(t33, values = 'user_activity_action_type_%d_count_concerned_register_day'%(i),
index = 'user_id', aggfunc = 'sum').reset_index()
user_register = pd.merge(user_register, t33, on = ['user_id'], how = 'left')
user_register.fillna(0, inplace = True)
t35 = user_activity[(user_activity.day >= f_start) &
(user_activity.day <= f_end)][['user_id', 'author_id', 'day']]
t35 = pd.pivot_table(t35, values = 'day',
index = ['user_id', 'author_id'],
aggfunc = np.max).reset_index()
t35 = pd.merge(t35, user_register[['user_id', 'register_day']], on = ['user_id'], how = 'left')
t35['author_id'] = t35['day'] - t35['register_day'] + 1
t35 = pd.pivot_table(t35, values = 'author_id',
index = ['user_id'],
aggfunc = 'sum').reset_index()\
.rename(columns = {'author_id' : 'user_author_type_count_concerned_register_day'})
user_register = pd.merge(user_register, t35, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
t36 = user_activity[(user_activity.day >= f_start) &
(user_activity.day <= f_end)][['user_id', 'author_id', 'day']]
t36 = pd.pivot_table(t36, values = 'day',
index = ['user_id', 'author_id'],
aggfunc = np.mean).reset_index()
t36 = pd.merge(t36, user_register[['user_id', 'register_day']], on=['user_id'], how='left')
t36['user_id'] = t36['day'] - t36['register_day'] + 1
t36 = pd.pivot_table(t36, values='user_id',
index=['author_id'],
aggfunc='sum').reset_index() \
.rename(columns={'user_id': 'author_user_type_mean_count_concerned_register_day'})
t36.rename(columns = {'author_id' : 'user_id'}, inplace = True)
user_register = pd.merge(user_register, t36, on=['user_id'], how='left')
user_register.fillna(0, inplace = True)
t8_1 = user_launch[(user_launch.day >= f_start) &
(user_launch.day <= f_end)][['user_id', 'day']]
t8_1 = t8_1.groupby(['user_id']).agg('mean').reset_index()
t8_1.rename(columns={'day': 'mean_day'}, inplace=True)
user_register = pd.merge(user_register, t8_1, on=['user_id'], how='left')
user_register['user_launch_mean_day_register_day_distance'] = \
(user_register['mean_day'] - user_register['register_day'] + 1) / \
(f_end - user_register['register_day'] + 1)
user_register.drop(['mean_day'], axis=1, inplace=True)
user_register.fillna(0, inplace=True)
t9_1 = video_create[(video_create.day >= f_start) &
(video_create.day <= f_end)][['user_id', 'day']]
t9_1 = t9_1.groupby(['user_id']).agg('mean').reset_index()
t9_1.rename(columns={'day': 'mean_day'}, inplace=True)
user_register = pd.merge(user_register, t9_1, on=['user_id'], how='left')
user_register['user_video_mean_day_register_day_distance'] = \
(user_register['mean_day'] - user_register['register_day'] + 1) / \
(f_end - user_register['register_day'] + 1)
user_register.drop(['mean_day'], axis=1, inplace=True)
user_register.fillna(0, inplace=True)
t10_1 = user_activity[(user_activity.day >= f_start) &
(user_activity.day <= f_end)][['user_id', 'day']]
t10_1 = t10_1.groupby(['user_id']).agg('mean').reset_index()
t10_1.rename(columns={'day': 'mean_day'}, inplace=True)
user_register = pd.merge(user_register, t10_1, on=['user_id'], how='left')
user_register['user_activity_mean_day_register_day_distance'] = \
(user_register['mean_day'] - user_register['register_day'] + 1) / \
(f_end - user_register['register_day'] + 1)
user_register.drop(['mean_day'], axis=1, inplace=True)
user_register.fillna(0, inplace=True)
t8_1 = user_launch[(user_launch.day >= f_start) &
(user_launch.day <= f_end)][['user_id', 'day']]
t8_1 = t8_1.groupby(['user_id']).agg('median').reset_index()
t8_1.rename(columns={'day': 'median_day'}, inplace=True)
user_register = pd.merge(user_register, t8_1, on=['user_id'], how='left')
user_register['user_launch_median_day_register_day_distance'] = \
(user_register['median_day'] - user_register['register_day'] + 1)
user_register.drop(['median_day'], axis=1, inplace=True)
user_register.fillna(0, inplace=True)
t9_1 = video_create[(video_create.day >= f_start) &
(video_create.day <= f_end)][['user_id', 'day']]
t9_1 = t9_1.groupby(['user_id']).agg('median').reset_index()
t9_1.rename(columns={'day': 'median_day'}, inplace=True)
user_register = pd.merge(user_register, t9_1, on=['user_id'], how='left')
user_register['user_video_median_day_register_day_distance'] = \
(user_register['median_day'] - user_register['register_day'] + 1)
user_register.drop(['median_day'], axis=1, inplace=True)
user_register.fillna(0, inplace=True)
t10_1 = user_activity[(user_activity.day >= f_start) &
(user_activity.day <= f_end)][['user_id', 'day']]
t10_1 = t10_1.groupby(['user_id']).agg('median').reset_index()
t10_1.rename(columns={'day': 'median_day'}, inplace=True)
user_register = pd.merge(user_register, t10_1, on=['user_id'], how='left')
user_register['user_activity_median_day_register_day_distance'] = \
(user_register['median_day'] - user_register['register_day'] + 1)
user_register.drop(['median_day'], axis=1, inplace=True)
user_register.fillna(0, inplace=True)
for i in range(6):
t40_1 = user_activity[(user_activity.day >= f_start) &
(user_activity.day <= f_end) &
(user_activity.action_type == i)][['user_id']]
t40_1['count1'] = 1
t40_1 = pd.pivot_table(t40_1, index=['user_id'],
values=['count1'], aggfunc = 'sum').reset_index()
t40_2 = user_activity[(user_activity.day >= f_start)
& (user_activity.day <= f_end)][['user_id']]
t40_2['count2'] = 1
t40_2 = pd.pivot_table(t40_2, index = ['user_id'],
values = ['count2'], aggfunc = 'sum').reset_index()
t40 = pd.merge(t40_2, t40_1, on = ['user_id'], how = 'left')
t40.fillna(0, inplace = True)
t40['user_%d_before_activity_action_type_%d_rate'
%(f_end - f_start + 1, i)] = t40['count1'] / t40['count2']
t40 = t40[['user_id', 'user_%d_before_activity_action_type_%d_rate'
%(f_end - f_start + 1, i)]]
user_register = pd.merge(user_register, t40, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
for i in range(5):
t40_1 = user_activity[(user_activity.day >= f_start) &
(user_activity.day <= f_end) &
(user_activity.page == i)][['user_id']]
t40_1['count1'] = 1
t40_1 = pd.pivot_table(t40_1, index=['user_id'],
values=['count1'], aggfunc='sum').reset_index()
t40_2 = user_activity[(user_activity.day >= f_start)
& (user_activity.day <= f_end)][['user_id']]
t40_2['count2'] = 1
t40_2 = pd.pivot_table(t40_2, index=['user_id'],
values=['count2'], aggfunc='sum').reset_index()
t40 = pd.merge(t40_2, t40_1, on=['user_id'], how='left')
t40.fillna(0, inplace=True)
t40['user_%d_before_activity_page_%d_rate'
% (f_end - f_start + 1, i)] = t40['count1'] / t40['count2']
t40 = t40[['user_id', 'user_%d_before_activity_page_%d_rate'
% (f_end - f_start + 1, i)]]
user_register = pd.merge(user_register, t40, on=['user_id'], how='left')
user_register.fillna(0, inplace=True)
print('get all time feature have finished!')
return user_register
def getSlideJoin(user_launch, register_have_label, video_create, user_activity, end):
register_have_feature = getLastDayFeature(user_launch, register_have_label,
video_create, user_activity, end, end)
register_have_feature = getAllTimeFeature(user_launch, register_have_feature,
video_create, user_activity, end - 15, end)
return register_have_feature
def getDummiesFeature(user_register):
register_type_df = pd.get_dummies(user_register['register_type'], prefix='register_type')
user_register = pd.concat([user_register, register_type_df], axis=1)
return user_register[user_register['flag'].notnull()].reset_index(drop=True), \
user_register[user_register['flag'].isnull()].reset_index(drop=True)
def getFeature(user_launch, user_register, video_create, user_activity,
register_have_label1, register_have_label2):
print('train1')
register_have_feature_train1 = getSlideJoin(user_launch, register_have_label1,
video_create, user_activity, 23)
print(register_have_feature_train1.shape)
print('train2')
register_have_feature_train2 = getSlideJoin(user_launch, register_have_label2,
video_create, user_activity, 16)
print(register_have_feature_train2.shape)
register_have_feature_train = pd.concat([register_have_feature_train1,
register_have_feature_train2]).reset_index(drop=True)
print('test')
register_have_feature_test = getSlideJoin(user_launch, user_register,
video_create, user_activity, 30)
print(register_have_feature_test.shape)
register_have_feature_train['flag'] = 1
register_have_feature_train, register_have_feature_test = \
getDummiesFeature(pd.concat([register_have_feature_train,
register_have_feature_test]).reset_index(drop=True))
print(register_have_feature_train.shape)
print(register_have_feature_test.shape)
register_have_feature_train.drop(['user_id', 'register_day', 'flag'], axis=1, inplace=True)
register_have_feature_test.drop(['register_day', 'flag'], axis=1, inplace=True)
return register_have_feature_train, register_have_feature_test
def runXGBoost(train, test):
print('run xgboost...')
train_feat = [x for x in train.columns if x != 'label']
test_feat = [x for x in test.columns if x != 'user_id']
feat = [x for x in train_feat if x in test_feat]
print('feat:', len(feat))
train_x = train[feat]
train_y = train[['label']]
test_x = test[feat]
test_pre = test[['user_id']]
train_xgb = xgb.DMatrix(train_x, label=train_y)
test_xgb = xgb.DMatrix(test_x)
params = {
'booster': 'gbtree',
'objective': 'rank:pairwise',
'eval_metric': 'auc',
'gamma': 0.1,
'min_child_weight': 1.5,
'max_depth': 5,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'eta': 0.03,
'tree_method': 'exact',
'seed': 0,
'nthread': 12
}
# train on dataset2, evaluate on dataset1
watchlist = [(train_xgb, 'train'), (train_xgb, 'val')]
model = xgb.train(params, train_xgb, num_boost_round=700, evals=watchlist)
test_pre['predicted_pro'] = model.predict(test_xgb)
min_pro = np.min(test_pre.predicted_pro)
max_pro = np.max(test_pre.predicted_pro)
test_pre.predicted_pro = \
(test_pre.predicted_pro - min_pro) / (max_pro - min_pro)
result = test_pre.sort_index(by=['predicted_pro'], ascending=False).reset_index(drop=True)
print('run xgboost have finished!')
return result
def main():
# read pure txt
user_launch, user_register, video_create, user_activity = readData()
# labeling register log
print('get label...')
register_have_label1, register_have_label2 =\
getLabel(user_launch, user_register, video_create, user_activity)
print('get label have finished!')
# get feature
print('get feature...')
train, test = \
getFeature(user_launch, user_register, video_create, user_activity,
register_have_label1, register_have_label2)
print('get feature have finished!')
# run xgboost
result_10_cv = test[['user_id']]
result_10_cv['predicted_pro'] = 0
for i in range(10):
train_sample = train.sample(frac=0.9)
result = runXGBoost(train_sample, test)
result.rename(columns={'predicted_pro': 'predicted_pro_%d' % (i)}, inplace=True)
result_10_cv = pd.merge(result_10_cv, result, on=['user_id'], how='left')
result_10_cv['predicted_pro'] = result_10_cv['predicted_pro'] +\
0.1 * result_10_cv['predicted_pro_%d' % (i)]
result_10_cv = result_10_cv[['user_id', 'predicted_pro']]
result = runXGBoost(train, test)
result.rename(columns={'predicted_pro': 'predicted_pro_all'}, inplace=True)
result_10_cv = pd.merge(result_10_cv, result, on=['user_id'], how='left')
result_10_cv['predicted_pro'] = 0.5 * result_10_cv['predicted_pro'] + 0.5 * result_10_cv['predicted_pro_all']
result_10_cv = result_10_cv[['user_id', 'predicted_pro']]
result_10_cv.to_csv("mzj_model.csv", encoding='utf-8', index=None, header=None)
if __name__ == '__main__':
main()
# ๆจกๅ่ๅ
import pandas as pd
import numpy as np
def getMaxMinNormalization(feature):
max = np.max(feature)
min = np.min(feature)
return (feature - min) / (max - min)
r1 = pd.read_csv(r'yw_model.txt', header = None)
r1.columns = ['user_id', 'label1']
r2 = pd.read_csv(r'xjy_model.txt', header = None)
r2.columns = ['user_id', 'label2']
r2['label2'] = getMaxMinNormalization(r2['label2'])
r3 = pd.read_csv(r'mzj_model.csv', header = None)
r3.columns = ['user_id', 'label3']
print(r1.shape)
print(r2.shape)
print(r3.shape)
result = pd.merge(r1, r2, on = ['user_id'], how = 'left')
result = pd.merge(result, r3, on = ['user_id'], how = 'left')
result['label'] = 0.3 * result['label1'] + 0.4 * result['label2'] + 0.3 * result['label3']
result = result[['user_id', 'label']]
result = result.sort_index(by=['label'], ascending=False).reset_index(drop=True)
print(result.shape)
result.to_csv("fafenlousi_result.csv", encoding='utf-8', index=None, header = None)