Частично использованы материалы
import pandas as pd
import numpy as np
def make_s(n_rows):
tmp = pd.DataFrame({'price': (100*np.random.rand(n_rows)).astype(int), 'feature': np.zeros(n_rows)})
tmp['price'] = tmp['price'].astype(str) + '$'
return tmp
data = make_s(5)
data
data['price($)_v1'] = data['price'].apply(lambda x: int(x[:-1]))
data['price($)_v2'] = data['price'].apply(lambda x: x[:-1]).astype(int)
data['price($)_v3'] = data['price'].apply(lambda x: x.replace('$', '')).astype(int)
data['price($)_v4'] = data['price'].str.replace('$', '').astype(int)
data
data = make_s(10000000)
%%time
data['price($))_v1'] = data['price'].apply(lambda x: int(x[:-1]))
# 4.2-4.33
%%time
data['price($)_v2'] = data['price'].apply(lambda x: x[:-1]).astype(int)
# 2.47 s - 2.52 s
%%time
data['price($)_v3'] = data['price'].apply(lambda x: x.replace('$', '')).astype(int)
# 3.14 - 3.31
%%time
data['price($)_v4'] = data['price'].str.replace('$', '').astype(int)
# 3.43 - 4
def make_t(n_rows):
tmp = pd.DataFrame({'type': np.where(np.random.rand(n_rows)<0.5, 'A', 'B'), 'feature': np.zeros(n_rows)})
return tmp
data = make_t(5)
data
data['type_v1'] = data['type'].apply(lambda x: 1 if x == "A" else 0)
data['type_v2'] = (data['type']=='A').astype(int)
data['type_v3'] = np.where(data['type'] == 'A', 1 ,0)
data['type_v4'] = data['type'].map({'A': 1, 'B': 0})
data['type_v5'] = data['type'].factorize()[0] # некорректный ответ
data['type_v6'] = pd.get_dummies(data['type'])['A'] # uint8!!!
from sklearn import preprocessing
data['type_v7'] = preprocessing.LabelEncoder().fit_transform(data['type']) # некорректный ответ
data
data = make_t(10000000)
%%time
data['type_v1'] = data['type'].apply(lambda x: 1 if x == "A" else 0)
# 2.14 s - 2.18 s
%%time
data['type_v1'] = data['type'].apply(lambda x: "1" if x == "A" else "0").astype(int)
# 1.82 s - 1.86 s
%%time
data['type_v2'] = (data['type']=='A').astype(int)
# 348 - 364
%%time
data['type_v3'] = np.where(data['type'] == 'A', 1 ,0)
# 380-398
%%time
data['type_v4'] = data['type'].map({'A': 1, 'B': 0})
# 400-424
%%time
data['type_v5'] = data['type'].factorize()[0]
# 304-324
%%time
data['type_v6'] = pd.get_dummies(data['type'])['A']
# 360-392
%%time
from sklearn import preprocessing
data['type_v7'] = preprocessing.LabelEncoder().fit_transform(data['type']) # некорректный ответ
# 5.47 s - 5.57 s
def make_ab(n_rows):
tmp = pd.DataFrame({'A': (100*np.random.rand(n_rows)).astype(int), 'B': (100*np.random.rand(n_rows)).astype(int), 'feature': np.zeros(n_rows)})
tmp['A/B'] = tmp['A'].astype(str) + '/' + tmp['B'].astype(str)
del tmp['A']
del tmp['B']
return tmp
data = make_ab(5)
data
tmp = data['A/B'].str.split('/')
data['A_v1'] = tmp.apply(lambda x: x[0])
data['B_v1'] = tmp.apply(lambda x: x[1])
data[['A_v2', 'B_v2']] = pd.DataFrame(data['A/B'].str.split('/', 1).tolist())
data[['A_v3', 'B_v3']] = data['A/B'].str.split('/', expand=True)
st = '/'.join(data['A/B'])
data[['A_v4', 'B_v4']] = pd.DataFrame(np.array(st.split('/')).reshape(-1, 2))
data
data = make_ab(10000000)
%%time
tmp = data['A/B'].str.split('/')
data['A_v1'] = tmp.apply(lambda x: x[0])
data['B_v1'] = tmp.apply(lambda x: x[1])
# 12.5 s-13.4 s
%%time
data[['A_v2', 'B_v2']] = pd.DataFrame(data['A/B'].str.split('/', 1).tolist())
# 10.2 s - 12.2 s
%%time
data[['A_v3', 'B_v3']] = data['A/B'].str.split('/', expand=True)
# 26.1 s - 29.2 s
%%time
st = '/'.join(data['A/B'])
data[['A_v4', 'B_v4']] = pd.DataFrame(np.array(st.split('/')).reshape(-1, 2))
# 3.65 s - 4.54 s
def make_t(n_rows):
tmp = pd.DataFrame({'type': np.where(np.random.rand(n_rows)<0.5, 'train', 'test'),
'feature': np.where(np.random.rand(n_rows)<0.5, (100*np.random.rand(n_rows)).astype(int), np.nan)})
tmp['feature_v1'] = tmp['feature']
tmp['feature_v2'] = tmp['feature']
tmp['feature_v3'] = tmp['feature']
tmp['feature_v4'] = tmp['feature']
return tmp
data = make_t(10)
data
name = 'feature_v1'
data.loc[data['type'] == 'test', name] = \
data[data['type'] == 'test'][name].fillna(data[data['type'] == 'test'][name].mean())
data.loc[data['type'] == 'train', name] = \
data[data['type'] == 'train'][name].fillna(data[data['type'] == 'train'][name].mean())
name = 'feature_v2'
data[name] = data.groupby('type')[name].transform(lambda x: x.fillna(x.mean()))
name = 'feature_v3'
data.loc[data[name].isnull(), name] = data.groupby('type')[name].transform('mean')
name = 'feature_v4'
data[name] = np.where(data[name].isnull(), data['type'].map(data.groupby('type')[name].mean()), data[name])
#name = 'feature_v4'
#gb = data.groupby('type')
#mn = gb.mean()
#for gn, x in gb:
# x[name].fillna(mn.loc[gn], inplace=True)
data
data = make_t(10000000)
%%time
name = 'feature_v1'
data.loc[data['type'] == 'test', name] = data[data['type'] == 'test'][name].fillna(data[data['type'] == 'test'][name].mean())
data.loc[data['type'] == 'train', name] = data[data['type'] == 'train'][name].fillna(data[data['type'] == 'train'][name].mean())
# 3.44 s - 3.84 s
%%time
name = 'feature_v2'
data[name] = data.groupby('type')[name].transform(lambda x: x.fillna(x.mean()))
# 1.9 s - 2.04 s
%%time
name = 'feature_v3'
data.loc[data[name].isnull(), name] = data.groupby('type')[name].transform('mean')
# 1.17 - 1.18 s
%%time
name = 'feature_v4'
data[name] = np.where(data[name].isnull(), data['type'].map(data.groupby('type')[name].mean()), data[name])
# 1.26 s - 1.38