Эксперименты по ускорению обработки данных в Python

Частично использованы материалы

In [136]:
import pandas as pd
import numpy as np

Задача 1 - устранить знак доллара

In [137]:
def make_s(n_rows):
    tmp = pd.DataFrame({'price': (100*np.random.rand(n_rows)).astype(int), 'feature': np.zeros(n_rows)})
    tmp['price'] = tmp['price'].astype(str) + '$'
    return tmp

data = make_s(5)
data
Out[137]:
price feature
0 17$ 0.0
1 89$ 0.0
2 39$ 0.0
3 97$ 0.0
4 23$ 0.0
In [139]:
data['price($)_v1'] = data['price'].apply(lambda x: int(x[:-1]))
data['price($)_v2'] = data['price'].apply(lambda x: x[:-1]).astype(int)
data['price($)_v3'] = data['price'].apply(lambda x: x.replace('$', '')).astype(int)
data['price($)_v4'] = data['price'].str.replace('$', '').astype(int)
data
Out[139]:
price feature price($)_v1 price($)_v2 price($)_v3 price($)_v4
0 17$ 0.0 17 17 17 17
1 89$ 0.0 89 89 89 89
2 39$ 0.0 39 39 39 39
3 97$ 0.0 97 97 97 97
4 23$ 0.0 23 23 23 23
In [140]:
data = make_s(10000000)
In [141]:
%%time
data['price($))_v1'] = data['price'].apply(lambda x: int(x[:-1]))
# 4.2-4.33
CPU times: user 4.26 s, sys: 60 ms, total: 4.32 s
Wall time: 4.36 s
In [147]:
%%time
data['price($)_v2'] = data['price'].apply(lambda x: x[:-1]).astype(int)
# 2.47 s - 2.52 s
CPU times: user 2.5 s, sys: 124 ms, total: 2.62 s
Wall time: 2.65 s
In [144]:
%%time
data['price($)_v3'] = data['price'].apply(lambda x: x.replace('$', '')).astype(int) 
# 3.14 - 3.31
CPU times: user 3.19 s, sys: 152 ms, total: 3.34 s
Wall time: 3.35 s
In [145]:
%%time
data['price($)_v4'] = data['price'].str.replace('$', '').astype(int)
# 3.43 - 4
CPU times: user 3.43 s, sys: 164 ms, total: 3.6 s
Wall time: 3.63 s

Задача - бинаризовать

In [150]:
def make_t(n_rows):
    tmp = pd.DataFrame({'type': np.where(np.random.rand(n_rows)<0.5, 'A', 'B'), 'feature': np.zeros(n_rows)})
    return tmp

data = make_t(5)
data
Out[150]:
type feature
0 A 0.0
1 B 0.0
2 A 0.0
3 A 0.0
4 A 0.0
In [155]:
data['type_v1'] = data['type'].apply(lambda x: 1 if x == "A" else 0)
data['type_v2'] = (data['type']=='A').astype(int)
data['type_v3'] = np.where(data['type'] == 'A', 1 ,0)
data['type_v4'] = data['type'].map({'A': 1, 'B': 0})
data['type_v5'] = data['type'].factorize()[0] # некорректный ответ
data['type_v6'] = pd.get_dummies(data['type'])['A'] # uint8!!!
from sklearn import preprocessing
data['type_v7'] = preprocessing.LabelEncoder().fit_transform(data['type']) # некорректный ответ

data
Out[155]:
type feature type_v1 type_v2 type_v3 type_v4 type_v5 type_v6 type_v7
0 A 0.0 1 1 1 1 0 1 0
1 B 0.0 0 0 0 0 1 0 1
2 A 0.0 1 1 1 1 0 1 0
3 A 0.0 1 1 1 1 0 1 0
4 A 0.0 1 1 1 1 0 1 0
In [156]:
data = make_t(10000000)
In [157]:
%%time
data['type_v1'] = data['type'].apply(lambda x: 1 if x == "A" else 0)
# 2.14 s - 2.18 s
CPU times: user 2.14 s, sys: 72 ms, total: 2.22 s
Wall time: 2.25 s
In [159]:
%%time
data['type_v1'] = data['type'].apply(lambda x: "1" if x == "A" else "0").astype(int)
# 1.82 s - 1.86 s
CPU times: user 1.82 s, sys: 60 ms, total: 1.88 s
Wall time: 1.89 s
In [160]:
%%time
data['type_v2'] = (data['type']=='A').astype(int)
# 348 - 364
CPU times: user 364 ms, sys: 20 ms, total: 384 ms
Wall time: 386 ms
In [161]:
%%time
data['type_v3'] = np.where(data['type'] == 'A', 1 ,0)
# 380-398
CPU times: user 376 ms, sys: 20 ms, total: 396 ms
Wall time: 398 ms
In [162]:
%%time
data['type_v4'] = data['type'].map({'A': 1, 'B': 0})
# 400-424
CPU times: user 420 ms, sys: 16 ms, total: 436 ms
Wall time: 443 ms
In [163]:
%%time
data['type_v5'] = data['type'].factorize()[0]
# 304-324
CPU times: user 304 ms, sys: 36 ms, total: 340 ms
Wall time: 357 ms
In [164]:
%%time
data['type_v6'] = pd.get_dummies(data['type'])['A']
# 360-392
CPU times: user 364 ms, sys: 28 ms, total: 392 ms
Wall time: 395 ms
In [168]:
%%time
from sklearn import preprocessing
data['type_v7'] = preprocessing.LabelEncoder().fit_transform(data['type']) # некорректный ответ
# 5.47 s - 5.57 s
CPU times: user 5.5 s, sys: 36 ms, total: 5.53 s
Wall time: 5.52 s

Задача - расщепить

In [169]:
def make_ab(n_rows):
    tmp = pd.DataFrame({'A': (100*np.random.rand(n_rows)).astype(int), 'B': (100*np.random.rand(n_rows)).astype(int), 'feature': np.zeros(n_rows)})
    tmp['A/B'] = tmp['A'].astype(str) + '/' + tmp['B'].astype(str)
    del tmp['A']
    del tmp['B']
    return tmp

data = make_ab(5)
data
Out[169]:
feature A/B
0 0.0 23/60
1 0.0 65/76
2 0.0 66/53
3 0.0 57/53
4 0.0 85/18
In [171]:
tmp = data['A/B'].str.split('/')
data['A_v1'] = tmp.apply(lambda x: x[0])
data['B_v1'] = tmp.apply(lambda x: x[1])

data[['A_v2', 'B_v2']] = pd.DataFrame(data['A/B'].str.split('/', 1).tolist())

data[['A_v3', 'B_v3']] = data['A/B'].str.split('/', expand=True)

st = '/'.join(data['A/B'])
data[['A_v4', 'B_v4']] = pd.DataFrame(np.array(st.split('/')).reshape(-1, 2))

data
Out[171]:
feature A/B A_v1 B_v1 A_v2 B_v2 A_v3 B_v3 A_v4 B_v4
0 0.0 23/60 23 60 23 60 23 60 23 60
1 0.0 65/76 65 76 65 76 65 76 65 76
2 0.0 66/53 66 53 66 53 66 53 66 53
3 0.0 57/53 57 53 57 53 57 53 57 53
4 0.0 85/18 85 18 85 18 85 18 85 18
In [172]:
data = make_ab(10000000)
In [177]:
%%time
tmp = data['A/B'].str.split('/')
data['A_v1'] = tmp.apply(lambda x: x[0])
data['B_v1'] = tmp.apply(lambda x: x[1])
# 12.5 s-13.4 s
CPU times: user 13.1 s, sys: 1.51 s, total: 14.6 s
Wall time: 14.6 s
In [174]:
%%time
data[['A_v2', 'B_v2']] = pd.DataFrame(data['A/B'].str.split('/', 1).tolist())
# 10.2 s - 12.2 s
CPU times: user 9.94 s, sys: 176 ms, total: 10.1 s
Wall time: 10.2 s
In [175]:
%%time
data[['A_v3', 'B_v3']] = data['A/B'].str.split('/', expand=True)
# 26.1 s - 29.2 s
CPU times: user 26.1 s, sys: 368 ms, total: 26.5 s
Wall time: 26.5 s
In [176]:
%%time
st = '/'.join(data['A/B'])
data[['A_v4', 'B_v4']] = pd.DataFrame(np.array(st.split('/')).reshape(-1, 2))
# 3.65 s - 4.54 s
CPU times: user 3.65 s, sys: 168 ms, total: 3.82 s
Wall time: 3.84 s

Задача - заменить пропуски средним

In [178]:
def make_t(n_rows):
    tmp = pd.DataFrame({'type': np.where(np.random.rand(n_rows)<0.5, 'train', 'test'),
                        'feature': np.where(np.random.rand(n_rows)<0.5, (100*np.random.rand(n_rows)).astype(int), np.nan)})
    tmp['feature_v1'] = tmp['feature']
    tmp['feature_v2'] = tmp['feature']
    tmp['feature_v3'] = tmp['feature']
    tmp['feature_v4'] = tmp['feature']
    return tmp

data = make_t(10)
data
Out[178]:
type feature feature_v1 feature_v2 feature_v3 feature_v4
0 test NaN NaN NaN NaN NaN
1 test 43.0 43.0 43.0 43.0 43.0
2 test NaN NaN NaN NaN NaN
3 train 4.0 4.0 4.0 4.0 4.0
4 train 18.0 18.0 18.0 18.0 18.0
5 train NaN NaN NaN NaN NaN
6 train NaN NaN NaN NaN NaN
7 train 25.0 25.0 25.0 25.0 25.0
8 train NaN NaN NaN NaN NaN
9 train NaN NaN NaN NaN NaN
In [182]:
name = 'feature_v1'
data.loc[data['type'] == 'test', name] = \
    data[data['type'] == 'test'][name].fillna(data[data['type'] == 'test'][name].mean())
data.loc[data['type'] == 'train', name] = \
    data[data['type'] == 'train'][name].fillna(data[data['type'] == 'train'][name].mean())

name = 'feature_v2'
data[name] = data.groupby('type')[name].transform(lambda x: x.fillna(x.mean()))

name = 'feature_v3'
data.loc[data[name].isnull(), name] = data.groupby('type')[name].transform('mean')

name = 'feature_v4'
data[name] = np.where(data[name].isnull(), data['type'].map(data.groupby('type')[name].mean()), data[name])

#name = 'feature_v4'
#gb = data.groupby('type')
#mn = gb.mean()
#for gn, x in gb:
#    x[name].fillna(mn.loc[gn], inplace=True)

data
Out[182]:
type feature feature_v1 feature_v2 feature_v3 feature_v4
0 test NaN 43.000000 43.000000 43.000000 43.000000
1 test 43.0 43.000000 43.000000 43.000000 43.000000
2 test NaN 43.000000 43.000000 43.000000 43.000000
3 train 4.0 4.000000 4.000000 4.000000 4.000000
4 train 18.0 18.000000 18.000000 18.000000 18.000000
5 train NaN 15.666667 15.666667 15.666667 15.666667
6 train NaN 15.666667 15.666667 15.666667 15.666667
7 train 25.0 25.000000 25.000000 25.000000 25.000000
8 train NaN 15.666667 15.666667 15.666667 15.666667
9 train NaN 15.666667 15.666667 15.666667 15.666667
In [191]:
data = make_t(10000000)
In [195]:
%%time

name = 'feature_v1'
data.loc[data['type'] == 'test', name] = data[data['type'] == 'test'][name].fillna(data[data['type'] == 'test'][name].mean())
data.loc[data['type'] == 'train', name] = data[data['type'] == 'train'][name].fillna(data[data['type'] == 'train'][name].mean())

# 3.44 s - 3.84 s
CPU times: user 3.66 s, sys: 132 ms, total: 3.79 s
Wall time: 3.81 s
In [194]:
%%time
name = 'feature_v2'
data[name] = data.groupby('type')[name].transform(lambda x: x.fillna(x.mean()))
# 1.9 s - 2.04 s
CPU times: user 1.9 s, sys: 152 ms, total: 2.05 s
Wall time: 2.06 s
In [193]:
%%time
name = 'feature_v3'
data.loc[data[name].isnull(), name] = data.groupby('type')[name].transform('mean')
# 1.17 - 1.18 s
CPU times: user 1.2 s, sys: 128 ms, total: 1.32 s
Wall time: 1.35 s
In [192]:
%%time
name = 'feature_v4'
data[name] = np.where(data[name].isnull(), data['type'].map(data.groupby('type')[name].mean()), data[name])
# 1.26 s - 1.38
CPU times: user 1.37 s, sys: 72 ms, total: 1.44 s
Wall time: 1.45 s
In [ ]: