import numpy as np
import pandas as pd
df = pd.read_parquet("fraud-cleaned-sample.parquet")
We're using time-series data, so we'll split based on time.
first = df['timestamp'].min()
last = df['timestamp'].max()
cutoff = first + ((last - first) * 0.7)
train = df[df['timestamp'] <= cutoff]
len(train)
test = df[df['timestamp'] > cutoff]
len(test)
len(train) / (len(train) + len(test))
import sklearn
from sklearn.pipeline import Pipeline
from sklearn import feature_extraction, preprocessing
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
stringize = np.frompyfunc(lambda x: "%s" % x, 1, 1)
def mk_stringize(colname):
def stringize(tab):
return [{colname : s} for s in tab]
return stringize
def amap(s):
return s.map(str)
# my_func = mk_stringize('merchant_id')
my_func = amap
def mk_hasher(features=16384, values=None):
return Pipeline([('stringize',
FunctionTransformer(my_func, accept_sparse=True)),
('hasher',
sklearn.feature_extraction.FeatureHasher(n_features=features, input_type='string'))])
tt_xform = ('onehot', sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore', categories=[['online','contactless','chip_and_pin','manual','swipe']]), ['trans_type'])
mu_xform = ('m_hashing', mk_hasher(256), 'merchant_id')
xform_steps = [tt_xform, mu_xform]
cat_xform = ColumnTransformer(transformers=xform_steps, n_jobs=None)
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
impute_and_scale = Pipeline([('median_imputer', SimpleImputer()), ('interarrival_scaler', RobustScaler())])
ia_scaler = ('interarrival_scaler', impute_and_scale, ['interarrival'])
amount_scaler = ('amount_scaler', RobustScaler(), ['amount'])
scale_steps = [ia_scaler, amount_scaler]
all_xforms = ColumnTransformer(transformers=(scale_steps + xform_steps))
feat_pipeline = Pipeline([
('feature_extraction',all_xforms)
])
feat_pipeline.fit(train)
from mlworkflows import util
util.serialize_to(feat_pipeline, "feature_pipeline.sav")