In [1]:

import numpy as np
import pandas as pd
df = pd.read_parquet("fraud.parquet")

Train/test split¶

We're using time-series data, so we'll split based on time.

In [2]:

first = df['timestamp'].min()
last = df['timestamp'].max()
cutoff = first + ((last - first) * 0.7)

df = df.sample(frac=0.1).copy()

train = df[df['timestamp'] <= cutoff]
test = df[df['timestamp'] > cutoff]

In [ ]:

import cloudpickle as cp
feature_pipeline = cp.load(open('feature_pipeline.sav', 'rb'))

Weighting samples¶

We're going to weight samples by the the inverse of the frequency of their label.

In [9]:

fraud_frequency = train[train["label"] == "fraud"]["timestamp"].count() / train["timestamp"].count()
train["weights"] = fraud_frequency
train.loc[train["label"] == "fraud", "weights"] = (1 - fraud_frequency)

WIP from here...¶

In [12]:

from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection

rfc = RandomForestClassifier(n_estimators=4, max_depth=3, random_state=404, class_weight="balanced_subsample")

svecs = feature_pipeline.fit_transform(train)
rfc.fit(svecs, train["label"], sample_weight=train["weights"])

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-12-a3b16769fd33> in <module>
      2 lr = LogisticRegression(max_iter=500)
      3 
----> 4 svecs = feature_pipeline.fit_transform(smol_train)
      5 lr.fit(svecs, smol_train["label"], sample_weight=smol_train["weights"])

NameError: name 'feature_pipeline' is not defined

In [ ]:

from sklearn.metrics import classification_report

predictions = rfc.predict(feature_pipeline.fit_transform(test))
print(classification_report(smol_test.label.values, predictions))

In [ ]:

from mlworkflows import plot
df, chart = plot.binary_confusion_matrix(test["label"], predictions)

Train/test split¶

Weighting samples¶

WIP from here...¶

save model here¶