import numpy as np
import pandas as pd
df = pd.read_parquet("fraud-cleaned-sample.parquet")
We're using time-series data, so we'll split based on time.
first = df['timestamp'].min()
last = df['timestamp'].max()
cutoff = first + ((last - first) * 0.7)
df = df.sample(frac=0.1).copy()
train = df[df['timestamp'] <= cutoff]
test = df[df['timestamp'] > cutoff]
import cloudpickle as cp
feature_pipeline = cp.load(open('feature_pipeline.sav', 'rb'))
We're going to weight samples by the the inverse of the frequency of their label.
fraud_frequency = train[train["label"] == "fraud"]["timestamp"].count() / train["timestamp"].count()
train.loc[train["label"] == "legitimate", "weights"] = fraud_frequency
train.loc[train["label"] == "fraud", "weights"] = (1 - fraud_frequency)
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=500)
svecs = feature_pipeline.fit_transform(train)
lr.fit(svecs, train["label"], sample_weight=train["weights"])
from sklearn.metrics import classification_report
predictions = lr.predict(feature_pipeline.fit_transform(test))
print(classification_report(test.label.values, predictions))
from mlworkflows import plot
df, chart = plot.binary_confusion_matrix(test["label"], predictions)
chart
df
from mlworkflows import util
util.serialize_to(lr, "lr.sav")