import pandas as pd
import numpy as np
We'll start by reading in our fraud dataset and looking at the column names:
df = pd.read_parquet("fraud-cleaned-sample.parquet")
df.columns
pt = pd.pivot_table(df[["label", "trans_type", "timestamp"]],
index=["label", "trans_type"], aggfunc=len)
pt.columns = ['count']
gdf = pd.DataFrame(pt.to_records())
gdf['total'] = gdf.groupby('label')['count'].transform(np.sum)
gdf['percentage'] = gdf['count'] / gdf['total']
gdf
import altair as alt
alt.Chart(gdf).mark_bar().encode(
alt.Y('percentage:Q', axis=alt.Axis(format='.0%')), column='trans_type', x="label", color='label'
)
pt = pd.pivot_table(df[["label", "foreign", "timestamp"]],
index=["label", "foreign"], aggfunc=len)
pt.columns = ['count']
gdf = pd.DataFrame(pt.to_records())
gdf['total'] = gdf.groupby('label')['count'].transform(np.sum)
gdf['pctage'] = gdf['count'] / gdf['total']
gdf
alt.Chart(gdf).mark_bar().encode(
alt.Y('pctage:Q', axis=alt.Axis(format='.0%')), column='foreign', x="label", color='label'
)
%%time
qs = df[['label','amount']].groupby('label').quantile(q=[0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99])
qs
qdf = pd.DataFrame(qs.to_records())
alt.Chart(qdf).mark_line(interpolate="monotone").encode(
alt.Y("amount", axis=alt.Axis(title='transaction amounts (log scale)'), scale=alt.Scale(type='log')),
alt.X("level_0", axis=alt.Axis(title='cumulative distribution'), scale=alt.Scale(type='linear')),
color="label"
)
fraudsamp = df[df["label"] == "fraud"].copy()
legitsamp = df[df["label"] == "legitimate"].sample(len(fraudsamp)).copy()
fraudsamp['irank'] = fraudsamp['interarrival'].rank(pct=True, method="dense")
legitsamp['irank'] = legitsamp['interarrival'].rank(pct=True, method="dense")
qdf = pd.concat([fraudsamp.groupby(['label', 'interarrival', 'irank']).size(), legitsamp.groupby(['label', 'interarrival', 'irank']).size()])
qdf = pd.DataFrame(pd.DataFrame(qdf).to_records())
qdf = qdf[qdf['interarrival'] > 0]
alt.Chart(qdf.sample(5000)).mark_line().interactive().encode(
alt.Y("interarrival", axis=alt.Axis(title='interarrival time'), scale=alt.Scale(type='log')),
alt.X("irank", axis=alt.Axis(title='cumulative distribution'), scale=alt.Scale(type='linear')),
color="label"
)