This dataset contains transactions of an online retail store in the UK. The files takes a bit to load. I downloaded this file as an xls from the UCI machine learning dataset library. I used excel to save it as a csv.
I will largely follow ths process described in a Feature Tools tutorial. Note that the tutorial changed the column names from the original file (which I have not done).
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
create_data = False
Field info (from UCI machine learning dataset website):
df = pd.read_csv('./data/raw/OnlineRetail.csv', parse_dates=["InvoiceDate"])
# Restrict data to 2011 - from tutorial. not sure why they did this.
df = df[df['InvoiceDate'].dt.year == 2011]
df.columns
Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country'], dtype='object')
len(df)
499428
# label cancellation orders
df['Cancelled'] = df['InvoiceNo'].str.startswith('C')
# drop the duplicates
df = df.drop_duplicates()
# drop rows with null customer id
df = df.dropna(axis=0)
# Convert to dollars and create a field representing the total spent
df['UnitPrice'] = df['UnitPrice'] * 1.65
df['Total'] = df['UnitPrice'] * df['Quantity']
df.describe()
Quantity | UnitPrice | CustomerID | Total | |
---|---|---|---|---|
count | 375250.000000 | 375250.000000 | 375250.000000 | 375250.000000 |
mean | 12.252296 | 5.762031 | 15265.136168 | 33.972397 |
std | 258.180877 | 119.054102 | 1710.927381 | 733.803756 |
min | -80995.000000 | 0.000000 | 12346.000000 | -277974.840000 |
25% | 2.000000 | 2.062500 | 13901.000000 | 7.012500 |
50% | 5.000000 | 3.217500 | 15108.000000 | 19.305000 |
75% | 12.000000 | 6.187500 | 16767.000000 | 32.670000 |
max | 80995.000000 | 64300.500000 | 18287.000000 | 277974.840000 |
df.head()
InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | Cancelled | Total | |
---|---|---|---|---|---|---|---|---|---|---|
42481 | 539993 | 22386 | JUMBO BAG PINK POLKADOT | 10 | 2011-01-04 10:00:00 | 3.2175 | 13313.0 | United Kingdom | False | 32.175 |
42482 | 539993 | 21499 | BLUE POLKADOT WRAP | 25 | 2011-01-04 10:00:00 | 0.6930 | 13313.0 | United Kingdom | False | 17.325 |
42483 | 539993 | 21498 | RED RETROSPOT WRAP | 25 | 2011-01-04 10:00:00 | 0.6930 | 13313.0 | United Kingdom | False | 17.325 |
42484 | 539993 | 22379 | RECYCLING BAG RETROSPOT | 5 | 2011-01-04 10:00:00 | 3.4650 | 13313.0 | United Kingdom | False | 17.325 |
42485 | 539993 | 20718 | RED RETROSPOT SHOPPER BAG | 10 | 2011-01-04 10:00:00 | 2.0625 | 13313.0 | United Kingdom | False | 20.625 |
df.dtypes
InvoiceNo object StockCode object Description object Quantity int64 InvoiceDate datetime64[ns] UnitPrice float64 CustomerID float64 Country object Cancelled bool Total float64 dtype: object
The following code is a little complex. I compute how much a customer spent each month (including months with no transactions).
def monthly_spend(df):
"""Identify how much each customer spent each month."""
df.index = df['InvoiceDate']
monthly_sums = df.groupby('CustomerID').resample('MS')['Total'].sum()
return monthly_sums.reset_index()
def customer_by_month(monthly_data, df):
"""Create an index for each customer for each month of the data set."""
labels = monthly_data.set_index(['InvoiceDate', 'CustomerID'])
midx = pd.MultiIndex.from_product(
[pd.date_range('2011-01-01', '2012-01-01', freq='MS'), df['CustomerID'].unique()],
names=labels.index.names)
return labels.reindex(midx, fill_value=0).reset_index()
def monthly_min_date(monthly_data):
"""Create a table which has all months since a customer's first transaction"""
min_dates = (monthly_data
.groupby('CustomerID')['InvoiceDate']
.min()
.apply(lambda x: pd.date_range(x, '2012-01-01', freq='MS').tolist())
.reset_index()
)
return pd.DataFrame([(x, i) for x, y in zip(min_dates['CustomerID'], min_dates['InvoiceDate']) for i in y],
columns=['CustomerID', 'InvoiceDate'])
# join the table where each customer has a record each month with the table where each customer has a
# record since account creation. This way customers do not have records before their first transaction.
#labels = labels.merge(relevant_months, on=['CustomerID', 'InvoiceDate'], how='inner')
monthly_df = monthly_spend(df)
labels = customer_by_month(monthly_df, df).merge(monthly_min_date(monthly_df),
on=['CustomerID', 'InvoiceDate'],
how='inner')
# tutorial starts with march data.
labels = labels[labels['InvoiceDate'] >= '2011-03-01']
labels.head()
InvoiceDate | CustomerID | Total | |
---|---|---|---|
2065 | 2011-03-01 | 13313.0 | 0.000 |
2066 | 2011-03-01 | 18097.0 | 0.000 |
2067 | 2011-03-01 | 16656.0 | 589.248 |
2068 | 2011-03-01 | 16875.0 | 0.000 |
2069 | 2011-03-01 | 13094.0 | 115.434 |
Notice that I have more samples than the feature tools tutorial because they accidently removed all transactions on the first of the month...
labels.describe()
CustomerID | Total | |
---|---|---|
count | 35339.000000 | 35339.000000 |
mean | 15276.044399 | 318.284416 |
std | 1720.194897 | 1698.736413 |
min | 12346.000000 | -7074.589500 |
25% | 13791.000000 | 0.000000 |
50% | 15247.000000 | 0.000000 |
75% | 16757.000000 | 260.460750 |
max | 18287.000000 | 115906.725000 |
labels.loc[labels['CustomerID'] == 12347]
InvoiceDate | CustomerID | Total | |
---|---|---|---|
2731 | 2011-03-01 | 12347.0 | 0.0000 |
4543 | 2011-04-01 | 12347.0 | 1049.8125 |
6715 | 2011-05-01 | 12347.0 | 0.0000 |
9204 | 2011-06-01 | 12347.0 | 631.1580 |
11956 | 2011-07-01 | 12347.0 | 0.0000 |
14926 | 2011-08-01 | 12347.0 | 965.1015 |
18080 | 2011-09-01 | 12347.0 | 0.0000 |
21547 | 2011-10-01 | 12347.0 | 2135.6280 |
25385 | 2011-11-01 | 12347.0 | 0.0000 |
29582 | 2011-12-01 | 12347.0 | 370.9530 |
33826 | 2012-01-01 | 12347.0 | 0.0000 |
df[(df['CustomerID'] == 12347) & (df['InvoiceDate'] >= '2011-12-01')]['Total'].sum()
370.953
This next series of commands will load data into feature tools and explain the dataset to feature tools.
import featuretools as ft
es = ft.EntitySet(id="Online Retail Logs")
# Add the entire data table as an entity
es.entity_from_dataframe("purchases", # name of entity set
dataframe=df, # data
index="purchases_index", # name of new index
time_index = 'InvoiceDate', # time associated with each row
variable_types = {'Description': ft.variable_types.Text}) # specifiy variable type
es['purchases']
2018-09-22 09:06:26,324 featuretools.entityset - WARNING index purchases_index not found in dataframe, creating new integer column
Entity: purchases Variables: purchases_index (dtype: index) InvoiceNo (dtype: categorical) StockCode (dtype: categorical) Quantity (dtype: numeric) InvoiceDate (dtype: datetime_time_index) UnitPrice (dtype: numeric) CustomerID (dtype: numeric) Country (dtype: categorical) Cancelled (dtype: boolean) Total (dtype: numeric) Description (dtype: text) Shape: (Rows: 375250, Columns: 11)
es.normalize_entity(new_entity_id="products", # new entity for products.
base_entity_id="purchases", # built from purchases entity
index="StockCode", # Index with StockCode column from purchases entity. (unique key)
additional_variables=["Description"]) # bring in this variable
es['products'].df.head()
StockCode | Description | first_purchases_time | |
---|---|---|---|
20682 | 20682 | RED RETROSPOT CHILDRENS UMBRELLA | 2011-01-04 10:00:00 |
20718 | 20718 | RED RETROSPOT SHOPPER BAG | 2011-01-04 10:00:00 |
21498 | 21498 | RED RETROSPOT WRAP | 2011-01-04 10:00:00 |
21499 | 21499 | BLUE POLKADOT WRAP | 2011-01-04 10:00:00 |
22302 | 22302 | COFFEE MUG PEARS DESIGN | 2011-01-04 10:00:00 |
es.normalize_entity(new_entity_id="customers", # customer entity
base_entity_id="purchases", # from purchases
index="CustomerID") # unique key is CustomerID
es.normalize_entity(new_entity_id="orders", # order entity
base_entity_id="purchases", # from purchases
index="InvoiceNo", # unique key is InvoiceNo
additional_variables=["Country", 'Cancelled']) # Include these variables.
es
Entityset: Online Retail Logs Entities: purchases [Rows: 375250, Columns: 8] products [Rows: 3612, Columns: 3] customers [Rows: 4244, Columns: 2] orders [Rows: 20482, Columns: 4] Relationships: purchases.StockCode -> products.StockCode purchases.CustomerID -> customers.CustomerID purchases.InvoiceNo -> orders.InvoiceNo
es['customers'].df.head()
CustomerID | first_purchases_time | |
---|---|---|
13313.0 | 13313.0 | 2011-01-04 10:00:00 |
18097.0 | 18097.0 | 2011-01-04 10:22:00 |
16656.0 | 16656.0 | 2011-01-04 10:23:00 |
13094.0 | 13094.0 | 2011-01-04 10:37:00 |
16875.0 | 16875.0 | 2011-01-04 10:37:00 |
es['orders'].df.head()
InvoiceNo | Country | Cancelled | first_purchases_time | |
---|---|---|---|---|
539993 | 539993 | United Kingdom | False | 2011-01-04 10:00:00 |
540001 | 540001 | United Kingdom | False | 2011-01-04 10:22:00 |
540002 | 540002 | United Kingdom | False | 2011-01-04 10:23:00 |
540003 | 540003 | United Kingdom | False | 2011-01-04 10:37:00 |
540004 | 540004 | United Kingdom | False | 2011-01-04 10:37:00 |
Here, we use featuretools to create new features.
if create_data:
feature_matrix,_ = ft.dfs(entityset=es, # entity
target_entity='customers', # what we're trying to predict
agg_primitives=['max', 'min', 'mode', 'mean', 'avg_time_between'], # requested aggregations
trans_primitives=['day', 'month', 'hour', 'weekend'], # requested transformations
cutoff_time=labels, # define time period of predictions
verbose=1, # how much stdout
cutoff_time_in_index=True, # specify that we've given cutoff times
chunk_size=50, # how much data to give each worker
n_jobs=-1, # how many threads to create
max_depth=1) # how many aggregations
feature_matrix = feature_matrix.reset_index()
feature_matrix.to_csv('./data/processed/dfs_depth1.csv')
feature_matrix.head()
else:
feature_matrix = pd.read_csv('./data/processed/dfs_depth1.csv')
feature_matrix['time'] = pd.to_datetime(feature_matrix['time'])
feature_matrix = feature_matrix.drop('Unnamed: 0', axis=1)
feature_matrix.columns
Index(['CustomerID', 'time', 'MAX(purchases.Quantity)', 'MAX(purchases.UnitPrice)', 'MAX(purchases.Total)', 'MIN(purchases.Quantity)', 'MIN(purchases.UnitPrice)', 'MIN(purchases.Total)', 'MODE(purchases.InvoiceNo)', 'MODE(purchases.StockCode)', 'MEAN(purchases.Quantity)', 'MEAN(purchases.UnitPrice)', 'MEAN(purchases.Total)', 'AVG_TIME_BETWEEN(purchases.InvoiceDate)', 'DAY(first_purchases_time)', 'MONTH(first_purchases_time)', 'HOUR(first_purchases_time)', 'WEEKEND(first_purchases_time)', 'Total'], dtype='object')
# look at a single output
feature_matrix.iloc[0, :3]
CustomerID 12346 time 2011-03-01 00:00:00 MAX(purchases.Quantity) 74215 Name: 0, dtype: object
# demonstrate we understand the output
df[(df['CustomerID'] == 12346) & (df['InvoiceDate'] < '2011-03-01')]['Quantity'].max()
74215
feature_matrix.shape
(35339, 19)
# create categorical response variable
feature_matrix['Total'] = feature_matrix['Total'].apply(lambda x: 1 if x > 500 else 0)
import numpy as np
def create_train_test(month, feature_matrix, drop_cols=['CustomerID', 'time', 'month', 'Total']):
"""Basic cleaning and return train/test data."""
# remove columns we know will not contribute
feature_matrix = feature_matrix.drop(columns= ['MODE(purchases.InvoiceNo)', 'MODE(purchases.StockCode)'])
# dummy code strings
feature_matrix = pd.get_dummies(feature_matrix)
# fill nans
feature_matrix = feature_matrix.fillna(0)
# Labels
feature_matrix['month'] = feature_matrix['time'].dt.month
train_labels = feature_matrix.loc[feature_matrix['month'] < month, 'Total']
test_labels = feature_matrix.loc[feature_matrix['month'] >= month, 'Total']
y_train = np.array(train_labels).reshape((-1, ))
y_test = np.array(test_labels).reshape((-1, ))
# Features
X_train = feature_matrix[feature_matrix['time'].dt.month < month].drop(columns=drop_cols)
X_test = feature_matrix[feature_matrix['time'].dt.month >= month].drop(columns=drop_cols)
return (X_train, X_test, y_train, y_test)
X_train, X_test, y_train, y_test = create_train_test(11, feature_matrix)
print(np.mean(y_train))
print(np.mean(y_test))
0.183619599970258 0.16076294277929154
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=0, class_weight='balanced')
model.fit(X_train, y_train)
LogisticRegression(C=1.0, class_weight='balanced', dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=0, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
def print_performance(X, y, model):
"""Print model performance metrics."""
predictions = model.predict(X)
probs = model.predict_proba(X)[:, 1]
# Calculate metrics
print('Precision: {}'.format(round(precision_score(y, predictions), 5)))
print('Recall: {}'.format(round(recall_score(y, predictions), 5)))
print('F1 Score: {}'.format(round(f1_score(y, predictions), 5)))
print('ROC AUC: {}'.format(round(roc_auc_score(y, probs), 5)))
print_performance(X_train, y_train, model)
Precision: 0.44233 Recall: 0.43794 F1 Score: 0.44013 ROC AUC: 0.73158
print_performance(X_test, y_test, model)
Precision: 0.40087 Recall: 0.33972 F1 Score: 0.36777 ROC AUC: 0.69795
Beware that this takes forever!!!
if create_data:
feature_matrix,_ = ft.dfs(entityset=es,
target_entity='customers', # what we're trying to predict
agg_primitives=['max', 'min', 'mode', 'mean', 'avg_time_between'], # requested aggs
trans_primitives=['day', 'month', 'hour', 'weekend'], # requested transformations
n_jobs=-1,
chunk_size=50,
max_depth=2, # how many aggregations to do
cutoff_time=labels,
cutoff_time_in_index=True,
verbose=1)
feature_matrix = feature_matrix.reset_index()
feature_matrix.to_csv('./data/processed/dfs_depth2.csv')
feature_matrix.head()
else:
feature_matrix = pd.read_csv('./data/processed/dfs_depth2.csv')
feature_matrix['time'] = pd.to_datetime(feature_matrix['time'])
feature_matrix = feature_matrix.drop('Unnamed: 0', axis=1)
feature_matrix.shape
(35339, 23)
feature_matrix.columns
Index(['CustomerID', 'time', 'MAX(purchases.Quantity)', 'MAX(purchases.UnitPrice)', 'MAX(purchases.Total)', 'MIN(purchases.Quantity)', 'MIN(purchases.UnitPrice)', 'MIN(purchases.Total)', 'MODE(purchases.InvoiceNo)', 'MODE(purchases.StockCode)', 'MEAN(purchases.Quantity)', 'MEAN(purchases.UnitPrice)', 'MEAN(purchases.Total)', 'AVG_TIME_BETWEEN(purchases.InvoiceDate)', 'DAY(first_purchases_time)', 'MONTH(first_purchases_time)', 'HOUR(first_purchases_time)', 'WEEKEND(first_purchases_time)', 'MODE(purchases.DAY(InvoiceDate))', 'MODE(purchases.MONTH(InvoiceDate))', 'MODE(purchases.HOUR(InvoiceDate))', 'MODE(purchases.orders.Country)', 'Total'], dtype='object')
# look at a single output
feature_matrix.iloc[7000, [0, 1, 19]]
CustomerID 13481 time 2011-06-01 00:00:00 MODE(purchases.MONTH(InvoiceDate)) 3 Name: 7000, dtype: object
# demonstrate we understand the output
df['month'] = df['InvoiceDate'].dt.month
df[(df['CustomerID'] == 13481) & (df['InvoiceDate'] < '2011-06-01')].groupby('month')['Total'].count()
month 3 43 4 1 5 17 Name: Total, dtype: int64
# create categorical response variable
feature_matrix['Total'] = feature_matrix['Total'].apply(lambda x: 1 if x > 500 else 0)
X_train, X_test, y_train, y_test = create_train_test(11, feature_matrix)
model = LogisticRegression(random_state=0, class_weight='balanced')
model.fit(X_train, y_train)
LogisticRegression(C=1.0, class_weight='balanced', dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=0, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
print_performance(X_train, y_train, model)
Precision: 0.4444 Recall: 0.44503 F1 Score: 0.44471 ROC AUC: 0.72874
print_performance(X_test, y_test, model)
Precision: 0.41033 Recall: 0.32203 F1 Score: 0.36086 ROC AUC: 0.68288