This notebook explores implementing a multi-touch attribution model using Markov chains. The following articles explain this approach:
This is the graph we are using as our test case:
import numpy as np
import pandas as pd
# Load the journey dataset and insert a "Start" column at the beginning, to explicitly mark the initial state.
df = pd.read_csv('simple_conversion_data.csv', header=None)
df.insert(0, 'Start', 'Start')
df
Start | 0 | 1 | 2 | 3 | |
---|---|---|---|---|---|
0 | Start | C1 | C2 | C3 | Conversion |
1 | Start | C1 | NaN | NaN | NaN |
2 | Start | C2 | C3 | NaN | NaN |
# Reshape the journey dataset to get a new, two-column dataset that contains each
# transition. That is, each row in the new dataset will contain: (T, T+1)
transitions = list()
for n in range(0, len(df.columns)-1):
df_transition = pd.DataFrame({
't': df.iloc[:,n],
't_plus_1': df.iloc[:,n+1]
})
transitions.append(df_transition)
df_transitions = pd.concat(transitions)
# We can drop all rows with NaN values in state T (the starting state).
# These represent journey's that have already been completed.
df_transitions.dropna(subset=['t'], inplace=True)
# Let's replace the NaN's in the T+1 columns with a value indicating that the user bailed.
# This will avoid problems later when we do groupby functions on the dataframe.
df_transitions.fillna('Exit', inplace=True)
df_transitions
t | t_plus_1 | |
---|---|---|
0 | Start | C1 |
1 | Start | C1 |
2 | Start | C2 |
0 | C1 | C2 |
1 | C1 | Exit |
2 | C2 | C3 |
0 | C2 | C3 |
2 | C3 | Exit |
0 | C3 | Conversion |
# Create a separate lookup table with row counts for each starting state, t, in the
# transitions table. We'll use this to calculate probabilities later, and it will
# help us avoid the slow performing df.apply() method.
df_initial_state_counts = df_transitions.groupby(by=['t'], as_index=False).count()
df_initial_state_counts.rename(columns={'t_plus_1':'count_of_t'}, inplace=True)
df_initial_state_counts
t | count_of_t | |
---|---|---|
0 | C1 | 2 |
1 | C2 | 2 |
2 | C3 | 2 |
3 | Start | 3 |
# Join the lookup table to the transitions table, to pull in the counts for each starting state, t.
df_transitions = pd.merge(df_transitions, df_initial_state_counts, on='t', how='inner')
df_transitions
t | t_plus_1 | count_of_t | |
---|---|---|---|
0 | Start | C1 | 3 |
1 | Start | C1 | 3 |
2 | Start | C2 | 3 |
3 | C1 | C2 | 2 |
4 | C1 | Exit | 2 |
5 | C2 | C3 | 2 |
6 | C2 | C3 | 2 |
7 | C3 | Exit | 2 |
8 | C3 | Conversion | 2 |
# Calculate the individual probability for each transition instance
df_transitions['probability'] = 1/df_transitions['count_of_t']
df_transitions
t | t_plus_1 | count_of_t | probability | |
---|---|---|---|---|
0 | Start | C1 | 3 | 0.333333 |
1 | Start | C1 | 3 | 0.333333 |
2 | Start | C2 | 3 | 0.333333 |
3 | C1 | C2 | 2 | 0.500000 |
4 | C1 | Exit | 2 | 0.500000 |
5 | C2 | C3 | 2 | 0.500000 |
6 | C2 | C3 | 2 | 0.500000 |
7 | C3 | Exit | 2 | 0.500000 |
8 | C3 | Conversion | 2 | 0.500000 |
# Calculate the total probability of transitioning from one state to another
df_transition_prob = df_transitions.groupby(by=['t', 't_plus_1'], as_index=False).sum()
df_transition_prob.drop(['count_of_t'], axis=1, inplace=True) # We don't need this column anymore
df_transition_prob
t | t_plus_1 | probability | |
---|---|---|---|
0 | C1 | C2 | 0.500000 |
1 | C1 | Exit | 0.500000 |
2 | C2 | C3 | 1.000000 |
3 | C3 | Conversion | 0.500000 |
4 | C3 | Exit | 0.500000 |
5 | Start | C1 | 0.666667 |
6 | Start | C2 | 0.333333 |
# Double-check to make sure the total probability for each starting state, t, equals 1.0 (i.e. 100%)
df_test = df_transition_prob.groupby(by='t', as_index=False).sum()
df_test
t | probability | |
---|---|---|
0 | C1 | 1.0 |
1 | C2 | 1.0 |
2 | C3 | 1.0 |
3 | Start | 1.0 |
print(' ')
if df_test['probability'].sum() != len(df_test):
print('[ERROR]: The probability calculation test failed. :(')
else:
print('The probability calculation test passed!!! :)')
print(' ')
The probability calculation test passed!!! :)
def print_node(node, debug=False):
# Print each node as it's processed. For debugging purposes.
if not debug:
return
if node['t_plus_1'] in ['Exit', 'Conversion']:
node_type = 'Leaf'
else:
node_type = 'Parent'
print('%s > %s' % (node['t'], node['t_plus_1']))
print('Type: %s' % node_type)
print('Prob: %0.2f' % node['probability'])
print('----------------------------')
calculated_node_probabilities = dict()
def calc_conversion_probability(starting_state, df_transitions, cum_probability, calculated_nodes, debug=True):
# Calculates the cumulative probability of reaching a conversion, given a starting state.
# Assumes the transition dataframe represents a Directed Acyclic Graph (DAG)
# Get the transition probabilities for the starting state we're evaluating
df_nodes = df_transitions[df_transitions['t'] == starting_state]
# Loop through the starting states and either return the probability for
# a leaf node, or recurse to keep following the tree.
node_conversion_probability = 0
child_node_proabilities = []
for index, row in df_nodes.iterrows():
# These are leaf nodes: either an exit or conversion
if row['t_plus_1'] == 'Exit':
print_node(row, debug)
child_node_proabilities.append(0)
elif row['t_plus_1'] == 'Conversion':
print_node(row, debug)
child_node_proabilities.append(row['probability'])
# This is a parent node: Keep following the rabbit hole
else:
# Have we cached the total probability for this node???
if row['t_plus_1'] in calculated_nodes:
if debug:
print('Cache Hit for %s! Cum probability from child: %0.2f' % (row['t_plus_1'], calculated_nodes[row['t_plus_1']]))
child_probability = calculated_nodes[row['t_plus_1']]
# No cached value found. We'll walk through the tree to calculated the value.
else:
# Recursive call
child_probability = calc_conversion_probability(row['t_plus_1'],
df_transitions,
cum_probability + row['probability'],
calculated_nodes,
debug)
node_conversion_prob = child_probability * row['probability']
print_node(row, debug)
child_node_proabilities.append(node_conversion_prob)
if debug:
print('%s > %s' % (row['t'], row['t_plus_1']))
print('Cum Prob from Child : %0.2f' % child_probability)
print('Prob to Child Node : %0.2f' % row['probability'])
print('Node Conv Proability: %0.2f' % node_conversion_prob)
print('----------------------------')
total_node_probability = sum(child_node_proabilities)
if debug:
print('Node Conversion Probability for %s: %0.2f' % (starting_state, total_node_probability))
print('----------------------------')
# We'll cache the calculated total probability for the node, so we don't have to calculate it again.
calculated_node_probabilities[starting_state] = total_node_probability
return total_node_probability
starting_node = 'Start'
print('====== START DEBUG PRINT ======')
total_probability = calc_conversion_probability(starting_node, df_transition_prob, 0, calculated_node_probabilities)
print('====== END DEBUG PRINT ======')
print(' ')
print('Total Conversion Probability from %s: %0.2f' % (starting_node, total_probability))
====== START DEBUG PRINT ====== C3 > Conversion Type: Leaf Prob: 0.50 ---------------------------- C3 > Exit Type: Leaf Prob: 0.50 ---------------------------- Node Conversion Probability for C3: 0.50 ---------------------------- C2 > C3 Type: Parent Prob: 1.00 ---------------------------- C2 > C3 Cum Prob from Child : 0.50 Prob to Child Node : 1.00 Node Conv Proability: 0.50 ---------------------------- Node Conversion Probability for C2: 0.50 ---------------------------- C1 > C2 Type: Parent Prob: 0.50 ---------------------------- C1 > C2 Cum Prob from Child : 0.50 Prob to Child Node : 0.50 Node Conv Proability: 0.25 ---------------------------- C1 > Exit Type: Leaf Prob: 0.50 ---------------------------- Node Conversion Probability for C1: 0.25 ---------------------------- Start > C1 Type: Parent Prob: 0.67 ---------------------------- Start > C1 Cum Prob from Child : 0.25 Prob to Child Node : 0.67 Node Conv Proability: 0.17 ---------------------------- Cache Hit for C2! Cum probability from child: 0.50 Start > C2 Type: Parent Prob: 0.33 ---------------------------- Start > C2 Cum Prob from Child : 0.50 Prob to Child Node : 0.33 Node Conv Proability: 0.17 ---------------------------- Node Conversion Probability for Start: 0.33 ---------------------------- ====== END DEBUG PRINT ====== Total Conversion Probability from Start: 0.33
# Let's look at our transition probability table again. We'll tweak this to calculate the
# removal effect.
df_transition_prob
t | t_plus_1 | probability | |
---|---|---|---|
0 | C1 | C2 | 0.500000 |
1 | C1 | Exit | 0.500000 |
2 | C2 | C3 | 1.000000 |
3 | C3 | Conversion | 0.500000 |
4 | C3 | Exit | 0.500000 |
5 | Start | C1 | 0.666667 |
6 | Start | C2 | 0.333333 |
unique_channels = df_transition_prob[df_transition_prob['t'] != 'Start']['t'].unique()
pd.options.mode.chained_assignment = None # I know, this makes me a very bad person.
removal_effects = dict()
# Remove each channel and calculate the impact
for channel in unique_channels:
# Remove the channel from the transition probability matrix
df_reduced_graph = df_transition_prob[df_transition_prob['t'] != channel]
df_reduced_graph.loc[df_reduced_graph['t_plus_1']==channel, 't_plus_1'] = 'Exit'
# Recalculate the total conversion probability
calculated_node_probabilities = dict()
new_total_probability = calc_conversion_probability('Start',
df_reduced_graph,
0,
calculated_node_probabilities,
debug=False)
# Calculate the difference in conversion probability
removal_effect = (total_probability - new_total_probability)/total_probability
removal_effects[channel] = removal_effect
print('Removal effect by channel:')
for key, value in removal_effects.items():
print('%s: %0.2f' % (key, value))
Removal effect by channel: C1: 0.50 C2: 1.00 C3: 1.00