Companion notebook to http://pbpython.com/market-basket-analysis.html
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
df = pd.read_excel('http://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx')
df.head()
InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
---|---|---|---|---|---|---|---|---|
0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 2010-12-01 08:26:00 | 2.55 | 17850.0 | United Kingdom |
1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom |
2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 2010-12-01 08:26:00 | 2.75 | 17850.0 | United Kingdom |
3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom |
4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom |
# Clean up spaces in description and remove any rows that don't have a valid invoice
df['Description'] = df['Description'].str.strip()
df.dropna(axis=0, subset=['InvoiceNo'], inplace=True)
df['InvoiceNo'] = df['InvoiceNo'].astype('str')
df = df[~df['InvoiceNo'].str.contains('C')]
basket = (df[df['Country'] =="France"]
.groupby(['InvoiceNo', 'Description'])['Quantity']
.sum().unstack().reset_index().fillna(0)
.set_index('InvoiceNo'))
basket.head()
Description | 10 COLOUR SPACEBOY PEN | 12 COLOURED PARTY BALLOONS | 12 EGG HOUSE PAINTED WOOD | 12 MESSAGE CARDS WITH ENVELOPES | 12 PENCIL SMALL TUBE WOODLAND | 12 PENCILS SMALL TUBE RED RETROSPOT | 12 PENCILS SMALL TUBE SKULL | 12 PENCILS TALL TUBE POSY | 12 PENCILS TALL TUBE RED RETROSPOT | 12 PENCILS TALL TUBE WOODLAND | ... | WRAP VINTAGE PETALS DESIGN | YELLOW COAT RACK PARIS FASHION | YELLOW GIANT GARDEN THERMOMETER | YELLOW SHARK HELICOPTER | ZINC STAR T-LIGHT HOLDER | ZINC FOLKART SLEIGH BELLS | ZINC HERB GARDEN CONTAINER | ZINC METAL HEART DECORATION | ZINC T-LIGHT HOLDER STAR LARGE | ZINC T-LIGHT HOLDER STARS SMALL |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
InvoiceNo | |||||||||||||||||||||
536370 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
536852 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
536974 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
537065 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
537463 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 1563 columns
# Show a subset of columns
basket.iloc[:,[0,1,2,3,4,5,6, 7]].head()
Description | 10 COLOUR SPACEBOY PEN | 12 COLOURED PARTY BALLOONS | 12 EGG HOUSE PAINTED WOOD | 12 MESSAGE CARDS WITH ENVELOPES | 12 PENCIL SMALL TUBE WOODLAND | 12 PENCILS SMALL TUBE RED RETROSPOT | 12 PENCILS SMALL TUBE SKULL | 12 PENCILS TALL TUBE POSY |
---|---|---|---|---|---|---|---|---|
InvoiceNo | ||||||||
536370 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
536852 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
536974 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
537065 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
537463 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
# Convert the units to 1 hot encoded values
def encode_units(x):
if x <= 0:
return 0
if x >= 1:
return 1
basket_sets = basket.applymap(encode_units)
# No need to track postage
basket_sets.drop('POSTAGE', inplace=True, axis=1)
basket_sets.head()
Description | 10 COLOUR SPACEBOY PEN | 12 COLOURED PARTY BALLOONS | 12 EGG HOUSE PAINTED WOOD | 12 MESSAGE CARDS WITH ENVELOPES | 12 PENCIL SMALL TUBE WOODLAND | 12 PENCILS SMALL TUBE RED RETROSPOT | 12 PENCILS SMALL TUBE SKULL | 12 PENCILS TALL TUBE POSY | 12 PENCILS TALL TUBE RED RETROSPOT | 12 PENCILS TALL TUBE WOODLAND | ... | WRAP VINTAGE PETALS DESIGN | YELLOW COAT RACK PARIS FASHION | YELLOW GIANT GARDEN THERMOMETER | YELLOW SHARK HELICOPTER | ZINC STAR T-LIGHT HOLDER | ZINC FOLKART SLEIGH BELLS | ZINC HERB GARDEN CONTAINER | ZINC METAL HEART DECORATION | ZINC T-LIGHT HOLDER STAR LARGE | ZINC T-LIGHT HOLDER STARS SMALL |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
InvoiceNo | |||||||||||||||||||||
536370 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
536852 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
536974 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
537065 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
537463 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 1562 columns
# Build up the frequent items
frequent_itemsets = apriori(basket_sets, min_support=0.07, use_colnames=True)
frequent_itemsets.head()
support | itemsets | |
---|---|---|
0 | 0.071429 | [4 TRADITIONAL SPINNING TOPS] |
1 | 0.096939 | [ALARM CLOCK BAKELIKE GREEN] |
2 | 0.102041 | [ALARM CLOCK BAKELIKE PINK] |
3 | 0.094388 | [ALARM CLOCK BAKELIKE RED] |
4 | 0.081633 | [BAKING SET 9 PIECE RETROSPOT] |
# Create the rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules
antecedants | consequents | support | confidence | lift | |
---|---|---|---|---|---|
0 | (PLASTERS IN TIN CIRCUS PARADE) | (PLASTERS IN TIN WOODLAND ANIMALS) | 0.168367 | 0.606061 | 3.545907 |
1 | (PLASTERS IN TIN WOODLAND ANIMALS) | (PLASTERS IN TIN CIRCUS PARADE) | 0.170918 | 0.597015 | 3.545907 |
2 | (ALARM CLOCK BAKELIKE PINK) | (ALARM CLOCK BAKELIKE GREEN) | 0.102041 | 0.725000 | 7.478947 |
3 | (ALARM CLOCK BAKELIKE GREEN) | (ALARM CLOCK BAKELIKE PINK) | 0.096939 | 0.763158 | 7.478947 |
4 | (SET/6 RED SPOTTY PAPER PLATES) | (SET/6 RED SPOTTY PAPER CUPS) | 0.127551 | 0.960000 | 6.968889 |
5 | (SET/6 RED SPOTTY PAPER CUPS) | (SET/6 RED SPOTTY PAPER PLATES) | 0.137755 | 0.888889 | 6.968889 |
6 | (SET/6 RED SPOTTY PAPER CUPS, SET/20 RED RETRO... | (SET/6 RED SPOTTY PAPER PLATES) | 0.102041 | 0.975000 | 7.644000 |
7 | (SET/6 RED SPOTTY PAPER PLATES, SET/6 RED SPOT... | (SET/20 RED RETROSPOT PAPER NAPKINS) | 0.122449 | 0.812500 | 6.125000 |
8 | (SET/6 RED SPOTTY PAPER PLATES, SET/20 RED RET... | (SET/6 RED SPOTTY PAPER CUPS) | 0.102041 | 0.975000 | 7.077778 |
9 | (SET/6 RED SPOTTY PAPER CUPS) | (SET/6 RED SPOTTY PAPER PLATES, SET/20 RED RET... | 0.137755 | 0.722222 | 7.077778 |
10 | (SET/20 RED RETROSPOT PAPER NAPKINS) | (SET/6 RED SPOTTY PAPER PLATES, SET/6 RED SPOT... | 0.132653 | 0.750000 | 6.125000 |
11 | (SET/6 RED SPOTTY PAPER PLATES) | (SET/6 RED SPOTTY PAPER CUPS, SET/20 RED RETRO... | 0.127551 | 0.780000 | 7.644000 |
12 | (PLASTERS IN TIN CIRCUS PARADE) | (PLASTERS IN TIN SPACEBOY) | 0.168367 | 0.530303 | 3.849607 |
13 | (PLASTERS IN TIN SPACEBOY) | (PLASTERS IN TIN CIRCUS PARADE) | 0.137755 | 0.648148 | 3.849607 |
14 | (PLASTERS IN TIN WOODLAND ANIMALS) | (PLASTERS IN TIN SPACEBOY) | 0.170918 | 0.611940 | 4.442233 |
15 | (PLASTERS IN TIN SPACEBOY) | (PLASTERS IN TIN WOODLAND ANIMALS) | 0.137755 | 0.759259 | 4.442233 |
16 | (ALARM CLOCK BAKELIKE PINK) | (ALARM CLOCK BAKELIKE RED) | 0.102041 | 0.725000 | 7.681081 |
17 | (ALARM CLOCK BAKELIKE RED) | (ALARM CLOCK BAKELIKE PINK) | 0.094388 | 0.783784 | 7.681081 |
18 | (ALARM CLOCK BAKELIKE GREEN) | (ALARM CLOCK BAKELIKE RED) | 0.096939 | 0.815789 | 8.642959 |
19 | (ALARM CLOCK BAKELIKE RED) | (ALARM CLOCK BAKELIKE GREEN) | 0.094388 | 0.837838 | 8.642959 |
20 | (SET/6 RED SPOTTY PAPER CUPS) | (SET/20 RED RETROSPOT PAPER NAPKINS) | 0.137755 | 0.740741 | 5.584046 |
21 | (SET/20 RED RETROSPOT PAPER NAPKINS) | (SET/6 RED SPOTTY PAPER CUPS) | 0.132653 | 0.769231 | 5.584046 |
22 | (SPACEBOY LUNCH BOX) | (DOLLY GIRL LUNCH BOX) | 0.125000 | 0.571429 | 5.743590 |
23 | (DOLLY GIRL LUNCH BOX) | (SPACEBOY LUNCH BOX) | 0.099490 | 0.717949 | 5.743590 |
24 | (SET/6 RED SPOTTY PAPER PLATES) | (SET/20 RED RETROSPOT PAPER NAPKINS) | 0.127551 | 0.800000 | 6.030769 |
25 | (SET/20 RED RETROSPOT PAPER NAPKINS) | (SET/6 RED SPOTTY PAPER PLATES) | 0.132653 | 0.769231 | 6.030769 |
rules[ (rules['lift'] >= 6) &
(rules['confidence'] >= 0.8) ]
antecedants | consequents | support | confidence | lift | |
---|---|---|---|---|---|
4 | (SET/6 RED SPOTTY PAPER PLATES) | (SET/6 RED SPOTTY PAPER CUPS) | 0.127551 | 0.960000 | 6.968889 |
5 | (SET/6 RED SPOTTY PAPER CUPS) | (SET/6 RED SPOTTY PAPER PLATES) | 0.137755 | 0.888889 | 6.968889 |
6 | (SET/6 RED SPOTTY PAPER CUPS, SET/20 RED RETRO... | (SET/6 RED SPOTTY PAPER PLATES) | 0.102041 | 0.975000 | 7.644000 |
7 | (SET/6 RED SPOTTY PAPER PLATES, SET/6 RED SPOT... | (SET/20 RED RETROSPOT PAPER NAPKINS) | 0.122449 | 0.812500 | 6.125000 |
8 | (SET/6 RED SPOTTY PAPER PLATES, SET/20 RED RET... | (SET/6 RED SPOTTY PAPER CUPS) | 0.102041 | 0.975000 | 7.077778 |
18 | (ALARM CLOCK BAKELIKE GREEN) | (ALARM CLOCK BAKELIKE RED) | 0.096939 | 0.815789 | 8.642959 |
19 | (ALARM CLOCK BAKELIKE RED) | (ALARM CLOCK BAKELIKE GREEN) | 0.094388 | 0.837838 | 8.642959 |
24 | (SET/6 RED SPOTTY PAPER PLATES) | (SET/20 RED RETROSPOT PAPER NAPKINS) | 0.127551 | 0.800000 | 6.030769 |
basket['ALARM CLOCK BAKELIKE GREEN'].sum()
340.0
basket['ALARM CLOCK BAKELIKE RED'].sum()
316.0
basket2 = (df[df['Country'] =="Germany"]
.groupby(['InvoiceNo', 'Description'])['Quantity']
.sum().unstack().reset_index().fillna(0)
.set_index('InvoiceNo'))
basket_sets2 = basket2.applymap(encode_units)
basket_sets2.drop('POSTAGE', inplace=True, axis=1)
frequent_itemsets2 = apriori(basket_sets2, min_support=0.05, use_colnames=True)
rules2 = association_rules(frequent_itemsets2, metric="lift", min_threshold=1)
rules2
antecedants | consequents | support | confidence | lift | |
---|---|---|---|---|---|
0 | (WOODLAND CHARLOTTE BAG) | (ROUND SNACK BOXES SET OF4 WOODLAND) | 0.126915 | 0.500000 | 2.040179 |
1 | (ROUND SNACK BOXES SET OF4 WOODLAND) | (WOODLAND CHARLOTTE BAG) | 0.245077 | 0.258929 | 2.040179 |
2 | (PLASTERS IN TIN CIRCUS PARADE) | (ROUND SNACK BOXES SET OF4 WOODLAND) | 0.115974 | 0.490566 | 2.001685 |
3 | (ROUND SNACK BOXES SET OF4 WOODLAND) | (PLASTERS IN TIN CIRCUS PARADE) | 0.245077 | 0.232143 | 2.001685 |
4 | (PLASTERS IN TIN WOODLAND ANIMALS) | (PLASTERS IN TIN SPACEBOY) | 0.137856 | 0.444444 | 4.145125 |
5 | (PLASTERS IN TIN SPACEBOY) | (PLASTERS IN TIN WOODLAND ANIMALS) | 0.107221 | 0.571429 | 4.145125 |
6 | (WOODLAND CHARLOTTE BAG) | (RED RETROSPOT CHARLOTTE BAG) | 0.126915 | 0.465517 | 6.648168 |
7 | (RED RETROSPOT CHARLOTTE BAG) | (WOODLAND CHARLOTTE BAG) | 0.070022 | 0.843750 | 6.648168 |
8 | (ROUND SNACK BOXES SET OF4 WOODLAND) | (ROUND SNACK BOXES SET OF 4 FRUITS) | 0.245077 | 0.535714 | 3.400298 |
9 | (ROUND SNACK BOXES SET OF 4 FRUITS) | (ROUND SNACK BOXES SET OF4 WOODLAND) | 0.157549 | 0.833333 | 3.400298 |
10 | (PLASTERS IN TIN WOODLAND ANIMALS) | (ROUND SNACK BOXES SET OF4 WOODLAND) | 0.137856 | 0.539683 | 2.202098 |
11 | (ROUND SNACK BOXES SET OF4 WOODLAND) | (PLASTERS IN TIN WOODLAND ANIMALS) | 0.245077 | 0.303571 | 2.202098 |
12 | (PLASTERS IN TIN CIRCUS PARADE) | (PLASTERS IN TIN WOODLAND ANIMALS) | 0.115974 | 0.584906 | 4.242887 |
13 | (PLASTERS IN TIN WOODLAND ANIMALS) | (PLASTERS IN TIN CIRCUS PARADE) | 0.137856 | 0.492063 | 4.242887 |
14 | (PLASTERS IN TIN CIRCUS PARADE) | (ROUND SNACK BOXES SET OF 4 FRUITS) | 0.115974 | 0.433962 | 2.754455 |
15 | (ROUND SNACK BOXES SET OF 4 FRUITS) | (PLASTERS IN TIN CIRCUS PARADE) | 0.157549 | 0.319444 | 2.754455 |
16 | (SPACEBOY LUNCH BOX) | (ROUND SNACK BOXES SET OF4 WOODLAND) | 0.102845 | 0.680851 | 2.778116 |
17 | (ROUND SNACK BOXES SET OF4 WOODLAND) | (SPACEBOY LUNCH BOX) | 0.245077 | 0.285714 | 2.778116 |
rules2[ (rules2['lift'] >= 4) &
(rules2['confidence'] >= 0.5) ]
antecedants | consequents | support | confidence | lift | |
---|---|---|---|---|---|
5 | (PLASTERS IN TIN SPACEBOY) | (PLASTERS IN TIN WOODLAND ANIMALS) | 0.107221 | 0.571429 | 4.145125 |
7 | (RED RETROSPOT CHARLOTTE BAG) | (WOODLAND CHARLOTTE BAG) | 0.070022 | 0.843750 | 6.648168 |
12 | (PLASTERS IN TIN CIRCUS PARADE) | (PLASTERS IN TIN WOODLAND ANIMALS) | 0.115974 | 0.584906 | 4.242887 |