#!/usr/bin/env python # coding: utf-8 # # Market Basket Analysis # Association Analysis # Get pandas and MLxtend code imported and read the data: # In[1]: #imports import pandas as pd #Python data analysis library import numpy as np #Python scientific computing from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules # Citation - “The Instacart Online Grocery Shopping Dataset 2017”, Accessed from https://www.instacart.com/datasets/grocery-shopping-2017" # In[2]: #import dataset trainDf = pd.read_csv("../data/raw/order_products__train.csv") orderDf = pd.read_csv("../data/raw/orders.csv") productDf = pd.read_csv("../data/raw/products.csv") # We will merge the prior and train dataset to get the complete order dataset. We will use append() for doing so. # In[3]: priorDf = pd.read_csv("../data/raw/order_products__prior.csv") trainDf = trainDf.append(priorDf,ignore_index = True) # For counting each product, we can assign reordered column as 1 # In[4]: trainDf['reordered'] = 1 # In[5]: productCountDf = trainDf.groupby("product_id",as_index = False)["order_id"].count() # In[6]: #Top 100 most frequently purchased products topLev = 100 #Here order_id is the count so we need to sort the data frame w.r.t order_id productCountDf = productCountDf.sort_values("order_id",ascending = False) topProdFrame = productCountDf.iloc[0:topLev,:] topProdFrame = topProdFrame.merge(productDf,on = "product_id") productId= topProdFrame.loc[:,["product_id"]] # Now we will filter the orders and get orders containting the the most frequently purchased products # In[7]: df = trainDf[0:0] for i in range(0,99): pId = productId.iloc[i]['product_id'] stDf = trainDf[trainDf.product_id == pId ] df = df.append(stDf,ignore_index = False) # In[8]: df.head() # Now we need to consolidate the items into 1 transaction per row with each product 1 hot encoded. Each row will represent an order and each column will represent product_id. If the cell value is '1' say (i,j) then ith order contains jth product. # In[9]: basket = df.groupby(['order_id', 'product_id'])['reordered'].sum().unstack().reset_index().fillna(0).set_index('order_id') # In[10]: # Convert the units to 1 hot encoded values def encode_units(x): if x <= 0: return 0 if x >= 1: return 1 # In[11]: basket_sets = basket.applymap(encode_units) # In[12]: basket_sets.head() # In[13]: basket_sets.size # Now that the data is structured properly, we can generate frequent item sets that have a support of at least 1% # In[14]: # Build up the frequent items frequent_itemsets = apriori(basket_sets, min_support=0.01, use_colnames=True) # In[15]: frequent_itemsets # The final step is to generate the rules with their corresponding support, confidence and lift: # In[16]: # Create the rules rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1) rules # We can also see several where the confidence is high as well.or instance, we can see that there are quite a few rules with a high lift value which means that it occurs more frequently than would be expected given the number of transaction and product combinations. # # We can filter the dataframe using standard pandas code. In this case, look for a large lift (2) and high confidence (.1): # In[17]: rules[ (rules['lift'] >= 2) & (rules['confidence'] >= 0.1) ]