#!/usr/bin/env python
# coding: utf-8

# # Market Basket Analysis

# Association Analysis

# Get pandas and MLxtend code imported and read the data:

# In[1]:


#imports
import pandas as pd #Python data analysis library
import numpy as np #Python scientific computing
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules


# Citation - “The Instacart Online Grocery Shopping Dataset 2017”, Accessed from https://www.instacart.com/datasets/grocery-shopping-2017"

# In[2]:


#import dataset
trainDf = pd.read_csv("../data/raw/order_products__train.csv")
orderDf = pd.read_csv("../data/raw/orders.csv")
productDf = pd.read_csv("../data/raw/products.csv")


# We will merge the prior and train dataset to get the complete order dataset. We will use append() for doing so.

# In[3]:


priorDf = pd.read_csv("../data/raw/order_products__prior.csv")
trainDf = trainDf.append(priorDf,ignore_index = True)


# For counting each product, we can assign reordered column as 1 

# In[4]:


trainDf['reordered'] = 1 


# In[5]:


productCountDf = trainDf.groupby("product_id",as_index = False)["order_id"].count()


# In[6]:


#Top 100 most frequently purchased products
topLev = 100

#Here order_id is the count so we need to sort the data frame w.r.t order_id
productCountDf = productCountDf.sort_values("order_id",ascending = False)

topProdFrame = productCountDf.iloc[0:topLev,:]
topProdFrame = topProdFrame.merge(productDf,on = "product_id")
productId= topProdFrame.loc[:,["product_id"]]


# Now we will filter the orders and get orders containting the the most frequently purchased products

# In[7]:


df = trainDf[0:0]
for i in range(0,99):
    pId = productId.iloc[i]['product_id'] 
    stDf = trainDf[trainDf.product_id == pId ]
    df = df.append(stDf,ignore_index = False)


# In[8]:


df.head()


#  Now we need to consolidate the items into 1 transaction per row with each product 1 hot encoded. Each row will represent an order and each column will represent product_id. If the cell value is '1' say (i,j) then ith order contains jth product.

# In[9]:


basket = df.groupby(['order_id', 'product_id'])['reordered'].sum().unstack().reset_index().fillna(0).set_index('order_id')


# In[10]:


# Convert the units to 1 hot encoded values
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1    


# In[11]:


basket_sets = basket.applymap(encode_units)


# In[12]:


basket_sets.head()


# In[13]:


basket_sets.size


# Now that the data is structured properly, we can generate frequent item sets that have a support of at least 1% 

# In[14]:


# Build up the frequent items
frequent_itemsets = apriori(basket_sets, min_support=0.01, use_colnames=True)


# In[15]:


frequent_itemsets


# The final step is to generate the rules with their corresponding support, confidence and lift:

# In[16]:


# Create the rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules


#  We can also see several where the confidence is high as well.or instance, we can see that there are quite a few rules with a high lift value which means that it occurs more frequently than would be expected given the number of transaction and product combinations. 
# 
# We can filter the dataframe using standard pandas code. In this case, look for a large lift (2) and high confidence (.1):

# In[17]:


rules[ (rules['lift'] >= 2) &
       (rules['confidence'] >= 0.1) ]