#!/usr/bin/env python # coding: utf-8 # # Data Analysis - Instakart # Before carrying out MBA on the given dataset, we can study the given dataset, its attributes and summarize the dataset for a better understanding. # In[2]: #imports import pandas as pd #Python data analysis library import numpy as np #Python scientific computing import matplotlib.pyplot as plt #For plotting import matplotlib.mlab as mlab import seaborn as sns #Python visualization library from scipy.optimize import curve_fit from IPython.display import display, HTML #Plots inline get_ipython().run_line_magic('matplotlib', 'inline') # Citation - “The Instacart Online Grocery Shopping Dataset 2017”, Accessed from https://www.instacart.com/datasets/grocery-shopping-2017" # In[3]: #import dataset trainDf = pd.read_csv("../data/raw/order_products__train.csv") orderDf = pd.read_csv("../data/raw/orders.csv") depDf = pd.read_csv("../data/raw/departments.csv") aisleDf = pd.read_csv("../data/raw/aisles.csv") productDf = pd.read_csv("../data/raw/products.csv") # In[4]: orderDf.shape # There are 3421083 orders, roughly 35 lakh orders # In[5]: productDf.shape # There are 49688 products, roughly 50000 products # In[27]: #get distribution of number of orders per customer sns.set_style('whitegrid') customerNumOrderFrame = orderDf.groupby("user_id",as_index = False)["order_number"].max() num_bins = 10 n, bins, patches = plt.hist(customerNumOrderFrame["order_number"] , num_bins, normed=1, color='blue', alpha=0.5) mu = customerNumOrderFrame["order_number"].mean() sigma = customerNumOrderFrame["order_number"].std() # Looking at the histogram it seems to be skewed and appropriate distribution will be an exponential function. # In[29]: n, bins, patches = plt.hist(customerNumOrderFrame["order_number"] , num_bins, normed=1, facecolor='yellow', alpha=0.5) bins = np.delete(bins,10) #to make dimensions of x & y axis values equal bins = bins+5 #to ge the central value of each bar def exponenial_func(x, a, b, c): return a*np.exp(-b*x)+c popt, pcov = curve_fit(exponenial_func, bins, n, p0=(1, 1e-6, 1)) xx = np.linspace(8, 100, 30) yy = exponenial_func(xx, *popt) plt.plot(xx, yy ,'r--') plt.xlabel("No. of Orders") plt.ylabel("Count") plt.title("Number of Orders per Customer Distribution") # Now to find the top frequently purchased products we will merge the prior and train dataset to get the complete order dataset. We will use append() for doing so. # In[7]: priorDf = pd.read_csv("../data/raw/order_products__prior.csv") trainDf = trainDf.append(priorDf,ignore_index = True) #Now a product count data frame can be created by counting the order_id for each product_id productCountDf = trainDf.groupby("product_id",as_index = False)["order_id"].count() # In[8]: productCountDf.shape # In[17]: #Top 20 most frequently purchased products topLev = 20 #Here order_id is the count so we need to sort the data frame w.r.t order_id productCountDf = productCountDf.sort_values("order_id",ascending = False) topProdFrame = productCountDf.iloc[0:topLev,:] topProdFrame = topProdFrame.merge(productDf,on = "product_id") display(topProdFrame.loc[:,["product_name"]]) # A density plot for product count can give an idea about to what extend we can perform smoothing of the dataset # In[10]: #Calculate the density productCountDf["density"] = (productCountDf["order_id"]/np.sum(productCountDf["order_id"])) #Calculate the rank productCountDf["rank"] = range(productCountDf.shape[0]) plt.plot(productCountDf["rank"],productCountDf["density"]) plt.title("Density Plot for product counts") plt.xlabel("Rank") plt.ylabel("Density") # It is hard to interpret much from this graph due to wide-ranging ranks (0-50000), So we can go for logarthmic scale for better visualization # In[11]: #Calculate log(Rank) also we add 1 to avoid log(0) productCountDf["logRank"] = np.log(productCountDf["rank"] + 1) plt.title("Density Plot for product counts") plt.xlabel("$\log(Rank)$") plt.ylabel("Density") plt.plot(productCountDf["logRank"],productCountDf["density"]) # We can see a very steep distribuiton and we can perform smoothening on the sparse distribuition area. e^6 = 403 products define most of the distribution. Products lying under (e^6, e^12) range are not significant since their respective density is very less. # Days of Orders in a week # In[33]: grouped = orderDf.groupby("order_id")["order_dow"].aggregate("sum").reset_index() grouped = grouped.order_dow.value_counts() sns.barplot(grouped.index, grouped.values) plt.ylabel('Number of orders', fontsize=13) plt.xlabel('Days of order in a week', fontsize=13) plt.show() # Number of unique customers in the whole dataset- # In[36]: len(set(orderDf.user_id)) # In[ ]: