#!/usr/bin/env python
# coding: utf-8

# ## Learning More About Pandas by Building and Using a Weighted Average Function

# This notebook is based on the article on [Pbpython.com](http://pbpython.com/weighted-average.html). Please reference it for the background and additional details

# In[1]:


import pandas as pd
import numpy as np


# Read in our sample sales data that includes projected price for our new product launch

# In[2]:


sales = pd.read_excel("https://github.com/chris1610/pbpython/blob/master/data/sales-estimate.xlsx?raw=True", sheetname="projections")
sales


# Show the mean for our current and new product price

# In[3]:


print(sales["Current_Price"].mean())
print(sales["New_Product_Price"].mean())


# Calculate the weighted average using the long form

# In[4]:


print((sales["Current_Price"] * sales["Quantity"]).sum() / sales["Quantity"].sum())
print((sales["New_Product_Price"] * sales["Quantity"]).sum() / sales["Quantity"].sum())


# Use np.average to simplify the formula

# In[5]:


print(np.average(sales["Current_Price"], weights=sales["Quantity"]))
print(np.average(sales["New_Product_Price"], weights=sales["Quantity"]))


# For maximum flexibility, build our own weighted average function

# In[6]:


def wavg(group, avg_name, weight_name):
    """ http://stackoverflow.com/questions/10951341/pandas-dataframe-aggregate-function-using-multiple-columns
    In rare instance, we may not have weights, so just return the mean
    """
    d = group[avg_name]
    w = group[weight_name]
    try:
        return (d * w).sum() / w.sum()
    except ZeroDivisionError:
        return d.mean()


# Call the weighted average on all of the data

# In[7]:


print(wavg(sales, "Current_Price", "Quantity"))
print(wavg(sales, "New_Product_Price", "Quantity"))


# Group the data by manager

# In[8]:


sales.groupby("Manager").apply(wavg, "Current_Price", "Quantity")


# In[9]:


sales.groupby("Manager").apply(wavg, "New_Product_Price", "Quantity")


# You can also group by state

# In[10]:


sales.groupby("State").apply(wavg, "New_Product_Price", "Quantity")


# You can also group by multiple criteria and the function will work correctly.

# In[11]:


sales.groupby(["Manager", "State"]).apply(wavg, "New_Product_Price", "Quantity")


# Example of applying multiple aggregation functions

# In[12]:


f = {'New_Product_Price': ['mean'],'Current_Price': ['median'], 'Quantity': ['sum', 'mean']}
sales.groupby("Manager").agg(f)


# Similar method to group multiple custom functions together into a single DataFrame

# In[13]:


data_1 = sales.groupby("Manager").apply(wavg, "New_Product_Price", "Quantity")
data_2 = sales.groupby("Manager").apply(wavg, "Current_Price", "Quantity")


# In[14]:


summary = pd.DataFrame(data=dict(s1=data_1, s2=data_2))
summary.columns = ["New Product Price","Current Product Price"]
summary.head()


# Finally, numpy has an average function that can be used:

# In[15]:


np.average(sales["Current_Price"], weights=sales["Quantity"])


# Use a lambda function for it to work with grouped data

# In[16]:


sales.groupby("Manager").apply(lambda x: np.average(x['New_Product_Price'], weights=x['Quantity']))


# In[ ]: