#!/usr/bin/env python # coding: utf-8 # ## Learning More About Pandas by Building and Using a Weighted Average Function # This notebook is based on the article on [Pbpython.com](http://pbpython.com/weighted-average.html). Please reference it for the background and additional details # In[1]: import pandas as pd import numpy as np # Read in our sample sales data that includes projected price for our new product launch # In[2]: sales = pd.read_excel("https://github.com/chris1610/pbpython/blob/master/data/sales-estimate.xlsx?raw=True", sheetname="projections") sales # Show the mean for our current and new product price # In[3]: print(sales["Current_Price"].mean()) print(sales["New_Product_Price"].mean()) # Calculate the weighted average using the long form # In[4]: print((sales["Current_Price"] * sales["Quantity"]).sum() / sales["Quantity"].sum()) print((sales["New_Product_Price"] * sales["Quantity"]).sum() / sales["Quantity"].sum()) # Use np.average to simplify the formula # In[5]: print(np.average(sales["Current_Price"], weights=sales["Quantity"])) print(np.average(sales["New_Product_Price"], weights=sales["Quantity"])) # For maximum flexibility, build our own weighted average function # In[6]: def wavg(group, avg_name, weight_name): """ http://stackoverflow.com/questions/10951341/pandas-dataframe-aggregate-function-using-multiple-columns In rare instance, we may not have weights, so just return the mean """ d = group[avg_name] w = group[weight_name] try: return (d * w).sum() / w.sum() except ZeroDivisionError: return d.mean() # Call the weighted average on all of the data # In[7]: print(wavg(sales, "Current_Price", "Quantity")) print(wavg(sales, "New_Product_Price", "Quantity")) # Group the data by manager # In[8]: sales.groupby("Manager").apply(wavg, "Current_Price", "Quantity") # In[9]: sales.groupby("Manager").apply(wavg, "New_Product_Price", "Quantity") # You can also group by state # In[10]: sales.groupby("State").apply(wavg, "New_Product_Price", "Quantity") # You can also group by multiple criteria and the function will work correctly. # In[11]: sales.groupby(["Manager", "State"]).apply(wavg, "New_Product_Price", "Quantity") # Example of applying multiple aggregation functions # In[12]: f = {'New_Product_Price': ['mean'],'Current_Price': ['median'], 'Quantity': ['sum', 'mean']} sales.groupby("Manager").agg(f) # Similar method to group multiple custom functions together into a single DataFrame # In[13]: data_1 = sales.groupby("Manager").apply(wavg, "New_Product_Price", "Quantity") data_2 = sales.groupby("Manager").apply(wavg, "Current_Price", "Quantity") # In[14]: summary = pd.DataFrame(data=dict(s1=data_1, s2=data_2)) summary.columns = ["New Product Price","Current Product Price"] summary.head() # Finally, numpy has an average function that can be used: # In[15]: np.average(sales["Current_Price"], weights=sales["Quantity"]) # Use a lambda function for it to work with grouped data # In[16]: sales.groupby("Manager").apply(lambda x: np.average(x['New_Product_Price'], weights=x['Quantity'])) # In[ ]: