https://stackoverflow.com/questions/33569265/retain-function-in-python
import numpy as np
import pandas as pd
aapl = pd.read_csv('AAPL_exmaple.csv', parse_dates=['Date'])
aapl['Total Shares'] = aapl['Quantity'].cumsum()
aapl['Cost'] = aapl['Quantity']*aapl['Price']
aapl
Date | Quantity | Price | Total Shares | Cost | |
---|---|---|---|---|---|
0 | 2017-01-10 | 1000 | 117.2249 | 1000 | 117224.90 |
1 | 2017-02-10 | -500 | 130.5928 | 500 | -65296.40 |
2 | 2017-03-10 | 1500 | 137.5316 | 2000 | 206297.40 |
3 | 2017-04-10 | -2000 | 141.5150 | 0 | -283030.00 |
4 | 2017-05-10 | 500 | 151.4884 | 500 | 75744.20 |
5 | 2017-06-09 | 500 | 147.8657 | 1000 | 73932.85 |
6 | 2017-07-10 | 500 | 143.9750 | 1500 | 71987.50 |
7 | 2017-08-10 | -1000 | 154.7636 | 500 | -154763.60 |
8 | 2017-09-11 | -200 | 160.9215 | 300 | -32184.30 |
9 | 2017-10-10 | 1000 | 155.3416 | 1300 | 155341.60 |
def get_ave_cost(df):
for index, row in df.iterrows():
if index == 0:
df.loc[index,'Ave Cost'] = row['Price']
elif row['Total Shares'] == 0:
df.loc[index,'Ave Cost'] = 0.0
else:
if row['Quantity'] > 0:
df.loc[index,'Ave Cost'] = ((df.loc[index - 1,'Ave Cost'] * df.loc[index - 1,'Total Shares']) \
+ row['Cost'])/row['Total Shares']
else:
df.loc[index,'Ave Cost'] = df.loc[index - 1,'Ave Cost']
return df
aapl_ave_cost = get_ave_cost(aapl)
aapl_ave_cost
Date | Quantity | Price | Total Shares | Cost | Ave Cost | |
---|---|---|---|---|---|---|
0 | 2017-01-10 | 1000 | 117.2249 | 1000 | 117224.90 | 117.224900 |
1 | 2017-02-10 | -500 | 130.5928 | 500 | -65296.40 | 117.224900 |
2 | 2017-03-10 | 1500 | 137.5316 | 2000 | 206297.40 | 132.454925 |
3 | 2017-04-10 | -2000 | 141.5150 | 0 | -283030.00 | 0.000000 |
4 | 2017-05-10 | 500 | 151.4884 | 500 | 75744.20 | 151.488400 |
5 | 2017-06-09 | 500 | 147.8657 | 1000 | 73932.85 | 149.677050 |
6 | 2017-07-10 | 500 | 143.9750 | 1500 | 71987.50 | 147.776367 |
7 | 2017-08-10 | -1000 | 154.7636 | 500 | -154763.60 | 147.776367 |
8 | 2017-09-11 | -200 | 160.9215 | 300 | -32184.30 | 147.776367 |
9 | 2017-10-10 | 1000 | 155.3416 | 1300 | 155341.60 | 153.595777 |
%timeit get_ave_cost(aapl)
100 loops, best of 3: 4.81 ms per loop
The root problem is using for loops instead of a vectorized solution.