In [51]:
import datetime
import pandas as pd
from pandas import DataFrame
from pandas.io.data import DataReader
import matplotlib.pyplot as plt
import seaborn
import numpy as np
In [23]:
symbols = pd.read_csv('/Users/erikrood/desktop/ipython_datasets/sp_symbols.csv')
Out[23]:
Symbol
0 MMM
1 ABT
2 ABBV
3 ACN
4 ATVI
In [24]:
#converting from df to list i.e. symbols = ['symbol1', 'symbol2', 'etc']
symbols_list = symbols["Symbol"].tolist()
In [25]:
#checking length
len(symbols_list)
Out[25]:
503
In [26]:
#reading in symbols list (csv -> df -> list)
symbols=[]
#reading in yahoo data
for ticker in symbols_list: 
    r = DataReader(ticker, "yahoo", 
                   start=datetime.datetime(2015, 8, 10))
    # add a symbol column
    r['Symbol'] = ticker 
    symbols.append(r)
    
# concatenate all the dfs
df = pd.concat(symbols)
In [28]:
df = df.reset_index()
df['Date'] = df['Date'].dt.strftime('%Y%m%d')
df.head()
Out[28]:
Date Open High Low Close Volume Adj Close Symbol
0 20150810 150.000000 151.399994 149.889999 151.240005 1571800 146.252782 MMM
1 20150811 150.000000 150.000000 148.259995 148.479996 1810800 143.583786 MMM
2 20150812 147.240005 148.710007 146.070007 148.520004 2036700 143.622475 MMM
3 20150813 148.000000 148.559998 147.080002 147.399994 1793100 142.539397 MMM
4 20150814 147.660004 148.509995 147.320007 148.279999 1401900 143.390384 MMM
In [29]:
#adding year/month/day to eventually only pull last day of month, by symbol
df['year'] = df['Date'].str[:-4]
df['month'] = df['Date'].str[4:6]
df['day'] = df['Date'].str[-2:]
df.head()
Out[29]:
Date Open High Low Close Volume Adj Close Symbol year month day
0 20150810 150.000000 151.399994 149.889999 151.240005 1571800 146.252782 MMM 2015 08 10
1 20150811 150.000000 150.000000 148.259995 148.479996 1810800 143.583786 MMM 2015 08 11
2 20150812 147.240005 148.710007 146.070007 148.520004 2036700 143.622475 MMM 2015 08 12
3 20150813 148.000000 148.559998 147.080002 147.399994 1793100 142.539397 MMM 2015 08 13
4 20150814 147.660004 148.509995 147.320007 148.279999 1401900 143.390384 MMM 2015 08 14
In [30]:
df.groupby(
    ['year', 'month']
).apply(lambda x: x.sort('day', ascending=True))
df.head(5)
/Users/erikrood/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  app.launch_new_instance()
Out[30]:
Date Open High Low Close Volume Adj Close Symbol year month day
0 20150810 150.000000 151.399994 149.889999 151.240005 1571800 146.252782 MMM 2015 08 10
1 20150811 150.000000 150.000000 148.259995 148.479996 1810800 143.583786 MMM 2015 08 11
2 20150812 147.240005 148.710007 146.070007 148.520004 2036700 143.622475 MMM 2015 08 12
3 20150813 148.000000 148.559998 147.080002 147.399994 1793100 142.539397 MMM 2015 08 13
4 20150814 147.660004 148.509995 147.320007 148.279999 1401900 143.390384 MMM 2015 08 14
In [32]:
for i in range(1,len(df)-1):
    if df['month'].loc[i] == df['month'].loc[i+1]:
        df['Date'].loc[i+1] = 10000
    else: 
        pass
/Users/erikrood/anaconda/lib/python2.7/site-packages/pandas/core/indexing.py:128: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
In [34]:
df = df[df.Date != 10000]
df = df[df.Date != '20150810']
df = df[df.Date != '20150811']
In [36]:
#type conversions
df_clean[['Symbol']] = df_clean[['Symbol']].astype(str)
df_clean[['Close']] = df_clean[['Close']].astype(float)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 3 columns):
Date      13 non-null object
Symbol    13 non-null object
Close     13 non-null float64
dtypes: float64(1), object(2)
memory usage: 384.0+ bytes
In [37]:
df_clean = df[['Date','Symbol','Close']]
df_clean.head()
Out[37]:
Date Symbol Close
16 20150901 MMM 138.500000
37 20151001 MMM 140.800003
59 20151102 MMM 159.919998
79 20151201 MMM 156.899994
101 20160104 MMM 146.820007
In [38]:
#switching/transposing row of dates to columns
df_pivot2 = df_clean.pivot('Date','Symbol','Close').reset_index()
df_pivot2.rename(columns={'Date':'Date'}, inplace=True)
In [40]:
corr_df = df_pivot2.corr(method='pearson')
In [46]:
corr_df.reset_index().head(10)
Out[46]:
Symbol Symbol A AA AAL AAP AAPL ABBV ABC ABT ACN ... XLNX XOM XRAY XRX XYL YHOO YUM ZBH ZION ZTS
0 A 1.000000 0.459216 -0.575283 -0.409230 -0.293964 0.708519 -0.619220 0.052206 0.797355 ... 0.570966 0.798568 0.808590 0.169917 0.947214 0.839686 0.565778 0.911500 0.121616 0.886955
1 AA 0.459216 1.000000 -0.289380 -0.021417 0.001143 0.579619 -0.120382 0.256079 0.314193 ... -0.171139 0.459299 0.172325 0.265727 0.491626 0.619373 0.535206 0.540549 0.433958 0.641208
2 AAL -0.575283 -0.289380 1.000000 0.386389 0.754325 -0.374976 0.788206 0.529788 -0.490708 ... 0.131172 -0.567111 -0.152170 0.312714 -0.645627 -0.490982 -0.728467 -0.675147 0.131930 -0.502695
3 AAP -0.409230 -0.021417 0.386389 1.000000 0.665447 0.173504 0.383483 0.469664 -0.335253 ... -0.275148 -0.180028 -0.345072 -0.244096 -0.348343 -0.126654 -0.082786 -0.167855 0.433747 -0.346692
4 AAPL -0.293964 0.001143 0.754325 0.665447 1.000000 0.057007 0.670206 0.852227 -0.275719 ... 0.094758 -0.301675 -0.088269 0.454961 -0.369949 -0.114511 -0.324972 -0.294330 0.548260 -0.179122
5 ABBV 0.708519 0.579619 -0.374976 0.173504 0.057007 1.000000 -0.313558 0.406501 0.441159 ... 0.174020 0.645130 0.469381 -0.073805 0.662391 0.819199 0.552599 0.867349 0.493484 0.755435
6 ABC -0.619220 -0.120382 0.788206 0.383483 0.670206 -0.313558 1.000000 0.605800 -0.751508 ... -0.231818 -0.700939 -0.463421 0.184379 -0.774217 -0.599750 -0.655745 -0.656081 0.336540 -0.321025
7 ABT 0.052206 0.256079 0.529788 0.469664 0.852227 0.406501 0.605800 1.000000 -0.192174 ... 0.195823 -0.164454 0.078213 0.471521 -0.096590 0.149059 -0.066473 0.068116 0.622769 0.256204
8 ACN 0.797355 0.314193 -0.490708 -0.335253 -0.275719 0.441159 -0.751508 -0.192174 1.000000 ... 0.536608 0.851239 0.814440 0.230174 0.877496 0.791925 0.451595 0.733508 -0.003222 0.607123
9 ADBE 0.882918 0.345301 -0.443852 -0.412987 -0.265353 0.429599 -0.638848 -0.112900 0.935845 ... 0.638457 0.801199 0.861766 0.265262 0.887285 0.723040 0.375690 0.729266 0.027395 0.708086

10 rows × 504 columns

In [47]:
#Outputting raw data to csv
corr_df.to_csv('raw_Pearson_corr_sp500.csv')
In [52]:
# Create a mask to display only the lower triangle of the matrix (since it's mirrored around its 
# top-left to bottom-right diagonal).
mask = np.zeros_like(corr_df)
mask[np.triu_indices_from(mask)] = True
In [53]:
# Create the heatmap using seaborn library. 
# List if colormaps (parameter 'cmap') is available here: http://matplotlib.org/examples/color/colormaps_reference.html
seaborn.heatmap(corr_df, cmap='RdYlGn_r', vmax=1.0, vmin=-1.0 , mask = mask, linewidths=2.5)
Out[53]:
<matplotlib.axes._subplots.AxesSubplot at 0x11aa35f50>
In [54]:
plt.yticks(rotation=0) 
plt.xticks(rotation=90) 
plt.show()

old stuff/references -- did not use

In [385]:
#stopped here, just need to add 6 price percent columns drop rest
df_pivot2.rename(columns={'20160104':'January'}, inplace=True)
df_pivot2.rename(columns={'20151231':'December'}, inplace=True)

df_pivot2['Jan_dec'] = (df_pivot2.January / df_pivot2.December)
In [ ]:
#http://gouthamanbalaraman.com/blog/calculating-stock-beta.html
#http://dogdogfish.com/python-2/stock-prices-and-python-pandas-to-the-rescue/
#correl matrix - http://stackoverflow.com/questions/28932065/how-do-you-calculate-correlation-between-all-columns-in-a-dataframe-and-all-colu

#this is the example to use: correlation matrix using forex
# http://www.tradinggeeks.net/2015/08/calculating-correlation-in-python/

#Some caveats about the sample code. The returns are calculated using the adjusted close from Yahoo finance data. This is because the adjusted close accounts for dividends and splits etc. In my personal experience I have found the returns calculated this way to be a reasonably close estimate but not always accurate.