#!/usr/bin/env python # coding: utf-8 # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib import matplotlib.pyplot as plt import pandas as pd plt.style.use('ggplot') # ## Getting the data # # From Yahoo Finance's Historical Prices: [Apple Inc. (AAPL) stock](http://finance.yahoo.com/q/hp?s=AAPL): http://finance.yahoo.com/q/hp?s=AAPL # # Direct link to __CSV for 2012-01-01 to 2015-05-01__: # # http://real-chart.finance.yahoo.com/table.csv?s=AAPL&a=00&b=1&c=2013&d=04&e=1&f=2015&g=d # # # We can use [__pandas.io.parsers.read_csv()__](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.io.parsers.read_csv.html) to download the CSV file from the URL, then read the data into a DataFrame. # # The `parse_dates` argument takes in a __list__ of column numbers, in which we can use to tell pandas which columns are meant to be dates. In this case, it's the _first_ column, i.e. the _0th_: # # In[20]: csvurl = "http://real-chart.finance.yahoo.com/table.csv?s=AAPL&a=00&b=1&c=2010&d=04&e=1&f=2015&g=d" prices = pd.read_csv(csvurl, parse_dates = [0]) # In[4]: prices.head() # ## Making a chart # In[21]: fig, ax = plt.subplots() plt.plot(prices['Date'], prices['Adj Close']) # ## Reshaping the data # # Use the DataFrame method `set_index()` to create a DataFrame that uses the `"Date"` column as an index: # ### Tidy the chart # # Use the pandas Timeseries to generate ticks for the x-axis. Here's a [list of aliases for time periods](http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases) -- `'BQ'` stands for "business quarter end", for example. I've decided to use `'6m'` for "6 months". # # In[68]: # fig, ax = plt.subplots() # ax.plot(prices['Date'], prices['Adj Close']) # redo the axes # fix the ticks # ax.set_xticks(pd.date_range('2010-01-01', '2015-05-01', freq = '6m')) # ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%b %Y')) # In[39]: dprices = prices.set_index('Date') dprices.head() # This doesn't change anything about the number of rows or what they contain (other than that the `"Date"` column has been moved out of the columns and into the index). # # With `"Date"` no longer an actual column, the `plot()` call will refer to the `index` attribute of the DataFrame. But the visual output should be exactly the same as before. # In[40]: fig, ax = plt.subplots() ax.plot(dprices.index, dprices['Adj Close']) # ### Resampling # # In the timespan of 2010 to 2015, there are more than 1,300 datapoints. When looking at stock trends over a half-decade period, is it necessary to see _each day_ represented on the graph? If we reduce the granularity of the data, we can still see the same trends in a clearer, less jagged graph, and the loss in _precision_ is irrelevant. # # Let's see what the data looks like when charted as _weekly_ periods. # # In terms of programmatic logic, this is a kind of [__groupby__](http://pandas.pydata.org/pandas-docs/stable/groupby.html) call, in which we group the index (i.e. the `"Date"`) by its year-month representation, e.g. `"2014-02"`. # # However, switching up the frequency in a time-series is such a common scenario that _pandas_ has a helpful convenience function named [__resample()__](http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.resample.html). # # The only required argument for `resample()` is a string that specifies the frequency of sampling, such as `'W'` for _weekly_ and `'A'` for _annual_ (a [list of aliases can be found here](http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases)). We can specify an optional `how` argument, for example, a string such as `'max'` or `'first'`, to get the _max_ value or the _first_ value, respectively, of every week. The default value of `how` is `'mean'`. # # To resample the data such that we get the _weekly average_ of the stock price data: # In[47]: wkprices = dprices.resample('W', how = 'mean') # 'mean' is default, but we'll be explicit here wkprices.head() # Let's chart the `wkprices` data to see what it looks like; the result is slightly less jagged with no real loss in being able to interpret the trends: # In[51]: fig, ax = plt.subplots() ax.plot(wkprices.index, wkprices['Adj Close']) # Let's break the data down by _monthly averages_. Then let's display the three data series on the same chart to see how the trendlines compare: # In[67]: mthprices = dprices.resample('M', how = 'mean') fig, ax = plt.subplots() ax.plot(mthprices.index, mthprices['Adj Close'], linewidth = 4, color = 'yellow', ) ax.plot(wkprices.index, wkprices['Adj Close'], linewidth = 4, color = 'blue', alpha = 0.4, linestyle = '--') ax.plot(dprices.index, dprices['Adj Close'], linewidth = 1, color = 'red', linestyle = ':') # It's an ugly graph, but the point is that when looking at 5+ year timespan, daily values end up being noise. The monthly averages seem to work just fine for our purposes so we'll stick to that. # ## Making a nice chart # # - Set the y-axis to a start at __0__ # - Add some labels # - Clean up tick marks # # # In[75]: fig, ax = plt.subplots() ax.plot(mthprices.index, mthprices['Adj Close']) ax.set_ylim(ymin = 0) ax.set_ylabel('Adjusted closing price, monthly average', fontsize=10) ax.set_title('Apple (AAPL) stock') # remove the ticks that are along the top and right side of the chart ax.xaxis.set_ticks_position('bottom') ax.yaxis.set_ticks_position('left') # ## Adding context with another data series # # # Let's add computing rivals Dell (DELL), Hewlett-Packard (HPQ), Microsoft (MSFT), and Sony (SNE) to the analysis. And let's expand it to 1990. # In[169]: from collections import OrderedDict NAMES = ['AAPL', 'DELL', 'HPQ', 'MSFT', 'SNE'] COLORS = ['red', 'gray', 'orange', 'green', 'blue'] data = OrderedDict({}) # #### Getting the data # # We'll have to redo the initial data collection from the Yahoo Finance historical price data: http://finance.yahoo.com/q/hp?s=AAPL. # # This is the same pattern as before, except in a for-loop. I've decided to create a dictionary called `data` that will hold a DataFrame for each company label. I could've arranged it to have one large DataFrame and then added a new column for the company label. But this works just fine. # In[170]: for n in NAMES: csvurl = "http://real-chart.finance.yahoo.com/table.csv?s=%s&a=00&b=1&c=1990&d=04&e=1&f=2015&g=d" % n prices = pd.read_csv(csvurl, parse_dates = [0], index_col = 0) data[n] = prices.resample('M') # ### Each series gets its own chart # In[171]: for name, prices in data.items(): fig, ax = plt.subplots() ax.set_title(name) ax.plot(prices.index, prices['Adj Close']) # This is the laziest, most un-useful way to compare the data: make one chart for each company. One of the main problems with making separate charts is that _pandas_ will autoscale the axes for each series; the y-scale is different for each company, which is severely misleading as the top value for __HPQ__ is not at all the same as it is for __AAPL__, but you wouldn't know it without looking at the fine print of the labels. # ### Multiple series on one chart # # In[214]: fig, ax = plt.subplots() for i, (name, prices) in enumerate(data.items()): ax.plot(prices.index, prices['Adj Close'], label = name, color = COLORS[i]) ax.legend(loc = 'upper left') # By specifying the `label` arugment in each `plot()` call, the subsequent `legend()` call can properly match up each series to its color. The matplotlib docs have much more detail [on how to position and format legends](http://matplotlib.org/api/legend_api.html). # ### Using subplots # # So far, we've been using the __subplots()__ call even though we've only constructed and rendered one chart at a time. By passing in multiple arguments, we can specify a grid of axes to plot on. # # (Check out this [tutorial example from the docs of how to use __subplots()__](http://matplotlib.org/examples/pylab_examples/subplots_demo.html)) # In[174]: fig, axes = plt.subplots(5) for i, (name, prices) in enumerate(data.items()): ax = axes[i] ax.set_title(name) ax.plot(prices.index, prices['Adj Close'], label = name, color = COLORS[i]) # In[175]: fig, axes = plt.subplots(5, sharex=True) for i, (name, prices) in enumerate(data.items()): ax = axes[i] ax.set_title(name) ax.plot(prices.index, prices['Adj Close'], color = COLORS[i]) # In[176]: fig, axes = plt.subplots(5, sharex=True, sharey=True) for i, (name, prices) in enumerate(data.items()): ax = axes[i] ax.set_title(name) ax.plot(prices.index, prices['Adj Close'], color = COLORS[i]) # In[177]: fig, axes = plt.subplots(1, 5, sharex=True, sharey=True) for i, (name, prices) in enumerate(data.items()): ax = axes[i] ax.set_title(name) ax.plot(prices.index, prices['Adj Close'], color = COLORS[i]) # In[204]: fig, axes = plt.subplots(2, 3, sharex=True, sharey=True) for i, (name, prices) in enumerate(data.items()): row_num = i / 2 col_num = i % 2 ax = axes[col_num][row_num] ax.set_title(name) ax.plot(prices.index, prices['Adj Close'], color = COLORS[i]) # pct_change http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.pct_change.html # In[205]: fig, axes = plt.subplots(2, 3, sharex=True, sharey=True) for i, (name, prices) in enumerate(data.items()): row_num = i / 2 col_num = i % 2 ax = axes[col_num][row_num] ax.set_title(name) ppchange = prices.pct_change() ax.scatter(ppchange.index, ppchange['Adj Close'] * 100, color = COLORS[i], s=1) # In[208]: fig, ax = plt.subplots() for i, (name, prices) in enumerate(data.items()): xprices = prices.resample('A') ppchange = xprices.pct_change() ax.scatter(ppchange.index.year, ppchange['Adj Close'] * 100, color = COLORS[i], s=10, label = name) plt.legend(loc = 'upper left') # In[ ]: