#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') # # Analysing structured data with data frames # # (c) 2019 [Steve Phelps](mailto:sphelps@sphelps.net) # # ## Data frames # # - The `pandas` module provides a powerful data-structure called a data frame. # # - It is similar, but not identical to: # - a table in a relational database, # - an Excel spreadsheet, # - a dataframe in R. # # ### Types of data # # Data frames can be used to represent: # # - [Panel data](https://en.wikipedia.org/wiki/Panel_data) # - [Time series](https://en.wikipedia.org/wiki/Time_series) data # - [Relational data](https://en.wikipedia.org/wiki/Relational_model) # # ### Loading data # # - Data frames can be read and written to/from: # - financial web sites # - database queries # - database tables # - CSV files # - json files # # - Beware that data frames are memory resident; # - If you read a large amount of data your PC might crash # - With big data, typically you would read a subset or summary of the data via e.g. a select statement. # ## Importing pandas # # - The pandas module is usually imported with the alias `pd`. # # In[2]: import pandas as pd # ## Series # # - A Series contains a one-dimensional array of data, *and* an associated sequence of labels called the *index*. # # - The index can contain numeric, string, or date/time values. # # - When the index is a time value, the series is a [time series](https://en.wikipedia.org/wiki/Time_series). # # - The index must be the same length as the data. # # - If no index is supplied it is automatically generated as `range(len(data))`. # ### Creating a series from an array # # # In[3]: import numpy as np data = np.random.randn(5) data # In[4]: my_series = pd.Series(data, index=['a', 'b', 'c', 'd', 'e']) my_series # ### Plotting a series # # - We can plot a series by invoking the `plot()` method on an instance of a `Series` object. # # - The x-axis will autimatically be labelled with the series index. # In[5]: import matplotlib.pyplot as plt my_series.plot() plt.show() # ### Creating a series with automatic index # # - In the following example the index is creating automatically: # In[6]: pd.Series(data) # ### Creating a Series from a `dict` # # # In[7]: d = {'a' : 0., 'b' : 1., 'c' : 2.} my_series = pd.Series(d) my_series # ### Indexing a series with `[]` # # - Series can be accessed using the same syntax as arrays and dicts. # # - We use the labels in the index to access each element. # # # In[8]: my_series['b'] # - We can also use the label like an attribute: # In[9]: my_series.b # ### Slicing a series # # # - We can specify a range of labels to obtain a slice: # In[10]: my_series[['b', 'c']] # ## Arithmetic and vectorised functions # # - `numpy` vectorization works for series objects too. # # # In[11]: d = {'a' : 0., 'b' : 1., 'c' : 2.} squared_values = pd.Series(d) ** 2 squared_values # In[12]: x = pd.Series({'a' : 0., 'b' : 1., 'c' : 2.}) y = pd.Series({'a' : 3., 'b' : 4., 'c' : 5.}) x + y # ## Time series # In[13]: dates = pd.date_range('1/1/2000', periods=5) dates # In[14]: time_series = pd.Series(data, index=dates) time_series # ### Plotting a time-series # In[15]: ax = time_series.plot() # ## Missing values # # - Pandas uses `nan` to represent missing data. # # - So `nan` is used to represent missing, invalid or unknown data values. # # - It is important to note that this only convention only applies within pandas. # - Other frameworks have very different semantics for these values. # # ## DataFrame # # - A data frame has multiple columns, each of which can hold a *different* type of value. # # - Like a series, it has an index which provides a label for each and every row. # # - Data frames can be constructed from: # - dict of arrays, # - dict of lists, # - dict of dict # - dict of Series # - 2-dimensional array # - a single Series # - another DataFrame # # ## Creating a dict of series # In[16]: series_dict = { 'x' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']), 'y' : pd.Series([4., 5., 6., 7.], index=['a', 'b', 'c', 'd']), 'z' : pd.Series([0.1, 0.2, 0.3, 0.4], index=['a', 'b', 'c', 'd']) } series_dict # ## Converting the dict to a data frame # In[17]: df = pd.DataFrame(series_dict) df # ## Plotting data frames # # - When plotting a data frame, each column is plotted as its own series on the same graph. # # - The column names are used to label each series. # # - The row names (index) is used to label the x-axis. # In[18]: ax = df.plot() # ## Indexing # - The outer dimension is the column index. # # - When we retrieve a single column, the result is a Series # In[19]: df['x'] # In[20]: df['x']['b'] # In[21]: df.x.b # ## Projections # # - Data frames can be sliced just like series. # - When we slice columns we call this a *projection*, because it is analogous to specifying a subset of attributes in a relational query, e.g. `SELECT x FROM table`. # - If we project a single column the result is a series: # In[22]: slice = df['x'][['b', 'c']] slice # In[23]: type(slice) # ## Projecting multiple columns # # - When we include multiple columns in the projection the result is a DataFrame. # In[24]: slice = df[['x', 'y']] slice # In[25]: type(slice) # ## Vectorization # # - Vectorized functions and operators work just as with series objects: # In[26]: df['x'] + df['y'] # In[27]: df ** 2 # ## Logical indexing # # - We can use logical indexing to retrieve a subset of the data. # # # In[28]: df['x'] >= 2 # In[29]: df[df['x'] >= 2] # ## Descriptive statistics # - To quickly obtain descriptive statistics on numerical values use the `describe` method. # In[30]: df.describe() # ## Accessing a single statistic # # - The result is itself a DataFrame, so we can index a particular statistic like so: # In[31]: df.describe()['x']['mean'] # ## Accessing the row and column labels # # - The row labels (index) and column labels can be accessed: # # In[32]: df.index # In[33]: df.columns # ## Head and tail # # - Data frames have `head()` and `tail()` methods which behave analgously to the Unix commands of the same name. # ## Financial data # # - Pandas was originally developed to analyse financial data. # # - We can download tabulated data in a portable format called [Comma Separated Values (CSV)](https://www.loc.gov/preservation/digital/formats/fdd/fdd000323.shtml). # In[34]: import pandas as pd googl = pd.read_csv('data/GOOGL.csv') # ### Examining the first few rows # # - When working with large data sets it is useful to view just the first/last few rows in the dataset. # # - We can use the `head()` method to retrieve the first rows: # In[35]: googl.head() # ### Examining the last few rows # In[36]: googl.tail() # ### Converting to datetime values # # - So far, the `Date` attribute is of type string. # In[37]: googl.Date[0] # In[38]: type(googl.Date[0]) # - In order to work with time-series data, we need to construct an index containing time values. # # - Time values are of type `datetime` or `Timestamp`. # # - We can use the function `to_datetime()` to convert strings to time values. # In[39]: pd.to_datetime(googl['Date']).head() # ### Setting the index # # - Now we need to set the index of the data-frame so that it contains the sequence of dates. # # In[40]: googl.set_index(pd.to_datetime(googl['Date']), inplace=True) googl.index[0] # In[41]: type(googl.index[0]) # ### Plotting series # # - We can plot a series in a dataframe by invoking its `plot()` method. # # - Here we plot a time-series of the daily traded volume: # In[42]: ax = googl['Volume'].plot() plt.show() # ### Adjusted closing prices as a time series # In[43]: googl['Adj Close'].plot() plt.show() # ### Slicing series using date/time stamps # # - We can slice a time series by specifying a range of dates or times. # # - Date and time stamps are specified strings representing dates in the required format. # In[44]: googl['Adj Close']['1-1-2016':'1-1-2017'].plot() plt.show() # ### Resampling # # - We can *resample* to obtain e.g. weekly or monthly prices. # # - In the example below the `'W'` denotes weekly. # # - See [the documentation](http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases) for other frequencies. # # - We group data into weeks, and then take the last value in each week. # # - For details of other ways to resample the data, see [the documentation](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.resample.html). # #### Resampled time-series plot # In[45]: weekly_prices = googl['Adj Close'].resample('W').last() weekly_prices.head() # In[46]: weekly_prices.plot() plt.title('Prices for GOOGL sampled at weekly frequency') plt.show() # ### Converting prices to log returns # In[47]: weekly_rets = np.diff(np.log(weekly_prices)) plt.plot(weekly_rets) plt.xlabel('t'); plt.ylabel('$r_t$') plt.title('Weekly log-returns for GOOGL') plt.show() # ### Converting the returns to a series # # - Notice that in the above plot the time axis is missing the dates. # # - This is because the `np.diff()` function returns an array instead of a data-frame. # # In[48]: type(weekly_rets) # - We can convert it to a series thus: # In[49]: weekly_rets_series = pd.Series(weekly_rets, index=weekly_prices.index[1:]) weekly_rets_series.head() # #### Plotting with the correct time axis # Now when we plot the series we will obtain the correct time axis: # In[50]: plt.plot(weekly_rets_series) plt.title('GOOGL weekly log-returns'); plt.xlabel('t'); plt.ylabel('$r_t$') plt.show() # ### Plotting a return histogram # In[51]: weekly_rets_series.hist() plt.show() # In[52]: weekly_rets_series.describe()