#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')


# # Analysing structured data with data frames 
# 
# (c) 2019 [Steve Phelps](mailto:sphelps@sphelps.net)
# 

# ## Data frames
# 
# - The `pandas` module provides a powerful data-structure called a data frame.
# 
# - It is similar, but not identical to:
#     - a table in a relational database,
#     - an Excel spreadsheet,
#     - a dataframe in R.
#     

# ### Types of data
# 
# Data frames can be used to represent:
# 
# - [Panel data](https://en.wikipedia.org/wiki/Panel_data)
# - [Time series](https://en.wikipedia.org/wiki/Time_series) data
# - [Relational data](https://en.wikipedia.org/wiki/Relational_model)
#     

# ### Loading data
# 
# - Data frames can be read and written to/from:
#     - financial web sites
#     - database queries
#     - database tables
#     - CSV files
#     - json files
#     
# - Beware that data frames are memory resident;
#     - If you read a large amount of data your PC might crash
#     - With big data, typically you would read a subset or summary of the data via e.g. a select statement.

# ## Importing pandas
# 
# - The pandas module is usually imported with the alias `pd`.
# 

# In[2]:


import pandas as pd


# ## Series
# 
# - A Series contains a one-dimensional array of data, *and* an associated sequence of labels called the *index*.
# 
# - The index can contain numeric, string, or date/time values.
# 
# - When the index is a time value, the series is a [time series](https://en.wikipedia.org/wiki/Time_series).
# 
# - The index must be the same length as the data.
# 
# - If no index is supplied it is automatically generated as `range(len(data))`.

# ### Creating a series from an array
# 
# 

# In[3]:


import numpy as np
data = np.random.randn(5)
data


# In[4]:


my_series = pd.Series(data, index=['a', 'b', 'c', 'd', 'e'])
my_series


# ### Plotting a series
# 
# - We can plot a series by invoking the `plot()` method on an instance of a `Series` object.
# 
# - The x-axis will autimatically be labelled with the series index.

# In[5]:


import matplotlib.pyplot as plt
my_series.plot()
plt.show()


# ### Creating a series with automatic index
# 
# - In the following example the index is creating automatically:

# In[6]:


pd.Series(data)


# ### Creating a Series from a `dict`
# 
# 

# In[7]:


d = {'a' : 0., 'b' : 1., 'c' : 2.}
my_series = pd.Series(d)
my_series


# ### Indexing a series with `[]`
# 
# - Series can be accessed using the same syntax as arrays and dicts.
# 
# - We use the labels in the index to access each element.
# 
# 

# In[8]:


my_series['b']


# - We can also use the label like an attribute:

# In[9]:


my_series.b


# ### Slicing a series
# 
# 
# - We can specify a range of labels to obtain a slice:

# In[10]:


my_series[['b', 'c']]


# ## Arithmetic and vectorised functions
# 
# - `numpy` vectorization works for series objects too.
# 
# 

# In[11]:


d = {'a' : 0., 'b' : 1., 'c' : 2.}
squared_values = pd.Series(d) ** 2
squared_values


# In[12]:


x = pd.Series({'a' : 0., 'b' : 1., 'c' : 2.})
y = pd.Series({'a' : 3., 'b' : 4., 'c' : 5.})
x + y


# ## Time series

# In[13]:


dates = pd.date_range('1/1/2000', periods=5)
dates


# In[14]:


time_series = pd.Series(data, index=dates)
time_series


# ### Plotting a time-series

# In[15]:


ax = time_series.plot()


# ## Missing values
# 
# - Pandas uses `nan` to represent missing data.
# 
# - So `nan` is used to represent missing, invalid or unknown data values.
# 
# - It is important to note that this only convention only applies within pandas.
#     - Other frameworks have very different semantics for these values.
# 

# ## DataFrame
# 
# - A data frame has multiple columns, each of which can hold a *different* type of value.
# 
# - Like a series, it has an index which provides a label for each and every row. 
# 
# - Data frames can be constructed from:
#     - dict of arrays,
#     - dict of lists,
#     - dict of dict
#     - dict of Series
#     - 2-dimensional array
#     - a single Series
#     - another DataFrame

# 
# ## Creating a dict of series

# In[16]:


series_dict = {
        'x' : 
            pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
        'y' : 
            pd.Series([4., 5., 6., 7.], index=['a', 'b', 'c', 'd']),
        'z' :
            pd.Series([0.1, 0.2, 0.3, 0.4], index=['a', 'b', 'c', 'd'])
}

series_dict


# ## Converting the dict to a data frame

# In[17]:


df = pd.DataFrame(series_dict)
df


# ## Plotting data frames
# 
# - When plotting a data frame, each column is plotted as its own series on the same graph.
# 
# - The column names are used to label each series.
# 
# - The row names (index) is used to label the x-axis.

# In[18]:


ax = df.plot()


# ## Indexing

# - The outer dimension is the column index.
# 
# - When we retrieve a single column, the result is a Series

# In[19]:


df['x']


# In[20]:


df['x']['b']


# In[21]:


df.x.b


# ## Projections
# 
# - Data frames can be sliced just like series.
# - When we slice columns we call this a *projection*, because it is analogous to specifying a subset of attributes in a relational query, e.g. `SELECT x FROM table`.
# - If we project a single column the result is a series:

# In[22]:


slice = df['x'][['b', 'c']]
slice


# In[23]:


type(slice)


# ## Projecting multiple columns
# 
# - When we include multiple columns in the projection the result is a DataFrame.

# In[24]:


slice = df[['x', 'y']]
slice


# In[25]:


type(slice)


# ## Vectorization
# 
# - Vectorized functions and operators work just as with series objects:

# In[26]:


df['x'] + df['y']


# In[27]:


df ** 2


# ## Logical indexing
# 
# - We can use logical indexing to retrieve a subset of the data.
# 
# 

# In[28]:


df['x'] >= 2


# In[29]:


df[df['x'] >= 2]


# ## Descriptive statistics

# - To quickly obtain descriptive statistics on numerical values use the `describe` method.

# In[30]:


df.describe()


# ## Accessing a single statistic
# 
# - The result is itself a DataFrame, so we can index a particular statistic like so:

# In[31]:


df.describe()['x']['mean']


# ## Accessing the row and column labels
# 
# - The row labels (index) and column labels can be accessed:
# 

# In[32]:


df.index


# In[33]:


df.columns


# ## Head and tail
# 
# - Data frames have `head()` and `tail()` methods which behave analgously to the Unix commands of the same name.

# ## Financial data
# 
# - Pandas was originally developed to analyse financial data.
# 
# - We can download tabulated data in a portable format called [Comma Separated Values (CSV)](https://www.loc.gov/preservation/digital/formats/fdd/fdd000323.shtml).

# In[34]:


import pandas as pd
googl = pd.read_csv('data/GOOGL.csv')


# ### Examining the first few rows
# 
# - When working with large data sets it is useful to view just the first/last few rows in the dataset.
# 
# - We can use the `head()` method to retrieve the first rows:

# In[35]:


googl.head()


# ### Examining the last few rows

# In[36]:


googl.tail()


# ### Converting to datetime values
# 
# - So far, the `Date` attribute is of type string.

# In[37]:


googl.Date[0]


# In[38]:


type(googl.Date[0])


# - In order to work with time-series data, we need to construct an index containing time values.
# 
# - Time values are of type `datetime` or `Timestamp`.
# 
# - We can use the function `to_datetime()` to convert strings to time values.

# In[39]:


pd.to_datetime(googl['Date']).head()


# ### Setting the index
# 
# - Now we need to set the index of the data-frame so that it contains the sequence of dates.
# 

# In[40]:


googl.set_index(pd.to_datetime(googl['Date']), inplace=True)
googl.index[0]


# In[41]:


type(googl.index[0])


# ### Plotting series
# 
# - We can plot a series in a dataframe by invoking its `plot()` method.
# 
# - Here we plot a time-series of the daily traded volume:

# In[42]:


ax = googl['Volume'].plot()
plt.show()


# ### Adjusted closing prices as a time series

# In[43]:


googl['Adj Close'].plot()
plt.show()


# ### Slicing series using date/time stamps
# 
# - We can slice a time series by specifying a range of dates or times.
# 
# - Date and time stamps are specified strings representing dates in the required format.

# In[44]:


googl['Adj Close']['1-1-2016':'1-1-2017'].plot()
plt.show()


# ### Resampling 
# 
# - We can *resample* to obtain e.g. weekly or monthly prices.
# 
# - In the example below the `'W'` denotes weekly.
# 
# - See [the documentation](http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases) for other frequencies.
# 
# - We group data into weeks, and then take the last value in each week.
# 
# - For details of other ways to resample the data, see [the documentation](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.resample.html).

# #### Resampled time-series plot

# In[45]:


weekly_prices = googl['Adj Close'].resample('W').last()
weekly_prices.head()


# In[46]:


weekly_prices.plot()
plt.title('Prices for GOOGL sampled at weekly frequency')
plt.show()


# ### Converting prices to log returns

# In[47]:


weekly_rets = np.diff(np.log(weekly_prices))
plt.plot(weekly_rets)
plt.xlabel('t'); plt.ylabel('$r_t$')
plt.title('Weekly log-returns for GOOGL')
plt.show()


# ### Converting the returns to a series
# 
# - Notice that in the above plot the time axis is missing the dates.
# 
# - This is because the `np.diff()` function returns an array instead of a data-frame.
# 

# In[48]:


type(weekly_rets)


# - We can convert it to a series thus:

# In[49]:


weekly_rets_series = pd.Series(weekly_rets, index=weekly_prices.index[1:])
weekly_rets_series.head()


# #### Plotting with the correct time axis

# Now when we plot the series we will obtain the correct time axis:

# In[50]:


plt.plot(weekly_rets_series)
plt.title('GOOGL weekly log-returns'); plt.xlabel('t'); plt.ylabel('$r_t$')
plt.show()


# ### Plotting a return histogram

# In[51]:


weekly_rets_series.hist()
plt.show()


# In[52]:


weekly_rets_series.describe()