#!/usr/bin/env python
# coding: utf-8

# # Pandas Time series

# In[1]:


import addutils.toc ; addutils.toc.js(ipy_notebook=True)


# For further documentation see [Time Series / Date functionality](http://pandas.pydata.org/pandas-docs/stable/timeseries.html) on the [pandas](http://pandas.pydata.org/) documentation.

# In[2]:


import numpy as np
import pandas as pd
from numpy import NaN
from IPython.display import (display, HTML)
from pandas.tseries.offsets import *
from addutils import side_by_side2
from addutils import css_notebook
css_notebook()


# ## 1 Timestamps and DatetimeIndex

# `to_datetime` convertsa list of date-like objects to Time Stamps. If the list is homegeneous `infer_datetime_format=True` can give a great speed-up. Otherwise with `format` is possible to define the format of the strings a-priori.

# In[3]:


pd.to_datetime(['10-11-2000 17:52:13.7731+02:00', '3-5-2014 11:57:18.7754+01:00'], dayfirst=True, utc=True)


# It’s also possible to convert integer or float epoch times. The float value is interpreted as a Unix timestamp and the default unit of measure is nanoseconds bu a different unit (D,s,ms,us,ns) can be specified:

# In[4]:


print (pd.to_datetime([0]))
print (pd.to_datetime([1]))
print (pd.to_datetime([1], unit='D'))


# Index of Timestamps: use `date_range` and `bdate_range` to create regular frequency timestamp indexes. These functions return **`DatetimeIndex`** objects that are **array of Timestamps**:

# In[5]:


rng = pd.date_range('1/1/2012 15:53:25.335',
                    periods=6,
                    freq='2d3h35min42s115ms',
                    tz='Europe/Rome')
for date in rng:
    print (date)


# In[6]:


rng[0]


# This can be used as an index for Series, Dataframes and Panels:

# In[7]:


data = np.random.randint(0,99,(len(rng),3))
d1 = pd.DataFrame(data, index=rng, columns=list('ABC'))
display(d1)


# Pandas provides a timezone conversion. Here we produce new data by converting the previous timeseries form the Rome timezone to the Eastern US timezone. When data with different timezones are combined toghether (as in `d3 = d1+d2`) the results are given in **UTC** time which can be in turn converted in any timezone. In this example the UTC timezone is shown with the notation **`+00:00`**

# In[8]:


d2 = d1.tz_convert('US/Eastern')
HTML(side_by_side2(d1, d2))


# In[9]:


d3 = d1+d2
d3


# ## 2 DateOffsets objects

# In the previous example we used a time frequency string (with Offset Aliases) `freq='2d3h35min42s115ms` with `date_range` to create a `DatetimeIndex`. These frequency strings are being translated into an instance of pandas `DateOffset`, which represents a regular frequency increment. Specific offset logic like “month”, “business day”, or “one hour” is represented in its various subclasses.
# 
# The key features of a `DateOffset` object are:
# * it can be used to shift a datetime object
# * it can be multiplied by an integer
# * it has rollforward and rollback methods for moving a date forward or backward

# In[10]:


from pandas.tseries.offsets import *


# In[11]:


print (pd.datetime(2014, 1, 1) + Week() + Hour(13) + Minute()*15 + Milli()*350)
print (pd.datetime(2014, 1, 1) + MonthEnd())
print (pd.datetime(2014, 1, 1) + MonthBegin())
print (pd.datetime(2014, 1, 1) + BQuarterBegin(2))


# Offset Aliases can be used to define to define time series frequencies:
# * B - Business Day
# * D - Calendar Day
# * H - Hour
# * T - Minute
# * (see the Pandas documentation for a complete list)

# In[12]:


print (pd.date_range('2014/1/1', periods=2, freq=Minute(4)))
print (pd.date_range('2014/1/1', periods=2, freq='4T'))
print (pd.date_range('2014/1/1', periods=2, freq='4min'))


# In[13]:


start = pd.datetime(2014, 1, 1, 12, 30)
end = pd.datetime(2014, 1, 12)
idx2 = pd.date_range(start, end, freq='1B1H10T')
for i in idx2:
    print (i)


# ## 3 Indexing with a DateTime index

# As the name can suggest Datetime Index objects can be used to index a DataFrames. Try uncommenting some of the following lines to see alternative ways for selecting dates:

# In[14]:


data = np.random.randint(0,99,(len(idx2),3))
d2 = pd.DataFrame(data, index=idx2, columns=list('ABC'))
d2


# In[15]:


# Remember that in Pandas endpoints are included
d2.loc['2014/01/02':'2014/01/08', ['A', 'C']]


# In[16]:


d2.loc['2014/01/06 12:00':'2014/01/08 12:00', ['A', 'C']]


# It is possible to mangle with indexes using offset objects. BDay means business day (be aware that xmas is considered a business day).

# ## 4 Frequency conversion

# We can convert all TimeSeries to specified frequency using DateOffset objects. Optionally we can provide a fill method to handle missing values.

# In[17]:


# TODO: Fix
HTML(side_by_side2(d2,
                  d2.asfreq(Day(), method='ffill'),
                  d2.asfreq(Day(), method=None)))


# ## 5 Filling gaps

# We are going to see some ways to let pandas fill `NaN` values on a dataframe.
# 
# The first method is called forward filling and consists on using the first element above that isn't `NaN`.

# In[18]:


d3 = d2.copy()
d3.iloc[2:5, 0] = np.nan
d3.iloc[0:4, 1] = np.nan
d3.iloc[4:, 2] = np.nan
cols = ['A', 'B', 'C']
HTML(side_by_side2(d3, d3[cols].fillna(method='ffill')))


# Notice that on column 'B' the `NaN` values are at the beginning. So the method hasn't been able to fill those holes. The backward filling methods is complementary to the one above. It fills gaps using the first non `NaN` value below the cell.

# In[19]:


HTML(side_by_side2(d3, d3[cols].fillna(method='bfill')))


# ---
# 
# Visit [www.add-for.com](<http://www.add-for.com/IT>) for more tutorials and updates.
# 
# This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.