#!/usr/bin/env python # coding: utf-8 # # Changepoint Analysis with PELT and mongodb using nag4py # --- # # This notebook outlines the steps required to perform changepoint analysis on a set of data using the PELT algorithm. # The PELT algorithm is included in the NAG Library and we will use NAG's Python Wrappers to access the algorithm. # As many potential users access their data from a database system such as mongodb, this example will start from that basis. # # The particular example examines the Volkswagen stock price downloaded from Yahoo's historic stock prices. # ### Import python modules # In[1]: import pandas as pd import numpy as np import pymongo import matplotlib.pyplot as plt import nag4py.util as util import nag4py.e01 as e01 import nag4py.g13 as g13 get_ipython().run_line_magic('matplotlib', 'inline') # ### Set name of stock # In[2]: stock_name = 'volkswagen' # ### Read in csv file downloaded from Yahoo historic stock prices filename: stock_name.csv # In[3]: data = pd.read_csv('{}.csv'.format(stock_name)) # ### Convert date to datetime datatype and set as index # In[4]: data['Date'] = pd.to_datetime(data['Date'], format="%d/%m/%Y") data = data.set_index('Date') # ### Using Mongodb # Mongodb must be running for this example to work. # # From the command-line: # # $ mongod # ### Open a connection to the mongo client. The client (mongod) must be running. # In[5]: client = pymongo.MongoClient() # ### Select the Database # In[6]: db = client.stock # ### Load the specific collection # In[7]: collection = db['{}'.format(stock_name)] # ### Populate the collection (First we drop the collection so that it is empty) # In[8]: collection.drop() for date in data.index.values: post = {"Date":"{}".format(date), "Close":float("{}".format(data.ix[date]['Close'])), "Adj Close":float("{}".format(data.ix[date]['Adj Close'])), "High":float("{}".format(data.ix[date]['High'])), "Low":float("{}".format(data.ix[date]['Low'])), "Open":float("{}".format(data.ix[date]['Open'])), "Volume":float("{}".format(data.ix[date]['Volume']))} collection.insert_one(post).inserted_id # ### We now have our data stored in the Mongodb database. # ### Put Collection into DataFrame and set the 'Date' as the index # In[9]: data = pd.DataFrame(list(collection.find())).set_index('Date') # ### Change some datatypes # In[10]: data['_id'] = data['_id'].astype(str) # In[11]: data = data.set_index(data.index.astype('datetime64')) # ### Rename column # In[12]: data = data.rename(columns = {'_id': 'ID'}) # ## Calculate Change Points # The PELT algorithm assumes that the data is univariate (evenly spaced). # # The stock prices have gaps for holidays and other reasons. # # Therefore, the gaps must be filled, by interpolated data, before performing the change point analysis. # In[13]: known_dates = data.index.sort_values() # ### Select the price to do the change point analysis on, i.e. High, Close, Low, Open # In[14]: price_type = 'High' known_prices = getattr(data, price_type).sort_index().values # ### Determine all the dates between the first and last piece of data # In[15]: all_dates = pd.date_range(known_dates[0], known_dates[-1]) # ### Reindex the data with all the dates # In[16]: all_data = data.reindex(all_dates) # ### Determine the missing dates # In[17]: missing_dates = all_dates.difference(known_dates) # ### Define a wrapper function for the NAG monotonicity preserving interpolating routine # In[18]: def mono_inter(x, y, missing_dates): """This function calcuates the missing prices from the known dates, known prices and missing dates. x is the known dates y is the known prices md is the missing dates""" d = np.zeros(len(x)) fail_int = util.noisy_fail() e01.nag_monotonic_interpolant(len(x), x, y, d, fail_int) missing_prices = np.zeros(len(missing_dates)) fail_eva = util.noisy_fail() e01.nag_monotonic_evaluate(len(x), x, y, d, len(missing_dates), missing_dates, missing_prices, fail_eva) if fail_eva.iflag != 0: print(fail.message) return missing_prices # ### Calculate the missing prices # In[19]: missing_prices = mono_inter(known_dates.values.astype(float), known_prices, missing_dates.values.astype(float)) # ### Merge the missing prices and the known data # In[20]: missing_prices_list = missing_prices.tolist() for row in all_data.iterrows(): if pd.isnull(row[1][price_type]): all_data.loc[row[0], price_type] = missing_prices_list.pop(0) # ### Plot the complete dataset # In[21]: fig = plt.figure() fig.set_size_inches(16, 10) plt.plot(all_data.index, all_data[price_type], 'g', label = 'Volkswagen') plt.title('Volkswagen Stock Price with Interpolated Values') plt.legend(loc = 2) plt.ylabel('Euro '+u"(\u20AC)") plt.xlabel('Date') plt.show() # ## Comparison between known data an interpolated data # In[22]: start = 4000 end = 4100 s = np.where(all_dates==known_dates[start])[0][0] e = np.where(all_dates==known_dates[end])[0][0] fig = plt.figure() fig.set_size_inches(16, 10) plt.plot(all_data.index[s:e], all_data[price_type][s:e], 'gs-', label = 'Interpolated') plt.plot(known_dates[start:end], known_prices[start:end], 'rs-', label = 'Known Values') plt.title('Volkswagen Stock Price with Interpolated Values and Known Values') plt.legend(loc = 2) plt.ylabel('Euro '+u"(\u20AC)") plt.xlabel('Date') plt.show() # # We will now apply the PELT algorithm to the complete data set # ### Define some parameters # In[23]: dist = util.Nag_TS_ChangeType() dist_change_point_type_dict = {'Nag_NormalMean': util.Nag_NormalMean, 'Nag_NormalStd': util.Nag_NormalStd, 'Nag_NormalMeanStd': util.Nag_NormalMeanStd, 'Nag_GammaScale': util.Nag_GammaScale, 'Nag_ExponentialLambda': util.Nag_ExponentialLambda, 'Nag_PoissonLambda': util.Nag_PoissonLambda} dist_change_point_type = 'Nag_NormalMean' dist = dist_change_point_type_dict[dist_change_point_type] # ### Define the penalty term based on the documentation # In[24]: def penalty_term(dist_change_point_type, time_series_len, penalty_method = 'SIC'): if dist_change_point_type == 'Nag_NormalMeanStd': p = 2 else: p = 1 penalty_method_dict = {'SIC': p * np.log(time_series_len), 'BIC': p * np.log(time_series_len), 'AIC': p * 2, 'Hannan-Quinn': 2 * p * np.log(np.log(time_series_len))} return penalty_method_dict[penalty_method] # ### Calculate the Standard Deviation and the Mean using nag_tsa_mean_range # In[25]: n = len(all_data[price_type]) m = n std = np.zeros(1, dtype=np.float64) mean = np.zeros(1, dtype=np.float64) fail = util.noisy_fail() g13.nag_tsa_mean_range(len(all_data[price_type]), np.array(all_data[price_type].values), m, util.Nag_UseSD, std, mean, fail) if fail.iflag != 0: print(fail.message) # ### Calculate the changepoints # In[26]: beta = penalty_term(dist_change_point_type, n, 'SIC') print('Pentalty Term', beta) minss = 2 param = np.ndarray(1) if dist_change_point_type == 'Nag_NormalMean': param[0] = std[0] elif dist_change_point_type == 'Nag_NormalMeanStd': param[0] = mean[0] else: param[0] = 0 ntau = np.zeros(n, dtype=util.nag_int_type) tau = np.zeros(n, dtype=util.nag_int_type) sparam = np.zeros(2*n+2) fail = util.noisy_fail() g13.nag_tsa_cp_pelt(dist, n, np.array(all_data[price_type].values), beta, minss, param, ntau, tau, sparam, fail) if fail.iflag != 0: print(fail.message) # ### Store the mean value between the changepoints # In[27]: m = ntau[0] change_points = [0] + tau[0:m-1].tolist() mean = [] std = [] for i in range(0, m): mean.append(sparam[2*(i+1)-2]) std.append(sparam[2*(i+1)-1]) #This is to plot the final mean bar change_points.append(len(all_data[price_type].values)-1) std.append(std[-1]) mean.append(mean[-1]) # ### Plot the Changepoints and the mean value between # In[28]: fig = plt.figure() fig.set_size_inches(16, 10) plt.plot(all_dates, all_data[price_type].values, 'g', label = 'Stock Price - {}'.format(price_type)) for change_point in change_points: plt.axvline(all_dates[change_point], color = 'r') plt.axvline(all_dates[change_point], color = 'r', label = 'Changepoints') plt.step(all_dates[change_points], mean, where='post', label = 'Mean') plt.title('Volkswagen Stock Price with Interpolated Values and Changepoints') plt.legend() plt.ylabel('Euro '+u"(\u20AC)") plt.xlabel('Date') plt.show() # ## A plot showing the changepoint that occurred at the time of the recent emissions scandal # In[29]: fig = plt.figure() fig.set_size_inches(16, 10) plt.plot(all_dates[change_points[-3]:], all_data[price_type].values[change_points[-3]:], 'g', label = 'Stock Price - {}'.format(price_type)) for change_point in change_points[-3:]: plt.axvline(all_dates[change_point], color = 'r') plt.axvline(all_dates[change_point], color = 'r', label = 'Changepoints') plt.step(all_dates[change_points][-3:], mean[-3:], where='post', label='Mean') plt.title('Volkswagen Stock Price with Interpolated Values and Changepoints') plt.legend() plt.ylabel('Euro '+u"(\u20AC)") plt.xlabel('Date') plt.show() # In[30]: print("The change point occurs on the {}.".format(all_dates[change_points][-3:][1]))