In this lecture we will cover
# Imports
import warnings
warnings.filterwarnings('ignore')
from io import BytesIO
import requests
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.integrate import quad
from statsmodels.tsa.arima_process import ArmaProcess
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
np.random.seed(0) # Reproducibility
A time series X is weakly stationary if:
def run_sequence_plot(x, y, title, xlabel="Time", ylabel="Values", ax=None):
if ax is None:
_, ax = plt.subplots(1,1, figsize=(10, 3.5))
ax.plot(x, y, 'k-')
ax.set_title(title)
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
ax.grid(alpha=0.3)
return ax
time
that defines equally-spaced time intervals.T = 200
time = np.arange(T)
stationary = np.random.normal(loc=0, scale=1.0, size=(T))
ax = run_sequence_plot(time, stationary, title="Stationary TS")
ax.plot(time, np.ones_like(time)*np.mean(stationary), linewidth=2, color='tab:red', label='Mean');
ax.fill_between(time, np.ones_like(time)*(stationary.mean()-1.96*stationary.std()),
np.ones_like(time)*(stationary.mean()+1.96*stationary.std()),
color='tab:red', alpha=0.2, label='std')
plt.legend();
The covariance of X(t1) and X(t2) is called autocovariance and is slightly different from the autocorrelation.
Example: constant autocorrelation
Example: time-varying autocorrelation
ar1 = np.array([1, -0.8])
ma1 = np.array([1])
AR_object1 = ArmaProcess(ar1, ma1)
constant_autocorr_ts = AR_object1.generate_sample(nsample=200)
run_sequence_plot(time, constant_autocorr_ts, title="Time series with constant autocorrelation ($X$)");
ar2 = np.array([1, -0.9])
AR_object2 = ArmaProcess(ar2, ma1)
ar3 = np.array([1, 0.3])
AR_object3 = ArmaProcess(ar3, ma1)
time_dependent_autocorr_ts_1 = AR_object2.generate_sample(nsample=100)
time_dependent_autocorr_ts_2 = AR_object3.generate_sample(nsample=100)
time_dependent_autocorr_ts = np.concatenate([time_dependent_autocorr_ts_1,
time_dependent_autocorr_ts_2])
run_sequence_plot(time, time_dependent_autocorr_ts, title="Time series with time-dependent autocorrelation ($Y$)");
run_sequence_plot
is an excellent starting point.run_sequence_plot(time, stationary, title="White noise");
# seed to start series
seed = 3.14
# Random Walk
rand_walk = np.empty_like(time, dtype='float')
for t in time:
rand_walk[t] = seed + np.random.normal(loc=0, scale=2.5, size=1)[0]
seed = rand_walk[t]
run_sequence_plot(time, rand_walk, title="Random Walk");
Dependence on Initial Value
Changing variance
At t=2 we have:
At a general time t we have:
trend = (time * 2.75) + stationary
run_sequence_plot(time, trend, title="Nonstationary data with trend");
level_1 = np.random.normal(loc=0, scale=1.0, size = 100)
level_2 = np.random.normal(loc=0, scale=10.0, size = 100)
heteroscedasticity = np.append(level_1, level_2)
run_sequence_plot(time, heteroscedasticity, title="Heteroscedastic time series");
seasonality = 20 + np.sin(2*np.pi*time/12)*20
run_sequence_plot(time, seasonality, title="Time series with seasonality");
20
in our case).20
).trend_seasonality = trend + seasonality + stationary
run_sequence_plot(time, trend_seasonality, title="Time series with trend and seasonality");
# Arbitrarily set the coefficients
R = 1.3
lamb = 3.5
t = 4
# Function to integrate
def func(psi):
return R*np.sin(lamb*t + psi)
# Perform the numerical integration over the interval [-pi, pi]
mean, _ = quad(func, -np.pi, np.pi) # Change the interval to get an expectation != 0
print(mean)
6.975736996017264e-16
R = 3
lamb = 6.5
t = 4
def func_sin2(psi):
return ((R*np.sin(lamb*t + psi))**2) / (2*np.pi)
# Perform the numerical integration
variance, _ = quad(func_sin2, -np.pi, np.pi)
print(variance)
4.5
In conclusion, the time series X(t)=μ+Rsin(λt+ψ) is weakly stationary:
How does X(t)=μ+Rsin(λt+ψ) looks like?
R = 1.3
lamb = 3.5
phi = np.random.uniform(-np.pi, np.pi, len(time))
cyc = R * np.sin(lamb * time + phi)
run_sequence_plot(time, cyc, title="Sinusoidal time series with random phase");
📨 Take-away message
Next, we will review practical techniques used to identify if a time series is stationary or not.
Specifically, we will cover:
run_sequence_plot
.This is how a stationary time series looks like.
This is how a non-stationary time series looks like.
trend
time series that we created before.# split data into 10 chunks
chunks = np.split(trend, indices_or_sections=10)
Summary statistics:
print("{}\t | {}\t\t | {}".format("Chunk", "Mean", "Variance"))
print("-" * 35)
for i, chunk in enumerate(chunks, 1):
print("{:2}\t | {:.5}\t | {:.5}".format(i, np.mean(chunk), np.var(chunk)))
Chunk | Mean | Variance ----------------------------------- 1 | 26.694 | 241.23 2 | 81.181 | 255.62 3 | 135.73 | 258.47 4 | 190.77 | 257.08 5 | 246.55 | 257.82 6 | 301.62 | 259.15 7 | 356.19 | 249.28 8 | 411.09 | 244.11 9 | 466.14 | 245.37 10 | 520.99 | 275.11
Example: p=0.0001→ reject H0→ the time series is stationary
Example: p=0.43→ fail to reject H0→ the time series is non-stationary
adf, pvalue, usedlag, nobs, critical_values, icbest = adfuller(stationary)
print(f"ADF: {adf:.2f}")
ADF: -13.56
adf
is the value of the test statistic.pvalue
should.print(f"p-value: {pvalue}")
p-value: 2.300172138070656e-25
pvalue
is interpreted like any p-value.pvalue
should be compared with the confidence levels (e.g., α=0.05 or α=0.01).pvalue
is very close to zero so we reject the H0 in favor of HA and conclude that the time series is stationary.print(f"nobs: {nobs}")
nobs: 199
nobs
is simply the number of observations in the time series.print(f"critical values: {critical_values}")
critical values: {'1%': np.float64(-3.4636447617687436), '5%': np.float64(-2.8761761179270766), '10%': np.float64(-2.57457158581854)}
critical_values
are the test statistic thresholds for common significant levels.usedlag
and icbest
.trend
time series.adf, pvalue, usedlag, nobs, critical_values, icbest = adfuller(trend, regression='c')
print(f"ADF: {adf:.2f}")
print(f"p-value: {pvalue:.3f}")
ADF: 0.82 p-value: 0.992
trend
time series is nonstationary.rand_walk
time series.adf, pvalue, usedlag, nobs, critical_values, icbest = adfuller(rand_walk, regression='c')
print(f"ADF: {adf:.2f}")
print(f"p-value: {pvalue:.3f}")
ADF: -1.03 p-value: 0.741
Transformation | Effect |
---|---|
Subtract trend | Constant mean |
Apply log | Constant variance |
Differencing | Remove autocorrelation |
Seasonal differencing | Remove periodic component |
adf_b4, pvalue_b4, _, _, _, _ = adfuller(trend_seasonality)
print(f"ADF: {adf_b4:.2f}")
print(f"p-value: {pvalue_b4:.3f}")
ADF: 1.01 p-value: 0.994
ss_decomposition = seasonal_decompose(x=trend_seasonality, model='additive', period=12)
est_trend = ss_decomposition.trend
est_seasonal = ss_decomposition.seasonal
est_residual = ss_decomposition.resid
run_sequence_plot(time, est_trend, title="Trend", ylabel="series");
run_sequence_plot(time, est_seasonal, title="Seasonality", ylabel="series");
run_sequence_plot(time, est_residual, title="Residuals", ylabel="series");
print(est_residual)
[ nan nan nan nan nan nan 1.24368173 -2.62651313 -0.10234455 -0.73531189 -0.1829664 2.04130761 -0.65755141 -0.46722156 0.39903068 0.38172731 2.54722749 -0.46031003 1.15158029 -2.95123128 -3.8243952 0.80129202 2.15850427 -1.56043208 3.0359116 -3.14443182 -0.42647195 -0.79241057 2.89305425 3.06029956 0.96135096 -0.50108732 -0.85817676 -4.86262436 -0.47675426 0.62469458 1.86733165 3.19296905 -0.18983981 -0.16068445 -1.78136305 -2.23271451 -2.10183683 3.79310173 1.39263034 -0.34341253 -1.17970353 2.45314096 -3.79055414 0.35619929 -0.97123041 1.50295994 -0.48073187 -1.50897278 1.33062156 0.40333605 2.04519325 0.82223149 -0.03721313 0.02072604 -1.83124467 0.28260905 -0.58713007 -2.29219567 1.34745638 0.30238082 -1.82829117 0.50552599 0.09712559 -0.03657677 2.21367333 0.75259989 1.3771748 -2.08987195 1.19900202 -1.00537537 -1.50116216 -0.48861264 0.67380781 -0.65481532 -1.06292965 1.09813623 1.09779453 -3.38842903 1.23096044 3.31408846 1.62739389 -1.19160617 -3.04549638 1.29505225 -1.16583223 0.44363104 0.87222682 0.58400786 -0.05579167 0.23568169 -2.18513392 2.81637873 -0.40616749 0.19770027 3.06858161 -2.93997013 -2.30549491 0.26624964 -1.77828607 2.40403475 -1.22835495 -2.30177972 1.38285927 1.78578503 2.35625379 0.45059171 -3.06972949 2.51600513 -1.33226534 -0.62157147 2.37390149 -1.59898418 0.78567536 1.19671763 -1.01040306 -2.6779518 0.01217197 2.06368639 -1.98478575 -0.47973971 -0.41317317 2.37486029 2.36460554 0.10283292 -1.40829286 0.74603065 -3.04663694 -0.10097141 -1.05166821 1.76970428 1.41428445 0.02640765 1.4903752 -3.5927001 -2.08195289 -0.09839805 0.37255568 1.12512109 3.38001307 1.6649115 -2.38660082 1.73553203 -3.0777606 -0.97087869 0.54685379 2.60933743 0.24150924 -1.63684448 0.77593821 -0.9844951 1.01053918 -1.99450958 -1.99487784 -0.65557883 -0.91511512 4.35653519 2.98276586 -0.6365906 -1.19121881 0.84886207 -2.12084827 -3.47078436 1.07403998 0.80507599 1.88740267 0.7949761 1.87859292 -1.04476392 -1.2947557 0.63231715 0.29378516 -1.00038758 0.59418834 1.27579846 -0.68317918 -1.30938778 0.15957658 -3.17218197 2.33379231 -1.80387318 -0.33741986 0.1581179 0.82029446 3.25311048 -1.70643665 0.83606969 -1.55215866 -2.8317032 nan nan nan nan nan nan]
nan
.period=12
.adf_after, pvalue_after, _, _, _, _ = adfuller(est_residual[6:-6])
print(f"ADF: {adf_after:.2f}")
print(f"p-value: {pvalue_after:.3f}")
ADF: -6.61 p-value: 0.000
run_sequence_plot(time, heteroscedasticity, title="Heteroskedastic time series");
adf_b4, pvalue_b4, _, _, _, _ = adfuller(heteroscedasticity)
print(f"ADF: {adf_b4:.2f}")
print(f"p-value: {pvalue_b4:.3f}")
ADF: -2.49 p-value: 0.118
⚠ Attention
# Ensure all data are positive
new_hetero = heteroscedasticity - heteroscedasticity.min() + 1.0
# Apply the log
log_new_hetero = np.log(new_hetero)
run_sequence_plot(time, log_new_hetero, title="Log heteroskedastic time series");
📨 Takeaway message:
rand_walk
time series is nonstationary.rand_walk
into stationary time series by applying differencing.rand_walk
was created with a lag of 1.difference = rand_walk[:-1] - rand_walk[1:]
run_sequence_plot(time[:-1], difference, title="Random walk after differencing");
adf_after, pvalue_after, _, _, _, _ = adfuller(difference)
print(f"ADF: {adf_after:.2f}")
print(f"p-value: {pvalue_after:.3f}")
ADF: -14.60 p-value: 0.000
df = pd.DataFrame([[x for x in range(1,11)], [x**2 for x in range(1,11)]]).T
df.columns = ['original', 'squared']
df
original | squared | |
---|---|---|
0 | 1 | 1 |
1 | 2 | 4 |
2 | 3 | 9 |
3 | 4 | 16 |
4 | 5 | 25 |
5 | 6 | 36 |
6 | 7 | 49 |
7 | 8 | 64 |
8 | 9 | 81 |
9 | 10 | 100 |
original
) so that mean and variance don't change for sub-windows.df.original.diff().to_frame("1st order difference")
1st order difference | |
---|---|
0 | NaN |
1 | 1.0 |
2 | 1.0 |
3 | 1.0 |
4 | 1.0 |
5 | 1.0 |
6 | 1.0 |
7 | 1.0 |
8 | 1.0 |
9 | 1.0 |
💡 NOTE: This is similar to taking a first-order derivative.
df.squared.diff().diff().to_frame("2nd order difference")
2nd order difference | |
---|---|
0 | NaN |
1 | NaN |
2 | 2.0 |
3 | 2.0 |
4 | 2.0 |
5 | 2.0 |
6 | 2.0 |
7 | 2.0 |
8 | 2.0 |
9 | 2.0 |
squared
) with log.np.log(df.squared).to_frame("log")
log | |
---|---|
0 | 0.000000 |
1 | 1.386294 |
2 | 2.197225 |
3 | 2.772589 |
4 | 3.218876 |
5 | 3.583519 |
6 | 3.891820 |
7 | 4.158883 |
8 | 4.394449 |
9 | 4.605170 |
trend_seasonality
time series.trend_seasonality_d12 = trend_seasonality[:-12] - trend_seasonality[12:] # remove seasonality
trend_seasonality_d12_d1 = trend_seasonality_d12[:-1] - trend_seasonality_d12[1:] # remove trend
# Plot the differenced time series
run_sequence_plot(time[:-13], trend_seasonality_d12_d1, title="Seasonal + 1st order differencing");
adf_after, pvalue_after, _, _, _, _ = adfuller(trend_seasonality_d12_d1)
print(f"ADF: {adf_after:.2f}")
print(f"p-value: {pvalue_after:.3f}")
ADF: -5.28 p-value: 0.000
In this lecture we learned:
stationarity_ts1
and stationarity_ts2
by running the code below.# Load the first time series
response = requests.get("https://zenodo.org/records/10897398/files/stationarity_ts1.npy?download=1")
response.raise_for_status()
stationarity_ts1 = np.load(BytesIO(response.content))
print(len(stationarity_ts1))
# Load the second time series
response = requests.get("https://zenodo.org/records/10897398/files/stationarity_ts2.npy?download=1")
response.raise_for_status()
stationarity_ts2 = np.load(BytesIO(response.content))
print(len(stationarity_ts2))
100 100
Use the following tools to determine if the time series stationarity_ts1
and stationarity_ts2
are stationary or not.
Discuss the result obtained with each method.
If either or both datasets from exercises one and two are nonstationary, apply the transformations you learned in this section to make them so. Then apply the methods you learned to ensure stationarity.