%pylab inline
%load_ext rmagic
%R library(astsa)
%R library(forecast)
Populating the interactive namespace from numpy and matplotlib
This is forecast 4.8
where $w_t$ is the random noise (e.g., following iid, but not necessarily), and $z_{t1}, z_{t2}, ..., z_{tq}$ are possible inputs or indepdent series
%%R
## find linear trend in temprature
data(gtemp)
fit <- lm(gtemp ~ time(gtemp))
plot(gtemp, type="o", ylab="Global Temp Deviation")
abline(fit)
print(summary(fit))
Call: lm(formula = gtemp ~ time(gtemp)) Residuals: Min 1Q Median 3Q Max -0.31946 -0.09722 0.00084 0.08245 0.29383 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -1.120e+01 5.689e-01 -19.69 <2e-16 *** time(gtemp) 5.749e-03 2.925e-04 19.65 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.1251 on 128 degrees of freedom Multiple R-squared: 0.7511, Adjusted R-squared: 0.7492 F-statistic: 386.3 on 1 and 128 DF, p-value: < 2.2e-16
%%R -w 500 -h 500 -u px
## Polution, Temperature and Mortality
data(cmort); data(tempr); data(part)
par(mfrow=c(3, 1))
plot(cmort, main = "Cardiovascular Motality")
plot(tempr, main = "Temperature")
plot(part, main = "particulates")
pairs(data.frame(Mortality=cmort,
Temperature=tempr,
Particulates=part))
fit <- lm(cmort ~ time(cmort) + tempr + tempr^2 + part)
print(summary(fit))
plot(residuals(fit), type="l")
Acf(residuals(fit))
Call: lm(formula = cmort ~ time(cmort) + tempr + tempr^2 + part) Residuals: Min 1Q Median 3Q Max -19.3902 -4.6295 -0.7917 3.8028 27.5423 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 2931.13282 211.32125 13.87 <2e-16 *** time(cmort) -1.42873 0.10703 -13.35 <2e-16 *** tempr -0.45219 0.03343 -13.53 <2e-16 *** part 0.26811 0.01993 13.46 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 6.772 on 504 degrees of freedom Multiple R-squared: 0.544, Adjusted R-squared: 0.5413 F-statistic: 200.4 on 3 and 504 DF, p-value: < 2.2e-16
%%R
## Regression with Lagged Variables
data(soi); data(rec)
## ccf reveals that SOI is 6 months leading of REcuritment
par(mfrow=c(1, 1))
r <- ccf(soi, rec, lag.max = 50)
## use ts.intersect to align data prior to regression
rec.soi <- ts.intersect(rec, soi.lag6 = lag(soi, -6), dframe = T)
fit <- lm(rec ~ soi.lag6, data = rec.soi)
print(summary(fit))
plot.ts(rec[-(1:6)], col = 1, lty = 1)
lines(predict(fit, rec.soi), col = 2, lty = 2)
legend("topleft", c("real", "predict"), col=1:2, lty=1:2)
Call: lm(formula = rec ~ soi.lag6, data = rec.soi) Residuals: Min 1Q Median 3Q Max -65.187 -18.234 0.354 16.580 55.790 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 65.790 1.088 60.47 <2e-16 *** soi.lag6 -44.283 2.781 -15.92 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 22.5 on 445 degrees of freedom Multiple R-squared: 0.3629, Adjusted R-squared: 0.3615 F-statistic: 253.5 on 1 and 445 DF, p-value: < 2.2e-16
The crucial assumption for many time series analysis is that their mean and autocovariance/crosscovariance functions satisfy the conditions of weakly stationarity (for at least some reasonalbe strech of time)
If this is not the case, there are some methods for playing down the effects of nonstartionarity so the stationary properties of the series may be studied.
EDA is a way of looking at the data to validate their assumptions
Things to consider to make nonstationary process stationary
first order difference eliminates a linear trend, whereas a second difference can eliminate a quardatic trend, to see why, define backshift operator $B$ as $$Bx_t = x_{t-1}; {\triangledown}x_t = (1-B)x_t$$ $$so\ B^kx_t=x_{t-k}\ and\ $$ $${\triangledown}^2x_t=(1-B)^2x_t=(1-2B+B^2)x_t=x_t-2x_{t-1}+x_{t-2}$$
log
transformation is particularly useful because it tends to suppress larger fluctuations that occur over portions of the series where the underlying values are largerAcf & Scatterplot Matrices: The ACF gives a profile of the linear correlation at all possible lags and shows which values of $h$ lead to the best predictability. The restriction of this idea to linear predictability, however, may mask a possible nonlinear relation between current values, $x_t$, and past values, $x_{t−h}$. This idea extends to two series where one may be interested in examining scatterplots of yt versus $x_{t−h}$
Frequency domain such as Periodogram: try the frequency
so the corresponding components are $\{cos(2\pi{\omega}t), sin(2\pi\omega t)\}$. The point, however, is that if the data contain any cyclic behavior we are likely to catch it by performing these saturated regressions.
%%R
## examples of nonstartionary series from real world example
data(jj)
plot(jj, xlab = "")
mtext(side = 3, line = 1, "JJ series (NONSTATIONARY): \n
Exponentially Increasing Trend and Increasing Variance")
%%R
## detrend the global temperature
## find the linear trend
fit <- lm(gtemp ~ time(gtemp))
par(mfrow = c(3, 1))
plot(gtemp, type = 'o', main = 'original gtemp')
plot(residuals(fit), type = 'o', main = 'detrended')
plot(diff(gtemp), type = 'o', main = 'first difference')
par(mfrow = c(3, 1))
Acf(gtemp, lag.max=50, main = "gtemp")
Acf(resid(fit), 50, main = 'detrended')
Acf(diff(gtemp), 50, main = "first difference")
%%R
## Acf and Scatterplot matrix of SOI
Acf(soi, lag.max=50, main = "acf finds linearly predictability only")
lag.plot(soi, 12, diag.col = "red")
%%R
## ccf and Scatterplot matrix of Rec ~ SOI_lags
%%R
## Decomposition by triangles to recover signal and noise
set.seed(1)
## assume omega(frequency) = 1/500 is known, but phase phi
## altitude A and noise level are unknown
x <- 2*cos(2*pi*(1:500)/50 + .6*pi) + rnorm(500, 0, 5)
z1 <- cos(2*pi*(1:500)/50); z2 <- sin(2*pi*(1:500)/50)
fit <- lm(x ~ z1 + z2 + 0) # zero to exclude the intercept
plot.ts(x, lty = 2)
lines(fitted(fit), lwd=2)
%%R
## use peridogram to discover signal in noise
## this time we dont know the frequency omega
set.seed(1)
x <- 2*cos(2*pi*(1:500)/50 + .6*pi) + rnorm(500, 0, 5)
I = abs(fft(x))^2/500 # the periodogram
P = (1/500)*I[1:250] # the scaled periodogram
f = (0:249)/500
plot(f, P, type = 'h')
%%R
## moving average smoother to find trend and seansonal components
x <- cmort ## weekly data
## monthly
x5 <- filter(x, sides = 2,
method = "convolution", filter = rep(1, 5)/5)
## yearly
x53 <- filter(x, sides = 2,
method = "convolution", filter = rep(1, 53)/53)
plot(x, col = 1, type = 'p')
lines(x5, col = 2)
lines(x53, col = 3)