import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.tsa.api as smt
from arch import arch_model
train = pd.read_csv('train.csv', index_col= 0, parse_dates=[0], squeeze=True)
test = pd.read_csv('test.csv', index_col= 0,parse_dates=[0], squeeze=True)
def tsplot(y, title, lags=None, figsize=(15,10), style='bmh'):
if not isinstance(y, pd.Series):
y = pd.Series(y)
with plt.style.context(style):
fig = plt.figure(figsize=figsize)
layout = (2, 2)
ts_ax = plt.subplot2grid(layout, (0,0), colspan = 2)
acf_ax = plt.subplot2grid(layout, (1,0))
pacf_ax = plt.subplot2grid(layout, (1,1))
y.plot(ax = ts_ax)
ts_ax.set_title(title)
smt.graphics.plot_acf(y, lags=lags, ax=acf_ax, alpha=0.05)
smt.graphics.plot_pacf(y, lags = lags, ax=pacf_ax, alpha=0.05, method = "yw")
acf_ax.set_title('Autocorrelation')
pacf_ax.set_title('Partial Autocorrelation')
plt.tight_layout()
return
import statsmodels.api as sm
std = np.std(train)
sm.qqplot(train / std, line ='45');
tsplot(train, "Log Returns, ACF, and PACF")
smt.adfuller(train)
(-23.36845488643965, 0.0, 9, 4517, {'1%': -3.4317985322285707, '5%': -2.862180079546209, '10%': -2.567110717704876}, -26345.66930578957)
The ADF test shows the null can be rejected without differencing the sequence.
import warnings
warnings.filterwarnings('ignore')
order_sel = sm.tsa.arma_order_select_ic(train, ic=["aic", "bic"], max_ar=2, max_ma=2)
p_bic = order_sel.bic_min_order[0]
q_bic = order_sel.bic_min_order[1]
d = 0
p_aic = order_sel.aic_min_order[0]
q_aic = order_sel.aic_min_order[1]
print('BIC Order', (p_bic,d,q_bic))
print('AIC Order', (p_aic,d,q_aic))
BIC Order (0, 0, 0) AIC Order (0, 0, 2)
print('AIC Model')
model_aic = smt.ARIMA(train, order=(p_aic, d, q_aic))
results_aic = model_aic.fit()
print(results_aic.summary())
AIC Model SARIMAX Results ============================================================================== Dep. Variable: LogReturns No. Observations: 4527 Model: ARIMA(0, 0, 2) Log Likelihood 13208.475 Date: Thu, 28 Apr 2022 AIC -26408.950 Time: 17:58:55 BIC -26383.279 Sample: 0 HQIC -26399.907 - 4527 Covariance Type: opg ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ const 0.0002 0.000 1.129 0.259 -0.000 0.001 ma.L1 -0.0116 0.008 -1.450 0.147 -0.027 0.004 ma.L2 -0.0616 0.008 -7.526 0.000 -0.078 -0.046 sigma2 0.0002 1.69e-06 101.494 0.000 0.000 0.000 =================================================================================== Ljung-Box (L1) (Q): 0.07 Jarque-Bera (JB): 13723.11 Prob(Q): 0.79 Prob(JB): 0.00 Heteroskedasticity (H): 0.28 Skew: -0.03 Prob(H) (two-sided): 0.00 Kurtosis: 11.53 =================================================================================== Warnings: [1] Covariance matrix calculated using the outer product of gradients (complex-step).
tsplot(results_aic.resid, title="ARIMA Residuals", lags=5)
sm.stats.acorr_ljungbox(results_aic.resid,lags=5, boxpierce=True, return_df = True)
lb_stat | lb_pvalue | bp_stat | bp_pvalue | |
---|---|---|---|---|
1 | 0.069021 | 0.792767 | 0.068976 | 0.792834 |
2 | 0.127238 | 0.938363 | 0.127141 | 0.938408 |
3 | 0.306741 | 0.958756 | 0.306446 | 0.958812 |
4 | 2.168965 | 0.704715 | 2.166203 | 0.705221 |
5 | 3.122665 | 0.681081 | 3.118428 | 0.681733 |
Up to lag 5, the ACF and PACF plots do not show any significant coefficients. The Box tests have large p-values, meaning we cannot reject the null, and should regard the residual sequence as White Noise.
sm.stats.acorr_ljungbox(results_aic.resid ** 2,lags=10, boxpierce=True, return_df = True)
lb_stat | lb_pvalue | bp_stat | bp_pvalue | |
---|---|---|---|---|
1 | 281.246936 | 4.016698e-63 | 281.060639 | 4.410279e-63 |
2 | 603.699672 | 8.096210e-132 | 603.228586 | 1.024653e-131 |
3 | 762.998708 | 4.578316e-165 | 762.351756 | 6.324205e-165 |
4 | 899.407470 | 2.238930e-193 | 898.579804 | 3.383507e-193 |
5 | 1008.370434 | 9.261532e-216 | 1007.374355 | 1.521724e-215 |
6 | 1123.685389 | 1.565177e-239 | 1122.485618 | 2.845536e-239 |
7 | 1201.343367 | 3.617657e-255 | 1199.989275 | 7.099737e-255 |
8 | 1304.572639 | 2.414423e-276 | 1302.990617 | 5.305990e-276 |
9 | 1354.352501 | 5.625355e-286 | 1352.649575 | 1.312279e-285 |
10 | 1484.584041 | 0.000000e+00 | 1482.536054 | 0.000000e+00 |
# Engle Test
etest = sm.stats.diagnostic.het_arch(results_aic.resid, nlags = 10)
print('pval Lagrange: ' + str(np.round(etest[1],4)))
print('pval F-test: ' + str(np.round(etest[1],4)))
etest
pval Lagrange: 0.0 pval F-test: 0.0
(622.7549183324542, 2.3389558452228067e-127, 72.05847611430839, 2.766030135813962e-137)
The Ljung-Box test seems to suggest there are autocorrelations in our data (squared residuals), while the Engle test suggest that the residuals exhibit heteroskedasticity. There is ARCH effects and a volatility model is needed.
Note: we use a scaled residual sequence to fit the model.
gm = arch_model(results_aic.resid * 100, p = 2, q = 2)
res_scaled = gm.fit()
Iteration: 1, Func. Count: 8, Neg. LLF: 1779976150.91917 Iteration: 2, Func. Count: 19, Neg. LLF: 2280399062.5472245 Iteration: 3, Func. Count: 29, Neg. LLF: 9175.840640302666 Iteration: 4, Func. Count: 38, Neg. LLF: 6869.603416337753 Iteration: 5, Func. Count: 46, Neg. LLF: 6855.874080978718 Iteration: 6, Func. Count: 54, Neg. LLF: 6842.038195369938 Iteration: 7, Func. Count: 62, Neg. LLF: 7054.343849464374 Iteration: 8, Func. Count: 70, Neg. LLF: 6794.190980692841 Iteration: 9, Func. Count: 78, Neg. LLF: 6789.471714383795 Iteration: 10, Func. Count: 86, Neg. LLF: 6786.867200815041 Iteration: 11, Func. Count: 93, Neg. LLF: 6788.225936272537 Iteration: 12, Func. Count: 101, Neg. LLF: 6786.181755633428 Iteration: 13, Func. Count: 108, Neg. LLF: 6786.047726770072 Iteration: 14, Func. Count: 115, Neg. LLF: 6785.980201125069 Iteration: 15, Func. Count: 122, Neg. LLF: 6785.960122386743 Iteration: 16, Func. Count: 129, Neg. LLF: 6785.960075051431 Iteration: 17, Func. Count: 136, Neg. LLF: 6785.9600735354 Iteration: 18, Func. Count: 142, Neg. LLF: 6785.960073534739 Optimization terminated successfully (Exit mode 0) Current function value: 6785.9600735354 Iterations: 18 Function evaluations: 142 Gradient evaluations: 18
print(res_scaled.summary())
Constant Mean - GARCH Model Results ============================================================================== Dep. Variable: None R-squared: 0.000 Mean Model: Constant Mean Adj. R-squared: 0.000 Vol Model: GARCH Log-Likelihood: -6785.96 Distribution: Normal AIC: 13583.9 Method: Maximum Likelihood BIC: 13622.4 No. Observations: 4527 Date: Thu, Apr 28 2022 Df Residuals: 4526 Time: 18:03:35 Df Model: 1 Mean Model ============================================================================ coef std err t P>|t| 95.0% Conf. Int. ---------------------------------------------------------------------------- mu 0.0301 1.483e-02 2.030 4.233e-02 [1.043e-03,5.918e-02] Volatility Model ============================================================================= coef std err t P>|t| 95.0% Conf. Int. ----------------------------------------------------------------------------- omega 0.0153 6.743e-03 2.264 2.360e-02 [2.047e-03,2.848e-02] alpha[1] 0.0724 2.899e-02 2.497 1.252e-02 [1.557e-02, 0.129] alpha[2] 5.3297e-12 1.778e-02 2.998e-10 1.000 [-3.484e-02,3.484e-02] beta[1] 0.7434 0.303 2.452 1.421e-02 [ 0.149, 1.338] beta[2] 0.1746 0.304 0.575 0.565 [ -0.420, 0.770] ============================================================================= Covariance estimator: robust
garch_resid = res_scaled.resid
st_garch_resid = np.divide(garch_resid, res_scaled.conditional_volatility)
fig = res_scaled.plot()
sm.stats.acorr_ljungbox(st_garch_resid ** 2,lags=10, boxpierce=True, return_df = True)
lb_stat | lb_pvalue | bp_stat | bp_pvalue | |
---|---|---|---|---|
1 | 0.807515 | 0.368856 | 0.806981 | 0.369014 |
2 | 7.344085 | 0.025424 | 7.337777 | 0.025505 |
3 | 7.415846 | 0.059761 | 7.409458 | 0.059931 |
4 | 8.148070 | 0.086302 | 8.140712 | 0.086558 |
5 | 8.151305 | 0.148092 | 8.143943 | 0.148479 |
6 | 8.511495 | 0.202972 | 8.503497 | 0.203486 |
7 | 8.743913 | 0.271569 | 8.735453 | 0.272212 |
8 | 9.002884 | 0.342053 | 8.993852 | 0.342815 |
9 | 9.747039 | 0.371349 | 9.736200 | 0.372260 |
10 | 10.212696 | 0.422035 | 10.200623 | 0.423072 |
etest_garch = sm.stats.diagnostic.het_arch(st_garch_resid, nlags = 10)
print('pval Lagrange: ' + str(np.round(etest_garch[1],4)))
print('pval F-test: ' + str(np.round(etest_garch[1],4)))
etest_garch
pval Lagrange: 0.4263 pval F-test: 0.4263
(10.162580243357679, 0.4263473083946894, 1.0160691924636058, 0.4267201886626978)
log_returns = pd.concat([train, test])
n_train = len(train)
n_test = len(test)
train_start, train_end = train.index[0].strftime("%Y-%m-%d"), train.index[-1].strftime("%Y-%m-%d")
test_start, test_end = test.index[0].strftime("%Y-%m-%d"), test.index[-1].strftime("%Y-%m-%d")
# Obtain the predictions
arima_predictions = pd.Series(
data=results_aic.forecast(n_test).to_numpy(),
index=test.index,
)
PLOT_STYLE = "bmh"
with plt.style.context(PLOT_STYLE):
fig, ax = plt.subplots(1, 1, figsize=(12, 5))
log_returns[-100:].plot(ax=ax, label="Actual Log Returns")
arima_predictions.plot(ax=ax, label="ARIMA(0,0,2) Predictions")
ax.axvline(train_end, color="gray", alpha=0.9, linestyle="dashed")
ax.set(
title="Daily Log-Returns Forecast",
ylabel="Log(Return)",
ylim=(-0.05, 0.05),
)
ax.legend()
None
# predicted values
forecasts_vol_scaled = res_scaled.forecast(horizon=len(test),reindex = False)
forecasts_df = pd.DataFrame()
forecasts_df['ActualReturn'] = test.values
forecasts_df['PredictedVolScaled'] = np.sqrt(forecasts_vol_scaled.variance.values[-1,:])
forecasts_df['Date'] = test.index
# previous returns and volatilities
pre_df = pd.DataFrame()
pre_df['ActualReturn'] = train.values
pre_df['ConditionalVolScaled'] = res_scaled.conditional_volatility.values
pre_df['Date'] = train.index
fig = plt.figure(figsize=(10,8))
return_ax = fig.add_subplot(2,1,1)
return_ax.set_title("Returns and Volatility using fixed window (Scaled)")
plt.plot(train*100, label = "Actual Returns (2000 - 2017)")
plt.plot(test*100, label = "Actual Returns (Jan 2018)", color = 'orange')
plt.legend();
vol_ax = fig.add_subplot(2,1,2)
vol_ax.set_title("Volatility (Scaled)")
plt.plot(pre_df['Date'],pre_df['ConditionalVolScaled'], label = "Conditional Vol (2000 - 2017)")
plt.plot(forecasts_df['Date'], forecasts_df['PredictedVolScaled'], label = "Predicted Vol (Jan 2018)", color = 'orange')
plt.legend();
To zoom in the predictions, we have the following:
fig = plt.figure(figsize=(10,8))
return_ax = fig.add_subplot(2,1,1)
return_ax.set_title("Returns and Volatility using fixed window (Scaled)")
# plt.plot(train*100, label = "Actual Returns (2000 - 2017)")
plt.plot(test*100, label = "Actual Returns (Jan 2018)")
plt.legend();
vol_ax = fig.add_subplot(2,1,2)
vol_ax.set_title("Volatility (Scaled)")
# plt.plot(pre_df['Date'],pre_df['ConditionalVolScaled'], label = "Conditional Vol (2000 - 2017)")
plt.plot(forecasts_df['Date'], forecasts_df['PredictedVolScaled'], label = "Predicted Vol (Jan 2018)")
plt.legend();