Predictive Model for Stock Market Analysis

# Load the libraries
library(quantmod)
library(rugarch)
library(dplyr)
library(ggplot2)
library(forecast)
library(MASS)
library(tseries)
library(rugarch)
library(caret)
library(randomForest)
library(e1071)
library(xgboost)
library(parallel)
library(Metrics)
library(PerformanceAnalytics)
library(keras)
# Set the seed for reproducibility
set.seed(123)

Project Overview:

The goal of this project is to build a predictive model using historical stock market data. This project will involve data ingestion, data cleaning, exploratory data analysis, statistical analysis, model building, and model evaluation.

Steps:

Data Collection:

Collecting historical data on selected stocks. Yahoo Finance, Alpha Vantage, and Quandl offer APIs, quantmod package in R is used for this purpose.

We’ll collect Google, Apple and Coca cola daily stock data.

# stock names
tickers <- c("GOOG","AAPL","KO")
start_date <- as.Date("2010-01-01")
end_date <- Sys.Date()
# download stock
getSymbols(tickers,
           from =  start_date,
           to = end_date,
           periodicity = "daily")
## [1] "GOOG" "AAPL" "KO"

Data Cleaning and Preprocessing:

After the data collection, I clean the data to handle any missing or irregular values. I am using the tidyverse package for data manipulation.

data cleaning

# extract the closing price
closing_prices_ko <- Cl(get("KO"))
# convert to time series object
ts_ko <- ts(coredata(closing_prices_ko), frequency = 365.25, start = c(2004, 9))
plot(ts_ko, main="Cocacola Stock")

# extract the closing price

closing_prices_goog <- Cl(get("GOOG"))
# convert to time series object
ts_goog <- ts(coredata(closing_prices_goog), frequency = 365.25, start = c(2004, 9))
plot(ts_goog, main="Google Stock")

# extract the closing price

closing_prices_aapl <- Cl(get("AAPL"))
# convert to time series object
ts_aapl <- ts(coredata(closing_prices_aapl), frequency = 365.25, start = c(2004, 9))
plot(ts_aapl, main="APPLE Stock")

# make a dataframe of the stocks

# convert stock to dataframe
df <- data.frame(Date = index(closing_prices_ko), ko = coredata(closing_prices_ko),
                 aapl = coredata(closing_prices_aapl),
                 goog = coredata(closing_prices_goog))
head(df)

Handle any missing values

We’ll now check for missing values

sapply(closing_prices_ko, function(x){ sum(is.null(x))})
## KO.Close 
##        0
sapply(closing_prices_aapl, function(x){ sum(is.null(x))})
## AAPL.Close 
##          0
sapply(closing_prices_goog, function(x){ sum(is.null(x))})
## GOOG.Close 
##          0

We can see that KO, AAPL and GOOG stocks don’t have any missing values. So don’t need any missing value handling.

Irregular values

We’ll use the function quantile to find out irregular values.

# find irregular values in KO stock
irregular_values_aapl <- df %>%
  dplyr::filter(AAPL.Close > quantile(AAPL.Close, 0.95) | AAPL.Close < quantile(AAPL.Close, 0.05)) %>% 
  dplyr::select(Date,AAPL.Close)

# head the identified irregular values
head(irregular_values_aapl)
# find irrelegular values in KO stock
irregular_values_goog <- df %>%
  dplyr::filter(GOOG.Close > quantile(GOOG.Close, 0.95) | GOOG.Close < quantile(GOOG.Close, 0.05)) %>% 
  dplyr::select(Date,GOOG.Close)

# head the identified irregular values
head(irregular_values_goog)
# find irrelegular values in KO stock
irregular_values_ko <- df %>%
  dplyr::filter(KO.Close > quantile(KO.Close, 0.95) | KO.Close < quantile(KO.Close, 0.05)) %>% 
  dplyr::select(Date,KO.Close)

# head the identified irregular values
head(irregular_values_ko)

From the above table baed on 95th and 5th percentile cut point we can see that we have outliers in KO, AAPL and GOOG stocks.

To reduce the impact of outliers we’ll do a box cox transformation to the KO data.

# boxcox transformation
boxcox(df$KO.Close~1,lambda = seq(-0,0.5,length = 100))

# taking parameter lambda as -0.25 we'll do a box cox transformation
yl <- ts_ko^0.4
df$KO <- df$KO.Close^0.4
# convert to time series object

ggplot(df, aes(x = Date, y = KO.Close)) +
  geom_line() +
  labs(x = "Date", y = "Price") +
  ggtitle("KO Stock Market Data")+theme_bw()

ggplot(df, aes(x = Date, y = KO)) +
  geom_line() +
  labs(x = "Date", y = "Price") +
  ggtitle("Transformed KO Stock Market Data")+theme_bw()

Exploratory Data Analysis (EDA):

In the following code chunk we will be performing an EDA, which is important because it allows us to understand the trends, seasonality and correlation with other stocks/indices. This will reveal other important patterns or characteristics. I am using the package ggplot2 for data visualization.

correlation

# Print the correlation value
print("Correlation with Coca Cola Index:\n")
## [1] "Correlation with Coca Cola Index:\n"
cor(df[, c("KO.Close","AAPL.Close","GOOG.Close")])
##             KO.Close AAPL.Close GOOG.Close
## KO.Close   1.0000000  0.8742653  0.8865882
## AAPL.Close 0.8742653  1.0000000  0.9626542
## GOOG.Close 0.8865882  0.9626542  1.0000000

KO stock has strong positive correlation with AAPL stock(0.87) and GOOG stock(0.88).

Feature Engineering:

Based on the EDA, I can generate new features that might help in predicting the stock prices. This could include technical indicators (like Moving Averages, RSI, MACD), derived features (like returns, log returns), and other data like market indices, sector performance, etc.

# technical indicators
# Moving Averages (MA)
df$sma_20 <- SMA(df$KO, n = 20)
df$sma_50 <- SMA(df$KO, n = 50)

# Relative Strength Index (RSI)
df$rsi <- RSI(df$KO)

# Moving Average Convergence Divergence (MACD)
df$macd <- as.data.frame(MACD(df$KO))$macd

# derived features
# Returns
df$returns <- as.data.frame( dailyReturn(closing_prices_ko^0.4))$daily.returns

# Log Returns
df$log_returns <- log(1 + df$returns)

# head of data with generated features
head(df)

Sector data

# download sector and benchmark index data
getSymbols(c("^GSPC", "XLK"), from = start_date, to = end_date, periodicity="daily")
## [1] "GSPC" "XLK"
index_closing_price <- Cl(get("GSPC"))
sector_closing_price <- Cl(get("XLK"))

# add index and sector closing price
df$market_index <- as.data.frame(index_closing_price)$GSPC.Close
df$sector_performance <- as.data.frame(sector_closing_price)$XLK.Close
head(df,10)

Model Building:

I start with a simple Linear Regression model as a baseline. Then move on to more complex models like ARIMA, GARCH for time series prediction. You could also use Machine Learning models like Random Forest, SVM, or Gradient Boosting. If you have enough data, you could experiment with LSTM (a type of recurrent neural network). You can use any of these packages (forecast, rugarch, randomForest, e1071, xgboost, keras) for the models respectively.

We will try fitting a machine learning model to predict KO Stock data.

Split the data into train and test

# convert date to numeric for model input
df$date_num <- as.numeric(as.POSIXct(df$Date))
index <- floor(0.8 * nrow(df))


# Split the data into train and test sets
train_df <- df[1:index, ]
test_df <- df[(index + 1):nrow(df), ]

dim(train_df)
## [1] 2712   14
dim(test_df)
## [1] 679  14

Linear Regression model (baseline model)

# Build the linear regression model
lm.model <- lm(KO ~ AAPL.Close + GOOG.Close + sma_20 + sma_50 + rsi+ macd + returns + log_returns+
                 market_index+sector_performance + date_num, data = train_df)

# Summarize the model
summary(lm.model)
## 
## Call:
## lm(formula = KO ~ AAPL.Close + GOOG.Close + sma_20 + sma_50 + 
##     rsi + macd + returns + log_returns + market_index + sector_performance + 
##     date_num, data = train_df)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.208009 -0.006880  0.000143  0.006943  0.068318 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         4.860e-02  2.123e-02   2.289 0.022143 *  
## AAPL.Close          5.244e-05  7.910e-05   0.663 0.507390    
## GOOG.Close         -3.375e-04  1.344e-04  -2.511 0.012109 *  
## sma_20              2.072e-01  1.169e-02  17.729  < 2e-16 ***
## sma_50              7.835e-01  1.227e-02  63.879  < 2e-16 ***
## rsi                 1.209e-03  4.499e-05  26.877  < 2e-16 ***
## macd                1.052e-01  1.634e-03  64.415  < 2e-16 ***
## returns            -1.269e+02  9.390e+00 -13.517  < 2e-16 ***
## log_returns         1.291e+02  9.368e+00  13.776  < 2e-16 ***
## market_index        2.825e-05  3.322e-06   8.505  < 2e-16 ***
## sector_performance -9.099e-05  1.460e-04  -0.623 0.533170    
## date_num           -7.849e-11  2.133e-11  -3.679 0.000239 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01418 on 2651 degrees of freedom
##   (49 observations deleted due to missingness)
## Multiple R-squared:  0.9975, Adjusted R-squared:  0.9975 
## F-statistic: 9.675e+04 on 11 and 2651 DF,  p-value: < 2.2e-16
# Make predictions on the test data
predictions_train_lm <- predict(lm.model, newdata = train_df)
predictions_test_lm <- predict(lm.model, newdata = test_df)

ARIMA

plot.ts(yl, main="KO stock transformed data")

Test for series stationary

# test for level stationary
adf.test(yl)
## Warning in adf.test(yl): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  yl
## Dickey-Fuller = -4.3859, Lag order = 15, p-value = 0.01
## alternative hypothesis: stationary
kpss.test(yl)
## Warning in kpss.test(yl): p-value smaller than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  yl
## KPSS Level = 29.515, Truncation lag parameter = 9, p-value = 0.01

Since the p value of Augmented Dickey-Fuller Test is < 0.05 at 5% level of significance we reject the null hypothesis and conclude that our KO stock transformed series is stationary.

Since the p value of KPSS Test for Level Stationarity is < 0.05 at 5% level of significance we reject the null hypothesis and conclude that our KO stock transformed series is level stationary.

# take 1st order differnce of the KO stock
ys <- diff(yl,1)
plot.ts(ys)

Daily difference d value is 1 here.

tsdisplay(ys)

Taking potentinal values of of p = 0,1,2,q = 0,1,2, d = 1, P = 0, 1, 2,Q = 0,1,2,D = 0,1 we’ll see which model gives the lowest bic.

BIC value lower is the best model.

# fit multiple Arima models
Arima(ts_ko,order=c(1,1,1),seasonal=list(order=c(0,0,0),period=12),lambda=0.4)$bic
## [1] -10571.41
Arima(ts_ko,order=c(1,1,0),seasonal=list(order=c(0,0,0),period=12),lambda=0.4)$bic
## [1] -10577.79
Arima(ts_ko,order=c(0,1,1),seasonal=list(order=c(0,0,0),period=12),lambda=0.4)$bic 
## [1] -10577.64
Arima(ts_ko,order=c(2,1,1),seasonal=list(order=c(0,0,0),period=12),lambda=0.4)$bic 
## [1] -10564.51
Arima(ts_ko,order=c(1,1,2),seasonal=list(order=c(0,0,0),period=12),lambda=0.4)$bic
## [1] -10564.87
Arima(ts_ko,order=c(1,1,1),seasonal=list(order=c(0,1,0),period=12),lambda=0.4)$bic
## [1] -7987.758
Arima(ts_ko,order=c(1,1,0),seasonal=list(order=c(0,1,0),period=12),lambda=0.4)$bic
## [1] -7995.886
Arima(ts_ko,order=c(0,1,1),seasonal=list(order=c(0,1,0),period=12),lambda=0.4)$bic 
## [1] -7995.881
Arima(ts_ko,order=c(2,1,1),seasonal=list(order=c(0,1,0),period=12),lambda=0.4)$bic 
## [1] -7986.126
Arima(ts_ko,order=c(1,1,2),seasonal=list(order=c(0,1,0),period=12),lambda=0.4)$bic
## [1] -7987.291
Arima(ts_ko,order=c(1,1,1),seasonal=list(order=c(0,1,1),period=12),lambda=0.4)$bic
## [1] -10457.65
Arima(ts_ko,order=c(1,1,0),seasonal=list(order=c(0,1,1),period=12),lambda=0.4)$bic
## [1] -10464.05
Arima(ts_ko,order=c(0,1,1),seasonal=list(order=c(0,1,1),period=12),lambda=0.4)$bic 
## [1] -10463.91
Arima(ts_ko,order=c(2,1,1),seasonal=list(order=c(0,1,1),period=12),lambda=0.4)$bic 
## [1] -10450.64
Arima(ts_ko,order=c(1,1,2),seasonal=list(order=c(0,1,1),period=12),lambda=0.4)$bic
## [1] -10451.01
Arima(ts_ko,order=c(1,1,1),seasonal=list(order=c(0,1,2),period=12),lambda=0.4)$bic
## [1] -10462.96
Arima(ts_ko,order=c(1,1,0),seasonal=list(order=c(0,1,2),period=12),lambda=0.4)$bic 
## [1] -10469.51
Arima(ts_ko,order=c(0,1,1),seasonal=list(order=c(0,1,2),period=12),lambda=0.4)$bic 
## [1] -10469.35
Arima(ts_ko,order=c(2,1,1),seasonal=list(order=c(0,1,2),period=12),lambda=0.4)$bic 
## [1] -10455.68
Arima(ts_ko,order=c(1,1,2),seasonal=list(order=c(0,1,2),period=12),lambda=0.4)$bic
## [1] -10455.94
Arima(ts_ko,order=c(1,1,1),seasonal=list(order=c(1,1,1),period=12),lambda=0.4)$bic
## [1] -10462.97
Arima(ts_ko,order=c(1,1,0),seasonal=list(order=c(1,1,1),period=12),lambda=0.4)$bic
## [1] -10469.54
Arima(ts_ko,order=c(0,1,1),seasonal=list(order=c(1,1,1),period=12),lambda=0.4)$bic 
## [1] -10469.38
Arima(ts_ko,order=c(2,1,1),seasonal=list(order=c(1,1,1),period=12),lambda=0.4)$bic 
## [1] -10455.76
Arima(ts_ko,order=c(1,1,2),seasonal=list(order=c(1,1,1),period=12),lambda=0.4)$bic
## [1] -10456.04
Arima(ts_ko,order=c(1,1,1),seasonal=list(order=c(1,1,2),period=12),lambda=0.4)$bic
## [1] -10454.77
Arima(ts_ko,order=c(1,1,0),seasonal=list(order=c(1,1,2),period=12),lambda=0.4)$bic 
## [1] -10461.43
Arima(ts_ko,order=c(0,1,1),seasonal=list(order=c(1,1,2),period=12),lambda=0.4)$bic 
## [1] -10461.27
Arima(ts_ko,order=c(2,1,1),seasonal=list(order=c(1,1,2),period=12),lambda=0.4)$bic 
## [1] -10447.63
Arima(ts_ko,order=c(1,1,2),seasonal=list(order=c(1,1,2),period=12),lambda=0.4)$bic
## [1] -10447.91
Arima(ts_ko,order=c(1,1,1),seasonal=list(order=c(2,1,1),period=12),lambda=0.4)$bic
## [1] -10454.86
Arima(ts_ko,order=c(1,1,0),seasonal=list(order=c(2,1,1),period=12),lambda=0.4)$bic
## [1] -10461.44
Arima(ts_ko,order=c(0,1,1),seasonal=list(order=c(2,1,1),period=12),lambda=0.4)$bic 
## [1] -10461.28
Arima(ts_ko,order=c(2,1,1),seasonal=list(order=c(2,1,1),period=12),lambda=0.4)$bic 
## [1] -10452.3
Arima(ts_ko,order=c(1,1,2),seasonal=list(order=c(2,1,1),period=12),lambda=0.4)$bic
## [1] -10447.91
Arima(ts_ko,order=c(1,1,1),seasonal=list(order=c(2,1,2),period=12),lambda=0.4)$bic
## [1] -10446.82
Arima(ts_ko,order=c(0,1,1),seasonal=list(order=c(2,1,2),period=12),lambda=0.4)$bic 
## [1] -10453.15
Arima(ts_ko,order=c(2,1,1),seasonal=list(order=c(2,1,2),period=12),lambda=0.4)$bic 
## [1] -10439.64
Arima(ts_ko,order=c(1,1,2),seasonal=list(order=c(2,1,2),period=12),lambda=0.4)$bic
## [1] -10439.82

Based on BIC criterion based model is p = 1, d = 1, q = 0, P = 0, D = 0, Q = 0 with period = 12.

Final ARIMA model

# Calculate the number of observations for the train set
train_size <- floor(0.8 * length(ts_ko))

# Perform the train-test split
train <- head(ts_ko,train_size)
test <- tail(ts_ko, length(ts_ko) - train_size)

model.arima<- Arima(train,order=c(1,1,0),seasonal=list(order=c(0,0,0),period=12),lambda=0.4)
summary(model.arima)
## Series: train 
## ARIMA(1,1,0) 
## Box Cox transformation: lambda= 0.4 
## 
## Coefficients:
##           ar1
##       -0.0447
## s.e.   0.0192
## 
## sigma^2 = 0.002448:  log likelihood = 4303.76
## AIC=-8603.51   AICc=-8603.51   BIC=-8591.7
## 
## Training set error measures:
##                       ME     RMSE       MAE        MPE      MAPE       MASE
## Training set 0.008666953 0.475879 0.3079847 0.01606859 0.7448756 0.08603564
##                      ACF1
## Training set -0.004900167
# plot 30 day forecast
plot(forecast(model.arima,30))

# plot actual vs fitted values
fitted_values <- fitted(model.arima)
residuals <- residuals(model.arima)
plot(ts_ko, main = "Daily Stock Data")
lines(fitted_values, col = "red")

plot(residuals, main = "Residuals")

# Make predictions on the test data
predictions_train_ar <- forecast(model.arima,h = length(train))
predictions_test_ar <- forecast(model.arima, h = length(test))

GARCH

Fit multiple GARCH models

model <- ugarchspec(variance.model = list(model = "sGARCH", garchOrder = c(0, 1)),
                    mean.model = list(armaOrder = c(1, 0)),
                    distribution.model = "std")

# Fit the GARCH model to the data
fit <- ugarchfit(spec = model, data = train)
aic <- infocriteria(fit)[1]
print("0, 1 order aic")
## [1] "0, 1 order aic"
aic
## [1] 1.004197
model <- ugarchspec(variance.model = list(model = "sGARCH", garchOrder = c(1, 0)),
                    mean.model = list(armaOrder = c(1, 0)),
                    distribution.model = "std")

# Fit the GARCH model to the data
fit <- ugarchfit(spec = model, data = train)
aic <- infocriteria(fit)[1]
print("1, 0 order aic")
## [1] "1, 0 order aic"
aic
## [1] 0.9419476
model <- ugarchspec(variance.model = list(model = "sGARCH", garchOrder = c(1, 1)),
                    mean.model = list(armaOrder = c(1, 0)),
                    distribution.model = "std")

# Fit the GARCH model to the data
fit <- ugarchfit(spec = model, data = train)
aic <- infocriteria(fit)[1]

print("1, 1 order aic")
## [1] "1, 1 order aic"
aic
## [1] 0.8602891
model <- ugarchspec(variance.model = list(model = "sGARCH", garchOrder = c(2, 1)),
                    mean.model = list(armaOrder = c(1, 0)),
                    distribution.model = "std")

# Fit the GARCH model to the data
fit <- ugarchfit(spec = model, data = train)
aic <- infocriteria(fit)[1]
print("2, 1 order aic")
## [1] "2, 1 order aic"
aic
## [1] 0.8610766
model <- ugarchspec(variance.model = list(model = "sGARCH", garchOrder = c(1, 2)),
                    mean.model = list(armaOrder = c(1, 0)),
                    distribution.model = "std")

# Fit the GARCH model to the data
fit <- ugarchfit(spec = model, data = train)
aic <- infocriteria(fit)[1]
print("1, 2 order aic")
## [1] "1, 2 order aic"
aic
## [1] 0.8696863

Here garchOrder p = 1 and q = 1 order garch model has the lowest AIC value 0.6813819. It’s the best model.

Fit final GARCH Model

# fit model with p = 1, q = 1
model.spec <- ugarchspec(variance.model = list(model = "sGARCH", garchOrder = c(1, 1)),
                    mean.model = list(armaOrder = c(1, 0)),
                    distribution.model = "std")

# Fit the GARCH model to the data
model.garch <- ugarchfit(spec = model.spec, data = train)
# Print the model summary
show(model.garch)
## 
## *---------------------------------*
## *          GARCH Model Fit        *
## *---------------------------------*
## 
## Conditional Variance Dynamics    
## -----------------------------------
## GARCH Model  : sGARCH(1,1)
## Mean Model   : ARFIMA(1,0,0)
## Distribution : std 
## 
## Optimal Parameters
## ------------------------------------
##         Estimate  Std. Error   t value Pr(>|t|)
## mu     28.519999    0.320401   89.0134 0.000000
## ar1     1.000000    0.000490 2040.7573 0.000000
## omega   0.004032    0.001379    2.9236 0.003460
## alpha1  0.075322    0.015724    4.7904 0.000002
## beta1   0.904633    0.019945   45.3569 0.000000
## shape   4.624631    0.405428   11.4068 0.000000
## 
## Robust Standard Errors:
##         Estimate  Std. Error   t value Pr(>|t|)
## mu     28.519999    0.019199 1485.5257 0.000000
## ar1     1.000000    0.000465 2150.2840 0.000000
## omega   0.004032    0.002106    1.9147 0.055531
## alpha1  0.075322    0.029414    2.5607 0.010445
## beta1   0.904633    0.036009   25.1222 0.000000
## shape   4.624631    0.456689   10.1264 0.000000
## 
## LogLikelihood : -1160.552 
## 
## Information Criteria
## ------------------------------------
##                     
## Akaike       0.86029
## Bayes        0.87335
## Shibata      0.86028
## Hannan-Quinn 0.86501
## 
## Weighted Ljung-Box Test on Standardized Residuals
## ------------------------------------
##                         statistic p-value
## Lag[1]                      1.667  0.1967
## Lag[2*(p+q)+(p+q)-1][2]     2.029  0.2035
## Lag[4*(p+q)+(p+q)-1][5]     3.082  0.4110
## d.o.f=1
## H0 : No serial correlation
## 
## Weighted Ljung-Box Test on Standardized Squared Residuals
## ------------------------------------
##                         statistic p-value
## Lag[1]                    0.02041  0.8864
## Lag[2*(p+q)+(p+q)-1][5]   0.38389  0.9740
## Lag[4*(p+q)+(p+q)-1][9]   0.81231  0.9927
## d.o.f=2
## 
## Weighted ARCH LM Tests
## ------------------------------------
##             Statistic Shape Scale P-Value
## ARCH Lag[3]   0.02911 0.500 2.000  0.8645
## ARCH Lag[5]   0.70636 1.440 1.667  0.8215
## ARCH Lag[7]   0.94371 2.315 1.543  0.9225
## 
## Nyblom stability test
## ------------------------------------
## Joint Statistic:  2.7729
## Individual Statistics:              
## mu     0.73520
## ar1    1.25282
## omega  0.41483
## alpha1 0.54513
## beta1  0.37069
## shape  0.09633
## 
## Asymptotic Critical Values (10% 5% 1%)
## Joint Statistic:          1.49 1.68 2.12
## Individual Statistic:     0.35 0.47 0.75
## 
## Sign Bias Test
## ------------------------------------
##                    t-value   prob sig
## Sign Bias           0.1916 0.8481    
## Negative Sign Bias  0.7238 0.4692    
## Positive Sign Bias  0.6095 0.5422    
## Joint Effect        1.1982 0.7534    
## 
## 
## Adjusted Pearson Goodness-of-Fit Test:
## ------------------------------------
##   group statistic p-value(g-1)
## 1    20     38.99     0.004433
## 2    30     47.42     0.016849
## 3    40     58.80     0.021763
## 4    50     75.57     0.008746
## 
## 
## Elapsed time : 0.307487

GARCH Prediction

# Generate forecasts for the training dataset
train_forecast <- ugarchforecast(model.garch, n.ahead = length(train))

# Generate forecasts for the test dataset
test_forecast <- ugarchforecast(model.garch, n.ahead = length(test))

# Extract the predicted conditional variances
predictions_train_garch <- sigma(train_forecast)
predictions_test_garch <- sigma(test_forecast)

Random Forest

df_no_na <- na.omit(df)

index <- floor(0.8 * nrow(df_no_na))


# Split the data into train and test sets
train_df_na <- df_no_na[1:index, ]
test_df_na <- df_no_na[(index + 1):nrow(df_no_na), ]

dim(train_df_na)
## [1] 2673   14
dim(test_df_na)
## [1] 669  14
X <- train_df_na %>% 
  dplyr::select(AAPL.Close , GOOG.Close , sma_20 , sma_50 , rsi, macd , returns , log_returns,
                 market_index,sector_performance , date_num)
y <- train_df_na$KO
# Algorithm Tune (tuneRF)

bestmtry <- tuneRF(X, y, stepFactor=1.5, improve=1e-5, ntree=500)
## mtry = 3  OOB error = 0.0001357392 
## Searching left ...
## mtry = 2     OOB error = 0.0001704267 
## -0.2555454 1e-05 
## Searching right ...
## mtry = 4     OOB error = 0.0001254123 
## 0.07607853 1e-05 
## mtry = 6     OOB error = 0.00011442 
## 0.08764921 1e-05 
## mtry = 9     OOB error = 0.0001249605 
## -0.09212059 1e-05