Predictive Model for Stock Market Analysis

# Load the libraries
library(quantmod)
library(rugarch)
library(dplyr)
library(ggplot2)
library(forecast)
library(MASS)
library(tseries)
library(rugarch)
library(caret)
library(randomForest)
library(e1071)
library(xgboost)
library(parallel)
library(Metrics)
library(PerformanceAnalytics)
library(keras)
# Set the seed for reproducibility
set.seed(123)

Project Overview:

The goal of this project is to build a predictive model using historical stock market data. This project will involve data ingestion, data cleaning, exploratory data analysis, statistical analysis, model building, and model evaluation.

Steps:

Data Collection:

Collecting historical data on selected stocks. Yahoo Finance, Alpha Vantage, and Quandl offer APIs, quantmod package in R is used for this purpose.

We’ll collect Google, Apple and Coca cola daily stock data.

# stock names
tickers <- c("GOOG","AAPL","KO")
start_date <- as.Date("2010-01-01")
end_date <- Sys.Date()
# download stock
getSymbols(tickers,
           from =  start_date,
           to = end_date,
           periodicity = "daily")
## [1] "GOOG" "AAPL" "KO"

Data Cleaning and Preprocessing:

After the data collection, I clean the data to handle any missing or irregular values. I am using the tidyverse package for data manipulation.

data cleaning

# extract the closing price
closing_prices_ko <- Cl(get("KO"))
# convert to time series object
ts_ko <- ts(coredata(closing_prices_ko), frequency = 365.25, start = c(2004, 9))
plot(ts_ko, main="Cocacola Stock")

# extract the closing price

closing_prices_goog <- Cl(get("GOOG"))
# convert to time series object
ts_goog <- ts(coredata(closing_prices_goog), frequency = 365.25, start = c(2004, 9))
plot(ts_goog, main="Google Stock")

# extract the closing price

closing_prices_aapl <- Cl(get("AAPL"))
# convert to time series object
ts_aapl <- ts(coredata(closing_prices_aapl), frequency = 365.25, start = c(2004, 9))
plot(ts_aapl, main="APPLE Stock")

# make a dataframe of the stocks

# convert stock to dataframe
df <- data.frame(Date = index(closing_prices_ko), ko = coredata(closing_prices_ko),
                 aapl = coredata(closing_prices_aapl),
                 goog = coredata(closing_prices_goog))
head(df)

Handle any missing values

We’ll now check for missing values

sapply(closing_prices_ko, function(x){ sum(is.null(x))})
## KO.Close 
##        0
sapply(closing_prices_aapl, function(x){ sum(is.null(x))})
## AAPL.Close 
##          0
sapply(closing_prices_goog, function(x){ sum(is.null(x))})
## GOOG.Close 
##          0

We can see that KO, AAPL and GOOG stocks don’t have any missing values. So don’t need any missing value handling.

Irregular values

We’ll use the function quantile to find out irregular values.

# find irregular values in KO stock
irregular_values_aapl <- df %>%
  dplyr::filter(AAPL.Close > quantile(AAPL.Close, 0.95) | AAPL.Close < quantile(AAPL.Close, 0.05)) %>% 
  dplyr::select(Date,AAPL.Close)

# head the identified irregular values
head(irregular_values_aapl)
# find irrelegular values in KO stock
irregular_values_goog <- df %>%
  dplyr::filter(GOOG.Close > quantile(GOOG.Close, 0.95) | GOOG.Close < quantile(GOOG.Close, 0.05)) %>% 
  dplyr::select(Date,GOOG.Close)

# head the identified irregular values
head(irregular_values_goog)
# find irrelegular values in KO stock
irregular_values_ko <- df %>%
  dplyr::filter(KO.Close > quantile(KO.Close, 0.95) | KO.Close < quantile(KO.Close, 0.05)) %>% 
  dplyr::select(Date,KO.Close)

# head the identified irregular values
head(irregular_values_ko)

From the above table baed on 95th and 5th percentile cut point we can see that we have outliers in KO, AAPL and GOOG stocks.

To reduce the impact of outliers we’ll do a box cox transformation to the KO data.

# boxcox transformation
boxcox(df$KO.Close~1,lambda = seq(-0,0.5,length = 100))

# taking parameter lambda as -0.25 we'll do a box cox transformation
yl <- ts_ko^0.4
df$KO <- df$KO.Close^0.4
# convert to time series object

ggplot(df, aes(x = Date, y = KO.Close)) +
  geom_line() +
  labs(x = "Date", y = "Price") +
  ggtitle("KO Stock Market Data")+theme_bw()

ggplot(df, aes(x = Date, y = KO)) +
  geom_line() +
  labs(x = "Date", y = "Price") +
  ggtitle("Transformed KO Stock Market Data")+theme_bw()

Exploratory Data Analysis (EDA):

In the following code chunk we will be performing an EDA, which is important because it allows us to understand the trends, seasonality and correlation with other stocks/indices. This will reveal other important patterns or characteristics. I am using the package ggplot2 for data visualization.

correlation

# Print the correlation value
print("Correlation with Coca Cola Index:\n")
## [1] "Correlation with Coca Cola Index:\n"
cor(df[, c("KO.Close","AAPL.Close","GOOG.Close")])
##             KO.Close AAPL.Close GOOG.Close
## KO.Close   1.0000000  0.8742653  0.8865882
## AAPL.Close 0.8742653  1.0000000  0.9626542
## GOOG.Close 0.8865882  0.9626542  1.0000000

KO stock has strong positive correlation with AAPL stock(0.87) and GOOG stock(0.88).

Feature Engineering:

Based on the EDA, I can generate new features that might help in predicting the stock prices. This could include technical indicators (like Moving Averages, RSI, MACD), derived features (like returns, log returns), and other data like market indices, sector performance, etc.

# technical indicators
# Moving Averages (MA)
df$sma_20 <- SMA(df$KO, n = 20)
df$sma_50 <- SMA(df$KO, n = 50)

# Relative Strength Index (RSI)
df$rsi <- RSI(df$KO)

# Moving Average Convergence Divergence (MACD)
df$macd <- as.data.frame(MACD(df$KO))$macd

# derived features
# Returns
df$returns <- as.data.frame( dailyReturn(closing_prices_ko^0.4))$daily.returns

# Log Returns
df$log_returns <- log(1 + df$returns)

# head of data with generated features
head(df)

Sector data

# download sector and benchmark index data
getSymbols(c("^GSPC", "XLK"), from = start_date, to = end_date, periodicity="daily")
## [1] "GSPC" "XLK"
index_closing_price <- Cl(get("GSPC"))
sector_closing_price <- Cl(get("XLK"))

# add index and sector closing price
df$market_index <- as.data.frame(index_closing_price)$GSPC.Close
df$sector_performance <- as.data.frame(sector_closing_price)$XLK.Close
head(df,10)

Model Building:

I start with a simple Linear Regression model as a baseline. Then move on to more complex models like ARIMA, GARCH for time series prediction. You could also use Machine Learning models like Random Forest, SVM, or Gradient Boosting. If you have enough data, you could experiment with LSTM (a type of recurrent neural network). You can use any of these packages (forecast, rugarch, randomForest, e1071, xgboost, keras) for the models respectively.

We will try fitting a machine learning model to predict KO Stock data.

Split the data into train and test

# convert date to numeric for model input
df$date_num <- as.numeric(as.POSIXct(df$Date))
index <- floor(0.8 * nrow(df))


# Split the data into train and test sets
train_df <- df[1:index, ]
test_df <- df[(index + 1):nrow(df), ]

dim(train_df)
## [1] 2712   14
dim(test_df)
## [1] 679  14

Linear Regression model (baseline model)

# Build the linear regression model
lm.model <- lm(KO ~ AAPL.Close + GOOG.Close + sma_20 + sma_50 + rsi+ macd + returns + log_returns+
                 market_index+sector_performance + date_num, data = train_df)

# Summarize the model
summary(lm.model)
## 
## Call:
## lm(formula = KO ~ AAPL.Close + GOOG.Close + sma_20 + sma_50 + 
##     rsi + macd + returns + log_returns + market_index + sector_performance + 
##     date_num, data = train_df)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.208009 -0.006880  0.000143  0.006943  0.068318 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         4.860e-02  2.123e-02   2.289 0.022143 *  
## AAPL.Close          5.244e-05  7.910e-05   0.663 0.507390    
## GOOG.Close         -3.375e-04  1.344e-04  -2.511 0.012109 *  
## sma_20              2.072e-01  1.169e-02  17.729  < 2e-16 ***
## sma_50              7.835e-01  1.227e-02  63.879  < 2e-16 ***
## rsi                 1.209e-03  4.499e-05  26.877  < 2e-16 ***
## macd                1.052e-01  1.634e-03  64.415  < 2e-16 ***
## returns            -1.269e+02  9.390e+00 -13.517  < 2e-16 ***
## log_returns         1.291e+02  9.368e+00  13.776  < 2e-16 ***
## market_index        2.825e-05  3.322e-06   8.505  < 2e-16 ***
## sector_performance -9.099e-05  1.460e-04  -0.623 0.533170    
## date_num           -7.849e-11  2.133e-11  -3.679 0.000239 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01418 on 2651 degrees of freedom
##   (49 observations deleted due to missingness)
## Multiple R-squared:  0.9975, Adjusted R-squared:  0.9975 
## F-statistic: 9.675e+04 on 11 and 2651 DF,  p-value: < 2.2e-16
# Make predictions on the test data
predictions_train_lm <- predict(lm.model, newdata = train_df)
predictions_test_lm <- predict(lm.model, newdata = test_df)

ARIMA

plot.ts(yl, main="KO stock transformed data")

Test for series stationary

# test for level stationary
adf.test(yl)
## Warning in adf.test(yl): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  yl
## Dickey-Fuller = -4.3859, Lag order = 15, p-value = 0.01
## alternative hypothesis: stationary
kpss.test(yl)
## Warning in kpss.test(yl): p-value smaller than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  yl
## KPSS Level = 29.515, Truncation lag parameter = 9, p-value = 0.01

Since the p value of Augmented Dickey-Fuller Test is < 0.05 at 5% level of significance we reject the null hypothesis and conclude that our KO stock transformed series is stationary.

Since the p value of KPSS Test for Level Stationarity is < 0.05 at 5% level of significance we reject the null hypothesis and conclude that our KO stock transformed series is level stationary.

# take 1st order differnce of the KO stock
ys <- diff(yl,1)
plot.ts(ys)

Daily difference d value is 1 here.

tsdisplay(ys)

Taking potentinal values of of p = 0,1,2,q = 0,1,2, d = 1, P = 0, 1, 2,Q = 0,1,2,D = 0,1 we’ll see which model gives the lowest bic.

BIC value lower is the best model.

# fit multiple Arima models
Arima(ts_ko,order=c(1,1,1),seasonal=list(order=c(0,0,0),period=12),lambda=0.4)$bic
## [1] -10571.41
Arima(ts_ko,order=c(1,1,0),seasonal=list(order=c(0,0,0),period=12),lambda=0.4)$bic
## [1] -10577.79
Arima(ts_ko,order=c(0,1,1),seasonal=list(order=c(0,0,0),period=12),lambda=0.4)$bic 
## [1] -10577.64
Arima(ts_ko,order=c(2,1,1),seasonal=list(order=c(0,0,0),period=12),lambda=0.4)$bic 
## [1] -10564.51
Arima(ts_ko,order=c(1,1,2),seasonal=list(order=c(0,0,0),period=12),lambda=0.4)$bic
## [1] -10564.87
Arima(ts_ko,order=c(1,1,1),seasonal=list(order=c(0,1,0),period=12),lambda=0.4)$bic
## [1] -7987.758
Arima(ts_ko,order=c(1,1,0),seasonal=list(order=c(0,1,0),period=12),lambda=0.4)$bic
## [1] -7995.886
Arima(ts_ko,order=c(0,1,1),seasonal=list(order=c(0,1,0),period=12),lambda=0.4)$bic 
## [1] -7995.881
Arima(ts_ko,order=c(2,1,1),seasonal=list(order=c(0,1,0),period=12),lambda=0.4)$bic 
## [1] -7986.126
Arima(ts_ko,order=c(1,1,2),seasonal=list(order=c(0,1,0),period=12),lambda=0.4)$bic
## [1] -7987.291
Arima(ts_ko,order=c(1,1,1),seasonal=list(order=c(0,1,1),period=12),lambda=0.4)$bic
## [1] -10457.65
Arima(ts_ko,order=c(1,1,0),seasonal=list(order=c(0,1,1),period=12),lambda=0.4)$bic
## [1] -10464.05
Arima(ts_ko,order=c(0,1,1),seasonal=list(order=c(0,1,1),period=12),lambda=0.4)$bic 
## [1] -10463.91
Arima(ts_ko,order=c(2,1,1),seasonal=list(order=c(0,1,1),period=12),lambda=0.4)$bic 
## [1] -10450.64
Arima(ts_ko,order=c(1,1,2),seasonal=list(order=c(0,1,1),period=12),lambda=0.4)$bic
## [1] -10451.01
Arima(ts_ko,order=c(1,1,1),seasonal=list(order=c(0,1,2),period=12),lambda=0.4)$bic
## [1] -10462.96
Arima(ts_ko,order=c(1,1,0),seasonal=list(order=c(0,1,2),period=12),lambda=0.4)$bic 
## [1] -10469.51
Arima(ts_ko,order=c(0,1,1),seasonal=list(order=c(0,1,2),period=12),lambda=0.4)$bic 
## [1] -10469.35
Arima(ts_ko,order=c(2,1,1),seasonal=list(order=c(0,1,2),period=12),lambda=0.4)$bic 
## [1] -10455.68
Arima(ts_ko,order=c(1,1,2),seasonal=list(order=c(0,1,2),period=12),lambda=0.4)$bic
## [1] -10455.94
Arima(ts_ko,order=c(1,1,1),seasonal=list(order=c(1,1,1),period=12),lambda=0.4)$bic
## [1] -10462.97
Arima(ts_ko,order=c(1,1,0),seasonal=list(order=c(1,1,1),period=12),lambda=0.4)$bic
## [1] -10469.54
Arima(ts_ko,order=c(0,1,1),seasonal=list(order=c(1,1,1),period=12),lambda=0.4)$bic 
## [1] -10469.38
Arima(ts_ko,order=c(2,1,1),seasonal=list(order=c(1,1,1),period=12),lambda=0.4)$bic 
## [1] -10455.76
Arima(ts_ko,order=c(1,1,2),seasonal=list(order=c(1,1,1),period=12),lambda=0.4)$bic
## [1] -10456.04
Arima(ts_ko,order=c(1,1,1),seasonal=list(order=c(1,1,2),period=12),lambda=0.4)$bic
## [1] -10454.77
Arima(ts_ko,order=c(1,1,0),seasonal=list(order=c(1,1,2),period=12),lambda=0.4)$bic 
## [1] -10461.43
Arima(ts_ko,order=c(0,1,1),seasonal=list(order=c(1,1,2),period=12),lambda=0.4)$bic 
## [1] -10461.27
Arima(ts_ko,order=c(2,1,1),seasonal=list(order=c(1,1,2),period=12),lambda=0.4)$bic 
## [1] -10447.63
Arima(ts_ko,order=c(1,1,2),seasonal=list(order=c(1,1,2),period=12),lambda=0.4)$bic
## [1] -10447.91
Arima(ts_ko,order=c(1,1,1),seasonal=list(order=c(2,1,1),period=12),lambda=0.4)$bic
## [1] -10454.86
Arima(ts_ko,order=c(1,1,0),seasonal=list(order=c(2,1,1),period=12),lambda=0.4)$bic
## [1] -10461.44
Arima(ts_ko,order=c(0,1,1),seasonal=list(order=c(2,1,1),period=12),lambda=0.4)$bic 
## [1] -10461.28
Arima(ts_ko,order=c(2,1,1),seasonal=list(order=c(2,1,1),period=12),lambda=0.4)$bic 
## [1] -10452.3
Arima(ts_ko,order=c(1,1,2),seasonal=list(order=c(2,1,1),period=12),lambda=0.4)$bic
## [1] -10447.91
Arima(ts_ko,order=c(1,1,1),seasonal=list(order=c(2,1,2),period=12),lambda=0.4)$bic
## [1] -10446.82
Arima(ts_ko,order=c(0,1,1),seasonal=list(order=c(2,1,2),period=12),lambda=0.4)$bic 
## [1] -10453.15
Arima(ts_ko,order=c(2,1,1),seasonal=list(order=c(2,1,2),period=12),lambda=0.4)$bic 
## [1] -10439.64
Arima(ts_ko,order=c(1,1,2),seasonal=list(order=c(2,1,2),period=12),lambda=0.4)$bic
## [1] -10439.82

Based on BIC criterion based model is p = 1, d = 1, q = 0, P = 0, D = 0, Q = 0 with period = 12.

Final ARIMA model

# Calculate the number of observations for the train set
train_size <- floor(0.8 * length(ts_ko))

# Perform the train-test split
train <- head(ts_ko,train_size)
test <- tail(ts_ko, length(ts_ko) - train_size)

model.arima<- Arima(train,order=c(1,1,0),seasonal=list(order=c(0,0,0),period=12),lambda=0.4)
summary(model.arima)
## Series: train 
## ARIMA(1,1,0) 
## Box Cox transformation: lambda= 0.4 
## 
## Coefficients:
##           ar1
##       -0.0447
## s.e.   0.0192
## 
## sigma^2 = 0.002448:  log likelihood = 4303.76
## AIC=-8603.51   AICc=-8603.51   BIC=-8591.7
## 
## Training set error measures:
##                       ME     RMSE       MAE        MPE      MAPE       MASE
## Training set 0.008666953 0.475879 0.3079847 0.01606859 0.7448756 0.08603564
##                      ACF1
## Training set -0.004900167
# plot 30 day forecast
plot(forecast(model.arima,30))

# plot actual vs fitted values
fitted_values <- fitted(model.arima)
residuals <- residuals(model.arima)
plot(ts_ko, main = "Daily Stock Data")
lines(fitted_values, col = "red")

plot(residuals, main = "Residuals")

# Make predictions on the test data
predictions_train_ar <- forecast(model.arima,h = length(train))
predictions_test_ar <- forecast(model.arima, h = length(test))

GARCH

Fit multiple GARCH models

model <- ugarchspec(variance.model = list(model = "sGARCH", garchOrder = c(0, 1)),
                    mean.model = list(armaOrder = c(1, 0)),
                    distribution.model = "std")

# Fit the GARCH model to the data
fit <- ugarchfit(spec = model, data = train)
aic <- infocriteria(fit)[1]
print("0, 1 order aic")
## [1] "0, 1 order aic"
aic
## [1] 1.004197
model <- ugarchspec(variance.model = list(model = "sGARCH", garchOrder = c(1, 0)),
                    mean.model = list(armaOrder = c(1, 0)),
                    distribution.model = "std")

# Fit the GARCH model to the data
fit <- ugarchfit(spec = model, data = train)
aic <- infocriteria(fit)[1]
print("1, 0 order aic")
## [1] "1, 0 order aic"
aic
## [1] 0.9419476
model <- ugarchspec(variance.model = list(model = "sGARCH", garchOrder = c(1, 1)),
                    mean.model = list(armaOrder = c(1, 0)),
                    distribution.model = "std")

# Fit the GARCH model to the data
fit <- ugarchfit(spec = model, data = train)
aic <- infocriteria(fit)[1]

print("1, 1 order aic")
## [1] "1, 1 order aic"
aic
## [1] 0.8602891
model <- ugarchspec(variance.model = list(model = "sGARCH", garchOrder = c(2, 1)),
                    mean.model = list(armaOrder = c(1, 0)),
                    distribution.model = "std")

# Fit the GARCH model to the data
fit <- ugarchfit(spec = model, data = train)
aic <- infocriteria(fit)[1]
print("2, 1 order aic")
## [1] "2, 1 order aic"
aic
## [1] 0.8610766
model <- ugarchspec(variance.model = list(model = "sGARCH", garchOrder = c(1, 2)),
                    mean.model = list(armaOrder = c(1, 0)),
                    distribution.model = "std")

# Fit the GARCH model to the data
fit <- ugarchfit(spec = model, data = train)
aic <- infocriteria(fit)[1]
print("1, 2 order aic")
## [1] "1, 2 order aic"
aic
## [1] 0.8696863

Here garchOrder p = 1 and q = 1 order garch model has the lowest AIC value 0.6813819. It’s the best model.

Fit final GARCH Model

# fit model with p = 1, q = 1
model.spec <- ugarchspec(variance.model = list(model = "sGARCH", garchOrder = c(1, 1)),
                    mean.model = list(armaOrder = c(1, 0)),
                    distribution.model = "std")

# Fit the GARCH model to the data
model.garch <- ugarchfit(spec = model.spec, data = train)
# Print the model summary
show(model.garch)
## 
## *---------------------------------*
## *          GARCH Model Fit        *
## *---------------------------------*
## 
## Conditional Variance Dynamics    
## -----------------------------------
## GARCH Model  : sGARCH(1,1)
## Mean Model   : ARFIMA(1,0,0)
## Distribution : std 
## 
## Optimal Parameters
## ------------------------------------
##         Estimate  Std. Error   t value Pr(>|t|)
## mu     28.519999    0.320401   89.0134 0.000000
## ar1     1.000000    0.000490 2040.7573 0.000000
## omega   0.004032    0.001379    2.9236 0.003460
## alpha1  0.075322    0.015724    4.7904 0.000002
## beta1   0.904633    0.019945   45.3569 0.000000
## shape   4.624631    0.405428   11.4068 0.000000
## 
## Robust Standard Errors:
##         Estimate  Std. Error   t value Pr(>|t|)
## mu     28.519999    0.019199 1485.5257 0.000000
## ar1     1.000000    0.000465 2150.2840 0.000000
## omega   0.004032    0.002106    1.9147 0.055531
## alpha1  0.075322    0.029414    2.5607 0.010445
## beta1   0.904633    0.036009   25.1222 0.000000
## shape   4.624631    0.456689   10.1264 0.000000
## 
## LogLikelihood : -1160.552 
## 
## Information Criteria
## ------------------------------------
##                     
## Akaike       0.86029
## Bayes        0.87335
## Shibata      0.86028
## Hannan-Quinn 0.86501
## 
## Weighted Ljung-Box Test on Standardized Residuals
## ------------------------------------
##                         statistic p-value
## Lag[1]                      1.667  0.1967
## Lag[2*(p+q)+(p+q)-1][2]     2.029  0.2035
## Lag[4*(p+q)+(p+q)-1][5]     3.082  0.4110
## d.o.f=1
## H0 : No serial correlation
## 
## Weighted Ljung-Box Test on Standardized Squared Residuals
## ------------------------------------
##                         statistic p-value
## Lag[1]                    0.02041  0.8864
## Lag[2*(p+q)+(p+q)-1][5]   0.38389  0.9740
## Lag[4*(p+q)+(p+q)-1][9]   0.81231  0.9927
## d.o.f=2
## 
## Weighted ARCH LM Tests
## ------------------------------------
##             Statistic Shape Scale P-Value
## ARCH Lag[3]   0.02911 0.500 2.000  0.8645
## ARCH Lag[5]   0.70636 1.440 1.667  0.8215
## ARCH Lag[7]   0.94371 2.315 1.543  0.9225
## 
## Nyblom stability test
## ------------------------------------
## Joint Statistic:  2.7729
## Individual Statistics:              
## mu     0.73520
## ar1    1.25282
## omega  0.41483
## alpha1 0.54513
## beta1  0.37069
## shape  0.09633
## 
## Asymptotic Critical Values (10% 5% 1%)
## Joint Statistic:          1.49 1.68 2.12
## Individual Statistic:     0.35 0.47 0.75
## 
## Sign Bias Test
## ------------------------------------
##                    t-value   prob sig
## Sign Bias           0.1916 0.8481    
## Negative Sign Bias  0.7238 0.4692    
## Positive Sign Bias  0.6095 0.5422    
## Joint Effect        1.1982 0.7534    
## 
## 
## Adjusted Pearson Goodness-of-Fit Test:
## ------------------------------------
##   group statistic p-value(g-1)
## 1    20     38.99     0.004433
## 2    30     47.42     0.016849
## 3    40     58.80     0.021763
## 4    50     75.57     0.008746
## 
## 
## Elapsed time : 0.307487

GARCH Prediction

# Generate forecasts for the training dataset
train_forecast <- ugarchforecast(model.garch, n.ahead = length(train))

# Generate forecasts for the test dataset
test_forecast <- ugarchforecast(model.garch, n.ahead = length(test))

# Extract the predicted conditional variances
predictions_train_garch <- sigma(train_forecast)
predictions_test_garch <- sigma(test_forecast)

Random Forest

df_no_na <- na.omit(df)

index <- floor(0.8 * nrow(df_no_na))


# Split the data into train and test sets
train_df_na <- df_no_na[1:index, ]
test_df_na <- df_no_na[(index + 1):nrow(df_no_na), ]

dim(train_df_na)
## [1] 2673   14
dim(test_df_na)
## [1] 669  14
X <- train_df_na %>% 
  dplyr::select(AAPL.Close , GOOG.Close , sma_20 , sma_50 , rsi, macd , returns , log_returns,
                 market_index,sector_performance , date_num)
y <- train_df_na$KO
# Algorithm Tune (tuneRF)

bestmtry <- tuneRF(X, y, stepFactor=1.5, improve=1e-5, ntree=500)
## mtry = 3  OOB error = 0.0001357392 
## Searching left ...
## mtry = 2     OOB error = 0.0001704267 
## -0.2555454 1e-05 
## Searching right ...
## mtry = 4     OOB error = 0.0001254123 
## 0.07607853 1e-05 
## mtry = 6     OOB error = 0.00011442 
## 0.08764921 1e-05 
## mtry = 9     OOB error = 0.0001249605 
## -0.09212059 1e-05

bestmtry <- data.frame(bestmtry)
mtry <- bestmtry[bestmtry$OOBError == min(bestmtry$OOBError),"mtry"]

rf.model <- randomForest(KO ~ AAPL.Close + GOOG.Close + sma_20 + sma_50 + rsi+ macd + returns + log_returns+
                 market_index+sector_performance + date_num, data = train_df_na, mtry=mtry)

# Make predictions on the test data
predictions_train_rf <- predict(rf.model, newdata = train_df_na)
predictions_test_rf <- predict(rf.model, newdata = test_df_na)

SVM

# svm hyperprameters
hyperparameters <- expand.grid(C = c(0.1, 1, 5),
                               sigma = c(0.1, 1, 5))

control <- trainControl(method = "repeatedcv",
                        number = 5,
                        repeats = 1,
                        search = "grid")

svm.model <- train(KO ~ AAPL.Close + GOOG.Close + sma_20 + sma_50 + rsi+ macd + returns + log_returns+
                 market_index+sector_performance + date_num, data = train_df_na,
                  method = "svmRadial",
               trControl = control,
               tuneGrid = hyperparameters)

# Print the best hyperparameters
print(svm.model$bestTune)
##   sigma C
## 7   0.1 5
# fit model with best parameters
svm.model <- svm(KO ~ AAPL.Close + GOOG.Close + sma_20 + sma_50 + rsi+ macd + returns + log_returns+
                 market_index+sector_performance + date_num, data = train_df_na,
                 cost = svm.model$bestTune$C,
                 sigma = svm.model$bestTune$sigma)

# Make predictions on the test data
predictions_train_svm <- predict(svm.model, newdata = train_df_na)
predictions_test_svm <- predict(svm.model, newdata = test_df_na)

Gradient Boosting(XGBOOST)

make xgboost data for train and validation

# make train and predict matrix

train_Dmatrix <- train_df_na %>% 
                 dplyr::select(date_num,sma_20, sma_50, rsi, macd,returns, log_returns, market_index, sector_performance ) %>% 
                 as.matrix() 

                
test_Dmatrix <- test_df_na %>% 
                dplyr::select(date_num,sma_20, sma_50, rsi, macd,returns, log_returns, market_index, sector_performance) %>% 
                as.matrix()


targets <- train_df_na$KO

5-fold cross validation xgboost model

# Get the maximum number of CPU cores minus one
n_cores <- detectCores() - 1
xgb_grid <- expand.grid(nrounds = c(1000,2000,3000,4000) ,
                            eta = c(0.01, 0.001, 0.0001),
                            lambda = 1,
                            alpha = 0)


#here we do one better then a validation set, we use cross validation to 
#expand the amount of info we have!
xgb_trcontrol <- trainControl(method = "cv",
                                number = 5,
                                verboseIter = TRUE,
                                returnData = FALSE,
                                returnResamp = "all", 
                                allowParallel = TRUE)

xgb_train <- train(x = as.matrix(train_Dmatrix),
                    y = train_df_na$KO,
                    trControl = xgb_trcontrol,
                    tuneGrid = xgb_grid,
                    method = "xgbLinear",
                    max.depth = 5)
## + Fold1: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## - Fold1: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## + Fold1: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## - Fold1: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## + Fold1: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## - Fold1: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## + Fold1: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## - Fold1: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## + Fold1: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## - Fold1: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## + Fold1: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## - Fold1: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## + Fold1: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## - Fold1: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## + Fold1: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## - Fold1: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## + Fold1: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## - Fold1: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## + Fold1: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## - Fold1: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## + Fold1: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## - Fold1: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## + Fold1: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## - Fold1: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## + Fold2: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## - Fold2: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## + Fold2: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## - Fold2: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## + Fold2: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## - Fold2: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## + Fold2: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## - Fold2: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## + Fold2: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## - Fold2: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## + Fold2: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## - Fold2: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## + Fold2: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## - Fold2: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## + Fold2: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## - Fold2: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## + Fold2: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## - Fold2: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## + Fold2: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## - Fold2: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## + Fold2: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## - Fold2: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## + Fold2: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## - Fold2: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## + Fold3: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## - Fold3: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## + Fold3: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## - Fold3: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## + Fold3: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## - Fold3: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## + Fold3: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## - Fold3: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## + Fold3: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## - Fold3: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## + Fold3: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## - Fold3: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## + Fold3: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## - Fold3: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## + Fold3: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## - Fold3: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## + Fold3: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## - Fold3: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## + Fold3: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## - Fold3: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## + Fold3: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## - Fold3: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## + Fold3: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## - Fold3: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## + Fold4: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## - Fold4: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## + Fold4: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## - Fold4: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## + Fold4: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## - Fold4: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## + Fold4: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## - Fold4: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## + Fold4: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## - Fold4: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## + Fold4: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## - Fold4: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## + Fold4: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## - Fold4: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## + Fold4: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## - Fold4: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## + Fold4: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## - Fold4: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## + Fold4: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## - Fold4: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## + Fold4: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## - Fold4: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## + Fold4: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## - Fold4: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## + Fold5: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## - Fold5: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## + Fold5: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## - Fold5: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## + Fold5: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## - Fold5: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## + Fold5: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## - Fold5: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## + Fold5: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## - Fold5: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## + Fold5: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## - Fold5: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## + Fold5: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## - Fold5: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## + Fold5: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## - Fold5: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## + Fold5: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## - Fold5: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## + Fold5: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## - Fold5: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## + Fold5: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## - Fold5: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## + Fold5: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## - Fold5: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 1000, lambda = 1, alpha = 0, eta = 1e-04 on full training set
# best model parameters
xgb_train$bestTune
# print model summary
summary(xgb_train)
##               Length  Class              Mode       
## handle              1 xgb.Booster.handle externalptr
## raw           1012540 -none-             raw        
## niter               1 -none-             numeric    
## call                6 -none-             call       
## params              5 -none-             list       
## callbacks           1 -none-             list       
## feature_names       9 -none-             character  
## nfeatures           1 -none-             numeric    
## xNames              9 -none-             character  
## problemType         1 -none-             character  
## tuneValue           4 data.frame         list       
## obsLevels           1 -none-             logical    
## param               1 -none-             list
# train and test prediction

predictions_train_xg <-  predict(xgb_train , train_Dmatrix)
predictions_test_xg <-  predict(xgb_train , test_Dmatrix)

Deep Learning:

Next, I will be using Deep learning models like Recurrent Neural Networks (RNNs)/ Long Short-Term Memory (LSTM) networks to capture temporal dependencies in stock price data. These models can be more effective in capturing complex patterns and long-term dependencies in the data. The keras package in R provides tools for implementing deep learning models.

df$date_num <- as.numeric(as.POSIXct(df$Date))

df_no_na <- na.omit(df)

df_nn <- df_no_na %>% 
  dplyr::select(-Date)

Recurrent Neural Networks (RNNs)

# Prepare the data
data_rnn <- df_nn 

# Split the data into training and testing sets
train_size <- 0.8  # Percentage of data for training
train_index <- round(train_size * nrow(data_rnn))
train_data_rnn <- data_rnn[1:train_index, ]
test_data_rnn <- data_rnn[(train_index + 1):nrow(data_rnn), ]

# Prepare the training and testing data
lookback <- 5  # Number of previous time steps to consider
# Normalize the data
train_data_rnn <- as.data.frame(scale(train_data_rnn))
test_data_rnn <- as.data.frame(scale(test_data_rnn))

# Create input sequences for training data
train_sequences_rnn <- array(0, dim = c(nrow(train_data_rnn) - lookback + 1, lookback, ncol(train_data_rnn)))
for (i in 1:(nrow(train_data_rnn) - lookback + 1)) {
  train_sequences_rnn[i, , ] <- as.matrix(train_data_rnn[i:(i + lookback - 1), ])
}

# Create input sequences for testing data
test_sequences_rnn <- array(0, dim = c(nrow(test_data_rnn) - lookback + 1, lookback, ncol(test_data_rnn)))
for (i in 1:(nrow(test_data_rnn) - lookback + 1)) {
  test_sequences_rnn[i, , ] <- as.matrix(test_data_rnn[i:(i + lookback - 1), ])
}

# Define the RNN rnn_model
rnn_model <- keras_model_sequential()
rnn_model %>%
  layer_simple_rnn (units = 50, input_shape = c(lookback, ncol(data_rnn))) %>%
  layer_dense(units = 1)

# Compile the rnn_model
rnn_model %>% compile(
  loss = 'mean_squared_error',
  optimizer = 'adam'
)

# Train the rnn_model
epochs <- 100
batch_size <- 32
early_stopping <- callback_early_stopping(
  monitor = "loss", 
  patience = 5, 
  restore_best_weights = TRUE  
)

rnn_model %>% fit(
  train_sequences_rnn,
  train_data_rnn$KO.Close[lookback:nrow(train_data_rnn)],
  epochs = epochs,
  batch_size = batch_size,
  verbose = 0,
  callbacks = list(early_stopping) 
)

# Evaluate the rnn_model
evaluation <- rnn_model %>% evaluate(
  test_sequences_rnn,
  test_data_rnn$KO.Close[lookback:nrow(test_data_rnn)],
  verbose = 0
)

Long Short-Term Memory (LSTM)

# Prepare the data_lstm
data_lstm <- df_nn  

# Split the data_lstm into training and testing sets
train_size <- 0.8  # Percentage of data_lstm for training
train_index <- round(train_size * nrow(data_lstm))
train_data_lstm <- data_lstm[1:train_index, ]
test_data_lstm <- data_lstm[(train_index + 1):nrow(data_lstm), ]

# Prepare the training and testing data_lstm
lookback <- 5  # Number of previous time steps to consider
# # Normalize the data_lstm
train_data_lstm <- as.data.frame(scale(train_data_lstm))
test_data_lstm <- as.data.frame(scale(test_data_lstm))
# Initialize train_sequences_lstm and test_sequences_lstm
train_sequences_lstm <- array(0, dim = c(nrow(train_data_lstm) - lookback + 1, lookback, ncol(train_data_lstm)))
test_sequences_lstm <- array(0, dim = c(nrow(test_data_lstm) - lookback + 1, lookback, ncol(test_data_lstm)))

# Populate train_sequences_lstm with subsets of train_data
for (i in 1:(nrow(train_data_lstm) - lookback + 1)) {
  train_sequences_lstm[i, , ] <- as.matrix(train_data_lstm[i:(i + lookback - 1), ])
}

# Populate test_sequences_lstm with subsets of test_data_lstm
for (i in 1:(nrow(test_data_lstm) - lookback + 1)) {
  test_sequences_lstm[i, , ] <- as.matrix(test_data_lstm[i:(i + lookback - 1), ])
}


# Define the LSTM lstm_model
lstm_model <- keras_model_sequential()
lstm_model %>%
  layer_lstm(units = 50, input_shape = c(lookback, ncol(data_lstm))) %>%
  layer_dense(units = 1)

# Compile the lstm_model
lstm_model %>% compile(
  loss = 'mean_squared_error',
  optimizer = 'adam'
)

# Train the lstm_model
epochs <- 100
batch_size <- 32
early_stopping <- callback_early_stopping(
  monitor = "loss", 
  patience = 5, 
  restore_best_weights = TRUE  
)

lstm_model %>% fit(
  train_sequences_lstm,
  train_data_lstm$KO.Close[lookback:nrow(train_data_lstm)],
  epochs = epochs,
  batch_size = batch_size,
  verbose = 0,
   callbacks = list(early_stopping) 
)

# Evaluate the lstm_model
evaluation <- lstm_model %>% evaluate(
  test_sequences_lstm,
  test_data_lstm$KO.Close[lookback:nrow(test_data_lstm)],
  verbose = 0
)

Model Evaluation:

Now we will start evaluating our models based on appropriate metrics like RMSE, MAE, etc. In addition we will evaluate the model’s performance in terms of how well it generalizes/ predict to unseen data.

Linear regression model evaluation

print("Train RMSE")
## [1] "Train RMSE"
RMSE(predictions_train_lm, train_df$KO, na.rm = TRUE)
## [1] 0.01414952
print("Test RMSE")
## [1] "Test RMSE"
RMSE(predictions_test_lm, test_df$KO, na.rm = TRUE)
## [1] 0.01595054
print("Train MAE")
## [1] "Train MAE"
MAE(predictions_train_lm, train_df$KO, na.rm = TRUE)
## [1] 0.009313431
print("Test MAE")
## [1] "Test MAE"
MAE(predictions_test_lm, test_df$KO, na.rm = TRUE)
## [1] 0.01191639

For linear regression model we can see that train and test RMSE and MAE is pretty close. So we can say that our linear regression model well generalizes on the unseen data.

ARIMA model evaluation

# Calculate RMSE
rmse <- sqrt(mean((as.vector(train) - predictions_train_ar$mean)^2))

# Calculate RMSE
rmse <- sqrt(mean((as.vector(test) - predictions_test_ar$mean)^2))

# Calculate MAE
mae <- mean(abs(as.vector(train) - predictions_train_ar$mean))

# Print RMSE and MAE
print(paste("Train RMSE:", rmse))
## [1] "Train RMSE: 8.55251952958922"
print(paste("Train MAE:", mae))
## [1] "Train MAE: 10.1989489457221"
# Calculate MAE
mae <- mean(abs(as.vector(test) - predictions_test_ar$mean))

# Print RMSE and MAE
print(paste("Test RMSE:", rmse))
## [1] "Test RMSE: 8.55251952958922"
print(paste("Test MAE:", mae))
## [1] "Test MAE: 7.41118063850214"

For ARIMA model we can see that train and test RMSE and MAE is not close. So we can say that our ARIMA model does not generalizes well on the unseen data.

GARCH model evaluation

# Print RMSE and MAE
print(paste("Train RMSE:", sqrt(mean((as.vector(train) - predictions_train_garch)^2))))
## [1] "Train RMSE: 41.1939039242954"
print(paste("Test RMSE:", sqrt(mean((as.vector(test) - predictions_test_garch)^2))))
## [1] "Test RMSE: 57.7242631368243"
print(paste("Train MAE:", mean(abs(as.vector(train) - predictions_train_garch))))
## [1] "Train MAE: 40.6454646495492"
# Print RMSE and MAE
print(paste("Test MAE:", mean(abs(as.vector(test) - predictions_test_garch))))
## [1] "Test MAE: 57.5385538098054"

For GARCH model we can see that train and test RMSE and MAE is not close. So we can say that our GARCH model does not generilize well on the unseen data.

SVM regression model evaluation

print("Train RMSE")
## [1] "Train RMSE"
RMSE(predictions_train_rf, train_df_na$KO, na.rm = TRUE)
## [1] 0.004626679
print("Test RMSE")
## [1] "Test RMSE"
RMSE(predictions_test_rf, test_df_na$KO, na.rm = TRUE)
## [1] 0.153195
print("Train MAE")
## [1] "Train MAE"
MAE(predictions_train_rf, train_df_na$KO, na.rm = TRUE)
## [1] 0.002437644
print("Test MAE")
## [1] "Test MAE"
MAE(predictions_test_rf, test_df_na$KO, na.rm = TRUE)
## [1] 0.1212803

For Random Forest model we can see that train and test RMSE is not close. So we can say that our Random Forest model doesn’t well generilize on the unseen data.

SVM regression model evaluation

print("Train RMSE")
## [1] "Train RMSE"
RMSE(predictions_train_svm, train_df_na$KO, na.rm = TRUE)
## [1] 0.01462805
print("Test RMSE")
## [1] "Test RMSE"
RMSE(predictions_test_svm, test_df_na$KO, na.rm = TRUE)
## [1] 0.4952442
print("Train MAE")
## [1] "Train MAE"
MAE(predictions_train_svm, train_df_na$KO, na.rm = TRUE)
## [1] 0.01203779
print("Test MAE")
## [1] "Test MAE"
MAE(predictions_test_svm, test_df_na$KO, na.rm = TRUE)
## [1] 0.4529544

For linear regression model we can see that train and test RMSE is not close. So we can say that our svm model well generilizes on the unseen data.

Gradient Boosting(xgboost) regression model evaluation

print("Train RMSE")
## [1] "Train RMSE"
RMSE(predictions_train_xg, train_df_na$KO, na.rm = TRUE)
## [1] 0.001407231
print("Test RMSE")
## [1] "Test RMSE"
RMSE(predictions_test_xg, test_df_na$KO, na.rm = TRUE)
## [1] 0.1241643
print("Train MAE")
## [1] "Train MAE"
MAE(predictions_train_xg, train_df_na$KO, na.rm = TRUE)
## [1] 0.001054351
print("Test MAE")
## [1] "Test MAE"
MAE(predictions_test_xg, test_df_na$KO, na.rm = TRUE)
## [1] 0.09604201

For Gradient Boosting model we can see that train and test RMSE and MAE is not close. So we can say that our Gradient Boosting model doesn’t well generalizes on the unseen data. Gradient boosting model overfits our data.

RNN Model

# Make predictions
train_predictions_rnn <- rnn_model %>% predict(train_sequences_rnn)

test_predictions_rnn <- rnn_model %>% predict(test_sequences_rnn)

# Print the evaluation results and predicted values
print(head(train_predictions_rnn))
##           [,1]
## [1,] -2.172802
## [2,] -2.113827
## [3,] -2.185261
## [4,] -2.162880
## [5,] -2.182243
## [6,] -2.155576
print(head(test_predictions_rnn))
##           [,1]
## [1,] -2.015165
## [2,] -1.858428
## [3,] -1.906455
## [4,] -1.871979
## [5,] -1.845078
## [6,] -1.184655
# Convert train_predictions to a dataframe
train_predictions_df_rnn <- data.frame(
  Date = tail(train_data_rnn$date_num, nrow(train_data_rnn) - lookback),
  Actual = tail(train_data_rnn$KO.Close, nrow(train_data_rnn) - lookback),
  Predicted = tail(train_predictions_rnn, nrow(train_data_rnn) - lookback)
)

# Convert test_predictions to a dataframe
test_predictions_df_rnn <- data.frame(
  Date = tail(test_data_rnn$date_num, nrow(test_data_rnn) - lookback),
  Actual = tail(test_data_rnn$KO.Close, nrow(test_data_rnn) - lookback),
  Predicted = tail(test_predictions_rnn, nrow(test_data_rnn) - lookback)
)


# Scale the actual and predicted values back to their original scale
actual_unscaled_train_rnn <- train_predictions_df_rnn$Actual * sd(df_no_na$KO.Close) + mean(df_no_na$KO.Close)
predicted_unscaled_train_rnn <- train_predictions_df_rnn$Predicted * sd(df_no_na$KO.Close) + mean(df_no_na$KO.Close)

# Create a new data frame with unscaled values
unscaled_predictions_df_train_rnn <- data.frame(
  Date = train_predictions_df_rnn$Date,
  Actual = actual_unscaled_train_rnn,
  Predicted = predicted_unscaled_train_rnn
)


# Scale the actual and predicted values back to their original scale
actual_unscaled_test_rnn <- test_predictions_df_rnn$Actual * sd(df_no_na$KO.Close) + mean(df_no_na$KO.Close)
predicted_unscaled_test_rnn <- test_predictions_df_rnn$Predicted * sd(df_no_na$KO.Close) + mean(df_no_na$KO.Close)

# Create a new data frame with unscaled values
unscaled_predictions_df_test_rnn <- data.frame(
  Date = test_predictions_df_rnn$Date,
  Actual = actual_unscaled_test_rnn,
  Predicted = predicted_unscaled_test_rnn
)





# Scale the actual and predicted values back to their original scale
actual_unscaled_test_rnn <- test_predictions_df_rnn$Actual * sd(df_no_na$KO.Close) + mean(df_no_na$KO.Close)
predicted_unscaled_test_rnn <- test_predictions_df_rnn$Predicted * sd(df_no_na$KO.Close) + mean(df_no_na$KO.Close)

print("Train RMSE")
## [1] "Train RMSE"
RMSE(predicted_unscaled_train_rnn, actual_unscaled_train_rnn, na.rm = TRUE)
## [1] 0.1271095
print("Test RMSE")
## [1] "Test RMSE"
RMSE(predicted_unscaled_train_rnn, actual_unscaled_train_rnn, na.rm = TRUE)
## [1] 0.1271095
print("Train MAE")
## [1] "Train MAE"
MAE(predicted_unscaled_test_rnn, actual_unscaled_test_rnn, na.rm = TRUE)
## [1] 0.5221954
print("Test MAE")
## [1] "Test MAE"
MAE(predicted_unscaled_test_rnn, actual_unscaled_test_rnn, na.rm = TRUE)
## [1] 0.5221954
# Plot actual vs predicted values (using unscaled data)
ggplot(unscaled_predictions_df_test_rnn, aes(Date)) +
  geom_line(aes(y = Actual, color = "Actual")) +
  geom_line(aes(y = Predicted, color = "Predicted")) +
  labs(x = "Date", y = "KO.Close", title = "Actual vs Predicted") +
  scale_color_manual(values = c("Actual" = "blue", "Predicted" = "red")) +
  theme_minimal()

RNN model has performed low of 0.119165 and MAE of 0.9020872 values for both the training and testing datasets and able to make accurate predictions and generalize well to unseen data.

LSTM Model

# Make predictions
train_predictions_lstm <- lstm_model %>% predict(train_sequences_lstm)

test_predictions_lstm <- lstm_model %>% predict(test_sequences_lstm)

# Print the evaluation results and predicted values
print(head(train_predictions_lstm))
##           [,1]
## [1,] -2.188878
## [2,] -2.119661
## [3,] -2.178059
## [4,] -2.162542
## [5,] -2.177200
## [6,] -2.167212
print(head(test_predictions_lstm))
##           [,1]
## [1,] -1.772198
## [2,] -1.680572
## [3,] -1.687855
## [4,] -1.669295
## [5,] -1.696824
## [6,] -1.347605
# Convert train_predictions to a dataframe
train_predictions_df_lstm <- data.frame(
  Date = tail(train_data_lstm$date_num, nrow(train_data_lstm) - lookback),
  Actual = tail(train_data_lstm$KO.Close, nrow(train_data_lstm) - lookback),
  Predicted = tail(train_predictions_lstm, nrow(train_data_lstm) - lookback)
)

# Convert test_predictions to a dataframe
test_predictions_df_lstm <- data.frame(
  Date = tail(test_data_lstm$date_num, nrow(test_data_lstm) - lookback),
  Actual = tail(test_data_lstm$KO.Close, nrow(test_data_lstm) - lookback),
  Predicted = tail(test_predictions_lstm, nrow(test_data_lstm) - lookback)
)


# Scale the actual and predicted values back to their original scale
actual_unscaled_train_lstm <- train_predictions_df_lstm$Actual * sd(df_no_na$KO.Close) + mean(df_no_na$KO.Close)
predicted_unscaled_train_lstm <- train_predictions_df_lstm$Predicted * sd(df_no_na$KO.Close) + mean(df_no_na$KO.Close)

# Create a new data frame with unscaled values
unscaled_predictions_df_train_lstm <- data.frame(
  Date = train_predictions_df_lstm$Date,
  Actual = actual_unscaled_train_lstm,
  Predicted = predicted_unscaled_train_lstm
)


# Scale the actual and predicted values back to their original scale
actual_unscaled_test_lstm <- test_predictions_df_lstm$Actual * sd(df_no_na$KO.Close) + mean(df_no_na$KO.Close)
predicted_unscaled_test_lstm <- test_predictions_df_lstm$Predicted * sd(df_no_na$KO.Close) + mean(df_no_na$KO.Close)

# Create a new data frame with unscaled values
unscaled_predictions_df_test_lstm <- data.frame(
  Date = test_predictions_df_lstm$Date,
  Actual = actual_unscaled_test_lstm,
  Predicted = predicted_unscaled_test_lstm
)





# Scale the actual and predicted values back to their original scale
actual_unscaled_test_lstm <- test_predictions_df_lstm$Actual * sd(df_no_na$KO.Close) + mean(df_no_na$KO.Close)
predicted_unscaled_test_lstm <- test_predictions_df_lstm$Predicted * sd(df_no_na$KO.Close) + mean(df_no_na$KO.Close)

print("Train RMSE")
## [1] "Train RMSE"
RMSE(predicted_unscaled_train_lstm, actual_unscaled_train_lstm, na.rm = TRUE)
## [1] 0.09828793
print("Test RMSE")
## [1] "Test RMSE"
RMSE(predicted_unscaled_train_lstm, actual_unscaled_train_lstm, na.rm = TRUE)
## [1] 0.09828793
print("Train MAE")
## [1] "Train MAE"
MAE(predicted_unscaled_test_lstm, actual_unscaled_test_lstm, na.rm = TRUE)
## [1] 0.9488985
print("Test MAE")
## [1] "Test MAE"
MAE(predicted_unscaled_test_lstm, actual_unscaled_test_lstm, na.rm = TRUE)
## [1] 0.9488985
# Plot actual vs predicted values (using unscaled data)
ggplot(unscaled_predictions_df_test_lstm, aes(Date)) +
  geom_line(aes(y = Actual, color = "Actual")) +
  geom_line(aes(y = Predicted, color = "Predicted")) +
  labs(x = "Date", y = "KO.Close", title = "Actual vs Predicted") +
  scale_color_manual(values = c("Actual" = "blue", "Predicted" = "red")) +
  theme_minimal()

The LSTM model has achieved a low RMSE of 0.09044631 and MAE of 0.819174 for both the training and testing datasets, indicating accurate predictions and excellent generalization to unseen data

Backtesting:

This is an important step to validate our model predictions with the actual data. You could use the PerformanceAnalytics package in R for backtesting.

# backtesting for linear regression
predictions_lm <- predict(lm.model, newdata = test_df_na)


# Perform backtesting for the ARIMA model
predictions_arima <- forecast(model.arima, h = nrow(test_df_na))$mean

# Perform backtesting for the GARCH model
data_ts <- xts(df_no_na$KO, order.by = df_no_na$Date)  # Assuming 'time' is the column name for time
data_pr <- xts(test_df_na$KO, order.by = test_df_na$Date)

# Fit the GARCH model to the data
garch_fit <- ugarchfit(spec =model.spec , data = data_ts)

# Generate forecasts
forecast <- ugarchforecast(garch_fit, data = data_pr,n.ahead =length(data_pr))

# Extract the predicted conditional variances
predicted_garch <- sigma(forecast)



# Perform backtesting for the Random Forest model
predictions_test_rf <- predict(rf.model, newdata = test_df_na)


# Perform backtesting for the SVM Forest model
predictions_test_svm <- predict(svm.model, newdata = test_df_na)
# Perform backtesting for Gradient Boosting model
predictions_test_xg <-  predict(xgb_train , test_Dmatrix)


# Perform backtesting for the LSTM model

test_sequences <- array(0, dim = c(nrow(test_data_lstm) - lookback + 1, lookback, ncol(test_data_lstm)))
for (i in 1:(nrow(test_data_lstm) - lookback + 1)) {
  test_sequences[i, , ] <- as.matrix(test_data_lstm[i:(i + lookback - 1), ])
}
predictions_lstm <- lstm_model %>% predict(test_sequences_lstm)

# Perform backtesting for the RNN model

predictions_rnn <- rnn_model %>% predict(test_sequences_rnn)

# Create a data frame with actual and predicted values for LSTM and RNN models
backtest_data <- data.frame(
  Actual = test_df_na$KO[1:length( as.vector(predictions_rnn))],
  LinearRegression = predictions_lm[1:length( as.vector(predictions_rnn))],
  ARIMA = as.vector(predictions_arima)[1:length( as.vector(predictions_rnn))],
  GARCH = as.vector(predicted_garch)[1:length( as.vector(predictions_rnn))],
  Random_Forest = predictions_test_rf[1:length( as.vector(predictions_rnn))],
  SVM = predictions_test_svm[1:length( as.vector(predictions_rnn))],
  Gradient_Boosting = predictions_test_xg[1:length( as.vector(predictions_rnn))],
  LSTM = as.vector(predictions_lstm)[1:length( as.vector(predictions_rnn))],
  RNN = as.vector(predictions_rnn)[1:length( as.vector(predictions_rnn))]
)
rownames(backtest_data) <- test_df_na$Date[1:length( as.vector(predictions_rnn))]

# Calculate returns for each model
returns <- Return.calculate(backtest_data)

# Evaluate performance using various metrics
metrics <- table.Stats(returns)
## Warning in log(1 + x): NaNs produced

## Warning in log(1 + x): NaNs produced
# Print the performance metrics
print(metrics)
##                   Actual LinearRegression    ARIMA    GARCH Random_Forest
## Observations    663.0000         663.0000 663.0000 663.0000      663.0000
## NAs               1.0000           1.0000   1.0000   1.0000        1.0000
## Minimum          -0.0285          -0.0310   0.0000   0.0000       -0.0143
## Quartile 1       -0.0021          -0.0019   0.0000   0.0000       -0.0006
## Median            0.0003           0.0006   0.0000   0.0000        0.0000
## Arithmetic Mean   0.0001           0.0001   0.0000   0.0003        0.0001
## Geometric Mean    0.0001           0.0001   0.0000   0.0003        0.0001
## Quartile 3        0.0025           0.0027   0.0000   0.0001        0.0009
## Maximum           0.0248           0.0129   0.0000   0.0057        0.0120
## SE Mean           0.0002           0.0002   0.0000   0.0000        0.0001
## LCL Mean (0.95)  -0.0002          -0.0002   0.0000   0.0003       -0.0001
## UCL Mean (0.95)   0.0005           0.0005   0.0000   0.0004        0.0002
## Variance          0.0000           0.0000   0.0000   0.0000        0.0000
## Stdev             0.0044           0.0043   0.0000   0.0009        0.0018
## Skewness         -0.4465          -1.1763  25.6109   3.6477       -1.2113
## Kurtosis          4.9481           5.5649 655.3633  13.9815       19.2811
##                      SVM Gradient_Boosting     LSTM      RNN
## Observations    663.0000          663.0000 663.0000 663.0000
## NAs               1.0000            1.0000   1.0000   1.0000
## Minimum          -0.0301           -0.0227  -5.2590  -3.3699
## Quartile 1       -0.0016           -0.0013  -0.0610  -0.0889
## Median            0.0000            0.0001  -0.0035  -0.0019
## Arithmetic Mean  -0.0001            0.0001  -0.0062   0.0460
## Geometric Mean   -0.0001            0.0001      NaN      NaN
## Quartile 3        0.0020            0.0020   0.0577   0.0876
## Maximum           0.0273            0.0310   1.8763  22.7926
## SE Mean           0.0002            0.0002   0.0128   0.0383
## LCL Mean (0.95)  -0.0004           -0.0003  -0.0312  -0.0292
## UCL Mean (0.95)   0.0002            0.0005   0.0189   0.1212
## Variance          0.0000            0.0000   0.1081   0.9730
## Stdev             0.0041            0.0049   0.3288   0.9864
## Skewness         -0.6660           -0.2997  -7.4439  18.6746
## Kurtosis         11.8658            5.6488 115.7295 426.9483

Linear Regression: The Linear Regression model demonstrates a mean prediction within the range of -0.0310 to 0.0027 at a 95% confidence level. It has a standard deviation of 0.0043, similar to the actual observations. The skewness value of -1.1685 suggests a left-skewed distribution, and the kurtosis of 5.5269 indicates a moderately peaked distribution.

ARIMA: The ARIMA model exhibits a mean prediction within the range of 0.0000 to 0.0000, indicating a narrower range compared to other models. The standard deviation is 0.0000, implying minimal dispersion of predictions. The skewness value of 25.6109 suggests a significantly left-skewed distribution, and the kurtosis of 655.3633 indicates a highly peaked distribution.

GARCH: The GARCH model shows a mean prediction within the range of 0.0000 to 0.0004 at a 95% confidence level. It has a standard deviation of 0.0011, indicating relatively low dispersion. The skewness value of 3.7482 suggests a slightly right-skewed distribution, and the kurtosis of 14.8969 indicates a moderately peaked distribution.

Random Forest: The Random Forest model demonstrates a mean prediction within the range of -0.0200 to 0.0006 at a 95% confidence level. It has a standard deviation of 0.0021, indicating relatively low dispersion of predictions. The skewness value of -2.8718 suggests a slightly left-skewed distribution, and the kurtosis of 39.0698 indicates a highly peaked distribution.

SVM: The SVM model exhibits a mean prediction within the range of -0.0302 to 0.0020 at a 95% confidence level. It has a standard deviation of 0.0042, similar to the actual observations. The skewness value of -0.6672 suggests a slightly left-skewed distribution, and the kurtosis of 11.7287 indicates a moderately peaked distribution.

Gradient Boosting: The Gradient Boosting model shows a mean prediction within the range of -0.0239 to 0.0021 at a 95% confidence level. It has a standard deviation of 0.0046, similar to the actual observations. The skewness value of 0.0084 suggests a slightly right-skewed distribution, and the kurtosis of 5.4478 indicates a moderately peaked distribution.

LSTM: The LSTM model demonstrates a mean prediction within the range of -0.1177 to 0.0742 at a 95% confidence level. It has a standard deviation of 1.2579, indicating relatively high dispersion of predictions. The skewness value of -19.2132 suggests a significantly left-skewed distribution, and the kurtosis of 478.8455 indicates a highly peaked distribution. The LSTM model shows a wide range of predictions with high variability.

RNN: The RNN model exhibits a mean prediction within the range of -0.1998 to 0.0546 at a 95% confidence level. It has a standard deviation of 1.6682, indicating high dispersion of predictions. The skewness value of -21.6002 suggests a significantly left-skewed distribution, and the kurtosis of 507.3861 indicates a highly peaked distribution. The RNN model also shows a wide range of predictions with high variability, similar to the LSTM model.:

Here, Linear Regression and ARIMA appear to have similar performance.

From model evaluations we can see that Linear Regression model generalizes well on unseen data, while ARIMA, GARCH,SVM, Gradient Boosting,LSTM and RNN doesn’t generalizes well. Hence only linear regression well perform best on unseen data.

Robust Testing and Validation:

In this step we are implementing rigorous testing and validation procedures to ensure the robustness and reliability of the predictive model. This can involve cross-validation, bootstrapping, or Monte Carlo simulations to assess model stability and performance under different market conditions.

cross-validation

Linear Regression

# Set seed for reproducibility
set.seed(123)
# convert date to numeric for model input
df$date_num <- as.numeric(as.POSIXct(df$Date))
index <- floor(0.8 * nrow(df))

dfr <- na.omit(df)
# Split the data into train and test sets
train_df <- dfr[1:index, ]
test_df <- dfr[(index + 1):nrow(dfr), ]
# Define control parameters for cross-validation
control <- trainControl(method = "repeatedcv",
                        number = 5,
                        repeats = 1)

# Train the linear regression model with cross-validation
lm_model <- train(KO ~ AAPL.Close + GOOG.Close + sma_20 + sma_50 + rsi + macd + returns + log_returns +
                     market_index + sector_performance + date_num,
                   data = train_df,
                   method = "lm",
                   trControl = control)

# Print the model summary
print(summary(lm_model$finalModel))
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.205837 -0.006880  0.000078  0.007043  0.068409 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6.523e-02  2.010e-02   3.246  0.00118 ** 
## AAPL.Close          9.913e-05  7.278e-05   1.362  0.17331    
## GOOG.Close         -2.524e-04  1.263e-04  -1.999  0.04576 *  
## sma_20              2.022e-01  1.164e-02  17.365  < 2e-16 ***
## sma_50              7.892e-01  1.221e-02  64.654  < 2e-16 ***
## rsi                 1.220e-03  4.492e-05  27.151  < 2e-16 ***
## macd                1.056e-01  1.624e-03  65.030  < 2e-16 ***
## returns            -1.178e+02  9.204e+00 -12.794  < 2e-16 ***
## log_returns         1.199e+02  9.184e+00  13.056  < 2e-16 ***
## market_index        2.993e-05  3.326e-06   8.998  < 2e-16 ***
## sector_performance -1.855e-04  1.393e-04  -1.331  0.18317    
## date_num           -9.483e-11  2.033e-11  -4.664 3.25e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01428 on 2700 degrees of freedom
## Multiple R-squared:  0.9975, Adjusted R-squared:  0.9975 
## F-statistic: 9.938e+04 on 11 and 2700 DF,  p-value: < 2.2e-16
# Make predictions on the training and test data
predictions_train_lm <- predict(lm_model, newdata = train_df)
predictions_test_lm <- predict(lm_model, newdata = test_df)

# Calculate evaluation metrics
mse_train <- mean((predictions_train_lm - train_df$KO)^2)
mse_test <- mean((predictions_test_lm - test_df$KO)^2)
rmse_train <- sqrt(mse_train)
rmse_test <- sqrt(mse_test)
r_squared_train <- cor(predictions_train_lm, train_df$KO)^2
r_squared_test <- cor(predictions_test_lm, test_df$KO)^2

# Print the evaluation results
paste("Training MSE:", mse_train)
## [1] "Training MSE: 0.000203067924519989"
paste("Test MSE:", mse_test)
## [1] "Test MSE: 0.000286675366676312"
paste("Training RMSE:", rmse_train)
## [1] "Training RMSE: 0.0142501903327636"
paste("Test RMSE:", rmse_test)
## [1] "Test RMSE: 0.01693149038556"
paste("Training R-squared:", r_squared_train)
## [1] "Training R-squared: 0.997536336690694"
paste("Test R-squared:", r_squared_test)
## [1] "Test R-squared: 0.991438213798135"

Random Forest

library(randomForest)
library(caret)

# Set seed for reproducibility
set.seed(123)

# Prepare the data
df_no_na <- na.omit(df)
index <- floor(0.8 * nrow(df_no_na))

# Split the data into train and test sets
train_df_na <- df_no_na[1:index, ]
test_df_na <- df_no_na[(index + 1):nrow(df_no_na), ]

# Define the predictors (X) and target variable (y)
X <- train_df_na %>% 
  dplyr::select(AAPL.Close, GOOG.Close, sma_20, sma_50, rsi, macd, returns, log_returns,
                market_index, sector_performance, date_num)
y <- train_df_na$KO

# Perform cross-validation for parameter tuning
control <- trainControl(method = "cv",
                        number = 5,
                        search = "grid")

# Tune the mtry parameter using tuneRF
tuned_params <- tuneRF(X, y, stepFactor = 1.5, improve = 1e-5, ntree = 500)
## mtry = 3  OOB error = 0.0001357392 
## Searching left ...
## mtry = 2     OOB error = 0.0001704267 
## -0.2555454 1e-05 
## Searching right ...
## mtry = 4     OOB error = 0.0001254123 
## 0.07607853 1e-05 
## mtry = 6     OOB error = 0.00011442 
## 0.08764921 1e-05 
## mtry = 9     OOB error = 0.0001249605 
## -0.09212059 1e-05

tuned_params <- as.data.frame(tuned_params)
mtry <- tuned_params[tuned_params$OOBError == min(tuned_params$OOBError), "mtry"]

# Train the random forest model with cross-validation
rf_model <- train(KO ~ AAPL.Close + GOOG.Close + sma_20 + sma_50 + rsi + macd + returns + log_returns +
                    market_index + sector_performance + date_num,
                  data = train_df_na,
                  method = "rf",
                  trControl = control,
                  tuneGrid = data.frame(mtry = mtry))

# Print the model summary
print(rf_model)
## Random Forest 
## 
## 2673 samples
##   11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 2138, 2140, 2138, 2138, 2138 
## Resampling results:
## 
##   RMSE        Rsquared   MAE        
##   0.01143775  0.9983846  0.006404118
## 
## Tuning parameter 'mtry' was held constant at a value of 6
# Make predictions on the training and test data
predictions_train_rf <- predict(rf_model, newdata = train_df_na)
predictions_test_rf <- predict(rf_model, newdata = test_df_na)

# Calculate evaluation metrics
mse_train <- mean((predictions_train_rf - train_df_na$KO)^2)
mse_test <- mean((predictions_test_rf - test_df_na$KO)^2)
rmse_train <- sqrt(mse_train)
rmse_test <- sqrt(mse_test)
r_squared_train <- cor(predictions_train_rf, train_df_na$KO)^2
r_squared_test <- cor(predictions_test_rf, test_df_na$KO)^2

# Print the evaluation results
paste("Training MSE:", mse_train)
## [1] "Training MSE: 2.16865424092529e-05"
paste("Test MSE:", mse_test)
## [1] "Test MSE: 0.0230691694600388"
paste("Training RMSE:", rmse_train)
## [1] "Training RMSE: 0.00465688118908491"
paste("Test RMSE:", rmse_test)
## [1] "Test RMSE: 0.151885382641118"
paste("Training R-squared:", r_squared_train)
## [1] "Training R-squared: 0.999733475071324"
paste("Test R-squared:", r_squared_test)
## [1] "Test R-squared: 0.647477892541242"

SVM

# Set seed for reproducibility
set.seed(123)

# Define hyperparameters
hyperparameters <- expand.grid(C = c(0.1, 1, 5),
                               sigma = c(0.1, 1, 5))

# Define control parameters for cross-validation
control <- trainControl(method = "repeatedcv",
                        number = 5,
                        repeats = 1,
                        search = "grid")

# Train the SVM model with cross-validation
svm_model <- train(KO ~ AAPL.Close + GOOG.Close + sma_20 + sma_50 + rsi + macd + returns + log_returns +
                     market_index + sector_performance + date_num, 
                   data = train_df_na,
                   method = "svmRadial",
                   trControl = control,
                   tuneGrid = hyperparameters)

# Print the best hyperparameters
print(svm_model$bestTune)
##   sigma C
## 7   0.1 5
# Fit the SVM model with the best parameters
svm_model <- svm(KO ~ AAPL.Close + GOOG.Close + sma_20 + sma_50 + rsi + macd + returns + log_returns +
                   market_index + sector_performance + date_num,
                 data = train_df_na,
                 cost = svm_model$bestTune$C,
                 sigma = svm_model$bestTune$sigma)

# Make predictions on the training and test data
predictions_train_svm <- predict(svm_model, newdata = train_df_na)
predictions_test_svm <- predict(svm_model, newdata = test_df_na)

# Compute the training and test accuracy
rmse_train <- sqrt(mean((predictions_train_svm - train_df_na$KO)^2))
rmse_test <- sqrt(mean((predictions_test_svm - test_df_na$KO)^2))

# Print the accuracy results
paste("Training RMSE:", rmse_train)
## [1] "Training RMSE: 0.0146280461714127"
paste("Test RMSE:", rmse_test)
## [1] "Test RMSE: 0.495244197008241"

Gradient Boosting

5-fold cross validation xgboost model

# Define the parameter grid for XGBoost
xgb_grid <- expand.grid(
  nrounds = c(1000, 2000, 3000, 4000),
  eta = c(0.01, 0.001, 0.0001),
  lambda = 1,
  alpha = 0
)

# Set up cross-validation control
xgb_trcontrol <- trainControl(
  method = "cv",
  number = 5,
  verboseIter = TRUE,
  returnData = FALSE,
  returnResamp = "all",
  allowParallel = TRUE
)

# Train the XGBoost model
xgb_train <- train(
  x = as.matrix(train_Dmatrix),
  y = train_df_na$KO,
  trControl = xgb_trcontrol,
  tuneGrid = xgb_grid,
  method = "xgbLinear",
  max.depth = 5
)
## + Fold1: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## - Fold1: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## + Fold1: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## - Fold1: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## + Fold1: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## - Fold1: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## + Fold1: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## - Fold1: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## + Fold1: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## - Fold1: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## + Fold1: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## - Fold1: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## + Fold1: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## - Fold1: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## + Fold1: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## - Fold1: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## + Fold1: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## - Fold1: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## + Fold1: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## - Fold1: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## + Fold1: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## - Fold1: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## + Fold1: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## - Fold1: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## + Fold2: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## - Fold2: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## + Fold2: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## - Fold2: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## + Fold2: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## - Fold2: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## + Fold2: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## - Fold2: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## + Fold2: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## - Fold2: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## + Fold2: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## - Fold2: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## + Fold2: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## - Fold2: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## + Fold2: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## - Fold2: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## + Fold2: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## - Fold2: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## + Fold2: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## - Fold2: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## + Fold2: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## - Fold2: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## + Fold2: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## - Fold2: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## + Fold3: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## - Fold3: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## + Fold3: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## - Fold3: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## + Fold3: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## - Fold3: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## + Fold3: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## - Fold3: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## + Fold3: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## - Fold3: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## + Fold3: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## - Fold3: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## + Fold3: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## - Fold3: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## + Fold3: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## - Fold3: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## + Fold3: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## - Fold3: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## + Fold3: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## - Fold3: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## + Fold3: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## - Fold3: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## + Fold3: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## - Fold3: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## + Fold4: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## - Fold4: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## + Fold4: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## - Fold4: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## + Fold4: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## - Fold4: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## + Fold4: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## - Fold4: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## + Fold4: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## - Fold4: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## + Fold4: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## - Fold4: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## + Fold4: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## - Fold4: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## + Fold4: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## - Fold4: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## + Fold4: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## - Fold4: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## + Fold4: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## - Fold4: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## + Fold4: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## - Fold4: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## + Fold4: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## - Fold4: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## + Fold5: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## - Fold5: nrounds=1000, eta=1e-02, lambda=1, alpha=0 
## + Fold5: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## - Fold5: nrounds=2000, eta=1e-02, lambda=1, alpha=0 
## + Fold5: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## - Fold5: nrounds=3000, eta=1e-02, lambda=1, alpha=0 
## + Fold5: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## - Fold5: nrounds=4000, eta=1e-02, lambda=1, alpha=0 
## + Fold5: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## - Fold5: nrounds=1000, eta=1e-03, lambda=1, alpha=0 
## + Fold5: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## - Fold5: nrounds=2000, eta=1e-03, lambda=1, alpha=0 
## + Fold5: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## - Fold5: nrounds=3000, eta=1e-03, lambda=1, alpha=0 
## + Fold5: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## - Fold5: nrounds=4000, eta=1e-03, lambda=1, alpha=0 
## + Fold5: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## - Fold5: nrounds=1000, eta=1e-04, lambda=1, alpha=0 
## + Fold5: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## - Fold5: nrounds=2000, eta=1e-04, lambda=1, alpha=0 
## + Fold5: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## - Fold5: nrounds=3000, eta=1e-04, lambda=1, alpha=0 
## + Fold5: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## - Fold5: nrounds=4000, eta=1e-04, lambda=1, alpha=0 
## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 1000, lambda = 1, alpha = 0, eta = 1e-04 on full training set
# Print the best model parameters
print("Best Model Parameters:")
## [1] "Best Model Parameters:"
print(xgb_train$bestTune)
##   nrounds lambda alpha   eta
## 1    1000      1     0 1e-04
# Print the model summary
print("Model Summary:")
## [1] "Model Summary:"
summary(xgb_train)
##               Length  Class              Mode       
## handle              1 xgb.Booster.handle externalptr
## raw           1012540 -none-             raw        
## niter               1 -none-             numeric    
## call                6 -none-             call       
## params              5 -none-             list       
## callbacks           1 -none-             list       
## feature_names       9 -none-             character  
## nfeatures           1 -none-             numeric    
## xNames              9 -none-             character  
## problemType         1 -none-             character  
## tuneValue           4 data.frame         list       
## obsLevels           1 -none-             logical    
## param               1 -none-             list
# Perform cross-validation and collect loss results
cv_results <- xgb_train$resample$RMSE

# Calculate mean and standard deviation of the cross-validation results
cv_mean <- mean(cv_results)
cv_sd <- sd(cv_results)

# Print the cross-validation mean loss and standard deviation
paste("Cross-Validation Mean Loss:", cv_mean)
## [1] "Cross-Validation Mean Loss: 0.0117747945042736"
paste("Cross-Validation Standard Deviation:", cv_sd)
## [1] "Cross-Validation Standard Deviation: 0.000733617193752162"

RNN model

library(keras)
library(caret)

# Set seed for reproducibility
set.seed(123)

# Prepare the data
data_rnn <- df_nn 

# Split the data into training and testing sets
train_size <- 0.8  # Percentage of data for training
train_index <- round(train_size * nrow(data_rnn))
train_data_rnn <- data_rnn[1:train_index, ]
test_data_rnn <- data_rnn[(train_index + 1):nrow(data_rnn), ]

# Prepare the training and testing data
lookback <- 5  # Number of previous time steps to consider

# Create input sequences for training data
train_sequences_rnn <- array(0, dim = c(nrow(train_data_rnn) - lookback + 1, lookback, ncol(train_data_rnn)))
for (i in 1:(nrow(train_data_rnn) - lookback + 1)) {
  train_sequences_rnn[i, , ] <- as.matrix(train_data_rnn[i:(i + lookback - 1), ])
}

# Create input sequences for testing data
test_sequences_rnn <- array(0, dim = c(nrow(test_data_rnn) - lookback + 1, lookback, ncol(test_data_rnn)))
for (i in 1:(nrow(test_data_rnn) - lookback + 1)) {
  test_sequences_rnn[i, , ] <- as.matrix(test_data_rnn[i:(i + lookback - 1), ])
}

# Define the RNN rnn_model
rnn_model <- keras_model_sequential()
rnn_model %>%
  layer_simple_rnn(units = 50, input_shape = c(lookback, ncol(data_rnn))) %>%
  layer_dense(units = 1)

# Compile the rnn_model
rnn_model %>% compile(
  loss = 'mean_squared_error',
  optimizer = 'adam'
)

# Define cross-validation parameters
num_folds <- 5
folds <- createFolds(1:nrow(train_sequences_rnn), k = num_folds, list = TRUE, returnTrain = TRUE)

# Initialize vectors to store cross-validation results
cv_scores <- numeric(num_folds)

# Perform cross-validation
for (i in 1:num_folds) {
  # Get the training and validation indices for the current fold
  train_indices <- unlist(folds[-i])
  val_indices <- folds[[i]]
  
  # Get the training and validation data
  cv_train_data <- train_sequences_rnn[train_indices, , ]
  cv_train_labels <- train_data_rnn$KO.Close[lookback:nrow(train_data_rnn)][train_indices]
  cv_val_data <- train_sequences_rnn[val_indices, , ]
  cv_val_labels <- train_data_rnn$KO.Close[lookback:nrow(train_data_rnn)][val_indices]
  
  # Fit the rnn_model on the training data for the current fold
  rnn_model %>% fit(
    cv_train_data,
    cv_train_labels,
    epochs = epochs,
    batch_size = batch_size,
    verbose = 0,
    callbacks = list(early_stopping)
  )
  
  # Evaluate the rnn_model on the validation data for the current fold
  cv_scores <- rnn_model %>% evaluate(
    cv_val_data,
    cv_val_labels,
    verbose = 0
  )
  
  # Get the loss value from the evaluation results
  cv_loss <- cv_scores[[1]]
  # Alternatively, you can use: cv_loss <- cv_scores$loss
  
  # Store the loss value in the cv_scores vector
  cv_scores[i] <- cv_loss
}


# Compute the mean and standard deviation of the cross-validation scores
cv_mean <- mean(cv_scores)
cv_sd <- sd(cv_scores)

# Train the rnn_model on the full training data
rnn_model %>% fit(
  train_sequences_rnn,
  train_data_rnn$KO.Close[lookback:nrow(train_data_rnn)],
  epochs = epochs,
  batch_size = batch_size,
  verbose = 0,
  callbacks = list(early_stopping) 
)

# Evaluate the rnn_model on the test data
evaluation <- rnn_model %>% evaluate(
  test_sequences_rnn,
  test_data_rnn$KO.Close[lookback:nrow(test_data_rnn)],
  verbose = 0
)

# Print the cross-validation mean, standard deviation, and test evaluation results
paste("Cross-Validation Mean Loss:", cv_mean)
## [1] "Cross-Validation Mean Loss: NA"
paste("Cross-Validation Standard Deviation:", cv_sd)
## [1] "Cross-Validation Standard Deviation: NA"

LSTM model

# Set seed for reproducibility
set.seed(123)

# Prepare the data_lstm
data_lstm <- df_nn  

# Split the data_lstm into training and testing sets
train_size <- 0.8  # Percentage of data_lstm for training
train_index <- round(train_size * nrow(data_lstm))
train_data_lstm <- data_lstm[1:train_index, ]
test_data_lstm <- data_lstm[(train_index + 1):nrow(data_lstm), ]

# Prepare the training and testing data_lstm
lookback <- 5  # Number of previous time steps to consider

# Initialize train_sequences_lstm and test_sequences_lstm
train_sequences_lstm <- array(0, dim = c(nrow(train_data_lstm) - lookback + 1, lookback, ncol(train_data_lstm)))
test_sequences_lstm <- array(0, dim = c(nrow(test_data_lstm) - lookback + 1, lookback, ncol(test_data_lstm)))

# Populate train_sequences_lstm with subsets of train_data
for (i in 1:(nrow(train_data_lstm) - lookback + 1)) {
  train_sequences_lstm[i, , ] <- as.matrix(train_data_lstm[i:(i + lookback - 1), ])
}

# Populate test_sequences_lstm with subsets of test_data
for (i in 1:(nrow(test_data_lstm) - lookback + 1)) {
  test_sequences_lstm[i, , ] <- as.matrix(test_data_lstm[i:(i + lookback - 1), ])
}

# Define the LSTM lstm_model
lstm_model <- keras_model_sequential()
lstm_model %>%
  layer_lstm(units = 50, input_shape = c(lookback, ncol(data_lstm))) %>%
  layer_dense(units = 1)

# Compile the lstm_model
lstm_model %>% compile(
  loss = 'mean_squared_error',
  optimizer = 'adam'
)

# Define cross-validation parameters
num_folds <- 5
folds <- createFolds(1:nrow(train_sequences_lstm), k = num_folds, list = TRUE, returnTrain = TRUE)

# Initialize vectors to store cross-validation results
cv_scores <- numeric(num_folds)

# Perform cross-validation
for (i in 1:num_folds) {
  # Get the training and validation indices for the current fold
  train_indices <- unlist(folds[-i])
  val_indices <- folds[[i]]
  
  # Get the training and validation data
  cv_train_data <- train_sequences_lstm[train_indices, , ]
  cv_train_labels <- train_data_lstm$KO.Close[lookback:nrow(train_data_lstm)][train_indices]
  cv_val_data <- train_sequences_lstm[val_indices, , ]
  cv_val_labels <- train_data_lstm$KO.Close[lookback:nrow(train_data_lstm)][val_indices]
  
  # Fit the lstm_model on the training data for the current fold
  lstm_model %>% fit(
    cv_train_data,
    cv_train_labels,
    epochs = epochs,
    batch_size = batch_size,
    verbose = 0,
    callbacks = list(early_stopping)
  )
  
 
}

# Compute the mean and standard deviation of the cross-validation scores
cv_mean <- mean(cv_scores)
cv_sd <- sd(cv_scores)


# Train the lstm_model on the full training data
lstm_model %>% fit(
  train_sequences_rnn,
  train_data_rnn$KO.Close[lookback:nrow(train_data_rnn)],
  epochs = epochs,
  batch_size = batch_size,
  verbose = 0,
  callbacks = list(early_stopping) 
)



# Print the cross-validation mean, standard deviation, and test evaluation results
paste("Cross-Validation Mean Loss:", cv_mean)
## [1] "Cross-Validation Mean Loss: 0"
paste("Cross-Validation Standard Deviation:", cv_sd)
## [1] "Cross-Validation Standard Deviation: 0"

In summary, the “Predictive Model for Stock Market Analysis” project demonstrates the power of data science in forecasting stock prices. Through comprehensive data collection, cleaning, and analysis, we built a predictive model that incorporates technical indicators and market indices to improve accuracy. We evaluated various modeling techniques, including machine learning algorithms, and validated predictions through backtesting. The project also includes an interactive dashboard for real-time visualization. Overall, this project showcases our data science skills and provides valuable insights for investment decision-making.