data_KFA.R

library(readr)
library(dplyr)
library(lubridate)
library(ggplot2)
library(ecoforecastR)  # Ensure this package is installed and loaded

# Define DLM function
DLM_function <- function() {
  # Load target data
  target <- read_csv("https://data.ecoforecast.org/neon4cast-targets/aquatics/aquatics-targets.csv.gz", guess_max = 1e6)
  site_data <- read_csv("https://raw.githubusercontent.com/eco4cast/neon4cast-targets/main/NEON_Field_Site_Metadata_20220412.csv")

  # Filter for aquatic sites
  site_data <- site_data %>% filter(aquatics == 1)
  site <- "BARC"
  oxygen <- target %>% filter(variable == "oxygen", site_id == site) %>% arrange(datetime)

  # Prepare data for DLM
  y <- oxygen$observation
  data <- list(y = y, n = length(y))

  # Define the DLM model; simplified model since `X` isn't used
  model <- list(obs = "y", fixed = "~ 1", n.iter = 10000)

  # Run the DLM
  ef.out <- ecoforecastR::fit_dlm(model = model, data = data)

  # Extract and process results from DLM
  params <- window(ef.out$params, start = 1000)
  summary_params <- summary(params)

  # Initial conditions for Kalman Filter
  mu0 <- as.numeric(head(y, 1))  # First observation of y
  P0 <- diag(10, length(mu0))    # Large initial covariance

  # Return relevant data and DLM results
  return(list(
    Y = y,
    mu0 = mu0,
    P0 = P0,
    time = oxygen$datetime,
    DLM_results = ef.out,
    params = summary_params
  ))
}


# Kalman Filter functions
KalmanAnalysis <- function(mu.f, P.f, Y, R, H, I){
  obs = !is.na(Y)
  if (any(obs)) {
    K <- P.f * H / (H * P.f * H + R)
    mu.a <- mu.f + K * (Y - H * mu.f)
    P.a <- (I - K * H) * P.f
  } else {
    mu.a = mu.f
    P.a = P.f
  }
  return(list(mu.a=mu.a, P.a=P.a))
}

KalmanForecast <- function(mu.a, P.a, M, Q){
  mu.f = M * mu.a
  P.f = Q + M * P.a * M
  return(list(mu.f=mu.f, P.f=P.f))
}

# Running the Kalman Filter
KalmanFilter <- function(M, mu0, P0, Q, R, Y){
  nt = length(Y)
  mu.f = numeric(nt + 1)
  mu.a = numeric(nt)
  P.f = numeric(nt + 1)
  P.a = numeric(nt)

  # Initialization
  mu.f[1] = mu0
  P.f[1] = P0
  H = 1
  I = 1

  # Sequential updates
  for(t in 1:nt){
    KA <- KalmanAnalysis(mu.f[t], P.f[t], Y[t], R, H, I)
    mu.a[t] <- KA$mu.a
    P.a[t] <- KA$P.a
    
    KF <- KalmanForecast(mu.a[t], P.a[t], M, Q)
    mu.f[t + 1] <- KF$mu.f
    P.f[t + 1] <- KF$P.f
  }
  
  return(list(mu.f=mu.f, mu.a=mu.a, P.f=P.f, P.a=P.a))
}


# Run the DLM function to get initial parameters
results <- DLM_function()

# Define a simple state transition matrix M, assuming simple evolution without spatial interactions
alpha <- 0.05
M <- diag(1 - alpha, length(results$Y))

# Define process and observation error
tau_proc <- rep(0.01, length(results$Y))
Q <- diag(diag(tau_proc))
tau_obs <- var(results$Y, na.rm = TRUE)
R <- diag(tau_obs, length(results$Y))

# Prepare the observation vector Y
Y <- matrix(results$Y, ncol = 1)

# Run the Kalman Filter
KF_results <- KalmanFilter(M, results$mu0, results$P0, Q, R, Y)

# Forecasting for the next 16 days
last_mu <- tail(KF_results$mu.a, 1)
last_P <- tail(KF_results$P.a, 1)
forecast_length <- 30
forecast_mu <- numeric(forecast_length)
forecast_P <- numeric(forecast_length)
for (i in 1:forecast_length) {
  forecast_result <- KalmanForecast(last_mu, last_P, M, Q)
  forecast_mu[i] <- forecast_result$mu.f
  last_mu <- forecast_result$mu.f
  last_P <- forecast_result$P.f
}
forecast_dates <- seq(max(results$time) + 1, by = "day", length.out = forecast_length)

# Combining actual and forecasted results for plotting
full_mu <- c(KF_results$mu.a, forecast_mu)
full_time <- c(results$time, forecast_dates)
full_p <- c(sqrt(KF_results$P.a), sqrt(forecast_P)) * 1.96  # 95% CI
ci <- cbind(full_mu - full_p, full_mu + full_p)

# Plotting actual and forecasted results with confidence intervals
plot(as.Date(full_time), full_mu, type = 'n', xlab = "Time", ylab = "Oxygen Level", main = "Forecast and Analysis with Confidence Intervals")
ecoforecastR::ciEnvelope(as.Date(full_time), ci[, 1], ci[, 2], col = "lightBlue")
lines(as.Date(full_time), full_mu, lwd = 2)
points(as.Date(results$time), results$Y, pch = 19)  # Actual observations
points(as.Date(forecast_dates), forecast_mu, pch = 19, col = 'red')  # Forecasted points