From cb12820682e27578ef9db3adb311c5801f1dc2fa Mon Sep 17 00:00:00 2001
From: Aaron  Jomy <75925957+maximusron@users.noreply.github.com>
Date: Sun, 17 Sep 2023 15:15:41 +0530
Subject: [PATCH] Add rich text to kalman-filter CUDA demo (#58)

---
 .../{kalman.hpp => kalman.h++}                |    2 +-
 notebooks/kalman_CUDA_demo/run_kf.ipynb       | 1203 ++++++++++++-----
 notebooks/kalman_CUDA_demo/run_kf_CUDA.ipynb  |  928 -------------
 3 files changed, 835 insertions(+), 1298 deletions(-)
 rename notebooks/kalman_CUDA_demo/{kalman.hpp => kalman.h++} (96%)
 delete mode 100644 notebooks/kalman_CUDA_demo/run_kf_CUDA.ipynb
diff --git a/notebooks/kalman_CUDA_demo/kalman.hpp b/notebooks/kalman_CUDA_demo/kalman.h++
similarity index 96%
rename from notebooks/kalman_CUDA_demo/kalman.hpp
rename to notebooks/kalman_CUDA_demo/kalman.h++
index 73faf6fa..0390ded7 100644
--- a/notebooks/kalman_CUDA_demo/kalman.hpp
+++ b/notebooks/kalman_CUDA_demo/kalman.h++
@@ -42,7 +42,7 @@ class KalmanFilter {
   * Update the estimated state based on measured values. The
   * time step is assumed to remain constant.
   */
-  void update(const std::vector<double>& y);
+  std::vector<double> update(const std::vector<double>& y);
 
   /**
   * Update the estimated state based on measured values,
diff --git a/notebooks/kalman_CUDA_demo/run_kf.ipynb b/notebooks/kalman_CUDA_demo/run_kf.ipynb
index 020fd954..a31dab2c 100644
--- a/notebooks/kalman_CUDA_demo/run_kf.ipynb
+++ b/notebooks/kalman_CUDA_demo/run_kf.ipynb
@@ -1,107 +1,476 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0aa31d74-5546-4990-9dd4-c44b552443a3",
+   "metadata": {},
+   "source": [
+    "## Running a 1 Dimensional Kalman Filter using std::vector and CUDA"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,
-   "id": "afdccef0-6aaa-43d6-ac29-7ea950356e19",
+   "id": "9f1f08b0-9d14-46c0-af75-4ea72af2d350",
    "metadata": {},
    "outputs": [],
    "source": [
+    "#include <vector>\n",
+    "#include <cuda_runtime.h>\n",
+    "#include <device_launch_parameters.h>\n",
+    "#include <iostream>\n",
     "#include \"kalman.hpp\""
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "dc67046f-5dcb-48c4-844b-b7f248cc8cea",
+   "metadata": {},
+   "source": [
+    "### Defining CUDA Kernel functions for parallel matrix and vector operations:"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 2,
-   "id": "6feb6a99-9580-4bf7-aa48-0a7d175494a4",
+   "id": "ebcc8af4-32b2-418b-bfbe-9ffb4ff9b0ff",
    "metadata": {},
    "outputs": [],
    "source": [
-    "#include <iostream>\n",
-    "#include <stdexcept>\n",
-    "#include <vector>\n"
+    "__global__ void matAddKernel(double* a, double* b, double* c, int rows, int cols) {\n",
+    "    int col = blockIdx.x * blockDim.x + threadIdx.x;\n",
+    "    int row = blockIdx.y * blockDim.y + threadIdx.y;\n",
+    "\n",
+    "    if (col < cols && row < rows) {\n",
+    "        int idx = row * cols + col;\n",
+    "        c[idx] = a[idx] + b[idx];\n",
+    "    }\n",
+    "}"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 3,
-   "id": "ebcc8af4-32b2-418b-bfbe-9ffb4ff9b0ff",
+   "id": "3f7f95d9-2f3d-4600-8896-0ed4dcca1ed6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "__global__ void matSubKernel(double* a, double* b, double* c, int rows, int cols) {\n",
+    "    int col = blockIdx.x * blockDim.x + threadIdx.x;\n",
+    "    int row = blockIdx.y * blockDim.y + threadIdx.y;\n",
+    "\n",
+    "    if (col < cols && row < rows) {\n",
+    "        int idx = row * cols + col;\n",
+    "        c[idx] = a[idx] - b[idx];\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "00183ab5-0380-4773-a34c-053fef7782f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "__global__ void matTransposeKernel(double* a, double* c, int rows, int cols) {\n",
+    "    int col = blockIdx.x * blockDim.x + threadIdx.x;\n",
+    "    int row = blockIdx.y * blockDim.y + threadIdx.y;\n",
+    "\n",
+    "    if (col < cols && row < rows) {\n",
+    "        int idx_in = row * cols + col;\n",
+    "        int idx_out = col * rows + row;\n",
+    "        c[idx_out] = a[idx_in];\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "d0eff8fe-34a5-4370-b6ff-118cd4d7cb22",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "__global__ void matvecmulKernel(double* d_mat, double* d_vec, double* d_result, int rows, int cols) {\n",
+    "    int tid = blockIdx.x * blockDim.x + threadIdx.x;\n",
+    "\n",
+    "    if (tid < rows) {\n",
+    "        double sum = 0.0;\n",
+    "        for (int j = 0; j < cols; j++) {\n",
+    "            sum += d_mat[tid * cols + j] * d_vec[j];\n",
+    "        }\n",
+    "        d_result[tid] = sum;\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "21718df0-5f56-4122-bc1a-a368b306fdaf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "__global__ void matmulKernel(double* d_a, double* d_b, double* d_result, int rowsA, int colsA, int colsB) {\n",
+    "    int row = blockIdx.y * blockDim.y + threadIdx.y;\n",
+    "    int col = blockIdx.x * blockDim.x + threadIdx.x;\n",
+    "\n",
+    "    if (row < rowsA && col < colsB) {\n",
+    "        double value = 0.0;\n",
+    "        for (int k = 0; k < colsA; k++) {\n",
+    "            value += d_a[row * colsA + k] * d_b[k * colsB + col];\n",
+    "        }\n",
+    "        d_result[row * colsB + col] = value;\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "18f843c9-9aee-4055-aae7-473c8f121037",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "__global__ void vecsubKernel(const double* a, const double* b, double* result, int len) {\n",
+    "    int idx = blockIdx.x * blockDim.x + threadIdx.x;\n",
+    "    if (idx < len) {\n",
+    "        result[idx] = a[idx] - b[idx];\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d8bb436d-238b-488f-a8a8-b8e1a4a0150d",
+   "metadata": {},
+   "source": [
+    "### Defining functions to run CUDA kernels and obtain the results:\n",
+    "This allows us to run calls to all matrix-matrix, matrix-vector and vector-vector CUDA operations without dealing with CUDA syntax"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "6ff06da4-1430-4720-affb-b9c8b1dc7a26",
    "metadata": {},
    "outputs": [],
    "source": [
-    "// helper function: matrix addition\n",
-    "std::vector<std::vector<double>> matadd(const std::vector<std::vector<double>>& a, const std::vector<std::vector<double>>& b) {\n",
+    "std::vector<std::vector<double>> mataddCUDA(const std::vector<std::vector<double>>& a, const std::vector<std::vector<double>>& b) {\n",
     "    int rows = a.size();\n",
     "    int cols = a[0].size();\n",
     "    \n",
-    "    std::vector<std::vector<double>> result(rows, std::vector<double>(cols));\n",
+    "    double* h_a = new double[rows*cols];\n",
+    "    double* h_b = new double[rows*cols];\n",
+    "    double* h_c = new double[rows*cols];\n",
+    "    \n",
+    "    for (int i = 0; i < rows; i++) {\n",
+    "        for (int j = 0; j < cols; j++) {\n",
+    "            h_a[i*cols + j] = a[i][j];\n",
+    "            h_b[i*cols + j] = b[i][j];\n",
+    "        }\n",
+    "    }\n",
+    "    \n",
+    "    double* d_a, * d_b, * d_c;\n",
+    "    cudaMalloc((void**)&d_a, rows*cols*sizeof(double));\n",
+    "    cudaMalloc((void**)&d_b, rows*cols*sizeof(double));\n",
+    "    cudaMalloc((void**)&d_c, rows*cols*sizeof(double));\n",
+    "\n",
+    "    cudaMemcpy(d_a, h_a, rows*cols*sizeof(double), cudaMemcpyHostToDevice);\n",
+    "    cudaMemcpy(d_b, h_b, rows*cols*sizeof(double), cudaMemcpyHostToDevice);\n",
     "\n",
+    "    dim3 dimBlock(16, 16);\n",
+    "    dim3 dimGrid((cols + dimBlock.x - 1) / dimBlock.x, (rows + dimBlock.y - 1) / dimBlock.y);\n",
+    "    matAddKernel<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, rows, cols);\n",
+    "\n",
+    "    cudaMemcpy(h_c, d_c, rows*cols*sizeof(double), cudaMemcpyDeviceToHost);\n",
+    "\n",
+    "    cudaFree(d_a);\n",
+    "    cudaFree(d_b);\n",
+    "    cudaFree(d_c);\n",
+    "\n",
+    "    std::vector<std::vector<double>> result(rows, std::vector<double>(cols));\n",
     "    for (int i = 0; i < rows; i++) {\n",
     "        for (int j = 0; j < cols; j++) {\n",
-    "            result[i][j] = a[i][j] + b[i][j];\n",
+    "            result[i][j] = h_c[i*cols + j];\n",
     "        }\n",
     "    }\n",
     "\n",
+    "    delete[] h_a;\n",
+    "    delete[] h_b;\n",
+    "    delete[] h_c;\n",
+    "\n",
     "    return result;\n",
     "}"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 9,
    "id": "b70f7f32-b7c2-4a1c-96a0-aea42b15f2aa",
    "metadata": {},
    "outputs": [],
    "source": [
-    "// helper function: matrix subtraction\n",
-    "std::vector<std::vector<double>> matsub(const std::vector<std::vector<double>>& a, const std::vector<std::vector<double>>& b) {\n",
+    "std::vector<std::vector<double>> matsubCUDA(const std::vector<std::vector<double>>& a, const std::vector<std::vector<double>>& b) {\n",
     "    int rows = a.size();\n",
     "    int cols = a[0].size();\n",
     "    \n",
-    "    std::vector<std::vector<double>> result(rows, std::vector<double>(cols));\n",
+    "    double* h_a = new double[rows*cols];\n",
+    "    double* h_b = new double[rows*cols];\n",
+    "    double* h_c = new double[rows*cols];\n",
+    "    \n",
+    "    for (int i = 0; i < rows; i++) {\n",
+    "        for (int j = 0; j < cols; j++) {\n",
+    "            h_a[i*cols + j] = a[i][j];\n",
+    "            h_b[i*cols + j] = b[i][j];\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    double* d_a, * d_b, * d_c;\n",
+    "    cudaMalloc((void**)&d_a, rows*cols*sizeof(double));\n",
+    "    cudaMalloc((void**)&d_b, rows*cols*sizeof(double));\n",
+    "    cudaMalloc((void**)&d_c, rows*cols*sizeof(double));\n",
+    "\n",
+    "    cudaMemcpy(d_a, h_a, rows*cols*sizeof(double), cudaMemcpyHostToDevice);\n",
+    "    cudaMemcpy(d_b, h_b, rows*cols*sizeof(double), cudaMemcpyHostToDevice);\n",
     "\n",
+    "    dim3 dimBlock(16, 16);  \n",
+    "    dim3 dimGrid((cols + dimBlock.x - 1) / dimBlock.x, (rows + dimBlock.y - 1) / dimBlock.y);\n",
+    "    matSubKernel<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, rows, cols);\n",
+    "\n",
+    "    cudaMemcpy(h_c, d_c, rows*cols*sizeof(double), cudaMemcpyDeviceToHost);\n",
+    "\n",
+    "    cudaFree(d_a);\n",
+    "    cudaFree(d_b);\n",
+    "    cudaFree(d_c);\n",
+    "\n",
+    "    std::vector<std::vector<double>> result(rows, std::vector<double>(cols));\n",
     "    for (int i = 0; i < rows; i++) {\n",
     "        for (int j = 0; j < cols; j++) {\n",
-    "            result[i][j] = a[i][j] - b[i][j];\n",
+    "            result[i][j] = h_c[i*cols + j];\n",
     "        }\n",
     "    }\n",
     "\n",
+    "    delete[] h_a;\n",
+    "    delete[] h_b;\n",
+    "    delete[] h_c;\n",
+    "\n",
     "    return result;\n",
     "}"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 10,
    "id": "86b4a452-f04c-45a4-b69b-d87bcb2e5954",
    "metadata": {},
    "outputs": [],
    "source": [
-    "// helper function: matrix transpose\n",
-    "std::vector<std::vector<double>> mattranspose(const std::vector<std::vector<double>>& a) {\n",
+    "std::vector<std::vector<double>> mattransposeCUDA(const std::vector<std::vector<double>>& a) {\n",
     "    int rows = a.size();\n",
     "    int cols = a[0].size();\n",
     "    \n",
+    "    double* h_a = new double[rows*cols];\n",
+    "    double* h_c = new double[cols*rows];\n",
+    "    \n",
+    "    for (int i = 0; i < rows; i++) {\n",
+    "        for (int j = 0; j < cols; j++) {\n",
+    "            h_a[i*cols + j] = a[i][j];\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    double* d_a, * d_c;\n",
+    "    cudaMalloc((void**)&d_a, rows*cols*sizeof(double));\n",
+    "    cudaMalloc((void**)&d_c, cols*rows*sizeof(double));\n",
+    "\n",
+    "    cudaMemcpy(d_a, h_a, rows*cols*sizeof(double), cudaMemcpyHostToDevice);\n",
+    "\n",
+    "    dim3 dimBlock(16, 16);\n",
+    "    dim3 dimGrid((cols + dimBlock.x - 1) / dimBlock.x, (rows + dimBlock.y - 1) / dimBlock.y);\n",
+    "    matTransposeKernel<<<dimGrid, dimBlock>>>(d_a, d_c, rows, cols);\n",
+    "\n",
+    "    cudaMemcpy(h_c, d_c, cols*rows*sizeof(double), cudaMemcpyDeviceToHost);\n",
+    "\n",
+    "    cudaFree(d_a);\n",
+    "    cudaFree(d_c);\n",
+    "\n",
     "    std::vector<std::vector<double>> result(cols, std::vector<double>(rows));\n",
+    "    for (int i = 0; i < cols; i++) {\n",
+    "        for (int j = 0; j < rows; j++) {\n",
+    "            result[i][j] = h_c[i*rows + j];\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    delete[] h_a;\n",
+    "    delete[] h_c;\n",
+    "\n",
+    "    return result;\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3a43038a-89d4-4d90-9b2c-bc3bf87ca9ab",
+   "metadata": {},
+   "source": [
+    "### Function to obtain 2x2 and 3x3 matrix inverse using adjoint:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "0d925dce-377c-4d23-96d6-9c4fead297cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "std::vector<double> matvecmulCUDA(const std::vector<std::vector<double>>& a, const std::vector<double>& b) {\n",
+    "    int rows = a.size();\n",
+    "    int cols = a[0].size();\n",
     "\n",
+    "    if (cols != b.size()) {\n",
+    "        throw std::runtime_error(\"Matrix dims do not match for multiplication.\");\n",
+    "    }\n",
+    "\n",
+    "    std::vector<double> h_mat(rows * cols);\n",
     "    for (int i = 0; i < rows; i++) {\n",
     "        for (int j = 0; j < cols; j++) {\n",
-    "            result[j][i] = a[i][j];\n",
+    "            h_mat[i * cols + j] = a[i][j];\n",
     "        }\n",
     "    }\n",
     "\n",
+    "    double *d_mat, *d_vec, *d_result;\n",
+    "    cudaMalloc((void**)&d_mat, rows * cols * sizeof(double));\n",
+    "    cudaMalloc((void**)&d_vec, cols * sizeof(double));\n",
+    "    cudaMalloc((void**)&d_result, rows * sizeof(double));\n",
+    "\n",
+    "    cudaMemcpy(d_mat, h_mat.data(), rows * cols * sizeof(double), cudaMemcpyHostToDevice);\n",
+    "    cudaMemcpy(d_vec, b.data(), cols * sizeof(double), cudaMemcpyHostToDevice);\n",
+    "\n",
+    "    int blockSize = 256;\n",
+    "    int gridSize = (rows + blockSize - 1) / blockSize;\n",
+    "\n",
+    "    matvecmulKernel<<<gridSize, blockSize>>>(d_mat, d_vec, d_result, rows, cols);\n",
+    "\n",
+    "    std::vector<double> result(rows);\n",
+    "    cudaMemcpy(result.data(), d_result, rows * sizeof(double), cudaMemcpyDeviceToHost);\n",
+    "\n",
+    "    cudaFree(d_mat);\n",
+    "    cudaFree(d_vec);\n",
+    "    cudaFree(d_result);\n",
+    "\n",
+    "    return result;\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "69714460-9a3e-4966-9de7-5e4111fc40af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "std::vector<std::vector<double>> matmulCUDA(const std::vector<std::vector<double>>& a, const std::vector<std::vector<double>>& b) {\n",
+    "    int rowsA = a.size();\n",
+    "    int colsA = a[0].size();\n",
+    "    int rowsB = b.size();\n",
+    "    int colsB = b[0].size();\n",
+    "\n",
+    "    if (colsA != rowsB) {\n",
+    "        throw std::runtime_error(\"Matrix dims do not match for multiplication.\");\n",
+    "    }\n",
+    "\n",
+    "    std::vector<double> h_a(rowsA * colsA);\n",
+    "    std::vector<double> h_b(rowsB * colsB);\n",
+    "\n",
+    "    for (int i = 0; i < rowsA; i++) {\n",
+    "        for (int j = 0; j < colsA; j++) {\n",
+    "            h_a[i * colsA + j] = a[i][j];\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    for (int i = 0; i < rowsB; i++) {\n",
+    "        for (int j = 0; j < colsB; j++) {\n",
+    "            h_b[i * colsB + j] = b[i][j];\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    double *d_a, *d_b, *d_result;\n",
+    "    \n",
+    "    cudaMalloc((void**)&d_a, rowsA * colsA * sizeof(double));\n",
+    "    cudaMalloc((void**)&d_b, rowsB * colsB * sizeof(double));\n",
+    "    cudaMalloc((void**)&d_result, rowsA * colsB * sizeof(double));\n",
+    "\n",
+    "    cudaMemcpy(d_a, h_a.data(), rowsA * colsA * sizeof(double), cudaMemcpyHostToDevice);\n",
+    "    cudaMemcpy(d_b, h_b.data(), rowsB * colsB * sizeof(double), cudaMemcpyHostToDevice);\n",
+    "\n",
+    "    dim3 blockSize(16, 16);\n",
+    "    dim3 gridSize((colsB + blockSize.x - 1) / blockSize.x, (rowsA + blockSize.y - 1) / blockSize.y);\n",
+    "\n",
+    "    matmulKernel<<<gridSize, blockSize>>>(d_a, d_b, d_result, rowsA, colsA, colsB);\n",
+    "\n",
+    "    std::vector<double> h_result(rowsA * colsB);\n",
+    "    cudaMemcpy(h_result.data(), d_result, rowsA * colsB * sizeof(double), cudaMemcpyDeviceToHost);\n",
+    "\n",
+    "    cudaFree(d_a);\n",
+    "    cudaFree(d_b);\n",
+    "    cudaFree(d_result);\n",
+    "    \n",
+    "    std::vector<std::vector<double>> result(rowsA, std::vector<double>(colsB));\n",
+    "    for (int i = 0; i < rowsA; i++) {\n",
+    "        for (int j = 0; j < colsB; j++) {\n",
+    "            result[i][j] = h_result[i * colsB + j];\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    return result;\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "7295e929-dcfc-43be-9b3b-82e9f9fbf9db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "std::vector<double> vecsubCUDA(const std::vector<double>& a, const std::vector<double>& b) {\n",
+    "    int len = a.size();\n",
+    "\n",
+    "    double* d_a;\n",
+    "    double* d_b;\n",
+    "    double* d_result;\n",
+    "\n",
+    "    cudaMalloc((void**)&d_a, len * sizeof(double));\n",
+    "    cudaMalloc((void**)&d_b, len * sizeof(double));\n",
+    "    cudaMalloc((void**)&d_result, len * sizeof(double));\n",
+    "\n",
+    "    cudaMemcpy(d_a, a.data(), len * sizeof(double), cudaMemcpyHostToDevice);\n",
+    "    cudaMemcpy(d_b, b.data(), len * sizeof(double), cudaMemcpyHostToDevice);\n",
+    "\n",
+    "    int blockSize = 256; \n",
+    "    int gridSize = (len + blockSize - 1) / blockSize;\n",
+    "\n",
+    "    vecsubKernel<<<gridSize, blockSize>>>(d_a, d_b, d_result, len);\n",
+    "\n",
+    "    std::vector<double> result(len);\n",
+    "    cudaMemcpy(result.data(), d_result, len * sizeof(double), cudaMemcpyDeviceToHost);\n",
+    "\n",
+    "    cudaFree(d_a);\n",
+    "    cudaFree(d_b);\n",
+    "    cudaFree(d_result);\n",
+    "\n",
     "    return result;\n",
     "}\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "id": "bf413652-f71a-4b91-9e6e-5520812dc3cb",
+   "execution_count": 14,
+   "id": "cb968a24-75c4-491f-9930-0f16a48f7560",
    "metadata": {},
    "outputs": [],
    "source": [
-    "// helper function: matrix inversion\n",
     "std::vector<std::vector<double>> matinverse(const std::vector<std::vector<double>>& a) {\n",
     "    size_t n = a.size();\n",
     "    \n",
@@ -113,7 +482,7 @@
     "    // Handle 1x1 matrix\n",
     "    if (n == 1) {\n",
     "        if (a[0][0] == 0) {\n",
-    "            throw std::runtime_error(\"Matrix is singular and cannot be inverted.\");\n",
+    "            throw std::runtime_error(\"singular matrix\");\n",
     "        }\n",
     "        return {{1.0 / a[0][0]}};\n",
     "    }\n",
@@ -121,7 +490,7 @@
     "    if (n == 2) {\n",
     "        double determinant = a[0][0] * a[1][1] - a[0][1] * a[1][0];\n",
     "        if (determinant == 0) {\n",
-    "            throw std::runtime_error(\"Matrix is singular and cannot be inverted.\");\n",
+    "            throw std::runtime_error(\"singular matrix\");\n",
     "        }\n",
     "\n",
     "        std::vector<std::vector<double>> result(2, std::vector<double>(2));\n",
@@ -139,7 +508,7 @@
     "                             + a[0][2]*(a[1][0]*a[2][1]-a[1][1]*a[2][0]);\n",
     "        \n",
     "        if (determinant == 0) {\n",
-    "            throw std::runtime_error(\"Matrix is singular and cannot be inverted.\");\n",
+    "            throw std::runtime_error(\"singular matrix\");\n",
     "        }\n",
     "\n",
     "        std::vector<std::vector<double>> result(3, std::vector<double>(3));\n",
@@ -157,118 +526,117 @@
     "        return result;\n",
     "    }\n",
     "\n",
-    "    throw std::runtime_error(\"Only 2x2 and 3x3 matrices supported for inversion in this function.\");\n",
-    "}\n",
-    "\n"
+    "    throw std::runtime_error(\"Only 2x2 and 3x3 matrices supported for inversion\");\n",
+    "}\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "id": "8c4b2db6-0847-4c0f-a8f7-88339d8257d7",
+   "execution_count": 15,
+   "id": "84f29c1e-d4c2-4680-a579-14d63a2bde08",
    "metadata": {},
    "outputs": [],
    "source": [
-    "// helper function: matrix-vector subtraction\n",
-    "std::vector<std::vector<double>> matvecsub(const std::vector<std::vector<double>>& mat, const std::vector<double>& vec) {\n",
-    "    std::vector<std::vector<double>> result(mat.size(), std::vector<double>(vec.size()));\n",
-    "\n",
-    "    for (size_t i = 0; i < mat.size(); i++) {\n",
-    "        for (size_t j = 0; j < vec.size(); j++) {\n",
-    "            result[i][j] = mat[i][j] - vec[j];\n",
+    "void printMatrix(const std::vector<std::vector<double>>& matrix) {\n",
+    "    for (size_t i = 0; i < matrix.size(); i++) {\n",
+    "        for (size_t j = 0; j < matrix[i].size(); j++) {\n",
+    "            std::cout << matrix[i][j] << \" \";\n",
     "        }\n",
+    "        std::cout << std::endl;\n",
     "    }\n",
-    "\n",
-    "    return result;\n",
     "}"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "id": "0d925dce-377c-4d23-96d6-9c4fead297cb",
+   "execution_count": 16,
+   "id": "fa143e89-d1c5-4b54-b22d-9921d7e1000d",
    "metadata": {},
    "outputs": [],
    "source": [
-    "// helper function: matrix-vector multiplication\n",
-    "std::vector<double> matvecmul(const std::vector<std::vector<double>>& a, const std::vector<double>& b) {\n",
-    "    int rows = a.size();\n",
-    "    int cols = a[0].size();\n",
-    "\n",
-    "    if (cols != b.size()) {\n",
-    "        throw std::runtime_error(\"Matrix dimensions do not match for multiplication.\");\n",
-    "    }\n",
-    "\n",
-    "    std::vector<double> result(rows, 0.0);\n",
-    "\n",
-    "    for (int i = 0; i < rows; i++) {\n",
-    "        for (int j = 0; j < cols; j++) {\n",
-    "            result[i] += a[i][j] * b[j];\n",
+    "void printMatrix(const std::vector<double>& vec) {\n",
+    "    for (size_t i = 0; i < vec.size(); i++) {\n",
+    "            std::cout << vec[i] << \" \";\n",
     "        }\n",
-    "    }\n",
-    "\n",
-    "    return result;\n",
-    "}"
+    "        std::cout << std::endl;\n",
+    "    }\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "79ebaf27-693f-4213-800d-8e8b5c9abaa3",
+   "metadata": {},
+   "source": [
+    "### Test for matrix-matrix multiplication using CUDA"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "id": "69714460-9a3e-4966-9de7-5e4111fc40af",
+   "execution_count": 17,
+   "id": "6d718c47-3bc6-4c7a-893c-4fedc399926d",
    "metadata": {},
    "outputs": [],
    "source": [
-    "// helper function: matrix-matrix multiplication\n",
-    "std::vector<std::vector<double>> matmul(const std::vector<std::vector<double>>& a, const std::vector<std::vector<double>>& b) {\n",
-    "    int rowsA = a.size();\n",
-    "    int colsA = a[0].size();\n",
-    "    int rowsB = b.size();\n",
-    "    int colsB = b[0].size();\n",
-    "\n",
-    "    if (colsA != rowsB) {\n",
-    "        throw std::runtime_error(\"Matrix dimensions do not match for multiplication.\");\n",
-    "    }\n",
+    "std::vector<std::vector<double>> matrixA = {\n",
+    "        {1.0, 2.0},\n",
+    "        {3.0, 4.0},\n",
+    "        {5.0, 6.0}\n",
+    "    };\n",
     "\n",
-    "    std::vector<std::vector<double>> result(rowsA, std::vector<double>(colsB, 0));\n",
+    "std::vector<std::vector<double>> matrixB = {\n",
+    "        {1.0, 2.0, 3.0, 4.0},\n",
+    "        {5.0, 6.0, 7.0, 8.0}\n",
+    "    };\n",
     "\n",
-    "    for (int i = 0; i < rowsA; i++) {\n",
-    "        for (int j = 0; j < colsB; j++) {\n",
-    "            for (int k = 0; k < colsA; k++) {\n",
-    "                result[i][j] += a[i][k] * b[k][j];\n",
-    "            }\n",
-    "        }\n",
-    "    }\n",
-    "\n",
-    "    return result;\n",
-    "}"
+    "std::vector<std::vector<double>> result = matmulCUDA(matrixA, matrixB);"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
-   "id": "7295e929-dcfc-43be-9b3b-82e9f9fbf9db",
+   "execution_count": 18,
+   "id": "3e1c925e-c164-46af-8e0f-65c05f0ee1c0",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Result matrix:\n",
+      "11 14 17 20 \n",
+      "23 30 37 44 \n",
+      "35 46 57 68 \n"
+     ]
+    }
+   ],
    "source": [
-    "// helper function: vector subtraction\n",
-    "std::vector<double> vecsub(const std::vector<double>& a, const std::vector<double>& b) {\n",
-    "    size_t len = a.size();\n",
-    "    std::vector<double> result(len);\n",
-    "    for (size_t i = 0; i < len; i++) {\n",
-    "        result[i] = a[i] - b[i];\n",
-    "    }\n",
-    "    return result;\n",
-    "}"
+    "std::cout << \"Result matrix:\" << std::endl;\n",
+    "printMatrix(result);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9312a581-d000-4da6-b257-b22a7da942f6",
+   "metadata": {},
+   "source": [
+    "### Set up the `KalmanFilter` class and constructor\n",
+    "\n",
+    "  Here we create a Kalman filter with the specified matrices:\n",
+    "  \n",
+    "  - A - System dynamics matrix, here we model 1D motion with a *(Position x, Velocity v, Acceleration a)* model\n",
+    "  - C - Output matrix\n",
+    "  - Q - Covariance matrix of the process noise random variable *p(w) ~ N(0, Q)*\n",
+    "  - R - Covariance matrix of the measurement noise random variable *p(v) ~ N(0, R)*\n",
+    "  - P - Estimate error covariance\n",
+    " "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 19,
    "id": "f4c9238f-674a-4cf8-8128-840cc7f61e0f",
    "metadata": {},
    "outputs": [],
    "source": [
-    "//set up class and constructor\n",
     "KalmanFilter::KalmanFilter(\n",
     "    double dt,\n",
     "    const std::vector<std::vector<double>>& A,\n",
@@ -288,76 +656,121 @@
     "KalmanFilter::KalmanFilter() {}"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "cb0ecf07-b676-4a2f-bb91-4d0ebfe4fc17",
+   "metadata": {},
+   "source": [
+    "#### Describe the `KalmanFilter::init` function to initialize the filter with initial states as zero."
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 12,
-   "id": "bd953ec6-ceed-489c-8e33-8eb7ae6bb32f",
+   "execution_count": 20,
+   "id": "ed99dd28-5959-409a-8226-7e292bc4b0de",
    "metadata": {},
    "outputs": [],
    "source": [
-    "// init the kf states + measurements \n",
-    "void KalmanFilter::init(double t0, const std::vector<double>& x0) {\n",
-    "    x_hat = x0;\n",
+    "void KalmanFilter::init() {\n",
+    "    std::fill(x_hat.begin(), x_hat.end(), 0.0);\n",
     "    P = P0;\n",
-    "    this->t0 = t0;\n",
+    "    t0 = 0;\n",
     "    t = t0;\n",
     "    initialized = true;\n",
     "}"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "f4291823-8a85-40bf-baf7-c0686388e435",
+   "metadata": {},
+   "source": [
+    "#### Describe the `KalmanFilter::init` function to initialize the filter with a guess for initial states."
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 13,
-   "id": "2840f0f7-52de-4f47-a9dc-e257235df002",
+   "execution_count": 21,
+   "id": "bd953ec6-ceed-489c-8e33-8eb7ae6bb32f",
    "metadata": {},
    "outputs": [],
    "source": [
-    "void KalmanFilter::init() {\n",
-    "    std::fill(x_hat.begin(), x_hat.end(), 0.0);\n",
+    "void KalmanFilter::init(double t0, const std::vector<double>& x0) {\n",
+    "    x_hat = x0;\n",
     "    P = P0;\n",
-    "    t0 = 0;\n",
+    "    this->t0 = t0;\n",
     "    t = t0;\n",
     "    initialized = true;\n",
     "}"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "3b71922a-0448-4fa0-8196-08142c2f64b8",
+   "metadata": {},
+   "source": [
+    "#### Running the Kalman Filter Algorithm:\n",
+    "\n",
+    "The update function runs the two stages of the ongoing discrete Kalman filter cycle:\n",
+    "\n",
+    " - Discrete Kalman filter time update equations:\n",
+    " \n",
+    "     $\\begin{aligned} & \\hat{x}_k^{-}=A \\hat{x}_{k-1} \\\\ & P_k^{-}=A P_{k-1} A^T+Q\\end{aligned}$\n",
+    "    \n",
+    "    They project the state and covariance estimates forward from time step *k* – 1 to step *k*.\n",
+    "    \n",
+    " - Discrete Kalman filter measurement update equations: \n",
+    "\n",
+    "    $\\begin{gathered}K_k=P_k^{-} H^T\\left(H P_k^{-} H^T+R\\right)^{-1} \\\\ \\hat{x}_k=\\hat{x}_k^{-}+K_k\\left(z_k-H \\hat{x}_k^{-}\\right) \\\\ P_k=\\left(I-K_k H\\right) P_k^{-}\\end{gathered}$\n",
+    "    \n",
+    "    The Kalman Gain is computed in this step"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 22,
    "id": "ea057f20-9097-4dd3-946e-ab7a383afe69",
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
-    "\n",
-    "void KalmanFilter::update(const std::vector<double>& y) {\n",
+    "std::vector<double> KalmanFilter::update(const std::vector<double>& y) {\n",
     "    if (!initialized)\n",
     "        throw std::runtime_error(\"Filter is not initialized!\");\n",
-    "\n",
-    "    x_hat_new = matvecmul(A, x_hat);\n",
-    "\n",
-    "    P = matadd(matmul(matmul(A, P), mattranspose(A)), Q);\n",
-    "\n",
-    "    std::vector<std::vector<double>> inv = matinverse(matadd(matmul(matmul(C, P), mattranspose(C)), R));\n",
-    "\n",
-    "    K = matmul(matmul(P, mattranspose(C)), inv);\n",
-    "    std::vector<double> temp = matvecmul(C, x_hat_new);\n",
-    "    std::vector<double> difference = vecsub(y, temp);\n",
-    "\n",
+    "    \n",
+    "    // Discrete Kalman filter time update \n",
+    "    x_hat_new = matvecmulCUDA(A, x_hat);\n",
+    "    P = mataddCUDA(matmulCUDA(matmulCUDA(A, P), mattransposeCUDA(A)), Q);\n",
+    "    \n",
+    "    // Discrete Kalman filter measurement update\n",
+    "    std::vector<std::vector<double>> inv = matinverse(mataddCUDA(matmulCUDA(matmulCUDA(C, P), mattransposeCUDA(C)), R));\n",
+    "    K = matmulCUDA(matmulCUDA(P, mattransposeCUDA(C)), inv);\n",
+    "    std::vector<double> temp = matvecmulCUDA(C, x_hat_new);\n",
+    "    std::vector<double> difference = vecsubCUDA(y, temp);\n",
+    "    std::vector<double> gain = K[0];\n",
     "    for (size_t i = 0; i < x_hat_new.size(); i++) {\n",
-    "        x_hat_new[i] += matvecmul(K, difference)[i];\n",
+    "        x_hat_new[i] += matvecmulCUDA(K, difference)[i];\n",
     "    }\n",
     "\n",
-    "    P = matmul(matsub(I, matmul(K, C)), P);\n",
+    "    P = matmulCUDA(matsubCUDA(I, matmulCUDA(K, C)), P);\n",
     "\n",
     "    x_hat = x_hat_new;\n",
     "    t += dt;\n",
+    "    \n",
+    "    return gain;\n",
     "}\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "2b1d8963-599a-4034-80de-e7c92c59c321",
+   "metadata": {},
+   "source": [
+    "#### Update the estimated state based on measured values, using the given time step and dynamics matrix:"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 23,
    "id": "08d28163-a086-46a9-bf41-044f48530929",
    "metadata": {},
    "outputs": [],
@@ -370,41 +783,41 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "80d35cbe-7baa-4bc2-a232-4b5631af59b2",
+   "cell_type": "markdown",
+   "id": "54fb13b2-51ca-4dda-8503-151aea8dbd68",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "std::vector<double> generateExpandedData(const std::vector<double>& baseData, int repetitions) {\n",
-    "    std::vector<double> expandedData;\n",
-    "    for (int r = 0; r < repetitions; r++) {\n",
-    "        for (auto val : baseData) {\n",
-    "            // Add slight random perturbation to the data.\n",
-    "            double perturbedValue = val + ((double)rand() / RAND_MAX - 0.5);\n",
-    "            expandedData.push_back(perturbedValue);\n",
-    "        }\n",
-    "    }\n",
-    "    return expandedData;\n",
-    "}"
+    "### Putting it all together:\n",
+    "We define the `run_kf` function that initialises the following:\n",
+    "- Number of states\n",
+    "- Dimensions of eacch measurement\n",
+    "- Time steps used $\\begin{aligned} & {dt}\\end{aligned}$\n",
+    "- Covariance matrices\n",
+    "- Passes the measurement data to $\\begin{aligned} & \\hat{x}_0^{-}\\end{aligned}$\n",
+    "\n",
+    "Then the measurements are fed into filter, and output estimated states\n",
+    "\n",
+    "The algorithm then runs by looping over the measurements:\n",
+    "\n",
+    "$\\begin{gathered} & for\\:y\\:in\\:measurements\\::\\end{gathered}$\n",
+    "$\\begin{gathered} & kf.update(y)\\end{gathered}$"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 24,
    "id": "8938a288-fc2c-4104-82c6-added4466c8c",
    "metadata": {},
    "outputs": [],
    "source": [
-    "std::vector<std::vector<double>> run_kf(int expansion_factor, bool verbose) {\n",
+    "std::vector<std::vector<double>> run_kf(bool verbose) {\n",
     "    \n",
-    "    int n = 3; // Number of states\n",
-    "    int m = 1; // Number of measurements\n",
+    "    int n = 3; \n",
+    "    int m = 1; \n",
     "\n",
-    "    double dt = 1.0 / 40; // Time step\n",
+    "    double dt = 1.0 / 30; // Time step\n",
     "    \n",
     "    std::vector<double> g_preds;\n",
-    "    std::vector<double> g_est;\n",
     "    \n",
     "    std::vector<std::vector<double>> A(n, std::vector<double>(n));\n",
     "    std::vector<std::vector<double>> C(m, std::vector<double>(n));\n",
@@ -421,47 +834,58 @@
     "    KalmanFilter kf(dt, A, C, Q, R, P);\n",
     "    \n",
     "    std::vector<double> measurements = {\n",
-    "      1.04202710058, 1.10726790452, 1.2913511148, 1.48485250951, 1.72825901034,\n",
-    "      1.74216489744, 2.11672039768, 2.14529225112, 2.16029641405, 2.21269371128,\n",
-    "      2.57709350237, 2.6682215744, 2.51641839428, 2.76034056782, 2.88131780617,\n",
-    "      2.88373786518, 2.9448468727, 2.82866600131, 3.0006601946, 3.12920591669,\n",
-    "      2.858361783, 2.83808170354, 2.68975330958, 2.66533185589, 2.81613499531,\n",
-    "      2.81003612051, 2.88321849354, 2.69789264832, 2.4342229249, 2.23464791825,\n",
-    "      2.30278776224, 2.02069770395, 1.94393985809, 1.82498398739, 1.52526230354,\n",
-    "      1.86967808173, 1.18073207847, 1.10729605087, 0.916168349913, 0.678547664519,\n",
-    "      0.562381751596, 0.355468474885, -0.155607486619, -0.287198661013, -0.602973173813,\n",
-    "      -0.873817307503, -1.144661441193, -1.415505574883, -1.686349708573, -1.957193842263, \n",
-    "    -2.228037975953, -2.498882109643, -2.769726243333, -3.040570377023, -3.311414510713, \n",
-    "    -3.582258644403, -3.853102778093, -4.123946911783, -4.394791045473, -4.665635179163, \n",
-    "    -4.936479312853, -5.207323446543, -5.478167580233, -5.749011713923, -6.019855847613,\n",
-    "    -6.290699981303, -6.561544114993, -6.832388248683, -7.103232382373, -7.374076516063,\n",
-    "    -7.644920649753, -7.915764783443, -8.186608917133, -8.457453050823, -8.728297184513, \n",
-    "    -8.999141318203, -9.269985451893, -9.540829585583, -9.811673719273, -10.082517852963,\n",
-    "    -10.353361986653, -10.624206120343, -10.895050254033, -11.165894387723, -11.436738521413,\n",
-    "    -11.707582655103, -11.978426788793, -12.249270922483, -12.520115056173\n",
-    "      };\n",
-    "    \n",
-    "    // std::vector<double> expandedMeasurements = generateExpandedData(measurements, expansion_factor);\n",
-    "    \n",
-    "    std::vector<double> x0 = {measurements[0], 0, 0};\n",
+    "    1.04202710058,1.10726790452,1.2913511148,1.48485250951,1.72825901034,1.74216489744,\n",
+    "    2.11672039768,2.14529225112,2.16029641405,2.21269371128,2.57709350237,2.6682215744,\n",
+    "    2.51641839428,2.76034056782,2.88131780617,2.88373786518,2.9448468727,2.82866600131,\n",
+    "    3.0006601946,3.12920591669,2.858361783,2.83808170354,2.68975330958,2.66533185589,\n",
+    "    2.81613499531,2.81003612051,2.88321849354,2.69789264832,2.4342229249,2.23464791825,\n",
+    "    2.30278776224,2.02069770395,1.94393985809,1.82498398739,1.52526230354,1.86967808173,\n",
+    "    1.18073207847,1.10729605087,0.916168349913,0.678547664519,0.562381751596,0.355468474885,\n",
+    "    -0.155607486619,-0.287198661013,-0.602973173813,-0.292648661013,-0.602973173813,\n",
+    "    -0.924197686613,-1.2563221994129998,-1.5993467122129998,-1.9532712250129998,-2.3180957378129996,\n",
+    "    -2.6938202506129993,-3.0804447634129994,-3.4779692762129994,-3.8863937890129994,-4.305718301812999,\n",
+    "    -4.735942814612999,-5.177067327412999,-5.629091840212999,-6.0920163530129985,-6.565840865812999,\n",
+    "    -7.0505653786129985,-7.546189891412999,-8.052714404212999,-8.570138917012999,-9.098463429812998,\n",
+    "    -9.637687942612999,-10.187812455412999,-10.748836968212998,-11.320761481013,-11.903585993813,\n",
+    "    -12.497310506613,-13.101935019413,-13.717459532213,-14.343884045013,-14.981208557813002,\n",
+    "    -15.629433070613002,-16.288557583413002,-16.958582096213004,-17.639506609013004,\n",
+    "    -18.331331121813005,-19.034055634613004,-19.747680147413003,-20.472204660213006,\n",
+    "    -21.207629173013007,-21.95395368581301,-22.71117819861301,-23.47930271141301,\n",
+    "    -24.25832722421301,-25.04825173701301,-25.849076249813013,-26.660800762613015,\n",
+    "    -27.483425275413015,-28.316949788213016,-29.161374301013016,-30.01669881381302,\n",
+    "    -30.88292332661302,-31.760047839413023,-32.64807235221303,-33.54699686501303,\n",
+    "    -34.45682137781303,-35.37754589061303,-36.30917040341303,-37.251694916213026,\n",
+    "    -38.20511942901303,-39.169443941813036,-40.14466845461304,-41.13079296741304,\n",
+    "    -42.12781748021305,-43.13574199301305,-44.15456650581305,-45.18429101861305,\n",
+    "    -46.22491553141305,-47.276440044213054,-48.33886455701305,-49.41218906981305,\n",
+    "    -50.49641358261306,-51.591538095413064,-52.69756260821307,-53.81448712101307,\n",
+    "    -54.94231163381308,-56.08103614661308,-57.23066065941308,-58.391185172213085,\n",
+    "    -59.562609685013086,-60.74493419781309,-61.93815871061309,-63.14228322341309,\n",
+    "    -64.35730773621309,-65.5832322490131,-66.82005676181309,-68.0677812746131,\n",
+    "    -69.3264057874131,-70.5959303002131,-71.8763548130131,-73.1676793258131,\n",
+    "    -74.46990383861309,-75.7830283514131,-77.1070528642131,-78.4419773770131,\n",
+    "    -79.78780188981311,-81.14452640261311,-82.51215091541312,-83.89067542821311\n",
+    "\n",
+    "    };\n",
+    "    \n",
+    "    std::vector<double> x0 = {measurements[0], 0, -15};\n",
+    "    std::vector<std::vector<double>> gain;\n",
     "    kf.init(0, x0);\n",
     "\n",
-    "    // Feed measurements into filter, output estimated states\n",
+    "    \n",
     "    std::vector<double> y(m);\n",
     "    if(verbose) {\n",
     "        std::cout << \"t = \" << 0 << \", \" << \"x_hat[0]: \";\n",
     "        for (auto& val : kf.state()){\n",
     "            std::cout << val << \" \";\n",
     "        }\n",
-    "        g_est.push_back(kf.state()[2]);\n",
-    "        \n",
     "        std::cout << std::endl;\n",
     "    }\n",
     "    \n",
     "    int i;\n",
     "    for (i = 0; i < measurements.size(); i++) {\n",
     "        y[0] = measurements[i];\n",
-    "        kf.update(y);\n",
+    "        gain.push_back(kf.update(y));\n",
     "        if(verbose) {\n",
     "            std::cout << \"t = \" << (i + 1) * dt << \", y[\" << i << \"] = \" << y[0] << \", x_hat[\" << i << \"] = \";\n",
     "            for (auto& val : kf.state()) {\n",
@@ -475,10 +899,10 @@
     "    std::cout<<\"Exec Success, Final kf states:\";\n",
     "    for (auto& val : kf.state()) std::cout << val << \" \";\n",
     "    std::cout << std::endl;\n",
-    "    \n",
+    "\n",
     "    std::vector<std::vector<double>> g_res;\n",
     "    for (size_t i = 0; i < g_preds.size(); ++i) {\n",
-    "        std::vector<double> pair = {g_preds[i], g_est[i]};\n",
+    "        std::vector<double> pair = {g_preds[i], gain[i][0]};\n",
     "        g_res.push_back(pair);\n",
     "    }\n",
     "    \n",
@@ -488,167 +912,183 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
-   "id": "1e77f6cc-410a-4cd4-acbc-94677b77c968",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#include <chrono>\n",
-    "auto start = std::chrono::high_resolution_clock::now();\n",
-    "auto stop = std::chrono::high_resolution_clock::now();\n",
-    "auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "id": "7ea068c2-fc1f-4ad4-a2c9-aecd66cbbf71",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "std::vector<std::vector<double>> pyrun_sim(int exp_factor, bool verbose) {\n",
-    "    start = std::chrono::high_resolution_clock::now();\n",
-    "    std::vector<std::vector<double>> g_res = run_kf(exp_factor, verbose);\n",
-    "    stop = std::chrono::high_resolution_clock::now();\n",
-    "    duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);\n",
-    "    return g_res;\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "id": "6026ea4e-c42b-42e5-97a7-12af37e121a9",
+   "execution_count": 25,
+   "id": "7a910405-c4ff-41fa-aec5-69aa24598f9c",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "t = 0, x_hat[0]: 1.04203 0 0 \n",
-      "t = 0.025, y[0] = 1.04203, x_hat[0] = 1.04203 0 0 \n",
-      "t = 0.05, y[1] = 1.10727, x_hat[1] = 1.08709 0.898405 0.00110609 \n",
-      "t = 0.075, y[2] = 1.29135, x_hat[2] = 1.22062 2.38572 0.00356059 \n",
-      "t = 0.1, y[3] = 1.48485, x_hat[3] = 1.3876 3.46905 0.00712492 \n",
-      "t = 0.125, y[4] = 1.72826, x_hat[4] = 1.58997 4.40683 0.013793 \n",
-      "t = 0.15, y[5] = 1.74216, x_hat[5] = 1.71702 4.52169 0.0154301 \n",
-      "t = 0.175, y[6] = 2.11672, x_hat[6] = 1.93313 5.12414 0.031192 \n",
-      "t = 0.2, y[7] = 2.14529, x_hat[7] = 2.08865 5.26575 0.0374157 \n",
-      "t = 0.225, y[8] = 2.1603, x_hat[8] = 2.20235 5.18421 0.0316618 \n",
-      "t = 0.25, y[9] = 2.21269, x_hat[9] = 2.29893 5.04726 0.0173069 \n",
-      "t = 0.275, y[10] = 2.57709, x_hat[10] = 2.46442 5.19829 0.039685 \n",
-      "t = 0.3, y[11] = 2.66822, x_hat[11] = 2.61234 5.26324 0.0527055 \n",
-      "t = 0.325, y[12] = 2.51642, x_hat[12] = 2.69148 5.08935 0.00552738 \n",
-      "t = 0.35, y[13] = 2.76034, x_hat[13] = 2.80588 5.04887 -0.00849014 \n",
-      "t = 0.375, y[14] = 2.88132, x_hat[14] = 2.92141 5.01625 -0.0224249 \n",
-      "t = 0.4, y[15] = 2.88374, x_hat[15] = 3.0137 4.91887 -0.0729322 \n",
-      "t = 0.425, y[16] = 2.94485, x_hat[16] = 3.09895 4.80966 -0.139272 \n",
-      "t = 0.45, y[17] = 2.82867, x_hat[17] = 3.14447 4.59753 -0.288577 \n",
-      "t = 0.475, y[18] = 3.00066, x_hat[18] = 3.21103 4.45686 -0.396929 \n",
-      "t = 0.5, y[19] = 3.12921, x_hat[19] = 3.287 4.34976 -0.484796 \n",
-      "t = 0.525, y[20] = 2.85836, x_hat[20] = 3.29866 4.07181 -0.747944 \n",
-      "t = 0.55, y[21] = 2.83808, x_hat[21] = 3.30007 3.77743 -1.04221 \n",
-      "t = 0.575, y[22] = 2.68975, x_hat[22] = 3.26982 3.40694 -1.43333 \n",
-      "t = 0.6, y[23] = 2.66533, x_hat[23] = 3.23376 3.03344 -1.83638 \n",
-      "t = 0.625, y[24] = 2.81613, x_hat[24] = 3.2232 2.74466 -2.13797 \n",
-      "t = 0.65, y[25] = 2.81004, x_hat[25] = 3.20765 2.45227 -2.44388 \n",
-      "t = 0.675, y[26] = 2.88322, x_hat[26] = 3.20161 2.19814 -2.69673 \n",
-      "t = 0.7, y[27] = 2.69789, x_hat[27] = 3.15898 1.84844 -3.07251 \n",
-      "t = 0.725, y[28] = 2.43422, x_hat[28] = 3.07035 1.3784 -3.6016 \n",
-      "t = 0.75, y[29] = 2.23465, x_hat[29] = 2.95232 0.840664 -4.20755 \n",
-      "t = 0.775, y[30] = 2.30279, x_hat[30] = 2.85557 0.387859 -4.67894 \n",
-      "t = 0.8, y[31] = 2.0207, x_hat[31] = 2.7166 -0.169629 -5.27548 \n",
-      "t = 0.825, y[32] = 1.94394, x_hat[32] = 2.57681 -0.704099 -5.81837 \n",
-      "t = 0.85, y[33] = 1.82498, x_hat[33] = 2.42945 -1.23525 -6.3351 \n",
-      "t = 0.875, y[34] = 1.52526, x_hat[34] = 2.24401 -1.85275 -6.94502 \n",
-      "t = 0.9, y[35] = 1.86968, x_hat[35] = 2.13957 -2.19864 -7.17158 \n",
-      "t = 0.925, y[36] = 1.18073, x_hat[36] = 1.92439 -2.85128 -7.78709 \n",
-      "t = 0.95, y[37] = 1.1073, x_hat[37] = 1.72092 -3.43475 -8.28638 \n",
-      "t = 0.975, y[38] = 0.916168, x_hat[38] = 1.50771 -4.01434 -8.75833 \n",
-      "t = 1, y[39] = 0.678548, x_hat[39] = 1.2784 -4.60797 -9.22649 \n",
-      "t = 1.025, y[40] = 0.562382, x_hat[40] = 1.05708 -5.14471 -9.60338 \n",
-      "t = 1.05, y[41] = 0.355468, x_hat[41] = 0.827473 -5.67368 -9.95378 \n",
-      "t = 1.075, y[42] = -0.155607, x_hat[42] = 0.537759 -6.3418 -10.4545 \n",
-      "t = 1.1, y[43] = -0.287199, x_hat[43] = 0.262434 -6.93117 -10.8401 \n",
-      "t = 1.125, y[44] = -0.602973, x_hat[44] = -0.0317108 -7.53829 -11.2291 \n",
-      "t = 1.15, y[45] = -0.873817, x_hat[45] = -0.333878 -8.13198 -11.5854 \n",
-      "t = 1.175, y[46] = -1.14466, x_hat[46] = -0.64242 -8.70817 -11.9065 \n",
-      "t = 1.2, y[47] = -1.41551, x_hat[47] = -0.955917 -9.2638 -12.1908 \n",
-      "t = 1.225, y[48] = -1.68635, x_hat[48] = -1.27315 -9.79661 -12.4381 \n",
-      "t = 1.25, y[49] = -1.95719, x_hat[49] = -1.5931 -10.3051 -12.6488 \n",
-      "t = 1.275, y[50] = -2.22804, x_hat[50] = -1.91487 -10.7882 -12.8241 \n",
-      "t = 1.3, y[51] = -2.49888, x_hat[51] = -2.23773 -11.2455 -12.9653 \n",
-      "t = 1.325, y[52] = -2.76973, x_hat[52] = -2.56107 -11.6768 -13.0743 \n",
-      "t = 1.35, y[53] = -3.04057, x_hat[53] = -2.88438 -12.0825 -13.1531 \n",
-      "t = 1.375, y[54] = -3.31141, x_hat[54] = -3.20725 -12.4629 -13.2039 \n",
-      "t = 1.4, y[55] = -3.58226, x_hat[55] = -3.52932 -12.8188 -13.2289 \n",
-      "t = 1.425, y[56] = -3.8531, x_hat[56] = -3.85034 -13.1508 -13.2301 \n",
-      "t = 1.45, y[57] = -4.12395, x_hat[57] = -4.17008 -13.46 -13.2098 \n",
-      "t = 1.475, y[58] = -4.39479, x_hat[58] = -4.48837 -13.7472 -13.17 \n",
-      "t = 1.5, y[59] = -4.66564, x_hat[59] = -4.8051 -14.0134 -13.1126 \n",
-      "t = 1.525, y[60] = -4.93648, x_hat[60] = -5.12016 -14.2597 -13.0395 \n",
-      "t = 1.55, y[61] = -5.20732, x_hat[61] = -5.4335 -14.4872 -12.9525 \n",
-      "t = 1.575, y[62] = -5.47817, x_hat[62] = -5.74508 -14.6968 -12.853 \n",
-      "t = 1.6, y[63] = -5.74901, x_hat[63] = -6.05487 -14.8895 -12.7428 \n",
-      "t = 1.625, y[64] = -6.01986, x_hat[64] = -6.36288 -15.0664 -12.623 \n",
-      "t = 1.65, y[65] = -6.2907, x_hat[65] = -6.66912 -15.2283 -12.4951 \n",
-      "t = 1.675, y[66] = -6.56154, x_hat[66] = -6.97361 -15.3763 -12.3601 \n",
-      "t = 1.7, y[67] = -6.83239, x_hat[67] = -7.27637 -15.5111 -12.2191 \n",
-      "t = 1.725, y[68] = -7.10323, x_hat[68] = -7.57746 -15.6336 -12.0731 \n",
-      "t = 1.75, y[69] = -7.37408, x_hat[69] = -7.87691 -15.7446 -11.9229 \n",
-      "t = 1.775, y[70] = -7.64492, x_hat[70] = -8.17476 -15.8449 -11.7694 \n",
-      "t = 1.8, y[71] = -7.91576, x_hat[71] = -8.47108 -15.9351 -11.6132 \n",
-      "t = 1.825, y[72] = -8.18661, x_hat[72] = -8.7659 -16.0159 -11.4551 \n",
-      "t = 1.85, y[73] = -8.45745, x_hat[73] = -9.05929 -16.0881 -11.2954 \n",
-      "t = 1.875, y[74] = -8.7283, x_hat[74] = -9.3513 -16.152 -11.1349 \n",
-      "t = 1.9, y[75] = -8.99914, x_hat[75] = -9.64199 -16.2084 -10.9739 \n",
-      "t = 1.925, y[76] = -9.26999, x_hat[76] = -9.9314 -16.2578 -10.8128 \n",
-      "t = 1.95, y[77] = -9.54083, x_hat[77] = -10.2196 -16.3005 -10.652 \n",
-      "t = 1.975, y[78] = -9.81167, x_hat[78] = -10.5066 -16.3372 -10.4918 \n",
-      "t = 2, y[79] = -10.0825, x_hat[79] = -10.7925 -16.3682 -10.3325 \n",
-      "t = 2.025, y[80] = -10.3534, x_hat[80] = -11.0774 -16.3939 -10.1743 \n",
-      "t = 2.05, y[81] = -10.6242, x_hat[81] = -11.3612 -16.4148 -10.0175 \n",
-      "t = 2.075, y[82] = -10.8951, x_hat[82] = -11.6441 -16.4311 -9.86215 \n",
-      "t = 2.1, y[83] = -11.1659, x_hat[83] = -11.9261 -16.4432 -9.70853 \n",
-      "t = 2.125, y[84] = -11.4367, x_hat[84] = -12.2072 -16.4514 -9.55674 \n",
-      "t = 2.15, y[85] = -11.7076, x_hat[85] = -12.4874 -16.4559 -9.4069 \n",
-      "t = 2.175, y[86] = -11.9784, x_hat[86] = -12.7669 -16.4572 -9.25909 \n",
-      "t = 2.2, y[87] = -12.2493, x_hat[87] = -13.0456 -16.4553 -9.1134 \n",
-      "t = 2.225, y[88] = -12.5201, x_hat[88] = -13.3236 -16.4505 -8.96991 \n",
+      "t = 0, x_hat[0]: 1.04203 0 -15 \n",
+      "t = 0.0333333, y[0] = 1.04203, x_hat[0] = 1.04203 -0.5 -15 \n",
+      "t = 0.0666667, y[1] = 1.10727, x_hat[1] = 1.08556 -0.0966619 -14.9988 \n",
+      "t = 0.1, y[2] = 1.29135, x_hat[2] = 1.21317 0.720024 -14.9952 \n",
+      "t = 0.133333, y[3] = 1.48485, x_hat[3] = 1.36865 1.21707 -14.9881 \n",
+      "t = 0.166667, y[4] = 1.72826, x_hat[4] = 1.55548 1.60875 -14.9732 \n",
+      "t = 0.2, y[5] = 1.74216, x_hat[5] = 1.66278 1.38374 -14.9637 \n",
+      "t = 0.233333, y[6] = 2.11672, x_hat[6] = 1.85606 1.53382 -14.9229 \n",
+      "t = 0.266667, y[7] = 2.14529, x_hat[7] = 1.98512 1.34018 -14.8908 \n",
+      "t = 0.3, y[8] = 2.1603, x_hat[8] = 2.06901 0.981678 -14.8682 \n",
+      "t = 0.333333, y[9] = 2.21269, x_hat[9] = 2.13267 0.58593 -14.8441 \n",
+      "t = 0.366667, y[10] = 2.57709, x_hat[10] = 2.26319 0.425535 -14.7321 \n",
+      "t = 0.4, y[11] = 2.66822, x_hat[11] = 2.37389 0.210228 -14.6096 \n",
+      "t = 0.433333, y[12] = 2.51642, x_hat[12] = 2.41279 -0.189016 -14.56 \n",
+      "t = 0.466667, y[13] = 2.76034, x_hat[13] = 2.4865 -0.459522 -14.4115 \n",
+      "t = 0.5, y[14] = 2.88132, x_hat[14] = 2.56093 -0.701772 -14.2172 \n",
+      "t = 0.533333, y[15] = 2.88374, x_hat[15] = 2.61135 -0.980106 -14.0346 \n",
+      "t = 0.566667, y[16] = 2.94485, x_hat[16] = 2.65526 -1.24377 -13.8225 \n",
+      "t = 0.6, y[17] = 2.82867, x_hat[17] = 2.65812 -1.58494 -13.6876 \n",
+      "t = 0.633333, y[18] = 3.00066, x_hat[18] = 2.68613 -1.81973 -13.4217 \n",
+      "t = 0.666667, y[19] = 3.12921, x_hat[19] = 2.72799 -1.9816 -13.0631 \n",
+      "t = 0.7, y[20] = 2.85836, x_hat[20] = 2.70186 -2.30405 -12.9167 \n",
+      "t = 0.733333, y[21] = 2.83808, x_hat[21] = 2.66841 -2.61016 -12.7523 \n",
+      "t = 0.766667, y[22] = 2.68975, x_hat[22] = 2.60351 -2.971 -12.6666 \n",
+      "t = 0.8, y[23] = 2.66533, x_hat[23] = 2.53742 -3.29663 -12.5373 \n",
+      "t = 0.833333, y[24] = 2.81613, x_hat[24] = 2.50742 -3.47892 -12.2229 \n",
+      "t = 0.866667, y[25] = 2.81004, x_hat[25] = 2.47782 -3.63091 -11.8847 \n",
+      "t = 0.9, y[26] = 2.88322, x_hat[26] = 2.46572 -3.70491 -11.4628 \n",
+      "t = 0.933333, y[27] = 2.69789, x_hat[27] = 2.41597 -3.86954 -11.1819 \n",
+      "t = 0.966667, y[28] = 2.43422, x_hat[28] = 2.31755 -4.15264 -11.068 \n",
+      "t = 1, y[29] = 2.23465, x_hat[29] = 2.19065 -4.48803 -11.0261 \n",
+      "t = 1.03333, y[30] = 2.30279, x_hat[30] = 2.09529 -4.69914 -10.8344 \n",
+      "t = 1.06667, y[31] = 2.0207, x_hat[31] = 1.95562 -5.01191 -10.7763 \n",
+      "t = 1.1, y[32] = 1.94394, x_hat[32] = 1.82057 -5.28093 -10.6701 \n",
+      "t = 1.13333, y[33] = 1.82498, x_hat[33] = 1.68155 -5.53372 -10.5515 \n",
+      "t = 1.16667, y[34] = 1.52526, x_hat[34] = 1.50284 -5.86969 -10.5338 \n",
+      "t = 1.2, y[35] = 1.86968, x_hat[35] = 1.42124 -5.91291 -10.1937 \n",
+      "t = 1.23333, y[36] = 1.18073, x_hat[36] = 1.2154 -6.27594 -10.2188 \n",
+      "t = 1.26667, y[37] = 1.1073, x_hat[37] = 1.02642 -6.56369 -10.1629 \n",
+      "t = 1.3, y[38] = 0.916168, x_hat[38] = 0.829169 -6.84703 -10.1055 \n",
+      "t = 1.33333, y[39] = 0.678548, x_hat[39] = 0.616214 -7.14521 -10.0663 \n",
+      "t = 1.36667, y[40] = 0.562382, x_hat[40] = 0.41403 -7.39117 -9.97735 \n",
+      "t = 1.4, y[41] = 0.355468, x_hat[41] = 0.204015 -7.63475 -9.89085 \n",
+      "t = 1.43333, y[42] = -0.155607, x_hat[42] = -0.0706529 -8.01301 -9.93708 \n",
+      "t = 1.46667, y[43] = -0.287199, x_hat[43] = -0.328135 -8.32148 -9.91585 \n",
+      "t = 1.5, y[44] = -0.602973, x_hat[44] = -0.605038 -8.65089 -9.91483 \n",
+      "t = 1.53333, y[45] = -0.292649, x_hat[45] = -0.781097 -8.72417 -9.68453 \n",
+      "t = 1.56667, y[46] = -0.602973, x_hat[46] = -0.985005 -8.85119 -9.51267 \n",
+      "t = 1.6, y[47] = -0.924198, x_hat[47] = -1.21467 -9.02335 -9.38793 \n",
+      "t = 1.63333, y[48] = -1.25632, x_hat[48] = -1.46826 -9.2333 -9.30101 \n",
+      "t = 1.66667, y[49] = -1.59935, x_hat[49] = -1.74413 -9.4748 -9.24427 \n",
+      "t = 1.7, y[50] = -1.95327, x_hat[50] = -2.04085 -9.74255 -9.21144 \n",
+      "t = 1.73333, y[51] = -2.3181, x_hat[51] = -2.35716 -10.032 -9.19743 \n",
+      "t = 1.76667, y[52] = -2.69382, x_hat[52] = -2.69196 -10.3394 -9.19807 \n",
+      "t = 1.8, y[53] = -3.08044, x_hat[53] = -3.04427 -10.6615 -9.20997 \n",
+      "t = 1.83333, y[54] = -3.47797, x_hat[54] = -3.41323 -10.9956 -9.23039 \n",
+      "t = 1.86667, y[55] = -3.88639, x_hat[55] = -3.7981 -11.3393 -9.25711 \n",
+      "t = 1.9, y[56] = -4.30572, x_hat[56] = -4.19822 -11.6907 -9.28834 \n",
+      "t = 1.93333, y[57] = -4.73594, x_hat[57] = -4.61301 -12.0482 -9.32265 \n",
+      "t = 1.96667, y[58] = -5.17707, x_hat[58] = -5.04196 -12.4105 -9.3589 \n",
+      "t = 2, y[59] = -5.62909, x_hat[59] = -5.48464 -12.7763 -9.39618 \n",
+      "t = 2.03333, y[60] = -6.09202, x_hat[60] = -5.94065 -13.1448 -9.43379 \n",
+      "t = 2.06667, y[61] = -6.56584, x_hat[61] = -6.40966 -13.5151 -9.47117 \n",
+      "t = 2.1, y[62] = -7.05057, x_hat[62] = -6.89136 -13.8865 -9.5079 \n",
+      "t = 2.13333, y[63] = -7.54619, x_hat[63] = -7.38549 -14.2587 -9.54365 \n",
+      "t = 2.16667, y[64] = -8.05271, x_hat[64] = -7.89184 -14.6309 -9.57819 \n",
+      "t = 2.2, y[65] = -8.57014, x_hat[65] = -8.41019 -15.003 -9.61135 \n",
+      "t = 2.23333, y[66] = -9.09846, x_hat[66] = -8.94038 -15.3747 -9.64301 \n",
+      "t = 2.26667, y[67] = -9.63769, x_hat[67] = -9.48225 -15.7456 -9.67311 \n",
+      "t = 2.3, y[68] = -10.1878, x_hat[68] = -10.0357 -16.1156 -9.70161 \n",
+      "t = 2.33333, y[69] = -10.7488, x_hat[69] = -10.6005 -16.4845 -9.7285 \n",
+      "t = 2.36667, y[70] = -11.3208, x_hat[70] = -11.1767 -16.8523 -9.75379 \n",
+      "t = 2.4, y[71] = -11.9036, x_hat[71] = -11.7642 -17.2188 -9.77751 \n",
+      "t = 2.43333, y[72] = -12.4973, x_hat[72] = -12.3628 -17.5841 -9.7997 \n",
+      "t = 2.46667, y[73] = -13.1019, x_hat[73] = -12.9725 -17.9479 -9.82041 \n",
+      "t = 2.5, y[74] = -13.7175, x_hat[74] = -13.5932 -18.3104 -9.83969 \n",
+      "t = 2.53333, y[75] = -14.3439, x_hat[75] = -14.225 -18.6716 -9.8576 \n",
+      "t = 2.56667, y[76] = -14.9812, x_hat[76] = -14.8677 -19.0313 -9.87422 \n",
+      "t = 2.6, y[77] = -15.6294, x_hat[77] = -15.5214 -19.3897 -9.88959 \n",
+      "t = 2.63333, y[78] = -16.2886, x_hat[78] = -16.1859 -19.7467 -9.90379 \n",
+      "t = 2.66667, y[79] = -16.9586, x_hat[79] = -16.8613 -20.1025 -9.91688 \n",
+      "t = 2.7, y[80] = -17.6395, x_hat[80] = -17.5475 -20.4569 -9.92892 \n",
+      "t = 2.73333, y[81] = -18.3313, x_hat[81] = -18.2446 -20.8101 -9.93998 \n",
+      "t = 2.76667, y[82] = -19.0341, x_hat[82] = -18.9524 -21.162 -9.9501 \n",
+      "t = 2.8, y[83] = -19.7477, x_hat[83] = -19.6711 -21.5128 -9.95936 \n",
+      "t = 2.83333, y[84] = -20.4722, x_hat[84] = -20.4006 -21.8624 -9.96781 \n",
+      "t = 2.86667, y[85] = -21.2076, x_hat[85] = -21.1408 -22.2109 -9.97549 \n",
+      "t = 2.9, y[86] = -21.954, x_hat[86] = -21.8918 -22.5584 -9.98246 \n",
+      "t = 2.93333, y[87] = -22.7112, x_hat[87] = -22.6535 -22.9048 -9.98876 \n",
+      "t = 2.96667, y[88] = -23.4793, x_hat[88] = -23.4261 -23.2503 -9.99445 \n",
+      "t = 3, y[89] = -24.2583, x_hat[89] = -24.2094 -23.5948 -9.99955 \n",
+      "t = 3.03333, y[90] = -25.0483, x_hat[90] = -25.0034 -23.9384 -10.0041 \n",
+      "t = 3.06667, y[91] = -25.8491, x_hat[91] = -25.8082 -24.2812 -10.0082 \n",
+      "t = 3.1, y[92] = -26.6608, x_hat[92] = -26.6238 -24.6231 -10.0118 \n",
+      "t = 3.13333, y[93] = -27.4834, x_hat[93] = -27.4501 -24.9642 -10.015 \n",
+      "t = 3.16667, y[94] = -28.3169, x_hat[94] = -28.2872 -25.3046 -10.0177 \n",
+      "t = 3.2, y[95] = -29.1614, x_hat[95] = -29.135 -25.6443 -10.0201 \n",
+      "t = 3.23333, y[96] = -30.0167, x_hat[96] = -29.9937 -25.9833 -10.0222 \n",
+      "t = 3.26667, y[97] = -30.8829, x_hat[97] = -30.863 -26.3216 -10.0239 \n",
+      "t = 3.3, y[98] = -31.76, x_hat[98] = -31.7432 -26.6593 -10.0254 \n",
+      "t = 3.33333, y[99] = -32.6481, x_hat[99] = -32.6341 -26.9964 -10.0265 \n",
+      "t = 3.36667, y[100] = -33.547, x_hat[100] = -33.5358 -27.333 -10.0274 \n",
+      "t = 3.4, y[101] = -34.4568, x_hat[101] = -34.4483 -27.669 -10.0281 \n",
+      "t = 3.43333, y[102] = -35.3775, x_hat[102] = -35.3716 -28.0045 -10.0286 \n",
+      "t = 3.46667, y[103] = -36.3092, x_hat[103] = -36.3056 -28.3395 -10.0289 \n",
+      "t = 3.5, y[104] = -37.2517, x_hat[104] = -37.2505 -28.674 -10.029 \n",
+      "t = 3.53333, y[105] = -38.2051, x_hat[105] = -38.2061 -29.0081 -10.0289 \n",
+      "t = 3.56667, y[106] = -39.1694, x_hat[106] = -39.1726 -29.3418 -10.0287 \n",
+      "t = 3.6, y[107] = -40.1447, x_hat[107] = -40.1498 -29.6751 -10.0283 \n",
+      "t = 3.63333, y[108] = -41.1308, x_hat[108] = -41.1378 -30.008 -10.0278 \n",
+      "t = 3.66667, y[109] = -42.1278, x_hat[109] = -42.1367 -30.3405 -10.0272 \n",
+      "t = 3.7, y[110] = -43.1357, x_hat[110] = -43.1463 -30.6727 -10.0265 \n",
+      "t = 3.73333, y[111] = -44.1546, x_hat[111] = -44.1668 -31.0046 -10.0257 \n",
+      "t = 3.76667, y[112] = -45.1843, x_hat[112] = -45.1981 -31.3362 -10.0247 \n",
+      "t = 3.8, y[113] = -46.2249, x_hat[113] = -46.2402 -31.6674 -10.0238 \n",
+      "t = 3.83333, y[114] = -47.2764, x_hat[114] = -47.2932 -31.9984 -10.0227 \n",
+      "t = 3.86667, y[115] = -48.3389, x_hat[115] = -48.3569 -32.3292 -10.0216 \n",
+      "t = 3.9, y[116] = -49.4122, x_hat[116] = -49.4315 -32.6597 -10.0204 \n",
+      "t = 3.93333, y[117] = -50.4964, x_hat[117] = -50.517 -32.9899 -10.0191 \n",
+      "t = 3.96667, y[118] = -51.5915, x_hat[118] = -51.6132 -33.3199 -10.0179 \n",
+      "t = 4, y[119] = -52.6976, x_hat[119] = -52.7204 -33.6497 -10.0165 \n",
+      "t = 4.03333, y[120] = -53.8145, x_hat[120] = -53.8383 -33.9793 -10.0152 \n",
+      "t = 4.06667, y[121] = -54.9423, x_hat[121] = -54.9671 -34.3087 -10.0138 \n",
+      "t = 4.1, y[122] = -56.081, x_hat[122] = -56.1067 -34.638 -10.0123 \n",
+      "t = 4.13333, y[123] = -57.2307, x_hat[123] = -57.2572 -34.967 -10.0109 \n",
+      "t = 4.16667, y[124] = -58.3912, x_hat[124] = -58.4185 -35.2959 -10.0094 \n",
+      "t = 4.2, y[125] = -59.5626, x_hat[125] = -59.5907 -35.6246 -10.0079 \n",
+      "t = 4.23333, y[126] = -60.7449, x_hat[126] = -60.7738 -35.9532 -10.0064 \n",
+      "t = 4.26667, y[127] = -61.9382, x_hat[127] = -61.9677 -36.2817 -10.0048 \n",
+      "t = 4.3, y[128] = -63.1423, x_hat[128] = -63.1724 -36.61 -10.0033 \n",
+      "t = 4.33333, y[129] = -64.3573, x_hat[129] = -64.388 -36.9381 -10.0018 \n",
+      "t = 4.36667, y[130] = -65.5832, x_hat[130] = -65.6145 -37.2662 -10.0002 \n",
+      "t = 4.4, y[131] = -66.8201, x_hat[131] = -66.8518 -37.5942 -9.99866 \n",
+      "t = 4.43333, y[132] = -68.0678, x_hat[132] = -68.1 -37.922 -9.9971 \n",
+      "t = 4.46667, y[133] = -69.3264, x_hat[133] = -69.3591 -38.2498 -9.99554 \n",
+      "t = 4.5, y[134] = -70.5959, x_hat[134] = -70.629 -38.5774 -9.99399 \n",
+      "t = 4.53333, y[135] = -71.8764, x_hat[135] = -71.9099 -38.905 -9.99244 \n",
+      "t = 4.56667, y[136] = -73.1677, x_hat[136] = -73.2015 -39.2324 -9.99089 \n",
+      "t = 4.6, y[137] = -74.4699, x_hat[137] = -74.5041 -39.5598 -9.98934 \n",
+      "t = 4.63333, y[138] = -75.783, x_hat[138] = -75.8175 -39.8871 -9.98781 \n",
+      "t = 4.66667, y[139] = -77.1071, x_hat[139] = -77.1418 -40.2144 -9.98628 \n",
+      "t = 4.7, y[140] = -78.442, x_hat[140] = -78.477 -40.5416 -9.98476 \n",
+      "t = 4.73333, y[141] = -79.7878, x_hat[141] = -79.823 -40.8687 -9.98324 \n",
+      "t = 4.76667, y[142] = -81.1445, x_hat[142] = -81.18 -41.1957 -9.98174 \n",
+      "t = 4.8, y[143] = -82.5122, x_hat[143] = -82.5478 -41.5227 -9.98025 \n",
+      "t = 4.83333, y[144] = -83.8907, x_hat[144] = -83.9265 -41.8497 -9.97877 \n",
       "\n",
-      "Exec Success, Final kf states:-13.3236 -16.4505 -8.96991 \n"
+      "Exec Success, Final kf states:-83.9265 -41.8497 -9.97877 \n"
      ]
     }
    ],
    "source": [
-    "std::vector<std::vector<double>> g_res = pyrun_sim(1, true);"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "id": "5e8f3f24-b003-4416-bb16-eabc480e437e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "std::vector<double> py_g_pred;\n",
-    "std::vector<double> py_g_est;"
+    "std::vector<std::vector<double>> g_res = run_kf(true);"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 22,
-   "id": "107f53c0-0f2d-47b9-a218-67c29e16b87b",
+   "cell_type": "markdown",
+   "id": "835df3bf-9017-4a24-ac29-8e9d821bb881",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "int k = g_res.size();"
+    "### Defining a C++ function to resolve the 2D vector result *{Kalman Filter Estimates, Kalman Gain}*\n",
+    "\n",
+    "- The function returns the KF estimates if the provided axis is 0 and Kalman Gains if axis = 1"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
-   "id": "63f5ba73-393b-4a0a-9c83-6829b56fc526",
+   "execution_count": 26,
+   "id": "2a47ecf6-f4e9-44b6-b72c-a935ed02fdc1",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -659,129 +1099,154 @@
     "    }\n",
     "    return ret;\n",
     "}\n",
-    "        "
+    "    "
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "a6bc89d0-026f-4f52-b8c9-9a8fca6948f2",
+   "cell_type": "markdown",
+   "id": "1566d4bf-3952-4616-8b45-b0231c5be9f5",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "py_g_pred = ret_1d_vector(g_res, 0);\n",
-    "py_g_est = ret_1d_vector(g_res, 1);"
+    "#### We obtain the final results as two std::vectors that can be accessed in Python:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
-   "id": "1b22451b-0bd7-48d6-8590-ff59b56a8031",
+   "execution_count": 27,
+   "id": "4e3c7506-8fb8-42f0-b5af-e8950d088a3d",
    "metadata": {},
    "outputs": [],
    "source": [
-    "void printMatrix(const std::vector<double>& vec) {\n",
-    "    for (size_t i = 0; i < vec.size(); i++) {\n",
-    "            std::cout << vec[i] << \" \";\n",
-    "        }\n",
-    "        std::cout << std::endl;\n",
-    "    }\n"
+    "std::vector<double> py_g_pred = ret_1d_vector(g_res, 0);\n",
+    "std::vector<double> py_kf_gains = ret_1d_vector(g_res, 1);"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 26,
-   "id": "47fd1710-59a8-4d80-96de-04ef86e233fd",
+   "cell_type": "markdown",
+   "id": "2f7b1159-8a34-41c2-99d0-56a7e5714cab",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0 0.00110609 0.00356059 0.00712492 0.013793 0.0154301 0.031192 0.0374157 0.0316618 0.0173069 0.039685 0.0527055 0.00552738 -0.00849014 -0.0224249 -0.0729322 -0.139272 -0.288577 -0.396929 -0.484796 -0.747944 -1.04221 -1.43333 -1.83638 -2.13797 -2.44388 -2.69673 -3.07251 -3.6016 -4.20755 -4.67894 -5.27548 -5.81837 -6.3351 -6.94502 -7.17158 -7.78709 -8.28638 -8.75833 -9.22649 -9.60338 -9.95378 -10.4545 -10.8401 -11.2291 -11.5854 -11.9065 -12.1908 -12.4381 -12.6488 -12.8241 -12.9653 -13.0743 -13.1531 -13.2039 -13.2289 -13.2301 -13.2098 -13.17 -13.1126 -13.0395 -12.9525 -12.853 -12.7428 -12.623 -12.4951 -12.3601 -12.2191 -12.0731 -11.9229 -11.7694 -11.6132 -11.4551 -11.2954 -11.1349 -10.9739 -10.8128 -10.652 -10.4918 -10.3325 -10.1743 -10.0175 -9.86215 -9.70853 -9.55674 -9.4069 -9.25909 -9.1134 -8.96991 \n"
-     ]
-    }
-   ],
    "source": [
-    "printMatrix(py_g_pred);"
+    "### Lets plot the trend of Kalman Gain across the time steps to verify its working\n",
+    "\n",
+    "If the algorithm converged properly, we should see an exponential decrease in the gain  with time"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
-   "id": "d0152901-3f6d-49e5-9138-126d1b072912",
+   "execution_count": 28,
+   "id": "129a199c-70b0-454b-a70d-68373c4786f2",
    "metadata": {},
    "outputs": [],
    "source": [
-    "// %%python\n",
+    "%%python\n",
     "\n",
-    "// expand_factor = [1, 2, 3, 4, 5]\n",
-    "// benchmarks = []\n",
+    "import matplotlib.pyplot as plt\n",
     "\n",
-    "// for i in expand_factor:\n",
-    "//     time_kf = cppyy.gbl.pyrun_sim(int(i), 0)\n",
-    "//     benchmarks.append(time_kf)\n",
-    "// "
+    "kalman_gains = list(cppyy.gbl.py_kf_gains)\n",
+    "x = range(len(kalman_gains))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
-   "id": "2c0b5c86-7875-45d9-8113-00febd1be3d5",
+   "execution_count": 29,
+   "id": "f7bcd055-7470-42c9-aad8-238ff3d3566e",
    "metadata": {},
    "outputs": [],
    "source": [
     "%%python\n",
     "\n",
-    "import matplotlib.pyplot as plt"
+    "plt.plot(x, kalman_gains, color='blue', marker='v')\n",
+    "    \n",
+    "plt.xlabel('Time Steps')\n",
+    "plt.ylabel('Kalman Gain')\n",
+    "plt.title('Kalman Gain Plot')\n",
+    "plt.savefig(\"kalman_gain_plot.jpg\")\n",
+    "    \n",
+    "plt.yscale('symlog')\n",
+    "    \n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "842b0e92-4722-4551-81a6-e3a00229da0f",
+   "metadata": {},
+   "source": [
+    "<img src=\"kalman_gain_plot.jpg\">"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8ae33816-740c-455b-82d2-4cacf29206c1",
+   "metadata": {},
+   "source": [
+    "### Now we can plot the values of Gravitational Acceleration *(g)* obtained as Kalman Estimates :"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
-   "id": "e0d86e62-207f-4e14-80c1-2e04c8984fd6",
+   "execution_count": 30,
+   "id": "8801e6da-b52b-46b1-b895-e138bbaa39c0",
    "metadata": {},
    "outputs": [],
    "source": [
     "%%python\n",
     "\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
     "true_val = 9.81\n",
     "g_pred = list(cppyy.gbl.py_g_pred)\n",
     "g_pred = list(-x for x in g_pred)\n",
-    "x = range(len(g_pred))\n",
-    "\n",
+    "    \n",
+    "x = range(len(g_pred))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "09889e69-195e-44c0-87ee-a82616492101",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
     "\n",
-    "# Plot the constant green line\n",
+    "plt.figure()\n",
     "plt.axhline(y=true_val, color='green', linestyle='-')\n",
-    "\n",
-    "# Plot g_pred in orange\n",
     "plt.plot(x, g_pred, color='orange', marker='o', label='KF Estimates')\n",
     "plt.annotate(f'{true_val}', xy=(-0.5, true_val), color='green',\n",
     "             verticalalignment='center', horizontalalignment = 'left')\n",
-    "# Add labels and title\n",
+    "    \n",
     "plt.xlabel('Index')\n",
     "plt.ylabel('Acceleration (m/s₂)')\n",
     "plt.title('True Value vs. g_pred')\n",
     "plt.legend()\n",
-    "plt.savefig(\"1D_kf_plot.jpg\")\n",
+    "plt.savefig(\"1D_KF_plot.jpg\")\n",
     "    \n",
     "plt.yscale('symlog')\n",
-    "# Show the plot\n",
+    "    \n",
     "plt.show()\n"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "7f478ffc-eafa-4c98-9fcc-cd3d18833987",
+   "id": "96887ee7-3a13-4440-8f27-8e93b5bc96db",
+   "metadata": {},
+   "source": [
+    "### Result : The Kalman Filter begins to converge on the value of 9.81"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6596ee2f-a810-4e5c-a4c7-5634d71ec608",
    "metadata": {},
    "source": [
-    "<img src=\"1D_kf_plot.jpg\">"
+    "<img src=\"1D_KF_plot.jpg\">"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "145a5f37-15ae-4336-b126-fba8b6760de2",
+   "id": "9788e8c7-cd17-412c-9158-cbfa601e935d",
    "metadata": {},
    "outputs": [],
    "source": []
diff --git a/notebooks/kalman_CUDA_demo/run_kf_CUDA.ipynb b/notebooks/kalman_CUDA_demo/run_kf_CUDA.ipynb
deleted file mode 100644
index 39f3a2e9..00000000
--- a/notebooks/kalman_CUDA_demo/run_kf_CUDA.ipynb
+++ /dev/null
@@ -1,928 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "afdccef0-6aaa-43d6-ac29-7ea950356e19",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#include \"kalman.hpp\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "6feb6a99-9580-4bf7-aa48-0a7d175494a4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#include <iostream>\n",
-    "#include <stdexcept>\n",
-    "#include <vector>\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "ebcc8af4-32b2-418b-bfbe-9ffb4ff9b0ff",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "__global__ void matAddKernel(double* a, double* b, double* c, int rows, int cols) {\n",
-    "    int col = blockIdx.x * blockDim.x + threadIdx.x;\n",
-    "    int row = blockIdx.y * blockDim.y + threadIdx.y;\n",
-    "\n",
-    "    if (col < cols && row < rows) {\n",
-    "        int idx = row * cols + col;\n",
-    "        c[idx] = a[idx] + b[idx];\n",
-    "    }\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "3f7f95d9-2f3d-4600-8896-0ed4dcca1ed6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "__global__ void matSubKernel(double* a, double* b, double* c, int rows, int cols) {\n",
-    "    int col = blockIdx.x * blockDim.x + threadIdx.x;\n",
-    "    int row = blockIdx.y * blockDim.y + threadIdx.y;\n",
-    "\n",
-    "    if (col < cols && row < rows) {\n",
-    "        int idx = row * cols + col;\n",
-    "        c[idx] = a[idx] - b[idx];\n",
-    "    }\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "00183ab5-0380-4773-a34c-053fef7782f3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "__global__ void matTransposeKernel(double* a, double* c, int rows, int cols) {\n",
-    "    int col = blockIdx.x * blockDim.x + threadIdx.x;\n",
-    "    int row = blockIdx.y * blockDim.y + threadIdx.y;\n",
-    "\n",
-    "    if (col < cols && row < rows) {\n",
-    "        int idx_in = row * cols + col;\n",
-    "        int idx_out = col * rows + row;\n",
-    "        c[idx_out] = a[idx_in];\n",
-    "    }\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "d0eff8fe-34a5-4370-b6ff-118cd4d7cb22",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "__global__ void matvecmulKernel(double* d_mat, double* d_vec, double* d_result, int rows, int cols) {\n",
-    "    int tid = blockIdx.x * blockDim.x + threadIdx.x;\n",
-    "\n",
-    "    if (tid < rows) {\n",
-    "        double sum = 0.0;\n",
-    "        for (int j = 0; j < cols; j++) {\n",
-    "            sum += d_mat[tid * cols + j] * d_vec[j];\n",
-    "        }\n",
-    "        d_result[tid] = sum;\n",
-    "    }\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "21718df0-5f56-4122-bc1a-a368b306fdaf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "__global__ void matmulKernel(double* d_a, double* d_b, double* d_result, int rowsA, int colsA, int colsB) {\n",
-    "    int row = blockIdx.y * blockDim.y + threadIdx.y;\n",
-    "    int col = blockIdx.x * blockDim.x + threadIdx.x;\n",
-    "\n",
-    "    if (row < rowsA && col < colsB) {\n",
-    "        double value = 0.0;\n",
-    "        for (int k = 0; k < colsA; k++) {\n",
-    "            value += d_a[row * colsA + k] * d_b[k * colsB + col];\n",
-    "        }\n",
-    "        d_result[row * colsB + col] = value;\n",
-    "    }\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "18f843c9-9aee-4055-aae7-473c8f121037",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "__global__ void vecsubKernel(const double* a, const double* b, double* result, int len) {\n",
-    "    int idx = blockIdx.x * blockDim.x + threadIdx.x;\n",
-    "    if (idx < len) {\n",
-    "        result[idx] = a[idx] - b[idx];\n",
-    "    }\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "6ff06da4-1430-4720-affb-b9c8b1dc7a26",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "std::vector<std::vector<double>> mataddCUDA(const std::vector<std::vector<double>>& a, const std::vector<std::vector<double>>& b) {\n",
-    "    int rows = a.size();\n",
-    "    int cols = a[0].size();\n",
-    "    \n",
-    "    double* h_a = new double[rows*cols];\n",
-    "    double* h_b = new double[rows*cols];\n",
-    "    double* h_c = new double[rows*cols];\n",
-    "    \n",
-    "    for (int i = 0; i < rows; i++) {\n",
-    "        for (int j = 0; j < cols; j++) {\n",
-    "            h_a[i*cols + j] = a[i][j];\n",
-    "            h_b[i*cols + j] = b[i][j];\n",
-    "        }\n",
-    "    }\n",
-    "    \n",
-    "    double* d_a, * d_b, * d_c;\n",
-    "    cudaMalloc((void**)&d_a, rows*cols*sizeof(double));\n",
-    "    cudaMalloc((void**)&d_b, rows*cols*sizeof(double));\n",
-    "    cudaMalloc((void**)&d_c, rows*cols*sizeof(double));\n",
-    "\n",
-    "    cudaMemcpy(d_a, h_a, rows*cols*sizeof(double), cudaMemcpyHostToDevice);\n",
-    "    cudaMemcpy(d_b, h_b, rows*cols*sizeof(double), cudaMemcpyHostToDevice);\n",
-    "\n",
-    "    dim3 dimBlock(16, 16);\n",
-    "    dim3 dimGrid((cols + dimBlock.x - 1) / dimBlock.x, (rows + dimBlock.y - 1) / dimBlock.y);\n",
-    "    matAddKernel<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, rows, cols);\n",
-    "\n",
-    "    cudaMemcpy(h_c, d_c, rows*cols*sizeof(double), cudaMemcpyDeviceToHost);\n",
-    "\n",
-    "    cudaFree(d_a);\n",
-    "    cudaFree(d_b);\n",
-    "    cudaFree(d_c);\n",
-    "\n",
-    "    std::vector<std::vector<double>> result(rows, std::vector<double>(cols));\n",
-    "    for (int i = 0; i < rows; i++) {\n",
-    "        for (int j = 0; j < cols; j++) {\n",
-    "            result[i][j] = h_c[i*cols + j];\n",
-    "        }\n",
-    "    }\n",
-    "\n",
-    "    delete[] h_a;\n",
-    "    delete[] h_b;\n",
-    "    delete[] h_c;\n",
-    "\n",
-    "    return result;\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "b70f7f32-b7c2-4a1c-96a0-aea42b15f2aa",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "std::vector<std::vector<double>> matsubCUDA(const std::vector<std::vector<double>>& a, const std::vector<std::vector<double>>& b) {\n",
-    "    int rows = a.size();\n",
-    "    int cols = a[0].size();\n",
-    "    \n",
-    "    double* h_a = new double[rows*cols];\n",
-    "    double* h_b = new double[rows*cols];\n",
-    "    double* h_c = new double[rows*cols];\n",
-    "    \n",
-    "    for (int i = 0; i < rows; i++) {\n",
-    "        for (int j = 0; j < cols; j++) {\n",
-    "            h_a[i*cols + j] = a[i][j];\n",
-    "            h_b[i*cols + j] = b[i][j];\n",
-    "        }\n",
-    "    }\n",
-    "\n",
-    "    double* d_a, * d_b, * d_c;\n",
-    "    cudaMalloc((void**)&d_a, rows*cols*sizeof(double));\n",
-    "    cudaMalloc((void**)&d_b, rows*cols*sizeof(double));\n",
-    "    cudaMalloc((void**)&d_c, rows*cols*sizeof(double));\n",
-    "\n",
-    "    cudaMemcpy(d_a, h_a, rows*cols*sizeof(double), cudaMemcpyHostToDevice);\n",
-    "    cudaMemcpy(d_b, h_b, rows*cols*sizeof(double), cudaMemcpyHostToDevice);\n",
-    "\n",
-    "    dim3 dimBlock(16, 16);  \n",
-    "    dim3 dimGrid((cols + dimBlock.x - 1) / dimBlock.x, (rows + dimBlock.y - 1) / dimBlock.y);\n",
-    "    matSubKernel<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, rows, cols);\n",
-    "\n",
-    "    cudaMemcpy(h_c, d_c, rows*cols*sizeof(double), cudaMemcpyDeviceToHost);\n",
-    "\n",
-    "    cudaFree(d_a);\n",
-    "    cudaFree(d_b);\n",
-    "    cudaFree(d_c);\n",
-    "\n",
-    "    std::vector<std::vector<double>> result(rows, std::vector<double>(cols));\n",
-    "    for (int i = 0; i < rows; i++) {\n",
-    "        for (int j = 0; j < cols; j++) {\n",
-    "            result[i][j] = h_c[i*cols + j];\n",
-    "        }\n",
-    "    }\n",
-    "\n",
-    "    delete[] h_a;\n",
-    "    delete[] h_b;\n",
-    "    delete[] h_c;\n",
-    "\n",
-    "    return result;\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "86b4a452-f04c-45a4-b69b-d87bcb2e5954",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "std::vector<std::vector<double>> mattransposeCUDA(const std::vector<std::vector<double>>& a) {\n",
-    "    int rows = a.size();\n",
-    "    int cols = a[0].size();\n",
-    "    \n",
-    "    double* h_a = new double[rows*cols];\n",
-    "    double* h_c = new double[cols*rows];\n",
-    "    \n",
-    "    for (int i = 0; i < rows; i++) {\n",
-    "        for (int j = 0; j < cols; j++) {\n",
-    "            h_a[i*cols + j] = a[i][j];\n",
-    "        }\n",
-    "    }\n",
-    "\n",
-    "    double* d_a, * d_c;\n",
-    "    cudaMalloc((void**)&d_a, rows*cols*sizeof(double));\n",
-    "    cudaMalloc((void**)&d_c, cols*rows*sizeof(double));\n",
-    "\n",
-    "    cudaMemcpy(d_a, h_a, rows*cols*sizeof(double), cudaMemcpyHostToDevice);\n",
-    "\n",
-    "    dim3 dimBlock(16, 16);\n",
-    "    dim3 dimGrid((cols + dimBlock.x - 1) / dimBlock.x, (rows + dimBlock.y - 1) / dimBlock.y);\n",
-    "    matTransposeKernel<<<dimGrid, dimBlock>>>(d_a, d_c, rows, cols);\n",
-    "\n",
-    "    cudaMemcpy(h_c, d_c, cols*rows*sizeof(double), cudaMemcpyDeviceToHost);\n",
-    "\n",
-    "    cudaFree(d_a);\n",
-    "    cudaFree(d_c);\n",
-    "\n",
-    "    std::vector<std::vector<double>> result(cols, std::vector<double>(rows));\n",
-    "    for (int i = 0; i < cols; i++) {\n",
-    "        for (int j = 0; j < rows; j++) {\n",
-    "            result[i][j] = h_c[i*rows + j];\n",
-    "        }\n",
-    "    }\n",
-    "\n",
-    "    delete[] h_a;\n",
-    "    delete[] h_c;\n",
-    "\n",
-    "    return result;\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "bf413652-f71a-4b91-9e6e-5520812dc3cb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "// helper function: matrix inversion\n",
-    "std::vector<std::vector<double>> matinverse(const std::vector<std::vector<double>>& a) {\n",
-    "    size_t n = a.size();\n",
-    "    \n",
-    "    if (n != a[0].size()) {\n",
-    "        std::cout<<\" Shape of a : \"<<a.size()<<\",\"<<a[0].size()<<\"\\n\";\n",
-    "        throw std::runtime_error(\"Only square matrices are supported for inversion.\");\n",
-    "    }\n",
-    "\n",
-    "    // Handle 1x1 matrix\n",
-    "    if (n == 1) {\n",
-    "        if (a[0][0] == 0) {\n",
-    "            throw std::runtime_error(\"singular matrix\");\n",
-    "        }\n",
-    "        return {{1.0 / a[0][0]}};\n",
-    "    }\n",
-    "    \n",
-    "    if (n == 2) {\n",
-    "        double determinant = a[0][0] * a[1][1] - a[0][1] * a[1][0];\n",
-    "        if (determinant == 0) {\n",
-    "            throw std::runtime_error(\"singular matrix\");\n",
-    "        }\n",
-    "\n",
-    "        std::vector<std::vector<double>> result(2, std::vector<double>(2));\n",
-    "        result[0][0] = a[1][1] / determinant;\n",
-    "        result[0][1] = -a[0][1] / determinant;\n",
-    "        result[1][0] = -a[1][0] / determinant;\n",
-    "        result[1][1] = a[0][0] / determinant;\n",
-    "\n",
-    "        return result;\n",
-    "    }\n",
-    "\n",
-    "    if (n == 3) {\n",
-    "        double determinant = a[0][0]*(a[1][1]*a[2][2]-a[2][1]*a[1][2]) \n",
-    "                             - a[0][1]*(a[1][0]*a[2][2]-a[1][2]*a[2][0]) \n",
-    "                             + a[0][2]*(a[1][0]*a[2][1]-a[1][1]*a[2][0]);\n",
-    "        \n",
-    "        if (determinant == 0) {\n",
-    "            throw std::runtime_error(\"singular matrix\");\n",
-    "        }\n",
-    "\n",
-    "        std::vector<std::vector<double>> result(3, std::vector<double>(3));\n",
-    "\n",
-    "        result[0][0] = (a[1][1] * a[2][2] - a[2][1] * a[1][2]) / determinant;\n",
-    "        result[0][1] = (a[0][2] * a[2][1] - a[0][1] * a[2][2]) / determinant;\n",
-    "        result[0][2] = (a[0][1] * a[1][2] - a[0][2] * a[1][1]) / determinant;\n",
-    "        result[1][0] = (a[1][2] * a[2][0] - a[1][0] * a[2][2]) / determinant;\n",
-    "        result[1][1] = (a[0][0] * a[2][2] - a[0][2] * a[2][0]) / determinant;\n",
-    "        result[1][2] = (a[1][0] * a[0][2] - a[0][0] * a[1][2]) / determinant;\n",
-    "        result[2][0] = (a[1][0] * a[2][1] - a[2][0] * a[1][1]) / determinant;\n",
-    "        result[2][1] = (a[2][0] * a[0][1] - a[0][0] * a[2][1]) / determinant;\n",
-    "        result[2][2] = (a[0][0] * a[1][1] - a[1][0] * a[0][1]) / determinant;\n",
-    "\n",
-    "        return result;\n",
-    "    }\n",
-    "\n",
-    "    throw std::runtime_error(\"Only 2x2 and 3x3 matrices supported for inversion\");\n",
-    "}\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "0d925dce-377c-4d23-96d6-9c4fead297cb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "std::vector<double> matvecmulCUDA(const std::vector<std::vector<double>>& a, const std::vector<double>& b) {\n",
-    "    int rows = a.size();\n",
-    "    int cols = a[0].size();\n",
-    "\n",
-    "    if (cols != b.size()) {\n",
-    "        throw std::runtime_error(\"Matrix dims do not match for multiplication.\");\n",
-    "    }\n",
-    "\n",
-    "    std::vector<double> h_mat(rows * cols);\n",
-    "    for (int i = 0; i < rows; i++) {\n",
-    "        for (int j = 0; j < cols; j++) {\n",
-    "            h_mat[i * cols + j] = a[i][j];\n",
-    "        }\n",
-    "    }\n",
-    "\n",
-    "    double *d_mat, *d_vec, *d_result;\n",
-    "    cudaMalloc((void**)&d_mat, rows * cols * sizeof(double));\n",
-    "    cudaMalloc((void**)&d_vec, cols * sizeof(double));\n",
-    "    cudaMalloc((void**)&d_result, rows * sizeof(double));\n",
-    "\n",
-    "    cudaMemcpy(d_mat, h_mat.data(), rows * cols * sizeof(double), cudaMemcpyHostToDevice);\n",
-    "    cudaMemcpy(d_vec, b.data(), cols * sizeof(double), cudaMemcpyHostToDevice);\n",
-    "\n",
-    "    int blockSize = 256;\n",
-    "    int gridSize = (rows + blockSize - 1) / blockSize;\n",
-    "\n",
-    "    matvecmulKernel<<<gridSize, blockSize>>>(d_mat, d_vec, d_result, rows, cols);\n",
-    "\n",
-    "    std::vector<double> result(rows);\n",
-    "    cudaMemcpy(result.data(), d_result, rows * sizeof(double), cudaMemcpyDeviceToHost);\n",
-    "\n",
-    "    cudaFree(d_mat);\n",
-    "    cudaFree(d_vec);\n",
-    "    cudaFree(d_result);\n",
-    "\n",
-    "    return result;\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "69714460-9a3e-4966-9de7-5e4111fc40af",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "std::vector<std::vector<double>> matmulCUDA(const std::vector<std::vector<double>>& a, const std::vector<std::vector<double>>& b) {\n",
-    "    int rowsA = a.size();\n",
-    "    int colsA = a[0].size();\n",
-    "    int rowsB = b.size();\n",
-    "    int colsB = b[0].size();\n",
-    "\n",
-    "    if (colsA != rowsB) {\n",
-    "        throw std::runtime_error(\"Matrix dims do not match for multiplication.\");\n",
-    "    }\n",
-    "\n",
-    "    std::vector<double> h_a(rowsA * colsA);\n",
-    "    std::vector<double> h_b(rowsB * colsB);\n",
-    "\n",
-    "    for (int i = 0; i < rowsA; i++) {\n",
-    "        for (int j = 0; j < colsA; j++) {\n",
-    "            h_a[i * colsA + j] = a[i][j];\n",
-    "        }\n",
-    "    }\n",
-    "\n",
-    "    for (int i = 0; i < rowsB; i++) {\n",
-    "        for (int j = 0; j < colsB; j++) {\n",
-    "            h_b[i * colsB + j] = b[i][j];\n",
-    "        }\n",
-    "    }\n",
-    "\n",
-    "    double *d_a, *d_b, *d_result;\n",
-    "    \n",
-    "    cudaMalloc((void**)&d_a, rowsA * colsA * sizeof(double));\n",
-    "    cudaMalloc((void**)&d_b, rowsB * colsB * sizeof(double));\n",
-    "    cudaMalloc((void**)&d_result, rowsA * colsB * sizeof(double));\n",
-    "\n",
-    "    cudaMemcpy(d_a, h_a.data(), rowsA * colsA * sizeof(double), cudaMemcpyHostToDevice);\n",
-    "    cudaMemcpy(d_b, h_b.data(), rowsB * colsB * sizeof(double), cudaMemcpyHostToDevice);\n",
-    "\n",
-    "    dim3 blockSize(16, 16);\n",
-    "    dim3 gridSize((colsB + blockSize.x - 1) / blockSize.x, (rowsA + blockSize.y - 1) / blockSize.y);\n",
-    "\n",
-    "    matmulKernel<<<gridSize, blockSize>>>(d_a, d_b, d_result, rowsA, colsA, colsB);\n",
-    "\n",
-    "    std::vector<double> h_result(rowsA * colsB);\n",
-    "    cudaMemcpy(h_result.data(), d_result, rowsA * colsB * sizeof(double), cudaMemcpyDeviceToHost);\n",
-    "\n",
-    "    cudaFree(d_a);\n",
-    "    cudaFree(d_b);\n",
-    "    cudaFree(d_result);\n",
-    "    \n",
-    "    std::vector<std::vector<double>> result(rowsA, std::vector<double>(colsB));\n",
-    "    for (int i = 0; i < rowsA; i++) {\n",
-    "        for (int j = 0; j < colsB; j++) {\n",
-    "            result[i][j] = h_result[i * colsB + j];\n",
-    "        }\n",
-    "    }\n",
-    "\n",
-    "    return result;\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "7295e929-dcfc-43be-9b3b-82e9f9fbf9db",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "std::vector<double> vecsubCUDA(const std::vector<double>& a, const std::vector<double>& b) {\n",
-    "    int len = a.size();\n",
-    "\n",
-    "    double* d_a;\n",
-    "    double* d_b;\n",
-    "    double* d_result;\n",
-    "\n",
-    "    cudaMalloc((void**)&d_a, len * sizeof(double));\n",
-    "    cudaMalloc((void**)&d_b, len * sizeof(double));\n",
-    "    cudaMalloc((void**)&d_result, len * sizeof(double));\n",
-    "\n",
-    "    cudaMemcpy(d_a, a.data(), len * sizeof(double), cudaMemcpyHostToDevice);\n",
-    "    cudaMemcpy(d_b, b.data(), len * sizeof(double), cudaMemcpyHostToDevice);\n",
-    "\n",
-    "    int blockSize = 256; \n",
-    "    int gridSize = (len + blockSize - 1) / blockSize;\n",
-    "\n",
-    "    vecsubKernel<<<gridSize, blockSize>>>(d_a, d_b, d_result, len);\n",
-    "\n",
-    "    std::vector<double> result(len);\n",
-    "    cudaMemcpy(result.data(), d_result, len * sizeof(double), cudaMemcpyDeviceToHost);\n",
-    "\n",
-    "    cudaFree(d_a);\n",
-    "    cudaFree(d_b);\n",
-    "    cudaFree(d_result);\n",
-    "\n",
-    "    return result;\n",
-    "}\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "84f29c1e-d4c2-4680-a579-14d63a2bde08",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "void printMatrix(const std::vector<std::vector<double>>& matrix) {\n",
-    "    for (size_t i = 0; i < matrix.size(); i++) {\n",
-    "        for (size_t j = 0; j < matrix[i].size(); j++) {\n",
-    "            std::cout << matrix[i][j] << \" \";\n",
-    "        }\n",
-    "        std::cout << std::endl;\n",
-    "    }\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "id": "6d718c47-3bc6-4c7a-893c-4fedc399926d",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Result matrix:\n",
-      "11 14 17 20 \n",
-      "23 30 37 44 \n",
-      "35 46 57 68 \n"
-     ]
-    }
-   ],
-   "source": [
-    "// test for matrix matrix multiplication using CUDA\n",
-    "std::vector<std::vector<double>> matrixA = {\n",
-    "        {1.0, 2.0},\n",
-    "        {3.0, 4.0},\n",
-    "        {5.0, 6.0}\n",
-    "    };\n",
-    "\n",
-    "    std::vector<std::vector<double>> matrixB = {\n",
-    "        {1.0, 2.0, 3.0, 4.0},\n",
-    "        {5.0, 6.0, 7.0, 8.0}\n",
-    "    };\n",
-    "\n",
-    "    std::vector<std::vector<double>> result = matmulCUDA(matrixA, matrixB);\n",
-    "\n",
-    "    // Print the result\n",
-    "    std::cout << \"Result matrix:\" << std::endl;\n",
-    "    printMatrix(result);"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "id": "f4c9238f-674a-4cf8-8128-840cc7f61e0f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "//set up class and constructor\n",
-    "KalmanFilter::KalmanFilter(\n",
-    "    double dt,\n",
-    "    const std::vector<std::vector<double>>& A,\n",
-    "    const std::vector<std::vector<double>>& C,\n",
-    "    const std::vector<std::vector<double>>& Q,\n",
-    "    const std::vector<std::vector<double>>& R,\n",
-    "    const std::vector<std::vector<double>>& P)\n",
-    "  : A(A), C(C), Q(Q), R(R), P0(P),\n",
-    "    m(C.size()), n(A.size()), dt(dt), initialized(false),\n",
-    "    I(n, std::vector<double>(n)), x_hat(n), x_hat_new(n)\n",
-    "{\n",
-    "    for (int i = 0; i < n; i++) {\n",
-    "        I[i][i] = 1.0;\n",
-    "    }\n",
-    "}\n",
-    "\n",
-    "KalmanFilter::KalmanFilter() {}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "id": "bd953ec6-ceed-489c-8e33-8eb7ae6bb32f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "// init the kf states + measurements \n",
-    "void KalmanFilter::init(double t0, const std::vector<double>& x0) {\n",
-    "    x_hat = x0;\n",
-    "    P = P0;\n",
-    "    this->t0 = t0;\n",
-    "    t = t0;\n",
-    "    initialized = true;\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "id": "2840f0f7-52de-4f47-a9dc-e257235df002",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "void KalmanFilter::init() {\n",
-    "    std::fill(x_hat.begin(), x_hat.end(), 0.0);\n",
-    "    P = P0;\n",
-    "    t0 = 0;\n",
-    "    t = t0;\n",
-    "    initialized = true;\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "id": "ea057f20-9097-4dd3-946e-ab7a383afe69",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "\n",
-    "void KalmanFilter::update(const std::vector<double>& y) {\n",
-    "    if (!initialized)\n",
-    "        throw std::runtime_error(\"Filter is not initialized!\");\n",
-    "\n",
-    "    x_hat_new = matvecmulCUDA(A, x_hat);\n",
-    "\n",
-    "    P = mataddCUDA(matmulCUDA(matmulCUDA(A, P), mattransposeCUDA(A)), Q);\n",
-    "\n",
-    "    std::vector<std::vector<double>> inv = matinverse(mataddCUDA(matmulCUDA(matmulCUDA(C, P), mattransposeCUDA(C)), R));\n",
-    "\n",
-    "    K = matmulCUDA(matmulCUDA(P, mattransposeCUDA(C)), inv);\n",
-    "    std::vector<double> temp = matvecmulCUDA(C, x_hat_new);\n",
-    "    std::vector<double> difference = vecsubCUDA(y, temp);\n",
-    "\n",
-    "    for (size_t i = 0; i < x_hat_new.size(); i++) {\n",
-    "        x_hat_new[i] += matvecmulCUDA(K, difference)[i];\n",
-    "    }\n",
-    "\n",
-    "    P = matmulCUDA(matsubCUDA(I, matmulCUDA(K, C)), P);\n",
-    "\n",
-    "    x_hat = x_hat_new;\n",
-    "    t += dt;\n",
-    "}\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "id": "08d28163-a086-46a9-bf41-044f48530929",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "void KalmanFilter::update(const std::vector<double>& y, double dt, const std::vector<std::vector<double>>& A) {\n",
-    "    this->A = A;\n",
-    "    this->dt = dt;\n",
-    "    update(y);\n",
-    "}\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "id": "79ed17e5-e20a-4155-849b-41b2b9279b9f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "std::vector<double> generateExpandedData(const std::vector<double>& baseData, int repetitions) {\n",
-    "    std::vector<double> expandedData;\n",
-    "    for (int r = 0; r < repetitions; r++) {\n",
-    "        for (auto val : baseData) {\n",
-    "            // Add slight random perturbation to the data.\n",
-    "            double perturbedValue = val + ((double)rand() / RAND_MAX - 0.5);\n",
-    "            expandedData.push_back(perturbedValue);\n",
-    "        }\n",
-    "    }\n",
-    "    return expandedData;\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "8938a288-fc2c-4104-82c6-added4466c8c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "int run_kf(int expansion_factor, bool verbose) {\n",
-    "    \n",
-    "    int n = 3; // Number of states\n",
-    "    int m = 1; // Number of measurements\n",
-    "\n",
-    "    double dt = 1.0 / 30; // Time step\n",
-    "    \n",
-    "    std::vector<std::vector<double>> A(n, std::vector<double>(n));\n",
-    "    std::vector<std::vector<double>> C(m, std::vector<double>(n));\n",
-    "    std::vector<std::vector<double>> Q(n, std::vector<double>(n));\n",
-    "    std::vector<std::vector<double>> R(m, std::vector<double>(m));\n",
-    "    std::vector<std::vector<double>> P(n, std::vector<double>(n));\n",
-    "    \n",
-    "    A = {{1, dt, 0}, {0, 1, dt}, {0, 0, 1}};\n",
-    "    C = {{1, 0, 0}};\n",
-    "    Q = {{.05, .05, .0}, {.05, .05, .0}, {.0, .0, .0}};\n",
-    "    R = {{5}};\n",
-    "    P = {{.1, .1, .1}, {.1, 10000, 10}, {.1, 10, 100}};\n",
-    "    \n",
-    "    KalmanFilter kf(dt, A, C, Q, R, P);\n",
-    "    \n",
-    "    std::vector<double> measurements = {\n",
-    "      1.04202710058, 1.10726790452, 1.2913511148, 1.48485250951, 1.72825901034,\n",
-    "      1.74216489744, 2.11672039768, 2.14529225112, 2.16029641405, 2.21269371128,\n",
-    "      2.57709350237, 2.6682215744, 2.51641839428, 2.76034056782, 2.88131780617,\n",
-    "      2.88373786518, 2.9448468727, 2.82866600131, 3.0006601946, 3.12920591669,\n",
-    "      2.858361783, 2.83808170354, 2.68975330958, 2.66533185589, 2.81613499531,\n",
-    "      2.81003612051, 2.88321849354, 2.69789264832, 2.4342229249, 2.23464791825,\n",
-    "      2.30278776224, 2.02069770395, 1.94393985809, 1.82498398739, 1.52526230354,\n",
-    "      1.86967808173, 1.18073207847, 1.10729605087, 0.916168349913, 0.678547664519,\n",
-    "      0.562381751596, 0.355468474885, -0.155607486619, -0.287198661013, -0.602973173813\n",
-    "      };\n",
-    "    \n",
-    "    std::vector<double> expandedMeasurements = generateExpandedData(measurements, expansion_factor);\n",
-    "    \n",
-    "    std::vector<double> x0 = {expandedMeasurements[0], 0, -9.81};\n",
-    "    kf.init(0, x0);\n",
-    "\n",
-    "    // Feed measurements into filter, output estimated states\n",
-    "    std::vector<double> y(m);\n",
-    "    if(verbose) {\n",
-    "        std::cout << \"t = \" << 0 << \", \" << \"x_hat[0]: \";\n",
-    "        for (auto& val : kf.state()) std::cout << val << \" \";\n",
-    "        std::cout << std::endl;\n",
-    "    }\n",
-    "    \n",
-    "    int i;\n",
-    "    for (i = 0; i < measurements.size(); i++) {\n",
-    "        y[0] = measurements[i];\n",
-    "        kf.update(y);\n",
-    "        if(verbose) {\n",
-    "            std::cout << \"t = \" << (i + 1) * dt << \", y[\" << i << \"] = \" << y[0] << \", x_hat[\" << i << \"] = \";\n",
-    "            for (auto& val : kf.state()) std::cout << val << \" \";\n",
-    "            std::cout << std::endl;\n",
-    "        }\n",
-    "    }\n",
-    "    std::cout << std::endl;\n",
-    "    std::cout<<\"Exec Success, Final kf states:\";\n",
-    "    for (auto& val : kf.state()) std::cout << val << \" \";\n",
-    "    std::cout << std::endl;\n",
-    "    return 0;\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "id": "7ea068c2-fc1f-4ad4-a2c9-aecd66cbbf71",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#include <chrono>\n",
-    "auto start = std::chrono::high_resolution_clock::now();\n",
-    "auto stop = std::chrono::high_resolution_clock::now();\n",
-    "auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "id": "1e1f74bf-1118-421f-a39f-00ce72db9923",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "int pyrun_sim(int exp_factor, bool verbose) {\n",
-    "    start = std::chrono::high_resolution_clock::now();\n",
-    "    run_kf(exp_factor, verbose);\n",
-    "    stop = std::chrono::high_resolution_clock::now();\n",
-    "    duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);\n",
-    "    return duration.count();\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "id": "7a910405-c4ff-41fa-aec5-69aa24598f9c",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "t = 0, x_hat[0]: 0.737686 0 -9.81 \n",
-      "t = 0.0333333, y[0] = 1.04203, x_hat[0] = 0.948486 5.91215 -9.80189 \n",
-      "t = 0.0666667, y[1] = 1.10727, x_hat[1] = 1.11742 5.16313 -9.80246 \n",
-      "t = 0.1, y[2] = 1.29135, x_hat[2] = 1.29067 4.8479 -9.80243 \n",
-      "t = 0.133333, y[3] = 1.48485, x_hat[3] = 1.46956 4.65232 -9.8015 \n",
-      "t = 0.166667, y[4] = 1.72826, x_hat[4] = 1.67214 4.61508 -9.79665 \n",
-      "t = 0.2, y[5] = 1.74216, x_hat[5] = 1.79217 4.11586 -9.80259 \n",
-      "t = 0.233333, y[6] = 2.11672, x_hat[6] = 1.99697 4.08721 -9.78381 \n",
-      "t = 0.266667, y[7] = 2.14529, x_hat[7] = 2.13716 3.7765 -9.78219 \n",
-      "t = 0.3, y[8] = 2.1603, x_hat[8] = 2.23217 3.34189 -9.80002 \n",
-      "t = 0.333333, y[9] = 2.21269, x_hat[9] = 2.30708 2.89745 -9.82839 \n",
-      "t = 0.366667, y[10] = 2.57709, x_hat[10] = 2.44897 2.70633 -9.78267 \n",
-      "t = 0.4, y[11] = 2.66822, x_hat[11] = 2.57104 2.47129 -9.74221 \n",
-      "t = 0.433333, y[12] = 2.51642, x_hat[12] = 2.62117 2.05785 -9.79233 \n",
-      "t = 0.466667, y[13] = 2.76034, x_hat[13] = 2.70573 1.77429 -9.76272 \n",
-      "t = 0.5, y[14] = 2.88132, x_hat[14] = 2.79035 1.51648 -9.70754 \n",
-      "t = 0.533333, y[15] = 2.88374, x_hat[15] = 2.85003 1.21709 -9.68495 \n",
-      "t = 0.566667, y[16] = 2.94485, x_hat[16] = 2.90195 0.924505 -9.65353 \n",
-      "t = 0.6, y[17] = 2.82867, x_hat[17] = 2.91129 0.544786 -9.7189 \n",
-      "t = 0.633333, y[18] = 3.00066, x_hat[18] = 2.94401 0.260709 -9.67101 \n",
-      "t = 0.666667, y[19] = 3.12921, x_hat[19] = 2.98862 0.0383847 -9.54536 \n",
-      "t = 0.7, y[20] = 2.85836, x_hat[20] = 2.96317 -0.355458 -9.64337 \n",
-      "t = 0.733333, y[21] = 2.83808, x_hat[21] = 2.92828 -0.743058 -9.73076 \n",
-      "t = 0.766667, y[22] = 2.68975, x_hat[22] = 2.85989 -1.19415 -9.89987 \n",
-      "t = 0.8, y[23] = 2.66533, x_hat[23] = 2.7884 -1.61706 -10.0242 \n",
-      "t = 0.833333, y[24] = 2.81613, x_hat[24] = 2.75128 -1.9017 -9.95817 \n",
-      "t = 0.866667, y[25] = 2.81004, x_hat[25] = 2.71309 -2.1591 -9.85946 \n",
-      "t = 0.9, y[26] = 2.88322, x_hat[26] = 2.69122 -2.3396 -9.66543 \n",
-      "t = 0.933333, y[27] = 2.69789, x_hat[27] = 2.63079 -2.61001 -9.59857 \n",
-      "t = 0.966667, y[28] = 2.43422, x_hat[28] = 2.52104 -2.99666 -9.68334 \n",
-      "t = 1, y[29] = 2.23465, x_hat[29] = 2.38245 -3.43211 -9.82404 \n",
-      "t = 1.03333, y[30] = 2.30279, x_hat[30] = 2.27525 -3.73881 -9.7986 \n",
-      "t = 1.06667, y[31] = 2.0207, x_hat[31] = 2.12376 -4.14204 -9.89064 \n",
-      "t = 1.1, y[32] = 1.94394, x_hat[32] = 1.97709 -4.49596 -9.91916 \n",
-      "t = 1.13333, y[33] = 1.82498, x_hat[33] = 1.82676 -4.82788 -9.92063 \n",
-      "t = 1.16667, y[34] = 1.52526, x_hat[34] = 1.63716 -5.23716 -10.0093 \n",
-      "t = 1.2, y[35] = 1.86968, x_hat[35] = 1.54514 -5.34796 -9.76318 \n",
-      "t = 1.23333, y[36] = 1.18073, x_hat[36] = 1.32938 -5.77305 -9.8709 \n",
-      "t = 1.26667, y[37] = 1.1073, x_hat[37] = 1.13101 -6.11758 -9.88731 \n",
-      "t = 1.3, y[38] = 0.916168, x_hat[38] = 0.924926 -6.45274 -9.89309 \n",
-      "t = 1.33333, y[39] = 0.678548, x_hat[39] = 0.703675 -6.7981 -9.90889 \n",
-      "t = 1.36667, y[40] = 0.562382, x_hat[40] = 0.493728 -7.08693 -9.86774 \n",
-      "t = 1.4, y[41] = 0.355468, x_hat[41] = 0.276462 -7.36943 -9.82262 \n",
-      "t = 1.43333, y[42] = -0.155607, x_hat[42] = -0.00496317 -7.78298 -9.9046 \n",
-      "t = 1.46667, y[43] = -0.287199, x_hat[43] = -0.268734 -8.1234 -9.91418 \n",
-      "t = 1.5, y[44] = -0.602973, x_hat[44] = -0.551482 -8.48174 -9.93964 \n",
-      "\n",
-      "Exec Success, Final kf states:-0.551482 -8.48174 -9.93964 \n"
-     ]
-    }
-   ],
-   "source": [
-    "pyrun_sim(10, true)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "id": "76110101-38eb-4100-820e-8407ba943ce0",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Exec Success, Final kf states:-0.485139 -7.98525 -9.02124 \n",
-      "\n",
-      "Exec Success, Final kf states:-0.436986 -7.62488 -8.35466 \n",
-      "\n",
-      "Exec Success, Final kf states:-0.516416 -8.21932 -9.45422 \n",
-      "\n",
-      "Exec Success, Final kf states:-0.476838 -7.92312 -8.90633 \n",
-      "\n",
-      "Exec Success, Final kf states:-0.547245 -8.45003 -9.88098 \n"
-     ]
-    }
-   ],
-   "source": [
-    "%%python\n",
-    "\n",
-    "expand_factor = [500000, 1000000, 10000000, 20000000, 40000000]\n",
-    "benchmarks = []\n",
-    "\n",
-    "for i in expand_factor:\n",
-    "    time_kf = cppyy.gbl.pyrun_sim(int(i), 0)\n",
-    "    benchmarks.append(time_kf)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "id": "2a47ecf6-f4e9-44b6-b72c-a935ed02fdc1",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[880, 1584, 13726, 27178, 54014]\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%python\n",
-    "\n",
-    "print(benchmarks)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "CUDA (C++17)",
-   "language": "CUDA",
-   "name": "cuda-xcpp17"
-  },
-  "language_info": {
-   "codemirror_mode": "text/x-c++src",
-   "file_extension": ".cpp",
-   "mimetype": "text/x-c++src",
-   "name": "c++",
-   "version": "17"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}