From 763f5a1daabd5c2c07cd7ef67c52e81ad1301ecd Mon Sep 17 00:00:00 2001 From: E33605 Date: Mon, 30 Oct 2023 19:08:30 +0530 Subject: [PATCH] Added boilerplate for time series data --- HOWTO.md | 68 +-- notebooks/BOILERPLATE.ipynb | 2 +- notebooks/Time Series - BOILERPLATE.ipynb | 525 ++++++++++++++++++++++ 3 files changed, 560 insertions(+), 35 deletions(-) create mode 100644 notebooks/Time Series - BOILERPLATE.ipynb diff --git a/HOWTO.md b/HOWTO.md index 4235921..d4964e2 100644 --- a/HOWTO.md +++ b/HOWTO.md @@ -1,34 +1,34 @@ -

HOWTO

- -
- -The template provides a minimal approach for getting started with an AI/ML project, and has hardly any dependencies required. However, the [`notebooks/BOILERPLATE.ipynb`](notebooks/BOILERPLATE.ipynb) provides popular import and its configurations (like `pandas`, `numpy`, `scikit-learn` and `tensorflow`). A high level directory overview is as follows: - -``` -├───config : store all configuration files -│ -├───data : responsible for all data handling, or contains raw data -│ └───processed : contains processed data (like combined/normalized dataframes, tables, etc.) -│ -├───logs : repository to contain log files, can also be saved in `/path/to/directory` -│ -├───notebooks : contains boilerplate notebook for EDA and quick data understanding/explanations -│ -├───output : directory responsible for all output files -│ ├───images : save output images -│ └───savedmodels : save trained model files -│ -├───src : source directory -│ ├───agents : define agents for any rnn application -│ ├───engine : provides a suit of machine learning analytic functions -│ └───models : directory containing model definations -│ -├───static : other important/useful resources required in the project -│ ├───fonts : store additional fonts, maybe used in documentations -│ ├───images : store explanatory images -│ └───logo : setup a project logo -│ -└───utilities : utilities directory containing functions and/or submodules -``` - -
+

HOWTO

+ +
+ +The template provides a minimal approach for getting started with an AI/ML project, and has hardly any dependencies required. However, the [`notebooks/BOILERPLATE.ipynb`](notebooks/BOILERPLATE.ipynb) provides popular import and its configurations (like `pandas`, `numpy`, `scikit-learn` and `tensorflow`). A high level directory overview is as follows: + +``` +├───config : store all configuration files +│ +├───data : responsible for all data handling, or contains raw data +│ └───processed : contains processed data (like combined/normalized dataframes, tables, etc.) +│ +├───logs : repository to contain log files, can also be saved in `/path/to/directory` +│ +├───notebooks : contains boilerplate notebook for EDA and quick data understanding/explanations +│ +├───output : directory responsible for all output files +│ ├───images : save output images +│ └───savedmodels : save trained model files +│ +├───src : source directory +│ ├───agents : define agents for any rnn application +│ ├───engine : provides a suit of machine learning analytic functions +│ └───models : directory containing model definations +│ +├───static : other important/useful resources required in the project +│ ├───fonts : store additional fonts, maybe used in documentations +│ ├───images : store explanatory images +│ └───logo : setup a project logo +│ +└───utilities : utilities directory containing functions and/or submodules +``` + +
diff --git a/notebooks/BOILERPLATE.ipynb b/notebooks/BOILERPLATE.ipynb index 2632d4a..a895594 100644 --- a/notebooks/BOILERPLATE.ipynb +++ b/notebooks/BOILERPLATE.ipynb @@ -380,7 +380,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.10.9" }, "latex_envs": { "LaTeX_envs_menu_present": true, diff --git a/notebooks/Time Series - BOILERPLATE.ipynb b/notebooks/Time Series - BOILERPLATE.ipynb new file mode 100644 index 0000000..818622e --- /dev/null +++ b/notebooks/Time Series - BOILERPLATE.ipynb @@ -0,0 +1,525 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ea4e27fa", + "metadata": {}, + "source": [ + "

Boilerplate/Template Design

\n", + "\n", + "---\n", + "\n", + "**Objective:** The file provides a simple *boilerplate* for *time series modelling* to concentrate on what is necessary, and stop doing same tasks! The boilerplate is also configured with certain [**nbextensions**](https://gitlab.com/ZenithClown/computer-configurations-and-setups) that I personally use. Install them, if required, else ignore them as they do not participate in any type of code-optimizations. For any new project *edit* this file or `File > Make a Copy` to get started with the project. Some settings and configurations are already provided, as mentioned below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42afd2fc", + "metadata": { + "ExecuteTime": { + "end_time": "2023-10-30T06:03:17.992353Z", + "start_time": "2023-10-30T06:03:17.972967Z" + } + }, + "outputs": [], + "source": [ + "# use the code release version for tracking and code modifications. use the\n", + "# CHANGELOG.md file to keep track of version features, and/or release notes.\n", + "# the version file is avaiable at project root directory, check the\n", + "# global configuration setting for root directory information.\n", + "# the file is already read and is available as `__version__`\n", + "__version__ = open(\"../VERSION\", \"rt\").read() # bump codecov\n", + "print(f\"Current Code Version: {__version__}\") # TODO : author, contact" + ] + }, + { + "cell_type": "markdown", + "id": "66a070ef", + "metadata": {}, + "source": [ + "## Code Imports\n", + "\n", + "A code must be written such that it is always _production ready_. The conventional guidelines provided under [**PEP8**](https://peps.python.org/pep-0008/#imports) defines the conventional or syntactically useful ways of defining and/or manipulating functions. Necessar guidelines w.r.t. code imports are mentioned below, and basic libraries and import settings are defined.\n", + "\n", + " 1. Imports should be on separate lines,\n", + " 2. Import order should be:\n", + " * standard library/modules,\n", + " * related third party imports,\n", + " * local application/user defined imports\n", + " 3. Wildcard import (`*`) should be avoided, else specifically tagged with **`# noqa: F403`** as per `flake8` or **`# pylint: disable=unused-import`** as per `pylint`\n", + " 4. Avoid using relative imports; use explicit imports instead." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d4e89b2", + "metadata": { + "ExecuteTime": { + "end_time": "2023-10-30T06:03:17.999550Z", + "start_time": "2023-10-30T06:03:17.994324Z" + } + }, + "outputs": [], + "source": [ + "import os # miscellaneous os interfaces\n", + "import sys # configuring python runtime environment\n", + "import time # library for time manipulation, and logging\n", + "import pickle # load/save model `pmdarima` model as a pickle file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "84111db6", + "metadata": { + "ExecuteTime": { + "end_time": "2023-10-30T06:03:18.012009Z", + "start_time": "2023-10-30T06:03:17.999550Z" + } + }, + "outputs": [], + "source": [ + "# use `datetime` to control and preceive the environment\n", + "# in addition `pandas` also provides date time functionalities\n", + "import datetime as dt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98b2f278", + "metadata": {}, + "outputs": [], + "source": [ + "# from copy import deepcopy # dataframe is mutable\n", + "# from tqdm import tqdm as TQ # progress bar for loops\n", + "# from uuid import uuid4 as UUID # unique identifier for objs" + ] + }, + { + "cell_type": "markdown", + "id": "b728be33", + "metadata": {}, + "source": [ + "### Data Analysis and AI/ML Libraries\n", + "\n", + "Import of data analysis and AI/ML libraries required at different intersections. Check settings and configurations [here](https://gitlab.com/ZenithClown/computer-configurations-and-setups) and code snippets [here](https://gitlab.com/ZenithClown/computer-configurations-and-setups/-/tree/master/template/snippets/vscode) for understanding settings that is used in this notebook. The code uses `matplotlib.styles` which is a custom `.mplstyle` file recognised by the `matplotlib` downlodable from [this link](https://gitlab.com/ZenithClown/computer-configurations-and-setups/-/tree/master/settings/python/matplotlib)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d0f9892", + "metadata": { + "ExecuteTime": { + "end_time": "2023-10-30T06:03:21.457741Z", + "start_time": "2023-10-30T06:03:18.061274Z" + } + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "%precision 3\n", + "%matplotlib inline\n", + "sns.set_style('whitegrid');\n", + "plt.style.use('default-style');\n", + "\n", + "pd.set_option('display.max_rows', 50) # max. rows to show\n", + "pd.set_option('display.max_columns', 17) # max. cols to show\n", + "np.set_printoptions(precision = 3, threshold = 15) # set np options\n", + "pd.options.display.float_format = '{:,.3f}'.format # float precisions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c89c689f", + "metadata": { + "ExecuteTime": { + "end_time": "2023-10-30T06:03:21.992718Z", + "start_time": "2023-10-30T06:03:21.460254Z" + } + }, + "outputs": [], + "source": [ + "# sklearn metrices for analysis can be imported as below\n", + "# considering `regression` problem, rmse is imported metrics\n", + "# for rmse, use `squared = False` : https://stackoverflow.com/a/18623635/\n", + "from sklearn.metrics import (\n", + " mean_squared_error as MSE,\n", + " mean_absolute_error as MAE\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54b88729", + "metadata": { + "ExecuteTime": { + "end_time": "2023-10-30T06:03:22.140032Z", + "start_time": "2023-10-30T06:03:21.992718Z" + } + }, + "outputs": [], + "source": [ + "from statsmodels.tsa.seasonal import seasonal_decompose" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11b75632", + "metadata": { + "ExecuteTime": { + "end_time": "2023-10-30T06:03:23.379119Z", + "start_time": "2023-10-30T06:03:22.142392Z" + } + }, + "outputs": [], + "source": [ + "import pmdarima as pm # let `auto_arima()` find the coefficients\n", + "import statsmodels.api as sm # statsmodels for statistical generates" + ] + }, + { + "cell_type": "markdown", + "id": "77c4d99d", + "metadata": {}, + "source": [ + "### User Defined Function(s)\n", + "\n", + "It is recommended that any UDFs are defined outside the scope of the *jupyter notebook* such that development/editing of function can be done more practically. As per *programming guidelines* as [`src`](https://fileinfo.com/extension/src) file/directory is beneficial in code development and/or production release. However, *jupyter notebook* requires *kernel restart* if any imported code file is changed in disc, for this frequently changing functions can be defined in this section.\n", + "\n", + "**Getting Started** with **`PYTHONPATH`**\n", + "\n", + "One must know what are [Environment Variable](https://medium.com/chingu/an-introduction-to-environment-variables-and-how-to-use-them-f602f66d15fa) and how to call/use them in your choice of programming language. Note that an environment variable is *case sensitive* in all operating systems (except windows, since DOS is not case sensitive). Generally, we can access environment variables from terminal/shell/command prompt as:\n", + "\n", + "```shell\n", + "# macOS/*nix\n", + "echo $VARNAME\n", + "\n", + "# windows\n", + "echo %VARNAME%\n", + "```\n", + "\n", + "Once you've setup your system with [`PYTHONPATH`](https://bic-berkeley.github.io/psych-214-fall-2016/using_pythonpath.html) as per [*python documentation*](https://docs.python.org/3/using/cmdline.html#envvar-PYTHONPATH) is an important directory where any `import` statements looks for based on their order of importance. If a source code/module is not available check necessary environment variables and/or ask the administrator for the source files. For testing purpose, the module boasts the use of `src`, `utils` and `config` directories. However, these directories are available at `ROOT` level, and thus using `sys.path.append()` to add directories while importing.\n", + "\n", + "**Getting Started** with **`submodules`**\n", + "\n", + "A [`submodule`](https://git-scm.com/book/en/v2/Git-Tools-Submodules) provides functionality to integrate a seperate project in the current repository - this is typically useful to remove code-duplicacy and central repository to control dependent modules. More information on initializing and using submodule is available [here](https://www.youtube.com/watch?v=gSlXo2iLBro). Check [Github-GISTS/ZenithClown](https://gist.github.com/ZenithClown) for more information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a1a2ece", + "metadata": { + "ExecuteTime": { + "end_time": "2023-10-30T06:03:23.396289Z", + "start_time": "2023-10-30T06:03:23.379119Z" + } + }, + "outputs": [], + "source": [ + "# get udf for `date_range()` function\n", + "# always check if `datetime` is imported\n", + "import datetime_ as dt_ # https://gist.github.com/ZenithClown/d2dd294c5f528459e16b139c04c0b182" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c08976b5", + "metadata": { + "ExecuteTime": { + "end_time": "2023-10-30T06:03:23.423992Z", + "start_time": "2023-10-30T06:03:23.400439Z" + } + }, + "outputs": [], + "source": [ + "# from stationarity import checkStationarity # https://gist.github.com/ZenithClown/f99d7e1e3f4b4304dd7d43603cef344d" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52cb748c", + "metadata": { + "ExecuteTime": { + "end_time": "2023-10-30T06:52:02.161135Z", + "start_time": "2023-10-30T06:52:02.151469Z" + } + }, + "outputs": [], + "source": [ + "# append `src` and sub-modules to call additional files these directory are\n", + "# project specific and not to be added under environment or $PATH variable\n", + "# sys.path.append(os.path.join(\"..\", \"src\", \"agents\")) # agents for reinforcement modelling\n", + "# sys.path.append(os.path.join(\"..\", \"src\", \"engine\")) # derivative engines for model control\n", + "# sys.path.append(os.path.join(\"..\", \"src\", \"models\")) # actual models for decision making tools" + ] + }, + { + "cell_type": "markdown", + "id": "315b5ca2", + "metadata": {}, + "source": [ + "## Global Argument(s)\n", + "\n", + "The global arguments are *notebook* specific, however they may also be extended to external libraries and functions on import. The *boilerplate* provides a basic ML directory structure which contains a directory for `data` and a separate directory for `output`. In addition, a separate directory (`data/processed`) is created to save processed dataset such that preprocessing can be avoided." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0144fd21", + "metadata": { + "ExecuteTime": { + "end_time": "2023-10-30T06:03:24.998651Z", + "start_time": "2023-10-30T06:03:24.990553Z" + } + }, + "outputs": [], + "source": [ + "ROOT = \"..\" # the document root is one level up, that contains all code structure\n", + "DATA = os.path.join(ROOT, \"data\") # the directory contains all data files, subdirectory (if any) can also be used/defined\n", + "\n", + "# processed data directory can be used, such that preprocessing steps is not\n", + "# required to run again-and-again each time on kernel restart\n", + "PROCESSED_DATA = os.path.join(DATA, \"processed\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f0ed681", + "metadata": { + "ExecuteTime": { + "end_time": "2023-10-30T06:03:25.206753Z", + "start_time": "2023-10-30T06:03:25.174915Z" + } + }, + "outputs": [], + "source": [ + "OUTPUT_DIR = os.path.join(ROOT, \"output\")\n", + "IMAGES_DIR = os.path.join(OUTPUT_DIR, \"images\")\n", + "MODELS_DIR = os.path.join(OUTPUT_DIR, \"savedmodels\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4dbbec59", + "metadata": { + "ExecuteTime": { + "end_time": "2023-10-30T06:03:26.146000Z", + "start_time": "2023-10-30T06:03:26.137057Z" + } + }, + "outputs": [], + "source": [ + "N_LOOKBACK = 12 # no. of periods to lookback in the history\n", + "N_FORECAST = 24 # no, of periods of foreward forecast is required" + ] + }, + { + "cell_type": "markdown", + "id": "7f63ecb4", + "metadata": {}, + "source": [ + "## Historic Price Data\n", + "\n", + "The historic data, i.e., any data in time series format, can be read using `pd.read_*().set_index(\"date\")` or use a custom function to read and process from the file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0745711e", + "metadata": {}, + "outputs": [], + "source": [ + "data = # write read statement as a dataframe" + ] + }, + { + "cell_type": "markdown", + "id": "ece12873", + "metadata": {}, + "source": [ + "### Exponential Smoothening\n", + "\n", + "Exponential smoothing or exponential moving average (EMA) is a rule of thumb technique for smoothing time series data using the exponential window function. Instead of the \"Simple Moving Average\", the EMA model gives a higher weightage to the near prices. In addition, models like DEMA, TEMA are also developed on top of EMA." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b9cb774", + "metadata": {}, + "outputs": [], + "source": [ + "# exponential smoothening is always a good topic, we can do so by setting\n", + "span = \n", + "alpha = 2 / (span + 1) # thumb rule of alpha selection - # https://help.sumologic.com/docs/metrics/metrics-operators/ewma/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb05a4dc", + "metadata": {}, + "outputs": [], + "source": [ + "plt.plot(data[\"values\"], label = \"Historic Price\", color = \"#5d8bd4\")\n", + "plt.plot(data[\"values\"].ewm(alpha = alpha, adjust = False).mean(), label = f\"Adjusted EWMA (span = {span}, α = {alpha})\", color = \"#b09666\")\n", + "\n", + "plt.legend(loc = \"upper right\")\n", + "plt.title(commodity)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "7e512ca3", + "metadata": {}, + "source": [ + "## Model Development\n", + "\n", + "An exploratory data analysis on a time series data set is typically different from that of conventional/non-timeseries dataset. The [`template`](https://github.com/ZenithClown/ai-ml-project-template) provides some basic analysis techniques that is applicable to any type of datasets. The following methods are implemented:\n", + "\n", + " * [**Seasonal Decomposition**](https://machinelearningmastery.com/decompose-time-series-data-trend-seasonality/): The process of breaking a time series data into three main components - (I) trend, (II) seasonality, and (III) residuals/error terms.\n", + " * [**Check Data Stationarity**](https://www.analyticsvidhya.com/blog/2021/06/statistical-tests-to-check-stationarity-in-time-series-part-1/) Basic statistical models like `ARIMA` works on the principle that the time series is stationary.\n", + "\n", + "But, first we copy the actual dataset into a copied variable (preserve original data), and understand the data to be applied on to the master series. In addition, we also set the date time indexing (with frequency, if not already available) since may functions (like ETS Decomposition) is dependent on it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff18bf5b", + "metadata": { + "ExecuteTime": { + "end_time": "2023-10-29T14:32:30.519938Z", + "start_time": "2023-10-29T14:32:30.257113Z" + } + }, + "outputs": [], + "source": [ + "frame_ = data[[\"values\"]].copy() # frame with missing dates, and imputed with frequency\n", + "all_dates = list(map(lambda x : pd.Timestamp(str(x)), list(dt_.date_range(frame_.index.min().date(), frame_.index.max().date()))[::7])) # weekly frequency data\n", + "missing_dates = [date for date in all_dates if date not in frame_.index.values] # ? this function is not authorized\n", + "\n", + "# insert missing dates\n", + "frame_.reset_index(inplace = True)\n", + "for missing_date in missing_dates:\n", + " frame_.loc[len(frame_)] = [missing_date, pd.NA]\n", + " \n", + "# interpolate on missing dates\n", + "frame_ = frame_.sort_values(\"date\").set_index(\"date\") # don't sort descending\n", + "frame_[\"values\"] = frame_[\"values\"].interpolate() # fill missing values\n", + "frame_ = frame_.resample(\"???\").first() # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.resample.html" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9b3f427", + "metadata": { + "ExecuteTime": { + "end_time": "2023-10-27T09:05:45.721592Z", + "start_time": "2023-10-27T09:05:43.830580Z" + } + }, + "outputs": [], + "source": [ + "plt.figure(figsize = (25, 19))\n", + "seasonal_decompose(frame_).plot();\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1494feb", + "metadata": { + "ExecuteTime": { + "end_time": "2023-10-27T09:17:49.445929Z", + "start_time": "2023-10-27T09:17:48.785679Z" + } + }, + "outputs": [], + "source": [ + "# get yearly average sales\n", + "yearly_sales = data.resample(\"Y\")[commodity].mean().reset_index()\n", + "\n", + "# get monthly average sales, grouped by year\n", + "monthly_sales = data.resample(\"M\")[commodity].mean().reset_index()\n", + "monthly_sales[\"year\"] = monthly_sales[\"date\"].dt.year # can be used as an hue parameter to distinguish\n", + "\n", + "plt.figure(figsize = (25, 5))\n", + "sns.lineplot(yearly_sales, x = \"date\", y = commodity, label = \"Yearly Average Sales\")\n", + "sns.lineplot(monthly_sales, x = \"date\", y = commodity, hue = \"year\", palette = \"viridis\", label = \"Monthly Average Sales\")\n", + "\n", + "# disable/set xy label\n", + "plt.xlabel(\"\")\n", + "plt.ylabel(\"Price Values\")\n", + "\n", + "plt.legend([]) # yearly diff color is understood\n", + "plt.title(\"Average Sales Historic Price Trend\")\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2516b5c3", + "metadata": { + "ExecuteTime": { + "end_time": "2023-10-27T13:16:54.016943Z", + "start_time": "2023-10-27T13:16:53.703830Z" + } + }, + "outputs": [], + "source": [ + "window = 12\n", + "*_, rolling = checkStationarity(data, feature = \"values\", window = window)\n", + "\n", + "plt.plot(rolling)\n", + "plt.suptitle(\"Rolling Mean & Standard Deviation\")\n", + "plt.title(f\"Rolling Window Size = {window}\")\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "AutoTS, PMD-ARIMA", + "language": "python", + "name": "timeseries" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}