From 19b0212cc766b4f6d9fd2e30084dcd5ffaa8bc16 Mon Sep 17 00:00:00 2001 From: Debmalya Pramanik Date: Wed, 18 Oct 2023 10:43:38 +0530 Subject: [PATCH 01/25] --- Time Series Utilities.md | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 Time Series Utilities.md diff --git a/Time Series Utilities.md b/Time Series Utilities.md new file mode 100644 index 0000000..b938fd8 --- /dev/null +++ b/Time Series Utilities.md @@ -0,0 +1,36 @@ +
+

Time Series Utilities

+

object oriented process to create time series sequence features for AI/ML model development

+Colab Notebook +
+ +
+ +
+ +## Time Series Featuring + +Time series analysis is a special segment of AI/ML application development where a feature is dependent on time. The code here is desgined to create a *sequence* of `x` and `y` data needed in a time series problem. The function is defined with two input parameters (I) **Lootback Period (T) `n_lookback`**, and (II) **Forecast Period (H) `n_forecast`** which can be visually presented below. + +
+ +![prediction-sequence](https://i.stack.imgur.com/YXwMJ.png) + +
+ +## Getting Started + +The code is publically available at [**GitHub gists**](https://gist.github.com/ZenithClown) which is a simple platform for sharing *code snippets* with the community. To use the code, simply clone the code like: + +```shell +git clone https://gist.github.com/ZenithClown/.git ts_utils +export PYTHONPATH="${PYTHONPATH}:ts_utils" +``` + +Done, you can now easily import the function with *python* notebooks/code-files like: + +```python +from ts_featuring import CreateSequence +``` + +
\ No newline at end of file From 014d2a39f4180d06592b31272d07c2bca0a4001e Mon Sep 17 00:00:00 2001 From: Debmalya Pramanik Date: Wed, 18 Oct 2023 10:47:26 +0530 Subject: [PATCH 02/25] From 9f487d7a01f6a4307a28d70f9eab40990dd6610d Mon Sep 17 00:00:00 2001 From: ZenithClown Date: Tue, 11 Apr 2023 11:33:06 +0530 Subject: [PATCH 03/25] =?UTF-8?q?=F0=9F=92=A3=20rename=20featuring.py=20to?= =?UTF-8?q?=20ts=5Ffeaturing.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ts_featuring.py | 202 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 202 insertions(+) create mode 100644 ts_featuring.py diff --git a/ts_featuring.py b/ts_featuring.py new file mode 100644 index 0000000..941b448 --- /dev/null +++ b/ts_featuring.py @@ -0,0 +1,202 @@ +# -*- encoding: utf-8 -*- + +""" +A Set of Methodologies involved with Feature Engineering + +Feature engineering or feature extraction involves transforming the +data by manipulation (addition, deletion, combination) or mutation of +the data set in hand to improve the machine learning model. The +project mainly deals with, but not limited to, time series data that +requires special treatment - which are listed over here. + +Feature engineering time series data will incorporate the use case of +both univariate and multivariate data series with additional +parameters like lookback and forward tree. Check documentation of the +function(s) for more information. +""" + +import numpy as np +import pandas as pd + + +class DataObjectModel(object): + """ + Data Object Model (`DOM`) for AI-ML Application Development + + Data is the key to an artificial intelligence application + development, and often times real world data are gibrish and + incomprehensible. The DOM is developed to provide basic use case + like data formatting, seperating `x` and `y` variables etc. such + that a feature engineering function or a machine learning model + can easily get the required information w/o much needed code. + + # Example Use Cases + The following small use cases are possible with the use of the + DOM in feature engineering: + + 1. Formatting a Data to a NumPy ND-Array - an iterable/pandas + object can be converted into `np.ndarray` which is the base + data type of the DOM. + + ```python + np.random.seed(7) # set seed for duplication + data = pd.DataFrame( + data = np.random.random(size = (9, 26)), + columns = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ") + ) + + dom = DataObjectModel(data) + print(type(dom.data)) + >> + ``` + + 2. Breaking an Array of `Xy` into Individual Component - for + instance a dataframe/tabular data has `X` features along side + `y` in column. The function considers the data and breaks it + into individual components. + + ```python + X, y = dom.create_xy(y_index = 1) + + # or if `y` is group of elements then: + X, y = dom.create_xy(y_index = (1, 4)) + ``` + """ + + def __init__(self, data: np.ndarray) -> None: + self.data = self.__to_numpy__(data) # also check integrity + + def __to_numpy__(self, data: object) -> np.ndarray: + """Convert Meaningful Data into a N-Dimensional Array""" + + if type(data) == np.ndarray: + pass # data is already in required type + elif type(data) in [list, tuple]: + data = np.array(data) + elif type(data) == pd.DataFrame: + # often times a full df can be passed, which is a ndarray + # thus, the df can be easily converted to an np ndarray: + data = data.values + else: + raise TypeError( + f"Data `type == {type(data)}` is not convertible.") + + return data + + + def create_xy(self, y_index : object = -1) -> tuple: + """ + Breaks the Data into Individual `X` and `y` Components + + From a tabular or ndimensional structure, the code considers + `y` along a given axis (`y_index`) and returns two `ndarray` + which can be treated as `X` and `y` individually. + + The function uses `np.delete` command to create `X` feature + from the data. (https://stackoverflow.com/a/5034558/6623589). + + This function is meant for multivariate dataset, and is only + applicable when dealing with multivariate time series data. + The function can also be used for any machine learning model + consisting of multiple features (even if it is a time series + dataset). + + :type y_index: object + :param y_index: Index/axis of `y` variable. If the type is + is `int` then the code assumes one feature, + and `y_.shape == (-1, 1)` and if the type + of `y_index` is `tuple` then + `y_.shape == (-1, (end - start - 1))` since + end index is exclusive as in `numpy` module. + """ + + if type(y_index) in [list, tuple]: + x_ = self.data + y_ = self.data[:, y_index[0]:y_index[1]] + for idx in range(*y_index)[::-1]: + x_ = np.delete(x_, obj = idx, axis = 1) + elif type(y_index) == int: + y_ = self.data[:, y_index] + x_ = np.delete(self.data, obj = y_index, axis = 1) + else: + raise TypeError("`type(y_index)` not in [int, tuple].") + + return x_, y_ + + +class CreateSequence(DataObjectModel): + """ + Create a Data Sequence Typically to be used in LSTM Model + + LSTM Model, or rather any time series data, requires a specific + sequence of data consisting of `n_lookback` i.e. length of input + sequence (or lookup values) and `n_forecast` values, i.e., the + length of output sequence. The function tries to provide single + approach to break data into sequence of `x_train` and `y_train` + for training in neural network. + """ + + def __init__(self, data: np.ndarray) -> None: + super().__init__(data) + + + def create_series( + self, + n_lookback : int, + n_forecast : int, + univariate : bool = True, + **kwargs + ) -> tuple: + """ + Create a Sequence of `x_train` and `y_train` for training a + neural network model with time series data. The basic + approach in building the function is taken from: + https://stackoverflow.com/a/69912334/6623589 + + UPDATE [22-02-2023] : The function is now modified such that + it now can also return a sequence for multivariate time-series + analysis. The following changes has been added: + * 💣 refactor function name to generalise between univariate + and multivariate methods. + * 🔧 univariate feature can be called directly as this is the + default code behaviour. + * 🛠 to get multivariate functionality, use `univariate = False` + * 🛠 by default the last column (-1) of `data` is considered + as `y` feature by slicing `arr[s:e, -1]` but this can be + configured using `kwargs["y_feat_"]` + """ + + x_, y_ = [], [] + n_record = self.__check_univariate_get_len__(univariate) \ + - n_forecast + 1 + + y_feat_ = kwargs.get("y_feat_", -1) + + for idx in range(n_lookback, n_record): + x_.append(self.data[idx - n_lookback : idx]) + y_.append( + self.data[idx : idx + n_forecast] if univariate + else self.data[idx : idx + n_forecast, y_feat_] + ) + + x_, y_ = map(np.array, [x_, y_]) + + if univariate: + # the for loop is so designed it returns the data like: + # (, n_lookback, ) however, + # for univariate the `` dimension is "squeezed" + x_, y_ = map(lambda arr : np.squeeze(arr), [x_, y_]) + + return [x_, y_] + + + def __check_univariate_get_len__(self, univariate : bool) -> int: + """ + Check if the data is a univariate one, and if `True` then + return the length, i.e. `.shape[0]`, for further analysis. + """ + + if (self.data.ndim != 1) and (univariate): + raise TypeError("Wrong dimension for univariate series.") + + return self.data.shape[0] From 094a6705eac03b7b358b9308528c5774d7dfcf65 Mon Sep 17 00:00:00 2001 From: Debmalya Pramanik Date: Tue, 17 Oct 2023 12:26:35 +0530 Subject: [PATCH 04/25] --- stationarity.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 stationarity.py diff --git a/stationarity.py b/stationarity.py new file mode 100644 index 0000000..477f7f8 --- /dev/null +++ b/stationarity.py @@ -0,0 +1,19 @@ +from statsmodels.tsa.stattools import adfuller # adfuller test + +def checkStationarity(series : list, verbose : bool = True, **kwargs) -> bool: + """Performs ADF Test to Determine Data Stationarity""" + + results = adfuller(series) # should be send like `frame.col.values` + stationary = True if (results[1] <= 0.05) & (results[4]["5%"] > results[0]) else False + + if verbose: + print("Observations of ADF Test") + print("========================") + print(f"ADF Statistics : {results[0]:,.3f}") + print(f"p-value : {results[1]:,.3f}") + + critical_values = {k : round(v, 3) for k, v in results[4].items()} + print(f"Critical Values : {critical_values}") + + print(f"Data is :", "\u001b[32mStationary\u001b[0m" if stationary else "\x1b[31mNon-stationary\x1b[0m") + return results, stationary From f6244875669829f27d6185a9894c255885297062 Mon Sep 17 00:00:00 2001 From: E33605 Date: Tue, 17 Oct 2023 12:50:53 +0530 Subject: [PATCH 05/25] added rolling window observation --- stationarity.py | 37 +++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/stationarity.py b/stationarity.py index 477f7f8..4b9de50 100644 --- a/stationarity.py +++ b/stationarity.py @@ -1,19 +1,44 @@ +# -*- encoding: utf-8 -*- + +""" +Stationarity Checking for Time Series Data + +@author: Debmalya Pramanik +@version: v0.0.1 +""" + from statsmodels.tsa.stattools import adfuller # adfuller test -def checkStationarity(series : list, verbose : bool = True, **kwargs) -> bool: - """Performs ADF Test to Determine Data Stationarity""" +def checkStationarity(frame : list, feature: str, verbose : bool = True, **kwargs) -> bool: + """ + Performs ADF Test to Determine Data Stationarity - results = adfuller(series) # should be send like `frame.col.values` + Given an univariate series formatted as `frame.set_index("data")` + the series can be tested for stationarity using the Augmented + Dickey Fuller (ADF) test. The function also returns a `dataframe` + of rolling window for plotting the data using `frame.plot()`. + """ + + results = adfuller(frame[feature].values) # should be send like `frame.col.values` stationary = True if (results[1] <= 0.05) & (results[4]["5%"] > results[0]) else False if verbose: - print("Observations of ADF Test") - print("========================") + print(f"Observations of ADF Test ({feature})") + print("===========================" + "=" * len(feature)) print(f"ADF Statistics : {results[0]:,.3f}") print(f"p-value : {results[1]:,.3f}") critical_values = {k : round(v, 3) for k, v in results[4].items()} print(f"Critical Values : {critical_values}") + # rolling calculations for plotting + rolling = frame.copy() # enable deep copy + rolling = rolling[[feature]] # only keep single feature, works if multi-feature sent + rolling.rename(columns = {feature : "original"}, inplace = True) + + rolling_ = rolling.rolling(window = kwargs.get("window", 12)) + rolling["mean"] = rolling_.mean()["original"].values + rolling["std"] = rolling_.std()["original"].values + print(f"Data is :", "\u001b[32mStationary\u001b[0m" if stationary else "\x1b[31mNon-stationary\x1b[0m") - return results, stationary + return results, stationary, rolling From 77154b2e7de85bd9200d52ed7588fdd09a994572 Mon Sep 17 00:00:00 2001 From: E33605 Date: Tue, 17 Oct 2023 15:37:49 +0530 Subject: [PATCH 06/25] =?UTF-8?q?=F0=9F=9B=A0=20add=20method=20selection?= =?UTF-8?q?=20b/w=20adfuller=20and=20kpss?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- stationarity.py | 67 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 13 deletions(-) diff --git a/stationarity.py b/stationarity.py index 4b9de50..3fb5a6e 100644 --- a/stationarity.py +++ b/stationarity.py @@ -7,29 +7,71 @@ @version: v0.0.1 """ +from statsmodels.tsa.stattools import kpss # kpss test from statsmodels.tsa.stattools import adfuller # adfuller test -def checkStationarity(frame : list, feature: str, verbose : bool = True, **kwargs) -> bool: +def checkStationarity(frame : object, feature: str, method : str = "both", verbose : bool = True, **kwargs) -> bool: """ Performs ADF Test to Determine Data Stationarity Given an univariate series formatted as `frame.set_index("data")` the series can be tested for stationarity using the Augmented - Dickey Fuller (ADF) test. The function also returns a `dataframe` - of rolling window for plotting the data using `frame.plot()`. + Dickey Fuller (ADF) and Kwiatkowski-Phillips-Schmidt-Shin (KPSS) + test. The function also returns a `dataframe` of rolling window + for plotting the data using `frame.plot()`. + + :type frame: pd.DataFrame + :param frame: The dataframe that (ideally) contains a single + univariate feature (`feature`), else for a + dataframe containing multiple series only the + `feature` series is worked upon. + + :type feature: str + :param feature: Name of the feature, i.e. the column name + in the dataframe. The `rolling` dataframe returns + a slice of `frame[[feature]]` along with rolling + mean and standard deviation. + + :type method: str + :param method: Select any of the method ['ADF', 'KPSS', 'both'], + using the `method` parameter, name is case + insensitive. Defaults to `both`. """ - results = adfuller(frame[feature].values) # should be send like `frame.col.values` - stationary = True if (results[1] <= 0.05) & (results[4]["5%"] > results[0]) else False + results = dict() # key is `ADF` and/or `KPSS` + stationary = dict() + + if method.upper() in ["ADF", "BOTH"]: + results["ADF"] = adfuller(frame[feature].values) # should be send like `frame.col.values` + stationary["ADF"] = True if (results["ADF"][1] <= 0.05) & (results["ADF"][4]["5%"] > results["ADF"][0]) else False + + if verbose: + print(f"Observations of ADF Test ({feature})") + print("===========================" + "=" * len(feature)) + print(f"ADF Statistics : {results['ADF'][0]:,.3f}") + print(f"p-value : {results['ADF'][1]:,.3f}") + + critical_values = {k : round(v, 3) for k, v in results["ADF"][4].items()} + print(f"Critical Values : {critical_values}") + + # always print if data is stationary/not + print(f"[ADF] Data is :", "\u001b[32mStationary\u001b[0m" if stationary else "\x1b[31mNon-stationary\x1b[0m") + + if method.upper() in ["KPSS", "BOTH"]: + results["KPSS"] = kpss(frame[feature].values) # should be send like `frame.col.values` + stationary["KPSS"] = False if (results["KPSS"][1] <= 0.05) & (results["KPSS"][3]["5%"] > results["KPSS"][0]) else True + + if verbose: + print(f"Observations of KPSS Test ({feature})") + print("============================" + "=" * len(feature)) + print(f"KPSS Statistics : {results['KPSS'][0]:,.3f}") + print(f"p-value : {results['KPSS'][1]:,.3f}") - if verbose: - print(f"Observations of ADF Test ({feature})") - print("===========================" + "=" * len(feature)) - print(f"ADF Statistics : {results[0]:,.3f}") - print(f"p-value : {results[1]:,.3f}") + critical_values = {k : round(v, 3) for k, v in results["KPSS"][3].items()} + print(f"Critical Values : {critical_values}") - critical_values = {k : round(v, 3) for k, v in results[4].items()} - print(f"Critical Values : {critical_values}") + # always print if data is stationary/not + print(f"[KPSS] Data is :", "\x1b[31mNon-stationary\x1b[0m" if stationary else "\u001b[32mStationary\u001b[0m") # rolling calculations for plotting rolling = frame.copy() # enable deep copy @@ -40,5 +82,4 @@ def checkStationarity(frame : list, feature: str, verbose : bool = True, **kwarg rolling["mean"] = rolling_.mean()["original"].values rolling["std"] = rolling_.std()["original"].values - print(f"Data is :", "\u001b[32mStationary\u001b[0m" if stationary else "\x1b[31mNon-stationary\x1b[0m") return results, stationary, rolling From fd78e906da91636c6339fb7de160ed7a4eca0c24 Mon Sep 17 00:00:00 2001 From: Debmalya Pramanik Date: Mon, 16 Oct 2023 14:35:16 +0530 Subject: [PATCH 07/25] --- moving_average.py | 51 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 moving_average.py diff --git a/moving_average.py b/moving_average.py new file mode 100644 index 0000000..a931a36 --- /dev/null +++ b/moving_average.py @@ -0,0 +1,51 @@ +class MovingAverage: + """ + A Base Method for Moving Average based Forecasting Model + + A moving average is the most simple timeseries model, which is + implemented using python. Note, the `.rolling` and `.cumsum` + methods of `pandas` and `numpy` respectively is used internally + where required to achieve the forecast. + """ + + def __init__(self, n_lookback : int, n_forecast : int, series : list) -> None: + self.n_lookback = n_lookback + self.n_forecast = n_forecast + + # the series is expected to have the same values as `looback` + # else, an warning is raised and only the last `n` loockback values are kept + self.series = self._check_series(series) # ? removes the values with warning + + + def simple_ma(self) -> np.ndarray: + series_ = self.series # make a copy of the original iterable + forecast = [] # append the forecasted values to the list to return + + for _ in range(self.n_forecast): + _iter_ma = series_.mean() # current iteration moving average + + # pop fifo, and add latest iter + series_ = np.insert(series_, len(series_), _iter_ma) + series_ = np.delete(series_, 0) + + forecast.append(_iter_ma) + + return np.array(forecast) + + + def _check_series(self, series : list) -> list: + """ + Data Sanity Check on the `series` and Return Cleaned Series + + Checks if the series length is expected as the `lookback` + period, else returns a truncated data series with a simple + warning. + """ + + if len(series) > self.n_lookback: + warnings.warn(f"Series Length = {len(series)}, while Lookback = {self.n_lookback} Periods.") + return series[-self.n_lookback :] + elif len(series) < self.n_lookback: + raise ValueError(f"Cannot compile, as {len(series)} < {self.n_lookback}. Check values.") + else: + return series \ No newline at end of file From 5b261099daffd961d0f4eb3b163fc1e15499b606 Mon Sep 17 00:00:00 2001 From: Debmalya Pramanik Date: Mon, 16 Oct 2023 16:07:21 +0530 Subject: [PATCH 08/25] --- moving_average.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/moving_average.py b/moving_average.py index a931a36..c3e6ad3 100644 --- a/moving_average.py +++ b/moving_average.py @@ -1,3 +1,6 @@ +import warnings +import numpy as np + class MovingAverage: """ A Base Method for Moving Average based Forecasting Model From e27794977fd78459fbdaa1349f447e3b660dbebe Mon Sep 17 00:00:00 2001 From: Debmalya Pramanik Date: Mon, 16 Oct 2023 16:16:47 +0530 Subject: [PATCH 09/25] --- moving_average.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/moving_average.py b/moving_average.py index c3e6ad3..845dc15 100644 --- a/moving_average.py +++ b/moving_average.py @@ -20,7 +20,7 @@ def __init__(self, n_lookback : int, n_forecast : int, series : list) -> None: self.series = self._check_series(series) # ? removes the values with warning - def simple_ma(self) -> np.ndarray: + def simple(self) -> np.ndarray: series_ = self.series # make a copy of the original iterable forecast = [] # append the forecasted values to the list to return @@ -34,6 +34,24 @@ def simple_ma(self) -> np.ndarray: forecast.append(_iter_ma) return np.array(forecast) + + + def exponential(self, factor : float = 0.5) -> np.ndarray: + series_ = self.series # make a copy of the original iterable + forecast = [] # append the forecasted values to the list to return + + factors = [factor / i for i in range(1, self.n_forecast + 1)] + + for _ in range(self.n_forecast): + _iter_ma = (series_ * factors).sum() # current iteration moving average + + # pop fifo, and add latest iter + series_ = np.insert(series_, len(series_), _iter_ma) + series_ = np.delete(series_, 0) + + forecast.append(_iter_ma) + + return np.array(forecast) def _check_series(self, series : list) -> list: From 9da65eb190a9f4a20e270fb0433fa28af8b9c650 Mon Sep 17 00:00:00 2001 From: Debmalya Pramanik Date: Mon, 16 Oct 2023 17:20:27 +0530 Subject: [PATCH 10/25] --- moving_average.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moving_average.py b/moving_average.py index 845dc15..9096e9f 100644 --- a/moving_average.py +++ b/moving_average.py @@ -40,7 +40,7 @@ def exponential(self, factor : float = 0.5) -> np.ndarray: series_ = self.series # make a copy of the original iterable forecast = [] # append the forecasted values to the list to return - factors = [factor / i for i in range(1, self.n_forecast + 1)] + factors = [factor / (2 ** i) for i in range(self.n_forecast)] for _ in range(self.n_forecast): _iter_ma = (series_ * factors).sum() # current iteration moving average From 816bc935fd9b2cc59ac058d0247bb9f0e6679bf0 Mon Sep 17 00:00:00 2001 From: E33605 Date: Wed, 18 Oct 2023 11:31:24 +0530 Subject: [PATCH 11/25] =?UTF-8?q?=F0=9F=9B=A0=F0=9F=9A=A7=20merge=20all=20?= =?UTF-8?q?the=20time=20series=20funcs=20in=20ts=5Futils?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Time Series Utilities.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Time Series Utilities.md b/Time Series Utilities.md index b938fd8..a48b964 100644 --- a/Time Series Utilities.md +++ b/Time Series Utilities.md @@ -7,6 +7,8 @@
+ +**WARNING:** Merging all the time series gists into a single module. ## Time Series Featuring From ad9a540f948122906e01b1b5fe9d75a8c0f4f351 Mon Sep 17 00:00:00 2001 From: E33605 Date: Wed, 18 Oct 2023 11:56:25 +0530 Subject: [PATCH 12/25] =?UTF-8?q?=F0=9F=93=83=F0=9F=9A=A7=20add=20theory?= =?UTF-8?q?=20for=20stationarity=20in=20time=20series?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Stationary in Time Series.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 Stationary in Time Series.md diff --git a/Stationary in Time Series.md b/Stationary in Time Series.md new file mode 100644 index 0000000..7c076fe --- /dev/null +++ b/Stationary in Time Series.md @@ -0,0 +1,19 @@ +
+

Stationarity in Time Series Data

+

Theory:Understand Stationary and Non-Stationary in Time Series Data

+
+ +
+ +
+ +Stationarity is one of the fundamental concepts in time series analysis. The **time series data model works on the principle that the [_data is stationary_](https://www.analyticsvidhya.com/blog/2021/04/how-to-check-stationarity-of-data-in-python/)**, this means: + * the data must have a constant mean (across all periods), + * the data should have a constant variance, and + * auto-covariance should not be dependent on time. + +Let's understand the concept using the following example, for more information check [this link](https://www.analyticsvidhya.com/blog/2018/09/non-stationary-time-series-python/). + +![Non-Stationary Time Series](https://cdn.analyticsvidhya.com/wp-content/uploads/2018/09/ns5-e1536673990684.png) + +
From c61329ccd71a86871e0f4f6c2dcc8b0822e1e59f Mon Sep 17 00:00:00 2001 From: E33605 Date: Wed, 18 Oct 2023 15:20:28 +0530 Subject: [PATCH 13/25] Rename stationarity readme file --- Stationary in Time Series.md => Stationarity & Unit Roots.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Stationary in Time Series.md => Stationarity & Unit Roots.md (100%) diff --git a/Stationary in Time Series.md b/Stationarity & Unit Roots.md similarity index 100% rename from Stationary in Time Series.md rename to Stationarity & Unit Roots.md From 9e2e0190ac7a9aa54f5ed9e219dc51d413314f24 Mon Sep 17 00:00:00 2001 From: E33605 Date: Wed, 18 Oct 2023 15:27:05 +0530 Subject: [PATCH 14/25] add stationary test conclusions --- Stationarity & Unit Roots.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Stationarity & Unit Roots.md b/Stationarity & Unit Roots.md index 7c076fe..29b1482 100644 --- a/Stationarity & Unit Roots.md +++ b/Stationarity & Unit Roots.md @@ -7,7 +7,7 @@
-Stationarity is one of the fundamental concepts in time series analysis. The **time series data model works on the principle that the [_data is stationary_](https://www.analyticsvidhya.com/blog/2021/04/how-to-check-stationarity-of-data-in-python/)**, this means: +Stationarity is one of the fundamental concepts in time series analysis. The **time series data model works on the principle that the [_data is stationary_](https://www.analyticsvidhya.com/blog/2021/04/how-to-check-stationarity-of-data-in-python/) and [_data has no unit roots_](https://www.analyticsvidhya.com/blog/2018/09/non-stationary-time-series-python/)**, this means: * the data must have a constant mean (across all periods), * the data should have a constant variance, and * auto-covariance should not be dependent on time. @@ -16,4 +16,11 @@ Let's understand the concept using the following example, for more information c ![Non-Stationary Time Series](https://cdn.analyticsvidhya.com/wp-content/uploads/2018/09/ns5-e1536673990684.png) +| ADF Test | KPSS Test | Series Type | Additional Steps | +| :---: | :---: | :---: | --- | +| ✅ | ✅ | _stationary_ | | +| ❌ | ❌ | _non-stationary_ | | +| ✅ | ❌ | _difference-stationary_ | Use differencing to make series stationary. | +| ❌ | ✅ | _trend-stationary_ | Remove trend to make the series _strict stationary. | +
From fdce7546faba1a534f39379b857b279ececb78da Mon Sep 17 00:00:00 2001 From: E33605 Date: Wed, 18 Oct 2023 16:35:37 +0530 Subject: [PATCH 15/25] =?UTF-8?q?=F0=9F=93=83=F0=9F=9A=A7=20created=20docu?= =?UTF-8?q?mentation=20for=20ma=20models?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- moving_average.py => ts_models.py | 42 +++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 5 deletions(-) rename moving_average.py => ts_models.py (64%) diff --git a/moving_average.py b/ts_models.py similarity index 64% rename from moving_average.py rename to ts_models.py index 9096e9f..15d885d 100644 --- a/moving_average.py +++ b/ts_models.py @@ -1,17 +1,49 @@ +# -*- encoding: utf-8 -*- + +""" +A Set of Simplistic Time Series Models + +A set of simplistic time series models developed on top of `pandas` +and `numpy` functionalities to provide quick analysis and develop a +base line for a univariate time series data. + +@author: Debmalya Pramanik +@version: v0.0.1 +""" + import warnings import numpy as np class MovingAverage: """ - A Base Method for Moving Average based Forecasting Model + A Set of Moving Average (MA) based Models for Time Series Methods A moving average is the most simple timeseries model, which is - implemented using python. Note, the `.rolling` and `.cumsum` - methods of `pandas` and `numpy` respectively is used internally - where required to achieve the forecast. + implemented using python. However, when used well the MA model is + able to provide much analysis and is one of the favorites for a + quick understanding in the stock market. + + Note, the `.rolling` and `.cumsum` methods of `pandas` and + `numpy` respectively is used internally where required to + achieve the forecast. + + :type n_lookback: int + :param n_lookback: Number of periods to lookback into the past. + Typically, 'n-lags' is a good indicator of + price, as the price of `(N+1)` is always a + factor of `N, N-1, N-2, ..., N-n` where `n` + can be determined statistically. + + :type n_forecast: int + :param n_forecast: Number of periods to forecast into the future. + + :type series: iterable + :param series: Time series data, where each item of the iterable + is a value at interval `n, ..., N-2, N-1, N` where + `N` is the value at current date. """ - def __init__(self, n_lookback : int, n_forecast : int, series : list) -> None: + def __init__(self, n_lookback : int, n_forecast : int, series : np.ndarray) -> None: self.n_lookback = n_lookback self.n_forecast = n_forecast From 075002769fcc30e3284e4c852d6ed2652f28eaa3 Mon Sep 17 00:00:00 2001 From: E33605 Date: Thu, 19 Oct 2023 12:39:53 +0530 Subject: [PATCH 16/25] =?UTF-8?q?=E2=8F=B3=F0=9F=93=83=20optimize,=20add?= =?UTF-8?q?=20document=20on=20moving=20average=20model?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ts_models.py | 144 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 119 insertions(+), 25 deletions(-) diff --git a/ts_models.py b/ts_models.py index 15d885d..e42ba93 100644 --- a/ts_models.py +++ b/ts_models.py @@ -14,6 +14,8 @@ import warnings import numpy as np +from _base import UnivariateSeries + class MovingAverage: """ A Set of Moving Average (MA) based Models for Time Series Methods @@ -27,6 +29,23 @@ class MovingAverage: `numpy` respectively is used internally where required to achieve the forecast. + The model is an extension for moving average, and can be used to + forecast into the future on a rolling basis. Example: + + ```python + N_LOOKBACK = 4 + N_FORECAST = 5 + + # given the series, the rolling forecast for `N_FORECAST` period: + simple_ma = MovingAverage( + n_lookback = N_LOOKBACK, + n_forecast = N_FORECAST, + series = np.array([12, 7, 27, 34]) + ).simple() + + >> np.array([20.00, 22.00, 25.75, 25.25, 23.00]) + ``` + :type n_lookback: int :param n_lookback: Number of periods to lookback into the past. Typically, 'n-lags' is a good indicator of @@ -50,55 +69,130 @@ def __init__(self, n_lookback : int, n_forecast : int, series : np.ndarray) -> N # the series is expected to have the same values as `looback` # else, an warning is raised and only the last `n` loockback values are kept self.series = self._check_series(series) # ? removes the values with warning - - + + def simple(self) -> np.ndarray: - series_ = self.series # make a copy of the original iterable - forecast = [] # append the forecasted values to the list to return - + """ + Simple Moving Average Forecast + + The most simple algorithm is the simple moving average + which gives equal weightage to all the time, and does not + consider level, trend, or seasonality. + + Simple moving average forecasting is not advisable, and is + only applicable for data with low variations, i.e. the data + is stationary. + """ + + series_ = self.series.copy() # make a copy of the iterable + forecast = [] # append the forecasted values to the list + for _ in range(self.n_forecast): - _iter_ma = series_.mean() # current iteration moving average - + _iter_ma = series_.mean() + # pop fifo, and add latest iter series_ = np.insert(series_, len(series_), _iter_ma) series_ = np.delete(series_, 0) - + forecast.append(_iter_ma) - + return np.array(forecast) - - - def exponential(self, factor : float = 0.5) -> np.ndarray: - series_ = self.series # make a copy of the original iterable - forecast = [] # append the forecasted values to the list to return - - factors = [factor / (2 ** i) for i in range(self.n_forecast)] + + + def exponential(self, alpha : float = 0.5) -> np.ndarray: + """ + Exponential Moving Average Forecasting + + An exponential moving average is an extension of the + moving average algorithm that places an greater weightage to + the recent data points. The EMA is also referred to as the + exponentially weighted moving average. + + Side note: In financial market, like all moving average + metrices, the EMA is a technical indicator which is used to + produce buy and sell signals based on crossovers and + divergence on crossovers. + (https://www.investopedia.com/terms/e/ema.asp) + + In addition, traders often use different EMA lengths of + 10-, 50-, and 200-days moving average as an indicator. + + However, in time series forecasting (like price forecasting) + the order (`q`) can be determined from the ACF & PACF tests. + But, in case of exponential smoothening/forecasting the order + is referred as `alpha` which is the coefficient of level + smoothening. + + EMA(T+1) = sum( + alpha * EMA(T) + + (alpha / 2) * EMA(T-1) + + (alpha / 4) * EMA(T-2) + + ... + + (alpha / 2^n) * EMA(T-n) + ) + + where `n` is the lookback period, and `T` is the current day. + + :type alpha: float + :param alpha: The coefficient for level smoothening. + alpha ∈ (0, 1), typically the best value is 0.5 + """ + + series_ = self.series.copy() # make a copy of the iterable + forecast = [] # append the forecasted values to the list + factors = alpha / (2 ** np.arange(1, stop = self.n_lookback + 1)) + for _ in range(self.n_forecast): - _iter_ma = (series_ * factors).sum() # current iteration moving average - + _iter_ma = (series_ * factors).sum() + # pop fifo, and add latest iter series_ = np.insert(series_, len(series_), _iter_ma) series_ = np.delete(series_, 0) - + forecast.append(_iter_ma) - + return np.array(forecast) - - + + def _check_series(self, series : list) -> list: """ Data Sanity Check on the `series` and Return Cleaned Series - + Checks if the series length is expected as the `lookback` period, else returns a truncated data series with a simple warning. """ - + if len(series) > self.n_lookback: warnings.warn(f"Series Length = {len(series)}, while Lookback = {self.n_lookback} Periods.") return series[-self.n_lookback :] elif len(series) < self.n_lookback: raise ValueError(f"Cannot compile, as {len(series)} < {self.n_lookback}. Check values.") else: - return series \ No newline at end of file + return series + + + +if __name__ == "__main__": + N_LOOKBACK = 4 + N_FORECAST = 5 + + series = np.array([12, 7, 27, 34]) + print(f"Given Series: {series}", end = "\n\n") + + model = MovingAverage( + n_lookback = N_LOOKBACK, + n_forecast = N_FORECAST, + series = series + ) + + # calculate the simple moving average + simple_ma = model.simple() + print("Simple Moving Average:", end = "\n ") + print(simple_ma) + + # calculate the exponential moving average + exponential_ma = model.exponential() + print("Exponential Moving Average:", end = "\n ") + print(exponential_ma) From f1a02a8dd3dd1ba9a7512b9d2fdc9b7362609764 Mon Sep 17 00:00:00 2001 From: E33605 Date: Thu, 19 Oct 2023 14:29:01 +0530 Subject: [PATCH 17/25] =?UTF-8?q?=F0=9F=A9=B9=20fix=20=5Fbase=20not=20foun?= =?UTF-8?q?d=20error?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ts_models.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/ts_models.py b/ts_models.py index e42ba93..6c191d0 100644 --- a/ts_models.py +++ b/ts_models.py @@ -14,8 +14,6 @@ import warnings import numpy as np -from _base import UnivariateSeries - class MovingAverage: """ A Set of Moving Average (MA) based Models for Time Series Methods From 2a0062edd1fece05c8d76a7124f575a9f617454a Mon Sep 17 00:00:00 2001 From: E33605 Date: Fri, 20 Oct 2023 18:47:45 +0530 Subject: [PATCH 18/25] =?UTF-8?q?=F0=9F=93=83=F0=9F=9A=A7=20keep=20one=20r?= =?UTF-8?q?eadme=20file=20per=20gist=20-=20gists=20name=20cannot=20be=20ed?= =?UTF-8?q?ited,=20and=20only=20considers=20the=20file=20name=20-=20this?= =?UTF-8?q?=20may=20be=20difficult=20to=20understand=20the=20gists=20purpo?= =?UTF-8?q?se=20-=20keeping=20one=20readme=20file=20with=20different=20sec?= =?UTF-8?q?tions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Stationarity & Unit Roots.md | 26 -------------------------- Time Series Utilities.md | 24 +++++++++++++++++++++++- 2 files changed, 23 insertions(+), 27 deletions(-) delete mode 100644 Stationarity & Unit Roots.md diff --git a/Stationarity & Unit Roots.md b/Stationarity & Unit Roots.md deleted file mode 100644 index 29b1482..0000000 --- a/Stationarity & Unit Roots.md +++ /dev/null @@ -1,26 +0,0 @@ -
-

Stationarity in Time Series Data

-

Theory:Understand Stationary and Non-Stationary in Time Series Data

-
- -
- -
- -Stationarity is one of the fundamental concepts in time series analysis. The **time series data model works on the principle that the [_data is stationary_](https://www.analyticsvidhya.com/blog/2021/04/how-to-check-stationarity-of-data-in-python/) and [_data has no unit roots_](https://www.analyticsvidhya.com/blog/2018/09/non-stationary-time-series-python/)**, this means: - * the data must have a constant mean (across all periods), - * the data should have a constant variance, and - * auto-covariance should not be dependent on time. - -Let's understand the concept using the following example, for more information check [this link](https://www.analyticsvidhya.com/blog/2018/09/non-stationary-time-series-python/). - -![Non-Stationary Time Series](https://cdn.analyticsvidhya.com/wp-content/uploads/2018/09/ns5-e1536673990684.png) - -| ADF Test | KPSS Test | Series Type | Additional Steps | -| :---: | :---: | :---: | --- | -| ✅ | ✅ | _stationary_ | | -| ❌ | ❌ | _non-stationary_ | | -| ✅ | ❌ | _difference-stationary_ | Use differencing to make series stationary. | -| ❌ | ✅ | _trend-stationary_ | Remove trend to make the series _strict stationary. | - -
diff --git a/Time Series Utilities.md b/Time Series Utilities.md index a48b964..a15aeb7 100644 --- a/Time Series Utilities.md +++ b/Time Series Utilities.md @@ -9,7 +9,29 @@
**WARNING:** Merging all the time series gists into a single module. - + +## Stationarity & Unit Roots + +Stationarity is one of the fundamental concepts in time series analysis. The **time series data model works on the principle that the [_data is stationary_](https://www.analyticsvidhya.com/blog/2021/04/how-to-check-stationarity-of-data-in-python/) and [_data has no unit roots_](https://www.analyticsvidhya.com/blog/2018/09/non-stationary-time-series-python/)**, this means: + * the data must have a constant mean (across all periods), + * the data should have a constant variance, and + * auto-covariance should not be dependent on time. + +Let's understand the concept using the following example, for more information check [this link](https://www.analyticsvidhya.com/blog/2018/09/non-stationary-time-series-python/). + +![Non-Stationary Time Series](https://cdn.analyticsvidhya.com/wp-content/uploads/2018/09/ns5-e1536673990684.png) + +
+ +| ADF Test | KPSS Test | Series Type | Additional Steps | +| :---: | :---: | :---: | --- | +| ✅ | ✅ | _stationary_ | | +| ❌ | ❌ | _non-stationary_ | | +| ✅ | ❌ | _difference-stationary_ | Use differencing to make series stationary. | +| ❌ | ✅ | _trend-stationary_ | Remove trend to make the series _strict stationary. | + +
+ ## Time Series Featuring Time series analysis is a special segment of AI/ML application development where a feature is dependent on time. The code here is desgined to create a *sequence* of `x` and `y` data needed in a time series problem. The function is defined with two input parameters (I) **Lootback Period (T) `n_lookback`**, and (II) **Forecast Period (H) `n_forecast`** which can be visually presented below. From 8c18dde7cb3044bc9b244b89a1d2f0c0e7af983f Mon Sep 17 00:00:00 2001 From: E33605 Date: Wed, 14 Aug 2024 15:20:13 +0530 Subject: [PATCH 19/25] =?UTF-8?q?=F0=9F=93=9D=20function=20documentation?= =?UTF-8?q?=20for=20stationarity?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- stationarity.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stationarity.py b/stationarity.py index 3fb5a6e..6cc8990 100644 --- a/stationarity.py +++ b/stationarity.py @@ -3,8 +3,8 @@ """ Stationarity Checking for Time Series Data -@author: Debmalya Pramanik -@version: v0.0.1 +A functional approach to check stationarity using different models +and the function attrbutes are as defined below. """ from statsmodels.tsa.stattools import kpss # kpss test From e5e0a401400acaf5f0ffdf97da109acc696a023d Mon Sep 17 00:00:00 2001 From: E33605 Date: Wed, 14 Aug 2024 15:22:45 +0530 Subject: [PATCH 20/25] =?UTF-8?q?=F0=9F=92=A3=20moving=20document=20code-a?= =?UTF-8?q?rchived/ds-gringotts#15?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Time Series Utilities.md | 60 ---------------------------------------- stationarity.py | 1 + 2 files changed, 1 insertion(+), 60 deletions(-) delete mode 100644 Time Series Utilities.md diff --git a/Time Series Utilities.md b/Time Series Utilities.md deleted file mode 100644 index a15aeb7..0000000 --- a/Time Series Utilities.md +++ /dev/null @@ -1,60 +0,0 @@ -
-

Time Series Utilities

-

object oriented process to create time series sequence features for AI/ML model development

-Colab Notebook -
- -
- -
- -**WARNING:** Merging all the time series gists into a single module. - -## Stationarity & Unit Roots - -Stationarity is one of the fundamental concepts in time series analysis. The **time series data model works on the principle that the [_data is stationary_](https://www.analyticsvidhya.com/blog/2021/04/how-to-check-stationarity-of-data-in-python/) and [_data has no unit roots_](https://www.analyticsvidhya.com/blog/2018/09/non-stationary-time-series-python/)**, this means: - * the data must have a constant mean (across all periods), - * the data should have a constant variance, and - * auto-covariance should not be dependent on time. - -Let's understand the concept using the following example, for more information check [this link](https://www.analyticsvidhya.com/blog/2018/09/non-stationary-time-series-python/). - -![Non-Stationary Time Series](https://cdn.analyticsvidhya.com/wp-content/uploads/2018/09/ns5-e1536673990684.png) - -
- -| ADF Test | KPSS Test | Series Type | Additional Steps | -| :---: | :---: | :---: | --- | -| ✅ | ✅ | _stationary_ | | -| ❌ | ❌ | _non-stationary_ | | -| ✅ | ❌ | _difference-stationary_ | Use differencing to make series stationary. | -| ❌ | ✅ | _trend-stationary_ | Remove trend to make the series _strict stationary. | - -
- -## Time Series Featuring - -Time series analysis is a special segment of AI/ML application development where a feature is dependent on time. The code here is desgined to create a *sequence* of `x` and `y` data needed in a time series problem. The function is defined with two input parameters (I) **Lootback Period (T) `n_lookback`**, and (II) **Forecast Period (H) `n_forecast`** which can be visually presented below. - -
- -![prediction-sequence](https://i.stack.imgur.com/YXwMJ.png) - -
- -## Getting Started - -The code is publically available at [**GitHub gists**](https://gist.github.com/ZenithClown) which is a simple platform for sharing *code snippets* with the community. To use the code, simply clone the code like: - -```shell -git clone https://gist.github.com/ZenithClown/.git ts_utils -export PYTHONPATH="${PYTHONPATH}:ts_utils" -``` - -Done, you can now easily import the function with *python* notebooks/code-files like: - -```python -from ts_featuring import CreateSequence -``` - -
\ No newline at end of file diff --git a/stationarity.py b/stationarity.py index 6cc8990..2ccc2d2 100644 --- a/stationarity.py +++ b/stationarity.py @@ -5,6 +5,7 @@ A functional approach to check stationarity using different models and the function attrbutes are as defined below. +(`More Information `_) """ from statsmodels.tsa.stattools import kpss # kpss test From 06b79891b22b4e769010ede46e983948804f1a0c Mon Sep 17 00:00:00 2001 From: E33605 Date: Mon, 26 Aug 2024 10:52:26 +0530 Subject: [PATCH 21/25] =?UTF-8?q?=E2=9C=A8=20create=20pandaswizard/timeser?= =?UTF-8?q?ies=20module?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - the submodule will hold timeseries utility functions #24 --- pandaswizard/timeseries/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 pandaswizard/timeseries/__init__.py diff --git a/pandaswizard/timeseries/__init__.py b/pandaswizard/timeseries/__init__.py new file mode 100644 index 0000000..e69de29 From 72e26564519b60425a511ee4f6d4eed931afde25 Mon Sep 17 00:00:00 2001 From: E33605 Date: Mon, 26 Aug 2024 11:04:31 +0530 Subject: [PATCH 22/25] =?UTF-8?q?=F0=9F=9B=A0=20git=20move=20gist/master?= =?UTF-8?q?=20to=20pdw/timeseries?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- stationarity.py => pandaswizard/timeseries/stationarity.py | 0 ts_featuring.py => pandaswizard/timeseries/ts_featuring.py | 0 ts_models.py => pandaswizard/timeseries/ts_models.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename stationarity.py => pandaswizard/timeseries/stationarity.py (100%) rename ts_featuring.py => pandaswizard/timeseries/ts_featuring.py (100%) rename ts_models.py => pandaswizard/timeseries/ts_models.py (100%) diff --git a/stationarity.py b/pandaswizard/timeseries/stationarity.py similarity index 100% rename from stationarity.py rename to pandaswizard/timeseries/stationarity.py diff --git a/ts_featuring.py b/pandaswizard/timeseries/ts_featuring.py similarity index 100% rename from ts_featuring.py rename to pandaswizard/timeseries/ts_featuring.py diff --git a/ts_models.py b/pandaswizard/timeseries/ts_models.py similarity index 100% rename from ts_models.py rename to pandaswizard/timeseries/ts_models.py From 6b038b7cb2ec231defbec2f363f0b4b14cbf3af2 Mon Sep 17 00:00:00 2001 From: E33605 Date: Mon, 26 Aug 2024 12:02:10 +0530 Subject: [PATCH 23/25] =?UTF-8?q?=F0=9F=92=A3=20add=20caution=20block=20fo?= =?UTF-8?q?r=20code=20migration=20preparations=20#29?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pandaswizard/timeseries/ts_models.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandaswizard/timeseries/ts_models.py b/pandaswizard/timeseries/ts_models.py index 6c191d0..ffbc90c 100644 --- a/pandaswizard/timeseries/ts_models.py +++ b/pandaswizard/timeseries/ts_models.py @@ -7,8 +7,9 @@ and `numpy` functionalities to provide quick analysis and develop a base line for a univariate time series data. -@author: Debmalya Pramanik -@version: v0.0.1 +.. caution:: + This will be a part of pandaswizard/functions module instead + `GH/#29 `_. """ import warnings From 8e4ebb197bd50fb94c62fa9774272e6d3c238ed9 Mon Sep 17 00:00:00 2001 From: E33605 Date: Mon, 26 Aug 2024 12:04:53 +0530 Subject: [PATCH 24/25] =?UTF-8?q?=E2=9C=A8=20initialize=20timeseries=20und?= =?UTF-8?q?er=20pandaswizard?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pandaswizard/__init__.py | 1 + pandaswizard/timeseries/__init__.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/pandaswizard/__init__.py b/pandaswizard/__init__.py index 23a933c..75f5d67 100644 --- a/pandaswizard/__init__.py +++ b/pandaswizard/__init__.py @@ -28,3 +28,4 @@ from pandaswizard import window from pandaswizard import wrappers from pandaswizard import functions +from pandaswizard import timeseries diff --git a/pandaswizard/timeseries/__init__.py b/pandaswizard/timeseries/__init__.py index e69de29..5f7a7bd 100644 --- a/pandaswizard/timeseries/__init__.py +++ b/pandaswizard/timeseries/__init__.py @@ -0,0 +1,29 @@ +# -*- encoding: utf-8 -*- + +""" +A Set of Utility Functions for a Time Series Data for ``pandas`` + +A time series analysis is a way of analyzing a sequence of data which +was collected over a period of time and the data is collected at a +specific intervals. The :mod:`pandas` provides various functions to +manipulate a time object which is a wrapper over the ``datetime`` +module and is a type of ``pd.Timestamp`` object. + +The module provides some utility functions for a time series data +and tests like "stationarity" which is a must in EDA! + +.. caution:: + The time series module consists of functions which was earlier + developed in `GH/sqlparser `_. + +.. note:: + More details on why the module was merged is + available `GH/#24 `_. + +.. note:: + Code migration details are mentioned + `GH/#27 `_. +""" + +from pandaswizard.timeseries.stationarity import * +from pandaswizard.timeseries.ts_featuring import * From 61976e0cbf59f9d85052727d73335f6f58537c02 Mon Sep 17 00:00:00 2001 From: E33605 Date: Mon, 26 Aug 2024 12:10:13 +0530 Subject: [PATCH 25/25] =?UTF-8?q?=F0=9F=93=9D=20create=20section=20for=20t?= =?UTF-8?q?ime=20series=20documentation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - related issue code-archived/ds-gringotts#21 --- docs/index.md | 1 + docs/timeseries.md | 59 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 docs/timeseries.md diff --git a/docs/index.md b/docs/index.md index 40789a1..d460cdd 100644 --- a/docs/index.md +++ b/docs/index.md @@ -61,6 +61,7 @@ The above function calculates the 50th percentile, i.e., the median of the featu aggregate.md wrappers.md functions.md +timeseries.md ``` --- diff --git a/docs/timeseries.md b/docs/timeseries.md new file mode 100644 index 0000000..9eb9019 --- /dev/null +++ b/docs/timeseries.md @@ -0,0 +1,59 @@ +# Time Series Utilities + +
+ +Feature engineering, time series stationarity checks are few of the use-cases that are compiled in this gists. Check +individual module defination and functionalities as follows. + +## Stationarity & Unit Roots + +Stationarity is one of the fundamental concepts in time series analysis. The +**time series data model works on the principle that the [_data is stationary_](https://www.analyticsvidhya.com/blog/2021/04/how-to-check-stationarity-of-data-in-python/) +and [_data has no unit roots_](https://www.analyticsvidhya.com/blog/2018/09/non-stationary-time-series-python/)**, this means: + * the data must have a constant mean (across all periods), + * the data should have a constant variance, and + * auto-covariance should not be dependent on time. + +Let's understand the concept using the following example, for more information check [this link](https://www.analyticsvidhya.com/blog/2018/09/non-stationary-time-series-python/). + +![Non-Stationary Time Series](https://cdn.analyticsvidhya.com/wp-content/uploads/2018/09/ns5-e1536673990684.png) + +
+ +| ADF Test | KPSS Test | Series Type | Additional Steps | +| :---: | :---: | :---: | --- | +| ✅ | ✅ | _stationary_ | | +| ❌ | ❌ | _non-stationary_ | | +| ✅ | ❌ | _difference-stationary_ | Use differencing to make series stationary. | +| ❌ | ✅ | _trend-stationary_ | Remove trend to make the series _strict stationary. | + +
+ +```{eval-rst} +.. automodule:: pandaswizard.timeseries.stationarity + :members: + :undoc-members: + :show-inheritance: +``` + +## Time Series Featuring + +Time series analysis is a special segment of AI/ML application development where a feature is dependent on time. The code here +is desgined to create a *sequence* of `x` and `y` data needed in a time series problem. The function is defined with two input +parameters (I) **Lootback Period (T) `n_lookback`**, and (II) **Forecast Period (H) `n_forecast`** which can be visually +presented below. + +
+ +![prediction-sequence](https://i.stack.imgur.com/YXwMJ.png) + +
+ +```{eval-rst} +.. automodule:: pandaswizard.timeseries.ts_featuring + :members: + :undoc-members: + :show-inheritance: +``` + +