added auto support; updated docs

georgia-tech-db · Sep 29, 2023 · 8181ba8 · 8181ba8
1 parent fda2b40
commit 8181ba8
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 21 deletions.
diff --git a/docs/source/reference/ai/model-forecasting.rst b/docs/source/reference/ai/model-forecasting.rst
@@ -47,20 +47,22 @@ EvaDB's default forecast framework is `statsforecast <https://nixtla.github.io/s
 .. list-table:: Available Parameters
    :widths: 25 75
 
-   * - PREDICT (**required**) 
+   * - PREDICT (required) 
      - The name of the column we wish to forecast.
-   * - TIME
-     - The name of the column that contains the datestamp, wihch should be of a format expected by Pandas, ideally YYYY-MM-DD for a date or YYYY-MM-DD HH:MM:SS for a timestamp. Please visit the `pandas documentation <https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html>`_ for details. If not provided, an auto increasing ID column will be used.
-   * - ID
-     - The name of column that represents an identifier for the series. If not provided, the whole table is considered as one series of data.
-   * - LIBRARY
+   * - TIME (default: 'ds')
+     - The name of the column that contains the datestamp, which should be of a format expected by Pandas, ideally YYYY-MM-DD for a date or YYYY-MM-DD HH:MM:SS for a timestamp. Please visit the `pandas documentation <https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html>`_ for details. If relevant column is not found, an auto increasing ID column will be used.
+   * - ID (default: 'unique_id')
+     - The name of column that represents an identifier for the series. If relevant column is not found, the whole table is considered as one series of data.
+   * - LIBRARY (default: 'statsforecast')
      - We can select one of `statsforecast` (default) or `neuralforecast`. `statsforecast` provides access to statistical forecasting methods, while `neuralforecast` gives access to deep-learning based forecasting methods.
-   * - MODEL
+   * - MODEL (default: 'AutoARIMA')
      - If LIBRARY is `statsforecast`, we can select one of AutoARIMA, AutoCES, AutoETS, AutoTheta. The default is AutoARIMA. Check `Automatic Forecasting <https://nixtla.github.io/statsforecast/src/core/models_intro.html#automatic-forecasting>`_ to learn details about these models. If LIBRARY is `neuralforecast`, we can select one of NHITS or NBEATS. The default is NBEATS. Check `NBEATS docs <https://nixtla.github.io/neuralforecast/models.nbeats.html>`_ for details.
-   * - EXOGENOUS
+   * - AUTO (default: 'F')
      - The names of columns to be treated as exogenous variables, separated by comma. These columns would be considered for forecasting by the backend only for LIBRARY `neuralforecast`.
-   * - Frequency
-     - A string indicating the frequency of the data. The common used ones are D, W, M, Y, which repestively represents day-, week-, month- and year- end frequency. The default value is M. Check `pandas available frequencies <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_ for all available frequencies.
+   * - Frequency (default: 'auto')
+     - A string indicating the frequency of the data. The common used ones are D, W, M, Y, which repestively represents day-, week-, month- and year- end frequency. The default value is M. Check `pandas available frequencies <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_ for all available frequencies. If it is not provided, the frequency is attempted to be determined automatically.
+
+Note: If columns other than the ones required as mentioned above are passed while creating the function, they will be treated as exogenous variables if LIBRARY is `neuralforecast` and the AUTO is set to F. In other situations, they would be ignored.
 
 Below is an example query specifying the above parameters:
 

diff --git a/evadb/executor/create_function_executor.py b/evadb/executor/create_function_executor.py
@@ -246,7 +246,7 @@ def handle_forecasting_function(self):
             Set or infer data frequency
         """
 
-        if "frequency" not in arg_map.keys():
+        if "frequency" not in arg_map.keys() or arg_map["frequency"] == "auto":
             arg_map["frequency"] = pd.infer_freq(data["ds"])
         frequency = arg_map["frequency"]
         if frequency is None:
@@ -290,6 +290,12 @@ def handle_forecasting_function(self):
             if "model" not in arg_map.keys():
                 arg_map["model"] = "NBEATS"
 
+            if (
+                arg_map["model"].lower()[0] == "t"
+                and "auto" not in arg_map["model"].lower()
+            ):
+                arg_map["model"] = "Auto" + arg_map["model"]
+
             try:
                 model_here = model_dict[arg_map["model"]]
             except Exception:
@@ -298,16 +304,17 @@ def handle_forecasting_function(self):
                 raise FunctionIODefinitionError(err_msg)
             model_args = {}
 
-            if len(data.columns) >= 4:
-                exogenous_columns = [
-                    x for x in list(data.columns) if x not in ["ds", "y", "unique_id"]
-                ]
-                model_args["hist_exog_list"] = exogenous_columns
-
             if "auto" not in arg_map["model"].lower():
                 model_args["input_size"] = 2 * horizon
+                if len(data.columns) >= 4:
+                    exogenous_columns = [
+                        x
+                        for x in list(data.columns)
+                        if x not in ["ds", "y", "unique_id"]
+                    ]
+                    model_args["hist_exog_list"] = exogenous_columns
 
-            model_args["early_stop_patience_steps"] = 20
+                model_args["early_stop_patience_steps"] = 20
 
             model_args["h"] = horizon
 
@@ -333,6 +340,12 @@ def handle_forecasting_function(self):
             if "model" not in arg_map.keys():
                 arg_map["model"] = "AutoARIMA"
 
+            if (
+                arg_map["model"].lower()[0] == "t"
+                and "auto" not in arg_map["model"].lower()
+            ):
+                arg_map["model"] = "Auto" + arg_map["model"]
+
             try:
                 model_here = model_dict[arg_map["model"]]
             except Exception:
@@ -348,7 +361,11 @@ def handle_forecasting_function(self):
         data["ds"] = pd.to_datetime(data["ds"])
 
         model_save_dir_name = library + "_" + arg_map["model"] + "_" + new_freq
-        if len(data.columns) >= 4:
+        if (
+            len(data.columns) >= 4
+            and "auto" not in arg_map["model"].lower()
+            and library == "neuralforecast"
+        ):
             model_save_dir_name += "_exogenous_" + str(sorted(exogenous_columns))
 
         model_dir = os.path.join(
@@ -373,7 +390,7 @@ def handle_forecasting_function(self):
             if int(x.split("horizon")[1].split(".pkl")[0]) >= horizon
         ]
         if len(existing_model_files) == 0:
-            print("Training")
+            print("Training, please wait...")
             if library == "neuralforecast":
                 model.fit(df=data, val_size=horizon)
             else:

diff --git a/test/integration_tests/long/test_model_forecasting.py b/test/integration_tests/long/test_model_forecasting.py
@@ -104,7 +104,6 @@ def test_forecast(self):
             HORIZON 12
             PREDICT 'y'
             LIBRARY 'neuralforecast'
-            EXOGENOUS 'trend'
             FREQUENCY 'M';
         """
         execute_query_fetch_all(self.evadb, create_predict_udf)
@@ -128,6 +127,7 @@ def test_forecast_with_column_rename(self):
                 WHERE bedrooms = 2
             )
             TYPE Forecasting
+            HORIZON 12
             PREDICT 'ma'
             ID 'type'
             TIME 'saledate'