From d64a84e71bed06db6d1c29f77f0fedb98ee18d76 Mon Sep 17 00:00:00 2001
From: ckurze <christian.kurze@crate.io>
Date: Thu, 30 Nov 2023 17:06:42 +0100
Subject: [PATCH] ML/AutoML: Harmonize notebooks, and easier connection string
 handling

---
 .../automl_classification_with_pycaret.ipynb  | 87 +++++++++++++------
 ..._timeseries_forecasting_with_pycaret.ipynb | 10 ++-
 2 files changed, 69 insertions(+), 28 deletions(-)
diff --git a/topic/machine-learning/automl/automl_classification_with_pycaret.ipynb b/topic/machine-learning/automl/automl_classification_with_pycaret.ipynb
index bb763f0f..9f0e3711 100644
--- a/topic/machine-learning/automl/automl_classification_with_pycaret.ipynb
+++ b/topic/machine-learning/automl/automl_classification_with_pycaret.ipynb
@@ -115,12 +115,27 @@
    "source": [
     "## Getting started\n",
     "\n",
-    "First, install the required dependencies. \n",
-    "\n",
-    "```bash\n",
-    "pip install -r requirements.txt\n",
-    "```\n",
+    "First, install the required dependencies. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!pip install -r requirements.txt\n",
     "\n",
+    "# In an environment like Google Colab, please use the abolute path of requirements.txt\n",
+    "# Note: Some inconsistencies of dependencies might get reported. They can usually be ignored.\n",
+    "# Restart the runtime, if asked by Colab.\n",
+    "#!pip install -r https://raw.githubusercontent.com/crate/cratedb-examples/main/topic/machine-learning/automl/requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "**Note:** As of time of this writing, PyCaret requires Python 3.8, 3.9 or 3.10.\n",
     "\n",
     "Second, you will need a CrateDB instance to store and serve the data. The easiest\n",
@@ -131,31 +146,53 @@
     "create an `.env` file with the following content:\n",
     "\n",
     "```env\n",
-    "CRATE_HOST=<your-crate-host> # set this to localhost if you're running crate locally\n",
-    "CRATE_USER=<your-crate-user> # set this to crate if you're running crate locally\n",
-    "CRATE_PASSWORD=<your-crate-password> # set this to \"\" if you're running crate locally\n",
-    "CRATE_SSL=true # set this to false if you're running crate locally\n",
+    "# use this string for a connection to CrateDB Cloud\n",
+    "CONNECTION_STRING=crate://username:password@hostname/?ssl=true \n",
+    "\n",
+    "# use this string for a local connection to CrateDB\n",
+    "# CONNECTION_STRING=crate://crate@localhost/?ssl=false\n",
     "```\n",
     "\n",
     "You can find your CrateDB credentials in the [CrateDB Cloud Console].\n",
     "\n",
     "[CrateDB Cloud Console]: https://cratedb.com/docs/cloud/en/latest/reference/overview.html#cluster\n",
-    "[deploy a cluster]: https://cratedb.com/docs/cloud/en/latest/tutorials/deploy/stripe.html#deploy-cluster\n",
-    "\n",
-    "### Creating demo data\n",
+    "[deploy a cluster]: https://cratedb.com/docs/cloud/en/latest/tutorials/deploy/stripe.html#deploy-cluster"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
     "\n",
-    "For convenience, this notebook comes with an accompanying CSV dataset which you\n",
-    "can quickly import into the database. Upload the CSV file to your CrateDB cloud\n",
-    "cluster, as described [here](https://cratedb.com/docs/cloud/en/latest/reference/overview.html#import).\n",
-    "To follow this notebook, choose `pycaret_churn` for your table name.\n",
+    "# For CrateDB Cloud, use:\n",
+    "CONNECTION_STRING = os.environ.get(\n",
+    "    \"CRATEDB_CONNECTION_STRING\",\n",
+    "    \"crate://username:password@hostname/?ssl=true\",\n",
+    ")\n",
     "\n",
-    "This will automatically create a new database table and import the data."
+    "# For an self-deployed CrateDB, e.g. via Docker, please use:\n",
+    "# CONNECTION_STRING = os.environ.get(\n",
+    "#     \"CRATEDB_CONNECTION_STRING\",\n",
+    "#     \"crate://crate@localhost/?ssl=false\",\n",
+    "# )"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "### Creating demo data\n",
+    "\n",
+    "For convenience, this notebook comes with an accompanying CSV dataset which you\n",
+    "can quickly import into the database. Upload the CSV file to your CrateDB cloud\n",
+    "cluster, as described [here](https://cratedb.com/docs/cloud/en/latest/reference/overview.html#import).\n",
+    "To follow this notebook, choose `pycaret_churn` for your table name.\n",
+    "\n",
+    "This will automatically create a new database table and import the data.\n",
+    "\n",
     "### Alternative data import using code\n",
     "\n",
     "If you prefer to use code to import your data, please execute the following lines which read the CSV\n",
@@ -175,8 +212,7 @@
     "if os.path.exists(\".env\"):\n",
     "    dotenv.load_dotenv(\".env\", override=True)\n",
     "\n",
-    "dburi = f\"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}\"\n",
-    "engine = sa.create_engine(dburi, echo=os.environ.get('DEBUG'))\n",
+    "engine = sa.create_engine(CONNECTION_STRING, echo=os.environ.get('DEBUG'))\n",
     "df = pd.read_csv(\"https://github.com/crate/cratedb-datasets/raw/main/machine-learning/automl/churn-dataset.csv\")\n",
     "\n",
     "with engine.connect() as conn:\n",
@@ -214,8 +250,7 @@
     "if os.path.exists(\".env\"):\n",
     "    dotenv.load_dotenv(\".env\", override=True)\n",
     "\n",
-    "dburi = f\"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}\"\n",
-    "engine = sa.create_engine(dburi, echo=os.environ.get('DEBUG'))\n",
+    "engine = sa.create_engine(CONNECTION_STRING, echo=os.environ.get('DEBUG'))\n",
     "\n",
     "with engine.connect() as conn:\n",
     "    with conn.execute(sa.text(\"SELECT * FROM pycaret_churn\")) as cursor:\n",
@@ -224,7 +259,7 @@
     "# We set the MLFLOW_TRACKING_URI to our CrateDB instance. We'll see later why\n",
     "os.environ[\n",
     "    \"MLFLOW_TRACKING_URI\"\n",
-    "] = f\"{dburi}&schema=mlflow\""
+    "] = f\"{CONNECTION_STRING}&schema=mlflow\""
    ]
   },
   {
@@ -966,8 +1001,10 @@
     "# - \"n_select\" defines how many models are selected.\n",
     "# - \"exclude\" defines which models are excluded from the comparison.\n",
     "\n",
+    "# Note: This is only relevant if we are executing automated tests\n",
     "if \"PYTEST_CURRENT_TEST\" in os.environ:\n",
     "    best_models = compare_models(sort=\"AUC\", include=[\"lr\", \"knn\"], n_select=3)\n",
+    "# If we are not in an automated test, compare the available models\n",
     "else:\n",
     "    # For production scenarios, it might be worth to include \"lightgbm\" again.\n",
     "    best_models = compare_models(sort=\"AUC\", exclude=[\"lightgbm\"], n_select=3)"
@@ -3406,7 +3443,7 @@
    "source": [
     "os.environ[\n",
     "    \"MLFLOW_TRACKING_URI\"\n",
-    "] = f\"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}&schema=mlflow\""
+    "] = f\"{CONNECTION_STRING}&schema=mlflow\""
    ]
   },
   {
@@ -3484,7 +3521,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "crate",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -3498,7 +3535,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.0"
+   "version": "3.11.4"
   }
  },
  "nbformat": 4,
diff --git a/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.ipynb b/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.ipynb
index 79217839..7a2f7b0e 100644
--- a/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.ipynb
+++ b/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.ipynb
@@ -112,13 +112,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
     "#!pip install -r requirements.txt\n",
     "\n",
-    "# In an environment like Google Colab, please use the abolute path of requirements.txt:\n",
+    "# In an environment like Google Colab, please use the abolute path of requirements.txt\n",
+    "# Note: Some inconsistencies of dependencies might get reported. They can usually be ignored.\n",
+    "# Restart the runtime, if asked by Colab.\n",
     "#!pip install -r https://raw.githubusercontent.com/crate/cratedb-examples/main/topic/machine-learning/automl/requirements.txt"
    ]
   },
@@ -1080,10 +1082,12 @@
     "#    all available models are included by default)\n",
     "# - \"fold\" defines the number of folds to use for cross-validation.\n",
     "\n",
+    "# Note: This is only relevant if we are executing automated tests\n",
     "if \"PYTEST_CURRENT_TEST\" in os.environ:\n",
     "    best_models = compare_models(sort=\"MASE\",\n",
     "                                 include=[\"ets\", \"et_cds_dt\", \"naive\"],\n",
     "                                 n_select=3)\n",
+    "# If we are not in an automated test, compare all available models\n",
     "else:\n",
     "    best_models = compare_models(sort=\"MASE\", n_select=3)"
    ]
@@ -1831,7 +1835,7 @@
     "The missing step is to identify the best model from all the conducted experiments.\n",
     "This is done by simply looking the the Mean MASE outputs of all the model training\n",
     "and tuning steps above. Looking at all the outputs, the best performing model is\n",
-    "the blended model with a MASE of 0.7783."
+    "the blended model with a MASE of approx. 0.77."
    ]
   },
   {