From 89abd38970a6b3ffba345264ffb6fc785b16aad3 Mon Sep 17 00:00:00 2001 From: ckurze Date: Thu, 30 Nov 2023 16:10:48 +0100 Subject: [PATCH 1/5] ML/AutoML: Reworked connection string for easier usage --- ..._timeseries_forecasting_with_pycaret.ipynb | 65 +++++++++++++------ 1 file changed, 46 insertions(+), 19 deletions(-) diff --git a/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.ipynb b/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.ipynb index ff6d6a47..8d71cf8a 100644 --- a/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.ipynb +++ b/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.ipynb @@ -120,20 +120,48 @@ "sign up for a free account at https://console.cratedb.cloud and \n", "[deploy a cluster].\n", "\n", + "[deploy a cluster]: https://cratedb.com/docs/cloud/en/latest/tutorials/deploy/stripe.html#deploy-cluster\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "To populate your environment with corresponding database access credentials,\n", "create an `.env` file with the following content:\n", "\n", "```env\n", - "CRATE_HOST= # set this to localhost if you're running crate locally\n", - "CRATE_USER= # set this to crate if you're running crate locally\n", - "CRATE_PASSWORD= # set this to \"\" if you're running crate locally\n", - "CRATE_SSL=true # set this to false if you're running crate locally\n", + "# use this string for a connection to CrateDB Cloud\n", + "CONNECTION_STRING=crate://username:password@hostname/?ssl=true \n", + "\n", + "# use this string for a local connection to CrateDB\n", + "# CONNECTION_STRING=crate://crate@localhost/?ssl=false \n", "```\n", "\n", "You can find your CrateDB credentials in the [CrateDB Cloud Console].\n", "\n", - "[CrateDB Cloud Console]: https://cratedb.com/docs/cloud/en/latest/reference/overview.html#cluster\n", - "[deploy a cluster]: https://cratedb.com/docs/cloud/en/latest/tutorials/deploy/stripe.html#deploy-cluster\n" + "[CrateDB Cloud Console]: https://cratedb.com/docs/cloud/en/latest/reference/overview.html#cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# For CrateDB Cloud, use:\n", + "CONNECTION_STRING = os.environ.get(\n", + " \"CRATEDB_CONNECTION_STRING\",\n", + " \"crate://username:password@hostname/?ssl=true\",\n", + ")\n", + "\n", + "# For an self-deployed CrateDB, e.g. via Docker, please use:\n", + "# CONNECTION_STRING = os.environ.get(\n", + "# \"CRATEDB_CONNECTION_STRING\",\n", + "# \"crate://crate@localhost/?ssl=false\",\n", + "# )" ] }, { @@ -203,8 +231,7 @@ "data[\"date\"] = pd.to_datetime(data[\"date\"])\n", "\n", "# Insert the data into CrateDB\n", - "dburi = f\"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}\"\n", - "engine = sa.create_engine(dburi, echo=os.environ.get(\"DEBUG\"))\n", + "engine = sa.create_engine(CONNECTION_STRING, echo=os.environ.get(\"DEBUG\"))\n", "\n", "with engine.connect() as conn:\n", " data.to_sql(\n", @@ -253,7 +280,7 @@ "data[\"month\"] = pd.to_datetime(data['month'], unit='ms')\n", "\n", "# We set the MLFLOW_TRACKING_URI to our CrateDB instance. We'll see later why\n", - "os.environ[\"MLFLOW_TRACKING_URI\"] = f\"{dburi}&schema=mlflow\"" + "os.environ[\"MLFLOW_TRACKING_URI\"] = f\"{CONNECTION_STRING}&schema=mlflow\"" ] }, { @@ -1277,7 +1304,6 @@ "output_type": "display_data" }, { - "name": "stdout", "metadata": { "nbreg": { "diff_ignore": [ @@ -1285,13 +1311,13 @@ ] } }, + "name": "stdout", "output_type": "stream", "text": [ "Fitting 3 folds for each of 10 candidates, totalling 30 fits\n" ] }, { - "name": "stderr", "metadata": { "nbreg": { "diff_ignore": [ @@ -1299,6 +1325,7 @@ ] } }, + "name": "stderr", "output_type": "stream", "text": [ "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.\n", @@ -1433,7 +1460,6 @@ "output_type": "display_data" }, { - "name": "stdout", "metadata": { "nbreg": { "diff_ignore": [ @@ -1441,13 +1467,13 @@ ] } }, + "name": "stdout", "output_type": "stream", "text": [ "Fitting 3 folds for each of 10 candidates, totalling 30 fits\n" ] }, { - "name": "stderr", "metadata": { "nbreg": { "diff_ignore": [ @@ -1455,6 +1481,7 @@ ] } }, + "name": "stderr", "output_type": "stream", "text": [ "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.\n", @@ -1583,8 +1610,6 @@ "output_type": "display_data" }, { - "name": "stdout", - "output_type": "stream", "metadata": { "nbreg": { "diff_ignore": [ @@ -1592,12 +1617,13 @@ ] } }, + "name": "stdout", + "output_type": "stream", "text": [ "Fitting 3 folds for each of 10 candidates, totalling 30 fits\n" ] }, { - "name": "stderr", "metadata": { "nbreg": { "diff_ignore": [ @@ -1605,6 +1631,7 @@ ] } }, + "name": "stderr", "output_type": "stream", "text": [ "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.\n", @@ -2086,7 +2113,7 @@ "source": [ "os.environ[\n", " \"MLFLOW_TRACKING_URI\"\n", - "] = f\"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}&schema=mlflow\"" + "] = f\"{CONNECTION_STRING}&schema=mlflow\"" ] }, { @@ -2182,7 +2209,7 @@ ], "metadata": { "kernelspec": { - "display_name": "crate", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -2196,7 +2223,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.0" + "version": "3.11.4" } }, "nbformat": 4, From 637960d0b394160d442b7837d545a9d5e2d61e43 Mon Sep 17 00:00:00 2001 From: ckurze Date: Thu, 30 Nov 2023 16:13:42 +0100 Subject: [PATCH 2/5] ML/AutoML: Slight changes for requirements --- ..._timeseries_forecasting_with_pycaret.ipynb | 31 ++++++++++++------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.ipynb b/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.ipynb index 8d71cf8a..79217839 100644 --- a/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.ipynb +++ b/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.ipynb @@ -107,12 +107,25 @@ "source": [ "## Getting started\n", "\n", - "First, install the required dependencies. \n", - "\n", - "```bash\n", - "pip install -r requirements.txt\n", - "```\n", + "First, install the required dependencies. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install -r requirements.txt\n", "\n", + "# In an environment like Google Colab, please use the abolute path of requirements.txt:\n", + "#!pip install -r https://raw.githubusercontent.com/crate/cratedb-examples/main/topic/machine-learning/automl/requirements.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "**Note:** As of time of this writing, PyCaret requires Python 3.8, 3.9 or 3.10.\n", "\n", "Second, you will need a CrateDB instance to store and serve the data. The \n", @@ -120,13 +133,6 @@ "sign up for a free account at https://console.cratedb.cloud and \n", "[deploy a cluster].\n", "\n", - "[deploy a cluster]: https://cratedb.com/docs/cloud/en/latest/tutorials/deploy/stripe.html#deploy-cluster\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ "To populate your environment with corresponding database access credentials,\n", "create an `.env` file with the following content:\n", "\n", @@ -140,6 +146,7 @@ "\n", "You can find your CrateDB credentials in the [CrateDB Cloud Console].\n", "\n", + "[deploy a cluster]: https://cratedb.com/docs/cloud/en/latest/tutorials/deploy/stripe.html#deploy-cluster\n", "[CrateDB Cloud Console]: https://cratedb.com/docs/cloud/en/latest/reference/overview.html#cluster" ] }, From f9658b3bb4a8f6563801533a663b0e1c7c9c0832 Mon Sep 17 00:00:00 2001 From: ckurze Date: Thu, 30 Nov 2023 17:06:42 +0100 Subject: [PATCH 3/5] ML/AutoML: Harmonize notebooks, and easier connection string handling --- .../automl_classification_with_pycaret.ipynb | 87 +++++++++++++------ ..._timeseries_forecasting_with_pycaret.ipynb | 10 ++- 2 files changed, 69 insertions(+), 28 deletions(-) diff --git a/topic/machine-learning/automl/automl_classification_with_pycaret.ipynb b/topic/machine-learning/automl/automl_classification_with_pycaret.ipynb index bb763f0f..8a2ce80e 100644 --- a/topic/machine-learning/automl/automl_classification_with_pycaret.ipynb +++ b/topic/machine-learning/automl/automl_classification_with_pycaret.ipynb @@ -115,12 +115,27 @@ "source": [ "## Getting started\n", "\n", - "First, install the required dependencies. \n", - "\n", - "```bash\n", - "pip install -r requirements.txt\n", - "```\n", + "First, install the required dependencies. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install -r requirements.txt\n", "\n", + "# In an environment like Google Colab, please use the absolute URL to the requirements.txt file.\n", + "# Note: Some inconsistencies of dependencies might get reported. They can usually be ignored.\n", + "# Restart the runtime, if asked by Colab.\n", + "#!pip install -r https://raw.githubusercontent.com/crate/cratedb-examples/main/topic/machine-learning/automl/requirements.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "**Note:** As of time of this writing, PyCaret requires Python 3.8, 3.9 or 3.10.\n", "\n", "Second, you will need a CrateDB instance to store and serve the data. The easiest\n", @@ -131,31 +146,53 @@ "create an `.env` file with the following content:\n", "\n", "```env\n", - "CRATE_HOST= # set this to localhost if you're running crate locally\n", - "CRATE_USER= # set this to crate if you're running crate locally\n", - "CRATE_PASSWORD= # set this to \"\" if you're running crate locally\n", - "CRATE_SSL=true # set this to false if you're running crate locally\n", + "# use this string for a connection to CrateDB Cloud\n", + "CONNECTION_STRING=crate://username:password@hostname/?ssl=true \n", + "\n", + "# use this string for a local connection to CrateDB\n", + "# CONNECTION_STRING=crate://crate@localhost/?ssl=false\n", "```\n", "\n", "You can find your CrateDB credentials in the [CrateDB Cloud Console].\n", "\n", "[CrateDB Cloud Console]: https://cratedb.com/docs/cloud/en/latest/reference/overview.html#cluster\n", - "[deploy a cluster]: https://cratedb.com/docs/cloud/en/latest/tutorials/deploy/stripe.html#deploy-cluster\n", - "\n", - "### Creating demo data\n", + "[deploy a cluster]: https://cratedb.com/docs/cloud/en/latest/tutorials/deploy/stripe.html#deploy-cluster" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", "\n", - "For convenience, this notebook comes with an accompanying CSV dataset which you\n", - "can quickly import into the database. Upload the CSV file to your CrateDB cloud\n", - "cluster, as described [here](https://cratedb.com/docs/cloud/en/latest/reference/overview.html#import).\n", - "To follow this notebook, choose `pycaret_churn` for your table name.\n", + "# For CrateDB Cloud, use:\n", + "CONNECTION_STRING = os.environ.get(\n", + " \"CRATEDB_CONNECTION_STRING\",\n", + " \"crate://username:password@hostname/?ssl=true\",\n", + ")\n", "\n", - "This will automatically create a new database table and import the data." + "# For an self-deployed CrateDB, e.g. via Docker, please use:\n", + "# CONNECTION_STRING = os.environ.get(\n", + "# \"CRATEDB_CONNECTION_STRING\",\n", + "# \"crate://crate@localhost/?ssl=false\",\n", + "# )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ + "### Creating demo data\n", + "\n", + "For convenience, this notebook comes with an accompanying CSV dataset which you\n", + "can quickly import into the database. Upload the CSV file to your CrateDB cloud\n", + "cluster, as described [here](https://cratedb.com/docs/cloud/en/latest/reference/overview.html#import).\n", + "To follow this notebook, choose `pycaret_churn` for your table name.\n", + "\n", + "This will automatically create a new database table and import the data.\n", + "\n", "### Alternative data import using code\n", "\n", "If you prefer to use code to import your data, please execute the following lines which read the CSV\n", @@ -175,8 +212,7 @@ "if os.path.exists(\".env\"):\n", " dotenv.load_dotenv(\".env\", override=True)\n", "\n", - "dburi = f\"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}\"\n", - "engine = sa.create_engine(dburi, echo=os.environ.get('DEBUG'))\n", + "engine = sa.create_engine(CONNECTION_STRING, echo=os.environ.get('DEBUG'))\n", "df = pd.read_csv(\"https://github.com/crate/cratedb-datasets/raw/main/machine-learning/automl/churn-dataset.csv\")\n", "\n", "with engine.connect() as conn:\n", @@ -214,8 +250,7 @@ "if os.path.exists(\".env\"):\n", " dotenv.load_dotenv(\".env\", override=True)\n", "\n", - "dburi = f\"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}\"\n", - "engine = sa.create_engine(dburi, echo=os.environ.get('DEBUG'))\n", + "engine = sa.create_engine(CONNECTION_STRING, echo=os.environ.get('DEBUG'))\n", "\n", "with engine.connect() as conn:\n", " with conn.execute(sa.text(\"SELECT * FROM pycaret_churn\")) as cursor:\n", @@ -224,7 +259,7 @@ "# We set the MLFLOW_TRACKING_URI to our CrateDB instance. We'll see later why\n", "os.environ[\n", " \"MLFLOW_TRACKING_URI\"\n", - "] = f\"{dburi}&schema=mlflow\"" + "] = f\"{CONNECTION_STRING}&schema=mlflow\"" ] }, { @@ -966,8 +1001,10 @@ "# - \"n_select\" defines how many models are selected.\n", "# - \"exclude\" defines which models are excluded from the comparison.\n", "\n", + "# Note: This is only relevant if we are executing automated tests\n", "if \"PYTEST_CURRENT_TEST\" in os.environ:\n", " best_models = compare_models(sort=\"AUC\", include=[\"lr\", \"knn\"], n_select=3)\n", + "# If we are not in an automated test, compare the available models\n", "else:\n", " # For production scenarios, it might be worth to include \"lightgbm\" again.\n", " best_models = compare_models(sort=\"AUC\", exclude=[\"lightgbm\"], n_select=3)" @@ -3406,7 +3443,7 @@ "source": [ "os.environ[\n", " \"MLFLOW_TRACKING_URI\"\n", - "] = f\"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}&schema=mlflow\"" + "] = f\"{CONNECTION_STRING}&schema=mlflow\"" ] }, { @@ -3484,7 +3521,7 @@ ], "metadata": { "kernelspec": { - "display_name": "crate", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -3498,7 +3535,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.0" + "version": "3.11.4" } }, "nbformat": 4, diff --git a/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.ipynb b/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.ipynb index 79217839..60d3b725 100644 --- a/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.ipynb +++ b/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.ipynb @@ -112,13 +112,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "#!pip install -r requirements.txt\n", "\n", - "# In an environment like Google Colab, please use the abolute path of requirements.txt:\n", + "# In an environment like Google Colab, please use the absolute URL to the requirements.txt file.\n", + "# Note: Some inconsistencies of dependencies might get reported. They can usually be ignored.\n", + "# Restart the runtime, if asked by Colab.\n", "#!pip install -r https://raw.githubusercontent.com/crate/cratedb-examples/main/topic/machine-learning/automl/requirements.txt" ] }, @@ -1080,10 +1082,12 @@ "# all available models are included by default)\n", "# - \"fold\" defines the number of folds to use for cross-validation.\n", "\n", + "# Note: This is only relevant if we are executing automated tests\n", "if \"PYTEST_CURRENT_TEST\" in os.environ:\n", " best_models = compare_models(sort=\"MASE\",\n", " include=[\"ets\", \"et_cds_dt\", \"naive\"],\n", " n_select=3)\n", + "# If we are not in an automated test, compare all available models\n", "else:\n", " best_models = compare_models(sort=\"MASE\", n_select=3)" ] @@ -1831,7 +1835,7 @@ "The missing step is to identify the best model from all the conducted experiments.\n", "This is done by simply looking the the Mean MASE outputs of all the model training\n", "and tuning steps above. Looking at all the outputs, the best performing model is\n", - "the blended model with a MASE of 0.7783." + "the blended model with a MASE of approx. 0.77." ] }, { From 86e77a7d39fcd97ad2580bdd9bbf71e8c1dfbe1d Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Sat, 2 Dec 2023 22:26:02 +0100 Subject: [PATCH 4/5] ML/AutoML: Increase notebook execution timeout from 240 to 300 seconds FAILED test.py::test_notebook[automl_timeseries_forecasting_with_pycaret.ipynb] - nbclient.exceptions.CellTimeoutError: A cell timed out while it was being executed, after 240 seconds. The message was: Cell execution timed out. Here is a preview of the cell contents: ------------------- s = setup(data, fh=15, target="total_sales", index="month", log_experiment=True) --- topic/machine-learning/automl/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/topic/machine-learning/automl/pyproject.toml b/topic/machine-learning/automl/pyproject.toml index 44dda605..e50956bd 100644 --- a/topic/machine-learning/automl/pyproject.toml +++ b/topic/machine-learning/automl/pyproject.toml @@ -27,7 +27,7 @@ markers = [ nb_test_files = true nb_coverage = false # 120 seconds is too less on CI/GHA -nb_exec_timeout = 240 +nb_exec_timeout = 300 nb_diff_replace = [ # Compensate output of `crash`. '"/cells/*/outputs/*/text" "\(\d.\d+ sec\)" "(0.000 sec)"', From bb5e35a057db687a387dc025979b102629f0c006 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Mon, 4 Dec 2023 02:47:51 +0100 Subject: [PATCH 5/5] ML/AutoML: Fix testing after streamlining the connectivity configuration Concatenating the `schema` query parameter to the SQLAlchemy connection string correctly is crucial. In order to avoid anomalies or confusion, this patch makes it so that both types of connection strings (regular data vs. MLflow tracking) are configured side-by-side now, so it is easier to understand what is going on. --- topic/machine-learning/automl/README.md | 30 ++++++++++--- .../automl_classification_with_pycaret.ipynb | 37 ++++++++------- .../automl_classification_with_pycaret.py | 18 ++++++-- ..._timeseries_forecasting_with_pycaret.ipynb | 45 ++++++++++--------- ...oml_timeseries_forecasting_with_pycaret.py | 19 +++++--- topic/machine-learning/automl/backlog.md | 1 + topic/machine-learning/automl/pyproject.toml | 36 +++++---------- topic/machine-learning/automl/test.py | 6 ++- 8 files changed, 114 insertions(+), 78 deletions(-) diff --git a/topic/machine-learning/automl/README.md b/topic/machine-learning/automl/README.md index 53dd05cd..666c9d13 100644 --- a/topic/machine-learning/automl/README.md +++ b/topic/machine-learning/automl/README.md @@ -71,11 +71,31 @@ and [CrateDB]. performing model. The notebook also shows how to use CrateDB as storage for both the raw data and the expirement tracking and model registry data. -- Accompanied to the Jupyter Notebook files, there are also basic variants of - the above examples, - [automl_timeseries_forecasting_with_pycaret.py](automl_timeseries_forecasting_with_pycaret.py), - [automl_classification_with_pycaret.py](automl_classification_with_pycaret.py). +- Accompanied to the Jupyter Notebook files, there are also basic standalone + program variants of the above examples. + - [automl_timeseries_forecasting_with_pycaret.py](automl_timeseries_forecasting_with_pycaret.py), + - [automl_classification_with_pycaret.py](automl_classification_with_pycaret.py). + + +## Software Tests + +The resources are validated by corresponding software tests on CI. You can +also use those on your workstation. For example, to invoke the test cases +validating the Notebook about timeseries classification with PyCaret, run: + +```shell +pytest -k automl_classification_with_pycaret.ipynb +``` + +Alternatively, you can validate all resources in this folder by invoking a +test runner program on the top-level folder of this repository. This is the +same code path the CI jobs are taking. +```shell +pip install -r requirements.txt +ngr test topic/machine-learning/automl +``` + -[PyCaret]: https://github.com/pycaret/pycaret [CrateDB]: https://github.com/crate/crate [Introduction to hyperparameter tuning]: https://medium.com/analytics-vidhya/comparison-of-hyperparameter-tuning-algorithms-grid-search-random-search-bayesian-optimization-5326aaef1bd1 +[PyCaret]: https://github.com/pycaret/pycaret diff --git a/topic/machine-learning/automl/automl_classification_with_pycaret.ipynb b/topic/machine-learning/automl/automl_classification_with_pycaret.ipynb index 8a2ce80e..62e42fa1 100644 --- a/topic/machine-learning/automl/automl_classification_with_pycaret.ipynb +++ b/topic/machine-learning/automl/automl_classification_with_pycaret.ipynb @@ -167,17 +167,21 @@ "source": [ "import os\n", "\n", - "# For CrateDB Cloud, use:\n", + "# Define database connectivity when connecting to CrateDB Cloud.\n", "CONNECTION_STRING = os.environ.get(\n", " \"CRATEDB_CONNECTION_STRING\",\n", " \"crate://username:password@hostname/?ssl=true\",\n", ")\n", "\n", - "# For an self-deployed CrateDB, e.g. via Docker, please use:\n", + "# Define database connectivity when connecting to CrateDB on localhost.\n", "# CONNECTION_STRING = os.environ.get(\n", "# \"CRATEDB_CONNECTION_STRING\",\n", "# \"crate://crate@localhost/?ssl=false\",\n", - "# )" + "# )\n", + "\n", + "# Compute derived connection strings for SQLAlchemy use vs. MLflow use.\n", + "DBURI_DATA = f\"{CONNECTION_STRING}&schema=testdrive\"\n", + "DBURI_MLFLOW = f\"{CONNECTION_STRING}&schema=mlflow\"" ] }, { @@ -188,11 +192,13 @@ "\n", "For convenience, this notebook comes with an accompanying CSV dataset which you\n", "can quickly import into the database. Upload the CSV file to your CrateDB cloud\n", - "cluster, as described [here](https://cratedb.com/docs/cloud/en/latest/reference/overview.html#import).\n", + "cluster, as described at [CrateDB Cloud » Import].\n", "To follow this notebook, choose `pycaret_churn` for your table name.\n", "\n", "This will automatically create a new database table and import the data.\n", "\n", + "[CrateDB Cloud » Import]: https://cratedb.com/docs/cloud/en/latest/reference/overview.html#import\n", + "\n", "### Alternative data import using code\n", "\n", "If you prefer to use code to import your data, please execute the following lines which read the CSV\n", @@ -212,12 +218,16 @@ "if os.path.exists(\".env\"):\n", " dotenv.load_dotenv(\".env\", override=True)\n", "\n", - "engine = sa.create_engine(CONNECTION_STRING, echo=os.environ.get('DEBUG'))\n", + "# Connect to database.\n", + "engine = sa.create_engine(DBURI_DATA, echo=bool(os.environ.get('DEBUG')))\n", + "\n", + "# Import data.\n", "df = pd.read_csv(\"https://github.com/crate/cratedb-datasets/raw/main/machine-learning/automl/churn-dataset.csv\")\n", + "df.to_sql(\"pycaret_churn\", engine, schema=\"testdrive\", index=False, chunksize=1000, if_exists=\"replace\")\n", "\n", + "# CrateDB is eventually consistent, so synchronize write operations.\n", "with engine.connect() as conn:\n", - " df.to_sql(\"pycaret_churn\", conn, index=False, chunksize=1000, if_exists=\"replace\")\n", - " conn.execute(sa.text(\"REFRESH TABLE pycaret_churn;\"))" + " conn.execute(sa.text(\"REFRESH TABLE pycaret_churn\"))" ] }, { @@ -250,16 +260,14 @@ "if os.path.exists(\".env\"):\n", " dotenv.load_dotenv(\".env\", override=True)\n", "\n", - "engine = sa.create_engine(CONNECTION_STRING, echo=os.environ.get('DEBUG'))\n", + "engine = sa.create_engine(DBURI_DATA, echo=bool(os.environ.get('DEBUG')))\n", "\n", "with engine.connect() as conn:\n", " with conn.execute(sa.text(\"SELECT * FROM pycaret_churn\")) as cursor:\n", " data = pd.DataFrame(cursor.fetchall(), columns=cursor.keys())\n", "\n", - "# We set the MLFLOW_TRACKING_URI to our CrateDB instance. We'll see later why\n", - "os.environ[\n", - " \"MLFLOW_TRACKING_URI\"\n", - "] = f\"{CONNECTION_STRING}&schema=mlflow\"" + "# Configure MLflow to use CrateDB.\n", + "os.environ[\"MLFLOW_TRACKING_URI\"] = DBURI_MLFLOW" ] }, { @@ -3441,9 +3449,8 @@ "metadata": {}, "outputs": [], "source": [ - "os.environ[\n", - " \"MLFLOW_TRACKING_URI\"\n", - "] = f\"{CONNECTION_STRING}&schema=mlflow\"" + "# Configure MLflow to use CrateDB.\n", + "os.environ[\"MLFLOW_TRACKING_URI\"] = DBURI_MLFLOW" ] }, { diff --git a/topic/machine-learning/automl/automl_classification_with_pycaret.py b/topic/machine-learning/automl/automl_classification_with_pycaret.py index 1bcb2db4..4138e01c 100644 --- a/topic/machine-learning/automl/automl_classification_with_pycaret.py +++ b/topic/machine-learning/automl/automl_classification_with_pycaret.py @@ -17,16 +17,26 @@ dotenv.load_dotenv(".env", override=True) -# Configure database connection string. -dburi = f"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}" -os.environ["MLFLOW_TRACKING_URI"] = f"{dburi}&schema=mlflow" +# Configure to connect to CrateDB server on localhost. +CONNECTION_STRING = os.environ.get( + "CRATEDB_CONNECTION_STRING", + "crate://crate@localhost/?ssl=false", +) + +# Compute derived connection strings for SQLAlchemy use vs. MLflow use. +DBURI_DATA = f"{CONNECTION_STRING}&schema=testdrive" +DBURI_MLFLOW = f"{CONNECTION_STRING}&schema=mlflow" + +# Propagate database connectivity settings. +engine = sa.create_engine(DBURI_DATA, echo=bool(os.environ.get("DEBUG"))) +os.environ["MLFLOW_TRACKING_URI"] = DBURI_MLFLOW def fetch_data(): """ Fetch data from CrateDB, using SQL and SQLAlchemy, and wrap result into pandas data frame. """ - engine = sa.create_engine(dburi, echo=True) + engine = sa.create_engine(DBURI_DATA, echo=True) with engine.connect() as conn: with conn.execute(sa.text("SELECT * FROM pycaret_churn")) as cursor: diff --git a/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.ipynb b/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.ipynb index 60d3b725..e0f25ee6 100644 --- a/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.ipynb +++ b/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.ipynb @@ -160,17 +160,21 @@ "source": [ "import os\n", "\n", - "# For CrateDB Cloud, use:\n", + "# Define database connectivity when connecting to CrateDB Cloud.\n", "CONNECTION_STRING = os.environ.get(\n", " \"CRATEDB_CONNECTION_STRING\",\n", " \"crate://username:password@hostname/?ssl=true\",\n", ")\n", "\n", - "# For an self-deployed CrateDB, e.g. via Docker, please use:\n", + "# Define database connectivity when connecting to CrateDB on localhost.\n", "# CONNECTION_STRING = os.environ.get(\n", "# \"CRATEDB_CONNECTION_STRING\",\n", "# \"crate://crate@localhost/?ssl=false\",\n", - "# )" + "# )\n", + "\n", + "# Compute derived connection strings for SQLAlchemy use vs. MLflow use.\n", + "DBURI_DATA = f\"{CONNECTION_STRING}&schema=testdrive\"\n", + "DBURI_MLFLOW = f\"{CONNECTION_STRING}&schema=mlflow\"" ] }, { @@ -239,21 +243,21 @@ "data[\"total_sales\"] = data[\"unit_price\"] * data[\"quantity\"]\n", "data[\"date\"] = pd.to_datetime(data[\"date\"])\n", "\n", - "# Insert the data into CrateDB\n", - "engine = sa.create_engine(CONNECTION_STRING, echo=os.environ.get(\"DEBUG\"))\n", + "# Connect to database.\n", + "engine = sa.create_engine(DBURI_DATA, echo=bool(os.environ.get(\"DEBUG\")))\n", "\n", - "with engine.connect() as conn:\n", - " data.to_sql(\n", - " \"sales_data_for_forecast\",\n", - " conn,\n", - " index=False,\n", - " chunksize=1000,\n", - " if_exists=\"replace\",\n", - " )\n", + "# Import data.\n", + "data.to_sql(\n", + " \"sales_data_for_forecast\",\n", + " engine,\n", + " index=False,\n", + " chunksize=1000,\n", + " if_exists=\"replace\",\n", + ")\n", "\n", - " # Refresh table to make sure the data is available for querying - as CrateDB\n", - " # is eventually consistent\n", - " conn.execute(sa.text(\"REFRESH TABLE sales_data_for_forecast;\"))" + "# CrateDB is eventually consistent, so synchronize write operations.\n", + "with engine.connect() as conn:\n", + " conn.execute(sa.text(\"REFRESH TABLE sales_data_for_forecast\"))" ] }, { @@ -288,8 +292,8 @@ "\n", "data[\"month\"] = pd.to_datetime(data['month'], unit='ms')\n", "\n", - "# We set the MLFLOW_TRACKING_URI to our CrateDB instance. We'll see later why\n", - "os.environ[\"MLFLOW_TRACKING_URI\"] = f\"{CONNECTION_STRING}&schema=mlflow\"" + "# Configure MLflow to use CrateDB.\n", + "os.environ[\"MLFLOW_TRACKING_URI\"] = DBURI_MLFLOW" ] }, { @@ -2122,9 +2126,8 @@ "metadata": {}, "outputs": [], "source": [ - "os.environ[\n", - " \"MLFLOW_TRACKING_URI\"\n", - "] = f\"{CONNECTION_STRING}&schema=mlflow\"" + "# Configure MLflow to use CrateDB.\n", + "os.environ[\"MLFLOW_TRACKING_URI\"] = DBURI_MLFLOW" ] }, { diff --git a/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.py b/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.py index 115169c0..d59aa457 100644 --- a/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.py +++ b/topic/machine-learning/automl/automl_timeseries_forecasting_with_pycaret.py @@ -17,10 +17,19 @@ if os.path.isfile(".env"): load_dotenv(".env", override=True) -# Configure database connection string. -dburi = f"crate://{os.environ['CRATE_USER']}:{os.environ['CRATE_PASSWORD']}@{os.environ['CRATE_HOST']}:4200?ssl={os.environ['CRATE_SSL']}" -engine = sa.create_engine(dburi, echo=os.environ.get("DEBUG")) -os.environ["MLFLOW_TRACKING_URI"] = f"{dburi}&schema=mlflow" +# Configure to connect to CrateDB server on localhost. +CONNECTION_STRING = os.environ.get( + "CRATEDB_CONNECTION_STRING", + "crate://crate@localhost/?ssl=false", +) + +# Compute derived connection strings for SQLAlchemy use vs. MLflow use. +DBURI_DATA = f"{CONNECTION_STRING}&schema=testdrive" +DBURI_MLFLOW = f"{CONNECTION_STRING}&schema=mlflow" + +# Propagate database connectivity settings. +engine = sa.create_engine(DBURI_DATA, echo=bool(os.environ.get("DEBUG"))) +os.environ["MLFLOW_TRACKING_URI"] = DBURI_MLFLOW def prepare_data(): @@ -37,7 +46,7 @@ def prepare_data(): data["date"] = pd.to_datetime(data["date"]) # Insert the data into CrateDB - engine = sa.create_engine(dburi, echo=os.environ.get("DEBUG")) + engine = sa.create_engine(DBURI_DATA, echo=bool(os.environ.get("DEBUG"))) with engine.connect() as conn: data.to_sql( diff --git a/topic/machine-learning/automl/backlog.md b/topic/machine-learning/automl/backlog.md index 4c86c490..e5ff690a 100644 --- a/topic/machine-learning/automl/backlog.md +++ b/topic/machine-learning/automl/backlog.md @@ -1,3 +1,4 @@ # Backlog - Describe / program how to import `churn-dataset.csv`. +- Format and lint notebooks using `black` and `ruff`. diff --git a/topic/machine-learning/automl/pyproject.toml b/topic/machine-learning/automl/pyproject.toml index e50956bd..97913613 100644 --- a/topic/machine-learning/automl/pyproject.toml +++ b/topic/machine-learning/automl/pyproject.toml @@ -1,15 +1,11 @@ [tool.pytest.ini_options] minversion = "2.0" addopts = """ - -rfEX -p pytester --strict-markers --verbosity=3 --capture=no + -rfEX -p pytester --strict-markers --verbosity=3 """ # --cov=. --cov-report=term-missing --cov-report=xml env = [ - "CRATEDB_CONNECTION_STRING=crate://crate@localhost/?schema=testdrive", - "CRATE_USER=crate", - "CRATE_PASSWORD=", - "CRATE_HOST=localhost", - "CRATE_SSL=false", + "CRATEDB_CONNECTION_STRING=crate://crate@localhost/?ssl=false", "PYDEVD_DISABLE_FILE_VALIDATION=1", ] @@ -26,8 +22,8 @@ markers = [ # pytest-notebook settings nb_test_files = true nb_coverage = false -# 120 seconds is too less on CI/GHA -nb_exec_timeout = 300 +# Default cell timeout is 120 seconds. For heavy computing, it needs to be increased. +nb_exec_timeout = 240 nb_diff_replace = [ # Compensate output of `crash`. '"/cells/*/outputs/*/text" "\(\d.\d+ sec\)" "(0.000 sec)"', @@ -47,24 +43,12 @@ nb_diff_ignore = [ "/cells/*/outputs/*/metadata/nbreg", # Ignore images. "/cells/*/outputs/*/data/image/png", - # FIXME: Those pacifiers should be revisited. - # Some are warnings, some are semantic ambiguities. - # Maybe they can be improved in one way or another, - # for improved QA. - "/cells/5/outputs", - "/cells/14/outputs", - "/cells/16/outputs", - "/cells/16/outputs", - "/cells/18/outputs", - "/cells/22/outputs", - "/cells/24/outputs", - "/cells/30/outputs/0/data/application/vnd.jupyter.widget-view+json", - "/cells/34/outputs", - "/cells/36/outputs", - "/cells/40/outputs", - # automl_timeseries_forecasting_with_pycaret.ipynb - "/cells/19/outputs", - "/cells/33/outputs", + # Ignore all cell output. It is too tedious to compare and maintain. + # The validation hereby extends exclusively to the _execution_ of notebook cells, + # able to catch syntax errors, module import flaws, and runtime errors. + # However, the validation will not catch any regressions on actual cell output, + # or whether any output is produced at all. + "/cells/*/outputs", ] [tool.coverage.run] diff --git a/topic/machine-learning/automl/test.py b/topic/machine-learning/automl/test.py index c9df264e..86879c9f 100644 --- a/topic/machine-learning/automl/test.py +++ b/topic/machine-learning/automl/test.py @@ -1,7 +1,7 @@ """ ## About -Test cases for classification model examples with CrateDB, PyCaret and MLflow. +Test cases for classification and forecasting examples with CrateDB, PyCaret, and MLflow. ## Synopsis @@ -17,6 +17,7 @@ pytest -k notebook ``` """ +import os from pathlib import Path import pytest @@ -32,7 +33,8 @@ def cratedb() -> DatabaseAdapter: """ Provide test cases with a connection to CrateDB, with additional tooling. """ - return DatabaseAdapter(dburi="crate://crate@localhost:4200") + dburi = os.environ.get("CRATEDB_CONNECTION_STRING") + return DatabaseAdapter(dburi=f"{dburi}&schema=testdrive") @pytest.fixture(scope="function", autouse=True)