diff --git a/README.md b/README.md index 01793905..be42aee0 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,7 @@ In order to understand the tutorials you need to be familiar with general concep - [Credit Scores](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/credit_scores): Predict clients' repayment abilities. - [Electricity](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/electricity): Predict the electricity prices in several Swedish cities based on weather conditions, previous prices, and Swedish holidays. - [NYC Taxi Fares](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/nyc_taxi_fares): Predict the fare amount for a taxi ride in New York City given the pickup and dropoff locations. + - [Hospital Wait Time](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/hospital_wait_time): Predict the waiting time for a deceased donor kidney using Prophet model. - [Recommender System](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/recommender-system): Build a recommender system for fashion items. - [TimeSeries](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/timeseries): Timeseries price prediction. - [LLM PDF](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/llm_pdfs): An AI assistant that utilizes a Retrieval-Augmented Generation (RAG) system to provide accurate answers to user questions by retrieving relevant context from PDF documents. diff --git a/advanced_tutorials/hospital_wait_time/1_feature_pipeline.ipynb b/advanced_tutorials/hospital_wait_time/1_feature_pipeline.ipynb new file mode 100644 index 00000000..ebf89dff --- /dev/null +++ b/advanced_tutorials/hospital_wait_time/1_feature_pipeline.ipynb @@ -0,0 +1,597 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5615c2ae", + "metadata": {}, + "source": [ + "## 📝 Imports " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fd527fe", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "# Mute warnings\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "id": "97f5d51d", + "metadata": {}, + "source": [ + "## 💽 Data Loading\n", + "\n", + "In this case, you are predicting the waiting time for a deceased donor kidney transplant involves estimating the duration a patient might need to wait from the time they are registered on the transplant list until a suitable donor kidney becomes available for transplantation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "819100e5", + "metadata": {}, + "outputs": [], + "source": [ + "patient_demographics_data = pd.read_csv(\n", + " 'https://repo.hops.works/dev/davit/hospital_wait_time/patient_demographics.csv', \n", + " parse_dates=['date'],\n", + ")\n", + "patient_demographics_data.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d827df9", + "metadata": {}, + "outputs": [], + "source": [ + "medical_background_data = pd.read_csv(\n", + " 'https://repo.hops.works/dev/davit/hospital_wait_time/medical_background.csv', \n", + " parse_dates=['date'],\n", + ")\n", + "medical_background_data.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f7973d2", + "metadata": {}, + "outputs": [], + "source": [ + "transplant_compatibility_data = pd.read_csv(\n", + " 'https://repo.hops.works/dev/davit/hospital_wait_time/transplant_compatibility.csv', \n", + " parse_dates=['date'],\n", + ")\n", + "transplant_compatibility_data.columns = transplant_compatibility_data.columns.str.lower()\n", + "transplant_compatibility_data.head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "c7a4c7c6", + "metadata": {}, + "source": [ + "## 👨🏻‍🍳 Data Preparation\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23284e32", + "metadata": {}, + "outputs": [], + "source": [ + "patient_demographics_data.isna().sum()[patient_demographics_data.isna().sum() > 0] / len(patient_demographics_data)*100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5532bdbf", + "metadata": {}, + "outputs": [], + "source": [ + "medical_background_data.isna().sum()[medical_background_data.isna().sum() > 0] / len(medical_background_data)*100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31876981", + "metadata": {}, + "outputs": [], + "source": [ + "transplant_compatibility_data.isna().sum()[transplant_compatibility_data.isna().sum() > 0] / len(transplant_compatibility_data)*100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "058cf4f1", + "metadata": {}, + "outputs": [], + "source": [ + "medical_background_data['dialysis_duration'] = medical_background_data['dialysis_duration'].fillna(1).replace(0, 1)\n", + "medical_background_data['dialysis_duration'] = np.log(medical_background_data['dialysis_duration'] + 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6a1faef", + "metadata": {}, + "outputs": [], + "source": [ + "def remove_outliers_iqr(dataframe, iqr_multiplier=1.5):\n", + " # Select numerical columns for outlier removal\n", + " numerical_columns = dataframe.select_dtypes(\n", + " include=['int64', 'float64']).columns\n", + "\n", + " # Loop through numerical columns to identify and remove outliers using IQR\n", + " for column in numerical_columns:\n", + " Q1 = dataframe[column].quantile(0.25)\n", + " Q3 = dataframe[column].quantile(0.75)\n", + " IQR = Q3 - Q1\n", + " lower_bound = Q1 - iqr_multiplier * IQR\n", + " upper_bound = Q3 + iqr_multiplier * IQR\n", + "\n", + " outliers = dataframe[(dataframe[column] < lower_bound) | (\n", + " dataframe[column] > upper_bound)]\n", + "\n", + " # Remove outliers\n", + " dataframe = dataframe[~dataframe.index.isin(outliers.index)]\n", + "\n", + " return dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7cacfd97", + "metadata": {}, + "outputs": [], + "source": [ + "patient_demographics_data_filtered = remove_outliers_iqr(patient_demographics_data, iqr_multiplier=1.5)\n", + "print(f'⛳️ Original shape: {patient_demographics_data.shape}')\n", + "print(f'⛳️ Cleared shape: {patient_demographics_data_filtered.shape}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de3c3316", + "metadata": {}, + "outputs": [], + "source": [ + "medical_background_data_filtered = remove_outliers_iqr(medical_background_data, iqr_multiplier=1.5)\n", + "print(f'⛳️ Original shape: {medical_background_data.shape}')\n", + "print(f'⛳️ Cleared shape: {medical_background_data_filtered.shape}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45b9c45b", + "metadata": {}, + "outputs": [], + "source": [ + "transplant_compatibility_data_filtered = remove_outliers_iqr(transplant_compatibility_data, iqr_multiplier=1.5)\n", + "print(f'⛳️ Original shape: {transplant_compatibility_data.shape}')\n", + "print(f'⛳️ Cleared shape: {transplant_compatibility_data_filtered.shape}')" + ] + }, + { + "cell_type": "markdown", + "id": "3f7ac6f7", + "metadata": {}, + "source": [ + "## 👮🏻‍♂️ Great Expectations " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f99a5cff", + "metadata": {}, + "outputs": [], + "source": [ + "import great_expectations as ge\n", + "from great_expectations.core import ExpectationSuite, ExpectationConfiguration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee28579a", + "metadata": {}, + "outputs": [], + "source": [ + "# Convert your DataFrame to a Great Expectations DataFrame\n", + "ge_df_patient_demographics = ge.from_pandas(patient_demographics_data_filtered)\n", + "\n", + "# Retrieve the expectation suite associated with the ge DataFrame\n", + "expectation_suite_patient_demographics = ge_df_patient_demographics.get_expectation_suite()\n", + "\n", + "# Set the expectation suite name\n", + "expectation_suite_patient_demographics.expectation_suite_name = \"patient_registration_suite\"\n", + "\n", + "# Expectation: 'id' should always be unique and not null\n", + "expectation_suite_patient_demographics.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_be_unique\",\n", + " kwargs={\"column\": \"id\"},\n", + " )\n", + ")\n", + "expectation_suite_patient_demographics.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_not_be_null\",\n", + " kwargs={\"column\": \"id\"},\n", + " )\n", + ")\n", + "\n", + "# Expectation: 'date' should be a valid date and not null\n", + "expectation_suite_patient_demographics.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_be_of_type\",\n", + " kwargs={\n", + " \"column\": \"date\",\n", + " \"type_\": \"datetime64[ns]\",\n", + " }\n", + " )\n", + ")\n", + "expectation_suite_patient_demographics.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_not_be_null\",\n", + " kwargs={\"column\": \"date\"},\n", + " )\n", + ")\n", + "\n", + "# Expectation: 'age_at_list_registration' to be non-negative\n", + "expectation_suite_patient_demographics.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_be_between\",\n", + " kwargs={\n", + " \"column\": \"age_at_list_registration\",\n", + " \"min_value\": 0,\n", + " \"max_value\": None,\n", + " }\n", + " )\n", + ")\n", + "\n", + "# Expectation: 'gender' to be within expected values\n", + "expectation_suite_patient_demographics.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_be_in_set\",\n", + " kwargs={\n", + " \"column\": \"gender\",\n", + " \"value_set\": [\"M\", \"F\"],\n", + " }\n", + " )\n", + ")\n", + "\n", + "# Expectation: 'age_cat' to contain expected categories\n", + "expectation_suite_patient_demographics.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_be_in_set\",\n", + " kwargs={\n", + " \"column\": \"age_cat\",\n", + " \"value_set\": [\"Over60\", \"From18to60\", \"Below18\"],\n", + " }\n", + " )\n", + ")\n", + "\n", + "print(\"✅ Expectations defined and saved successfully.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "640bbf2a", + "metadata": {}, + "outputs": [], + "source": [ + "ge_df_medical_background = ge.from_pandas(medical_background_data_filtered)\n", + "\n", + "# Retrieve and set the expectation suite\n", + "expectation_suite_medical_background = ge_df_medical_background.get_expectation_suite()\n", + "expectation_suite_medical_background.expectation_suite_name = \"medical_background_suite\"\n", + "\n", + "# Expectations for 'id' and 'date'\n", + "expectation_suite_medical_background.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_be_unique\",\n", + " kwargs={\"column\": \"id\"},\n", + " )\n", + ")\n", + "expectation_suite_medical_background.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_not_be_null\",\n", + " kwargs={\"column\": \"id\"},\n", + " )\n", + ")\n", + "expectation_suite_medical_background.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_be_of_type\",\n", + " kwargs={\n", + " \"column\": \"date\",\n", + " \"type_\": \"datetime64[ns]\",\n", + " }\n", + " )\n", + ")\n", + "expectation_suite_medical_background.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_not_be_null\",\n", + " kwargs={\"column\": \"date\"},\n", + " )\n", + ")\n", + "\n", + "# Expectation for 'dialysis_duration'\n", + "expectation_suite_medical_background.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_be_between\",\n", + " kwargs={\n", + " \"column\": \"dialysis_duration\",\n", + " \"min_value\": 0,\n", + " \"max_value\": None,\n", + " }\n", + " )\n", + ")\n", + "\n", + "# Expectation for 'blood_gp'\n", + "expectation_suite_medical_background.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_be_in_set\",\n", + " kwargs={\n", + " \"column\": \"blood_gp\",\n", + " \"value_set\": [\"A\", \"B\", \"AB\", \"O\"],\n", + " }\n", + " )\n", + ")\n", + "\n", + "# Gestation and Prior Transplant Expectations\n", + "for column in [\"gestation\", \"prior_transplant\"]:\n", + " expectation_suite_medical_background.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_be_in_set\",\n", + " kwargs={\n", + " \"column\": column,\n", + " \"value_set\": [\"YES\", \"NO\"],\n", + " }\n", + " )\n", + " )\n", + "\n", + "# Expectation for 'number_prior_transplant' - check alignment with 'prior_transplant'\n", + "expectation_suite_medical_background.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_be_between\",\n", + " kwargs={\n", + " \"column\": \"number_prior_transplant\",\n", + " \"min_value\": 0,\n", + " \"max_value\": None,\n", + " }\n", + " )\n", + ")\n", + "\n", + "print(\"✅ Expectations defined and saved successfully.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ca9a557", + "metadata": {}, + "outputs": [], + "source": [ + "ge_df_transplant_compatibility = ge.from_pandas(transplant_compatibility_data_filtered)\n", + "\n", + "# Retrieve and set the expectation suite\n", + "expectation_suite_transplant_compatibility = ge_df_transplant_compatibility.get_expectation_suite()\n", + "expectation_suite_transplant_compatibility.expectation_suite_name = \"transplant_compatibility_and_outcome_suite\"\n", + "\n", + "# Expectations for 'id' and 'date'\n", + "expectation_suite_transplant_compatibility.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_be_unique\",\n", + " kwargs={\"column\": \"id\"},\n", + " )\n", + ")\n", + "expectation_suite_transplant_compatibility.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_not_be_null\",\n", + " kwargs={\"column\": \"id\"},\n", + " )\n", + ")\n", + "expectation_suite_transplant_compatibility.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_be_of_type\",\n", + " kwargs={\n", + " \"column\": \"date\",\n", + " \"type_\": \"datetime64[ns]\",\n", + " }\n", + " )\n", + ")\n", + "\n", + "# Expectation for 'cPRA' to be between 0 and 100\n", + "expectation_suite_transplant_compatibility.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_be_between\",\n", + " kwargs={\n", + " \"column\": \"cpra\",\n", + " \"min_value\": 0,\n", + " \"max_value\": 100,\n", + " }\n", + " )\n", + ")\n", + "\n", + "# HLA Marker Expectations (checking they are non-negative integers)\n", + "for hla_marker in [\"hla_a1\", \"hla_a2\", \"hla_b1\", \"hla_b2\", \"hla_dr1\", \"hla_dr2\"]:\n", + " expectation_suite_transplant_compatibility.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_be_of_type\",\n", + " kwargs={\n", + " \"column\": hla_marker,\n", + " \"type_\": \"int\",\n", + " }\n", + " )\n", + " )\n", + "\n", + "# Expectation for 'if_transplanted'\n", + "expectation_suite_transplant_compatibility.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_be_in_set\",\n", + " kwargs={\n", + " \"column\": \"if_transplanted\",\n", + " \"value_set\": [\"YES\", \"NO\"],\n", + " }\n", + " )\n", + ")\n", + "\n", + "# Expectation for 'duration' to be non-negative\n", + "expectation_suite_transplant_compatibility.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_be_between\",\n", + " kwargs={\n", + " \"column\": \"duration\",\n", + " \"min_value\": 0,\n", + " \"max_value\": None,\n", + " }\n", + " )\n", + ")\n", + "\n", + "print(\"✅ Expectations defined and saved successfully.\")" + ] + }, + { + "cell_type": "markdown", + "id": "ed8fa713", + "metadata": {}, + "source": [ + "## 📡 Connecting to Hopsworks Feature Store " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82777378", + "metadata": {}, + "outputs": [], + "source": [ + "import hopsworks\n", + "\n", + "project = hopsworks.login()\n", + "\n", + "fs = project.get_feature_store()" + ] + }, + { + "cell_type": "markdown", + "id": "ff82304a", + "metadata": {}, + "source": [ + "## 🪄 Creating Feature Groups \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d21bff59", + "metadata": {}, + "outputs": [], + "source": [ + "# Get or create the 'patient_info' feature group\n", + "patient_info_fg = fs.get_or_create_feature_group(\n", + " name=\"patient_info\",\n", + " version=1,\n", + " description=\"Demographic Features\",\n", + " primary_key=[\"id\"],\n", + " event_time=\"date\",\n", + " expectation_suite=expectation_suite_patient_demographics,\n", + ")\n", + "\n", + "patient_info_fg.insert(patient_demographics_data_filtered)\n", + "print('✅ Done')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a524f361", + "metadata": {}, + "outputs": [], + "source": [ + "# Get or create the 'medical_info' feature group\n", + "medical_info_fg = fs.get_or_create_feature_group(\n", + " name=\"medical_info\",\n", + " version=1,\n", + " description=\"Medical background features\",\n", + " primary_key=[\"id\"],\n", + " event_time=\"date\",\n", + " expectation_suite=expectation_suite_medical_background,\n", + ")\n", + "\n", + "medical_info_fg.insert(medical_background_data_filtered)\n", + "print('✅ Done')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f476f24b", + "metadata": {}, + "outputs": [], + "source": [ + "# Get or create the 'transplant_compatibility' feature group\n", + "transplant_compatibility_fg = fs.get_or_create_feature_group(\n", + " name=\"transplant_compatibility\",\n", + " version=1,\n", + " description=\"Transplant compatibility features\",\n", + " primary_key=[\"id\"],\n", + " event_time=\"date\",\n", + " expectation_suite=expectation_suite_transplant_compatibility,\n", + ")\n", + "\n", + "transplant_compatibility_fg.insert(transplant_compatibility_data_filtered)\n", + "print('✅ Done')" + ] + }, + { + "cell_type": "markdown", + "id": "3e15c005", + "metadata": {}, + "source": [ + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.1.-1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/advanced_tutorials/hospital_wait_time/2_training_pipeline.ipynb b/advanced_tutorials/hospital_wait_time/2_training_pipeline.ipynb new file mode 100644 index 00000000..e8583a6a --- /dev/null +++ b/advanced_tutorials/hospital_wait_time/2_training_pipeline.ipynb @@ -0,0 +1,537 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b6699fe1", + "metadata": {}, + "source": [ + "## 📝 Imports " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a39a3d61", + "metadata": {}, + "outputs": [], + "source": [ + "import joblib\n", + "import os\n", + "import datetime\n", + "import pandas as pd\n", + "import numpy as np\n", + "from matplotlib import pyplot\n", + "\n", + "from sklearn.metrics import mean_absolute_error\n", + "from prophet import Prophet\n", + "from prophet.serialize import model_to_json\n", + "\n", + "# Mute warnings\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "id": "6f80e358", + "metadata": {}, + "source": [ + "## 📡 Connecting to Hopsworks Feature Store " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f88a3faa", + "metadata": {}, + "outputs": [], + "source": [ + "import hopsworks\n", + "\n", + "project = hopsworks.login()\n", + "\n", + "fs = project.get_feature_store()" + ] + }, + { + "cell_type": "markdown", + "id": "d65e85bf", + "metadata": {}, + "source": [ + "### 🔪 Feature Selection \n", + "\n", + "You will start by selecting all the features you want to include for model training/inference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a11ec70f", + "metadata": {}, + "outputs": [], + "source": [ + "# Retrieve the 'patient_info' feature group\n", + "patient_info_fg = fs.get_feature_group(\n", + " name=\"patient_info\",\n", + " version=1,\n", + ")\n", + "\n", + "# Retrieve the 'medical_info' feature group\n", + "medical_info_fg = fs.get_feature_group(\n", + " name=\"medical_info\",\n", + " version=1,\n", + ")\n", + "\n", + "# Retrieve the 'transplant_compatibility' feature group\n", + "transplant_compatibility_fg = fs.get_feature_group(\n", + " name=\"transplant_compatibility\",\n", + " version=1,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17ba77ed", + "metadata": {}, + "outputs": [], + "source": [ + "# Select features for training data.\n", + "selected_features = patient_info_fg.select_all([\"id\", \"date\"])\\\n", + " .join(medical_info_fg.select_except([\"id\", \"date\"]))\\\n", + " .join(transplant_compatibility_fg.select_except([\"id\", \"date\"])\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "495cbb5f", + "metadata": {}, + "outputs": [], + "source": [ + "# Uncomment this if you would like to view your selected features\n", + "selected_features.show(5)" + ] + }, + { + "cell_type": "markdown", + "id": "d5902c9e", + "metadata": {}, + "source": [ + "## ⚙️ Transformation Functions \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba4ae19c", + "metadata": {}, + "outputs": [], + "source": [ + "[f.name for f in fs.get_transformation_functions()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c32555b", + "metadata": {}, + "outputs": [], + "source": [ + "label_encoder = fs.get_transformation_function(name=\"label_encoder\")\n", + "\n", + "standard_scaler = fs.get_transformation_function(name=\"standard_scaler\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4580c73", + "metadata": {}, + "outputs": [], + "source": [ + "features_category = ['gender', 'age_cat', 'blood_gp', 'underlying_disease', 'gestation', 'prior_transplant', 'if_transplanted']\n", + "\n", + "transformation_functions_category = {\n", + " feature_name: label_encoder\n", + " for feature_name\n", + " in features_category\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be5eb3c5", + "metadata": {}, + "outputs": [], + "source": [ + "features_numerical = [\n", + " 'age_at_list_registration', 'dialysis_duration', 'number_prior_transplant', 'cpra', 'hla_a1', 'hla_a2', 'hla_b1', 'hla_b2', 'hla_dr1', 'hla_dr2',\n", + "]\n", + "\n", + "transformation_functions_numerical = {\n", + " feature_name: standard_scaler\n", + " for feature_name\n", + " in features_numerical\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a7e76ee", + "metadata": {}, + "outputs": [], + "source": [ + "# Join transformation_functions_category and transformation_functions_numerical dictionaries into one\n", + "transformation_functions = transformation_functions_category | transformation_functions_numerical" + ] + }, + { + "cell_type": "markdown", + "id": "91636dc3", + "metadata": {}, + "source": [ + "## ⚙️ Feature View Creation \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "576617c8", + "metadata": {}, + "outputs": [], + "source": [ + "# Get or create the 'medical_features' feature view\n", + "feature_view = fs.get_or_create_feature_view(\n", + " name='medical_features',\n", + " version=1,\n", + " query=selected_features,\n", + " labels=[\"duration\"],\n", + " transformation_functions=transformation_functions,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "a3bb3b8e", + "metadata": {}, + "source": [ + "## 🏋️ Training Dataset Creation\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7851e335", + "metadata": {}, + "outputs": [], + "source": [ + "# Split date with percentage \n", + "df = patient_info_fg.read()\n", + "\n", + "def split_dfs(df): \n", + " df = df.sort_values(by='date') \n", + " trainvals = df[:int(len(df)*0.8)] \n", + " testvals = df[int(len(df)*0.8):] \n", + " return {\n", + " 'train_start': min(trainvals.date).date(), \n", + " 'train_end': max(trainvals.date).date(), \n", + " 'test_start': min(testvals.date).date(), \n", + " 'test_end': max(testvals.date).date(),\n", + " }\n", + "\n", + "split_dict = split_dfs(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7a8f6f1", + "metadata": {}, + "outputs": [], + "source": [ + "split_dict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69f4373c", + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = feature_view.train_test_split(\n", + " train_start=split_dict['train_start'],\n", + " train_end=split_dict['train_end'],\n", + " test_start=split_dict['test_start'],\n", + " test_end=split_dict['test_end'], \n", + " event_time=True,\n", + ")\n", + "X_train.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2facefa1", + "metadata": {}, + "outputs": [], + "source": [ + "y_train.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d510db36", + "metadata": {}, + "outputs": [], + "source": [ + "# Sort the X_train DataFrame based on the \"datetime\" column in ascending order\n", + "X_train = X_train.sort_values(\"date\")\n", + "# Reindex the y_train Series to match the order of rows in the sorted X_train DataFrame\n", + "y_train = y_train.reindex(X_train.index)\n", + "\n", + "# Sort the X_test DataFrame based on the \"datetime\" column in ascending order\n", + "X_test = X_test.sort_values(\"date\")\n", + "# Reindex the y_test Series to match the order of rows in the sorted X_test DataFrame\n", + "y_test = y_test.reindex(X_test.index)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7e10eb2", + "metadata": {}, + "outputs": [], + "source": [ + "X_train['y'] = y_train\n", + "X_train['ds'] = X_train.date\n", + "X_train['ds'] = pd.to_datetime(X_train.ds)\n", + "X_train['ds'] = X_train.ds.map(lambda x: x.replace(tzinfo=None))\n", + "X_train.drop(columns=[\"date\"], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dfbb7b31", + "metadata": {}, + "outputs": [], + "source": [ + "X_test['ds'] = X_test.date\n", + "X_test['ds'] = pd.to_datetime(X_test.ds)\n", + "X_test['ds'] = X_test.ds.map(lambda x: x.replace(tzinfo=None))\n", + "X_test.drop(columns=[\"date\"], axis=1, inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "3847431e", + "metadata": {}, + "source": [ + "## 🧬 Modeling\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d639b394", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the Prophet model with the appropriate seasonalities\n", + "model = Prophet(\n", + " daily_seasonality=False,\n", + " weekly_seasonality=True,\n", + " yearly_seasonality=True,\n", + ")\n", + "\n", + "# Add monthly seasonality with a period of 30.5 days (average length of a month)\n", + "model.add_seasonality(\n", + " name='monthly', \n", + " period=30.5, \n", + " fourier_order=5,\n", + " mode='additive',\n", + ")\n", + "\n", + "# Add the additional regressors\n", + "additional_regressors = [\n", + " 'age_at_list_registration','cpra', 'hla_a1', 'hla_a2', 'hla_b1', 'hla_b2', 'hla_dr1', 'hla_dr2',\n", + "]\n", + "\n", + "for regressor in additional_regressors:\n", + " model.add_regressor(regressor)\n", + "\n", + "# Fit the model\n", + "model.fit(X_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce527621", + "metadata": {}, + "outputs": [], + "source": [ + "forecast = model.predict(X_test)\n", + "\n", + "# Summarize the forecast\n", + "print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].head())\n", + "\n", + "# Plot the forecast\n", + "fig = model.plot(forecast)\n", + "\n", + "pyplot.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e4217c0", + "metadata": {}, + "outputs": [], + "source": [ + "model.plot_components(forecast)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8701339", + "metadata": {}, + "outputs": [], + "source": [ + "# calculate MAE between expected and predicted values for december\n", + "y_pred = forecast['yhat']\n", + "mae = mean_absolute_error(y_test, y_pred)\n", + "print('MAE: %.3f' % mae)\n", + "# plot expected vs actual\n", + "\n", + "metrics = {\n", + " \"mae\": round(mae,2)\n", + "}\n", + "metrics" + ] + }, + { + "cell_type": "markdown", + "id": "ea6cfd10", + "metadata": {}, + "source": [ + "### ⚙️ Model Schema\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0bf4bb02", + "metadata": {}, + "outputs": [], + "source": [ + "from hsml.schema import Schema\n", + "from hsml.model_schema import ModelSchema\n", + "\n", + "# Define the input schema using the values of X_test\n", + "input_schema = Schema(X_test.values)\n", + "\n", + "# Define the output schema using y_train\n", + "output_schema = Schema(y_train)\n", + "\n", + "# Create a ModelSchema object specifying the input and output schemas\n", + "model_schema = ModelSchema(\n", + " input_schema=input_schema, \n", + " output_schema=output_schema,\n", + ")\n", + "\n", + "# Convert the model schema to a dictionary for further inspection or serialization\n", + "model_schema.to_dict()" + ] + }, + { + "cell_type": "markdown", + "id": "93a92ddd", + "metadata": {}, + "source": [ + "## 📝 Register model\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1dbc8bae", + "metadata": {}, + "outputs": [], + "source": [ + "# Specify the directory where the model will be saved\n", + "model_dir = \"forecast_model\"\n", + "\n", + "# Check if the directory exists, and create it if it doesn't\n", + "if not os.path.isdir(model_dir):\n", + " os.mkdir(model_dir)\n", + "\n", + "# Save the trained model using joblib\n", + "with open(model_dir + '/serialized_model.json', 'w') as fout:\n", + " fout.write(model_to_json(model)) # Save model\n", + " \n", + "# Save the confusion matrix plot as an image file in the 'iris_model' directory\n", + "fig.savefig(model_dir + \"/forecast.png\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "707022f2", + "metadata": {}, + "outputs": [], + "source": [ + "# Get the model registry\n", + "mr = project.get_model_registry()\n", + "\n", + "# Create a new model in the model registry\n", + "forecast_model = mr.python.create_model(\n", + " name=\"waiting_time_forecast_model\", # Name for the model\n", + " metrics=metrics, # Metrics used for evaluation\n", + " model_schema=model_schema, # Schema defining the model's input and output\n", + " input_example=X_test.sample(), # Example input data for reference\n", + " description=\"Waiting time for a deceased donor kidney transplant forecasting model\", # Description of the model\n", + ")\n", + "\n", + "# Save the model to the specified directory\n", + "forecast_model.save(model_dir)" + ] + }, + { + "cell_type": "markdown", + "id": "0db0bf98", + "metadata": {}, + "source": [ + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/advanced_tutorials/hospital_wait_time/3_inference_pipeline.ipynb b/advanced_tutorials/hospital_wait_time/3_inference_pipeline.ipynb new file mode 100644 index 00000000..af472ff3 --- /dev/null +++ b/advanced_tutorials/hospital_wait_time/3_inference_pipeline.ipynb @@ -0,0 +1,229 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "72f42eba", + "metadata": {}, + "source": [ + "## 📝 Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8158dcec", + "metadata": {}, + "outputs": [], + "source": [ + "import joblib\n", + "import pandas as pd\n", + "from datetime import datetime\n", + "from prophet.serialize import model_from_json\n", + "from matplotlib import pyplot\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "markdown", + "id": "faa87ecf", + "metadata": {}, + "source": [ + "## 📡 Connecting to Hopsworks Feature Store " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4330a4a", + "metadata": {}, + "outputs": [], + "source": [ + "import hopsworks\n", + "\n", + "project = hopsworks.login()\n", + "\n", + "fs = project.get_feature_store()" + ] + }, + { + "cell_type": "markdown", + "id": "dff5ebe0", + "metadata": {}, + "source": [ + "## ⚙️ Feature View Retrieval\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd3f8c1c", + "metadata": {}, + "outputs": [], + "source": [ + "# Retrieve the 'medical_features' feature view\n", + "feature_view = fs.get_feature_view(\n", + " name='medical_features',\n", + " version=1,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "21b6f351", + "metadata": {}, + "source": [ + "## 🗄 Model Registry\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ecad81cc", + "metadata": {}, + "outputs": [], + "source": [ + "# Get the model registry\n", + "mr = project.get_model_registry()" + ] + }, + { + "cell_type": "markdown", + "id": "515fe05a", + "metadata": {}, + "source": [ + "## 🚀 Fetch and test the model\n", + "\n", + "Finally you can start making predictions with your model!\n", + "\n", + "Retrieve your model from Hopsworks model registry." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbb5cdc7", + "metadata": {}, + "outputs": [], + "source": [ + "# Retrieve the model from the model registry\n", + "retrieved_model = mr.get_model(\n", + " name=\"waiting_time_forecast_model\",\n", + " version=1,\n", + ")\n", + "\n", + "# Download the saved model files to a local directory\n", + "saved_model_dir = retrieved_model.download()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9dfaba98", + "metadata": {}, + "outputs": [], + "source": [ + "with open(saved_model_dir + '/serialized_model.json', 'r') as fin:\n", + " model = model_from_json(fin.read()) # Load model" + ] + }, + { + "cell_type": "markdown", + "id": "2e5b40c5", + "metadata": {}, + "source": [ + "## 🔮 Batch Prediction \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5e49a6e", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize batch scoring\n", + "feature_view.init_batch_scoring(1)\n", + "\n", + "# Get the batch data\n", + "batch_data = feature_view.get_batch_data(\n", + " start_time=datetime(2015, 10, 19), \n", + " end_time=datetime(2017, 12, 29), \n", + " event_time=True,\n", + ")\n", + "\n", + "batch_data['ds'] = batch_data.date\n", + "batch_data['ds'] = pd.to_datetime(batch_data.ds)\n", + "batch_data['ds'] = batch_data.ds.map(lambda x: x.replace(tzinfo=None))\n", + "batch_data.drop(columns=[\"date\"], axis=1, inplace=True)\n", + "batch_data = batch_data.sort_values(\"ds\")\n", + "\n", + "# Display the first 3 rows\n", + "batch_data.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ff70533", + "metadata": {}, + "outputs": [], + "source": [ + "# use the model to make a forecast\n", + "forecast = model.predict(batch_data)\n", + "\n", + "# summarize the forecast\n", + "print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6821c72", + "metadata": {}, + "outputs": [], + "source": [ + "model.plot(forecast)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c6c4a04", + "metadata": {}, + "outputs": [], + "source": [ + "model.plot_components(forecast)" + ] + }, + { + "cell_type": "markdown", + "id": "583f95e2", + "metadata": {}, + "source": [ + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/advanced_tutorials/hospital_wait_time/requirements.txt b/advanced_tutorials/hospital_wait_time/requirements.txt new file mode 100644 index 00000000..3f5db197 --- /dev/null +++ b/advanced_tutorials/hospital_wait_time/requirements.txt @@ -0,0 +1 @@ +prophet==1.1.5