Merge pull request #6 from crederauk/feat/data-pre-processing-function

Feat/data pre processing function
crederauk · Mar 8, 2024 · e7bab62 · e7bab62
2 parents 73ff93e + c3fb46c
commit e7bab62
Show file tree

Hide file tree

Showing 21 changed files with 656 additions and 1,333 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # AWS-MLOps-module
 
+## [2.0.2] - 01/03/24
+* Added functionality for passing preprocessing script
+
+## [2.0.1] - 02/02/24
+* Updated retraining_schedule validation
+
 ## [2.0.0] - 21/12/23
 **BREAKING CHANGES**
 * Mandatory variable `resource_naming_prefix` has now been added.

diff --git a/README.md b/README.md
@@ -8,7 +8,6 @@ This repo contains a terraform module with corresponding AWS resources that enab
 
 
 ## Example Usage
-
  ```
 module "MLOps" {
   source  = "github.com/crederauk/terraform-aws-mlops-module?ref=<MODULE_VERSION>"
@@ -22,6 +21,7 @@ module "MLOps" {
   algorithm_choice        = "classification"
   sagemaker_training_notebook_instance_type = "ml.m4.xlarge"
   inference_instance_count = 1
+  preprocessing_script_path = "terraform/preprocess_data.py"
   tags = {
     my-tag-key = "my-tag-value"
   }
@@ -67,6 +67,7 @@ No resources.
 | <a name="input_inference_instance_count"></a> [inference\_instance\_count](#input\_inference\_instance\_count) | The initial number of instances to serve the model endpoint | `number` | `1` | no |
 | <a name="input_inference_instance_type"></a> [inference\_instance\_type](#input\_inference\_instance\_type) | The instance type to be created for serving the model. Must be a valid EC2 instance type | `string` | `"ml.t2.medium"` | no |
 | <a name="input_model_target_variable"></a> [model\_target\_variable](#input\_model\_target\_variable) | The dependent variable (or 'label') that the model aims to predict. This should be a column name in the dataset. | `string` | n/a | yes |
+| <a name="input_preprocessing_script_path"></a> [preprocessing\_script\_path](#input\_preprocessing\_script\_path) | The path the user provides if they want to include their own data cleaning logic | `string` | `null` | no |
 | <a name="input_resource_naming_prefix"></a> [resource\_naming\_prefix](#input\_resource\_naming\_prefix) | Naming prefix to be applied to all resources created by this module | `string` | n/a | yes |
 | <a name="input_retrain_model_bool"></a> [retrain\_model\_bool](#input\_retrain\_model\_bool) | Boolean to indicate if the retraining pipeline shoud be added | `bool` | `false` | no |
 | <a name="input_retraining_schedule"></a> [retraining\_schedule](#input\_retraining\_schedule) | Cron expression for the model retraining frequency in the AWS format. See https://docs.aws.amazon.com/lambda/latest/dg/services-cloudwatchevents-expressions.html for details | `string` | `""` | no |

diff --git a/main.tf b/main.tf
@@ -1,8 +1,9 @@
 module "s3" {
   source = "./modules/s3"
 
-  resource_naming_prefix = var.resource_naming_prefix
-  tags                   = var.tags
+  resource_naming_prefix    = var.resource_naming_prefix
+  tags                      = var.tags
+  preprocessing_script_path = var.preprocessing_script_path
 }
 
 module "sagemaker" {
@@ -26,15 +27,15 @@ module "sagemaker" {
   ecr_repo_uri             = "${module.ecr.repository.repository_url}:latest"
 
   # S3
-  config_s3_bucket      = module.s3.config_bucket.id
-  config_bucket_key_arn = module.s3.encryption_key.arn
-  data_s3_bucket        = var.data_s3_bucket
-  data_bucket_key_arn   = var.data_s3_bucket_encryption_key_arn
-  data_location_s3      = var.data_location_s3
-  model_s3_bucket       = module.s3.model_bucket.id
-  model_bucket_key_arn  = module.s3.encryption_key.arn
+  config_s3_bucket          = module.s3.config_bucket.id
+  config_bucket_key_arn     = module.s3.encryption_key.arn
+  data_s3_bucket            = var.data_s3_bucket
+  data_bucket_key_arn       = var.data_s3_bucket_encryption_key_arn
+  data_location_s3          = var.data_location_s3
+  model_s3_bucket           = module.s3.model_bucket.id
+  model_bucket_key_arn      = module.s3.encryption_key.arn
+  preprocessing_script_path = var.preprocessing_script_path
 }
-
 module "retraining_job" {
   count  = var.retrain_model_bool ? 1 : 0
   source = "./modules/glue"

diff --git a/mlops_ml_models/delete_sagemaker_endpoint.py b/mlops_ml_models/delete_sagemaker_endpoint.py
@@ -32,11 +32,8 @@ def delete_sagemaker_endpoint(endpoint_name: str) -> None:
         sagemaker_client.delete_endpoint(EndpointName=endpoint_name)
 
         # Delete endpoint configuration
-        sagemaker_client.delete_endpoint_config(
-            EndpointConfigName=endpoint_name
-        )
+        sagemaker_client.delete_endpoint_config(EndpointConfigName=endpoint_name)
 
-        print(f"Endpoint '{endpoint_name}' and its configuration have "
-              "been deleted.")
+        print(f"Endpoint '{endpoint_name}' and its configuration have " "been deleted.")
     else:
         print("Endpoint deletion cancelled.")
diff --git a/mlops_ml_models/deploy_model_endpoint.py b/mlops_ml_models/deploy_model_endpoint.py
@@ -2,10 +2,15 @@
 
 
 def deploy_model(
-    model_name: str, model_type: str, model_s3_bucket: str, instance_type: str, endpoint_name,
-    role: str, inference_instance_count: int, image_uri: str
+    model_name: str,
+    model_type: str,
+    model_s3_bucket: str,
+    instance_type: str,
+    endpoint_name,
+    role: str,
+    inference_instance_count: int,
+    image_uri: str,
 ) -> None:
-
     """This script deploys the sagemaker endpoint using the tar.gz file
     saved in s3.
 
@@ -23,10 +28,7 @@ def deploy_model(
         image_uri=(image_uri),  # The ECR image you pushed
         model_data=model_file,  # Location of your serialized model
         role=role,
-        env={
-            "MODEL_NAME": model_name,
-            "MODEL_TYPE": model_type
-        }
+        env={"MODEL_NAME": model_name, "MODEL_TYPE": model_type},
     )
     model.deploy(
         initial_instance_count=inference_instance_count,

diff --git a/mlops_ml_models/finalize_and_save_model.py b/mlops_ml_models/finalize_and_save_model.py
@@ -1,8 +1,7 @@
 import importlib
 
 
-def finalize_and_save_model(algorithm_choice: str, bestModel: str,
-                            model_name: str):
+def finalize_and_save_model(algorithm_choice: str, bestModel: str, model_name: str):
     """
     Finalizes the best model obtained from PyCaret and saves it locally.
 

diff --git a/mlops_ml_models/load_data.py b/mlops_ml_models/load_data.py
@@ -18,7 +18,7 @@ def load_data(data_location: str) -> pd.DataFrame:
         df = pd.read_csv(data_location, low_memory=False)
         # Dropped unnamed columns. You should comment this portion out before
         # using the script if you dont have unamed columns
-        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
+        df = df.loc[:, ~df.columns.str.contains("^Unnamed")]
         return df
     except Exception as e:
         print(f"Error loading data: {e}")