Skip to content

Commit

Permalink
Merge pull request #6 from crederauk/feat/data-pre-processing-function
Browse files Browse the repository at this point in the history
Feat/data pre processing function
  • Loading branch information
konradbcredera authored Mar 8, 2024
2 parents 73ff93e + c3fb46c commit e7bab62
Show file tree
Hide file tree
Showing 21 changed files with 656 additions and 1,333 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# AWS-MLOps-module

## [2.0.2] - 01/03/24
* Added functionality for passing preprocessing script

## [2.0.1] - 02/02/24
* Updated retraining_schedule validation

## [2.0.0] - 21/12/23
**BREAKING CHANGES**
* Mandatory variable `resource_naming_prefix` has now been added.
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ This repo contains a terraform module with corresponding AWS resources that enab


## Example Usage

```
module "MLOps" {
source = "github.com/crederauk/terraform-aws-mlops-module?ref=<MODULE_VERSION>"
Expand All @@ -22,6 +21,7 @@ module "MLOps" {
algorithm_choice = "classification"
sagemaker_training_notebook_instance_type = "ml.m4.xlarge"
inference_instance_count = 1
preprocessing_script_path = "terraform/preprocess_data.py"
tags = {
my-tag-key = "my-tag-value"
}
Expand Down Expand Up @@ -67,6 +67,7 @@ No resources.
| <a name="input_inference_instance_count"></a> [inference\_instance\_count](#input\_inference\_instance\_count) | The initial number of instances to serve the model endpoint | `number` | `1` | no |
| <a name="input_inference_instance_type"></a> [inference\_instance\_type](#input\_inference\_instance\_type) | The instance type to be created for serving the model. Must be a valid EC2 instance type | `string` | `"ml.t2.medium"` | no |
| <a name="input_model_target_variable"></a> [model\_target\_variable](#input\_model\_target\_variable) | The dependent variable (or 'label') that the model aims to predict. This should be a column name in the dataset. | `string` | n/a | yes |
| <a name="input_preprocessing_script_path"></a> [preprocessing\_script\_path](#input\_preprocessing\_script\_path) | The path the user provides if they want to include their own data cleaning logic | `string` | `null` | no |
| <a name="input_resource_naming_prefix"></a> [resource\_naming\_prefix](#input\_resource\_naming\_prefix) | Naming prefix to be applied to all resources created by this module | `string` | n/a | yes |
| <a name="input_retrain_model_bool"></a> [retrain\_model\_bool](#input\_retrain\_model\_bool) | Boolean to indicate if the retraining pipeline shoud be added | `bool` | `false` | no |
| <a name="input_retraining_schedule"></a> [retraining\_schedule](#input\_retraining\_schedule) | Cron expression for the model retraining frequency in the AWS format. See https://docs.aws.amazon.com/lambda/latest/dg/services-cloudwatchevents-expressions.html for details | `string` | `""` | no |
Expand Down
21 changes: 11 additions & 10 deletions main.tf
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
module "s3" {
source = "./modules/s3"

resource_naming_prefix = var.resource_naming_prefix
tags = var.tags
resource_naming_prefix = var.resource_naming_prefix
tags = var.tags
preprocessing_script_path = var.preprocessing_script_path
}

module "sagemaker" {
Expand All @@ -26,15 +27,15 @@ module "sagemaker" {
ecr_repo_uri = "${module.ecr.repository.repository_url}:latest"

# S3
config_s3_bucket = module.s3.config_bucket.id
config_bucket_key_arn = module.s3.encryption_key.arn
data_s3_bucket = var.data_s3_bucket
data_bucket_key_arn = var.data_s3_bucket_encryption_key_arn
data_location_s3 = var.data_location_s3
model_s3_bucket = module.s3.model_bucket.id
model_bucket_key_arn = module.s3.encryption_key.arn
config_s3_bucket = module.s3.config_bucket.id
config_bucket_key_arn = module.s3.encryption_key.arn
data_s3_bucket = var.data_s3_bucket
data_bucket_key_arn = var.data_s3_bucket_encryption_key_arn
data_location_s3 = var.data_location_s3
model_s3_bucket = module.s3.model_bucket.id
model_bucket_key_arn = module.s3.encryption_key.arn
preprocessing_script_path = var.preprocessing_script_path
}

module "retraining_job" {
count = var.retrain_model_bool ? 1 : 0
source = "./modules/glue"
Expand Down
7 changes: 2 additions & 5 deletions mlops_ml_models/delete_sagemaker_endpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,8 @@ def delete_sagemaker_endpoint(endpoint_name: str) -> None:
sagemaker_client.delete_endpoint(EndpointName=endpoint_name)

# Delete endpoint configuration
sagemaker_client.delete_endpoint_config(
EndpointConfigName=endpoint_name
)
sagemaker_client.delete_endpoint_config(EndpointConfigName=endpoint_name)

print(f"Endpoint '{endpoint_name}' and its configuration have "
"been deleted.")
print(f"Endpoint '{endpoint_name}' and its configuration have " "been deleted.")
else:
print("Endpoint deletion cancelled.")
16 changes: 9 additions & 7 deletions mlops_ml_models/deploy_model_endpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,15 @@


def deploy_model(
model_name: str, model_type: str, model_s3_bucket: str, instance_type: str, endpoint_name,
role: str, inference_instance_count: int, image_uri: str
model_name: str,
model_type: str,
model_s3_bucket: str,
instance_type: str,
endpoint_name,
role: str,
inference_instance_count: int,
image_uri: str,
) -> None:

"""This script deploys the sagemaker endpoint using the tar.gz file
saved in s3.
Expand All @@ -23,10 +28,7 @@ def deploy_model(
image_uri=(image_uri), # The ECR image you pushed
model_data=model_file, # Location of your serialized model
role=role,
env={
"MODEL_NAME": model_name,
"MODEL_TYPE": model_type
}
env={"MODEL_NAME": model_name, "MODEL_TYPE": model_type},
)
model.deploy(
initial_instance_count=inference_instance_count,
Expand Down
3 changes: 1 addition & 2 deletions mlops_ml_models/finalize_and_save_model.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import importlib


def finalize_and_save_model(algorithm_choice: str, bestModel: str,
model_name: str):
def finalize_and_save_model(algorithm_choice: str, bestModel: str, model_name: str):
"""
Finalizes the best model obtained from PyCaret and saves it locally.
Expand Down
2 changes: 1 addition & 1 deletion mlops_ml_models/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def load_data(data_location: str) -> pd.DataFrame:
df = pd.read_csv(data_location, low_memory=False)
# Dropped unnamed columns. You should comment this portion out before
# using the script if you dont have unamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]
return df
except Exception as e:
print(f"Error loading data: {e}")
Loading

0 comments on commit e7bab62

Please sign in to comment.