crederauk · konradbcredera · Apr 12, 2024 · Apr 12, 2024 · May 17, 2024 · May 17, 2024
diff --git a/.github/workflows/github-ci.yaml b/.github/workflows/github-ci.yaml
@@ -49,7 +49,7 @@ jobs:
         working-directory: .
         run: |
           tflint --init
-          tflint -f compact --recursive
+          tflint -f compact --recursive --disable-rule=terraform_typed_variables 
 
   tests:
     runs-on: ubuntu-latest

diff --git a/CODEOWNERS b/CODEOWNERS
@@ -1,5 +1,3 @@
 @konradbcredera
 @Lanre-B
-@anishadas2022
-@lentonwork
-@samcred-ojogbede
+@anishadas2022
diff --git a/README.md b/README.md
@@ -1,30 +1,26 @@
 # AWS-MLOps-module
 This repo contains a terraform module with corresponding AWS resources that enable training, deploying and re-training AWS-hosted machine learning models with corresponding cloud infrastructure.
 
->  **Warning**: This repo is a basic template for MLOps resources on AWS. Please apply appropriate security enhancements for your project in production.
-
-## High-Level Solution Architecture
-![image](https://github.com/konradbachusz/AWS-MLOps-module/assets/104912687/12c4f1a0-573b-44a0-98f2-1256be64d19a)
+## Warning
+This repo is a basic template for MLOps resources on AWS. Please apply appropriate security enhancements for your project in production.
 
 
 ## Example Usage
+
  ```
-module "MLOps" {
-  source  = "github.com/crederauk/terraform-aws-mlops-module?ref=<MODULE_VERSION>"
-  resource_naming_prefix  = "your-app"
-  data_s3_bucket          = "your-bucket-name"
-  data_location_s3        = "/your_s3_folder/your_data.csv"
-  model_target_variable   = "y"
-  tuning_metric           = "AUC"
-  retrain_model_bool      = true
-  retraining_schedule     = "cron(0 8 1 * ? *)"
-  algorithm_choice        = "classification"
-  sagemaker_training_notebook_instance_type = "ml.m4.xlarge"
-  inference_instance_count = 1
-  preprocessing_script_path = "terraform/preprocess_data.py"
-  tags = {
-    my-tag-key = "my-tag-value"
-  }
+ module "mlops" {
+  source                          = "github.com/konradbachusz/AWS-MLOps-module?ref=<module_version>"
+  model_name                      = "test-model"
+  sagemaker_image_repository_name = "sagemaker-xgboost"
+  vpc_id                          = var.my_vpc
+  subnet_ids                      = var.my_subnets
+  endpoint_instance_type          = "ml.t2.medium"
+  retrain_model_bool              = true
+  retraining_schedule             = "cron(0 8 1 * ? *)"
+  data_location_s3                = "test_bucket"
+  account_id                      = var.account_id
+  model_target_variable           = "test_target_column"
+  region                          = var.region
 } 
 ```
 
@@ -37,7 +33,6 @@ module "MLOps" {
 | <a name="requirement_archive"></a> [archive](#requirement\_archive) | 2.4.0 |
 | <a name="requirement_aws"></a> [aws](#requirement\_aws) | >= 4.0 |
 | <a name="requirement_local"></a> [local](#requirement\_local) | >= 2.4 |
-| <a name="requirement_random"></a> [random](#requirement\_random) | >= 3.6 |
 
 ## Providers
 
@@ -47,7 +42,7 @@ No providers.
 
 | Name | Source | Version |
 |------|--------|---------|
-| <a name="module_ecr"></a> [ecr](#module\_ecr) | ./modules/ecr | n/a |
+| <a name="module_iam"></a> [iam](#module\_iam) | ./modules/iam | n/a |
 | <a name="module_retraining_job"></a> [retraining\_job](#module\_retraining\_job) | ./modules/glue | n/a |
 | <a name="module_s3"></a> [s3](#module\_s3) | ./modules/s3 | n/a |
 | <a name="module_sagemaker"></a> [sagemaker](#module\_sagemaker) | ./modules/sagemaker | n/a |
@@ -60,56 +55,23 @@ No resources.
 
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
-| <a name="input_algorithm_choice"></a> [algorithm\_choice](#input\_algorithm\_choice) | Machine learning problem type e.g classification, regression, clustering, anomaly, time\_series | `string` | n/a | yes |
-| <a name="input_data_location_s3"></a> [data\_location\_s3](#input\_data\_location\_s3) | The path to a file in the data S3 bucket within which training data is located. Should be in the format /<path>/<filename>. If the file is in the root of the bucket, this should be set to /<filename> only. | `string` | n/a | yes |
-| <a name="input_data_s3_bucket"></a> [data\_s3\_bucket](#input\_data\_s3\_bucket) | The name of an S3 bucket within which training data is located. | `string` | n/a | yes |
-| <a name="input_data_s3_bucket_encryption_key_arn"></a> [data\_s3\_bucket\_encryption\_key\_arn](#input\_data\_s3\_bucket\_encryption\_key\_arn) | The ARN of the KMS key using which training data is encrypted in S3, if such a key exists. | `string` | `""` | no |
-| <a name="input_inference_instance_count"></a> [inference\_instance\_count](#input\_inference\_instance\_count) | The initial number of instances to serve the model endpoint | `number` | `1` | no |
-| <a name="input_inference_instance_type"></a> [inference\_instance\_type](#input\_inference\_instance\_type) | The instance type to be created for serving the model. Must be a valid EC2 instance type | `string` | `"ml.t2.medium"` | no |
-| <a name="input_model_target_variable"></a> [model\_target\_variable](#input\_model\_target\_variable) | The dependent variable (or 'label') that the model aims to predict. This should be a column name in the dataset. | `string` | n/a | yes |
-| <a name="input_preprocessing_script_path"></a> [preprocessing\_script\_path](#input\_preprocessing\_script\_path) | The path the user provides if they want to include their own data cleaning logic | `string` | `null` | no |
-| <a name="input_resource_naming_prefix"></a> [resource\_naming\_prefix](#input\_resource\_naming\_prefix) | Naming prefix to be applied to all resources created by this module | `string` | n/a | yes |
+| <a name="input_account_id"></a> [account\_id](#input\_account\_id) | AWS Account ID | `string` | n/a | yes |
+| <a name="input_data_location_s3"></a> [data\_location\_s3](#input\_data\_location\_s3) | Location of the data in s3 bucket | `string` | n/a | yes |
+| <a name="input_endpoint_instance_type"></a> [endpoint\_instance\_type](#input\_endpoint\_instance\_type) | Type of EC2 instance used for model endpoint | `string` | `""` | no |
+| <a name="input_model_name"></a> [model\_name](#input\_model\_name) | Name of the Sagemaker model | `string` | `""` | no |
+| <a name="input_model_target_variable"></a> [model\_target\_variable](#input\_model\_target\_variable) | The dependent variable (or 'label') that the regression model aims to predict. This should be a column name in the dataset. | `string` | n/a | yes |
+| <a name="input_region"></a> [region](#input\_region) | AWS deployment region | `string` | n/a | yes |
 | <a name="input_retrain_model_bool"></a> [retrain\_model\_bool](#input\_retrain\_model\_bool) | Boolean to indicate if the retraining pipeline shoud be added | `bool` | `false` | no |
-| <a name="input_retraining_schedule"></a> [retraining\_schedule](#input\_retraining\_schedule) | Cron expression for the model retraining frequency in the AWS format. See https://docs.aws.amazon.com/lambda/latest/dg/services-cloudwatchevents-expressions.html for details | `string` | `""` | no |
-| <a name="input_sagemaker_training_notebook_instance_type"></a> [sagemaker\_training\_notebook\_instance\_type](#input\_sagemaker\_training\_notebook\_instance\_type) | The Sagemaker notebook instance type to be created for training the model. Must be a valid EC2 instance type | `string` | `"ml.t2.medium"` | no |
-| <a name="input_tags"></a> [tags](#input\_tags) | Tags applied to your resources | `map(string)` | `{}` | no |
-| <a name="input_tuning_metric"></a> [tuning\_metric](#input\_tuning\_metric) | The metric user want to focus when tuning hyperparameter | `string` | n/a | yes |
+| <a name="input_retraining_schedule"></a> [retraining\_schedule](#input\_retraining\_schedule) | Cron expression of the model retraing frequency | `string` | n/a | yes |
+| <a name="input_sagemaker_image_repository_name"></a> [sagemaker\_image\_repository\_name](#input\_sagemaker\_image\_repository\_name) | Name of the repository, which is generally the algorithm or library. Values include blazingtext, factorization-machines, forecasting-deepar, image-classification, ipinsights, kmeans, knn, lda, linear-learner, mxnet-inference-eia, mxnet-inference, mxnet-training, ntm, object-detection, object2vec, pca, pytorch-inference-eia, pytorch-inference, pytorch-training, randomcutforest, sagemaker-scikit-learn, sagemaker-sparkml-serving, sagemaker-xgboost, semantic-segmentation, seq2seq, tensorflow-inference-eia, tensorflow-inference, tensorflow-training, huggingface-tensorflow-training, huggingface-tensorflow-inference, huggingface-pytorch-training, and huggingface-pytorch-inference. | `string` | `""` | no |
+| <a name="input_subnet_ids"></a> [subnet\_ids](#input\_subnet\_ids) | The VPC subnets that Studio uses for communication. | `list(any)` | n/a | yes |
+| <a name="input_tags"></a> [tags](#input\_tags) | Tags applied to your resources | `map` | `{}` | no |
+| <a name="input_vpc_id"></a> [vpc\_id](#input\_vpc\_id) | The ID of the Amazon Virtual Private Cloud (VPC) that Studio uses for communication. | `string` | n/a | yes |
 
 ## Outputs
 
 | Name | Description |
 |------|-------------|
-| <a name="output_config_bucket"></a> [config\_bucket](#output\_config\_bucket) | Config S3 Bucket Terraform object |
-| <a name="output_ecr"></a> [ecr](#output\_ecr) | The ECR repository module outputs. Contains both 'repository' and 'encryption\_key' attributes, that are the ECR repository and KMS encryption key Terraform object respectively. |
-| <a name="output_ecr_repository"></a> [ecr\_repository](#output\_ecr\_repository) | The ECR repository Terraform object. |
-| <a name="output_glue"></a> [glue](#output\_glue) | The Glue module outputs. Contains both 'retraining\_job' and 'retraining\_role' attributes, that are the Glue retraining job and IAM role Terraform objects respectively. |
-| <a name="output_glue_retraining_role"></a> [glue\_retraining\_role](#output\_glue\_retraining\_role) | The Glue retraining job IAM role Terraform object. |
-| <a name="output_model_bucket"></a> [model\_bucket](#output\_model\_bucket) | Model S3 Bucket Terraform object |
-| <a name="output_s3_encryption_key"></a> [s3\_encryption\_key](#output\_s3\_encryption\_key) | S3 encryption KMS key Terraform Object |
-| <a name="output_sagemaker_endpoint_name"></a> [sagemaker\_endpoint\_name](#output\_sagemaker\_endpoint\_name) | Sagemaker model endpoint name |
-| <a name="output_sagemaker_model_name"></a> [sagemaker\_model\_name](#output\_sagemaker\_model\_name) | Sagemaker model name |
-| <a name="output_sagemaker_notebook_instance"></a> [sagemaker\_notebook\_instance](#output\_sagemaker\_notebook\_instance) | Sagemaker notebook instance Terraform object |
-
-## Destroying Resources
-After creating the resources made using this the module, the resources: 
-- Sagemaker model 
-- Sagemaker Endpoint  
-- Endpoint configuration
-
-Will not be tracked by your Terraform state file so if you decide to run "terraform destroy" these resources will not be deleted.
-
-To destroy these resourses we recommend that you add these commands to your CI/CD pipeline:
-
-```bash
-aws sagemaker delete-model --model-name < demo-regression-model >
-aws sagemaker delete-endpoint-config --endpoint-config-name < demo-regression-model-config >
-aws sagemaker delete-endpoint --endpoint-name < demo-regression-model >    
-```
-
-But before this you will need to add your AWS credentials to the environment if you have not do already:
-```bash
-aws-access-key-id: < aws-access-key-id >
-aws-secret-access-key: < aws-secret-access-key >
-aws-region: < region >
-```
-<!-- END_TF_DOCS -->
+| <a name="output_model"></a> [model](#output\_model) | Outputs the machine learning model resource |
+| <a name="output_model_endpoint"></a> [model\_endpoint](#output\_model\_endpoint) | Outputs the machine learning model endpoint resource |
+<!-- END_TF_DOCS -->
diff --git a/examples/main.tf b/examples/main.tf
@@ -0,0 +1 @@
+#TODO
diff --git a/main.tf b/main.tf
@@ -1,64 +1,50 @@
 module "s3" {
-  source = "./modules/s3"
-
-  resource_naming_prefix    = var.resource_naming_prefix
-  tags                      = var.tags
-  preprocessing_script_path = var.preprocessing_script_path
+  source     = "./modules/s3"
+  model_name = var.model_name
+  tags       = var.tags
 }
 
-module "sagemaker" {
-  source = "./modules/sagemaker"
-
-  # Naming
-  resource_naming_prefix = var.resource_naming_prefix
-  tags                   = var.tags
 
-  # Training
-  algorithm_choice = var.algorithm_choice
-  tuning_metric    = var.tuning_metric
-
-  # Notebook
-  training_notebook_instance_type = var.sagemaker_training_notebook_instance_type
+module "sagemaker" {
+  source                       = "./modules/sagemaker"
+  tags                         = var.tags
+  sagemaker_execution_role_arn = module.iam.sagemaker_role_arn
+  model_target_variable        = var.model_target_variable
+  algorithm_choice             = var.algorithm_choice
+  tuning_metric                = var.tuning_metric
+  config_bucket_id             = module.s3.config_bucket_id
+  data_location_s3             = var.data_location_s3
+  endpoint_name                = var.endpoint_name
+  model_name                   = var.model_name
+  sagemaker_instance_type      = var.sagemaker_instance_type
+  model_instance_count         = var.model_instance_count
+  ecr_repo_uri                 = module.ecr.ecr_repo_uri
+}
 
-  # Model
-  inference_instance_type  = var.inference_instance_type
-  model_target_variable    = var.model_target_variable
-  inference_instance_count = var.inference_instance_count
-  ecr_repo_uri             = "${module.ecr.repository.repository_url}:latest"
 
-  # S3
-  config_s3_bucket          = module.s3.config_bucket.id
-  config_bucket_key_arn     = module.s3.encryption_key.arn
-  data_s3_bucket            = var.data_s3_bucket
-  data_bucket_key_arn       = var.data_s3_bucket_encryption_key_arn
-  data_location_s3          = var.data_location_s3
-  model_s3_bucket           = module.s3.model_bucket.id
-  model_bucket_key_arn      = module.s3.encryption_key.arn
-  preprocessing_script_path = var.preprocessing_script_path
+module "iam" {
+  source     = "./modules/iam"
+  tags       = var.tags
+  region     = var.region
+  account_id = var.account_id
+  model_name = var.model_name
 }
-module "retraining_job" {
-  count  = var.retrain_model_bool ? 1 : 0
-  source = "./modules/glue"
-
-  # Naming
-  resource_naming_prefix = var.resource_naming_prefix
-  tags                   = var.tags
 
-  # S3
-  config_s3_bucket      = module.s3.config_bucket.id
-  config_bucket_key_arn = module.s3.encryption_key.arn
-  data_s3_bucket        = var.data_s3_bucket
-  data_bucket_key_arn   = var.data_s3_bucket_encryption_key_arn
-  data_location_s3      = var.data_location_s3
 
-  # Glue
+module "retraining_job" {
+  count               = var.retrain_model_bool ? 1 : 0
+  source              = "./modules/glue"
+  model_name          = var.model_name
+  tags                = var.tags
+  config_bucket_id    = module.s3.config_bucket_id
+  data_location_s3    = var.data_location_s3
   retraining_schedule = var.retraining_schedule
+  region              = var.region
+  account_id          = var.account_id
 }
 
 
 module "ecr" {
-  source = "./modules/ecr"
-
-  resource_naming_prefix = var.resource_naming_prefix
-  tags                   = var.tags
+  source           = "./modules/ecr"
+  pycaret_ecr_name = var.pycaret_ecr_name
 }
diff --git a/mlops_ml_models/architectural_diagram.png b/mlops_ml_models/architectural_diagram.png
diff --git a/mlops_ml_models/data_cleaning.env b/mlops_ml_models/data_cleaning.env
@@ -0,0 +1,21 @@
+data_location_s3 = "cleaned_ethan_data.csv"
+
+target = "y"
+
+algorithm_choice="classification"
+
+endpoint_name="classification-proba-endpoint"
+
+model_name = "banking-classification"
+
+
+ecr_repo_uri = "mlops-classification-repo"
+
+instance_type = "ml.m4.xlarge"
+
+
+model_instance_count=1
+
+tuning_metric = "AUC"
+
+pycaret_ecr_name = "mlops-classification-repo"
diff --git a/mlops_ml_models/delete_sagemaker_endpoint.py b/mlops_ml_models/delete_sagemaker_endpoint.py
@@ -32,8 +32,11 @@ def delete_sagemaker_endpoint(endpoint_name: str) -> None:
         sagemaker_client.delete_endpoint(EndpointName=endpoint_name)
 
         # Delete endpoint configuration
-        sagemaker_client.delete_endpoint_config(EndpointConfigName=endpoint_name)
+        sagemaker_client.delete_endpoint_config(
+            EndpointConfigName=endpoint_name
+        )
 
-        print(f"Endpoint '{endpoint_name}' and its configuration have " "been deleted.")
+        print(f"Endpoint '{endpoint_name}' and its configuration have "
+              "been deleted.")
     else:
         print("Endpoint deletion cancelled.")
diff --git a/mlops_ml_models/deploy_model_endpoint.py b/mlops_ml_models/deploy_model_endpoint.py
@@ -2,36 +2,28 @@
 
 
 def deploy_model(
-    model_name: str,
-    model_type: str,
-    model_s3_bucket: str,
-    instance_type: str,
-    endpoint_name,
-    role: str,
-    inference_instance_count: int,
-    image_uri: str,
+    model_name: str, instance_type: str, endpoint_name,
+    role: str, model_instance_count: int, image_uri: str
 ) -> None:
+
     """This script deploys the sagemaker endpoint using the tar.gz file
     saved in s3.
 
     Args:
-        model_name (str): The name of the model file in s3 (without file extension)
-        model_type (str): The type of model deployed e.g. regression
-        model_s3_bucket (str): The name of the bucket within which the model file resides
+        model_name (str): The name of the bucket and name of the file in s3
         instance_type (str): The sagemaker instance type you want to deploy
         endpoint_name (_type_): What you will like to call the endpoint.
         role (str): Your execution role
-        inference_instance_count (int): initial instance number of model
+        model_instance_count (int): initial instance number of model
     """
-    model_file = f"s3://{model_s3_bucket}/{model_name}.tar.gz"
+    model_file = f"s3://{model_name}-model/{model_name}.tar.gz"
     model = Model(
         image_uri=(image_uri),  # The ECR image you pushed
         model_data=model_file,  # Location of your serialized model
         role=role,
-        env={"MODEL_NAME": model_name, "MODEL_TYPE": model_type},
     )
     model.deploy(
-        initial_instance_count=inference_instance_count,
+        initial_instance_count=model_instance_count,
         instance_type=instance_type,
         endpoint_name=endpoint_name,
     )
diff --git a/mlops_ml_models/finalize_and_save_model.py b/mlops_ml_models/finalize_and_save_model.py
@@ -1,7 +1,8 @@
 import importlib
 
 
-def finalize_and_save_model(algorithm_choice: str, bestModel: str, model_name: str):
+def finalize_and_save_model(algorithm_choice: str, bestModel: str,
+                            model_name: str):
     """
     Finalizes the best model obtained from PyCaret and saves it locally.
 

diff --git a/mlops_ml_models/load_data.py b/mlops_ml_models/load_data.py
@@ -18,7 +18,7 @@ def load_data(data_location: str) -> pd.DataFrame:
         df = pd.read_csv(data_location, low_memory=False)
         # Dropped unnamed columns. You should comment this portion out before
         # using the script if you dont have unamed columns
-        df = df.loc[:, ~df.columns.str.contains("^Unnamed")]
+        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
         return df
     except Exception as e:
         print(f"Error loading data: {e}")