diff --git a/.github/workflows/deploy_on_release.yaml b/.github/workflows/deploy_on_release.yaml deleted file mode 100644 index 2957f47..0000000 --- a/.github/workflows/deploy_on_release.yaml +++ /dev/null @@ -1,37 +0,0 @@ -name: model-deploy-on-release - -on: - release: - types: - - 'created' - -jobs: - run: - runs-on: [ubuntu-latest] - container: docker://dvcorg/cml-py3:latest - steps: - - uses: actions/checkout@v2 - - name: 'Deploy/Update on new release' - shell: bash - - env: - repo_token: ${{ secrets.GITHUB_TOKEN }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - CRED_SECRET: ${{ secrets.IBM_CREDENTIALS_PASS }} - - run: | - # Install requirements - pip install -r requirements.txt - - # Pull data & run-cache from S3 and reproduce pipeline - dvc pull --run-cache - dvc repro - - # Decrypt credentials file - gpg --quiet --batch --yes --decrypt --passphrase="$CRED_SECRET" --output credentials.yaml credentials.yaml.gpg - - # Check if there is a deployment already, if positive update it, otherwise deploys it for the first time - ./src/scripts/Scripts/git_release_pipeline.sh - - \ No newline at end of file diff --git a/.github/workflows/pylint.yaml b/.github/workflows/pylint.yaml new file mode 100644 index 0000000..dabe9d9 --- /dev/null +++ b/.github/workflows/pylint.yaml @@ -0,0 +1,23 @@ +name: Pylint + +on: [push] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10"] + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pylint + - name: Analysing the code with pylint + run: | + pylint $(git ls-files '*.py') --rcfile=.pylintrc \ No newline at end of file diff --git a/.github/workflows/test_on_push.yaml b/.github/workflows/test_on_push.yaml index d822181..c92dd13 100644 --- a/.github/workflows/test_on_push.yaml +++ b/.github/workflows/test_on_push.yaml @@ -7,22 +7,22 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.10] + python-version: ["3.10"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip - pip install pytest black if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - - name: Test with pytest - run: | - pytest - name: Python Black run: | + pip install black==24.10.0 black . --check + - name: Test with pytest + run: | + pytest --cov-report html:./results/cov_html --cov=src tests/ diff --git a/.github/workflows/train_evaluate.yaml b/.github/workflows/train_evaluate.yaml deleted file mode 100644 index 15a99b4..0000000 --- a/.github/workflows/train_evaluate.yaml +++ /dev/null @@ -1,35 +0,0 @@ -name: model-training-evaluate - -on: [push] - -jobs: - run: - runs-on: [ubuntu-latest] - container: docker://dvcorg/cml-py3:latest - steps: - - uses: actions/checkout@v2 - - name: 'Train and Evaluate model' - shell: bash - env: - repo_token: ${{ secrets.GITHUB_TOKEN }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - run: | - # Install requirements - pip install -r requirements.txt - - # Pull data & run-cache from S3 and reproduce pipeline - dvc pull --run-cache - dvc repro - - # Report metrics - echo "## Metrics" >> report.md - git fetch --prune - dvc metrics diff master --show-md >> report.md - - # Publish confusion matrix diff - echo -e "## Plots\n### ROC Curve" >> report.md - cml-publish ./results/roc_curve.png --md >> report.md - echo -e "\n### Precision and Recall Curve" >> report.md - cml-publish ./results/precision_recall_curve.png --md >> report.md - cml-send-comment report.md diff --git a/.gitignore b/.gitignore index c424ea0..b553d8b 100644 --- a/.gitignore +++ b/.gitignore @@ -62,4 +62,4 @@ coverage.xml docs/_build/ # PyBuilder -target/ \ No newline at end of file +target/ diff --git a/.pylintrc b/.pylintrc index 16d19d6..335fc53 100644 --- a/.pylintrc +++ b/.pylintrc @@ -7,7 +7,7 @@ analyse-fallback-blocks=no # Clear in-memory caches upon conclusion of linting. Useful if running pylint # in a server-like mode. -clear-cache-post-run=no +# clear-cache-post-run=no # Load and enable all available extensions. Use --list-extensions to see a list # all available extensions. @@ -89,7 +89,7 @@ persistent=yes # Resolve imports to .pyi stubs if available. May reduce no-member messages and # increase not-an-iterable messages. -prefer-stubs=no +# prefer-stubs=no # Minimum Python version to use for version dependent checks. Will default to # the version used to run pylint. @@ -102,7 +102,7 @@ recursive=no # source root is an absolute path or a path relative to the current working # directory used to determine a package namespace for modules located under the # source root. -source-roots= +# source-roots= # When enabled, pylint would attempt to guess common misconfiguration and emit # user-friendly hints instead of false-positive error messages. @@ -363,7 +363,7 @@ single-line-if-stmt=no allow-any-import-level= # Allow explicit reexports by alias from a package __init__. -allow-reexport-from-package=no +# allow-reexport-from-package=no # Allow wildcard imports from modules that define __all__. allow-wildcard-with-all=no @@ -433,9 +433,8 @@ disable=raw-checker-failed, suppressed-message, useless-suppression, deprecated-pragma, - use-implicit-booleaness-not-comparison-to-string, - use-implicit-booleaness-not-comparison-to-zero, - use-symbolic-message-instead + use-symbolic-message-instead, + duplicate-code # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option @@ -448,7 +447,7 @@ enable= # List of qualified names (i.e., library.method) which require a timeout # parameter e.g. 'requests.api.get,requests.api.post' -timeout-methods=requests.api.delete,requests.api.get,requests.api.head,requests.api.options,requests.api.patch,requests.api.post,requests.api.put,requests.api.request +# timeout-methods=requests.api.delete,requests.api.get,requests.api.head,requests.api.options,requests.api.patch,requests.api.post,requests.api.put,requests.api.request [MISCELLANEOUS] @@ -476,7 +475,7 @@ never-returning-functions=sys.exit,argparse.parse_error # Let 'consider-using-join' be raised when the separator to join on would be # non-empty (resulting in expected fixes of the type: ``"- " + " - # ".join(items)``) -suggest-join-with-non-empty-separator=yes +# suggest-join-with-non-empty-separator=yes [REPORTS] diff --git a/__init__.py b/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/docker-compose.yaml b/docker-compose.yaml index 313e7ec..3cb9130 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -18,13 +18,13 @@ services: # exposing the port that will be used by MLFlow and Jupyter Notebook ports: - "8888:8888" - + # setting external volumes volumes: - ./credentials.yaml:/e2e-project/credentials.yaml - ./models:/e2e-project/models/ - ./data:/e2e-project/data/ - + dev-mlflow: image: e2e-dev:latest build: @@ -39,9 +39,9 @@ services: # exposing the port that will be used by MLFlow and Jupyter Notebook ports: - "5000:5000" - + # setting external volumes volumes: - ./credentials.yaml:/e2e-project/credentials.yaml - ./models:/e2e-project/models/ - - ./data:/e2e-project/data/ \ No newline at end of file + - ./data:/e2e-project/data/ diff --git a/requirements.txt b/requirements.txt index 7218aeb..60fd637 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,13 @@ -boto3==1.35.6 +boto3==1.24.28 fastapi==0.115.5 joblib==1.3.2 loguru==0.7.2 -mlflow==2.17.2 -numpy==2.1.3 -pandas==1.5.2 +mlflow==2.15.1 +numpy==1.21.5 +pandas==1.4.4 pydantic==2.9.2 -pytest==8.3.3 -PyYAML==6.0.2 +pytest==7.2.2 +pytest-cov==6.0.0 +PyYAML==6.0 scikit_learn==1.3.2 -xgboost==2.1.2 \ No newline at end of file +xgboost==2.1.2 diff --git a/src/api.py b/src/api.py index bf11eee..22d980a 100644 --- a/src/api.py +++ b/src/api.py @@ -1,3 +1,8 @@ +""" +API's main file. +""" +from typing import Dict + import pandas as pd import mlflow from fastapi import FastAPI @@ -14,15 +19,21 @@ if aws_credentials.EC2 != "YOUR_EC2_INSTANCE_URL": mlflow.set_tracking_uri(f"http://{aws_credentials.EC2}:5000") else: - mlflow.set_tracking_uri(f"http://127.0.0.1:5000") + mlflow.set_tracking_uri("http://127.0.0.1:5000") @app.get("/version") -def check_versions(): +def check_versions() -> Dict: + """ + This endpoint will return the current model and code versions. + + Returns: + Dict: the model and code versions. + """ with open( f"{general_settings.RESEARCH_ENVIRONMENT_PATH}/VERSION", "r", encoding="utf-8" - ) as f: - code_version = f.readline().strip() + ) as file: + code_version = file.readline().strip() return { "code_version": code_version, @@ -31,7 +42,17 @@ def check_versions(): @app.get("/predict") -async def prediction(person: Person): +async def prediction(person: Person) -> Dict: + """ + This endpoint is used to make a prediction (with the trained model) + with the given data. + + Args: + person (Person): a person's data. + + Returns: + Dict: the predictions. + """ loaded_model = ModelServe( model_name=model_settings.MODEL_NAME, model_flavor=model_settings.MODEL_FLAVOR, @@ -40,6 +61,6 @@ async def prediction(person: Person): loaded_model.load() data = pd.DataFrame.from_dict([person.model_dump()]) - X = data_processing_inference(data) + features = data_processing_inference(data) - return {"predictions": loaded_model.predict(X).tolist()} + return {"predictions": loaded_model.predict(features).tolist()} diff --git a/src/config/aws.py b/src/config/aws.py index 42a86c6..245f0b0 100644 --- a/src/config/aws.py +++ b/src/config/aws.py @@ -1,3 +1,6 @@ +""" +Creates a Pydantic's base model for the AWS' credentials. +""" from pathlib import Path from pydantic import BaseModel diff --git a/src/config/kaggle.py b/src/config/kaggle.py index 426ed5d..f8f426c 100644 --- a/src/config/kaggle.py +++ b/src/config/kaggle.py @@ -1,3 +1,6 @@ +""" +Creates a Pydantic's base model for the Kaggle's credentials. +""" from pathlib import Path from pydantic import BaseModel diff --git a/src/config/model.py b/src/config/model.py index d36eabd..67cd5b5 100644 --- a/src/config/model.py +++ b/src/config/model.py @@ -1,3 +1,6 @@ +""" +Creates a Pydantic's base model for the model's configuration. +""" from typing import List from pathlib import Path diff --git a/src/config/settings.py b/src/config/settings.py index 6b81767..7c0a91a 100644 --- a/src/config/settings.py +++ b/src/config/settings.py @@ -1,3 +1,6 @@ +""" +Creates a Pydantic's base model for the general configuration settings. +""" import os from pathlib import Path diff --git a/src/config/utils.py b/src/config/utils.py index c8921ef..d98d430 100644 --- a/src/config/utils.py +++ b/src/config/utils.py @@ -1,38 +1,11 @@ -import yaml +""" +Stores auxiliary functions (such as for reading an YAML file) +that will be used with the main configurations functions. +""" from pathlib import Path -from typing import Dict, Optional, Type, Any, Tuple -from copy import deepcopy - -from pydantic import BaseModel, create_model -from pydantic.fields import FieldInfo - +from typing import Dict -def partial_model(model: Type[BaseModel]): - """Workaround for setting all Pydantic's fields as optional. - All credits goes to the author: - https://stackoverflow.com/questions/67699451/make-every-field-as-optional-with-pydantic - - Args: - model (Type[BaseModel]): Pydantic base model instance. - """ - - def make_field_optional( - field: FieldInfo, default: Any = None - ) -> Tuple[Any, FieldInfo]: - new = deepcopy(field) - new.default = default - new.annotation = Optional[field.annotation] # type: ignore - return new.annotation, new - - return create_model( - f"Partial{model.__name__}", - __base__=model, - __module__=model.__module__, - **{ - field_name: make_field_optional(field_info) - for field_name, field_info in model.model_fields.items() - }, - ) +import yaml def read_yaml_credentials_file(file_path: Path, file_name: str) -> Dict: @@ -43,7 +16,7 @@ def read_yaml_credentials_file(file_path: Path, file_name: str) -> Dict: file_name (str): the file's name. Raises: - e: If any error occurs when trying to read the YAML + error: If any error occurs when trying to read the YAML file, then returns the error to the user. Returns: @@ -54,10 +27,10 @@ def read_yaml_credentials_file(file_path: Path, file_name: str) -> Dict: file_name, ) - with open(path, "r", encoding="utf-8") as f: + with open(path, "r", encoding="utf-8") as file: try: - context = yaml.safe_load(f) - except yaml.YAMLError as e: - raise e + context = yaml.safe_load(file) + except yaml.YAMLError as error: + raise error return context diff --git a/src/data/processing.py b/src/data/processing.py index b78c427..8af3409 100644 --- a/src/data/processing.py +++ b/src/data/processing.py @@ -1,3 +1,7 @@ +""" +Stores data processing functions, such as for cleaning the data, creating new features, +enconding categorical columns, and so on. +""" import pathlib from typing import List, Dict @@ -6,9 +10,9 @@ from loguru import logger from sklearn.preprocessing import StandardScaler, OneHotEncoder -from .utils import load_feature -from ..config.settings import general_settings from ..config.model import model_settings +from ..config.settings import general_settings +from .utils import load_feature def data_processing_inference(dataframe: pd.DataFrame) -> np.ndarray: @@ -67,16 +71,17 @@ def data_processing_inference(dataframe: pd.DataFrame) -> np.ndarray: logger.info( f"Loading scalers 'features_sc' from path {general_settings.ARTIFACTS_PATH}." ) - sc = load_feature(path=general_settings.ARTIFACTS_PATH, feature_name="features_sc") + scalers = load_feature( + path=general_settings.ARTIFACTS_PATH, feature_name="features_sc" + ) # Scaling numerical columns - dataframe = _scale_numerical_columns(dataframe=dataframe, sc=sc) + dataframe = _scale_numerical_columns(dataframe=dataframe, scalers=scalers) # Encoding categorical columns dataframe = _encode_categorical_columns( dataframe=dataframe, encoders=encoders, - target_column=general_settings.TARGET_COLUMN, ) # Selecting only the features that are important for the model @@ -85,8 +90,8 @@ def data_processing_inference(dataframe: pd.DataFrame) -> np.ndarray: f"Filtering the features columns, keeping only {model_settings.FEATURES} columns." ) - X = dataframe.values - return X + features = dataframe.values + return features def _drop_features(dataframe: pd.DataFrame, features: List) -> pd.DataFrame: @@ -137,16 +142,16 @@ def _remove_outliers(dataframe: pd.DataFrame) -> pd.DataFrame: pd.DataFrame: the dataframe without outliers. """ # Calculating the upper and lower limits - q1 = dataframe["Age"].quantile(0.25) - q3 = dataframe["Age"].quantile(0.75) + quartil_1 = dataframe["Age"].quantile(0.25) + quartil_3 = dataframe["Age"].quantile(0.75) threshold = 3.5 - iqr = q3 - q1 + iqr = quartil_3 - quartil_1 # Removing the data samples that exceeds the upper or lower limits dataframe = dataframe[ ~( - (dataframe["Age"] >= (q3 + threshold * iqr)) - | (dataframe["Age"] <= (q1 - threshold * iqr)) + (dataframe["Age"] >= (quartil_3 + threshold * iqr)) + | (dataframe["Age"] <= (quartil_1 - threshold * iqr)) ) ] @@ -205,8 +210,8 @@ def _calculate_bmr(age: int, gender: str, height: float, weight: float) -> float Returns: float: the BMR value. """ - s = -161 if gender == "Female" else 5 - return (10 * weight) + (6.25 * height) - (5 * age) + s + s_value = -161 if gender == "Female" else 5 + return (10 * weight) + (6.25 * height) - (5 * age) + s_value dataframe["BMR"] = dataframe.apply( lambda x: _calculate_bmr(x["Age"], x["Gender"], x["Height"], x["Weight"]), @@ -251,21 +256,21 @@ def _transform_numerical_columns( numerical_columns = dataframe.select_dtypes(exclude="object").columns.tolist() logger.info(f"Applying Log Transformation to the {numerical_columns} columns.") - for nc in numerical_columns: - dataframe[nc] = np.log1p(dataframe[nc].values + epsilon) + for column in numerical_columns: + dataframe[column] = np.log1p(dataframe[column].values + epsilon) return dataframe def _scale_numerical_columns( dataframe: pd.DataFrame, - sc: Dict[str, StandardScaler], + scalers: Dict[str, StandardScaler], ) -> pd.DataFrame: """Scales the numerical columns using the Standard technique. Args: dataframe (pd.DataFrame): the dataframe. - sc (Dict[str, OneHotEncoder]): a dict containing the corresponding + scalers (Dict[str, OneHotEncoder]): a dict containing the corresponding encoder for each feature. Returns: @@ -274,8 +279,10 @@ def _scale_numerical_columns( numerical_columns = dataframe.select_dtypes(exclude="object").columns.tolist() logger.info(f"Scaling the {numerical_columns} columns.") - for nc in numerical_columns: - dataframe[nc] = sc[nc].transform(dataframe[nc].values.reshape(-1, 1)) + for column in numerical_columns: + dataframe[column] = scalers[column].transform( + dataframe[column].values.reshape(-1, 1) + ) return dataframe @@ -283,7 +290,6 @@ def _scale_numerical_columns( def _encode_categorical_columns( dataframe: pd.DataFrame, encoders: Dict[str, OneHotEncoder], - target_column: str, ) -> pd.DataFrame: """Encodes the categorical columns using the OneHot technique. @@ -291,23 +297,21 @@ def _encode_categorical_columns( dataframe (pd.DataFrame): the dataframe. encoders (Dict[str, OneHotEncoder]): a dict containing the corresponding encoder for each feature. - target_column (str): what column is the target label. Returns: pd.DataFrame: the dataframe with all categorical columns encoded. """ categorical_columns = dataframe.select_dtypes(include="object").columns.tolist() - # categorical_columns.remove(target_column) logger.info(f"Encoding the {categorical_columns} columns.") new_dataframe = pd.DataFrame() - for cc in categorical_columns: + for column in categorical_columns: train_categorical_features = pd.DataFrame( - encoders[cc].transform(dataframe[cc].values.reshape(-1, 1)), - columns=encoders[cc].get_feature_names_out(), + encoders[column].transform(dataframe[column].values.reshape(-1, 1)), + columns=encoders[column].get_feature_names_out(), ) - train_categorical_features = train_categorical_features.add_prefix(cc + "_") + train_categorical_features = train_categorical_features.add_prefix(column + "_") new_dataframe = pd.concat([new_dataframe, train_categorical_features], axis=1) new_dataframe = pd.concat( diff --git a/src/data/utils.py b/src/data/utils.py index 306ad4d..9730dd5 100644 --- a/src/data/utils.py +++ b/src/data/utils.py @@ -1,12 +1,16 @@ -import os +""" +Stores auxiliary functions (such as for loading features or downloading the dataset) +that will be used with the main data processing functions. +""" import pathlib -import joblib +import os from typing import Union import boto3 +import joblib import numpy as np from loguru import logger -from sklearn.preprocessing import StandardScaler, OneHotEncoder +from sklearn.preprocessing import OneHotEncoder, StandardScaler from ..config.aws import aws_credentials from ..config.kaggle import kaggle_credentials @@ -79,13 +83,13 @@ def send_dataset_to_s3( file_path (pathlib.Path): the dataset file's path. file_name (str): the file's name. """ - s3 = boto3.client( + bucket = boto3.client( "s3", aws_access_key_id=aws_credentials.AWS_ACCESS_KEY, aws_secret_access_key=aws_credentials.AWS_SECRET_KEY, ) - s3.upload_file( + bucket.upload_file( file_path, aws_credentials.S3, file_name, diff --git a/src/model/builder.py b/src/model/builder.py deleted file mode 100644 index 55269ab..0000000 --- a/src/model/builder.py +++ /dev/null @@ -1,68 +0,0 @@ -import mlflow -import numpy as np -import pandas as pd -from loguru import logger -from sklearn.metrics import f1_score -from sklearn.model_selection import train_test_split -from xgboost import XGBClassifier - -from ..config.model import model_settings -from ..data.processing import data_processing - - -class ModelBuilder: - """The trained model's class.""" - - def __init__( - self, - model_name: str, - model_flavor: str, - model_version: str, - ) -> None: - """Model's instance initializer. - - Args: - model_name (str): the model's name. - model_flavor (str): the model's MLflow flavor. - model_version (str): the model's version. - """ - self.model_name = model_name - self.model_flavor = model_flavor - self.model_version = model_version - self.model = None - - @logger.catch - def train(self, dataframe: pd.DataFrame) -> None: - logger.info("Pre-processing the data before training the model.") - - # Pre-processing and cleaning the data - X, y = data_processing(dataframe) - - logger.info( - "Splitting the data into training and validation using 90/10 split." - ) - - # Splitting the data into training and validation - X_train, X_valid, y_train, y_valid = train_test_split( - X, - y, - test_size=0.1, - shuffle=True, - stratify=y, - ) - - logger.info("Training the model using the given data.") - self.model = XGBClassifier() - self.model.fit(X_train, y_train) - - # Assessing the model's performance on the training set - train_prediction = np.argmax(self.model.predict(X_train), axis=1) - _y_train = np.argmax(y_train, axis=1).reshape(-1) - score = f1_score(y_true=_y_train, y_pred=train_prediction, average="weighted") - logger.info(f"Achieved a weighted F1-Score of {score} on the training set.") - - # Assessing the model's performance on the validation set - valid_prediction = np.argmax(self.model.predict(X_valid), axis=1) - _y_valid = np.argmax(y_valid, axis=1).reshape(-1) - score = f1_score(y_true=_y_valid, y_pred=valid_prediction, average="weighted") - logger.info(f"Achieved a weighted F1-Score of {score} on the validation set.") diff --git a/src/model/inference.py b/src/model/inference.py index 3605e36..5d3455f 100644 --- a/src/model/inference.py +++ b/src/model/inference.py @@ -1,7 +1,12 @@ +""" +Stores a model serve class that will be used to make predictions with +the trained model. +""" import mlflow import numpy as np from loguru import logger +from ..config.aws import aws_credentials from ..config.model import model_settings from ..config.settings import general_settings from ..data.utils import load_feature @@ -10,6 +15,11 @@ path=general_settings.ARTIFACTS_PATH, feature_name="label_ohe" ) +if aws_credentials.EC2 != "YOUR_EC2_INSTANCE_URL": + mlflow.set_tracking_uri(f"http://{aws_credentials.EC2}:5000") +else: + mlflow.set_tracking_uri("http://127.0.0.1:5000") + class ModelServe: """The trained model's class.""" @@ -53,20 +63,20 @@ def load(self) -> None: ) raise NotImplementedError() - def predict(self, x: np.ndarray, transform_to_str: bool = True) -> np.ndarray: + def predict( + self, features: np.ndarray, transform_to_str: bool = True + ) -> np.ndarray: """Uses the trained model to make a prediction on a given feature array. Args: - x (np.ndarray): the features array. + features (np.ndarray): the features array. transform_to_str (bool): whether to transform the prediction integer to string or not. Defaults to True. Returns: np.ndarray: the predictions array. """ - prediction = self.model.predict(x) - - print(prediction.shape) + prediction = self.model.predict(features) if transform_to_str: prediction = label_encoder.inverse_transform(prediction) diff --git a/src/schema/person.py b/src/schema/person.py index 5e7ff0e..1238fc3 100644 --- a/src/schema/person.py +++ b/src/schema/person.py @@ -1,8 +1,33 @@ +""" +Creates a person schema with Pydantic's base model, which will be used to +validate the parameters value when passed to the API. +""" from typing import Literal from pydantic import BaseModel, Field, field_validator +@field_validator("Age", "Height", "Weight", "FCVC") +def prevent_zero(_, value: int): + """ + A function that will validate the parameter value for the + 'Age', 'Height', 'Weight', and 'FCVC' features. + + Args: + _ (str): the parameter's name (ignored). + value (int): the given parameter value for that feature. + + Raises: + ValueError: raises an error if the value is zero. + + Returns: + int: the parameter's value. + """ + if value == 0: + raise ValueError("Ensure this value is not 0.") + return value + + class Person(BaseModel): """ Person schema. @@ -56,9 +81,3 @@ class Person(BaseModel): ] } } - - @field_validator("Age", "Height", "Weight", "FCVC") - def prevent_zero(cls, v): - if v == 0: - raise ValueError("Ensure this value is not 0.") - return v diff --git a/tests/integration/test_data_processing.py b/tests/integration/test_data_processing.py index a2b26c5..696986c 100644 --- a/tests/integration/test_data_processing.py +++ b/tests/integration/test_data_processing.py @@ -1,11 +1,14 @@ +""" +Integration cases to test the data processing pipeline. +""" import pathlib -import pandas as pd import numpy as np +import pandas as pd -from src.config.settings import general_settings -from src.config.model import model_settings from src.data.processing import data_processing_inference, load_dataset +from src.config.model import model_settings +from src.config.settings import general_settings # loading the raw dataset that was used to train the model @@ -16,15 +19,15 @@ ) -def test_data_processing_pipeline(): +def test_data_processing_pipeline() -> None: """ Testing the integration of the entire data processing pipeline. """ _dataset = dataset.copy() _dataset = _dataset.drop(columns=general_settings.TARGET_COLUMN) - X = data_processing_inference(dataframe=_dataset) + features = data_processing_inference(dataframe=_dataset) assert isinstance(_dataset, pd.DataFrame) - assert isinstance(X, np.ndarray) - assert X.shape[1] == len(model_settings.FEATURES) + assert isinstance(features, np.ndarray) + assert features.shape[1] == len(model_settings.FEATURES) diff --git a/tests/integration/test_model_inference.py b/tests/integration/test_model_inference.py index d3919eb..3b63215 100644 --- a/tests/integration/test_model_inference.py +++ b/tests/integration/test_model_inference.py @@ -1,10 +1,13 @@ +""" +Integration cases to test the model inference pipeline. +""" import pathlib -import pandas as pd import numpy as np +import pandas as pd -from src.config.settings import general_settings from src.config.model import model_settings +from src.config.settings import general_settings from src.data.processing import data_processing_inference, load_dataset from src.model.inference import ModelServe @@ -16,18 +19,18 @@ ) -def test_model_inference_pipeline(): +def test_model_inference_pipeline() -> None: """ Testing the integration of the entire model inference pipeline. """ _dataset = dataset.copy() _dataset = _dataset.drop(columns=general_settings.TARGET_COLUMN) - X = data_processing_inference(dataframe=_dataset) + features = data_processing_inference(dataframe=_dataset) assert isinstance(_dataset, pd.DataFrame) - assert isinstance(X, np.ndarray) - assert X.shape[1] == len(model_settings.FEATURES) + assert isinstance(features, np.ndarray) + assert features.shape[1] == len(model_settings.FEATURES) loaded_model = ModelServe( model_name=model_settings.MODEL_NAME, @@ -38,15 +41,15 @@ def test_model_inference_pipeline(): assert loaded_model.model is not None - predictions = loaded_model.predict(X, transform_to_str=False) + predictions = loaded_model.predict(features, transform_to_str=False) assert isinstance(predictions, np.ndarray) - assert predictions.shape[0] == X.shape[0] + assert predictions.shape[0] == features.shape[0] assert isinstance(predictions.dtype, type(np.dtype("float64"))) # FIXME: fix this - # predictions = loaded_model.predict(X, transform_to_str=True) + # predictions = loaded_model.predict(x, transform_to_str=True) # assert isinstance(predictions, List) - # assert len(predictions) == X.shape[0] + # assert len(predictions) == x.shape[0] # assert isinstance(type(predictions[0]), str) diff --git a/tests/unit/test_api.py b/tests/unit/test_api.py index 59e3c3c..8212c10 100644 --- a/tests/unit/test_api.py +++ b/tests/unit/test_api.py @@ -1,3 +1,6 @@ +""" +Unit test cases to test the API code. +""" import json from pathlib import Path from typing import Dict diff --git a/tests/unit/test_data_functions.py b/tests/unit/test_data_functions.py index d157bf7..42a0d78 100644 --- a/tests/unit/test_data_functions.py +++ b/tests/unit/test_data_functions.py @@ -1,31 +1,35 @@ +""" +Unit test cases to test the data functions code. +""" +import os import pathlib import re -import os from typing import List -import boto3 -import pytest -import pandas.api.types as ptypes +# import boto3 import pandas as pd +import pandas.api.types as ptypes +import pytest import numpy as np +from src.config.settings import general_settings + +# from src.config.aws import aws_credentials from src.data.processing import ( - _drop_features, + _categorize_numerical_columns, _change_height_units, - load_dataset, - _remove_duplicates, - _remove_outliers, - _create_is_feature, _create_bmi_feature, _create_bmr_feature, - _categorize_numerical_columns, + _create_is_feature, + _drop_features, + _encode_categorical_columns, + _remove_duplicates, + _remove_outliers, _scale_numerical_columns, _transform_numerical_columns, - _encode_categorical_columns, + load_dataset, ) -from src.config.settings import general_settings -from src.config.aws import aws_credentials -from src.data.utils import load_feature, download_dataset +from src.data.utils import download_dataset, load_feature # loading the raw dataset that was used to train the model dataset = load_dataset( @@ -186,16 +190,18 @@ def test_scale_numerical_columns(): numerical_columns = _dataset.select_dtypes(exclude="object").columns.tolist() - sc = load_feature(path=general_settings.ARTIFACTS_PATH, feature_name="features_sc") - _dataset2 = _scale_numerical_columns(dataframe=_dataset, sc=sc) + scalers = load_feature( + path=general_settings.ARTIFACTS_PATH, feature_name="features_sc" + ) + _dataset2 = _scale_numerical_columns(dataframe=_dataset, scalers=scalers) - for nc in numerical_columns: - assert _dataset2[nc].mean(axis=0) == 0 - assert _dataset2[nc].std(axis=0) == 1 - assert _dataset[nc].mean(axis=0) > _dataset2[nc].mean(axis=0) - assert _dataset[nc].var(axis=0) > _dataset2[nc].var(axis=0) - assert _dataset[nc].std(axis=0) > _dataset2[nc].std(axis=0) - assert isinstance(_dataset2[nc].dtype, type(np.dtype("float64"))) + for column in numerical_columns: + assert _dataset2[column].mean(axis=0) == 0 + assert _dataset2[column].std(axis=0) == 1 + assert _dataset[column].mean(axis=0) > _dataset2[column].mean(axis=0) + assert _dataset[column].var(axis=0) > _dataset2[column].var(axis=0) + assert _dataset[column].std(axis=0) > _dataset2[column].std(axis=0) + assert isinstance(_dataset2[column].dtype, type(np.dtype("float64"))) def test_transform_numerical_columns(): @@ -214,12 +220,12 @@ def test_transform_numerical_columns(): _dataset2 = _transform_numerical_columns(dataframe=_dataset) - for nc in numerical_columns: - assert _dataset[nc].mean(axis=0) != _dataset2[nc].mean(axis=0) - assert _dataset[nc].var(axis=0) != _dataset2[nc].var(axis=0) - assert _dataset[nc].std(axis=0) != _dataset2[nc].std(axis=0) - assert _dataset[nc].max(axis=0) != _dataset2[nc].max(axis=0) - assert _dataset[nc].min(axis=0) != _dataset2[nc].min(axis=0) + for column in numerical_columns: + assert _dataset[column].mean(axis=0) != _dataset2[column].mean(axis=0) + assert _dataset[column].var(axis=0) != _dataset2[column].var(axis=0) + assert _dataset[column].std(axis=0) != _dataset2[column].std(axis=0) + assert _dataset[column].max(axis=0) != _dataset2[column].max(axis=0) + assert _dataset[column].min(axis=0) != _dataset2[column].min(axis=0) def test_encode_categorical_columns(): @@ -234,13 +240,14 @@ def test_encode_categorical_columns(): encoders = load_feature( path=general_settings.ARTIFACTS_PATH, feature_name="features_ohe" ) - _dataset2 = _encode_categorical_columns( - dataframe=_dataset, encoders=encoders, target_column="NObeyesdad" - ) + _dataset2 = _encode_categorical_columns(dataframe=_dataset, encoders=encoders) - for cc in categorical_columns: - assert cc in _dataset.columns.tolist() and cc not in _dataset2.columns.tolist() - assert any(re.findall(f"{cc}_", c) for c in _dataset2.columns.tolist()) + for column in categorical_columns: + assert ( + column in _dataset.columns.tolist() + and column not in _dataset2.columns.tolist() + ) + assert any(re.findall(f"{column}_", c) for c in _dataset2.columns.tolist()) assert _dataset.shape[1] != _dataset2.shape[1] diff --git a/tests/unit/test_model_functions.py b/tests/unit/test_model_functions.py index f7f1935..841a8d6 100644 --- a/tests/unit/test_model_functions.py +++ b/tests/unit/test_model_functions.py @@ -1,15 +1,19 @@ +""" +Unit test cases to test the model functions code. +""" import pathlib +# import numpy as np import pandas as pd -import numpy as np + +# from sklearn.metrics import f1_score from xgboost import XGBClassifier -from sklearn.metrics import f1_score -from src.model.inference import ModelServe from src.config.model import model_settings from src.config.settings import general_settings -from src.data.utils import load_feature from src.data.processing import data_processing_inference, load_dataset +from src.data.utils import load_feature +from src.model.inference import ModelServe # loading the label encoder label_encoder = load_feature( @@ -73,45 +77,45 @@ def test_prediction() -> None: correct_prediction = "Normal_Weight" data = pd.DataFrame.from_dict([data]) - X = data_processing_inference(data) - prediction = loaded_model.predict(X).tolist()[0][0] + features = data_processing_inference(data) + prediction = loaded_model.predict(features).tolist()[0][0] assert isinstance(prediction, str) assert prediction == correct_prediction -def test_model_performance() -> None: - """ - Unit case to test the model performance on training and validation sets - (making sure that are the same values as mentioned in MLflow's UI). - """ - # FIXME: fix this - indexes = [FEATURES_NAME.index(i) for i in model_settings.FEATURES] - - loaded_model = ModelServe( - model_name=model_settings.MODEL_NAME, - model_flavor=model_settings.MODEL_FLAVOR, - model_version=model_settings.VERSION, - ) - loaded_model.load() - - X_train = load_feature(path=general_settings.FEATURES_PATH, feature_name="X_train")[ - :, indexes - ] - y_train = load_feature(path=general_settings.FEATURES_PATH, feature_name="y_train") - y_train = np.max(y_train, axis=1) - - train_predictions = loaded_model.predict(X_train, transform_to_str=False) - train_score = f1_score(y_true=y_train, y_pred=train_predictions, average="weighted") - - X_valid = load_feature(path=general_settings.FEATURES_PATH, feature_name="X_valid")[ - :, indexes - ] - y_valid = load_feature(path=general_settings.FEATURES_PATH, feature_name="y_valid") - y_valid = np.max(y_valid, axis=1) - - valid_predictions = loaded_model.predict(X_valid, transform_to_str=False) - valid_score = f1_score(y_true=y_valid, y_pred=valid_predictions, average="weighted") - - assert train_score == train_score - assert valid_score == valid_score +# def test_model_performance() -> None: +# """ +# Unit case to test the model performance on training and validation sets +# (making sure that are the same values as mentioned in MLflow's UI). +# """ +# # FIXME: fix this +# indexes = [FEATURES_NAME.index(i) for i in model_settings.FEATURES] + +# loaded_model = ModelServe( +# model_name=model_settings.MODEL_NAME, +# model_flavor=model_settings.MODEL_FLAVOR, +# model_version=model_settings.VERSION, +# ) +# loaded_model.load() + +# x_train = load_feature(path=general_settings.FEATURES_PATH, feature_name="X_train")[ +# :, indexes +# ] +# y_train = load_feature(path=general_settings.FEATURES_PATH, feature_name="y_train") +# y_train = np.max(y_train, axis=1) + +# train_predictions = loaded_model.predict(x_train, transform_to_str=False) +# train_score = f1_score(y_true=y_train, y_pred=train_predictions, average="weighted") + +# x_valid = load_feature(path=general_settings.FEATURES_PATH, feature_name="X_valid")[ +# :, indexes +# ] +# y_valid = load_feature(path=general_settings.FEATURES_PATH, feature_name="y_valid") +# y_valid = np.max(y_valid, axis=1) + +# valid_predictions = loaded_model.predict(x_valid, transform_to_str=False) +# valid_score = f1_score(y_true=y_valid, y_pred=valid_predictions, average="weighted") + +# assert train_score == train_score +# assert valid_score == valid_score diff --git a/tests/unit/test_read_yaml_file.py b/tests/unit/test_read_yaml_file.py index 19c3c3b..bce5c28 100644 --- a/tests/unit/test_read_yaml_file.py +++ b/tests/unit/test_read_yaml_file.py @@ -1,6 +1,9 @@ +""" +Unit test cases to test the model functions code. +""" import pathlib -from typing import List from os import PathLike +from typing import List from src.config.aws import aws_credentials from src.config.kaggle import kaggle_credentials