diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d91dc93..f16daf8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,20 +1,31 @@ ---- repos: -- - repo: https://github.com/ambv/black - rev: 20.8b1 - hooks: - - - id: black - language_version: python3 + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.3.0 + hooks: + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace + - id: check-added-large-files + - id: debug-statements + language_version: python3 -- repo: local - hooks: - - id: python-tests - name: pytests - entry: pytest src/tests - language: python - additional_dependencies: [pre-commit, pytest, pandas, sklearn, matplotlib] - always_run: true - pass_filenames: false + - repo: https://github.com/psf/black + rev: 22.10.0 + hooks: + - id: black + args: [--safe] + + - repo: local + hooks: + - id: pylint + name: pylint + files: . + entry: pylint + language: system + types: [python3] + args: [ + "-rn", # Only display messages + "-sn", # Don't display the score + "--rcfile=.pylintrc", # Link to your config file + ] diff --git a/data/README.md b/data/README.md index eb7bb4b..a241fb4 100644 --- a/data/README.md +++ b/data/README.md @@ -12,4 +12,4 @@ Finally, you can download the dataset using the following command: bash download_data.sh ``` -The dataset will be temporarily saved locally (inside the `data` folder) and transferred to your AWS S3 bucket. After that, the dataset will be deleted. If you choose to not use an AWS S3 Bucket, then the dataset will be stored into the `data` folder. \ No newline at end of file +The dataset will be temporarily saved locally (inside the `data` folder) and transferred to your AWS S3 bucket. After that, the dataset will be deleted. If you choose to not use an AWS S3 Bucket, then the dataset will be stored into the `data` folder. diff --git a/data/download_data.sh b/data/download_data.sh index d4d2734..1e68a04 100644 --- a/data/download_data.sh +++ b/data/download_data.sh @@ -39,4 +39,3 @@ if [[ "$CONFIG_S3" != "YOUR_S3_BUCKET_URL" ]]; then # deleting the create folder rm Original_ObesityDataSet.csv - \ No newline at end of file diff --git a/notebooks/README.md b/notebooks/README.md index 8e705ac..35d0422 100644 --- a/notebooks/README.md +++ b/notebooks/README.md @@ -4,7 +4,7 @@ Here go the notebooks used for research and development. The main idea is to try ## Setup Credentials -If you haven't your credentials yet, please check the `docs` folder first before following along. +If you haven't your credentials yet, please check the `docs` folder first before following along. 1. Set your `AWS Credentials` and `Kaggle API Credentials` (used to download the dataset) in the `credentials.yaml` file. @@ -44,4 +44,4 @@ sudo docker log - Run the `EDA` notebook. - Run the `Data Processing` notebook. - Run the `Experimentations` notebook (will test different Machine Learning models, different hyperparameters for each model, and do some feature engineering and selection). -- Register the best models to the MLflow model registry using the `Experimentations` notebook (last cell) or the MLflow's user interface. \ No newline at end of file +- Register the best models to the MLflow model registry using the `Experimentations` notebook (last cell) or the MLflow's user interface. diff --git a/notebooks/VERSION b/notebooks/VERSION index 1cc5f65..f0bb29e 100644 --- a/notebooks/VERSION +++ b/notebooks/VERSION @@ -1 +1 @@ -1.1.0 \ No newline at end of file +1.3.0 diff --git a/notebooks/dev_Dockerfile b/notebooks/dev_Dockerfile index 3a72d6d..9d61b46 100644 --- a/notebooks/dev_Dockerfile +++ b/notebooks/dev_Dockerfile @@ -17,4 +17,4 @@ WORKDIR /e2e-project RUN pip install --no-cache-dir -U pip # installing requirements -RUN pip install -r notebooks/requirements_dev.txt \ No newline at end of file +RUN pip install -r notebooks/requirements_dev.txt diff --git a/notebooks/docs/SETUP_AWS.md b/notebooks/docs/SETUP_AWS.md index fd6abe3..10ebef5 100644 --- a/notebooks/docs/SETUP_AWS.md +++ b/notebooks/docs/SETUP_AWS.md @@ -196,7 +196,7 @@ aws ec2 authorize-security-group-ingress \ --group-id "sg-0613261580cd87115" \ --protocol tcp \ --port 5000 \ - --cidr "0.0.0.0/0" + --cidr "0.0.0.0/0" ``` The output should look like this: @@ -224,7 +224,7 @@ aws ec2 authorize-security-group-ingress \ --group-id "sg-0613261580cd87115" \ --protocol tcp \ --port 22 \ - --cidr "18.206.107.24/29" + --cidr "18.206.107.24/29" ``` The output should look like this: @@ -579,4 +579,4 @@ pipenv install mlflow boto3 psycopg2-binary awscli pipenv shell aws configure -``` \ No newline at end of file +``` diff --git a/notebooks/docs/SETUP_KAGGLE.md b/notebooks/docs/SETUP_KAGGLE.md index 2f42875..2ee2f49 100644 --- a/notebooks/docs/SETUP_KAGGLE.md +++ b/notebooks/docs/SETUP_KAGGLE.md @@ -1,3 +1,3 @@ # Setting up Kaggle's Account -To use the Kaggle API, sign up for a Kaggle account at https://www.kaggle.com. Then go to the 'Account' tab of your user profile (https://www.kaggle.com//account) and select 'Create API Token'. This will trigger the download of kaggle.json, a file containing your API credentials. Set your `Kaggle API Credentials` (used to download the dataset) in the `credentials.yaml` file. \ No newline at end of file +To use the Kaggle API, sign up for a Kaggle account at https://www.kaggle.com. Then go to the 'Account' tab of your user profile (https://www.kaggle.com//account) and select 'Create API Token'. This will trigger the download of kaggle.json, a file containing your API credentials. Set your `Kaggle API Credentials` (used to download the dataset) in the `credentials.yaml` file. diff --git a/notebooks/requirements_dev.txt b/notebooks/requirements_dev.txt index fa884c6..265826d 100644 --- a/notebooks/requirements_dev.txt +++ b/notebooks/requirements_dev.txt @@ -12,4 +12,4 @@ optuna==3.6.1 pandas==1.5.2 scikit_learn==1.3.2 seaborn==0.13.2 -xgboost==2.1.1 \ No newline at end of file +xgboost==2.1.1 diff --git a/requirements.txt b/requirements.txt index bd33d1b..7218aeb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,12 @@ -scikit-learn>=0.23 -pandas -seaborn -matplotlib -joblib -numpy -ibm_watson_machine_learning -pyyaml -pytest -pytest-dependency -pre-commit \ No newline at end of file +boto3==1.35.6 +fastapi==0.115.5 +joblib==1.3.2 +loguru==0.7.2 +mlflow==2.17.2 +numpy==2.1.3 +pandas==1.5.2 +pydantic==2.9.2 +pytest==8.3.3 +PyYAML==6.0.2 +scikit_learn==1.3.2 +xgboost==2.1.2 \ No newline at end of file diff --git a/src/README.md b/src/README.md index 12a59fc..0a3ec9e 100644 --- a/src/README.md +++ b/src/README.md @@ -1,3 +1,3 @@ # Scripts -Here goes Scripts and Pipelines \ No newline at end of file +Here goes Scripts and Pipelines diff --git a/src/api.py b/src/api.py index 9d80be9..bf11eee 100644 --- a/src/api.py +++ b/src/api.py @@ -12,13 +12,16 @@ app = FastAPI() if aws_credentials.EC2 != "YOUR_EC2_INSTANCE_URL": - mlflow.set_tracking_uri(f"http://{aws_credentials.EC2}:5000") + mlflow.set_tracking_uri(f"http://{aws_credentials.EC2}:5000") else: mlflow.set_tracking_uri(f"http://127.0.0.1:5000") + @app.get("/version") def check_versions(): - with open(f"{general_settings.RESEARCH_ENVIRONMENT_PATH}/VERSION", "r", encoding="utf-8") as f: + with open( + f"{general_settings.RESEARCH_ENVIRONMENT_PATH}/VERSION", "r", encoding="utf-8" + ) as f: code_version = f.readline().strip() return { @@ -26,6 +29,7 @@ def check_versions(): "model_version": model_settings.VERSION, } + @app.get("/predict") async def prediction(person: Person): loaded_model = ModelServe( @@ -38,6 +42,4 @@ async def prediction(person: Person): data = pd.DataFrame.from_dict([person.model_dump()]) X = data_processing_inference(data) - return { - "predictions": loaded_model.predict(X).tolist() - } + return {"predictions": loaded_model.predict(X).tolist()} diff --git a/src/config/aws.py b/src/config/aws.py index e6ecaac..42a86c6 100644 --- a/src/config/aws.py +++ b/src/config/aws.py @@ -11,12 +11,14 @@ class AWSCredentials(BaseModel): Args: BaseModel (pydantic.BaseModel): Pydantic base model instance. """ + EC2: str S3: str POSTGRESQL: str AWS_ACCESS_KEY: str AWS_SECRET_KEY: str + aws_credentials = AWSCredentials( **read_yaml_credentials_file( file_path=Path.joinpath( diff --git a/src/config/kaggle.py b/src/config/kaggle.py index c093c28..426ed5d 100644 --- a/src/config/kaggle.py +++ b/src/config/kaggle.py @@ -10,6 +10,7 @@ class KaggleCredentials(BaseModel): Args: BaseModel (pydantic.BaseModel): Pydantic base model instance. """ + KAGGLE_USERNAME: str KAGGLE_KEY: str diff --git a/src/config/model.py b/src/config/model.py index bc2ce9c..d36eabd 100644 --- a/src/config/model.py +++ b/src/config/model.py @@ -12,6 +12,7 @@ class ModelSettings(BaseModel): Args: BaseModel (pydantic.BaseModel): Pydantic base model instance. """ + MODEL_NAME: str VERSION: str MODEL_FLAVOR: str @@ -19,6 +20,7 @@ class ModelSettings(BaseModel): RUN_ID: str FEATURES: List[str] + model_settings = ModelSettings( **read_yaml_credentials_file( file_path=Path.joinpath( diff --git a/src/config/settings.py b/src/config/settings.py index 900eb81..6b81767 100644 --- a/src/config/settings.py +++ b/src/config/settings.py @@ -13,6 +13,7 @@ class GeneralSettings(BaseModel): Args: BaseModel (pydantic.BaseModel): Pydantic base model instance. """ + DATA_PATH: DirectoryPath RAW_FILE_NAME: str ARTIFACTS_PATH: DirectoryPath @@ -22,6 +23,7 @@ class GeneralSettings(BaseModel): LOG_PATH: DirectoryPath RESEARCH_ENVIRONMENT_PATH: DirectoryPath + general_settings = GeneralSettings( **read_yaml_credentials_file( file_path=Path.joinpath( @@ -38,5 +40,5 @@ class GeneralSettings(BaseModel): Path.joinpath(general_settings.LOG_PATH, "logs", "app.log"), rotation="1 day", retention="7 days", - compression="zip" + compression="zip", ) diff --git a/src/config/utils.py b/src/config/utils.py index 2530272..c8921ef 100644 --- a/src/config/utils.py +++ b/src/config/utils.py @@ -6,6 +6,7 @@ from pydantic import BaseModel, create_model from pydantic.fields import FieldInfo + def partial_model(model: Type[BaseModel]): """Workaround for setting all Pydantic's fields as optional. All credits goes to the author: @@ -14,9 +15,9 @@ def partial_model(model: Type[BaseModel]): Args: model (Type[BaseModel]): Pydantic base model instance. """ + def make_field_optional( - field: FieldInfo, - default: Any = None + field: FieldInfo, default: Any = None ) -> Tuple[Any, FieldInfo]: new = deepcopy(field) new.default = default @@ -24,20 +25,17 @@ def make_field_optional( return new.annotation, new return create_model( - f'Partial{model.__name__}', + f"Partial{model.__name__}", __base__=model, __module__=model.__module__, **{ field_name: make_field_optional(field_info) for field_name, field_info in model.model_fields.items() - } + }, ) -def read_yaml_credentials_file( - file_path: Path, - file_name: str -) -> Dict: +def read_yaml_credentials_file(file_path: Path, file_name: str) -> Dict: """Reads a YAML file. Args: @@ -56,7 +54,7 @@ def read_yaml_credentials_file( file_name, ) - with open(path, 'r', encoding='utf-8') as f: + with open(path, "r", encoding="utf-8") as f: try: context = yaml.safe_load(f) except yaml.YAMLError as e: diff --git a/src/data/processing.py b/src/data/processing.py index 2e0e88e..b78c427 100644 --- a/src/data/processing.py +++ b/src/data/processing.py @@ -49,8 +49,7 @@ def data_processing_inference(dataframe: pd.DataFrame) -> np.ndarray: # Transforming the AGE and IS columns into a categorical columns logger.info("Categorizing the numerical columns ('Age' and 'IS').") age_bins = load_feature( - path=general_settings.ARTIFACTS_PATH, - feature_name='qcut_bins' + path=general_settings.ARTIFACTS_PATH, feature_name="qcut_bins" ) dataframe = _categorize_numerical_columns(dataframe, age_bins) @@ -58,17 +57,17 @@ def data_processing_inference(dataframe: pd.DataFrame) -> np.ndarray: dataframe = _transform_numerical_columns(dataframe) # Loading the encoders and scalers - logger.info(f"Loading encoders 'features_ohe' from path {general_settings.ARTIFACTS_PATH}.") + logger.info( + f"Loading encoders 'features_ohe' from path {general_settings.ARTIFACTS_PATH}." + ) encoders = load_feature( - path=general_settings.ARTIFACTS_PATH, - feature_name='features_ohe' + path=general_settings.ARTIFACTS_PATH, feature_name="features_ohe" ) - logger.info(f"Loading scalers 'features_sc' from path {general_settings.ARTIFACTS_PATH}.") - sc = load_feature( - path=general_settings.ARTIFACTS_PATH, - feature_name='features_sc' + logger.info( + f"Loading scalers 'features_sc' from path {general_settings.ARTIFACTS_PATH}." ) + sc = load_feature(path=general_settings.ARTIFACTS_PATH, feature_name="features_sc") # Scaling numerical columns dataframe = _scale_numerical_columns(dataframe=dataframe, sc=sc) @@ -77,20 +76,20 @@ def data_processing_inference(dataframe: pd.DataFrame) -> np.ndarray: dataframe = _encode_categorical_columns( dataframe=dataframe, encoders=encoders, - target_column=general_settings.TARGET_COLUMN + target_column=general_settings.TARGET_COLUMN, ) # Selecting only the features that are important for the model dataframe = dataframe[model_settings.FEATURES] - logger.info(f"Filtering the features columns, keeping only {model_settings.FEATURES} columns.") + logger.info( + f"Filtering the features columns, keeping only {model_settings.FEATURES} columns." + ) X = dataframe.values return X -def _drop_features( - dataframe: pd.DataFrame, - features: List -) -> pd.DataFrame: + +def _drop_features(dataframe: pd.DataFrame, features: List) -> pd.DataFrame: """Excludes features from the given dataframe. Args: @@ -101,9 +100,8 @@ def _drop_features( """ return dataframe.drop(columns=features).reset_index(drop=True) -def _remove_duplicates( - dataframe: pd.DataFrame -) -> pd.DataFrame: + +def _remove_duplicates(dataframe: pd.DataFrame) -> pd.DataFrame: """Removes duplicates. Args: @@ -114,9 +112,8 @@ def _remove_duplicates( """ return dataframe.drop_duplicates(keep="first").reset_index(drop=True) -def _change_height_units( - dataframe: pd.DataFrame -) -> pd.DataFrame: + +def _change_height_units(dataframe: pd.DataFrame) -> pd.DataFrame: """Changes the Height unit to centimeters, so will be easier to calculate other features from it. @@ -129,9 +126,8 @@ def _change_height_units( dataframe["Height"] *= 100 return dataframe -def _remove_outliers( - dataframe: pd.DataFrame -) -> pd.DataFrame: + +def _remove_outliers(dataframe: pd.DataFrame) -> pd.DataFrame: """Removes outliers based on the age. Args: @@ -149,16 +145,15 @@ def _remove_outliers( # Removing the data samples that exceeds the upper or lower limits dataframe = dataframe[ ~( - (dataframe["Age"] >= (q3 + threshold * iqr)) | - (dataframe["Age"] <= (q1 - threshold * iqr)) + (dataframe["Age"] >= (q3 + threshold * iqr)) + | (dataframe["Age"] <= (q1 - threshold * iqr)) ) ] return dataframe -def _create_is_feature( - dataframe: pd.DataFrame -) -> pd.DataFrame: + +def _create_is_feature(dataframe: pd.DataFrame) -> pd.DataFrame: """Calculates the Is Sedentary? (IS) feature. Args: @@ -172,9 +167,8 @@ def _create_is_feature( dataframe["IS"] = dataframe["IS"].astype(int) return dataframe -def _create_bmi_feature( - dataframe: pd.DataFrame -) -> pd.DataFrame: + +def _create_bmi_feature(dataframe: pd.DataFrame) -> pd.DataFrame: """Calculates the Body Mass Index (BMI) feature. Args: @@ -187,9 +181,8 @@ def _create_bmi_feature( dataframe["BMI"] = dataframe["Weight"] / (dataframe["Height"] ** 2) return dataframe -def _create_bmr_feature( - dataframe: pd.DataFrame -) -> pd.DataFrame: + +def _create_bmr_feature(dataframe: pd.DataFrame) -> pd.DataFrame: """Calculates the Basal Metabolic Rate (BMR) feature. Args: @@ -199,12 +192,8 @@ def _create_bmr_feature( pd.DataFrame: the dataframe with a new column corresponding to the value of BMR for each data. """ - def _calculate_bmr( - age: int, - gender: str, - height: float, - weight: float - ) -> float: + + def _calculate_bmr(age: int, gender: str, height: float, weight: float) -> float: """Auxiliary function used to calculate the BMR value. Args: @@ -221,10 +210,11 @@ def _calculate_bmr( dataframe["BMR"] = dataframe.apply( lambda x: _calculate_bmr(x["Age"], x["Gender"], x["Height"], x["Weight"]), - axis=1 + axis=1, ) return dataframe + def _categorize_numerical_columns( dataframe: pd.DataFrame, bins: pd.DataFrame, @@ -237,14 +227,16 @@ def _categorize_numerical_columns( Returns: pd.DataFrame: the dataframe with all numerical columns categorized. """ - dataframe["Age"] = pd.cut(x=dataframe["Age"], bins=bins, labels=["q1", "q2", "q3", "q4"]) + dataframe["Age"] = pd.cut( + x=dataframe["Age"], bins=bins, labels=["q1", "q2", "q3", "q4"] + ) dataframe["Age"] = dataframe["Age"].astype("object") dataframe["IS"] = dataframe["IS"].astype("object") return dataframe + def _transform_numerical_columns( - dataframe: pd.DataFrame, - epsilon: float = 1e-10 + dataframe: pd.DataFrame, epsilon: float = 1e-10 ) -> pd.DataFrame: """Transforms the numerical columns using the Log Transformation technique. @@ -264,6 +256,7 @@ def _transform_numerical_columns( return dataframe + def _scale_numerical_columns( dataframe: pd.DataFrame, sc: Dict[str, StandardScaler], @@ -286,6 +279,7 @@ def _scale_numerical_columns( return dataframe + def _encode_categorical_columns( dataframe: pd.DataFrame, encoders: Dict[str, OneHotEncoder], @@ -316,9 +310,12 @@ def _encode_categorical_columns( train_categorical_features = train_categorical_features.add_prefix(cc + "_") new_dataframe = pd.concat([new_dataframe, train_categorical_features], axis=1) - new_dataframe = pd.concat([new_dataframe, dataframe.drop(columns=categorical_columns)], axis=1) + new_dataframe = pd.concat( + [new_dataframe, dataframe.drop(columns=categorical_columns)], axis=1 + ) return new_dataframe + def _encode_labels_array( array: np.ndarray, encoder: OneHotEncoder, @@ -334,6 +331,7 @@ def _encode_labels_array( """ return encoder.transform(array.reshape(-1, 1)) + def load_dataset(path: pathlib.Path) -> pd.DataFrame: """Loads a dataset from a specific path. diff --git a/src/data/utils.py b/src/data/utils.py index ec36f1f..306ad4d 100644 --- a/src/data/utils.py +++ b/src/data/utils.py @@ -11,6 +11,7 @@ from ..config.aws import aws_credentials from ..config.kaggle import kaggle_credentials + def load_feature( path: pathlib.Path, feature_name: str, @@ -27,6 +28,7 @@ def load_feature( logger.info(f"Loading feature/encoder/scaler from file {path}.") return joblib.load(pathlib.PosixPath.joinpath(path, f"{feature_name}.pkl")) + @logger.catch def download_dataset( name: str, @@ -49,10 +51,8 @@ def download_dataset( # Downloading data using the Kaggle API through the terminal # os.system(f'export KAGGLE_USERNAME={kaggle_user}; export KAGGLE_KEY={kaggle_key};') - os.system(f'kaggle datasets download -d {name} --unzip') - os.system( - f'mv ObesityDataSet.csv {pathlib.Path.joinpath(path, new_name)}' - ) + os.system(f"kaggle datasets download -d {name} --unzip") + os.system(f"mv ObesityDataSet.csv {pathlib.Path.joinpath(path, new_name)}") # Sending the dataset to the AWS S3 bucket if send_to_aws: @@ -63,10 +63,11 @@ def download_dataset( ) else: logger.warning( - "The S3 Bucket url was not specified in the 'credentials.yaml' file. " + - "Therefore, the dataset will not be send to S3 and it will be kept saved locally." + "The S3 Bucket url was not specified in the 'credentials.yaml' file. " + + "Therefore, the dataset will not be send to S3 and it will be kept saved locally." ) + @logger.catch def send_dataset_to_s3( file_path: pathlib.Path, diff --git a/src/model/builder.py b/src/model/builder.py index b394a23..55269ab 100644 --- a/src/model/builder.py +++ b/src/model/builder.py @@ -11,8 +11,8 @@ class ModelBuilder: - """The trained model's class. - """ + """The trained model's class.""" + def __init__( self, model_name: str, @@ -38,7 +38,9 @@ def train(self, dataframe: pd.DataFrame) -> None: # Pre-processing and cleaning the data X, y = data_processing(dataframe) - logger.info("Splitting the data into training and validation using 90/10 split.") + logger.info( + "Splitting the data into training and validation using 90/10 split." + ) # Splitting the data into training and validation X_train, X_valid, y_train, y_valid = train_test_split( diff --git a/src/model/inference.py b/src/model/inference.py index 4409955..3605e36 100644 --- a/src/model/inference.py +++ b/src/model/inference.py @@ -7,13 +7,13 @@ from ..data.utils import load_feature label_encoder = load_feature( - path=general_settings.ARTIFACTS_PATH, - feature_name='label_ohe' + path=general_settings.ARTIFACTS_PATH, feature_name="label_ohe" ) + class ModelServe: - """The trained model's class. - """ + """The trained model's class.""" + def __init__( self, model_name: str, @@ -40,13 +40,17 @@ def load(self) -> None: NotImplementedError: raises NotImplementedError if the model's flavor value is not 'xgboost'. """ - logger.info(f"Loading the model {model_settings.MODEL_NAME} from run ID {model_settings.RUN_ID}.") + logger.info( + f"Loading the model {model_settings.MODEL_NAME} from run ID {model_settings.RUN_ID}." + ) if self.model_flavor == "xgboost": model_uri = f"runs:/{model_settings.RUN_ID}/{model_settings.MODEL_NAME}" self.model = mlflow.xgboost.load_model(model_uri) else: - logger.critical(f"Couldn't load the model using the flavor {model_settings.MODEL_FLAVOR}.") + logger.critical( + f"Couldn't load the model using the flavor {model_settings.MODEL_FLAVOR}." + ) raise NotImplementedError() def predict(self, x: np.ndarray, transform_to_str: bool = True) -> np.ndarray: diff --git a/src/schema/person.py b/src/schema/person.py index d43de0e..5e7ff0e 100644 --- a/src/schema/person.py +++ b/src/schema/person.py @@ -2,6 +2,7 @@ from pydantic import BaseModel, Field, field_validator + class Person(BaseModel): """ Person schema. @@ -19,6 +20,7 @@ class Person(BaseModel): FAF - Physical activity frequency (FAF). TUE - Time using technology devices (TUE). """ + Age: int = Field(ge=0, le=100) Height: float = Field(ge=0.0, le=2.5) Weight: float = Field(ge=0, le=400) @@ -27,7 +29,9 @@ class Person(BaseModel): SMOKE: str SCC: str CALC: str = Literal["Frequently", "Sometimes", "Always", "no"] - MTRANS: str = Literal["Public_Transportation", "Automobile", "Walking", "Motorbike", "Bike"] + MTRANS: str = Literal[ + "Public_Transportation", "Automobile", "Walking", "Motorbike", "Bike" + ] FCVC: int = Field(ge=0, le=5) FAF: int = Field(ge=0, le=5) TUE: int = Field(ge=0, le=2) diff --git a/tests/integration/test_data_processing.py b/tests/integration/test_data_processing.py index da3b5a2..a2b26c5 100644 --- a/tests/integration/test_data_processing.py +++ b/tests/integration/test_data_processing.py @@ -11,11 +11,11 @@ # loading the raw dataset that was used to train the model dataset = load_dataset( path=pathlib.Path.joinpath( - general_settings.DATA_PATH, - general_settings.RAW_FILE_NAME + general_settings.DATA_PATH, general_settings.RAW_FILE_NAME ) ) + def test_data_processing_pipeline(): """ Testing the integration of the entire data processing pipeline. diff --git a/tests/integration/test_model_inference.py b/tests/integration/test_model_inference.py index 5a550dd..d3919eb 100644 --- a/tests/integration/test_model_inference.py +++ b/tests/integration/test_model_inference.py @@ -11,11 +11,11 @@ # loading the raw dataset that was used to train the model dataset = load_dataset( path=pathlib.Path.joinpath( - general_settings.DATA_PATH, - general_settings.RAW_FILE_NAME + general_settings.DATA_PATH, general_settings.RAW_FILE_NAME ) ) + def test_model_inference_pipeline(): """ Testing the integration of the entire model inference pipeline. diff --git a/tests/unit/test_api.py b/tests/unit/test_api.py index d944d2d..59e3c3c 100644 --- a/tests/unit/test_api.py +++ b/tests/unit/test_api.py @@ -10,10 +10,11 @@ with open( f"{Path.joinpath(general_settings.RESEARCH_ENVIRONMENT_PATH, 'VERSION')}", "r", - encoding="utf-8" + encoding="utf-8", ) as f: CODE_VERSION = f.readline().strip() + def test_version_endpoint() -> None: """ Unit case to test the API's version endpoint. @@ -29,6 +30,7 @@ def test_version_endpoint() -> None: assert model_settings.VERSION == content[desired_keys[0]] assert CODE_VERSION == content[desired_keys[1]] + def test_inference_endpoint() -> None: """ Unit case to test the API's inference endpoint. @@ -48,7 +50,7 @@ def test_inference_endpoint() -> None: "SCC": "no", "SMOKE": "False", "TUE": 1, - "Weight": 64 + "Weight": 64, } response = requests.get("http://127.0.0.1:8000/predict", json=data, timeout=100) diff --git a/tests/unit/test_data_functions.py b/tests/unit/test_data_functions.py index 1ce1f3a..d157bf7 100644 --- a/tests/unit/test_data_functions.py +++ b/tests/unit/test_data_functions.py @@ -30,11 +30,11 @@ # loading the raw dataset that was used to train the model dataset = load_dataset( path=pathlib.Path.joinpath( - general_settings.DATA_PATH, - general_settings.RAW_FILE_NAME + general_settings.DATA_PATH, general_settings.RAW_FILE_NAME ) ) + def test_change_height_units() -> None: """ Unit case to test the function that changes the unit measure of the @@ -52,14 +52,15 @@ def test_change_height_units() -> None: assert max_height < 300 and min_height > 100 + @pytest.mark.parametrize( "features", [ ["Height", "Weight", "Gender", "Age"], ["family_history_with_overweight", "FAVC", "FCVC", "NCP"], ["CAEC", "SMOKE", "CH2O", "SCC"], - ["FAF", "TUE", "CALC", "MTRANS", "NObeyesdad"] - ] + ["FAF", "TUE", "CALC", "MTRANS", "NObeyesdad"], + ], ) def test_drop_features(features: List[str]) -> None: """ @@ -74,6 +75,7 @@ def test_drop_features(features: List[str]) -> None: assert all(f not in _dataset.columns.tolist() for f in features) + def test_remove_duplicates(): """ Unit case to test the function that drops duplicated rows from the dataset. @@ -86,6 +88,7 @@ def test_remove_duplicates(): assert shape_after < shape_before + def test_remove_outliers(): """ Unit case to test the function that removes outliers from the dataset @@ -99,6 +102,7 @@ def test_remove_outliers(): assert shape_after < shape_before + def test_create_is_feature(): """ Unit case to test the function that creates the Is Sedentary (IS) feature. @@ -114,6 +118,7 @@ def test_create_is_feature(): assert isinstance(_dataset["IS"].dtype, type(np.dtype("int64"))) assert max(_dataset["IS"].values.tolist()) <= 1 + def test_create_bmi_feature(): """ Unit case to test the function that creates the Body Mass Index (BMI) feature. @@ -128,6 +133,7 @@ def test_create_bmi_feature(): assert ptypes.is_numeric_dtype(_dataset["BMI"]) assert isinstance(_dataset["BMI"].dtype, type(np.dtype("float64"))) + def test_create_bmr_feature(): """ Unit case to test the function that creates the Body Mass Ratio (BMR) feature. @@ -142,14 +148,14 @@ def test_create_bmr_feature(): assert ptypes.is_numeric_dtype(_dataset["BMR"]) assert isinstance(_dataset["BMR"].dtype, type(np.dtype("float64"))) + def test_categorize_numerical_columns(): """ Unit case to test the function that categorizes (transform numeric to object) the numerical columns. """ age_bins = load_feature( - path=general_settings.ARTIFACTS_PATH, - feature_name='qcut_bins' + path=general_settings.ARTIFACTS_PATH, feature_name="qcut_bins" ) _dataset = dataset.copy() _dataset = _create_is_feature(dataframe=_dataset) @@ -165,20 +171,22 @@ def test_categorize_numerical_columns(): assert isinstance(_dataset["Age"].dtype, type(np.dtype("object"))) assert isinstance(_dataset["IS"].dtype, type(np.dtype("object"))) + def test_scale_numerical_columns(): """ Unit case to test the function that scales the numerical features. """ _dataset = dataset.copy() - _dataset = _dataset.drop(columns=["Age"]) # this column will be transformed to object - _dataset = _dataset.drop(columns=["Height", "Weight", "FCVC", "NCP", "CH2O", "FAF", "TUE"]) # these columns are not being used + _dataset = _dataset.drop( + columns=["Age"] + ) # this column will be transformed to object + _dataset = _dataset.drop( + columns=["Height", "Weight", "FCVC", "NCP", "CH2O", "FAF", "TUE"] + ) # these columns are not being used numerical_columns = _dataset.select_dtypes(exclude="object").columns.tolist() - sc = load_feature( - path=general_settings.ARTIFACTS_PATH, - feature_name="features_sc" - ) + sc = load_feature(path=general_settings.ARTIFACTS_PATH, feature_name="features_sc") _dataset2 = _scale_numerical_columns(dataframe=_dataset, sc=sc) for nc in numerical_columns: @@ -189,14 +197,19 @@ def test_scale_numerical_columns(): assert _dataset[nc].std(axis=0) > _dataset2[nc].std(axis=0) assert isinstance(_dataset2[nc].dtype, type(np.dtype("float64"))) + def test_transform_numerical_columns(): """ Unit case to test the function that applies log transformatio to the numerical columns. """ _dataset = dataset.copy() - _dataset = _dataset.drop(columns=["Age"]) # this column will be transformed to object - _dataset = _dataset.drop(columns=["Height", "Weight", "FCVC", "NCP", "CH2O", "FAF", "TUE"]) # these columns are not being used + _dataset = _dataset.drop( + columns=["Age"] + ) # this column will be transformed to object + _dataset = _dataset.drop( + columns=["Height", "Weight", "FCVC", "NCP", "CH2O", "FAF", "TUE"] + ) # these columns are not being used numerical_columns = _dataset.select_dtypes(exclude="object").columns.tolist() _dataset2 = _transform_numerical_columns(dataframe=_dataset) @@ -208,26 +221,29 @@ def test_transform_numerical_columns(): assert _dataset[nc].max(axis=0) != _dataset2[nc].max(axis=0) assert _dataset[nc].min(axis=0) != _dataset2[nc].min(axis=0) + def test_encode_categorical_columns(): """ - Unit case to test the function that encodes (applies the one hot + Unit case to test the function that encodes (applies the one hot encode technique) the categorical features. """ _dataset = dataset.copy() - _dataset = _dataset.drop(columns=["NObeyesdad"]) # removing the target column + _dataset = _dataset.drop(columns=["NObeyesdad"]) # removing the target column categorical_columns = _dataset.select_dtypes(include="object").columns.tolist() encoders = load_feature( - path=general_settings.ARTIFACTS_PATH, - feature_name="features_ohe" + path=general_settings.ARTIFACTS_PATH, feature_name="features_ohe" + ) + _dataset2 = _encode_categorical_columns( + dataframe=_dataset, encoders=encoders, target_column="NObeyesdad" ) - _dataset2 = _encode_categorical_columns(dataframe=_dataset, encoders=encoders, target_column="NObeyesdad") for cc in categorical_columns: assert cc in _dataset.columns.tolist() and cc not in _dataset2.columns.tolist() - assert any(re.findall(f'{cc}_', c) for c in _dataset2.columns.tolist()) + assert any(re.findall(f"{cc}_", c) for c in _dataset2.columns.tolist()) assert _dataset.shape[1] != _dataset2.shape[1] + def test_load_dataset(): """ Unit case to test the function that loads the original, raw dataset. @@ -257,20 +273,19 @@ def test_load_dataset(): assert dataset.shape[1] == len(columns) assert dataset.shape[0] == 2111 + def test_download_dataset(): """ Unit case to test the function that downloads the original, raw dataset. """ if pathlib.Path.exists( pathlib.Path.joinpath( - general_settings.DATA_PATH, - general_settings.RAW_FILE_NAME + general_settings.DATA_PATH, general_settings.RAW_FILE_NAME ) ): os.remove( pathlib.Path.joinpath( - general_settings.DATA_PATH, - general_settings.RAW_FILE_NAME + general_settings.DATA_PATH, general_settings.RAW_FILE_NAME ) ) @@ -283,11 +298,11 @@ def test_download_dataset(): assert pathlib.Path.exists( pathlib.Path.joinpath( - general_settings.DATA_PATH, - general_settings.RAW_FILE_NAME + general_settings.DATA_PATH, general_settings.RAW_FILE_NAME ) ) + # def test_send_dataset_aws(): # """ # Unit case to test the function that sends the original, raw dataset to diff --git a/tests/unit/test_model_functions.py b/tests/unit/test_model_functions.py index 2fed69a..f7f1935 100644 --- a/tests/unit/test_model_functions.py +++ b/tests/unit/test_model_functions.py @@ -13,20 +13,19 @@ # loading the label encoder label_encoder = load_feature( - path=general_settings.ARTIFACTS_PATH, - feature_name='label_ohe' + path=general_settings.ARTIFACTS_PATH, feature_name="label_ohe" ) # loading the processed dataset that will be used to get # the index of the used columns dataset = load_dataset( path=pathlib.Path.joinpath( - general_settings.DATA_PATH, - "Preprocessed_ObesityDataSet.csv" + general_settings.DATA_PATH, "Preprocessed_ObesityDataSet.csv" ) ) FEATURES_NAME = dataset.columns.tolist() + def test_load_model() -> None: """ Unit case to test loading a trained model from MLflow. @@ -45,6 +44,7 @@ def test_load_model() -> None: assert isinstance(loaded_model, ModelServe) + def test_prediction() -> None: """ Unit case to test making a prediction with the loaded model. @@ -68,7 +68,7 @@ def test_prediction() -> None: "SCC": "no", "SMOKE": "False", "TUE": 1, - "Weight": 64 + "Weight": 64, } correct_prediction = "Normal_Weight" @@ -79,6 +79,7 @@ def test_prediction() -> None: assert isinstance(prediction, str) assert prediction == correct_prediction + def test_model_performance() -> None: """ Unit case to test the model performance on training and validation sets @@ -94,27 +95,19 @@ def test_model_performance() -> None: ) loaded_model.load() - X_train = load_feature( - path=general_settings.FEATURES_PATH, - feature_name='X_train' - )[:, indexes] - y_train = load_feature( - path=general_settings.FEATURES_PATH, - feature_name='y_train' - ) + X_train = load_feature(path=general_settings.FEATURES_PATH, feature_name="X_train")[ + :, indexes + ] + y_train = load_feature(path=general_settings.FEATURES_PATH, feature_name="y_train") y_train = np.max(y_train, axis=1) train_predictions = loaded_model.predict(X_train, transform_to_str=False) train_score = f1_score(y_true=y_train, y_pred=train_predictions, average="weighted") - X_valid = load_feature( - path=general_settings.FEATURES_PATH, - feature_name='X_valid' - )[:, indexes] - y_valid = load_feature( - path=general_settings.FEATURES_PATH, - feature_name='y_valid' - ) + X_valid = load_feature(path=general_settings.FEATURES_PATH, feature_name="X_valid")[ + :, indexes + ] + y_valid = load_feature(path=general_settings.FEATURES_PATH, feature_name="y_valid") y_valid = np.max(y_valid, axis=1) valid_predictions = loaded_model.predict(X_valid, transform_to_str=False) diff --git a/tests/unit/test_read_yaml_file.py b/tests/unit/test_read_yaml_file.py index ab1d3aa..19c3c3b 100644 --- a/tests/unit/test_read_yaml_file.py +++ b/tests/unit/test_read_yaml_file.py @@ -28,6 +28,7 @@ def test_kaggle_file() -> None: assert isinstance(kaggle_credentials.KAGGLE_USERNAME, str) assert isinstance(kaggle_credentials.KAGGLE_KEY, str) + def test_model_file() -> None: """ Unit case to test the function responsible for reading an YAML @@ -40,6 +41,7 @@ def test_model_file() -> None: assert isinstance(model_settings.RUN_ID, str) assert isinstance(model_settings.FEATURES, List) + def test_settings_file() -> None: """ Unit case to test the function responsible for reading an YAML