updating requirements, adding pre-commit, and formatting code with black

rafaelgreca · Nov 13, 2024 · d8b82b1 · d8b82b1
1 parent ad24076
commit d8b82b1
Show file tree

Hide file tree

Showing 28 changed files with 209 additions and 170 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,20 +1,31 @@
---- 
 repos:
 
--
-  repo: https://github.com/ambv/black
-  rev: 20.8b1
-  hooks: 
-    - 
-      id: black
-      language_version: python3
+  -   repo: https://github.com/pre-commit/pre-commit-hooks
+      rev: v2.3.0
+      hooks:
+      -   id: check-yaml
+      -   id: end-of-file-fixer
+      -   id: trailing-whitespace
+      -   id: check-added-large-files
+      -   id: debug-statements
+          language_version: python3
 
--   repo: local
-    hooks:
-    -   id: python-tests
-        name: pytests
-        entry: pytest src/tests
-        language: python
-        additional_dependencies: [pre-commit, pytest, pandas, sklearn, matplotlib]
-        always_run: true
-        pass_filenames: false
+  -   repo: https://github.com/psf/black
+      rev: 22.10.0
+      hooks:
+      -   id: black
+          args: [--safe]
+
+  -   repo: local
+      hooks:
+      -   id: pylint
+          name: pylint
+          files: .
+          entry: pylint
+          language: system
+          types: [python3]
+          args: [
+            "-rn", # Only display messages
+            "-sn", # Don't display the score
+            "--rcfile=.pylintrc", # Link to your config file
+          ]
diff --git a/data/README.md b/data/README.md
@@ -12,4 +12,4 @@ Finally, you can download the dataset using the following command:
 bash download_data.sh
 ```
 
-The dataset will be temporarily saved locally (inside the `data` folder) and transferred to your AWS S3 bucket. After that, the dataset will be deleted. If you choose to not use an AWS S3 Bucket, then the dataset will be stored into the `data` folder.
+The dataset will be temporarily saved locally (inside the `data` folder) and transferred to your AWS S3 bucket. After that, the dataset will be deleted. If you choose to not use an AWS S3 Bucket, then the dataset will be stored into the `data` folder.
diff --git a/data/download_data.sh b/data/download_data.sh
@@ -39,4 +39,3 @@ if [[ "$CONFIG_S3" != "YOUR_S3_BUCKET_URL" ]]; then
 
     # deleting the create folder
     rm Original_ObesityDataSet.csv
-
diff --git a/notebooks/README.md b/notebooks/README.md
@@ -4,7 +4,7 @@ Here go the notebooks used for research and development. The main idea is to try
 
 ## Setup Credentials
 
-If you haven't your credentials yet, please check the `docs` folder first before following along. 
+If you haven't your credentials yet, please check the `docs` folder first before following along.
 
 1. Set your `AWS Credentials` and `Kaggle API Credentials` (used to download the dataset) in the `credentials.yaml` file.
 
@@ -44,4 +44,4 @@ sudo docker log <CONTAINER_ID>
 - Run the `EDA` notebook.
 - Run the `Data Processing` notebook.
 - Run the `Experimentations` notebook (will test different Machine Learning models, different hyperparameters for each model, and do some feature engineering and selection).
-- Register the best models to the MLflow model registry using the `Experimentations` notebook (last cell) or the MLflow's user interface.
+- Register the best models to the MLflow model registry using the `Experimentations` notebook (last cell) or the MLflow's user interface.
diff --git a/notebooks/VERSION b/notebooks/VERSION
@@ -1 +1 @@
-1.1.0
+1.3.0
diff --git a/notebooks/dev_Dockerfile b/notebooks/dev_Dockerfile
@@ -17,4 +17,4 @@ WORKDIR /e2e-project
 RUN pip install --no-cache-dir -U pip
 
 # installing requirements
-RUN pip install -r notebooks/requirements_dev.txt
+RUN pip install -r notebooks/requirements_dev.txt
diff --git a/notebooks/docs/SETUP_AWS.md b/notebooks/docs/SETUP_AWS.md
@@ -196,7 +196,7 @@ aws ec2 authorize-security-group-ingress \
     --group-id "sg-0613261580cd87115" \
     --protocol tcp \
     --port 5000 \
-    --cidr "0.0.0.0/0" 
+    --cidr "0.0.0.0/0"
 ```
 
 The output should look like this:
@@ -224,7 +224,7 @@ aws ec2 authorize-security-group-ingress \
     --group-id "sg-0613261580cd87115" \
     --protocol tcp \
     --port 22 \
-    --cidr "18.206.107.24/29" 
+    --cidr "18.206.107.24/29"
 ```
 
 The output should look like this:
@@ -579,4 +579,4 @@ pipenv install mlflow boto3 psycopg2-binary awscli
 pipenv shell
 
 aws configure
-```
+```
diff --git a/notebooks/docs/SETUP_KAGGLE.md b/notebooks/docs/SETUP_KAGGLE.md
@@ -1,3 +1,3 @@
 # Setting up Kaggle's Account
 
-To use the Kaggle API, sign up for a Kaggle account at https://www.kaggle.com. Then go to the 'Account' tab of your user profile (https://www.kaggle.com/<username>/account) and select 'Create API Token'. This will trigger the download of kaggle.json, a file containing your API credentials. Set your `Kaggle API Credentials` (used to download the dataset) in the `credentials.yaml` file.
+To use the Kaggle API, sign up for a Kaggle account at https://www.kaggle.com. Then go to the 'Account' tab of your user profile (https://www.kaggle.com/<username>/account) and select 'Create API Token'. This will trigger the download of kaggle.json, a file containing your API credentials. Set your `Kaggle API Credentials` (used to download the dataset) in the `credentials.yaml` file.
diff --git a/notebooks/requirements_dev.txt b/notebooks/requirements_dev.txt
@@ -12,4 +12,4 @@ optuna==3.6.1
 pandas==1.5.2
 scikit_learn==1.3.2
 seaborn==0.13.2
-xgboost==2.1.1
+xgboost==2.1.1
diff --git a/requirements.txt b/requirements.txt
@@ -1,11 +1,12 @@
-scikit-learn>=0.23
-pandas
-seaborn
-matplotlib
-joblib
-numpy
-ibm_watson_machine_learning
-pyyaml
-pytest
-pytest-dependency
-pre-commit
+boto3==1.35.6
+fastapi==0.115.5
+joblib==1.3.2
+loguru==0.7.2
+mlflow==2.17.2
+numpy==2.1.3
+pandas==1.5.2
+pydantic==2.9.2
+pytest==8.3.3
+PyYAML==6.0.2
+scikit_learn==1.3.2
+xgboost==2.1.2
diff --git a/src/README.md b/src/README.md
@@ -1,3 +1,3 @@
 # Scripts
 
-Here goes Scripts and Pipelines
+Here goes Scripts and Pipelines
diff --git a/src/api.py b/src/api.py
@@ -12,20 +12,24 @@
 app = FastAPI()
 
 if aws_credentials.EC2 != "YOUR_EC2_INSTANCE_URL":
-    mlflow.set_tracking_uri(f"http://{aws_credentials.EC2}:5000") 
+    mlflow.set_tracking_uri(f"http://{aws_credentials.EC2}:5000")
 else:
     mlflow.set_tracking_uri(f"http://127.0.0.1:5000")
 
+
 @app.get("/version")
 def check_versions():
-    with open(f"{general_settings.RESEARCH_ENVIRONMENT_PATH}/VERSION", "r", encoding="utf-8") as f:
+    with open(
+        f"{general_settings.RESEARCH_ENVIRONMENT_PATH}/VERSION", "r", encoding="utf-8"
+    ) as f:
         code_version = f.readline().strip()
 
     return {
         "code_version": code_version,
         "model_version": model_settings.VERSION,
     }
 
+
 @app.get("/predict")
 async def prediction(person: Person):
     loaded_model = ModelServe(
@@ -38,6 +42,4 @@ async def prediction(person: Person):
     data = pd.DataFrame.from_dict([person.model_dump()])
     X = data_processing_inference(data)
 
-    return {
-        "predictions": loaded_model.predict(X).tolist()
-    }
+    return {"predictions": loaded_model.predict(X).tolist()}
diff --git a/src/config/aws.py b/src/config/aws.py
@@ -11,12 +11,14 @@ class AWSCredentials(BaseModel):
     Args:
         BaseModel (pydantic.BaseModel): Pydantic base model instance.
     """
+
     EC2: str
     S3: str
     POSTGRESQL: str
     AWS_ACCESS_KEY: str
     AWS_SECRET_KEY: str
 
+
 aws_credentials = AWSCredentials(
     **read_yaml_credentials_file(
         file_path=Path.joinpath(

diff --git a/src/config/kaggle.py b/src/config/kaggle.py
@@ -10,6 +10,7 @@ class KaggleCredentials(BaseModel):
     Args:
         BaseModel (pydantic.BaseModel): Pydantic base model instance.
     """
+
     KAGGLE_USERNAME: str
     KAGGLE_KEY: str
 

diff --git a/src/config/model.py b/src/config/model.py
@@ -12,13 +12,15 @@ class ModelSettings(BaseModel):
     Args:
         BaseModel (pydantic.BaseModel): Pydantic base model instance.
     """
+
     MODEL_NAME: str
     VERSION: str
     MODEL_FLAVOR: str
     EXPERIMENT_ID: str
     RUN_ID: str
     FEATURES: List[str]
 
+
 model_settings = ModelSettings(
     **read_yaml_credentials_file(
         file_path=Path.joinpath(

diff --git a/src/config/settings.py b/src/config/settings.py
@@ -13,6 +13,7 @@ class GeneralSettings(BaseModel):
     Args:
         BaseModel (pydantic.BaseModel): Pydantic base model instance.
     """
+
     DATA_PATH: DirectoryPath
     RAW_FILE_NAME: str
     ARTIFACTS_PATH: DirectoryPath
@@ -22,6 +23,7 @@ class GeneralSettings(BaseModel):
     LOG_PATH: DirectoryPath
     RESEARCH_ENVIRONMENT_PATH: DirectoryPath
 
+
 general_settings = GeneralSettings(
     **read_yaml_credentials_file(
         file_path=Path.joinpath(
@@ -38,5 +40,5 @@ class GeneralSettings(BaseModel):
     Path.joinpath(general_settings.LOG_PATH, "logs", "app.log"),
     rotation="1 day",
     retention="7 days",
-    compression="zip"
+    compression="zip",
 )
diff --git a/src/config/utils.py b/src/config/utils.py
@@ -6,6 +6,7 @@
 from pydantic import BaseModel, create_model
 from pydantic.fields import FieldInfo
 
+
 def partial_model(model: Type[BaseModel]):
     """Workaround for setting all Pydantic's fields as optional.
     All credits goes to the author:
@@ -14,30 +15,27 @@ def partial_model(model: Type[BaseModel]):
     Args:
         model (Type[BaseModel]): Pydantic base model instance.
     """
+
     def make_field_optional(
-        field: FieldInfo,
-        default: Any = None
+        field: FieldInfo, default: Any = None
     ) -> Tuple[Any, FieldInfo]:
         new = deepcopy(field)
         new.default = default
         new.annotation = Optional[field.annotation]  # type: ignore
         return new.annotation, new
 
     return create_model(
-        f'Partial{model.__name__}',
+        f"Partial{model.__name__}",
         __base__=model,
         __module__=model.__module__,
         **{
             field_name: make_field_optional(field_info)
             for field_name, field_info in model.model_fields.items()
-        }
+        },
     )
 
 
-def read_yaml_credentials_file(
-    file_path: Path,
-    file_name: str
-) -> Dict:
+def read_yaml_credentials_file(file_path: Path, file_name: str) -> Dict:
     """Reads a YAML file.
 
     Args:
@@ -56,7 +54,7 @@ def read_yaml_credentials_file(
         file_name,
     )
 
-    with open(path, 'r', encoding='utf-8') as f:
+    with open(path, "r", encoding="utf-8") as f:
         try:
             context = yaml.safe_load(f)
         except yaml.YAMLError as e:
Original file line number	Diff line number	Diff line change
Expand Up		@@ -39,4 +39,3 @@ if [[ "$CONFIG_S3" != "YOUR_S3_BUCKET_URL" ]]; then

		# deleting the create folder
		rm Original_ObesityDataSet.csv
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,6 +10,7 @@ class KaggleCredentials(BaseModel): @@
         Args:
             BaseModel (pydantic.BaseModel): Pydantic base model instance.
         """
         KAGGLE_USERNAME: str
         KAGGLE_KEY: str
@@ Expand Down @@