add config files

victorkaillo · Feb 22, 2022 · 4777865 · 4777865
1 parent 0696fa6
commit 4777865
Show file tree

Hide file tree

Showing 4 changed files with 167 additions and 0 deletions.
diff --git a/MLproject b/MLproject
@@ -0,0 +1,12 @@
+name: main_env
+conda_env: conda.yml
+
+entry_points:
+  main:
+    parameters:
+      hydra_options:
+        description: Hydra parameters to override
+        type: str
+        default: ''
+    command: >-
+      python main.py $(echo {hydra_options})
diff --git a/conda.yml b/conda.yml
@@ -0,0 +1,13 @@
+name: main_env
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - requests=2.24.0
+  - pip=21.3.1
+  - hydra-core=1.1.1
+  - pip:
+      - wandb==0.12.9
+      - mlflow==1.14.1
+      - hydra-joblib-launcher==1.1.5
+      - opendatasets==0.1.20
diff --git a/config.yaml b/config.yaml
@@ -0,0 +1,33 @@
+main:
+  project_name: mlops-creditcard_fraud_predictive
+  experiment_name: dev
+  execute_steps:
+    - download
+    - preprocess
+    - check_data
+    - segregate
+    - decision_tree
+    - evaluate
+  # This seed will be used to seed the random number generator
+  # to ensure repeatibility of the data splits and other
+  # pseudo-random operations
+  random_seed: 42
+data:
+  train_data: "mlops-creditcard_fraud_predictive/train_data.csv:latest"
+  file_url: "https://www.kaggle.com/mlg-ulb/creditcardfraud.csv"
+  reference_dataset: "mlops-creditcard_fraud_predictive/ccfraud_preprocessed.csv:latest"
+  # Threshold for Kolomorov-Smirnov test
+  ks_alpha: 0.05
+  test_size: 0.3
+  val_size: 0.3
+  # Stratify according to the target when splitting the data
+  # in train/test or in train/val
+  stratify: Class
+decision_tree_pipeline:
+  decision_tree:
+    criterion: "entropy"
+    splitter: "best"
+    max_depth: 13
+  numerical_pipe:
+    model: 0
+  export_artifact: "model_export"
diff --git a/main.py b/main.py
@@ -0,0 +1,109 @@
+import mlflow
+import os
+import hydra
+from omegaconf import DictConfig, OmegaConf
+
+# This automatically reads in the configuration
+@hydra.main(config_name='config')
+def process_args(config: DictConfig):
+
+    # Setup the wandb experiment. All runs will be grouped under this name
+    os.environ["WANDB_PROJECT"] = config["main"]["project_name"]
+    os.environ["WANDB_RUN_GROUP"] = config["main"]["experiment_name"]
+
+    # You can get the path at the root of the MLflow project with this:
+    root_path = hydra.utils.get_original_cwd()
+
+    # Check which steps we need to execute
+    if isinstance(config["main"]["execute_steps"], str):
+        # This was passed on the command line as a comma-separated list of steps
+        steps_to_execute = config["main"]["execute_steps"].split(",")
+    else:
+        steps_to_execute = list(config["main"]["execute_steps"])
+
+    # Download step
+    if "download" in steps_to_execute:
+
+        _ = mlflow.run(
+            os.path.join(root_path, "download"),
+            "main",
+            parameters={
+                "file_url": config["data"]["file_url"],
+                "artifact_name": "raw_data.csv",
+                "artifact_type": "raw_data",
+                "artifact_description": "Data as downloaded"
+            }
+        )
+
+    if "preprocess" in steps_to_execute:
+        _ = mlflow.run(
+            os.path.join(root_path, "preprocess"),
+            "main",
+            parameters={
+                "input_artifact": "raw_data.csv:latest",
+                "artifact_name": "preprocessed_data.csv",
+                "artifact_type": "preprocessed_data",
+                "artifact_description": "Data with preprocessing applied"
+            }
+        )
+
+    if "check_data" in steps_to_execute:
+        _ = mlflow.run(
+            os.path.join(root_path, "check_data"),
+            "main",
+            parameters={
+                "reference_artifact": config["data"]["reference_dataset"],
+                "sample_artifact": "preprocessed_data.csv:latest",
+                "ks_alpha": config["data"]["ks_alpha"]
+            }
+        )
+
+    if "segregate" in steps_to_execute:
+
+        _ = mlflow.run(
+            os.path.join(root_path, "segregate"),
+            "main",
+            parameters={
+                "input_artifact": "preprocessed_data.csv:latest",
+                "artifact_root": "data",
+                "artifact_type": "segregated_data",
+                "test_size": config["data"]["test_size"],
+                "stratify": config["data"]["stratify"],
+                "random_state": config["main"]["random_seed"]
+            }
+        )
+
+    if "decision_tree" in steps_to_execute:
+        # Serialize decision tree configuration
+        model_config = os.path.abspath("decision_tree_config.yml")
+
+        with open(model_config, "w+") as fp:
+            fp.write(OmegaConf.to_yaml(config["decision_tree_pipeline"]))
+
+        _ = mlflow.run(
+            os.path.join(root_path, "decision_tree"),
+            "main",
+            parameters={
+                "train_data": config["data"]["train_data"],
+                "model_config": model_config,
+                "export_artifact": config["decision_tree_pipeline"]["export_artifact"],
+                "random_seed": config["main"]["random_seed"],
+                "val_size": config["data"]["val_size"],
+                "stratify": config["data"]["stratify"]
+            }
+        )
+
+    if "evaluate" in steps_to_execute:
+
+        _ = mlflow.run(
+            os.path.join(root_path, "evaluate"),
+            "main",
+            parameters={
+                "model_export": f"{config['decision_tree_pipeline']['export_artifact']}:latest",
+                "test_data": "test_data.csv:latest"
+            }
+        )
+
+
+if __name__ == "__main__":
+    process_args()