diff --git a/MLproject b/MLproject new file mode 100644 index 0000000..6562190 --- /dev/null +++ b/MLproject @@ -0,0 +1,12 @@ +name: main_env +conda_env: conda.yml + +entry_points: + main: + parameters: + hydra_options: + description: Hydra parameters to override + type: str + default: '' + command: >- + python main.py $(echo {hydra_options}) diff --git a/conda.yml b/conda.yml new file mode 100644 index 0000000..fe6f090 --- /dev/null +++ b/conda.yml @@ -0,0 +1,13 @@ +name: main_env +channels: + - conda-forge + - defaults +dependencies: + - requests=2.24.0 + - pip=21.3.1 + - hydra-core=1.1.1 + - pip: + - wandb==0.12.9 + - mlflow==1.14.1 + - hydra-joblib-launcher==1.1.5 + - opendatasets==0.1.20 \ No newline at end of file diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..36ebaa5 --- /dev/null +++ b/config.yaml @@ -0,0 +1,33 @@ +main: + project_name: mlops-creditcard_fraud_predictive + experiment_name: dev + execute_steps: + - download + - preprocess + - check_data + - segregate + - decision_tree + - evaluate + # This seed will be used to seed the random number generator + # to ensure repeatibility of the data splits and other + # pseudo-random operations + random_seed: 42 +data: + train_data: "mlops-creditcard_fraud_predictive/train_data.csv:latest" + file_url: "https://www.kaggle.com/mlg-ulb/creditcardfraud.csv" + reference_dataset: "mlops-creditcard_fraud_predictive/ccfraud_preprocessed.csv:latest" + # Threshold for Kolomorov-Smirnov test + ks_alpha: 0.05 + test_size: 0.3 + val_size: 0.3 + # Stratify according to the target when splitting the data + # in train/test or in train/val + stratify: Class +decision_tree_pipeline: + decision_tree: + criterion: "entropy" + splitter: "best" + max_depth: 13 + numerical_pipe: + model: 0 + export_artifact: "model_export" \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..59a6a81 --- /dev/null +++ b/main.py @@ -0,0 +1,109 @@ +import mlflow +import os +import hydra +from omegaconf import DictConfig, OmegaConf + +# This automatically reads in the configuration +@hydra.main(config_name='config') +def process_args(config: DictConfig): + + # Setup the wandb experiment. All runs will be grouped under this name + os.environ["WANDB_PROJECT"] = config["main"]["project_name"] + os.environ["WANDB_RUN_GROUP"] = config["main"]["experiment_name"] + + # You can get the path at the root of the MLflow project with this: + root_path = hydra.utils.get_original_cwd() + + # Check which steps we need to execute + if isinstance(config["main"]["execute_steps"], str): + # This was passed on the command line as a comma-separated list of steps + steps_to_execute = config["main"]["execute_steps"].split(",") + else: + steps_to_execute = list(config["main"]["execute_steps"]) + + # Download step + if "download" in steps_to_execute: + + _ = mlflow.run( + os.path.join(root_path, "download"), + "main", + parameters={ + "file_url": config["data"]["file_url"], + "artifact_name": "raw_data.csv", + "artifact_type": "raw_data", + "artifact_description": "Data as downloaded" + } + ) + + if "preprocess" in steps_to_execute: + _ = mlflow.run( + os.path.join(root_path, "preprocess"), + "main", + parameters={ + "input_artifact": "raw_data.csv:latest", + "artifact_name": "preprocessed_data.csv", + "artifact_type": "preprocessed_data", + "artifact_description": "Data with preprocessing applied" + } + ) + + if "check_data" in steps_to_execute: + _ = mlflow.run( + os.path.join(root_path, "check_data"), + "main", + parameters={ + "reference_artifact": config["data"]["reference_dataset"], + "sample_artifact": "preprocessed_data.csv:latest", + "ks_alpha": config["data"]["ks_alpha"] + } + ) + + if "segregate" in steps_to_execute: + + _ = mlflow.run( + os.path.join(root_path, "segregate"), + "main", + parameters={ + "input_artifact": "preprocessed_data.csv:latest", + "artifact_root": "data", + "artifact_type": "segregated_data", + "test_size": config["data"]["test_size"], + "stratify": config["data"]["stratify"], + "random_state": config["main"]["random_seed"] + } + ) + + if "decision_tree" in steps_to_execute: + # Serialize decision tree configuration + model_config = os.path.abspath("decision_tree_config.yml") + + with open(model_config, "w+") as fp: + fp.write(OmegaConf.to_yaml(config["decision_tree_pipeline"])) + + _ = mlflow.run( + os.path.join(root_path, "decision_tree"), + "main", + parameters={ + "train_data": config["data"]["train_data"], + "model_config": model_config, + "export_artifact": config["decision_tree_pipeline"]["export_artifact"], + "random_seed": config["main"]["random_seed"], + "val_size": config["data"]["val_size"], + "stratify": config["data"]["stratify"] + } + ) + + if "evaluate" in steps_to_execute: + + _ = mlflow.run( + os.path.join(root_path, "evaluate"), + "main", + parameters={ + "model_export": f"{config['decision_tree_pipeline']['export_artifact']}:latest", + "test_data": "test_data.csv:latest" + } + ) + + +if __name__ == "__main__": + process_args()