Skip to content

Commit

Permalink
add config files
Browse files Browse the repository at this point in the history
  • Loading branch information
Victor Kaillo committed Feb 22, 2022
1 parent 0696fa6 commit 4777865
Show file tree
Hide file tree
Showing 4 changed files with 167 additions and 0 deletions.
12 changes: 12 additions & 0 deletions MLproject
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
name: main_env
conda_env: conda.yml

entry_points:
main:
parameters:
hydra_options:
description: Hydra parameters to override
type: str
default: ''
command: >-
python main.py $(echo {hydra_options})
13 changes: 13 additions & 0 deletions conda.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
name: main_env
channels:
- conda-forge
- defaults
dependencies:
- requests=2.24.0
- pip=21.3.1
- hydra-core=1.1.1
- pip:
- wandb==0.12.9
- mlflow==1.14.1
- hydra-joblib-launcher==1.1.5
- opendatasets==0.1.20
33 changes: 33 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
main:
project_name: mlops-creditcard_fraud_predictive
experiment_name: dev
execute_steps:
- download
- preprocess
- check_data
- segregate
- decision_tree
- evaluate
# This seed will be used to seed the random number generator
# to ensure repeatibility of the data splits and other
# pseudo-random operations
random_seed: 42
data:
train_data: "mlops-creditcard_fraud_predictive/train_data.csv:latest"
file_url: "https://www.kaggle.com/mlg-ulb/creditcardfraud.csv"
reference_dataset: "mlops-creditcard_fraud_predictive/ccfraud_preprocessed.csv:latest"
# Threshold for Kolomorov-Smirnov test
ks_alpha: 0.05
test_size: 0.3
val_size: 0.3
# Stratify according to the target when splitting the data
# in train/test or in train/val
stratify: Class
decision_tree_pipeline:
decision_tree:
criterion: "entropy"
splitter: "best"
max_depth: 13
numerical_pipe:
model: 0
export_artifact: "model_export"
109 changes: 109 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import mlflow
import os
import hydra
from omegaconf import DictConfig, OmegaConf

# This automatically reads in the configuration
@hydra.main(config_name='config')
def process_args(config: DictConfig):

# Setup the wandb experiment. All runs will be grouped under this name
os.environ["WANDB_PROJECT"] = config["main"]["project_name"]
os.environ["WANDB_RUN_GROUP"] = config["main"]["experiment_name"]

# You can get the path at the root of the MLflow project with this:
root_path = hydra.utils.get_original_cwd()

# Check which steps we need to execute
if isinstance(config["main"]["execute_steps"], str):
# This was passed on the command line as a comma-separated list of steps
steps_to_execute = config["main"]["execute_steps"].split(",")
else:
steps_to_execute = list(config["main"]["execute_steps"])

# Download step
if "download" in steps_to_execute:

_ = mlflow.run(
os.path.join(root_path, "download"),
"main",
parameters={
"file_url": config["data"]["file_url"],
"artifact_name": "raw_data.csv",
"artifact_type": "raw_data",
"artifact_description": "Data as downloaded"
}
)

if "preprocess" in steps_to_execute:
_ = mlflow.run(
os.path.join(root_path, "preprocess"),
"main",
parameters={
"input_artifact": "raw_data.csv:latest",
"artifact_name": "preprocessed_data.csv",
"artifact_type": "preprocessed_data",
"artifact_description": "Data with preprocessing applied"
}
)

if "check_data" in steps_to_execute:
_ = mlflow.run(
os.path.join(root_path, "check_data"),
"main",
parameters={
"reference_artifact": config["data"]["reference_dataset"],
"sample_artifact": "preprocessed_data.csv:latest",
"ks_alpha": config["data"]["ks_alpha"]
}
)

if "segregate" in steps_to_execute:

_ = mlflow.run(
os.path.join(root_path, "segregate"),
"main",
parameters={
"input_artifact": "preprocessed_data.csv:latest",
"artifact_root": "data",
"artifact_type": "segregated_data",
"test_size": config["data"]["test_size"],
"stratify": config["data"]["stratify"],
"random_state": config["main"]["random_seed"]
}
)

if "decision_tree" in steps_to_execute:
# Serialize decision tree configuration
model_config = os.path.abspath("decision_tree_config.yml")

with open(model_config, "w+") as fp:
fp.write(OmegaConf.to_yaml(config["decision_tree_pipeline"]))

_ = mlflow.run(
os.path.join(root_path, "decision_tree"),
"main",
parameters={
"train_data": config["data"]["train_data"],
"model_config": model_config,
"export_artifact": config["decision_tree_pipeline"]["export_artifact"],
"random_seed": config["main"]["random_seed"],
"val_size": config["data"]["val_size"],
"stratify": config["data"]["stratify"]
}
)

if "evaluate" in steps_to_execute:

_ = mlflow.run(
os.path.join(root_path, "evaluate"),
"main",
parameters={
"model_export": f"{config['decision_tree_pipeline']['export_artifact']}:latest",
"test_data": "test_data.csv:latest"
}
)


if __name__ == "__main__":
process_args()

0 comments on commit 4777865

Please sign in to comment.