Skip to content

Commit

Permalink
Applied feeback from Gert, Lisa and Adrienne
Browse files Browse the repository at this point in the history
  • Loading branch information
vloothuis committed Mar 24, 2024
1 parent adbd497 commit a26bc6a
Show file tree
Hide file tree
Showing 10 changed files with 185 additions and 116 deletions.
10 changes: 9 additions & 1 deletion .github/workflows/checks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,13 @@ jobs:
- name: Run prediction
run: docker run --rm -v "$(pwd)/data:/data" eyra-rank:latest predict /data/fake_data.csv --out=/data/predictions.csv

- name: Build Docker scoring image
uses: docker/build-push-action@v4
with:
context: .
file: python.Dockerfile
tags: eyra-rank:scoring
load: true

- name: Run scoring
run: docker run --rm -v "$(pwd)/data:/data" eyra-rank:latest score /data/predictions.csv /data/fake_data_ground_truth.csv
run: docker run --rm -v "$(pwd)/data:/data" --entrypoint "conda run -n eyra-rank python /app/score.py" eyra-rank:scoring /data/predictions.csv /data/fake_data_ground_truth.csv
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@
.DS_Store
.AppleDouble
.LSOverride
__pycache__/
.tool-versions
9 changes: 5 additions & 4 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
name: eyra-rank
channels:
- defaults
- conda-forge
dependencies:
- pandas=1.5
- scikit-learn=1.2
- joblib=1.1
- matplotlib=3.7
- pandas=2.2.1
- scikit-learn=1.4.1.post1
- joblib=1.3.2
- matplotlib=3.8.3
Binary file modified model.rds
Binary file not shown.
9 changes: 5 additions & 4 deletions python.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@ COPY environment.yml /
RUN conda env create -f /environment.yml

RUN mkdir /app
WORKDIR /app

COPY data /data
COPY *.py /
COPY *.joblib /
COPY *.csv /app
COPY *.py /app
COPY *.joblib /app

ENTRYPOINT ["conda", "run", "-n", "eyra-rank", "python", "/run.py"]
ENTRYPOINT ["conda", "run", "-n", "eyra-rank", "python", "/app/run.py"]
CMD ["predict", "/data/fake_data.csv"]
67 changes: 16 additions & 51 deletions run.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,32 +16,19 @@ source("submission.R")

print_usage <- function() {
cat("Usage:\n")
cat(" Rscript script.R predict INPUT_FILE [--output OUTPUT_FILE]\n")
cat(" Rscript script.R score --prediction PREDICTION_FILE --ground_truth GROUND_TRUTH_FILE [--output OUTPUT_FILE]\n")
cat(" Rscript script.R DATA_FILE BACKGROUND_DATA_FILE [--output OUTPUT_FILE]\n")
}

parse_arguments <- function() {
args <- list()
command_args <- commandArgs(trailingOnly = TRUE)
if (length(command_args) > 0) {
args$command <- command_args[1]
if (length(command_args) < 2) {
return(args)
}

if (is.null(args$command)) {
stop("Error: No command provided.")
}

if (args$command == "predict") {
args$input <- commandArgs(trailingOnly = TRUE)[2]
args$output <- get_argument("--output")
} else if (args$command == "score") {
args$prediction <- get_argument("--prediction")
args$ground_truth <- get_argument("--ground_truth")
args$output <- get_argument("--output")
}
} else {
stop("Error: No command provided. Run the script with predict or score.")
}

args$data <- commandArgs(trailingOnly = TRUE)[1]
args$background_data <- commandArgs(trailingOnly = TRUE)[2]
args$output <- get_argument("--output")
return(args)
}

Expand All @@ -56,41 +43,25 @@ get_argument <- function(arg_name) {
}

parse_and_run_predict <- function(args) {
if (is.null(args$input)) {
stop("Error: Please provide --input argument for prediction.")
if (is.null(args$data)||is.null(args$background_data)) {
stop("Error: Please provide data and background_data argument for prediction.")
}

cat("Processing input data for prediction from:", args$input, "\n")
cat("Processing input data for prediction from:", args$data, " ", args$background_data, "\n")
if (!is.null(args$output)) {
cat("Output will be saved to:", args$output, "\n")
}
run_predict(args$input, args$output)
}

run_score <- function(args) {
if (is.null(args$prediction) || is.null(args$ground_truth)) {
stop("Error: Please provide --prediction and --ground_truth arguments for scoring.")
}

cat("Scoring predictions from:", args$prediction, "\n")
cat("Ground truth data from:", args$ground_truth, "\n")
if (!is.null(args$output)) {
cat("Evaluation score will be saved to:", args$output, "\n")
}
# Call your submission function for scoring here
run_predict(args$data, args$background_data, args$output)
}

run_predict <- function(input_path, output=NULL) {
run_predict <- function(data_path, background_data_path, output=NULL) {
if (is.null(output)) {
output <- stdout()
}
df <- read.csv(data_path, encoding="latin1")
background_df <- read.csv(background_data_path, encoding="latin1")


# Read data from input file
df <- read.csv(input_path, encoding="latin1")

# Make predictions
predictions <- predict_outcomes(df) # Assuming predict_outcomes is a function in the submission package
predictions <- predict_outcomes(df, background_df)

# Check if predictions have the required format
stopifnot(ncol(predictions) == 2,
Expand All @@ -105,13 +76,7 @@ run_predict <- function(input_path, output=NULL) {
main <- function() {
args <- parse_arguments()

if (args$command == "predict") {
parse_and_run_predict(args)
} else if (args$command == "score") {
run_score(args)
} else {
stop("Error: Invalid command. Use 'predict' or 'score'.")
}
parse_and_run_predict(args)
}

# Call main function
Expand Down
46 changes: 18 additions & 28 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,31 +20,21 @@
import pandas as pd
import submission

parser = argparse.ArgumentParser(description="Process and score data.")
subparsers = parser.add_subparsers(dest="command")
parser = argparse.ArgumentParser(description="Process data.")

# Process subcommand
process_parser = subparsers.add_parser(
"predict", help="Process input data for prediction."
parser.add_argument("data_path", help="Path to data data CSV file.")
parser.add_argument(
"background_data_path", help="Path to background data data CSV file."
)
process_parser.add_argument("input_path", help="Path to input data CSV file.")
process_parser.add_argument("--output", help="Path to prediction output CSV file.")

# Score subcommand
score_parser = subparsers.add_parser("score", help="Score (evaluate) predictions.")
score_parser.add_argument("prediction_path", help="Path to predicted outcome CSV file.")
score_parser.add_argument(
"ground_truth_path", help="Path to ground truth outcome CSV file."
)
score_parser.add_argument("--output", help="Path to evaluation score output CSV file.")
parser.add_argument("--output", help="Path to prediction output CSV file.")

args = parser.parse_args()


def predict(input_path, output):
def predict(data_path, background_data_path, output):
"""Predict Score (evaluate) the predictions and write the metrics.
This function takes the path to an input CSV file containing the input data.
This function takes the path to an data CSV file containing the data data.
It calls submission.py clean_df and predict_outcomes writes the predictions
to a new output CSV file.
Expand All @@ -53,10 +43,17 @@ def predict(input_path, output):

if output is None:
output = sys.stdout
df = pd.read_csv(
input_path, encoding="latin-1", encoding_errors="replace", low_memory=False
data_df = pd.read_csv(
data_path, encoding="latin-1", encoding_errors="replace", low_memory=False
)
background_data_path = pd.read_csv(
background_data_path,
encoding="latin-1",
encoding_errors="replace",
low_memory=False,
)
predictions = submission.predict_outcomes(df)

predictions = submission.predict_outcomes(data_df, background_data_path)
assert (
predictions.shape[1] == 2
), "Predictions must have two columns: nomem_encr and prediction"
Expand Down Expand Up @@ -131,11 +128,4 @@ def score(prediction_path, ground_truth_path, output):

if __name__ == "__main__":
args = parser.parse_args()
if args.command == "predict":
predict(args.input_path, args.output)
elif args.command == "score":
score(args.prediction_path, args.ground_truth_path, args.output)
else:
parser.print_help()
predict(args.input_path, args.output)
sys.exit(1)
predict(args.data_path, args.background_data_path, args.output)
97 changes: 97 additions & 0 deletions score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
"""
This script calls submission.py. Add your method to submission.py to run your
prediction method.
To test your submission use the following command:
python run.py predict
For example:
python run.py predict data/PreFer_fake_data.csv
Optionally, you can use the score function to calculate evaluation scores given
your predictions and the ground truth within the training dataset.
"""

import sys
import argparse
import pandas as pd
import submission

parser = argparse.ArgumentParser(description="Score data.")
# Score subcommand
parser.add_argument("prediction_path", help="Path to predicted outcome CSV file.")
# Score subcommand
parser.add_argument("ground_truth_path", help="Path to ground truth outcome CSV file.")
# Score subcommand
parser.add_argument("--output", help="Path to evaluation score output CSV file.")

args = parser.parse_args()


def score(prediction_path, ground_truth_path, output):
"""Score (evaluate) the predictions and write the metrics.
This function takes the path to a CSV file containing predicted outcomes and the
path to a CSV file containing the ground truth outcomes. It calculates the overall
prediction accuracy, and precision, recall, and F1 score for having a child
and writes these scores to a new output CSV file.
This function should not be modified.
"""

if output is None:
output = sys.stdout
# Load predictions and ground truth into dataframes
predictions_df = pd.read_csv(prediction_path)
ground_truth_df = pd.read_csv(ground_truth_path)

# Merge predictions and ground truth on the 'id' column
merged_df = pd.merge(predictions_df, ground_truth_df, on="nomem_encr", how="right")

# Calculate accuracy
accuracy = len(merged_df[merged_df["prediction"] == merged_df["new_child"]]) / len(
merged_df
)

# Calculate true positives, false positives, and false negatives
true_positives = len(
merged_df[(merged_df["prediction"] == 1) & (merged_df["new_child"] == 1)]
)
false_positives = len(
merged_df[(merged_df["prediction"] == 1) & (merged_df["new_child"] == 0)]
)
false_negatives = len(
merged_df[(merged_df["prediction"] == 0) & (merged_df["new_child"] == 1)]
)

# Calculate precision, recall, and F1 score
try:
precision = true_positives / (true_positives + false_positives)
except ZeroDivisionError:
precision = 0
try:
recall = true_positives / (true_positives + false_negatives)
except ZeroDivisionError:
recall = 0
try:
f1_score = 2 * (precision * recall) / (precision + recall)
except ZeroDivisionError:
f1_score = 0
# Write metric output to a new CSV file
metrics_df = pd.DataFrame(
{
"accuracy": [accuracy],
"precision": [precision],
"recall": [recall],
"f1_score": [f1_score],
}
)
metrics_df.to_csv(output, index=False)


if __name__ == "__main__":
args = parser.parse_args()
score(args.prediction_path, args.ground_truth_path, args.output)
13 changes: 7 additions & 6 deletions submission.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# List your packages here. Don't forget to update packages.R!
library(dplyr) # as an example, not used here

clean_df <- function(df, background = NULL){
clean_df <- function(df, background_df){
# Preprocess the input dataframe to feed the model.
### If no cleaning is done (e.g. if all the cleaning is done in a pipeline) leave only the "return df" command

Expand Down Expand Up @@ -45,7 +45,7 @@ clean_df <- function(df, background = NULL){
return(df)
}

predict_outcomes <- function(df, model_path = "./model.rds"){
predict_outcomes <- function(df, background_df, model_path = "./model.rds"){
# Generate predictions using the saved model and the input dataframe.

# The predict_outcomes function accepts a dataframe as an argument
Expand All @@ -58,7 +58,8 @@ predict_outcomes <- function(df, model_path = "./model.rds"){
# they did.

# Parameters:
# df (dataframe): The input dataframe for which predictions are to be made.
# df (dataframe): The data dataframe for which predictions are to be made.
# df (dataframe): The background data dataframe for which predictions are to be made.
# model_path (str): The path to the saved model file (which is the output of training.R).

# Returns:
Expand All @@ -73,7 +74,7 @@ predict_outcomes <- function(df, model_path = "./model.rds"){
model <- readRDS(model_path)

# Preprocess the fake / holdout data
df <- clean_df(df)
df <- clean_df(df, background_df)

# IMPORTANT: the outcome `new_child` should NOT be in the data from this point onwards
# get list of variables *without* the outcome:
Expand All @@ -87,9 +88,9 @@ predict_outcomes <- function(df, model_path = "./model.rds"){
predictions <- ifelse(predictions > 0.5, 1, 0)

# Output file should be data.frame with two columns, nomem_enc and predictions
df_predict <- data.frame("nomem_encr" = df[ , "nomem_encr" ], "predictions" = predictions)
df_predict <- data.frame("nomem_encr" = df[ , "nomem_encr" ], "prediction" = predictions)
# Force columnnames (overrides names that may be given by `predict`)
names(df_predict) <- c("nomem_encr", "predictions")
names(df_predict) <- c("nomem_encr", "prediction")

# Return only dataset with predictions and identifier
return( df_predict )
Expand Down
Loading

0 comments on commit a26bc6a

Please sign in to comment.