From ebbeba15288ad64e9615d74c614175711541751c Mon Sep 17 00:00:00 2001 From: JunhaoQiu <56094690+qchiujunhao@users.noreply.github.com> Date: Tue, 10 Dec 2024 19:26:03 -0500 Subject: [PATCH] updated docker image version back to 3.3.2 (#32) * updated back to 3.3.2 latest version includes pdf related packages and we don't need them now * delated unnecessary dependencies * lint * removed extra_file in tests * added help info --- Dockerfile | 12 -- tools/generate_md.py | 127 ------------------ tools/pycaret_macros.xml | 2 +- tools/pycaret_predict.py | 17 --- tools/pycaret_predict.xml | 8 ++ tools/pycaret_train.xml | 39 ++++-- ...cted_comparison_result_classification.html | 72 +++++----- ...ison_result_classification_customized.html | 52 +++---- ...expected_comparison_result_regression.html | 90 ++++++------- 9 files changed, 141 insertions(+), 278 deletions(-) delete mode 100644 tools/generate_md.py diff --git a/Dockerfile b/Dockerfile index 4631f65..2f1acb2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,22 +7,10 @@ RUN apt-get update && \ apt-get install -y --no-install-recommends git unzip libgomp1 && \ rm -rf /var/lib/apt/lists/* -RUN apt-get update && apt-get install -y \ - libgdk-pixbuf2.0-0 \ - libpangocairo-1.0-0 \ - libcairo2 \ - libpango-1.0-0 \ - libglib2.0-0 \ - libfontconfig1 \ - libfreetype6 \ - && rm -rf /var/lib/apt/lists/* - # Install Python packages RUN pip install -U pip && \ pip install --no-cache-dir --no-compile joblib && \ pip install --no-cache-dir --no-compile h5py && \ - pip install --no-cache-dir --no-compile weasyprint && \ - pip install --no-cache-dir --no-compile markdown2 && \ pip install --no-cache-dir --no-compile pycaret[analysis,models]==${VERSION} && \ pip install --no-cache-dir --no-compile explainerdashboard diff --git a/tools/generate_md.py b/tools/generate_md.py deleted file mode 100644 index a9fd7b4..0000000 --- a/tools/generate_md.py +++ /dev/null @@ -1,127 +0,0 @@ -import base64 -import logging -import os - -from markdown2 import markdown - -import pandas as pd - -from weasyprint import HTML - -LOG = logging.getLogger(__name__) - - -def dataframe_to_markdown(df): - # Generate header - header = "| " + " | ".join(df.columns) + " |" - separator = "| " + " | ".join(["---"] * len(df.columns)) + " |" - rows = "\n".join( - ["| " + " | ".join(map(str, row)) + " |" for row in df.values]) - - return f"{header}\n{separator}\n{rows}" - - -def generate_report_from_path(base_path, output_path, format="md"): - """ - Generate a report (Markdown or PDF) based on the directory structure. - - Parameters: - base_path (str): Path to the base directory containing subfolders. - output_path (str): Path to save the generated report. - format (str): Output format ("md" for Markdown, "pdf" for PDF). - - Returns: - str: Path to the saved report. - """ - LOG.info(f"Generating report from path: {base_path} as {format.upper()}") - - # Initialize the report content - markdown_content = "# Report\n\n---\n" - - # Iterate through each subfolder in the base path - for folder_name in sorted(os.listdir(base_path)): - folder_path = os.path.join(base_path, folder_name) - if not os.path.isdir(folder_path): - continue # Skip files, only process directories - - markdown_content += f"## {folder_name.capitalize()}\n\n" - - # Process CSV files in the subfolder - for file_name in sorted(os.listdir(folder_path)): - file_path = os.path.join(folder_path, file_name) - if file_name.lower().endswith(".csv"): - try: - markdown_content += f"### Table: {file_name}\n\n" - table = pd.read_csv(file_path) - markdown_content += dataframe_to_markdown(table) - markdown_content += "\n\n" - except Exception as e: - LOG.error(f"Failed to process CSV {file_path}: {e}") - markdown_content += ( - f"*Error reading table: {file_name}*\n\n" - ) - - # Process image files in the subfolder - for file_name in sorted(os.listdir(folder_path)): - file_path = os.path.join(folder_path, file_name) - if file_name.lower().endswith((".png", ".jpg", ".jpeg")): - try: - encoded_image = encode_image_to_base64(file_path) - image_name = os.path.splitext(file_name)[0] - markdown_content += f"### Plot: {image_name}\n\n" - if format == "md": - markdown_content += "![" - markdown_content += image_name - markdown_content += "](data:image/png;base64," - markdown_content += encoded_image - markdown_content += ")\n\n" - else: - # Include image as file path for PDF rendering - img_src = f"data:image/png;base64,{encoded_image}" - alt_text = file_name - style = "width:600px; max-width:100%; height:auto;" - - markdown_content += ( - f""" - {alt_text}\n\n - """ - ) - except Exception as e: - LOG.error(f"Failed to process image {file_path}: {e}") - markdown_content += ( - f"*Error displaying plot: {file_name}*\n\n" - ) - - markdown_content += "---\n" - - # Save to Markdown or PDF based on the format - if format == "md": - os.makedirs(os.path.dirname(output_path), exist_ok=True) - with open(output_path, "w") as file: - file.write(markdown_content) - LOG.info(f"Markdown report saved at: {output_path}") - elif format == "pdf": - html_content = markdown(markdown_content, extras=["tables"]) - HTML(string=html_content).write_pdf(output_path) - LOG.error(f"PDF report saved at: {output_path}") - return output_path - - return output_path - - -def encode_image_to_base64(img_path): - """ - Encode an image to Base64 format. - - Parameters: - img_path (str): Path to the image file. - - Returns: - str: Base64-encoded string of the image. - """ - with open(img_path, "rb") as img_file: - return base64.b64encode(img_file.read()).decode("utf-8") diff --git a/tools/pycaret_macros.xml b/tools/pycaret_macros.xml index 4abb6d9..abb6211 100644 --- a/tools/pycaret_macros.xml +++ b/tools/pycaret_macros.xml @@ -5,7 +5,7 @@ 21.05 - quay.io/goeckslab/galaxy-pycaret:latest + quay.io/goeckslab/galaxy-pycaret:3.3.2 diff --git a/tools/pycaret_predict.py b/tools/pycaret_predict.py index d788f03..af00f60 100644 --- a/tools/pycaret_predict.py +++ b/tools/pycaret_predict.py @@ -1,11 +1,7 @@ import argparse import logging -import os -import shutil import tempfile -from generate_md import generate_report_from_path - import h5py import joblib @@ -131,19 +127,6 @@ def evaluate(self, data_path): return predictions, metrics, plot_paths -def generate_md(plots, metrics): - LOG.error(plots) - if not os.path.exists("markdown"): - os.mkdir("markdown") - if not os.path.exists("markdown/Evaluation"): - os.mkdir("markdown/Evaluation") - for plot, path in plots.items(): - shutil.copy(path, "markdown/Evaluation/") - LOG.error(type(metrics)) - metrics.to_csv("markdown/Evaluation/metrics.csv", index=False) - generate_report_from_path("markdown", "evaluation.pdf", format="pdf") - - def generate_html_report(plots, metrics): """Generate an HTML evaluation report.""" plots_html = "" diff --git a/tools/pycaret_predict.xml b/tools/pycaret_predict.xml index a35a221..fa91622 100644 --- a/tools/pycaret_predict.xml +++ b/tools/pycaret_predict.xml @@ -48,6 +48,14 @@ This tool uses PyCaret to evaluate a machine learning model or do prediction. + + **Outputs**: + + - **prediction**: The prediction results on the dataset in a csv format. + + - **report**: The evaluation report is generated in HTML format. + if you upload a dataset with a target column and select the target column in the target_feature input field. + \ No newline at end of file diff --git a/tools/pycaret_train.xml b/tools/pycaret_train.xml index 915fe73..b6969f7 100644 --- a/tools/pycaret_train.xml +++ b/tools/pycaret_train.xml @@ -49,14 +49,16 @@ #if $test_file --test_file $test_file #end if - --model_type $model_type && - mkdir -p $comparison_result.extra_files_path && - cp -r best_model.csv $comparison_result.extra_files_path + --model_type $model_type ]]> - - + + @@ -150,6 +152,7 @@ + @@ -166,9 +169,8 @@ - - - + + @@ -176,9 +178,8 @@ - - - + + @@ -186,13 +187,23 @@ - - - + + This tool uses PyCaret to train and evaluate machine learning models. + It compares different models on a dataset and provides the best model based on the performance metrics. + + **Outputs** + + - **Model**: The best model trained on the dataset in h5 format. + + + - **Comparison Result**: The comparison result of different models in html format. + It contains the performance metrics of different models, plots of the best model + on the testing set (or part of the training set if a separate test set is not uploaded), and feature analysis plots. + \ No newline at end of file diff --git a/tools/test-data/expected_comparison_result_classification.html b/tools/test-data/expected_comparison_result_classification.html index ef6b7d9..4d6f02d 100644 --- a/tools/test-data/expected_comparison_result_classification.html +++ b/tools/test-data/expected_comparison_result_classification.html @@ -231,7 +231,7 @@

Comparison Results on the Cross-Validation Set

0.4380 0.4748 0.6822 - 0.077 + 0.228 Logistic Regression @@ -243,7 +243,7 @@

Comparison Results on the Cross-Validation Set

0.3478 0.3742 0.7144 - 0.041 + 0.331 Ridge Classifier @@ -255,7 +255,7 @@

Comparison Results on the Cross-Validation Set

0.3478 0.3742 0.0000 - 0.046 + 0.180 Naive Bayes @@ -267,7 +267,7 @@

Comparison Results on the Cross-Validation Set

0.2969 0.3112 0.6978 - 0.041 + 2.694 Quadratic Discriminant Analysis @@ -279,7 +279,7 @@

Comparison Results on the Cross-Validation Set

0.2256 0.2488 0.7033 - 0.050 + 0.158 Linear Discriminant Analysis @@ -291,7 +291,7 @@

Comparison Results on the Cross-Validation Set

0.2372 0.2577 0.6594 - 0.048 + 0.110 CatBoost Classifier @@ -303,7 +303,7 @@

Comparison Results on the Cross-Validation Set

0.2165 0.2207 0.6861 - 3.940 + 12.075 Extra Trees Classifier @@ -315,7 +315,7 @@

Comparison Results on the Cross-Validation Set

0.2103 0.2167 0.6811 - 0.227 + 0.775 SVM - Linear Kernel @@ -327,7 +327,7 @@

Comparison Results on the Cross-Validation Set

0.1429 0.1690 0.0000 - 0.039 + 0.217 K Neighbors Classifier @@ -339,7 +339,7 @@

Comparison Results on the Cross-Validation Set

0.1413 0.1469 0.6717 - 0.044 + 0.685 Random Forest Classifier @@ -351,7 +351,7 @@

Comparison Results on the Cross-Validation Set

0.1524 0.1540 0.6211 - 0.262 + 0.847 Dummy Classifier @@ -363,7 +363,7 @@

Comparison Results on the Cross-Validation Set

0.0000 0.0000 0.4600 - 0.038 + 0.165 Ada Boost Classifier @@ -375,7 +375,7 @@

Comparison Results on the Cross-Validation Set

0.0656 0.0275 0.5819 - 0.201 + 0.645 Decision Tree Classifier @@ -387,7 +387,7 @@

Comparison Results on the Cross-Validation Set

0.0049 0.0040 0.5483 - 0.039 + 0.329 Gradient Boosting Classifier @@ -399,7 +399,7 @@

Comparison Results on the Cross-Validation Set

-0.0033 -0.0239 0.5800 - 0.231 + 0.643 Extreme Gradient Boosting @@ -411,7 +411,7 @@

Comparison Results on the Cross-Validation Set

-0.0489 -0.0537 0.6281 - 0.138 + 0.422 @@ -513,7 +513,7 @@

Dimension


Manifold

- manifold

@@ -558,25 +558,25 @@

-

-

-

-

-

-

-

-

-

-

-

-

-

-

-

-

-

-

-

+

+

+

+

+

+

+

+

+

+

+

+

+

+

+

+

+

+

+

diff --git a/tools/test-data/expected_comparison_result_classification_customized.html b/tools/test-data/expected_comparison_result_classification_customized.html index 20289ec..e047ed3 100644 --- a/tools/test-data/expected_comparison_result_classification_customized.html +++ b/tools/test-data/expected_comparison_result_classification_customized.html @@ -255,7 +255,7 @@

Comparison Results on the Cross-Validation Set

0.4056 0.4224 0.5918 - 0.294 + 0.322 Naive Bayes @@ -267,7 +267,7 @@

Comparison Results on the Cross-Validation Set

0.3163 0.3232 0.6930 - 0.328 + 1.240 K Neighbors Classifier @@ -279,7 +279,7 @@

Comparison Results on the Cross-Validation Set

0.2603 0.2660 0.6001 - 0.364 + 0.864 Ridge Classifier @@ -291,7 +291,7 @@

Comparison Results on the Cross-Validation Set

0.2700 0.2835 0.0000 - 0.312 + 0.898 Random Forest Classifier @@ -303,7 +303,7 @@

Comparison Results on the Cross-Validation Set

0.2688 0.2834 0.6539 - 0.528 + 0.906 Logistic Regression @@ -315,7 +315,7 @@

Comparison Results on the Cross-Validation Set

0.2700 0.2835 0.6697 - 0.380 + 0.798 Quadratic Discriminant Analysis @@ -327,7 +327,7 @@

Comparison Results on the Cross-Validation Set

0.2815 0.2899 0.7075 - 0.336 + 0.418 Linear Discriminant Analysis @@ -339,7 +339,7 @@

Comparison Results on the Cross-Validation Set

0.2700 0.2835 0.6751 - 0.300 + 0.364 Gradient Boosting Classifier @@ -351,7 +351,7 @@

Comparison Results on the Cross-Validation Set

0.2328 0.2389 0.6403 - 0.444 + 0.522 Ada Boost Classifier @@ -363,7 +363,7 @@

Comparison Results on the Cross-Validation Set

0.2340 0.2415 0.6517 - 0.450 + 0.560 Extra Trees Classifier @@ -375,7 +375,7 @@

Comparison Results on the Cross-Validation Set

0.2266 0.2347 0.6413 - 0.472 + 0.468 Decision Tree Classifier @@ -387,7 +387,7 @@

Comparison Results on the Cross-Validation Set

0.1950 0.2060 0.5215 - 0.324 + 1.532 CatBoost Classifier @@ -399,7 +399,7 @@

Comparison Results on the Cross-Validation Set

0.1454 0.1414 0.6991 - 3.244 + 3.426 SVM - Linear Kernel @@ -411,7 +411,7 @@

Comparison Results on the Cross-Validation Set

0.0684 0.0685 0.0000 - 0.394 + 1.666 Dummy Classifier @@ -423,7 +423,7 @@

Comparison Results on the Cross-Validation Set

0.0000 0.0000 0.4545 - 0.338 + 0.456 Extreme Gradient Boosting @@ -435,7 +435,7 @@

Comparison Results on the Cross-Validation Set

0.0550 0.0564 0.5943 - 0.338 + 0.336 @@ -537,7 +537,7 @@

Dimension


Manifold

- manifold

@@ -582,15 +582,15 @@

-

-

-

-

-

-

-

-

-

+

+

+

+

+

+

+

+

+

diff --git a/tools/test-data/expected_comparison_result_regression.html b/tools/test-data/expected_comparison_result_regression.html index be8f20c..9a949d4 100644 --- a/tools/test-data/expected_comparison_result_regression.html +++ b/tools/test-data/expected_comparison_result_regression.html @@ -235,7 +235,7 @@

Comparison Results on the Cross-Validation Set

0.8383 0.1197 0.0980 - 0.304 + 0.681 Extra Trees Regressor @@ -245,7 +245,7 @@

Comparison Results on the Cross-Validation Set

0.8323 0.1220 0.0949 - 0.459 + 2.212 Light Gradient Boosting Machine @@ -255,7 +255,7 @@

Comparison Results on the Cross-Validation Set

0.8282 0.1252 0.1011 - 0.153 + 0.263 CatBoost Regressor @@ -265,7 +265,7 @@

Comparison Results on the Cross-Validation Set

0.8270 0.1256 0.1011 - 3.342 + 8.883 Random Forest Regressor @@ -275,7 +275,7 @@

Comparison Results on the Cross-Validation Set

0.8210 0.1252 0.0990 - 0.486 + 1.916 Extreme Gradient Boosting @@ -285,7 +285,7 @@

Comparison Results on the Cross-Validation Set

0.8045 0.1336 0.1057 - 0.247 + 0.497 Elastic Net @@ -295,7 +295,7 @@

Comparison Results on the Cross-Validation Set

0.8029 0.1426 0.1168 - 0.091 + 0.116 Lasso Regression @@ -305,7 +305,7 @@

Comparison Results on the Cross-Validation Set

0.8011 0.1438 0.1172 - 0.095 + 0.134 Lasso Least Angle Regression @@ -315,7 +315,7 @@

Comparison Results on the Cross-Validation Set

0.8011 0.1438 0.1172 - 0.088 + 0.157 AdaBoost Regressor @@ -325,7 +325,7 @@

Comparison Results on the Cross-Validation Set

0.7939 0.1378 0.1153 - 0.276 + 2.469 Bayesian Ridge @@ -335,7 +335,7 @@

Comparison Results on the Cross-Validation Set

0.7920 0.1433 0.1194 - 0.113 + 0.268 Ridge Regression @@ -345,7 +345,7 @@

Comparison Results on the Cross-Validation Set

0.7872 0.1448 0.1212 - 0.113 + 0.108 Linear Regression @@ -355,7 +355,7 @@

Comparison Results on the Cross-Validation Set

0.7866 0.1450 0.1214 - 0.086 + 0.122 Least Angle Regression @@ -365,7 +365,7 @@

Comparison Results on the Cross-Validation Set

0.7759 0.1489 0.1249 - 0.092 + 0.165 Huber Regressor @@ -375,7 +375,7 @@

Comparison Results on the Cross-Validation Set

0.7699 0.1404 0.1138 - 0.124 + 1.508 Decision Tree Regressor @@ -385,7 +385,7 @@

Comparison Results on the Cross-Validation Set

0.7507 0.1470 0.1108 - 0.080 + 0.253 Orthogonal Matching Pursuit @@ -395,7 +395,7 @@

Comparison Results on the Cross-Validation Set

0.6709 0.1767 0.1475 - 0.099 + 0.418 K Neighbors Regressor @@ -405,7 +405,7 @@

Comparison Results on the Cross-Validation Set

0.6546 0.1692 0.1448 - 0.093 + 0.858 Dummy Regressor @@ -415,7 +415,7 @@

Comparison Results on the Cross-Validation Set

-0.0391 0.3303 0.3219 - 0.083 + 0.129 Passive Aggressive Regressor @@ -425,7 +425,7 @@

Comparison Results on the Cross-Validation Set

-0.4762 0.4067 0.3652 - 0.081 + 0.420 @@ -493,7 +493,7 @@

Vc


Manifold

- manifold

@@ -538,30 +538,30 @@

-

-

-

-

-

-

-

-

-

-

-

-

-

-

-

-

-

-

-

-

-

-

-

-

+

+

+

+

+

+

+

+

+

+

+

+

+

+

+

+

+

+

+

+

+

+

+

+