updated docker image version back to 3.3.2 (#32)

* updated back to 3.3.2 latest version includes pdf related packages and we don't need them now * delated unnecessary dependencies * lint * removed extra_file in tests * added help info
goeckslab · Dec 11, 2024 · ebbeba1 · ebbeba1
1 parent 66fd012
commit ebbeba1
Show file tree

Hide file tree

Showing 9 changed files with 141 additions and 278 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -7,22 +7,10 @@ RUN apt-get update && \
     apt-get install -y --no-install-recommends git unzip libgomp1 && \
     rm -rf /var/lib/apt/lists/*
 
-RUN apt-get update && apt-get install -y \
-    libgdk-pixbuf2.0-0 \
-    libpangocairo-1.0-0 \
-    libcairo2 \
-    libpango-1.0-0 \
-    libglib2.0-0 \
-    libfontconfig1 \
-    libfreetype6 \
-    && rm -rf /var/lib/apt/lists/*
-
 # Install Python packages
 RUN pip install -U pip && \
     pip install --no-cache-dir --no-compile joblib && \
     pip install --no-cache-dir --no-compile h5py && \
-    pip install --no-cache-dir --no-compile weasyprint && \
-    pip install --no-cache-dir --no-compile markdown2 && \
     pip install --no-cache-dir --no-compile pycaret[analysis,models]==${VERSION} && \
     pip install --no-cache-dir --no-compile explainerdashboard
 

diff --git a/tools/generate_md.py b/tools/generate_md.py
diff --git a/tools/pycaret_macros.xml b/tools/pycaret_macros.xml
@@ -5,7 +5,7 @@
     <token name="@PROFILE@">21.05</token>
     <xml name="python_requirements">
         <requirements>
-            <container type="docker">quay.io/goeckslab/galaxy-pycaret:latest</container>
+            <container type="docker">quay.io/goeckslab/galaxy-pycaret:3.3.2</container>
         </requirements>
     </xml>
     <xml name="macro_citations">

diff --git a/tools/pycaret_predict.py b/tools/pycaret_predict.py
@@ -1,11 +1,7 @@
 import argparse
 import logging
-import os
-import shutil
 import tempfile
 
-from generate_md import generate_report_from_path
-
 import h5py
 
 import joblib
@@ -131,19 +127,6 @@ def evaluate(self, data_path):
         return predictions, metrics, plot_paths
 
 
-def generate_md(plots, metrics):
-    LOG.error(plots)
-    if not os.path.exists("markdown"):
-        os.mkdir("markdown")
-    if not os.path.exists("markdown/Evaluation"):
-        os.mkdir("markdown/Evaluation")
-    for plot, path in plots.items():
-        shutil.copy(path, "markdown/Evaluation/")
-    LOG.error(type(metrics))
-    metrics.to_csv("markdown/Evaluation/metrics.csv", index=False)
-    generate_report_from_path("markdown", "evaluation.pdf", format="pdf")
-
-
 def generate_html_report(plots, metrics):
     """Generate an HTML evaluation report."""
     plots_html = ""

diff --git a/tools/pycaret_predict.xml b/tools/pycaret_predict.xml
@@ -48,6 +48,14 @@
     </tests>
     <help>
         This tool uses PyCaret to evaluate a machine learning model or do prediction.
+
+        **Outputs**:
+
+        - **prediction**: The prediction results on the dataset in a csv format.
+
+        - **report**: The evaluation report is generated in HTML format. 
+            if you upload a dataset with a target column and select the target column in the target_feature input field.
+
     </help>
     <expand macro="macro_citations" />
 </tool>
diff --git a/tools/pycaret_train.xml b/tools/pycaret_train.xml
@@ -49,14 +49,16 @@
         #if $test_file
             --test_file $test_file 
         #end if 
-        --model_type $model_type &&
-        mkdir -p $comparison_result.extra_files_path &&
-        cp -r best_model.csv $comparison_result.extra_files_path    
+        --model_type $model_type    
         ]]>
     </command>
     <inputs>
-        <param name="input_file" type="data" format="csv,tabular" label="Input Dataset (CSV or TSV)" />
-        <param name="test_file" type="data" format="csv,tabular" optional="true" label="Test Dataset (CSV or TSV)" />
+        <param name="input_file" type="data" format="csv,tabular" label="Train Dataset (CSV or TSV)" />
+        <param name="test_file" type="data" format="csv,tabular" optional="true" label="Test Dataset (CSV or TSV)"
+        help="If a test set is not provided, 
+        the selected training set will be split into training, validation, and test sets. 
+        If a test set is provided, the training set will only be split into training and validation sets. 
+        BTW, cross-validation is always applied by default." />
        <param name="target_feature" multiple="false" type="data_column" use_header_names="true" data_ref="input_file" label="Select the target column:" />
         <conditional name="model_selection">
             <param name="model_type" type="select" label="Task">
@@ -150,6 +152,7 @@
     <outputs>
         <data name="model" format="h5" from_work_dir="pycaret_model.h5" label="${tool.name} best model on ${on_string}" />
         <data name="comparison_result" format="html" from_work_dir="comparison_result.html" label="${tool.name} Comparison result on ${on_string}"/>
+        <data name="best_model_csv" format="csv" from_work_dir="best_model.csv" label="${tool.name} The prams of the best model on ${on_string}" hidden="true" />
     </outputs>
     <tests>
         <test>
@@ -166,33 +169,41 @@
             <param name="remove_outliers" value="true"/>
             <param name="remove_multicollinearity" value="true"/>
             <output name="model" file="expected_model_classification_customized.h5" compare="sim_size"/>
-            <output name="comparison_result" file="expected_comparison_result_classification_customized.html" compare="sim_size"> 
-                <extra_files type="file" name="best_model.csv" value="expected_best_model_classification_customized.csv" />
-            </output>
+            <output name="comparison_result" file="expected_comparison_result_classification_customized.html" compare="sim_size" /> 
+            <output name="best_model_csv" value="expected_best_model_classification_customized.csv" />
         </test>
         <test>
             <param name="input_file" value="pcr.tsv"/>
             <param name="target_feature" value="11"/> 
             <param name="model_type" value="classification"/>
             <param name="random_seed" value="42"/>
             <output name="model" file="expected_model_classification.h5" compare="sim_size"/>
-            <output name="comparison_result" file="expected_comparison_result_classification.html" compare="sim_size"> 
-                <extra_files type="file" name="best_model.csv" value="expected_best_model_classification.csv" />
-            </output>
+            <output name="comparison_result" file="expected_comparison_result_classification.html" compare="sim_size" /> 
+            <output name="best_model_csv" value="expected_best_model_classification.csv" />
         </test>
         <test>
             <param name="input_file" value="auto-mpg.tsv"/>
             <param name="target_feature" value="1"/> 
             <param name="model_type" value="regression"/>
             <param name="random_seed" value="42"/>
             <output name="model" file="expected_model_regression.h5" compare="sim_size" />
-            <output name="comparison_result" file="expected_comparison_result_regression.html" compare="sim_size"> 
-                <extra_files type="file" name="best_model.csv" value="expected_best_model_regression.csv" />
-            </output>
+            <output name="comparison_result" file="expected_comparison_result_regression.html" compare="sim_size" /> 
+            <output name="best_model_csv" value="expected_best_model_regression.csv" />
         </test>
     </tests>
     <help>
         This tool uses PyCaret to train and evaluate machine learning models.
+        It compares different models on a dataset and provides the best model based on the performance metrics.
+
+        **Outputs**
+
+        - **Model**: The best model trained on the dataset in h5 format.
+
+
+        - **Comparison Result**: The comparison result of different models in html format. 
+            It contains the performance metrics of different models, plots of the best model 
+            on the testing set (or part of the training set if a separate test set is not uploaded), and feature analysis plots.
+
     </help>
     <expand macro="macro_citations" />
 </tool>