From f60382e4914671c8e5efffbfffed6e8cb8f66385 Mon Sep 17 00:00:00 2001 From: Lon Blauvelt Date: Tue, 5 Dec 2023 10:03:47 -0800 Subject: [PATCH] Remove the WDL compiler. (#4679) * Remove the WDL compiler. * Linting. * Update WDL stand-alone. * Weird linting error? * Cut compiler docs * Stop trying to run removed WDL compiler tests --------- Co-authored-by: Adam Novak --- .gitlab-ci.yml | 7 +- contrib/admin/mypy-with-ignore.py | 9 - docs/running/wdl.rst | 137 +--- setup.py | 1 - src/toil/server/cli/wes_cwl_runner.py | 2 +- src/toil/test/utils/toilDebugTest.py | 49 +- src/toil/test/wdl/builtinTest.py | 506 ------------- src/toil/test/wdl/conftest.py | 23 - src/toil/test/wdl/toilwdlTest.py | 520 ------------- src/toil/test/wdl/wdltoil_test.py | 47 +- src/toil/wdl/toilwdl.py | 142 ---- src/toil/wdl/utils.py | 124 +-- src/toil/wdl/versions/__init__.py | 0 src/toil/wdl/versions/dev.py | 107 --- src/toil/wdl/versions/draft2.py | 980 ------------------------ src/toil/wdl/versions/v1.py | 794 ------------------- src/toil/wdl/wdl_analysis.py | 116 --- src/toil/wdl/wdl_functions.py | 1007 ------------------------ src/toil/wdl/wdl_synthesis.py | 1011 ------------------------- src/toil/wdl/wdl_types.py | 243 ------ src/toil/wdl/wdltoil.py | 1 + 21 files changed, 49 insertions(+), 5777 deletions(-) delete mode 100644 src/toil/test/wdl/builtinTest.py delete mode 100644 src/toil/test/wdl/conftest.py delete mode 100644 src/toil/test/wdl/toilwdlTest.py delete mode 100644 src/toil/wdl/toilwdl.py delete mode 100644 src/toil/wdl/versions/__init__.py delete mode 100644 src/toil/wdl/versions/dev.py delete mode 100644 src/toil/wdl/versions/draft2.py delete mode 100644 src/toil/wdl/versions/v1.py delete mode 100644 src/toil/wdl/wdl_analysis.py delete mode 100644 src/toil/wdl/wdl_functions.py delete mode 100644 src/toil/wdl/wdl_synthesis.py delete mode 100644 src/toil/wdl/wdl_types.py diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2d4e5019a7..d6ee0ce499 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -61,7 +61,6 @@ stages: - main_tests - integration - lint: rules: - if: $CI_PIPELINE_SOURCE != "schedule" @@ -75,7 +74,6 @@ lint: - make docs # - make diff_pydocstyle_report - cwl_dependency_is_stand_alone: rules: - if: $CI_PIPELINE_SOURCE != "schedule" @@ -85,7 +83,6 @@ cwl_dependency_is_stand_alone: - ${MAIN_PYTHON_PKG} -m virtualenv venv && . venv/bin/activate && make prepare && make develop extras=[cwl] - make test threads="${TEST_THREADS}" marker="${MARKER}" tests=src/toil/test/docs/scriptsTest.py::ToilDocumentationTest::testCwlexample - wdl_dependency_is_stand_alone: rules: - if: $CI_PIPELINE_SOURCE != "schedule" @@ -93,7 +90,7 @@ wdl_dependency_is_stand_alone: script: - pwd - ${MAIN_PYTHON_PKG} -m virtualenv venv && . venv/bin/activate && make prepare && make develop extras=[wdl] - - make test threads="${TEST_THREADS}" marker="${MARKER}" tests=src/toil/test/wdl/toilwdlTest.py::ToilWdlTest::testMD5sum + - make test threads="${TEST_THREADS}" marker="${MARKER}" tests=src/toil/test/wdl/wdltoil_test.py::WDLTests::test_MD5sum quick_test_offline: rules: @@ -279,8 +276,6 @@ wdl: - apt update && apt install -y default-jre - ${MAIN_PYTHON_PKG} -m virtualenv venv && . venv/bin/activate && pip install -U pip wheel && make prepare && make develop extras=[all] - make test threads="${TEST_THREADS}" marker="${MARKER}" tests=src/toil/test/wdl/wdltoil_test.py - - which java &> /dev/null || { echo >&2 "Java is not installed. Install java to run these tests."; exit 1; } - - make test threads="${TEST_THREADS}" marker="${MARKER}" tests="src/toil/test/wdl/toilwdlTest.py src/toil/test/wdl/builtinTest.py" # needs java (default-jre) to run "GATK.jar" jobstore: rules: diff --git a/contrib/admin/mypy-with-ignore.py b/contrib/admin/mypy-with-ignore.py index 01f57150e0..bf19e459e7 100755 --- a/contrib/admin/mypy-with-ignore.py +++ b/contrib/admin/mypy-with-ignore.py @@ -33,14 +33,6 @@ def main(): 'src/toil/__init__.py', 'src/toil/deferred.py', 'src/toil/version.py', - 'src/toil/wdl/utils.py', - 'src/toil/wdl/wdl_synthesis.py', - 'src/toil/wdl/wdl_analysis.py', - 'src/toil/wdl/wdl_functions.py', - 'src/toil/wdl/toilwdl.py', - 'src/toil/wdl/versions/draft2.py', - 'src/toil/wdl/versions/v1.py', - 'src/toil/wdl/versions/dev.py', 'src/toil/provisioners/abstractProvisioner.py', 'src/toil/provisioners/gceProvisioner.py', 'src/toil/provisioners/__init__.py', @@ -103,7 +95,6 @@ def ignore(file_path): if file_path.startswith(prefix): return True return False - filtered_files_to_check = [] for file_path in all_files_to_check: diff --git a/docs/running/wdl.rst b/docs/running/wdl.rst index 8e3630e8ac..168c1d9af8 100644 --- a/docs/running/wdl.rst +++ b/docs/running/wdl.rst @@ -14,6 +14,9 @@ You can run WDL workflows with ``toil-wdl-runner``. Currently, workflow, and has support for workflows in WDL 1.0 or later (which are required to declare a ``version``, and which use ``inputs`` and ``outputs`` sections). +.. tip:: + The last release of Toil that supported unversioned, ``draft-2`` WDL workflows was `5.12.0`_. + Toil is, for compatible workflows, a drop-in replacement for the `Cromwell`_ WDL runner. Instead of running a workflow with Cromwell:: @@ -39,6 +42,7 @@ workflow, you can do:: toil-wdl-runner https://raw.githubusercontent.com/DataBiosphere/toil/36b54c45e8554ded5093bcdd03edb2f6b0d93887/src/toil/test/wdl/miniwdl_self_test/self_test.wdl https://raw.githubusercontent.com/DataBiosphere/toil/36b54c45e8554ded5093bcdd03edb2f6b0d93887/src/toil/test/wdl/miniwdl_self_test/inputs.json +.. _`5.12.0`: https://github.com/DataBiosphere/toil/releases/tag/releases%2F5.12.0 .. _`Cromwell`: https://github.com/broadinstitute/cromwell#readme Writing WDL with Toil @@ -126,137 +130,4 @@ Toil is not yet fully conformant with the WDL specification, but it inherits mos .. _`MiniWDL`: https://github.com/chanzuckerberg/miniwdl/#miniwdl -Using the Old WDL Compiler --------------------------- - -Up through Toil 5.9.2, ``toil-wdl-runner`` worked by compiling the WDL code to -a Toil Python workflow, and executing that. The old compiler is -still available as ``toil-wdl-runner-old``. - -The compiler implements: - * Scatter - * Many Built-In Functions - * Docker Calls - * Handles Priority, and Output File Wrangling - * Currently Handles Primitives and Arrays - -The compiler DOES NOT implement: - * Robust cloud autoscaling - * WDL files that ``import`` other WDL files (including URI handling for 'http://' and 'https://') - -Recommended best practice when running wdl files with ``toil-wdl-runner-old`` is to first use the Broad's wdltool for syntax validation and generating -the needed json input file. Full documentation can be found in the repository_, and a precompiled jar binary can be -downloaded here: wdltool_ (this requires java7_). - -.. _repository: https://github.com/broadinstitute/wdltool -.. _wdltool: https://github.com/broadinstitute/wdltool/releases -.. _java7: http://www.oracle.com/technetwork/java/javase/downloads/java-archive-downloads-javase7-521261.html - -That means two steps. First, make sure your wdl file is valid and devoid of syntax errors by running:: - - java -jar wdltool.jar validate example_wdlfile.wdl - -Second, generate a complementary json file if your wdl file needs one. This json will contain keys for every necessary -input that your wdl file needs to run:: - - java -jar wdltool.jar inputs example_wdlfile.wdl - -When this json template is generated, open the file, and fill in values as necessary by hand. WDL files all require -json files to accompany them. If no variable inputs are needed, a json file containing only '{}' may be required. - -Once a wdl file is validated and has an appropriate json file, workflows can be compiled and run using:: - - toil-wdl-runner-old example_wdlfile.wdl example_jsonfile.json - -Toil WDL Compiler Options -~~~~~~~~~~~~~~~~~~~~~~~~~ -``-o`` or ``--outdir``: Specifies the output folder, and defaults to the current working directory if -not specified by the user. - -``--dev_mode``: Creates "AST.out", which holds a printed AST of the wdl file and "mappings.out", which holds the -printed task, workflow, csv, and tsv dictionaries generated by the parser. Also saves the compiled toil python workflow -file for debugging. - -Any number of arbitrary options may also be specified. These options will not be parsed immediately, but passed down -as toil options once the wdl/json files are processed. For valid toil options, see the documentation: -http://toil.readthedocs.io/en/latest/running/cliOptions.html - -Compiler Example: ENCODE Example from ENCODE-DCC -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -For this example, we will run a WDL draft-2 workflow. This version is too old -to be supported by ``toil-wdl-runner``, so we will need to use -``toil-wdl-runner-old``. - -To follow this example, you will need docker installed. The original workflow can be found here: -https://github.com/ENCODE-DCC/pipeline-container - -We've included the wdl file and data files in the toil repository needed to run this example. First, download -the example code_ and unzip. The file needed is "testENCODE/encode_mapping_workflow.wdl". - -Next, use wdltool_ (this requires java7_) to validate this file:: - - java -jar wdltool.jar validate encode_mapping_workflow.wdl - -Next, use wdltool to generate a json file for this wdl file:: - - java -jar wdltool.jar inputs encode_mapping_workflow.wdl - -This json file once opened should look like this:: - - { - "encode_mapping_workflow.fastqs": "Array[File]", - "encode_mapping_workflow.trimming_parameter": "String", - "encode_mapping_workflow.reference": "File" - } - -You will need to edit this file to replace the types (like ``Array[File]``) with values of those types. - -The trimming_parameter should be set to 'native'. - -For the file parameters, download the example data_ and unzip. Inside are two data files required for the run:: - - ENCODE_data/reference/GRCh38_chr21_bwa.tar.gz - ENCODE_data/ENCFF000VOL_chr21.fq.gz - -Editing the json to include these as inputs, the json should now look something like this:: - - { - "encode_mapping_workflow.fastqs": ["/path/to/unzipped/ENCODE_data/ENCFF000VOL_chr21.fq.gz"], - "encode_mapping_workflow.trimming_parameter": "native", - "encode_mapping_workflow.reference": "/path/to/unzipped/ENCODE_data/reference/GRCh38_chr21_bwa.tar.gz" - } - -The wdl and json files can now be run using the command:: - - toil-wdl-runner-old encode_mapping_workflow.wdl encode_mapping_workflow.json - -This should deposit the output files in the user's current working directory (to change this, specify a new directory -with the ``-o`` option). - -.. _code: https://toil-datasets.s3.amazonaws.com/wdl_templates.zip -.. _data: https://toil-datasets.s3.amazonaws.com/ENCODE_data.zip - -Compiler Example: GATK Examples from the Broad -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Terra hosts some example documentation for using early, pre-1.0 versions of WDL, originally authored by the Broad: -https://support.terra.bio/hc/en-us/sections/360007347652?name=wdl-tutorials - -One can follow along with these tutorials, write their own old-style WDL files following the directions and run them using either -Cromwell or Toil's old WDL compiler. For example, in tutorial 1, if you've followed along and named your wdl file 'helloHaplotypeCall.wdl', -then once you've validated your wdl file using wdltool_ (this requires java7_) using:: - - java -jar wdltool.jar validate helloHaplotypeCaller.wdl - -and generated a ``json`` file (and subsequently typed in appropriate file paths and variables) using:: - - java -jar wdltool.jar inputs helloHaplotypeCaller.wdl - -.. note:: - Absolute filepath inputs are recommended for local testing with the Toil WDL compiler. - -then the WDL script can be compiled and run using:: - - toil-wdl-runner-old helloHaplotypeCaller.wdl helloHaplotypeCaller_inputs.json - diff --git a/setup.py b/setup.py index 1278326a93..7882452406 100755 --- a/setup.py +++ b/setup.py @@ -116,7 +116,6 @@ def run_setup(): 'cwltoil = toil.cwl.cwltoil:cwltoil_was_removed [cwl]', 'toil-cwl-runner = toil.cwl.cwltoil:main [cwl]', 'toil-wdl-runner = toil.wdl.wdltoil:main [wdl]', - 'toil-wdl-runner-old = toil.wdl.toilwdl:main [wdl]', 'toil-wes-cwl-runner = toil.server.cli.wes_cwl_runner:main [server]', '_toil_mesos_executor = toil.batchSystems.mesos.executor:main [mesos]', '_toil_contained_executor = toil.batchSystems.contained_executor:executor']}) diff --git a/src/toil/server/cli/wes_cwl_runner.py b/src/toil/server/cli/wes_cwl_runner.py index cb33ac8caa..77b1f30c24 100644 --- a/src/toil/server/cli/wes_cwl_runner.py +++ b/src/toil/server/cli/wes_cwl_runner.py @@ -147,7 +147,7 @@ def parse_params(self, workflow_params_file: str) -> Dict[str, Any]: :param workflow_params_file: The URL or path to the CWL input file. """ - loader = schema_salad.ref_resolver.Loader( + loader = schema_salad.ref_resolver.Loader( # type:ignore {"location": {"@type": "@id"}, "path": {"@type": "@id"}} ) diff --git a/src/toil/test/utils/toilDebugTest.py b/src/toil/test/utils/toilDebugTest.py index 481bc8fab0..d37394315f 100644 --- a/src/toil/test/utils/toilDebugTest.py +++ b/src/toil/test/utils/toilDebugTest.py @@ -1,4 +1,3 @@ -"""A set of test cases for toilwdl.py""" # Copyright (C) 2015-2021 Regents of the University of California # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -15,7 +14,7 @@ import logging import os import subprocess -from pathlib import Path +import tempfile import pytest @@ -26,21 +25,20 @@ logger = logging.getLogger(__name__) -@pytest.fixture -def workflow_debug_jobstore(tmp_path: Path) -> str: - jobStorePath = str(tmp_path / "toilWorkflowRun") +def workflow_debug_jobstore() -> str: + job_store_path = os.path.join(tempfile.mkdtemp(), "toilWorkflowRun") subprocess.check_call( [ python, os.path.abspath("src/toil/test/utils/ABCWorkflowDebug/debugWorkflow.py"), - jobStorePath, + job_store_path, ] ) - return jobStorePath + return job_store_path @slow -def testJobStoreContents(workflow_debug_jobstore: str): +def testJobStoreContents(): """ Test toilDebugFile.printContentsOfJobStore(). @@ -48,14 +46,13 @@ def testJobStoreContents(workflow_debug_jobstore: str): jobStore. 'A.txt', 'C.txt', 'ABC.txt' are then created. This checks to make sure these contents are found in the jobStore and printed. """ - jobStoreDir = workflow_debug_jobstore contents = ["A.txt", "B.txt", "C.txt", "ABC.txt", "mkFile.py"] subprocess.check_call( [ python, os.path.abspath("src/toil/utils/toilDebugFile.py"), - jobStoreDir, + workflow_debug_jobstore(), "--logDebug", "--listFilesInJobStore=True", ] @@ -78,7 +75,7 @@ def testJobStoreContents(workflow_debug_jobstore: str): os.remove(jobstoreFileContents) -def fetchFiles(symLink, jobStoreDir: str, outputDir): +def fetchFiles(symLink: bool, jobStoreDir: str, outputDir: str): """ Fn for testFetchJobStoreFiles() and testFetchJobStoreFilesWSymlinks(). @@ -99,8 +96,8 @@ def fetchFiles(symLink, jobStoreDir: str, outputDir): "*C.txt", "*ABC.txt", "*mkFile.py", - "--localFilePath=" + outputDir, - "--useSymlinks=" + str(symLink), + f"--localFilePath={outputDir}", + f"--useSymlinks={symLink}", ] print(cmd) subprocess.check_call(cmd) @@ -114,22 +111,10 @@ def fetchFiles(symLink, jobStoreDir: str, outputDir): # expected run time = 4s -def testFetchJobStoreFiles(tmp_path: Path, workflow_debug_jobstore: str) -> None: - """Test toilDebugFile.fetchJobStoreFiles() without using symlinks.""" - outputDir = tmp_path / "testoutput" - outputDir.mkdir() - fetchFiles( - symLink=False, jobStoreDir=workflow_debug_jobstore, outputDir=str(outputDir) - ) - - -# expected run time = 4s -def testFetchJobStoreFilesWSymlinks( - tmp_path: Path, workflow_debug_jobstore: str -) -> None: - """Test toilDebugFile.fetchJobStoreFiles() using symlinks.""" - outputDir = tmp_path / "testoutput" - outputDir.mkdir() - fetchFiles( - symLink=True, jobStoreDir=workflow_debug_jobstore, outputDir=str(outputDir) - ) +def testFetchJobStoreFiles() -> None: + """Test toilDebugFile.fetchJobStoreFiles() symlinks.""" + job_store_dir = workflow_debug_jobstore() + output_dir = os.path.join(os.path.dirname(job_store_dir), "testoutput") + os.makedirs(output_dir, exist_ok=True) + for symlink in (True, False): + fetchFiles(symLink=symlink, jobStoreDir=job_store_dir, outputDir=output_dir) diff --git a/src/toil/test/wdl/builtinTest.py b/src/toil/test/wdl/builtinTest.py deleted file mode 100644 index c13fd30273..0000000000 --- a/src/toil/test/wdl/builtinTest.py +++ /dev/null @@ -1,506 +0,0 @@ -import json -import os -import shutil -import subprocess -import unittest -import uuid -from typing import List, Optional - -from toil.test import ToilTest -from toil.version import exactPython -from toil.wdl.wdl_functions import (WDLPair, - WDLRuntimeError, - ceil, - cross, - floor, - length, - read_boolean, - read_float, - read_int, - read_json, - read_lines, - read_map, - read_string, - read_tsv, - sub, - transpose, - wdl_zip, - write_json, - write_lines, - write_map, - write_tsv) - - -class WdlStandardLibraryFunctionsTest(ToilTest): - """ A set of test cases for toil's wdl functions.""" - - def setUp(self): - """Runs anew before each test to create farm fresh temp dirs.""" - self.output_dir = os.path.join('/tmp/', 'toil-wdl-test-' + str(uuid.uuid4())) - os.makedirs(self.output_dir) - os.makedirs(os.path.join(self.output_dir, 'execution')) - - @classmethod - def setUpClass(cls): - pass - - def tearDown(self): - """Clean up outputs.""" - if os.path.exists(self.output_dir): - shutil.rmtree(self.output_dir) - - def _check_output(self, path, expected_result, strip=True): - """ Compare expected_result to content from file.""" - with open(path) as f: - result = f.read() - if strip: - result = result.strip() - self.assertEqual(expected_result, result) - - def _write_temp_file(self, function_name, content): - """ Write content to a temp file.""" - path = os.path.join(self.output_dir, f'{function_name}_{uuid.uuid4()}.tmp') - with open(path, 'w') as f: - f.write(content + '\n') - return path - - def testFn_Sub(self): - """Test the wdl built-in functional equivalent of 'sub()'.""" - # example from the WDL spec. - chocolike = "I like chocolate when it's late" - self.assertEqual("I love chocolate when it's late", sub(chocolike, 'like', 'love')) - self.assertEqual("I like chocoearly when it's early", sub(chocolike, 'late', 'early')) - self.assertEqual("I like chocolate when it's early", sub(chocolike, 'late$', 'early')) - - def testFn_Ceil(self): - """Test the wdl built-in functional equivalent of 'ceil()', which converts - a Float value into an Int by rounding up to the next higher integer""" - assert ceil(1.999) == 2 - assert ceil(-1.5) == -1 - - def testFn_Floor(self): - """Test the wdl built-in functional equivalent of 'floor()', which converts - a Float value into an Int by rounding down to the next lower integer""" - assert floor(1.999) == 1 - assert floor(-1.5) == -2 - - def testFn_ReadLines(self): - """Test the wdl built-in functional equivalent of 'read_lines()'.""" - # trailing newlines are stripped; spaces are kept - lines = 'line 1\nline 2\t\t\n \n\n' - path = self._write_temp_file('read_lines', lines) - self.assertEqual(['line 1', 'line 2\t\t', ' '], read_lines(path)) - - # preceding newlines are kept - lines = '\n\n\nline 1\nline 2\t\t\n ' - path = self._write_temp_file('read_lines', lines) - self.assertEqual(['', '', '', 'line 1', 'line 2\t\t', ' '], read_lines(path)) - - def testFn_ReadTsv(self): - """Test the wdl built-in functional equivalent of 'read_tsv()'.""" - tsv = [['1', '2', '3'], ['4', '5', '6'], ['7', '8', '9']] - tsv_str = '1\t2\t3\n4\t5\t6\n7\t8\t9' - - path = self._write_temp_file('read_tsv', tsv_str) - self.assertEqual(tsv, read_tsv(path)) - - def testFn_ReadJson(self): - """Test the wdl built-in functional equivalent of 'read_json()'.""" - json_obj = {'str': 'some string', 'num': 3.14, 'bool': True, 'null': None, 'arr': ['test']} - json_arr = ['1', '2'] - json_num = 3.14 - - path = self._write_temp_file('read_json', json.dumps(json_obj)) - self.assertEqual(json_obj, read_json(path)) - - path = self._write_temp_file('read_json', json.dumps(json_arr)) - self.assertEqual(json_arr, read_json(path)) - - path = self._write_temp_file('read_json', json.dumps(json_num)) - self.assertEqual(json_num, read_json(path)) - - def testFn_ReadMap(self): - """Test the wdl built-in functional equivalent of 'read_map()'.""" - map_str = 'key1\tvalue1\nkey2\tvalue2' - path = self._write_temp_file('read_map', map_str) - self.assertEqual({'key1': 'value1', 'key2': 'value2'}, read_map(path)) - - # extra lines and spaces are stripped, except spaces in keys are kept. - map_str = '\n\n\nkey1 \tvalue1\nkey2\tvalue2 \n \n \t \n' - path = self._write_temp_file('read_map', map_str) - self.assertEqual({'key1 ': 'value1', 'key2': 'value2'}, read_map(path)) - - def testFn_ReadInt(self): - """Test the wdl built-in functional equivalent of 'read_int()'.""" - num = 10 - path = self._write_temp_file('read_int', content=str(num)) - self.assertEqual(num, read_int(path)) - - num = 10.0 - path = self._write_temp_file('read_int', content=str(num)) - self.assertRaises(ValueError, read_int, path) - - num = 10.5 - path = self._write_temp_file('read_int', content=str(num)) - self.assertRaises(ValueError, read_int, path) - - def testFn_ReadString(self): - """Test the wdl built-in functional equivalent of 'read_string()'.""" - some_str = 'some string' - path = self._write_temp_file('read_string', content=some_str) - self.assertEqual(some_str, read_string(path)) - - # with preceding newlines. Cromwell strips from the front and the end. - path = self._write_temp_file('read_string', content='\n\n\n' + some_str) - self.assertEqual(some_str, read_string(path)) - - # with trailing newlines - path = self._write_temp_file('read_string', content=some_str + '\n\n') - self.assertEqual(some_str, read_string(path)) - - def testFn_ReadFloat(self): - """Test the wdl built-in functional equivalent of 'read_float()'.""" - num = 2.718281828459045 - path = self._write_temp_file('read_float', content=str(num)) - self.assertEqual(num, read_float(path)) - - def testFn_ReadBoolean(self): - """Test the wdl built-in functional equivalent of 'read_boolean()'.""" - for val in (True, False): - path = self._write_temp_file('read_boolean', content=str(val)) - self.assertEqual(val, read_boolean(path)) - - # upper - path = self._write_temp_file('read_boolean', content=str(val).upper()) - self.assertEqual(val, read_boolean(path)) - - # lower - path = self._write_temp_file('read_boolean', content=str(val).lower()) - self.assertEqual(val, read_boolean(path)) - - def testFn_WriteLines(self): - """Test the wdl built-in functional equivalent of 'write_lines()'.""" - # 'line 1' \n - # 'line 2\t\t' \n - # ' ' \n - # '\n' \n - path = write_lines(['line 1', 'line 2\t\t', ' ', '\n'], temp_dir=self.output_dir) - self._check_output(path, 'line 1\nline 2\t\t\n \n\n\n', strip=False) - - def testFn_WriteTsv(self): - """Test the wdl built-in functional equivalent of 'write_tsv()'.""" - path = write_tsv([['1', '2', '3'], ['4', '5', '6'], ['7', '8', '9']], temp_dir=self.output_dir) - self._check_output(path, '1\t2\t3\n4\t5\t6\n7\t8\t9') - - def testFn_WriteJson(self): - """Test the wdl built-in functional equivalent of 'write_json()'.""" - json_obj = {'str': 'some string', 'num': 3.14, 'bool': True, 'null': None, 'arr': ['test']} - json_arr = ['1', '2'] - json_num = 3.14 - json_str = 'test string' - json_bool = False - json_null = None - - # Pair[Int, Pair[Int, Pair[Int, Pair[Int, Int]]]] - json_pairs = WDLPair(1, WDLPair(2, WDLPair(3, WDLPair(4, 5)))) - - path = write_json(json_obj, temp_dir=self.output_dir) - self._check_output(path, '{"str":"some string","num":3.14,"bool":true,"null":null,"arr":["test"]}') - - path = write_json(json_arr, temp_dir=self.output_dir) - self._check_output(path, '["1","2"]') - - path = write_json(json_num, temp_dir=self.output_dir) - self._check_output(path, '3.14') - - path = write_json(json_str, temp_dir=self.output_dir) - self._check_output(path, '"test string"') - - path = write_json(json_bool, temp_dir=self.output_dir) - self._check_output(path, 'false') - - path = write_json(json_null, temp_dir=self.output_dir) - self._check_output(path, 'null') - - path = write_json(json_pairs, temp_dir=self.output_dir) - self._check_output(path, '{"left":1,"right":{"left":2,"right":{"left":3,"right":{"left":4,"right":5}}}}') - - def testFn_WriteMap(self): - """Test the wdl built-in functional equivalent of 'write_map()'.""" - path = write_map({'key1': 'value1', 'key2': 'value2'}, temp_dir=self.output_dir) - self._check_output(path, 'key1\tvalue1\nkey2\tvalue2') - - def testFn_Transpose(self): - """Test the wdl built-in functional equivalent of 'transpose()'.""" - self.assertEqual([[0, 3], [1, 4], [2, 5]], transpose([[0, 1, 2], [3, 4, 5]])) - self.assertEqual([[0, 1, 2], [3, 4, 5]], transpose([[0, 3], [1, 4], [2, 5]])) - - self.assertEqual([], transpose([])) - self.assertEqual([], transpose([[]])) # same as Cromwell - self.assertEqual([[0]], transpose([[0]])) - self.assertRaises(RuntimeError, transpose, [[0, 1, 2], [3, 4, 5, 6]]) - - def testFn_Length(self): - """Test the WDL 'length()' built-in.""" - self.assertEqual(3, length([1, 2, 3])) - self.assertEqual(3, length(['a', 'b', 'c'])) - self.assertEqual(0, length([])) - - def testFn_Zip(self): - """Test the wdl built-in functional equivalent of 'zip()'.""" - left_array = [1, 2, 3] - right_array = ['a', 'b', 'c'] - zipped = wdl_zip(left_array, right_array) - expected_results = [WDLPair(1, 'a'), WDLPair(2, 'b'), WDLPair(3, 'c')] - - self.assertEqual(zipped, expected_results) - - # input with different size should fail. - self.assertRaises(WDLRuntimeError, wdl_zip, [1, 2, 3], ['a', 'b']) - - def testFn_Cross(self): - """Test the wdl built-in functional equivalent of 'cross()'.""" - left_array = [1, 2, 3] - right_array = ['a', 'b'] - crossed = cross(left_array, right_array) - expected_results = [WDLPair(1, 'a'), WDLPair(1, 'b'), - WDLPair(2, 'a'), WDLPair(2, 'b'), - WDLPair(3, 'a'), WDLPair(3, 'b')] - - self.assertEqual(crossed, expected_results) - - -class WdlWorkflowsTest(ToilTest): - """ - A set of test cases for toil's conformance with WDL. - - All tests should include a simple wdl and json file for toil to run that checks the output. - """ - - @classmethod - def setUpClass(cls): - super().setUpClass() - cls.program = os.path.abspath("src/toil/wdl/toilwdl.py") - cls.test_path = os.path.abspath("src/toil/test/wdl") - - def check_function(self, - function_name: str, - cases: List[str], - json_file_name: Optional[str] = None, - expected_result: Optional[str] = None, - expected_exception: Optional[str] = None): - """ - Run the given WDL workflow and check its output. The WDL workflow - should store its output inside a 'output.txt' file that can be - compared to `expected_result`. - - If `expected_exception` is set, this test passes only when both the - workflow fails and that the given `expected_exception` string is - present in standard error. - """ - wdl_files = [os.path.abspath(f'{self.test_path}/{function_name}_{case}.wdl') - for case in cases] - json_file = os.path.abspath(f'{self.test_path}/{json_file_name or function_name}.json') - for wdl_file in wdl_files: - with self.subTest(f'Testing: {wdl_file} {json_file}'): - output_dir = f'/tmp/toil-wdl-test-{uuid.uuid4()}' - os.makedirs(output_dir) - - if expected_exception is not None: - with self.assertRaises(subprocess.CalledProcessError) as context: - # use check_output() here so that the output is read before return. - subprocess.check_output([exactPython, self.program, wdl_file, json_file, '-o', output_dir], - stderr=subprocess.PIPE) - - stderr = context.exception.stderr - self.assertIsInstance(stderr, bytes) - self.assertIn(expected_exception, stderr.decode('utf-8')) - - elif expected_result is not None: - subprocess.check_call([exactPython, self.program, wdl_file, json_file, '-o', output_dir]) - output = os.path.join(output_dir, 'output.txt') - with open(output) as f: - result = f.read().strip() - self.assertEqual(result, expected_result) - - else: - self.fail("Invalid test. Either `expected_result` or `expected_exception` must be set.") - - shutil.rmtree(output_dir) - - -class WdlLanguageSpecWorkflowsTest(WdlWorkflowsTest): - """ - A set of test cases for toil's conformance with the WDL language specification: - - https://github.com/openwdl/wdl/blob/main/versions/development/SPEC.md#language-specification - """ - - @classmethod - def setUpClass(cls): - super().setUpClass() - cls.test_path = os.path.abspath("src/toil/test/wdl/wdl_specification") - - def test_type_pair(self): - # NOTE: these tests depend on read_lines(), write_json(), and select_first(). - - expected_result = '[23,"twenty-three","a.bai",{"left":23,"right":"twenty-three"}]' - self.check_function('type_pair', cases=['basic'], expected_result=expected_result) - - # tests if files from the pair type are correctly imported. - # the array of three arrays consists content from: - # 1. src/toil/test/wdl/testfiles/test_string.txt -> 'A Whale of a Tale.' - # 2. src/toil/test/wdl/testfiles/test_boolean.txt -> 'true' - # 3. src/toil/test/wdl/testfiles/test_int.txt -> '11' - expected_result = '[["A Whale of a Tale."],["true"],["11"]]' - self.check_function('type_pair', cases=['with_files'], expected_result=expected_result) - - def test_v1_declaration(self): - """ - Basic declaration example modified from the WDL 1.0 spec: - - https://github.com/openwdl/wdl/blob/main/versions/1.0/SPEC.md#declarations - """ - expected_result = 'Hello, x!; Hello, y!' - self.check_function('v1_spec', cases=['declaration'], expected_result=expected_result) - - -class WdlStandardLibraryWorkflowsTest(WdlWorkflowsTest): - """ - A set of test cases for toil's conformance with the WDL built-in standard library: - - https://github.com/openwdl/wdl/blob/main/versions/development/SPEC.md#standard-library - """ - - @classmethod - def setUpClass(cls): - super().setUpClass() - cls.test_path = os.path.abspath("src/toil/test/wdl/standard_library") - - def test_sub(self): - # this workflow swaps the extension of a TSV file to CSV, with String and File inputs. - self.check_function('sub', cases=['as_input'], expected_result='src/toil/test/wdl/test.csv') - - # NOTE: the result differs from Cromwell since we copy the file to the file store without - # preserving the path. Cromwell would return 'src/toil/test/wdl/test.csv' instead. - self.check_function('sub', cases=['as_input_with_file'], expected_result='test.csv') - - def test_size(self): - self.check_function('size', cases=['as_command'], expected_result='19.0') - - # this workflow outputs the size of a 22-byte file in 'B', 'K', and 'Ki' separated with spaces. - - # NOTE: Cromwell treats the decimal and binary units (e.g.: 'K' and 'Ki') the same, which differs from - # the spec (https://github.com/openwdl/wdl/blob/main/versions/development/SPEC.md#float-sizefile-string). - # The correct output should be '22.0 0.022 0.021484375' not '22.0 0.021484375 0.021484375' - self.check_function('size', cases=['as_output'], expected_result='22.0 0.022 0.021484375') - - def test_ceil(self): - self.check_function('ceil', cases=['as_input', 'as_command'], expected_result='12') - - def test_floor(self): - self.check_function('floor', cases=['as_input', 'as_command'], expected_result='11') - - def test_round(self): - self.check_function('round', cases=['as_input', 'as_command'], expected_result='11') - - def test_stdout(self): - self.check_function('stdout', cases=['as_output'], expected_result='A Whale of a Tale.') - self.check_function('stderr', cases=['as_output'], expected_result='a journey straight to stderr') - - def test_read(self): - """ Test the set of WDL read functions.""" - # NOTE: these tests depend on stdout() and the write_*() functions. - - self.check_function('read_lines', cases=['as_output'], - expected_result='line 1\n\t\tline 2 with tabs\n line 3\n\nline 5') - - self.check_function('read_tsv', cases=['as_output'], - expected_result='1\t2\t3\n4\t5\t6\n7\t8\t9') - - self.check_function('read_json', cases=['as_output'], - expected_result='{"key1":"value1","key2":"value2"}') - - self.check_function('read_map', cases=['as_output'], - expected_result='key1\tvalue1\nkey2\tvalue2') - - # primitives - self.check_function('read_int', cases=['as_command'], expected_result='11') - self.check_function('read_string', cases=['as_command'], expected_result='A Whale of a Tale.') - self.check_function('read_float', cases=['as_command'], expected_result='11.2345') - self.check_function('read_boolean', cases=['as_command'], expected_result='True') - - def test_write(self): - """ Test the set of WDL write functions.""" - self.check_function('write_lines', cases=['as_command'], - expected_result='first\nsecond\nthird') - - self.check_function('write_tsv', cases=['as_command'], - expected_result='one\ttwo\tthree\nun\tdeux\ttrois') - - self.check_function('write_json', cases=['as_command'], - expected_result='{"key1":"value1","key2":"value2"}') - - self.check_function('write_map', cases=['as_command'], - expected_result='key1\tvalue1\nkey2\tvalue2') - - def test_range(self): - # NOTE: this test depends on write_lines(). - self.check_function('range', cases=['as_input'], - expected_result='0\n1\n2\n3\n4\n5\n6\n7') - - self.check_function('range', cases=['as_input'], - json_file_name='range_0', - expected_result='') - - self.check_function('range', cases=['as_input'], - json_file_name='range_invalid', - expected_exception='WDLRuntimeError') - - def test_transpose(self): - # NOTE: this test depends on write_tsv(). - - # this workflow writes a transposed 2-dimensional array as a TSV file. - self.check_function('transpose', cases=['as_input'], expected_result='0\t3\n1\t4\n2\t5') - - def test_length(self): - self.check_function('length', cases=['as_input'], expected_result='3') - - self.check_function('length', cases=['as_input'], - json_file_name='length_invalid', - expected_exception='WDLRuntimeError') - - # length() should not work with Map[X, Y]. - self.check_function('length', cases=['as_input_with_map'], - expected_exception='WDLRuntimeError') - - def test_zip(self): - self.check_function('zip', cases=['as_input'], - expected_result='[{"left":1,"right":"a"},{"left":2,"right":"b"},{"left":3,"right":"c"}]') - - def test_cross(self): - self.check_function('cross', cases=['as_input'], - expected_result='[{"left":1,"right":"a"},{"left":1,"right":"b"},' - '{"left":2,"right":"a"},{"left":2,"right":"b"},' - '{"left":3,"right":"a"},{"left":3,"right":"b"}]') - - def test_as_pairs(self): - self.check_function('as_pairs', cases=['as_input'], - expected_result='[{"left":"a","right":1},{"left":"b","right":2},{"left":"c","right":3}]') - - def test_as_map(self): - self.check_function('as_map', cases=['as_input'], expected_result='{"a":1,"b":2,"c":3}') - - def test_keys(self): - self.check_function('keys', cases=['as_input'], expected_result='["a","b","c"]') - - def test_collect_by_key(self): - # NOTE: this result is expected according to the spec but differs from Cromwell. - self.check_function('collect_by_key', cases=['as_input'], expected_result='{"a":[1,3],"b":[2]}') - - def test_flatten(self): - self.check_function('flatten', cases=['as_input'], expected_result='[1,2,3,1,21,22]') - - -if __name__ == "__main__": - unittest.main() diff --git a/src/toil/test/wdl/conftest.py b/src/toil/test/wdl/conftest.py deleted file mode 100644 index 45fa5bf258..0000000000 --- a/src/toil/test/wdl/conftest.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright (C) 2015-2021 Regents of the University of California -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# https://pytest.org/latest/example/pythoncollection.html - -collect_ignore = [] - -try: - import wdlparse - print(wdlparse.__file__) # keep this import from being removed -except ImportError: - collect_ignore.append("toilwdl.py") diff --git a/src/toil/test/wdl/toilwdlTest.py b/src/toil/test/wdl/toilwdlTest.py deleted file mode 100644 index 13576d05a4..0000000000 --- a/src/toil/test/wdl/toilwdlTest.py +++ /dev/null @@ -1,520 +0,0 @@ -import os -import shutil -import subprocess -import unittest -import uuid -import zipfile -from urllib.request import urlretrieve - -from toil.test import ToilTest, needs_docker, needs_java, slow -from toil.version import exactPython -from toil.wdl.utils import get_analyzer -from toil.wdl.wdl_functions import (basename, - glob, - parse_cores, - parse_disk, - parse_memory, - process_infile, - read_csv, - read_tsv, - select_first, - size) - - -class BaseToilWdlTest(ToilTest): - """Base test class for WDL tests""" - - def setUp(self) -> None: - """Runs anew before each test to create farm fresh temp dirs.""" - self.output_dir = os.path.join('/tmp/', 'toil-wdl-test-' + str(uuid.uuid4())) - os.makedirs(self.output_dir) - - def tearDown(self) -> None: - if os.path.exists(self.output_dir): - shutil.rmtree(self.output_dir) - - @classmethod - def setUpClass(cls) -> None: - """Runs once for all tests.""" - super(BaseToilWdlTest, cls).setUpClass() - cls.base_command = [exactPython, os.path.abspath("src/toil/wdl/toilwdl.py")] - -class ToilWdlTest(BaseToilWdlTest): - """ - General tests for Toil WDL - """ - - @needs_docker - def testMD5sum(self): - """Test if toilwdl produces the same outputs as known good outputs for WDL's - GATK tutorial #1.""" - wdl = os.path.abspath('src/toil/test/wdl/md5sum/md5sum.wdl') - inputfile = os.path.abspath('src/toil/test/wdl/md5sum/md5sum.input') - json = os.path.abspath('src/toil/test/wdl/md5sum/md5sum.json') - - subprocess.check_call(self.base_command + [wdl, json, '-o', self.output_dir, '--logDebug']) - md5sum_output = os.path.join(self.output_dir, 'md5sum.txt') - assert os.path.exists(md5sum_output) - os.unlink(md5sum_output) - -class ToilWDLLibraryTest(BaseToilWdlTest): - """ - Test class for WDL standard functions. - """ - - # estimated run time <1 sec - def testFn_SelectFirst(self): - """Test the wdl built-in functional equivalent of 'select_first()', - which returns the first value in a list that is not None.""" - assert select_first(['somestring', 'anotherstring', None, '', 1]) == 'somestring' - assert select_first([None, '', 1, 'somestring']) == 1 - assert select_first([2, 1, '', 'somestring', None, '']) == 2 - assert select_first(['', 2, 1, 'somestring', None, '']) == 2 - - # estimated run time <1 sec - def testFn_Size(self) -> None: - """Test the wdl built-in functional equivalent of 'size()', - which returns a file's size based on the path.""" - from toil.common import Toil - from toil.job import Job - from toil.wdl.wdl_types import WDLFile - options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) - options.clean = 'always' - with Toil(options) as toil: - small = process_infile(WDLFile(file_path=os.path.abspath('src/toil/test/wdl/testfiles/vocab.wdl')), toil) - small_file = size(small) - assert small_file >= 1800, small_file - - # estimated run time <1 sec - def testFn_Basename(self): - assert basename('/home/quokka/git/delete/toil/src/toil/wdl/toilwdl.py', '.py') == 'toilwdl' - assert basename('/home/quokka/git/delete/toil/src/toil/wdl/toilwdl.py') == 'toilwdl.py' - assert basename('toilwdl.py', '.py') == 'toilwdl' - assert basename('toilwdl.py') == 'toilwdl.py' - - # estimated run time <1 sec - def testFn_Glob(self): - """Test the wdl built-in functional equivalent of 'glob()', - which finds all files with a pattern in a directory.""" - vocab_location = glob('vocab.wdl', os.path.abspath('src/toil')) - assert vocab_location == [os.path.abspath('src/toil/test/wdl/testfiles/vocab.wdl')], str(vocab_location) - wdl_locations = glob('wdl_*.py', os.path.abspath('src/toil')) - wdl_that_should_exist = [os.path.abspath('src/toil/wdl/wdl_analysis.py'), - os.path.abspath('src/toil/wdl/wdl_synthesis.py'), - os.path.abspath('src/toil/wdl/wdl_types.py'), - os.path.abspath('src/toil/wdl/wdl_functions.py')] - # make sure the files match the expected files - for location in wdl_that_should_exist: - assert location in wdl_locations, f'{str(location)} not in {str(wdl_locations)}!' - # make sure the same number of files were found as expected - assert len(wdl_that_should_exist) == len(wdl_locations), f'{str(len(wdl_locations))} != {str(len(wdl_that_should_exist))}' - - # estimated run time <1 sec - def testFn_ParseMemory(self): - """Test the wdl built-in functional equivalent of 'parse_memory()', - which parses a specified memory input to an int output. - - The input can be a string or an int or a float and may include units - such as 'Gb' or 'mib' as a separate argument.""" - assert parse_memory(2147483648) == 2147483648, str(parse_memory(2147483648)) - assert parse_memory('2147483648') == 2147483648, str(parse_memory(2147483648)) - assert parse_memory('2GB') == 2000000000, str(parse_memory('2GB')) - assert parse_memory('2GiB') == 2147483648, str(parse_memory('2GiB')) - assert parse_memory('1 GB') == 1000000000, str(parse_memory('1 GB')) - assert parse_memory('1 GiB') == 1073741824, str(parse_memory('1 GiB')) - - # estimated run time <1 sec - def testFn_ParseCores(self): - """Test the wdl built-in functional equivalent of 'parse_cores()', - which parses a specified disk input to an int output. - - The input can be a string or an int.""" - assert parse_cores(1) == 1 - assert parse_cores('1') == 1 - - # estimated run time <1 sec - def testFn_ParseDisk(self): - """Test the wdl built-in functional equivalent of 'parse_disk()', - which parses a specified disk input to an int output. - - The input can be a string or an int or a float and may include units - such as 'Gb' or 'mib' as a separate argument. - - The minimum returned value is 2147483648 bytes.""" - # check minimum returned value - assert parse_disk('1') == 2147483648, str(parse_disk('1')) - assert parse_disk(1) == 2147483648, str(parse_disk(1)) - - assert parse_disk(2200000001) == 2200000001, str(parse_disk(2200000001)) - assert parse_disk('2200000001') == 2200000001, str(parse_disk('2200000001')) - assert parse_disk('/mnt/my_mnt 3 SSD, /mnt/my_mnt2 500 HDD') == 503000000000, str(parse_disk('/mnt/my_mnt 3 SSD, /mnt/my_mnt2 500 HDD')) - assert parse_disk('local-disk 10 SSD') == 10000000000, str(parse_disk('local-disk 10 SSD')) - assert parse_disk('/mnt/ 10 HDD') == 10000000000, str(parse_disk('/mnt/ 10 HDD')) - assert parse_disk('/mnt/ 1000 HDD') == 1000000000000, str(parse_disk('/mnt/ 1000 HDD')) - - # estimated run time <1 sec - def testPrimitives(self): - """Test if toilwdl correctly interprets some basic declarations.""" - wdl = os.path.abspath('src/toil/test/wdl/testfiles/vocab.wdl') - - # TODO: test for all version. - aWDL = get_analyzer(wdl) - aWDL.analyze() - - no_declaration = ['bool1', 'int1', 'float1', 'file1', 'string1'] - collection_counter = [] - for key, declaration in aWDL.workflows_dictionary['vocabulary'].items(): - if not key.startswith('declaration'): - continue - - name, var_type, var_expr = declaration - - if name in no_declaration: - collection_counter.append(name) - assert not var_expr - - if name == 'bool2': - collection_counter.append(name) - assert var_expr == 'True', var_expr - assert var_type == 'Boolean', var_type - if name == 'int2': - collection_counter.append(name) - assert var_expr == '1', var_expr - assert var_type == 'Int', var_type - if name == 'float2': - collection_counter.append(name) - assert var_expr == '1.1', var_expr - assert var_type == 'Float', var_type - if name == 'file2': - collection_counter.append(name) - assert var_expr == "'src/toil/test/wdl/test.tsv'", var_expr - assert var_type == 'File', var_type - if name == 'string2': - collection_counter.append(name) - assert var_expr == "'x'", var_expr - assert var_type == 'String', var_type - assert collection_counter == ['bool1', 'int1', 'float1', 'file1', 'string1', - 'bool2', 'int2', 'float2', 'file2', 'string2'] - - # estimated run time <1 sec - def testCSV(self): - default_csv_output = [['1', '2', '3'], - ['4', '5', '6'], - ['7', '8', '9']] - csv_array = read_csv(os.path.abspath('src/toil/test/wdl/test.csv')) - assert csv_array == default_csv_output - - # estimated run time <1 sec - def testTSV(self): - default_tsv_output = [['1', '2', '3'], - ['4', '5', '6'], - ['7', '8', '9']] - tsv_array = read_tsv(os.path.abspath('src/toil/test/wdl/test.tsv')) - assert tsv_array == default_tsv_output - -class ToilWdlIntegrationTest(BaseToilWdlTest): - """Test class for WDL tests that need extra workflows and data downloaded""" - - gatk_data: str - gatk_data_dir: str - encode_data: str - encode_data_dir: str - wdl_data: str - wdl_data_dir: str - - @classmethod - def setUpClass(cls) -> None: - """Runs once for all tests.""" - super(ToilWdlIntegrationTest, cls).setUpClass() - - cls.test_directory = os.path.abspath("src/toil/test/wdl/") - - cls.encode_data = os.path.join(cls.test_directory, "ENCODE_data.zip") - cls.encode_data_dir = os.path.join(cls.test_directory, "ENCODE_data") - - cls.wdl_data = os.path.join(cls.test_directory, "wdl_templates.zip") - cls.wdl_data_dir = os.path.join(cls.test_directory, "wdl_templates") - - cls.gatk_data = os.path.join(cls.test_directory, "GATK_data.zip") - cls.gatk_data_dir = os.path.join(cls.test_directory, "GATK_data") - - cls.fetch_and_unzip_from_s3(filename='ENCODE_data.zip', - data=cls.encode_data, - data_dir=cls.encode_data_dir) - - cls.fetch_and_unzip_from_s3(filename='wdl_templates.zip', - data=cls.wdl_data, - data_dir=cls.wdl_data_dir) - - cls.fetch_and_unzip_from_s3(filename='GATK_data.zip', - data=cls.gatk_data, - data_dir=cls.gatk_data_dir) - - @classmethod - def tearDownClass(cls) -> None: - """We generate a lot of cruft.""" - data_dirs = [cls.gatk_data_dir, cls.wdl_data_dir, cls.encode_data_dir] - data_zips = [cls.gatk_data, cls.wdl_data, cls.encode_data] - encode_outputs = ['ENCFF000VOL_chr21.fq.gz', - 'ENCFF000VOL_chr21.raw.srt.bam', - 'ENCFF000VOL_chr21.raw.srt.bam.flagstat.qc', - 'ENCFF000VOL_chr21.raw.srt.dup.qc', - 'ENCFF000VOL_chr21.raw.srt.filt.nodup.srt.final.bam', - 'ENCFF000VOL_chr21.raw.srt.filt.nodup.srt.final.bam.bai', - 'ENCFF000VOL_chr21.raw.srt.filt.nodup.srt.final.filt.nodup.sample.15.SE.tagAlign.gz', - 'ENCFF000VOL_chr21.raw.srt.filt.nodup.srt.final.filt.nodup.sample.15.SE.tagAlign.gz.cc.plot.pdf', - 'ENCFF000VOL_chr21.raw.srt.filt.nodup.srt.final.filt.nodup.sample.15.SE.tagAlign.gz.cc.qc', - 'ENCFF000VOL_chr21.raw.srt.filt.nodup.srt.final.flagstat.qc', - 'ENCFF000VOL_chr21.raw.srt.filt.nodup.srt.final.pbc.qc', - 'ENCFF000VOL_chr21.raw.srt.filt.nodup.srt.final.SE.tagAlign.gz', - 'ENCFF000VOL_chr21.sai', - 'test.txt', - 'filter_qc.json', - 'filter_qc.log', - 'GRCh38_chr21_bwa.tar.gz', - 'mapping.json', - 'mapping.log', - 'post_mapping.json', - 'post_mapping.log', - 'wdl-stats.log', - 'xcor.json', - 'xcor.log', - 'toilwdl_compiled.pyc', - 'toilwdl_compiled.py', - 'post_processing.log', - 'md5.log'] - for cleanup in data_dirs + data_zips + encode_outputs: - if os.path.isdir(cleanup): - shutil.rmtree(cleanup) - elif os.path.exists(cleanup): - os.remove(cleanup) - super(ToilWdlIntegrationTest, cls).tearDownClass() - - # estimated run time 27 sec - @slow - @needs_java - def testTut01(self): - """Test if toilwdl produces the same outputs as known good outputs for WDL's - GATK tutorial #1.""" - wdl = os.path.abspath("src/toil/test/wdl/wdl_templates/t01/helloHaplotypeCaller.wdl") - json = os.path.abspath("src/toil/test/wdl/wdl_templates/t01/helloHaplotypeCaller_inputs.json") - ref_dir = os.path.abspath("src/toil/test/wdl/wdl_templates/t01/output/") - - subprocess.check_call(self.base_command + [wdl, json, '-o', self.output_dir]) - - compare_runs(self.output_dir, ref_dir) - - # estimated run time 28 sec - @slow - @needs_java - def testTut02(self): - """Test if toilwdl produces the same outputs as known good outputs for WDL's - GATK tutorial #2.""" - wdl = os.path.abspath("src/toil/test/wdl/wdl_templates/t02/simpleVariantSelection.wdl") - json = os.path.abspath("src/toil/test/wdl/wdl_templates/t02/simpleVariantSelection_inputs.json") - ref_dir = os.path.abspath("src/toil/test/wdl/wdl_templates/t02/output/") - - subprocess.check_call(self.base_command + [wdl, json, '-o', self.output_dir]) - - compare_runs(self.output_dir, ref_dir) - - # estimated run time 60 sec - @slow - @needs_java - def testTut03(self): - """Test if toilwdl produces the same outputs as known good outputs for WDL's - GATK tutorial #3.""" - wdl = os.path.abspath("src/toil/test/wdl/wdl_templates/t03/simpleVariantDiscovery.wdl") - json = os.path.abspath("src/toil/test/wdl/wdl_templates/t03/simpleVariantDiscovery_inputs.json") - ref_dir = os.path.abspath("src/toil/test/wdl/wdl_templates/t03/output/") - - subprocess.check_call(self.base_command + [wdl, json, '-o', self.output_dir]) - - compare_runs(self.output_dir, ref_dir) - - # estimated run time 175 sec - @slow - @needs_java - @unittest.skip('broken; see: https://github.com/DataBiosphere/toil/issues/3339') - def testTut04(self): - """Test if toilwdl produces the same outputs as known good outputs for WDL's - GATK tutorial #4.""" - wdl = os.path.abspath("src/toil/test/wdl/wdl_templates/t04/jointCallingGenotypes.wdl") - json = os.path.abspath("src/toil/test/wdl/wdl_templates/t04/jointCallingGenotypes_inputs.json") - ref_dir = os.path.abspath("src/toil/test/wdl/wdl_templates/t04/output/") - - subprocess.check_call(self.base_command + [wdl, json, '-o', self.output_dir]) - - compare_runs(self.output_dir, ref_dir) - - # estimated run time 80 sec - @slow - @needs_docker - def testENCODE(self): - """Test if toilwdl produces the same outputs as known good outputs for - a short ENCODE run.""" - wdl = os.path.abspath( - "src/toil/test/wdl/wdl_templates/testENCODE/encode_mapping_workflow.wdl") - json = os.path.abspath( - "src/toil/test/wdl/wdl_templates/testENCODE/encode_mapping_workflow.wdl.json") - ref_dir = os.path.abspath( - "src/toil/test/wdl/wdl_templates/testENCODE/output/") - - subprocess.check_call( - self.base_command + [wdl, json, '--docker_user=None', '--out_dir', self.output_dir]) - - compare_runs(self.output_dir, ref_dir) - - # estimated run time 2 sec - def testPipe(self): - """Test basic bash input functionality with a pipe.""" - wdl = os.path.abspath( - "src/toil/test/wdl/wdl_templates/testPipe/call.wdl") - json = os.path.abspath( - "src/toil/test/wdl/wdl_templates/testPipe/call.json") - ref_dir = os.path.abspath( - "src/toil/test/wdl/wdl_templates/testPipe/output/") - - subprocess.check_call( - self.base_command + [wdl, json, '--out_dir', self.output_dir]) - - compare_runs(self.output_dir, ref_dir) - - # estimated run time <1 sec - def testJSON(self): - default_json_dict_output = { - 'helloHaplotypeCaller.haplotypeCaller.RefIndex': '"src/toil/test/wdl/GATK_data/ref/human_g1k_b37_20.fasta.fai"', - 'helloHaplotypeCaller.haplotypeCaller.sampleName': '"WDL_tut1_output"', - 'helloHaplotypeCaller.haplotypeCaller.inputBAM': '"src/toil/test/wdl/GATK_data/inputs/NA12878_wgs_20.bam"', - 'helloHaplotypeCaller.haplotypeCaller.bamIndex': '"src/toil/test/wdl/GATK_data/inputs/NA12878_wgs_20.bai"', - 'helloHaplotypeCaller.haplotypeCaller.GATK': '"src/toil/test/wdl/GATK_data/gatk-package-4.1.9.0-local.jar"', - 'helloHaplotypeCaller.haplotypeCaller.RefDict': '"src/toil/test/wdl/GATK_data/ref/human_g1k_b37_20.dict"', - 'helloHaplotypeCaller.haplotypeCaller.RefFasta': '"src/toil/test/wdl/GATK_data/ref/human_g1k_b37_20.fasta"'} - - from toil.wdl.utils import dict_from_JSON - json_dict = dict_from_JSON("src/toil/test/wdl/wdl_templates/t01/helloHaplotypeCaller_inputs.json") - assert json_dict == default_json_dict_output, ( - str(json_dict) + '\nAssertionError: ' + str(default_json_dict_output)) - - # estimated run time <1 sec - def test_size_large(self) -> None: - """Test the wdl built-in functional equivalent of 'size()', - which returns a file's size based on the path, on a large file.""" - from toil.common import Toil - from toil.job import Job - from toil.wdl.wdl_types import WDLFile - options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) - options.clean = 'always' - with Toil(options) as toil: - large = process_infile(WDLFile(file_path=self.encode_data), toil) - larger_file = size(large) - larger_file_in_mb = size(large, 'mb') - assert larger_file >= 70000000, larger_file - assert larger_file_in_mb >= 70, larger_file_in_mb - - @classmethod - def fetch_and_unzip_from_s3(cls, filename, data, data_dir): - if not os.path.exists(data): - s3_loc = os.path.join('http://toil-datasets.s3.amazonaws.com/', filename) - urlretrieve(s3_loc, data) - # extract the compressed data if not already extracted - if not os.path.exists(data_dir): - with zipfile.ZipFile(data, 'r') as zip_ref: - zip_ref.extractall(cls.test_directory) - - -def compare_runs(output_dir, ref_dir): - """ - Takes two directories and compares all of the files between those two - directories, asserting that they match. - - - Ignores outputs.txt, which contains a list of the outputs in the folder. - - Compares line by line, unless the file is a .vcf file. - - Ignores potentially date-stamped comments (lines starting with '#'). - - Ignores quality scores in .vcf files and only checks that they found - the same variants. This is due to assumed small observed rounding - differences between systems. - - :param ref_dir: The first directory to compare (with output_dir). - :param output_dir: The second directory to compare (with ref_dir). - """ - reference_output_files = os.listdir(ref_dir) - for file in reference_output_files: - if file not in ('outputs.txt', '__pycache__'): - test_output_files = os.listdir(output_dir) - filepath = os.path.join(ref_dir, file) - with open(filepath) as default_file: - good_data = [] - for line in default_file: - if not line.startswith('#'): - good_data.append(line) - for test_file in test_output_files: - if file == test_file: - test_filepath = os.path.join(output_dir, file) - if file.endswith(".vcf"): - compare_vcf_files(filepath1=filepath, - filepath2=test_filepath) - else: - with open(test_filepath) as test_file: - test_data = [] - for line in test_file: - if not line.startswith('#'): - test_data.append(line) - assert good_data == test_data, "File does not match: %r" % file - - -def compare_vcf_files(filepath1, filepath2): - """ - Asserts that two .vcf files contain the same variant findings. - - - Ignores potentially date-stamped comments (lines starting with '#'). - - Ignores quality scores in .vcf files and only checks that they found - the same variants. This is due to assumed small observed rounding - differences between systems. - - VCF File Column Contents: - 1: #CHROM - 2: POS - 3: ID - 4: REF - 5: ALT - 6: QUAL - 7: FILTER - 8: INFO - - :param filepath1: First .vcf file to compare. - :param filepath2: Second .vcf file to compare. - """ - with open(filepath1) as default_file: - good_data = [] - for line in default_file: - line = line.strip() - if not line.startswith('#'): - good_data.append(line.split('\t')) - - with open(filepath2) as test_file: - test_data = [] - for line in test_file: - line = line.strip() - if not line.startswith('#'): - test_data.append(line.split('\t')) - - for i in range(len(test_data)): - if test_data[i] != good_data[i]: - for j in range(len(test_data[i])): - # Only compare chromosome, position, ID, reference, and alts. - # Quality score may vary (<1%) between systems because of - # (assumed) rounding differences. Same for the "info" sect. - if j < 5: - if j == 4: - if test_data[i][j].startswith('*,'): - test_data[i][j] = test_data[i][j][2:] - if good_data[i][j].startswith('*,'): - good_data[i][j] = good_data[i][j][2:] - assert test_data[i][j] == good_data[i][j], f"\nInconsistent VCFs: {filepath1} != {filepath2}\n" \ - f" - {test_data[i][j]} != {good_data[i][j]}\n" \ - f" - Line: {i} Column: {j}" - - -if __name__ == "__main__": - unittest.main() # run all tests diff --git a/src/toil/test/wdl/wdltoil_test.py b/src/toil/test/wdl/wdltoil_test.py index 2430fac701..7e2225e20b 100644 --- a/src/toil/test/wdl/wdltoil_test.py +++ b/src/toil/test/wdl/wdltoil_test.py @@ -3,12 +3,13 @@ import shutil import subprocess import unittest -from typing import Any, Dict, Set +import uuid + from unittest.mock import patch +from typing import Any, Dict, List, Set -# Don't import the test case directly or pytest will test it again. -import toil.test.wdl.toilwdlTest -from toil.test import (needs_docker_cuda, +from toil.test import (ToilTest, + needs_docker_cuda, needs_google_storage, needs_singularity_or_docker, slow) @@ -16,9 +17,21 @@ from toil.wdl.wdltoil import WDLSectionJob, WDLWorkflowGraph -class ToilConformanceTests(toil.test.wdl.toilwdlTest.BaseToilWdlTest): +class BaseWDLTest(ToilTest): + """Base test class for WDL tests.""" + def setUp(self) -> None: + """Runs anew before each test to create farm fresh temp dirs.""" + self.output_dir = os.path.join('/tmp/', 'toil-wdl-test-' + str(uuid.uuid4())) + os.makedirs(self.output_dir) + + def tearDown(self) -> None: + if os.path.exists(self.output_dir): + shutil.rmtree(self.output_dir) + + +class WDLConformanceTests(BaseWDLTest): """ - New WDL conformance tests for Toil + WDL conformance tests for Toil. """ wdl_dir = "wdl-conformance-tests" @classmethod @@ -70,11 +83,8 @@ def tearDownClass(cls) -> None: shutil.rmtree("wdl-conformance-tests") -class WdlToilTest(toil.test.wdl.toilwdlTest.ToilWdlTest): - """ - Version of the old Toil WDL tests that tests the new MiniWDL-based implementation. - """ - +class WDLTests(BaseWDLTest): + """Tests for Toil's MiniWDL-based implementation.""" @classmethod def setUpClass(cls) -> None: """Runs once for all tests.""" @@ -83,7 +93,7 @@ def setUpClass(cls) -> None: # We inherit a testMD5sum but it is going to need Singularity or Docker # now. And also needs to have a WDL 1.0+ WDL file. So we replace it. @needs_singularity_or_docker - def testMD5sum(self): + def test_MD5sum(self): """Test if Toil produces the same outputs as known good outputs for WDL's GATK tutorial #1.""" wdl = os.path.abspath('src/toil/test/wdl/md5sum/md5sum.1.0.wdl') @@ -225,18 +235,6 @@ def test_gs_uri(self): assert os.path.exists(result['ga4ghMd5.value']) assert os.path.basename(result['ga4ghMd5.value']) == 'md5sum.txt' - def test_empty_file_path(self): - """Test if empty File type inputs are protected against""" - wdl = os.path.abspath('src/toil/test/wdl/md5sum/md5sum.1.0.wdl') - json_file = os.path.abspath('src/toil/test/wdl/md5sum/empty_file.json') - - p = subprocess.Popen(self.base_command + [wdl, json_file, '-o', self.output_dir, '--logDebug'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, stderr = p.communicate() - retval = p.wait() - - assert retval != 0 - assert b'Could not find' in stderr - def test_coalesce(self): """ Test if WDLSectionJob can coalesce WDL decls. @@ -343,5 +341,6 @@ def mock_get_transitive_dependencies(self: Any, node_id: str) -> Set[str]: assert "decl2" in result[0] assert "successor" in result[1] + if __name__ == "__main__": unittest.main() # run all tests diff --git a/src/toil/wdl/toilwdl.py b/src/toil/wdl/toilwdl.py deleted file mode 100644 index 416c3194db..0000000000 --- a/src/toil/wdl/toilwdl.py +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright (C) 2018-2021 UCSC Computational Genomics Lab -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -import os -import subprocess -import sys - -from configargparse import ArgumentParser - -from toil.wdl.utils import dict_from_JSON, get_analyzer, write_mappings -from toil.wdl.wdl_synthesis import SynthesizeWDL - -logger = logging.getLogger(__name__) - - -def main(): - """ - A program to run WDL input files using native Toil scripts. - - Calls two files, described below, wdl_analysis.py and wdl_synthesis.py: - - wdl_analysis reads the wdl and restructures them into 2 intermediate data - structures before writing (python dictionaries): - "wf_dictionary": containing the parsed workflow information. - "tasks_dictionary": containing the parsed task information. - - wdl_synthesis takes the "wf_dictionary", "tasks_dictionary", and the JSON file - and uses them to write a native python script for use with Toil. - - Requires a WDL file, and a JSON file. The WDL file contains ordered commands, - and the JSON file contains input values for those commands. To run in Toil, - these two files must be parsed, restructured into python dictionaries, and - then compiled into a Toil formatted python script. This compiled Toil script - is deleted unless the user specifies: "--dev_mode" as an option. - - The WDL parser was auto-generated from the Broad's current WDL grammar file: - https://github.com/openwdl/wdl/blob/master/parsers/grammar.hgr - using Scott Frazer's Hermes: https://github.com/scottfrazer/hermes - Thank you Scott Frazer! - - Currently in alpha testing, and known to work with the Broad's GATK tutorial - set for WDL on their main wdl site: - software.broadinstitute.org/wdl/documentation/topic?name=wdl-tutorials - - And ENCODE's WDL workflow: - github.com/ENCODE-DCC/pipeline-container/blob/master/local-workflows/encode_mapping_workflow.wdl - - Additional support to be broadened to include more features soon. - """ - parser = ArgumentParser(description='Runs WDL files with toil.') - parser.add_argument('wdl_file', help='A WDL workflow file.') - parser.add_argument('secondary_file', help='A secondary data file (json).') - parser.add_argument("--jobStore", type=str, required=False, default=None) - parser.add_argument('-o', - '--outdir', - required=False, - default=os.getcwd(), - help='Optionally specify the directory that outputs ' - 'are written to. Default is the current working dir.') - parser.add_argument('--dev_mode', required=False, default=False, - help='1. Creates "AST.out", which holds the printed AST and ' - '"mappings.out", which holds the parsed task, workflow ' - 'dictionaries that were generated. ' - '2. Saves the compiled toil script generated from the ' - 'wdl/json files from deletion. ' - '3. Skips autorunning the compiled python file.') - parser.add_argument('--docker_user', required=False, default='root', - help='The user permissions that the docker containers will be run ' - 'with (and the permissions set on any output files produced). ' - 'Default is "root". Setting this to None will set this to ' - 'the current user.') - parser.add_argument("--destBucket", type=str, required=False, default=False, - help="Specify a cloud bucket endpoint for output files.") - - # wdl_run_args is an array containing all of the unknown arguments not - # specified by the parser in this main. All of these will be passed down in - # check_call later to run the compiled toil file. - args, wdl_run_args = parser.parse_known_args() - - wdl_file = os.path.abspath(args.wdl_file) - args.secondary_file = os.path.abspath(args.secondary_file) - args.outdir = os.path.abspath(args.outdir) - - aWDL = get_analyzer(wdl_file=wdl_file) - - if args.dev_mode: - aWDL.write_AST(out_dir=args.outdir) - - # read secondary file; create dictionary to hold variables - if args.secondary_file.endswith('.json'): - json_dict = dict_from_JSON(args.secondary_file) - else: - raise RuntimeError('Unsupported Secondary File Type. Use json.') - - aWDL.analyze() - - sWDL = SynthesizeWDL(aWDL.version, - aWDL.tasks_dictionary, - aWDL.workflows_dictionary, - args.outdir, - json_dict, - args.docker_user, - args.jobStore, - args.destBucket) - - # use the AST dictionaries to write 4 strings - # these are the future 4 sections of the compiled toil python file - module_section = sWDL.write_modules() - fn_section = sWDL.write_functions() - main_section = sWDL.write_main() - - # write 3 strings to a python output file - sWDL.write_python_file(module_section, - fn_section, - main_section, - sWDL.output_file) - - if args.dev_mode: - logger.debug('WDL file compiled to toil script.') - write_mappings(aWDL) - else: - logger.debug('WDL file compiled to toil script. Running now.') - exe = sys.executable if sys.executable else 'python' - cmd = [exe, sWDL.output_file] - cmd.extend(wdl_run_args) - subprocess.check_call(cmd) - os.remove(sWDL.output_file) - - -if __name__ == '__main__': - main() diff --git a/src/toil/wdl/utils.py b/src/toil/wdl/utils.py index 44d669bffc..0b5d56184e 100644 --- a/src/toil/wdl/utils.py +++ b/src/toil/wdl/utils.py @@ -11,12 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import json +from typing import Iterable -from toil.wdl.wdl_analysis import AnalyzeWDL - -def get_version(iterable) -> str: +def get_version(iterable: Iterable[str]) -> str: """ Get the version of the WDL document. @@ -35,121 +33,3 @@ def get_version(iterable) -> str: break # only draft-2 doesn't contain the version declaration return 'draft-2' - - -def get_analyzer(wdl_file: str) -> AnalyzeWDL: - """ - Creates an instance of an AnalyzeWDL implementation based on the version. - - :param wdl_file: The path to the WDL file. - """ - with open(wdl_file) as f: - version = get_version(f) - - if version == 'draft-2': - from toil.wdl.versions.draft2 import AnalyzeDraft2WDL - return AnalyzeDraft2WDL(wdl_file) - elif version == '1.0': - from toil.wdl.versions.v1 import AnalyzeV1WDL - return AnalyzeV1WDL(wdl_file) - elif version == 'development': - from toil.wdl.versions.dev import AnalyzeDevelopmentWDL - return AnalyzeDevelopmentWDL(wdl_file) - else: - raise RuntimeError(f"Unsupported WDL version: '{version}'.") - - -def dict_from_JSON(JSON_file: str) -> dict: - """ - Takes a WDL-mapped json file and creates a dict containing the bindings. - - :param JSON_file: A required JSON file containing WDL variable bindings. - """ - json_dict = {} - - # TODO: Add context support for variables within multiple wdl files - - with open(JSON_file) as data_file: - data = json.load(data_file) - for d in data: - if isinstance(data[d], str): - json_dict[d] = f'"{data[d]}"' - else: - json_dict[d] = data[d] - return json_dict - - -def write_mappings(parser: AnalyzeWDL, filename: str = 'mappings.out') -> None: - """ - Takes an AnalyzeWDL instance and writes the final task dict and workflow - dict to the given file. - - :param parser: An AnalyzeWDL instance. - :param filename: The name of a file to write to. - """ - from collections import OrderedDict - - class Formatter: - def __init__(self): - self.types = {} - self.htchar = '\t' - self.lfchar = '\n' - self.indent = 0 - self.set_formater(object, self.__class__.format_object) - self.set_formater(dict, self.__class__.format_dict) - self.set_formater(list, self.__class__.format_list) - self.set_formater(tuple, self.__class__.format_tuple) - - def set_formater(self, obj, callback): - self.types[obj] = callback - - def __call__(self, value, **args): - for key in args: - setattr(self, key, args[key]) - formater = self.types[type(value) if type(value) in self.types else object] - return formater(self, value, self.indent) - - def format_object(self, value, indent): - return repr(value) - - def format_dict(self, value, indent): - items = [ - self.lfchar + self.htchar * (indent + 1) + repr(key) + ': ' + - (self.types[type(value[key]) if type(value[key]) in self.types else object])(self, value[key], - indent + 1) - for key in value] - return '{%s}' % (','.join(items) + self.lfchar + self.htchar * indent) - - def format_list(self, value, indent): - items = [ - self.lfchar + self.htchar * (indent + 1) + ( - self.types[type(item) if type(item) in self.types else object])(self, item, indent + 1) - for item in value] - return '[%s]' % (','.join(items) + self.lfchar + self.htchar * indent) - - def format_tuple(self, value, indent): - items = [ - self.lfchar + self.htchar * (indent + 1) + ( - self.types[type(item) if type(item) in self.types else object])(self, item, indent + 1) - for item in value] - return '(%s)' % (','.join(items) + self.lfchar + self.htchar * indent) - - pretty = Formatter() - - def format_ordereddict(self, value, indent): - items = [ - self.lfchar + self.htchar * (indent + 1) + - "(" + repr(key) + ', ' + (self.types[ - type(value[key]) if type(value[key]) in self.types else object - ])(self, value[key], indent + 1) + ")" - for key in value - ] - return 'OrderedDict([%s])' % (','.join(items) + - self.lfchar + self.htchar * indent) - - pretty.set_formater(OrderedDict, format_ordereddict) - - with open(filename, 'w') as f: - f.write(pretty(parser.tasks_dictionary)) - f.write('\n\n\n\n\n\n') - f.write(pretty(parser.workflows_dictionary)) diff --git a/src/toil/wdl/versions/__init__.py b/src/toil/wdl/versions/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/src/toil/wdl/versions/dev.py b/src/toil/wdl/versions/dev.py deleted file mode 100644 index 6f1096b823..0000000000 --- a/src/toil/wdl/versions/dev.py +++ /dev/null @@ -1,107 +0,0 @@ -# Copyright (C) 2020-2021 Regents of the University of California -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging - -from wdlparse.dev.WdlLexer import FileStream, WdlLexer -from wdlparse.dev.WdlParser import CommonTokenStream, WdlParser - -from toil.wdl.versions.v1 import AnalyzeV1WDL, is_context -from toil.wdl.wdl_types import WDLType - -logger = logging.getLogger(__name__) - - -class AnalyzeDevelopmentWDL(AnalyzeV1WDL): # extend from 1.0 - """ - AnalyzeWDL implementation for the development version using ANTLR4. - - See: https://github.com/openwdl/wdl/blob/main/versions/development/SPEC.md - https://github.com/openwdl/wdl/blob/main/versions/development/parsers/antlr4/WdlParser.g4 - """ - - @property - def version(self) -> str: - return 'development' - - def analyze(self): - """ - Analyzes the WDL file passed into the constructor and generates the two - intermediate data structures: `self.workflows_dictionary` and - `self.tasks_dictionary`. - """ - lexer = WdlLexer(FileStream(self.wdl_file)) - parser = WdlParser(input=CommonTokenStream(lexer)) - tree = parser.document() - self.visit_document(tree) - - def visit_document(self, ctx: WdlParser.DocumentContext) -> None: - """ - Similar to version 1.0, except the 'workflow' element is included in - `ctx.document_element()`. - """ - for element in ctx.document_element(): - self.visit_document_element(element) - - def visit_document_element(self, ctx: WdlParser.Document_elementContext) -> None: - """ - Similar to version 1.0, except this also contains 'workflow'. - """ - element = ctx.children[0] - if is_context(element, 'WorkflowContext'): - self.visit_workflow(element) - else: - # let super take care of the rest. - super().visit_document_element(ctx) - - def visit_call(self, ctx: WdlParser.CallContext) -> dict: - """ - Similar to version 1.0, except `ctx.call_afters()` is added. - """ - # TODO: implement call_afters - # See: https://github.com/openwdl/wdl/blob/main/versions/development/SPEC.md#call-statement - return super().visit_call(ctx) - - def visit_string_expr_part(self, ctx: WdlParser.String_expr_partContext) -> str: - """ - Similar to version 1.0, except `ctx.expression_placeholder_option()` - is removed. - """ - # expression placeholder options are removed in development - # See: https://github.com/openwdl/wdl/blob/main/versions/development/parsers/antlr4/WdlParser.g4#L55 - - return self.visit_expr(ctx.expr()) - - def visit_wdl_type(self, ctx: WdlParser.Wdl_typeContext) -> WDLType: - """ - Similar to version 1.0, except Directory type is added. - """ - identifier = ctx.type_base().children[0] - - if identifier == 'Directory': - # TODO: implement Directory type - raise NotImplementedError('Directory type is not implemented.') - else: - # let super take care of the rest. - return super().visit_wdl_type(ctx) - - def visit_expr_core(self, expr: WdlParser.Expr_coreContext) -> str: - """ - Similar to version 1.0, except struct literal is added. - """ - if is_context(expr, 'Struct_literalContext'): - # TODO: implement struct literal - raise NotImplementedError(f'WDL struct is not implemented.') - else: - # let super take care of the rest. - return super().visit_expr_core(expr) diff --git a/src/toil/wdl/versions/draft2.py b/src/toil/wdl/versions/draft2.py deleted file mode 100644 index 29f9db790d..0000000000 --- a/src/toil/wdl/versions/draft2.py +++ /dev/null @@ -1,980 +0,0 @@ -# Copyright (C) 2018-2020 UCSC Computational Genomics Lab -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -import os -from collections import OrderedDict - -from wdlparse.draft2 import wdl_parser - -from toil.wdl.wdl_analysis import AnalyzeWDL - -logger = logging.getLogger(__name__) - - -class AnalyzeDraft2WDL(AnalyzeWDL): - """ - AnalyzeWDL implementation for the draft-2 version. - """ - - @property - def version(self) -> str: - return 'draft-2' - - def analyze(self): - """ - Analyzes the WDL file passed into the constructor and generates the two - intermediate data structures: `self.workflows_dictionary` and - `self.tasks_dictionary`. - - :return: Returns nothing. - """ - # parse the wdl AST into 2 dictionaries - with open(self.wdl_file) as wdl: - wdl_string = wdl.read() - ast = wdl_parser.parse(wdl_string).ast() - self.create_tasks_dict(ast) - self.create_workflows_dict(ast) - - def write_AST(self, out_dir=None): - """ - Writes a file with the AST for a wdl file in the out_dir. - """ - if out_dir is None: - out_dir = os.getcwd() - with open(os.path.join(out_dir, 'AST.out'), 'w') as f: - with open(self.wdl_file) as wdl: - wdl_string = wdl.read() - ast = wdl_parser.parse(wdl_string).ast() - f.write(ast.dumps(indent=2)) - - def find_asts(self, ast_root, name): - """ - Finds an AST node with the given name and the entire subtree under it. - A function borrowed from scottfrazer. Thank you Scott Frazer! - - :param ast_root: The WDL AST. The whole thing generally, but really - any portion that you wish to search. - :param name: The name of the subtree you're looking for, like "Task". - :return: nodes representing the AST subtrees matching the "name" given. - """ - nodes = [] - if isinstance(ast_root, wdl_parser.AstList): - for node in ast_root: - nodes.extend(self.find_asts(node, name)) - elif isinstance(ast_root, wdl_parser.Ast): - if ast_root.name == name: - nodes.append(ast_root) - for attr_name, attr in ast_root.attributes.items(): - nodes.extend(self.find_asts(attr, name)) - return nodes - - def create_tasks_dict(self, ast): - """ - Parse each "Task" in the AST. This will create self.tasks_dictionary, - where each task name is a key. - - :return: Creates the self.tasks_dictionary necessary for much of the - parser. Returning it is only necessary for unittests. - """ - tasks = self.find_asts(ast, 'Task') - for task in tasks: - self.parse_task(task) - return self.tasks_dictionary - - def parse_task(self, task): - """ - Parses a WDL task AST subtree. - - Currently looks at and parses 4 sections: - 1. Declarations (e.g. string x = 'helloworld') - 2. Commandline (a bash command with dynamic variables inserted) - 3. Runtime (docker image; disk; CPU; RAM; etc.) - 4. Outputs (expected return values/files) - - :param task: An AST subtree of a WDL "Task". - :return: Returns nothing but adds a task to the self.tasks_dictionary - necessary for much of the parser. - """ - - task_name = task.attributes["name"].source_string - - # task declarations - declaration_array = [] - for declaration_subAST in task.attr("declarations"): - declaration_array.append(self.parse_declaration(declaration_subAST)) - self.tasks_dictionary.setdefault(task_name, OrderedDict())['inputs'] = declaration_array - - for section in task.attr("sections"): - - # task commandline entries section [command(s) to run] - if section.name == "RawCommand": - command_array = self.parse_task_rawcommand(section) - self.tasks_dictionary.setdefault(task_name, OrderedDict())['raw_commandline'] = command_array - - # task runtime section (docker image; disk; CPU; RAM; etc.) - if section.name == "Runtime": - runtime_dict = self.parse_task_runtime(section.attr("map")) - self.tasks_dictionary.setdefault(task_name, OrderedDict())['runtime'] = runtime_dict - - # task output filenames section (expected return values/files) - if section.name == "Outputs": - output_array = self.parse_task_outputs(section) - self.tasks_dictionary.setdefault(task_name, OrderedDict())['outputs'] = output_array - - def parse_task_rawcommand_attributes(self, code_snippet): - """ - - :param code_snippet: - :return: - """ - attr_dict = OrderedDict() - if isinstance(code_snippet, wdl_parser.Terminal): - raise NotImplementedError - if isinstance(code_snippet, wdl_parser.Ast): - raise NotImplementedError - if isinstance(code_snippet, wdl_parser.AstList): - for ast in code_snippet: - if ast.name == 'CommandParameterAttr': - # TODO rewrite - if ast.attributes['value'].str == 'string': - attr_dict[ast.attributes['key'].source_string] = "'" + ast.attributes['value'].source_string + "'" - else: - attr_dict[ast.attributes['key'].source_string] = ast.attributes['value'].source_string - return attr_dict - - def parse_task_rawcommand(self, rawcommand_subAST): - """ - Parses the rawcommand section of the WDL task AST subtree. - - Task "rawcommands" are divided into many parts. There are 2 types of - parts: normal strings, & variables that can serve as changeable inputs. - - The following example command: - 'echo ${variable1} ${variable2} > output_file.txt' - - Has 5 parts: - Normal String: 'echo ' - Variable Input: variable1 - Normal String: ' ' - Variable Input: variable2 - Normal String: ' > output_file.txt' - - Variables can also have additional conditions, like 'sep', which is like - the python ''.join() function and in WDL looks like: ${sep=" -V " GVCFs} - and would be translated as: ' -V '.join(GVCFs). - - :param rawcommand_subAST: A subAST representing some bash command. - :return: A list=[] of tuples=() representing the parts of the command: - e.g. [(command_var, command_type, additional_conditions_list), ...] - Where: command_var = 'GVCFs' - command_type = 'variable' - command_actions = {'sep': ' -V '} - """ - command_array = [] - for code_snippet in rawcommand_subAST.attributes["parts"]: - - # normal string - if isinstance(code_snippet, wdl_parser.Terminal): - command_var = "r'''" + code_snippet.source_string + "'''" - - # a variable like ${dinosaurDNA} - if isinstance(code_snippet, wdl_parser.Ast): - if code_snippet.name == 'CommandParameter': - # change in the future? seems to be a different parameter but works for all cases it seems? - code_expr = self.parse_declaration_expressn(code_snippet.attr('expr'), es='') - code_attributes = self.parse_task_rawcommand_attributes(code_snippet.attr('attributes')) - command_var = self.modify_cmd_expr_w_attributes(code_expr, code_attributes) - - if isinstance(code_snippet, wdl_parser.AstList): - raise NotImplementedError - command_array.append(command_var) - - return command_array - - def modify_cmd_expr_w_attributes(self, code_expr, code_attr): - """ - - :param code_expr: - :param code_attr: - :return: - """ - for param in code_attr: - if param == 'sep': - code_expr = f"{code_attr[param]}.join(str(x) for x in {code_expr})" - elif param == 'default': - code_expr = "{expr} if {expr} else {default}".format(default=code_attr[param], expr=code_expr) - else: - raise NotImplementedError - return code_expr - - def parse_task_runtime_key(self, i): - """ - - :param runtime_subAST: - :return: - """ - if isinstance(i, wdl_parser.Terminal): - return i.source_string - if isinstance(i, wdl_parser.Ast): - raise NotImplementedError - if isinstance(i, wdl_parser.AstList): - raise NotImplementedError - - def parse_task_runtime(self, runtime_subAST): - """ - Parses the runtime section of the WDL task AST subtree. - - The task "runtime" section currently supports context fields for a - docker container, CPU resources, RAM resources, and disk resources. - - :param runtime_subAST: A subAST representing runtime parameters. - :return: A list=[] of runtime attributes, for example: - runtime_attributes = [('docker','quay.io/encode-dcc/map:v1.0'), - ('cpu','2'), - ('memory','17.1 GB'), - ('disks','local-disk 420 HDD')] - """ - runtime_attributes = OrderedDict() - if isinstance(runtime_subAST, wdl_parser.Terminal): - raise NotImplementedError - elif isinstance(runtime_subAST, wdl_parser.Ast): - raise NotImplementedError - elif isinstance(runtime_subAST, wdl_parser.AstList): - for ast in runtime_subAST: - key = self.parse_task_runtime_key(ast.attr('key')) - value = self.parse_declaration_expressn(ast.attr('value'), es='') - if value.startswith('"'): - value = self.translate_wdl_string_to_python_string(value[1:-1]) - runtime_attributes[key] = value - return runtime_attributes - - def parse_task_outputs(self, i): - """ - Parse the WDL output section. - - Outputs are like declarations, with a type, name, and value. Examples: - - ------------ - Simple Cases - ------------ - - 'Int num = 7' - var_name: 'num' - var_type: 'Int' - var_value: 7 - - String idea = 'Lab grown golden eagle burgers.' - var_name: 'idea' - var_type: 'String' - var_value: 'Lab grown golden eagle burgers.' - - File ideaFile = 'goldenEagleStemCellStartUpDisrupt.txt' - var_name: 'ideaFile' - var_type: 'File' - var_value: 'goldenEagleStemCellStartUpDisrupt.txt' - - ------------------- - More Abstract Cases - ------------------- - - Array[File] allOfMyTerribleIdeas = glob(*.txt)[0] - var_name: 'allOfMyTerribleIdeas' - var_type**: 'File' - var_value: [*.txt] - var_actions: {'index_lookup': '0', 'glob': 'None'} - - **toilwdl.py converts 'Array[File]' to 'ArrayFile' - - :return: output_array representing outputs generated by the job/task: - e.g. x = [(var_name, var_type, var_value, var_actions), ...] - """ - output_array = [] - for j in i.attributes['attributes']: - if j.name == 'Output': - output_array.append(self.parse_declaration(j)) - else: - raise NotImplementedError - return output_array - - def translate_wdl_string_to_python_string(self, some_string): - """ - Parses a string representing a given job's output filename into something - python can read. Replaces ${string}'s with normal variables and the rest - with normal strings all concatenated with ' + '. - - Will not work with additional parameters, such as: - ${default="foo" bar} - or - ${true="foo" false="bar" Boolean baz} - - This method expects to be passed only strings with some combination of - "${abc}" and "abc" blocks. - - :param job: A list such that: - (job priority #, job ID #, Job Skeleton Name, Job Alias) - :param some_string: e.g. '${sampleName}.vcf' - :return: output_string, e.g. 'sampleName + ".vcf"' - """ - - try: - # add support for 'sep' - output_string = '' - edited_string = some_string.strip() - - if edited_string.find('${') != -1: - continue_loop = True - while continue_loop: - index_start = edited_string.find('${') - index_end = edited_string.find('}', index_start) - - stringword = edited_string[:index_start] - - if index_start != 0: - output_string = output_string + "'" + stringword + "' + " - - keyword = edited_string[index_start + 2:index_end] - output_string = output_string + "str(" + keyword + ") + " - - edited_string = edited_string[index_end + 1:] - if edited_string.find('${') == -1: - continue_loop = False - if edited_string: - output_string = output_string + "'" + edited_string + "' + " - else: - output_string = "'" + edited_string + "'" - - if output_string.endswith(' + '): - output_string = output_string[:-3] - - return output_string - except: - return '' - - def create_workflows_dict(self, ast): - """ - Parse each "Workflow" in the AST. This will create self.workflows_dictionary, - where each called job is a tuple key of the form: (priority#, job#, name, alias). - - :return: Creates the self.workflows_dictionary necessary for much of the - parser. Returning it is only necessary for unittests. - """ - workflows = self.find_asts(ast, 'Workflow') - for workflow in workflows: - self.parse_workflow(workflow) - return self.workflows_dictionary - - def parse_workflow(self, workflow): - """ - Parses a WDL workflow AST subtree. - - Returns nothing but creates the self.workflows_dictionary necessary for much - of the parser. - - :param workflow: An AST subtree of a WDL "Workflow". - :return: Returns nothing but adds a workflow to the - self.workflows_dictionary necessary for much of the parser. - """ - workflow_name = workflow.attr('name').source_string - self.workflows_dictionary[workflow_name] = self.parse_workflow_body(workflow.attr("body")) - - def parse_workflow_body(self, i): - """ - Currently looks at and parses 3 sections: - 1. Declarations (e.g. String x = 'helloworld') - 2. Calls (similar to a python def) - 3. Scatter (which expects to map to a Call or multiple Calls) - 4. Conditionals - """ - subworkflow_dict = OrderedDict() - if isinstance(i, wdl_parser.Terminal): - raise NotImplementedError - elif isinstance(i, wdl_parser.Ast): - raise NotImplementedError - elif isinstance(i, wdl_parser.AstList): - for ast in i: - if ast.name == "Declaration": - declaration = self.parse_declaration(ast) - subworkflow_dict['declaration' + str(self.declaration_number)] = declaration - self.declaration_number += 1 - - elif ast.name == "Scatter": - scattertask = self.parse_workflow_scatter(ast) - subworkflow_dict['scatter' + str(self.scatter_number)] = scattertask - self.scatter_number += 1 - - elif ast.name == "Call": - task = self.parse_workflow_call(ast) - subworkflow_dict['call' + str(self.call_number)] = task - self.call_number += 1 - - elif ast.name == "If": - task = self.parse_workflow_if(ast) - subworkflow_dict['if' + str(self.if_number)] = task - self.if_number += 1 - return subworkflow_dict - - def parse_workflow_if(self, ifAST): - expression = self.parse_workflow_if_expression(ifAST.attr('expression')) - body = self.parse_workflow_body(ifAST.attr('body')) - return {'expression': expression, 'body': body} - - def parse_workflow_if_expression(self, i): - return self.parse_declaration_expressn(i, es='') - - def parse_workflow_scatter(self, scatterAST): - item = self.parse_workflow_scatter_item(scatterAST.attr('item')) - collection = self.parse_workflow_scatter_collection(scatterAST.attr('collection')) - body = self.parse_workflow_body(scatterAST.attr('body')) - return {'item': item, 'collection': collection, 'body': body} - - def parse_workflow_scatter_item(self, i): - if isinstance(i, wdl_parser.Terminal): - return i.source_string - elif isinstance(i, wdl_parser.Ast): - raise NotImplementedError - elif isinstance(i, wdl_parser.AstList): - raise NotImplementedError - - def parse_workflow_scatter_collection(self, i): - if isinstance(i, wdl_parser.Terminal): - return i.source_string - elif isinstance(i, wdl_parser.Ast): - return self.parse_declaration_expressn(i, es='') - elif isinstance(i, wdl_parser.AstList): - raise NotImplementedError - - def parse_declaration(self, ast): - """ - Parses a WDL declaration AST subtree into a Python tuple. - - Examples: - - String my_name - String your_name - Int two_chains_i_mean_names = 0 - - :param ast: Some subAST representing a task declaration like: - 'String file_name' - :return: var_name, var_type, var_value - Example: - Input subAST representing: 'String file_name' - Output: var_name='file_name', var_type='String', var_value=None - """ - var_name = self.parse_declaration_name(ast.attr("name")) - var_type = self.parse_declaration_type(ast.attr("type")) - var_expressn = self.parse_declaration_expressn(ast.attr("expression"), es='') - - return var_name, var_type, var_expressn - - def parse_declaration_name(self, nameAST): - """ - Required. - - Nothing fancy here. Just the name of the workflow - function. For example: "rnaseqexample" would be the following - wdl workflow's name: - - workflow rnaseqexample {File y; call a {inputs: y}; call b;} - task a {File y} - task b {command{"echo 'ATCG'"}} - - :param nameAST: - :return: - """ - if isinstance(nameAST, wdl_parser.Terminal): - return nameAST.source_string - elif isinstance(nameAST, wdl_parser.Ast): - return nameAST.source_string - elif isinstance(nameAST, wdl_parser.AstList): - raise NotImplementedError - - def parse_declaration_type(self, typeAST): - """ - Required. - - Currently supported: - Types are: Boolean, Float, Int, File, String, Array[subtype], - Pair[subtype, subtype], and Map[subtype, subtype]. - OptionalTypes are: Boolean?, Float?, Int?, File?, String?, Array[subtype]?, - Pair[subtype, subtype]?, and Map[subtype, subtype]?. - - Python is not typed, so we don't need typing except to identify type: "File", - which Toil needs to import, so we recursively travel down to the innermost - type which will tell us if the variables are files that need importing. - - For Pair and Map compound types, we recursively travel down the subtypes and - store them as attributes of a `WDLType` string. This way, the type structure is - preserved, which will allow us to import files appropriately. - - :param typeAST: - :return: a WDLType instance - """ - if isinstance(typeAST, wdl_parser.Terminal): - return self.create_wdl_primitive_type(typeAST.source_string) - elif isinstance(typeAST, wdl_parser.Ast): - if typeAST.name == 'Type': - subtype = typeAST.attr('subtype') - optional = False - elif typeAST.name == 'OptionalType': - subtype = typeAST.attr('innerType') - optional = True - else: - raise NotImplementedError - - if isinstance(subtype, wdl_parser.AstList): - # we're looking at a compound type - name = typeAST.attr('name').source_string - elements = [self.parse_declaration_type(element) for element in subtype] - return self.create_wdl_compound_type(name, elements, optional=optional) - else: - # either a primitive optional type OR deeply recursive types - # TODO: add tests #3331 - wdl_type = self.parse_declaration_type(subtype) - wdl_type.optional = optional - return wdl_type - else: - raise NotImplementedError - - def parse_declaration_expressn(self, expressionAST, es): - """ - Expressions are optional. Workflow declaration valid examples: - - File x - - or - - File x = '/x/x.tmp' - - :param expressionAST: - :return: - """ - if not expressionAST: - return None - else: - if isinstance(expressionAST, wdl_parser.Terminal): - if expressionAST.str == 'boolean': - if expressionAST.source_string == 'false': - return 'False' - elif expressionAST.source_string == 'true': - return 'True' - else: - raise TypeError('Parsed boolean ({}) must be expressed as "true" or "false".' - ''.format(expressionAST.source_string)) - elif expressionAST.str == 'string': - parsed_string = self.translate_wdl_string_to_python_string(expressionAST.source_string) - return f'{parsed_string}' - else: - # integers, floats, and variables - return f'{expressionAST.source_string}' - elif isinstance(expressionAST, wdl_parser.Ast): - if expressionAST.name == 'Add': - es = es + self.parse_declaration_expressn_operator(expressionAST.attr('lhs'), - expressionAST.attr('rhs'), - es, - operator=' + ') - elif expressionAST.name == 'Subtract': - es = es + self.parse_declaration_expressn_operator(expressionAST.attr('lhs'), - expressionAST.attr('rhs'), - es, - operator=' - ') - elif expressionAST.name == 'Multiply': - es = es + self.parse_declaration_expressn_operator(expressionAST.attr('lhs'), - expressionAST.attr('rhs'), - es, - operator=' * ') - elif expressionAST.name == 'Divide': - es = es + self.parse_declaration_expressn_operator(expressionAST.attr('lhs'), - expressionAST.attr('rhs'), - es, - operator=' / ') - elif expressionAST.name == 'GreaterThan': - es = es + self.parse_declaration_expressn_operator(expressionAST.attr('lhs'), - expressionAST.attr('rhs'), - es, - operator=' > ') - elif expressionAST.name == 'LessThan': - es = es + self.parse_declaration_expressn_operator(expressionAST.attr('lhs'), - expressionAST.attr('rhs'), - es, - operator=' < ') - elif expressionAST.name == 'FunctionCall': - es = es + self.parse_declaration_expressn_fncall(expressionAST.attr('name'), - expressionAST.attr('params'), - es) - elif expressionAST.name == 'TernaryIf': - es = es + self.parse_declaration_expressn_ternaryif(expressionAST.attr('cond'), - expressionAST.attr('iftrue'), - expressionAST.attr('iffalse'), - es) - elif expressionAST.name == 'MemberAccess': - es = es + self.parse_declaration_expressn_memberaccess(expressionAST.attr('lhs'), - expressionAST.attr('rhs'), - es) - elif expressionAST.name == 'ArrayLiteral': - es = es + self.parse_declaration_expressn_arrayliteral(expressionAST.attr('values'), - es) - elif expressionAST.name == 'TupleLiteral': - es = es + self.parse_declaration_expressn_tupleliteral(expressionAST.attr('values'), - es) - elif expressionAST.name == 'ArrayOrMapLookup': - es = es + self.parse_declaration_expressn_arraymaplookup(expressionAST.attr('lhs'), - expressionAST.attr('rhs'), - es) - elif expressionAST.name == 'LogicalNot': - es = es + self.parse_declaration_expressn_logicalnot(expressionAST.attr('expression'), - es) - else: - raise NotImplementedError - elif isinstance(expressionAST, wdl_parser.AstList): - raise NotImplementedError - return '(' + es + ')' - - def parse_declaration_expressn_logicalnot(self, exprssn, es): - if isinstance(exprssn, wdl_parser.Terminal): - es = es + exprssn.source_string - elif isinstance(exprssn, wdl_parser.Ast): - es = es + self.parse_declaration_expressn(exprssn, es='') - elif isinstance(exprssn, wdl_parser.AstList): - raise NotImplementedError - return ' not ' + es - - def parse_declaration_expressn_arraymaplookup(self, lhsAST, rhsAST, es): - """ - - :param lhsAST: - :param rhsAST: - :param es: - :return: - """ - if isinstance(lhsAST, wdl_parser.Terminal): - es = es + lhsAST.source_string - elif isinstance(lhsAST, wdl_parser.Ast): - # parenthesis must be removed because 'i[0]' works, but '(i)[0]' does not - es = es + self.parse_declaration_expressn(lhsAST, es='')[1:-1] - elif isinstance(lhsAST, wdl_parser.AstList): - raise NotImplementedError - - if isinstance(rhsAST, wdl_parser.Terminal): - indexnum = rhsAST.source_string - elif isinstance(rhsAST, wdl_parser.Ast): - raise NotImplementedError - elif isinstance(rhsAST, wdl_parser.AstList): - raise NotImplementedError - - return es + f'[{indexnum}]' - - def parse_declaration_expressn_memberaccess(self, lhsAST, rhsAST, es): - """ - Instead of "Class.variablename", use "Class.rv('variablename')". - - :param lhsAST: - :param rhsAST: - :param es: - :return: - """ - if isinstance(lhsAST, wdl_parser.Terminal): - es = es + lhsAST.source_string - elif isinstance(lhsAST, wdl_parser.Ast): - es = es + self.parse_declaration_expressn(lhsAST, es) - elif isinstance(lhsAST, wdl_parser.AstList): - raise NotImplementedError - - # hack-y way to make sure pair.left and pair.right are parsed correctly. - if isinstance(rhsAST, wdl_parser.Terminal) and ( - rhsAST.source_string == 'left' or rhsAST.source_string == 'right'): - es = es + '.' - else: - es = es + '_' - - if isinstance(rhsAST, wdl_parser.Terminal): - es = es + rhsAST.source_string - elif isinstance(rhsAST, wdl_parser.Ast): - es = es + self.parse_declaration_expressn(rhsAST, es) - elif isinstance(rhsAST, wdl_parser.AstList): - raise NotImplementedError - - return es - - def parse_declaration_expressn_ternaryif(self, cond, iftrue, iffalse, es): - """ - Classic if statement. This needs to be rearranged. - - In wdl, this looks like: - if then else - - In python, this needs to be: - if else - - :param cond: - :param iftrue: - :param iffalse: - :param es: - :return: - """ - es = es + self.parse_declaration_expressn(iftrue, es='') - es = es + ' if ' + self.parse_declaration_expressn(cond, es='') - es = es + ' else ' + self.parse_declaration_expressn(iffalse, es='') - return es - - def parse_declaration_expressn_tupleliteral(self, values, es): - """ - Same in python. Just a parenthesis enclosed tuple. - - :param values: - :param es: - :return: - """ - es = es + '(' - for ast in values: - es = es + self.parse_declaration_expressn(ast, es='') + ', ' - if es.endswith(', '): - es = es[:-2] - return es + ')' - - def parse_declaration_expressn_arrayliteral(self, values, es): - """ - Same in python. Just a square bracket enclosed array. - - :param values: - :param es: - :return: - """ - es = es + '[' - for ast in values: - es = es + self.parse_declaration_expressn(ast, es='') + ', ' - if es.endswith(', '): - es = es[:-2] - return es + ']' - - def parse_declaration_expressn_operator(self, lhsAST, rhsAST, es, operator): - """ - Simply joins the left and right hand arguments lhs and rhs with an operator. - - :param lhsAST: - :param rhsAST: - :param es: - :param operator: - :return: - """ - if isinstance(lhsAST, wdl_parser.Terminal): - if lhsAST.str == 'string': - es = es + f'"{lhsAST.source_string}"' - else: - es = es + f'{lhsAST.source_string}' - elif isinstance(lhsAST, wdl_parser.Ast): - es = es + self.parse_declaration_expressn(lhsAST, es='') - elif isinstance(lhsAST, wdl_parser.AstList): - raise NotImplementedError - - es = es + operator - - if isinstance(rhsAST, wdl_parser.Terminal): - if rhsAST.str == 'string': - es = es + f'"{rhsAST.source_string}"' - else: - es = es + f'{rhsAST.source_string}' - elif isinstance(rhsAST, wdl_parser.Ast): - es = es + self.parse_declaration_expressn(rhsAST, es='') - elif isinstance(rhsAST, wdl_parser.AstList): - raise NotImplementedError - return es - - def parse_declaration_expressn_fncall(self, name, params, es): - """ - Parses out cromwell's built-in function calls. - - Some of these are special and need minor adjustments, - for example size() requires a fileStore. - - :param name: - :param params: - :param es: - :return: - """ - # name of the function - if isinstance(name, wdl_parser.Terminal): - if name.str: - if name.source_string == 'stdout': - # let the stdout() function reference the generated stdout file path. - return es + '_toil_wdl_internal__stdout_file' - elif name.source_string == 'stderr': - return es + '_toil_wdl_internal__stderr_file' - elif name.source_string in ('range', 'zip'): - # replace python built-in functions - es += f'wdl_{name.source_string}(' - else: - es = es + name.source_string + '(' - else: - raise NotImplementedError - elif isinstance(name, wdl_parser.Ast): - raise NotImplementedError - elif isinstance(name, wdl_parser.AstList): - raise NotImplementedError - - es_params = self.parse_declaration_expressn_fncall_normalparams(params) - - if name.source_string == 'glob': - return es + es_params + ', tempDir)' - elif name.source_string == 'size': - return es + (es_params + ', ' if es_params else '') + 'fileStore=fileStore)' - elif name.source_string in ('write_lines', 'write_tsv', 'write_json', 'write_map'): - return es + es_params + ', temp_dir=tempDir, file_store=fileStore)' - else: - return es + es_params + ')' - - def parse_declaration_expressn_fncall_normalparams(self, params): - - # arguments passed to the function - if isinstance(params, wdl_parser.Terminal): - raise NotImplementedError - elif isinstance(params, wdl_parser.Ast): - raise NotImplementedError - elif isinstance(params, wdl_parser.AstList): - es_param = '' - for ast in params: - es_param = es_param + self.parse_declaration_expressn(ast, es='') + ', ' - if es_param.endswith(', '): - es_param = es_param[:-2] - return es_param - - def parse_workflow_call_taskname(self, i): - """ - Required. - - :param i: - :return: - """ - if isinstance(i, wdl_parser.Terminal): - return i.source_string - elif isinstance(i, wdl_parser.Ast): - raise NotImplementedError - elif isinstance(i, wdl_parser.AstList): - raise NotImplementedError - - def parse_workflow_call_taskalias(self, i): - """ - Required. - - :param i: - :return: - """ - if isinstance(i, wdl_parser.Terminal): - return i.source_string - elif isinstance(i, wdl_parser.Ast): - raise NotImplementedError - elif isinstance(i, wdl_parser.AstList): - raise NotImplementedError - - def parse_workflow_call_body_declarations(self, i): - """ - Have not seen this used, so expects to return "[]". - - :param i: - :return: - """ - declaration_array = [] - if isinstance(i, wdl_parser.Terminal): - declaration_array = [i.source_string] - elif isinstance(i, wdl_parser.Ast): - raise NotImplementedError - elif isinstance(i, wdl_parser.AstList): - for ast in i: - declaration_array.append(self.parse_declaration(ast)) - - # have not seen this used so raise to check - if declaration_array: - raise NotImplementedError - - return declaration_array - - def parse_workflow_call_body_io(self, i): - """ - Required. - - :param i: - :return: - """ - if isinstance(i, wdl_parser.Terminal): - raise NotImplementedError - elif isinstance(i, wdl_parser.Ast): - raise NotImplementedError - elif isinstance(i, wdl_parser.AstList): - for ast in i: - assert len(i) == 1 - if ast.name == 'Inputs': - return self.parse_workflow_call_body_io_map(ast.attr('map')) - else: - raise NotImplementedError - - def parse_workflow_call_body_io_map(self, i): - """ - Required. - - :param i: - :return: - """ - io_map = OrderedDict() - if isinstance(i, wdl_parser.Terminal): - raise NotImplementedError - elif isinstance(i, wdl_parser.Ast): - raise NotImplementedError - elif isinstance(i, wdl_parser.AstList): - for ast in i: - if ast.name == 'IOMapping': - key = self.parse_declaration_expressn(ast.attr("key"), es='') - value = self.parse_declaration_expressn(ast.attr("value"), es='') - io_map[key] = value - else: - raise NotImplementedError - return io_map - - def parse_workflow_call_body(self, i): - """ - Required. - - :param i: - :return: - """ - io_map = OrderedDict() - - if isinstance(i, wdl_parser.Terminal): - return i.source_string # no io mappings; represents just a blank call - elif isinstance(i, wdl_parser.Ast): - if i.name == 'CallBody': - declarations = self.parse_workflow_call_body_declarations(i.attr("declarations")) # have not seen this used - io_map = self.parse_workflow_call_body_io(i.attr('io')) - else: - raise NotImplementedError - elif isinstance(i, wdl_parser.AstList): - raise NotImplementedError - - return io_map - - def parse_workflow_call(self, i): - """ - Parses a WDL workflow call AST subtree to give the variable mappings for - that particular job/task "call". - - :param i: WDL workflow job object - :return: python dictionary of io mappings for that job call - """ - task_being_called = self.parse_workflow_call_taskname(i.attr("task")) - task_alias = self.parse_workflow_call_taskalias(i.attr("alias")) - io_map = self.parse_workflow_call_body(i.attr("body")) - - if not task_alias: - task_alias = task_being_called - - return {'task': task_being_called, 'alias': task_alias, 'io': io_map} - diff --git a/src/toil/wdl/versions/v1.py b/src/toil/wdl/versions/v1.py deleted file mode 100644 index 0e997275d5..0000000000 --- a/src/toil/wdl/versions/v1.py +++ /dev/null @@ -1,794 +0,0 @@ -# Copyright (C) 2020-2021 Regents of the University of California -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -from collections import OrderedDict -from typing import Union - -from wdlparse.v1.WdlV1Lexer import FileStream, WdlV1Lexer -from wdlparse.v1.WdlV1Parser import CommonTokenStream, WdlV1Parser - -from toil.wdl.wdl_analysis import AnalyzeWDL - -logger = logging.getLogger(__name__) - - -def is_context(ctx, classname: Union[str, tuple]) -> bool: - """ - Returns whether an ANTLR4 context object is of the precise type `classname`. - - :param ctx: An ANTLR4 context object. - :param classname: The class name(s) as a string or a tuple of strings. If a - tuple is provided, this returns True if the context object - matches one of the class names. - """ - # we check for `ctx.__class__.__name__` so that it's portable across multiple similar auto-generated parsers. - if isinstance(classname, str): - return ctx.__class__.__name__ == classname - return ctx.__class__.__name__ in classname - - -class AnalyzeV1WDL(AnalyzeWDL): - """ - AnalyzeWDL implementation for the 1.0 version using ANTLR4. - - See: https://github.com/openwdl/wdl/blob/main/versions/1.0/SPEC.md - https://github.com/openwdl/wdl/blob/main/versions/1.0/parsers/antlr4/WdlV1Parser.g4 - """ - - @property - def version(self) -> str: - return '1.0' - - def analyze(self): - """ - Analyzes the WDL file passed into the constructor and generates the two - intermediate data structures: `self.workflows_dictionary` and - `self.tasks_dictionary`. - """ - lexer = WdlV1Lexer(FileStream(self.wdl_file)) - parser = WdlV1Parser(input=CommonTokenStream(lexer)) - tree = parser.document() - self.visit_document(tree) - - def visit_document(self, ctx): - """ - Root of tree. Contains `version` followed by an optional workflow and - any number of `document_element`s. - """ - wf = ctx.workflow() - if wf: - self.visit_workflow(wf) - - for element in ctx.document_element(): - self.visit_document_element(element) - - def visit_document_element(self, ctx): - """ - Contains one of the following: 'import_doc', 'struct', or 'task'. - """ - element = ctx.children[0] - # task - if is_context(element, 'TaskContext'): - return self.visit_task(element) - # struct - elif is_context(element, 'StructContext'): - # TODO: add support for structs. - raise NotImplementedError('Struct is not supported.') - # import_doc - elif is_context(element, 'Import_docContext'): - # TODO: add support for imports. - raise NotImplementedError('Import is not supported.') - else: - raise RuntimeError(f'Unrecognized document element in visitDocument(): {type(element)}') - - # Workflow section - - def visit_workflow(self, ctx): - """ - Contains an 'identifier' and an array of `workflow_element`s. - """ - identifier = ctx.Identifier().getText() - wf = self.workflows_dictionary.setdefault(identifier, OrderedDict()) - - for element in ctx.workflow_element(): - section = element.children[0] - # input - if is_context(section, 'Workflow_inputContext'): - # loop through all inputs and add to the workflow dictionary. - # treating this the same as workflow declarations for now - for wf_input in self.visit_workflow_input(section): - wf[f'declaration{self.declaration_number}'] = wf_input - self.declaration_number += 1 - # output - elif is_context(section, 'Workflow_outputContext'): - # TODO: add support for workflow level outputs in wdl_synthesis - wf['wf_outputs'] = self.visit_workflow_output(section) - # inner_element - # i.e.: non-input declarations, scatters, calls, and conditionals - elif is_context(section, 'Inner_workflow_elementContext'): - wf_key, contents = self.visit_inner_workflow_element(section) - wf[wf_key] = contents - # parameter_meta and meta - elif is_context(section, ('Parameter_meta_elementContext', 'Meta_elementContext')): - # ignore additional metadata information for now. - pass - else: - raise RuntimeError(f'Unrecognized workflow element in visitWorkflow(): {type(section)}') - - def visit_workflow_input(self, ctx): - """ - Contains an array of 'any_decls', which can be unbound or bound declarations. - Example: - input { - String in_str = "twenty" - Int in_int - } - - Returns a list of tuple=(name, type, expr). - """ - return [self.visit_any_decls(decl) for decl in ctx.any_decls()] - - def visit_workflow_output(self, ctx): - """ - Contains an array of 'bound_decls' (unbound_decls not allowed). - Example: - output { - String out_str = "output" - } - - Returns a list of tuple=(name, type, expr). - """ - return [self.visit_bound_decls(decl) for decl in ctx.bound_decls()] - - def visit_inner_workflow_element(self, ctx): - """ - Returns a tuple=(unique_key, dict), where dict contains the contents of - the given inner workflow element. - """ - element = ctx.children[0] - - # bound_decls - # i.e.: declarations declared outside of input section - if is_context(element, 'Bound_declsContext'): - key = f'declaration{self.declaration_number}' - self.declaration_number += 1 - return key, self.visit_bound_decls(element) - # call - elif is_context(element, 'CallContext'): - key = f'call{self.call_number}' - self.call_number += 1 - return key, self.visit_call(element) - # scatter - elif is_context(element, 'ScatterContext'): - key = f'scatter{self.scatter_number}' - self.scatter_number += 1 - return key, self.visit_scatter(element) - # conditional - elif is_context(element, 'ConditionalContext'): - key = f'if{self.if_number}' - self.if_number += 1 - return key, self.visit_conditional(element) - else: - raise RuntimeError(f'Unrecognized workflow element in visitInner_workflow_element(): {type(element)}') - - def visit_call(self, ctx): - """ - Pattern: CALL call_name call_alias? call_body? - Example WDL syntax: call task_1 {input: arr=arr} - - Returns a dict={task, alias, io}. - """ - name = '.'.join(identifier.getText() for identifier in ctx.call_name().Identifier()) - alias = ctx.call_alias().Identifier().getText() if ctx.call_alias() else name - - body = OrderedDict() - # make sure that '{}' and '{input: ...}' are provided - if ctx.call_body() and ctx.call_body().call_inputs(): - for input_ in ctx.call_body().call_inputs().call_input(): - body[input_.Identifier().getText()] = self.visit_expr(input_.expr()) - - return { - 'task': name, - 'alias': alias, - 'io': body - } - - def visit_scatter(self, ctx): - """ - Pattern: SCATTER LPAREN Identifier In expr RPAREN LBRACE inner_workflow_element* RBRACE - Example WDL syntax: scatter ( i in items) { ... } - - Returns a dict={item, collection, body}. - """ - item = ctx.Identifier().getText() - expr = self.visit_expr(ctx.expr()) - body = OrderedDict() - for element in ctx.inner_workflow_element(): - body_key, contents = self.visit_inner_workflow_element(element) - body[body_key] = contents - return { - 'item': item, - 'collection': expr, - 'body': body - } - - def visit_conditional(self, ctx): - """ - Pattern: IF LPAREN expr RPAREN LBRACE inner_workflow_element* RBRACE - Example WDL syntax: if (condition) { ... } - - Returns a dict={expression, body}. - """ - # see https://github.com/openwdl/wdl/blob/main/versions/1.0/SPEC.md#conditionals - expr = self.visit_expr(ctx.expr()) - - body = OrderedDict() - for element in ctx.inner_workflow_element(): - body_key, contents = self.visit_inner_workflow_element(element) - body[body_key] = contents - - return { - 'expression': expr, - 'body': body - } - - # Task section - - def visit_task(self, ctx): - """ - Root of a task definition. Contains an `identifier` and an array of - `task_element`s. - """ - identifier = ctx.Identifier().getText() - task = self.tasks_dictionary.setdefault(identifier, OrderedDict()) - # print(f'Visiting task: {identifier}') - - for element in ctx.task_element(): - section = element.children[0] - # input - if is_context(section, 'Task_inputContext'): - task.setdefault('inputs', []).extend(self.visit_task_input(section)) - # output - elif is_context(section, 'Task_outputContext'): - task['outputs'] = self.visit_task_output(section) - # command - elif is_context(section, 'Task_commandContext'): - task['raw_commandline'] = self.visit_task_command(section) - # runtime - elif is_context(section, 'Task_runtimeContext'): - task['runtime'] = self.visit_task_runtime(section) - # bound_decls - elif is_context(section, 'Bound_declsContext'): - # treating this the same as inputs for now - decl = self.visit_bound_decls(section) - task.setdefault('inputs', []).append(decl) - # parameter_meta, and meta - elif is_context(section, ('Parameter_meta_elementContext', 'Meta_elementContext')): - pass - else: - raise RuntimeError(f'Unrecognized task element in visitTask(): {type(section)}') - - def visit_task_input(self, ctx): - """ - Contains an array of 'any_decls', which can be unbound or bound declarations. - Example: - input { - String in_str = "twenty" - Int in_int - } - - Returns a list of tuple=(name, type, expr) - """ - return [self.visit_any_decls(decl) for decl in ctx.any_decls()] - - def visit_task_output(self, ctx): - """ - Contains an array of 'bound_decls' (unbound_decls not allowed). - Example: - output { - String out_str = read_string(stdout()) - } - - Returns a list of tuple=(name, type, expr) - """ - return [self.visit_bound_decls(decl) for decl in ctx.bound_decls()] - - def visit_task_command(self, ctx): - """ - Parses the command section of the WDL task. - - Contains a `string_part` plus any number of `expr_with_string`s. - The following example command: - 'echo ${var1} ${var2} > output_file.txt' - Has 3 parts: - 1. string_part: 'echo ' - 2. expr_with_string, which has two parts: - - expr_part: 'var1' - - string_part: ' ' - 1. expr_with_string, which has two parts: - - expr_part: 'var2' - - string_part: ' > output_file.txt' - - Returns a list=[] of strings representing the parts of the command. - e.g. [string_part, expr_part, string_part, ...] - """ - parts = [] - - # add the first part - str_part = self.visit_task_command_string_part(ctx.task_command_string_part()) - if str_part: - parts.append(f"r'''{str_part}'''") - - # add the rest - for group in ctx.task_command_expr_with_string(): - expr_part, str_part = self.visit_task_command_expr_with_string(group) - parts.append(expr_part) - if str_part: - parts.append(f"r'''{str_part}'''") - - return parts - - def visit_task_command_string_part(self, ctx): - """ - Returns a string representing the string_part. - """ - # join here because a string that contains '$', '{', or '}' is split - return ''.join(part.getText() for part in ctx.CommandStringPart()) - - def visit_task_command_expr_with_string(self, ctx): - """ - Returns a tuple=(expr_part, string_part). - """ - return (self.visit_task_command_expr_part(ctx.task_command_expr_part()), - self.visit_task_command_string_part(ctx.task_command_string_part())) - - def visit_task_command_expr_part(self, ctx): - """ - Contains the expression inside ${expr}. Same function as `self.visit_string_expr_part()`. - - Returns the expression. - """ - return self.visit_string_expr_part(ctx) - - def visit_task_runtime(self, ctx): - """ - Contains an array of `task_runtime_kv`s. - - Returns a dict={key: value} where key can be 'docker', 'cpu', 'memory', - 'cores', or 'disks'. - """ - return OrderedDict((kv.children[0].getText(), # runtime key - self.visit_expr(kv.expr())) # runtime value - for kv in ctx.task_runtime_kv()) - - # Shared - - def visit_any_decls(self, ctx): - """ - Contains a bound or unbound declaration. - """ - if ctx.bound_decls(): - return self.visit_bound_decls(ctx.bound_decls()) - elif ctx.unbound_decls(): - return self.visit_unbound_decls(ctx.unbound_decls()) - else: - raise RuntimeError(f'Unrecognized declaration: {type(ctx)}') - - def visit_unbound_decls(self, ctx): - """ - Contains an unbound declaration. E.g.: `String in_str`. - - Returns a tuple=(name, type, expr), where `expr` is None. - """ - name = ctx.Identifier().getText() - type_ = self.visit_wdl_type(ctx.wdl_type()) - return name, type_, None - - def visit_bound_decls(self, ctx): - """ - Contains a bound declaration. E.g.: `String in_str = "some string"`. - - Returns a tuple=(name, type, expr). - """ - name = ctx.Identifier().getText() - type_ = self.visit_wdl_type(ctx.wdl_type()) - expr = self.visit_expr(ctx.expr()) - - return name, type_, expr - - def visit_wdl_type(self, ctx): - """ - Returns a WDLType instance. - """ - identifier = ctx.type_base().children[0] - optional = ctx.OPTIONAL() is not None - - # primitives - if is_context(identifier, 'TerminalNodeImpl'): - - # TODO: implement Object type - return self.create_wdl_primitive_type(key=identifier.getText(), optional=optional) - # compound types - else: - name = identifier.children[0].getText() # the first child is the name of the type. - type_ = identifier.wdl_type() - if isinstance(type_, list): - elements = [self.visit_wdl_type(element) for element in type_] - else: - elements = [self.visit_wdl_type(type_)] - return self.create_wdl_compound_type(key=name, elements=elements, optional=optional) - - def visit_primitive_literal(self, ctx): - """ - Returns the primitive literal as a string. - """ - is_bool = ctx.BoolLiteral() - if is_bool: - val = is_bool.getText() - if val not in ('true', 'false'): - raise TypeError(f'Parsed boolean ({val}) must be expressed as "true" or "false".') - return val.capitalize() - elif is_context(ctx.children[0], 'StringContext'): - return self.visit_string(ctx.children[0]) - elif is_context(ctx.children[0], ('TerminalNodeImpl', # this also includes variables - 'NumberContext')): - return ctx.children[0].getText() - else: - raise RuntimeError(f'Primitive literal has unknown child: {type(ctx.children[0])}.') - - def visit_number(self, ctx): - """ - Contains an `IntLiteral` or a `FloatLiteral`. - """ - return ctx.children[0].getText() - - def visit_string(self, ctx): - """ - Contains a `string_part` followed by an array of `string_expr_with_string_part`s. - """ - string = self.visit_string_part(ctx.string_part()) - - for part in ctx.string_expr_with_string_part(): - string += f' + {self.visit_string_expr_with_string_part(part)}' - - return string - - def visit_string_expr_with_string_part(self, ctx): - """ - Contains a `string_expr_part` and a `string_part`. - """ - expr = self.visit_string_expr_part(ctx.string_expr_part()) - part = self.visit_string_part(ctx.string_part()) - - if not part: - return expr - - return f'{expr} + {part}' - - def visit_string_expr_part(self, ctx): - """ - Contains an array of `expression_placeholder_option`s and an `expr`. - """ - # See https://github.com/openwdl/wdl/blob/main/versions/1.0/parsers/antlr4/WdlV1Parser.g4#L56 - - options = {} - - for opt in ctx.expression_placeholder_option(): - key, val = self.visit_expression_placeholder_option(opt) - options[key] = val - - expr = self.visit_expr(ctx.expr()) - - if len(options) == 0: - return expr - elif 'sep' in options: - sep = options['sep'] - return f'{sep}.join(str(x) for x in {expr})' - elif 'default' in options: - default = options['default'] - return f'({expr} if {expr} else {default})' - else: - raise NotImplementedError(options) - - def visit_string_part(self, ctx): - """ - Returns a string representing the string_part. - """ - # join here because a string that contains '$', '{', or '}' is split - part = ''.join(part.getText() for part in ctx.StringPart()) - - if part: - return f"'{part}'" - return None - - def visit_expression_placeholder_option(self, ctx): - """ - Expression placeholder options. - - Can match one of the following: - BoolLiteral EQUAL (string | number) - DEFAULT EQUAL (string | number) - SEP EQUAL (string | number) - - See https://github.com/openwdl/wdl/blob/main/versions/1.0/SPEC.md#expression-placeholder-options - - e.g.: ${sep=", " array_value} - e.g.: ${true="--yes" false="--no" boolean_value} - e.g.: ${default="foo" optional_value} - - Returns a tuple=(key, value) - """ - assert len(ctx.children) == 3 - - param = ctx.children[0].getText() - str_or_num = ctx.children[2] - val = self.visit_string(str_or_num) \ - if is_context(str_or_num, 'StringContext') else self.visit_number(str_or_num) - - return param, val - - def visit_expr(self, ctx): - """ - Expression root. - """ - return self.visit_infix0(ctx.expr_infix()) - - def visit_infix0(self, ctx): - """ - Expression infix0 (LOR). - """ - infix = ctx.expr_infix0() - if is_context(infix, 'LorContext'): - return self.visit_lor(infix) - return self.visit_infix1(infix) - - def visit_lor(self, ctx): - """ - Logical OR expression. - """ - lhs = self.visit_infix0(ctx) - rhs = self.visit_infix1(ctx) - return f'{lhs} or {rhs}' - - def visit_infix1(self, ctx): - """ - Expression infix1 (LAND). - """ - infix = ctx.expr_infix1() - if is_context(infix, 'LandContext'): - return self.visit_land(infix) - return self.visit_infix2(infix) - - def visit_land(self, ctx): - """ - Logical AND expresion. - """ - lhs = self.visit_infix1(ctx) - rhs = self.visit_infix2(ctx) - return f'{lhs} and {rhs}' - - def visit_infix2(self, ctx): - """ - Expression infix2 (comparisons). - """ - infix = ctx.expr_infix2() - if is_context(infix, 'EqeqContext'): - return self._visit_infix2(infix, '==') - elif is_context(infix, 'NeqContext'): - return self._visit_infix2(infix, '!=') - elif is_context(infix, 'LteContext'): - return self._visit_infix2(infix, '<=') - elif is_context(infix, 'GteContext'): - return self._visit_infix2(infix, '>=') - elif is_context(infix, 'LtContext'): - return self._visit_infix2(infix, '<') - elif is_context(infix, 'GtContext'): - return self._visit_infix2(infix, '>') - # continue down our path - return self.visit_infix3(infix) - - def _visit_infix2(self, ctx, operation: str): - """ - :param operation: Operation as a string. - """ - lhs = self.visit_infix2(ctx) - rhs = self.visit_infix3(ctx) - return f'{lhs} {operation} {rhs}' - - def visit_infix3(self, ctx): - """ - Expression infix3 (add/subtract). - """ - infix = ctx.expr_infix3() - if is_context(infix, 'AddContext'): - return self._visit_infix3(infix, '+') - elif is_context(infix, 'SubContext'): - return self._visit_infix3(infix, '-') - # continue down our path - return self.visit_infix4(infix) - - def _visit_infix3(self, ctx, operation: str): - """ - :param operation: Operation as a string. - """ - lhs = self.visit_infix3(ctx) - rhs = self.visit_infix4(ctx) - return f'{lhs} {operation} {rhs}' - - def visit_infix4(self, ctx): - """ - Expression infix4 (multiply/divide/modulo). - """ - infix = ctx.expr_infix4() - if is_context(infix, 'MulContext'): - return self._visit_infix4(infix, '*') - elif is_context(infix, 'DivideContext'): - return self._visit_infix4(infix, '/') - elif is_context(infix, 'ModContext'): - return self._visit_infix4(infix, '%') - # continue down our path - return self.visit_infix5(infix) - - def _visit_infix4(self, ctx, operation: str): - """ - :param operation: Operation as a string. - """ - lhs = self.visit_infix4(ctx) - rhs = self.visit_infix5(ctx) - return f'{lhs} {operation} {rhs}' - - def visit_infix5(self, ctx): - """ - Expression infix5. - """ - return self.visit_expr_core(ctx.expr_infix5().expr_core()) - - def visit_expr_core(self, expr): - """ - Expression core. - """ - # TODO: implement map_literal, object_literal, and left_name - - if is_context(expr, 'ApplyContext'): - return self.visit_apply(expr) - elif is_context(expr, 'Array_literalContext'): - return self.visit_array_literal(expr) - elif is_context(expr, 'Pair_literalContext'): - return self.visit_pair_literal(expr) - elif is_context(expr, 'IfthenelseContext'): - return self.visit_ifthenelse(expr) - elif is_context(expr, 'Expression_groupContext'): - return self.visit_expression_group(expr) - elif is_context(expr, 'AtContext'): - return self.visit_at(expr) - elif is_context(expr, 'Get_nameContext'): - return self.visit_get_name(expr) - elif is_context(expr, 'NegateContext'): - return self.visit_negate(expr) - elif is_context(expr, 'UnarysignedContext'): - return self.visit_unarysigned(expr) - elif is_context(expr, 'PrimitivesContext'): - return self.visit_primitives(expr) - - raise NotImplementedError(f"Expression context '{type(expr)}' is not supported.") - - # expr_core - - def visit_apply(self, ctx): - """ - A function call. - Pattern: Identifier LPAREN (expr (COMMA expr)*)? RPAREN - """ - fn = ctx.Identifier().getText() - params = ', '.join(self.visit_expr(expr) for expr in ctx.expr()) - - if fn == 'stdout': - return '_toil_wdl_internal__stdout_file' - elif fn == 'stderr': - return '_toil_wdl_internal__stderr_file' - elif fn in ('range', 'zip'): - # replace python built-in functions - return f'wdl_{fn}' - - call = f'{fn}({params}' - - # append necessary params for i/o functions - if fn == 'glob': - return call + ', tempDir)' - elif fn == 'size': - return call + (params + ', ' if params else '') + 'fileStore=fileStore)' - elif fn in ('write_lines', 'write_tsv', 'write_json', 'write_map'): - return call + ', temp_dir=tempDir, file_store=fileStore)' - else: - return call + ')' - - def visit_array_literal(self, ctx): - """ - Pattern: LBRACK (expr (COMMA expr)*)* RBRACK - """ - return f"[{', '.join(self.visit_expr(expr) for expr in ctx.expr())}]" - - def visit_pair_literal(self, ctx): - """ - Pattern: LPAREN expr COMMA expr RPAREN - """ - return f"({self.visit_expr(ctx.expr(0))}, {self.visit_expr(ctx.expr(1))})" - - def visit_ifthenelse(self, ctx): - """ - Ternary expression. - Pattern: IF expr THEN expr ELSE expr - """ - if_true = self.visit_expr(ctx.expr(0)) - condition = self.visit_expr(ctx.expr(1)) - if_false = self.visit_expr(ctx.expr(2)) - - return f'({condition} if {if_true} else {if_false})' - - def visit_expression_group(self, ctx): - """ - Pattern: LPAREN expr RPAREN - """ - return f'({self.visit_expr(ctx.expr())})' - - def visit_at(self, ctx): - """ - Array or map lookup. - Pattern: expr_core LBRACK expr RBRACK - """ - expr_core = self.visit_expr_core(ctx.expr_core()) - expr = self.visit_expr(ctx.expr()) - - # parenthesis must be removed because 'i[0]' works, but '(i)[0]' does not - if expr_core[0] == '(' and expr_core[-1] == ')': - expr_core = expr_core[1:-1] - - return f'{expr_core}[{expr}]' - - def visit_get_name(self, ctx): - """ - Member access. - Pattern: expr_core DOT Identifier - """ - expr_core = self.visit_expr_core(ctx.expr_core()) - identifier = ctx.Identifier().getText() - - if identifier in ('left', 'right'): - # hack-y way to make sure pair.left and pair.right are parsed correctly. - return f'({expr_core}.{identifier})' - - return f'({expr_core}_{identifier})' - - def visit_negate(self, ctx): - """ - Pattern: NOT expr - """ - return f'(not {self.visit_expr(ctx.expr())})' - - def visit_unarysigned(self, ctx): - """ - Pattern: (PLUS | MINUS) expr - """ - plus: bool = ctx.PLUS() is not None - expr = self.visit_expr(ctx.expr()) - - if plus: - return f'(+{expr})' - return f'(-{expr})' - - def visit_primitives(self, ctx): - """ - Expression alias for primitive literal. - """ - return self.visit_primitive_literal(ctx.primitive_literal()) diff --git a/src/toil/wdl/wdl_analysis.py b/src/toil/wdl/wdl_analysis.py deleted file mode 100644 index bebfc83e8a..0000000000 --- a/src/toil/wdl/wdl_analysis.py +++ /dev/null @@ -1,116 +0,0 @@ -# Copyright (C) 2018-2021 UCSC Computational Genomics Lab -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -from collections import OrderedDict - -from toil.wdl.wdl_types import (WDLArrayType, - WDLBooleanType, - WDLFileType, - WDLFloatType, - WDLIntType, - WDLMapType, - WDLPairType, - WDLStringType) - -logger = logging.getLogger(__name__) - - -class AnalyzeWDL: - """ - An interface to analyze a WDL file. Each version corresponds to a subclass that - restructures the WDL document into 2 intermediate data structures (python - dictionaries): - "workflows_dictionary": containing the parsed workflow information. - "tasks_dictionary": containing the parsed task information. - - These are then fed into wdl_synthesis.py which uses them to write a native python - script for use with Toil. - - Requires a WDL file. The WDL file contains ordered commands. - """ - def __init__(self, wdl_file: str): - self.wdl_file = wdl_file - - # holds task skeletons from WDL task objects - self.tasks_dictionary = OrderedDict() - - # holds workflow structure from WDL workflow objects - self.workflows_dictionary = OrderedDict() - - # unique iterator to add to declaration names - self.declaration_number = 0 - - # unique iterator to add to call names - self.call_number = 0 - - # unique iterator to add to scatter names - self.scatter_number = 0 - - # unique iterator to add to if names - self.if_number = 0 - - @property - def version(self) -> str: - """ - Returns the version of the WDL document as a string. - """ - raise NotImplementedError - - def analyze(self): - """ - Analyzes the WDL file passed into the constructor and generates the two - intermediate data structures: `self.workflows_dictionary` and - `self.tasks_dictionary`. - - :return: Returns nothing. - """ - - def write_AST(self, out_dir): - """ - Writes a file with the AST for a wdl file in the out_dir. - """ - - primitive_types = { - 'String': WDLStringType, - 'Int': WDLIntType, - 'Float': WDLFloatType, - 'Boolean': WDLBooleanType, - 'File': WDLFileType - } - - compound_types = { - 'Array': WDLArrayType, - 'Pair': WDLPairType, - 'Map': WDLMapType - } - - def create_wdl_primitive_type(self, key: str, optional: bool = False): - """ - Returns an instance of WDLType. - """ - type_ = self.primitive_types.get(key) - if type_: - return type_(optional=optional) - else: - raise RuntimeError(f'Unsupported primitive type: {key}') - - def create_wdl_compound_type(self, key: str, elements: list, optional: bool = False): - """ - Returns an instance of WDLCompoundType. - """ - type_ = self.compound_types.get(key) - if type_: - return type_(*elements, optional=optional) - else: - raise RuntimeError(f'Unsupported compound type: {key}') diff --git a/src/toil/wdl/wdl_functions.py b/src/toil/wdl/wdl_functions.py deleted file mode 100644 index 6b66b3f403..0000000000 --- a/src/toil/wdl/wdl_functions.py +++ /dev/null @@ -1,1007 +0,0 @@ -# Copyright (C) 2015-2021 Regents of the University of California -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import csv -import json -import logging -import math -import os -import re -import subprocess -import textwrap -import uuid -from typing import Any, Dict, List, Optional, Tuple, Union - -from toil.fileStores.abstractFileStore import AbstractFileStore -from toil.lib.conversions import bytes_in_unit -from toil.lib.resources import glob # type: ignore -from toil.wdl.wdl_types import WDLFile, WDLPair - -logger = logging.getLogger(__name__) - - -class WDLRuntimeError(Exception): - """ WDL-related run-time error.""" - - def __init__(self, message): - super().__init__(message) - - -class WDLJSONEncoder(json.JSONEncoder): - """ - Extended JSONEncoder to support WDL-specific JSON encoding. - """ - - def default(self, obj): - if isinstance(obj, WDLPair): - return obj.to_dict() - return json.JSONEncoder.default(self, obj) - - -def generate_docker_bashscript_file(temp_dir, docker_dir, globs, cmd, job_name): - ''' - Creates a bashscript to inject into a docker container for the job. - - This script wraps the job command(s) given in a bash script, hard links the - outputs and returns an "rc" file containing the exit code. All of this is - done in an effort to parallel the Broad's cromwell engine, which is the - native WDL runner. As they've chosen to write and then run a bashscript for - every command, so shall we. - - :param temp_dir: The current directory outside of docker to deposit the - bashscript into, which will be the bind mount that docker - loads files from into its own containerized filesystem. - This is usually the tempDir created by this individual job - using 'tempDir = job.fileStore.getLocalTempDir()'. - :param docker_dir: The working directory inside of the docker container - which is bind mounted to 'temp_dir'. By default this is - 'data'. - :param globs: A list of expected output files to retrieve as glob patterns - that will be returned as hard links to the current working - directory. - :param cmd: A bash command to be written into the bash script and run. - :param job_name: The job's name, only used to write in a file name - identifying the script as written for that job. - Will be used to call the script later. - :return: Nothing, but it writes and deposits a bash script in temp_dir - intended to be run inside of a docker container for this job. - ''' - wdl_copyright = heredoc_wdl(''' \n - # Borrowed/rewritten from the Broad's Cromwell implementation. As - # that is under a BSD-ish license, I include here the license off - # of their GitHub repo. Thank you Broadies! - - # Copyright (c) 2015, Broad Institute, Inc. - # All rights reserved. - - # Redistribution and use in source and binary forms, with or without - # modification, are permitted provided that the following conditions are met: - - # * Redistributions of source code must retain the above copyright notice, this - # list of conditions and the following disclaimer. - - # * Redistributions in binary form must reproduce the above copyright notice, - # this list of conditions and the following disclaimer in the documentation - # and/or other materials provided with the distribution. - - # * Neither the name Broad Institute, Inc. nor the names of its - # contributors may be used to endorse or promote products derived from - # this software without specific prior written permission. - - # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE - - # make a temp directory w/identifier - ''') - prefix_dict = {"docker_dir": docker_dir, - "cmd": cmd} - bashfile_prefix = heredoc_wdl(''' - tmpDir=$(mktemp -d /{docker_dir}/execution/tmp.XXXXXX) - chmod 777 $tmpDir - # set destination for java to deposit all of its files - export _JAVA_OPTIONS=-Djava.io.tmpdir=$tmpDir - export TMPDIR=$tmpDir - - ( - cd /{docker_dir}/execution - {cmd} - ) - - # gather the input command return code - echo $? > "$tmpDir/rc.tmp" - - ''', prefix_dict) - - bashfile_string = '#!/bin/bash' + wdl_copyright + bashfile_prefix - - begin_globbing_string = heredoc_wdl(''' - ( - mkdir "$tmpDir/globs" - ''') - - bashfile_string = bashfile_string + begin_globbing_string - - for glob_input in globs: - add_this_glob = \ - '( ln -L ' + glob_input + \ - ' "$tmpDir/globs" 2> /dev/null ) || ( ln ' + glob_input + \ - ' "$tmpDir/globs" )\n' - bashfile_string = bashfile_string + add_this_glob - - bashfile_suffix = heredoc_wdl(''' - ) - - # flush RAM to disk - sync - - mv "$tmpDir/rc.tmp" "$tmpDir/rc" - chmod -R 777 $tmpDir - ''') - - bashfile_string = bashfile_string + bashfile_suffix - - with open(os.path.join(temp_dir, job_name + '_script.sh'), 'w') as bashfile: - bashfile.write(bashfile_string) - - -def process_single_infile(wdl_file: WDLFile, fileStore: AbstractFileStore) -> WDLFile: - f = wdl_file.file_path - logger.info(f'Importing {f} into the jobstore.') - if f.startswith('http://') or f.startswith('https://') or \ - f.startswith('file://') or f.startswith('wasb://'): - filepath = fileStore.importFile(f) - preserveThisFilename = os.path.basename(f) - elif f.startswith('s3://'): - try: - filepath = fileStore.importFile(f) - preserveThisFilename = os.path.basename(f) - except: - from toil.lib.ec2nodes import EC2Regions - success = False - for region in EC2Regions: - try: - html_path = f'http://s3.{region}.amazonaws.com/' + f[5:] - filepath = fileStore.importFile(html_path) - preserveThisFilename = os.path.basename(f) - success = True - except: - pass - if not success: - raise RuntimeError('Unable to import: ' + f) - elif f.startswith('gs://'): - f = 'https://storage.googleapis.com/' + f[5:] - filepath = fileStore.importFile(f) - preserveThisFilename = os.path.basename(f) - else: - filepath = fileStore.importFile("file://" + os.path.abspath(f)) - preserveThisFilename = os.path.basename(f) - return WDLFile(file_path=filepath, file_name=preserveThisFilename, imported=True) - - -def process_infile(f: Any, fileStore: AbstractFileStore): - """ - Takes any input and imports the WDLFile into the fileStore. - - This returns the input importing all WDLFile instances to the fileStore. Toil - does not preserve a file's original name upon import and so the WDLFile also keeps - track of this. - - :param f: A primitive, WDLFile, or a container. A file needs to be a WDLFile instance - to be imported. - :param fileStore: The fileStore object that is called to load files into the fileStore. - """ - if isinstance(f, WDLFile): - # check if this has already been imported into the fileStore - if f.imported: - return f - else: - return process_single_infile(f, fileStore) - elif isinstance(f, list): - # recursively call process_infile() to handle cases like Array[Map[String, File]] - return [process_infile(sf, fileStore) for sf in f] - elif isinstance(f, WDLPair): - f.left = process_infile(f.left, fileStore) - f.right = process_infile(f.right, fileStore) - return f - elif isinstance(f, dict): - return {process_infile(k, fileStore): process_infile(v, fileStore) for k, v in f.items()} - elif isinstance(f, (int, str, bool, float)): - return f - else: - raise WDLRuntimeError(f'Error processing file: {str(f)}') - - -def sub(input_str: str, pattern: str, replace: str) -> str: - """ - Given 3 String parameters `input`, `pattern`, `replace`, this function will - replace any occurrence matching `pattern` in `input` by `replace`. - `pattern` is expected to be a regular expression. Details of regex evaluation - will depend on the execution engine running the WDL. - - WDL syntax: String sub(String, String, String) - """ - - if isinstance(input_str, WDLFile): - input_str = input_str.file_name - if isinstance(pattern, WDLFile): - pattern = pattern.file_name - if isinstance(replace, WDLFile): - replace = replace.file_name - - return re.sub(pattern=str(pattern), repl=str(replace), string=str(input_str)) - - -def defined(i): - if i: - return True - return False - - -def process_single_outfile(wdl_file: WDLFile, fileStore, workDir, outDir) -> WDLFile: - f = wdl_file.file_path - if os.path.exists(f): - output_f_path = f - elif os.path.exists(os.path.abspath(f)): - output_f_path = os.path.abspath(f) - elif os.path.exists(os.path.join(workDir, 'execution', f)): - output_f_path = os.path.join(workDir, 'execution', f) - elif os.path.exists(os.path.join('execution', f)): - output_f_path = os.path.join('execution', f) - elif os.path.exists(os.path.join(workDir, f)): - output_f_path = os.path.join(workDir, f) - elif os.path.exists(os.path.join(outDir, f)): - output_f_path = os.path.join(outDir, f) - else: - tmp = subprocess.check_output(['ls', '-lha', workDir]).decode('utf-8') - exe = subprocess.check_output(['ls', '-lha', os.path.join(workDir, 'execution')]).decode('utf-8') - for std_file in ('stdout', 'stderr'): - std_file = os.path.join(workDir, 'execution', std_file) - if os.path.exists(std_file): - with open(std_file, 'rb') as f: - logger.info(f.read()) - - raise RuntimeError('OUTPUT FILE: {} was not found in {}!\n' - '{}\n\n' - '{}\n'.format(f, os.getcwd(), tmp, exe)) - output_file = fileStore.writeGlobalFile(output_f_path) - preserveThisFilename = os.path.basename(output_f_path) - fileStore.export_file(output_file, "file://" + os.path.join(os.path.abspath(outDir), preserveThisFilename)) - return WDLFile(file_path=output_file, file_name=preserveThisFilename, imported=True) - - -def process_outfile(f, fileStore, workDir, outDir): - if isinstance(f, WDLFile): - return process_single_outfile(f, fileStore, workDir, outDir) - elif isinstance(f, list): - # recursively call process_outfile() to handle cases like Array[Map[String, File]] - return [process_outfile(sf, fileStore, workDir, outDir) for sf in f] - elif isinstance(f, WDLPair): - f.left = process_outfile(f.left, fileStore, workDir, outDir) - f.right = process_outfile(f.right, fileStore, workDir, outDir) - return f - elif isinstance(f, dict): - return {process_outfile(k, fileStore, workDir, outDir): - process_outfile(v, fileStore, workDir, outDir) for k, v in f.items()} - elif isinstance(f, (int, str, bool, float)): - return f - else: - raise WDLRuntimeError(f'Error processing file: {str(f)}') - - -def abspath_single_file(f: WDLFile, cwd: str) -> WDLFile: - path = f.file_path - if path != os.path.abspath(path): - f.file_path = os.path.join(cwd, path) - return f - - -def abspath_file(f: Any, cwd: str): - if not f: - # in the case of "optional" files (same treatment in 'process_and_read_file()') - # TODO: handle this at compile time, not here - return '' - if isinstance(f, WDLFile): - # check if this has already been imported into the fileStore - if f.imported: - return f - path = f.file_path - if path.startswith('s3://') or path.startswith('http://') or path.startswith('https://') or \ - path.startswith('file://') or path.startswith('wasb://') or path.startswith('gs://'): - return f - return abspath_single_file(f, cwd) - elif isinstance(f, list): - # recursively call abspath_file() to handle cases like Array[Map[String, File]] - return [abspath_file(sf, cwd) for sf in f] - elif isinstance(f, WDLPair): - f.left = abspath_file(f.left, cwd) - f.right = abspath_file(f.right, cwd) - return f - elif isinstance(f, dict): - return {abspath_file(k, cwd): abspath_file(v, cwd) for k, v in f.items()} - elif isinstance(f, (int, str, bool, float)): - return f - else: - raise WDLRuntimeError(f'Error processing file: ({str(f)}) of type: ({str(type(f))}).') - - -def read_single_file(f: WDLFile, tempDir, fileStore, docker=False) -> str: - import os - try: - fpath = fileStore.readGlobalFile(f.file_path, userPath=os.path.join(tempDir, f.file_name)) - except: - fpath = os.path.join(tempDir, f.file_name) - return fpath - - -def read_file(f: Any, tempDir: str, fileStore: AbstractFileStore, docker: bool = False): - if isinstance(f, WDLFile): - return read_single_file(f, tempDir, fileStore, docker=docker) - elif isinstance(f, list): - # recursively call read_file() to handle cases like Array[Map[String, File]] - return [read_file(sf, tempDir, fileStore, docker=docker) for sf in f] - elif isinstance(f, WDLPair): - f.left = read_file(f.left, tempDir, fileStore, docker=docker) - f.right = read_file(f.right, tempDir, fileStore, docker=docker) - return f - elif isinstance(f, dict): - return {read_file(k, tempDir, fileStore, docker=docker): - read_file(v, tempDir, fileStore, docker=docker) for k, v in f.items()} - elif isinstance(f, (int, str, bool, float)): - return f - else: - raise WDLRuntimeError(f'Error processing file: {str(f)}') - - -def process_and_read_file(f, tempDir, fileStore, docker=False): - if not f: - # in the case of "optional" files (same treatment in 'abspath_file()') - # TODO: handle this at compile time, not here and change to the empty string - return None - processed_file = process_infile(f, fileStore) - return read_file(processed_file, tempDir, fileStore, docker=docker) - - -def generate_stdout_file(output, tempDir, fileStore, stderr=False): - """ - Create a stdout (or stderr) file from a string or bytes object. - - :param str|bytes output: A str or bytes object that holds the stdout/stderr text. - :param str tempDir: The directory to write the stdout file. - :param fileStore: A fileStore object. - :param bool stderr: If True, a stderr instead of a stdout file is generated. - :return: The file path to the generated file. - """ - if output is None: - # write an empty file if there's no stdout/stderr. - output = b'' - elif isinstance(output, str): - output = bytes(output, encoding='utf-8') - - # TODO: we need a way to differentiate the stdout/stderr files in the workflow after execution. - # Cromwell generates a folder for each task so the file is simply named stdout and lives in - # the task execution folder. This is not the case with Toil. Though, this would not be a - # problem with intermediate stdout files as each task has its own temp folder. - name = 'stderr' if stderr else 'stdout' - local_path = os.path.join(tempDir, 'execution', name) - - # import to fileStore then read to local temp file - with fileStore.writeGlobalFileStream(cleanup=True, basename=name) as (stream, file_id): - stream.write(output) - - if file_id is None: - raise RuntimeError("No file ID written. Importing to fileStore likely failed.") - return fileStore.readGlobalFile(fileStoreID=file_id, userPath=local_path) - - -def parse_memory(memory): - """ - Parses a string representing memory and returns - an integer # of bytes. - - :param memory: - :return: - """ - memory = str(memory) - if 'None' in memory: - return 2147483648 # toil's default - try: - import re - raw_mem_split = re.split('([a-zA-Z]+)', memory) - mem_split = [] - - for r in raw_mem_split: - if r: - mem_split.append(r.replace(' ', '')) - - if len(mem_split) == 1: - return int(memory) - - if len(mem_split) == 2: - num = mem_split[0] - unit = mem_split[1] - return int(float(num) * bytes_in_unit(unit)) - else: - raise RuntimeError(f'Memory parsing failed: {memory}') - except: - return 2147483648 # toil's default - - -def parse_cores(cores): - cores = str(cores) - if 'None' in cores: - return 1 # toil's default - if cores: - return float(cores) - else: - return 1 - - -def parse_disk(disk): - disk = str(disk) - if 'None' in disk: - return 2147483648 # toil's default - try: - total_disk = 0 - disks = disk.split(',') - for d in disks: - d = d.strip().split(' ') - if len(d) > 1: - for part in d: - if is_number(part): - total_disk += parse_memory(f'{part} GB') - else: - return parse_memory(d[0]) if parse_memory(d[0]) > 2147483648 else 2147483648 - return total_disk if total_disk > 2147483648 else 2147483648 - except: - return 2147483648 # toil's default - - -def is_number(s): - try: - float(s) - return True - except ValueError: - return False - - -def size(f: Optional[Union[str, WDLFile, List[Union[str, WDLFile]]]] = None, - unit: Optional[str] = 'B', - fileStore: Optional[AbstractFileStore] = None) -> float: - """ - Given a `File` and a `String` (optional), returns the size of the file in Bytes - or in the unit specified by the second argument. - - Supported units are KiloByte ("K", "KB"), MegaByte ("M", "MB"), GigaByte - ("G", "GB"), TeraByte ("T", "TB") (powers of 1000) as well as their binary version - (https://en.wikipedia.org/wiki/Binary_prefix) "Ki" ("KiB"), "Mi" ("MiB"), - "Gi" ("GiB"), "Ti" ("TiB") (powers of 1024). Default unit is Bytes ("B"). - - WDL syntax: Float size(File, [String]) - Varieties: Float size(File?, [String]) - Float size(Array[File], [String]) - Float size(Array[File?], [String]) - """ - - if f is None: - return 0 - - # it is possible that size() is called directly (e.g.: size('file')) and so it is not treated as a file. - if isinstance(f, str): - f = WDLFile(file_path=f) - elif isinstance(f, list): - f = [WDLFile(file_path=sf) if isinstance(sf, str) else sf for sf in f] - - if not isinstance(f, (WDLFile, list)): - raise RuntimeError(f'size() excepts a "File" or "File?" argument! Not: {type(f)}') - - # validate the input. fileStore is only required if the input is not processed. - f = process_infile(f, fileStore) - - divisor = bytes_in_unit(unit) - - if isinstance(f, list): - total_size = sum(file.file_path.size for file in f) - return total_size / divisor - - fileID = f.file_path - return fileID.size / divisor - - -def select_first(values): - for var in values: - if var: - return var - raise ValueError(f'No defined variables found for select_first array: {str(values)}') - - -def combine_dicts(dict1, dict2): - combineddict= {} - for k, v in dict1.items(): - counter1 = 0 - while isinstance(v, list): - counter1 += 1 - v = v[0] - break - - for k, v in dict2.items(): - counter2 = 0 - while isinstance(v, list): - counter2 += 1 - v = v[0] - break - - for k in dict1: - if counter1 > counter2: - combineddict[k] = dict1[k] - combineddict[k].append(dict2[k]) - elif counter1 < counter2: - combineddict[k] = dict2[k] - combineddict[k].append(dict1[k]) - else: - combineddict[k] = [dict1[k], dict2[k]] - return combineddict - - -def basename(path, suffix=None): - """https://software.broadinstitute.org/wdl/documentation/article?id=10554""" - path = path.strip() - if suffix: - suffix = suffix.strip() - if path.endswith(suffix): - path = path[:-len(suffix)] - return os.path.basename(path) - - -def heredoc_wdl(template, dictionary={}, indent=''): - template = textwrap.dedent(template).format(**dictionary) - return template.replace('\n', '\n' + indent) + '\n' - - -def floor(i: Union[int, float]) -> int: - """ - Converts a Float value into an Int by rounding down to the next lower integer. - """ - return math.floor(i) - - -def ceil(i: Union[int, float]) -> int: - """ - Converts a Float value into an Int by rounding up to the next higher integer. - """ - return math.ceil(i) - - -def read_lines(path: str) -> List[str]: - """ - Given a file-like object (`String`, `File`) as a parameter, this will read each - line as a string and return an `Array[String]` representation of the lines in - the file. - - WDL syntax: Array[String] read_lines(String|File) - """ - # file should already be imported locally via `process_and_read_file` - with open(path) as f: - return f.read().rstrip('\n').split('\n') - - -def read_tsv(path: str, delimiter: str = '\t') -> List[List[str]]: - """ - Take a tsv filepath and return an array; e.g. [[],[],[]]. - - For example, a file containing: - - 1 2 3 - 4 5 6 - 7 8 9 - - would return the array: [['1','2','3'], ['4','5','6'], ['7','8','9']] - - WDL syntax: Array[Array[String]] read_tsv(String|File) - """ - tsv_array = [] - with open(path) as f: - data_file = csv.reader(f, delimiter=delimiter) - for line in data_file: - tsv_array.append(line) - return tsv_array - - -def read_csv(path: str) -> List[List[str]]: - """ - Take a csv filepath and return an array; e.g. [[],[],[]]. - - For example, a file containing: - - 1,2,3 - 4,5,6 - 7,8,9 - - would return the array: [['1','2','3'], ['4','5','6'], ['7','8','9']] - """ - return read_tsv(path, delimiter=",") - - -def read_json(path: str) -> Any: - """ - The `read_json()` function takes one parameter, which is a file-like object - (`String`, `File`) and returns a data type which matches the data - structure in the JSON file. See - https://github.com/openwdl/wdl/blob/main/versions/development/SPEC.md#mixed-read_jsonstringfile - - WDL syntax: mixed read_json(String|File) - """ - with open(path) as f: - return json.load(f) - - -def read_map(path: str) -> Dict[str, str]: - """ - Given a file-like object (`String`, `File`) as a parameter, this will read each - line from a file and expect the line to have the format `col1\tcol2`. In other - words, the file-like object must be a two-column TSV file. - - WDL syntax: Map[String, String] read_map(String|File) - """ - d = dict() - with open(path) as f: - for line in f: - line = line.rstrip() - if not line: - # remove extra lines - continue - key, value = line.split('\t', 1) - d[key] = value.strip() - return d - - -def read_int(path: Union[str, WDLFile]) -> int: - """ - The `read_int()` function takes a file path which is expected to contain 1 - line with 1 integer on it. This function returns that integer. - - WDL syntax: Int read_int(String|File) - """ - if isinstance(path, WDLFile): - path = path.file_path - - with open(path) as f: - return int(f.read().strip()) - - -def read_string(path: Union[str, WDLFile]) -> str: - """ - The `read_string()` function takes a file path which is expected to contain 1 - line with 1 string on it. This function returns that string. - - WDL syntax: String read_string(String|File) - """ - if isinstance(path, WDLFile): - path = path.file_path - - with open(path) as f: - return str(f.read().strip()) - - -def read_float(path: Union[str, WDLFile]) -> float: - """ - The `read_float()` function takes a file path which is expected to contain 1 - line with 1 floating point number on it. This function returns that float. - - WDL syntax: Float read_float(String|File) - """ - if isinstance(path, WDLFile): - path = path.file_path - - with open(path) as f: - return float(f.read().strip()) - - -def read_boolean(path: Union[str, WDLFile]) -> bool: - """ - The `read_boolean()` function takes a file path which is expected to contain 1 - line with 1 Boolean value (either "true" or "false" on it). This function - returns that Boolean value. - - WDL syntax: Boolean read_boolean(String|File) - """ - if isinstance(path, WDLFile): - path = path.file_path - - with open(path) as f: - return f.read().strip().lower() == 'true' - - -def _get_temp_file_path(function_name: str, temp_dir: Optional[str] = None) -> str: - """ - Get a unique path with basename in the format of "{function_name}_{UUID}.tmp". - """ - - if not temp_dir: - temp_dir = os.getcwd() - - # Cromwell uses the MD5 checksum of the content as part of the file name. We use a UUID instead - # for now, since we're writing line by line via a context manager. - # md5sum = hashlib.md5(content).hexdigest() - # name = f'{function_name}_{md5sum}.tmp' - - name = f'{function_name}_{uuid.uuid4()}.tmp' - return os.path.join(temp_dir, 'execution', name) - - -def write_lines(in_lines: List[str], - temp_dir: Optional[str] = None, - file_store: Optional[AbstractFileStore] = None) -> str: - """ - Given something that's compatible with `Array[String]`, this writes each element - to it's own line on a file. with newline `\n` characters as line separators. - - WDL syntax: File write_lines(Array[String]) - """ - if not isinstance(in_lines, list): - raise RuntimeError(f'write_lines() requires "{in_lines}" to be a list! Not: {type(in_lines)}') - - path = _get_temp_file_path('write_lines', temp_dir) - - with open(path, 'w') as file: - for line in in_lines: - file.write(f'{line}\n') - - if file_store: - file_store.writeGlobalFile(path, cleanup=True) - - return path - - -def write_tsv(in_tsv: List[List[str]], - delimiter: str = '\t', - temp_dir: Optional[str] = None, - file_store: Optional[AbstractFileStore] = None) -> str: - """ - Given something that's compatible with `Array[Array[String]]`, this writes a TSV - file of the data structure. - - WDL syntax: File write_tsv(Array[Array[String]]) - """ - if not isinstance(in_tsv, list): - raise RuntimeError(f'write_tsv() requires "{in_tsv}" to be a list! Not: {type(in_tsv)}') - - path = _get_temp_file_path('write_tsv', temp_dir) - - with open(path, 'w') as file: - tsv_writer = csv.writer(file, delimiter=delimiter) - for row in in_tsv: - tsv_writer.writerow(row) - - if file_store: - file_store.writeGlobalFile(path, cleanup=True) - - return path - - -def write_json(in_json: Any, - indent: Union[None, int, str] = None, - separators: Optional[Tuple[str, str]] = (',', ':'), - temp_dir: Optional[str] = None, - file_store: Optional[AbstractFileStore] = None) -> str: - """ - Given something with any type, this writes the JSON equivalent to a file. See - the table in the definition of - https://github.com/openwdl/wdl/blob/main/versions/development/SPEC.md#mixed-read_jsonstringfile - - WDL syntax: File write_json(mixed) - """ - - path = _get_temp_file_path('write_json', temp_dir) - - with open(path, 'w') as file: - file.write(json.dumps(in_json, indent=indent, separators=separators, cls=WDLJSONEncoder)) - - if file_store: - file_store.writeGlobalFile(path, cleanup=True) - - return path - - -def write_map(in_map: Dict[str, str], - temp_dir: Optional[str] = None, - file_store: Optional[AbstractFileStore] = None) -> str: - """ - Given something that's compatible with `Map[String, String]`, this writes a TSV - file of the data structure. - - WDL syntax: File write_map(Map[String, String]) - """ - if not isinstance(in_map, dict): - raise RuntimeError(f'write_map() requires "{in_map}" to be a dict! Not: {type(in_map)}') - - path = _get_temp_file_path('write_map', temp_dir) - - with open(path, 'w') as file: - for key, val in in_map.items(): - file.write(f'{key}\t{val}\n') - - if file_store: - file_store.writeGlobalFile(path, cleanup=True) - - return path - - -def wdl_range(num: int) -> List[int]: - """ - Given an integer argument, the range function creates an array of integers of - length equal to the given argument. - - WDL syntax: Array[Int] range(Int) - """ - if not (isinstance(num, int) and num >= 0): - raise WDLRuntimeError(f'range() requires an integer greater than or equal to 0 (but got {num})') - - return list(range(num)) - - -def transpose(in_array: List[List[Any]]) -> List[List[Any]]: - """ - Given a two dimensional array argument, the transpose function transposes the - two dimensional array according to the standard matrix transpose rules. - - WDL syntax: Array[Array[X]] transpose(Array[Array[X]]) - """ - if not isinstance(in_array, list): - raise RuntimeError(f'transpose() requires "{in_array}" to be a list! Not: {type(in_array)}') - - for arr in in_array: - if not isinstance(arr, list): - raise RuntimeError(f'transpose() requires all collections to be a list! Not: {type(arr)}') - # zip() can handle this but Cromwell can not. - if len(arr) != len(in_array[0]): - raise RuntimeError('transpose() requires all collections have the same size!') - - return [list(i) for i in zip(*in_array)] - - -def length(in_array: List[Any]) -> int: - """ - Given an Array, the `length` function returns the number of elements in the Array - as an Int. - """ - if not isinstance(in_array, list): - # Cromwell throws an exception for anything other than a WDL Array - raise WDLRuntimeError(f'length() requires ${in_array} to be a list! Not: {type(in_array)}') - - return len(in_array) - - -def wdl_zip(left: List[Any], right: List[Any]) -> List[WDLPair]: - """ - Return the dot product of the two arrays. If the arrays have different lengths - it is an error. - - WDL syntax: Array[Pair[X,Y]] zip(Array[X], Array[Y]) - """ - if not isinstance(left, list) or not isinstance(right, list): - raise WDLRuntimeError(f'zip() requires both inputs to be lists! Not: {type(left)} and {type(right)}') - - if len(left) != len(right): - raise WDLRuntimeError('zip() requires that input values have the same size!') - - return list(WDLPair(left=left_val, right=right_val) for left_val, right_val in zip(left, right)) - - -def cross(left: List[Any], right: List[Any]) -> List[WDLPair]: - """ - Return the cross product of the two arrays. Array[Y][1] appears before - Array[X][1] in the output. - - WDL syntax: Array[Pair[X,Y]] cross(Array[X], Array[Y]) - """ - if not isinstance(left, list) or not isinstance(right, list): - raise WDLRuntimeError(f'cross() requires both inputs to be Array[]! Not: {type(left)} and {type(right)}') - - return list(WDLPair(left=left_val, right=right_val) for left_val in left for right_val in right) - - -def as_pairs(in_map: dict) -> List[WDLPair]: - """ - Given a Map, the `as_pairs` function returns an Array containing each element - in the form of a Pair. The key will be the left element of the Pair and the - value the right element. The order of the the Pairs in the resulting Array - is the same as the order of the key/value pairs in the Map. - - WDL syntax: Array[Pair[X,Y]] as_pairs(Map[X,Y]) - """ - if not isinstance(in_map, dict): - raise WDLRuntimeError(f'as_pairs() requires "{in_map}" to be Map[]! Not: {type(in_map)}') - - return list(WDLPair(left=k, right=v) for k, v in in_map.items()) - - -def as_map(in_array: List[WDLPair]) -> dict: - """ - Given an Array consisting of Pairs, the `as_map` function returns a Map in - which the left elements of the Pairs are the keys and the right elements the - values. - - WDL syntax: Map[X,Y] as_map(Array[Pair[X,Y]]) - """ - if not isinstance(in_array, list): - raise WDLRuntimeError(f'as_map() requires "{in_array}" to be a list! Not: {type(in_array)}') - - map = {} - - for pair in in_array: - if map.get(pair.left): - raise WDLRuntimeError('Cannot evaluate "as_map()" with duplicated keys.') - - map[pair.left] = pair.right - - return map - - -def keys(in_map: dict) -> list: - """ - Given a Map, the `keys` function returns an Array consisting of the keys in - the Map. The order of the keys in the resulting Array is the same as the - order of the Pairs in the Map. - - WDL syntax: Array[X] keys(Map[X,Y]) - """ - - return list(in_map.keys()) - - -def collect_by_key(in_array: List[WDLPair]) -> dict: - """ - Given an Array consisting of Pairs, the `collect_by_key` function returns a Map - in which the left elements of the Pairs are the keys and the right elements the - values. - - WDL syntax: Map[X,Array[Y]] collect_by_key(Array[Pair[X,Y]]) - """ - if not isinstance(in_array, list): - raise WDLRuntimeError(f'as_map() requires "{in_array}" to be a list! Not: {type(in_array)}') - - map = {} - - for pair in in_array: - map.setdefault(pair.left, []).append(pair.right) - - return map - - -def flatten(in_array: List[list]) -> list: - """ - Given an array of arrays, the `flatten` function concatenates all the member - arrays in the order to appearance to give the result. It does not deduplicate - the elements. - - WDL syntax: Array[X] flatten(Array[Array[X]]) - """ - if not isinstance(in_array, list): - raise RuntimeError(f'flatten() requires "{in_array}" to be a list! Not: {type(in_array)}') - - arr = [] - - for element in in_array: - if not isinstance(element, list): - raise RuntimeError(f'flatten() requires all collections to be a list! Not: {type(element)}') - arr.extend(element) - - return arr diff --git a/src/toil/wdl/wdl_synthesis.py b/src/toil/wdl/wdl_synthesis.py deleted file mode 100644 index 1ea8efec70..0000000000 --- a/src/toil/wdl/wdl_synthesis.py +++ /dev/null @@ -1,1011 +0,0 @@ -# Copyright (C) 2015-2021 Regents of the University of California -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -import os -from typing import Optional - -from toil.lib.io import mkdtemp -from toil.wdl.wdl_functions import heredoc_wdl -from toil.wdl.wdl_types import (WDLArrayType, - WDLCompoundType, - WDLFileType, - WDLMapType, - WDLPairType, - WDLType) - -logger = logging.getLogger(__name__) - - -class SynthesizeWDL: - """ - SynthesizeWDL takes the "workflows_dictionary" and "tasks_dictionary" produced by - wdl_analysis.py and uses them to write a native python script for use with Toil. - - A WDL "workflow" section roughly corresponds to the python "main()" function, where - functions are wrapped as Toil "jobs", output dependencies specified, and called. - - A WDL "task" section corresponds to a unique python function, which will be wrapped - as a Toil "job" and defined outside of the "main()" function that calls it. - - Generally this handles breaking sections into their corresponding Toil counterparts. - - For example: write the imports, then write all functions defining jobs (which have subsections - like: write header, define variables, read "File" types into the jobstore, docker call, etc.), - then write the main and all of its subsections. - """ - - def __init__(self, - version: str, - tasks_dictionary: dict, - workflows_dictionary: dict, - output_directory: str, - json_dict: dict, - docker_user: str, - jobstore: Optional[str] = None, - destBucket: Optional[str] = None): - - self.version = version - self.output_directory = output_directory - if not os.path.exists(self.output_directory): - try: - os.makedirs(self.output_directory) - except: - raise OSError( - 'Could not create directory. Insufficient permissions or disk space most likely.') - - self.output_file = os.path.join(self.output_directory, 'toilwdl_compiled.py') - - if jobstore: - self.jobstore = jobstore - else: - self.jobstore = mkdtemp(prefix=f"{os.getcwd()}{os.sep}toilWorkflowRun") - os.rmdir(self.jobstore) - - if docker_user != 'None': - self.docker_user = "'" + docker_user + "'" - else: - self.docker_user = docker_user - - # only json is required; tsv/csv are optional - self.json_dict = json_dict - - # holds task skeletons from WDL task objects - self.tasks_dictionary = tasks_dictionary - # holds workflow structure from WDL workflow objects - self.workflows_dictionary = workflows_dictionary - - # keep track of which workflow is being written - self.current_workflow = None - - # unique iterator to add to cmd names - self.cmd_num = 0 - - # deposit WDL outputs into a cloud bucket; optional - self.destBucket = destBucket - - def write_modules(self): - # string used to write imports to the file - module_string = heredoc_wdl(''' - from toil.job import Job - from toil.common import Toil - from toil.lib.docker import apiDockerCall - from toil.wdl.wdl_types import WDLType - from toil.wdl.wdl_types import WDLStringType - from toil.wdl.wdl_types import WDLIntType - from toil.wdl.wdl_types import WDLFloatType - from toil.wdl.wdl_types import WDLBooleanType - from toil.wdl.wdl_types import WDLFileType - from toil.wdl.wdl_types import WDLArrayType - from toil.wdl.wdl_types import WDLPairType - from toil.wdl.wdl_types import WDLMapType - from toil.wdl.wdl_types import WDLFile - from toil.wdl.wdl_types import WDLPair - from toil.wdl.wdl_functions import generate_docker_bashscript_file - from toil.wdl.wdl_functions import generate_stdout_file - from toil.wdl.wdl_functions import select_first - from toil.wdl.wdl_functions import sub - from toil.wdl.wdl_functions import size - from toil.wdl.wdl_functions import glob - from toil.wdl.wdl_functions import process_and_read_file - from toil.wdl.wdl_functions import process_infile - from toil.wdl.wdl_functions import process_outfile - from toil.wdl.wdl_functions import abspath_file - from toil.wdl.wdl_functions import combine_dicts - from toil.wdl.wdl_functions import parse_memory - from toil.wdl.wdl_functions import parse_cores - from toil.wdl.wdl_functions import parse_disk - from toil.wdl.wdl_functions import read_lines - from toil.wdl.wdl_functions import read_tsv - from toil.wdl.wdl_functions import read_csv - from toil.wdl.wdl_functions import read_json - from toil.wdl.wdl_functions import read_map - from toil.wdl.wdl_functions import read_int - from toil.wdl.wdl_functions import read_string - from toil.wdl.wdl_functions import read_float - from toil.wdl.wdl_functions import read_boolean - from toil.wdl.wdl_functions import write_lines - from toil.wdl.wdl_functions import write_tsv - from toil.wdl.wdl_functions import write_json - from toil.wdl.wdl_functions import write_map - from toil.wdl.wdl_functions import defined - from toil.wdl.wdl_functions import basename - from toil.wdl.wdl_functions import floor - from toil.wdl.wdl_functions import ceil - from toil.wdl.wdl_functions import wdl_range - from toil.wdl.wdl_functions import transpose - from toil.wdl.wdl_functions import length - from toil.wdl.wdl_functions import wdl_zip - from toil.wdl.wdl_functions import cross - from toil.wdl.wdl_functions import as_pairs - from toil.wdl.wdl_functions import as_map - from toil.wdl.wdl_functions import keys - from toil.wdl.wdl_functions import collect_by_key - from toil.wdl.wdl_functions import flatten - import fnmatch - import textwrap - import subprocess - import os - import errno - import time - import shutil - import shlex - import uuid - import logging - - _toil_wdl_internal__current_working_dir = os.getcwd() - - logger = logging.getLogger(__name__) - - - ''', {'jobstore': self.jobstore})[1:] - return module_string - - def write_main(self): - """ - Writes out a huge string representing the main section of the python - compiled toil script. - - Currently looks at and writes 5 sections: - 1. JSON Variables (includes importing and preparing files as tuples) - 2. TSV Variables (includes importing and preparing files as tuples) - 3. CSV Variables (includes importing and preparing files as tuples) - 4. Wrapping each WDL "task" function as a toil job - 5. List out children and encapsulated jobs by priority, then start job0. - - This should create variable declarations necessary for function calls. - Map file paths appropriately and store them in the toil fileStore so - that they are persistent from job to job. Create job wrappers for toil. - And finally write out, and run the jobs in order of priority using the - addChild and encapsulate commands provided by toil. - - :return: giant string containing the main def for the toil script. - """ - - main_section = '' - - # write out the main header - main_header = self.write_main_header() - main_section = main_section + main_header - - # write toil job wrappers with input vars - jobs_to_write = self.write_main_jobwrappers() - main_section = main_section + jobs_to_write - - # loop to export all outputs to a cloud bucket - if self.destBucket: - main_destbucket = self.write_main_destbucket() - main_section = main_section + main_destbucket - - return main_section - - def write_main_header(self): - main_header = heredoc_wdl(''' - if __name__=="__main__": - options = Job.Runner.getDefaultOptions("{jobstore}") - options.clean = 'always' - with Toil(options) as fileStore: - ''', {'jobstore': self.jobstore}) - return main_header - - def write_main_jobwrappers(self): - """ - Writes out 'jobs' as wrapped toil objects in preparation for calling. - - :return: A string representing this. - """ - main_section = '' - - # toil cannot technically start with multiple jobs, so an empty - # 'initialize_jobs' function is always called first to get around this - main_section = main_section + ' job0 = Job.wrapJobFn(initialize_jobs)\n' - - # declare each job in main as a wrapped toil function in order of priority - for wf in self.workflows_dictionary: - self.current_workflow = wf - for assignment in self.workflows_dictionary[wf]: - if assignment.startswith('declaration'): - main_section += self.write_main_jobwrappers_declaration(self.workflows_dictionary[wf][assignment]) - if assignment.startswith('call'): - main_section += ' job0 = job0.encapsulate()\n' - main_section += self.write_main_jobwrappers_call(self.workflows_dictionary[wf][assignment]) - if assignment.startswith('scatter'): - main_section += ' job0 = job0.encapsulate()\n' - main_section += self.write_main_jobwrappers_scatter(self.workflows_dictionary[wf][assignment], - assignment) - if assignment.startswith('if'): - main_section += ' if {}:\n'.format(self.workflows_dictionary[wf][assignment]['expression']) - main_section += self.write_main_jobwrappers_if(self.workflows_dictionary[wf][assignment]['body']) - - main_section += '\n fileStore.start(job0)\n' - - return main_section - - def write_main_jobwrappers_declaration(self, declaration): - - main_section = '' - var_name, var_type, var_expr = declaration - - # check the json file for the expression's value - # this is a higher priority and overrides anything written in the .wdl - json_expressn = self.json_var(wf=self.current_workflow, var=var_name) - if json_expressn is not None: - var_expr = json_expressn - - main_section += ' {} = {}.create(\n {})\n' \ - .format(var_name, self.write_declaration_type(var_type), var_expr) - - # import filepath into jobstore - if self.needs_file_import(var_type) and var_expr: - main_section += f' {var_name} = process_infile({var_name}, fileStore)\n' - - return main_section - - def write_main_destbucket(self): - """ - Writes out a loop for exporting outputs to a cloud bucket. - - :return: A string representing this. - """ - main_section = heredoc_wdl(''' - outdir = '{outdir}' - onlyfiles = [os.path.join(outdir, f) for f in os.listdir(outdir) if os.path.isfile(os.path.join(outdir, f))] - for output_f_path in onlyfiles: - output_file = fileStore.writeGlobalFile(output_f_path) - preserveThisFilename = os.path.basename(output_f_path) - destUrl = '/'.join(s.strip('/') for s in [destBucket, preserveThisFilename]) - fileStore.exportFile(output_file, destUrl) - ''', {'outdir': self.output_directory}, indent=' ') - return main_section - - def fetch_ignoredifs(self, assignments, breaking_assignment): - ignore_ifs = [] - for assignment in assignments: - if assignment.startswith('call'): - pass - elif assignment.startswith('scatter'): - pass - elif assignment.startswith('if'): - if not self.fetch_ignoredifs_chain(assignments[assignment]['body'], breaking_assignment): - ignore_ifs.append(assignment) - return ignore_ifs - - def fetch_ignoredifs_chain(self, assignments, breaking_assignment): - for assignment in assignments: - if assignment.startswith('call'): - if assignment == breaking_assignment: - return True - if assignment.startswith('scatter'): - if assignment == breaking_assignment: - return True - if assignment.startswith('if'): - return self.fetch_ignoredifs_chain(assignments[assignment]['body'], breaking_assignment) - return False - - def write_main_jobwrappers_if(self, if_statement): - # check for empty if statement - if not if_statement: - return self.indent(' pass') - - main_section = '' - for assignment in if_statement: - if assignment.startswith('declaration'): - main_section += self.write_main_jobwrappers_declaration(if_statement[assignment]) - if assignment.startswith('call'): - main_section += ' job0 = job0.encapsulate()\n' - main_section += self.write_main_jobwrappers_call(if_statement[assignment]) - if assignment.startswith('scatter'): - main_section += ' job0 = job0.encapsulate()\n' - main_section += self.write_main_jobwrappers_scatter(if_statement[assignment], assignment) - if assignment.startswith('if'): - main_section += ' if {}:\n'.format(if_statement[assignment]['expression']) - main_section += self.write_main_jobwrappers_if(if_statement[assignment]['body']) - main_section = self.indent(main_section) - return main_section - - def write_main_jobwrappers_scatter(self, task, assignment): - scatter_inputs = self.fetch_scatter_inputs(assignment) - - main_section = ' {scatter} = job0.addChild({scatter}Cls('.format(scatter=assignment) - for var in scatter_inputs: - main_section += var + '=' + var + ', ' - if main_section.endswith(', '): - main_section = main_section[:-2] - main_section += '))\n' - - scatter_outputs = self.fetch_scatter_outputs(task) - for var in scatter_outputs: - main_section += ' {var} = {scatter}.rv("{var}")\n'.format(var=var['task'] + '_' + var['output'], scatter=assignment) - - return main_section - - def fetch_scatter_outputs(self, task): - scatteroutputs = [] - - for var in task['body']: - # TODO variable support - if var.startswith('call'): - if 'outputs' in self.tasks_dictionary[task['body'][var]['task']]: - for output in self.tasks_dictionary[task['body'][var]['task']]['outputs']: - scatteroutputs.append({'task': task['body'][var]['alias'], 'output': output[0]}) - return scatteroutputs - - def fetch_scatter_inputs(self, assigned): - - for wf in self.workflows_dictionary: - ignored_ifs = self.fetch_ignoredifs(self.workflows_dictionary[wf], assigned) - # TODO support additional wfs - break - - scatternamespace = [] - - for wf in self.workflows_dictionary: - for assignment in self.workflows_dictionary[wf]: - if assignment == assigned: - return scatternamespace - elif assignment.startswith('declaration'): - name, _, _ = self.workflows_dictionary[wf][assignment] - scatternamespace.append(name) - elif assignment.startswith('call'): - if 'outputs' in self.tasks_dictionary[self.workflows_dictionary[wf][assignment]['task']]: - for output in self.tasks_dictionary[self.workflows_dictionary[wf][assignment]['task']]['outputs']: - scatternamespace.append(self.workflows_dictionary[wf][assignment]['alias'] + '_' + output[0]) - elif assignment.startswith('scatter'): - for var in self.fetch_scatter_outputs(self.workflows_dictionary[wf][assignment]): - scatternamespace.append(var['task'] + '_' + var['output']) - elif assignment.startswith('if') and assignment not in ignored_ifs: - new_list, cont_or_break = self.fetch_scatter_inputs_chain(self.workflows_dictionary[wf][assignment]['body'], - assigned, - ignored_ifs, - inputs_list=[]) - scatternamespace += new_list - if not cont_or_break: - return scatternamespace - return scatternamespace - - def fetch_scatter_inputs_chain(self, inputs, assigned, ignored_ifs, inputs_list): - for i in inputs: - if i == assigned: - return inputs_list, False - elif i.startswith('call'): - if 'outputs' in self.tasks_dictionary[inputs[i]['task']]: - for output in self.tasks_dictionary[inputs[i]['task']]['outputs']: - inputs_list.append(inputs[i]['alias'] + '_' + output[0]) - elif i.startswith('scatter'): - for var in self.fetch_scatter_outputs(inputs[i]): - inputs_list.append(var['task'] + '_' + var['output']) - elif i.startswith('if') and i not in ignored_ifs: - inputs_list, cont_or_break = self.fetch_scatter_inputs_chain(inputs[i]['body'], assigned, ignored_ifs, inputs_list) - if not cont_or_break: - return inputs_list, False - return inputs_list, True - - def write_main_jobwrappers_call(self, task): - main_section = ' {} = job0.addChild({}Cls('.format(task['alias'], task['task']) - for var in task['io']: - main_section += var + '=' + task['io'][var] + ', ' - if main_section.endswith(', '): - main_section = main_section[:-2] - main_section += '))\n' - - call_outputs = self.fetch_call_outputs(task) - for var in call_outputs: - main_section += ' {var} = {task}.rv("{output}")\n'.format(var=var['task'] + '_' + var['output'], - task=var['task'], - output=var['output']) - return main_section - - def fetch_call_outputs(self, task): - calloutputs = [] - if 'outputs' in self.tasks_dictionary[task['task']]: - for output in self.tasks_dictionary[task['task']]['outputs']: - calloutputs.append({'task': task['alias'], 'output': output[0]}) - return calloutputs - - def write_functions(self): - """ - Writes out a python function for each WDL "task" object. - - :return: a giant string containing the meat of the job defs. - """ - - # toil cannot technically start with multiple jobs, so an empty - # 'initialize_jobs' function is always called first to get around this - fn_section = 'def initialize_jobs(job):\n' + \ - ' job.fileStore.logToMaster("initialize_jobs")\n' - - for job in self.tasks_dictionary: - fn_section += self.write_function(job) - - for wf in self.workflows_dictionary: - for assignment in self.workflows_dictionary[wf]: - if assignment.startswith('scatter'): - fn_section += self.write_scatterfunction(self.workflows_dictionary[wf][assignment], assignment) - if assignment.startswith('if'): - fn_section += self.write_scatterfunctions_within_if(self.workflows_dictionary[wf][assignment]['body']) - - return fn_section - - def write_scatterfunctions_within_if(self, ifstatement): - fn_section = '' - for assignment in ifstatement: - if assignment.startswith('scatter'): - fn_section += self.write_scatterfunction(ifstatement[assignment], assignment) - if assignment.startswith('if'): - fn_section += self.write_scatterfunctions_within_if(ifstatement[assignment]['body']) - return fn_section - - def write_scatterfunction(self, job, scattername): - """ - Writes out a python function for each WDL "scatter" object. - """ - - scatter_outputs = self.fetch_scatter_outputs(job) - - # write the function header - fn_section = self.write_scatterfunction_header(scattername) - - # write the scatter definitions - fn_section += self.write_scatterfunction_lists(scatter_outputs) - - # write - fn_section += self.write_scatterfunction_loop(job, scatter_outputs) - - # write the outputs for the task to return - fn_section += self.write_scatterfunction_outputreturn(scatter_outputs) - - return fn_section - - def write_scatterfunction_header(self, scattername): - """ - - :return: - """ - scatter_inputs = self.fetch_scatter_inputs(scattername) - - fn_section = f'\n\nclass {scattername}Cls(Job):\n' - fn_section += ' def __init__(self, ' - for input in scatter_inputs: - fn_section += f'{input}=None, ' - fn_section += '*args, **kwargs):\n' - fn_section += ' Job.__init__(self)\n\n' - - for input in scatter_inputs: - fn_section += ' self.id_{input} = {input}\n'.format(input=input) - - fn_section += heredoc_wdl(''' - - def run(self, fileStore): - fileStore.logToMaster("{jobname}") - tempDir = fileStore.getLocalTempDir() - - try: - os.makedirs(os.path.join(tempDir, 'execution')) - except OSError as e: - if e.errno != errno.EEXIST: - raise - ''', {'jobname': scattername}, indent=' ')[1:] - for input in scatter_inputs: - fn_section += ' {input} = self.id_{input}\n'.format(input=input) - return fn_section - - def write_scatterfunction_outputreturn(self, scatter_outputs): - """ - - :return: - """ - fn_section = '\n rvDict = {' - for var in scatter_outputs: - fn_section += '"{var}": {var}, '.format(var=var['task'] + '_' + var['output']) - if fn_section.endswith(', '): - fn_section = fn_section[:-2] - fn_section += '}\n' - fn_section += ' return rvDict\n\n' - - return fn_section[:-1] - - def write_scatterfunction_lists(self, scatter_outputs): - """ - - :return: - """ - fn_section = '\n' - for var in scatter_outputs: - fn_section += ' {var} = []\n'.format(var=var['task'] + '_' + var['output']) - - return fn_section - - def write_scatterfunction_loop(self, job, scatter_outputs): - """ - - :return: - """ - collection = job['collection'] - item = job['item'] - - fn_section = f' for {item} in {collection}:\n' - - previous_dependency = 'self' - for statement in job['body']: - if statement.startswith('declaration'): - # reusing write_main_jobwrappers_declaration() here, but it needs to be indented one more level. - fn_section += self.indent( - self.write_main_jobwrappers_declaration(job['body'][statement])) - elif statement.startswith('call'): - fn_section += self.write_scatter_callwrapper(job['body'][statement], previous_dependency) - previous_dependency = 'job_' + job['body'][statement]['alias'] - elif statement.startswith('scatter'): - raise NotImplementedError('nested scatter not implemented.') - elif statement.startswith('if'): - fn_section += ' if {}:\n'.format(job['body'][statement]['expression']) - # reusing write_main_jobwrappers_if() here, but it needs to be indented one more level. - fn_section += self.indent(self.write_main_jobwrappers_if(job['body'][statement]['body'])) - - # check for empty scatter section - if len(job['body']) == 0: - fn_section += ' pass' - - for var in scatter_outputs: - fn_section += ' {var}.append({task}.rv("{output}"))\n'.format(var=var['task'] + '_' + var['output'], - task='job_' + var['task'], - output=var['output']) - return fn_section - - def write_scatter_callwrapper(self, job, previous_dependency): - fn_section = ' job_{alias} = {pd}.addFollowOn({task}Cls('.format(alias=job['alias'], - pd=previous_dependency, - task=job['task']) - for var in job['io']: - fn_section += var + '=' + job['io'][var] + ', ' - if fn_section.endswith(', '): - fn_section = fn_section[:-2] - fn_section += '))\n' - return fn_section - - def write_function(self, job): - """ - Writes out a python function for each WDL "task" object. - - Each python function is a unit of work written out as a string in - preparation to being written out to a file. In WDL, each "job" is - called a "task". Each WDL task is written out in multiple steps: - - 1: Header and inputs (e.g. 'def mapping(self, input1, input2)') - 2: Log job name (e.g. 'job.fileStore.logToMaster('initialize_jobs')') - 3: Create temp dir (e.g. 'tempDir = fileStore.getLocalTempDir()') - 4: import filenames and use readGlobalFile() to get files from the - jobStore - 5: Reformat commandline variables (like converting to ' '.join(files)). - 6: Commandline call using subprocess.Popen(). - 7: Write the section returning the outputs. Also logs stats. - - :return: a giant string containing the meat of the job defs for the toil script. - """ - - # write the function header - fn_section = self.write_function_header(job) - - # write out commandline keywords - fn_section += self.write_function_cmdline(job) - - if self.needsdocker(job): - # write a bash script to inject into the docker - fn_section += self.write_function_bashscriptline(job) - # write a call to the docker API - fn_section += self.write_function_dockercall(job) - else: - # write a subprocess call - fn_section += self.write_function_subprocesspopen() - - # write the outputs for the definition to return - fn_section += self.write_function_outputreturn(job, docker=self.needsdocker(job)) - - return fn_section - - def write_function_header(self, job): - """ - Writes the header that starts each function, for example, this function - can write and return: - - 'def write_function_header(self, job, job_declaration_array):' - - :param job: A list such that: - (job priority #, job ID #, Job Skeleton Name, Job Alias) - :param job_declaration_array: A list of all inputs that job requires. - :return: A string representing this. - """ - fn_section = f'\n\nclass {job}Cls(Job):\n' - fn_section += ' def __init__(self, ' - if 'inputs' in self.tasks_dictionary[job]: - for i in self.tasks_dictionary[job]['inputs']: - var = i[0] - vartype = i[1] - if vartype == 'String': - fn_section += f'{var}="", ' - else: - fn_section += f'{var}=None, ' - fn_section += '*args, **kwargs):\n' - fn_section += f' super({job}Cls, self).__init__(*args, **kwargs)\n' - - # TODO: Resolve inherent problems resolving resource requirements - # In WDL, "local-disk " + 500 + " HDD" cannot be directly converted to python. - # This needs a special handler. - if 'runtime' in self.tasks_dictionary[job]: - runtime_resources = [] - if 'memory' in self.tasks_dictionary[job]['runtime']: - runtime_resources.append('memory=memory') - memory = self.tasks_dictionary[job]['runtime']['memory'] - fn_section += f' memory=parse_memory({memory})\n' - if 'cpu' in self.tasks_dictionary[job]['runtime']: - runtime_resources.append('cores=cores') - cores = self.tasks_dictionary[job]['runtime']['cpu'] - fn_section += f' cores=parse_cores({cores})\n' - if 'disks' in self.tasks_dictionary[job]['runtime']: - runtime_resources.append('disk=disk') - disk = self.tasks_dictionary[job]['runtime']['disks'] - fn_section += f' disk=parse_disk({disk})\n' - runtime_resources = ['self'] + runtime_resources - fn_section += ' Job.__init__({})\n\n'.format(', '.join(runtime_resources)) - - if 'inputs' in self.tasks_dictionary[job]: - for i in self.tasks_dictionary[job]['inputs']: - var = i[0] - var_type = i[1] - var_expressn = i[2] - json_expressn = self.json_var(task=job, var=var) - - # json declarations have priority and can overwrite - # whatever is in the wdl file - if json_expressn is not None: - var_expressn = json_expressn - - if var_expressn is None: - # declarations from workflow - fn_section += f' self.id_{var} = {var}\n' - else: - # declarations from a WDL or JSON file - fn_section += ' self.id_{} = {}.create(\n {})\n'\ - .format(var, self.write_declaration_type(var_type), var_expressn) - - fn_section += heredoc_wdl(''' - - def run(self, fileStore): - fileStore.logToMaster("{jobname}") - tempDir = fileStore.getLocalTempDir() - - _toil_wdl_internal__stdout_file = os.path.join(tempDir, 'stdout') - _toil_wdl_internal__stderr_file = os.path.join(tempDir, 'stderr') - - try: - os.makedirs(os.path.join(tempDir, 'execution')) - except OSError as e: - if e.errno != errno.EEXIST: - raise - ''', {'jobname': job}, indent=' ')[1:] - if 'inputs' in self.tasks_dictionary[job]: - for i in self.tasks_dictionary[job]['inputs']: - var = i[0] - var_type = i[1] - - docker_bool = str(self.needsdocker(job)) - - if self.needs_file_import(var_type): - args = ', '.join( - [ - f'abspath_file(self.id_{var}, _toil_wdl_internal__current_working_dir)', - 'tempDir', - 'fileStore', - f'docker={docker_bool}' - ]) - fn_section += f' {var} = process_and_read_file({args})\n' - else: - fn_section += f' {var} = self.id_{var}\n' - - return fn_section - - def json_var(self, var, task=None, wf=None): - """ - - :param var: - :param task: - :param wf: - :return: - """ - # default to the last workflow in the list - if wf is None: - for workflow in self.workflows_dictionary: - wf = workflow - - for identifier in self.json_dict: - # check task declarations - if task: - if identifier == f'{wf}.{task}.{var}': - return self.json_dict[identifier] - # else check workflow declarations - else: - if identifier == f'{wf}.{var}': - return self.json_dict[identifier] - - return None - - def needs_file_import(self, var_type: WDLType) -> bool: - """ - Check if the given type contains a File type. A return value of True - means that the value with this type has files to import. - """ - if isinstance(var_type, WDLFileType): - return True - - if isinstance(var_type, WDLCompoundType): - if isinstance(var_type, WDLArrayType): - return self.needs_file_import(var_type.element) - elif isinstance(var_type, WDLPairType): - return self.needs_file_import(var_type.left) or self.needs_file_import(var_type.right) - elif isinstance(var_type, WDLMapType): - return self.needs_file_import(var_type.key) or self.needs_file_import(var_type.value) - else: - raise NotImplementedError - return False - - def write_declaration_type(self, var_type: WDLType): - """ - Return a string that preserves the construction of the given WDL type - so it can be passed into the compiled script. - """ - section = var_type.__class__.__name__ + '(' # e.g.: 'WDLIntType(' - - if isinstance(var_type, WDLCompoundType): - if isinstance(var_type, WDLArrayType): - section += self.write_declaration_type(var_type.element) - elif isinstance(var_type, WDLPairType): - section += self.write_declaration_type(var_type.left) + ', ' - section += self.write_declaration_type(var_type.right) - elif isinstance(var_type, WDLMapType): - section += self.write_declaration_type(var_type.key) + ', ' - section += self.write_declaration_type(var_type.value) - else: - raise ValueError(var_type) - - if var_type.optional: - if isinstance(var_type, WDLCompoundType): - section += ', ' - section += 'optional=True' - return section + ')' - - def write_function_bashscriptline(self, job): - """ - Writes a function to create a bashscript for injection into the docker - container. - - :param job_task_reference: The job referenced in WDL's Task section. - :param job_alias: The actual job name to be written. - :return: A string writing all of this. - """ - fn_section = " generate_docker_bashscript_file(temp_dir=tempDir, docker_dir=tempDir, globs=[" - # TODO: Add glob - # if 'outputs' in self.tasks_dictionary[job]: - # for output in self.tasks_dictionary[job]['outputs']: - # fn_section += '({}), '.format(output[2]) - if fn_section.endswith(', '): - fn_section = fn_section[:-2] - fn_section += f"], cmd=cmd, job_name='{str(job)}')\n\n" - - return fn_section - - def write_function_dockercall(self, job): - """ - Writes a string containing the apiDockerCall() that will run the job. - - :param job_task_reference: The name of the job calling docker. - :param docker_image: The corresponding name of the docker image. - e.g. "ubuntu:latest" - :return: A string containing the apiDockerCall() that will run the job. - """ - docker_dict = {"docker_image": self.tasks_dictionary[job]['runtime']['docker'], - "job_task_reference": job, - "docker_user": str(self.docker_user)} - docker_template = heredoc_wdl(''' - # apiDockerCall() with demux=True returns a tuple of bytes objects (stdout, stderr). - _toil_wdl_internal__stdout, _toil_wdl_internal__stderr = \\ - apiDockerCall(self, - image={docker_image}, - working_dir=tempDir, - parameters=[os.path.join(tempDir, "{job_task_reference}_script.sh")], - entrypoint="/bin/bash", - user={docker_user}, - stderr=True, - demux=True, - volumes={{tempDir: {{"bind": tempDir}}}}) - with open(os.path.join(_toil_wdl_internal__current_working_dir, '{job_task_reference}.log'), 'wb') as f: - if _toil_wdl_internal__stdout: - f.write(_toil_wdl_internal__stdout) - if _toil_wdl_internal__stderr: - f.write(_toil_wdl_internal__stderr) - ''', docker_dict, indent=' ')[1:] - - return docker_template - - def write_function_cmdline(self, job): - """ - Write a series of commandline variables to be concatenated together - eventually and either called with subprocess.Popen() or with - apiDockerCall() if a docker image is called for. - - :param job: A list such that: - (job priority #, job ID #, Job Skeleton Name, Job Alias) - :return: A string representing this. - """ - - fn_section = '\n' - cmd_array = [] - if 'raw_commandline' in self.tasks_dictionary[job]: - for cmd in self.tasks_dictionary[job]['raw_commandline']: - if not cmd.startswith("r'''"): - cmd = 'str({i} if not isinstance({i}, WDLFile) else process_and_read_file({i}, tempDir, fileStore)).strip("{nl}")'.format(i=cmd, nl=r"\n") - fn_section = fn_section + heredoc_wdl(''' - try: - # Intended to deal with "optional" inputs that may not exist - # TODO: handle this better - command{num} = {cmd} - except: - command{num} = ''\n''', {'cmd': cmd, 'num': self.cmd_num}, indent=' ') - cmd_array.append('command' + str(self.cmd_num)) - self.cmd_num = self.cmd_num + 1 - - if cmd_array: - fn_section += '\n cmd = ' - for command in cmd_array: - fn_section += f'{command} + ' - if fn_section.endswith(' + '): - fn_section = fn_section[:-3] - fn_section += '\n cmd = textwrap.dedent(cmd.strip("{nl}"))\n'.format(nl=r"\n") - else: - # empty command section - fn_section += ' cmd = ""' - - return fn_section - - def write_function_subprocesspopen(self): - """ - Write a subprocess.Popen() call for this function and write it out as a - string. - - :param job: A list such that: - (job priority #, job ID #, Job Skeleton Name, Job Alias) - :return: A string representing this. - """ - fn_section = heredoc_wdl(''' - this_process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - _toil_wdl_internal__stdout, _toil_wdl_internal__stderr = this_process.communicate()\n''', indent=' ') - - return fn_section - - def write_function_outputreturn(self, job, docker=False): - """ - Find the output values that this function needs and write them out as a - string. - - :param job: A list such that: - (job priority #, job ID #, Job Skeleton Name, Job Alias) - :param job_task_reference: The name of the job to look up values for. - :return: A string representing this. - """ - - fn_section = '' - - fn_section += heredoc_wdl(''' - _toil_wdl_internal__stdout_file = generate_stdout_file(_toil_wdl_internal__stdout, - tempDir, - fileStore=fileStore) - _toil_wdl_internal__stderr_file = generate_stdout_file(_toil_wdl_internal__stderr, - tempDir, - fileStore=fileStore, - stderr=True) - ''', indent=' ')[1:] - - if 'outputs' in self.tasks_dictionary[job]: - return_values = [] - for output in self.tasks_dictionary[job]['outputs']: - output_name = output[0] - output_type = output[1] - output_value = output[2] - - if self.needs_file_import(output_type): - nonglob_dict = { - "output_name": output_name, - "output_type": self.write_declaration_type(output_type), - "expression": output_value, - "out_dir": self.output_directory} - - nonglob_template = heredoc_wdl(''' - {output_name} = {output_type}.create( - {expression}, output=True) - {output_name} = process_outfile({output_name}, fileStore, tempDir, '{out_dir}') - ''', nonglob_dict, indent=' ')[1:] - fn_section += nonglob_template - return_values.append(output_name) - else: - fn_section += f' {output_name} = {output_value}\n' - return_values.append(output_name) - - if return_values: - fn_section += ' rvDict = {' - for return_value in return_values: - fn_section += '"{rv}": {rv}, '.format(rv=return_value) - if fn_section.endswith(', '): - fn_section = fn_section[:-2] - if return_values: - fn_section = fn_section + '}\n' - - if return_values: - fn_section += ' return rvDict\n\n' - - return fn_section - - def indent(self, string2indent: str) -> str: - """ - Indent the input string by 4 spaces. - """ - split_string = string2indent.split('\n') - return '\n'.join(f' {line}' for line in split_string) - - def needsdocker(self, job): - """ - - :param job: - :return: - """ - if 'runtime' in self.tasks_dictionary[job]: - if 'docker' in self.tasks_dictionary[job]['runtime']: - return True - - return False - - def write_python_file(self, - module_section, - fn_section, - main_section, - output_file): - """ - Just takes three strings and writes them to output_file. - - :param module_section: A string of 'import modules'. - :param fn_section: A string of python 'def functions()'. - :param main_section: A string declaring toil options and main's header. - :param job_section: A string import files into toil and declaring jobs. - :param output_file: The file to write the compiled toil script to. - """ - with open(output_file, 'w') as file: - file.write(module_section) - file.write(fn_section) - file.write(main_section) diff --git a/src/toil/wdl/wdl_types.py b/src/toil/wdl/wdl_types.py deleted file mode 100644 index 2231e17f51..0000000000 --- a/src/toil/wdl/wdl_types.py +++ /dev/null @@ -1,243 +0,0 @@ -from abc import ABC -from typing import Any, Dict, Optional - -from toil.job import Promise - - -class WDLRuntimeError(RuntimeError): - pass - - -class WDLType: - """ - Represents a primitive or compound WDL type: - - https://github.com/openwdl/wdl/blob/main/versions/development/SPEC.md#types - """ - - def __init__(self, optional: bool = False): - self.optional = optional - - @property - def name(self) -> str: - """ - Type name as string. Used in display messages / 'mappings.out' if dev - mode is enabled. - """ - raise NotImplementedError - - @property - def default_value(self) -> Optional[str]: - """ - Default value if optional. - """ - return None - - def create(self, value: Any, output: bool = False) -> Any: - """ - Calls at runtime. Returns an instance of the current type. An error may - be raised if the value is not in the correct format. - - :param value: a Python object - """ - if value is None: - # check if input is in fact an optional. - if self.optional: - return self.default_value - else: - raise WDLRuntimeError(f"Required input for '{self.name}' type not specified.") - - if isinstance(value, Promise): - return value - - return self._create(value) - - def _create(self, value: Any) -> Any: - raise NotImplementedError - - def __eq__(self, other: Any) -> bool: - return self.name.__eq__(other) - - def __str__(self) -> str: - return self.name.__str__() - - def __repr__(self) -> str: - return self.name.__repr__() - - -class WDLCompoundType(WDLType, ABC): - """ - Represents a WDL compound type. - """ - - -class WDLStringType(WDLType): - """ Represents a WDL String primitive type.""" - - @property - def name(self) -> str: - return 'String' - - @property - def default_value(self) -> str: - return '' - - def _create(self, value: Any) -> Any: - return str(value) - - -class WDLIntType(WDLType): - """ Represents a WDL Int primitive type.""" - - @property - def name(self) -> str: - return 'Int' - - def _create(self, value: Any) -> Any: - return int(value) - - -class WDLFloatType(WDLType): - """ Represents a WDL Float primitive type.""" - - @property - def name(self) -> str: - return 'Float' - - def _create(self, value: Any) -> Any: - return float(value) - - -class WDLBooleanType(WDLType): - """ Represents a WDL Boolean primitive type.""" - - @property - def name(self) -> str: - return 'Boolean' - - def _create(self, value: Any) -> Any: - return True if value else False - - -class WDLFileType(WDLType): - """ Represents a WDL File primitive type.""" - - @property - def name(self) -> str: - return 'File' - - @property - def default_value(self) -> str: - return '' - - def _create(self, value: Any) -> Any: - if isinstance(value, (WDLFile, Promise)): - # return the original file if it's passed from task to task. - return value - - return WDLFile(file_path=value, imported=False) - - -class WDLArrayType(WDLCompoundType): - """ Represents a WDL Array compound type.""" - - def __init__(self, element: WDLType, optional: bool = False): - super().__init__(optional) - self.element = element - - @property - def name(self) -> str: - return f'Array[{self.element.name}]' - - def _create(self, value: Any) -> Any: - if not isinstance(value, list): - raise WDLRuntimeError(f"Expected an array input for Array, but got '{type(value)}'") - - return [self.element.create(val) for val in value] - - -class WDLPairType(WDLCompoundType): - """ Represents a WDL Pair compound type.""" - - def __init__(self, left: WDLType, right: WDLType, optional: bool = False): - super().__init__(optional) - self.left = left - self.right = right - - @property - def name(self) -> str: - return f'Pair[{self.left.name}, {self.right.name}]' - - def _create(self, value: Any) -> Any: - if isinstance(value, WDLPair): - return value - elif isinstance(value, tuple): - if len(value) != 2: - raise WDLRuntimeError('Only support Pair len == 2') - left, right = value - elif isinstance(value, dict): - if 'left' not in value or 'right' not in value: - raise WDLRuntimeError('Pair needs \'left\' and \'right\' keys') - left = value.get('left') - right = value.get('right') - else: - raise WDLRuntimeError(f"Expected a pair input for Pair, but got '{type(value)}'") - - return WDLPair(self.left.create(left), self.right.create(right)) - - -class WDLMapType(WDLCompoundType): - """ Represents a WDL Map compound type.""" - - def __init__(self, key: WDLType, value: WDLType, optional: bool = False): - super().__init__(optional) - self.key = key - self.value = value - - @property - def name(self) -> str: - return f'Map[{self.key.name}, {self.value.name}]' - - def _create(self, value: Any) -> Any: - if not isinstance(value, dict): - raise WDLRuntimeError(f"Expected a map input for Map, but got '{type(value)}'") - - return {self.key.create(k): self.value.create(v) for k, v in value.items()} - - -class WDLFile: - """ - Represents a WDL File. - """ - def __init__(self, file_path: str, file_name: Optional[str] = None, imported: bool = False): - """ - :param file_path: Path to file. - :param file_name: Optional. Preserved file name. - :param imported: If True, this file has been imported to the fileStore - via fileStore.importFile(). - """ - self.file_path = file_path - self.file_name = file_name - self.imported = imported - - -class WDLPair: - """ - Represents a WDL Pair literal defined at - https://github.com/openwdl/wdl/blob/main/versions/development/SPEC.md#pair-literals - """ - - def __init__(self, left: Any, right: Any): - self.left = left - self.right = right - - def to_dict(self) -> Dict[str, Any]: - return {'left': self.left, 'right': self.right} - - def __eq__(self, other: Any) -> Any: - if not isinstance(other, WDLPair): - return False - return self.left == other.left and self.right == other.right - - def __repr__(self) -> str: - return str(self.to_dict()) diff --git a/src/toil/wdl/wdltoil.py b/src/toil/wdl/wdltoil.py index 810ec0cada..bfc02e1012 100755 --- a/src/toil/wdl/wdltoil.py +++ b/src/toil/wdl/wdltoil.py @@ -75,6 +75,7 @@ logger = logging.getLogger(__name__) + @contextmanager def wdl_error_reporter(task: str, exit: bool = False, log: Callable[[str], None] = logger.critical) -> Generator[None, None, None]: """