diff --git a/.circleci/config.yml b/.circleci/config.yml index a56a4a8..6139a51 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -619,12 +619,6 @@ workflows: - test_transcriptclean_singularity: requires: - build - - test_filter_transcriptclean_docker: - requires: - - build - - test_filter_transcriptclean_singularity: - requires: - - build - test_init_talon_database_docker: requires: - build @@ -662,7 +656,7 @@ workflows: requires: - build - test_workflow_onerep_docker: - <<: *only_dev_and_master +# <<: *only_dev_and_master requires: - build - test_workflow_onerep_singularity: @@ -670,7 +664,7 @@ workflows: requires: - build - test_workflow_tworep_docker: - <<: *only_dev_and_master +# <<: *only_dev_and_master requires: - build - test_workflow_tworep_singularity: @@ -685,8 +679,6 @@ workflows: - test_minimap2_singularity - test_transcriptclean_docker - test_transcriptclean_singularity - - test_filter_transcriptclean_docker - - test_filter_transcriptclean_singularity - test_init_talon_database_docker - test_init_talon_database_singularity - test_talon_docker diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..ee49b2e --- /dev/null +++ b/.flake8 @@ -0,0 +1,2 @@ +[flake8] +ignore = E501,W503, W605, E203 diff --git a/.isort.cfg b/.isort.cfg new file mode 100644 index 0000000..ce7d60b --- /dev/null +++ b/.isort.cfg @@ -0,0 +1,7 @@ +[settings] +known_third_party = dataframe_utils,pandas,qc_utils +multi_line_output=3 +include_trailing_comma=True +force_grid_wrap=0 +use_parentheses=True +line_length=88 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..479f405 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,30 @@ +- repo: https://github.com/psf/black + rev: 19.3b0 + hooks: + - id: black + language_version: python3.7 + +- repo: https://github.com/asottile/seed-isort-config + rev: v1.9.2 + hooks: + - id: seed-isort-config + +- repo: https://github.com/pre-commit/mirrors-isort + rev: v4.3.21 + hooks: + - id: isort + language_version: python3.7 + +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.2.3 + hooks: + - id: flake8 + - id: trailing-whitespace + exclude: docs/\w+\.md + - id: end-of-file-fixer + - id: debug-statements + - id: check-json + - id: pretty-format-json + args: + - --autofix + - id: check-yaml diff --git a/Dockerfile b/Dockerfile index 0251d4f..e045a1b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ FROM ubuntu:16.04 MAINTAINER Otto Jolanki RUN apt-get update && apt-get install -y software-properties-common -RUN add-apt-repository -y ppa:deadsnakes/ppa +RUN add-apt-repository -y ppa:deadsnakes/ppa RUN apt-get update && apt-get install -y \ python \ cython \ @@ -14,12 +14,14 @@ RUN apt-get update && apt-get install -y \ gdebi \ #pybedtools dependency libz-dev \ - bedtools=2.25.0-1 \ #samtools dependencies libbz2-dev \ libncurses5-dev \ git \ - python3.7 + python3.7 \ + python3.7-dev \ + libssl-dev \ + build-essential RUN mkdir /software WORKDIR /software @@ -46,6 +48,9 @@ RUN yes | gdebi r-recommended_3.3.2-1xenial0_all.deb RUN wget https://cran.r-project.org/bin/linux/ubuntu/xenial/r-base_3.3.2-1xenial0_all.deb RUN yes | gdebi r-base_3.3.2-1xenial0_all.deb +# clear apt lists +RUN rm -rf /var/lib/apt/lists/* + # Install R packages RUN echo "r <- getOption('repos'); r['CRAN'] <- 'https://cloud.r-project.org'; options(repos = r);" > ~/.Rprofile && \ @@ -53,10 +58,15 @@ RUN echo "r <- getOption('repos'); r['CRAN'] <- 'https://cloud.r-project.org'; o Rscript -e "install.packages('gridExtra')" && \ Rscript -e "install.packages('readr')" -# Install Intervaltree 2.1.0 +# Install TC dependencies +RUN python3.7 -m pip install --upgrade pip +RUN python3.7 -m pip install cython +RUN python3.7 -m pip install pybedtools==0.8.0 pyfasta==0.5.2 numpy pandas + +# splice junction finding accessory script from TC still runs in python2 and requires pyfasta, which in turn requires numpy -RUN pip install --upgrade pip -RUN pip install intervaltree==2.1.0 pybedtools==0.7.8 pyfasta==0.5.2 numpy pandas +RUN python -m pip install --upgrade pip +RUN python -m pip install pyfasta==0.5.2 numpy # Install qc-utils to python 3.7 @@ -66,9 +76,16 @@ RUN python3.7 -m pip install qc-utils==19.8.1 RUN python3.7 -m pip install pandas scipy -# Get transcriptclean v1.0.8 +# Install bedtools 2.29 + +RUN wget https://github.com/arq5x/bedtools2/releases/download/v2.29.0/bedtools-2.29.0.tar.gz +RUN tar xzvf bedtools-2.29.0.tar.gz +RUN cd bedtools2/ && make +ENV PATH="/software/bedtools2/bin:${PATH}" -RUN git clone -b 'v1.0.8' --single-branch https://github.com/dewyman/TranscriptClean.git +# Get transcriptclean v2.0.2 + +RUN git clone -b 'v2.0.2' --single-branch https://github.com/dewyman/TranscriptClean.git RUN chmod 755 TranscriptClean/accessory_scripts/* TranscriptClean/TranscriptClean.py TranscriptClean/generate_report.R ENV PATH "/software/TranscriptClean/accessory_scripts:/software/TranscriptClean:${PATH}" @@ -94,4 +111,3 @@ ARG BRANCH ENV BUILD_BRANCH=${BRANCH} ARG BUILD_TAG ENV MY_TAG=${BUILD_TAG} - diff --git a/docs/howto.md b/docs/howto.md index 479c230..387a3b7 100644 --- a/docs/howto.md +++ b/docs/howto.md @@ -1,14 +1,14 @@ # HOWTO -Here are concrete instructions for running analyses on different platforms. -Before following these instructions, make sure you have completed installation and possible account setup detailed in [installation instructions](installation.md). These instructions show how to use Cromwell directly. Consider running the pipeline using [Caper](https://github.com/ENCODE-DCC/caper) which is more user friendly way. +Here are recipes for running analyses on different platforms. +Before following these instructions, make sure you have completed installation and possible account setup detailed in [installation instructions](installation.md). Note that although running the pipeline directly with Cromwell is still possible, using [caper](https://github.com/ENCODE-DCC/caper) is the canonical, supported and official way to use ENCODE Uniform Processing Pipelines. The examples below use command `caper run`, which is the simplest way to run a single pipeline instance. For running multiple pipelines in production setting we recommend using caper server. To find details on setting up the server, refer to [caper documentation](https://github.com/ENCODE-DCC/caper/blob/master/DETAILS.md#usage). -# CONTENTS +Note that the files used in these examples are first restricted to reads from chromosome 19, and then further subsampled to 10000 reads. The cpu and memory resources reflect the size of inputs. For resource guidelines with full sized data, see discussion [here](reference.md#note-about-resources). -## Running Workflows +# CONTENTS [Google Cloud](howto.md#google-cloud) -[SLURM](howto.md#slurm-singularity) +[Other Platforms](howto.md#other-platforms) [Splice Junctions](howto.md#splice-junctions) @@ -66,84 +66,39 @@ The goal is to run the pipeline with test data using Google Cloud Platform. } ``` -5. Get cromwell 40: +5. Run the pipeline using caper: ```bash - wget -N -c https://github.com/broadinstitute/cromwell/releases/download/40/cromwell-40.jar + $ caper run long-read-rna-pipeline.wdl -i input.json -b gcp -m testrun_metadata.json ``` -6. Run the pipeline: +6. Run croo, to to make finding outputs easier: ```bash - $ java -jar -Dconfig.file=backends/backend.conf -Dbackend.default=google -Dbackend.providers.google.config.project=YOUR_PROJECT -Dbackend.providers.google.config.root=gs://YOUR_BUCKET_NAME/output cromwell-40.jar run long-read-rna-pipeline.wdl -i input.json -o workflow_opts/docker.json -m metadata.json + $ croo testrun_metadata.json --out-dir gs://[YOUR_BUCKET_NAME]/croo_out ``` -7. See the outputs in `gs://YOUR_BUCKET_NAME/output`. You can also use [croo](https://github.com/ENCODE-DCC/croo) to organize the outputs before taking a look. The required configuration json file `output_definition.json` is provided with this repo. +This command will output into the bucket an HTML table, that shows the locations of the outputs nicely organized. Note that if your output bucket is not public, you need to be logged into your google account to be able to follow the links. -## SLURM Singularity +## Other platforms -For this example you need to have Singularity installed. For details see [installation instructions](installation.md). The goal is to run the pipeline with testdata using Singularity on a SLURM cluster. Login into your cluster first and then follow the instructions. -When running workflows on SLURM (or other) HPC clusters, use [Caper](https://github.com/ENCODE-DCC/caper), it takes care of backend configuration for you. +Running on other platforms is similar, because the caper takes care of the details for you. See [caper documentation](https://github.com/ENCODE-DCC/caper#installation) for further details. -1. Get the code and move into the code directory: - -```bash - git clone https://github.com/ENCODE-DCC/long-read-rna-pipeline.git - cd long-read-rna-pipeline -``` - -3. Build the singularity image for the pipeline. The following pulls the pipeline docker image, and uses that to construct the singularity image. The image will be stored in `~/.singularity`. It is bad practice to build images (or do any other intensive work) on login nodes. For this reason we will first invoke an interactive session on a different node by running `sdev` command, and building there (It will take few seconds to get back into the shell after running `sdev`). - -```bash - sdev - mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name long_read_rna_pipeline-v1.0.simg -F docker://quay.io/encode-dcc/long-read-rna-pipeline:v1.0 - exit #this takes you back to the login node -``` - -Note: If you want to store your inputs `/in/some/data/directory1`and `/in/some/data/directory2`you must edit `workflow_opts/singularity.json` in the following way: -``` -{ - "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/long-read-rna-pipeline-v1.0.simg", - "singularity_bindpath" : "~/, /in/some/data/directory1/, /in/some/data/directory2/" - } -} -``` +## Using Singularity -4. Install caper. Python 3.4.1 or newer is required. - -```bash - pip install caper -``` - -5. Follow [Caper configuration instructions](https://github.com/ENCODE-DCC/caper#configuration-file). - -Note: In Caper configuration file, you will need to give a value to `--time` parameter by editing `slurm-extra-param` line. For example: -``` - slurm-extra-param=--time=01:00:00 -``` -to give one hour of runtime. - -6. Edit the input file `test/test_workflow/test_workflow_2reps_input.json` so that all the input file paths are absolute. -For example replace `test_data/chr19_test_10000_reads.fastq.gz` in fastq inputs with `[PATH-TO-REPO]/test_data/chr19_test_10000_reads.fastq.gz`. You can find out the `[PATH-TO-REPO]` by running `pwd` command in the `long-read-rna-pipeline` directory. - -7. Run the pipeline using Caper: - -```bash - caper run -i test/test_workflow/test_workflow_2reps_input.json -o workflow_opts/singularity.json -m metadata.json -``` +Caper comes with built-in support for singularity with `--singularity` option. See [caper documentation](https://github.com/ENCODE-DCC/caper/blob/master/DETAILS.md) for more information. ## Splice junctions You may want to run the pipeline using other references than the ones used by ENCODE. In this case you must prepare your own splice junctions file. The workflows for this is in this repo and it is `get-splice-junctions.wdl`. This workflow uses the same Docker/Singularity images as the main pipeline and running this workflow is done in exactly same way as the running of the main pipeline. -`input.json` for splice junction workflow with gencode v24 annotation, and GRCh38 reference genome looks like this: +`input.json` for splice junction workflow with gencode V29 annotation, and GRCh38 reference genome looks like this: ``` { - "get_splice_junctions.annotation" : "gs://long_read_rna/splice_junctions/inputs/gencode.v24.primary_assembly.annotation.gtf.gz", + "get_splice_junctions.annotation" : "gs://long_read_rna/splice_junctions/inputs/gencode.v29.primary_assembly.annotation_UCSC_names.gtf.gz", "get_splice_junctions.reference_genome" : "gs://long_read_rna/splice_junctions/inputs/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.gz", - "get_splice_junctions.output_prefix" : "gencode_V24_splice_junctions", + "get_splice_junctions.output_prefix" : "gencode_V29_splice_junctions", "get_splice_junctions.ncpus" : 2, "get_splice_junctions.ramGB" : 7, "get_splice_junctions.disks" : "local-disk 50 SSD" diff --git a/docs/installation.md b/docs/installation.md index 2e6a960..df6102d 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -1,10 +1,28 @@ # INSTALLATION -To run the pipeline you need to install following software. Running the pipeline on Google Cloud requires additional setup detailed below. +To run the pipeline you need to do some setup. The exact steps you need to take depends on the platform you are running the pipeline on, and will be detailed below and in [HOWTO](howto.md). Independent of platform, running the pipeline is done using [caper](https://github.com/ENCODE-DCC/caper) and (optional but recommended) output organization is done using [croo](https://github.com/ENCODE-DCC/croo). Both `caper` and `croo` require `python` version 3.4.1 or newer. + +## Caper + +Direct usage of the execution engine [Cromwell](https://software.broadinstitute.org/wdl/documentation/execution) features complicated backend configuration, workflow options and command line parameters. Caper hides the complexity and consolidates configuration in one file. Caper is available in [PyPI](https://pypi.org/project/caper/) and it is installed by running: + +```bash + $ pip install caper +``` + +Note that conda run mode that is described in caper documentation is not supported by this pipeline. + +## Croo + +The way [Cromwell](https://software.broadinstitute.org/wdl/documentation/execution) organizes pipeline outputs is not always the most clear and findable. Croo is a tool to reorganize the files in more readable manner. Croo is available in [PyPI](https://pypi.org/project/croo/) and it is installed by running: + +```bash + $ pip install croo +``` ## Java 8 -Java is required to run execution engine [Cromwell](https://software.broadinstitute.org/wdl/documentation/execution). +Java is required to run execution engine [Cromwell](https://software.broadinstitute.org/wdl/documentation/execution) that `caper` uses under the hood. To check which Java version you already have, run: ```bash $ java -version @@ -12,54 +30,24 @@ To check which Java version you already have, run: You are looking for 1.8 or higher. If the requirement is not fulfilled follow installation instructions for [mac](https://java.com/en/download/help/mac_install.xml) or [linux](http://openjdk.java.net/install/) or use your favorite installation method. -## Cromwell - -Download WDL runner Cromwell from [here](https://github.com/broadinstitute/cromwell/releases). The pipeline has been tested using version 40. - ## Docker -Pipeline code is packaged and distributed in Docker containers, and thus Docker installation is needed. +Pipeline code is packaged and distributed in Docker containers, and thus Docker installation is needed. Follow instructions for [mac](https://docs.docker.com/docker-for-mac/install/) or [linux](https://docs.docker.com/install/linux/docker-ce/ubuntu/#upgrade-docker-after-using-the-convenience-script). -## Caper +## Singularity -For running the pipeline we recommend using [Caper](https://github.com/ENCODE-DCC/caper) that wraps Cromwell in an easier to use package. +If you want to use Singularity instead of Docker, install [singularity](https://www.sylabs.io/guides/3.1/user-guide/installation.html). Pipeline requires singularity version `>=2.5.2`, the link takes you to version `3.1`. -## croo +## Google Cloud -For organizing pipeline outputs we recommend using [croo](https://github.com/ENCODE-DCC/croo) that makes a nicely organized directory from the complicated output tree Cromwell defaults to. The configuration file for `croo` is named `output_definition.json` and can be found in the root of this repository. +If you are intending to run the pipeline on Google Cloud platform, follow the [caper setup instructions for GCP](https://github.com/ENCODE-DCC/caper/blob/master/docs/conf_gcp.md). +* For an example on how to run the pipeline on Google Cloud, see [HOWTO](howto.md#google-cloud). -## Singularity +## AWS -If for some reason you cannot run Docker, install [singularity](https://www.sylabs.io/guides/3.1/user-guide/installation.html) and have a look at [HOWTO](howto.md#local-with-singularity) for an example of how to run pipeline with singularity. Pipeline requires singularity version `>=2.5.2`, the link takes you to version `3.1`. +If you are intending to run the pipeline on AWS, follow the [caper setup instructions for AWS](https://github.com/ENCODE-DCC/caper/blob/master/docs/conf_aws.md). -## Google Cloud +## Cromwell (optional) -If you are intending to run the pipeline on Google Cloud platform, the following setup is needed: - -1. Sign up for a Google account. -2. Go to [Google Project](https://console.developers.google.com/project) page and click "SIGN UP FOR FREE TRIAL" on the top left and agree to terms. -3. Set up a payment method and click "START MY FREE TRIAL". -4. Create a [Google Project](https://console.developers.google.com/project) `[YOUR_PROJECT_NAME]` and choose it on the top of the page. -5. Create a [Google Cloud Storage bucket](https://console.cloud.google.com/storage/browser) `gs://[YOUR_BUCKET_NAME]` by clicking on a button "CREATE BUCKET" and create it to store pipeline outputs. -6. Find and enable following APIs in your [API Manager](https://console.developers.google.com/apis/library). Click a back button on your web brower after enabling each. - * Compute Engine API - * Google Cloud Storage - * Google Cloud Storage JSON API - * Genomics API - -7. Install [Google Cloud Platform SDK](https://cloud.google.com/sdk/downloads) and authenticate through it. You will be asked to enter verification keys. Get keys from the URLs they provide. - ``` - $ gcloud auth login --no-launch-browser - $ gcloud auth application-default login --no-launch-browser - ``` - -8. If you see permission errors at runtime, then unset environment variable `GOOGLE_APPLICATION_CREDENTIALS` or add it to your BASH startup scripts (`$HOME/.bashrc` or `$HOME/.bash_profile`). - ``` - unset GOOGLE_APPLICATION_CREDENTIALS - ``` - -9. Set your default Google Cloud Project. Pipeline will provision instances on this project. - ``` - $ gcloud config set project [YOUR_PROJECT_NAME] - ``` +We recommend using `caper` for running the pipeline, although it is possible to use Cromwell directly. Backend file and workflow options files necessary for direct Cromwell use are included in the repository for local testing purposes, but they are not actively maintained to follow cloud API changes etc. diff --git a/docs/reference.md b/docs/reference.md index a21f9fa..2460c03 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -23,9 +23,9 @@ Transcriptclean runs on python 2.7, and other parts utilize 3.7. [Minimap2](https://github.com/lh3/minimap2) is a versatile sequence alignment program that aligns DNA or mRNA sequences against a large reference database. For publication describing the software in detail, see [Paper by Li, H](https://doi.org/10.1093/bioinformatics/bty191). -### Transcriptclean v1.0.8 +### Transcriptclean v2.0.2 -[Transcriptclean](https://github.com/dewyman/TranscriptClean) is a program that corrects for mismatches, microindels and non-canonical splice junctions. For publication describing the software in detail, see [Paper by Dana Wyman, Ali Mortazavi](https://doi.org/10.1093/bioinformatics/bty483). +[Transcriptclean](https://github.com/dewyman/TranscriptClean) is a program that corrects for mismatches, microindels and non-canonical splice junctions. For publication describing the software in detail, see [Paper by Dana Wyman, Ali Mortazavi](https://doi.org/10.1093/bioinformatics/bty483). Version 2.x is an extensive rewrite of the first version, featuring parallel processing and very significant improvements in memory efficiency. ### TALON v4.2 @@ -47,6 +47,7 @@ A typical `input.json` is structured in the following way: "long_read_rna_pipeline.genome_build" : "GRCh38_chr19", "long_read_rna_pipeline.annotation_name" : "gencode_V24_chr19", "long_read_rna_pipeline.talon_prefixes" : ["FOO", "BAR"], + "long_read_rna_pipeline.canonical_only" : true, "long_read_rna_pipeline.init_talon_db_ncpus" : 2, "long_read_rna_pipeline.init_talon_db_ramGB" : 4, "long_read_rna_pipeline.init_talon_db_disks" : "local-disk 20 HDD", @@ -56,9 +57,6 @@ A typical `input.json` is structured in the following way: "long_read_rna_pipeline.transcriptclean_ncpus" : 1, "long_read_rna_pipeline.transcriptclean_ramGB" : 4, "long_read_rna_pipeline.transcriptclean_disks": "local-disk 20 HDD", - "long_read_rna_pipeline.filter_transcriptclean_ncpus" : 1, - "long_read_rna_pipeline.filter_transcriptclean_ramGB" : 4, - "long_read_rna_pipeline.filter_transcriptclean_disks" : "local-disk 20 HDD", "long_read_rna_pipeline.talon_ncpus" : 1, "long_read_rna_pipeline.talon_ramGB" : 4, "long_read_rna_pipeline.talon_disks" : "local-disk 20 HDD", @@ -79,19 +77,16 @@ The following elaborates on the meaning of each line in the input file. * `long_read_rna_pipeline.fastqs` Is a list of gzipped input fastqs, one file per replicate. * `long_read_rna_pipeline.reference_genome` Is the gzipped fasta file containing the reference genome used in mapping. Files for [human](https://www.encodeproject.org/files/GRCh38_no_alt_analysis_set_GCA_000001405.15/) and [mouse](https://www.encodeproject.org/files/mm10_no_alt_analysis_set_ENCODE/) are available on the [ENCODE Portal](https://https://www.encodeproject.org/). * `long_read_rna_pipeline.annotation` Is the gzipped gtf file containing the annotations. Files for [human V29](https://www.encodeproject.org/files/gencode.v29.primary_assembly.annotation_UCSC_names/) and [mouse M21](https://www.encodeproject.org/files/gencode.vM21.primary_assembly.annotation_UCSC_names/) are available on the [ENCODE Portal](https://https://www.encodeproject.org/). -* `long_read_rna_pipeline.variants` Is the gzipped vcf file containing variants. File for [human](https://www.encodeproject.org/files/dbsnp-variants-00-common_all/) available on the [ENCODE Portal](https://https://www.encodeproject.org/). Not used for mouse, this input is optional and can be left undefined. +* `long_read_rna_pipeline.variants` Is the gzipped vcf file containing variants. File for [human]((https://www.encodeproject.org/files/ENCFF911UGW/) is available on the [ENCODE Portal](https://https://www.encodeproject.org/). Variants file used in the pipeline is derived from dbsnp variants [file](https://www.encodeproject.org/files/ENCFF744NWL/) by modifying the chromosome names to match the ones in the reference genome. The original file uses short chromosome names (1,2,3 etc.) and the reference uses longer names (chr1, chr2, chr3, etc.). This input is optional and can be left undefined. Defining variants input prevents TranscriptClean from correcting away known variants. * `long_read_rna_pipeline.splice_junctions` Is the splice junctions file, generated with `get-splice-junctions.wdl` workflow based on the annotation and reference genome. Files for [human](https://www.encodeproject.org/files/ENCFF055LPJ/) and [mouse](https://www.encodeproject.org/files/ENCFF495CGH/) are available on the [ENCODE Portal](https://https://www.encodeproject.org/). * `long_read_rna_pipeline.experiment_prefix` This will be a prefix for the output files. * `long_read_rna_pipeline.input_type` Platform that was used for generating the data. Options are `pacbio` and `nanopore`. * `long_read_rna_pipeline.genome_build` Genome build name in the initial TALON database. This is internal metadata variable you typically do not need to touch. * `long_read_rna_pipeline.annotation_name` Annotation name in the initial TALON database. This is internal metadata variable you typically do not need to touch. * `long_read_rna_pipeline.talon_prefixes` This is a list of strings that, if provided, will be prefixes to the transcript names in the gtf generated by `create_gtf_from_talon_db`. If this is not defined, "TALON" will be the default prefix. Note, that if this list is defined, its length must be equal to the number of replicates. +* `long_read_rna_pipeline.canonical_only` If this option is set to true, TranscriptClean will only output transcripts that are either canonical or that contain annotated noncanonical junctions to the clean SAM and Fasta files at the end of the run. Set this parameter to false to output all transcripts. -Rest of the variables are for adjusting the computational resources of the pipeline tasks. - -### Note about resources - -The resources required by mapping task are quite typical for the mapping and we find that 16 cores with 60GB of memory get the job done quite fast. The resources required by TALON related tasks and filter transcriptclean are roughly 2cpus with 12GB memory. Right now we Transcriptclean is very memory intensive, using up to over 100GB of memory on full sized data, and we are working on making the task more memory efficient. +The rest of the variables are for adjusting the computational resources of the pipeline tasks. See [notes about resources](reference.md#note-about-resources) below for more details. ## Outputs @@ -110,16 +105,12 @@ The resources required by mapping task are quite typical for the mapping and we #### Task Transcriptclean * `corrected_sam` SAM file of corrected transcripts. Unmapped/non-primary transcript alignments from the input file are included in their original form. +* `corrected_bam` BAM file of corrected transcripts. Unmapped/non-primary transcript alignments from the input file are included in their original form. * `corrected_fasta` Fasta file of corrected transcript sequences. Unmapped transcripts from the input file are included in their original form. * `transcript_log ` Each row represents a transcript. The columns track the mapping status of the transcript, as well as how many errors of each type were found and corrected/not corrected in the transcript. * `transcript_error_log` Each row represents a potential error in a given transcript. The column values track whether the error was corrected or not and why. * `report` Report of the cleaning process in .pdf format. -#### Task Filter_transcriptclean - -* `filtered_sam` sam with noncanonical reads filtered, duplicates are removed and sorting is performed. Input to the TALON step. -* `filtered_bam` bam with noncanonical reads filtered, duplicates are removed and sorting is performed. - #### Task TALON * `talon_log` talon log file. @@ -138,6 +129,101 @@ The resources required by mapping task are quite typical for the mapping and we * `spearman` .json file with spearman correlation metric between the replicates. -#### Crowell output directory structure +### Note about resources + +The hardware resources needed to run the pipeline depend on the sequencing depth so it is hard to give definitive values that will be good for everyone. Further, some users may value time more than money, and vice versa. +The resources that get the mapping task finished in a reasonable amount of time are 16 cores with 60GB of RAM. The resources required by TALON related tasks roughly 2cpus with 12GB memory. TranscriptClean should be given 16 cpus and 60GB of memory. See example inputs tested with real ENCODE data on Google Cloud below. + +Splice junctions: + +``` +{ + "get_splice_junctions.annotation" : "gs://long_read_rna/splice_junctions/inputs/gencode.v29.primary_assembly.annotation_UCSC_names.gtf.gz", + "get_splice_junctions.reference_genome" : "gs://long_read_rna/splice_junctions/inputs/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.gz", + "get_splice_junctions.output_prefix" : "gencode_V29_splice_junctions", + "get_splice_junctions.ncpus" : 2, + "get_splice_junctions.ramGB" : 7, + "get_splice_junctions.disks" : "local-disk 50 SSD" +} +``` + +Main pipeline with mouse data: + +``` +{ + "long_read_rna_pipeline.fastqs": ["https://www.encodeproject.org/files/ENCFF103DSA/@@download/ENCFF103DSA.fastq.gz", "https://www.encodeproject.org/files/ENCFF309DMQ/@@download/ENCFF309DMQ.fastq.gz"], + "long_read_rna_pipeline.reference_genome": "https://www.encodeproject.org/files/mm10_no_alt_analysis_set_ENCODE/@@download/mm10_no_alt_analysis_set_ENCODE.fasta.gz", + "long_read_rna_pipeline.annotation": "https://www.encodeproject.org/files/gencode.vM21.primary_assembly.annotation_UCSC_names/@@download/gencode.vM21.primary_assembly.annotation_UCSC_names.gtf.gz", + "long_read_rna_pipeline.splice_junctions": "https://www.encodeproject.org/files/ENCFF495CGH/@@download/ENCFF495CGH.tsv", + "long_read_rna_pipeline.experiment_prefix": "ENCSR214HSG", + "long_read_rna_pipeline.input_type": "pacbio", + "long_read_rna_pipeline.genome_build": "mm10", + "long_read_rna_pipeline.annotation_name": "M21", + "long_read_rna_pipeline.talon_prefixes" : ["ENCLB278HSI", "ENCLB067AJF"], + "long_read_rna_pipeline.init_talon_db_ncpus" : 2, + "long_read_rna_pipeline.init_talon_db_ramGB" : 13, + "long_read_rna_pipeline.init_talon_db_disks" : "local-disk 150 SSD", + "long_read_rna_pipeline.minimap2_ncpus": 16, + "long_read_rna_pipeline.minimap2_ramGB": 60, + "long_read_rna_pipeline.minimap2_disks": "local-disk 150 SSD", + "long_read_rna_pipeline.transcriptclean_ncpus": 16, + "long_read_rna_pipeline.transcriptclean_ramGB": 60, + "long_read_rna_pipeline.transcriptclean_disks": "local-disk 150 SSD", + "long_read_rna_pipeline.filter_transcriptclean_ncpus": 2, + "long_read_rna_pipeline.filter_transcriptclean_ramGB": 13, + "long_read_rna_pipeline.filter_transcriptclean_disks": "local-disk 150 SSD", + "long_read_rna_pipeline.talon_ncpus": 2, + "long_read_rna_pipeline.talon_ramGB": 13, + "long_read_rna_pipeline.talon_disks": "local-disk 150 SSD", + "long_read_rna_pipeline.create_gtf_from_talon_db_ncpus" : 2, + "long_read_rna_pipeline.create_gtf_from_talon_db_ramGB" : 13, + "long_read_rna_pipeline.create_gtf_from_talon_db_disks" : "local-disk 150 HDD", + "long_read_rna_pipeline.create_abundance_from_talon_db_ncpus": 2, + "long_read_rna_pipeline.create_abundance_from_talon_db_ramGB": 13, + "long_read_rna_pipeline.create_abundance_from_talon_db_disks": "local-disk 150 SSD", + "long_read_rna_pipeline.calculate_spearman_ncpus": 2, + "long_read_rna_pipeline.calculate_spearman_ramGB": 7, + "long_read_rna_pipeline.calculate_spearman_disks": "local-disk 100 SSD" +} +``` + +Main pipeline with human data: -Cromwell: Cromwell will store outputs for each task under directory cromwell-executions/[WORKFLOW_ID]/call-[TASK_NAME]/shard-[IDX]. For all tasks [IDX] means a zero-based index for each replicate. +``` +{ + "long_read_rna_pipeline.fastqs": ["https://www.encodeproject.org/files/ENCFF281TNJ/@@download/ENCFF281TNJ.fastq.gz", "https://www.encodeproject.org/files/ENCFF475ORL/@@download/ENCFF475ORL.fastq.gz"], + "long_read_rna_pipeline.reference_genome": "https://www.encodeproject.org/files/GRCh38_no_alt_analysis_set_GCA_000001405.15/@@download/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.gz", + "long_read_rna_pipeline.annotation": "https://www.encodeproject.org/files/gencode.v29.primary_assembly.annotation_UCSC_names/@@download/gencode.v29.primary_assembly.annotation_UCSC_names.gtf.gz", + "long_read_rna_pipeline.variants": "https://storage.googleapis.com/documentation_runs/dbsnp-variants-00-common_all_long_chrnames.vcf.gz", + "long_read_rna_pipeline.splice_junctions": "https://www.encodeproject.org/files/ENCFF055LPJ/@@download/ENCFF055LPJ.tsv", + "long_read_rna_pipeline.experiment_prefix": "ENCSR706ANY", + "long_read_rna_pipeline.input_type": "pacbio", + "long_read_rna_pipeline.genome_build": "GRCh38", + "long_read_rna_pipeline.annotation_name": "V29", + "long_read_rna_pipeline.talon_prefixes" : ["ENCLB027JID", "ENCLB133UBC"], + "long_read_rna_pipeline.init_talon_db_ncpus" : 2, + "long_read_rna_pipeline.init_talon_db_ramGB" : 13, + "long_read_rna_pipeline.init_talon_db_disks" : "local-disk 150 SSD", + "long_read_rna_pipeline.minimap2_ncpus": 16, + "long_read_rna_pipeline.minimap2_ramGB": 60, + "long_read_rna_pipeline.minimap2_disks": "local-disk 150 SSD", + "long_read_rna_pipeline.transcriptclean_ncpus": 16, + "long_read_rna_pipeline.transcriptclean_ramGB": 60, + "long_read_rna_pipeline.transcriptclean_disks": "local-disk 150 SSD", + "long_read_rna_pipeline.filter_transcriptclean_ncpus": 2, + "long_read_rna_pipeline.filter_transcriptclean_ramGB": 13, + "long_read_rna_pipeline.filter_transcriptclean_disks": "local-disk 150 SSD", + "long_read_rna_pipeline.talon_ncpus": 2, + "long_read_rna_pipeline.talon_ramGB": 13, + "long_read_rna_pipeline.talon_disks": "local-disk 150 SSD", + "long_read_rna_pipeline.create_gtf_from_talon_db_ncpus" : 2, + "long_read_rna_pipeline.create_gtf_from_talon_db_ramGB" : 13, + "long_read_rna_pipeline.create_gtf_from_talon_db_disks" : "local-disk 150 HDD", + "long_read_rna_pipeline.create_abundance_from_talon_db_ncpus": 2, + "long_read_rna_pipeline.create_abundance_from_talon_db_ramGB": 13, + "long_read_rna_pipeline.create_abundance_from_talon_db_disks": "local-disk 150 SSD", + "long_read_rna_pipeline.calculate_spearman_ncpus": 2, + "long_read_rna_pipeline.calculate_spearman_ramGB": 7, + "long_read_rna_pipeline.calculate_spearman_disks": "local-disk 100 SSD" +} +``` diff --git a/get-splice-junctions.wdl b/get-splice-junctions.wdl index 24dc0f6..0d52432 100644 --- a/get-splice-junctions.wdl +++ b/get-splice-junctions.wdl @@ -1,8 +1,8 @@ # ENCODE long read rna pipeline: get splice junctions # Maintainer: Otto Jolanki -#CAPER docker quay.io/encode-dcc/long-read-rna-pipeline:v1.2 -#CAPER singularity docker://quay.io/encode-dcc/long-read-rna-pipeline:v1.2 +#CAPER docker quay.io/encode-dcc/long-read-rna-pipeline:v1.3 +#CAPER singularity docker://quay.io/encode-dcc/long-read-rna-pipeline:v1.3 workflow get_splice_junctions { # Inputs @@ -46,7 +46,7 @@ task get_splice_junctions_ { command <<< gzip -cd ${reference_genome} > ref.fasta rm ${reference_genome} - + if [ $(head -n 1 ref.fasta | awk '{print NF}') -gt 1 ]; then cat ref.fasta | awk '{print $1}' > reference.fasta else diff --git a/long-read-rna-pipeline.wdl b/long-read-rna-pipeline.wdl index 5b238b7..d32601d 100644 --- a/long-read-rna-pipeline.wdl +++ b/long-read-rna-pipeline.wdl @@ -1,8 +1,8 @@ # ENCODE long read rna pipeline # Maintainer: Otto Jolanki -#CAPER docker quay.io/encode-dcc/long-read-rna-pipeline:v1.2 -#CAPER singularity docker://quay.io/encode-dcc/long-read-rna-pipeline:v1.2 +#CAPER docker quay.io/encode-dcc/long-read-rna-pipeline:v1.3 +#CAPER singularity docker://quay.io/encode-dcc/long-read-rna-pipeline:v1.3 #CROO out_def https://storage.googleapis.com/encode-pipeline-output-definition/longreadrna.output_definition.json workflow long_read_rna_pipeline { # Inputs @@ -10,7 +10,7 @@ workflow long_read_rna_pipeline { # File inputs # Input fastqs, gzipped. - Array[File] fastqs + Array[File] fastqs # Reference genome. Fasta format, gzipped. File reference_genome @@ -43,6 +43,11 @@ workflow long_read_rna_pipeline { String annotation_name + # If this option is set, TranscriptClean will only output transcripts that are either canonical + # or that contain annotated noncanonical junctions to the clean SAM and Fasta files at the end + # of the run. + Boolean canonical_only=true + # Resouces # Task init_talon_db @@ -63,12 +68,6 @@ workflow long_read_rna_pipeline { Int transcriptclean_ramGB String transcriptclean_disks - # Task filter_transcriptclean - - Int filter_transcriptclean_ncpus - Int filter_transcriptclean_ramGB - String filter_transcriptclean_disks - # Task talon Int talon_ncpus @@ -123,22 +122,15 @@ workflow long_read_rna_pipeline { splice_junctions = splice_junctions, variants = variants, output_prefix = "rep"+(i+1)+experiment_prefix, + canonical_only = canonical_only, ncpus = transcriptclean_ncpus, ramGB = transcriptclean_ramGB, disks = transcriptclean_disks, } - call filter_transcriptclean { input: - sam = transcriptclean.corrected_sam, - output_prefix = "rep"+(i+1)+experiment_prefix, - ncpus = filter_transcriptclean_ncpus, - ramGB = filter_transcriptclean_ramGB, - disks = filter_transcriptclean_disks, - } - call talon { input: talon_db = init_talon_db.database, - sam = filter_transcriptclean.filtered_sam, + sam = transcriptclean.corrected_sam, genome_build = genome_build, output_prefix = "rep"+(i+1)+experiment_prefix, platform = input_type, @@ -179,7 +171,7 @@ workflow long_read_rna_pipeline { rep2_idprefix = rep2_idprefix, output_prefix = experiment_prefix, ncpus = calculate_spearman_ncpus, - ramGB = calculate_spearman_ramGB, + ramGB = calculate_spearman_ramGB, disks = calculate_spearman_disks, } } @@ -190,7 +182,7 @@ task init_talon_db { String annotation_name String ref_genome_name String output_prefix - String? idprefix + String? idprefix Int ncpus Int ramGB String disks @@ -240,7 +232,7 @@ task minimap2 { > ${output_prefix}.sam \ 2> ${output_prefix}_minimap2.log fi - + if [ "${input_type}" == "nanopore" ]; then minimap2 -t ${ncpus} -ax splice -uf -k14 \ ${reference_genome} \ @@ -259,7 +251,7 @@ task minimap2 { File sam = glob("*.sam")[0] File bam = glob("*.bam")[0] File log = glob("*_minimap2.log")[0] - File mapping_qc = glob("*_mapping_qc.json")[0] + File mapping_qc = glob("*_mapping_qc.json")[0] } runtime { @@ -275,6 +267,7 @@ task transcriptclean { File splice_junctions File? variants String output_prefix + Boolean canonical_only Int ncpus Int ramGB String disks @@ -288,22 +281,29 @@ task transcriptclean { mv ref.fasta reference.fasta fi - python $(which TranscriptClean.py) --sam ${sam} \ + test -f ${variants} && gzip -cd ${variants} > variants.vcf + + + python3.7 $(which TranscriptClean.py) --sam ${sam} \ --genome reference.fasta \ --spliceJns ${splice_junctions} \ - ${if defined(variants) then "--variants <(gzip -cd ${variants})" else ""} \ + ${if defined(variants) then "--variants variants.vcf" else ""} \ --maxLenIndel 5 \ --maxSJOffset 5 \ -m true \ -i true \ --correctSJs true \ --primaryOnly \ - --outprefix ${output_prefix} + --outprefix ${output_prefix} \ + --threads ${ncpus} \ + ${if canonical_only then "--canonOnly" else ""} + samtools view -S -b ${output_prefix}_clean.sam > ${output_prefix}_clean.bam Rscript $(which generate_report.R) ${output_prefix} >>> output { + File corrected_bam = glob("*_clean.bam")[0] File corrected_sam = glob("*_clean.sam")[0] File corrected_fasta = glob("*_clean.fa")[0] File transcript_log = glob("*_clean.log")[0] @@ -318,31 +318,6 @@ task transcriptclean { } } -task filter_transcriptclean { - File sam - String output_prefix - Int ncpus - Int ramGB - String disks - - command { - python $(which filter_transcriptclean_result.py) --f ${sam} --o ${output_prefix + "_filtered.sam"} - samtools view -S -b ${output_prefix}_filtered.sam > ${output_prefix}_filtered.bam - } - - output { - File filtered_sam = glob("*_filtered.sam")[0] - File filtered_bam = glob("*_filtered.bam")[0] - } - - runtime { - cpu: ncpus - memory: "${ramGB} GB" - disks: disks - } - -} - task talon { File talon_db File sam @@ -488,4 +463,4 @@ task skipNfirstlines { memory: "${ramGB} GB" disks: disks } -} \ No newline at end of file +} diff --git a/output_definition.json b/output_definition.json index cc024f3..d127668 100644 --- a/output_definition.json +++ b/output_definition.json @@ -23,6 +23,10 @@ "path": "transcriptclean/rep${i+1}/${basename}", "table": "Transcriptclean/Replicate ${i+1}/cleaned sam" }, + "corrected_bam": { + "path": "transcriptclean/rep${i+1}/${basename}", + "table": "Transcriptclean/Replicate ${i+1}/cleaned bam" + }, "corrected_fasta": { "path": "transcriptclean/rep${i+1}/${basename}", "table": "Transcriptclean/Replicate ${i+1}/cleaned fasta" @@ -41,17 +45,6 @@ } }, - "long_read_rna_pipeline.filter_transcriptclean": { - "filtered_sam": { - "path": "filter_transcriptclean/rep${i+1}/${basename}", - "table": "Filter_Transcriptclean/Replicate ${i+1}/filtered cleaned sam" - }, - "filtered_bam": { - "path": "filter_transcriptclean/rep${i+1}/${basename}", - "table": "Filter_Transcriptclean/Replicate ${i+1}/filtered cleaned bam" - } - }, - "long_read_rna_pipeline.talon": { "talon_config": { "path": "talon/rep${i+1}/${basename}", diff --git a/test/test_task/test_filter_transcriptclean.wdl b/test/test_task/test_filter_transcriptclean.wdl deleted file mode 100644 index 24a2d08..0000000 --- a/test/test_task/test_filter_transcriptclean.wdl +++ /dev/null @@ -1,31 +0,0 @@ -# Test workflow for transcriptclean task in ENCODE long read rna pipeline - -import "../../long-read-rna-pipeline.wdl" as longrna - -workflow test_filter_transcriptclean { - File sam - String output_prefix - Int lines_to_skip - String output_fn - Int ncpus - Int ramGB - String disks - - call longrna.filter_transcriptclean { input: - sam = sam, - output_prefix = output_prefix, - ncpus = ncpus, - ramGB = ramGB, - disks = disks - } - - call longrna.skipNfirstlines { input: - input_file = filter_transcriptclean.filtered_sam, - output_fn = output_fn, - lines_to_skip = lines_to_skip, - ncpus = ncpus, - ramGB = ramGB, - disks = disks, - } - -} \ No newline at end of file diff --git a/test/test_task/test_filter_transcriptclean_input.json b/test/test_task/test_filter_transcriptclean_input.json deleted file mode 100644 index ec1b76e..0000000 --- a/test/test_task/test_filter_transcriptclean_input.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "test_filter_transcriptclean.sam" : "test_data/from_transcriptclean.sam", - "test_filter_transcriptclean.output_prefix" : "TEST_FILTER_TRANSCRIPTCLEAN", - "test_filter_transcriptclean.lines_to_skip" : 3, - "test_filter_transcriptclean.output_fn" : "TEST_FILTER_TRANSCRIPTCLEAN_noheader_filtered.sam", - "test_filter_transcriptclean.ncpus" : 2, - "test_filter_transcriptclean.ramGB" : 4, - "test_filter_transcriptclean.disks" : "local-disk 20 HDD" -} \ No newline at end of file diff --git a/test/test_task/test_filter_transcriptclean_reference_md5.json b/test/test_task/test_filter_transcriptclean_reference_md5.json deleted file mode 100644 index 7f0f841..0000000 --- a/test/test_task/test_filter_transcriptclean_reference_md5.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "TEST_FILTER_TRANSCRIPTCLEAN_noheader_filtered.sam" : "4c8a7891d6f2c5b54b8787a8a3303a5c" -} \ No newline at end of file diff --git a/test/test_task/test_transcriptclean.wdl b/test/test_task/test_transcriptclean.wdl index c2f3d10..865a3d0 100644 --- a/test/test_task/test_transcriptclean.wdl +++ b/test/test_task/test_transcriptclean.wdl @@ -6,6 +6,7 @@ workflow test_transcriptclean { File sam File reference_genome File splice_junctions + Boolean canonical_only File? variants String output_prefix Int lines_to_skip @@ -20,6 +21,7 @@ workflow test_transcriptclean { splice_junctions = splice_junctions, variants = variants, output_prefix = output_prefix, + canonical_only = canonical_only, ncpus = ncpus, ramGB = ramGB, disks = disks, diff --git a/test/test_task/test_transcriptclean_input.json b/test/test_task/test_transcriptclean_input.json index 61e0320..92abbe7 100644 --- a/test/test_task/test_transcriptclean_input.json +++ b/test/test_task/test_transcriptclean_input.json @@ -1,12 +1,14 @@ { - "test_transcriptclean.sam" : "test_data/from_minimap2.sam", - "test_transcriptclean.reference_genome" : "test_data/GRCh38_no_alt_analysis_set_GCA_000001405.15_chr19_only.fasta.gz", - "test_transcriptclean.splice_junctions" : "test_data/splice_junctions.txt", - "test_transcriptclean.variants" : "test_data/00-common_chr19_only.vcf.gz", - "test_transcriptclean.output_prefix" : "TEST_TRANSCRIPTCLEAN", - "test_transcriptclean.lines_to_skip" : 2, - "test_transcriptclean.output_fn" : "TEST_TRANSCRIPTCLEAN_noheader_clean.sam", - "test_transcriptclean.ncpus" : 2, - "test_transcriptclean.ramGB" : 4, - "test_transcriptclean.disks" : "local-disk 20 HDD" -} \ No newline at end of file + "test_transcriptclean.disks": "local-disk 20 HDD", + "test_transcriptclean.lines_to_skip": 2, + "test_transcriptclean.ncpus": 2, + "test_transcriptclean.output_fn": "TEST_TRANSCRIPTCLEAN_noheader_clean.sam", + "test_transcriptclean.output_prefix": "TEST_TRANSCRIPTCLEAN", + "test_transcriptclean.ramGB": 4, + "test_transcriptclean.reference_genome": "test_data/GRCh38_no_alt_analysis_set_GCA_000001405.15_chr19_only.fasta.gz", + "test_transcriptclean.sam": "test_data/from_minimap2.sam", + "test_transcriptclean.splice_junctions": "test_data/splice_junctions.txt", + "test_transcriptclean.variants": "test_data/00-common_chr19_only_expanded_chrnames.vcf.gz", + "test_transcriptclean.canonical_only" : true + +} diff --git a/test/test_task/test_transcriptclean_reference_md5.json b/test/test_task/test_transcriptclean_reference_md5.json index a812d86..57b0b7e 100644 --- a/test/test_task/test_transcriptclean_reference_md5.json +++ b/test/test_task/test_transcriptclean_reference_md5.json @@ -1,6 +1,6 @@ { - "TEST_TRANSCRIPTCLEAN_clean.log" : "0cac2cb51ba1839eaeb1078201aea191", - "TEST_TRANSCRIPTCLEAN_clean.TE.log" : "2dabfde5147e9cef8964334f52b13622", - "TEST_TRANSCRIPTCLEAN_clean.fa" : "d72fd30a4e0be28ddaff3713bf7a6b7b", - "TEST_TRANSCRIPTCLEAN_noheader_clean.sam" : "a39dc5d6661c20a51bab192c2419430a" + "TEST_TRANSCRIPTCLEAN_clean.log" : "eaf281d39087034e66c3fce4595d7898", + "TEST_TRANSCRIPTCLEAN_clean.TE.log" : "078be510c933242636a466091c075bb6", + "TEST_TRANSCRIPTCLEAN_clean.fa" : "58806053ec670ef74aa6326547ac165c", + "TEST_TRANSCRIPTCLEAN_noheader_clean.sam" : "08c8969787896ae3faf1dcd5890f8e9d" } \ No newline at end of file diff --git a/test/test_workflow/test_workflow_2reps_input.json b/test/test_workflow/test_workflow_2reps_input.json index 9760760..4bcdd92 100644 --- a/test/test_workflow/test_workflow_2reps_input.json +++ b/test/test_workflow/test_workflow_2reps_input.json @@ -2,7 +2,7 @@ "long_read_rna_pipeline.fastqs" : ["test_data/chr19_test_10000_reads.fastq.gz", "test_data/chr19_test_10000_reads_rep2.fastq.gz"], "long_read_rna_pipeline.reference_genome" : "test_data/GRCh38_no_alt_analysis_set_GCA_000001405.15_chr19_only.fasta.gz", "long_read_rna_pipeline.annotation" : "test_data/gencode.v24.annotation_chr19.gtf.gz", - "long_read_rna_pipeline.variants" : "test_data/00-common_chr19_only.vcf.gz", + "long_read_rna_pipeline.variants" : "test_data/00-common_chr19_only_expanded_chrnames.vcf.gz", "long_read_rna_pipeline.splice_junctions" : "test_data/splice_junctions.txt", "long_read_rna_pipeline.experiment_prefix" : "TEST_WORKFLOW", "long_read_rna_pipeline.input_type" : "pacbio", @@ -18,9 +18,7 @@ "long_read_rna_pipeline.transcriptclean_ncpus" : 1, "long_read_rna_pipeline.transcriptclean_ramGB" : 4, "long_read_rna_pipeline.transcriptclean_disks": "local-disk 20 HDD", - "long_read_rna_pipeline.filter_transcriptclean_ncpus" : 1, - "long_read_rna_pipeline.filter_transcriptclean_ramGB" : 4, - "long_read_rna_pipeline.filter_transcriptclean_disks" : "local-disk 20 HDD", + "long_read_rna_pipeline.canonical_only" : true, "long_read_rna_pipeline.talon_ncpus" : 1, "long_read_rna_pipeline.talon_ramGB" : 4, "long_read_rna_pipeline.talon_disks" : "local-disk 20 HDD", diff --git a/test/test_workflow/test_workflow_2reps_reference_md5.json b/test/test_workflow/test_workflow_2reps_reference_md5.json index 717ea91..8b21a11 100644 --- a/test/test_workflow/test_workflow_2reps_reference_md5.json +++ b/test/test_workflow/test_workflow_2reps_reference_md5.json @@ -1,19 +1,19 @@ { "rep1TEST_WORKFLOW_mapping_qc.json": "5745442e83a3559a2789ce7ea6c2ac42", "rep2TEST_WORKFLOW_mapping_qc.json": "5745442e83a3559a2789ce7ea6c2ac42", - "rep1TEST_WORKFLOW_clean.log": "0cac2cb51ba1839eaeb1078201aea191", - "rep2TEST_WORKFLOW_clean.log": "b64cd8514e6c6083ea860d1f256cc1ae", - "rep1TEST_WORKFLOW_clean.TE.log": "2dabfde5147e9cef8964334f52b13622", - "rep2TEST_WORKFLOW_clean.TE.log": "8c55ce40bf57ee916979c98d57a32c2e", - "rep1TEST_WORKFLOW_clean.fa": "d72fd30a4e0be28ddaff3713bf7a6b7b", - "rep2TEST_WORKFLOW_clean.fa": "d3d2ae80c0ddd1b4f71c72c7b9962e45", - "rep1TEST_WORKFLOW_talon_QC.log": "baf3bfe66412deaf633d7c9117bf0f8f", - "rep2TEST_WORKFLOW_talon_QC.log": "9e4439ac4296cf50c620ed47c9967c85", - "rep1TEST_WORKFLOW_number_of_genes_detected.json": "cdc07eedbe282f6f7d049250cae03044", - "rep2TEST_WORKFLOW_number_of_genes_detected.json": "8bde50c9df34bdb693be5e2254d5e13b", - "rep1TEST_WORKFLOW_talon_abundance.tsv": "ff8bca8ea57db7979d0b7dde1ec0478d", - "rep2TEST_WORKFLOW_talon_abundance.tsv": "4230cf2ab5c062ba76d3b02126bd7c06", - "rep1TEST_WORKFLOW_talon.gtf" : "68fcacc9dc483cf3c0a3a40f95ea227a", - "rep2TEST_WORKFLOW_talon.gtf" : "5d71b906fc4977a59b7915c0a24e3fee", - "TEST_WORKFLOW_spearman.json": "6810dfbebd880a657579c4bdf9ef50c6" + "rep1TEST_WORKFLOW_clean.log": "eaf281d39087034e66c3fce4595d7898", + "rep2TEST_WORKFLOW_clean.log": "7004a8970eab5d0c9559bda607316d37", + "rep1TEST_WORKFLOW_clean.TE.log": "078be510c933242636a466091c075bb6", + "rep2TEST_WORKFLOW_clean.TE.log": "6fd5820cfa7e69ec1a3b429178aaec05", + "rep1TEST_WORKFLOW_clean.fa": "58806053ec670ef74aa6326547ac165c", + "rep2TEST_WORKFLOW_clean.fa": "5509a5c5af095160b7b9f050b0caebf7", + "rep1TEST_WORKFLOW_talon_QC.log": "470eb3417e59eeb4c90cab9a7f7eba1c", + "rep2TEST_WORKFLOW_talon_QC.log": "fb2ff0c4c7f1abf920673d0e253c491f", + "rep1TEST_WORKFLOW_number_of_genes_detected.json": "dc7199a61f01492dca070d65103c73e4", + "rep2TEST_WORKFLOW_number_of_genes_detected.json": "46ceb49cb58e9a5ba3cee2496d509d81", + "rep1TEST_WORKFLOW_talon_abundance.tsv": "8eee04b5758586f3dc72341f80f61ca7", + "rep2TEST_WORKFLOW_talon_abundance.tsv": "e466111e7093d2521c101c711a074552", + "rep1TEST_WORKFLOW_talon.gtf" : "d984f55ac47b44bf3238fa407ed07a84", + "rep2TEST_WORKFLOW_talon.gtf" : "a6a12616e2cb8211edddbd61be32b4ff", + "TEST_WORKFLOW_spearman.json": "64056dc27e16b5f96e2ab5c0902b76cf" } \ No newline at end of file diff --git a/test/test_workflow/test_workflow_input.json b/test/test_workflow/test_workflow_input.json index a86a981..3858be7 100644 --- a/test/test_workflow/test_workflow_input.json +++ b/test/test_workflow/test_workflow_input.json @@ -2,12 +2,13 @@ "long_read_rna_pipeline.fastqs" : ["test_data/chr19_test_10000_reads.fastq.gz"], "long_read_rna_pipeline.reference_genome" : "test_data/GRCh38_no_alt_analysis_set_GCA_000001405.15_chr19_only.fasta.gz", "long_read_rna_pipeline.annotation" : "test_data/gencode.v24.annotation_chr19.gtf.gz", - "long_read_rna_pipeline.variants" : "test_data/00-common_chr19_only.vcf.gz", + "long_read_rna_pipeline.variants" : "test_data/00-common_chr19_only_expanded_chrnames.vcf.gz", "long_read_rna_pipeline.splice_junctions" : "test_data/splice_junctions.txt", "long_read_rna_pipeline.experiment_prefix" : "TEST_WORKFLOW", "long_read_rna_pipeline.input_type" : "pacbio", "long_read_rna_pipeline.genome_build" : "GRCh38_chr19", "long_read_rna_pipeline.annotation_name" : "gencode_V24_chr19", + "long_read_rna_pipeline.canonical_only" : true, "long_read_rna_pipeline.init_talon_db_ncpus" : 2, "long_read_rna_pipeline.init_talon_db_ramGB" : 4, "long_read_rna_pipeline.init_talon_db_disks" : "local-disk 20 HDD", @@ -17,9 +18,6 @@ "long_read_rna_pipeline.transcriptclean_ncpus" : 2, "long_read_rna_pipeline.transcriptclean_ramGB" : 4, "long_read_rna_pipeline.transcriptclean_disks": "local-disk 20 HDD", - "long_read_rna_pipeline.filter_transcriptclean_ncpus" : 2, - "long_read_rna_pipeline.filter_transcriptclean_ramGB" : 4, - "long_read_rna_pipeline.filter_transcriptclean_disks" : "local-disk 20 HDD", "long_read_rna_pipeline.talon_ncpus" : 2, "long_read_rna_pipeline.talon_ramGB" : 4, "long_read_rna_pipeline.talon_disks" : "local-disk 20 HDD", diff --git a/test/test_workflow/test_workflow_reference_md5.json b/test/test_workflow/test_workflow_reference_md5.json index 5db9945..7067394 100644 --- a/test/test_workflow/test_workflow_reference_md5.json +++ b/test/test_workflow/test_workflow_reference_md5.json @@ -1,10 +1,10 @@ { -"rep1TEST_WORKFLOW_mapping_qc.json" : "5745442e83a3559a2789ce7ea6c2ac42", -"rep1TEST_WORKFLOW_clean.log" : "0cac2cb51ba1839eaeb1078201aea191", -"rep1TEST_WORKFLOW_clean.TE.log" : "2dabfde5147e9cef8964334f52b13622", -"rep1TEST_WORKFLOW_clean.fa" : "d72fd30a4e0be28ddaff3713bf7a6b7b", -"rep1TEST_WORKFLOW_talon_QC.log" : "baf3bfe66412deaf633d7c9117bf0f8f", -"rep1TEST_WORKFLOW_number_of_genes_detected.json" : "cdc07eedbe282f6f7d049250cae03044", -"rep1TEST_WORKFLOW_talon_abundance.tsv" : "a5933d20c5e9f3fa54b4ff1a02cb0e63", -"rep1TEST_WORKFLOW_talon.gtf" : "9894ac64a90ee42a279d1249f5ee9795" -} \ No newline at end of file + "rep1TEST_WORKFLOW_clean.TE.log": "078be510c933242636a466091c075bb6", + "rep1TEST_WORKFLOW_clean.fa": "58806053ec670ef74aa6326547ac165c", + "rep1TEST_WORKFLOW_clean.log": "eaf281d39087034e66c3fce4595d7898", + "rep1TEST_WORKFLOW_mapping_qc.json": "5745442e83a3559a2789ce7ea6c2ac42", + "rep1TEST_WORKFLOW_number_of_genes_detected.json": "dc7199a61f01492dca070d65103c73e4", + "rep1TEST_WORKFLOW_talon.gtf": "815c9f718ea55018a48e851c02d27728", + "rep1TEST_WORKFLOW_talon_QC.log": "470eb3417e59eeb4c90cab9a7f7eba1c", + "rep1TEST_WORKFLOW_talon_abundance.tsv": "096ed5cbf46580fb05f37b64333e5aa8" +} diff --git a/test_data/00-common_chr19_only.vcf.gz b/test_data/00-common_chr19_only_expanded_chrnames.vcf.gz similarity index 75% rename from test_data/00-common_chr19_only.vcf.gz rename to test_data/00-common_chr19_only_expanded_chrnames.vcf.gz index 3f5247e..9a0239c 100644 Binary files a/test_data/00-common_chr19_only.vcf.gz and b/test_data/00-common_chr19_only_expanded_chrnames.vcf.gz differ