Merge pull request #43 from ENCODE-DCC/dev1.3

Dev1.3
ENCODE-DCC · Oct 24, 2019 · 59f971c · 59f971c
2 parents 3a9f89b + 1001b2c
commit 59f971c
Show file tree

Hide file tree

Showing 22 changed files with 297 additions and 296 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -619,12 +619,6 @@ workflows:
       - test_transcriptclean_singularity:
           requires:
             - build
-      - test_filter_transcriptclean_docker:
-          requires:
-            - build
-      - test_filter_transcriptclean_singularity:
-          requires:
-            - build
       - test_init_talon_database_docker:
           requires:
             - build
@@ -662,15 +656,15 @@ workflows:
           requires:
             - build
       - test_workflow_onerep_docker:
-          <<: *only_dev_and_master
+#          <<: *only_dev_and_master
           requires:
             - build
       - test_workflow_onerep_singularity:
           <<: *only_dev_and_master
           requires:
             - build
       - test_workflow_tworep_docker:
-          <<: *only_dev_and_master
+#          <<: *only_dev_and_master
           requires:
             - build
       - test_workflow_tworep_singularity:
@@ -685,8 +679,6 @@ workflows:
             - test_minimap2_singularity
             - test_transcriptclean_docker
             - test_transcriptclean_singularity
-            - test_filter_transcriptclean_docker
-            - test_filter_transcriptclean_singularity
             - test_init_talon_database_docker
             - test_init_talon_database_singularity
             - test_talon_docker

diff --git a/.flake8 b/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+ignore = E501,W503, W605, E203
diff --git a/.isort.cfg b/.isort.cfg
@@ -0,0 +1,7 @@
+[settings]
+known_third_party = dataframe_utils,pandas,qc_utils
+multi_line_output=3
+include_trailing_comma=True
+force_grid_wrap=0
+use_parentheses=True
+line_length=88
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,30 @@
+- repo: https://github.com/psf/black
+  rev: 19.3b0
+  hooks:
+    - id: black
+      language_version: python3.7
+
+- repo: https://github.com/asottile/seed-isort-config
+  rev: v1.9.2
+  hooks:
+    - id: seed-isort-config
+
+- repo: https://github.com/pre-commit/mirrors-isort
+  rev: v4.3.21
+  hooks:
+    - id: isort
+      language_version: python3.7
+
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v2.2.3
+  hooks:
+    - id: flake8
+    - id: trailing-whitespace
+      exclude: docs/\w+\.md
+    - id: end-of-file-fixer
+    - id: debug-statements
+    - id: check-json
+    - id: pretty-format-json
+      args:
+        - --autofix
+    - id: check-yaml
diff --git a/Dockerfile b/Dockerfile
@@ -3,7 +3,7 @@ FROM ubuntu:16.04
 MAINTAINER Otto Jolanki
 
 RUN apt-get update && apt-get install -y software-properties-common
-RUN add-apt-repository -y ppa:deadsnakes/ppa 
+RUN add-apt-repository -y ppa:deadsnakes/ppa
 RUN apt-get update && apt-get install -y \
     python \
     cython \
@@ -14,12 +14,14 @@ RUN apt-get update && apt-get install -y \
     gdebi \
     #pybedtools dependency
     libz-dev \
-    bedtools=2.25.0-1 \
     #samtools dependencies
     libbz2-dev \
     libncurses5-dev \
     git \
-    python3.7
+    python3.7 \
+    python3.7-dev \
+    libssl-dev \
+    build-essential 
 
 RUN mkdir /software
 WORKDIR /software
@@ -46,17 +48,25 @@ RUN yes | gdebi r-recommended_3.3.2-1xenial0_all.deb
 RUN wget https://cran.r-project.org/bin/linux/ubuntu/xenial/r-base_3.3.2-1xenial0_all.deb
 RUN yes | gdebi r-base_3.3.2-1xenial0_all.deb
 
+# clear apt lists
+RUN rm -rf /var/lib/apt/lists/*
+
 # Install R packages
 
 RUN echo "r <- getOption('repos'); r['CRAN'] <- 'https://cloud.r-project.org'; options(repos = r);" > ~/.Rprofile && \
     Rscript -e "install.packages('ggplot2')" && \
     Rscript -e "install.packages('gridExtra')" && \
     Rscript -e "install.packages('readr')"
 
-# Install Intervaltree 2.1.0
+# Install TC dependencies
+RUN python3.7 -m pip install --upgrade pip
+RUN python3.7 -m pip install cython
+RUN python3.7 -m pip install pybedtools==0.8.0 pyfasta==0.5.2 numpy pandas
+
+# splice junction finding accessory script from TC still runs in python2 and requires pyfasta, which in turn requires numpy
 
-RUN pip install --upgrade pip 
-RUN pip install intervaltree==2.1.0 pybedtools==0.7.8 pyfasta==0.5.2 numpy pandas
+RUN python -m pip install --upgrade pip
+RUN python -m pip install pyfasta==0.5.2 numpy
 
 # Install qc-utils to python 3.7
 
@@ -66,9 +76,16 @@ RUN python3.7 -m pip install qc-utils==19.8.1
 
 RUN python3.7 -m pip install pandas scipy
 
-# Get transcriptclean v1.0.8
+# Install bedtools 2.29
+
+RUN wget https://github.com/arq5x/bedtools2/releases/download/v2.29.0/bedtools-2.29.0.tar.gz
+RUN tar xzvf bedtools-2.29.0.tar.gz
+RUN cd bedtools2/ && make
+ENV PATH="/software/bedtools2/bin:${PATH}"
 
-RUN git clone -b 'v1.0.8' --single-branch https://github.com/dewyman/TranscriptClean.git
+# Get transcriptclean v2.0.2
+
+RUN git clone -b 'v2.0.2' --single-branch https://github.com/dewyman/TranscriptClean.git
 RUN chmod 755 TranscriptClean/accessory_scripts/* TranscriptClean/TranscriptClean.py TranscriptClean/generate_report.R
 ENV PATH "/software/TranscriptClean/accessory_scripts:/software/TranscriptClean:${PATH}"
 
@@ -94,4 +111,3 @@ ARG BRANCH
 ENV BUILD_BRANCH=${BRANCH}
 ARG BUILD_TAG
 ENV MY_TAG=${BUILD_TAG}
-
diff --git a/docs/howto.md b/docs/howto.md
@@ -1,14 +1,14 @@
 # HOWTO
 
-Here are concrete instructions for running analyses on different platforms.
-Before following these instructions, make sure you have completed installation and possible account setup detailed in [installation instructions](installation.md). These instructions show how to use Cromwell directly. Consider running the pipeline using [Caper](https://github.com/ENCODE-DCC/caper) which is more user friendly way.
+Here are recipes for running analyses on different platforms.
+Before following these instructions, make sure you have completed installation and possible account setup detailed in [installation instructions](installation.md). Note that although running the pipeline directly with Cromwell is still possible, using [caper](https://github.com/ENCODE-DCC/caper) is the canonical, supported and official way to use ENCODE Uniform Processing Pipelines. The examples below use command `caper run`, which is the simplest way to run a single pipeline instance. For running multiple pipelines in production setting we recommend using caper server. To find details on setting up the server, refer to [caper documentation](https://github.com/ENCODE-DCC/caper/blob/master/DETAILS.md#usage).
 
-# CONTENTS
+Note that the files used in these examples are first restricted to reads from chromosome 19, and then further subsampled to 10000 reads. The cpu and memory resources reflect the size of inputs. For resource guidelines with full sized data, see discussion [here](reference.md#note-about-resources).
 
-## Running Workflows
+# CONTENTS
 
 [Google Cloud](howto.md#google-cloud)  
-[SLURM](howto.md#slurm-singularity)  
+[Other Platforms](howto.md#other-platforms)  
 [Splice Junctions](howto.md#splice-junctions)  
 
 
@@ -66,84 +66,39 @@ The goal is to run the pipeline with test data using Google Cloud Platform.
 }
 ```
 
-5. Get cromwell 40:
+5. Run the pipeline using caper:
 
 ```bash
-  wget -N -c https://github.com/broadinstitute/cromwell/releases/download/40/cromwell-40.jar
+  $ caper run long-read-rna-pipeline.wdl -i input.json -b gcp -m testrun_metadata.json
 ```
 
-6. Run the pipeline:
+6. Run croo, to to make finding outputs easier:
 
 ```bash
-  $ java -jar -Dconfig.file=backends/backend.conf -Dbackend.default=google -Dbackend.providers.google.config.project=YOUR_PROJECT -Dbackend.providers.google.config.root=gs://YOUR_BUCKET_NAME/output cromwell-40.jar run long-read-rna-pipeline.wdl -i input.json -o workflow_opts/docker.json -m metadata.json
+  $ croo testrun_metadata.json --out-dir gs://[YOUR_BUCKET_NAME]/croo_out
 ```
 
-7. See the outputs in `gs://YOUR_BUCKET_NAME/output`. You can also use [croo](https://github.com/ENCODE-DCC/croo) to organize the outputs before taking a look. The required configuration json file `output_definition.json` is provided with this repo.
+This command will output into the bucket an HTML table, that shows the locations of the outputs nicely organized. Note that if your output bucket is not public, you need to be logged into your google account to be able to follow the links.
 
-## SLURM Singularity
+## Other platforms
 
-For this example you need to have Singularity installed. For details see [installation instructions](installation.md). The goal is to run the pipeline with testdata using Singularity on a SLURM cluster. Login into your cluster first and then follow the instructions.
-When running workflows on SLURM (or other) HPC clusters, use [Caper](https://github.com/ENCODE-DCC/caper), it takes care of backend configuration for you.
+Running on other platforms is similar, because the caper takes care of the details for you. See [caper documentation](https://github.com/ENCODE-DCC/caper#installation) for further details.
 
-1. Get the code and move into the code directory:
-
-```bash
-  git clone https://github.com/ENCODE-DCC/long-read-rna-pipeline.git
-  cd long-read-rna-pipeline
-``` 
-
-3. Build the singularity image for the pipeline. The following pulls the pipeline docker image, and uses that to construct the singularity image. The image will be stored in `~/.singularity`. It is bad practice to build images (or do any other intensive work) on login nodes. For this reason we will first invoke an interactive session on a different node by running `sdev` command, and building there (It will take few seconds to get back into the shell after running `sdev`).
-
-```bash
-  sdev
-  mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name long_read_rna_pipeline-v1.0.simg -F docker://quay.io/encode-dcc/long-read-rna-pipeline:v1.0
-  exit #this takes you back to the login node
-```
-
-Note: If you want to store your inputs `/in/some/data/directory1`and `/in/some/data/directory2`you must edit `workflow_opts/singularity.json` in the following way:
-```
-{
-    "default_runtime_attributes" : {
-        "singularity_container" : "~/.singularity/long-read-rna-pipeline-v1.0.simg",
-        "singularity_bindpath" : "~/, /in/some/data/directory1/, /in/some/data/directory2/"
-    }
-}
-```
+## Using Singularity
 
-4. Install caper. Python 3.4.1 or newer is required.
-
-```bash
-  pip install caper
-```
-
-5. Follow [Caper configuration instructions](https://github.com/ENCODE-DCC/caper#configuration-file). 
-
-Note: In Caper configuration file, you will need to give a value to `--time` parameter by editing `slurm-extra-param` line. For example:
-```
-  slurm-extra-param=--time=01:00:00
-```
-to give one hour of runtime.
-
-6. Edit the input file `test/test_workflow/test_workflow_2reps_input.json` so that all the input file paths are absolute.
-For example replace `test_data/chr19_test_10000_reads.fastq.gz` in fastq inputs with `[PATH-TO-REPO]/test_data/chr19_test_10000_reads.fastq.gz`. You can find out the `[PATH-TO-REPO]` by running `pwd` command in the `long-read-rna-pipeline` directory.
-
-7. Run the pipeline using Caper:
-
-```bash
-  caper run -i test/test_workflow/test_workflow_2reps_input.json -o workflow_opts/singularity.json -m metadata.json
-```
+Caper comes with built-in support for singularity with `--singularity` option. See [caper documentation](https://github.com/ENCODE-DCC/caper/blob/master/DETAILS.md) for more information. 
 
 ## Splice junctions
 
 You may want to run the pipeline using other references than the ones used by ENCODE. In this case you must prepare your own splice junctions file. The workflows for this is in this repo and it is `get-splice-junctions.wdl`. This workflow uses the same Docker/Singularity images as the main pipeline and running this workflow is done in exactly same way as the running of the main pipeline.
 
-`input.json` for splice junction workflow with gencode v24 annotation, and GRCh38 reference genome looks like this:
+`input.json` for splice junction workflow with gencode V29 annotation, and GRCh38 reference genome looks like this:
 
 ```
 {
-    "get_splice_junctions.annotation" : "gs://long_read_rna/splice_junctions/inputs/gencode.v24.primary_assembly.annotation.gtf.gz",
+    "get_splice_junctions.annotation" : "gs://long_read_rna/splice_junctions/inputs/gencode.v29.primary_assembly.annotation_UCSC_names.gtf.gz",
     "get_splice_junctions.reference_genome" : "gs://long_read_rna/splice_junctions/inputs/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.gz",
-    "get_splice_junctions.output_prefix" : "gencode_V24_splice_junctions",
+    "get_splice_junctions.output_prefix" : "gencode_V29_splice_junctions",
     "get_splice_junctions.ncpus" : 2,
     "get_splice_junctions.ramGB" : 7,
     "get_splice_junctions.disks" : "local-disk 50 SSD"

diff --git a/docs/installation.md b/docs/installation.md
@@ -1,65 +1,53 @@
 # INSTALLATION
 
-To run the pipeline you need to install following software. Running the pipeline on Google Cloud requires additional setup detailed below.
+To run the pipeline you need to do some setup. The exact steps you need to take depends on the platform you are running the pipeline on, and will be detailed below and in [HOWTO](howto.md). Independent of platform, running the pipeline is done using [caper](https://github.com/ENCODE-DCC/caper) and (optional but recommended) output organization is done using [croo](https://github.com/ENCODE-DCC/croo). Both `caper` and `croo` require `python` version 3.4.1 or newer.
+
+## Caper
+
+Direct usage of the execution engine [Cromwell](https://software.broadinstitute.org/wdl/documentation/execution) features complicated backend configuration, workflow options and command line parameters. Caper hides the complexity and consolidates configuration in one file. Caper is available in [PyPI](https://pypi.org/project/caper/) and it is installed by running:
+
+```bash
+  $ pip install caper
+```
+
+Note that conda run mode that is described in caper documentation is not supported by this pipeline.
+
+## Croo
+
+The way [Cromwell](https://software.broadinstitute.org/wdl/documentation/execution) organizes pipeline outputs is not always the most clear and findable. Croo is a tool to reorganize the files in more readable manner. Croo is available in [PyPI](https://pypi.org/project/croo/) and it is installed by running:
+
+```bash
+  $ pip install croo
+```
 
 ## Java 8
 
-Java is required to run execution engine [Cromwell](https://software.broadinstitute.org/wdl/documentation/execution).
+Java is required to run execution engine [Cromwell](https://software.broadinstitute.org/wdl/documentation/execution) that `caper` uses under the hood.
 To check which Java version you already have, run:
 ```bash
   $ java -version
 ```
 You are looking for 1.8 or higher. If the requirement is not fulfilled follow installation instructions for [mac](https://java.com/en/download/help/mac_install.xml) or
 [linux](http://openjdk.java.net/install/) or use your favorite installation method.
 
-## Cromwell
-
-Download WDL runner Cromwell from [here](https://github.com/broadinstitute/cromwell/releases). The pipeline has been tested using version 40.
-
 ## Docker
 
-Pipeline code is packaged and distributed in Docker containers, and thus Docker installation is needed. 
+Pipeline code is packaged and distributed in Docker containers, and thus Docker installation is needed.
 Follow instructions for [mac](https://docs.docker.com/docker-for-mac/install/) or [linux](https://docs.docker.com/install/linux/docker-ce/ubuntu/#upgrade-docker-after-using-the-convenience-script).
 
-## Caper
+## Singularity
 
-For running the pipeline we recommend using [Caper](https://github.com/ENCODE-DCC/caper) that wraps Cromwell in an easier to use package.
+If you want to use Singularity instead of Docker, install [singularity](https://www.sylabs.io/guides/3.1/user-guide/installation.html). Pipeline requires singularity version `>=2.5.2`, the link takes you to version `3.1`.
 
-## croo
+## Google Cloud
 
-For organizing pipeline outputs we recommend using [croo](https://github.com/ENCODE-DCC/croo) that makes a nicely organized directory from the complicated output tree Cromwell defaults to. The configuration file for `croo` is named `output_definition.json` and can be found in the root of this repository.
+If you are intending to run the pipeline on Google Cloud platform, follow the [caper setup instructions for GCP](https://github.com/ENCODE-DCC/caper/blob/master/docs/conf_gcp.md).
+* For an example on how to run the pipeline on Google Cloud, see [HOWTO](howto.md#google-cloud).
 
-## Singularity
+## AWS
 
-If for some reason you cannot run Docker, install [singularity](https://www.sylabs.io/guides/3.1/user-guide/installation.html) and have a look at [HOWTO](howto.md#local-with-singularity) for an example of how to run pipeline with singularity. Pipeline requires singularity version `>=2.5.2`, the link takes you to version `3.1`.
+If you are intending to run the pipeline on AWS, follow the [caper setup instructions for AWS](https://github.com/ENCODE-DCC/caper/blob/master/docs/conf_aws.md).
 
-## Google Cloud
+## Cromwell (optional)
 
-If you are intending to run the pipeline on Google Cloud platform, the following setup is needed:
-
-1. Sign up for a Google account.
-2. Go to [Google Project](https://console.developers.google.com/project) page and click "SIGN UP FOR FREE TRIAL" on the top left and agree to terms.
-3. Set up a payment method and click "START MY FREE TRIAL".
-4. Create a [Google Project](https://console.developers.google.com/project) `[YOUR_PROJECT_NAME]` and choose it on the top of the page.
-5. Create a [Google Cloud Storage bucket](https://console.cloud.google.com/storage/browser) `gs://[YOUR_BUCKET_NAME]` by clicking on a button "CREATE BUCKET" and create it to store pipeline outputs.
-6. Find and enable following APIs in your [API Manager](https://console.developers.google.com/apis/library). Click a back button on your web brower after enabling each.
-    * Compute Engine API
-    * Google Cloud Storage
-    * Google Cloud Storage JSON API
-    * Genomics API
-
-7. Install [Google Cloud Platform SDK](https://cloud.google.com/sdk/downloads) and authenticate through it. You will be asked to enter verification keys. Get keys from the URLs they provide.
-    ```
-      $ gcloud auth login --no-launch-browser
-      $ gcloud auth application-default login --no-launch-browser
-    ```
-
-8. If you see permission errors at runtime, then unset environment variable `GOOGLE_APPLICATION_CREDENTIALS` or add it to your BASH startup scripts (`$HOME/.bashrc` or `$HOME/.bash_profile`).
-    ```
-      unset GOOGLE_APPLICATION_CREDENTIALS
-    ```
-
-9. Set your default Google Cloud Project. Pipeline will provision instances on this project.
-    ```
-      $ gcloud config set project [YOUR_PROJECT_NAME]
-    ```
+We recommend using `caper` for running the pipeline, although it is possible to use Cromwell directly. Backend file and workflow options files necessary for direct Cromwell use are included in the repository for local testing purposes, but they are not actively maintained to follow cloud API changes etc.