diff --git a/.editorconfig b/.editorconfig index 9554950..b6b3190 100644 --- a/.editorconfig +++ b/.editorconfig @@ -8,12 +8,9 @@ trim_trailing_whitespace = true indent_size = 4 indent_style = space -[*.{yml,yaml}] +[*.{md,yml,yaml,html,css,scss,js}] indent_size = 2 -[*.json] -insert_final_newline = unset - # These files are edited and tested upstream in nf-core/modules [/modules/nf-core/**] charset = unset diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index d08056c..d6b4355 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -15,8 +15,7 @@ Contributions to the code are even more welcome ;) If you'd like to write some code for nf-core/epitopeprediction, the standard workflow is as follows: -1. Check that there isn't already an issue about your idea in the [nf-core/epitopeprediction issues](https://github.com/nf-core/epitopeprediction/issues) to avoid duplicating work - * If there isn't one already, please create one so that others know you're working on this +1. Check that there isn't already an issue about your idea in the [nf-core/epitopeprediction issues](https://github.com/nf-core/epitopeprediction/issues) to avoid duplicating work. If there isn't one already, please create one so that others know you're working on this 2. [Fork](https://help.github.com/en/github/getting-started-with-github/fork-a-repo) the [nf-core/epitopeprediction repository](https://github.com/nf-core/epitopeprediction) to your GitHub account 3. Make the necessary changes / additions within your forked repository following [Pipeline conventions](#pipeline-contribution-conventions) 4. Use `nf-core schema build` and add any new parameters to the pipeline JSON schema (requires [nf-core tools](https://github.com/nf-core/tools) >= 1.10). @@ -49,9 +48,9 @@ These tests are run both with the latest available version of `Nextflow` and als :warning: Only in the unlikely and regretful event of a release happening with a bug. -* On your own fork, make a new branch `patch` based on `upstream/master`. -* Fix the bug, and bump version (X.Y.Z+1). -* A PR should be made on `master` from patch to directly this particular bug. +- On your own fork, make a new branch `patch` based on `upstream/master`. +- Fix the bug, and bump version (X.Y.Z+1). +- A PR should be made on `master` from patch to directly this particular bug. ## Getting help @@ -73,7 +72,7 @@ If you wish to contribute a new step, please use the following coding standards: 6. Add sanity checks and validation for all relevant parameters. 7. Perform local tests to validate that the new code works as expected. 8. If applicable, add a new test command in `.github/workflow/ci.yml`. -9. Update MultiQC config `assets/multiqc_config.yaml` so relevant suffixes, file name clean up and module plots are in the appropriate order. If applicable, add a [MultiQC](https://https://multiqc.info/) module. +9. Update MultiQC config `assets/multiqc_config.yml` so relevant suffixes, file name clean up and module plots are in the appropriate order. If applicable, add a [MultiQC](https://https://multiqc.info/) module. 10. Add a description of the output files and if relevant any appropriate images from the MultiQC report to `docs/output.md`. ### Default values @@ -92,8 +91,8 @@ The process resources can be passed on to the tool dynamically within the proces Please use the following naming schemes, to make it easy to understand what is going where. -* initial process channel: `ch_output_from_` -* intermediate and terminal channels: `ch__for_` +- initial process channel: `ch_output_from_` +- intermediate and terminal channels: `ch__for_` ### Nextflow version bumping diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index ef71508..31eaa8d 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -1,9 +1,7 @@ - name: Bug report description: Report something that is broken or incorrect labels: bug body: - - type: markdown attributes: value: | diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index bfe81d3..6cf49e7 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -10,16 +10,15 @@ Remember that PRs should be made against the dev branch, unless you're preparing Learn more about contributing: [CONTRIBUTING.md](https://github.com/nf-core/epitopeprediction/tree/master/.github/CONTRIBUTING.md) --> - ## PR checklist - [ ] This comment contains a description of changes (with reason). - [ ] If you've fixed a bug or added code that should be tested, add tests! - - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/epitopeprediction/tree/master/.github/CONTRIBUTING.md) - - [ ] If necessary, also make a PR on the nf-core/epitopeprediction _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. + - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/epitopeprediction/tree/master/.github/CONTRIBUTING.md) + - [ ] If necessary, also make a PR on the nf-core/epitopeprediction _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. - [ ] Make sure your code lints (`nf-core lint`). -- [ ] Ensure the test suite passes (`nextflow run . -profile test,docker`). +- [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `). - [ ] Usage Documentation in `docs/usage.md` is updated. - [ ] Output Documentation in `docs/output.md` is updated. - [ ] `CHANGELOG.md` is updated. diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index 6ed72f4..43c88f0 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -14,17 +14,14 @@ jobs: runs-on: ubuntu-latest steps: - name: Launch workflow via tower - uses: nf-core/tower-action@v2 + uses: nf-core/tower-action@v3 with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - pipeline: ${{ github.repository }} - revision: ${{ github.sha }} workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/epitopeprediction/work-${{ github.sha }} parameters: | { "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/epitopeprediction/results-${{ github.sha }}" } profiles: test_full,aws_tower - pre_run_script: 'export NXF_VER=21.10.3' diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index ab71cca..2a4b889 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -10,19 +10,16 @@ jobs: if: github.repository == 'nf-core/epitopeprediction' runs-on: ubuntu-latest steps: + # Launch workflow using Tower CLI tool action - name: Launch workflow via tower - uses: nf-core/tower-action@v2 - + uses: nf-core/tower-action@v3 with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - pipeline: ${{ github.repository }} - revision: ${{ github.sha }} workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/epitopeprediction/work-${{ github.sha }} parameters: | { "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/epitopeprediction/results-test-${{ github.sha }}" } profiles: test,aws_tower - pre_run_script: 'export NXF_VER=21.10.3' diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml index 9929180..28af178 100644 --- a/.github/workflows/branch.yml +++ b/.github/workflows/branch.yml @@ -15,7 +15,6 @@ jobs: run: | { [[ ${{github.event.pull_request.head.repo.full_name }} == nf-core/epitopeprediction ]] && [[ $GITHUB_HEAD_REF = "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] - # If the above check failed, post a comment on the PR explaining the failure # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets - name: Post PR comment @@ -43,4 +42,3 @@ jobs: Thanks again for your contribution! repo-token: ${{ secrets.GITHUB_TOKEN }} allow-repeats: false - diff --git a/.github/workflows/ci-external.yml b/.github/workflows/ci-external.yml new file mode 100644 index 0000000..189daf0 --- /dev/null +++ b/.github/workflows/ci-external.yml @@ -0,0 +1,50 @@ +name: nf-core CI external +# This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors +on: + workflow_dispatch: + +env: + NXF_ANSI_LOG: false + CAPSULE_LOG: none + +jobs: + netmhc: + name: Run NetMHC tool family tests + runs-on: ubuntu-latest + env: + NXF_VER: ${{ matrix.nxf_ver }} + strategy: + matrix: + include: + # Test pipeline minimum Nextflow version + - NXF_VER: "21.10.3" + NXF_EDGE: "" + # Test latest edge release of Nextflow + - NXF_VER: "" + NXF_EDGE: "1" + steps: + - name: Check out pipeline code + uses: actions/checkout@v2 + + - name: Install non-free software + env: + DECRYPT_PASSPHRASE: ${{ secrets.TEST_NETMHC }} + run: | + mkdir -v non-free + curl -L https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/software/non-free-software.tar.gpg | ${GITHUB_WORKSPACE}/bin/decrypt | tar -C non-free -v -x + + - name: Install Nextflow + env: + NXF_VER: ${{ matrix.NXF_VER }} + # Uncomment only if the edge release is more recent than the latest stable release + # See https://github.com/nextflow-io/nextflow/issues/2467 + # NXF_EDGE: ${{ matrix.NXF_EDGE }} + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + - name: Run pipeline with NetMHC + run: | + nextflow run ${GITHUB_WORKSPACE} -profile test_netmhc,docker --outdir ./results + - name: Run pipeline with NetMHCpan + run: | + nextflow run ${GITHUB_WORKSPACE} -profile test_netmhcpan,docker --outdir ./results diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 52dbd3e..e2b4aea 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,20 +14,20 @@ env: jobs: test: - name: Run workflow tests + name: Run pipeline with test data # Only run on push if this is the nf-core dev branch (merged PRs) - if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/epitopeprediction') }} + if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/epitopeprediction') }}" runs-on: ubuntu-latest strategy: matrix: # Nextflow versions include: # Test pipeline minimum Nextflow version - - NXF_VER: '21.10.3' - NXF_EDGE: '' + - NXF_VER: "21.10.3" + NXF_EDGE: "" # Test latest edge release of Nextflow - - NXF_VER: '' - NXF_EDGE: '1' + - NXF_VER: "" + NXF_EDGE: "1" steps: - name: Check out pipeline code uses: actions/checkout@v2 @@ -44,8 +44,7 @@ jobs: - name: Run pipeline with test data run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker - + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results profile: name: Run profile tests if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/epitopeprediction') }} @@ -57,14 +56,15 @@ jobs: matrix: include: # Test pipeline minimum Nextflow version - - NXF_VER: '21.10.3' - NXF_EDGE: '' + - NXF_VER: "21.10.3" + NXF_EDGE: "" # Test latest edge release of Nextflow - - NXF_VER: '' - NXF_EDGE: '1' + - NXF_VER: "" + NXF_EDGE: "1" tests: [ "test_variant_tsv", + "test_grch38_variant_tsv", "test_peptides", "test_peptides_h2", "test_proteins", @@ -85,11 +85,11 @@ jobs: sudo mv nextflow /usr/local/bin/ - name: Run pipeline with profile ${{ matrix.tests }} run: | - nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.tests }},docker + nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.tests }},docker --outdir ./results nonfree: name: Run NetMHC tool family tests - if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/epitopeprediction') }} + if: ${{ ( github.event_name == 'push' && github.repository == 'nf-core/epitopeprediction' ) || github.event.pull_request.head.repo.full_name == 'nf-core/epitopeprediction' }} runs-on: ubuntu-latest env: NXF_VER: ${{ matrix.nxf_ver }} @@ -97,11 +97,11 @@ jobs: matrix: include: # Test pipeline minimum Nextflow version - - NXF_VER: '21.10.3' - NXF_EDGE: '' + - NXF_VER: "21.10.3" + NXF_EDGE: "" # Test latest edge release of Nextflow - - NXF_VER: '' - NXF_EDGE: '1' + - NXF_VER: "" + NXF_EDGE: "1" steps: - name: Check out pipeline code uses: actions/checkout@v2 @@ -124,7 +124,7 @@ jobs: sudo mv nextflow /usr/local/bin/ - name: Run pipeline with NetMHC run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_netmhc,docker + nextflow run ${GITHUB_WORKSPACE} -profile test_netmhc,docker --outdir ./results - name: Run pipeline with NetMHCpan run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_netmhcpan,docker + nextflow run ${GITHUB_WORKSPACE} -profile test_netmhcpan,docker --outdir ./results diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix-linting.yml new file mode 100644 index 0000000..f166298 --- /dev/null +++ b/.github/workflows/fix-linting.yml @@ -0,0 +1,55 @@ +name: Fix linting from a comment +on: + issue_comment: + types: [created] + +jobs: + deploy: + # Only run if comment is on a PR with the main repo, and if it contains the magic keywords + if: > + contains(github.event.comment.html_url, '/pull/') && + contains(github.event.comment.body, '@nf-core-bot fix linting') && + github.repository == 'nf-core/epitopeprediction' + runs-on: ubuntu-latest + steps: + # Use the @nf-core-bot token to check out so we can push later + - uses: actions/checkout@v3 + with: + token: ${{ secrets.nf_core_bot_auth_token }} + + # Action runs on the issue comment, so we don't get the PR by default + # Use the gh cli to check out the PR + - name: Checkout Pull Request + run: gh pr checkout ${{ github.event.issue.number }} + env: + GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }} + + - uses: actions/setup-node@v2 + + - name: Install Prettier + run: npm install -g prettier @prettier/plugin-php + + # Check that we actually need to fix something + - name: Run 'prettier --check' + id: prettier_status + run: | + if prettier --check ${GITHUB_WORKSPACE}; then + echo "::set-output name=result::pass" + else + echo "::set-output name=result::fail" + fi + + - name: Run 'prettier --write' + if: steps.prettier_status.outputs.result == 'fail' + run: prettier --write ${GITHUB_WORKSPACE} + + - name: Commit & push changes + if: steps.prettier_status.outputs.result == 'fail' + run: | + git config user.email "core@nf-co.re" + git config user.name "nf-core-bot" + git config push.default upstream + git add . + git status + git commit -m "[automated] Fix linting with Prettier" + git push diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 3b44877..77358de 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -1,6 +1,7 @@ name: nf-core linting # This workflow is triggered on pushes and PRs to the repository. -# It runs the `nf-core lint` and markdown lint tests to ensure that the code meets the nf-core guidelines +# It runs the `nf-core lint` and markdown lint tests to ensure +# that the code meets the nf-core guidelines. on: push: pull_request: @@ -8,100 +9,35 @@ on: types: [published] jobs: - Markdown: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - uses: actions/setup-node@v1 - with: - node-version: '10' - - name: Install markdownlint - run: npm install -g markdownlint-cli - - name: Run Markdownlint - run: markdownlint . - - # If the above check failed, post a comment on the PR explaining the failure - - name: Post PR comment - if: failure() - uses: mshick/add-pr-comment@v1 - with: - message: | - ## Markdown linting is failing - - To keep the code consistent with lots of contributors, we run automated code consistency checks. - To fix this CI test, please run: - - * Install `markdownlint-cli` - * On Mac: `brew install markdownlint-cli` - * Everything else: [Install `npm`](https://www.npmjs.com/get-npm) then [install `markdownlint-cli`](https://www.npmjs.com/package/markdownlint-cli) (`npm install -g markdownlint-cli`) - * Fix the markdown errors - * Automatically: `markdownlint . --fix` - * Manually resolve anything left from `markdownlint .` - - Once you push these changes the test should pass, and you can hide this comment :+1: - - We highly recommend setting up markdownlint in your code editor so that this formatting is done automatically on save. Ask about it on Slack for help! - - Thanks again for your contribution! - repo-token: ${{ secrets.GITHUB_TOKEN }} - allow-repeats: false - EditorConfig: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - uses: actions/setup-node@v1 - with: - node-version: '10' + - uses: actions/setup-node@v2 - name: Install editorconfig-checker run: npm install -g editorconfig-checker - name: Run ECLint check - run: editorconfig-checker -exclude README.md $(git ls-files | grep -v test) + run: editorconfig-checker -exclude README.md $(find .* -type f | grep -v '.git\|.py\|.md\|json\|yml\|yaml\|html\|css\|work\|.nextflow\|build\|nf_core.egg-info\|log.txt\|Makefile') - YAML: + Prettier: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v1 - - uses: actions/setup-node@v1 - with: - node-version: '10' - - name: Install yaml-lint - run: npm install -g yaml-lint - - name: Run yaml-lint - run: yamllint $(find ${GITHUB_WORKSPACE} -type f -name "*.yml" -o -name "*.yaml") - - # If the above check failed, post a comment on the PR explaining the failure - - name: Post PR comment - if: failure() - uses: mshick/add-pr-comment@v1 - with: - message: | - ## YAML linting is failing - - To keep the code consistent with lots of contributors, we run automated code consistency checks. - To fix this CI test, please run: - - * Install `yaml-lint` - * [Install `npm`](https://www.npmjs.com/get-npm) then [install `yaml-lint`](https://www.npmjs.com/package/yaml-lint) (`npm install -g yaml-lint`) - * Fix the markdown errors - * Run the test locally: `yamllint $(find . -type f -name "*.yml" -o -name "*.yaml")` - * Fix any reported errors in your YAML files + - uses: actions/checkout@v2 - Once you push these changes the test should pass, and you can hide this comment :+1: + - uses: actions/setup-node@v2 - We highly recommend setting up yaml-lint in your code editor so that this formatting is done automatically on save. Ask about it on Slack for help! + - name: Install Prettier + run: npm install -g prettier - Thanks again for your contribution! - repo-token: ${{ secrets.GITHUB_TOKEN }} - allow-repeats: false + - name: Run Prettier --check + run: prettier --check ${GITHUB_WORKSPACE} nf-core: runs-on: ubuntu-latest steps: - - name: Check out pipeline code uses: actions/checkout@v2 @@ -112,10 +48,10 @@ jobs: wget -qO- get.nextflow.io | bash sudo mv nextflow /usr/local/bin/ - - uses: actions/setup-python@v1 + - uses: actions/setup-python@v3 with: - python-version: '3.6' - architecture: 'x64' + python-version: "3.6" + architecture: "x64" - name: Install dependencies run: | @@ -142,4 +78,3 @@ jobs: lint_log.txt lint_results.md PR_number.txt - diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index 44d7299..04758f6 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -1,4 +1,3 @@ - name: nf-core linting comment # This workflow is triggered after the linting action is complete # It posts an automated comment to the PR, even if the PR is coming from a fork @@ -27,4 +26,3 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} number: ${{ steps.pr_number.outputs.pr_number }} path: linting-logs/lint_results.md - diff --git a/.gitpod.yml b/.gitpod.yml new file mode 100644 index 0000000..85d95ec --- /dev/null +++ b/.gitpod.yml @@ -0,0 +1,14 @@ +image: nfcore/gitpod:latest + +vscode: + extensions: # based on nf-core.nf-core-extensionpack + - codezombiech.gitignore # Language support for .gitignore files + # - cssho.vscode-svgviewer # SVG viewer + - esbenp.prettier-vscode # Markdown/CommonMark linting and style checking for Visual Studio Code + - eamodio.gitlens # Quickly glimpse into whom, why, and when a line or code block was changed + - EditorConfig.EditorConfig # override user/workspace settings with settings found in .editorconfig files + - Gruntfuggly.todo-tree # Display TODO and FIXME in a tree view in the activity bar + - mechatroner.rainbow-csv # Highlight columns in csv files in different colors + # - nextflow.nextflow # Nextflow syntax highlighting + - oderwat.indent-rainbow # Highlight indentation level + - streetsidesoftware.code-spell-checker # Spelling checker for source code diff --git a/.markdownlint.yml b/.markdownlint.yml deleted file mode 100644 index 9e605fc..0000000 --- a/.markdownlint.yml +++ /dev/null @@ -1,14 +0,0 @@ -# Markdownlint configuration file -default: true -line-length: false -ul-indent: - indent: 4 -no-duplicate-header: - siblings_only: true -no-inline-html: - allowed_elements: - - img - - p - - kbd - - details - - summary diff --git a/.nf-core.yml b/.nf-core.yml new file mode 100644 index 0000000..3805dc8 --- /dev/null +++ b/.nf-core.yml @@ -0,0 +1 @@ +repository_type: pipeline diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 0000000..d0e7ae5 --- /dev/null +++ b/.prettierignore @@ -0,0 +1,9 @@ +email_template.html +.nextflow* +work/ +data/ +results/ +.DS_Store +testing/ +testing* +*.pyc diff --git a/.prettierrc.yml b/.prettierrc.yml new file mode 100644 index 0000000..c81f9a7 --- /dev/null +++ b/.prettierrc.yml @@ -0,0 +1 @@ +printWidth: 120 diff --git a/CHANGELOG.md b/CHANGELOG.md index 515ec6a..a9c57e5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,22 +3,53 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## v2.1.0 - Nordring - 2022-08-02 + +### `Added` + +- [#145](https://github.com/nf-core/epitopeprediction/pull/145) - Add functionality for handling gzipped VCF files for [#143](https://github.com/nf-core/epitopeprediction/issues/143) +- [#155](https://github.com/nf-core/epitopeprediction/pull/155) - Add functionality for splitting input VCF files by the number of variants [#140](https://github.com/nf-core/epitopeprediction/issues/140) +- [#157](https://github.com/nf-core/epitopeprediction/pull/157) - Add JSON config for external prediction tools +- [#161](https://github.com/nf-core/epitopeprediction/pull/161) - Add rank values for prediction threshold (as default) and parameter `use_affinity_thresholds` to use affinity thresholds instead [#160](https://github.com/nf-core/epitopeprediction/issues/160) +- [#165](https://github.com/nf-core/epitopeprediction/pull/165) - Add tools to full size test, add MHC class II to MHCnuggets test +- [#166](https://github.com/nf-core/epitopeprediction/pull/166) - Add support for additional non-free `NetMHC` family tools +- [#168](https://github.com/nf-core/epitopeprediction/pull/168) - Add parameters to specify MHC class II peptide length (`max_peptide_length_class2` and `min_peptide_length_class2`) +- [#170](https://github.com/nf-core/epitopeprediction/pull/170) - Add `binder` column (binder to any specified MHC allele) + +### `Changed` + +- [#152](https://github.com/nf-core/epitopeprediction/pull/152) - Update MultiQC from `1.11` to `1.12` +- [#152](https://github.com/nf-core/epitopeprediction/pull/152) - Merge previous template updates up to `2.3.2` +- [#153](https://github.com/nf-core/epitopeprediction/pull/153) - Update to nf-core template `2.4` +- [#158](https://github.com/nf-core/epitopeprediction/pull/158) - CI tests for non-free tools are not run on PR to `dev`(the secret is not available there). +- [#162](https://github.com/nf-core/epitopeprediction/pull/162) - Use most recent `epytope` release (`3.1.0`) +- [#162](https://github.com/nf-core/epitopeprediction/pull/162) - Use more recent `Ensembl BioMart` archive release for `GRCh38` (`Ensembl 88`) +- [#163](https://github.com/nf-core/epitopeprediction/pull/163) - Save applied tool thresholds in prediction report +- [#168](https://github.com/nf-core/epitopeprediction/pull/168) - Use MHC class information specified in sample sheet +- [#169](https://github.com/nf-core/epitopeprediction/pull/169) - Update MultiQC to `1.13` + +### `Fixed` + +- [#135](https://github.com/nf-core/epitopeprediction/pull/135) - Fix unique variant annotation field handling [#136](https://github.com/nf-core/epitopeprediction/issues/136) +- [#144](https://github.com/nf-core/epitopeprediction/pull/144) - Fix VCF file parsing [#142](https://github.com/nf-core/epitopeprediction/issues/142) +- [#159](https://github.com/nf-core/epitopeprediction/pull/159) - Fix execution for multiple samples of same input type + ## v2.0.0 - Heuberg - 2021-12-20 ### `Added` -- [#73](https://github.com/nf-core/epitopeprediction/pull/73) - Add support for the non-free `NetMHC` tool family including `NetMHC 4.0`, `NetMHCpan 4.0`, `NetMHCII 2.2`, and `NetMHCIIpan 3.1` -- [#83](https://github.com/nf-core/epitopeprediction/pull/83) - Add option for threshold customization +- [#73](https://github.com/nf-core/epitopeprediction/pull/73) - Add support for the non-free `NetMHC` tool family including `NetMHC 4.0`, `NetMHCpan 4.0`, `NetMHCII 2.2`, and `NetMHCIIpan 3.1` +- [#83](https://github.com/nf-core/epitopeprediction/pull/83) - Add option for threshold customization - [#101](https://github.com/nf-core/epitopeprediction/pull/101) - Add local modules for DSL2 conversion ### `Changed` -- [#107](https://github.com/nf-core/epitopeprediction/pull/107) - Merge previous template updates up to `v2.1` +- [#107](https://github.com/nf-core/epitopeprediction/pull/107) - Merge previous template updates up to `v2.1` - [#110](https://github.com/nf-core/epitopeprediction/pull/110), [#113](https://github.com/nf-core/epitopeprediction/pull/113) - Port pipeline to Nextflow DSL2 syntax - [#114](https://github.com/nf-core/epitopeprediction/pull/114) - Update `python 2.7` to `python 3.8.9` in `split_peptides.nf` and `merge_json.nf`. -- [#117](https://github.com/nf-core/epitopeprediction/pull/117) - Bump minimal NXF version to `21.10.4` +- [#117](https://github.com/nf-core/epitopeprediction/pull/117) - Bump minimal NXF version to `21.10.3` - [#121](https://github.com/nf-core/epitopeprediction/pull/121) - Extend full test to cover more test cases -- [#122](https://github.com/nf-core/epitopeprediction/pull/122) - Update to nf-core template v2.2 +- [#122](https://github.com/nf-core/epitopeprediction/pull/122) - Update to nf-core template `v2.2` - [#123](https://github.com/nf-core/epitopeprediction/pull/123) - Remove support for outdated external tools `NetMHCII 2.2` and `NetMHCIIpan 3.1` ### `Fixed` diff --git a/CITATIONS.md b/CITATIONS.md index 56daa84..76d1cf6 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,33 +10,66 @@ ## Pipeline tools -* [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) - > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. +- [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) -* [SnpSift](https://dx.doi.org/10.3389/fgene.2012.00035) - > Pablo Cingolani, Viral M. Patel, Melissa Coon, Tung Nguyen, Susan J. Land, Douglas M. Ruden and Xiangyi Lu. Using Drosophila melanogaster as a model for genotoxic chemical mutational studies with a new program, SnpSift. _Frontiers in Genetics_ 3, 35 (2012). doi: 10.3389/fgene.2012.00035. +- [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) -* [FRED2](https://dx.doi.org/10.1093/bioinformatics/btw113) - > Benjamin Schubert, Mathias Walzer, Hans-Philipp Brachvogel, András Szolek, Christopher Mohr, Oliver Kohlbacher. FRED 2: an immunoinformatics framework for Pythonö Bioinformatics 32(13), 2044-2046 (2016). doi: 10.1093/bioinformatics/btw113. + > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. -* [MHCflurry](https://dx.doi.org/10.1016/j.cels.2018.05.014) - > Timothy J. O’Donnell, Alex Rubinsteyn, Maria Bonsack, Angelika B. Riemer, Uri Laserson, Jeff Hammerbacher. MHC flurry: open-source class I MHC binding affinity prediction. Cell systems 7(1), 129-132 (2018). doi: 10.1016/j.cels.2018.05.014. +- [SnpSift](https://dx.doi.org/10.3389/fgene.2012.00035) -* [MHCnuggets](https://dx.doi.org/10.1158/2326-6066.CIR-19-0464) - > Xiaoshan M. Shao, Rohit Bhattacharya, Justin Huang, I.K. Ashok Sivakumar, Collin Tokheim, Lily Zheng, Dylan Hirsch, Benjamin Kaminow, Ashton Omdahl, Maria Bonsack, Angelika B. Riemer, Victor E. Velculescu, Valsamo Anagnostou, Kymberleigh A. Pagel and Rachel Karchin. High-throughput prediction of MHC class i and ii neoantigens with MHCnuggets. Cancer Immunology Research 8(3), 396-408 (2020). doi: 10.1158/2326-6066.CIR-19-0464. + > Pablo Cingolani, Viral M. Patel, Melissa Coon, Tung Nguyen, Susan J. Land, Douglas M. Ruden and Xiangyi Lu. Using Drosophila melanogaster as a model for genotoxic chemical mutational studies with a new program, SnpSift. _Frontiers in Genetics_ 3, 35 (2012). doi: 10.3389/fgene.2012.00035. + +- [Epytope (FRED2)](https://dx.doi.org/10.1093/bioinformatics/btw113) + + > Benjamin Schubert, Mathias Walzer, Hans-Philipp Brachvogel, András Szolek, Christopher Mohr, Oliver Kohlbacher. FRED 2: an immunoinformatics framework for Pythonö Bioinformatics 32(13), 2044-2046 (2016). doi: 10.1093/bioinformatics/btw113. + +- [MHCflurry](https://dx.doi.org/10.1016/j.cels.2018.05.014) + + > Timothy J. O’Donnell, Alex Rubinsteyn, Maria Bonsack, Angelika B. Riemer, Uri Laserson, Jeff Hammerbacher. MHC flurry: open-source class I MHC binding affinity prediction. Cell systems 7(1), 129-132 (2018). doi: 10.1016/j.cels.2018.05.014. + +- [MHCnuggets](https://dx.doi.org/10.1158/2326-6066.CIR-19-0464) + + > Xiaoshan M. Shao, Rohit Bhattacharya, Justin Huang, I.K. Ashok Sivakumar, Collin Tokheim, Lily Zheng, Dylan Hirsch, Benjamin Kaminow, Ashton Omdahl, Maria Bonsack, Angelika B. Riemer, Victor E. Velculescu, Valsamo Anagnostou, Kymberleigh A. Pagel and Rachel Karchin. High-throughput prediction of MHC class i and ii neoantigens with MHCnuggets. Cancer Immunology Research 8(3), 396-408 (2020). doi: 10.1158/2326-6066.CIR-19-0464. + +- [NetMHC-4.0](https://pubmed.ncbi.nlm.nih.gov/26515819/) + + > Andreatta M, Nielsen M. Gapped sequence alignment using artificial neural networks: application to the MHC class I system. Bioinformatics. 2016 Feb 15;32(4):511-7. doi: 10.1093/bioinformatics/btv639. Epub 2015 Oct 29. PMID: 26515819; PMCID: PMC6402319. + +- [NetMHCpan-4.0](https://pubmed.ncbi.nlm.nih.gov/28978689/) + + > Jurtz V, Paul S, Andreatta M, Marcatili P, Peters B, Nielsen M. NetMHCpan-4.0: Improved Peptide-MHC Class I Interaction Predictions Integrating Eluted Ligand and Peptide Binding Affinity Data. J Immunol. 2017 Nov 1;199(9):3360-3368. doi: 10.4049/jimmunol.1700893. Epub 2017 Oct 4. PMID: 28978689; PMCID: PMC5679736. + +- [NetMHCpan-4.1](https://pubmed.ncbi.nlm.nih.gov/32406916/) + + > Reynisson B, Alvarez B, Paul S, Peters B, Nielsen M. NetMHCpan-4.1 and NetMHCIIpan-4.0: improved predictions of MHC antigen presentation by concurrent motif deconvolution and integration of MS MHC eluted ligand data. Nucleic Acids Res. 2020 Jul 2;48(W1):W449-W454. doi: 10.1093/nar/gkaa379. PMID: 32406916; PMCID: PMC7319546. + +- [NetMHCII-2.3](https://pubmed.ncbi.nlm.nih.gov/29315598/) + + > Jensen KK, Andreatta M, Marcatili P, Buus S, Greenbaum JA, Yan Z, Sette A, Peters B, Nielsen M. Improved methods for predicting peptide binding affinity to MHC class II molecules. Immunology. 2018 Jul;154(3):394-406. doi: 10.1111/imm.12889. Epub 2018 Feb 6. PMID: 29315598; PMCID: PMC6002223. + +- [NetMHCII-2.3](https://pubmed.ncbi.nlm.nih.gov/29315598/) + + > Jensen KK, Andreatta M, Marcatili P, Buus S, Greenbaum JA, Yan Z, Sette A, Peters B, Nielsen M. Improved methods for predicting peptide binding affinity to MHC class II molecules. Immunology. 2018 Jul;154(3):394-406. doi: 10.1111/imm.12889. Epub 2018 Feb 6. PMID: 29315598; PMCID: PMC6002223. + +- [NetMHCIIpan-4.0](https://pubmed.ncbi.nlm.nih.gov/32406916/) + > Reynisson B, Alvarez B, Paul S, Peters B, Nielsen M. NetMHCpan-4.1 and NetMHCIIpan-4.0: improved predictions of MHC antigen presentation by concurrent motif deconvolution and integration of MS MHC eluted ligand data. Nucleic Acids Res. 2020 Jul 2;48(W1):W449-W454. doi: 10.1093/nar/gkaa379. PMID: 32406916; PMCID: PMC7319546. ## Software packaging/containerisation tools -* [Anaconda](https://anaconda.com) - > Anaconda Software Distribution. Computer software. Vers. 2-2.4.0. Anaconda, Nov. 2016. Web. +- [Anaconda](https://anaconda.com) + + > Anaconda Software Distribution. Computer software. Vers. 2-2.4.0. Anaconda, Nov. 2016. Web. + +- [Bioconda](https://pubmed.ncbi.nlm.nih.gov/29967506/) + + > Grüning B, Dale R, Sjödin A, Chapman BA, Rowe J, Tomkins-Tinch CH, Valieris R, Köster J; Bioconda Team. Bioconda: sustainable and comprehensive software distribution for the life sciences. Nat Methods. 2018 Jul;15(7):475-476. doi: 10.1038/s41592-018-0046-7. PubMed PMID: 29967506. -* [Bioconda](https://pubmed.ncbi.nlm.nih.gov/29967506/) - > Grüning B, Dale R, Sjödin A, Chapman BA, Rowe J, Tomkins-Tinch CH, Valieris R, Köster J; Bioconda Team. Bioconda: sustainable and comprehensive software distribution for the life sciences. Nat Methods. 2018 Jul;15(7):475-476. doi: 10.1038/s41592-018-0046-7. PubMed PMID: 29967506. +- [BioContainers](https://pubmed.ncbi.nlm.nih.gov/28379341/) -* [BioContainers](https://pubmed.ncbi.nlm.nih.gov/28379341/) - > da Veiga Leprevost F, Grüning B, Aflitos SA, Röst HL, Uszkoreit J, Barsnes H, Vaudel M, Moreno P, Gatto L, Weber J, Bai M, Jimenez RC, Sachsenberg T, Pfeuffer J, Alvarez RV, Griss J, Nesvizhskii AI, Perez-Riverol Y. BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics. 2017 Aug 15;33(16):2580-2582. doi: 10.1093/bioinformatics/btx192. PubMed PMID: 28379341; PubMed Central PMCID: PMC5870671. + > da Veiga Leprevost F, Grüning B, Aflitos SA, Röst HL, Uszkoreit J, Barsnes H, Vaudel M, Moreno P, Gatto L, Weber J, Bai M, Jimenez RC, Sachsenberg T, Pfeuffer J, Alvarez RV, Griss J, Nesvizhskii AI, Perez-Riverol Y. BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics. 2017 Aug 15;33(16):2580-2582. doi: 10.1093/bioinformatics/btx192. PubMed PMID: 28379341; PubMed Central PMCID: PMC5870671. -* [Docker](https://dl.acm.org/doi/10.5555/2600239.2600241) +- [Docker](https://dl.acm.org/doi/10.5555/2600239.2600241) -* [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/) - > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675. +- [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/) + > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675. diff --git a/README.md b/README.md index fdddaa4..c9aa3a0 100644 --- a/README.md +++ b/README.md @@ -33,26 +33,26 @@ On release, automated continuous integration tests run the pipeline on a full-si 1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=21.10.3`) -2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/), [`Podman`](https://podman.io/), [`Shifter`](https://nersc.gitlab.io/development/shifter/how-to-use/) or [`Charliecloud`](https://hpc.github.io/charliecloud/) for full pipeline reproducibility _(please only use [`Conda`](https://conda.io/miniconda.html) as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_ +2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) (you can follow [this tutorial](https://singularity-tutorial.github.io/01-installation/)), [`Podman`](https://podman.io/), [`Shifter`](https://nersc.gitlab.io/development/shifter/how-to-use/) or [`Charliecloud`](https://hpc.github.io/charliecloud/) for full pipeline reproducibility _(you can use [`Conda`](https://conda.io/miniconda.html) both to install Nextflow itself and also to manage software within pipelines. Please only use it within pipelines as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_. 3. Download the pipeline and test it on a minimal dataset with a single command: - ```console - nextflow run nf-core/epitopeprediction -profile test,YOURPROFILE - ``` + ```console + nextflow run nf-core/epitopeprediction -profile test,YOURPROFILE --outdir + ``` - Note that some form of configuration will be needed so that Nextflow knows how to fetch the required software. This is usually done in the form of a config profile (`YOURPROFILE` in the example command above). You can chain multiple config profiles in a comma-separated string. + Note that some form of configuration will be needed so that Nextflow knows how to fetch the required software. This is usually done in the form of a config profile (`YOURPROFILE` in the example command above). You can chain multiple config profiles in a comma-separated string. - > * The pipeline comes with config profiles called `docker`, `singularity`, `podman`, `shifter`, `charliecloud` and `conda` which instruct the pipeline to use the named tool for software management. For example, `-profile test,docker`. - > * Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. - > * If you are using `singularity` and are persistently observing issues downloading Singularity images directly due to timeout or network issues, then you can use the `--singularity_pull_docker_container` parameter to pull and convert the Docker image instead. Alternatively, you can use the [`nf-core download`](https://nf-co.re/tools/#downloading-pipelines-for-offline-use) command to download images first, before running the pipeline. Setting the [`NXF_SINGULARITY_CACHEDIR` or `singularity.cacheDir`](https://www.nextflow.io/docs/latest/singularity.html?#singularity-docker-hub) Nextflow options enables you to store and re-use the images from a central location for future pipeline runs. - > * If you are using `conda`, it is highly recommended to use the [`NXF_CONDA_CACHEDIR` or `conda.cacheDir`](https://www.nextflow.io/docs/latest/conda.html) settings to store the environments in a central location for future pipeline runs. + > - The pipeline comes with config profiles called `docker`, `singularity`, `podman`, `shifter`, `charliecloud` and `conda` which instruct the pipeline to use the named tool for software management. For example, `-profile test,docker`. + > - Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. + > - If you are using `singularity`, please use the [`nf-core download`](https://nf-co.re/tools/#downloading-pipelines-for-offline-use) command to download images first, before running the pipeline. Setting the [`NXF_SINGULARITY_CACHEDIR` or `singularity.cacheDir`](https://www.nextflow.io/docs/latest/singularity.html?#singularity-docker-hub) Nextflow options enables you to store and re-use the images from a central location for future pipeline runs. + > - If you are using `conda`, it is highly recommended to use the [`NXF_CONDA_CACHEDIR` or `conda.cacheDir`](https://www.nextflow.io/docs/latest/conda.html) settings to store the environments in a central location for future pipeline runs. 4. Start running your own analysis! - ```console - nextflow run nf-core/epitopeprediction -profile --input samplesheet.csv - ``` + ```console + nextflow run nf-core/epitopeprediction -profile --input samplesheet.csv --outdir + ``` See [usage docs](https://nf-co.re/epitopeprediction/usage) for all of the available options when running the pipeline. @@ -74,7 +74,7 @@ For further information or help, don't hesitate to get in touch on the [Slack `# ## Citations -If you use nf-core/epitopeprediction for your analysis, please cite it using the following doi: [10.5281/zenodo.3564666](https://doi.org/10.5281/zenodo.3564666) +If you use nf-core/epitopeprediction for your analysis, please cite it using the following doi: [10.5281/zenodo.3564666](https://doi.org/10.5281/zenodo.3564666) An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. diff --git a/assets/external_tools_meta.json b/assets/external_tools_meta.json new file mode 100644 index 0000000..70e7219 --- /dev/null +++ b/assets/external_tools_meta.json @@ -0,0 +1,70 @@ +{ + "netmhc": { + "4.0": { + "version": 4.0, + "software_md5": "132dc322da1e2043521138637dd31ebf", + "data_url": "https://services.healthtech.dtu.dk/services/NetMHC-4.0/data.tar.gz", + "data_md5": "63c646fa0921d617576531396954d633", + "binary_name": "netMHC" + } + }, + "netmhc_darwin": { + "4.0": { + "version": 4.0, + "software_md5": "43519de644c67ee4c8ee04c673861568", + "data_url": "https://services.healthtech.dtu.dk/services/NetMHC-4.0/data.tar.gz", + "data_md5": "63c646fa0921d617576531396954d633", + "binary_name": "netMHC" + } + }, + "netmhcpan": { + "4.0": { + "version": 4.0, + "software_md5": "94aa60f3dfd9752881c64878500e58f3", + "data_url": "https://services.healthtech.dtu.dk/services/NetMHCpan-4.0/data.Linux.tar.gz", + "data_md5": "26cbbd99a38f6692249442aeca48608f", + "binary_name": "netMHCpan" + }, + "4.1": { + "version": 4.1, + "software_md5": "105210d3de5525076347973e9391131c", + "data_url": "https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/data.tar.gz", + "data_md5": "4bdd3944cb4c5b8ba4d8900dae074c85", + "binary_name": "netMHCpan" + } + }, + "netmhcpan_darwin": { + "4.0": { + "version": 4.0, + "software_md5": "042d16b89adcf5656ca4deae06b55cc6", + "data_url": "https://services.healthtech.dtu.dk/services/NetMHCpan-4.0/data.Darwin.tar.gz", + "data_md5": "b4eef3c82852d677a98e0f7d753a36e2", + "binary_name": "netMHCpan" + }, + "4.1": { + "version": 4.0, + "software_md5": "5f6eab43feb80a24e32eb02b22e9db5f", + "data_url": "https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/data.tar.gz", + "data_md5": "4bdd3944cb4c5b8ba4d8900dae074c85", + "binary_name": "netMHCpan" + } + }, + "netmhciipan": { + "4.1": { + "version": 4.1, + "software_md5": "c70810798334e9f7614899b7ee965a3a", + "data_url": "https://services.healthtech.dtu.dk/services/NetMHCIIpan-4.0/data.tar.gz", + "data_md5": "ec3032cb66aee20ff0f53ff7f1795f47", + "binary_name": "netMHCIIpan" + } + }, + "netmhciipan_darwin": { + "4.1": { + "version": 4.1, + "software_md5": "eaa25d39710da3a8000a5cfd36707f7b", + "data_url": "https://services.healthtech.dtu.dk/services/NetMHCIIpan-4.0/data.tar.gz", + "data_md5": "ec3032cb66aee20ff0f53ff7f1795f47", + "binary_name": "netMHCIIpan" + } + } +} diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml deleted file mode 100644 index 2c6006d..0000000 --- a/assets/multiqc_config.yaml +++ /dev/null @@ -1,11 +0,0 @@ -report_comment: > - This report has been generated by the nf-core/epitopeprediction - analysis pipeline. For information about how to interpret these results, please see the - documentation. -report_section_order: - software_versions: - order: -1000 - nf-core-epitopeprediction-summary: - order: -1001 - -export_plots: true diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml new file mode 100644 index 0000000..ed096c5 --- /dev/null +++ b/assets/multiqc_config.yml @@ -0,0 +1,15 @@ +custom_logo: "nf-core-epitopeprediction_logo_light.png" +custom_logo_url: https://github.com/nf-core/epitopeprediction/ +custom_logo_title: "nf-core/epitopeprediction" + +report_comment: > + This report has been generated by the nf-core/epitopeprediction + analysis pipeline. For information about how to interpret these results, please see the + documentation. +report_section_order: + software_versions: + order: -1000 + "nf-core-epitopeprediction-summary": + order: -1001 + +export_plots: true diff --git a/assets/schema_input.json b/assets/schema_input.json index 2f1556c..b564a5c 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -25,16 +25,17 @@ ], "errorMessage": "Alleles must be provided as string or file with extension '.txt''" }, + "mhc_class": { + "type": "string", + "pattern": "^(I|II|H-2)$", + "errorMessage": "The MHC class must be provided. Valid values: " + }, "filename": { "type": "string", "pattern": "^\\S+\\.(vcf|tsv|fasta|fa|txt)$", "errorMessage": "Variants/proteins/peptides for sample must be provided and have one of the following extensions: '.vcf', '.tsv', '.fasta', '.fa', '.txt'" } }, - "required": [ - "sample", - "alleles", - "filename" - ] + "required": ["sample", "alleles", "mhc_class", "filename"] } } diff --git a/bin/check_requested_models.py b/bin/check_requested_models.py index a58335b..e3b126f 100755 --- a/bin/check_requested_models.py +++ b/bin/check_requested_models.py @@ -5,10 +5,10 @@ import argparse import logging -from Fred2.Core.Allele import Allele -from Fred2.Core.Peptide import Peptide -from Fred2.IO import FileReader -from Fred2.EpitopePrediction import EpitopePredictorFactory +from epytope.Core.Allele import Allele +from epytope.Core.Peptide import Peptide +from epytope.IO import FileReader +from epytope.EpitopePrediction import EpitopePredictorFactory # instantiate global logger object logger = logging.getLogger(__name__) @@ -37,7 +37,7 @@ def read_peptide_input(filename): def convert_allele_back(allele): name = str(allele) if name.startswith("H-2-"): - # convert internal Fred2 representation back to the nf-core/epitopeprediction input allele format + # convert internal epytope representation back to the nf-core/epitopeprediction input allele format return name.replace("H-2-", "H2-") elif name.startswith("HLA-"): return name.replace("HLA-", "") @@ -46,7 +46,7 @@ def convert_allele_back(allele): def __main__(): - parser = argparse.ArgumentParser("Write out information about supported models by Fred2 for installed predictor tool versions.") + parser = argparse.ArgumentParser("Write out information about supported models by epytope for installed predictor tool versions.") parser.add_argument('-p', "--peptides", help="File with one peptide per line") parser.add_argument('-c', "--mhcclass", default=1, help="MHC class I or II") parser.add_argument('-l', "--max_length", help="Maximum peptide length", type=int) @@ -55,10 +55,10 @@ def __main__(): parser.add_argument('-t', '--tools', help='Tools requested for peptide predictions', required=True, type=str) parser.add_argument('-v', '--versions', help=' File with used software versions.', required=True) args = parser.parse_args() - selected_methods = [item for item in args.tools.split(',')] + selected_methods = [item.split('-')[0] if "mhcnuggets" not in item else item for item in args.tools.split(',')] with open(args.versions, 'r') as versions_file: tool_version = [ (row[0].split()[0], str(row[1])) for row in csv.reader(versions_file, delimiter = ":") ] - # NOTE this needs to be updated, if a newer version will be available via Fred2 and should be used in the future + # NOTE this needs to be updated, if a newer version will be available via epytope and should be used in the future tool_version.append(('syfpeithi', '1.0')) # how to handle this? # get for each method the corresponding tool version methods = { method.strip():version.strip() for tool, version in tool_version for method in selected_methods if tool.lower() in method.lower() } @@ -77,7 +77,7 @@ def __main__(): # check if requested tool versions are supported for method, version in methods.items(): if version not in EpitopePredictorFactory.available_methods()[method.lower()]: - raise ValueError("The specified version " + version + " for " + method + " is not supported by Fred2.") + raise ValueError("The specified version " + version + " for " + method + " is not supported by epytope.") # check if requested alleles are supported support_all_alleles = True diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index a559aa7..408ebfa 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -1,21 +1,195 @@ #!/usr/bin/env python +import argparse +import logging import os import re import sys import errno -import argparse import re +from pathlib import Path + + +logger = logging.getLogger() + + +class RowChecker: + """ + Define a service that can validate and transform each given row. + + Attributes: + modified (list): A list of dicts, where each dict corresponds to a previously + validated and transformed row. The order of rows is maintained. + + """ + + VALID_FORMATS = ( + ".fq.gz", + ".fastq.gz", + ) -def parse_args(args=None): - Description = "Reformat nf-core/epitopeprediction samplesheet file and check its contents." - Epilog = "Example usage: python check_samplesheet.py " + def __init__( + self, + sample_col="sample", + first_col="fastq_1", + second_col="fastq_2", + single_col="single_end", + **kwargs, + ): + """ + Initialize the row checker with the expected column names. + + Args: + sample_col (str): The name of the column that contains the sample name + (default "sample"). + first_col (str): The name of the column that contains the first (or only) + FASTQ file path (default "fastq_1"). + second_col (str): The name of the column that contains the second (if any) + FASTQ file path (default "fastq_2"). + single_col (str): The name of the new column that will be inserted and + records whether the sample contains single- or paired-end sequencing + reads (default "single_end"). + + """ + super().__init__(**kwargs) + self._sample_col = sample_col + self._first_col = first_col + self._second_col = second_col + self._single_col = single_col + self._seen = set() + self.modified = [] + + def validate_and_transform(self, row): + """ + Perform all validations on the given row and insert the read pairing status. + + Args: + row (dict): A mapping from column headers (keys) to elements of that row + (values). + + """ + self._validate_sample(row) + self._validate_first(row) + self._validate_second(row) + self._validate_pair(row) + self._seen.add((row[self._sample_col], row[self._first_col])) + self.modified.append(row) + + def _validate_sample(self, row): + """Assert that the sample name exists and convert spaces to underscores.""" + assert len(row[self._sample_col]) > 0, "Sample input is required." + # Sanitize samples slightly. + row[self._sample_col] = row[self._sample_col].replace(" ", "_") + + def _validate_first(self, row): + """Assert that the first FASTQ entry is non-empty and has the right format.""" + assert len(row[self._first_col]) > 0, "At least the first FASTQ file is required." + self._validate_fastq_format(row[self._first_col]) + + def _validate_second(self, row): + """Assert that the second FASTQ entry has the right format if it exists.""" + if len(row[self._second_col]) > 0: + self._validate_fastq_format(row[self._second_col]) + + def _validate_pair(self, row): + """Assert that read pairs have the same file extension. Report pair status.""" + if row[self._first_col] and row[self._second_col]: + row[self._single_col] = False + assert ( + Path(row[self._first_col]).suffixes[-2:] == Path(row[self._second_col]).suffixes[-2:] + ), "FASTQ pairs must have the same file extensions." + else: + row[self._single_col] = True + + def _validate_fastq_format(self, filename): + """Assert that a given filename has one of the expected FASTQ extensions.""" + assert any(filename.endswith(extension) for extension in self.VALID_FORMATS), ( + f"The FASTQ file has an unrecognized extension: {filename}\n" + f"It should be one of: {', '.join(self.VALID_FORMATS)}" + ) - parser = argparse.ArgumentParser(description=Description, epilog=Epilog) - parser.add_argument("FILE_IN", help="Input samplesheet file.") - parser.add_argument("FILE_OUT", help="Output file.") - return parser.parse_args(args) + def validate_unique_samples(self): + """ + Assert that the combination of sample name and FASTQ filename is unique. + + In addition to the validation, also rename the sample if more than one sample, + FASTQ file combination exists. + + """ + assert len(self._seen) == len(self.modified), "The pair of sample name and FASTQ must be unique." + if len({pair[0] for pair in self._seen}) < len(self._seen): + counts = Counter(pair[0] for pair in self._seen) + seen = Counter() + for row in self.modified: + sample = row[self._sample_col] + seen[sample] += 1 + if counts[sample] > 1: + row[self._sample_col] = f"{sample}_T{seen[sample]}" + + +def read_head(handle, num_lines=10): + """Read the specified number of lines from the current position in the file.""" + lines = [] + for idx, line in enumerate(handle): + if idx == num_lines: + break + lines.append(line) + return "".join(lines) + + +def sniff_format(handle): + """ + Detect the tabular format. + + Args: + handle (text file): A handle to a `text file`_ object. The read position is + expected to be at the beginning (index 0). + + Returns: + csv.Dialect: The detected tabular format. + + .. _text file: + https://docs.python.org/3/glossary.html#term-text-file + + """ + peek = read_head(handle) + handle.seek(0) + sniffer = csv.Sniffer() + if not sniffer.has_header(peek): + logger.critical(f"The given sample sheet does not appear to contain a header.") + sys.exit(1) + dialect = sniffer.sniff(peek) + return dialect + + + +def parse_args(argv=None): + """Define and immediately parse command line arguments.""" + parser = argparse.ArgumentParser( + description="Validate and transform a tabular samplesheet.", + epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv", + ) + parser.add_argument( + "file_in", + metavar="FILE_IN", + type=Path, + help="Tabular input samplesheet in CSV or TSV format.", + ) + parser.add_argument( + "file_out", + metavar="FILE_OUT", + type=Path, + help="Transformed output samplesheet in CSV format.", + ) + parser.add_argument( + "-l", + "--log-level", + help="The desired log level (default WARNING).", + choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), + default="WARNING", + ) + return parser.parse_args(argv) def print_error(error, context="Line", context_str=""): @@ -23,7 +197,7 @@ def print_error(error, context="Line", context_str=""): if context != "" and context_str != "": error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format( error, context.strip(), context_str.strip() - ) + ) print(error_str) sys.exit(1) @@ -43,18 +217,17 @@ def make_dir(path): def check_samplesheet(file_in, file_out): - """ - This function checks that the samplesheet follows the following structure: - sample,alleles,filename - GBM_1,A*01:01;A*02:01;B*07:02;B*24:02;C*03:01;C*04:01,gbm_1_anno.vcf|gbm_1_peps.tsv|gbm_1_prot.fasta - GBM_2,A*02:01;A*24:01;B*07:02;B*08:01;C*04:01;C*07:01,gbm_2_anno.vcf|gbm_2_peps.tsv|gbm_2_prot.fasta + """ + sample,alleles,mhc_class,filename + GBM_1,A*01:01;A*02:01;B*07:02;B*24:02;C*03:01;C*04:01,I,gbm_1_anno.vcf|gbm_1_peps.tsv|gbm_1_prot.fasta + GBM_2,A*02:01;A*24:01;B*07:02;B*08:01;C*04:01;C*07:01,I,gbm_2_anno.vcf|gbm_2_peps.tsv|gbm_2_prot.fasta or - sample,alleles,filename - GBM_1,gbm_1_alleles.txt,gbm_1_anno.vcf|gbm_1_peps.tsv|gbm_1_prot.fasta - GBM_2,gbm_2_alleles.txt,gbm_2_anno.vcf|gbm_2_peps.tsv|gbm_2_prot.fasta + sample,alleles,mhc_class,filename + GBM_1,gbm_1_alleles.txt,I,gbm_1_anno.vcf|gbm_1_peps.tsv|gbm_1_prot.fasta + GBM_2,gbm_2_alleles.txt,I,gbm_2_anno.vcf|gbm_2_peps.tsv|gbm_2_prot.fasta where the FileName column contains EITHER a vcf/tsv file with genomic variants, a tsv file (peptides), or a fasta file (proteins) @@ -67,16 +240,18 @@ def check_samplesheet(file_in, file_out): - Peptide format => https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/peptides/peptides.tsv - Variant TSV format => https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/variants/variants.tsv - Variant VCF format => https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/variants/variants.vcf - """ sample_run_dict = {} with open(file_in, "r") as fin: ## Check header - COL_NUM = 3 - HEADER = ["sample", "alleles", "filename"] + COL_NUM = 4 + HEADER = ["sample", "alleles", "mhc_class", "filename"] header = [x.strip('"') for x in fin.readline().strip().split(",")] + valid_classes = "I,II,H-2" + valid_class1_loci = ['A*','B*','C*','E*','G*'] + valid_class2_loci = ['DR','DP','DQ'] if header[: len(HEADER)] != HEADER: print("ERROR: Please check samplesheet header -> {} != {}".format("\t".join(header), "\t".join(HEADER))) sys.exit(1) @@ -100,13 +275,21 @@ def check_samplesheet(file_in, file_out): ) ## Check sample name entries - sample, alleles, filename = lspl[: len(HEADER)] + sample, alleles, mhcclass, filename = lspl[: len(HEADER)] ## Check given file types if not filename.lower().endswith((".vcf", ".vcf.gz", ".tsv", ".GSvar", ".fasta", ".txt")): print_error("Samplesheet contains unsupported file type!", "Line", line) - sample_info = [sample, alleles, filename] + # Check given mhc classes + if mhcclass not in valid_classes: + print_error("Samplesheet contains unsupported mhc class!", "Line", line) + + # Check mhc class and allele combintion + if not os.path.isfile(alleles) and mhcclass == 'I' and any(substring in alleles for substring in valid_class2_loci) or mhcclass == 'II' and any(substring in alleles for substring in valid_class1_loci): + print_error("Samplesheet contains invalid mhc class and allele combination!", "Line", line) + + sample_info = [sample, alleles, mhcclass, filename] ## Create sample mapping dictionary if sample not in sample_run_dict: sample_run_dict[sample] = [sample_info] @@ -121,18 +304,24 @@ def check_samplesheet(file_in, file_out): out_dir = os.path.dirname(file_out) make_dir(out_dir) with open(file_out, "w") as fout: - fout.write(",".join(["sample", "alleles", "filename"]) + "\n") + fout.write(",".join(["sample", "alleles","mhc_class","filename"]) + "\n") for sample in sorted(sample_run_dict.keys()): for val in sample_run_dict[sample]: fout.write(",".join(val) + "\n") else: - print_error(f"No entries to process!", "Samplesheet: {file_in}") + print_error("No entries to process!", context="File", context_str="Samplesheet: {}".format(file_in)) -def main(args=None): - args = parse_args(args) - check_samplesheet(args.FILE_IN, args.FILE_OUT) +def main(argv=None): + """Coordinate argument parsing and program execution.""" + args = parse_args(argv) + logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") + if not args.file_in.is_file(): + logger.error(f"The given input file {args.file_in} was not found!") + sys.exit(2) + args.file_out.parent.mkdir(parents=True, exist_ok=True) + check_samplesheet(args.file_in, args.file_out) if __name__ == "__main__": diff --git a/bin/epaa.py b/bin/epaa.py index 4e1db8a..92296b6 100755 --- a/bin/epaa.py +++ b/bin/epaa.py @@ -7,23 +7,23 @@ import re import vcf import argparse -import urllib2 +import urllib import itertools import pandas as pd import numpy as np -import Fred2.Core.Generator as generator +import epytope.Core.Generator as generator import math import json from collections import defaultdict -from Fred2.IO.MartsAdapter import MartsAdapter -from Fred2.Core.Variant import Variant, VariationType, MutationSyntax -from Fred2.EpitopePrediction import EpitopePredictorFactory -from Fred2.IO.ADBAdapter import EIdentifierTypes -from Fred2.IO.UniProtAdapter import UniProtDB -from Fred2.Core.Allele import Allele -from Fred2.Core.Peptide import Peptide -from Fred2.IO import FileReader +from epytope.IO.MartsAdapter import MartsAdapter +from epytope.Core.Variant import Variant, VariationType, MutationSyntax +from epytope.EpitopePrediction import EpitopePredictorFactory +from epytope.IO.ADBAdapter import EIdentifierTypes +from epytope.IO.UniProtAdapter import UniProtDB +from epytope.Core.Allele import Allele +from epytope.Core.Peptide import Peptide +from epytope.IO import FileReader from Bio import SeqUtils from datetime import datetime from string import Template @@ -41,7 +41,8 @@ transcriptProteinMap = {} transcriptSwissProtMap = {} -def get_fred2_annotation(vt, p, r, alt): + +def get_epytope_annotation(vt, p, r, alt): if vt == VariationType.SNP: return p, r, alt elif vt == VariationType.DEL or vt == VariationType.FSDEL: @@ -83,13 +84,15 @@ def read_GSvar(filename, pass_only=True): reads GSvar and tsv files (tab sep files in context of genetic variants), omitting and warning about rows missing mandatory columns :param filename: /path/to/file - :return: list FRED2 variants + :return: list epytope variants """ global ID_SYSTEM_USED - RE = re.compile("(\w+):([\w.]+):([&\w]+):\w*:exon(\d+)\D*\d*:(c.\D*([_\d]+)\D*):(p.\D*(\d+)\w*)") + RE = re.compile( + "(\w+):([\w.]+):([&\w]+):\w*:exon(\d+)\D*\d*:(c.\D*([_\d]+)\D*):(p.\D*(\d+)\w*)") # list of mandatory (meta)data - exclusion_list = ["start", "end", "#chr", "ref", "obs", "gene", "tumour_genotype", "coding_and_splicing_details", "variant_details", "variant_type", "coding_and_splicing"] + exclusion_list = ["start", "end", "#chr", "ref", "obs", "gene", "tumour_genotype", + "coding_and_splicing_details", "variant_details", "variant_type", "coding_and_splicing"] list_vars = list() lines = list() @@ -98,11 +101,13 @@ def read_GSvar(filename, pass_only=True): cases = 0 - with open(filename, 'rb') as tsvfile: - tsvreader = csv.DictReader((row for row in tsvfile if not row.startswith('##')), delimiter='\t') + with open(filename, 'rt') as tsvfile: + tsvreader = csv.DictReader( + (row for row in tsvfile if not row.startswith('##')), delimiter='\t') for row in tsvreader: if not check_min_req_GSvar(row): - logger.warning("read_GSvar: Omitted row! Mandatory columns not present in: \n"+str(row)) + logger.warning( + "read_GSvar: Omitted row! Mandatory columns not present in: \n"+str(row)+".") continue lines.append(row) @@ -119,7 +124,8 @@ def read_GSvar(filename, pass_only=True): alt = line["obs"] gene = line.get("gene", '') - isHomozygous = True if (('tumour_genotype' in line) and (line['tumour_genotype'].split('/')[0] == line['tumour_genotype'].split('/')[1])) else False + isHomozygous = True if (('tumour_genotype' in line) and (line['tumour_genotype'].split( + '/')[0] == line['tumour_genotype'].split('/')[1])) else False # old GSvar version if "coding_and_splicing_details" in line: mut_type = line.get("variant_details", '') @@ -159,10 +165,12 @@ def read_GSvar(filename, pass_only=True): mut_type = a_mut_type nm_id = nm_id.split(".")[0] - coding[nm_id] = MutationSyntax(nm_id, int(trans_pos.split('_')[0])-1, int(prot_start)-1, trans_coding, prot_coding) + coding[nm_id] = MutationSyntax(nm_id, int(trans_pos.split( + '_')[0])-1, int(prot_start)-1, trans_coding, prot_coding) transcript_ids.append(nm_id) if coding: - var = Variant(mut_id, vt, chrom.strip('chr'), int(genome_start), ref.upper(), alt.upper(), coding, isHomozygous, isSynonymous=isyn) + var = Variant(mut_id, vt, chrom.strip('chr'), int(genome_start), ref.upper( + ), alt.upper(), coding, isHomozygous, isSynonymous=isyn) var.gene = gene # metadata logging @@ -176,14 +184,15 @@ def read_GSvar(filename, pass_only=True): # fix because of memory/timing issues due to combinatoric explosion for v in list_vars: - for trans_id in v.coding.iterkeys(): + for trans_id in v.coding.keys(): transToVar.setdefault(trans_id, []).append(v) - for tId, vs in transToVar.iteritems(): + for tId, vs in transToVar.items(): if len(vs) > 10: cases += 1 for v in vs: - vs_new = Variant(v.id, v.type, v.chrom, v.genomePos, v.ref, v.obs, v.coding, True, v.isSynonymous) + vs_new = Variant(v.id, v.type, v.chrom, v.genomePos, + v.ref, v.obs, v.coding, True, v.isSynonymous) vs_new.gene = v.gene for m in metadata_list: vs_new.log_metadata(m, v.get_metadata(m)[0]) @@ -194,14 +203,14 @@ def read_GSvar(filename, pass_only=True): def read_vcf(filename, pass_only=True): """ reads vcf files - returns a list of FRED2 variants + returns a list of epytope variants :param filename: /path/to/file - :return: list of FRED2 variants + :return: list of epytope variants """ global ID_SYSTEM_USED vl = list() - with open(filename, 'rb') as tsvfile: + with open(filename, 'rt') as tsvfile: vcf_reader = vcf.Reader(tsvfile) vl = [r for r in vcf_reader] @@ -275,8 +284,13 @@ def read_vcf(filename, pass_only=True): isSynonymous = False coding = dict() types = [] - for annraw in record.INFO['ANN']: # for each ANN only add a new coding! see GSvar + # for each ANN only add a new coding! see GSvar + for annraw in record.INFO['ANN']: annots = annraw.split('|') + if len(annots) != 16: + logger.warning( + "read_vcf: Omitted row! Mandatory columns not present in annotation field (ANN). \n Have you annotated your VCF file with SnpEff?") + continue obs, a_mut_type, impact, a_gene, a_gene_id, feature_type, transcript_id, exon, tot_exon, trans_coding, prot_coding, cdna, cds, aa, distance, warnings = annots types.append(a_mut_type) @@ -284,7 +298,7 @@ def read_vcf(filename, pass_only=True): ppos = 0 positions = '' - # get cds/protein positions and convert mutation syntax to FRED2 format + # get cds/protein positions and convert mutation syntax to epytope format if trans_coding != '': positions = re.findall(r'\d+', trans_coding) ppos = int(positions[0]) - 1 @@ -302,30 +316,40 @@ def read_vcf(filename, pass_only=True): if 'NM' in transcript_id: ID_SYSTEM_USED = EIdentifierTypes.REFSEQ - #take online coding variants into account, FRED2 cannot deal with stopgain variants right now + # take online coding variants into account, epytope cannot deal with stopgain variants right now if not prot_coding or 'stop_gained' in a_mut_type: continue - coding[transcript_id] = MutationSyntax(transcript_id, ppos, tpos, trans_coding, prot_coding) + coding[transcript_id] = MutationSyntax( + transcript_id, ppos, tpos, trans_coding, prot_coding) transcript_ids.append(transcript_id) if coding: - pos, reference, alternative = get_fred2_annotation(vt, p, r, str(alt)) - var = Variant("line" + str(num), vt, c, pos, reference, alternative, coding, isHomozygous, isSynonymous) + pos, reference, alternative = get_epytope_annotation( + vt, p, r, str(alt)) + var = Variant("line" + str(num), vt, c, pos, reference, + alternative, coding, isHomozygous, isSynonymous) var.gene = gene var.log_metadata("vardbid", variation_dbid) final_metadata_list.append("vardbid") for metadata_name in metadata_list: if metadata_name in record.INFO: final_metadata_list.append(metadata_name) - var.log_metadata(metadata_name, record.INFO[metadata_name]) + var.log_metadata( + metadata_name, record.INFO[metadata_name]) for sample in record.samples: for format_key in format_list: - format_header = '{}.{}'.format(sample.sample, format_key) + if getattr(sample.data, format_key, None) is None: + logger.warning("FORMAT entry {entry} not defined for {genotype}. Skipping.".format( + entry=format_key, genotype=sample.sample)) + continue + format_header = '{}.{}'.format( + sample.sample, format_key) final_metadata_list.append(format_header) if isinstance(sample[format_key], list): - format_value = ','.join([str(i) for i in sample[format_key]]) + format_value = ','.join( + [str(i) for i in sample[format_key]]) else: format_value = sample[format_key] var.log_metadata(format_header, format_value) @@ -336,13 +360,14 @@ def read_vcf(filename, pass_only=True): # fix because of memory/timing issues due to combinatoric explosion for v in list_vars: - for trans_id in v.coding.iterkeys(): + for trans_id in v.coding.keys(): transToVar.setdefault(trans_id, []).append(v) - for tId, vs in transToVar.iteritems(): + for tId, vs in transToVar.items(): if len(vs) > 10: for v in vs: - vs_new = Variant(v.id, v.type, v.chrom, v.genomePos, v.ref, v.obs, v.coding, True, v.isSynonymous) + vs_new = Variant(v.id, v.type, v.chrom, v.genomePos, + v.ref, v.obs, v.coding, True, v.isSynonymous) vs_new.gene = v.gene for m in metadata_name: vs_new.log_metadata(m, v.get_metadata(m)) @@ -387,7 +412,8 @@ def read_protein_quant(filename): valuedict = {} for key, val in row.iteritems(): if 'LFQ intensity' in key: - valuedict[key.replace('LFQ intensity ', '').split('/')[-1]] = val + valuedict[key.replace( + 'LFQ intensity ', '').split('/')[-1]] = val for p in row['Protein IDs'].split(';'): if 'sp' in p: intensities[p.split('|')[1]] = valuedict @@ -422,25 +448,22 @@ def read_lig_ID_values(filename): return intensities -def create_length_column_value(pep): - return int(len(pep[0])) - - def create_protein_column_value(pep): - all_proteins = [transcriptProteinMap[x.transcript_id.split(':')[0]] for x in set(pep[0].get_all_transcripts())] + all_proteins = [transcriptProteinMap[x.transcript_id.split( + ':')[0]] for x in set(pep.get_all_transcripts())] return ','.join(set([item for sublist in all_proteins for item in sublist])) def create_transcript_column_value(pep): - return ','.join(set([x.transcript_id.split(':')[0] for x in set(pep[0].get_all_transcripts())])) + return ','.join(set([x.transcript_id.split(':')[0] for x in set(pep.get_all_transcripts())])) def create_mutationsyntax_column_value(pep): - transcript_ids = [x.transcript_id for x in set(pep[0].get_all_transcripts())] + transcript_ids = [x.transcript_id for x in set(pep.get_all_transcripts())] variants = [] syntaxes = [] for t in transcript_ids: - variants.extend([v for v in pep[0].get_variants_by_protein(t)]) + variants.extend([v for v in pep.get_variants_by_protein(t)]) transcript_ids = set([t.split(':')[0] for t in transcript_ids]) for v in set(variants): for c in v.coding: @@ -450,11 +473,11 @@ def create_mutationsyntax_column_value(pep): def create_mutationsyntax_genome_column_value(pep): - transcript_ids = [x.transcript_id for x in set(pep[0].get_all_transcripts())] + transcript_ids = [x.transcript_id for x in set(pep.get_all_transcripts())] variants = [] syntaxes = [] for t in transcript_ids: - variants.extend([v for v in pep[0].get_variants_by_protein(t)]) + variants.extend([v for v in pep.get_variants_by_protein(t)]) transcript_ids = set([t.split(':')[0] for t in transcript_ids]) for v in set(variants): for c in v.coding: @@ -464,75 +487,78 @@ def create_mutationsyntax_genome_column_value(pep): def create_variationfilelinenumber_column_value(pep): - v = [x.vars.values() for x in pep[0].get_all_transcripts()] + v = [x.vars.values() for x in pep.get_all_transcripts()] vf = list(itertools.chain.from_iterable(v)) return ','.join([str(int(y.id.replace('line', ''))+1) for y in vf]) def create_gene_column_value(pep): - transcript_ids = [x.transcript_id for x in set(pep[0].get_all_transcripts())] + transcript_ids = [x.transcript_id for x in set(pep.get_all_transcripts())] variants = [] for t in transcript_ids: - variants.extend([v for v in pep[0].get_variants_by_protein(t)]) + variants.extend([v for v in pep.get_variants_by_protein(t)]) return ','.join(set([y.gene for y in set(variants)])) def create_variant_pos_column_value(pep): - transcript_ids = [x.transcript_id for x in set(pep[0].get_all_transcripts())] + transcript_ids = [x.transcript_id for x in set(pep.get_all_transcripts())] variants = [] for t in transcript_ids: - variants.extend([v for v in pep[0].get_variants_by_protein(t)]) + variants.extend([v for v in pep.get_variants_by_protein(t)]) return ','.join(set(['{}'.format(y.genomePos) for y in set(variants)])) def create_variant_chr_column_value(pep): - transcript_ids = [x.transcript_id for x in set(pep[0].get_all_transcripts())] + transcript_ids = [x.transcript_id for x in set(pep.get_all_transcripts())] variants = [] for t in transcript_ids: - variants.extend([v for v in pep[0].get_variants_by_protein(t)]) + variants.extend([v for v in pep.get_variants_by_protein(t)]) return ','.join(set(['{}'.format(y.chrom) for y in set(variants)])) def create_variant_type_column_value(pep): - types = {0: 'SNP', 1: 'DEL', 2: 'INS', 3: 'FSDEL', 4: 'FSINS', 5: 'UNKNOWN'} + types = {0: 'SNP', 1: 'DEL', 2: 'INS', + 3: 'FSDEL', 4: 'FSINS', 5: 'UNKNOWN'} - transcript_ids = [x.transcript_id for x in set(pep[0].get_all_transcripts())] + transcript_ids = [x.transcript_id for x in set(pep.get_all_transcripts())] variants = [] for t in transcript_ids: - variants.extend([v for v in pep[0].get_variants_by_protein(t)]) + variants.extend([v for v in pep.get_variants_by_protein(t)]) return ','.join(set([types[y.type] for y in set(variants)])) def create_variant_syn_column_value(pep): - transcript_ids = [x.transcript_id for x in set(pep[0].get_all_transcripts())] + transcript_ids = [x.transcript_id for x in set(pep.get_all_transcripts())] variants = [] for t in transcript_ids: - variants.extend([v for v in pep[0].get_variants_by_protein(t)]) + variants.extend([v for v in pep.get_variants_by_protein(t)]) return ','.join(set([str(y.isSynonymous) for y in set(variants)])) def create_variant_hom_column_value(pep): - transcript_ids = [x.transcript_id for x in set(pep[0].get_all_transcripts())] + transcript_ids = [x.transcript_id for x in set(pep.get_all_transcripts())] variants = [] for t in transcript_ids: - variants.extend([v for v in pep[0].get_variants_by_protein(t)]) + variants.extend([v for v in pep.get_variants_by_protein(t)]) return ','.join(set([str(y.isHomozygous) for y in set(variants)])) def create_coding_column_value(pep): - transcript_ids = [x.transcript_id for x in set(pep[0].get_all_transcripts())] + transcript_ids = [x.transcript_id for x in set(pep.get_all_transcripts())] variants = [] for t in transcript_ids: - variants.extend([v for v in pep[0].get_variants_by_protein(t)]) + variants.extend([v for v in pep.get_variants_by_protein(t)]) return ','.join(set([str(y.coding) for y in set(variants)])) def create_metadata_column_value(pep, c): - transcript_ids = [x.transcript_id for x in set(pep[0].get_all_transcripts())] + transcript_ids = [x.transcript_id for x in set( + pep[0].get_all_transcripts())] variants = [] for t in transcript_ids: variants.extend([v for v in pep[0].get_variants_by_protein(t)]) - meta = set([str(y.get_metadata(c)[0]) for y in set(variants)]) + meta = set([str(y.get_metadata(c)[0]) + for y in set(variants) if len(y.get_metadata(c)) != 0]) if len(meta) is 0: return np.nan else: @@ -540,8 +566,9 @@ def create_metadata_column_value(pep, c): def create_wt_seq_column_value(pep, wtseqs): - transcripts = [x for x in set(pep[0].get_all_transcripts())] - wt = set([str(wtseqs['{}_{}'.format(str(pep[0]), t.transcript_id)]) for t in transcripts if bool(t.vars) and '{}_{}'.format(str(pep[0]), t.transcript_id) in wtseqs]) + transcripts = [x for x in set(pep.get_all_transcripts())] + wt = set([str(wtseqs['{}_{}'.format(str(pep), t.transcript_id)]) for t in transcripts if bool( + t.vars) and '{}_{}'.format(str(pep), t.transcript_id) in wtseqs]) if len(wt) is 0: return np.nan else: @@ -555,10 +582,12 @@ def create_quant_column_value(row, dict): value = np.nan return value -#defined as : RPKM = (10^9 * C)/(N * L) +# defined as : RPKM = (10^9 * C)/(N * L) # L = exon length in base-pairs for a gene # C = Number of reads mapped to a gene in a single sample # N = total (unique)mapped reads in the sample + + def create_expression_column_value_for_result(row, dict, deseq, gene_id_lengths): ts = row['gene'].split(',') values = [] @@ -572,10 +601,13 @@ def create_expression_column_value_for_result(row, dict, deseq, gene_id_lengths) for t in ts: if t in dict: if t in gene_id_lengths: - values.append((10.0**9 * float(dict[t])) / (float(gene_id_lengths[t]) * sum([float(dict[k]) for k in dict.keys() if ((not k.startswith('__')) & (k in gene_id_lengths))]))) + values.append((10.0**9 * float(dict[t])) / (float(gene_id_lengths[t]) * sum([float( + dict[k]) for k in dict.keys() if ((not k.startswith('__')) & (k in gene_id_lengths))]))) else: - values.append((10.0**9 * float(dict[t])) / (float(len(row[0].get_all_transcripts()[0])) * sum([float(dict[k]) for k in dict.keys() if ((not k.startswith('__')) & (k in gene_id_lengths))]))) - logger.warning("FKPM value will be based on transcript length for {gene}. Because gene could not be found in the DB".format(gene=t)) + values.append((10.0**9 * float(dict[t])) / (float(len(row[0].get_all_transcripts()[0])) * sum( + [float(dict[k]) for k in dict.keys() if ((not k.startswith('__')) & (k in gene_id_lengths))]))) + logger.warning( + "FKPM value will be based on transcript length for {gene}. Because gene could not be found in the DB".format(gene=t)) else: values.append(np.nan) values = ["{0:.2f}".format(value) for value in values] @@ -583,13 +615,15 @@ def create_expression_column_value_for_result(row, dict, deseq, gene_id_lengths) def create_quant_column_value_for_result(row, dict, swissProtDict, key): - all_proteins = [swissProtDict[x.transcript_id.split(':')[0]] for x in set(row[0].get_all_transcripts())] - all_proteins_filtered = set([item for sublist in all_proteins for item in sublist]) + all_proteins = [swissProtDict[x.transcript_id.split( + ':')[0]] for x in set(row[0].get_all_transcripts())] + all_proteins_filtered = set( + [item for sublist in all_proteins for item in sublist]) values = [] for p in all_proteins_filtered: if p in dict: if int(dict[p][key]) > 0: - values.append(math.log(int(dict[p][key]),2)) + values.append(math.log(int(dict[p][key]), 2)) else: values.append(int(dict[p][key])) if len(values) is 0: @@ -637,13 +671,14 @@ def get_protein_ids_for_transcripts(idtype, transcripts, ensembl_url, reference) input_lists = [] # too long requests will fail - if len(transcripts) > 400: - input_lists = [transcripts[i:i + 3] for i in xrange(0, len(transcripts), 3)] + if len(transcripts) > 200: + input_lists = [transcripts[i:i + 3] + for i in range(0, len(transcripts), 3)] else: input_lists += [transcripts] - attribut_swissprot = "uniprot_swissprot_accession" if reference == 'GRCh37' else 'uniprot_swissprot' + attribut_swissprot = "uniprot_swissprot_accession" if reference == 'GRCh37' else 'uniprotswissprot' tsvselect = [] for l in input_lists: @@ -655,19 +690,22 @@ def get_protein_ids_for_transcripts(idtype, transcripts, ensembl_url, reference) + biomart_attribute % (idname) \ + biomart_tail - tsvreader = csv.DictReader(urllib2.urlopen(biomart_url + urllib2.quote(rq_n)).read().splitlines(), dialect='excel-tab') + # DictReader returns byte object that is transformed into a string by '.decode('utf-8')' + tsvreader = csv.DictReader(urllib.request.urlopen( + biomart_url + urllib.parse.quote(rq_n)).read().decode('utf-8').splitlines(), dialect='excel-tab') tsvselect += [x for x in tsvreader] - swissProtKey = 'UniProt/SwissProt Accession' + swissProtKey = 'UniProt/SwissProt Accession' if reference == 'GRCh37' else 'UniProtKB/Swiss-Prot ID' if(ENSEMBL): - key = 'Ensembl Transcript ID' if reference == 'GRCh37' else 'Transcript ID' - protein_key = 'Ensembl Protein ID' if reference == 'GRCh37' else 'Protein ID' + key = 'Ensembl Transcript ID' if reference == 'GRCh37' else 'Transcript stable ID' + protein_key = 'Ensembl Protein ID' if reference == 'GRCh37' else 'Protein stable ID' for dic in tsvselect: if dic[key] in result: merged = result[dic[key]] + [dic[protein_key]] - merged_swissProt = result_swissProt[dic[key]] + [dic[swissProtKey]] + merged_swissProt = result_swissProt[dic[key] + ] + [dic[swissProtKey]] result[dic[key]] = merged result_swissProt[dic[key]] = merged_swissProt else: @@ -677,12 +715,15 @@ def get_protein_ids_for_transcripts(idtype, transcripts, ensembl_url, reference) key = 'RefSeq mRNA [e.g. NM_001195597]' for dic in tsvselect: if dic[key] in result: - merged = result[dic[key]] + [dic['RefSeq Protein ID [e.g. NP_001005353]']] - merged_swissProt = result_swissProt[dic[key]] + [dic[swissProtKey]] + merged = result[dic[key]] + \ + [dic['RefSeq Protein ID [e.g. NP_001005353]']] + merged_swissProt = result_swissProt[dic[key] + ] + [dic[swissProtKey]] result[dic[key]] = merged result_swissProt[dic[key]] = merged_swissProt else: - result[dic[key]] = [dic['RefSeq Protein ID [e.g. NP_001005353]']] + result[dic[key]] = [ + dic['RefSeq Protein ID [e.g. NP_001005353]']] result_swissProt[dic[key]] = [dic[swissProtKey]] return result, result_swissProt @@ -691,49 +732,30 @@ def get_protein_ids_for_transcripts(idtype, transcripts, ensembl_url, reference) def get_matrix_max_score(allele, length): allele_model = "%s_%i" % (allele, length) try: - pssm = getattr(__import__("Fred2.Data.pssms.syfpeithi.mat."+allele_model, fromlist=[allele_model]), allele_model) - return sum([max(scrs.values()) for pos, scrs in pssm.iteritems()]) + pssm = getattr(__import__("epytope.Data.pssms.syfpeithi.mat." + + allele_model, fromlist=[allele_model]), allele_model) + return sum([max(scrs.values()) for pos, scrs in pssm.items()]) except: return np.nan -def create_score_values(j, method): - if not pd.isnull(j): - if 'syf' in method: - return round(j, 4) - elif any(m in method for m in ['mhcnuggets','mhcflurry']): - # convert IC50 -> affinity score in [0,1] - min_ic50 = 1.0 - return round(1.0 - math.log(max(j,min_ic50), 50000), 4) - else: - # currently not applied (for methods such as netMHC) - return round(j, 4) - - else: - return np.nan - - def create_affinity_values(allele, length, j, method, max_scores, allele_strings): if not pd.isnull(j): if 'syf' in method: return max(0, round((100.0 / float(max_scores[allele_strings[('%s_%s' % (str(allele), length))]]) * float(j)), 2)) - elif any(m in method for m in ['mhcnuggets','mhcflurry']): - # mhcnuggets and mhcflurry return already IC50 affinity values - return round(j, 2) else: # convert given affinity score in range [0,1] back to IC50 affinity value - # currently not applied (for methods such as netMHC) return round((50000**(1.0-float(j))), 2) else: return np.nan -def create_binder_values(aff, method, thresholds): - if not pd.isnull(aff): +def create_binder_values(pred_value, method, thresholds): + if not pd.isnull(pred_value): if 'syf' in method: - return True if aff > thresholds[method] else False + return True if pred_value > thresholds[method] else False else: - return True if aff <= thresholds[method.lower()] else False + return True if pred_value <= thresholds[method.lower()] else False else: return np.nan @@ -751,12 +773,14 @@ def generate_wt_seqs(peptides): not_available = False variant_available = False for p in protein_pos: - variant_dic = x.get_variants_by_protein_position(t.transcript_id, p) + variant_dic = x.get_variants_by_protein_position( + t.transcript_id, p) variant_available = bool(variant_dic) for key in variant_dic: var_list = variant_dic[key] for v in var_list: - mut_syntax = v.coding[t.transcript_id.split(':')[0]].aaMutationSyntax + mut_syntax = v.coding[t.transcript_id.split( + ':')[0]].aaMutationSyntax if v.type in [3, 4, 5] or '?' in mut_syntax: not_available = True elif v.type in [1]: @@ -773,13 +797,14 @@ def generate_wt_seqs(peptides): wt = SeqUtils.seq1(m.groups()[0]) mut_seq[key] = wt if not_available: - wt_dict['{}_{}'.format(str(x),t.transcript_id)] = np.nan + wt_dict['{}_{}'.format(str(x), t.transcript_id)] = np.nan elif variant_available: - wt_dict['{}_{}'.format(str(x),t.transcript_id)] = ''.join(mut_seq) + wt_dict['{}_{}'.format( + str(x), t.transcript_id)] = ''.join(mut_seq) return wt_dict -def make_predictions_from_variants(variants_all, methods, tool_thresholds, alleles, minlength, maxlength, martsadapter, protein_db, identifier, metadata, transcriptProteinMap): +def make_predictions_from_variants(variants_all, methods, tool_thresholds, use_affinity_thresholds, alleles, minlength, maxlength, martsadapter, protein_db, identifier, metadata, transcriptProteinMap): # list for all peptides and filtered peptides all_peptides = [] all_peptides_filtered = [] @@ -791,7 +816,8 @@ def make_predictions_from_variants(variants_all, methods, tool_thresholds, allel # list to hold dataframes for all predictions pred_dataframes = [] - prots = [p for p in generator.generate_proteins_from_transcripts(generator.generate_transcripts_from_variants(variants_all, martsadapter, ID_SYSTEM_USED))] + prots = [p for p in generator.generate_proteins_from_transcripts( + generator.generate_transcripts_from_variants(variants_all, martsadapter, ID_SYSTEM_USED))] for peplen in range(minlength, maxlength): peptide_gen = generator.generate_peptides_from_proteins(prots, peplen) @@ -799,7 +825,8 @@ def make_predictions_from_variants(variants_all, methods, tool_thresholds, allel peptides_var = [x for x in peptide_gen] # remove peptides which are not 'variant relevant' - peptides = [x for x in peptides_var if any(x.get_variants_by_protein(y) for y in x.proteins.keys())] + peptides = [x for x in peptides_var if any( + x.get_variants_by_protein(y) for y in x.proteins.keys())] # filter out self peptides selfies = [str(p) for p in peptides if protein_db.exists(str(p))] @@ -813,61 +840,83 @@ def make_predictions_from_variants(variants_all, methods, tool_thresholds, allel if len(filtered_peptides) > 0: for method, version in methods.items(): try: - predictor = EpitopePredictorFactory(method, version=version) - results.extend([predictor.predict(filtered_peptides, alleles=alleles)]) + predictor = EpitopePredictorFactory( + method, version=version) + results.extend( + [predictor.predict(filtered_peptides, alleles=alleles)]) except: - logger.warning("Prediction for length {length} and allele {allele} not possible with {method} version {version}.".format(length=peplen, allele=','.join([str(a) for a in alleles]), method=method, version=version)) - - if(len(results) == 0): + logger.warning("Prediction for length {length} and allele {allele} not possible with {method} version {version}.".format( + length=peplen, allele=','.join([str(a) for a in alleles]), method=method, version=version)) + + # merge dataframes for multiple predictors + if len(results) > 1: + df = results[0].merge_results(results[1:]) + elif len(results) == 1: + df = results[0] + else: continue df = pd.concat(results) + # create method index and remove it from multi-column + df = df.stack(level=1) + + # merge remaining multi-column Allele and ScoreType + df.columns = df.columns.map('{0[0]} {0[1]}'.format) + + # reset index to have indices as columns + df.reset_index(inplace=True) + df = df.rename(columns={'Method': 'method', 'Peptides': 'sequence'}) for a in alleles: conv_allele = "%s_%s%s" % (a.locus, a.supertype, a.subtype) - allele_string_map['%s_%s' % (a, peplen)] = '%s_%i' % (conv_allele, peplen) - max_values_matrices['%s_%i' % (conv_allele, peplen)] = get_matrix_max_score(conv_allele, peplen) - - df.insert(0, 'length', df.index.map(create_length_column_value)) - df['chr'] = df.index.map(create_variant_chr_column_value) - df['pos'] = df.index.map(create_variant_pos_column_value) - df['gene'] = df.index.map(create_gene_column_value) - df['transcripts'] = df.index.map(create_transcript_column_value) - df['proteins'] = df.index.map(create_protein_column_value) - df['variant type'] = df.index.map(create_variant_type_column_value) - df['synonymous'] = df.index.map(create_variant_syn_column_value) - df['homozygous'] = df.index.map(create_variant_hom_column_value) - df['variant details (genomic)'] = df.index.map(create_mutationsyntax_genome_column_value) - df['variant details (protein)'] = df.index.map(create_mutationsyntax_column_value) - - # reset index to have index as columns - df.reset_index(inplace=True) + allele_string_map['%s_%s' % + (a, peplen)] = '%s_%i' % (conv_allele, peplen) + max_values_matrices['%s_%i' % (conv_allele, peplen)] = get_matrix_max_score( + conv_allele, peplen) + + df['length'] = df['sequence'].map(len) + df['chr'] = df['sequence'].map(create_variant_chr_column_value) + df['pos'] = df['sequence'].map(create_variant_pos_column_value) + df['gene'] = df['sequence'].map(create_gene_column_value) + df['transcripts'] = df['sequence'].map(create_transcript_column_value) + df['proteins'] = df['sequence'].map(create_protein_column_value) + df['variant type'] = df['sequence'].map( + create_variant_type_column_value) + df['synonymous'] = df['sequence'].map(create_variant_syn_column_value) + df['homozygous'] = df['sequence'].map(create_variant_hom_column_value) + df['variant details (genomic)'] = df['sequence'].map( + create_mutationsyntax_genome_column_value) + df['variant details (protein)'] = df['sequence'].map( + create_mutationsyntax_column_value) for c in df.columns: - if ('HLA-' in str(c)) or ('H-2-' in str(c)): + if ('HLA-' in str(c) or 'H-2-' in str(c)) and 'Score' in str(c): idx = df.columns.get_loc(c) - df.insert(idx + 1, '%s affinity' % c, df.apply(lambda x: create_affinity_values(str(c), int(x['length']), float(x[c]), x['Method'], max_values_matrices, allele_string_map), axis=1)) - df.insert(idx + 2, '%s binder' % c, df.apply(lambda x: create_binder_values(float(x['%s affinity' % c]), x['Method'], tool_thresholds), axis=1)) - df = df.rename(columns={c: '%s score' % c}) - df['%s score' % c] = df.apply(lambda x: create_score_values(float(x['%s score' % c]), x['Method']), axis=1) + allele = c.rstrip(' Score') + df[c] = df[c].round(4) + df.insert(idx + 1, '%s affinity' % allele, df.apply(lambda x: create_affinity_values(allele, + int(x['length']), float(x[c]), x['method'], max_values_matrices, allele_string_map), axis=1)) + df.insert(idx + 2, '%s binder' % allele, df.apply(lambda x: create_binder_values(float(x['%s Rank' % allele]), x['method'], tool_thresholds) if 'netmhc' in x['method'] and not use_affinity_thresholds + else create_binder_values(float(x['%s affinity' % allele]), x['method'], tool_thresholds), axis=1)) - for c in metadata: - df[c] = df.apply(lambda row: create_metadata_column_value(row, c), axis=1) + df.columns = df.columns.str.replace('Score', 'score') + df.columns = df.columns.str.replace('Rank', 'rank') - df = df.rename(columns={'Seq': 'sequence'}) - df = df.rename(columns={'Method': 'method'}) + for c in metadata: + df[c] = df.apply( + lambda row: create_metadata_column_value(row, c), axis=1) pred_dataframes.append(df) - statistics = {'prediction_methods': [ method + "-" + version for method, version in methods.items() ], - 'number_of_variants': len(variants_all), - 'number_of_unique_peptides': [str(p) for p in all_peptides], - 'number_of_unique_peptides_after_filtering': [str(p) for p in all_peptides_filtered] - } + statistics = {'prediction_methods': [method + "-" + version for method, version in methods.items()], + 'number_of_variants': len(variants_all), + 'number_of_unique_peptides': [str(p) for p in all_peptides], + 'number_of_unique_peptides_after_filtering': [str(p) for p in all_peptides_filtered] + } return pred_dataframes, statistics, all_peptides_filtered, prots -def make_predictions_from_peptides(peptides, methods, tool_thresholds, alleles, protein_db, identifier, metadata): +def make_predictions_from_peptides(peptides, methods, tool_thresholds, use_affinity_thresholds, alleles, protein_db, identifier, metadata): # dictionaries for syfpeithi matrices max values and allele mapping max_values_matrices = {} allele_string_map = {} @@ -895,92 +944,134 @@ def make_predictions_from_peptides(peptides, methods, tool_thresholds, alleles, for method, version in methods.items(): try: predictor = EpitopePredictorFactory(method, version=version) - results.extend([predictor.predict(all_peptides_filtered, alleles=alleles)]) + results.extend( + [predictor.predict(all_peptides_filtered, alleles=alleles)]) except: - logger.warning("Prediction for length {length} and allele {allele} not possible with {method} version {version}. No model available.".format(length=peplen, allele=','.join([str(a) for a in alleles]), method=method, version=version)) - # merge dataframes of the performed predictions - if(len(results) == 0): + logger.warning("Prediction for length {length} and allele {allele} not possible with {method} version {version}. No model available.".format( + length=peplen, allele=','.join([str(a) for a in alleles]), method=method, version=version)) + + # merge dataframes for multiple predictors + if len(results) > 1: + df = results[0].merge_results(results[1:]) + elif len(results) == 1: + df = results[0] + else: continue - df = pd.concat(results, sort=True) + # create method index and remove it from multi-column + df = df.stack(level=1) + + # merge remaining multi-column Allele and ScoreType + df.columns = df.columns.map('{0[0]} {0[1]}'.format) + + # reset index to have indices as columns + df.reset_index(inplace=True) + df = df.rename(columns={'Method': 'method', 'Peptides': 'sequence'}) - df.insert(0, 'length', df.index.map(create_length_column_value)) + # create column containing the peptide lengths + df.insert(2, 'length', df['sequence'].map(len)) for a in alleles: conv_allele = "%s_%s%s" % (a.locus, a.supertype, a.subtype) - allele_string_map['%s_%s' % (a, peplen)] = '%s_%i' % (conv_allele, peplen) - max_values_matrices['%s_%i' % (conv_allele, peplen)] = get_matrix_max_score(conv_allele,peplen) + allele_string_map['%s_%s' % + (a, peplen)] = '%s_%i' % (conv_allele, peplen) + max_values_matrices['%s_%i' % (conv_allele, peplen)] = get_matrix_max_score( + conv_allele, peplen) - # reset index to have index as columns - df.reset_index(inplace=True) - - mandatory_columns = ['chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'synonymous', 'homozygous', 'variant details (genomic)', 'variant details (protein)'] + mandatory_columns = ['chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', + 'synonymous', 'homozygous', 'variant details (genomic)', 'variant details (protein)'] for header in mandatory_columns: if header not in metadata: df[header] = np.nan else: - df[header] = df.apply(lambda row: row[0].get_metadata(header)[0], axis=1) + df[header] = df.apply( + lambda row: row[0].get_metadata(header)[0], axis=1) for c in list(set(metadata) - set(mandatory_columns)): df[c] = df.apply(lambda row: row[0].get_metadata(c)[0], axis=1) for c in df.columns: - if ('HLA-' in str(c)) or ('H-2-' in str(c)): + if ('HLA-' in str(c) or 'H-2-' in str(c)) and 'Score' in str(c): idx = df.columns.get_loc(c) - df.insert(idx + 1, '%s affinity' % c, df.apply(lambda x: create_affinity_values(str(c), int(x['length']), float(x[c]), x['Method'], max_values_matrices, allele_string_map), axis=1)) - df.insert(idx + 2, '%s binder' % c, df.apply(lambda x: create_binder_values(float(x['%s affinity' % c]), x['Method'], tool_thresholds), axis=1)) - df = df.rename(columns={c: '%s score' % c}) - df['%s score' % c] = df.apply(lambda x: create_score_values(float(x['%s score' % c]), x['Method']), axis=1) + allele = c.rstrip(' Score') + df[c] = df[c].round(4) + df.insert(idx + 1, '%s affinity' % allele, df.apply(lambda x: create_affinity_values(allele, + int(x['length']), float(x[c]), x['method'], max_values_matrices, allele_string_map), axis=1)) + df.insert(idx + 2, '%s binder' % allele, df.apply(lambda x: create_binder_values(float(x['%s Rank' % allele]), x['method'], tool_thresholds) if 'netmhc' in x['method'] and not use_affinity_thresholds + else create_binder_values(float(x['%s affinity' % allele]), x['method'], tool_thresholds), axis=1)) + + df.columns = df.columns.str.replace('Score', 'score') + df.columns = df.columns.str.replace('Rank', 'rank') - df = df.rename(columns={'Seq': 'sequence'}) - df = df.rename(columns={'Method': 'method'}) pred_dataframes.append(df) # write prediction statistics - statistics = {'prediction_methods': [ method + "-" + version for method, version in methods.items() ], - 'number_of_variants': 0, - 'number_of_unique_peptides': [str(p) for p in peptides], - 'number_of_unique_peptides_after_filtering': [str(p) for p in peptides_filtered] - } + statistics = {'prediction_methods': [method + "-" + version for method, version in methods.items()], + 'number_of_variants': 0, + 'number_of_unique_peptides': [str(p) for p in peptides], + 'number_of_unique_peptides_after_filtering': [str(p) for p in peptides_filtered] + } return pred_dataframes, statistics + def __main__(): parser = argparse.ArgumentParser(description="""EPAA - Epitope Prediction And Annotation \n Pipeline for prediction of MHC class I and II epitopes from variants or peptides for a list of specified alleles. - Additionally predicted epitopes can be annotated with protein quantification values for the corresponding proteins, identified ligands, or differential expression values for the corresponding transcripts.""", version=VERSION) + Additionally predicted epitopes can be annotated with protein quantification values for the corresponding proteins, identified ligands, or differential expression values for the corresponding transcripts.""") parser.add_argument('-s', "--somatic_mutations", help='Somatic variants') parser.add_argument('-g', "--germline_mutations", help="Germline variants") parser.add_argument('-i', "--identifier", help="Dataset identifier") - parser.add_argument('-p', "--peptides", help="File with one peptide per line") - parser.add_argument('-c', "--mhcclass", default=1, help="MHC class I or II", type=int) + parser.add_argument('-p', "--peptides", + help="File with one peptide per line") parser.add_argument('-l', "--max_length", help="Maximum peptide length") parser.add_argument('-ml', "--min_length", help="Minimum peptide length") - parser.add_argument('-t', "--tools", help="Tools used for peptide predictions", required=True, type=str) - parser.add_argument('-tt', "--tool_thresholds", help="Customize affinity threshold of given tools using a json file", required=False, type=str) - parser.add_argument('-sv', "--versions", help="File containing parsed software version numbers.", required=True) - parser.add_argument('-a', "--alleles", help=" MHC Alleles", required=True, type=str) - parser.add_argument('-r', "--reference", help="Reference, retrieved information will be based on this ensembl version", required=False, default='GRCh37', choices=['GRCh37', 'GRCh38']) - parser.add_argument('-f', "--filter_self", help="Filter peptides against human proteom", required=False, action='store_true') - parser.add_argument('-wt', "--wild_type", help="Add wild type sequences of mutated peptides to output", required=False, action='store_true') - parser.add_argument('-fo', "--fasta_output", help="Create FASTA file with protein sequences", required=False, action='store_true') - parser.add_argument('-rp', "--reference_proteome", help="Reference proteome for self-filtering", required=False) - parser.add_argument('-gr', "--gene_reference", help="List of gene IDs for ID mapping.", required=False) - parser.add_argument('-pq', "--protein_quantification", help="File with protein quantification values") - parser.add_argument('-ge', "--gene_expression", help="File with expression analysis results") - parser.add_argument('-de', "--diff_gene_expression", help="File with differential expression analysis results (DESeq2)") - parser.add_argument('-li', "--ligandomics_id", help="Comma separated file with peptide sequence, score and median intensity of a ligandomics identification run.") + parser.add_argument( + '-t', "--tools", help="Tools used for peptide predictions", required=True, type=str) + parser.add_argument('-tt', "--tool_thresholds", + help="Customize thresholds of given tools using a json file", required=False, type=str) + parser.add_argument('-at', "--use_affinity_thresholds", + help="Use affinity instead of rank for thresholding", required=False, action='store_true') + parser.add_argument( + '-sv', "--versions", help="File containing parsed software version numbers.", required=True) + parser.add_argument('-a', "--alleles", + help=" MHC Alleles", required=True, type=str) + parser.add_argument('-r', "--reference", help="Reference, retrieved information will be based on this ensembl version", + required=False, default='GRCh37', choices=['GRCh37', 'GRCh38']) + parser.add_argument('-f', "--filter_self", help="Filter peptides against human proteom", + required=False, action='store_true') + parser.add_argument('-wt', "--wild_type", help="Add wild type sequences of mutated peptides to output", + required=False, action='store_true') + parser.add_argument('-fo', "--fasta_output", + help="Create FASTA file with protein sequences", required=False, action='store_true') + parser.add_argument('-rp', "--reference_proteome", + help="Reference proteome for self-filtering", required=False) + parser.add_argument('-gr', "--gene_reference", + help="List of gene IDs for ID mapping.", required=False) + parser.add_argument('-pq', "--protein_quantification", + help="File with protein quantification values") + parser.add_argument('-ge', "--gene_expression", + help="File with expression analysis results") + parser.add_argument('-de', "--diff_gene_expression", + help="File with differential expression analysis results (DESeq2)") + parser.add_argument('-li', "--ligandomics_id", + help="Comma separated file with peptide sequence, score and median intensity of a ligandomics identification run.") + parser.add_argument('-v', "--version", help="Script version", + action='version', version=VERSION) args = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) - logger.addHandler(logging.FileHandler('{}_prediction.log'.format(args.identifier))) - logger.info("Starting predictions at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) + logger.addHandler(logging.FileHandler( + '{}_prediction.log'.format(args.identifier))) + logger.info("Starting predictions at " + + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) metadata = [] proteins = [] - references = {'GRCh37': 'http://feb2014.archive.ensembl.org', 'GRCh38': 'http://dec2016.archive.ensembl.org'} + references = {'GRCh37': 'http://feb2014.archive.ensembl.org', + 'GRCh38': 'http://mar2017.archive.ensembl.org'} global transcriptProteinMap global transcriptSwissProtMap @@ -994,11 +1085,12 @@ def __main__(): vl, transcripts, metadata = read_vcf(args.somatic_mutations) transcripts = list(set(transcripts)) - transcriptProteinMap, transcriptSwissProtMap = get_protein_ids_for_transcripts(ID_SYSTEM_USED, transcripts, references[args.reference], args.reference) + transcriptProteinMap, transcriptSwissProtMap = get_protein_ids_for_transcripts( + ID_SYSTEM_USED, transcripts, references[args.reference], args.reference) # get the alleles # alleles = FileReader.read_lines(args.alleles, in_type=Allele) - alleles= [Allele(a) for a in args.alleles.split(";")] + alleles = [Allele(a) for a in args.alleles.split(";")] # initialize MartsAdapter, GRCh37 or GRCh38 based ma = MartsAdapter(biomart=references[args.reference]) @@ -1011,23 +1103,35 @@ def __main__(): if os.path.isdir(args.reference_proteome): for filename in os.listdir(args.reference_proteome): if filename.endswith(".fasta") or filename.endswith(".fsa"): - up_db.read_seqs(os.path.join(args.reference_proteome, filename)) + up_db.read_seqs(os.path.join( + args.reference_proteome, filename)) else: up_db.read_seqs(args.reference_proteome) - selected_methods = [item for item in args.tools.split(',')] + selected_methods = [item.split( + '-')[0] if "mhcnuggets" not in item else item for item in args.tools.split(',')] with open(args.versions, 'r') as versions_file: - tool_version = [ (row[0].split()[0], str(row[1])) for row in csv.reader(versions_file, delimiter = ":") ] - # NOTE this needs to be updated, if a newer version will be available via Fred2 and should be used in the future + tool_version = [(row[0].split()[0], str(row[1])) + for row in csv.reader(versions_file, delimiter=":")] + # NOTE this needs to be updated, if a newer version will be available via epytope and should be used in the future tool_version.append(('syfpeithi', '1.0')) # get for each selected method the corresponding tool version - methods = { method.lower().strip() : version.strip() for tool, version in tool_version for method in selected_methods if tool.lower() in method.lower() } + methods = {method.lower().strip(): version.strip( + ) for tool, version in tool_version for method in selected_methods if tool.lower() in method.lower()} for method, version in methods.items(): if version not in EpitopePredictorFactory.available_methods()[method]: - raise ValueError("The specified version " + version + " for " + method + " is not supported by Fred2.") + raise ValueError("The specified version " + version + + " for " + method + " is not supported by epytope.") + + thresholds = {"syfpeithi": 50, "mhcflurry": 500, "mhcnuggets-class-1": 500, + "mhcnuggets-class-2": 500, "netmhc": 500, "netmhcpan": 500, "netmhcii": 500, "netmhciipan": 500} + # Define binders based on the rank metric for netmhc family tools + # NOTE these recommended thresholds might change in the future with new versions of the tools + if "netmhc" in ''.join(methods.keys()) and not args.use_affinity_thresholds: + thresholds.update({"netmhc": 2, "netmhcpan": 2, + "netmhcii": 10, "netmhciipan": 5}) - thresholds = {"syfpeithi":50, "mhcflurry":500, "mhcnuggets-class-1":500, "mhcnuggets-class-2":500, "netmhc":500, "netmhcpan":500, "netmhcii":500, "netmhciipan":500} if args.tool_thresholds: with open(args.tool_thresholds, 'r') as json_file: threshold_file = json.load(json_file) @@ -1035,37 +1139,41 @@ def __main__(): if tool in thresholds.keys(): thresholds[tool] = thresh else: - raise ValueError('Tool ' + tool +' in specified threshold file is not supported') + raise ValueError( + 'Tool ' + tool + ' in specified threshold file is not supported') - # MHC class I or II predictions - if args.mhcclass is 1: - if args.peptides: - pred_dataframes, statistics = make_predictions_from_peptides(peptides, methods, thresholds, alleles, up_db, args.identifier, metadata) - else: - pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, thresholds, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap) + # Distinguish between prediction for peptides and variants + if args.peptides: + pred_dataframes, statistics = make_predictions_from_peptides( + peptides, methods, thresholds, args.use_affinity_thresholds, alleles, up_db, args.identifier, metadata) else: - if args.peptides: - pred_dataframes, statistics = make_predictions_from_peptides(peptides, methods, thresholds, alleles, up_db, args.identifier, metadata) - else: - pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, thresholds, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap) + pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, thresholds, args.use_affinity_thresholds, alleles, int( + args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap) + # concat dataframes for all peptide lengths try: complete_df = pd.concat(pred_dataframes, sort=True) # replace method names with method names with version # complete_df.replace({'method': methods}, inplace=True) - complete_df['method'] = complete_df['method'].apply(lambda x : x.lower() + '-' + methods[x.lower()] ) + complete_df['method'] = complete_df['method'].apply( + lambda x: x.lower() + '-' + methods[x.lower()]) + predictions_available = True except: complete_df = pd.DataFrame() + predictions_available = False logger.error("No predictions available.") # include wild type sequences to dataframe if specified if args.wild_type: wt_sequences = generate_wt_seqs(all_peptides_filtered) - complete_df['wt sequence'] = complete_df.apply(lambda row: create_wt_seq_column_value(row, wt_sequences), axis=1) - columns_tiles = ['sequence', 'wt sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method'] + complete_df['wt sequence'] = complete_df.apply( + lambda row: create_wt_seq_column_value(row, wt_sequences), axis=1) + columns_tiles = ['sequence', 'wt sequence', 'length', 'chr', 'pos', + 'gene', 'transcripts', 'proteins', 'variant type', 'method'] # Change the order (the index) of the columns else: - columns_tiles = ['sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method'] + columns_tiles = ['sequence', 'length', 'chr', 'pos', 'gene', + 'transcripts', 'proteins', 'variant type', 'method'] for c in complete_df.columns: if c not in columns_tiles: columns_tiles.append(c) @@ -1096,7 +1204,8 @@ def __main__(): protein_quant = read_protein_quant(args.protein_quantification) first_entry = protein_quant[protein_quant.keys()[0]] for k in first_entry.keys(): - complete_df['{} log2 protein LFQ intensity'.format(k)] = complete_df.apply(lambda row: create_quant_column_value_for_result(row, protein_quant, transcriptSwissProtMap, k), axis=1) + complete_df['{} log2 protein LFQ intensity'.format(k)] = complete_df.apply( + lambda row: create_quant_column_value_for_result(row, protein_quant, transcriptSwissProtMap, k), axis=1) # parse (differential) expression analysis results, annotate features (genes/transcripts) if args.gene_expression is not None: fold_changes = read_diff_expression_values(args.gene_expression) @@ -1113,7 +1222,8 @@ def __main__(): gene_id_lengths[ids[1]] = float(ids[2].strip()) deseq = False # add column to result dataframe - complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result(row, fold_changes, deseq, gene_id_lengths), axis=1) + complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result( + row, fold_changes, deseq, gene_id_lengths), axis=1) if args.diff_gene_expression is not None: gene_id_lengths = {} fold_changes = read_diff_expression_values(args.diff_gene_expression) @@ -1121,19 +1231,24 @@ def __main__(): deseq = True # add column to result dataframe - complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result(row, fold_changes, deseq, gene_id_lengths), axis=1) + complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result( + row, fold_changes, deseq, gene_id_lengths), axis=1) # parse ligandomics identification results, annotate peptides for samples if args.ligandomics_id is not None: lig_id = read_lig_ID_values(args.ligandomics_id) # add columns to result dataframe - complete_df['ligand score'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 0, False), axis=1) - complete_df['ligand intensity'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 1, False), axis=1) + complete_df['ligand score'] = complete_df.apply( + lambda row: create_ligandomics_column_value_for_result(row, lig_id, 0, False), axis=1) + complete_df['ligand intensity'] = complete_df.apply( + lambda row: create_ligandomics_column_value_for_result(row, lig_id, 1, False), axis=1) if args.wild_type != None: - complete_df['wt ligand score'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 0, True), axis=1) - complete_df['wt ligand intensity'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 1, True), axis=1) + complete_df['wt ligand score'] = complete_df.apply( + lambda row: create_ligandomics_column_value_for_result(row, lig_id, 0, True), axis=1) + complete_df['wt ligand intensity'] = complete_df.apply( + lambda row: create_ligandomics_column_value_for_result(row, lig_id, 1, True), axis=1) # write mutated protein sequences to fasta file - if args.fasta_output: + if args.fasta_output and predictions_available: with open('{}_prediction_proteins.fasta'.format(args.identifier), 'w') as protein_outfile: for p in proteins: variants = [] @@ -1143,23 +1258,33 @@ def __main__(): cf = list(itertools.chain.from_iterable(c)) cds = ','.join([y.cdsMutationSyntax for y in set(cf)]) aas = ','.join([y.aaMutationSyntax for y in set(cf)]) - protein_outfile.write('>{}:{}:{}\n'.format(p.transcript_id, aas, cds)) + protein_outfile.write( + '>{}:{}:{}\n'.format(p.transcript_id, aas, cds)) protein_outfile.write('{}\n'.format(str(p))) + complete_df["binder"] = complete_df[[ + col for col in complete_df.columns if 'binder' in col]].any(axis=1) + # write dataframe to tsv complete_df.fillna('') - complete_df.to_csv("{}_prediction_results.tsv".format(args.identifier), '\t', index=False) + if predictions_available: + complete_df.to_csv("{}_prediction_result.tsv".format( + args.identifier), '\t', index=False) + statistics['tool_thresholds'] = thresholds statistics['number_of_predictions'] = len(complete_df) statistics['number_of_binders'] = len(pos_predictions) statistics['number_of_nonbinders'] = len(neg_predictions) statistics['number_of_unique_binders'] = list(set(binders)) - statistics['number_of_unique_nonbinders'] = list(set(non_binders) - set(binders)) + statistics['number_of_unique_nonbinders'] = list( + set(non_binders) - set(binders)) with open('{}_report.json'.format(args.identifier), 'w') as json_out: json.dump(statistics, json_out) - logger.info("Finished predictions at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) + logger.info("Finished predictions at " + + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) + if __name__ == "__main__": __main__() diff --git a/bin/gen_peptides.py b/bin/gen_peptides.py index 7fa2d64..5e3b656 100755 --- a/bin/gen_peptides.py +++ b/bin/gen_peptides.py @@ -5,7 +5,7 @@ import pandas as pd from Bio.SeqIO.FastaIO import SimpleFastaParser -from Fred2.Core import Allele, Peptide, Protein, generate_peptides_from_proteins +from epytope.Core import Allele, Peptide, Protein, generate_peptides_from_proteins parser = argparse.ArgumentParser("Generating peptides from protein sequences.") diff --git a/bin/merge_jsons.py b/bin/merge_jsons.py index 38a855b..6f3af5c 100755 --- a/bin/merge_jsons.py +++ b/bin/merge_jsons.py @@ -44,15 +44,20 @@ def combine_dicts(a, b): data = dict() if args.single_input: with open(args.single_input, "r") as infile: - data = combine_dicts(data, json.load(infile)) + json_content = json.load(infile) + data = combine_dicts(data, json_content) + else: for file in os.listdir(args.input): if file.endswith(".json"): with open(os.path.join(args.input, file), "r") as infile: - data = combine_dicts(data, json.load(infile)) + json_content = json.load(infile) + data = combine_dicts(data, json_content) # merge and write json report - data["prediction_methods"] = "".join(set(list(flatten(data["prediction_methods"])))) + data["prediction_methods"] = ",".join(set(list(flatten(data["prediction_methods"])))) + # tool thresholds is the same for all runs i.e. for all JSON parts + data["tool_thresholds"] = json_content["tool_thresholds"] data["number_of_unique_peptides"] = len( set(list(flatten(data["number_of_unique_peptides"]))) ) diff --git a/bin/split_vcf_by_variants.py b/bin/split_vcf_by_variants.py new file mode 100755 index 0000000..11f1dc2 --- /dev/null +++ b/bin/split_vcf_by_variants.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python + +import argparse +import logging +import csv +import os +from typing import Optional + + +def determine_split_size(input_file, size): + with open(input_file, "r") as variants: + input_lines = variants.readlines() + num_variants = sum(1 for i in input_lines if not i.startswith('#')) + if not size: + return max( int(num_variants / 10), 1) + elif num_variants < size: + logging.warning( + 'Provided split size larger than number of variants. Using default value.') + return max( int(num_variants / 10), 1) + else: + return size + + +def main(): + parser = argparse.ArgumentParser("Split vcf file into multiple files.") + parser.add_argument('-i', '--input', metavar='FILE', + type=str, help='Input VCF file containing variants.', ) + parser.add_argument('-s', '--size', metavar='N', type=int, required=False, + help='Number of variants that should be written into one file. Default: number of variants / 10') + parser.add_argument('-d', '--distance', metavar='N', type=int, default=110000, + help='Number of nucleotides between previous and current variant. Default: 110000') + parser.add_argument('-o', '--output', metavar='N', + help='Output directory') + + args = parser.parse_args() + + split_size = determine_split_size(args.input, args.size) + file_name = args.input.split('.')[0] + var_group_count = 0 + file_count = 1 + metadata = '' + var_group = '' + + with open(args.input, 'r') as input_file: + vcf_file = csv.reader(input_file, delimiter="\t") + + for line in vcf_file: + if line[0].startswith('#'): + metadata += ('\t').join(line) + metadata += '\n' + else: + var_group += ('\t').join(line) + var_group += '\n' + # Get chromosome and positional information of the transcript + transcript = (line[0], int(line[1])) + # Check if the current number of saved variants is lower the number of desired variants per split + if var_group_count < split_size: + var_group_count += 1 + # Check if previous variant might be in the same transcript by checking chr and pos information + elif transcript[0] == previous_transcript[0] and transcript[1] < previous_transcript[1] + args.distance: + var_group_count += 1 + # write split to new VCF file + else: + with open(os.path.join(args.output, f'{file_name}_chunk_{file_count}.vcf'), 'w') as output_file: + output_file.write(metadata + var_group) + var_group = "" + var_group_count = 0 + file_count += 1 + previous_transcript = transcript + if var_group: + with open(os.path.join(args.output, f'{file_name}_chunk_{file_count}.vcf'), 'w') as output_file: + output_file.write(metadata + var_group) + + +if __name__ == "__main__": + main() diff --git a/conf/base.config b/conf/base.config index 018d0a1..82472d4 100644 --- a/conf/base.config +++ b/conf/base.config @@ -1,8 +1,8 @@ /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ nf-core/epitopeprediction Nextflow base config file -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ A 'blank slate' config file, appropriate for general use on most high performance compute environments. Assumes that all software is installed and available on the PATH. Runs in `local` mode - all jobs will be run on the logged in environment. diff --git a/conf/igenomes.config b/conf/igenomes.config index 855948d..7a1b3ac 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -1,7 +1,7 @@ /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Nextflow config file for iGenomes paths -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Defines reference genomes using iGenome paths. Can be used by any config that customises the base path using: $params.igenomes_base / --igenomes_base @@ -13,7 +13,7 @@ params { genomes { 'GRCh37' { fasta = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BismarkIndex/" @@ -26,7 +26,7 @@ params { } 'GRCh38' { fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BismarkIndex/" @@ -38,7 +38,7 @@ params { } 'GRCm38' { fasta = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BismarkIndex/" @@ -51,7 +51,7 @@ params { } 'TAIR10' { fasta = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BismarkIndex/" @@ -62,7 +62,7 @@ params { } 'EB2' { fasta = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/BismarkIndex/" @@ -72,7 +72,7 @@ params { } 'UMD3.1' { fasta = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/BismarkIndex/" @@ -83,7 +83,7 @@ params { } 'WBcel235' { fasta = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BismarkIndex/" @@ -94,7 +94,7 @@ params { } 'CanFam3.1' { fasta = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BismarkIndex/" @@ -105,7 +105,7 @@ params { } 'GRCz10' { fasta = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/BismarkIndex/" @@ -115,7 +115,7 @@ params { } 'BDGP6' { fasta = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/BismarkIndex/" @@ -126,7 +126,7 @@ params { } 'EquCab2' { fasta = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/BismarkIndex/" @@ -137,7 +137,7 @@ params { } 'EB1' { fasta = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/BismarkIndex/" @@ -147,7 +147,7 @@ params { } 'Galgal4' { fasta = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/BismarkIndex/" @@ -157,7 +157,7 @@ params { } 'Gm01' { fasta = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/BismarkIndex/" @@ -167,7 +167,7 @@ params { } 'Mmul_1' { fasta = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/BismarkIndex/" @@ -178,7 +178,7 @@ params { } 'IRGSP-1.0' { fasta = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/BismarkIndex/" @@ -188,7 +188,7 @@ params { } 'CHIMP2.1.4' { fasta = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/BismarkIndex/" @@ -199,7 +199,7 @@ params { } 'Rnor_5.0' { fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/BismarkIndex/" @@ -209,7 +209,7 @@ params { } 'Rnor_6.0' { fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/BismarkIndex/" @@ -219,7 +219,7 @@ params { } 'R64-1-1' { fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/BismarkIndex/" @@ -230,7 +230,7 @@ params { } 'EF2' { fasta = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/BismarkIndex/" @@ -242,7 +242,7 @@ params { } 'Sbi1' { fasta = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/BismarkIndex/" @@ -252,7 +252,7 @@ params { } 'Sscrofa10.2' { fasta = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/BismarkIndex/" @@ -263,7 +263,7 @@ params { } 'AGPv3' { fasta = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/BismarkIndex/" @@ -273,7 +273,7 @@ params { } 'hg38' { fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/BismarkIndex/" @@ -285,7 +285,7 @@ params { } 'hg19' { fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/BismarkIndex/" @@ -298,7 +298,7 @@ params { } 'mm10' { fasta = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/BismarkIndex/" @@ -311,7 +311,7 @@ params { } 'bosTau8' { fasta = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/BismarkIndex/" @@ -321,7 +321,7 @@ params { } 'ce10' { fasta = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/BismarkIndex/" @@ -333,7 +333,7 @@ params { } 'canFam3' { fasta = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/BismarkIndex/" @@ -344,7 +344,7 @@ params { } 'danRer10' { fasta = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/BismarkIndex/" @@ -355,7 +355,7 @@ params { } 'dm6' { fasta = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/BismarkIndex/" @@ -366,7 +366,7 @@ params { } 'equCab2' { fasta = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/BismarkIndex/" @@ -377,7 +377,7 @@ params { } 'galGal4' { fasta = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/BismarkIndex/" @@ -388,7 +388,7 @@ params { } 'panTro4' { fasta = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/BismarkIndex/" @@ -399,7 +399,7 @@ params { } 'rn6' { fasta = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/BismarkIndex/" @@ -409,7 +409,7 @@ params { } 'sacCer3' { fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/BismarkIndex/" @@ -419,7 +419,7 @@ params { } 'susScr3' { fasta = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/BWAIndex/genome.fa" + bwa = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/STARIndex/" bismark = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/BismarkIndex/" diff --git a/conf/modules.config b/conf/modules.config index bbcafc6..bba5fea 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1,12 +1,12 @@ /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Config file for defining DSL2 per module options and publishing paths -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Available keys to override module options: - ext.args = Additional arguments appended to command in module. - ext.args2 = Second set of arguments appended to command in module (multi-tool modules). - ext.args3 = Third set of arguments appended to command in module (multi-tool modules). - ext.prefix = File name prefix for output files. + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. ---------------------------------------------------------------------------------------- */ @@ -14,7 +14,7 @@ process { publishDir = [ path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, - mode: 'copy', + mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -25,33 +25,41 @@ process { withName: SAMPLESHEET_CHECK { publishDir = [ path: { "${params.outdir}/pipeline_info" }, - mode: 'copy', + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, + mode: params.publish_dir_mode, pattern: '*_versions.yml' ] } - withName: CHECK_REQUESTED_MODELS { + withName: EPYTOPE_CHECK_REQUESTED_MODELS { publishDir = [ path: { "${params.outdir}/reports" }, + mode: params.publish_dir_mode ] - ext.args = "--tools ${params.tools} --max_length ${params.max_peptide_length} --min_length ${params.min_peptide_length} " + ext.args = "--tools ${params.tools} " } - withName: CHECK_REQUESTED_MODELS_PEP { + withName: EPYTOPE_CHECK_REQUESTED_MODELS_PEP { publishDir = [ path: { "${params.outdir}/reports" }, + mode: params.publish_dir_mode ] - ext.args = "--tools ${params.tools} --max_length ${params.max_peptide_length} --min_length ${params.min_peptide_length} --peptides " + ext.args = "--tools ${params.tools} --peptides " } - withName: FRED2_GENERATEPEPTIDES { - ext.args = "--max_length ${params.max_peptide_length} --min_length ${params.min_peptide_length} " + withName: EPYTOPE_GENERATE_PEPTIDES { + publishDir = [ + path: { "${params.outdir}/generated_peptides/${meta.sample}" }, + mode: params.publish_dir_mode + ] + ext.args = '' } withName: SPLIT_PEPTIDES { @@ -62,28 +70,31 @@ process { ext.args = "--min_size ${params.peptides_split_minchunksize} --max_chunks ${params.peptides_split_maxchunks} " } - withName: PEPTIDE_PREDICTION_PROTEIN { + withName: EPYTOPE_PEPTIDE_PREDICTION_PROTEIN { publishDir = [ - path: { "${params.outdir}/predictions" }, + path: { "${params.outdir}/split_predictions/${meta.sample}" }, + mode: params.publish_dir_mode ] // Argument list needs to end with --peptides - ext.args = "--tools ${params.tools} --mhcclass ${params.mhc_class} --max_length ${params.max_peptide_length} --min_length ${params.min_peptide_length} --reference ${params.genome_version} --peptides" + ext.args = "--reference ${params.genome_version} --peptides" } - withName: PEPTIDE_PREDICTION_PEP { + withName: EPYTOPE_PEPTIDE_PREDICTION_PEP { publishDir = [ - path: { "${params.outdir}/predictions" }, + path: { "${params.outdir}/split_predictions/${meta.sample}" }, + mode: params.publish_dir_mode ] // Argument list needs to end with --peptides - ext.args = "--tools ${params.tools} --mhcclass ${params.mhc_class} --max_length ${params.max_peptide_length} --min_length ${params.min_peptide_length} --reference ${params.genome_version} --peptides" + ext.args = "--reference ${params.genome_version} --peptides" } - withName: PEPTIDE_PREDICTION_VAR { + withName: EPYTOPE_PEPTIDE_PREDICTION_VAR { publishDir = [ - path: { "${params.outdir}/predictions" }, + path: { "${params.outdir}/split_predictions/${meta.sample}" }, + mode: params.publish_dir_mode ] // Argument list needs to end with --somatic_mutation - ext.args = "--tools ${params.tools} --mhcclass ${params.mhc_class} --max_length ${params.max_peptide_length} --min_length ${params.min_peptide_length} --reference ${params.genome_version} --somatic_mutation" + ext.args = "--reference ${params.genome_version} --somatic_mutation" } withName: MERGE_JSON_SINGLE { @@ -96,55 +107,77 @@ process { withName: CAT_TSV { publishDir = [ - path: { "${params.outdir}/merged_predictions" }, + path: { "${params.outdir}/predictions/${meta.sample}" }, + mode: params.publish_dir_mode ] } withName: CAT_FASTA { publishDir = [ - path: { "${params.outdir}/merged_predictions" }, + path: { "${params.outdir}/predictions/${meta.sample}" }, + mode: params.publish_dir_mode ] } withName: CSVTK_CONCAT { publishDir = [ - path: { "${params.outdir}/merged_predictions" }, + path: { "${params.outdir}/predictions/${meta.sample}" }, + mode: params.publish_dir_mode ] } withName: CSVTK_SPLIT { publishDir = [ - path: { "${params.outdir}/splitted" }, + path: { "${params.outdir}/split_input/${meta.sample}" }, + mode: params.publish_dir_mode ] } withName: GET_PREDICTION_VERSIONS { publishDir = [ path: { "${params.outdir}/reports" }, + mode: params.publish_dir_mode + ] + } + + withName: GUNZIP_VCF { + publishDir = [ + enabled: false ] } withName: MERGE_JSON { publishDir = [ - path: { "${params.outdir}/merged_predictions" }, + path: { "${params.outdir}/predictions/${meta.sample}" }, + mode: params.publish_dir_mode ] } - withName: SHOW_SUPPORTED_MODELS { + withName: EPYTOPE_SHOW_SUPPORTED_MODELS { publishDir = [ path: { "${params.outdir}/supported_models" }, + mode: params.publish_dir_mode + ] + } + + withName: VARIANT_SPLIT { + publishDir = [ + path: { "${params.outdir}/split_input/${meta.sample}" }, + mode: params.publish_dir_mode ] } withName: SNPSIFT_SPLIT { publishDir = [ - path: { "${params.outdir}/splitted" }, + path: { "${params.outdir}/split_input/${meta.sample}" }, + mode: params.publish_dir_mode ] } withName: SPLIT_PEPTIDES { publishDir = [ - path: { "${params.outdir}/splitted" }, + path: { "${params.outdir}/split_input/${meta.sample}" }, + mode: params.publish_dir_mode ] } } diff --git a/conf/test.config b/conf/test.config index d42bf6f..681012d 100644 --- a/conf/test.config +++ b/conf/test.config @@ -1,11 +1,11 @@ /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Nextflow config file for running minimal tests -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Defines input files and everything required to run a fast and simple pipeline test. Use as follows: - nextflow run nf-core/epitopeprediction -profile test, + nextflow run nf-core/epitopeprediction -profile test, --outdir ---------------------------------------------------------------------------------------- */ diff --git a/conf/test_full.config b/conf/test_full.config index f54a9af..f1fa9ef 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -1,11 +1,11 @@ /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Nextflow config file for running full-size tests -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Defines input files and everything required to run a full size pipeline test. Use as follows: - nextflow run nf-core/epitopeprediction -profile test_full, + nextflow run nf-core/epitopeprediction -profile test_full, --outdir ---------------------------------------------------------------------------------------- */ @@ -17,4 +17,5 @@ params { // Input data for full size test input = 'https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/sample_sheets/sample_sheet_full_test.csv' schema_ignore_params = 'genomes,input_paths' + tools = 'syfpeithi,mhcflurry,mhcnuggets-class-1,mhcnuggets-class-2' } diff --git a/conf/test_grch38_variant_tsv.config b/conf/test_grch38_variant_tsv.config new file mode 100644 index 0000000..61d804a --- /dev/null +++ b/conf/test_grch38_variant_tsv.config @@ -0,0 +1,18 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/epitopeprediction -profile test_grch38_variant_tsv, --outdir + */ + +params { + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/sample_sheets/sample_sheet_variants_tab.csv' + genome_version = 'GRCh38' +} diff --git a/conf/test_mhcflurry.config b/conf/test_mhcflurry.config index c591756..7558693 100644 --- a/conf/test_mhcflurry.config +++ b/conf/test_mhcflurry.config @@ -4,18 +4,18 @@ * ------------------------------------------------- * Defines bundled input files and everything required * to run a fast and simple test. Use as follows: - * nextflow run nf-core/epitopeprediction -profile test_mhcflurry, + * nextflow run nf-core/epitopeprediction -profile test_mhcflurry, --outdir */ params { - config_profile_name = 'Test MHCflurry profile' - config_profile_description = 'Minimal test dataset to check pipeline function' - // Limit resources so that this can run on GitHub Actions - max_cpus = 2 - max_memory = 6.GB - max_time = 48.h + config_profile_name = 'Test MHCflurry profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h - // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/sample_sheets/sample_sheet_variants.csv' - tools = 'mhcflurry' + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/sample_sheets/sample_sheet_variants.csv' + tools = 'mhcflurry' } diff --git a/conf/test_mhcnuggets.config b/conf/test_mhcnuggets.config index 18fa1c8..90774b5 100644 --- a/conf/test_mhcnuggets.config +++ b/conf/test_mhcnuggets.config @@ -4,18 +4,18 @@ * ------------------------------------------------- * Defines bundled input files and everything required * to run a fast and simple test. Use as follows: - * nextflow run nf-core/epitopeprediction -profile test_mhcnuggets, + * nextflow run nf-core/epitopeprediction -profile test_mhcnuggets, --outdir */ params { - config_profile_name = 'Test MHCnuggets profile' - config_profile_description = 'Minimal test dataset to check pipeline function' - // Limit resources so that this can run on GitHub Actions - max_cpus = 2 - max_memory = 6.GB - max_time = 48.h + config_profile_name = 'Test MHCnuggets profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h - // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/sample_sheets/sample_sheet_variants.csv' - tools = 'mhcnuggets-class-1' + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/sample_sheets/sample_sheet_variants_class1_and_2.csv' + tools = 'mhcnuggets-class-1,mhcnuggets-class-2' } diff --git a/conf/test_netmhc.config b/conf/test_netmhc.config index 7ddb759..1ad878a 100644 --- a/conf/test_netmhc.config +++ b/conf/test_netmhc.config @@ -4,20 +4,20 @@ * ------------------------------------------------- * Defines bundled input files and everything required * to run a fast and simple test. Use as follows: - * nextflow run nf-core/epitopeprediction -profile test_netmhc, + * nextflow run nf-core/epitopeprediction -profile test_netmhc, --outdir */ params { - config_profile_name = 'NetMHC Test Profile' - config_profile_description = 'Peptide list based test profile for NetMHC' + config_profile_name = 'NetMHC Test Profile' + config_profile_description = 'Peptide list based test profile for NetMHC' - max_cpus = 2 - max_memory = 6.GB - max_time = 48.h + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h - // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/sample_sheets/sample_sheet_peptides.csv' + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/sample_sheets/sample_sheet_peptides.csv' - tools = 'netmhc' - netmhc_path = './non-free/netmhc.tar.gz' + tools = 'netmhc-4.0' + netmhc_path = './non-free/netmhc.tar.gz' } diff --git a/conf/test_netmhcii.config b/conf/test_netmhcii.config index 02cf0d6..2cc2804 100644 --- a/conf/test_netmhcii.config +++ b/conf/test_netmhcii.config @@ -4,20 +4,20 @@ * ------------------------------------------------- * Defines bundled input files and everything required * to run a fast and simple test. Use as follows: - * nextflow run nf-core/epitopeprediction -profile test_netmhcii, + * nextflow run nf-core/epitopeprediction -profile test_netmhcii, --outdir */ params { - config_profile_name = 'NetMHCII Test Profile' - config_profile_description = 'Peptide list based test profile for NetMHCII' + config_profile_name = 'NetMHCII Test Profile' + config_profile_description = 'Peptide list based test profile for NetMHCII' - max_cpus = 2 - max_memory = 6.GB - max_time = 48.h + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h - // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/sample_sheets/sample_sheet_peptides_class2.csv' + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/sample_sheets/sample_sheet_peptides_class2.csv' - tools = 'netmhcii' - netmhcii_path = './non-free/netmhcii.tar.Z' + tools = 'netmhcii-2.2' + netmhcii_path = './non-free/netmhcii.tar.Z' } diff --git a/conf/test_netmhciipan.config b/conf/test_netmhciipan.config index 330b90a..07b2781 100644 --- a/conf/test_netmhciipan.config +++ b/conf/test_netmhciipan.config @@ -4,20 +4,20 @@ * ------------------------------------------------- * Defines bundled input files and everything required * to run a fast and simple test. Use as follows: - * nextflow run nf-core/epitopeprediction -profile test_netmhciipan, + * nextflow run nf-core/epitopeprediction -profile test_netmhciipan, --outdir */ params { - config_profile_name = 'NetMHCIIpan Test Profile' - config_profile_description = 'Peptide list based test profile for NetMHCIIpan' + config_profile_name = 'NetMHCIIpan Test Profile' + config_profile_description = 'Peptide list based test profile for NetMHCIIpan' - max_cpus = 2 - max_memory = 6.GB - max_time = 48.h + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h - // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/sample_sheets/sample_sheet_peptides_class2.csv' + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/sample_sheets/sample_sheet_peptides_class2.csv' - tools = 'netmhciipan' - netmhciipan_path = './non-free/netmhciipan.tar.gz' + tools = 'netmhciipan-4.1' + netmhciipan_path = './non-free/netmhciipan.tar.gz' } diff --git a/conf/test_netmhcpan.config b/conf/test_netmhcpan.config index e9681d7..2e9fb4f 100644 --- a/conf/test_netmhcpan.config +++ b/conf/test_netmhcpan.config @@ -4,20 +4,20 @@ * ------------------------------------------------- * Defines bundled input files and everything required * to run a fast and simple test. Use as follows: - * nextflow run nf-core/epitopeprediction -profile test_netmhcpan, + * nextflow run nf-core/epitopeprediction -profile test_netmhcpan, --outdir */ params { - config_profile_name = 'NetMHCpan Test Profile' - config_profile_description = 'Peptide list based test profile for NetMHCpan' + config_profile_name = 'NetMHCpan Test Profile' + config_profile_description = 'Peptide list based test profile for NetMHCpan' - max_cpus = 2 - max_memory = 6.GB - max_time = 48.h + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h - // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/sample_sheets/sample_sheet_peptides.csv' + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/sample_sheets/sample_sheet_peptides.csv' - tools = 'netmhcpan' - netmhcpan_path = './non-free/netmhcpan.tar.gz' + tools = 'netmhcpan-4.0' + netmhcpan_path = './non-free/netmhcpan.tar.gz' } diff --git a/conf/test_peptides.config b/conf/test_peptides.config index 35dc0a3..74c98f3 100644 --- a/conf/test_peptides.config +++ b/conf/test_peptides.config @@ -4,14 +4,14 @@ * ------------------------------------------------- * Defines bundled input files and everything required * to run a fast and simple test. Use as follows: - * nextflow run nf-core/epitopeprediction -profile test_peptides, + * nextflow run nf-core/epitopeprediction -profile test_peptides, --outdir */ params { - max_cpus = 2 - max_memory = 6.GB - max_time = 48.h + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h - // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/sample_sheets/sample_sheet_peptides.csv' + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/sample_sheets/sample_sheet_peptides.csv' } diff --git a/conf/test_peptides_h2.config b/conf/test_peptides_h2.config index a7ea63e..26a4e00 100644 --- a/conf/test_peptides_h2.config +++ b/conf/test_peptides_h2.config @@ -4,7 +4,7 @@ * ------------------------------------------------- * Defines bundled input files and everything required * to run a fast and simple test. Use as follows: - * nextflow run nf-core/epitopeprediction -profile test_peptides_h2, + * nextflow run nf-core/epitopeprediction -profile test_peptides_h2, --outdir */ params { diff --git a/conf/test_proteins.config b/conf/test_proteins.config index b0e7b70..7515999 100644 --- a/conf/test_proteins.config +++ b/conf/test_proteins.config @@ -4,14 +4,14 @@ * ------------------------------------------------- * Defines bundled input files and everything required * to run a fast and simple test. Use as follows: - * nextflow run nf-core/epitopeprediction -profile test_proteins, + * nextflow run nf-core/epitopeprediction -profile test_proteins, --outdir */ params { - max_cpus = 2 - max_memory = 6.GB - max_time = 4.h + max_cpus = 2 + max_memory = 6.GB + max_time = 4.h - // Input data - input = 'https://github.com/nf-core/test-datasets/raw/epitopeprediction/testdata/sample_sheets/sample_sheet_proteins.csv' + // Input data + input = 'https://github.com/nf-core/test-datasets/raw/epitopeprediction/testdata/sample_sheets/sample_sheet_proteins.csv' } diff --git a/conf/test_variant_tsv.config b/conf/test_variant_tsv.config index 0c9ad26..f54d1bc 100644 --- a/conf/test_variant_tsv.config +++ b/conf/test_variant_tsv.config @@ -4,14 +4,14 @@ * ------------------------------------------------- * Defines bundled input files and everything required * to run a fast and simple test. Use as follows: - * nextflow run nf-core/epitopeprediction -profile test_variant_tsv, + * nextflow run nf-core/epitopeprediction -profile test_variant_tsv, --outdir */ params { - max_cpus = 2 - max_memory = 6.GB - max_time = 48.h + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h - // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/sample_sheets/sample_sheet_variants_tab.csv' + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/sample_sheets/sample_sheet_variants_tab.csv' } diff --git a/docs/README.md b/docs/README.md index de2bf68..6f87a50 100644 --- a/docs/README.md +++ b/docs/README.md @@ -2,9 +2,9 @@ The nf-core/epitopeprediction documentation is split into the following pages: -* [Usage](usage.md) - * An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. -* [Output](output.md) - * An overview of the different results produced by the pipeline and how to interpret them. +- [Usage](usage.md) + - An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. +- [Output](output.md) + - An overview of the different results produced by the pipeline and how to interpret them. You can find a lot more documentation about installing, configuring and running nf-core pipelines on the website: [https://nf-co.re](https://nf-co.re) diff --git a/docs/output.md b/docs/output.md index c4eb938..ebb2f96 100644 --- a/docs/output.md +++ b/docs/output.md @@ -14,9 +14,9 @@ The directories listed below will be created in the results directory after the The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -* [Epitope Prediction](#epitope-prediction) - Predict MHC-binding peptides -* [MultiQC](#multiqc) - Aggregate report describing results from the whole pipeline -* [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +- [Epitope Prediction](#epitope-prediction) - Predict MHC-binding peptides +- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline +- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution ## Epitope Prediction @@ -24,10 +24,10 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d **Output directory: `merged_predictions/`** -* `[input_base_name]_prediction_report.json` - * The statistics of the performed prediction in JSON format. -* `[input_base_name]_prediction_result.tsv` - * The predicted epitopes in TSV format for further processing. +- `[input_base_name]_prediction_report.json` + - The statistics of the performed prediction in JSON format. +- `[input_base_name]_prediction_result.tsv` + - The predicted epitopes in TSV format for further processing. Partial results, e.g. predictions per chromosome or of individual peptide chunks can be found in `predictions/`. @@ -44,48 +44,59 @@ HVYLFLSNL 9 17 3336962 ENSG00000127780 ENST00000248384 ENSP00000248384 SNP syfpe An example prediction report looks like this in JSON format: ```json -{"prediction_methods": "syfpeithi-1.0", "number_of_unique_peptides_after_filtering": 199, "number_of_nonbinders": 196, "number_of_variants": 0, "number_of_binders": 3, "number_of_unique_peptides": 199, "number_of_unique_binders": 3, "number_of_unique_nonbinders": 196, "number_of_predictions": 199} +{ + "prediction_methods": "syfpeithi-1.0", + "number_of_unique_peptides_after_filtering": 199, + "number_of_nonbinders": 196, + "number_of_variants": 0, + "number_of_binders": 3, + "number_of_unique_peptides": 199, + "number_of_unique_binders": 3, + "number_of_unique_nonbinders": 196, + "number_of_predictions": 199 +} ``` The prediction results are given as allele-specific score and affinity values per peptide. The computation of these values depends on the applied prediction method: -* [`Syfpeithi`](http://www.syfpeithi.de) : - * **Affinity**: Calculated based on the score as the percentage of the maximum value of the corresponding matrix: `score(peptide) divided by the maximum score of the allele/length-specific matrix * 100`. - * **Score**: Sum of the values given by the allele-specific position-specific scoring matrix (PSSM) for the respective peptide sequence. -Peptides are considered binders if the affinity is higher than 50. -* [`MHCflurry`](https://github.com/openvax/mhcflurry), [`MHCnuggets`](https://github.com/KarchinLab/mhcnuggets) and [`NetMHC` tool family](https://services.healthtech.dtu.dk/): - * **Affinity**: Predicted IC50 (threshold for binders: `<500 nmol/L`). - * **Score**: The provided score is calculated from the log-transformed predicted binding affinity and scaled to an interval of 0 to 1: `1-log50000(aff)`. +- [`Syfpeithi`](http://www.syfpeithi.de) : + - **Affinity**: Calculated based on the score as the percentage of the maximum value of the corresponding matrix: `score(peptide) divided by the maximum score of the allele/length-specific matrix * 100`. + - **Score**: Sum of the values given by the allele-specific position-specific scoring matrix (PSSM) for the respective peptide sequence. + Peptides are considered binders if the affinity is higher than 50. +- [`MHCflurry`](https://github.com/openvax/mhcflurry), [`MHCnuggets`](https://github.com/KarchinLab/mhcnuggets) and [`NetMHC` tool family](https://services.healthtech.dtu.dk/): + - **Affinity**: Predicted IC50 (threshold for binders: `<500 nmol/L`). + - **Score**: The provided score is calculated from the log-transformed predicted binding affinity and scaled to an interval of 0 to 1: `1-log50000(aff)`. When the parameter `--fasta_output` is specified, a `FASTA` file will be generated containing the protein sequences that are affected by the provided genomic variants. The resulting `FASTA` file will contain the wild-type and mutated protein sequences. **Output directory: `merged_predictions/`** -* `[input_base_name]_prediction.fasta` - * The sequences of proteins, affected by provided variants, in FASTA format. +- `[input_base_name]_prediction.fasta` + - The sequences of proteins, affected by provided variants, in FASTA format. ### Supported models -When running the pipeline using the `--show_supported_models` parameter, information about supported models for the available predictor tool versions will be written to the results folder. +When running the pipeline using the `--show_supported_models` parameter, information about supported models for the available predictor tool versions will be written to the results folder. **Output directory: `supported_models/`** -* `[tool].[version].supported_alleles.txt` - * A list of all supported alleles by the corresponding predictor method. -* `[tool].[version].supported_lengths.txt` - * A list of all supported peptide lengths by the corresponding predictor method. +- `[tool].[version].supported_alleles.txt` + - A list of all supported alleles by the corresponding predictor method. +- `[tool].[version].supported_lengths.txt` -* [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution + - A list of all supported peptide lengths by the corresponding predictor method. + +- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution ### Pipeline information
Output files -* `pipeline_info/` - * Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - * Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. - * Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. +- `pipeline_info/` + - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. + - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. + - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`.
diff --git a/docs/usage.md b/docs/usage.md index 93fe48e..86fe53d 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -14,17 +14,45 @@ You will need to create a samplesheet with information about the samples you wou --input '[path to samplesheet file]' ``` +### Input Formats + +The pipeline currently accepts three different types of input that are genomic variants, peptides and proteins. The supported file formats for genomic variants are `.vcf`, `.vcf.gz` and `tsv`. + +:warning: Please note that genomic variants have to be annotated. Currently, we support variants that have been annotated using [SnpEff](http://pcingola.github.io/SnpEff/). Support for [VEP](https://www.ensembl.org/info/docs/tools/vep/index.html) will be available with one of the upcoming versions. + +#### Genomic variants + +`tsv` files with genomic variants have to provide the following columns: + +```console +start, end, #chr, ref, obs, gene, tumour_genotype, coding_and_splicing_details, variant_details, variant_type, coding_and_splicing +... +chr1 12954870 12954870 C T . 0 NORMAL:414,TUMOR:8 . missense_variant 0.5 transcript PRAMEF10 missense_variant PRAMEF10:ENST00000235347:missense_variant:MODERATE:exon3:c.413G>A:p.Cys138Tyr +... + +``` + +#### Peptide sequences + +Peptide sequences have to be provided in `tsv` format with two mandatory columns `id` and `sequence`. Additional columns will be added as metadata to results. + +#### Protein sequences + +Protein input is supported in `FASTA` format. + ### Multiple runs of the same sample The `sample` identifiers are used to determine which sample belongs to the input file. Below is an example for the same sample with different input files that can be used: ```console -sample,alleles,filename -GBM_1,A*01:01;A*02:01;B*07:02;B*24:02;C*03:01;C*04:01,gbm_1_variants.vcf(.gz) -GBM_1,A*01:01;A*02:01;B*07:02;B*24:02;C*03:01;C*04:01,gbm_1_peptides.tsv -GBM_1,A*01:01;A*02:01;B*07:02;B*24:02;C*03:01;C*04:01,gbm_1_proteins.fasta +sample,alleles,mhc_class,filename +GBM_1,A*01:01;A*02:01;B*07:02;B*24:02;C*03:01;C*04:01,I,gbm_1_variants.vcf(.gz) +GBM_1,A*01:01;A*02:01;B*07:02;B*24:02;C*03:01;C*04:01,I,gbm_1_peptides.tsv +GBM_1,A*01:01;A*02:01;B*07:02;B*24:02;C*03:01;C*04:01,I,gbm_1_proteins.fasta ``` +You can also perform predictions for multiple MHC classes (`I`, `II` and `H-2`) in the same run by specifying the value in the corresponding column (one value per row). Please make sure to select the alleles accordingly. + ### Full samplesheet The pipeline accepts allele information in a file or as string in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. @@ -32,18 +60,19 @@ The pipeline accepts allele information in a file or as string in the sampleshee A final samplesheet file consisting of both allele data and different input types of two samples may look something like the one below. ```console -sample,alleles,filename -GBM_1,A*01:01;A*02:01;B*07:02;B*24:02;C*03:01;C*04:01,gbm_1_variants.vcf -GBM_1,A*02:01;A*24:01;B*07:02;B*08:01;C*04:01;C*07:01,gbm_1_peptides.vcf -GBM_1,A*01:01;A*24:01;B*07:02;B*08:01;C*03:01;C*07:01,gbm_1_proteins.vcf -GBM_2,alleles.txt,gbm_2_variants.vcf +sample,alleles,mhc_class,filename +GBM_1,A*01:01;A*02:01;B*07:02;B*24:02;C*03:01;C*04:01,I,gbm_1_variants.vcf +GBM_1,A*02:01;A*24:01;B*07:02;B*08:01;C*04:01;C*07:01,I,gbm_1_peptides.vcf +GBM_1,A*01:01;A*24:01;B*07:02;B*08:01;C*03:01;C*07:01,I,gbm_1_proteins.vcf +GBM_2,alleles.txt,I,gbm_2_variants.vcf ``` -| Column | Description | -|----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `alleles` | A string that consists of the patient's alleles (separated by ";"), or a full path to a allele ".txt" file where each allele is saved on a row. | -| `filename` | Full path to a variant/peptide or protein file (".vcf", ".vcf.gz", "tsv", "fasta", or "GSvar"). | +| Column | Description | +| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | +| `alleles` | A string that consists of the patient's alleles (separated by ";"), or a full path to a allele ".txt" file where each allele is saved on a row. | +| `mhc_class` | Specifies the MHC class for which the prediction should be performed. Valid values are: `I`, `II` and `H-2` (mouse). | +| `filename` | Full path to a variant/peptide or protein file (".vcf", ".vcf.gz", "tsv", "fasta", or "GSvar"). | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. @@ -52,7 +81,7 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p The typical command for running the pipeline is as follows: ```console -nextflow run nf-core/epitopeprediction --input samplesheet.csv -profile docker +nextflow run nf-core/epitopeprediction --input samplesheet.csv -profile docker --outdir ``` This will launch the pipeline with the `docker` configuration profile and default options (`syfpeithi` by default). See below for more information about profiles. @@ -60,12 +89,25 @@ This will launch the pipeline with the `docker` configuration profile and defaul Note that the pipeline will create the following files in your working directory: ```console -work # Directory containing the nextflow working files -results # Finished results (configurable, see below) -.nextflow_log # Log file from Nextflow +work # Directory containing the nextflow working files + # Finished results in specified location (defined with --outdir) +.nextflow_log # Log file from Nextflow # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` +### Running the pipeline with external prediction tools + +The pipeline can be used with external prediction tools that cannot be provided with the pipeline due to license restrictions. + +Currently we do support prediction tools of the `netMHC` family. Please refer to the [parameter docs](https://nf-co.re/epitopeprediction/latest/parameters#tools) for the list of supported tools. If one of the external tools is specified, the path to the corresponding tarball has to be specified. +When using `conda`, the parameter `--netmhc_system` (if the default value `linux` is not applicable) must also be specified. + +A typical command is as follows: + +```console +nextflow run nf-core/epitopeprediction --input samplesheet.csv -profile docker --tools netmhcpan-4.1 --netmhcpan_path /path/to/netMHCpan-4.1.Linux.tar.gz --outdir +``` + ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. After this, it will use the cached version if available - even if the pipeline has been updated since. To ensure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: @@ -101,25 +143,25 @@ They are loaded in sequence, so later profiles can overwrite earlier profiles. If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended. -* `docker` - * A generic configuration profile to be used with [Docker](https://docker.com/) -* `singularity` - * A generic configuration profile to be used with [Singularity](https://sylabs.io/docs/) -* `podman` - * A generic configuration profile to be used with [Podman](https://podman.io/) -* `shifter` - * A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) -* `charliecloud` - * A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) -* `conda` - * A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter or Charliecloud. -* `test` - * A profile with a complete configuration for automated testing - * Includes links to test data so needs no other parameters +- `docker` + - A generic configuration profile to be used with [Docker](https://docker.com/) +- `singularity` + - A generic configuration profile to be used with [Singularity](https://sylabs.io/docs/) +- `podman` + - A generic configuration profile to be used with [Podman](https://podman.io/) +- `shifter` + - A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) +- `charliecloud` + - A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) +- `conda` + - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter or Charliecloud. +- `test` + - A profile with a complete configuration for automated testing + - Includes links to test data so needs no other parameters ### `-resume` -Specify this parameter when restarting a pipeline. Nextflow will used cached results from any pipeline steps where the inputs are the same, continuing from the previous run. +Specify this when restarting a pipeline. Nextflow will use cached results from any pipeline steps where the inputs are the same, continuing from where it got to previously. For input to be considered the same, not only the names must be identical but the files' contents as well. For more info about this parameter, see [this blog post](https://www.nextflow.io/blog/2019/demystifying-nextflow-resume.html). You can also supply a run name to resume a specific run: `-resume [run-name]`. Use the `nextflow log` command to show previous run names. @@ -136,11 +178,11 @@ Whilst the default requirements, set within the pipeline, will hopefully work fo For example, if the nf-core/rnaseq pipeline is failing after multiple re-submissions of the `STAR_ALIGN` process due to an exit code of `137` this would indicate that there is an out of memory issue: ```console -[62/149eb0] NOTE: Process `RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)` terminated with an error exit status (137) -- Execution is retried (1) -Error executing process > 'RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)' +[62/149eb0] NOTE: Process `NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)` terminated with an error exit status (137) -- Execution is retried (1) +Error executing process > 'NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)' Caused by: - Process `RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)` terminated with an error exit status (137) + Process `NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)` terminated with an error exit status (137) Command executed: STAR \ @@ -164,17 +206,25 @@ Work dir: Tip: you can replicate the issue by changing to the process work dir and entering the command `bash .command.run` ``` -To bypass this error you would need to find exactly which resources are set by the `STAR_ALIGN` process. The quickest way is to search for `process STAR_ALIGN` in the [nf-core/rnaseq Github repo](https://github.com/nf-core/rnaseq/search?q=process+STAR_ALIGN). We have standardised the structure of Nextflow DSL2 pipelines such that all module files will be present in the `modules/` directory and so based on the search results the file we want is `modules/nf-core/software/star/align/main.nf`. If you click on the link to that file you will notice that there is a `label` directive at the top of the module that is set to [`label process_high`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/modules/nf-core/software/star/align/main.nf#L9). The [Nextflow `label`](https://www.nextflow.io/docs/latest/process.html#label) directive allows us to organise workflow processes in separate groups which can be referenced in a configuration file to select and configure subset of processes having similar computing requirements. The default values for the `process_high` label are set in the pipeline's [`base.config`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L33-L37) which in this case is defined as 72GB. Providing you haven't set any other standard nf-core parameters to __cap__ the [maximum resources](https://nf-co.re/usage/configuration#max-resources) used by the pipeline then we can try and bypass the `STAR_ALIGN` process failure by creating a custom config file that sets at least 72GB of memory, in this case increased to 100GB. The custom config below can then be provided to the pipeline via the [`-c`](#-c) parameter as highlighted in previous sections. +To bypass this error you would need to find exactly which resources are set by the `STAR_ALIGN` process. The quickest way is to search for `process STAR_ALIGN` in the [nf-core/rnaseq Github repo](https://github.com/nf-core/rnaseq/search?q=process+STAR_ALIGN). +We have standardised the structure of Nextflow DSL2 pipelines such that all module files will be present in the `modules/` directory and so, based on the search results, the file we want is `modules/nf-core/software/star/align/main.nf`. +If you click on the link to that file you will notice that there is a `label` directive at the top of the module that is set to [`label process_high`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/modules/nf-core/software/star/align/main.nf#L9). +The [Nextflow `label`](https://www.nextflow.io/docs/latest/process.html#label) directive allows us to organise workflow processes in separate groups which can be referenced in a configuration file to select and configure subset of processes having similar computing requirements. +The default values for the `process_high` label are set in the pipeline's [`base.config`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L33-L37) which in this case is defined as 72GB. +Providing you haven't set any other standard nf-core parameters to **cap** the [maximum resources](https://nf-co.re/usage/configuration#max-resources) used by the pipeline then we can try and bypass the `STAR_ALIGN` process failure by creating a custom config file that sets at least 72GB of memory, in this case increased to 100GB. +The custom config below can then be provided to the pipeline via the [`-c`](#-c) parameter as highlighted in previous sections. ```nextflow process { - withName: STAR_ALIGN { + withName: 'NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN' { memory = 100.GB } } ``` -> **NB:** We specify just the process name i.e. `STAR_ALIGN` in the config file and not the full task name string that is printed to screen in the error message or on the terminal whilst the pipeline is running i.e. `RNASEQ:ALIGN_STAR:STAR_ALIGN`. You may get a warning suggesting that the process selector isn't recognized but you can ignore that if the process name has been specified correctly. This is something that needs to be fixed upstream in core Nextflow. +> **NB:** We specify the full process name i.e. `NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN` in the config file because this takes priority over the short name (`STAR_ALIGN`) and allows existing configuration using the full process name to be correctly overridden. +> +> If you get a warning suggesting that the process selector isn't recognised check that the process name has been specified correctly. ### Updating containers @@ -184,35 +234,35 @@ The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementatio 2. Find the latest version of the Biocontainer available on [Quay.io](https://quay.io/repository/biocontainers/pangolin?tag=latest&tab=tags) 3. Create the custom config accordingly: - * For Docker: - - ```nextflow - process { - withName: PANGOLIN { - container = 'quay.io/biocontainers/pangolin:3.0.5--pyhdfd78af_0' - } - } - ``` - - * For Singularity: - - ```nextflow - process { - withName: PANGOLIN { - container = 'https://depot.galaxyproject.org/singularity/pangolin:3.0.5--pyhdfd78af_0' - } - } - ``` - - * For Conda: - - ```nextflow - process { - withName: PANGOLIN { - conda = 'bioconda::pangolin=3.0.5' - } - } - ``` + - For Docker: + + ```nextflow + process { + withName: PANGOLIN { + container = 'quay.io/biocontainers/pangolin:3.0.5--pyhdfd78af_0' + } + } + ``` + + - For Singularity: + + ```nextflow + process { + withName: PANGOLIN { + container = 'https://depot.galaxyproject.org/singularity/pangolin:3.0.5--pyhdfd78af_0' + } + } + ``` + + - For Conda: + + ```nextflow + process { + withName: PANGOLIN { + conda = 'bioconda::pangolin=3.0.5' + } + } + ``` > **NB:** If you wish to periodically update individual tool-specific results (e.g. Pangolin), generated by the pipeline, then you must ensure to keep the `work/` directory. Otherwise the `-resume` ability of the pipeline will be compromised and it will restart from scratch. diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy index 40ab65f..b3d092f 100755 --- a/lib/NfcoreSchema.groovy +++ b/lib/NfcoreSchema.groovy @@ -27,7 +27,7 @@ class NfcoreSchema { /* groovylint-disable-next-line UnusedPrivateMethodParameter */ public static void validateParameters(workflow, params, log, schema_filename='nextflow_schema.json') { def has_error = false - //=====================================================================// + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// // Check for nextflow core params and unexpected params def json = new File(getSchemaPath(workflow, schema_filename=schema_filename)).text def Map schemaParams = (Map) new JsonSlurper().parseText(json).get('definitions') @@ -135,7 +135,7 @@ class NfcoreSchema { } } - //=====================================================================// + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// // Validate parameters against the schema InputStream input_stream = new File(getSchemaPath(workflow, schema_filename=schema_filename)).newInputStream() JSONObject raw_schema = new JSONObject(new JSONTokener(input_stream)) diff --git a/lib/Utils.groovy b/lib/Utils.groovy index 1b88aec..28567bd 100755 --- a/lib/Utils.groovy +++ b/lib/Utils.groovy @@ -29,12 +29,12 @@ class Utils { conda_check_failed |= !(channels.indexOf('bioconda') < channels.indexOf('defaults')) if (conda_check_failed) { - log.warn "=============================================================================\n" + + log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " There is a problem with your Conda configuration!\n\n" + " You will need to set-up the conda-forge and bioconda channels correctly.\n" + " Please refer to https://bioconda.github.io/user/install.html#set-up-channels\n" + " NB: The order of the channels matters!\n" + - "===================================================================================" + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" } } } diff --git a/main.nf b/main.nf index 6a01b07..deab6f3 100644 --- a/main.nf +++ b/main.nf @@ -1,8 +1,8 @@ #!/usr/bin/env nextflow /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ nf-core/epitopeprediction -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Github : https://github.com/nf-core/epitopeprediction Website: https://nf-co.re/epitopeprediction Slack : https://nfcore.slack.com/channels/epitopeprediction @@ -12,17 +12,17 @@ nextflow.enable.dsl = 2 /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ VALIDATE & PRINT PARAMETER SUMMARY -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ WorkflowMain.initialise(workflow, params, log) /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ NAMED WORKFLOW FOR PIPELINE -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ include { EPITOPEPREDICTION } from './workflows/epitopeprediction' @@ -35,9 +35,9 @@ workflow NFCORE_EPITOPEPREDICTION { } /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN ALL WORKFLOWS -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ // @@ -49,7 +49,7 @@ workflow { } /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THE END -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ diff --git a/modules.json b/modules.json index 3b7e20c..881f4ad 100644 --- a/modules.json +++ b/modules.json @@ -4,11 +4,14 @@ "repos": { "nf-core/modules": { "custom/dumpsoftwareversions": { - "git_sha": "20d8250d9f39ddb05dfb437603aaf99b5c0b2b41" + "git_sha": "682f789f93070bd047868300dd018faf3d434e7c" + }, + "gunzip": { + "git_sha": "fa37e0662690c4ec4260dae282fbce08777503e6" }, "multiqc": { - "git_sha": "20d8250d9f39ddb05dfb437603aaf99b5c0b2b41" + "git_sha": "5138acca0985ca01c38a1c4fba917d83772b1106" } } } -} \ No newline at end of file +} diff --git a/modules/local/cat_files.nf b/modules/local/cat_files.nf index fb52a97..b002bfa 100644 --- a/modules/local/cat_files.nf +++ b/modules/local/cat_files.nf @@ -1,7 +1,7 @@ process CAT_FILES { label 'process_low' - conda (params.enable_conda ? "conda-forge::cat=5.2.3" : null) + conda (params.enable_conda ? "conda-forge:sed=4.8" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/cat:5.2.3--hdfd78af_1' : 'quay.io/biocontainers/cat:5.2.3--hdfd78af_1' }" @@ -10,13 +10,15 @@ process CAT_FILES { tuple val(meta), path(input) output: - tuple val(meta), path("*_prediction.*"), emit: output + tuple val(meta), path("*_prediction*"), emit: output script: def fileExt = input[0].name.tokenize("\\.")[-1] - prefix = task.ext.suffix ? "${meta.sample}_${task.ext.suffix}" : "${meta.sample}" + def prefix = task.ext.suffix ? "${meta.sample}_${task.ext.suffix}" : "${meta.sample}" + def type = fileExt == "tsv" ? "prediction_result" : "prediction_proteins" + """ - cat $input > ${prefix}_prediction.${fileExt} + cat $input > ${prefix}_${type}.${fileExt} """ } diff --git a/modules/local/csvtk_concat.nf b/modules/local/csvtk_concat.nf index 2abaaeb..2134337 100644 --- a/modules/local/csvtk_concat.nf +++ b/modules/local/csvtk_concat.nf @@ -15,7 +15,8 @@ process CSVTK_CONCAT { script: """ - csvtk concat -t $predicted > ${meta.sample}_prediction_result.tsv + csvtk concat -t $predicted > ${meta.sample}_prediction_result_unsorted.tmp + csvtk sort -k chr:n,length:n ${meta.sample}_prediction_result_unsorted.tmp -t --out-file ${meta.sample}_prediction_result.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/csvtk_split.nf b/modules/local/csvtk_split.nf index 2091b24..0d118ee 100644 --- a/modules/local/csvtk_split.nf +++ b/modules/local/csvtk_split.nf @@ -1,7 +1,7 @@ process CSVTK_SPLIT { label 'process_low' - conda (params.enable_conda ? "bioconda::csvtk=0.23.0" : null) + conda (params.enable_conda ? "conda-forge::sed=4.7 bioconda::csvtk=0.23.0" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/csvtk:0.23.0--h9ee0642_0' : 'quay.io/biocontainers/csvtk:0.23.0--h9ee0642_0' }" diff --git a/modules/local/check_requested_models.nf b/modules/local/epytope_check_requested_models.nf similarity index 53% rename from modules/local/check_requested_models.nf rename to modules/local/epytope_check_requested_models.nf index cf6562c..5327cbe 100644 --- a/modules/local/check_requested_models.nf +++ b/modules/local/epytope_check_requested_models.nf @@ -1,13 +1,13 @@ -process CHECK_REQUESTED_MODELS { +process EPYTOPE_CHECK_REQUESTED_MODELS { label 'process_low' - conda (params.enable_conda ? "bioconda::fred2=2.0.7 bioconda::mhcflurry=1.4.3 bioconda::mhcnuggets=2.3.2" : null) + conda (params.enable_conda ? "bioconda::epytope=3.1.0" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-c3f301504f7fa2e7bf81c3783de19a9990ea3001:12b1b9f040fd92a80629d58f8a558dde4820eb15-0' : - 'quay.io/biocontainers/mulled-v2-c3f301504f7fa2e7bf81c3783de19a9990ea3001:12b1b9f040fd92a80629d58f8a558dde4820eb15-0' }" + 'https://depot.galaxyproject.org/singularity/epytope:3.1.0--pyh5e36f6f_0' : + 'quay.io/biocontainers/epytope:3.1.0--pyh5e36f6f_0' }" input: - tuple val(alleles), path(input_file) + tuple val(meta), path(input_file) path(software_versions) output: @@ -15,7 +15,6 @@ process CHECK_REQUESTED_MODELS { path '*.log', emit: log // model_warnings.log path "versions.yml", emit: versions - script: def argument = task.ext.args @@ -27,17 +26,22 @@ process CHECK_REQUESTED_MODELS { argument += "--title \"$params.multiqc_title\"" } + def prefix = task.ext.suffix ? "${meta.sample}_${task.ext.suffix}" : "${meta.sample}_peptides" + def min_length = ("${meta.mhcclass}" == "I") ? params.min_peptide_length : params.min_peptide_length_class2 + def max_length = ("${meta.mhcclass}" == "I") ? params.max_peptide_length : params.max_peptide_length_class2 + """ check_requested_models.py ${argument} \ - --alleles '${alleles.join(';')}' \ - --mhcclass ${params.mhc_class} \ + --alleles '${meta.alleles}' \ + --max_length ${max_length} \ + --min_length ${min_length} \ --versions ${software_versions} > model_warnings.log cat <<-END_VERSIONS > versions.yml "${task.process}": mhcflurry: \$(echo \$(mhcflurry-predict --version 2>&1 | sed 's/^mhcflurry //; s/ .*\$//') ) mhcnuggets: \$(echo \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('mhcnuggets').version)")) - fred2: \$(echo \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('Fred2').version)")) + epytope: \$(echo \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('epytope').version)")) END_VERSIONS """ } diff --git a/modules/local/fred2_generatepeptides.nf b/modules/local/epytope_generate_peptides.nf similarity index 50% rename from modules/local/fred2_generatepeptides.nf rename to modules/local/epytope_generate_peptides.nf index d9fb96c..eaaa997 100644 --- a/modules/local/fred2_generatepeptides.nf +++ b/modules/local/epytope_generate_peptides.nf @@ -1,11 +1,11 @@ -process FRED2_GENERATEPEPTIDES { +process EPYTOPE_GENERATE_PEPTIDES { label 'process_low' tag "${meta.sample}" - conda (params.enable_conda ? "conda-forge::fred2:2.0.7" : null) + conda (params.enable_conda ? "bioconda::epytope=3.1.0" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/fred2:2.0.7--py_0' : - 'quay.io/biocontainers/fred2:2.0.7--py_0' }" + 'https://depot.galaxyproject.org/singularity/epytope:3.1.0--pyh5e36f6f_0' : + 'quay.io/biocontainers/epytope:3.1.0--pyh5e36f6f_0' }" input: tuple val(meta), path(raw) @@ -16,15 +16,19 @@ process FRED2_GENERATEPEPTIDES { script: def prefix = task.ext.suffix ? "${meta.sample}_${task.ext.suffix}" : "${meta.sample}_peptides" + def min_length = (meta.mhcclass == "I") ? params.min_peptide_length : params.min_peptide_length_class2 + def max_length = (meta.mhcclass == "I") ? params.max_peptide_length : params.max_peptide_length_class2 """ gen_peptides.py --input ${raw} \\ + --max_length ${max_length} \\ + --min_length ${min_length} \\ --output '${prefix}.tsv' \\ $task.ext.args cat <<-END_VERSIONS > versions.yml "${task.process}": - fred2: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('Fred2').version)") + epytope: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('epytope').version)") python: \$(python --version 2>&1 | sed 's/Python //g') END_VERSIONS """ diff --git a/modules/local/epytope_peptide_prediction.nf b/modules/local/epytope_peptide_prediction.nf new file mode 100644 index 0000000..b6a7eee --- /dev/null +++ b/modules/local/epytope_peptide_prediction.nf @@ -0,0 +1,90 @@ +process EPYTOPE_PEPTIDE_PREDICTION { + label 'process_low' + + conda (params.enable_conda ? "conda-forge::coreutils=9.1 conda-forge::tcsh=6.20.00 bioconda::epytope=3.1.0 conda-forge::gawk=5.1.0 conda-forge::perl=5.32.1" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-11bbf0d242ea96f7b9c08d5b5bc26f2cd5ac5943:3419f320edefe6077631798f50d7bd4f8dc4763f-0' : + 'quay.io/biocontainers/mulled-v2-11bbf0d242ea96f7b9c08d5b5bc26f2cd5ac5943:3419f320edefe6077631798f50d7bd4f8dc4763f-0' }" + + input: + tuple val(meta), path(splitted), path(software_versions) + val netmhc_paths + + output: + tuple val(meta), path("*.json"), emit: json + tuple val(meta), path("*.tsv"), emit: predicted optional true + tuple val(meta), path("*.fasta"), emit: fasta optional true + path "versions.yml", emit: versions + + script: + // Additions to the argument command need to go to the beginning. + // Argument list needs to end with --peptides or --somatic_mutation + def argument = task.ext.args + + if (params.proteome) { + argument = "--proteome ${params.proteome} " + argument + } + + if (params.wild_type) { + argument = "--wild_type " + argument + } + + if (params.fasta_output) { + argument = "--fasta_output " + argument + } + + if (params.tool_thresholds) { + argument = "--tool_thresholds ${params.tool_thresholds} " + argument + } + + if (params.use_affinity_thresholds) { + argument = "--use_affinity_thresholds " + argument + } + + def netmhc_paths_string = netmhc_paths.join(",") + def tools_split = params.tools.split(',') + def class1_tools = tools_split.findAll { ! it.matches('.*(?i)(class-2|ii).*') } + def class2_tools = tools_split.findAll { it.matches('.*(?i)(syf|class-2|ii).*') } + + if (((meta.mhcclass == "I") & class1_tools.empty) | ((meta.mhcclass == "II") & class2_tools.empty)) { + exit 1, "No tools specified for mhc class ${meta.mhcclass}" + } + + def min_length = (meta.mhcclass == "I") ? params.min_peptide_length : params.min_peptide_length_class2 + def max_length = (meta.mhcclass == "I") ? params.max_peptide_length : params.max_peptide_length_class2 + + def tools_to_use = ((meta.mhcclass == "I") | (meta.mhcclass == "H-2")) ? class1_tools.join(',') : class2_tools.join(',') + + """ + # create folder for MHCflurry downloads to avoid permission problems when running pipeline with docker profile and mhcflurry selected + mkdir -p mhcflurry-data + export MHCFLURRY_DATA_DIR=./mhcflurry-data + # specify MHCflurry release for which to download models, need to be updated here as well when MHCflurry will be updated + export MHCFLURRY_DOWNLOADS_CURRENT_RELEASE=1.4.0 + # Add non-free software to the PATH + shopt -s nullglob + IFS=',' read -r -a netmhc_paths_string <<< \"$netmhc_paths_string\" + for p in "\${netmhc_paths_string[@]}"; do + export PATH="\$(realpath -s "\$p"):\$PATH"; + done + shopt -u nullglob + + epaa.py --identifier ${splitted.baseName} \ + --alleles '${meta.alleles}' \ + --tools '${tools_to_use}' \ + --max_length ${max_length} \ + --min_length ${min_length} \ + --versions ${software_versions} \ + ${argument} ${splitted} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version 2>&1 | sed 's/Python //g') + epytope: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('epytope').version)") + pandas: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('pandas').version)") + pyvcf: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('pyvcf').version)") + mhcflurry: \$(mhcflurry-predict --version 2>&1 | sed 's/^mhcflurry //; s/ .*\$//') + mhcnuggets: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('mhcnuggets').version)") + END_VERSIONS + """ +} diff --git a/modules/local/show_supported_models.nf b/modules/local/epytope_show_supported_models.nf similarity index 56% rename from modules/local/show_supported_models.nf rename to modules/local/epytope_show_supported_models.nf index 34b876a..157758c 100644 --- a/modules/local/show_supported_models.nf +++ b/modules/local/epytope_show_supported_models.nf @@ -1,10 +1,10 @@ -process SHOW_SUPPORTED_MODELS { +process EPYTOPE_SHOW_SUPPORTED_MODELS { label 'process_low' - conda (params.enable_conda ? "bioconda::fred2=2.0.7 bioconda::mhcflurry=1.4.3 bioconda::mhcnuggets=2.3.2" : null) + conda (params.enable_conda ? "bioconda::epytope=3.1.0" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-c3f301504f7fa2e7bf81c3783de19a9990ea3001:12b1b9f040fd92a80629d58f8a558dde4820eb15-0' : - 'quay.io/biocontainers/mulled-v2-c3f301504f7fa2e7bf81c3783de19a9990ea3001:12b1b9f040fd92a80629d58f8a558dde4820eb15-0' }" + 'https://depot.galaxyproject.org/singularity/epytope:3.1.0--pyh5e36f6f_0' : + 'quay.io/biocontainers/epytope:3.1.0--pyh5e36f6f_0' }" input: tuple val(meta), path(raw), path(software_versions) @@ -21,7 +21,7 @@ process SHOW_SUPPORTED_MODELS { "${task.process}": mhcflurry: \$(echo \$(mhcflurry-predict --version 2>&1 | sed 's/^mhcflurry //; s/ .*\$//') ) mhcnuggets: \$(echo \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('mhcnuggets').version)")) - fred2: \$(echo \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('Fred2').version)")) + epytope: \$(echo \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('epytope').version)")) END_VERSIONS """ diff --git a/modules/local/external_tools_import.nf b/modules/local/external_tools_import.nf index a40f392..a764cbf 100644 --- a/modules/local/external_tools_import.nf +++ b/modules/local/external_tools_import.nf @@ -4,6 +4,11 @@ process EXTERNAL_TOOLS_IMPORT { label 'process_low' + conda (params.enable_conda ? "conda-forge::coreutils=9.1" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img' : + 'biocontainers/biocontainers:v1.2.0_cv1' }" + input: tuple val(toolname), val(toolversion), val(toolchecksum), path(tooltarball), file(datatarball), val(datachecksum), val(toolbinaryname) @@ -46,8 +51,13 @@ process EXTERNAL_TOOLS_IMPORT { sed -i.bak \ -e 's_bin/tcsh.*\$_usr/bin/env tcsh_' \ -e "s_/scratch_/tmp_" \ - -e "s_setenv[[:space:]]NMHOME.*_setenv NMHOME \\`realpath -s \\\$0 | sed -r 's/[^/]+\$//'\\`_" "${toolname}/${toolbinaryname}" + -e "s_setenv[[:space:]]NMHOME.*_setenv NMHOME \\`realpath -s \\\$0 | sed -r 's/[^/]+\$//'\\`_ " "${toolname}/${toolbinaryname}" + # MODIFY perl location in perl script for netmhcIIpan + if [ "$toolname" == "netmhciipan" ]; then + sed -i.bak \ + -e 's_bin/perl.*\$_local/bin/perl_' "${toolname}/NetMHCIIpan-${toolversion}.pl" + fi # # VALIDATE THE CHECKSUM OF THE DOWNLOADED MODEL DATA # @@ -66,5 +76,6 @@ process EXTERNAL_TOOLS_IMPORT { # CREATE VERSION FILE # echo "${toolname} ${toolversion}" > "v_${toolname}.txt" + """ } diff --git a/modules/local/get_prediction_versions.nf b/modules/local/get_prediction_versions.nf index e4653a8..7cf76c2 100644 --- a/modules/local/get_prediction_versions.nf +++ b/modules/local/get_prediction_versions.nf @@ -1,10 +1,10 @@ process GET_PREDICTION_VERSIONS { label 'process_low' - conda (params.enable_conda ? "bioconda::fred2=2.0.7 bioconda::mhcflurry=1.4.3 bioconda::mhcnuggets=2.3.2" : null) + conda (params.enable_conda ? "bioconda::epytope=3.1.0" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-c3f301504f7fa2e7bf81c3783de19a9990ea3001:12b1b9f040fd92a80629d58f8a558dde4820eb15-0' : - 'quay.io/biocontainers/mulled-v2-c3f301504f7fa2e7bf81c3783de19a9990ea3001:12b1b9f040fd92a80629d58f8a558dde4820eb15-0' }" + 'https://depot.galaxyproject.org/singularity/epytope:3.1.0--pyh5e36f6f_0' : + 'quay.io/biocontainers/epytope:3.1.0--pyh5e36f6f_0' }" input: val external_tool_versions @@ -13,18 +13,20 @@ process GET_PREDICTION_VERSIONS { path "versions.csv", emit: versions script: - def external_tools = external_tool_versions.join('\n') + def external_tools = external_tool_versions.join(",") """ cat <<-END_VERSIONS > versions.csv mhcflurry: \$(mhcflurry-predict --version 2>&1 | sed 's/^mhcflurry //; s/ .*\$//') mhcnuggets: \$(python -c "import pkg_resources; print('mhcnuggets' + pkg_resources.get_distribution('mhcnuggets').version)" | sed 's/^mhcnuggets//; s/ .*\$//' ) - fred2: \$(python -c "import pkg_resources; print('fred2' + pkg_resources.get_distribution('Fred2').version)" | sed 's/^fred2//; s/ .*\$//') + epytope: \$(python -c "import pkg_resources; print('epytope' + pkg_resources.get_distribution('epytope').version)" | sed 's/^epytope//; s/ .*\$//') END_VERSIONS - if ! [ -z "${external_tools}" ] - then - echo ${external_tools} >> versions.csv + IFS=',' read -r -a external_tools <<< \"$external_tools\" + if ! [ -z "${external_tool_versions}" ]; then + for TOOL in "\${external_tools[@]}"; do + echo "\$TOOL" >> versions.csv + done fi """ } diff --git a/modules/local/peptide_prediction.nf b/modules/local/peptide_prediction.nf deleted file mode 100644 index 92dd6a8..0000000 --- a/modules/local/peptide_prediction.nf +++ /dev/null @@ -1,55 +0,0 @@ -process PEPTIDE_PREDICTION { - label 'process_low' - - conda (params.enable_conda ? "bioconda::snpsift=4.3.1t bioconda::python=2.7.15 bioconda::pyvcf=0.6.8 conda-forge::pandas=0.24.2 bioconda::fred2=2.0.7 bioconda::mhcflurry=1.4.3 bioconda::mhcnuggets=2.3.2" : null) - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-c3f301504f7fa2e7bf81c3783de19a9990ea3001:12b1b9f040fd92a80629d58f8a558dde4820eb15-0' : - 'quay.io/biocontainers/mulled-v2-c3f301504f7fa2e7bf81c3783de19a9990ea3001:12b1b9f040fd92a80629d58f8a558dde4820eb15-0' }" - - input: - tuple val(meta), path(splitted), path(software_versions) - - output: - tuple val(meta), path("*.json"), emit: json - tuple val(meta), path("*.tsv"), emit: predicted - tuple val(meta), path("*.fasta"), emit: fasta optional true - path "versions.yml", emit: versions - - script: - // Additions to the argument command need to go to the beginning. - // Argument list needs to end with --peptides or --somatic_mutation - def argument = task.ext.args - - if (params.proteome) { - argument = "--proteome ${params.proteome} " + argument - } - - if (params.wild_type) { - argument = "--wild_type " + argument - } - - if (params.fasta_output) { - argument = "--fasta_output " + argument - } - - if (params.tool_thresholds) { - argument = "--tool_thresholds ${params.tool_thresholds} " + argument - } - - """ - epaa.py --identifier ${splitted.baseName} \ - --alleles '${meta.alleles}' \ - --versions ${software_versions} \ - ${argument} ${splitted} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version 2>&1 | sed 's/Python //g') - fred2: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('Fred2').version)") - pandas: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('pandas').version)") - pyvcf: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('pyvcf').version)") - mhcflurry: \$(mhcflurry-predict --version 2>&1 | sed 's/^mhcflurry //; s/ .*\$//') - mhcnuggets: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('mhcnuggets').version)") - END_VERSIONS - """ -} diff --git a/modules/local/snpsift_split.nf b/modules/local/snpsift_split.nf index 5c1db0e..8702345 100644 --- a/modules/local/snpsift_split.nf +++ b/modules/local/snpsift_split.nf @@ -1,7 +1,7 @@ process SNPSIFT_SPLIT { label 'process_low' - conda (params.enable_conda ? "conda-forge::snpsift:4.2" : null) + conda (params.enable_conda ? "bioconda::snpsift=4.2" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/snpsift:4.2--hdfd78af_5' : 'quay.io/biocontainers/snpsift:4.2--hdfd78af_5' }" @@ -18,8 +18,8 @@ process SNPSIFT_SPLIT { SnpSift split ${input_file} cat <<-END_VERSIONS > versions.yml - "${task.process}": - snpsift: \$(echo \$(snpsift -version 2>&1 | sed -n 3p | cut -d\$' ' -f3)) + "${task.process}": + snpsift: \$( echo \$(SnpSift split -h 2>&1) | sed 's/^.*version //' | sed 's/(.*//' | sed 's/t//g' ) END_VERSIONS """ } diff --git a/modules/local/variant_split.nf b/modules/local/variant_split.nf new file mode 100644 index 0000000..6074865 --- /dev/null +++ b/modules/local/variant_split.nf @@ -0,0 +1,30 @@ +process VARIANT_SPLIT { + label 'process_low' + + conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.8.3' : + 'quay.io/biocontainers/python:3.8.3' }" + + input: + tuple val(meta), path(input_file) + + output: + tuple val(meta), path("*.vcf"), emit: splitted + path "versions.yml", emit: versions + + script: + + def size_parameter = params.split_by_variants_size != 0 ? "--size ${params.split_by_variants_size}" : '' + def distance_parameter = params.split_by_variants_distance ? "--distance ${params.split_by_variants_distance}" : '' + + """ + split_vcf_by_variants.py --input ${input_file} ${size_parameter} ${distance_parameter} --output . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version 2>&1 | sed 's/Python //g') + END_VERSIONS + """ + +} diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/main.nf b/modules/nf-core/modules/custom/dumpsoftwareversions/main.nf index 934bb46..12293ef 100644 --- a/modules/nf-core/modules/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/modules/custom/dumpsoftwareversions/main.nf @@ -2,10 +2,10 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { label 'process_low' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda (params.enable_conda ? "bioconda::multiqc=1.11" : null) + conda (params.enable_conda ? "bioconda::multiqc=1.12" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.11--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.11--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.12--pyhdfd78af_0' : + 'quay.io/biocontainers/multiqc:1.12--pyhdfd78af_0' }" input: path versions @@ -15,6 +15,9 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { path "software_versions_mqc.yml", emit: mqc_yml path "versions.yml" , emit: versions + when: + task.ext.when == null || task.ext.when + script: def args = task.ext.args ?: '' template 'dumpsoftwareversions.py' diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/modules/custom/dumpsoftwareversions/meta.yml index 5b5b8a6..60b546a 100644 --- a/modules/nf-core/modules/custom/dumpsoftwareversions/meta.yml +++ b/modules/nf-core/modules/custom/dumpsoftwareversions/meta.yml @@ -8,7 +8,7 @@ tools: description: Custom module used to dump software versions within the nf-core pipeline template homepage: https://github.com/nf-core/tools documentation: https://github.com/nf-core/tools - licence: ['MIT'] + licence: ["MIT"] input: - versions: type: file diff --git a/modules/nf-core/modules/gunzip/main.nf b/modules/nf-core/modules/gunzip/main.nf new file mode 100644 index 0000000..7036704 --- /dev/null +++ b/modules/nf-core/modules/gunzip/main.nf @@ -0,0 +1,44 @@ +process GUNZIP { + tag "$archive" + label 'process_low' + + conda (params.enable_conda ? "conda-forge::sed=4.7" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'ubuntu:20.04' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("$gunzip"), emit: gunzip + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + gunzip = archive.toString() - '.gz' + """ + gunzip \\ + -f \\ + $args \\ + $archive + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunzip: \$(echo \$(gunzip --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//') + END_VERSIONS + """ + + stub: + gunzip = archive.toString() - '.gz' + """ + touch $gunzip + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunzip: \$(echo \$(gunzip --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/gunzip/meta.yml b/modules/nf-core/modules/gunzip/meta.yml new file mode 100644 index 0000000..4d2ebc8 --- /dev/null +++ b/modules/nf-core/modules/gunzip/meta.yml @@ -0,0 +1,34 @@ +name: gunzip +description: Compresses and decompresses files. +keywords: + - gunzip + - compression +tools: + - gunzip: + description: | + gzip is a file format and a software application used for file compression and decompression. + documentation: https://www.gnu.org/software/gzip/manual/gzip.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Optional groovy Map containing meta information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be compressed/uncompressed + pattern: "*.*" +output: + - gunzip: + type: file + description: Compressed/uncompressed file + pattern: "*.*" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@jfy133" diff --git a/modules/nf-core/modules/multiqc/main.nf b/modules/nf-core/modules/multiqc/main.nf index 3dceb16..1e7d6af 100644 --- a/modules/nf-core/modules/multiqc/main.nf +++ b/modules/nf-core/modules/multiqc/main.nf @@ -1,13 +1,14 @@ process MULTIQC { label 'process_medium' - conda (params.enable_conda ? 'bioconda::multiqc=1.11' : null) + conda (params.enable_conda ? 'bioconda::multiqc=1.13a' : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.11--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.11--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.13a--pyhdfd78af_1' : + 'quay.io/biocontainers/multiqc:1.13a--pyhdfd78af_1' }" input: - path multiqc_files + path multiqc_files, stageAs: "?/*" + tuple path(multiqc_config), path(multiqc_logo) output: path "*multiqc_report.html", emit: report @@ -15,10 +16,30 @@ process MULTIQC { path "*_plots" , optional:true, emit: plots path "versions.yml" , emit: versions + when: + task.ext.when == null || task.ext.when + script: def args = task.ext.args ?: '' + def config = multiqc_config ? "--config $multiqc_config" : '' + """ + multiqc \\ + --force \\ + $config \\ + $args \\ + . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) + END_VERSIONS + """ + + stub: """ - multiqc -f $args . + touch multiqc_data + touch multiqc_plots + touch multiqc_report.html cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/modules/multiqc/meta.yml b/modules/nf-core/modules/multiqc/meta.yml index 63c75a4..bf3a27f 100644 --- a/modules/nf-core/modules/multiqc/meta.yml +++ b/modules/nf-core/modules/multiqc/meta.yml @@ -1,40 +1,48 @@ name: MultiQC description: Aggregate results from bioinformatics analyses across many samples into a single report keywords: - - QC - - bioinformatics tools - - Beautiful stand-alone HTML report + - QC + - bioinformatics tools + - Beautiful stand-alone HTML report tools: - - multiqc: - description: | - MultiQC searches a given directory for analysis logs and compiles a HTML report. - It's a general use tool, perfect for summarising the output from numerous bioinformatics tools. - homepage: https://multiqc.info/ - documentation: https://multiqc.info/docs/ - licence: ['GPL-3.0-or-later'] + - multiqc: + description: | + MultiQC searches a given directory for analysis logs and compiles a HTML report. + It's a general use tool, perfect for summarising the output from numerous bioinformatics tools. + homepage: https://multiqc.info/ + documentation: https://multiqc.info/docs/ + licence: ["GPL-3.0-or-later"] input: - - multiqc_files: - type: file - description: | - List of reports / files recognised by MultiQC, for example the html and zip output of FastQC + - multiqc_files: + type: file + description: | + List of reports / files recognised by MultiQC, for example the html and zip output of FastQC + - multiqc_config: + type: file + description: Config yml for MultiQC + pattern: "*.{yml,yaml}" + - multiqc_logo: + type: file + description: Logo file for MultiQC + pattern: "*.{png}" output: - - report: - type: file - description: MultiQC report file - pattern: "multiqc_report.html" - - data: - type: dir - description: MultiQC data dir - pattern: "multiqc_data" - - plots: - type: file - description: Plots created by MultiQC - pattern: "*_data" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" + - report: + type: file + description: MultiQC report file + pattern: "multiqc_report.html" + - data: + type: dir + description: MultiQC data dir + pattern: "multiqc_data" + - plots: + type: file + description: Plots created by MultiQC + pattern: "*_data" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" authors: - - "@abhi18av" - - "@bunop" - - "@drpatelh" + - "@abhi18av" + - "@bunop" + - "@drpatelh" diff --git a/nextflow.config b/nextflow.config index 974b282..0d812cb 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,7 +1,7 @@ /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ nf-core/epitopeprediction Nextflow config file -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Default config options for all compute environments ---------------------------------------------------------------------------------------- */ @@ -12,26 +12,35 @@ params { input = null peptides_split_maxchunks = 100 peptides_split_minchunksize = 5000 + split_by_variants = false + split_by_variants_size = 0 + split_by_variants_distance = 110000 // References genome_version = 'GRCh37' // Options: Predictions - mhc_class = 1 tools = 'syfpeithi' tool_thresholds = null + use_affinity_thresholds = false // Options: Filtering filter_self = false show_supported_models = false + + // External tools + external_tools_meta = "${projectDir}/assets/external_tools_meta.json" + netmhc_system = 'linux' netmhcpan_path = null netmhc_path = null netmhciipan_path = null netmhcii_path = null // Options: Peptides - max_peptide_length = (mhc_class == 1) ? 11 : 16 - min_peptide_length = (mhc_class == 1) ? 8 : 15 + max_peptide_length = 11 + min_peptide_length = 8 + max_peptide_length_class2 = 16 + min_peptide_length_class2 = 15 // Options: Annotation proteome = null @@ -41,8 +50,9 @@ params { fasta_output = false // Boilerplate options - outdir = './results' + outdir = null tracedir = "${params.outdir}/pipeline_info" + publish_dir_mode = 'copy' email = null email_on_fail = null plaintext_email = false @@ -85,6 +95,15 @@ try { System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config") } +// Load nf-core/epitopeprediction custom profiles from different institutions. +// Warning: Uncomment only if a pipeline-specific instititutional config already exists on nf-core/configs! +// try { +// includeConfig "${params.custom_config_base}/pipeline/epitopeprediction.config" +// } catch (Exception e) { +// System.err.println("WARNING: Could not load nf-core/config/epitopeprediction profiles: ${params.custom_config_base}/pipeline/epitopeprediction.config") +// } + + profiles { debug { process.beforeScript = 'echo $HOSTNAME' } conda { @@ -134,6 +153,7 @@ profiles { } test { includeConfig 'conf/test.config' } test_variant_tsv { includeConfig 'conf/test_variant_tsv.config' } + test_grch38_variant_tsv { includeConfig 'conf/test_grch38_variant_tsv.config' } test_peptides { includeConfig 'conf/test_peptides.config' } test_peptides_h2 { includeConfig 'conf/test_peptides_h2.config' } test_proteins { includeConfig 'conf/test_proteins.config' } @@ -180,7 +200,7 @@ trace { } dag { enabled = true - file = "${params.tracedir}/pipeline_dag_${trace_timestamp}.svg" + file = "${params.tracedir}/pipeline_dag_${trace_timestamp}.html" } // Load modules.config for DSL2 module specific options @@ -193,7 +213,7 @@ manifest { description = 'A fully reproducible and state of the art epitope prediction pipeline.' mainScript = 'main.nf' nextflowVersion = '!>=21.10.3' - version = '2.0.0' + version = '2.1.0' } // Function to ensure that resource requirements don't go beyond diff --git a/nextflow_schema.json b/nextflow_schema.json index 22591a4..10de656 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,9 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -26,8 +24,8 @@ }, "outdir": { "type": "string", - "description": "Path to the output directory where the results will be saved.", - "default": "./results", + "format": "directory-path", + "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", "fa_icon": "fas fa-folder-open" }, "email": { @@ -54,10 +52,7 @@ "type": "string", "default": "GRCh37", "help_text": "This defines against which human reference genome the pipeline performs the analysis including the incorporation of genetic variants e.g..", - "enum": [ - "GRCh37", - "GRCh38" - ], + "enum": ["GRCh37", "GRCh38"], "description": "Specifies the human reference genome version." }, "proteome": { @@ -93,16 +88,6 @@ "description": "Filter against human proteome.", "help_text": "Specifies that peptides should be filtered against the specified human proteome references." }, - "mhc_class": { - "type": "integer", - "default": 1, - "description": "MHC class for prediction.", - "help_text": "Specifies whether the predictions should be done for MHC class I or class II.", - "enum": [ - 1, - 2 - ] - }, "max_peptide_length": { "type": "integer", "default": 11, @@ -115,16 +100,31 @@ "description": "Specifies the minimum peptide length.", "help_text": "Specifies the minimum peptide length (not applied when `--peptides` is specified). Default: MCH class I: 8 aa, MHC class II: 15 aa" }, + "max_peptide_length_class2": { + "type": "integer", + "default": 16, + "description": "Specifies the maximum peptide length for MHC class II peptides." + }, + "min_peptide_length_class2": { + "type": "integer", + "default": 15, + "description": "Specifies the minimum peptide length for MHC class II peptides." + }, "tools": { "type": "string", "default": "syfpeithi", - "help_text": "Specifies the tool(s) to use. Available are: `syfpeithi`, `mhcflurry`, `mhcnuggets-class-1`, `mhcnuggets-class-2`. Can be combined in a list separated by comma.", + "help_text": "Specifies the tool(s) to use. Multiple tools can be combined in a list separated by comma.\nAvailable are: `syfpeithi`, `mhcflurry`, `mhcnuggets-class-1`, `mhcnuggets-class-2`,`netmhcpan-4.0`,`netmhcpan-4.1`,`netmhc-4.0`,`netmhciipan-4.1`.", "description": "Specifies the prediction tool(s) to use." }, "tool_thresholds": { "type": "string", "description": "Specifies tool-specific binder thresholds in a JSON file. This can be used to override the given default binder threshold values.", - "help_text": "Default affinity thresholds to determine whether a peptide is considered as a binder are the following:: `syfpeithi` > 50, `mhcflurry` <=500, `mhcnuggets-class-1` <= 500, `mhcnuggets-class-2` <= 500, `netmhc` <= 500, `netmhcpan` <= 500, `netmhcii` <= 500, `netmhciipan` <= 500. Thresholds can be customized in a JSON file: `tool-name:value`" + "help_text": "Default thresholds to determine whether a peptide is considered as a binder are the following: `syfpeithi` > 50, `mhcflurry` <=500, `mhcnuggets-class-1` <= 500, `mhcnuggets-class-2` <= 500, `netmhc` <= 2, `netmhcpan` <= 2, `netmhcii` <= 5, `netmhciipan` <= 5. Note that the default threshold for NetMHC tools is based on the rank metric. The remaining predictors thresholds are based on affinities. Thresholds can be customized in a JSON file: `tool-name:value`" + }, + "use_affinity_thresholds": { + "type": "boolean", + "description": "Specifies the affinity metric instead of the rank metric used for determining whether a peptide is considered as a binder.", + "help_text": "Switches the prediction metric of netMHC tools from rank to affinity. Default: `netmhc` <= 500, `netmhcpan` <= 500, `netmhcii` <= 500, `netmhciipan` <= 500." }, "wild_type": { "type": "boolean", @@ -149,6 +149,21 @@ "description": "Options for optimising the pipeline run execution.", "fa_icon": "fas fa-history", "properties": { + "split_by_variants": { + "type": "boolean", + "description": "Split VCF file into multiple files by number of variants." + }, + "split_by_variants_size": { + "type": "integer", + "default": 0, + "description": "Number of variants that should be written into one file. Default: number of variants divided by ten" + }, + "split_by_variants_distance": { + "type": "integer", + "default": 110000, + "description": "Number of nucleotides between previous and current variant across split.", + "help_text": "This can be used to avoid that variants end up in separate splits that fall onto the same transcript and therefore potentially contribute to the same mutated protein. " + }, "peptides_split_maxchunks": { "type": "integer", "default": 100, @@ -169,25 +184,35 @@ "description": "External MHC binding prediction software that is not shipped with the pipeline.", "default": null, "properties": { + "external_tools_meta": { + "type": "string", + "default": "${projectDir}/assets/external_tools_meta.json", + "description": "Specifies the path to the JSON file with meta information on external prediction tools." + }, + "netmhc_system": { + "type": "string", + "default": "linux", + "description": "Specifies the operating system in use (Linux or Darwin). This is only necessary if conda is used." + }, "netmhcpan_path": { "type": "string", "default": "None", - "description": "To use the 'netmhcpan' tool, specify the path to the original software tarball for NetMHCpan 4.0 (Linux) here." + "description": "To use the 'netmhcpan' tool, specify the path to the original software tarball for NetMHCpan 4.0 here." }, "netmhc_path": { "type": "string", "default": "None", - "description": "To use the 'netmhc' tool, specify the path to the original software tarball for NetMHC 4.0 (Linux) here." + "description": "To use the 'netmhc' tool, specify the path to the original software tarball for NetMHC 4.0 here." }, "netmhciipan_path": { "type": "string", "default": "None", - "description": "To use the 'netmhciipan' tool, specify the path to the original software tarball for NetMHCIIpan 3.1 (Linux) here." + "description": "To use the 'netmhciipan' tool, specify the path to the original software tarball for NetMHCIIpan 3.1 here." }, "netmhcii_path": { "type": "string", "default": "None", - "description": "To use the 'netmhcii' tool, specify the path to the original software tarball for NetMHCII 2.2 (Linux) here." + "description": "To use the 'netmhcii' tool, specify the path to the original software tarball for NetMHCII 2.2 here." } } }, @@ -288,6 +313,15 @@ "fa_icon": "fas fa-question-circle", "hidden": true }, + "publish_dir_mode": { + "type": "string", + "default": "copy", + "description": "Method used to save pipeline results to output directory.", + "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", + "fa_icon": "fas fa-copy", + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "hidden": true + }, "email_on_fail": { "type": "string", "description": "Email address for completion summary, only when pipeline fails.", @@ -381,4 +415,4 @@ "$ref": "#/definitions/generic_options" } ] -} \ No newline at end of file +} diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 92d9aab..1913eea 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -23,11 +23,12 @@ workflow INPUT_CHECK { // Function to get list of [ meta, filenames ] def get_samplesheet_paths(LinkedHashMap row) { - def allele_string = generate_allele_string(row.alleles) + def allele_string = generate_allele_string(row.alleles, row.mhc_class) def type = determine_input_type(row.filename) def meta = [:] meta.sample = row.sample meta.alleles = allele_string + meta.mhcclass = row.mhc_class meta.inputtype = type def array = [] @@ -39,11 +40,17 @@ def get_samplesheet_paths(LinkedHashMap row) { return array } -def generate_allele_string(String alleles) { +def generate_allele_string(String alleles, String mhcclass) { // Collect the allele information from the file def allele_string + valid_class1_loci = ['A*','B*','C*','E*','G*'] + valid_class2_loci = ['DR','DP','DQ'] if ( alleles.endsWith(".txt") || alleles.endsWith(".alleles") ) { allele_string = file(alleles).readLines().join(';') + if ((mhcclass == 'I' & valid_class2_loci.any { allele_string.contains(it)}) | + (mhcclass == 'II' & valid_class1_loci.any { allele_string.contains(it)})) { + exit 1, "ERROR: Please check input samplesheet -> invalid mhc class and allele combination found!\n${row.Filename}" + } } // or assign the information to a new variable else { @@ -57,7 +64,10 @@ def determine_input_type(String filename) { def input_file = file(filename) def extension = input_file.extension - if ( extension == "vcf" | extension == "vcf.gz" ) { + if (filename.endsWith("vcf.gz") ) { + filetype = "variant_compressed" + } + else if (extension == "vcf") { filetype = "variant" } else if ( extension == "tsv" | extension == "GSvar" ) { diff --git a/workflows/epitopeprediction.nf b/workflows/epitopeprediction.nf index b0e5cad..64c59aa 100644 --- a/workflows/epitopeprediction.nf +++ b/workflows/epitopeprediction.nf @@ -1,7 +1,7 @@ /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ VALIDATE INPUTS -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) @@ -17,48 +17,51 @@ for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONFIG FILES -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -ch_multiqc_config = file("$projectDir/assets/multiqc_config.yaml", checkIfExists: true) +ch_multiqc_config = [file("$projectDir/assets/multiqc_config.yml", checkIfExists: true), file("$projectDir/assets/nf-core-epitopeprediction_logo_light.png", checkIfExists: true)] ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config) : Channel.empty() +ch_multiqc_configs = Channel.from(ch_multiqc_config).mix(ch_multiqc_custom_config).ifEmpty([]) /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT LOCAL MODULES/SUBWORKFLOWS -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ // // MODULE: Local to the pipeline // -include { GET_PREDICTION_VERSIONS } from '../modules/local/get_prediction_versions' +include { GET_PREDICTION_VERSIONS } from '../modules/local/get_prediction_versions' -include { EXTERNAL_TOOLS_IMPORT } from '../modules/local/external_tools_import' +include { EXTERNAL_TOOLS_IMPORT } from '../modules/local/external_tools_import' -include { CHECK_REQUESTED_MODELS as CHECK_REQUESTED_MODELS_PEP } from '../modules/local/check_requested_models' -include { CHECK_REQUESTED_MODELS } from '../modules/local/check_requested_models' -include { SHOW_SUPPORTED_MODELS} from '../modules/local/show_supported_models' +include { EPYTOPE_CHECK_REQUESTED_MODELS as EPYTOPE_CHECK_REQUESTED_MODELS_PROTEIN } from '../modules/local/epytope_check_requested_models' +include { EPYTOPE_CHECK_REQUESTED_MODELS as EPYTOPE_CHECK_REQUESTED_MODELS_PEP } from '../modules/local/epytope_check_requested_models' +include { EPYTOPE_CHECK_REQUESTED_MODELS } from '../modules/local/epytope_check_requested_models' +include { EPYTOPE_SHOW_SUPPORTED_MODELS } from '../modules/local/epytope_show_supported_models' -include { SNPSIFT_SPLIT} from '../modules/local/snpsift_split' -include { CSVTK_SPLIT} from '../modules/local/csvtk_split' +include { VARIANT_SPLIT} from '../modules/local/variant_split' +include { SNPSIFT_SPLIT} from '../modules/local/snpsift_split' +include { CSVTK_SPLIT} from '../modules/local/csvtk_split' -include { FRED2_GENERATEPEPTIDES } from '../modules/local/fred2_generatepeptides' -include { SPLIT_PEPTIDES } from '../modules/local/split_peptides' -include { SPLIT_PEPTIDES as SPLIT_PEPTIDES_PROTEIN } from '../modules/local/split_peptides' +include { EPYTOPE_GENERATE_PEPTIDES } from '../modules/local/epytope_generate_peptides' +include { SPLIT_PEPTIDES } from '../modules/local/split_peptides' +include { SPLIT_PEPTIDES as SPLIT_PEPTIDES_PROTEIN } from '../modules/local/split_peptides' -include { PEPTIDE_PREDICTION as PEPTIDE_PREDICTION_PROTEIN } from '../modules/local/peptide_prediction' -include { PEPTIDE_PREDICTION as PEPTIDE_PREDICTION_PEP } from '../modules/local/peptide_prediction' -include { PEPTIDE_PREDICTION as PEPTIDE_PREDICTION_VAR } from '../modules/local/peptide_prediction' +include { EPYTOPE_PEPTIDE_PREDICTION as EPYTOPE_PEPTIDE_PREDICTION_PROTEIN } from '../modules/local/epytope_peptide_prediction' +include { EPYTOPE_PEPTIDE_PREDICTION as EPYTOPE_PEPTIDE_PREDICTION_PEP } from '../modules/local/epytope_peptide_prediction' +include { EPYTOPE_PEPTIDE_PREDICTION as EPYTOPE_PEPTIDE_PREDICTION_VAR } from '../modules/local/epytope_peptide_prediction' -include { CAT_FILES as CAT_TSV } from '../modules/local/cat_files' -include { CAT_FILES as CAT_FASTA } from '../modules/local/cat_files' -include { CSVTK_CONCAT } from '../modules/local/csvtk_concat' +include { CAT_FILES as CAT_TSV } from '../modules/local/cat_files' +include { CAT_FILES as CAT_FASTA } from '../modules/local/cat_files' +include { CSVTK_CONCAT } from '../modules/local/csvtk_concat' -include { MERGE_JSON as MERGE_JSON_SINGLE } from '../modules/local/merge_json' -include { MERGE_JSON as MERGE_JSON_MULTI } from '../modules/local/merge_json' +include { MERGE_JSON as MERGE_JSON_SINGLE } from '../modules/local/merge_json' +include { MERGE_JSON as MERGE_JSON_MULTI } from '../modules/local/merge_json' // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules @@ -66,9 +69,9 @@ include { MERGE_JSON as MERGE_JSON_MULTI } from '../mod include { INPUT_CHECK } from '../subworkflows/local/input_check' /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT NF-CORE MODULES/SUBWORKFLOWS -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ // @@ -76,16 +79,22 @@ include { INPUT_CHECK } from '../subworkflows/local/input_check' // include { MULTIQC } from '../modules/nf-core/modules/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' +include { GUNZIP as GUNZIP_VCF } from '../modules/nf-core/modules/gunzip/main' /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ // Info required for completion email and summary def multiqc_report = [] +// load meta data of external tools +import groovy.json.JsonSlurper +def jsonSlurper = new JsonSlurper() +def external_tools_meta = jsonSlurper.parse(file(params.external_tools_meta, checkIfExists: true)) + workflow EPITOPEPREDICTION { ch_versions = Channel.empty() @@ -104,7 +113,9 @@ workflow EPITOPEPREDICTION { INPUT_CHECK.out.reads .branch { meta_data, input_file -> - variant : meta_data.inputtype == 'variant' + variant_compressed : meta_data.inputtype == 'variant_compressed' + return [ meta_data, input_file ] + variant_uncompressed : meta_data.inputtype == 'variant' return [ meta_data, input_file ] peptide : meta_data.inputtype == 'peptide' return [ meta_data, input_file ] @@ -113,28 +124,35 @@ workflow EPITOPEPREDICTION { } .set { ch_samples_from_sheet } - //TODO think about a better how to handle these supported external versions, config file? (CM) - netmhc_meta_data = [ - netmhc : [ - version : "4.0", - software_md5 : "132dc322da1e2043521138637dd31ebf", - data_url : "https://services.healthtech.dtu.dk/services/NetMHC-4.0/data.tar.gz", - data_md5 : "63c646fa0921d617576531396954d633", - binary_name : "netMHC" - ], - netmhcpan: [ - version : "4.0", - software_md5 : "94aa60f3dfd9752881c64878500e58f3", - data_url : "https://services.healthtech.dtu.dk/services/NetMHCpan-4.0/data.Linux.tar.gz", - data_md5 : "26cbbd99a38f6692249442aeca48608f", - binary_name : "netMHCpan" - ] - ] + // gunzip variant files + GUNZIP_VCF ( + ch_samples_from_sheet.variant_compressed + ) + ch_versions = ch_versions.mix(GUNZIP_VCF.out.versions) + + ch_variants_uncompressed = GUNZIP_VCF.out.gunzip + .mix(ch_samples_from_sheet.variant_uncompressed) + + + // (re)combine different input file types + ch_samples_uncompressed = ch_samples_from_sheet.protein + .mix(ch_samples_from_sheet.peptide) + .mix(ch_variants_uncompressed) + .branch { + meta_data, input_file -> + variant : meta_data.inputtype == 'variant' | meta_data.inputtype == 'variant_compressed' + peptide : meta_data.inputtype == 'peptide' + protein : meta_data.inputtype == 'protein' + } tools = params.tools?.tokenize(',') if (tools.isEmpty()) { exit 1, "No valid tools specified." } + if (params.enable_conda && params.tools.contains("netmhc")) { + log.warn("Please note: if you want to use external prediction tools with conda it might be necessary to set --netmhc_system to darwin depending on your system.") + } + c_purple = params.monochrome_logs ? '' : "\033[0;35m"; c_reset = params.monochrome_logs ? '' : "\033[0m"; @@ -142,7 +160,7 @@ workflow EPITOPEPREDICTION { ch_external_versions = Channel .from(tools) .filter( ~/(?i)^net.*/ ) - .map { it -> "${it}: ${netmhc_meta_data[it.toLowerCase()].version}"} + .map { it -> "${it.split('-')[0]}: ${it.split('-')[1]}"} .collect() // get versions of all prediction tools @@ -152,9 +170,9 @@ workflow EPITOPEPREDICTION { // TODO I guess it would be better to have two subworkflows for the if else parts (CM) if (params.show_supported_models) { SHOW_SUPPORTED_MODELS( - ch_samples_from_sheet + ch_samples_uncompressed .protein - .mix(ch_samples_from_sheet.variant, ch_samples_from_sheet.peptide) + .mix(ch_samples_uncompressed.variant, ch_samples_uncompressed.peptide) .combine(ch_prediction_tool_versions) .first() ) @@ -168,36 +186,32 @@ workflow EPITOPEPREDICTION { ======================================================================================== */ - // perform the check requested models on the protein and variant files - // we have to perform it on all alleles that are given in the sample sheet - ch_variants_protein_models = ch_samples_from_sheet - .variant - .mix(ch_samples_from_sheet.protein) - .map { meta_data, file -> meta_data.alleles } - .splitCsv(sep: ';') - .collect() - .toList() - .combine(ch_samples_from_sheet.variant.first()) - .map { it -> tuple(it[0].unique(), it[-1])} + // perform the check requested models on the variant files + EPYTOPE_CHECK_REQUESTED_MODELS( + ch_samples_uncompressed.variant, + ch_prediction_tool_versions + ) - CHECK_REQUESTED_MODELS( - ch_variants_protein_models, + // perform the check requested models on the protein files + EPYTOPE_CHECK_REQUESTED_MODELS_PROTEIN( + ch_samples_uncompressed.protein, ch_prediction_tool_versions ) // perform the check requested models on the peptide file where we need the input itself to determine the given peptide lengths - CHECK_REQUESTED_MODELS_PEP( - ch_samples_from_sheet + EPYTOPE_CHECK_REQUESTED_MODELS_PEP( + ch_samples_uncompressed .peptide - .map { meta_data, input_file -> tuple( meta_data.alleles.split(';'), input_file ) }, + .map { meta_data, input_file -> tuple( meta_data, input_file ) }, ch_prediction_tool_versions ) // Return a warning if this is raised - CHECK_REQUESTED_MODELS + EPYTOPE_CHECK_REQUESTED_MODELS .out .log - .mix( CHECK_REQUESTED_MODELS_PEP.out.log ) + .mix( EPYTOPE_CHECK_REQUESTED_MODELS_PEP.out.log ) + .mix (EPYTOPE_CHECK_REQUESTED_MODELS_PROTEIN.out.log ) .subscribe { model_log_file = file( it, checkIfExists: true ) def lines = model_log_file.readLines() @@ -210,31 +224,45 @@ workflow EPITOPEPREDICTION { } // Retrieve meta data for external tools - ["netmhc", "netmhcpan"].each { - // Check if the _path parameter was set for this tool - if (params["${it}_path"] as Boolean && ! tools.contains(it)) - { - log.warn("--${it}_path specified, but --tools does not contain ${it}. Both have to be specified to enable ${it}. Ignoring.") - } - else if (!params["${it}_path"] as Boolean && tools.contains(it)) - { - log.warn("--${it}_path not specified, but --tools contains ${it}. Both have to be specified to enable ${it}. Ignoring.") - tools.removeElement(it) - } - else if (params["${it}_path"]) - { - // If so, add the tool name and user installation path to the external tools import channel - ch_nonfree_paths.bind([ - it, - netmhc_meta_data[it].version, - netmhc_meta_data[it].software_md5, - file(params["${it}_path"], checkIfExists:true), - file(netmhc_meta_data[it].data_url), - netmhc_meta_data[it].data_md5, - netmhc_meta_data[it].binary_name - ]) + tools.each { + if(it.contains("net")) { + def tool_name = it.split('-')[0] + // Check if the _path parameter was set for this tool + if (params["${tool_name}_path"] as Boolean && ! tools.contains(it)) + { + log.warn("--${tool_name}_path specified, but --tools does not contain ${tool_name}. Both have to be specified to enable ${tool_name}. Ignoring.") + } + else if (!params["${tool_name}_path"] as Boolean && tools.contains(it)) + { + log.warn("--${tool_name}_path not specified, but --tools contains ${tool_name}. Both have to be specified to enable ${tool_name}. Ignoring.") + tools.removeElement(it) + } + else if (params["${tool_name}_path"]) + { + def external_tool_version = it.split('-')[1] + if (! external_tools_meta[tool_name].keySet().contains(external_tool_version)) { + exit 1, "Unsupported external prediction tool version specified: ${tool_name} ${external_tool_version}" + } + + def entry = external_tools_meta[tool_name][external_tool_version] + if (params["netmhc_system"] == 'darwin') { + entry = external_tools_meta["${tool_name}_darwin"][external_tool_version] + } + + // If so, add the tool name and user installation path to the external tools import channel + ch_nonfree_paths.bind([ + tool_name, + entry.version, + entry.software_md5, + file(params["${tool_name}_path"], checkIfExists:true), + file(entry.data_url), + entry.data_md5, + entry.binary_name + ]) + } } } + // import external tools EXTERNAL_TOOLS_IMPORT( ch_nonfree_paths @@ -247,7 +275,7 @@ workflow EPITOPEPREDICTION { */ // Make a division for the variant files and process them further accordingly - ch_samples_from_sheet + ch_samples_uncompressed .variant .branch { meta_data, input_file -> @@ -258,34 +286,45 @@ workflow EPITOPEPREDICTION { } .set { ch_variants } - // include the snpsift_split function (only vcf and vcf.gz variant files) - SNPSIFT_SPLIT( - ch_variants.vcf - ) + // decide between the split_by_variants and snpsift_split (by chromosome) function (only vcf and vcf.gz variant files) + if (params.split_by_variants) { + VARIANT_SPLIT( + ch_variants.vcf + ) + .set { ch_split_variants } + ch_versions = ch_versions.mix( VARIANT_SPLIT.out.versions.ifEmpty(null) ) + + } + else { + SNPSIFT_SPLIT( + ch_variants.vcf + ) + .set { ch_split_variants } + ch_versions = ch_versions.mix( SNPSIFT_SPLIT.out.versions.ifEmpty(null) ) + } // include the csvtk_split function (only variant files with an tsv and GSvar executable) CSVTK_SPLIT( ch_variants.tab ) - ch_versions = ch_versions.mix( SNPSIFT_SPLIT.out.versions.ifEmpty(null) ) ch_versions = ch_versions.mix( CSVTK_SPLIT.out.versions.ifEmpty(null) ) // process FASTA file and generated peptides - FRED2_GENERATEPEPTIDES( - ch_samples_from_sheet.protein + EPYTOPE_GENERATE_PEPTIDES( + ch_samples_uncompressed.protein ) SPLIT_PEPTIDES_PROTEIN( - FRED2_GENERATEPEPTIDES.out.splitted + EPYTOPE_GENERATE_PEPTIDES.out.splitted ) - ch_versions = ch_versions.mix( FRED2_GENERATEPEPTIDES.out.versions.ifEmpty(null) ) + ch_versions = ch_versions.mix( EPYTOPE_GENERATE_PEPTIDES.out.versions.ifEmpty(null) ) ch_versions = ch_versions.mix( SPLIT_PEPTIDES_PROTEIN.out.versions.ifEmpty(null) ) // split peptide data // TODO: Add the appropriate container to remove the warning SPLIT_PEPTIDES( - ch_samples_from_sheet.peptide + ch_samples_uncompressed.peptide ) ch_versions = ch_versions.mix( SPLIT_PEPTIDES.out.versions.ifEmpty(null) ) @@ -297,43 +336,46 @@ workflow EPITOPEPREDICTION { // not sure if this is the best solution to also have a extra process for protein, but I think we need it for cases when we have both in one sheet? (CM) // run epitope prediction for proteins - PEPTIDE_PREDICTION_PROTEIN( + EPYTOPE_PEPTIDE_PREDICTION_PROTEIN( SPLIT_PEPTIDES_PROTEIN .out .splitted .combine( ch_prediction_tool_versions ) - .transpose() + .transpose(), + EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]) ) // Run epitope prediction for peptides - PEPTIDE_PREDICTION_PEP( + EPYTOPE_PEPTIDE_PREDICTION_PEP( SPLIT_PEPTIDES .out .splitted .combine( ch_prediction_tool_versions ) - .transpose() + .transpose(), + EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]) ) // Run epitope prediction for variants - PEPTIDE_PREDICTION_VAR( + EPYTOPE_PEPTIDE_PREDICTION_VAR( CSVTK_SPLIT .out .splitted - .mix( SNPSIFT_SPLIT.out.splitted ) + .mix( ch_split_variants.splitted ) .combine( ch_prediction_tool_versions ) - .transpose() + .transpose(), + EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]) ) // collect prediction script versions - ch_versions = ch_versions.mix( PEPTIDE_PREDICTION_VAR.out.versions.ifEmpty(null) ) - ch_versions = ch_versions.mix( PEPTIDE_PREDICTION_PEP.out.versions.ifEmpty(null) ) - ch_versions = ch_versions.mix( PEPTIDE_PREDICTION_PROTEIN.out.versions.ifEmpty(null) ) + ch_versions = ch_versions.mix( EPYTOPE_PEPTIDE_PREDICTION_VAR.out.versions.ifEmpty(null) ) + ch_versions = ch_versions.mix( EPYTOPE_PEPTIDE_PREDICTION_PEP.out.versions.ifEmpty(null) ) + ch_versions = ch_versions.mix( EPYTOPE_PEPTIDE_PREDICTION_PROTEIN.out.versions.ifEmpty(null) ) // Combine the predicted files and save them in a branch to make a distinction between samples with single and multi files - PEPTIDE_PREDICTION_PEP + EPYTOPE_PEPTIDE_PREDICTION_PEP .out .predicted - .mix( PEPTIDE_PREDICTION_VAR.out.predicted, PEPTIDE_PREDICTION_PROTEIN.out.predicted ) + .mix( EPYTOPE_PEPTIDE_PREDICTION_VAR.out.predicted, EPYTOPE_PEPTIDE_PREDICTION_PROTEIN.out.predicted ) .groupTuple() .flatMap { meta_data, predicted -> [[[ sample:meta_data.sample, alleles:meta_data.alleles, files:predicted.size() ], predicted ]] } .branch { @@ -356,18 +398,18 @@ workflow EPITOPEPREDICTION { // Combine protein sequences CAT_FASTA( - PEPTIDE_PREDICTION_PEP + EPYTOPE_PEPTIDE_PREDICTION_PEP .out .fasta - .mix( PEPTIDE_PREDICTION_VAR.out.fasta, PEPTIDE_PREDICTION_PROTEIN.out.fasta ) + .mix( EPYTOPE_PEPTIDE_PREDICTION_VAR.out.fasta, EPYTOPE_PEPTIDE_PREDICTION_PROTEIN.out.fasta ) .groupTuple() ) - PEPTIDE_PREDICTION_PEP + EPYTOPE_PEPTIDE_PREDICTION_PEP .out .json - .mix( PEPTIDE_PREDICTION_VAR.out.json ) - .mix( PEPTIDE_PREDICTION_PROTEIN.out.json ) + .mix( EPYTOPE_PEPTIDE_PREDICTION_VAR.out.json ) + .mix( EPYTOPE_PEPTIDE_PREDICTION_PROTEIN.out.json ) .groupTuple() .flatMap { meta, json -> [[[ sample:meta.sample, alleles:meta.alleles, files:json.size() ], json ]] } .branch { @@ -403,22 +445,20 @@ workflow EPITOPEPREDICTION { ch_workflow_summary = Channel.value( workflow_summary ) ch_multiqc_files = Channel.empty() - ch_multiqc_files = ch_multiqc_files.mix( Channel.from( ch_multiqc_config ) ) - ch_multiqc_files = ch_multiqc_files.mix( ch_multiqc_custom_config.collect().ifEmpty([]) ) ch_multiqc_files = ch_multiqc_files.mix( CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect() ) ch_multiqc_files = ch_multiqc_files.mix( ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml') ) MULTIQC ( - ch_multiqc_files.collect() + ch_multiqc_files.collect(), ch_multiqc_configs.collect() ) multiqc_report = MULTIQC.out.report.toList() } } /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ COMPLETION EMAIL AND SUMMARY -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ workflow.onComplete { @@ -429,7 +469,7 @@ workflow.onComplete { } /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THE END -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */