Skip to content

Commit

Permalink
Merge pull request #17 from sanger-tol/dp24_treeval_parity
Browse files Browse the repository at this point in the history
Updating everything
  • Loading branch information
DLBPointon authored Jul 8, 2024
2 parents 9514488 + 9d9f779 commit c0650ec
Show file tree
Hide file tree
Showing 145 changed files with 3,522 additions and 560 deletions.
43 changes: 35 additions & 8 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ on:

env:
NXF_ANSI_LOG: false
NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity
NXF_SINGULARITY_LIBRARYDIR: ${{ github.workspace }}/.singularity

concurrency:
group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}"
Expand All @@ -19,14 +21,20 @@ jobs:
test:
name: Run pipeline with test data
# Only run on push if this is the nf-core dev branch (merged PRs)
if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'sanger-tol/curationpretextt') }}"
if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'sanger-tol/curationpretext') }}"
runs-on: ubuntu-latest
strategy:
matrix:
NXF_VER:
- "22.10.1"
- "latest-everything"
steps:
- name: Get branch names
# Pulls the names of current branches in repo
# steps.branch-names.outputs.current_branch is used later and returns the name of the branch the PR is made FROM not to
id: branch-names
uses: tj-actions/branch-names@v8

- name: Check out pipeline code
uses: actions/checkout@v3

Expand All @@ -35,17 +43,36 @@ jobs:
with:
version: "${{ matrix.NXF_VER }}"

- name: Setup apptainer
uses: eWaterCycle/setup-apptainer@main

- name: Set up Singularity
run: |
mkdir -p $NXF_SINGULARITY_CACHEDIR
mkdir -p $NXF_SINGULARITY_LIBRARYDIR
- name: Install Python
uses: actions/setup-python@v5
with:
python-version: "3.10"

- name: Install nf-core
run: |
pip install nf-core
- name: NF-Core Download - download singularity containers
# Forcibly download repo on active branch and download SINGULARITY containers into the CACHE dir if not found
# Must occur after singularity install or will crash trying to dl containers
# Zip up this fresh download and run the checked out version
run: |
nf-core download sanger-tol/curationpretext --revision ${{ steps.branch-names.outputs.current_branch }} --compress none -d --force --outdir sanger-curationpretext --container-cache-utilisation amend --container-system singularity
- name: Download test data
# Download A fungal test data set that is full enough to show some real output.
run: |
curl https://tolit.cog.sanger.ac.uk/test-data/resources/treeval/TreeValTinyData.tar.gz | tar xzf -
- name: Run MAPS_ONLY pipeline with test data
# Remember that you can parallelise this by using strategy.matrix
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results -entry MAPS_ONLY
- name: Run ALL_FILES pipeline with test data
- name: Singularity - Run ALL_FILES pipeline with test data
# Remember that you can parallelise this by using strategy.matrix
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results
nextflow run ./sanger-curationpretext/${{ steps.branch-names.outputs.current_branch }}/main.nf -profile test,singularity --outdir ./Sing-res
3 changes: 3 additions & 0 deletions .nf-core.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ repository_type: pipeline
lint:
files_exist:
- assets/multiqc_config.yml
- assets/nf-core-curationpretext_logo_light.png
- docs/images/nf-core-curationpretext_logo_light.png
- docs/images/nf-core-curationpretext_logo_dark.png
files_unchanged:
- .github/workflows/linting.yml
- LICENSE
Expand Down
55 changes: 54 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,60 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [[1.0.0](https://github.com/sanger-tol/curationpretext/releases/tag/1.0.0)] - UNSC Infinity - [2023-10-02]
## [[1.0.0](https://github.com/sanger-tol/curationpretext/releases/tag/1.0.0)] - UNSC Cradle - [2024-02-22]

### Added

- Subworkflows for both minimap2 and bwamem2 mapping.
- Subworkflow for Pretext accessory file ingestion.
- Considerations for other longread datatypes

### Paramters

| Old Version | New Versions |
| ----------- | --------------- |
| | --aligner |
| | --longread_type |
| --pacbio | --longread |

### Software Dependencies

Note, since the pipeline is using Nextflow DSL2, each process will be run with its own Biocontainer. This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference.

| Module | Old Version | New Versions |
| ------------------------------------------------------------------- | -------------- | -------------- |
| bamtobed_sort ( bedtools + samtools ) | - | 2.31.0 + 1.17 |
| bedtools ( genomecov, bamtobed, intersect, map, merge, makewindows) | 2.31.0 | 2.31.1 |
| bwamem2 index | - | 2.2.1 |
| cram_filter_align_bwamem2_fixmate_sort | - | |
| ^ ( samtools + bwamem2 ) ^ | 1.16.1 + 2.2.1 | 1.17 + 2.2.1 |
| cram_filter_minimap2_filter5end_fixmate_sort | - | |
| ^ ( samtools + minimap2 ) ^ | - | 1.17 + 2.24 |
| extract_cov_id ( coreutils ) | - | 9.1 |
| extract_repeat ( perl ) | - | 5.26.2 |
| extract_telo ( coreutils ) | - | 9.1 |
| find_telomere_regions ( gcc ) | - | 7.1.0 |
| find_telomere_windows ( java-jdk ) | - | 8.0.112 |
| gap_length ( coreutils ) | - | 9.1 |
| generate_cram_csv ( samtools ) | - | 1.17 |
| get_largest_scaff ( coreutils ) | - | 9.1 |
| gnu-sort | - | 8.25 |
| pretextmap + samtools | 0.1.9 + 1.17 | 0.1.9\* + 1.18 |
| pretextgraph | | 0.0.4 |
| pretextsnapshot + UCSC | 0.0.6 + 447 | 0.0.6b + 447 |
| seqtk | - | 1.4 |
| samtools (faidx,merge,sort,view) | 1.17 | 1.18 |
| tabix | - | 1.11 |
| ucsc | 377 | 445 |
| windowmasker (blast) | - | 2.14.0 |

- This version has been modified by @yumisims inorder to expose the texture buffer variable

### Dependencies

### Deprecated

## [[0.1.0](https://github.com/sanger-tol/curationpretext/releases/tag/0.1.0)] - UNSC Infinity - [2023-10-02]

Initial release of sanger-tol/curationpretext, created with the [sager-tol](https://nf-co.re/) template.

Expand Down
5 changes: 5 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,8 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


The filter_five_end.ph script has been taken from the Arima Mapping Pipeline, has not been modified and is subject to the below license:

Copyright (c) 2017 Arima Genomics, Inc.
77 changes: 49 additions & 28 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# ![sanger-tol/curationpretext](docs/images/nf-core-curationpretext_logo_light.png#gh-light-mode-only) ![sanger-tol/curationpretext](docs/images/nf-core-curationpretext_logo_dark.png#gh-dark-mode-only)
# ![sanger-tol/curationpretext](docs/images/curationpretext-light.png#gh-light-mode-only) ![sanger-tol/curationpretext](docs/images/curationpretext-dark.png#gh-dark-mode-only)

[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/curationpretext/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)

Expand Down Expand Up @@ -32,49 +32,64 @@ This is intended as a supplementary pipeline for the [treeval](https://github.co
Currently, the pipeline uses the following flags:

- --input
- `--input`

- The absolute path to the assembled genome in, e.g., `/path/to/assembly.fa`

- --pacbio
- `--longread`

- The directory of the fasta files generated from pacbio reads, e.g., `/path/to/fasta/`
- The directory of the fasta files generated from longread reads, e.g., `/path/to/fasta/`

- --cram
- `--longread_type`

- The type of longread data you are utilising, e.g., ont, illumina, hifi.

- `--aligner`

- The aligner yopu wish to use for the coverage generation, defaults to bwamem2 but minimap2 is also supported.

- `--cram`

- The directory of the cram _and_ cram.crai files, e.g., `/path/to/cram/`

- --teloseq
- `--teloseq`

- A telomeric sequence, e.g., `TTAGGG`

- -entry
- ALL_FILES generates all accessory files as well as pretext maps
- `-entry`
- ALL_FILES is the default and generates all accessory files as well as pretext maps
- MAPS_ONLY generates only the pretext maps and static images

Now, you can run the pipeline using:

<!-- TODO nf-core: update the following command to include all required parameters for a minimal example -->
#### For ALL_FILES run

```bash
// For ALL_FILES run
nextflow run sanger-tol/curationpretext \
-profile <docker/singularity/.../institute> \
--input path/to/assembly.fa \
--cram path/to/cram/ \
--pacbio path/to/pacbio/fasta/ \
--teloseq TTAGGG \
--sample { default is "pretext_rerun" }
--outdir path/to/outdir/

// For MAPS_ONLY run
--input { input.fasta } \
--cram { path/to/cram/ } \
--longread { path/to/longread/fasta/ } \
--longread_type { default is "hifi" }
--sample { default is "pretext_rerun" } \
--teloseq { deafault is "TTAGGG" } \
--outdir { OUTDIR } \
-profile <docker/singularity/{institute}>

```
#### For MAPS_ONLY run
```bash
nextflow run sanger-tol/curationpretext \
-profile <docker/singularity/.../institute> \
--input path/to/assembly.fa \
--cram path/to/cram/ \
--sample { default is "pretext_rerun" }
-entry MAPS_ONLY \
--outdir path/to/outdir/
--input { input.fasta } \
--cram { path/to/cram/ } \
--longread { path/to/longread/fasta/ } \
--longread_type { default is "hifi" }
--sample { default is "pretext_rerun" } \
--teloseq { deafault is "TTAGGG" } \
--outdir { OUTDIR } \
-profile <docker/singularity/{institute}> \
-entry MAPS_ONLY \
```
> **Warning:**
Expand All @@ -85,7 +100,7 @@ For more details, please refer to the [usage documentation](https://pipelines.to
## Pipeline output
To see the the results of a test run with a full size dataset refer to the [results](https://pipelines.tol.sanger.ac.uk/curationpretext/results) tab on the nf-core website pipeline page.
To see the the results of a test run with a full size dataset refer to the [results](https://pipelines.tol.sanger.ac.uk/curationpretext/results) tab on the sanger-tol/curationpretext website pipeline page.
For more details about the output files and reports, please refer to the
[output documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/output).
Expand All @@ -95,9 +110,11 @@ sanger-tol/curationpretext was originally written by Damon-Lee B Pointon (@DLBPo
We thank the following people for their extensive assistance in the development of this pipeline:
- @yumisims
- @yumisims - TreeVal and Software.
- @weaglesBio
- @weaglesBio - TreeVal and Software.
- @josieparis - Help with better docs and testing.
## Contributions and Support
Expand All @@ -121,3 +138,7 @@ You can cite the `nf-core` publication as follows:
> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.
>
> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).
```
```
Binary file removed assets/nf-core-curationpretext_logo_light.png
Binary file not shown.
1 change: 1 addition & 0 deletions bin/awk_filter_reads.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
awk 'BEGIN{OFS="\t"}{if($1 ~ /^\@/) {print($0)} else {$2=and($2,compl(2048)); print(substr($0,2))}}'
109 changes: 109 additions & 0 deletions bin/filter_five_end.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#!/usr/bin/perl
use strict;
use warnings;

my $prev_id = "";
my @five;
my @three;
my @unmap;
my @mid;
my @all;
my $counter = 0;

while (<STDIN>){
chomp;
if (/^@/){
print $_."\n";
next;
}
my ($id, $flag, $chr_from, $loc_from, $mapq, $cigar, $d1, $d2, $d3, $read, $read_qual, @rest) = split /\t/;
my $bin = reverse(dec2bin($flag));
my @binary = split(//,$bin);
if ($prev_id ne $id && $prev_id ne ""){
if ($counter == 1){
if (@five == 1){
print $five[0]."\n";
}
else{
my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0];
my $bin_1 = reverse(dec2bin($flag_1));
my @binary_1 = split(//,$bin_1);
$binary_1[2] = 1;
my $bin_1_new = reverse(join("",@binary_1));
my $flag_1_new = bin2dec($bin_1_new);
print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n");
}
}
elsif ($counter == 2 && @five == 1){
print $five[0]."\n";
}
else{
my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0];
my $bin_1 = reverse(dec2bin($flag_1));
my @binary_1 = split(//,$bin_1);
$binary_1[2] = 1;
my $bin_1_new = reverse(join("",@binary_1));
my $flag_1_new = bin2dec($bin_1_new);
print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n");
}

$counter = 0;
undef @unmap;
undef @five;
undef @three;
undef @mid;
undef @all;
}

$counter++;
$prev_id = $id;
push @all,$_;
if ($binary[2]==1){
push @unmap,$_;
}
elsif ($binary[4]==0 && $cigar =~ m/^[0-9]*M/ || $binary[4]==1 && $cigar =~ m/.*M$/){
push @five, $_;
}
elsif ($binary[4]==1 && $cigar =~ m/^[0-9]*M/ || $binary[4]==0 && $cigar =~ m/.*M$/){
push @three, $_;
}
elsif ($cigar =~ m/^[0-9]*[HS].*M.*[HS]$/){
push @mid, $_;
}
}

if ($counter == 1){
if (@five == 1){
print $five[0]."\n";
}
else{
my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0];
my $bin_1 = reverse(dec2bin($flag_1));
my @binary_1 = split(//,$bin_1);
$binary_1[2] = 1;
my $bin_1_new = reverse(join("",@binary_1));
my $flag_1_new = bin2dec($bin_1_new);
print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n");
}
}
elsif ($counter == 2 && @five == 1){
print $five[0]."\n";
}
else{
my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0];
my $bin_1 = reverse(dec2bin($flag_1));
my @binary_1 = split(//,$bin_1);
$binary_1[2] = 1;
my $bin_1_new = reverse(join("",@binary_1));
my $flag_1_new = bin2dec($bin_1_new);
print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n");
}

sub dec2bin {
my $str = unpack("B32", pack("N", shift));
return $str;
}

sub bin2dec {
return unpack("N", pack("B32", substr("0" x 32 . shift, -32)));
}
Loading

0 comments on commit c0650ec

Please sign in to comment.