Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update dev #46

Merged
merged 26 commits into from
Nov 30, 2022
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
0bd4469
update dev
priyanka-surana Nov 1, 2022
2680191
update input tol module
priyanka-surana Nov 1, 2022
4f2938f
update test config
priyanka-surana Nov 1, 2022
d07f240
Update bed_filter #47
priyanka-surana Nov 7, 2022
253fa85
Update bedtools_bamtobed config settings #48
priyanka-surana Nov 9, 2022
86d1170
From goat to ncbi and update table #49 #50
priyanka-surana Nov 24, 2022
7336e82
append merqury output to table #50
priyanka-surana Nov 24, 2022
ef0442b
add chrom and organelle info
priyanka-surana Nov 24, 2022
169ad1a
Make common name optional #49 #50
priyanka-surana Nov 24, 2022
ef12627
Remove bedtools_sort #51
priyanka-surana Nov 24, 2022
38b1db3
update samtools modules
priyanka-surana Nov 24, 2022
94e69c1
Reverting to original bed sort #51
priyanka-surana Nov 25, 2022
82e5607
Changing subworkflows from serial to parallel
priyanka-surana Nov 25, 2022
f369c52
remove extra modules and add full samplesheet
priyanka-surana Nov 28, 2022
f1efeec
break compound python statements
priyanka-surana Nov 28, 2022
e8dcf60
Reorder join statements
priyanka-surana Nov 28, 2022
abbb12c
modify join statements
priyanka-surana Nov 28, 2022
307ee71
break compound statements
priyanka-surana Nov 28, 2022
6e5aa0c
break compound statements
priyanka-surana Nov 28, 2022
68806e3
change file open
priyanka-surana Nov 28, 2022
38fbb9e
fixed typo
priyanka-surana Nov 29, 2022
f4e195e
create a single table
priyanka-surana Nov 29, 2022
20dd312
Fix typo
priyanka-surana Nov 30, 2022
a02332f
Update pacbio sample def
priyanka-surana Nov 30, 2022
78aa15b
version bump
priyanka-surana Nov 30, 2022
cb5c9bb
required flags
priyanka-surana Nov 30, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions bin/add_merqury.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/env python3

import argparse
import os
import json
import sys
import csv

def parse_args(args=None):
Description = "Create a table by parsing json output to extract N50, BUSCO, QV and COMPLETENESS stats."

parser = argparse.ArgumentParser(description=Description)
parser.add_argument("FILE_IN", help="Input CSV file.")
parser.add_argument("SAMPLE", help="PacBio sample ID used for MerquryFK.")
parser.add_argument("QV", help="Input QV TSV file from MERQURYFK.")
parser.add_argument("COMPLETENESS", help="Input COMPLETENESS stats TSV file from MERQURYFK.")
parser.add_argument("FILE_OUT", help="Output CSV file.")
parser.add_argument('--version', action='version', version='%(prog)s 1.0')
return parser.parse_args(args)

def make_dir(path):
if len(path) > 0:
os.makedirs(path, exist_ok=True)

def extract_qv(file_in, writer):
with open(file_in, "r") as fin:
data = csv.DictReader(fin, delimiter="\t")
for row in data:
writer.writerow(["QV", row["QV"]])

def extract_completeness(file_in, writer):
with open(file_in, "r") as fin:
data = csv.DictReader(fin, delimiter="\t")
for row in data:
writer.writerow(["Completeness", row["% Covered"]])

def main(args=None):
args = parse_args(args)

out_dir = os.path.dirname(args.FILE_OUT)
make_dir(out_dir)

with open(args.FILE_OUT, "w") as fout:
with open(args.FILE_IN, "r") as fin:
priyanka-surana marked this conversation as resolved.
Show resolved Hide resolved
for line in fin:
fout.write(line)

writer = csv.writer(fout)
writer.writerow(["##MerquryFK", "_".join(args.SAMPLE.split("_")[:-1])])
extract_qv(args.QV, writer)
extract_completeness(args.COMPLETENESS, writer)

if __name__ == "__main__":
sys.exit(main())
2 changes: 1 addition & 1 deletion bin/bed_filter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
infile=$1
outfile=$2

paste -d '\t' - - < $infile | sed 's/-/_/g' | awk 'BEGIN {FS="\t"; OFS="\t"} {if ($1 > $7) {print substr($4,1,length($4)-2),$12,$7,$8,"16",$6,$1,$2,"8",$11,$5} else { print substr($4,1,length($4)-2),$6,$1,$2,"8",$12,$7,$8,"16",$5,$11} }' | tr '_+' '01' > $outfile
paste -d '\t' - - < $infile | sed 's/-/_/g' | awk 'BEGIN {FS="\t"; OFS="\t"} {if ($1 > $7) {print substr($4,1,length($4)-2),$12,$7,$8,"16",$6,$1,$2,"8",$11,$5} else { print substr($4,1,length($4)-2),$6,$1,$2,"8",$12,$7,$8,"16",$5,$11} }' | tr '_+' '01' | awk 'NF==11' > $outfile
muffato marked this conversation as resolved.
Show resolved Hide resolved
68 changes: 0 additions & 68 deletions bin/create_table.py

This file was deleted.

82 changes: 82 additions & 0 deletions bin/summary_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/usr/bin/env python3

import argparse
import os
import json
import sys
import csv

def parse_args(args=None):
Description = "Create a table by parsing json output to extract N50, BUSCO, QV and COMPLETENESS stats."

parser = argparse.ArgumentParser(description=Description)
parser.add_argument("GENOME", help="Input NCBI genome summary JSON file.")
parser.add_argument("SEQUENCE", help="Input NCBI sequence summary JSON file.")
parser.add_argument("BUSCO", help="Input BUSCO short summary JSON file.")
parser.add_argument("FILE_OUT", help="Output CSV file.")
parser.add_argument('--version', action='version', version='%(prog)s 1.0')
return parser.parse_args(args)

def make_dir(path):
if len(path) > 0:
os.makedirs(path, exist_ok=True)

def ncbi_stats(genome_in, seq_in, writer):
with open(genome_in, "r") as fin1:
data = json.load(fin1)
with open(seq_in, "r") as fin2:
seq = json.load(fin2)

data = data["reports"][0]
info = data["assembly_info"]
attr = info["biosample"]["attributes"]
stats = data["assembly_stats"]
seq = seq["reports"]

writer.writerow(["##Assembly_Information"])
writer.writerow(["Accession", data["accession"]])
if "common_name" in data["organism"]: writer.writerow(["Common_Name", data["organism"]["common_name"]])
priyanka-surana marked this conversation as resolved.
Show resolved Hide resolved
writer.writerow(["Organism_Name", data["organism"]["organism_name"]])
writer.writerow(["ToL_ID", "".join([ pairs["value"] for pairs in attr if pairs["name"] == "tolid" ])])
priyanka-surana marked this conversation as resolved.
Show resolved Hide resolved
writer.writerow(["Taxon_ID", data["organism"]["tax_id"]])
writer.writerow(["Assembly_Name", info["assembly_name"]])
writer.writerow(["Assembly_Level", info["assembly_level"]])
writer.writerow(["Life_Stage", "".join([ pairs["value"] for pairs in attr if pairs["name"] == "life_stage" ])])
writer.writerow(["Tissue", "".join([ pairs["value"] for pairs in attr if pairs["name"] == "tissue" ])])
writer.writerow(["Sex", "".join([ pairs["value"] for pairs in attr if pairs["name"] == "sex" ])])
writer.writerow(["##Assembly_Statistics"])
writer.writerow(["Total_Sequence", stats["total_sequence_length"]])
writer.writerow(["Chromosomes", stats["total_number_of_chromosomes"]])
writer.writerow(["Scaffolds", stats["number_of_scaffolds"]])
writer.writerow(["Scaffold_N50", stats["scaffold_n50"]])
writer.writerow(["Contigs", stats["number_of_contigs"]])
writer.writerow(["Contig_N50", stats["contig_n50"]])
writer.writerow(["GC_Percent", stats["gc_percent"]])
writer.writerow(["##Chromosome", "Length", "GC_Percent"])
for chrom in [[mol["chr_name"], mol["length"], mol["gc_percent"]] for mol in seq if "gc_percent" in mol and mol["assembly_unit"] != "non-nuclear"]:
writer.writerow(chrom)
muffato marked this conversation as resolved.
Show resolved Hide resolved
writer.writerow(["##Organelle", "Length", "GC_Percent"])
for chrom in [[mol["assigned_molecule_location_type"], mol["length"], mol["gc_percent"]] for mol in seq if "gc_percent" in mol and mol["assembly_unit"] == "non-nuclear"]:
writer.writerow(chrom)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could undergo a small rewrite like the other for+for+if line


def extract_busco(file_in, writer):
with open(file_in, "r") as fin:
data = json.load(fin)

writer.writerow(["##BUSCO"])
writer.writerow(["Lineage", data["lineage_dataset"]["name"]])
writer.writerow(["Summary", data["results"]["one_line_summary"]])

def main(args=None):
args = parse_args(args)

out_dir = os.path.dirname(args.FILE_OUT)
make_dir(out_dir)

with open(args.FILE_OUT, "w") as fout:
writer = csv.writer(fout)
ncbi_stats(args.GENOME, args.SEQUENCE, writer)
extract_busco(args.BUSCO, writer)

if __name__ == "__main__":
sys.exit(main())
2 changes: 1 addition & 1 deletion bin/tol_input.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ assembly=$(echo $genome | cut -f12 -d'/')
gca=$(echo $genome | cut -f14 -d'/' | sed 's/.fasta.gz//')

# Currently this will import a masked file, but once the `insdcdownload` pipeline goes in production, it will be unmasked
gunzip -c $genome > ${gca}.fasta
ln -s $genome
muffato marked this conversation as resolved.
Show resolved Hide resolved

analysis=$data/$taxon/$organism/analysis/$assembly

Expand Down
18 changes: 9 additions & 9 deletions conf/base.config
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
nf-core/genomenote Nextflow base config file
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
sanger-tol/genomenote Nextflow base config file
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A 'blank slate' config file, appropriate for general use on most high performance
compute environments. Assumes that all software is installed and available on
the PATH. Runs in `local` mode - all jobs will be run on the logged in environment.
Expand All @@ -14,7 +14,7 @@ process {
memory = { check_max( 6.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }

errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' }
errorStrategy = { task.exitStatus in [140,143,137,104,134,139] ? 'retry' : 'finish' }
maxRetries = 1
maxErrors = '-1'

Expand All @@ -24,6 +24,11 @@ process {
// If possible, it would be nice to keep the same label naming convention when
// adding in your local modules too.
// See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
withLabel:process_single {
cpus = { check_max( 1 , 'cpus' ) }
memory = { check_max( 6.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }
}
withLabel:process_low {
cpus = { check_max( 2 * task.attempt, 'cpus' ) }
memory = { check_max( 12.GB * task.attempt, 'memory' ) }
Expand All @@ -42,11 +47,6 @@ process {
withLabel:process_long {
time = { check_max( 20.h * task.attempt, 'time' ) }
}
withLabel:process_samtools {
cpus = { check_max( 8 * task.attempt, 'cpus' ) }
memory = { check_max( 16.GB * task.attempt, 'memory' ) }
time = { check_max( 6.h * task.attempt, 'time' ) }
}
withLabel:process_high_memory {
memory = { check_max( 200.GB * task.attempt, 'memory' ) }
}
Expand Down
22 changes: 14 additions & 8 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,6 @@
*/

process {
/*
publishDir = [
path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
*/
withName: SAMPLESHEET_CHECK {
publishDir = [
path: { "${params.outdir}/pipeline_info" },
Expand Down Expand Up @@ -55,7 +48,20 @@ process {
ext.args = '--mode genome --tar'
}

withName: CREATE_TABLE {
withName: SUMMARYSEQUENCE {
ext.prefix = { "${meta.id}_sequence" }
ext.args = "--report sequence"
}

withName: SUMMARYTABLE {
publishDir = [
path: { "${meta.outdir}/genome_statistics" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: ADDMERQURY {
publishDir = [
path: { "${meta.outdir}/genome_statistics" },
mode: params.publish_dir_mode,
Expand Down
6 changes: 3 additions & 3 deletions conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ params {
max_time = '6.h'

// Input data
input = "mMelMel3"
project = "/lustre/scratch123/tol/projects/.sandbox"
lineage_db = "/lustre/scratch123/tol/resources/nextflow/test-data/busco_2021_06_reduced"
input = "${projectDir}/assets/samplesheet.csv"
fasta = "/lustre/scratch123/tol/projects/.sandbox/data/mammals/Meles_meles/assembly/release/mMelMel3.2_paternal_haplotype/insdc/GCA_922984935.2.subset.fasta.gz"
lineage_db = "/lustre/scratch123/tol/resources/nextflow/busco_2021_06_reduced"
}
5 changes: 2 additions & 3 deletions conf/test_full.config
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@ params {
config_profile_description = 'Full test dataset to check pipeline function'

// Input data for full size test
input = "gfLaeSulp1"
project = "/lustre/scratch123/tol/projects/.sandbox"
kmer = "/lustre/scratch123/tol/projects/.sandbox/data/fungi/Laetiporus_sulphureus/genomic_data/gfLaeSulp1/hic-arima2/kmer"
input = "${projectDir}/assets/samplesheet_full.csv"
muffato marked this conversation as resolved.
Show resolved Hide resolved
fasta = "/lustre/scratch124/tol/projects/darwin/data/insects/Ypsolopha_sequella/assembly/release/ilYpsSequ2.1/insdc/GCA_934047225.1.fasta.gz"
lineage_db = "/lustre/scratch123/tol/resources/busco/v5"
}
65 changes: 41 additions & 24 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,47 @@
"name": "nf-core/genomenote",
"homePage": "https://github.com/nf-core/genomenote",
"repos": {
"nf-core/modules": {
"bedtools/bamtobed": {
"git_sha": "90aef30f432332bdf0ce9f4b9004aa5d5c4960bb"
},
"busco": {
"git_sha": "89a84538bede7c6919f7c042fdb4c79e5e2d9d2a"
},
"cooler/cload": {
"git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
},
"cooler/zoomify": {
"git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
},
"custom/dumpsoftwareversions": {
"git_sha": "82501fe6d0d12614db67751d30af98d16e63dc59"
},
"merquryfk/merquryfk": {
"git_sha": "96602d03faf2b3a22323a19032f6e7e97b532d70"
},
"samtools/faidx": {
"git_sha": "3eb99152cedbb7280258858e5df08478a4670696"
},
"samtools/view": {
"git_sha": "83b05a8dc05b9dd2f1979d1a9f08411fb1a82b11"
"https://github.com/nf-core/modules.git": {
"modules": {
"nf-core": {
"bedtools/bamtobed": {
"branch": "master",
"git_sha": "219bbdaf9d08043a66f2b531609c1e8e4ac89021",
"patch": "modules/nf-core/bedtools/bamtobed/bedtools-bamtobed.diff"
},
"busco": {
"branch": "master",
"git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905"
},
"cooler/cload": {
"branch": "master",
"git_sha": "73890978012600b41dcc1b44136287a172b3fb71"
},
"cooler/zoomify": {
"branch": "master",
"git_sha": "73890978012600b41dcc1b44136287a172b3fb71"
},
"custom/dumpsoftwareversions": {
"branch": "master",
"git_sha": "8022c68e7403eecbd8ba9c49496f69f8c49d50f0"
},
"gunzip": {
"branch": "master",
"git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905"
},
"merquryfk/merquryfk": {
"branch": "master",
"git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905"
},
"samtools/faidx": {
"branch": "master",
"git_sha": "cf5b9c30a2adacc581793afb79fae5f5b50bed01"
},
"samtools/view": {
"branch": "master",
"git_sha": "cf5b9c30a2adacc581793afb79fae5f5b50bed01"
}
}
}
}
}
Expand Down
Loading