Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GWAS formatdata-subworkflow #39

Draft
wants to merge 1 commit into
base: dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions modules/local/formatdata_processes/extract_chro_gwas.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
process ExtractChroGWAS{
memory params.mem_req
input :
file(gwas) from gwas_chrolist_ext
each chro from chrolist2
output :
set val(chro), file(gwas_out) into gwas_format_chro
script :
gwas_out=gwas.baseName+"_"+chro+".gwas"
sep=(params.sep!="") ? "" : ""
infofile="Chro:${params.head_chr}:${params.headnew_chr},Pos:${params.head_bp}:${params.headnew_bp},A2:${params.head_A2}:${params.headnew_A2},A1:${params.head_A1}:${params.headnew_A1},af:${params.head_freq}:${params.headnew_freq},Beta:${params.head_beta}:${params.headnew_beta},Se:${params.head_se}:${params.headnew_se},Pval:${params.head_pval}:${params.headnew_pval},N:${params.head_N}:${params.headnew_N}"
"""
extractandformat_gwas.py --input_file $gwas --out_file ${gwas_out} --chr $chro --info_file $infofile
"""
}


12 changes: 12 additions & 0 deletions modules/local/formatdata_processes/extract_rs_id_chro.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
process ExtractRsIDChro{
memory params.mem_req
input :
set val(chro), file(gwas), file(rsinfo) from gwas_format_chro_rs
output :
set val(chro), file(gwas),file(outrs) into rsinfo_chro
script :
outrs="info_rs_"+chro+".rs"
"""
zcat $rsinfo | extractrsid_bypos.py --file_chrbp $gwas --out_file $outrs --ref_file stdin --chr $chro --chro_ps ${params.poshead_chro_inforef} --bp_ps ${params.poshead_bp_inforef} --rs_ps ${params.poshead_rs_inforef} --a1_ps ${params.poshead_a1_inforef} --a2_ps ${params.poshead_a2_inforef}
"""
}
13 changes: 13 additions & 0 deletions modules/local/formatdata_processes/get_list_echro.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
process getListeChro{
input :
file(gwas_res) from gwas_chrolist
output :
file("filechro") into chrolist
script:
sep=(params.sep!="") ? " --sep ${params.sep}" : ""
"""
extractlistchro.py --input_file $gwas_res --chro_header ${params.head_chr} $sep > filechro
"""
}


20 changes: 20 additions & 0 deletions modules/local/formatdata_processes/merge_all.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
process MergeAll{
memory params.mem_req
input :
file(allfile) from gwas_rsmerge_all
publishDir "${params.output_dir}/", overwrite:true, mode:'copy'
output :
file(fileout)
script :
file1=allfile[0]
listefiles=allfile.join(" ")
fileout=params.output
"""
head -1 $file1 > $fileout
ls $listefiles | xargs -n 1 tail -n +2 >> $fileout
"""

}



21 changes: 21 additions & 0 deletions modules/local/formatdata_processes/merge_rs_gwas_chro.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
process MergeRsGwasChro{
memory params.mem_req
input :
set val(chro), file(gwas),file(chrors), file(bed), file(bim), file(fam) from rsinfo_chroplk
output :
file(outmerge) into gwas_rsmerge
script :
outmerge="merge_"+chro+".gwas"
bfileopt= (params.input_pat!="" || params.input_dir!="") ? " --bfile "+bed.baseName : ""
Nheadopt=(params.head_N!="") ? " --N_head ${params.head_N} " : ""
Freqheadopt=(params.head_freq!="") ? " --freq_head ${params.head_freq} " : ""

NheadNewopt=(params.headnew_N!="") ? " --Nnew_head ${params.headnew_N} " : ""
FreqNewheadopt=(params.headnew_freq!="") ? " --freqnew_head ${params.headnew_freq} " : ""
"""
mergeforrs.py --input_gwas $gwas --input_rs $chrors --out_file $outmerge --chro_head ${params.headnew_chr} --bp_head ${params.headnew_bp} --rs_head ${params.headnew_rs} --chro $chro $bfileopt $Nheadopt $Freqheadopt $NheadNewopt $FreqNewheadopt --a1_head ${params.headnew_A1} --a2_head ${params.headnew_A2}
"""

}


155 changes: 128 additions & 27 deletions nextflow.config
Original file line number Diff line number Diff line change
@@ -1,15 +1,60 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
nf-core/gwas Nextflow config file
nf-core/gwas Nextflow config file
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Default config options for all compute environments
Default config options for all compute environments
----------------------------------------------------------------------------------------
*/

// Global default params, used in configs
params {


//------------------------
// TODO: h3abionet qc params
// NOTE: h3abionet formatdata params
//------------------------


mem_req = '2GB' // how much plink needs for this
file_gwas=""
output_dir="out"
output="output"
input_dir=""
input_pat=""

head_pval = ""
head_freq = ""
head_bp = ""
head_chr = ""
head_rs = ""
head_beta=""
head_se=""
head_A1=""
head_A2=""
head_N=""
sep=""

headnew_pval = ""
headnew_freq = ""
headnew_bp = ""
headnew_chr = ""
headnew_rs = "rs"
headnew_beta=""
headnew_se=""
headnew_A1=""
headnew_A2=""
headnew_N=""

poshead_chro_inforef=0
poshead_bp_inforef=1
poshead_rs_inforef=2
poshead_a1_inforef=3
poshead_a2_inforef=4



//------------------------
// NOTE: h3abionet qc params
//------------------------

// Input options
Expand Down Expand Up @@ -286,34 +331,90 @@ manifest {
includeConfig 'conf/modules.config'

// Function to ensure that resource requirements don't go beyond
// a maximum limit
def check_max(obj, type) {
if (type == 'memory') {
try {
if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1)
// a maximum limit
def check_max(obj, type) {
if (type == 'memory')
{
try
{
if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1)
return params.max_memory as nextflow.util.MemoryUnit
else
else
return obj
} catch (all) {
println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj"
return obj
}
} else if (type == 'time') {
try {
if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1)
} catch (all) {
println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj"
return obj
}
} else if (type == 'time')
{
try
{
if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1)
return params.max_time as nextflow.util.Duration
else
else
return obj
} catch (all) {
println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj"
return obj
}
} else if (type == 'cpus') {
try {
return Math.min( obj, params.max_cpus as int )
} catch (all) {
println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj"
return obj
} catch (all) {
println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj"
return obj
}
} else if (type == 'cpus')
{
try
{
return Math.min( obj, params.max_cpus as int )
} catch (all) {
println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj"
return obj
}
}
}
name = 'nf-core/gwas'
author = 'Davi Josué Marcon (@Mxrcon), Abhinav Sharma (@abhi18av), Jean-Tristan Brandenburg (@jeantristanb), Lindsay Clark (@lvclark) and Scott Hazelhurst (@shaze).'
homePage = 'https://github.com/nf-core/gwas'
description = 'A pipeline for genome wide association studies.'
mainScript = 'main.nf'
nextflowVersion = '!>=21.10.3'
version = '1.0dev'
}

// Load modules.config for DSL2 module specific options
includeConfig 'conf/modules.config'

// Function to ensure that resource requirements don't go beyond
// a maximum limit
def check_max(obj, type) {
if (type == 'memory')
{
try
{
if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1)
return params.max_memory as nextflow.util.MemoryUnit
else
return obj
} catch (all) {
println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj"
return obj
}
} else if (type == 'time')
{
try
{
if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1)
return params.max_time as nextflow.util.Duration
else
return obj
} catch (all) {
println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj"
return obj
}
} else if (type == 'cpus')
{
try
{
return Math.min( obj, params.max_cpus as int )
} catch (all) {
println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj"
return obj
}
}
}
105 changes: 105 additions & 0 deletions subworkflows/local/formatdata.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#!/usr/bin/env nextflow

import java.nio.file.Paths;
import sun.nio.fs.UnixPath;
import java.security.MessageDigest;


filescript=file(workflow.scriptFile)
projectdir="${filescript.getParent()}"
dummy_dir="${projectdir}/../../qc/input"


// Checks if the file exists
checker = { fn ->
if (fn.exists())
return fn;
else
error("\n\n------\nError in your config\nFile $fn does not exist\n\n---\n")
}

def helps = [ 'help' : 'help' ]
allowed_params = ['file_gwas', 'file_ref_gzip', "output_dir","output", "input_dir", "input_pat"]
allowed_header = ['head_pval', 'head_freq', 'head_bp', 'head_chr', 'head_rs', 'head_beta', 'head_se', 'head_A1', 'head_A2', 'sep']
allowed_headnewernew = ['headnew_pval', 'headnew_freq', 'headnew_bp', 'headnew_chr', 'headnew_rs', 'headnew_beta', 'headnew_se', 'headnew_A1', 'headnew_A2']
//chro_ps 0 --bp_ps 1 --rs_ps
allowed_posfilref=['poshead_chro_inforef', 'poshead_bp_inforef','poshead_rs_inforef']
allowed_params+=allowed_header
allowed_params+=allowed_headnewernew

if(params.file_gwas==""){
error('params.file_gwas: file contains gwas not found')
}

if(params.headnew_pval=="")params.headnew_pval=params.head_pval
if(params.headnew_freq=="")params.headnew_freq=params.head_freq
if(params.headnew_bp=="")params.headnew_bp=params.head_bp
if(params.headnew_chr=="")params.headnew_chr=params.head_chr
if(params.headnew_beta=="")params.headnew_beta=params.head_beta
if(params.headnew_se=="")params.headnew_se=params.head_se
if(params.headnew_A1=="")params.headnew_A1=params.head_A1
if(params.headnew_A2=="")params.headnew_A2=params.head_A2
//if(params.headnew_N=="")params.headnew_N=params.head_N


gwas_chrolist = Channel.fromPath(params.file_gwas)
gwas_chrolist_ext = Channel.fromPath(params.file_gwas)
gwas_chrolist_ext = Channel.fromPath(params.file_gwas)


getListeChro(
gwas_chrolist
).out.chrolist

chrolist2=Channel.create()
chrolist.flatMap { list_str -> list_str.readLines()[0].split() }.set { chrolist2 }

ExtractChroGWAS(
file(gwas) from gwas_chrolist_ext
each chro from chrolist2
}.out.gwas_format_chro

if(params.file_ref_gzip==""){
error('params.file_ref_gzip : file contains information for rs notnot found')
}

gwas_format_chro_rs=gwas_format_chro.combine(Channel.fromPath(params.file_ref_gzip))

ExtractRsIDChro(
gwas_format_chro_rs
).out.rsinfo_chro

if(params.input_dir!="" || params.input_pat!=''){
print("used plink file")
bed = Paths.get(params.input_dir,"${params.input_pat}.bed").toString().replaceFirst(/^az:/, "az:/").replaceFirst(/^s3:/, "s3:/")
bim = Paths.get(params.input_dir,"${params.input_pat}.bim").toString().replaceFirst(/^az:/, "az:/").replaceFirst(/^s3:/, "s3:/")
fam = Paths.get(params.input_dir,"${params.input_pat}.fam").toString().replaceFirst(/^az:/, "az:/").replaceFirst(/^s3:/, "s3:/")


}else{
bed=file('${dummy_dir}/00')
bim=file('${dummy_dir}/01')
fam=file('${dummy_dir}/02')
}

fileplk= Channel.create()
Channel
.from(file(bed),file(bim),file(fam))
.buffer(size:3)
.map { a -> [a[0], a[1], a[2]] }
.set { fileplk }

rsinfo_chroplk=rsinfo_chro.combine(fileplk)

process MergeRsGwasChro{
rsinfo_chroplk
).out.gwas_rsmerge

gwas_rsmerge_all=gwas_rsmerge.collect()

MergeAll(
gwas_rsmerge_all
}.out