From ca28b6a15093a64cce8d2abb9fa6f916df1e7e5b Mon Sep 17 00:00:00 2001 From: Lipastomies Date: Tue, 20 Apr 2021 15:29:44 +0300 Subject: [PATCH 1/3] add chromosome filtering regex --- wdl/gwas/saige.json | 1 + wdl/gwas/saige_sub.wdl | 2 ++ wdl/gwas/saige_tests.json | 3 ++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/wdl/gwas/saige.json b/wdl/gwas/saige.json index 79f9a41..876ab95 100644 --- a/wdl/gwas/saige.json +++ b/wdl/gwas/saige.json @@ -19,6 +19,7 @@ "saige.test_combine.test.minmac": 5, "saige.test_combine.test.logP": true, "saige.test_combine.combine.logP": true, + "saige.test_combine.combine.chrom_regex": "chr[0-9]{1,2}", "saige.test_combine.combine.prefix": "finngen_R7_", "saige.test_combine.combine.chrcol": "#chrom", diff --git a/wdl/gwas/saige_sub.wdl b/wdl/gwas/saige_sub.wdl index 22170d3..2bbf6d8 100644 --- a/wdl/gwas/saige_sub.wdl +++ b/wdl/gwas/saige_sub.wdl @@ -80,6 +80,7 @@ task combine { String prefix Boolean logP String logPStr = if logP then "True" else "False" + String chrom_regex command <<< set -euxo pipefail @@ -91,6 +92,7 @@ task combine { |awk 'NR == 1; NR > 1 {print $0 | "sort -k 1,1V -k 2,2g"}' \ |awk 'NR==1 { for(i=1;i<=NF;i++) { h[$i]=i }; if (! "POS" in h) { print "ERROR: POS not in saige file header"; exit 1};print } NR>1{ $h["POS"]=sprintf("%d",$h["POS"]);print $0 }' \ |tr ' ' '\t' \ + |grep -E "(^${chrom_regex})|(^CHR)" |bgzip > ${prefix}${pheno}.saige.gz tabix -s1 -b2 -e2 -S1 ${prefix}${pheno}.saige.gz diff --git a/wdl/gwas/saige_tests.json b/wdl/gwas/saige_tests.json index 18396d4..cf5fa97 100644 --- a/wdl/gwas/saige_tests.json +++ b/wdl/gwas/saige_tests.json @@ -12,10 +12,11 @@ "saige.test_combine.test.minmac": 5, "saige.test_combine.test.logP": true, "saige.test_combine.combine.logP": true, + "saige.test_combine.combine.chrom_regex": "chr[0-9]{1,2}", "saige.test_combine.combine.prefix": "", "saige.test_combine.combine.chrcol": "#chrom", "saige.test_combine.combine.p_valcol": "pval", "saige.test_combine.combine.bp_col": "pos", - "saige.test_combine.combine.loglog_pval": 10, + "saige.test_combine.combine.loglog_pval": 10 } From c4f081df203301bfda9e3ef08a6bbf04927761d7 Mon Sep 17 00:00:00 2001 From: Lipastomies Date: Tue, 20 Apr 2021 15:39:27 +0300 Subject: [PATCH 2/3] fix chrom regex --- wdl/gwas/saige_tests.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl/gwas/saige_tests.json b/wdl/gwas/saige_tests.json index cf5fa97..febcc0a 100644 --- a/wdl/gwas/saige_tests.json +++ b/wdl/gwas/saige_tests.json @@ -12,7 +12,7 @@ "saige.test_combine.test.minmac": 5, "saige.test_combine.test.logP": true, "saige.test_combine.combine.logP": true, - "saige.test_combine.combine.chrom_regex": "chr[0-9]{1,2}", + "saige.test_combine.combine.chrom_regex": "chr([0-9]{1,2})|(XYM)\s", "saige.test_combine.combine.prefix": "", "saige.test_combine.combine.chrcol": "#chrom", From 5f8084aed84b2404faff281b8dcdc23c52d4a926 Mon Sep 17 00:00:00 2001 From: Lipastomies Date: Tue, 20 Apr 2021 16:13:12 +0300 Subject: [PATCH 3/3] move regex to python script --- wdl/gwas/saige.json | 2 +- wdl/gwas/saige_sub.wdl | 8 +++++--- wdl/gwas/saige_tests.json | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/wdl/gwas/saige.json b/wdl/gwas/saige.json index 876ab95..3a83e0d 100644 --- a/wdl/gwas/saige.json +++ b/wdl/gwas/saige.json @@ -19,7 +19,7 @@ "saige.test_combine.test.minmac": 5, "saige.test_combine.test.logP": true, "saige.test_combine.combine.logP": true, - "saige.test_combine.combine.chrom_regex": "chr[0-9]{1,2}", + "saige.test_combine.combine.chrom_regex": "^chr(([0-9]{1,2})|([XYM]))$", "saige.test_combine.combine.prefix": "finngen_R7_", "saige.test_combine.combine.chrcol": "#chrom", diff --git a/wdl/gwas/saige_sub.wdl b/wdl/gwas/saige_sub.wdl index 2bbf6d8..1e2b746 100644 --- a/wdl/gwas/saige_sub.wdl +++ b/wdl/gwas/saige_sub.wdl @@ -92,13 +92,12 @@ task combine { |awk 'NR == 1; NR > 1 {print $0 | "sort -k 1,1V -k 2,2g"}' \ |awk 'NR==1 { for(i=1;i<=NF;i++) { h[$i]=i }; if (! "POS" in h) { print "ERROR: POS not in saige file header"; exit 1};print } NR>1{ $h["POS"]=sprintf("%d",$h["POS"]);print $0 }' \ |tr ' ' '\t' \ - |grep -E "(^${chrom_regex})|(^CHR)" |bgzip > ${prefix}${pheno}.saige.gz tabix -s1 -b2 -e2 -S1 ${prefix}${pheno}.saige.gz echo "`date` converting results to ${prefix}${pheno}.gz" python3 < ${prefix}${pheno}.gz - import math, gzip + import math, gzip, re from collections import OrderedDict from functools import reduce @@ -108,6 +107,8 @@ task combine { def red(obj, func_list): return "NA" if obj == "NA" else reduce(lambda o, func: func[0](o, *func[1]) if func[0] is not str.format else func[0](func[1], o), func_list, obj) + reg = re.compile("${chrom_regex}") + if "${traitType}" == "quantitative": mapping = OrderedDict([ ("#chrom", ("CHR", [(str.replace, ("chr", "")), (str.replace, ("X", "23")), (str.replace, ("Y", "24")), (str.replace, ("MT", "25")), (str.replace, ("M", "25"))])), @@ -148,7 +149,8 @@ task combine { print('\t'.join(mapping.keys())) for line in f: s = line.strip().split('\t') - print('\t'.join(str(red(s[header[v[0]]], v[1])) for v in mapping.values())) + if reg.match(s[0]):#match searches from the beginning of the string, which in this case is what we want + print('\t'.join(str(red(s[header[v[0]]], v[1])) for v in mapping.values())) EOF echo "`date` plotting qq and manha" qqplot.R --file ${prefix}${pheno}.gz --bp_col "${bp_col}" --pval_col "${p_valcol}" --chrcol "${chrcol}" --loglog_pval ${loglog_pval} diff --git a/wdl/gwas/saige_tests.json b/wdl/gwas/saige_tests.json index febcc0a..e1bdf82 100644 --- a/wdl/gwas/saige_tests.json +++ b/wdl/gwas/saige_tests.json @@ -12,7 +12,7 @@ "saige.test_combine.test.minmac": 5, "saige.test_combine.test.logP": true, "saige.test_combine.combine.logP": true, - "saige.test_combine.combine.chrom_regex": "chr([0-9]{1,2})|(XYM)\s", + "saige.test_combine.combine.chrom_regex": "^chr(([0-9]{1,2})|([XYM]))$", "saige.test_combine.combine.prefix": "", "saige.test_combine.combine.chrcol": "#chrom",