newly developed repo seperate from ncov-tools

simpsonlab · Oct 4, 2020 · 36f3608 · 36f3608
commit 36f3608
Show file tree

Hide file tree

Showing 39 changed files with 1,517 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,30 @@
+__pycache__/
+*.py[cod]
+*$py.class
+
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+sdist/
+share/python-wheels/
+*.egg-info/
+*.egg
+MANIFEST
+
+*.manifest
+*.spec
+*.swp
+
+pip-log.txt
+pip-delete-this-directory.txt
+
+.snakemake
+Snakefile
+
+tmp/
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,22 @@
+MIT License
+
+Copyright (c) 2020, Richard J. de Borja
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
diff --git a/README.md b/README.md
@@ -0,0 +1,92 @@
+# ncov_parser
+
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+
+The `ncov_parser` package provides a suite of tools to parse the files generated
+in the Nextflow workflow and provide a QC summary file.  The package requires
+several files including:
+* <sample>.variants.tsv/<sample>.pass.vcf
+* <sample>.per_base_coverage.bed
+* <sample>.primertrimmed.consensus.fa
+* alleles.tsv
+
+An optional metadata file with qPCR ct and collection date values can be
+included.
+
+In addition, `bedtools` should be run to generate a
+`<sample>.per_base_coverage.bed` file to generate mean and median depth of
+coverage statistics.
+
+
+## Installation
+After downloading the repository, the package can be installed using `pip`:
+```
+git clone https://github.com/jts/ncov-tools
+cd ncov-tools/parser
+pip install .
+```
+
+
+## Usage
+The library consists of several functions that can be imported.
+```
+import ncov.parser
+```
+Several classes are available representing the different files that can
+be processed.
+```
+ncov.parser.Alleles
+ncov.parser.Consensus
+ncov.parser.Meta
+ncov.parser.PerBaseCoverage
+ncov.parser.Variants
+ncov.parser.Vcf
+ncov.parser.primers
+```
+
+Similarly, wrapper scripts for creating a standard format output can be found in
+`ncov.parser.qc`
+```
+import ncov.parser.qc as qc
+qc.write_qc_summary_header()
+qc.write_qc_summary()
+```
+
+### Top levels scripts
+In the `bin` directory, several wrapper scripts exist to assist in generating
+QC metrics.
+
+To create sample level summary qc files, use the `get_qc.py` script:
+```
+get_qc.py --variants <sample>.variants.tsv or <sample>.pass.vcf
+--coverage <sample>.per_base_coverage.bed --meta <metadata>.tsv
+--consensus <sample>.primertrimmed.consensus.fa [--indel] --sample <samplename>
+--platform <illumina or oxford-nanopore> --run_name <run_name> --alleles alleles.tsv
+--indel
+```
+
+Note the `--indel` flag should only be present if indels will be used in the
+calculation of variants.
+
+Once this is complete, we can use the `collect_qc_summary.py` script to
+aggregate the sample level summary files into a single run tab-separate file.
+```
+collect_qc_summary.py --path <path to sample.summary.qc.tsv files>
+```
+
+To create an amplicon BED file from a primer scheme BED file:
+```
+primers_to_amplicons.py --primers <path to primer scheme BED file>
+--offset <number of bases to offset> --bed_type <full or no_primers or unique_amplicons>
+--output <full path to file to write BED data to>
+```
+
+## Credit and Acknowledgements
+Note that this tool has been used in conjunction with the [@jts `ncov-tools`](https://github.com/jts/ncov-tools)
+suite of tools. 
+
+BED file importing and amplicon site merging obtained from the ARTIC pipeline:
+`https://github.com/artic-network/fieldbioinformatics/blob/master/artic/vcftagprimersites.py`
+
+## License
+MIT
diff --git a/bin/collect_qc_summary.py b/bin/collect_qc_summary.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+'''
+A script for aggregating sample QC summary files into a single file.
+'''
+
+
+import argparse
+import sys
+from ncov.parser.qc import collect_qc_summary_data, write_qc_summary_header
+
+parser = argparse.ArgumentParser(description="Tool for aggregating sample QC \
+                                 data")
+parser.add_argument('-p', '--path',
+                    help='directory to search for <sample>.summary.qc.tsv \
+                         files')
+
+if len(sys.argv) == 1:
+    parser.print_help(sys.stderr)
+    sys.exit(1)
+
+args = parser.parse_args()
+
+summary_data = collect_qc_summary_data(path=args.path)
+
+write_qc_summary_header()
+for summary_line in summary_data:
+    print(summary_line)
diff --git a/bin/get_qc.py b/bin/get_qc.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python
+'''
+A Python package for summarizing QC data from the ncov-tools pipeline.
+'''
+
+import argparse
+import sys
+import ncov.parser.qc as qc
+import ncov.parser
+
+parser = argparse.ArgumentParser(description="Tool for summarizing QC data")
+parser.add_argument('-c', '--consensus', help='<sample>.consensus.fasta file to process')
+parser.add_argument('-v', '--variants',
+                    help='<sample>.variants.tsv file to process')
+parser.add_argument('-e', '--coverage',
+                    help='<sample>.per_base_coverage.bed file to process')
+parser.add_argument('-i', '--indel', action='store_true',
+                    help='flag to determine whether to count indels')
+parser.add_argument('-m', '--meta', default=None,
+                    help='full path to the metadata file')
+parser.add_argument('-a', '--alleles',
+                    help='full path to the alleles.tsv file')
+parser.add_argument('-s', '--sample',
+                    help='name of sample being processed')
+parser.add_argument('-p', '--platform', default='illumina',
+                    help='sequencing platform used')
+parser.add_argument('-r', '--run_name',
+                    help='run name for sample')
+if len(sys.argv) == 1:
+    parser.print_help(sys.stderr)
+    sys.exit('Invalid number of arguments')
+args = parser.parse_args()
+
+qc_line = dict()
+qc_line.update({'sample' : args.sample})
+
+try:
+    meta = ncov.parser.Meta(file=args.meta)
+    meta.import_metadata()
+    qc_line.update(meta.data[args.sample])
+except:
+    qc_line.update({'qpcr_ct' : 'NA', 'collection_date' : 'NA',
+                    'num_months' : 'NA', 'num_weeks' : 'NA'})
+
+if args.platform == 'illumina':
+    if str(args.variants).endswith('.variants.tsv'):
+        vars = ncov.parser.Variants(file=args.variants)
+        qc_line.update(vars.get_total_variants())
+    else:
+        sys.exit('Must be a valid variant.tsv file for the Illumina platform')
+elif args.platform == 'oxford-nanopore':
+    if str(args.variants).endswith('.vcf') or str(args.variants).endswith('.vcf.gz'):
+        vars = ncov.parser.Vcf(file=args.variants)
+        qc_line.update(vars.get_variant_counts())
+    else:
+        sys.exit('Must be a valid VCF file for the Oxford-Nanopore platform')
+
+
+alleles = ncov.parser.Alleles(file=args.alleles)
+qc_line.update(alleles.get_variant_counts(sample=args.sample))
+
+cons = ncov.parser.Consensus(file=args.consensus)
+qc_line.update(cons.count_iupac_in_fasta())
+qc_line.update(cons.get_genome_completeness())
+
+coverage = ncov.parser.PerBaseCoverage(file=args.coverage)
+qc_line.update(coverage.get_coverage_stats())
+
+# Produce warning flags
+qc_flags = list()
+if qc_line['genome_completeness'] < 0.5:
+    qc_flags.append("INCOMPLETE_GENOME")
+elif qc_line['genome_completeness'] < 0.9:
+    qc_flags.append("PARTIAL_GENOME")
+
+num_indel_non_triplet = qc_line['num_variants_indel'] - qc_line['num_variants_indel_triplet']
+if num_indel_non_triplet > 0:
+    qc_flags.append("POSSIBLE_FRAMESHIFT_INDELS")
+
+if qc_line['num_consensus_iupac'] > 5:
+    qc_flags.append("EXCESS_AMBIGUITY")
+
+# Calculate number of variants per week, while accounting for incompleteness
+if qc_line['num_weeks'] != 'NA':
+
+    if qc_line['genome_completeness'] > 0.1:
+        scaled_variants = qc_line['num_variants_snvs'] / qc_line['genome_completeness']
+
+        # very conservative upper limit on the number of acceptable variants
+        # samples that fail this check should be manually reviewed incorporating other
+        # evidence (frameshift indels, not failed outright)
+        variant_threshold = qc_line['num_weeks'] * 0.75 + 15
+        if scaled_variants > variant_threshold:
+            qc_flags.append("EXCESS_VARIANTS")
+        qc_line['scaled_variants_snvs'] = "%.2f" % (scaled_variants)
+    else:
+        qc_line['scaled_variants_snvs'] = "NA"
+else:
+    qc_line['scaled_variants_snvs'] = "NA"
+
+qc_flag_str = "PASS"
+if len(qc_flags) > 0:
+    qc_flag_str = ",".join(qc_flags)
+
+qc_line.update({'qc_pass' : qc_flag_str})
+qc_line.update({'run_name' : args.run_name})
+
+qc.write_qc_summary_header()
+qc.write_qc_summary(summary=qc_line)
diff --git a/bin/primers_to_amplicons.py b/bin/primers_to_amplicons.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+'''
+Convert the nCoV primer scheme to a unique amplicon BED file.
+'''
+
+import sys
+import argparse
+import ncov.parser.primers as pr
+
+parser = argparse.ArgumentParser(description='Create amplicon BED file')
+parser.add_argument('-p', '--primers', help='Primer scheme in BED format')
+parser.add_argument('--offset', default=0, help='Primer offset for coordinates')
+parser.add_argument('-o', '--output', default='out.bed',
+                    help='filename to write BED to')
+parser.add_argument('--bed_type', default='unique_amplicons',
+                    help='type of BED to produce (e.g. full, no_primers, unique-amplicons')
+if len(sys.argv) <= 1:
+    parser.print_help(sys.stderr)
+    sys.exit('Invalid number of arguments')
+args = parser.parse_args()
+
+primers = pr.read_bed_file(args.primers)
+primer_pairs = pr.create_primer_pairs(primers=primers)
+amplicon_ranges = pr.create_amplicons(primer_pairs=primer_pairs,
+                                      offset=args.offset,
+                                      type=args.bed_type)
+
+with open(args.output, 'w') as file_o:
+    for line in amplicon_ranges:
+        file_o.write('\t'.join(line))
+        file_o.write('\n')
+file_o.close()
diff --git a/data/alleles.tsv b/data/alleles.tsv
@@ -0,0 +1,15 @@
+name	pos	ref_allele	alt_allele
+sampleA	241	C	T
+sampleA	1583	G	A
+sampleA	13335	C	T
+sampleA	23403	A	G
+sampleA	29183	A	G
+sampleB	19	C	N
+sampleB	241	C	T
+sampleB	2416	C	T
+sampleB	3037	C	T
+sampleB	13396	A	G
+sampleB	14408	C	T
+sampleB	21302	C	N
+sampleB	28377	C	T
+sampleB	29737	G	S
diff --git a/data/metadata.tsv b/data/metadata.tsv
@@ -0,0 +1,4 @@
+sample	ct	date
+sampleA	17.4	2020-03-02
+sampleB	18.4	2020-05-03
+sampleC	18.4	1905-04-01
diff --git a/data/sampleA.pass.vcf b/data/sampleA.pass.vcf
@@ -0,0 +1,23 @@
+##fileformat=VCFv4.2
+##nanopolish_window=MN908947.3:1-29902
+##INFO=<ID=TotalReads,Number=1,Type=Integer,Description="The number of event-space reads used to call the variant">
+##INFO=<ID=SupportFraction,Number=1,Type=Float,Description="The fraction of event-space reads that support the variant">
+##INFO=<ID=SupportFractionByStrand,Number=2,Type=Float,Description="Fraction of event-space reads that support the variant for each strand">
+##INFO=<ID=BaseCalledReadsWithVariant,Number=1,Type=Integer,Description="The number of base-space reads that support the variant">
+##INFO=<ID=BaseCalledFraction,Number=1,Type=Float,Description="The fraction of base-space reads that support the variant">
+##INFO=<ID=AlleleCount,Number=1,Type=Integer,Description="The inferred number of copies of the allele">
+##INFO=<ID=StrandSupport,Number=4,Type=Integer,Description="Number of reads supporting the REF and ALT allele, by strand">
+##INFO=<ID=StrandFisherTest,Number=1,Type=Integer,Description="Strand bias fisher test">
+##INFO=<ID=SOR,Number=1,Type=Float,Description="StrandOddsRatio test from GATK">
+##INFO=<ID=RefContext,Number=1,Type=String,Description="The reference sequence context surrounding the variant call">
+##INFO=<ID=Pool,Number=1,Type=String,Description="The pool name">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample
+MN908947.3	4	.	C	T	3287.1	PASS	TotalReads=398;SupportFraction=0.957216;SupportFractionByStrand=0.983424,0.931271;BaseCalledReadsWithVariant=371;BaseCalledFraction=0.9275;AlleleCount=1;StrandSupport=3,14,195,186;StrandFisherTest=17;SOR=0.162774;RefContext=GGTTTCGTCCG;Pool=nCoV-2019_1	GT	1
+MN908947.3	14	.	C	AT	3287.1	PASS	TotalReads=398;SupportFraction=0.957216;SupportFractionByStrand=0.983424,0.931271;BaseCalledReadsWithVariant=371;BaseCalledFraction=0.9275;AlleleCount=1;StrandSupport=3,14,195,186;StrandFisherTest=17;SOR=0.162774;RefContext=GGTTTCGTCCG;Pool=nCoV-2019_1	GT	1
+MN908947.3	31	.	C	T	3287.1	PASS	TotalReads=398;SupportFraction=0.957216;SupportFractionByStrand=0.983424,0.931271;BaseCalledReadsWithVariant=371;BaseCalledFraction=0.9275;AlleleCount=1;StrandSupport=3,14,195,186;StrandFisherTest=17;SOR=0.162774;RefContext=GGTTTCGTCCG;Pool=nCoV-2019_1	GT	1
+MN908947.3	41	.	C	CAT	3287.1	PASS	TotalReads=398;SupportFraction=0.957216;SupportFractionByStrand=0.983424,0.931271;BaseCalledReadsWithVariant=371;BaseCalledFraction=0.9275;AlleleCount=1;StrandSupport=3,14,195,186;StrandFisherTest=17;SOR=0.162774;RefContext=GGTTTCGTCCG;Pool=nCoV-2019_1	GT	1
+MN908947.3	51	.	C	T	3287.1	PASS	TotalReads=398;SupportFraction=0.957216;SupportFractionByStrand=0.983424,0.931271;BaseCalledReadsWithVariant=371;BaseCalledFraction=0.9275;AlleleCount=1;StrandSupport=3,14,195,186;StrandFisherTest=17;SOR=0.162774;RefContext=GGTTTCGTCCG;Pool=nCoV-2019_1	GT	1
+MN908947.3	62	.	GATC	T	3287.1	PASS	TotalReads=398;SupportFraction=0.957216;SupportFractionByStrand=0.983424,0.931271;BaseCalledReadsWithVariant=371;BaseCalledFraction=0.9275;AlleleCount=1;StrandSupport=3,14,195,186;StrandFisherTest=17;SOR=0.162774;RefContext=GGTTTCGTCCG;Pool=nCoV-2019_1	GT	1
+MN908947.3	104	.	C	T	3514.9	PASS	TotalReads=398;SupportFraction=0.966071;SupportFractionByStrand=0.974862,0.95728;BaseCalledReadsWithVariant=390;BaseCalledFraction=0.975;AlleleCount=1;StrandSupport=5,9,194,190;StrandFisherTest=2;SOR=0.36185;RefContext=TGACACCTTCA;Pool=nCoV-2019_2	GT	1
+MN908947.3	120	.	G	A	3239.3	PASS	TotalReads=397;SupportFraction=0.961074;SupportFractionByStrand=0.947821,0.974394;BaseCalledReadsWithVariant=380;BaseCalledFraction=0.952381;AlleleCount=1;StrandSupport=10,5,189,193;StrandFisherTest=5;SOR=0.285429;RefContext=AGAACGCAGTG;Pool=nCoV-2019_1	GT	1
diff --git a/data/sampleA.pass.vcf.gz b/data/sampleA.pass.vcf.gz
diff --git a/data/sampleA.per_base_coverage.bed b/data/sampleA.per_base_coverage.bed
@@ -0,0 +1,8 @@
+reference_name	start	end	amplicon_id	pool	strand	position	depth
+ref	54	385	1	amp_1	+	1	636
+ref	54	385	1	amp_1	+	2	658
+ref	54	385	1	amp_1	+	3	672
+ref	54	385	1	amp_1	+	4	682
+ref	54	385	1	amp_1	+	5	688
+ref	54	385	1	amp_1	+	6	691
+ref	54	385	1	amp_1	+	7	729
diff --git a/data/sampleA.qc.csv b/data/sampleA.qc.csv
@@ -0,0 +1,2 @@
+sample_name,pct_N_bases,pct_covered_bases,longest_no_N_run,fasta,bam,qc_pass
+sampleA,31.29,68.01,2436,sampleA.consensus.fa,sampleA.mapped.primertrimmed.sorted.bam,FALSE
diff --git a/data/sampleA.summary.qc.tsv b/data/sampleA.summary.qc.tsv
@@ -0,0 +1,2 @@
+sample_name	pct_n_bases	pct_covered_bases	total_variants	total_snv	total_indel	total_n	total_iupac	mean_depth	median_depth	ct	qc_pass
+sampleA	31.29	68.01	10	9	1	2	4	679.4285714285714	682	17.4	FALSE
diff --git a/data/sampleA.variants.tsv b/data/sampleA.variants.tsv
@@ -0,0 +1,11 @@
+REGION	POS	REF	ALT	REF_DP	REF_RV	REF_QUAL	ALT_DP	ALT_RV	ALT_QUAL	ALT_FREQ	TOTAL_DP	PVAL	PASS	GFF_FEATURE	REF_CODON	REF_AA	ALT_CODON	ALT_AA
+region1	366	C	T	202	53	54	233	65	57	0.535632	435	1.5367e-117	TRUE	NA	NA	NA	NA	NA
+region1	403	T	A	320	99	54	376	104	58	0.54023	696	7.76958e-186	TRUE	NA	NA	NA	NA	NA
+region2	441	G	T	630	142	61	238	71	61	0.274194	868	1.89053e-115	TRUE	NA	NA	NA	NA	NA
+region2	431	G	+TTG	630	142	61	238	71	61	0.274194	868	1.89053e-115	TRUE	NA	NA	NA	NA	NA
+region2	441	G	W	630	142	61	238	71	61	0.274194	868	1.89053e-115	TRUE	NA	NA	NA	NA	NA
+region3	441	G	R	630	142	61	238	71	61	0.274194	868	1.89053e-115	TRUE	NA	NA	NA	NA	NA
+region3	441	G	D	630	142	61	238	71	61	0.274194	868	1.89053e-115	TRUE	NA	NA	NA	NA	NA
+region3	441	G	R	630	142	61	238	71	61	0.274194	868	1.89053e-115	TRUE	NA	NA	NA	NA	NA
+region3	441	G	N	630	142	61	238	71	61	0.274194	868	1.89053e-115	TRUE	NA	NA	NA	NA	NA
+region3	441	G	N	630	142	61	238	71	61	0.274194	868	1.89053e-115	TRUE	NA	NA	NA	NA	NA
diff --git a/data/sampleB.summary.qc.tsv b/data/sampleB.summary.qc.tsv
@@ -0,0 +1,2 @@
+sample_name	pct_n_bases	pct_covered_bases	total_variants	total_snv	total_indel	total_n	total_iupac	mean_depth	median_depth	ct	qc_pass
+sampleB	31.29	68.01	10	9	1	2	4	679.4285714285714	682	17.4	FALSE
diff --git a/data/sampleC.summary.qc.tsv b/data/sampleC.summary.qc.tsv
@@ -0,0 +1,2 @@
+sample_name	pct_n_bases	pct_covered_bases	total_variants	total_snv	total_indel	total_n	total_iupac	mean_depth	median_depth	ct	qc_pass
+sampleC	31.29	68.01	10	9	1	2	4	679.4285714285714	682	17.4	FALSE
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		sample_name,pct_N_bases,pct_covered_bases,longest_no_N_run,fasta,bam,qc_pass
		sampleA,31.29,68.01,2436,sampleA.consensus.fa,sampleA.mapped.primertrimmed.sorted.bam,FALSE
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		sample_name pct_n_bases pct_covered_bases total_variants total_snv total_indel total_n total_iupac mean_depth median_depth ct qc_pass
		sampleA 31.29 68.01 10 9 1 2 4 679.4285714285714 682 17.4 FALSE
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		sample_name pct_n_bases pct_covered_bases total_variants total_snv total_indel total_n total_iupac mean_depth median_depth ct qc_pass
		sampleB 31.29 68.01 10 9 1 2 4 679.4285714285714 682 17.4 FALSE
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		sample_name pct_n_bases pct_covered_bases total_variants total_snv total_indel total_n total_iupac mean_depth median_depth ct qc_pass
		sampleC 31.29 68.01 10 9 1 2 4 679.4285714285714 682 17.4 FALSE