diff --git a/README.md b/README.md index 3fdd5e0..ec638d2 100644 --- a/README.md +++ b/README.md @@ -37,8 +37,10 @@ be processed. ``` ncov.parser.Alleles ncov.parser.Consensus +ncov.parser.Lineage ncov.parser.Meta ncov.parser.PerBaseCoverage +ncov.parser.Snpeff ncov.parser.Variants ncov.parser.Vcf ncov.parser.primers @@ -62,7 +64,7 @@ get_qc.py --variants .variants.tsv or .pass.vcf --coverage .per_base_coverage.bed --meta .tsv --consensus .primertrimmed.consensus.fa [--indel] --sample --platform --run_name --alleles alleles.tsv ---indel +--indel --lineage --aa_table ``` Note the `--indel` flag should only be present if indels will be used in the diff --git a/bin/get_qc.py b/bin/get_qc.py index 44fb97b..f12f6f4 100644 --- a/bin/get_qc.py +++ b/bin/get_qc.py @@ -29,6 +29,12 @@ help='sequencing platform used') parser.add_argument('-r', '--run_name', help='run name for sample') +parser.add_argument('-l', '--lineage', + help='full path to the Pangolin lineage report') +parser.add_argument('-t', '--aa_table', + help='full path to the _aa_table.tsv file') +parser.add_argument('-u', '--mutations', + help='full path to the _ncov_watch_variants.tsv file') if len(sys.argv) == 1: parser.print_help(sys.stderr) sys.exit('Invalid number of arguments') @@ -69,6 +75,31 @@ coverage = ncov.parser.PerBaseCoverage(file=args.coverage) qc_line.update(coverage.get_coverage_stats()) +# Add the lineage from the Pangolin report +try: + lineage = ncov.parser.Lineage(file=args.lineage) + lineage.create_lineage_dictionary() + qc_line.update({"lineage" : lineage.lineage_dict[args.sample]}) +except: + qc_line.update({"lineage" : "none"}) + +# Add the watch list mutations +try: + watchlist = ncov.parser.WatchList(file=args.mutations) + qc_line.update({"mutations" : watchlist.get_mutation_string(sample=args.sample)}) +except: + qc_line.update({"mutations" : "none"}) + +# Get a list of consequences from the SNPEff variant annotations +frameshift_indels = False +try: + annotations = ncov.parser.Snpeff(file=args.aa_table) + annotations.get_list_of_consequences() + if annotations.has_frameshift(): + frameshift_indels = True +except: + pass + # Produce warning flags qc_flags = list() if qc_line['genome_completeness'] < 0.5: @@ -76,8 +107,9 @@ elif qc_line['genome_completeness'] < 0.9: qc_flags.append("PARTIAL_GENOME") -num_indel_non_triplet = qc_line['num_variants_indel'] - qc_line['num_variants_indel_triplet'] -if num_indel_non_triplet > 0: +#num_indel_non_triplet = qc_line['num_variants_indel'] - qc_line['num_variants_indel_triplet'] +#if num_indel_non_triplet > 0: +if frameshift_indels: qc_flags.append("POSSIBLE_FRAMESHIFT_INDELS") if qc_line['num_consensus_iupac'] > 5: diff --git a/data/sampleA_aa_table.tsv b/data/sampleA_aa_table.tsv new file mode 100644 index 0000000..1e9671c --- /dev/null +++ b/data/sampleA_aa_table.tsv @@ -0,0 +1,7 @@ +sample chr pos ref alt Consequence gene protein aa +sampleA MN908947.3 241 C T upstream_gene_variant orf1ab NA +sampleA MN908947.3 3037 C T synonymous_variant orf1ab F924F orf1ab-F924F +sampleA MN908947.3 14408 C T missense_variant orf1ab P4715L orf1ab-P4715L +sampleA MN908947.3 21625 A G synonymous_variant S R21R S-R21R +sampleA MN908947.3 28883 G C missense_variant N G204R N-G204R +sampleA MN908947.3 29580 CT C frameshift_variant ORF10 P10fs ORF10-P10fs diff --git a/data/sampleB_aa_table.tsv b/data/sampleB_aa_table.tsv new file mode 100644 index 0000000..cdae380 --- /dev/null +++ b/data/sampleB_aa_table.tsv @@ -0,0 +1 @@ +sample chr pos ref alt Consequence gene protein aa diff --git a/data/sample_lineages.csv b/data/sample_lineages.csv new file mode 100644 index 0000000..2fe4f10 --- /dev/null +++ b/data/sample_lineages.csv @@ -0,0 +1,4 @@ +taxon,lineage,probability,pangoLEARN_version,status,note +sampleA/ARTIC/nanopolish,B.1.1.43,1.0,2021-01-06,passed_qc, +sampleB/ARTIC/nanopolish,B.1.36,1.0,2021-01-06,passed_qc, +sampleC/ARTIC/nanopolish,B.1.1.7,1.0,2021-01-06,passed_qc, diff --git a/data/testrun_ncov_watch_variants.tsv b/data/testrun_ncov_watch_variants.tsv new file mode 100644 index 0000000..efb1dd5 --- /dev/null +++ b/data/testrun_ncov_watch_variants.tsv @@ -0,0 +1,15 @@ +sample mutation contig position reference alt +sampleA.variants.tsv S:del69-70 MN908947.3 21764 ATACATG A +sampleA.variants.tsv S:del144 MN908947.3 21990 TTTA T +sampleA.variants.tsv S:A570D MN908947.3 23271 C A +sampleA.variants.tsv S:P681H MN908947.3 23604 C A +sampleA.variants.tsv S:T716I MN908947.3 23709 C T +sampleA.variants.tsv S:S982A MN908947.3 24506 T G +sampleB.variants.tsv S:del69-70 MN908947.3 21764 ATACATG A +sampleB.variants.tsv S:del144 MN908947.3 21990 TTTA T +sampleB.variants.tsv S:N501Y MN908947.3 23063 A T +sampleB.variants.tsv S:A570D MN908947.3 23271 C A +sampleB.variants.tsv S:P681H MN908947.3 23604 C A +sampleB.variants.tsv S:T716I MN908947.3 23709 C T +sampleB.variants.tsv S:S982A MN908947.3 24506 T G +sampleB.variants.tsv S:D1118H MN908947.3 24914 G C diff --git a/ncov/parser/Lineage.py b/ncov/parser/Lineage.py new file mode 100644 index 0000000..0740626 --- /dev/null +++ b/ncov/parser/Lineage.py @@ -0,0 +1,50 @@ +import csv +import re + +class Lineage(): + """ + A class for handling Pangolin lineage reports. + """ + + def __init__(self, file, delimiter=','): + self.file = file + self.delimiter = delimiter + + + def get_sample_name(self, row): + """ + The sample name is obtained from the 'taxon' field. Note that ONT + consensus FASTA files are generated as "/ARTIC/nanopolish" + whereas Illumina runs (iVar) use "Consensus_" + """ + sample_name = row['taxon'] + sample_name = re.sub('^Consensus_', '', sample_name) # added by ivar + sample_name = re.sub('.primertrimmed.consensus_threshold_0.75_quality_20', '', sample_name) # added by ivar + sample_name = re.sub('_MN908947.3', '', sample_name) # added by pangolin + sample_name = re.sub('/ARTIC/nanopolish', '', sample_name) # added by ARTIC + sample_name = re.sub('/ARTIC/medaka', '', sample_name) # added by ARTIC + return sample_name + + + def get_lineage(self, row): + """ + Return the lineage value + """ + return row['lineage'] + + + def create_lineage_dictionary(self): + """ + Create a dictionary containing sample names as key and their lineage as + the value + """ + lineage_dict = dict() + with open(self.file, 'r') as fh: + reader = csv.DictReader(fh, delimiter=self.delimiter) + for row in reader: + sample = self.get_sample_name(row=row) + lineage = self.get_lineage(row=row) + lineage_dict[sample] = lineage + self.lineage_dict = lineage_dict + return lineage_dict + diff --git a/ncov/parser/Meta.py b/ncov/parser/Meta.py index 4682832..f4d920a 100644 --- a/ncov/parser/Meta.py +++ b/ncov/parser/Meta.py @@ -64,7 +64,7 @@ def import_metadata(self, sample_id='sample', ct_id='ct', date_id='date'): self.data = data return data except: - print("Invalid metadata file") + pass def get_meta_for_sample(self, sample): diff --git a/ncov/parser/Snpeff.py b/ncov/parser/Snpeff.py new file mode 100644 index 0000000..1040813 --- /dev/null +++ b/ncov/parser/Snpeff.py @@ -0,0 +1,46 @@ +""" + +""" + +import os +import sys +import csv +import re + +class Snpeff(): + """ + A class for processing annotated variants from SNPEff. + """ + + def __init__(self, file, delimiter="\t"): + """ + Initilize the object + """ + annotations = list() + try: + with open(file, 'r') as fh: + reader = csv.DictReader(fh, delimiter=delimiter) + for row in reader: + annotations.append(row) + self.annotations = annotations + except: + pass + + + def get_list_of_consequences(self): + """ + + """ + consequences = list() + for var in self.annotations: + consequences.append(var['Consequence']) + self.consequences = consequences + + + def has_frameshift(self): + """ + Determine whether the annotated variant is frameshift + """ + return 'frameshift_variant' in self.consequences + + diff --git a/ncov/parser/WatchList.py b/ncov/parser/WatchList.py new file mode 100644 index 0000000..391b4ed --- /dev/null +++ b/ncov/parser/WatchList.py @@ -0,0 +1,55 @@ +""" +WatchList module +""" + +import csv +import os +import sys +import re + + +class WatchList(): + """ + A class for handling the watch list report generated in the ncov-tools + pipeline. + """ + + watch_list = dict() + + def __init__(self, file, delimiter='\t'): + """ + Initialize the WatchList object and construct the watch_list dictionary + attribute from the provided file. + """ + watch = dict() + samplename = str() + with open(file, 'r') as fh: + reader = csv.DictReader(fh, delimiter=delimiter) + for row in reader: + if row['sample'].endswith('.variants.tsv'): + samplename = re.sub('.variants.tsv', '', row['sample']) + elif row['sample'].endswith('.pass.vcf.gz'): + samplename = re.sub('.pass.vcf.gz', '', row['sample']) + if samplename in watch: + watch[samplename].append(row) + else: + watch[samplename] = [row] + self.watch_list = watch + + + def get_mutation_string(self, sample, delimiter=','): + """ + Returns a comma separated string of the watch list mutations. + """ + mutations = list() + for samplename in self.watch_list: + if sample == samplename: + for item in self.watch_list[samplename]: + mutations.append(item['mutation']) + else: + continue + if len(mutations) == 0: + return 'none' + else: + return delimiter.join(mutations) + diff --git a/ncov/parser/__init__.py b/ncov/parser/__init__.py index 03f58fa..4abb2b5 100644 --- a/ncov/parser/__init__.py +++ b/ncov/parser/__init__.py @@ -7,3 +7,7 @@ from .Vcf import * from .NegativeControl import * from .primers import * +from .Lineage import * +from .Snpeff import * +from .WatchList import * + diff --git a/ncov/parser/qc.py b/ncov/parser/qc.py index a473e03..38a3523 100644 --- a/ncov/parser/qc.py +++ b/ncov/parser/qc.py @@ -26,6 +26,8 @@ def write_qc_summary(summary): * scaled_variants_snvs * genome_completeness * qc_pass + * lineage + * mutations Arguments: * summary: a dictionary containing the sample QC details @@ -49,7 +51,9 @@ def write_qc_summary(summary): str(summary['num_weeks']), str(summary['scaled_variants_snvs']), str(summary['genome_completeness']), - str(summary['qc_pass'])]) + str(summary['qc_pass']), + str(summary['lineage']), + str(summary['mutations'])]) print(summary_line) @@ -68,7 +72,9 @@ def write_qc_summary_header(header=['sample', 'num_weeks', 'scaled_variants_snvs', 'genome_completeness', - 'qc_pass']): + 'qc_pass', + 'lineage', + 'watch_mutations']): ''' Write the header for the QC summary data diff --git a/setup.py b/setup.py index 187e666..2a9933e 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setuptools.setup( name="ncov_parser", - version="0.6.2", + version="0.6.3", author="Richard J. de Borja", author_email="richard.deborja@oicr.on.ca", description="A nCoV package for parsing analysis files", diff --git a/tests/test_Lineage.py b/tests/test_Lineage.py new file mode 100644 index 0000000..13f63bf --- /dev/null +++ b/tests/test_Lineage.py @@ -0,0 +1,25 @@ +''' +Suite of tests for the Consensus module +''' + +import unittest +import ncov.parser + +test_lineage = ncov.parser.Lineage(file='data/sample_lineages.csv') + + +class LineageTest(unittest.TestCase): + def test_create_lineage_dictionary(self): + lineage_dict = test_lineage.create_lineage_dictionary() + self.assertEqual(lineage_dict['sampleA'], 'B.1.1.43') + self.assertEqual(lineage_dict['sampleB'], 'B.1.36') + self.assertEqual(lineage_dict['sampleC'], 'B.1.1.7') + def test_get_sample_name(self): + sample_row = {'taxon' : 'sampleA/ARTIC/nanopolish', + 'lineage' : 'B.1.1.43', + 'probability' : 1.0, + 'pangoLEARN_version' : '2021-01-06', + 'passed_qc' : 'passed_qc'} + expected_sample_name = 'sampleA' + sample_name = test_lineage.get_sample_name(row=sample_row) + self.assertEqual(sample_name, expected_sample_name) diff --git a/tests/test_Snpeff.py b/tests/test_Snpeff.py new file mode 100644 index 0000000..0ffb593 --- /dev/null +++ b/tests/test_Snpeff.py @@ -0,0 +1,14 @@ +""" +Suite of tests for the Snpeff module +""" + +import unittest +import ncov.parser + +test_snpeff = ncov.parser.Snpeff(file='data/sampleA_aa_table.tsv') +test_snpeff.get_list_of_consequences() + +class SnpeffTest(unittest.TestCase): + def test_has_frameshift(self): + self.assertTrue(test_snpeff.has_frameshift()) + diff --git a/tests/test_WatchList.py b/tests/test_WatchList.py new file mode 100644 index 0000000..498df76 --- /dev/null +++ b/tests/test_WatchList.py @@ -0,0 +1,14 @@ +""" +Suite of tests for the WatchList module +""" + +import unittest +import ncov.parser + +test_watch = ncov.parser.WatchList(file='data/testrun_ncov_watch_variants.tsv') +expected_mutation_string = 'S:del69-70,S:del144,S:A570D,S:P681H,S:T716I,S:S982A' + +class WatchListTest(unittest.TestCase): + def test_get_mutation_string(self): + self.assertEqual(test_watch.get_mutation_string(sample='sampleA'), + expected_mutation_string)