Merge pull request #4 from simpsonlab/add_pangolin_lineage

Add pangolin lineage
simpsonlab · Jan 25, 2021 · 63142a9 · 63142a9
2 parents ba88807 + 65ca008
commit 63142a9
Show file tree

Hide file tree

Showing 16 changed files with 282 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -37,8 +37,10 @@ be processed.
 ```
 ncov.parser.Alleles
 ncov.parser.Consensus
+ncov.parser.Lineage
 ncov.parser.Meta
 ncov.parser.PerBaseCoverage
+ncov.parser.Snpeff
 ncov.parser.Variants
 ncov.parser.Vcf
 ncov.parser.primers
@@ -62,7 +64,7 @@ get_qc.py --variants <sample>.variants.tsv or <sample>.pass.vcf
 --coverage <sample>.per_base_coverage.bed --meta <metadata>.tsv
 --consensus <sample>.primertrimmed.consensus.fa [--indel] --sample <samplename>
 --platform <illumina or oxford-nanopore> --run_name <run_name> --alleles alleles.tsv
---indel
+--indel --lineage <Pangolin lineage report> --aa_table <SNPEff annotation table>
 ```
 
 Note the `--indel` flag should only be present if indels will be used in the

diff --git a/bin/get_qc.py b/bin/get_qc.py
@@ -29,6 +29,12 @@
                     help='sequencing platform used')
 parser.add_argument('-r', '--run_name',
                     help='run name for sample')
+parser.add_argument('-l', '--lineage',
+                    help='full path to the Pangolin lineage report')
+parser.add_argument('-t', '--aa_table',
+                    help='full path to the <sample>_aa_table.tsv file')
+parser.add_argument('-u', '--mutations',
+                    help='full path to the <run>_ncov_watch_variants.tsv file')
 if len(sys.argv) == 1:
     parser.print_help(sys.stderr)
     sys.exit('Invalid number of arguments')
@@ -69,15 +75,41 @@
 coverage = ncov.parser.PerBaseCoverage(file=args.coverage)
 qc_line.update(coverage.get_coverage_stats())
 
+# Add the lineage from the Pangolin report
+try:
+    lineage = ncov.parser.Lineage(file=args.lineage)
+    lineage.create_lineage_dictionary()
+    qc_line.update({"lineage" : lineage.lineage_dict[args.sample]})
+except:
+    qc_line.update({"lineage" : "none"})
+
+# Add the watch list mutations
+try:
+    watchlist = ncov.parser.WatchList(file=args.mutations)
+    qc_line.update({"mutations" : watchlist.get_mutation_string(sample=args.sample)})
+except:
+    qc_line.update({"mutations" : "none"})
+
+# Get a list of consequences from the SNPEff variant annotations
+frameshift_indels = False
+try:
+    annotations = ncov.parser.Snpeff(file=args.aa_table)
+    annotations.get_list_of_consequences()
+    if annotations.has_frameshift():
+        frameshift_indels = True
+except:
+    pass
+
 # Produce warning flags
 qc_flags = list()
 if qc_line['genome_completeness'] < 0.5:
     qc_flags.append("INCOMPLETE_GENOME")
 elif qc_line['genome_completeness'] < 0.9:
     qc_flags.append("PARTIAL_GENOME")
 
-num_indel_non_triplet = qc_line['num_variants_indel'] - qc_line['num_variants_indel_triplet']
-if num_indel_non_triplet > 0:
+#num_indel_non_triplet = qc_line['num_variants_indel'] - qc_line['num_variants_indel_triplet']
+#if num_indel_non_triplet > 0:
+if frameshift_indels:
     qc_flags.append("POSSIBLE_FRAMESHIFT_INDELS")
 
 if qc_line['num_consensus_iupac'] > 5:

diff --git a/data/sampleA_aa_table.tsv b/data/sampleA_aa_table.tsv
@@ -0,0 +1,7 @@
+sample	chr	pos	ref	alt	Consequence	gene	protein	aa
+sampleA	MN908947.3	241	C	T	upstream_gene_variant	orf1ab		NA
+sampleA	MN908947.3	3037	C	T	synonymous_variant	orf1ab	F924F	orf1ab-F924F
+sampleA	MN908947.3	14408	C	T	missense_variant	orf1ab	P4715L	orf1ab-P4715L
+sampleA	MN908947.3	21625	A	G	synonymous_variant	S	R21R	S-R21R
+sampleA	MN908947.3	28883	G	C	missense_variant	N	G204R	N-G204R
+sampleA	MN908947.3	29580	CT	C	frameshift_variant	ORF10	P10fs	ORF10-P10fs
diff --git a/data/sampleB_aa_table.tsv b/data/sampleB_aa_table.tsv
@@ -0,0 +1 @@
+sample	chr	pos	ref	alt	Consequence	gene	protein	aa
diff --git a/data/sample_lineages.csv b/data/sample_lineages.csv
@@ -0,0 +1,4 @@
+taxon,lineage,probability,pangoLEARN_version,status,note
+sampleA/ARTIC/nanopolish,B.1.1.43,1.0,2021-01-06,passed_qc,
+sampleB/ARTIC/nanopolish,B.1.36,1.0,2021-01-06,passed_qc,
+sampleC/ARTIC/nanopolish,B.1.1.7,1.0,2021-01-06,passed_qc,
diff --git a/data/testrun_ncov_watch_variants.tsv b/data/testrun_ncov_watch_variants.tsv
@@ -0,0 +1,15 @@
+sample	mutation	contig	position	reference	alt
+sampleA.variants.tsv	S:del69-70	MN908947.3	21764	ATACATG	A
+sampleA.variants.tsv	S:del144	MN908947.3	21990	TTTA	T
+sampleA.variants.tsv	S:A570D	MN908947.3	23271	C	A
+sampleA.variants.tsv	S:P681H	MN908947.3	23604	C	A
+sampleA.variants.tsv	S:T716I	MN908947.3	23709	C	T
+sampleA.variants.tsv	S:S982A	MN908947.3	24506	T	G
+sampleB.variants.tsv	S:del69-70	MN908947.3	21764	ATACATG	A
+sampleB.variants.tsv	S:del144	MN908947.3	21990	TTTA	T
+sampleB.variants.tsv	S:N501Y	MN908947.3	23063	A	T
+sampleB.variants.tsv	S:A570D	MN908947.3	23271	C	A
+sampleB.variants.tsv	S:P681H	MN908947.3	23604	C	A
+sampleB.variants.tsv	S:T716I	MN908947.3	23709	C	T
+sampleB.variants.tsv	S:S982A	MN908947.3	24506	T	G
+sampleB.variants.tsv	S:D1118H	MN908947.3	24914	G	C
diff --git a/ncov/parser/Lineage.py b/ncov/parser/Lineage.py
@@ -0,0 +1,50 @@
+import csv
+import re
+
+class Lineage():
+    """
+    A class for handling Pangolin lineage reports.
+    """
+
+    def __init__(self, file, delimiter=','):
+        self.file = file
+        self.delimiter = delimiter
+
+
+    def get_sample_name(self, row):
+        """
+        The sample name is obtained from the 'taxon' field.  Note that ONT
+        consensus FASTA files are generated as "<sample>/ARTIC/nanopolish"
+        whereas Illumina runs (iVar) use "Consensus_<sample>"
+        """
+        sample_name = row['taxon']
+        sample_name = re.sub('^Consensus_', '', sample_name) # added by ivar
+        sample_name = re.sub('.primertrimmed.consensus_threshold_0.75_quality_20', '', sample_name) # added by ivar
+        sample_name = re.sub('_MN908947.3', '', sample_name) # added by pangolin
+        sample_name = re.sub('/ARTIC/nanopolish', '', sample_name) # added by ARTIC
+        sample_name = re.sub('/ARTIC/medaka', '', sample_name) # added by ARTIC
+        return sample_name
+
+
+    def get_lineage(self, row):
+        """
+        Return the lineage value
+        """
+        return row['lineage']
+
+
+    def create_lineage_dictionary(self):
+        """
+        Create a dictionary containing sample names as key and their lineage as
+        the value
+        """
+        lineage_dict = dict()
+        with open(self.file, 'r') as fh:
+            reader = csv.DictReader(fh, delimiter=self.delimiter)
+            for row in reader:
+                sample = self.get_sample_name(row=row)
+                lineage = self.get_lineage(row=row)
+                lineage_dict[sample] = lineage
+        self.lineage_dict = lineage_dict
+        return lineage_dict
+
diff --git a/ncov/parser/Meta.py b/ncov/parser/Meta.py
@@ -64,7 +64,7 @@ def import_metadata(self, sample_id='sample', ct_id='ct', date_id='date'):
                 self.data = data
                 return data
         except:
-            print("Invalid metadata file")
+            pass
 
 
     def get_meta_for_sample(self, sample):

diff --git a/ncov/parser/Snpeff.py b/ncov/parser/Snpeff.py
@@ -0,0 +1,46 @@
+"""
+
+"""
+
+import os
+import sys
+import csv
+import re
+
+class Snpeff():
+    """
+    A class for processing annotated variants from SNPEff.
+    """
+
+    def __init__(self, file, delimiter="\t"):
+        """
+        Initilize the object
+        """
+        annotations = list()
+        try:
+            with open(file, 'r') as fh:
+                reader = csv.DictReader(fh, delimiter=delimiter)
+                for row in reader:
+                    annotations.append(row)
+            self.annotations = annotations
+        except:
+            pass
+
+
+    def get_list_of_consequences(self):
+        """
+
+        """
+        consequences = list()
+        for var in self.annotations:
+            consequences.append(var['Consequence'])
+        self.consequences = consequences
+
+
+    def has_frameshift(self):
+        """
+        Determine whether the annotated variant is frameshift
+        """
+        return 'frameshift_variant' in self.consequences
+
+
diff --git a/ncov/parser/WatchList.py b/ncov/parser/WatchList.py
@@ -0,0 +1,55 @@
+"""
+WatchList module
+"""
+
+import csv
+import os
+import sys
+import re
+
+
+class WatchList():
+    """
+    A class for handling the watch list report generated in the ncov-tools
+    pipeline.
+    """
+
+    watch_list = dict()
+
+    def __init__(self, file, delimiter='\t'):
+        """
+        Initialize the WatchList object and construct the watch_list dictionary
+        attribute from the provided file.
+        """
+        watch = dict()
+        samplename = str()
+        with open(file, 'r') as fh:
+            reader = csv.DictReader(fh, delimiter=delimiter)
+            for row in reader:
+                if row['sample'].endswith('.variants.tsv'):
+                    samplename = re.sub('.variants.tsv', '', row['sample'])
+                elif row['sample'].endswith('.pass.vcf.gz'):
+                    samplename = re.sub('.pass.vcf.gz', '', row['sample'])
+                if samplename in watch:
+                    watch[samplename].append(row)
+                else:
+                    watch[samplename] = [row]
+        self.watch_list = watch
+
+
+    def get_mutation_string(self, sample, delimiter=','):
+        """
+        Returns a comma separated string of the watch list mutations.
+        """
+        mutations = list()
+        for samplename in self.watch_list:
+            if sample == samplename:
+                for item in self.watch_list[samplename]:
+                    mutations.append(item['mutation'])
+            else:
+                continue
+        if len(mutations) == 0:
+            return 'none'
+        else:
+            return delimiter.join(mutations)
+
diff --git a/ncov/parser/__init__.py b/ncov/parser/__init__.py
@@ -7,3 +7,7 @@
 from .Vcf import *
 from .NegativeControl import *
 from .primers import *
+from .Lineage import *
+from .Snpeff import *
+from .WatchList import *
+
diff --git a/ncov/parser/qc.py b/ncov/parser/qc.py
@@ -26,6 +26,8 @@ def write_qc_summary(summary):
         * scaled_variants_snvs
         * genome_completeness
         * qc_pass
+        * lineage
+        * mutations
 
     Arguments:
         * summary: a dictionary containing the sample QC details
@@ -49,7 +51,9 @@ def write_qc_summary(summary):
         str(summary['num_weeks']),
         str(summary['scaled_variants_snvs']),
         str(summary['genome_completeness']),
-        str(summary['qc_pass'])])
+        str(summary['qc_pass']),
+        str(summary['lineage']),
+        str(summary['mutations'])])
     print(summary_line)
 
 
@@ -68,7 +72,9 @@ def write_qc_summary_header(header=['sample',
                                     'num_weeks',
                                     'scaled_variants_snvs',
                                     'genome_completeness',
-                                    'qc_pass']):
+                                    'qc_pass',
+                                    'lineage',
+                                    'watch_mutations']):
     '''
     Write the header for the QC summary data
 

diff --git a/setup.py b/setup.py
@@ -8,7 +8,7 @@
 
 setuptools.setup(
     name="ncov_parser",
-    version="0.6.2",
+    version="0.6.3",
     author="Richard J. de Borja",
     author_email="[email protected]",
     description="A nCoV package for parsing analysis files",

diff --git a/tests/test_Lineage.py b/tests/test_Lineage.py
@@ -0,0 +1,25 @@
+'''
+Suite of tests for the Consensus module
+'''
+
+import unittest
+import ncov.parser
+
+test_lineage = ncov.parser.Lineage(file='data/sample_lineages.csv')
+
+
+class LineageTest(unittest.TestCase):
+    def test_create_lineage_dictionary(self):
+        lineage_dict = test_lineage.create_lineage_dictionary()
+        self.assertEqual(lineage_dict['sampleA'], 'B.1.1.43')
+        self.assertEqual(lineage_dict['sampleB'], 'B.1.36')
+        self.assertEqual(lineage_dict['sampleC'], 'B.1.1.7')
+    def test_get_sample_name(self):
+        sample_row = {'taxon' : 'sampleA/ARTIC/nanopolish',
+                      'lineage' : 'B.1.1.43',
+                      'probability' : 1.0,
+                      'pangoLEARN_version' : '2021-01-06',
+                      'passed_qc' : 'passed_qc'}
+        expected_sample_name = 'sampleA'
+        sample_name = test_lineage.get_sample_name(row=sample_row)
+        self.assertEqual(sample_name, expected_sample_name)
diff --git a/tests/test_Snpeff.py b/tests/test_Snpeff.py
@@ -0,0 +1,14 @@
+"""
+Suite of tests for the Snpeff module
+"""
+
+import unittest
+import ncov.parser
+
+test_snpeff = ncov.parser.Snpeff(file='data/sampleA_aa_table.tsv')
+test_snpeff.get_list_of_consequences()
+
+class SnpeffTest(unittest.TestCase):
+    def test_has_frameshift(self):
+        self.assertTrue(test_snpeff.has_frameshift())
+
diff --git a/tests/test_WatchList.py b/tests/test_WatchList.py
@@ -0,0 +1,14 @@
+"""
+Suite of tests for the WatchList module
+"""
+
+import unittest
+import ncov.parser
+
+test_watch = ncov.parser.WatchList(file='data/testrun_ncov_watch_variants.tsv')
+expected_mutation_string = 'S:del69-70,S:del144,S:A570D,S:P681H,S:T716I,S:S982A'
+
+class WatchListTest(unittest.TestCase):
+    def test_get_mutation_string(self):
+        self.assertEqual(test_watch.get_mutation_string(sample='sampleA'),
+                         expected_mutation_string)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		sample chr pos ref alt Consequence gene protein aa