Skip to content

Commit

Permalink
Merge pull request #4 from simpsonlab/add_pangolin_lineage
Browse files Browse the repository at this point in the history
Add pangolin lineage
  • Loading branch information
rdeborja authored Jan 25, 2021
2 parents ba88807 + 65ca008 commit 63142a9
Show file tree
Hide file tree
Showing 16 changed files with 282 additions and 7 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,10 @@ be processed.
```
ncov.parser.Alleles
ncov.parser.Consensus
ncov.parser.Lineage
ncov.parser.Meta
ncov.parser.PerBaseCoverage
ncov.parser.Snpeff
ncov.parser.Variants
ncov.parser.Vcf
ncov.parser.primers
Expand All @@ -62,7 +64,7 @@ get_qc.py --variants <sample>.variants.tsv or <sample>.pass.vcf
--coverage <sample>.per_base_coverage.bed --meta <metadata>.tsv
--consensus <sample>.primertrimmed.consensus.fa [--indel] --sample <samplename>
--platform <illumina or oxford-nanopore> --run_name <run_name> --alleles alleles.tsv
--indel
--indel --lineage <Pangolin lineage report> --aa_table <SNPEff annotation table>
```

Note the `--indel` flag should only be present if indels will be used in the
Expand Down
36 changes: 34 additions & 2 deletions bin/get_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@
help='sequencing platform used')
parser.add_argument('-r', '--run_name',
help='run name for sample')
parser.add_argument('-l', '--lineage',
help='full path to the Pangolin lineage report')
parser.add_argument('-t', '--aa_table',
help='full path to the <sample>_aa_table.tsv file')
parser.add_argument('-u', '--mutations',
help='full path to the <run>_ncov_watch_variants.tsv file')
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit('Invalid number of arguments')
Expand Down Expand Up @@ -69,15 +75,41 @@
coverage = ncov.parser.PerBaseCoverage(file=args.coverage)
qc_line.update(coverage.get_coverage_stats())

# Add the lineage from the Pangolin report
try:
lineage = ncov.parser.Lineage(file=args.lineage)
lineage.create_lineage_dictionary()
qc_line.update({"lineage" : lineage.lineage_dict[args.sample]})
except:
qc_line.update({"lineage" : "none"})

# Add the watch list mutations
try:
watchlist = ncov.parser.WatchList(file=args.mutations)
qc_line.update({"mutations" : watchlist.get_mutation_string(sample=args.sample)})
except:
qc_line.update({"mutations" : "none"})

# Get a list of consequences from the SNPEff variant annotations
frameshift_indels = False
try:
annotations = ncov.parser.Snpeff(file=args.aa_table)
annotations.get_list_of_consequences()
if annotations.has_frameshift():
frameshift_indels = True
except:
pass

# Produce warning flags
qc_flags = list()
if qc_line['genome_completeness'] < 0.5:
qc_flags.append("INCOMPLETE_GENOME")
elif qc_line['genome_completeness'] < 0.9:
qc_flags.append("PARTIAL_GENOME")

num_indel_non_triplet = qc_line['num_variants_indel'] - qc_line['num_variants_indel_triplet']
if num_indel_non_triplet > 0:
#num_indel_non_triplet = qc_line['num_variants_indel'] - qc_line['num_variants_indel_triplet']
#if num_indel_non_triplet > 0:
if frameshift_indels:
qc_flags.append("POSSIBLE_FRAMESHIFT_INDELS")

if qc_line['num_consensus_iupac'] > 5:
Expand Down
7 changes: 7 additions & 0 deletions data/sampleA_aa_table.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
sample chr pos ref alt Consequence gene protein aa
sampleA MN908947.3 241 C T upstream_gene_variant orf1ab NA
sampleA MN908947.3 3037 C T synonymous_variant orf1ab F924F orf1ab-F924F
sampleA MN908947.3 14408 C T missense_variant orf1ab P4715L orf1ab-P4715L
sampleA MN908947.3 21625 A G synonymous_variant S R21R S-R21R
sampleA MN908947.3 28883 G C missense_variant N G204R N-G204R
sampleA MN908947.3 29580 CT C frameshift_variant ORF10 P10fs ORF10-P10fs
1 change: 1 addition & 0 deletions data/sampleB_aa_table.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
sample chr pos ref alt Consequence gene protein aa
4 changes: 4 additions & 0 deletions data/sample_lineages.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
taxon,lineage,probability,pangoLEARN_version,status,note
sampleA/ARTIC/nanopolish,B.1.1.43,1.0,2021-01-06,passed_qc,
sampleB/ARTIC/nanopolish,B.1.36,1.0,2021-01-06,passed_qc,
sampleC/ARTIC/nanopolish,B.1.1.7,1.0,2021-01-06,passed_qc,
15 changes: 15 additions & 0 deletions data/testrun_ncov_watch_variants.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
sample mutation contig position reference alt
sampleA.variants.tsv S:del69-70 MN908947.3 21764 ATACATG A
sampleA.variants.tsv S:del144 MN908947.3 21990 TTTA T
sampleA.variants.tsv S:A570D MN908947.3 23271 C A
sampleA.variants.tsv S:P681H MN908947.3 23604 C A
sampleA.variants.tsv S:T716I MN908947.3 23709 C T
sampleA.variants.tsv S:S982A MN908947.3 24506 T G
sampleB.variants.tsv S:del69-70 MN908947.3 21764 ATACATG A
sampleB.variants.tsv S:del144 MN908947.3 21990 TTTA T
sampleB.variants.tsv S:N501Y MN908947.3 23063 A T
sampleB.variants.tsv S:A570D MN908947.3 23271 C A
sampleB.variants.tsv S:P681H MN908947.3 23604 C A
sampleB.variants.tsv S:T716I MN908947.3 23709 C T
sampleB.variants.tsv S:S982A MN908947.3 24506 T G
sampleB.variants.tsv S:D1118H MN908947.3 24914 G C
50 changes: 50 additions & 0 deletions ncov/parser/Lineage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import csv
import re

class Lineage():
"""
A class for handling Pangolin lineage reports.
"""

def __init__(self, file, delimiter=','):
self.file = file
self.delimiter = delimiter


def get_sample_name(self, row):
"""
The sample name is obtained from the 'taxon' field. Note that ONT
consensus FASTA files are generated as "<sample>/ARTIC/nanopolish"
whereas Illumina runs (iVar) use "Consensus_<sample>"
"""
sample_name = row['taxon']
sample_name = re.sub('^Consensus_', '', sample_name) # added by ivar
sample_name = re.sub('.primertrimmed.consensus_threshold_0.75_quality_20', '', sample_name) # added by ivar
sample_name = re.sub('_MN908947.3', '', sample_name) # added by pangolin
sample_name = re.sub('/ARTIC/nanopolish', '', sample_name) # added by ARTIC
sample_name = re.sub('/ARTIC/medaka', '', sample_name) # added by ARTIC
return sample_name


def get_lineage(self, row):
"""
Return the lineage value
"""
return row['lineage']


def create_lineage_dictionary(self):
"""
Create a dictionary containing sample names as key and their lineage as
the value
"""
lineage_dict = dict()
with open(self.file, 'r') as fh:
reader = csv.DictReader(fh, delimiter=self.delimiter)
for row in reader:
sample = self.get_sample_name(row=row)
lineage = self.get_lineage(row=row)
lineage_dict[sample] = lineage
self.lineage_dict = lineage_dict
return lineage_dict

2 changes: 1 addition & 1 deletion ncov/parser/Meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def import_metadata(self, sample_id='sample', ct_id='ct', date_id='date'):
self.data = data
return data
except:
print("Invalid metadata file")
pass


def get_meta_for_sample(self, sample):
Expand Down
46 changes: 46 additions & 0 deletions ncov/parser/Snpeff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""
"""

import os
import sys
import csv
import re

class Snpeff():
"""
A class for processing annotated variants from SNPEff.
"""

def __init__(self, file, delimiter="\t"):
"""
Initilize the object
"""
annotations = list()
try:
with open(file, 'r') as fh:
reader = csv.DictReader(fh, delimiter=delimiter)
for row in reader:
annotations.append(row)
self.annotations = annotations
except:
pass


def get_list_of_consequences(self):
"""
"""
consequences = list()
for var in self.annotations:
consequences.append(var['Consequence'])
self.consequences = consequences


def has_frameshift(self):
"""
Determine whether the annotated variant is frameshift
"""
return 'frameshift_variant' in self.consequences


55 changes: 55 additions & 0 deletions ncov/parser/WatchList.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""
WatchList module
"""

import csv
import os
import sys
import re


class WatchList():
"""
A class for handling the watch list report generated in the ncov-tools
pipeline.
"""

watch_list = dict()

def __init__(self, file, delimiter='\t'):
"""
Initialize the WatchList object and construct the watch_list dictionary
attribute from the provided file.
"""
watch = dict()
samplename = str()
with open(file, 'r') as fh:
reader = csv.DictReader(fh, delimiter=delimiter)
for row in reader:
if row['sample'].endswith('.variants.tsv'):
samplename = re.sub('.variants.tsv', '', row['sample'])
elif row['sample'].endswith('.pass.vcf.gz'):
samplename = re.sub('.pass.vcf.gz', '', row['sample'])
if samplename in watch:
watch[samplename].append(row)
else:
watch[samplename] = [row]
self.watch_list = watch


def get_mutation_string(self, sample, delimiter=','):
"""
Returns a comma separated string of the watch list mutations.
"""
mutations = list()
for samplename in self.watch_list:
if sample == samplename:
for item in self.watch_list[samplename]:
mutations.append(item['mutation'])
else:
continue
if len(mutations) == 0:
return 'none'
else:
return delimiter.join(mutations)

4 changes: 4 additions & 0 deletions ncov/parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,7 @@
from .Vcf import *
from .NegativeControl import *
from .primers import *
from .Lineage import *
from .Snpeff import *
from .WatchList import *

10 changes: 8 additions & 2 deletions ncov/parser/qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ def write_qc_summary(summary):
* scaled_variants_snvs
* genome_completeness
* qc_pass
* lineage
* mutations
Arguments:
* summary: a dictionary containing the sample QC details
Expand All @@ -49,7 +51,9 @@ def write_qc_summary(summary):
str(summary['num_weeks']),
str(summary['scaled_variants_snvs']),
str(summary['genome_completeness']),
str(summary['qc_pass'])])
str(summary['qc_pass']),
str(summary['lineage']),
str(summary['mutations'])])
print(summary_line)


Expand All @@ -68,7 +72,9 @@ def write_qc_summary_header(header=['sample',
'num_weeks',
'scaled_variants_snvs',
'genome_completeness',
'qc_pass']):
'qc_pass',
'lineage',
'watch_mutations']):
'''
Write the header for the QC summary data
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

setuptools.setup(
name="ncov_parser",
version="0.6.2",
version="0.6.3",
author="Richard J. de Borja",
author_email="[email protected]",
description="A nCoV package for parsing analysis files",
Expand Down
25 changes: 25 additions & 0 deletions tests/test_Lineage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
'''
Suite of tests for the Consensus module
'''

import unittest
import ncov.parser

test_lineage = ncov.parser.Lineage(file='data/sample_lineages.csv')


class LineageTest(unittest.TestCase):
def test_create_lineage_dictionary(self):
lineage_dict = test_lineage.create_lineage_dictionary()
self.assertEqual(lineage_dict['sampleA'], 'B.1.1.43')
self.assertEqual(lineage_dict['sampleB'], 'B.1.36')
self.assertEqual(lineage_dict['sampleC'], 'B.1.1.7')
def test_get_sample_name(self):
sample_row = {'taxon' : 'sampleA/ARTIC/nanopolish',
'lineage' : 'B.1.1.43',
'probability' : 1.0,
'pangoLEARN_version' : '2021-01-06',
'passed_qc' : 'passed_qc'}
expected_sample_name = 'sampleA'
sample_name = test_lineage.get_sample_name(row=sample_row)
self.assertEqual(sample_name, expected_sample_name)
14 changes: 14 additions & 0 deletions tests/test_Snpeff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""
Suite of tests for the Snpeff module
"""

import unittest
import ncov.parser

test_snpeff = ncov.parser.Snpeff(file='data/sampleA_aa_table.tsv')
test_snpeff.get_list_of_consequences()

class SnpeffTest(unittest.TestCase):
def test_has_frameshift(self):
self.assertTrue(test_snpeff.has_frameshift())

14 changes: 14 additions & 0 deletions tests/test_WatchList.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""
Suite of tests for the WatchList module
"""

import unittest
import ncov.parser

test_watch = ncov.parser.WatchList(file='data/testrun_ncov_watch_variants.tsv')
expected_mutation_string = 'S:del69-70,S:del144,S:A570D,S:P681H,S:T716I,S:S982A'

class WatchListTest(unittest.TestCase):
def test_get_mutation_string(self):
self.assertEqual(test_watch.get_mutation_string(sample='sampleA'),
expected_mutation_string)

0 comments on commit 63142a9

Please sign in to comment.