-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4 from simpsonlab/add_pangolin_lineage
Add pangolin lineage
- Loading branch information
Showing
16 changed files
with
282 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
sample chr pos ref alt Consequence gene protein aa | ||
sampleA MN908947.3 241 C T upstream_gene_variant orf1ab NA | ||
sampleA MN908947.3 3037 C T synonymous_variant orf1ab F924F orf1ab-F924F | ||
sampleA MN908947.3 14408 C T missense_variant orf1ab P4715L orf1ab-P4715L | ||
sampleA MN908947.3 21625 A G synonymous_variant S R21R S-R21R | ||
sampleA MN908947.3 28883 G C missense_variant N G204R N-G204R | ||
sampleA MN908947.3 29580 CT C frameshift_variant ORF10 P10fs ORF10-P10fs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
sample chr pos ref alt Consequence gene protein aa |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
taxon,lineage,probability,pangoLEARN_version,status,note | ||
sampleA/ARTIC/nanopolish,B.1.1.43,1.0,2021-01-06,passed_qc, | ||
sampleB/ARTIC/nanopolish,B.1.36,1.0,2021-01-06,passed_qc, | ||
sampleC/ARTIC/nanopolish,B.1.1.7,1.0,2021-01-06,passed_qc, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
sample mutation contig position reference alt | ||
sampleA.variants.tsv S:del69-70 MN908947.3 21764 ATACATG A | ||
sampleA.variants.tsv S:del144 MN908947.3 21990 TTTA T | ||
sampleA.variants.tsv S:A570D MN908947.3 23271 C A | ||
sampleA.variants.tsv S:P681H MN908947.3 23604 C A | ||
sampleA.variants.tsv S:T716I MN908947.3 23709 C T | ||
sampleA.variants.tsv S:S982A MN908947.3 24506 T G | ||
sampleB.variants.tsv S:del69-70 MN908947.3 21764 ATACATG A | ||
sampleB.variants.tsv S:del144 MN908947.3 21990 TTTA T | ||
sampleB.variants.tsv S:N501Y MN908947.3 23063 A T | ||
sampleB.variants.tsv S:A570D MN908947.3 23271 C A | ||
sampleB.variants.tsv S:P681H MN908947.3 23604 C A | ||
sampleB.variants.tsv S:T716I MN908947.3 23709 C T | ||
sampleB.variants.tsv S:S982A MN908947.3 24506 T G | ||
sampleB.variants.tsv S:D1118H MN908947.3 24914 G C |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import csv | ||
import re | ||
|
||
class Lineage(): | ||
""" | ||
A class for handling Pangolin lineage reports. | ||
""" | ||
|
||
def __init__(self, file, delimiter=','): | ||
self.file = file | ||
self.delimiter = delimiter | ||
|
||
|
||
def get_sample_name(self, row): | ||
""" | ||
The sample name is obtained from the 'taxon' field. Note that ONT | ||
consensus FASTA files are generated as "<sample>/ARTIC/nanopolish" | ||
whereas Illumina runs (iVar) use "Consensus_<sample>" | ||
""" | ||
sample_name = row['taxon'] | ||
sample_name = re.sub('^Consensus_', '', sample_name) # added by ivar | ||
sample_name = re.sub('.primertrimmed.consensus_threshold_0.75_quality_20', '', sample_name) # added by ivar | ||
sample_name = re.sub('_MN908947.3', '', sample_name) # added by pangolin | ||
sample_name = re.sub('/ARTIC/nanopolish', '', sample_name) # added by ARTIC | ||
sample_name = re.sub('/ARTIC/medaka', '', sample_name) # added by ARTIC | ||
return sample_name | ||
|
||
|
||
def get_lineage(self, row): | ||
""" | ||
Return the lineage value | ||
""" | ||
return row['lineage'] | ||
|
||
|
||
def create_lineage_dictionary(self): | ||
""" | ||
Create a dictionary containing sample names as key and their lineage as | ||
the value | ||
""" | ||
lineage_dict = dict() | ||
with open(self.file, 'r') as fh: | ||
reader = csv.DictReader(fh, delimiter=self.delimiter) | ||
for row in reader: | ||
sample = self.get_sample_name(row=row) | ||
lineage = self.get_lineage(row=row) | ||
lineage_dict[sample] = lineage | ||
self.lineage_dict = lineage_dict | ||
return lineage_dict | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
""" | ||
""" | ||
|
||
import os | ||
import sys | ||
import csv | ||
import re | ||
|
||
class Snpeff(): | ||
""" | ||
A class for processing annotated variants from SNPEff. | ||
""" | ||
|
||
def __init__(self, file, delimiter="\t"): | ||
""" | ||
Initilize the object | ||
""" | ||
annotations = list() | ||
try: | ||
with open(file, 'r') as fh: | ||
reader = csv.DictReader(fh, delimiter=delimiter) | ||
for row in reader: | ||
annotations.append(row) | ||
self.annotations = annotations | ||
except: | ||
pass | ||
|
||
|
||
def get_list_of_consequences(self): | ||
""" | ||
""" | ||
consequences = list() | ||
for var in self.annotations: | ||
consequences.append(var['Consequence']) | ||
self.consequences = consequences | ||
|
||
|
||
def has_frameshift(self): | ||
""" | ||
Determine whether the annotated variant is frameshift | ||
""" | ||
return 'frameshift_variant' in self.consequences | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
""" | ||
WatchList module | ||
""" | ||
|
||
import csv | ||
import os | ||
import sys | ||
import re | ||
|
||
|
||
class WatchList(): | ||
""" | ||
A class for handling the watch list report generated in the ncov-tools | ||
pipeline. | ||
""" | ||
|
||
watch_list = dict() | ||
|
||
def __init__(self, file, delimiter='\t'): | ||
""" | ||
Initialize the WatchList object and construct the watch_list dictionary | ||
attribute from the provided file. | ||
""" | ||
watch = dict() | ||
samplename = str() | ||
with open(file, 'r') as fh: | ||
reader = csv.DictReader(fh, delimiter=delimiter) | ||
for row in reader: | ||
if row['sample'].endswith('.variants.tsv'): | ||
samplename = re.sub('.variants.tsv', '', row['sample']) | ||
elif row['sample'].endswith('.pass.vcf.gz'): | ||
samplename = re.sub('.pass.vcf.gz', '', row['sample']) | ||
if samplename in watch: | ||
watch[samplename].append(row) | ||
else: | ||
watch[samplename] = [row] | ||
self.watch_list = watch | ||
|
||
|
||
def get_mutation_string(self, sample, delimiter=','): | ||
""" | ||
Returns a comma separated string of the watch list mutations. | ||
""" | ||
mutations = list() | ||
for samplename in self.watch_list: | ||
if sample == samplename: | ||
for item in self.watch_list[samplename]: | ||
mutations.append(item['mutation']) | ||
else: | ||
continue | ||
if len(mutations) == 0: | ||
return 'none' | ||
else: | ||
return delimiter.join(mutations) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,7 +8,7 @@ | |
|
||
setuptools.setup( | ||
name="ncov_parser", | ||
version="0.6.2", | ||
version="0.6.3", | ||
author="Richard J. de Borja", | ||
author_email="[email protected]", | ||
description="A nCoV package for parsing analysis files", | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
''' | ||
Suite of tests for the Consensus module | ||
''' | ||
|
||
import unittest | ||
import ncov.parser | ||
|
||
test_lineage = ncov.parser.Lineage(file='data/sample_lineages.csv') | ||
|
||
|
||
class LineageTest(unittest.TestCase): | ||
def test_create_lineage_dictionary(self): | ||
lineage_dict = test_lineage.create_lineage_dictionary() | ||
self.assertEqual(lineage_dict['sampleA'], 'B.1.1.43') | ||
self.assertEqual(lineage_dict['sampleB'], 'B.1.36') | ||
self.assertEqual(lineage_dict['sampleC'], 'B.1.1.7') | ||
def test_get_sample_name(self): | ||
sample_row = {'taxon' : 'sampleA/ARTIC/nanopolish', | ||
'lineage' : 'B.1.1.43', | ||
'probability' : 1.0, | ||
'pangoLEARN_version' : '2021-01-06', | ||
'passed_qc' : 'passed_qc'} | ||
expected_sample_name = 'sampleA' | ||
sample_name = test_lineage.get_sample_name(row=sample_row) | ||
self.assertEqual(sample_name, expected_sample_name) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
""" | ||
Suite of tests for the Snpeff module | ||
""" | ||
|
||
import unittest | ||
import ncov.parser | ||
|
||
test_snpeff = ncov.parser.Snpeff(file='data/sampleA_aa_table.tsv') | ||
test_snpeff.get_list_of_consequences() | ||
|
||
class SnpeffTest(unittest.TestCase): | ||
def test_has_frameshift(self): | ||
self.assertTrue(test_snpeff.has_frameshift()) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
""" | ||
Suite of tests for the WatchList module | ||
""" | ||
|
||
import unittest | ||
import ncov.parser | ||
|
||
test_watch = ncov.parser.WatchList(file='data/testrun_ncov_watch_variants.tsv') | ||
expected_mutation_string = 'S:del69-70,S:del144,S:A570D,S:P681H,S:T716I,S:S982A' | ||
|
||
class WatchListTest(unittest.TestCase): | ||
def test_get_mutation_string(self): | ||
self.assertEqual(test_watch.get_mutation_string(sample='sampleA'), | ||
expected_mutation_string) |