From 933e948037f56532d28a4e056964bb63ef380ea9 Mon Sep 17 00:00:00 2001 From: Himanshu Joshi Date: Thu, 25 Nov 2021 12:44:48 +1100 Subject: [PATCH] Updated SpliceAI toolset to output raw scores in VCF --- .gitignore | 1 + README.md | 12 ++++++++++-- setup.py | 2 +- spliceai/__main__.py | 8 ++++++-- spliceai/utils.py | 12 ++++++++++-- tests/test_delta_score.py | 16 ++++++++-------- 6 files changed, 36 insertions(+), 15 deletions(-) diff --git a/.gitignore b/.gitignore index 0134b26..2133d28 100755 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +.vscode *.pyc *.egg *__pycache__/ diff --git a/README.md b/README.md index 57f6b8f..cba31e5 100644 --- a/README.md +++ b/README.md @@ -62,15 +62,23 @@ Details of SpliceAI INFO field: | DP_AL | Delta position (acceptor loss) | | DP_DG | Delta position (donor gain) | | DP_DL | Delta position (donor loss) | +| DS_AG_REF | Reference score (acceptor gain) | +| DS_AG_ALT | Variant score (acceptor gain) | +| DS_AL_REF | Reference score (acceptor loss) | +| DS_AL_ALT | Variant score (acceptor loss) | +| DS_DG_REF | Reference score (donor gain) | +| DS_DG_ALT | Variant score (donor gain) | +| DS_DL_REF | Reference score (donor loss) | +| DS_DL_ALT | Variant score (donor loss) | Delta score of a variant, defined as the maximum of (DS_AG, DS_AL, DS_DG, DS_DL), ranges from 0 to 1 and can be interpreted as the probability of the variant being splice-altering. In the paper, a detailed characterization is provided for 0.2 (high recall), 0.5 (recommended), and 0.8 (high precision) cutoffs. Delta position conveys information about the location where splicing changes relative to the variant position (positive values are downstream of the variant, negative values are upstream). ### Examples -A sample input file and the corresponding output file can be found at `examples/input.vcf` and `examples/output.vcf` respectively. The output `T|RYR1|0.00|0.00|0.91|0.08|-28|-46|-2|-31` for the variant `19:38958362 C>T` can be interpreted as follows: +A sample input file and the corresponding output file can be found at `examples/input.vcf` and `examples/output.vcf` respectively. The output `T|RYR1|0.00|0.00|0.91|0.08|-28|-46|-2|-31|0.00|0.00|0.00|0.00|0.08|0.99|0.08|0.00` for the variant `19:38958362 C>T` can be interpreted as follows: * The probability that the position 19:38958360 (=38958362-2) is used as a splice donor increases by 0.91. * The probability that the position 19:38958331 (=38958362-31) is used as a splice donor decreases by 0.08. -Similarly, the output `CA|TTN|0.07|1.00|0.00|0.00|-7|-1|35|-29` for the variant `2:179415988 C>CA` has the following interpretation: +Similarly, the output `CA|TTN|0.07|1.00|0.00|0.00|-7|-1|35|-29|0.00|0.07|1.00|0.00|0.00|0.00|0.00|0.00` for the variant `2:179415988 C>CA` has the following interpretation: * The probability that the position 2:179415981 (=179415988-7) is used as a splice acceptor increases by 0.07. * The probability that the position 2:179415987 (=179415988-1) is used as a splice acceptor decreases by 1.00. diff --git a/setup.py b/setup.py index b5ccf74..59d74cf 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ description='SpliceAI: A deep learning-based tool to identify splice variants', long_description=io.open('README.md', encoding='utf-8').read(), long_description_content_type='text/markdown', - version='1.3.1', + version='1.3.2', author='Kishore Jaganathan', author_email='kishorejaganathan@gmail.com', license='GPLv3', diff --git a/spliceai/__main__.py b/spliceai/__main__.py index 25329cb..c9e359a 100644 --- a/spliceai/__main__.py +++ b/spliceai/__main__.py @@ -57,8 +57,12 @@ def main(): header = vcf.header header.add_line('##INFO=') + 'acceptor gain (AG), acceptor loss (AL), donor gain (DG), donor loss (DL), ' + ' acceptor gain reference score (DS_AG_REF), acceptor gain variant score (DS_AG_ALT), ' + ' acceptor loss reference score (DS_AL_REF), acceptor loss variant score (DS_AL_ALT), ' + ' donor gain reference score (DS_DG_REF), donor gain variant score (DS_DG_ALT), ' + ' donor loss reference score (DS_DL_REF), donor loss variant score (DS_DL_ALT).' + 'Format: ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL|DS_AG_REF|DS_AG_ALT|DS_AL_REF|DS_AL_ALT|DS_DG_REF|DS_DG_ALT|DS_DL_REF|DS_DL_ALT">') try: output = pysam.VariantFile(args.O, mode='w', header=header) diff --git a/spliceai/utils.py b/spliceai/utils.py index 991afd1..dc653ac 100644 --- a/spliceai/utils.py +++ b/spliceai/utils.py @@ -187,7 +187,7 @@ def get_delta_scores(record, ann, dist_var, mask): mask_pd = np.logical_and((idx_pd-cov//2 == dist_ann[2]), mask) mask_nd = np.logical_and((idx_nd-cov//2 != dist_ann[2]), mask) - delta_scores.append("{}|{}|{:.2f}|{:.2f}|{:.2f}|{:.2f}|{}|{}|{}|{}".format( + delta_scores.append("{}|{}|{:.2f}|{:.2f}|{:.2f}|{:.2f}|{}|{}|{}|{}|{:.2f}|{:.2f}|{:.2f}|{:.2f}|{:.2f}|{:.2f}|{:.2f}|{:.2f}".format( record.alts[j], genes[i], (y[1, idx_pa, 1]-y[0, idx_pa, 1])*(1-mask_pa), @@ -197,7 +197,15 @@ def get_delta_scores(record, ann, dist_var, mask): idx_pa-cov//2, idx_na-cov//2, idx_pd-cov//2, - idx_nd-cov//2)) + idx_nd-cov//2, + y[0, idx_pa, 1], + y[1, idx_pa, 1], + y[0, idx_na, 1], + y[1, idx_na, 1], + y[0, idx_pd, 2], + y[1, idx_pd, 2], + y[0, idx_nd, 2], + y[1, idx_nd, 2])) return delta_scores diff --git a/tests/test_delta_score.py b/tests/test_delta_score.py index 4630095..43ce89b 100755 --- a/tests/test_delta_score.py +++ b/tests/test_delta_score.py @@ -21,26 +21,26 @@ def test_get_delta_score_acceptor(self): record = Record('10', 94077, 'A', ['C']) scores = get_delta_scores(record, self.ann, 500, 0) - self.assertEqual(scores, ['C|TUBB8|0.15|0.27|0.00|0.05|89|-23|-267|193']) + self.assertEqual(scores, ['C|TUBB8|0.15|0.27|0.00|0.05|89|-23|-267|193|0.52|0.67|0.98|0.71|0.00|0.00|0.13|0.08']) scores = get_delta_scores(record, self.ann_without_prefix, 500, 0) - self.assertEqual(scores, ['C|TUBB8|0.15|0.27|0.00|0.05|89|-23|-267|193']) + self.assertEqual(scores, ['C|TUBB8|0.15|0.27|0.00|0.05|89|-23|-267|193|0.52|0.67|0.98|0.71|0.00|0.00|0.13|0.08']) record = Record('chr10', 94077, 'A', ['C']) scores = get_delta_scores(record, self.ann, 500, 0) - self.assertEqual(scores, ['C|TUBB8|0.15|0.27|0.00|0.05|89|-23|-267|193']) + self.assertEqual(scores, ['C|TUBB8|0.15|0.27|0.00|0.05|89|-23|-267|193|0.52|0.67|0.98|0.71|0.00|0.00|0.13|0.08']) scores = get_delta_scores(record, self.ann_without_prefix, 500, 0) - self.assertEqual(scores, ['C|TUBB8|0.15|0.27|0.00|0.05|89|-23|-267|193']) + self.assertEqual(scores, ['C|TUBB8|0.15|0.27|0.00|0.05|89|-23|-267|193|0.52|0.67|0.98|0.71|0.00|0.00|0.13|0.08']) def test_get_delta_score_donor(self): record = Record('10', 94555, 'C', ['T']) scores = get_delta_scores(record, self.ann, 500, 0) - self.assertEqual(scores, ['T|TUBB8|0.01|0.18|0.15|0.62|-2|110|-190|0']) + self.assertEqual(scores, ['T|TUBB8|0.01|0.18|0.15|0.62|-2|110|-190|0|0.00|0.01|0.97|0.79|0.56|0.71|0.99|0.36']) scores = get_delta_scores(record, self.ann_without_prefix, 500, 0) - self.assertEqual(scores, ['T|TUBB8|0.01|0.18|0.15|0.62|-2|110|-190|0']) + self.assertEqual(scores, ['T|TUBB8|0.01|0.18|0.15|0.62|-2|110|-190|0|0.00|0.01|0.97|0.79|0.56|0.71|0.99|0.36']) record = Record('chr10', 94555, 'C', ['T']) scores = get_delta_scores(record, self.ann, 500, 0) - self.assertEqual(scores, ['T|TUBB8|0.01|0.18|0.15|0.62|-2|110|-190|0']) + self.assertEqual(scores, ['T|TUBB8|0.01|0.18|0.15|0.62|-2|110|-190|0|0.00|0.01|0.97|0.79|0.56|0.71|0.99|0.36']) scores = get_delta_scores(record, self.ann_without_prefix, 500, 0) - self.assertEqual(scores, ['T|TUBB8|0.01|0.18|0.15|0.62|-2|110|-190|0']) + self.assertEqual(scores, ['T|TUBB8|0.01|0.18|0.15|0.62|-2|110|-190|0|0.00|0.01|0.97|0.79|0.56|0.71|0.99|0.36'])