From 6af65325bb8e6ae8e2e2905f8580a342a011c9e7 Mon Sep 17 00:00:00 2001 From: Sai Chen Date: Mon, 10 Dec 2018 11:11:05 -0800 Subject: [PATCH] GT-683 udpate v2.1 from dev GT-683 formatting and build fix GT-683 bring back cmake changes in public repo GT-683 further trim line-end spaces GT-683 change line break to unix style --- README.md | 53 +-- RELEASES.md | 11 + doc/filter-scheme.md | 32 +- doc/genotyping-parameters.md | 2 +- external/graph-tools.tar.gz | Bin share/schema/output_schema.json | 420 +++++++++--------- .../paragraph/long-del/chr4_graph_typing.bam | Bin 11739 -> 0 bytes .../long-del/chr4_graph_typing.fa.fai | 1 - .../long-del/chr4_graph_typing.manifest | 3 - ...le.json => chrX_graph_typing.2sample.json} | 18 +- .../paragraph/long-del/chrX_graph_typing.bam | Bin 0 -> 11750 bytes .../long-del/chrX_graph_typing.bam.bai | Bin 0 -> 288 bytes ...4_graph_typing.fa => chrX_graph_typing.fa} | 2 +- .../long-del/chrX_graph_typing.fa.fai | 1 + .../long-del/chrX_graph_typing.manifest | 3 + share/test-data/paragraph/long-del/param.json | 10 +- .../pg-het-ins/genotypes_expected.json | 64 +-- .../include/genotyping/BreakpointGenotyper.hh | 37 +- .../include/genotyping/CombinedGenotype.hh | 8 +- src/c++/include/genotyping/Genotype.hh | 17 +- .../genotyping/GenotypingParameters.hh | 17 +- src/c++/include/genotyping/GraphGenotyper.hh | 7 + src/c++/include/genotyping/SampleInfo.hh | 5 +- .../lib/genotyping/BreakpointGenotyper.cpp | 81 +++- src/c++/lib/genotyping/CombinedGenotype.cpp | 74 ++- src/c++/lib/genotyping/Genotype.cpp | 50 ++- .../lib/genotyping/GenotypingParameters.cpp | 34 +- .../genotyping/GraphBreakpointGenotyper.cpp | 14 +- src/c++/lib/genotyping/GraphGenotyper.cpp | 8 + src/c++/lib/genotyping/GraphGenotyperImpl.hh | 5 + src/c++/lib/genotyping/SampleInfo.cpp | 75 +++- src/c++/lib/grm/GraphInput.cpp | 1 + src/c++/lib/grmpy/CountAndGenotype.cpp | 18 +- .../lib/paragraph/GraphSummaryStatistics.cpp | 31 +- src/c++/main/grmpy.cpp | 8 +- src/c++/main/paragraph.cpp | 8 +- src/c++/test-blackbox/test_grm.cpp | 6 +- src/c++/test/test_breakpoint_genotyper.cpp | 33 +- src/c++/test/test_combined_genotype.cpp | 24 +- src/cmake/GetBoost.cmake | 1 - src/cmake/GetGraphTools.cmake | 2 +- .../test-helpers/valgrind-test.sh | 6 +- src/make/validation/SimulateReads.mk | 0 src/python/bin/multigrmpy.py | 19 +- src/python/bin/paragraph-to-csv.py | 155 ------- src/python/lib/grm/vcfgraph/vcfupdate.py | 201 +++++---- src/python/test/test_multigrmpy.py | 38 +- src/sh/valgrind-test.sh | 6 +- src/sh/validation/simulate-reads.sh.in | 0 49 files changed, 888 insertions(+), 721 deletions(-) mode change 100755 => 100644 README.md mode change 100644 => 100755 external/graph-tools.tar.gz delete mode 100644 share/test-data/paragraph/long-del/chr4_graph_typing.bam delete mode 100644 share/test-data/paragraph/long-del/chr4_graph_typing.fa.fai delete mode 100644 share/test-data/paragraph/long-del/chr4_graph_typing.manifest rename share/test-data/paragraph/long-del/{chr4_graph_typing.2sample.json => chrX_graph_typing.2sample.json} (86%) create mode 100644 share/test-data/paragraph/long-del/chrX_graph_typing.bam create mode 100644 share/test-data/paragraph/long-del/chrX_graph_typing.bam.bai rename share/test-data/paragraph/long-del/{chr4_graph_typing.fa => chrX_graph_typing.fa} (99%) create mode 100644 share/test-data/paragraph/long-del/chrX_graph_typing.fa.fai create mode 100644 share/test-data/paragraph/long-del/chrX_graph_typing.manifest mode change 100755 => 100644 src/cmake/GetBoost.cmake mode change 100755 => 100644 src/cmake/GetGraphTools.cmake mode change 100644 => 100755 src/make/validation/SimulateReads.mk delete mode 100644 src/python/bin/paragraph-to-csv.py mode change 100644 => 100755 src/sh/validation/simulate-reads.sh.in diff --git a/README.md b/README.md old mode 100755 new mode 100644 index 8825f1d..6c982d5 --- a/README.md +++ b/README.md @@ -156,26 +156,6 @@ to give the following genotypes for each event: | swap2 | chrB | S1/S1 (homalt) | | swap1 | chrC | REF/S1 (heterozygous) | -We can extract these genotypes from the output file using Python script at bin/paragraph-to-csv.py - -``` -bin/paragraph-to-csv.py /tmp/paragraph-test/genotype.json.gz --genotype-only -``` - -The output will be: - -``` -#FORMAT=GT -#ID SWAPS -chrA:1500-1509 REF/REF -chrB:1500-1509 S1/S1 -chrC:1500-1699 REF/S1 -``` - -The output JSON file contains more information which can be used to link -events back to the original VCF file, genotype likelihoods, and also to get the -genotypes of the individual breakpoints. - If the input is a VCF file, then the output folder will contain an updated VCF file which allows us to quickly compare the original genotypes from the input VCF, and those obtained by grmpy: @@ -189,6 +169,8 @@ bcftools query -f '[%GT\t%OLD_GT]\n' /tmp/paragraph-test/genotypes.vcf.gz 0/1 0/1 ``` +Note if the input VCF doesn't contain any sample information, you won't be able to get **OLD_GT** field in this output VCF. + We can see that the genotypes match in our use case, as expected. In [doc/multiple-samples.md](doc/multiple-samples.md), we show how ParaGRAPH can be run on multiple samples with snakemake. @@ -233,11 +215,10 @@ The complete list of requrements can be found in [requirements.txt](requirements [http://www.boost.org](http://www.boost.org) and is available under the Boost license: [http://www.boost.org/users/license.html](http://www.boost.org/users/license.html). - You may use your system Boost version, on Ubuntu, you can install the required versions - of Boost as follows: +You may use your system Boost version, on Ubuntu, you can install the required versions of Boost as follows: ```bash sudo apt install libboost-dev libboost-iostreams-dev libboost-program-options-dev \ - libboost-math-dev libboost-system-dev libboost-filesystem-dev + libboost-math-dev libboost-system-dev libboost-filesystem-dev ``` Paragraph includes a copy of Boost 1.61 which can be built automatically during the @@ -400,12 +381,23 @@ Alternatively, candidate SV events can be specified as vcf. chr1 161 test-del TC T . . . ``` -* **samples.txt**: Manifest that specifies some test BAM files. Required columns: ID, path, depth, read length. Optional column: sex. Tab delimited. +* **samples.txt**: Manifest that specifies some test BAM files. Tab delimited. + +Required columns: ID, path, depth, read length. + +Optional column: + +- depth sd: Specify standard deviation for genome depth. Used for the normal test of breakpoint read depth. Default is sqrt(5*depth). + +- depth variance: Square of depth sd. + +- sex: Affects chrX and chrY genotyping. Allow "male", "female" and "unknown". If not specified, all samples will be treated as female. + ``` - id path depth read length sex - sample1 sample1.bam 1 50 male - sample2 sample2.bam 1 50 female - sample3 sample2.bam 1 50 unknown + id path depth read length depth sd sex + sample1 sample1.bam 1 50 20 male + sample2 sample2.bam 1 50 20 female + sample3 sample2.bam 1 50 20 unknown ``` * **dummy.fa** a short dummy reference which only contains `chr1` @@ -518,8 +510,7 @@ We also have the paths induced by the edge labels (this was added by `vcf2paragr Each node, edge, and path has reads associated with it. We provide read counts for forward and reverse strands (`:READS`, `:FWD`, `:REV`) and fragment counts (these counts are corrected -for the same reads possibly originating from the same sequence fragment in the case of -paired-end sequencing data). +for the same reads possibly originating from the same sequence fragment in the case of paired-end sequencing data). ```javascript "read_counts_by_edge": { @@ -631,7 +622,7 @@ It is extracted and re-organized from [an expected output](share/test-data/multi * In [doc/graph-models.md](doc/graph-models.md) we describe the graph and genotyping models we implement. - + * [Doc/graphs-ashg-2017.pdf](doc/graphs-ashg-2017.pdf) contains the poster about this method we showed at [ASHG 2017](http://www.ashg.org/2017meeting/) diff --git a/RELEASES.md b/RELEASES.md index 8202554..1834d33 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -1,5 +1,16 @@ # Paragraph Release Notes / Change Log +# Version 2.1 + +| Date Y-m-d | Ticket | Description | +|------------|---------|----------------------------------------------------------------------| +| 2018-12-06 | GT-675 | Fix filters and alignment stats. Change depth test threshold on lower end | +| 2018-11-08 | GT-660 | Optimize GQ for variant genotypes | +| 2018-11-02 | GT-656 | Improvement for simple SV genotyping | +| 2018-07-19 | GT-501 | Breakpoint depth test based on normal distribution | +| 2018-07-16 | GT-539 | VCF now output genotypes for all samples in manifest and input VCF | +| 2018-06-28 | GT-527 | --graph-sequence-matching yes fails with boost 1.63 | + # Version 2.0 | Date Y-m-d | Ticket | Description | diff --git a/doc/filter-scheme.md b/doc/filter-scheme.md index 3bc2fd6..4f8c1a1 100644 --- a/doc/filter-scheme.md +++ b/doc/filter-scheme.md @@ -1,29 +1,33 @@ # Filters used in genotyper output -## Variant level filters +## Breakpoint level filters -* **PASS** +* **GQ** -Variant PASS all filters +Low genotype quality for this breakpoint -* **CONFLICT** +* **NO_READS** -Variant has genotype conflicts in one or more breakpoints +No reads in this breakpoint + +* **BP_DEPTH** -* **EXIST_BAD_BP** +Total number of reads on this breakpoint (from all alleles) fail the coverage test -Varaint has one or more breakpoint that fails breakpoint-level filter +## Variant level filters -* **ALL_BAD_BP** +* **PASS** -All breakpoints in this variant fail breakpoint-level filter +Variant PASS all filters -* **MISSING** +* **CONFLICT** -Variant has one or more breakpoints with no spanning read +Variant has genotype conflicts in one or more breakpoints -## Breakpoint level filters +* **BP_NO_GT** + +Exist one or more breakpoint with missing genotypes -* **DEPTH** +* **NO_VALID_GT** -Total number of reads on this breakpoint (from all alleles) fail the coverage test \ No newline at end of file +All breakpoints have missing genotypes \ No newline at end of file diff --git a/doc/genotyping-parameters.md b/doc/genotyping-parameters.md index 3a9c657..a378dc9 100644 --- a/doc/genotyping-parameters.md +++ b/doc/genotyping-parameters.md @@ -21,7 +21,7 @@ Below we show all allowed parameter fields: // such as 0.00001 for a more conservative callset which only // includes genotypes for calls which have read counts that are // close to the median read depth in the BAM file. - "coverage_test_cutoff": -1.0, + "coverage_test_cutoff": 0.0001, // Allele names in graph(s). // If other alleles were observed in graph, they will be excluded from analysis. diff --git a/external/graph-tools.tar.gz b/external/graph-tools.tar.gz old mode 100644 new mode 100755 diff --git a/share/schema/output_schema.json b/share/schema/output_schema.json index 6bbe1cc..dbd1771 100755 --- a/share/schema/output_schema.json +++ b/share/schema/output_schema.json @@ -1,211 +1,211 @@ -{ - "$schema": "http://json-schema.org/draft-04/hyper-schema#", - "description": "JSON schema for paragraph / pam outputs", - "type": "object", - "required": ["model_name", "nodes", "sequencenames", "target_regions", "sample_id", "bam", "reference", "alignments", "node_coverage", "read_counts_by_node"], - "dependencies": { - "paths": ["edges", "read_counts_by_path"], - "read_counts_by_path": ["edges", "paths"], - "variants_by_path": ["paths"] - }, - "additionalProperties": false, - "properties": { - "model_name": {"type": "string"}, - "sample_id": {"type": "string"}, - "bam": {"type": "string"}, - "reference": {"type": "string"}, - "alignments": { - "type": "array", - "items": { - "$ref": "#/definitions/alignment", - "minItems": 1, - "uniqueItems": true - } - }, - "edges": { - "type": "array", - "items": { - "$ref": "common_definitions.json#/definitions/edge", - "$ref": "common_definitions.json#/definitions/min_1_unique_list" - } - }, - "nodes": { - "type": "array", - "items": { - "$ref": "common_definitions.json#/definitions/node", - "$ref": "common_definitions.json#/definitions/min_1_unique_list" - } - }, - "paths": { - "type": "array", - "items": { - "$ref": "common_definitions.json#/definitions/path", - "$ref": "common_definitions.json#/definitions/min_1_unique_list" - } - }, - "sequencenames": { - "type": "array", - "items": { - "type": "string", - "$ref": "common_definitions.json#/definitions/min_1_unique_list" - } - }, - "target_regions": { - "type": "array", - "items": { - "type": "string", - "pattern": "^.+:[0-9]+-[0-9]+$", - "$ref": "common_definitions.json#/definitions/min_1_unique_list" - } - }, - "node_coverage": { - "type": "object", - "properties": { - "ref": { "$ref": "#/definitions/coverage_list" }, - "other": { "$ref": "#/definitions/coverage_list" }, - "ref_fwd": { "$ref": "#/definitions/coverage_list" }, - "ref_rev": { "$ref": "#/definitions/coverage_list" }, - "other_fwd": { "$ref": "#/definitions/coverage_list" }, - "other_rev": { "$ref": "#/definitions/coverage_list" } - }, - "comment": "The number of elements in each coverage list should match the node length, there should be one node_coverage instance per node" - }, - "read_counts_by_node": { - "required": ["total"], - "$ref": "#/definitions/read_counts", - "additionalProperties": { - "$ref": "common_definitions.json#/definitions/positive_int" - }, - "comment": "This dictionary should contain the total, forward, and reverse coverage each node" - }, - "read_counts_by_path": { - "type": "object", - "additionalProperties": { - "type": "object", - "required": ["total"], - "$ref": "#/definitions/read_counts", - "additionalProperties": false - } - }, - "variants_by_node": { - "type": "object", - "additionalProperties": { - "type": "array", - "items": { "$ref": "#/definitions/variant_description" } - } - }, - "variants_by_path": { - "type": "object", - "additionalProperties": { - "type": "array", - "items": { "$ref": "#/definitions/variant_description" } - } - } - }, - "definitions": { - "alignment": { - "type": "object", - "properties": { - "bases": { - "type": "string", - "pattern": "^[ACGTN]+$" - }, - "fragmentId": { - "type": "string", - "comment": "Is there any benefit to making this a key and the rest of alignment an associated object?" - }, - "isFirstMate": {"type": "boolean"}, - "isMapped": {"type": "boolean"}, - "isMateMapped": {"type": "boolean"}, - "isMateReverseStrand": {"type": "boolean", "comment": "What about isReverseStrand?"}, - "mapq": { - "$ref": "common_definitions.json#/definitions/positive_int", - "maximum": 60 - }, - "matePos": { "$ref": "common_definitions.json#/definitions/positive_int" }, - "pos": { "$ref": "common_definitions.json#/definitions/positive_int" }, - "quals": { - "type": "string", - "comment": "We'd want to check to make sure that the lengths of quals and bases match. We may also want to limit the character set allowed in quals with pattern" - }, - "graphMappingStatus": { - "type": "string", - "enum": ["MAPPED", "UNMAPPED"], - "comment": "Why isn't this a boolean?" - }, - "graphAlignmentScore": {"type": "integer"}, - "isGraphAlignmentUnique": {"type": "boolean"}, - "graphCigar": {"type": "string"}, - "graphMapq": { - "$ref": "common_definitions.json#/definitions/positive_int", - "maximum": 60 - }, - "graphPos": { "$ref": "common_definitions.json#/definitions/positive_int" }, - "graphNodesTraversed": { - "type": "array", - "items": { - "type": "string", - "$ref": "common_definitions.json#/definitions/min_1_unique_list", - "comment": "In JSON schema 5, we should be able to relate this back to node IDs" - } - }, - "graphSequencesSupported": { - "type": "array", - "items": { - "type": "string", - "$ref": "common_definitions.json#/definitions/min_1_unique_list", - "comment": "As with graphNodesTraversed, we should be able to relate this back to sequence IDs in JSON schema 5" - } - } - }, - "required": ["bases", "fragmentId", "isFirstMate", "isMapped", "isMateMapped", "isMateReverseStrand", "mapq", "matePos", "pos", "quals"], - "dependencies": { - "graphMappingStatus": ["graphAlignmentScore", "isGraphAlignmentUnique", "graphCigar", "graphMapq", "graphPos", "graphNodesTraversed", "graphSequencesSupported"], - "graphAlignmentScore": ["graphMappingStatus", "isGraphAlignmentUnique", "graphCigar", "graphMapq", "graphPos", "graphNodesTraversed", "graphSequencesSupported"], - "isGraphAlignmentUnique": ["graphMappingStatus", "graphAlignmentScore", "graphCigar", "graphMapq", "graphPos", "graphNodesTraversed", "graphSequencesSupported"], - "graphCigar": ["graphMappingStatus", "graphAlignmentScore", "isGraphAlignmentUnique", "graphMapq", "graphPos", "graphNodesTraversed", "graphSequencesSupported"], - "graphMapq": ["graphMappingStatus", "graphAlignmentScore", "isGraphAlignmentUnique", "graphCigar", "graphPos", "graphNodesTraversed", "graphSequencesSupported"], - "graphPos": ["graphMappingStatus", "graphAlignmentScore", "isGraphAlignmentUnique", "graphCigar", "graphMapq", "graphNodesTraversed", "graphSequencesSupported"], - "graphNodesTraversed": ["graphMappingStatus", "graphAlignmentScore", "isGraphAlignmentUnique", "graphCigar", "graphMapq", "graphPos", "graphSequencesSupported"], - "graphSequencesSupported": ["graphMappingStatus", "graphAlignmentScore", "isGraphAlignmentUnique", "graphCigar", "graphMapq", "graphPos", "graphNodesTraversed"] - }, - "additionalProperties": false - }, - "coverage_list": { - "type": "array", - "items": { - "type": "integer", - "minimum": 0, - "minItems": 1 - } - }, - "read_counts": { - "type": "object", - "properties": { - "total": { "$ref": "common_definitions.json#/definitions/positive_int" }, - "total:FWD": { "$ref": "common_definitions.json#/definitions/positive_int" }, - "total:REV": { "$ref": "common_definitions.json#/definitions/positive_int" } - } - }, - "variant_description": { - "type": "object", - "properties": { - "adaBackward": { "$ref": "common_definitions.json#/definitions/positive_int" }, - "adrBackward": { "$ref": "common_definitions.json#/definitions/positive_int" }, - "adrForward": { "$ref": "common_definitions.json#/definitions/positive_int" }, - "adaForward": { "$ref": "common_definitions.json#/definitions/positive_int" }, - "alt": { "type": "string", "pattern": "^ACGT*$" }, - "end": { "$ref": "common_definitions.json#/definitions/positive_int"}, - "leftmost": { "$ref": "common_definitions.json#/definitions/positive_int" }, - "rightmost": { "$ref": "common_definitions.json#/definitions/positive_int" }, - "start": { "$ref": "common_definitions.json#/definitions/positive_int" }, - "wadaBackward": { "$ref": "common_definitions.json#/definitions/positive_num" }, - "wadrBackward": { "$ref": "common_definitions.json#/definitions/positive_num" }, - "wadaForward": { "$ref": "common_definitions.json#/definitions/positive_num" }, - "wadrForward": { "$ref": "common_definitions.json#/definitions/positive_num" } - }, - "required": ["alt", "end", "start"], - "comment": "adaBackward / adrBackward / adaForward / etc. are not required, but at least one of them should be specified" - } - } +{ + "$schema": "http://json-schema.org/draft-04/hyper-schema#", + "description": "JSON schema for paragraph / pam outputs", + "type": "object", + "required": ["model_name", "nodes", "sequencenames", "target_regions", "sample_id", "bam", "reference", "alignments", "node_coverage", "read_counts_by_node"], + "dependencies": { + "paths": ["edges", "read_counts_by_path"], + "read_counts_by_path": ["edges", "paths"], + "variants_by_path": ["paths"] + }, + "additionalProperties": false, + "properties": { + "model_name": {"type": "string"}, + "sample_id": {"type": "string"}, + "bam": {"type": "string"}, + "reference": {"type": "string"}, + "alignments": { + "type": "array", + "items": { + "$ref": "#/definitions/alignment", + "minItems": 1, + "uniqueItems": true + } + }, + "edges": { + "type": "array", + "items": { + "$ref": "common_definitions.json#/definitions/edge", + "$ref": "common_definitions.json#/definitions/min_1_unique_list" + } + }, + "nodes": { + "type": "array", + "items": { + "$ref": "common_definitions.json#/definitions/node", + "$ref": "common_definitions.json#/definitions/min_1_unique_list" + } + }, + "paths": { + "type": "array", + "items": { + "$ref": "common_definitions.json#/definitions/path", + "$ref": "common_definitions.json#/definitions/min_1_unique_list" + } + }, + "sequencenames": { + "type": "array", + "items": { + "type": "string", + "$ref": "common_definitions.json#/definitions/min_1_unique_list" + } + }, + "target_regions": { + "type": "array", + "items": { + "type": "string", + "pattern": "^.+:[0-9]+-[0-9]+$", + "$ref": "common_definitions.json#/definitions/min_1_unique_list" + } + }, + "node_coverage": { + "type": "object", + "properties": { + "ref": { "$ref": "#/definitions/coverage_list" }, + "other": { "$ref": "#/definitions/coverage_list" }, + "ref_fwd": { "$ref": "#/definitions/coverage_list" }, + "ref_rev": { "$ref": "#/definitions/coverage_list" }, + "other_fwd": { "$ref": "#/definitions/coverage_list" }, + "other_rev": { "$ref": "#/definitions/coverage_list" } + }, + "comment": "The number of elements in each coverage list should match the node length, there should be one node_coverage instance per node" + }, + "read_counts_by_node": { + "required": ["total"], + "$ref": "#/definitions/read_counts", + "additionalProperties": { + "$ref": "common_definitions.json#/definitions/positive_int" + }, + "comment": "This dictionary should contain the total, forward, and reverse coverage each node" + }, + "read_counts_by_path": { + "type": "object", + "additionalProperties": { + "type": "object", + "required": ["total"], + "$ref": "#/definitions/read_counts", + "additionalProperties": false + } + }, + "variants_by_node": { + "type": "object", + "additionalProperties": { + "type": "array", + "items": { "$ref": "#/definitions/variant_description" } + } + }, + "variants_by_path": { + "type": "object", + "additionalProperties": { + "type": "array", + "items": { "$ref": "#/definitions/variant_description" } + } + } + }, + "definitions": { + "alignment": { + "type": "object", + "properties": { + "bases": { + "type": "string", + "pattern": "^[ACGTN]+$" + }, + "fragmentId": { + "type": "string", + "comment": "Is there any benefit to making this a key and the rest of alignment an associated object?" + }, + "isFirstMate": {"type": "boolean"}, + "isMapped": {"type": "boolean"}, + "isMateMapped": {"type": "boolean"}, + "isMateReverseStrand": {"type": "boolean", "comment": "What about isReverseStrand?"}, + "mapq": { + "$ref": "common_definitions.json#/definitions/positive_int", + "maximum": 60 + }, + "matePos": { "$ref": "common_definitions.json#/definitions/positive_int" }, + "pos": { "$ref": "common_definitions.json#/definitions/positive_int" }, + "quals": { + "type": "string", + "comment": "We'd want to check to make sure that the lengths of quals and bases match. We may also want to limit the character set allowed in quals with pattern" + }, + "graphMappingStatus": { + "type": "string", + "enum": ["MAPPED", "UNMAPPED"], + "comment": "Why isn't this a boolean?" + }, + "graphAlignmentScore": {"type": "integer"}, + "isGraphAlignmentUnique": {"type": "boolean"}, + "graphCigar": {"type": "string"}, + "graphMapq": { + "$ref": "common_definitions.json#/definitions/positive_int", + "maximum": 60 + }, + "graphPos": { "$ref": "common_definitions.json#/definitions/positive_int" }, + "graphNodesTraversed": { + "type": "array", + "items": { + "type": "string", + "$ref": "common_definitions.json#/definitions/min_1_unique_list", + "comment": "In JSON schema 5, we should be able to relate this back to node IDs" + } + }, + "graphSequencesSupported": { + "type": "array", + "items": { + "type": "string", + "$ref": "common_definitions.json#/definitions/min_1_unique_list", + "comment": "As with graphNodesTraversed, we should be able to relate this back to sequence IDs in JSON schema 5" + } + } + }, + "required": ["bases", "fragmentId", "isFirstMate", "isMapped", "isMateMapped", "isMateReverseStrand", "mapq", "matePos", "pos", "quals"], + "dependencies": { + "graphMappingStatus": ["graphAlignmentScore", "isGraphAlignmentUnique", "graphCigar", "graphMapq", "graphPos", "graphNodesTraversed", "graphSequencesSupported"], + "graphAlignmentScore": ["graphMappingStatus", "isGraphAlignmentUnique", "graphCigar", "graphMapq", "graphPos", "graphNodesTraversed", "graphSequencesSupported"], + "isGraphAlignmentUnique": ["graphMappingStatus", "graphAlignmentScore", "graphCigar", "graphMapq", "graphPos", "graphNodesTraversed", "graphSequencesSupported"], + "graphCigar": ["graphMappingStatus", "graphAlignmentScore", "isGraphAlignmentUnique", "graphMapq", "graphPos", "graphNodesTraversed", "graphSequencesSupported"], + "graphMapq": ["graphMappingStatus", "graphAlignmentScore", "isGraphAlignmentUnique", "graphCigar", "graphPos", "graphNodesTraversed", "graphSequencesSupported"], + "graphPos": ["graphMappingStatus", "graphAlignmentScore", "isGraphAlignmentUnique", "graphCigar", "graphMapq", "graphNodesTraversed", "graphSequencesSupported"], + "graphNodesTraversed": ["graphMappingStatus", "graphAlignmentScore", "isGraphAlignmentUnique", "graphCigar", "graphMapq", "graphPos", "graphSequencesSupported"], + "graphSequencesSupported": ["graphMappingStatus", "graphAlignmentScore", "isGraphAlignmentUnique", "graphCigar", "graphMapq", "graphPos", "graphNodesTraversed"] + }, + "additionalProperties": false + }, + "coverage_list": { + "type": "array", + "items": { + "type": "integer", + "minimum": 0, + "minItems": 1 + } + }, + "read_counts": { + "type": "object", + "properties": { + "total": { "$ref": "common_definitions.json#/definitions/positive_int" }, + "total:FWD": { "$ref": "common_definitions.json#/definitions/positive_int" }, + "total:REV": { "$ref": "common_definitions.json#/definitions/positive_int" } + } + }, + "variant_description": { + "type": "object", + "properties": { + "adaBackward": { "$ref": "common_definitions.json#/definitions/positive_int" }, + "adrBackward": { "$ref": "common_definitions.json#/definitions/positive_int" }, + "adrForward": { "$ref": "common_definitions.json#/definitions/positive_int" }, + "adaForward": { "$ref": "common_definitions.json#/definitions/positive_int" }, + "alt": { "type": "string", "pattern": "^ACGT*$" }, + "end": { "$ref": "common_definitions.json#/definitions/positive_int"}, + "leftmost": { "$ref": "common_definitions.json#/definitions/positive_int" }, + "rightmost": { "$ref": "common_definitions.json#/definitions/positive_int" }, + "start": { "$ref": "common_definitions.json#/definitions/positive_int" }, + "wadaBackward": { "$ref": "common_definitions.json#/definitions/positive_num" }, + "wadrBackward": { "$ref": "common_definitions.json#/definitions/positive_num" }, + "wadaForward": { "$ref": "common_definitions.json#/definitions/positive_num" }, + "wadrForward": { "$ref": "common_definitions.json#/definitions/positive_num" } + }, + "required": ["alt", "end", "start"], + "comment": "adaBackward / adrBackward / adaForward / etc. are not required, but at least one of them should be specified" + } + } } \ No newline at end of file diff --git a/share/test-data/paragraph/long-del/chr4_graph_typing.bam b/share/test-data/paragraph/long-del/chr4_graph_typing.bam deleted file mode 100644 index 443955827a36458b60223d1303b88d06f9b04897..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 11739 zcmZX4Wmr_-_dVSW%Fx}N!hq64qaa-ZLrE!}N(uYJcR%~=wbx$j9Hv-2EVTcgM;tUoB@8rc++UfRvFs+#Li*m9=D64y3fjxF zQwws59CyiDB;U9>UW z^j{-1vyalAv}Niyk%-gwX~{Ayjmx2SpRfUI(77*S2BPOv)S6wAl?Y4q2we~=d zk10Udr)a2IAKsM?Rth>wbj=kk!|9V6>5jCKZ| zs(7k!LzY*I(rp`L(qLc<5YOq$TUzt{w17g)Gnd$Gp&@jvQjMeR?E1f-&{&&v)ojBJ z(c6^s0XOj8=WRPf5+clHD_!GX*RiMw$2R=)lWznZhFFmLN%s$Q-QHu*A5 zMWBaZ=0--54A%=$>X3_x{1$04p<=Z6DTE?;VAvnl7n&aVJ>SKjf>WsggbXVv+tdkf zVMUYu+#vcS*s?F3z?g&ovvs|L)C<;B3Zzgh+{zth#q6HCwb@?~z5bQ0-oFVVEc>7?%v1*S{dZ11SC^aN1+wD~!0z8IoD7{`XCGSo7e z{)wvHd_v=xKF29-KfkeH%+#d-BU;4~8*a8aSX}PBsP@%n1+V+g$M*Y;A03VN!eyPJ z-jOT`cMs7a?Y__Z2sL2$N936y%_-bSVIW$*8wE8Ghu&o#KADWfp^t{IGAf@JHMr3~3mVLp3B}h*R8LBBZ zf9h$&UEvJzWY4 zV|a${L;(|iJ!sFrzNOkuIrLG!)8VDrD$DPPPdgH}En@d$k7;X?2J44&DOfQ*OWIi8 z_{6qJTh>PME1K4>irutTcAdM&rAI+)B3nIL0j2rs_^u31ww|Ev4!?(YVh$|2{w4|c zg9Ih7mpwoSy;t&PdNW0^jCGwh8=}vi2nvk?(|>%%D8L&2x05s1rnFP}!6h@_@Ae0Y zVARTGcqOwW7tV;^ASxpCB{wJ(XU+P25;weW87%7eQT&xGFd*po#4q^dFX}pEWv=Cl zH1`!lpP%8c^}idxul<-tbmlwLfFGMkx-_CJtNUuEtc~_h9_Rcg_Nln;?r_6#5tXrA zm9Z(`$p^_p$#sZuRUemI>a;|Tb|3agOJ79?L3Pzqp};^Dt3)sv;mq7DF0aE z$3$mZEvgMMjEpCMuxQAOl9e*I>3`Bc9Cz`bRGI%AB^uXoY32@m7nWP2-*ESm(WBY# zcuCJkb8zQCJSV6$qts3|e)G+`9@2Fg4y;DXE;Z9&)Xa*+d#4NSCb?bvmsr1@btS6( zEq&`{${H>N%MsXmFQU}`p6H88sn5r0DV4_(I}w2;mN#?9D#=+=0=F4o3;t}oP)@&o z-a2Dh^m&qXmrU8gJ=UjWU`EF#%jjaJxlfS%WUPN39Ap4~L$XR3%eUPuSjES-z8GTp zO?#u+y((BZHD{ILzZbQF=v-kcroAPmyG zYURvX+Cr|0=}SlYragMsnWs2{JqeD*OR0j1g6}DQE_pg$3cJ>rG{UNVD@6NmKSs7? zu1bd_ei%y(hKBxLOlw|6n9j>g%>p;_nhgvjExKTAY7BN2e4gWnZL1m~msMqbD|3e` zbLD@R*q3(-X@mpZ_N`k)wU|hzRn002G~1niD?A48dU$`MExC$vj`BY4yzoh4uSM40 zovECD>xx4g3d9Ze_vDz^Ks*tLsnyVY@M)A#Emm&Cc&{nhP>0;@!JJ0I(2I9qX9bSr z@M99gFrBfy``frwtEBWFpB~`9W;NAM8(O;C6Pee-YhBp0I)fg@g{nEyWN$9teW>Bc z4MN&?Co0!p{i}6l&pdb>?&?m{KfMdSN5wj;a|JT}vti5JC_R4o3)d)Q&wM=frNd7e z+W1gg&5@P;c=L;nk2052F8D*R>q^7MJE34d9A8!nZgrNT(tDvf%(d4Fq#35Sa4uOVXO)B> zC^S6+>|zBWtw`x;&m;Zj>`}lVIHoWMN`7uiG$NzzP z?DJ{Y)x**8!{;A&%PuB!WdFMCa_tyS5;=DL=c?y2_f}D70{fqDxAkgw=A1A3#;;?i z4F!Bw1KM<}o>%w&PD?E~jdYF*gtJ@4u(Zfb=XaCl&?2U@ zROhJxfS#{;YB}4%tOZ_fsqJ4+rmF075xlSLU2&%0nvRfer-pk~v*C&1EoZt+qq4P@ z#*17#C`**&s3R@MeC`{ky-auW?Hq!}$8UFoWDd>=>x{Y2McptV904oC!f9~8K=k2* zSo@r2%=QZlxSa)@xZigSnj;fPL%0bCAkWEVL<*v&S1Tm`^lj|>j|!YcfGWBhzvS% zp#K^S8V;a0|H7JuCkt#fJqrxq#|Wsh_kv)?8^d)TX_}yC9S|dL8ab7vvE{E}&UIwV z75W36e}X}W@k;Ut{+B^j;A%jWVL^;_`y)i9rwn%p5q+#Lt7b@m*;nShLbtHj)qjo8 zNnQpOi6JLu_gZ4Q%KSrTlvnI40XG~0L-an3(nU)HXZ>-lw5E~fks>4ZzmZ zBpE_tWv&%J0stAzCk{J`=g5#;^$HmdB!SI6(-s|ggN&AkODZnUdGMd6~JKY9Nw#>=rDuqxY}F2s=;cLE}4wGzw75h*g}S(LU% z_uarEs}c=vw7VKg1a_&^d-e?k)?pCt);azhWv#DiBuaj!4ZFw&mTkKf`kn#c@E&0& z1~~#a^aJIez>Lk1q$O%*;preQrb~FX+o9j->0{0FiPDiW@AIAol()@FH-l(<9n?k{ z6ZjkrS*?VR^a*GoTdFYoh#NVn0CnbW7ZD#c_{Q$MC ze3Rlbaz-yq$RhkiX(rPsCy3sB%i0zAt!`FoJn{X4xUA}fl+}vDF+JSSxk8X@Qh58@ z0^JVAQ|d~St82hp@x`ImGY@`5Rz%V6Z1!H7PVuFqdY>>o`F^!h%>oB?-r0ME__=Wy zh%_vVl)l(sJ7XSWN)M|;vIg7}Um|Sk#vw`LH8iyo61D*jYpteUxRJWcdv(k;hP&?K zFmN_z^y*}Y+@32b!nft+j=ySS@n@=e&kB|O(sRmE60!967-*9Rq7RCR}D0h1OS=#%%ZC=oe5-lkQz=T#|^?lT+` z5S<-U2b=)zO|(Zaa#@>s6Fz}5DxOkaoh!@I)xN~YDLRE!bfl`|UH75{Y|b6~`|N8< zyS2rq^SJ(7w^}RjSA#jG)g`jN1xa8r<5k=$nG_lRgi2Z zW1g*z&L}rXyuOJ|eOm{o!+3tG<3dWnw{lKVWApiBUdRMXgLTR4Z10w?1(sNSadQlr%ZTa&KZ1T)UnN|{Wd|wXMJS&85))neAHyQaL3_C&B!|05Qq^5+%e}UhX`i+ zHrGu`dzxtAHEwRGm>00d5*siMLy!}}mTLJ_#6RX1PaMX1J>$l{CmfiK$_(=d%YwPV z*;~8UG$LtTz@0cRi8me$FEK#sO<<#)c+R@4x|Hw?$LE6F`F3pVwpi!S9x{997vAyk2LV7?C)*P zHRT&KR%Ygn&$q#qzx&zbdEA$vb#Uz!+-L#E{Y3vTn zD;l0_y(+{U7O+oO1-sEWoUZ|n(fd@rdR~RcR#jMsEa=5mz2it8ya+*ort12rwrPh- zER-{lcnoV~ZGoJckj)As9*$(S>W!`*Qznk#jtpB>fpQ6E*(j;_2-J^u~GgYq$AKFCExNQ z4MX23VX680y)p}LL~C|z5hPj99=gr)H))bbafP8gke7nh-mibRR(Z@S1A_5-pfV-7 z2bLNHFy22?GQTr1Y9fD5;{^OB;$C7IxbXwic~vd`H5hK1eWd0(+rJbwhawl&a@)X7 z&@gemrecNuc@%!}WsVV!Vb$raxHLS=lmzSBe~peA?JOwXVs7xq8)5 zIZL&8@jhclz|*uq7XodWM5;yV9?m9aRfG;hFA<1I7XJEKGXUJS78+5znep-*g*AZ+ z-a(l}>gEijvC>v&$YG1-mtq-u)#`UgNV3@`$pFSJP&%PQ2}=nn{Ow|vBeob-f|?q|>q3Z!XB%}^FBEuh*x&g) zlJX2Jp2Re5+W9t$ni^YHsSfs0_8xdzwL+S?4u3Ht>SGKf0*(%_d;Ca-Zuy?3nLy>q z()TUeF3vm{pu^nBYK=eEkx5Jc^?^_15sPIPV`Ojd%PQztBcgGpjH;3L2}=x`@mwEY z>NQb4XQN%5ISojv@aa3~S2?d0_1;`sXfBT0LHI!L%1+Kgt8d({QOJ{IM(WRcqpJU6 zQ2w`d)b2!cKe}Rik(OK^^w=n!)5#s|{D1NZ3Q2-u?Z&+Wkq4DMadwiQQ*io`D)R}jdSiYj&>B) z)c06RPE$O3B?ZUdX5D2@5aoc8|B(>`^TpTp*{)vUifI#3r6z4J-5DS#r+{l8;GOV* zPH8ktTl_|a(%m4Q;VklG_<2L0r2#<^?-Ho}(}k!L!=LqO8E>IuX5l?z=02q^B+}ws zO@ToZ+grijwtVbdpeZ=#S1jxz;Fu?D=FNwhuh73gLSay#o6y_I_um}t3)%L&drZ^o z7R{#`wW&VdGmp|*TrE;7=a9aYyuyZ?j$rm@XuZ0m)8J$>+PjdM5{cqsnw0jsO=OAD zobGL8;G!<_%rsvhxJ$*?-|xa-?NIpZe|LQBH2=4#)(;3ERds(JAG0G@ef*5Dz5xFwt^xrlBp4uc*1FL7TV z;{xCY2}L$JF*GCIwR#7u3ax0y`_AW%!gEV+s6#dMN1L}@e(YoI+X)$A&C#xgKVWmN zF7^Sqf;h#x3;ha5c1aisYaSy0-r`c3$MZMt2^3^>?(X+p2bVvWGX6s8u3OxXm}q0m)a%Az=2L8g-pl- z3j+^@>_Ft+UV0Qhy%T3~X{^-fW^qu_4-Ma;22C{jC58CGGhj7F;RnCy4&{oXc3w?< zZThxpNEAB!kzrj>hNpa_h{Mn+D#vSVXE7ZEAX6CoyBnVJU4Pp#F|zI8dZfti&O!nB z;%0djD*?>331t@d82d&gr3Y zIs-c-%wX(=y`}?rRne!g1^u`qKp6&nNAK%}7P_VYSlv!MYg=EzJdXDCuVzuwmP~k0 z8F@6|C&4>*|LM;r!+#!szMyk z&D!ndLin;(NR*{!s0d5$^RbOzyKCD$EtddabY z$t^J&UO}WHhBWZ?gI%e8Cb!@2RMFB=Z$@wEDLwc0sxG|{4_;(m2VQ@C*cBMakr+0z z>DK>^p8MA#3AHt>X8%~hmAvERi{&{Nkmg$e!`^BL<7&-$t=|E+0o-R?rDLaGFKb2N z1wrpvn051g*;5F0(G%wBA&s)_2|GU7^%Wyg{n4NiL&Vcyf$51}+g35bi#3W3(%c+T zSh$gX`P=3{frW*FA3brM^1|eUAI5}8yAH+`txgS;?5U&SxA{{buLU)9AJ?=;sivBq zQy^KLUr23t$IEc|={jQHVd`M%f#O6Tzx{M!hZXfOm|+38quK7c+b|$W`@1MIpCON- zfH0i&SU5<%VBnI^(>0|rMM$TYUT70L$eK&t4^L!;ja-%Pi)~cMZ`?YHi1{e>>SLt7 zyn#;%9~fwz{m)4>BUa@~?O7M=f?5|$_Hmi)w0nRz&z%#)G_)F`~dIc#Oj zqUqsn2;;wNv7T%7DHMMd>{OMf+_j%*`L)2OJfpLl~w2?5sR65Hs^T%JpF z)>L$%i;{OWPksw93e7kJ>|TF>Kr3`ps3slvcRWL@yPY$fH!byA@`ew@DRFQF6L~Q} z)yQA{e#Cvo41lLk{~jioCI_R9y)kcka#qD86n`7@30A>w4scZivGif7QsDY0{wx%R zRpl|m7VupPuM@E*FKqG?KQ)F~9Bs}&iscHYC03Al`|kj9D{~D;X-~&!8z`qJ>4fw2NTJprT#yC^yF_ZCA*|_~jPbJUP|NaIZ!<(RL%Re>vv3eQA`l~UFd2s0lIz1{?a2P@5DCQr4Z0YqvxXCbx?n4$c z&it(^Rkt#n<)d|HH;Y7lw&PzG*i_{3fz6EcD1E9R=Z$ijaeqJYN9*qUX+Wh7UqgOb{EdWr{v8hu9h|b26#O1;@;EIjjXx7rOJca?&V)T5^Ami{b$xBW0 zmFE12am`%2Mbf3QP8UMy+dg$@nd7V1FGL( zQ4(+8Kwb54yklmV7XfmJ07WU|DE3d?wvl>wehrg!+u1pk{G&4UCqi7O;f|rf3is!)Em3 zetY@Q4Wf&(r8Ol^>ZZ!%$HW>x`{K&v^9XOpf06S(zLVjNDTbwV42dP{u}?jm9}?w`yNhaFV!Y zC+7Y;vvq>5&g&xTf)v_EEK?Dq5dy*v!lqtwaagKL2O{Z|y?m51)+XKt41rwNj{ z4U8NUUjPp$c5{oO;xMjdqBuTbw`y&p_hisoJ8wEU=a1?=4Z-cJd+1~)zWlcBhi_+S z=%0*?Kj@HeUs2>GlBHWlikdlTs$-(yC$%*RQ#dB+^{zDiWCe%~(Y*shZN{$LZL5i( z&?TlHgrGssuc%K_TV=OT$l^wZ_b?eH%jNdZ$gK0r)4D6d5H*ZAz+5HZ(fCf;9)seu zUp3<|iM-6!frj6q7wb*OaSlfaa zd!h9}svW>=Vv8_v4kxm1W;LdzSA#^{_ogSj?EPe^Y%b|e(%$kg|cOmXCcNc5{$(p;i5xI%j?M0KL4I!wHQ|>SO z%JdjIGIQM;)wjd6O5eTrcWqp|oMXzPRICU^I+NdN*OWg4eihon{NXCTb%(0O*Bw2cv(*)w^6>?8Afyz@u$iiSwCQll_ z{gGUxTm7j!12i%FjIkg~hH}csmc8n2G1B2ZQb(sgWJX4bP$8sgNPCqwg|19@j)?o1 z(j9XAC)Vi%|Fl$s7X+bN5G_FIIwFB|mPdFOA){!5wKx+LY2x;-X-^Oi&yx@$GEDYx zsg}2-V@$#5>w7IX_(ecp4oL$TqP7K%0mil%CDtU}v~3zAWr&x7#Aok7oEh~nk>&?o z5LCfTNf(|KzMyf3KS<@sG1*k5b?FuRZq+?|1G=lWg3%P}M( z#??F<>GZ&@B3^&C_K5x~=gLdr;s2g`*b~p@1Xj^|KSPv~#_DKW&s7W7Ngf;-NWS{$ z;A8L5a*f;IOWCZ(r=~LfV4Xjp%55CCKt}1^5?+;}{OuJpR?$K~D=iatO#Te5h?81M zWMq_NnyN&(DHeeTnEzt*;d8@}z~gMsj?$~&H{R(_n{ucv7xZKMe(d<3@h;mMaJKfR zuG6~njBP~B6Jh}e-nhK`Dg0XwA#8}#{PxV5y6WGQ!MuQHc9t1xui1SPqYv=j?3ZHA z`!9!}@@0~aL$}K1A#brZorNI>0_)~!S34P3zGb35t(sXU?--s=Lvf?!&H6fzfie^- z)+jxWnoDGi0nv&BctIxx1E++FIA{ONz6gNq0>;&g9xsYlmJppZxjDh78|qloK7o23 z=+$l-(a0iw$QgkMa_JS)&F|8%w5d)xQrU%tyXZsi8w35xR#2$rjp z>?F*pnVBN3U*r*<((aP{;VSq%uMxAF0_2^)6te z6WOTj4c0Y5D0>BA=2}4aj0Iw=`n05l=XPw(qj&u7WdWG(3>H5J({z;-9M~Wg1Pkcn z4{^`EWs( z1mBC^UN-=))$K>>b=H;FlcA%jxo3C5KGK> zXdw7?M_due$>)}KZ1z(QRp)e!BFGhYX|4*-EG(F7ClQm0gu}-%$ZWJ!1Nk;Hw(}1~ z37Bt55ExqNqm7E?u|$3N+S0Q-pX_4+S-99IOCmgJeEXb|-(D6cEiys>ZMtRQrsC#zSBtp0lswHpFXxDW`pv)-8Fe9O# zU3-J&xYW8z#BV?V14|21AR|jBP!_^l+f8kK&%Anc^2Mla63{&Dz|RKZW&A_(q5G$jKW|Zr!%k9%|xmkc?m<%r{Ty5JneyYsW%TtS`C3ey(T7Jry z(Dh42%s2BB#|}yMmU(sJc*)AoY+MxB>jTpi5Z_v;cssaI2k2a4?&M9A&0u}rTcQKEQyXie@Vb{y z^z5f#f8D3|7LhBuIZH?m9|?6QT{B2DYUA~3+RBbi@av=oY$H7@W6Z-gzB+D-Z_p0_ zn864UKOYkE`1+_(%1I~EB+pUx4?3$xFSs5=V(LPLfhvX(Fj~*NG!(B zo6_%s>{W21%9#3~^5YRvDjNb^#%7kl6zQ>vAe2k@C;qkszGzh@DchCJ4Fm9)wan(Z=r` zSk{m%FM%-KBZ>|5v~x+{S20vrBOxf!FkQ)C(rsMdc5THEmJIM6u|$g=;i;ApNH zNUj%sK_3IPPuuBRT6|^CZ|b=gE8HPhPutgLh-Ip1^Q7tZBNb-sOLiGZizDSMoKwex zUz=GqMn~4mRYIoEgrCJ%lEV=@Up!xHH5&W5R+2kqI7;KehNYj#UWkm+-XR)*{KV*d zdnZc;_FCA)F-Nq6WhKXjyG7=O7xok1Z!%CPo=!)n=(@J#l^sT;V8+wlq@@Z-5c*du zZ5XJPdore=t8@~bC_CA)IdN~NK#kadM`#RrJ>FjZ0qjEbgl%v3v-W`zs43aDf9;&Z zXp%N9WhJCTA~sW4r(U?^!F`sHUd5$>46pYx34sXhUl zA*Od;d8!^d#2!$kjqD29{QkFftwuBZdE_4-My zey)N8>(R`B9#C2tNtUNW2l#6;h{q3-@ES^c!kpcvY8?Kla=(7S2Q0V*7EDeX9!l~l zd+recX=hq?=i z9G8vLBO$e1Z-H)HW1Kn6rxgb*p4Al>QvZDVD}yl@@_M~fjO2(gpK4F$du1ZQjGjl3 zHvV!2?d^sJVmZNYg&6n{9j)r`lgCE~*uGs1kdy5*O9Udwgug=2YgH0=;6hd0F4W9h>B-eaMwscgNICj?L~v5`LRQ17?*%7v2Wh)Enic%f4pbXJ}E# zIYVzG{X5cO?-yh((^TobpPEnmXQYOilv%M`=YwJrZVIlpM5ejecaLgwK7oe0oF1bD z)U!L>#Fz`C{0#&ihuu|TIV{!Cz_Jo|y=*(%QVX8)n&tDqr+ZcS8f`d3U$iP;7@e&h zXBP$G4As<5@yRsBbehEDXJ(~YT6OI0Q`ihUkRx8{FfLzg<`j1(H1^(W&13A$Qz)mn zq4k+7j>xe=Eo5x~zLY7aI0`i>?|-p7r}orv@{qDXR|%k9VP*RpmWG11rWgLdNDfRz z5sZqB2O(hUb<<&!!H;h$gq&~qDALVBPA#yCS8DsdXzgc*yd%x_NIDg+BV5P1lIOH;vKDZKmMzahb2`=~K6!b07AUb1(!_eI40>4?4ID*tJBxpFanV z&YHeas2>+m9$ClpE=^$ilQ(LcU2mGlT`paqEP*_x*!lB4{1Hv8#49}J#k0-USk1qS zBduv3Up$sl8m20di~!T9lNZ1HHoL(G%b2Ki4Z}1f)NcqB+o_h9_Uv9YfAR=!(>X1&kn>`1talk@7B1 z%?p5Cqtg0wngwvSTQ96Y09u^xCgiDlSSWv6>(ym#YN6XWG+TgMkfQuh-bL&#ni!)-9bX#KVCny z&?JNAj~7Fk^f6`j=pn!^c?)5dT`0@I1i&A?PH39^#isqVldpejPc)tjdC zgtA48uWB>2+F$oh)aNdkGgIpTcAFFHP7O^5685ojYq=juNe@M4`-1|y+sK@k};x}AY}9!y^A0-NJ5lR zqr@nqMTlOX-~02u`^Q~(opaaSXPy1+Z}0t`3w}#YLGs^qMMw!JE7sp)@{*N@7H@81w%`jWsy zG`49$u30_jd%5_G@mD#|DFa2|Ey+HXq{Ja(8b64Wst{-mtHiV*kF?7OiMX2QxZF2S z-kR%F3i45mWdi2>0-u$at3&F&IvNA_oNc!(g6*o>7e&${Ycy5RyPf#@>G~CefCt0^e8EyUn%Z_*% zscY%C7QEDM3-x)P3dOvLM_cz&9~n?o-%K;7%@28-BRK#B^s>&%w+ejWfq#m5uBNvC z5YU1pR=K%1e|?ypQZZSvDYK9c|Qe48URKuS@f`zI%- zZt(>PZ=;d6Q?xm0t42QH1lz;jN*s`b@Ra`Q9Dcn-!Ad){9Gss_rRhIZI%?zlJ9>sU zL#dU_51Qn@w#*0+EH1L4Qm^_b2oq>6{vNW{@vB;*T!@1*Gu`J1D5@$ef?*QZ_VAJx zZRt-l&V%1zngy1J`bcIiXBH|5-Iw8rJouU4EYB!aMDjG17DA1{Eb!jf^U3f2boY93 zn&p7VK~)WSt@J8IJmZ&Tx)+kooAU3tb8miWS!!qS!!(vanbkh7<_+>tI1~Te{#eA+ za}Qei^X7Wjce#g0J&e!9Z^`3(#In}tJLnA7W`L~;1{p-U(2RTVHKO*N?hxiL@GoOF z(~&y;349cq1I;5DFId}be(LVr&UH`U%x@?dvhpk-i`R6!fwf-gFDmnx)_x8DMcs84 z;PmMvhzsaHby%xr{6~>qxSMX9b2H#+jI)q{>8C%3LnD}XHkQsUU%3lKsI_V%M7Llt z%a4*~nO$j|Jrox3qYL39EGdi<_uAqKYTHH|*Ak$?%(qk|zHMDb*=UNgslAt5PH>> zwzRL-IMSZT-rcFHIz(-bqzm~O)J?+v+Jtp2bvr=g%s>>lTAJUUnEprFsgOU2|4nP7 zJi;_uNYw$%mb^T>{DNP;Zd=kEO-s`Qsq)H-pKJbm;R&`cBbPW#!356&pY1ZN^!oHeU+ z|7wxf+i5uT$ybffGvef=;JLz@Nb}f48q^)#hcb*MJgufLOt*(UgGV$c!{cNU>JP1v zQ2Up8)u#1lO58q8K|3?X0ebz!t-HBlC7C78iisqL2;%DNLG!9B>HDa zttUS_4lZ^y!F$ow%*&tpS@A|oVR9u_pF-5zp3=S2ED3l%_CWKh*cl=*bLa8Ij%G^s z1Buhj*98k}o-AXA>@DMVh2bN->x>$%$hQH-U&jsL*%k-mO}&y#e~0>(5MgGB$Mkcw zZ^hP{BrC=EmZl@^-s{iQ|JL;U%idL1j(|#JQy!HJW$3M4;6m(D-MaV8WwkOdB`SnS z^E7n=`LI96RIkH+2w^*O($PC@x$N~?YDU8%!$5h-m=eHAQW9v$y<~DCLW(t9Sa1qG zbb~15tl$URX~T?q=%=o$;N_$m5 zX~0wkmdo^=K96n9nv;)A@*YZxKu66_r#H>vtR@vk@zCWDO=f2IY&$W0+92n0G2h|s z);ZnC!^+a$UlZG!6J?7t{IkT5Kty!lq&=y^Wm9f>E_|D{n+VtoSAMkC%ahxv zuE6C)F14_?6@b-I2#}=iO;Qd+Kdth-TUms={S>6qR_Au1amULnGc=($EL&RZs3)`9 zpTd#SQ6a?(nwO$;met@D*yBfB>-qU@8vP+4+dvG|u!Y(=d7_f*9hVz^hn@D79o8$d z_O8>DW*Jlw1O|b(U!kfGl9x{OGmuX*kmS3$^@z10JFHqO1pDAcp8<-ve~C8Uw|H~; zk>U6Pwg5azNeM4FeQk#gnHDqW9qf#+`)RgUCAk8$9Skm9o<=37L3Ce>t&)FcA<_NO zMjK`PSMM((rON*wKXx~2&1tWnE3?~Vn=)^jkn>|WrkfB@^oz7GEJ`{NURar+BeJI@ zwDU^!10zo7$%f1y3KRzmArUCDiG0rX#*Ozcanb$9&aGZk~HQd|gMS30{U@egd(d~qI^Q;n0*FcdF z{mUT9UO)*H4*Yk%;~BPb?>BPC!sHULlPI#7DESqy^{HyYV-Wl*p7Ol*r~cZ!+~@nb z3W<5I9#ECy+tzaczBCf9X)Hy)XQU!mNsk&|7Kwk|u~dsFRC%L(N6lA1TEHj5XJ8DF zfieLV#5zuGsbJ`*Hmb#Iswib<(RtUHqd91gJi^i}Xan=4==nznln z$vVAZac!T_hVEzU<$3=t+sTiF0Ec&=98LW}WhmHKD|wX4C1iR-_CfVM_v=;49D1xr zLb>VAk+&dS;4_3s_yHpyr|=D8=D%=<3i{~{5mc3VnhFimnb}BGQycZvhVdwp98poL zqN@nm?+Ft&{FbHRUw)oek5|uZISq#4f?lxx_W?KS3#INL2BpwjC6xychLbFx#&r<@ z@$&ITvm1h=&u=#2-H{G}#;R~?X*GOfM z-7OrDbO83+vQ@cvx4Hk*YBGR*kt>x9sL-C1$@>YWU+|2Is_LZ$yzF~&ZCNZj! zu-Vu0Lcz%o0s0zG=_gKtwT@%PZtr{EX)tSEq^HBY1++tzHHPo?9?>^H=5&LBEJ{Xe z6{S(o3=gu1bIuhX@1-1@?HloG?vnfZ4FPTuHe{X$njwex5EaNgzOheFk3lDdNFJi+QHT@x-MNgGRTD)bOQo$8XFfq!%-; zRTiN?{DIF0Uc|LbIyILz`Nyc8!ZP|-B6khoVUdoe*6(B@?w*Gf0WA$^{;)2lWia(eNZ%kmAmeSY_>33Q zAf}a&VSD3uO&)o-?&+4*aypDd^XThxS%mj0yOT;xiTC*5GTy4>IlZ+>*PY3WeF4O_ z5brdOH;uq@>q!Y2@w+~;kRhT|n4RowO+QUyV;+^7hu^)WYwq%p^Wb`M^kB~t3D*0Z zy{6k}m(f9@-TJa-JJr-JZ9Ve`S5>3kl(S;hw-zu&IWZ+FnWoBEgZ$zjbjyY!xowz< zAPU1BI|t7&$@MR9xj=~kYK0kdDaxj{bikxE3hAh&>-G^`Mgll^>*OaM8Z$uLxDEoZ z17nAZns;aw2S3yqm_HA20c@ZX2hW&aaZYWpT|>{zQLE5V_HW0fth@CE^Cvu4zxj^C z=-!n1jpxBu65Ab7sg2YCq2kIo8E6}M|FLuTu|;2yVsGi8bx}MF-FPUok?G6Wxp$T9 zD9*q`NQ;Ebl!OXhPp6h}8I0}6E|riEcLSjyiyAW28WVwu4zn?t#>@F&*qAs3Yokde zyL`2@(fjC22P6-Y3NiTSca8#lvjcsHwQ&l!qEqPr0%o}CpI{KC2=ws2?l+_r}YI<_X0Tl`(!x#2M?_G5+Wg@8#(>b z*etZECd(KZO{9@)rAlG{DFwQ6(3~pjI`V!*?y8)?TdnRm2XHP25KK2wz{aY1GC+g| z$`ArAcl4BD`d2XWXblXAuo*l?RANjuXS=xyCw3*fgAN~eFvS5p$}J_|Nv8}@1zZ7X zO=RrUP?EitU;wOhu8p1b8>Vvr& zRMI9XA)ycvCo&|LTx_e6NuUP(W^N4?)I+V5TZmlGTMdLpH7e_-20{kVGhNA9<&zM2 zmCp0d8ZH|lZZBKJ?@|+>SU}Z#jW_#*~?QjB~O3nkQd^xXxlQb&HeV8%9H|jl4H8@j!0m zsX~&hr68u;WLI>B|F4?>m~!Iuobi76gc%}6Gd6q3*C_M|@r6rM zo)x$#MsOyXb|PH|MPO(ujGN}}4;gSDF|9F!EL%C?H`|V#e*KKR?Ye}S00Z(P` z#DZ1mRo0Qeg}#Q3dqb2BRStR3UrqySKK^aN4Wx|s_sux>J$YjH_szGr2_x`n-?{k* zXQF?H-TaonJ$jRW4^aYk$ZcRbb#YPWZ`_}Zsy6z4@@VS=UN;b93~wzUP-KPUM-N~{ zb_RA-#<1ymL!bzPgKdMDHNB_{SRm0`Rd_nk0g%da4{3j_KD@Qxx$yGwnKg}!s|?CL zRIjiAlpXYT0{V;!^XTKn%MTB&^#wHt@9w{-zaF}Lz=jC^FD2EI7b=f5v{?XP{!l%a z(x%rUZ>(@`GW$*-%*|01ho$lv3~RX?Z}mnaVxDpSx|QBCD|%S|(aUMZLJBjnE1}$J zaDMgg!OZKD0@mXC(hu(F;$OgL|CCzu78wt=6rt|yH$Bl5? z&Z1B+mflBxPIDZVn=<#_>%}&`CPRZ`o?ybCRZ}>XubwIxNNTUiw0>ec6k{o!2xVytjPCyT$5HaBga9V?6}`O2 zL6k@{p|#LIf-&IZCc8IW_lJ8-`%g;{pZ?(hSu&-gP}|CZ7#*TR+eb72z~-A(O-&G_ zDi%!5!*x9pFeBc4mGJsA>_WEsB|qDy>37TO&@{L=)o(I~s+y5{N<`Hyf2LB6ub!bp zr?x_tQlYK9pC*Xh`%=>DL)_R0zh$#_*}k&a8U+5ee10HAF{zTj7~)i5<{hk6ZCN{I zyTwkM_VsH>m2uf!b`D{-hD!E}(SdvKhoNXMs_=L8@58Z&UY}Ere5PHL-QuEHKz5rf zo&(zn%JhG~6MMR>Kjm%sC>uTh;Y>D6TnVEh{T!B-msV@fuIKL1$%w3Wesr&uhoJUu zU%Vu34;l+G2#JY|LhK*wvG?T>?auVF%rro`sWG|I`(c=(e$8%k4Cec+cqPJ2E_%@P z?(f8qTLa`+Fat6}J8j(5S)|}KUtspGbocRR^Ac@Uwu$LK z?*FV`Sj;4_3F1Aqwo0rA9$l#VJqRQ}*z_e@}_nV07xwzEAyR*X0WIn$5BTD@MDe z-9aj3e&_5;q3N&xt3+XULwJ@MNC~`kcmY&ZxtF*I@0$wY-&tSlXI_Z_uY4YKJz|y? z{FfS-!X)HG0Ggc7fy)NIb`Dg1WCks$qs(G!AA~}OPUAv%9?;|5W*)uSS5os}csI3~PUHIRnPb zrd_htfQkk!k8H*@h0;fzsOE&J6=-9p_b}Nd9XMV!Ut6-#J)+vjS>#~3B$%0xQ^FcV z3&2pWlFj>?6_`~1i)G#B>UH>U0BbYZ zjly`C$7%`0Yf5T~O=Fujo?W{nZ5XI@y6YObn^H)4VyU%~Xxx)N5;`|ThQiVLa)_sm zTRImMPznP4ZIrnBa@@AoeIMDFb&|?3bvBK>53z{K{0CS+_C}%IAE&a85Q#+JsH!fH zOs*9>8TX$I~DJ7Fc$zWB4AALc$dM!bK>&M`3Fh);+8$SQZR8mOJ48y`QMAgIz zy81O#nON>3nT9m*?|UJhy-Vg$u>)Eav`-avXV4SHj3}qmMpeP0hr(ZbncebY;^Un2 z?xdEM6u=HkUodP4r~fpzrt}*}9>5(cN|LL>$!5zrSp@{{TE}$64ES=p?&X9SyQKl# zziBWT&u+NqBi{Ro%RtMM(X7bBnw2kj`clG%XAK%BStxg7-BPsdx8IZ5XHZ8$p6FXI zYVgmR-v>bXt)%vED!ID8cN`0l@lyHl>64#(XV_Hz4jH!vP3xN#*9RlG1 z@OSHEJ|m6UXqs7>4a#jFPjGjrSo>Fou6Rb0b-4Fo z*l9)|cR%FOBImUySrAcuAV(YxaYvu-+TlnxaO z0w5_@KU4Y0$C?Xnn+Kfl$hXvo4;|qKMi3gNgjIQu$g1#Y&jYS0b3A|{1$C}UgWh#< zsPKMQH9tJ%E`wEvwY--g?ID-EO^}G(t-U>_W?x>aZ1|*O;I4iHk+sB05e-=QCe2P2 zYm}kTMGqg+q|G957a6Os3tvO{e(oHlPvrM_XD=r|(-r#CC0=vQ-&1jOpKG7>=)2DF zc~hlMu4aaw#2b749C(EHMY;F}k|cIS0vHegv$rMRO}D$Xja)^seDS+p=K)0q4RmCf z@P92V((_3ONumJu0fLQ>(`|I50(W24DM2Qe3t5c0M-elWIRKR4U$_My!WR;7K#&<; zw-mnM5~K>c0uqn;*N=H7M)c`vX`Js4wy-;`{9B)9|88ln4>blR*m>+pF#0~@wZ7o< ztXK&`XrbKr5&AN@{O(ANg~!hIaQNntSgcLO!z#jY#J#C z#pU+ZG27-0k^OCmJF;FMJ`OEPU$>9+4!<6u5~UmE55ff+#-9c+i_5Z;VCSb&46a=@7x_{5K z_wFwNOcP!!3jH|~{=$?m`ITsPXZ(&)KEhOMzK3YZ1;83NvprRBX4BaWx~Zf|DbTCXyuFmwjCAwmQbm^czI0E#@do3xhRVx*ZVY?ok);IFe-4Fik1u$Az zB%!4UR*`L$`piZOUAYkCkop;6_zUfF!4sF~JnMj(L^URBL2SAQb}|yZsf}Zhz0B-* zc#YLr%zmgmJewOm%kOFbqsw1S- zR=XAj^Z!LDlY3o1-I5iJxJ>~I)Wu@fr)z-Njc)0UmQ%{>8m}Kb>~Icgb<-lEVHdl% z7=81W4MQimu7wi8Yb9deQ>Y0a%XulRd3W^W@72~P4SUtg|9}4BdJCT$QF+^p4Y$_V zNE~N%nHyyG=>4PYOkntxl{-BxartFyHb!EU#(1P50^qi*C2W~6#mM5{99@~J@%}Or z@D9%M*6rLVpIWzy2@|XF)a|$+VMR->%!-0$4=#T2eO@7SA=GzhGq0!H@~w{ zhJ_vZqhG!eTfVW5IkZ>kwOfQ-uw6P362R%PQr~%_H)tW=v69-eWOY)~=HJ(f8vBh0 za?MM4FT&DEEM%rB`<2ZY&1`~kGbmto&b&Eg)5RzEQ_3)Fk7vp3V8D!d6EHT*rUJjg zY7!*jU0wIh8S!-wLO^6xigHOAIOfPkA?2i>$vlVQgvk*T52EEa4*+A3ycF%LO}*Bl z#9&&*_Qk%}EY!u@hlR^zR&%M0LfU13HFh{w?giHN?^ace?F|HA*=ky?`Mrm~uVeH3 zou+?t#uW~1x~dhdP1XY6+?%XamFaq)DRZnUoa_U|USNY?IcZA@b7K7|Fq4Sor&IlG zmHR+Y%kYT_uQ>cEZI0sL9}d&N(GgqnevWYLPHA~E!~zQYh*M%0RvPl7 zdeybbu58IqTuFsn$t^}J4uxBHGx7QS*wR$bm&pr!sg)m~n44F0m&qUGek{vw`Rc&h zhmY@R;PCkby%w~80UsA+xuPe+=m}24FN{3THfqCC;%#N`BYEiwqLWP|F=#6cK#m02 ze(6s)Qdf25LzUA^(GOoHu>0YCoJ)atYI&O7V;+c_d0ebfu>YMUyk>VxiUTXu2UF`8 zD=k{X{nC{?$+uj+Kj_{P{zpE6Ic)WJYADL4BtUOnc7__WxACOAC9Tvqr@+xvHj%F9 zm>LDr&oLU}X*IYdH}0d06F2(i)k_#*w{zxm8Fj7P=i-JCa&F2CmHTE>V7xjMho#NY zbVFISoJ_9wmTnh1Y+IDN><^`Sf4_;Z772%Z6}yz!Wq)uHGFttXu|52jnvZf`z##nP zR}`*$lrh%CKUHPxf4DX)pn+@6Cs2M&5YJF- ziML`W!m%>R7V7;K3@lKZn!V$h^dd*dGz^f2o_vWud!5~(JdV=E$q-xECu@zlIbkbi zTtEf)F%-xLl9gl>o#X6$cloy2+5>8Wj6(<5weoc7}8iqh2r0~Oe#EVyDUz9v(d z^r=ma545lj$Xi6ARfK%dmM@o|TDoscnG|<~F!co8kp3gsA-X4O@wIv)#}~q!ctl1} z40~plr1wsdsVXNn2Lm8?+Im1UJalKt=aq$9a*q*KBFuRS zz?Aw<$+-V@pMD@3jAg8OMp47vx0cw{FMktQWr=R-^7zm-4e7#Nyo6-yIR`~UkybT_;-~+1|q=57@UR}_%w==Y{@ONpX zsK0W8bTt)rhh;PAqaiNc$m7EwU+tkX)gfejj!7C-YLMu?#`H=6!6Fh#{)NbJ4Pzw8 zc<3T~e(krJ6T@5Q@mTck$bazUh(d#T{VK2D-~`ub`JGb{gShR620EQ>N>&?W<|Cz* zX;4@E9inG`HbA*8hR3K)CTx{o=SF;x6QBEB#`z7n*P_lxHlB__|8jIwTC{Ze#4m52 zwXH9{26zxBOfJClA)uyUHW6>B#qOY|HDrXq!D4pxx{r}Bjs8qJ{`BwhERd~l!bfUC z*%AQ+{qB}*hke)n2R{oEO-}6{lA-gc@#RU?p>;;NR)E4(yp`|JeMYSQ^*QFqW_+R} z!(E4%ZAn(z1mmuvD5Ky5_QG^)BmScd+!;@+jQbQge{bVH6ZTbiP+vFjmBn-J9@6f!wRzV!nF{#kC@uwZHT zn(Ytr2{OFgb6L_udvEtP*bD70*oi?@Nfw)X50v;=|K)zFI&j-GkJ&YIIF}l-WZCT> z`!qg^Y-mJDmj|6gr2pJm1s#mxf@UUDL_p0*ZUoN%rsyh;?>@Hcd@ z0vsDsQIIJ=4I= zEuT0ICwTY>ew={1QX%cTX^)RBQ!Iucxib@1HGkCnP?Dc-pt|R8*R2u zeBiaa;FDHX8?R8)5_IS)yt7ss^ebT*PqvIF53)-;3jD+*H!iP>J7cQhE?NV1*=ViC zzy-&ic_`^!ku2dONC~7bM=~yM7o)6d_QDuPcAq-@((A+gYVWz;hRfSU1n!txa?}Tb z5Ody|I_L3_qp{w2iz9LZc?{YoA*~EPn_V^n2Nq&ofDfd0( zwF8y>D_8tC@UJZG$J0*A5O7aRX5|1up<^nZb7(}Q<}&ZW5|Ym-S|Xke@y?KEkzo|bE+IbVs3d8Dr=^~v_Y|0ZOmJ{|!UO58r( z8#1=jwsO4mnvN{O3{yXT*~7_j5*_d)Sk^t<3E$X)r$QM zx@)Km z{!%cjk@-tqIh_+4wN{GT?OFM$&D7FqIaMr&DZzY$K99?UMC{dXb2YyB!|O5v=Xaz5 zu8wZVg@!Le2U@~F1GNf~d(7}vfjS0@foH#p6hYOw&n%iOZvE1&L%$#&*-LpCeEuMO z$nFn#Sq+N@<^K-ZsEHrFW#cV9=oZ&k)#tuKC*pb_mo}Q0E^`W13$r&r$4B?7t^;tz zkB71tGe$C)E|N{ie1a4%p)GWST zFchX=(v`aO3_Gz4i}-NvEi0H%y--iCQc4*xE)VbxiGoHLkAk$`6d?D=;P%CQ&jx=K zAn9f<_^ zXl+FSU`Frppn&u>;=VOja|CJEzjy}8^7{}|;0&s5eJGhDJ?h1WSgC4wb7%6VqP@{2 YIR5A^kkG|7`lK7=yXpZ1_nm37n0Z^J~^ceqP-?DFo5J47*NI6Koo;zKm@8@xGchr4 +>chrX ATTCTTGCTCTGGCACTTAAAAGAACAAAGCCAGTAATTGCCAGAGTAGAACAGACAGAGCACTAACGATGGGACTGCAAAAAGTACTTGGGCTAGCTAGCTCAAGAAGGTGAAAGACAGGTTACACATGCTAGCCCTTGAGCTAGGAGAAACAGCTATTATCCTAAAGCTGAGGAGAACCCAATTTAGCTGTAAGACTAGGGATGAGGGAAAAAAACACAACAACAATAAATCCTATCTCTATAGTACCTGGTCTTTACCCAGTTAGAAAGCAGTAGAATGAGGAACagagagaaatagagaaacagagacagggagagagaaacaaagagagagagaggggagggagggagagagGTAGTTCCAGTTTATTATGTCAATTATGCACTAAAAATGGGGTtgtcatggtctaaatgtatttcctaaaacttatatgttgaagtttaatcaccaatgtgttagtattaagagctgaggcatttaggaggtgattaactcatgaggacagagcctcatgagatcagggtctttaaaaaggcttgagggagtgattggctctcttctgccctcctgccatgtgaggacacagcgtttcccttctgatgcagcactcaaggtgccattttaaaagtggagagaacagcccttatcagacactgaatctgctggcatcttattcttggactcccagcatctagaactgtgagaacatagcttcctattgtttacaaattacccagtctcaggttttttttgttgtagcagcatgaatagactaagacaAAATCTTACTTTACCTTTTGACCAAATTAACATGCATGGACTGGGAAAGTAGATACATAACCTGTTTTCTTGTATCTGATTTGTCTTACTTCTACCTTTTTGCACCCACTTGATTCTTTTCTATCTTGTTGTATGCTGCTTCCTTTATTTGCCTATCTTAGTGTTATTTTCTTCATTAGGGTCTGAGCCAAGCACTGTCCTGTATGCTGGAGACTCTACCTAACTGGACATACTTATTGCCTTTCAAGAGCTTGAAAGGAAGATCAAAACCAAGGCTGGGTCAGGATATTTATTCTTAGCAAGTCATACATTTTATATTGAGAAGAATTTTATAATTTGAAGTATGTTCATCATTCCAGTGGAATTTTGATAAGAATTTCACCCAACCCATGCAACTGTGTCAAACAGAAAAATCCTGATGGAAAATCATGGCTCCATGGGGGAGTTTGCTCTGGTGGAAATACATCTGTGTAGGAGTTCCTGTTAAAAGAAGACACATCACTACATTTTACCTGTCAGGGATTTTTTTTAAGTCTACATATGCAAAGGCAAAGTTGTCACTAGCGTGTAAACACAGAATCTCTGGGAAGGATTTGCAATATGACATCTTTCCATGTCTGCTGTATTTGATTCGAGATAGCAAAAATGCAAGTGGTTCCCCAGAGACTACCTATACAGGCAGGTTTGGAGCTAGTGTTTGTATCTTGCAGGGATTGTGAAATAAAGCGTACCCATGTTTAACTCTCAACACCTCTTGCCAGGCTGAGACAAAATGGTGAAACAAGGTGGTCAGCCTGGAgttgtgaaaagagggtgggatttggattcagcaagatgtgagtttgaattccagctctatcactTTTAGTTGGGGAGTCTCTACAATACACTTTGATTAGGGTAGTGATACCTACTGAGTTGCAGTGATTAATGGAAACACTGCATAAAATAATCTTATTAGAATGACTAGAATTTAAGAAGCTGTTacacacacacacacacacacaATAAATATGACGTTAGTTTGACAATACTGACTACACTACACATAAAAATAAAGTTCAAATCTGTGGTTTAAGGTTATTCTTATGACATTAAGACCATTTGTGATTTAGAAAATCGTATCTAATACACGTGCTTATGTAAGGGAGACATACACTCCCCCCAAaatagctaaaatgtatggaacacataatatgtgccaggtcctgttcaaaaaaatttacatgcatgattctcaaccatttatactaaatccaattattagtccaatattatcttcactctgcagctttggtaactaaggcccagtaaggttgggtaatttgcccaaggttacacaaatactaagtgatgtagccaaatagatgtaggcagtctggttctagagcctgtgaagttacttatcgtaccatactACCTAGATTTTCCCTAAAAAGTCAACAATTATTCTTAGAGCAGAGGGGAGCCTCATATAGTATCAATTTAATTAAGAAATATCACCCTTTCCTTGAGGAGTTTTAAGTTGAATAAAAAATAATGGATATAATTCTAGTTACTGTGTGGTTAATTGCTAACACATTGCAGCACAACCTATGTATGTTCTGGATTATACAGGTGAAAGAATAACAGGGCAAAATTTGGAATGGCCAAGAACAGTCTTcattcatctgtccttctatccatccataaatccattcatccactaatccatccatccattcatttgtcaatccatccatccatccatctacctatccTTTCTGATTGCAGTAAAGGGATGCagagcaaaatcttacagttaggtagacctaaattctagctcctgttttatcacttactagctgaatggtcttgagaaattaaggtaacctgattgagtctcctaaaatggaaaaataattaaaatatacttaatggtagtgttatgagaattacgtttatagacttctaggcacagtatctggtacaaaattaacactaaataaatgatggttATCAATATTAACATTAAAGCTTTTATGTCAGGTACCAAATTACAgttatgttctgaagatataaagattaataagatatatttcttgccctctaaaagattccagaagtttagtagggACTTTACACAAGTGTGATAAATGTGGAATGGTTAGTTCTACCATCGTTTAGGGATAAGGGAATCaagaatactaatagtagttggctgggcatggtggctcactcctgtaatcccagcattctggggggccaaagtgggagggtctcaggaacccaggaggttgagactgcagtgagccatgattgtacctttacactccagcctgggtgccagagtgagaccctgtctctaaaaaTTTTTTTAAAAGAAGAATGTAatttgttgagattttctgatgttcagatgctgagctaagtagttgtagcacttacttactatctcatttaattccaatgagaactattattacctctattttacaaaagaggaaagtgaggcttaaagaaatgcagtttgttgtttggagtcatgcatttaagaaagggaagggccgagaccttaacccaaatAAttttttttttttttttttttttagacggagtcttgctcagtcacccaggctggagttcagtggcaccatctctgctcactgcaagctctgtctctgcctcccgggttcacaccattctcctgcctcagcctcccgagtagctgggactacaggcacctgccaccatgcctggctaattttttgtatttttattagagacggggtttcaccgtgttagccaggatggtctcgacctcctgacctcgtgatcctcccacctcggcctcccgaagtgctgggattacaggcttgagccactgcacccggccAACCCAAGTTATTCTTAATGTGGAAAAACTAAGAAGACTGCCTCTAAGCTACATTTTAAAAGATTACCACCCATTTTTAAGTAGATAGATATTGAGGTGTATTAAAGACACAGGAAGCAGTAAGGAAAGTGACAAAAATAAATCAAACTATGTGGCCCAGTTAGAAAATATGAGGTGattcagtgttcattcaaaaattgtttattgagggtccattacatgtcagggatctttcctaggtactgaaggtgcatggctgtaaaacatagactaaaattcctgccaacgtggaccttattatccattctctgggagggagacacccaaagtaaaattaaaaacacatgattaagaagttcattataaagtaatatatacaacatataataaagtataaaaacctatgttctaagtgctattgagaaaaaaaaagaagcagagaagacatcaaagtaggttgaaagtagggagattttctggttttaaatggatagtcagggaaggcttagccaagaagatgatatgcggacatataagctctgaagcgctgagagtgagccatttgggatatctagcagaaaagcttgccaggtacaggaggtagcatttgcaaagacccaggaaggaaaaatgactggcatgttggaggaacagacaggaggtcaTGGATTGAAatatatatatatatatatatatatatatatatatatatatatatatatatacacacacacacacacacacacacacacacacgtgtgtgtgtgtgtgtgtgtATTAGCAGAACATagaaaaagaaactagcaatgcaagagaggaaaaggagaattctagagccaaatctttgagcagtggaaggggtagggatctagctcacaaatATGGTTGATGTATGAAACTTGTGCATGTCTGGTGGGGGTAGGGGTGAAGGTGGGAGTGTTTGAGGTCAAATAGGGCAATTCAGCCAgagattaaattagaggtgtataggaaccaaatcttgaaggctcttccattacaggcatagatgtttggactttgtgctagtaaaaatgaggatctttaagggttttaggtacaggaatgaaatattccaataagtttcaccaggaAAGTGAAACTTCTGCTGAGCTTTGAGGCATTTATGGGATTTGAATACATGAGAGGTAAAAGAAAGAACAATGTAGGCAAGAAGGTCAACAGTAATAATGACCCAGAATTGGAATAGTTTTGGCAGATGGGGAACCAGTTGAAGATGTGTGCTGGCAGACACCTCTGGGGGAAAGAAAGGAAAATATGAGTTATATCTAAAAggctctggaatcaatctgcttaccaggtgcacactctatcacatactatctctgtgagcttgagcatgatccctaatttcttgctatctaatcacttgatataataaagataataatagtaatttcctcataaagaaaatgaaattaaatcatatggataaaatattacctgtatgcctgacacacattgttctcactaaatgttagttgttattCTATATAGCATCTTGGGAACCTGGAGAGGATGTAAGAGATTGGTTCCAGATGTATGGAATGTACTAGAAAGAAACATGATTGAAAGCACTATGATTTCAGGAGGCTGATTATATTATGTAAATCTAAATATTGATCTCTAATATTCAAATTCATCATTTTAAAGGTCAAGGCTTATAACAGAATTCATCTAGACTAACCCTGGAGATAATATAAGAATCAAAATAGCTCTTAAAGTTCAACAGTTAAGCAGTTGAGCTGTGGAAATAGGATATTTATGAGTAGCTTACACACACACACACACAAACACATATATATTGTAAATAAACACAATGGGGATCTCCATACTGCTACCAAATTTTCATATTTTATAAGCCACAATGTGCTAGAAACAAAATTTGGTGCTTCAGAAGAATGTTTGATTAATGATCATTTCTTTGTAATTAGTTATTTGAATGGACAGCAACATCAACATATGGAAGAATTCAGAAAAAAAGAACACAGCATCGTACTTCAGGAGTCTTTACATTTTCTAGAAGGCCAAATTATAAATAGAGTCTTATCTCACCTATCATGAGAGCAGATTGCTACTTATGTATCCATTTATAATTTATCTGAAAATTCCATTTTTTAAAGTTTCTGTATATTCAATGTATCAAACTGGATAGAGAGTTCTTTTTTCAAGTTTTCCAGTTTGGTGAatagatagatagatagatagatagatagatagatagatagttagatagacagacagacagatacagatatagataTAGatgtgtatatatatgtgtgtatgtgtgtgtataCAACATATTGCTTTTCCTTGTTAAATAGCTTGTTAAATTCAGTTCTGTTTCATGCGTTAAAGATTACAGCAGGTAGGTTTCTAAAAAGTCATCATAGATTCTACACTGTTTGGGAATACATAGCTAATATCTGCAACTAAGGTGAACGTGAGAAAAATCACTTATTTGGTTTTGCAACTACAGATCTAACTCTCAAGCATTAAGATATTTCGTCAAATTGATGTTGAAAGGTATCTAGCAAATTCATGAAATCTCATCCTTTAATCCAAAATTACTACAAGGAATATGAACTAAAAAAGCCAAAAGAGGCTATCACATAATAATTAGATACTAGATCAGGCAGTTTTTGTGTTCAGTTCTCAAAATACTGAATATCTCAAAATAGTGAGGAGTGAATACAAGAGTGTTAATCAAGTTCACAAGTAAAATTTTATTCCCAATTTGACAGGTAGGTGGGTAAATATGAAACTTAGCTGTGCCTCATATGGTTGTGTTGATAGGATTTAATTTAAGTGACTGGATGCCTACTAGATTATTGGGCAGAAAATATAATTTAAACATAATTTGGCCATTAATTAGTCTTAATGGATTGGATAAAATGGCTCAGTGAACAGATTAAAAAAAATTTTCTAATGGTAAATAATCCTGTAAGAATATCTTTagcctgagcaacacagcaagaccttgtctctacaaaaaatgtaaaaaattatctggctgtggtggtgtgcacttgtagtcctagctacttgggaggctgaggcaggagaatcacttgagcctgggagttgagggttgcagtgagttatcatcatgccacttcatgccagcctgggtaatacataatctgtctcaacaaaagaaTATCCTTAAAATACAAAGataaaatataaaaaaaattttaGTGAGATTTTAGAAAGTAGAGAGATAGAGTAGACAATTTAATACAACTTTCAACTAAAAAGGTCTGAGAGACTTACAGAAGTTTGTGTTGTAGAAATCTAGTGTTTGGTTTTCTCAGCAGATATTTTCTTTTCTTTTGGTAATAGCAGTTGGGGTTTCCTTTGGGAACCGATTCCATTTCATATTGTCATGTTATTTGATATCAATAAGAAGAGCCAAAAAATTTAAGGCCAAACTACAAAGGTTCATGGCCAGGTTCTGGCCCAGTGTAATGATCAACAAGTCATATAATCTTCATAAAGGTTGGGTTTCTCATTTTCATtttatcttgtttatttatttatttatttatttTGTgatggagtttcactcttgttgctcatgctggagtgcaatggcatgatttctgttcactgcaacctctgcctcccaggttcaagcaattctcatgcctcagcctcccgagtaactgggattacaggtgctcctccatacccagcaaattttttgtatttttagtagagatggggtttcaccatattggccaggctggtctcaaactcctggcctcaagtgattcacccacctcggcctcccaaggtattgggattacaggtgtgagccaccacacccagccTAGGGTTTCTCATTTTTAGAATAGAAGCAACATTAGCACTATGTTAGTAAAATGCCACCACAAATACTTTGCAGAATTACTGTAACAATTGCAATAGAGAAAGGATTGCTACATCGATATTAAGAACACCTAGGGGGTGCTTGCTAGTGTAAACCAACATGACTAACCACAATAGGTGAGtgtattagttcattttcaccctgctgataaagacatacccaagactggctaatttacaaggaaaaagaggtgtaaaggactcacagttccatgtggctggggaggcctcacaaacatggcggaaggtgaaaggcaaaaggcacaacttacatggcagcaggcaagagagagaatgagagccaagaaaaaggggtttccccttataaaaccatcagatctagtgagacatattcactaccatgagaacagtatgcgggaaaccatacccatgattcagttatcttccaccaggtccctcccacaatacatgggaattatgggagctacaattcaagagatttgggtggggacatagctaaatcatatcaGTGAGCTTCATTACAAGTTTTAAAAAATGAAGGCAACAACAGGTGAACTGTATGGTATGGGCATGATAACTTAATAGAGTGTTTTAAAATATGATGGCAGTTTATTTTCATCAATGGTTGGTCTTTTCATTATCAACATGAGAATAACAAATGTATCTTATATAAAATGAAGGATCAGGTTGATGTCTACTTGACATTCCCCATGAACTACACTGGTATTAGGGAGACATTAGATACTCCCAGATATACTGGCTTTGGCTTCCTGGGAATGGAGTGTAGCTTGGTGACAGCTGGGCAGAGAGCCTCAGGCATGCTCCCCATTCCTGTGCAGAGGGCACAGCCATGGAAAGCATTTCTTGGACTTATTTATACCATCAGTACCCTAATGACAGAAGAAATACAGCTTATTTCATATTCATTTTTAAAAGTAACCTTAAACATCAGGAACACAAAAGAAGTGGTAGAGAATTCGGAGGTAATAGGGGAACAAAAAGCATTTAGTGTGATAATTTAGAGGGTGTGTGCAAACTTCATGCAGATTTAAATTTCCCAACAGCTTGCAAATAGGTTACTCAAGTATTCAAAAGCAGAAATTTCTAGAGGTGCATTAGTCGTAAAGGCCATGGAATGTAGCCTTTTATGTTTATTGTTTTCATTTACATTCTGTCCAACCCTGAGTCTTGTAGTCTAAAAGGGTGTGACATGTGTGTTCAATTGCATCCAAGAATGTCTTCTGAAACTGAGTCCCTCCACTGTAGAATAAaaacagcctgggttcaaatagcagactcacttcccagtaaagctggcccattgacttgggccatttacttgtattttctgagcctcagtttcttcatctgtaaactgaggggctaataatgtacttacactatagtggtaaagataacatgggaaaatatatgtcaaatgcataggtccaggggtatagtgcatttgattaagagcacaacaaatAATTACAATTATCATTGTTACCCTTTTTGTGAAtttttttctttttttttttttgagacagagtctcgctctgtcacccaggctggagtgcagtggtgcgatctctgctcactgcaagctccgcctcccaggttcaccctgttgtcttgcctcagcctcccgagtagctgggactacaggcgcctgccaccacgcccagctaattttttttgtatttttagtagaggcggggtttcaccgtgttaccaggatggtcttgatctcctgacctcgtgattggcccgcctcagcctcccaaagtgctgggattacaggcgtgagccaccgtgcctggccTGGCCATTGTTACCCTTTTTGATACAGTTGTGAGGAGTGAAAAATACTCACGTTCAAAAAGGTACCTTTCAAGGCAGAGTCTCCCTCCTAAGAGATAAGGAATTTCAGAAACACACCAGGAATCAAGGTGGATAGAGAGTTTCTCACCTCCTGAAAATTACAGTCTAAACTTTTGACTTCTTTTACCCCAGAGTATCTTAAGTTTGGGAGGAAATGTTTGTGTTTTCTCTAAAAGAAAAGCACAGAAATATCTTCTGCCTAAATTATTATTATACAACTTGTTTTTTCTTCTTTTGAAACCTCTGTGGCAGATAAAACTTGTATGTTAAAAAGGAATTTAAAAAGATAACACTTGCAAAATTTTGTGAAACCTAATATAATCTTCTCCAAAGTGTAACGCATTAGTTTTCCTTAAAGT diff --git a/share/test-data/paragraph/long-del/chrX_graph_typing.fa.fai b/share/test-data/paragraph/long-del/chrX_graph_typing.fa.fai new file mode 100644 index 0000000..076dd55 --- /dev/null +++ b/share/test-data/paragraph/long-del/chrX_graph_typing.fa.fai @@ -0,0 +1 @@ +chrX 9816 6 9816 9817 diff --git a/share/test-data/paragraph/long-del/chrX_graph_typing.manifest b/share/test-data/paragraph/long-del/chrX_graph_typing.manifest new file mode 100644 index 0000000..4de8544 --- /dev/null +++ b/share/test-data/paragraph/long-del/chrX_graph_typing.manifest @@ -0,0 +1,3 @@ +#id path depth read length depth sd sex +SAMPLE1 share/test-data/paragraph/long-del/chrX_graph_typing.bam 44.2 150 20 male +SAMPLE2 share/test-data/paragraph/long-del/chrX_graph_typing.bam 44.2 150 20 female diff --git a/share/test-data/paragraph/long-del/param.json b/share/test-data/paragraph/long-del/param.json index bdbc701..b39a458 100644 --- a/share/test-data/paragraph/long-del/param.json +++ b/share/test-data/paragraph/long-del/param.json @@ -1,5 +1,11 @@ { + "ploidy": 2, "min_overlap_bases": 16, + "min_pass_gq": 20, + "coverage_test_cutoff": [ + 0.01, + 0.0001 + ], "allele_names": [ "REF", "ALT" @@ -17,8 +23,8 @@ "0/1": 0.3, "1/1": 0.2 }, + "reference_allele": "REF", "reference_allele_error_rate": 0.05, "other_allele_error_rate": 0.05, - "other_genotype_fraction": 0.33, - "ploidy": 2 + "other_genotype_fraction": 0.33 } \ No newline at end of file diff --git a/share/test-data/paragraph/pg-het-ins/genotypes_expected.json b/share/test-data/paragraph/pg-het-ins/genotypes_expected.json index 6cf0707..d648021 100644 --- a/share/test-data/paragraph/pg-het-ins/genotypes_expected.json +++ b/share/test-data/paragraph/pg-het-ins/genotypes_expected.json @@ -120,12 +120,13 @@ "REF/chr1:939570-1:1": -11.425, "chr1:939570-1:1/chr1:939570-1:1": -34.357 }, + "GQ": 73, "GT": "REF/chr1:939570-1:1", "allele_fractions": { "REF": 0.54545, "chr1:939570-1:1": 0.45455 }, - "coverage_test_pvalue": 0.00013463, + "coverage_test_pvalue": 0.075848, "num_reads": 22 } }, @@ -146,34 +147,44 @@ "REF/chr1:939570-1:1": -9.087, "chr1:939570-1:1/chr1:939570-1:1": -29.247 }, + "GQ": 87, "GT": "REF/chr1:939570-1:1", "allele_fractions": { "REF": 0.46154, "chr1:939570-1:1": 0.53846 }, - "coverage_test_pvalue": 0.001772, + "coverage_test_pvalue": 0.11888, "num_reads": 26 } } }, "edges": { + "chr1:939571-939570:CCCTGGAGGACC_ref-chr1:939571-939720": { + "clip_rate": 0.0, + "contig_length": 162, + "gap_rate": 0.0, + "match_base_depth": 6.4691, + "mismatch_rate": 0.0075758, + "num_fwd_reads": 7, + "num_rev_reads": 9 + }, "ref-chr1:939420-939570_chr1:939571-939570:CCCTGGAGGACC": { "clip_rate": 0.0, "contig_length": 163, "gap_rate": 0.0, - "match_base_depth": 5.5215, - "mismatch_rate": 0.0077178, + "match_base_depth": 9.362, + "mismatch_rate": 0.0039164, "num_fwd_reads": 7, "num_rev_reads": 8 }, - "source_ref-chr1:939420-939570": { - "clip_rate": 0.0048808, - "contig_length": 152, + "ref-chr1:939420-939570_ref-chr1:939571-939720": { + "clip_rate": 0.0, + "contig_length": 301, "gap_rate": 0.0, - "match_base_depth": 22.862, - "mismatch_rate": 0.0022969, - "num_fwd_reads": 14, - "num_rev_reads": 14 + "match_base_depth": 5.9834, + "mismatch_rate": 0.00055494, + "num_fwd_reads": 7, + "num_rev_reads": 5 } }, "gt": { @@ -182,12 +193,15 @@ "REF/chr1:939570-1:1": -9.087, "chr1:939570-1:1/chr1:939570-1:1": -29.247 }, + "GQ": 73, "GT": "REF/chr1:939570-1:1", "allele_fractions": { "REF": 0.5, "chr1:939570-1:1": 0.5 }, - "filter": "PASS", + "filters": [ + "PASS" + ], "num_reads": 48 }, "mean_graph": null, @@ -197,31 +211,31 @@ "multi_read": 0, "nodes": { "chr1:939571-939570:CCCTGGAGGACC": { - "clip_rate": 0.0, + "clip_rate": 0.010417, "contig_length": 12, "gap_rate": 0.0, - "match_base_depth": 60.0, - "mismatch_rate": 0.0096286, + "match_base_depth": 16.0, + "mismatch_rate": 0.0, "num_fwd_reads": 7, - "num_rev_reads": 8 + "num_rev_reads": 9 }, "ref-chr1:939420-939570": { - "clip_rate": 0.00080906, + "clip_rate": 0.010522, "contig_length": 151, "gap_rate": 0.0, - "match_base_depth": 8.1788, - "mismatch_rate": 0.00080906, + "match_base_depth": 15.682, + "mismatch_rate": 0.003367, "num_fwd_reads": 14, "num_rev_reads": 14 }, - "source": { - "clip_rate": 0.0, - "contig_length": 1, + "ref-chr1:939571-939720": { + "clip_rate": 0.0057173, + "contig_length": 150, "gap_rate": 0.0, - "match_base_depth": 2519.0, - "mismatch_rate": 0.0039541, + "match_base_depth": 12.76, + "mismatch_rate": 0.0051975, "num_fwd_reads": 15, - "num_rev_reads": 15 + "num_rev_reads": 14 } }, "paired_read": 0, diff --git a/src/c++/include/genotyping/BreakpointGenotyper.hh b/src/c++/include/genotyping/BreakpointGenotyper.hh index 8616029..52af380 100644 --- a/src/c++/include/genotyping/BreakpointGenotyper.hh +++ b/src/c++/include/genotyping/BreakpointGenotyper.hh @@ -44,6 +44,24 @@ namespace genotyping { + +/** + * structure for passing parameter into breakpoint genotyper + */ +struct BreakpointGenotyperParameter +{ + BreakpointGenotyperParameter(double _read_depth, int32_t _read_length, double _depth_sd, bool _use_poisson_depth) + : read_depth(_read_depth) + , read_length(_read_length) + , depth_sd(_depth_sd) + , use_poisson_depth(_use_poisson_depth){}; + + double read_depth; + int32_t read_length; + double depth_sd; + bool use_poisson_depth; +}; + class BreakpointGenotyper { public: @@ -55,11 +73,13 @@ public: /** * public function to do breakpoint genotyping from read counts on edges - * @param read_depth expected / mean read depth - * @param read_length read length - * @param read counts for each allele + * @param read_depth Expected / mean read depth + * @param read_length Read length + * @param read counts Number of reads for each allele + * @param depth variance Variance of depth across genome. If non-positive, Poission test will be used */ - Genotype genotype(double read_depth, int32_t read_length, const std::vector& read_counts_per_allele) const; + Genotype + genotype(const BreakpointGenotyperParameter& param, const std::vector& read_counts_per_allele) const; private: /** @@ -75,8 +95,15 @@ private: /** * cutoff for coverage test p value + * 1. lower end + * 2. upper end + */ + std::pair coverage_test_cutoff_; + + /** + * minimum GQ for a PASS event */ - double coverage_test_cutoff_; + int min_pass_gq_; /** * genotyping specific stats diff --git a/src/c++/include/genotyping/CombinedGenotype.hh b/src/c++/include/genotyping/CombinedGenotype.hh index 65dbe5d..1eafcce 100644 --- a/src/c++/include/genotyping/CombinedGenotype.hh +++ b/src/c++/include/genotyping/CombinedGenotype.hh @@ -55,10 +55,12 @@ namespace genotyping * @param genotyper Breakpoint genotyper for additional genotyping * @param depth Depth of this sample * @param read_length Read length of this sample + * @param depth_sd Depth standard deviation * @return final GT */ Genotype combinedGenotype( - GenotypeSet const& genotypes, const BreakpointGenotyper* p_genotyper = NULL, double depth = 0, int read_length = 0); + GenotypeSet const& genotypes, const BreakpointGenotyperParameter* b_param = NULL, + const BreakpointGenotyper* p_genotyper = NULL); /** * Count number of unique genotypes in the genotype set @@ -84,6 +86,6 @@ Genotype reportConsensusGenotypes(GenotypeSet const& genotypes, bool pass_only); * @return Final GT from total counts */ Genotype genotypeByTotalCounts( - GenotypeSet const& genotypes, bool use_pass_only, const BreakpointGenotyper* p_genotyper, double depth, - int read_length); + GenotypeSet const& genotypes, bool use_pass_only, const BreakpointGenotyper* p_genotyper, + const BreakpointGenotyperParameter* b_param); }; diff --git a/src/c++/include/genotyping/Genotype.hh b/src/c++/include/genotyping/Genotype.hh index 4987119..373d32b 100644 --- a/src/c++/include/genotyping/Genotype.hh +++ b/src/c++/include/genotyping/Genotype.hh @@ -34,6 +34,7 @@ #pragma once +#include #include #include "json/json.h" @@ -72,6 +73,12 @@ struct Genotype std::vector gl_name; std::vector gl; + /** + * Genotype quality + * -10log10(1 - gl/sum(gl)) + */ + int gq = -1; + /** * fractions of edge count */ @@ -90,7 +97,7 @@ struct Genotype /** * filter description (include PASS) */ - std::string filter; + std::set filters; /** * relabel index-based genotypes with given new labels @@ -106,8 +113,12 @@ struct Genotype /** * output string of most likely genotype */ - std::string toString() const; - std::string toString(std::vector const& allele_names) const; + std::string toString(const std::vector* p_allele_names = NULL) const; + + /** + * output string of filter + */ + std::string filterString() const; /** * get major information as a single string (simplified) diff --git a/src/c++/include/genotyping/GenotypingParameters.hh b/src/c++/include/genotyping/GenotypingParameters.hh index a31e141..8c429a7 100644 --- a/src/c++/include/genotyping/GenotypingParameters.hh +++ b/src/c++/include/genotyping/GenotypingParameters.hh @@ -66,7 +66,9 @@ public: unsigned int numAlleles() const { return num_alleles; } - double coverageTestCutoff() const { return coverage_test_cutoff; } + std::pair coverageTestCutoff() const { return coverage_test_cutoff; } + + int minPassGQ() const { return min_pass_gq; } unsigned int minOverlapBases() const { return min_overlap_bases; } @@ -82,6 +84,8 @@ public: const std::vector& possibleGenotypes() const { return possible_genotypes; }; + bool usePoissonDepth() const { return use_poisson_depth; } + private: /** * Set the vector of all possible unphased genotypes from stored alleles and ploidy. @@ -117,9 +121,14 @@ private: unsigned int num_alleles; /** - * cutoff for coverage test p value + * cutoff for coverage test p value. lower end, upper end. + */ + std::pair coverage_test_cutoff; + + /** + * min GQ for a PASS event */ - double coverage_test_cutoff; + int min_pass_gq; /** * allele names from graph input @@ -164,5 +173,7 @@ private: double other_het_haplotype_fraction; double other_genotype_fraction; + + bool use_poisson_depth; }; }; \ No newline at end of file diff --git a/src/c++/include/genotyping/GraphGenotyper.hh b/src/c++/include/genotyping/GraphGenotyper.hh index 251ec94..9629ef8 100644 --- a/src/c++/include/genotyping/GraphGenotyper.hh +++ b/src/c++/include/genotyping/GraphGenotyper.hh @@ -143,6 +143,13 @@ protected: */ std::pair const& getDepthAndReadlength(size_t sample_index) const; + /** + * Get depth standard deviation for a sample + * @param sample_index + * @return std deviation of average depth. if not available, return 0 + */ + double getDepthSD(size_t sample_index) const; + /** * Get sex integer for a sample */ diff --git a/src/c++/include/genotyping/SampleInfo.hh b/src/c++/include/genotyping/SampleInfo.hh index 36c3578..e94674c 100644 --- a/src/c++/include/genotyping/SampleInfo.hh +++ b/src/c++/include/genotyping/SampleInfo.hh @@ -67,7 +67,9 @@ public: unsigned int read_length() const { return read_length_; } void set_read_length(unsigned int read_length) { read_length_ = read_length; } double autosome_depth() const { return autosome_depth_; } - void set_autosome_depth(double autosome_depth) { autosome_depth_ = autosome_depth; } + void set_autosome_depth(double autosome_depth); + double depth_sd() const { return depth_sd_; } + void set_depth_sd(double depth_sd) { depth_sd_ = depth_sd; } /** * Getters / setters for sex information @@ -93,6 +95,7 @@ private: std::string index_filename_; unsigned int read_length_ = 0; double autosome_depth_ = 0.0; + double depth_sd_ = 0.0; Sex sex_ = UNKNOWN; Json::Value alignment_data_ = Json::nullValue; }; diff --git a/src/c++/lib/genotyping/BreakpointGenotyper.cpp b/src/c++/lib/genotyping/BreakpointGenotyper.cpp index e60d30f..0b2088d 100644 --- a/src/c++/lib/genotyping/BreakpointGenotyper.cpp +++ b/src/c++/lib/genotyping/BreakpointGenotyper.cpp @@ -27,12 +27,15 @@ #include "genotyping/BreakpointGenotyper.hh" #include "common/Error.hh" #include +#include #include #include #include #include +#include #include +using boost::math::normal_distribution; using boost::math::poisson_distribution; using std::map; using std::vector; @@ -47,6 +50,7 @@ BreakpointGenotyper::BreakpointGenotyper(std::unique_ptr c : n_alleles_(param->numAlleles()) , ploidy_(param->ploidy()) , coverage_test_cutoff_(param->coverageTestCutoff()) + , min_pass_gq_(param->minPassGQ()) , min_overlap_bases_(param->minOverlapBases()) , possible_genotypes(param->possibleGenotypes()) { @@ -86,22 +90,25 @@ BreakpointGenotyper::BreakpointGenotyper(std::unique_ptr c } }; -Genotype BreakpointGenotyper::genotype(double read_depth, int32_t read_length, const vector& read_counts) const +Genotype BreakpointGenotyper::genotype( + const BreakpointGenotyperParameter& param, const std::vector& read_counts_per_allele) const { - if (read_counts.size() != n_alleles_) + if (read_counts_per_allele.size() != n_alleles_) { - error("Error: number of read counts and alleles mismatches. %i != %i.", (int)read_counts.size(), n_alleles_); + error( + "Error: number of read counts and alleles mismatches. %i != %i.", (int)read_counts_per_allele.size(), + n_alleles_); } Genotype result; // compute adjusted depth - const double multiplier = (read_length - min_overlap_bases_) / (double)read_length; + const double multiplier = (param.read_length - min_overlap_bases_) / (double)param.read_length; assert(multiplier > 0); - const double lambda = read_depth * multiplier; - const int32_t total_num_reads = std::accumulate(read_counts.begin(), read_counts.end(), 0); + const double lambda = param.read_depth * multiplier; + const int32_t total_num_reads = std::accumulate(read_counts_per_allele.begin(), read_counts_per_allele.end(), 0); if (total_num_reads == 0) { - result.filter = "NO_READS"; + result.filters.insert("NO_READS"); return result; } result.num_reads = total_num_reads; @@ -111,7 +118,7 @@ Genotype BreakpointGenotyper::genotype(double read_depth, int32_t read_length, c for (const auto& igt : possible_genotypes) { - const double gl = genotypeLikelihood(lambda, igt, read_counts); + const double gl = genotypeLikelihood(lambda, igt, read_counts_per_allele); result.gl_name.push_back(igt); result.gl.push_back(gl); @@ -123,26 +130,70 @@ Genotype BreakpointGenotyper::genotype(double read_depth, int32_t read_length, c } } + // compute GQ and set filter + double sum_gl = 0; + for (auto l : result.gl) + { + sum_gl += exp(l); + } + double pr_gt_error = (double)1.0 - exp(best_gl) / sum_gl; + if (pr_gt_error == 0) + { + result.gq = 100; + } + else + { + double gq_log10 = log10(pr_gt_error); + if (gq_log10 < -10) + { + result.gq = 100; + } + else + { + result.gq = -10 * gq_log10; + } + } + if (result.gq < min_pass_gq_) + { + result.filters.insert("GQ"); + } + // compute allele fractions result.allele_fractions.resize(n_alleles_, 0.0); for (unsigned int al = 0; al < n_alleles_; ++al) { - result.allele_fractions[al] = ((double)read_counts[al]) / total_num_reads; + result.allele_fractions[al] = ((double)read_counts_per_allele[al]) / total_num_reads; } // compute coverage test p value - const poisson_distribution<> coverage_distribution(lambda); - double coverage_test_pvalue = cdf(coverage_distribution, total_num_reads); + double coverage_test_pvalue; + if (param.use_poisson_depth) // use poisson test for depth (more stringent) + { + const poisson_distribution<> poisson_coverage_distribution(lambda); + coverage_test_pvalue = cdf(poisson_coverage_distribution, total_num_reads); + } + else // use normal test for depth (default) + { + const normal_distribution<> normal_coverage_distribution(lambda, param.depth_sd); + coverage_test_pvalue = cdf(normal_coverage_distribution, total_num_reads); + } + if (coverage_test_pvalue > 0.5) { coverage_test_pvalue = 1 - coverage_test_pvalue; + if (coverage_test_pvalue < coverage_test_cutoff_.first) + { + result.filters.insert("BP_DEPTH"); + } } - result.coverage_test_pvalue = coverage_test_pvalue; - - if (coverage_test_pvalue < coverage_test_cutoff_) + else { - result.filter = "DEPTH"; + if (coverage_test_pvalue < coverage_test_cutoff_.second) + { + result.filters.insert("BP_DEPTH"); + } } + result.coverage_test_pvalue = coverage_test_pvalue; return result; } diff --git a/src/c++/lib/genotyping/CombinedGenotype.cpp b/src/c++/lib/genotyping/CombinedGenotype.cpp index ce6d6d9..eca7dcf 100644 --- a/src/c++/lib/genotyping/CombinedGenotype.cpp +++ b/src/c++/lib/genotyping/CombinedGenotype.cpp @@ -34,6 +34,7 @@ */ #include "genotyping/CombinedGenotype.hh" +#include #include #include #include @@ -49,8 +50,8 @@ using std::vector; namespace genotyping { -Genotype -combinedGenotype(GenotypeSet const& genotypes, const BreakpointGenotyper* p_genotyper, double depth, int read_length) +Genotype combinedGenotype( + GenotypeSet const& genotypes, const BreakpointGenotyperParameter* b_param, const BreakpointGenotyper* p_genotyper) { Genotype result; @@ -60,7 +61,7 @@ combinedGenotype(GenotypeSet const& genotypes, const BreakpointGenotyper* p_geno auto num_fail_genotypes = static_cast(countUniqGenotypes(genotypes, false)); if (num_fail_genotypes == 0) { - result.filter = "MISSING"; + result.filters.insert("NO_VALID_GT"); } else if (num_fail_genotypes == 1) { @@ -68,7 +69,7 @@ combinedGenotype(GenotypeSet const& genotypes, const BreakpointGenotyper* p_geno } else { - result = genotypeByTotalCounts(genotypes, false, p_genotyper, depth, read_length); + result = genotypeByTotalCounts(genotypes, false, p_genotyper, b_param); } } else if (num_pass_genotypes == 1) @@ -77,12 +78,12 @@ combinedGenotype(GenotypeSet const& genotypes, const BreakpointGenotyper* p_geno } else { - result = genotypeByTotalCounts(genotypes, true, p_genotyper, depth, read_length); + result = genotypeByTotalCounts(genotypes, true, p_genotyper, b_param); } - if (result.filter.empty()) + if (result.filters.empty()) { - result.filter = "PASS"; + result.filters.insert("PASS"); } return result; @@ -90,7 +91,7 @@ combinedGenotype(GenotypeSet const& genotypes, const BreakpointGenotyper* p_geno size_t countUniqGenotypes(GenotypeSet const& genotypes, bool pass_only) { - std::set votes_for; + std::set voted_gts; for (auto& bp : genotypes) { if (bp.gt.empty()) @@ -100,7 +101,7 @@ size_t countUniqGenotypes(GenotypeSet const& genotypes, bool pass_only) if (pass_only) { - if (!bp.filter.empty()) + if (!bp.filters.empty()) { continue; } @@ -108,21 +109,15 @@ size_t countUniqGenotypes(GenotypeSet const& genotypes, bool pass_only) Genotype sorted_bp = bp; std::sort(sorted_bp.gt.begin(), sorted_bp.gt.end()); - votes_for.insert(sorted_bp.gt); + voted_gts.insert(sorted_bp.gt); } - return votes_for.size(); + return voted_gts.size(); } Genotype reportConsensusGenotypes(GenotypeSet const& genotypes, bool pass_only) { Genotype result; - std::set filters; - - if (!pass_only) - { - filters.insert("ALL_BAD_BP"); - } struct GLInfo { @@ -140,33 +135,34 @@ Genotype reportConsensusGenotypes(GenotypeSet const& genotypes, bool pass_only) result.num_reads = 0; result.allele_fractions.clear(); + std::vector gqs; for (auto& bp : genotypes) { - if (bp.gt.empty()) { - filters.insert("MISSING"); + result.filters.insert("BP_NO_GT"); continue; } - if (pass_only) + if (!bp.filters.empty()) { - if (!bp.filter.empty()) - { - filters.insert("EXIST_BAD_BP"); - continue; - } + result.filters.insert(bp.filters.begin(), bp.filters.end()); + continue; } if (result.gt.empty()) { // sort genotype -- this will need to handle phasing at some point - Genotype sorted_bp = bp; - std::sort(sorted_bp.gt.begin(), sorted_bp.gt.end()); - result.gt = sorted_bp.gt; + GenotypeVector sorted_bp = bp.gt; + std::sort(sorted_bp.begin(), sorted_bp.end()); + result.gt = sorted_bp; } result.num_reads += bp.num_reads; + if (!result.gt.empty()) + { + gqs.emplace_back(bp.gq); + } if (bp.allele_fractions.size() > result.allele_fractions.size()) { result.allele_fractions.resize(bp.allele_fractions.size(), 0); @@ -212,23 +208,19 @@ Genotype reportConsensusGenotypes(GenotypeSet const& genotypes, bool pass_only) result.gl_name.push_back(gl.second.gt); } - result.filter = boost::algorithm::join(filters, ";"); + result.gq = gqs.empty() ? 0 : *std::min_element(gqs.begin(), gqs.end()); return result; } Genotype genotypeByTotalCounts( - GenotypeSet const& genotypes, bool use_pass_only, const BreakpointGenotyper* p_genotyper, double depth, - int read_length) + GenotypeSet const& genotypes, bool use_pass_only, const BreakpointGenotyper* p_genotyper, + const BreakpointGenotyperParameter* b_param) { - assert(p_genotyper != nullptr && depth > 0 && read_length > 0); + assert(p_genotyper != nullptr && b_param->read_depth > 0 && b_param->read_length > 0); std::set filters; filters.insert("CONFLICT"); - if (!use_pass_only) - { - filters.insert("ALL_BAD_BP"); - } // get total count vector std::vector sum_counts; @@ -237,16 +229,16 @@ Genotype genotypeByTotalCounts( { if (use_pass_only) { - if (!bp.filter.empty()) + if (!bp.filters.empty()) { - filters.insert("EXIST_BAD_BP"); + filters.insert(bp.filters.begin(), bp.filters.end()); continue; } } if (bp.num_reads == 0) { - filters.insert("MISSING"); + filters.insert("BP_NO_GT"); continue; } @@ -273,9 +265,9 @@ Genotype genotypeByTotalCounts( s = static_cast(std::round((double)s / num_bp)); } const auto input_counts = sum_counts; - const auto sum_gt = p_genotyper->genotype(depth, read_length, input_counts); + const auto sum_gt = p_genotyper->genotype(*b_param, input_counts); Genotype result = sum_gt; - result.filter = boost::algorithm::join(filters, ";"); + result.filters = filters; return result; } } diff --git a/src/c++/lib/genotyping/Genotype.cpp b/src/c++/lib/genotyping/Genotype.cpp index 990f464..67554c4 100644 --- a/src/c++/lib/genotyping/Genotype.cpp +++ b/src/c++/lib/genotyping/Genotype.cpp @@ -66,38 +66,35 @@ GenotypeVector genotypeVectorFromString(std::string& gt_str) return gv; } -string Genotype::toString() const +string Genotype::toString(const vector* p_allele_names) const { string gt_str; if (gt.empty()) { - gt_str = "."; // TODO: change this for haploid + gt_str = "."; } else { - auto str = static_cast(std::to_string); - gt_str = join(gt | transformed(str), "/"); + if (p_allele_names == NULL) + { + auto al = static_cast(std::to_string); + gt_str = join(gt | transformed(al), "/"); + } + else + { + auto al = [p_allele_names](int g) { return (*p_allele_names)[g]; }; + gt_str = join(gt | transformed(al), "/"); + } } return gt_str; } -string Genotype::toString(vector const& allele_names) const +std::string Genotype::filterString() const { - string gt_str; - if (gt.empty()) - { - gt_str = "./."; - } - else - { - auto al = [&allele_names](int g) { return allele_names[g]; }; - gt_str = join(gt | transformed(al), "/"); - } - return gt_str; + string filter_string = boost::algorithm::join(filters, ";"); + return filter_string; } -Genotype::operator std::string() const { return toString(); } - /** * relabel genotypes */ @@ -139,7 +136,7 @@ void Genotype::relabel(std::vector const& new_labels) Json::Value Genotype::toJson(vector const& allele_names) const { Json::Value json_result; - json_result["GT"] = toString(allele_names); + json_result["GT"] = toString(&allele_names); if (!gl.empty()) { @@ -155,6 +152,11 @@ Json::Value Genotype::toJson(vector const& allele_names) const } } + if (gq != -1) + { + json_result["GQ"] = gq; + } + if (!allele_fractions.empty()) { json_result["allele_fractions"] = Json::objectValue; @@ -167,9 +169,13 @@ Json::Value Genotype::toJson(vector const& allele_names) const json_result["allele_fractions"][allele_name] = allele_fraction; } } - if (!filter.empty()) + if (!filters.empty()) { - json_result["filter"] = filter; + json_result["filters"] = Json::arrayValue; + for (auto f : filters) + { + json_result["filters"].append(f); + } } if (!gt.empty()) { @@ -181,4 +187,6 @@ Json::Value Genotype::toJson(vector const& allele_names) const } return json_result; } + +Genotype::operator std::string() const { return toString(); } } diff --git a/src/c++/lib/genotyping/GenotypingParameters.cpp b/src/c++/lib/genotyping/GenotypingParameters.cpp index 1e33d02..8ce35f2 100644 --- a/src/c++/lib/genotyping/GenotypingParameters.cpp +++ b/src/c++/lib/genotyping/GenotypingParameters.cpp @@ -44,7 +44,8 @@ namespace genotyping GenotypingParameters::GenotypingParameters(const vector& _allele_names, unsigned int ploidy) : ploidy_(ploidy) , num_alleles(static_cast(_allele_names.size())) - , coverage_test_cutoff(-1.0) + , coverage_test_cutoff(std::make_pair(0.02, 0.0001)) + , min_pass_gq(20) , allele_names(_allele_names) , min_overlap_bases(16) , reference_allele("REF") @@ -52,6 +53,7 @@ GenotypingParameters::GenotypingParameters(const vector& _allele_names, , other_allele_error_rate(0.05) , other_het_haplotype_fraction(0.5) , other_genotype_fraction(1) + , use_poisson_depth(false) { setPossibleGenotypes(); } @@ -97,10 +99,6 @@ void GenotypingParameters::setFromJson(Json::Value& param_json) { min_overlap_bases = field.asUInt(); } - if (key == "coverage_test_cutoff") - { - coverage_test_cutoff = field.asDouble(); - } else if (key == "reference_allele") { reference_allele = field.asString(); @@ -131,6 +129,16 @@ void GenotypingParameters::setFromJson(Json::Value& param_json) } } + if (param_json.isMember("coverage_test_cutoff")) + { + if (param_json["coverage_test_cutoff"].size() != 2) + { + error("Error: coverage_test_cutoff needs to be a list of 2 values: lower end, upper end."); + } + coverage_test_cutoff.first = param_json["coverage_test_cutoff"][0].asDouble(); + coverage_test_cutoff.first = param_json["coverage_test_cutoff"][1].asDouble(); + } + if (param_json.isMember("allele_error_rates") || (param_json.isMember("het_haplotype_fractions") && !uniform_het_haplotype_fraction) || param_json.isMember("genotype_fractions")) @@ -165,6 +173,22 @@ void GenotypingParameters::setFromJson(Json::Value& param_json) } } } + + if (param_json.isMember("use_poisson_depth")) + { + if (param_json["use_poisson_depth"] == "true") + { + use_poisson_depth = true; + } + else if (param_json["use_poisson_depth"] == "false") + { + use_poisson_depth = false; + } + else + { + error("In genotyping parameter JSON use_poisson_depth only allows true or false."); + } + } } vector GenotypingParameters::alleleNameConversionIndex(Json::Value& param_json) diff --git a/src/c++/lib/genotyping/GraphBreakpointGenotyper.cpp b/src/c++/lib/genotyping/GraphBreakpointGenotyper.cpp index 3e0b8b7..e06ae7e 100644 --- a/src/c++/lib/genotyping/GraphBreakpointGenotyper.cpp +++ b/src/c++/lib/genotyping/GraphBreakpointGenotyper.cpp @@ -72,15 +72,18 @@ void GraphBreakpointGenotyper::runGenotyping() } auto sample_ploidy = getSamplePloidy(sample_index); double expected_depth = depth_readlength.first * ((double)sample_ploidy / female_ploidy_); + double depth_sd = getDepthSD(sample_index); + const BreakpointGenotyperParameter b_param( + expected_depth, depth_readlength.second, depth_sd, p_genotype_parameter->usePoissonDepth()); if (getSamplePloidy(sample_index) == male_ploidy_) { - const auto gt = male_genotyper.genotype(expected_depth, depth_readlength.second, counts); + const auto gt = male_genotyper.genotype(b_param, counts); setGenotype(samplename, breakpointname, gt); } else // treat unknown as female { - const auto gt = genotyper.genotype(expected_depth, depth_readlength.second, counts); + const auto gt = genotyper.genotype(b_param, counts); setGenotype(samplename, breakpointname, gt); } ++sample_index; @@ -97,9 +100,10 @@ void GraphBreakpointGenotyper::runGenotyping() all_breakpoint_gts.add(allelenames, getGenotype(samplename, breakpointname)); } auto const& depth_readlength = getDepthAndReadlength(sample_index); - setGenotype( - samplename, "", - combinedGenotype(all_breakpoint_gts, &genotyper, depth_readlength.first, depth_readlength.second)); + auto depth_sd = getDepthSD(sample_index); + const BreakpointGenotyperParameter b_param( + depth_readlength.first, depth_readlength.second, depth_sd, p_genotype_parameter->usePoissonDepth()); + setGenotype(samplename, "", combinedGenotype(all_breakpoint_gts, &b_param, &genotyper)); ++sample_index; } } diff --git a/src/c++/lib/genotyping/GraphGenotyper.cpp b/src/c++/lib/genotyping/GraphGenotyper.cpp index 325d8a2..6e803ee 100644 --- a/src/c++/lib/genotyping/GraphGenotyper.cpp +++ b/src/c++/lib/genotyping/GraphGenotyper.cpp @@ -122,6 +122,7 @@ void GraphGenotyper::addAlignment(SampleInfo const& sampleinfo) breakpoint.second.addCounts(alignment); } _impl->depths.emplace_back(depth, read_length); + _impl->depth_sds.emplace_back(sampleinfo.depth_sd()); _impl->sexes.emplace_back(sampleinfo.sex()); // extract extra information and check we have the same event @@ -413,6 +414,13 @@ std::pair const& GraphGenotyper::getDepthAndReadlength(size_t sampl return _impl->depths[sample_index]; } +/** + * Get depth standard deviation for a sample + * @param sample_index + * @return std deviation of average depth. if not available, return 0 + */ +double GraphGenotyper::getDepthSD(size_t sample_index) const { return _impl->depth_sds[sample_index]; }; + /** * Get sex integer for a sample */ diff --git a/src/c++/lib/genotyping/GraphGenotyperImpl.hh b/src/c++/lib/genotyping/GraphGenotyperImpl.hh index b096072..7d3b6d1 100644 --- a/src/c++/lib/genotyping/GraphGenotyperImpl.hh +++ b/src/c++/lib/genotyping/GraphGenotyperImpl.hh @@ -47,6 +47,11 @@ struct GraphGenotyper::GraphGenotyperImpl */ std::vector> depths; + /** + * depth variance per sample + */ + std::vector depth_sds; + /** * sex information per sample */ diff --git a/src/c++/lib/genotyping/SampleInfo.cpp b/src/c++/lib/genotyping/SampleInfo.cpp index 50caba9..322b0a3 100644 --- a/src/c++/lib/genotyping/SampleInfo.cpp +++ b/src/c++/lib/genotyping/SampleInfo.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -45,17 +46,15 @@ using std::vector; namespace genotyping { -/** - * Load manifest file. A manifest contains - * - * * Sample names - * * Sample BAM/CRAM file locations - * * Depth estimates, or location of idxdepth output - * * (optionally) location of alignment JSON - * - * @param filename file name of manifest - * @return list of sample info records - */ +void SampleInfo::set_autosome_depth(double autosome_depth) +{ + autosome_depth_ = autosome_depth; + if (depth_sd_ == 0) + { + depth_sd_ = sqrt(autosome_depth_ * 5); + } +} + void SampleInfo::set_sex(std::string sex_string) { std::transform(sex_string.begin(), sex_string.end(), sex_string.begin(), ::tolower); @@ -77,6 +76,17 @@ void SampleInfo::set_sex(std::string sex_string) } } +/** + * Load manifest file. A manifest contains + * + * * Sample names + * * Sample BAM/CRAM file locations + * * Depth estimates, or location of idxdepth output + * * (optionally) location of alignment JSON + * + * @param filename file name of manifest + * @return list of sample info records + */ Samples loadManifest(const std::string& filename) { std::ifstream manifest_file(filename.c_str(), std::ifstream::in); @@ -101,9 +111,9 @@ Samples loadManifest(const std::string& filename) if (header.empty()) { common::stringutil::split(line, header, "\t,"); - static const set legal_header_columns = { - "id", "path", "index_path", "paragraph", "idxdepth", "depth", "read length", "sex", - }; + static const set legal_header_columns + = { "id", "path", "index_path", "paragraph", "idxdepth", + "depth", "read length", "sex", "depth variance", "depth sd" }; size_t j = 0; for (auto& h : header) { @@ -218,6 +228,43 @@ Samples loadManifest(const std::string& filename) sid.set_autosome_depth(depth); sid.set_read_length(static_cast(read_length)); + if (header_map.count("depth sd") != 0u) + { + double depth_sd = 0; + try + { + depth_sd = stod(tokens[header_map["depth sd"]]); + } + catch (const std::exception&) + { + } + if (depth_sd <= 0) + { + error("Depth sd is not positive in sample %s", sid.sample_name().c_str()); + } + sid.set_depth_sd(depth_sd); + } + else + { + if (header_map.count("depth variance") != 0u) + { + double depth_variance = 0; + try + { + depth_variance = stod(tokens[header_map["depth variance"]]); + } + catch (const std::exception&) + { + } + if (depth_variance <= 0) + { + error("Depth variance is not positive in sample %s", sid.sample_name().c_str()); + } + double depth_sd = sqrt(depth_variance); + sid.set_depth_sd(depth_sd); + } + } + if (header_map.count("sex") != 0u) { string sex_string = tokens[header_map["sex"]]; diff --git a/src/c++/lib/grm/GraphInput.cpp b/src/c++/lib/grm/GraphInput.cpp index c504af7..0a22f65 100644 --- a/src/c++/lib/grm/GraphInput.cpp +++ b/src/c++/lib/grm/GraphInput.cpp @@ -163,6 +163,7 @@ Graph graphFromJson(Json::Value const& in, string const& reference, bool store_r } } } + return result; } diff --git a/src/c++/lib/grmpy/CountAndGenotype.cpp b/src/c++/lib/grmpy/CountAndGenotype.cpp index 9bca665..13bb01b 100644 --- a/src/c++/lib/grmpy/CountAndGenotype.cpp +++ b/src/c++/lib/grmpy/CountAndGenotype.cpp @@ -38,6 +38,8 @@ #include "graphcore/Graph.hh" #include "grm/GraphInput.hh" +#include + namespace grmpy { @@ -60,16 +62,20 @@ Json::Value countAndGenotype( unsigned int male_ploidy = 2; unsigned int female_ploidy = 2; - if (root.isMember("chrom")) + for (auto t_region : root["target_regions"]) { - if (root["chrom"] == "chrX" || root["chrom"] == "X") + std::stringstream ss(t_region.asString()); + std::string chrom; + getline(ss, chrom, ':'); + if (chrom == "chrX" || chrom == "X") { male_ploidy = 1; } - } - else - { - LOG()->info("Cannot find chrom in graph. Assume the graph comes from chr1~22"); + else if (chrom == "chrY" || chrom == "Y") + { + male_ploidy = 1; + female_ploidy = 1; + } } genotyping::GraphBreakpointGenotyper graph_genotyper(male_ploidy, female_ploidy); diff --git a/src/c++/lib/paragraph/GraphSummaryStatistics.cpp b/src/c++/lib/paragraph/GraphSummaryStatistics.cpp index a5750b3..e61635b 100644 --- a/src/c++/lib/paragraph/GraphSummaryStatistics.cpp +++ b/src/c++/lib/paragraph/GraphSummaryStatistics.cpp @@ -111,35 +111,38 @@ void summarizeAlignments(Graph const& graph, common::ReadBuffer const& reads, Js GraphAlignment graph_alignment = graphtools::decodeGraphAlignment(read->graph_pos(), read->graph_cigar(), &graph); - string previous_node_name; - - for (NodeId node_id = 0; node_id != graph_alignment.size(); ++node_id) + NodeId pred_node_id; + for (size_t alignment_index = 0; alignment_index != graph_alignment.size(); ++alignment_index) { - const bool is_source_or_sink = has_source_or_sink && (node_id == 0 || node_id == graph.numNodes() - 1); + NodeId current_node_id = graph_alignment.getNodeIdByIndex(alignment_index); + const bool is_source_or_sink + = has_source_or_sink && (current_node_id == 0 || current_node_id == graph.numNodes() - 1); // node statistics - const auto& node_name = graph.nodeName(node_id); + const auto& node_name = graph.nodeName(current_node_id); if (gstats["nodes"].find(node_name) == gstats["nodes"].end()) { - gstats["nodes"][node_name] = AlignmentStatistics(graph.nodeSeq(node_id).size()); + gstats["nodes"][node_name] = AlignmentStatistics(graph.nodeSeq(current_node_id).size()); } gstats["nodes"][node_name].addNodeMapping( - graph_alignment[node_id], read->is_graph_reverse_strand(), !is_source_or_sink); + graph_alignment[alignment_index], read->is_graph_reverse_strand(), !is_source_or_sink); - // edge stats - if (node_id > 0) + // edge statistics + if (alignment_index > 0) { - const string edge_name = previous_node_name + "_" + node_name; + const string edge_name = graph.nodeName(pred_node_id) + "_" + node_name; if (gstats["edges"].find(edge_name) == gstats["edges"].end()) { - const size_t edge_length = graph.nodeSeq(node_id - 1).size() + graph.nodeSeq(node_id).size(); + const size_t edge_length + = graph.nodeSeq(pred_node_id).size() + graph.nodeSeq(current_node_id).size(); gstats["edges"][edge_name] = AlignmentStatistics(edge_length); } gstats["edges"][edge_name].addEdgeMapping( - graph_alignment[node_id - 1], graph_alignment[node_id], read->is_graph_reverse_strand(), - has_source_or_sink && (node_id - 1 == 0), is_source_or_sink); + graph_alignment[alignment_index - 1], graph_alignment[alignment_index], + read->is_graph_reverse_strand(), has_source_or_sink && (current_node_id - 1 == 0), + is_source_or_sink); } - previous_node_name = node_name; + pred_node_id = current_node_id; } for (auto& allele : read->graph_sequences_supported()) diff --git a/src/c++/main/grmpy.cpp b/src/c++/main/grmpy.cpp index c639342..3f80983 100644 --- a/src/c++/main/grmpy.cpp +++ b/src/c++/main/grmpy.cpp @@ -116,16 +116,16 @@ Options::Options() ("bad-align-frac", po::value(&bad_align_frac)->default_value(bad_align_frac), "Fraction of read that needs to be mapped in order for it to be used.") ("path-sequence-matching", - po::value(&path_sequence_matching)->default_value(path_sequence_matching)->implicit_value(true), + po::value(&path_sequence_matching)->default_value(path_sequence_matching), "Enables alignment to paths") ("graph-sequence-matching", - po::value(&graph_sequence_matching)->default_value(graph_sequence_matching)->implicit_value(true), + po::value(&graph_sequence_matching)->default_value(graph_sequence_matching), "Enables smith waterman graph alignment") ("klib-sequence-matching", - po::value(&klib_sequence_matching)->default_value(klib_sequence_matching)->implicit_value(true), + po::value(&klib_sequence_matching)->default_value(klib_sequence_matching), "Use klib smith-waterman aligner.") ("kmer-sequence-matching", - po::value(&kmer_sequence_matching)->default_value(kmer_sequence_matching)->implicit_value(true), + po::value(&kmer_sequence_matching)->default_value(kmer_sequence_matching), "Use kmer aligner.") ("bad-align-uniq-kmer-len", po::value(&bad_align_uniq_kmer_len)->default_value(bad_align_uniq_kmer_len), "Kmer length for uniqueness check during read filtering.") diff --git a/src/c++/main/paragraph.cpp b/src/c++/main/paragraph.cpp index 72ad536..2452380 100644 --- a/src/c++/main/paragraph.cpp +++ b/src/c++/main/paragraph.cpp @@ -105,16 +105,16 @@ Options::Options() "Comma-separated list of target regions, e.g. chr1:1-20,chr2:2-40. " "This overrides the target regions in the graph spec.") ("path-sequence-matching", - po::value(&path_sequence_matching)->default_value(path_sequence_matching)->implicit_value(true), + po::value(&path_sequence_matching)->default_value(path_sequence_matching), "Enable path seeding aligner") ("graph-sequence-matching", - po::value(&graph_sequence_matching)->default_value(graph_sequence_matching)->implicit_value(true), + po::value(&graph_sequence_matching)->default_value(graph_sequence_matching), "Enables smith waterman graph alignment") ("klib-sequence-matching", - po::value(&klib_sequence_matching)->default_value(klib_sequence_matching)->implicit_value(true), + po::value(&klib_sequence_matching)->default_value(klib_sequence_matching), "Use klib smith-waterman aligner.") ("kmer-sequence-matching", - po::value(&kmer_sequence_matching)->default_value(kmer_sequence_matching)->implicit_value(true), + po::value(&kmer_sequence_matching)->default_value(kmer_sequence_matching), "Use kmer aligner.") ("validate-alignments", po::value(&validate_alignments)->default_value(validate_alignments)->implicit_value(true), "Use information in the input bam read names to collect statistics about the accuracy of alignments. " diff --git a/src/c++/test-blackbox/test_grm.cpp b/src/c++/test-blackbox/test_grm.cpp index db5e8aa..b098257 100644 --- a/src/c++/test-blackbox/test_grm.cpp +++ b/src/c++/test-blackbox/test_grm.cpp @@ -40,11 +40,11 @@ TEST(Grmpy, GenotypesSingleSwap) { std::string graph_path - = g_testenv->getBasePath() + "/../share/test-data/paragraph/long-del/chr4_graph_typing.2sample.json"; + = g_testenv->getBasePath() + "/../share/test-data/paragraph/long-del/chrX_graph_typing.2sample.json"; std::string reference_path - = g_testenv->getBasePath() + "/../share/test-data/paragraph/long-del/chr4_graph_typing.fa"; + = g_testenv->getBasePath() + "/../share/test-data/paragraph/long-del/chrX_graph_typing.fa"; std::string manifest_path - = g_testenv->getBasePath() + "/../share/test-data/paragraph/long-del/chr4_graph_typing.manifest"; + = g_testenv->getBasePath() + "/../share/test-data/paragraph/long-del/chrX_graph_typing.manifest"; std::string genotype_parameter_path = g_testenv->getBasePath() + "/../share/test-data/paragraph/long-del/param.json"; grmpy::Parameters parameters; diff --git a/src/c++/test/test_breakpoint_genotyper.cpp b/src/c++/test/test_breakpoint_genotyper.cpp index 8a63da9..e1ae7bd 100644 --- a/src/c++/test/test_breakpoint_genotyper.cpp +++ b/src/c++/test/test_breakpoint_genotyper.cpp @@ -27,6 +27,7 @@ #include "genotyping/BreakpointGenotyper.hh" #include "genotyping/GenotypingParameters.hh" #include "gmock/gmock.h" +#include #include #include @@ -36,37 +37,45 @@ using namespace genotyping; TEST(BreakpointGenotyper, ThrowsWhenWrongNumberOfReadcounts) { - const double read_depth = 40.0; - const int32_t read_length = 100; + double read_depth = 40.0; + int32_t read_length = 100; + double depth_sd = sqrt(read_depth * 5); const vector alleles = { "REF", "ALT" }; auto param = std::unique_ptr(new GenotypingParameters(alleles, 2)); BreakpointGenotyper genotyper(param); - ASSERT_ANY_THROW(genotyper.genotype(read_depth, read_length, {})); - ASSERT_ANY_THROW(genotyper.genotype(read_depth, read_length, { 10 })); + const BreakpointGenotyperParameter b_param(read_depth, read_length, depth_sd, false); + ASSERT_ANY_THROW(genotyper.genotype(b_param, {})); + ASSERT_ANY_THROW(genotyper.genotype(b_param, { 10 })); } TEST(BreakpointGenotyper, GenotypesWellCoveredBreakpoints) { - const double read_depth = 40.0; - const int32_t read_length = 100; const vector alleles = { "REF", "ALT" }; auto param = std::unique_ptr(new GenotypingParameters(alleles, 2)); BreakpointGenotyper genotyper(param); - EXPECT_EQ("0/0", (string)genotyper.genotype(read_depth, read_length, { 20, 0 })); - EXPECT_EQ("0/1", (string)genotyper.genotype(read_depth, read_length, { 20, 20 })); - EXPECT_EQ("1/1", (string)genotyper.genotype(read_depth, read_length, { 0, 20 })); + double read_depth = 40.0; + int32_t read_length = 100; + double depth_sd = 20; + const BreakpointGenotyperParameter b_param(read_depth, read_length, depth_sd, false); + + EXPECT_EQ("0/0", (string)genotyper.genotype(b_param, { 20, 0 })); + EXPECT_EQ("0/1", (string)genotyper.genotype(b_param, { 20, 20 })); + EXPECT_EQ("1/1", (string)genotyper.genotype(b_param, { 0, 20 })); // test chrX auto haploid_param = std::unique_ptr(new GenotypingParameters(alleles, 1)); BreakpointGenotyper haploid_genotyper(haploid_param); - EXPECT_EQ("1", (string)haploid_genotyper.genotype(read_depth, read_length, { 0, 20 })); + EXPECT_EQ("1", (string)haploid_genotyper.genotype(b_param, { 0, 20 })); + + EXPECT_FLOAT_EQ(0.24825223, (float)genotyper.genotype(b_param, { 0, 20 }).coverage_test_pvalue); - EXPECT_FLOAT_EQ(0.0080560343, (float)genotyper.genotype(read_depth, read_length, { 0, 20 }).coverage_test_pvalue); + const BreakpointGenotyperParameter b_poisson_param(read_depth, read_length, depth_sd, true); + EXPECT_FLOAT_EQ(0.0080560343, (float)genotyper.genotype(b_poisson_param, { 0, 20 }).coverage_test_pvalue); const vector alleles2 = { "REF", "ALT1", "ALT2", "ALT3", "ALT4" }; auto param2 = std::unique_ptr(new GenotypingParameters(alleles2, 2)); BreakpointGenotyper genotyper_q(param2); - EXPECT_EQ("1/3", (string)genotyper_q.genotype(read_depth, read_length, { 1, 20, 2, 20, 2 })); + EXPECT_EQ("1/3", (string)genotyper_q.genotype(b_param, { 1, 20, 2, 20, 2 })); } \ No newline at end of file diff --git a/src/c++/test/test_combined_genotype.cpp b/src/c++/test/test_combined_genotype.cpp index 74f3eb6..b7c7ba2 100644 --- a/src/c++/test/test_combined_genotype.cpp +++ b/src/c++/test/test_combined_genotype.cpp @@ -56,7 +56,7 @@ TEST(CombinedGenotype, SimplePass) const Genotype combined_genotype = combinedGenotype(gs); EXPECT_EQ("1/1", combined_genotype.toString()); - EXPECT_EQ("ALT/ALT", combined_genotype.toString(alleles)); + EXPECT_EQ("ALT/ALT", combined_genotype.toString(&alleles)); } TEST(CombinedGenotype, GenotypeUnphasedMatch) @@ -67,11 +67,13 @@ TEST(CombinedGenotype, GenotypeUnphasedMatch) gt1.gt = { 0, 1 }; gt1.gl_name = { { 0, 0 }, { 0, 1 }, { 1, 1 } }; gt1.gl = { -10, -0.1, -10 }; + gt1.gq = 20; Genotype gt2; gt2.gt = { 1, 0 }; gt2.gl_name = { { 1, 0 }, { 1, 1 }, { 0, 0 } }; gt2.gl = { -0.1, -10, -10 }; + gt2.gq = 30; GenotypeSet gs; gs.add(alleles, gt1); @@ -80,7 +82,8 @@ TEST(CombinedGenotype, GenotypeUnphasedMatch) const Genotype combined_genotype = combinedGenotype(gs); EXPECT_EQ("0/1", combined_genotype.toString()); - EXPECT_EQ("PASS", combined_genotype.filter); + EXPECT_EQ("PASS", combined_genotype.filterString()); + EXPECT_EQ(20, combined_genotype.gq); } TEST(CombinedGenotype, GenotypeConflictNoConsensus) @@ -104,12 +107,15 @@ TEST(CombinedGenotype, GenotypeConflictNoConsensus) auto param = std::unique_ptr(new GenotypingParameters(alleles, 2)); BreakpointGenotyper genotyper(param); - const double read_depth = 10.0; - const int32_t read_length = 100; - const Genotype combined_genotype = combinedGenotype(gs, &genotyper, read_depth, read_length); + double read_depth = 10.0; + int32_t read_length = 100; + double depth_sd = 50; + const BreakpointGenotyperParameter b_param(read_depth, read_length, depth_sd, false); + const Genotype combined_genotype = combinedGenotype(gs, &b_param, &genotyper); EXPECT_EQ("0/1", combined_genotype.toString()); - EXPECT_EQ("CONFLICT", combined_genotype.filter); + EXPECT_EQ("CONFLICT", combined_genotype.filterString()); + EXPECT_EQ(8, combined_genotype.gq); } TEST(CombinedGenotype, GenotypeMissing) @@ -121,7 +127,8 @@ TEST(CombinedGenotype, GenotypeMissing) Genotype gt2; gt2.gt = { 0, 1 }; gt2.gl_name = { { 0, 1 }, { 1, 1 }, { 0, 0 } }; - gt2.gl = { 0.1, -10, -10 }; + gt2.gl = { -1, -10, -10 }; + gt2.gq = 36; GenotypeSet gs; gs.add(alleles, gt1); @@ -130,5 +137,6 @@ TEST(CombinedGenotype, GenotypeMissing) const Genotype combined_genotype = combinedGenotype(gs); EXPECT_EQ("0/1", combined_genotype.toString()); - EXPECT_EQ("MISSING", combined_genotype.filter); + EXPECT_EQ("BP_NO_GT", combined_genotype.filterString()); + EXPECT_EQ(36, combined_genotype.gq); } diff --git a/src/cmake/GetBoost.cmake b/src/cmake/GetBoost.cmake old mode 100755 new mode 100644 index 214b6b6..1e6d1da --- a/src/cmake/GetBoost.cmake +++ b/src/cmake/GetBoost.cmake @@ -70,4 +70,3 @@ find_package(Boost 1.58 COMPONENTS iostreams program_options filesystem system R # boost sometimes generates warnings; we won't patch them so let's disable them using SYSTEM include_directories(SYSTEM ${Boost_INCLUDE_DIR}) link_directories(${Boost_LIBRARY_DIRS}) - diff --git a/src/cmake/GetGraphTools.cmake b/src/cmake/GetGraphTools.cmake old mode 100755 new mode 100644 index d47b6a4..5b92acd --- a/src/cmake/GetGraphTools.cmake +++ b/src/cmake/GetGraphTools.cmake @@ -70,4 +70,4 @@ install(DIRECTORY ${GRAPHTOOLS_SOURCE_DIR}/include/graphalign DESTINATION includ install(DIRECTORY ${GRAPHTOOLS_SOURCE_DIR}/include/graphutils DESTINATION include/) install(FILES $ DESTINATION lib/) -################################################################## +################################################################## \ No newline at end of file diff --git a/src/docker-testing/test-helpers/valgrind-test.sh b/src/docker-testing/test-helpers/valgrind-test.sh index cbc1eb4..343d783 100644 --- a/src/docker-testing/test-helpers/valgrind-test.sh +++ b/src/docker-testing/test-helpers/valgrind-test.sh @@ -42,9 +42,9 @@ valgrind --leak-check=full --xml=yes \ --xml-file=${WORKSPACE}/valgrind3.xml \ --suppressions=${PARAGRAPH_SOURCE}/src/sh/valgrind-suppressions.supp \ ${PARAGRAPH}/bin/grmpy \ - -r ${PARAGRAPH}/share/test-data/paragraph/long-del/chr4_graph_typing.fa \ - -g ${PARAGRAPH}/share/test-data/paragraph/long-del/chr4_graph_typing.2sample.json \ - -m ${PARAGRAPH}/share/test-data/paragraph/long-del/chr4_graph_typing.manifest \ + -r ${PARAGRAPH}/share/test-data/paragraph/long-del/chrX_graph_typing.fa \ + -g ${PARAGRAPH}/share/test-data/paragraph/long-del/chrX_graph_typing.2sample.json \ + -m ${PARAGRAPH}/share/test-data/paragraph/long-del/chrX_graph_typing.manifest \ -o ${WORKSPACE}/vg_test.json python ${PARAGRAPH_SOURCE}/src/sh/valgrind-check.py ${WORKSPACE}/valgrind3.xml diff --git a/src/make/validation/SimulateReads.mk b/src/make/validation/SimulateReads.mk old mode 100644 new mode 100755 diff --git a/src/python/bin/multigrmpy.py b/src/python/bin/multigrmpy.py index e2fab64..8e3d3e4 100644 --- a/src/python/bin/multigrmpy.py +++ b/src/python/bin/multigrmpy.py @@ -251,9 +251,10 @@ def run(args): logging.root.removeHandler(handler) logging.basicConfig(filename=args.logfile, format='%(asctime)s %(levelname)-8s %(message)s', level=loglevel) - # check format of manifest + # manifest sanity check with open(args.manifest) as manifest_file: - headers = {"id": False, "path": False, "idxdepth": False, "depth": False, "read length": False, "sex": False} + headers = {"id": False, "path": False, "idxdepth": False, "depth": False, + "read length": False, "sex": False, "depth variance": False} for line in manifest_file: if line.startswith("#"): line = line[1:] @@ -335,13 +336,25 @@ def run(args): raise try: + sample_names = [] + with open(args.manifest) as manifest_file: + id_index = -1 + for line in manifest_file: + line = line.rstrip() + if line.startswith('#'): + line = line[1:] + f = line.split('\t') + if id_index == -1: + id_index = f.index("id") + continue + sample_names.append(f[id_index]) if args.input.endswith("vcf") or args.input.endswith("vcf.gz"): grmpyOutput = vcfupdate.read_grmpy(result_json_path) result_vcf_path = os.path.join(args.output, "genotypes.vcf.gz") vcf_input_path = os.path.join(args.output, "variants.vcf.gz") if not os.path.exists(vcf_input_path) or not os.path.isfile(vcf_input_path): vcf_input_path = args.input - vcfupdate.update_vcf_from_grmpy(vcf_input_path, grmpyOutput, result_vcf_path) + vcfupdate.update_vcf_from_grmpy(vcf_input_path, grmpyOutput, result_vcf_path, sample_names) except Exception: # pylint: disable=W0703 traceback.print_exc(file=LoggingWriter(logging.ERROR)) raise diff --git a/src/python/bin/paragraph-to-csv.py b/src/python/bin/paragraph-to-csv.py deleted file mode 100644 index 3ae8a56..0000000 --- a/src/python/bin/paragraph-to-csv.py +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env python3 -# coding=utf-8 - -# -# Copyright (c) 2017 Illumina, Inc. -# All rights reserved. -# -# This file is distributed under the simplified BSD license. -# The full text can be found here (and in the LICENSE file in the root folder of -# this distribution): -# -# https://github.com/Illumina/licenses/blob/master/Simplified-BSD-License.txt -# -# January 2018 -# -# Extract variant genotypes from paragraph JSON and output as csv format -# -# Usage: -# python3 paragraph-to-csv.py [options] -# -# For usage instructions run with option --help -# -# Author: -# -# Sai Chen -# - -# noinspection PyUnresolvedReferences - -import argparse -import json -import gzip -import tempfile -import os - - -def main(): - parser = make_argument_parser() - args = parser.parse_args() - run(args) - - -def make_argument_parser(): - parser = argparse.ArgumentParser("paragraph-to-vcf.py") - parser.add_argument("input", nargs="+") - parser.add_argument("-o", "--output", type=str, dest="output", - help="Output csv file name. If not specified, will be printed to standard output.", ) - parser.add_argument("--genotype-only", dest="genotype_only", default=False, - action="store_true", help="Only output genotypes for each variant.") - return parser - - -def run(args): - """ - Run converter - """ - if args.input[0].endswith(".gz"): - paragraph_json_file = gzip.open(args.input[0], 'rt') - else: - paragraph_json_file = open(args.input[0], 'rt') - paragraph_json = json.load(paragraph_json_file) - paragraph_json_file.close() - - if not paragraph_json: - raise Exception("Fail to load paragraph JSON.") - - if args.genotype_only: - annotation = "#FORMAT=GT" - else: - annotation = "#FORMAT=GT:FILTER:DP" - - samples = [] - header = "#ID" - for event in paragraph_json: - if "samples" not in event: - continue - if "population" in event: - header += "\tHWE\tCallRate\tPassRate\tMinorAF" - for sample in event["samples"]: - header += "\t" + sample - samples.append(sample) - break - if not samples: - raise Exception("No valid results in paragraph JSON") - - output_lines = [] - for event in paragraph_json: - if "graphinfo" not in event: - output_lines.append("ERROR") - continue - line = "" - if "ID" in event["graphinfo"]: - line = event["graphinfo"]["ID"] - if not line: - line = "." - - if "population" in event: - pop_json = event["population"] - pop_stat = [] - for k in ["hwe", "call_rate"]: - if k in pop_json: - pop_stat.append(pop_json[k]) - else: - pop_stat.append("NA") - # calculate pass rate - num_pass = 0 - num_samples = 0 - for sample in samples: - num_samples += 1 - sample_stat = event["samples"][sample]["gt"]["GT"] - if "filter" in sample_stat: - if sample_stat["filter"] == "PASS": - num_pass += 1 - if num_samples: - pop_stat.append(num_pass / num_samples) - else: - pop_stat.append("NA") - # AF - if "allele_frequencies" in pop_json: - if pop_json["allele_frequencies"]: - pop_stat.append(min(pop_json["allele_frequencies"])) - else: - pop_stat.append("NA") - line += "\t" + '\t'.join(map(str, pop_stat)) - - for sample in samples: - gt_json = event["samples"][sample]["gt"] - sample_stat = [gt_json["GT"]] - if not args.genotype_only: - for k in ["filter", "num_reads"]: - if k in gt_json: - sample_stat.append(gt_json[k]) - else: - sample_stat.append(".") - line += "\t" + ':'.join(map(str, sample_stat)) - output_lines.append(line) - - if args.output: - output_file = open(args.output, 'wt') - else: - output_file = tempfile.NamedTemporaryFile(mode="wt", suffix=".csv", delete=False) - - output_file.write(annotation + "\n") - output_file.write(header + "\n") - for line in output_lines: - output_file.write(line + "\n") - output_file.close() - - if not args.output: - os.system("cat " + output_file.name) - os.system("rm -f " + output_file.name) - - -if __name__ == '__main__': - main() diff --git a/src/python/lib/grm/vcfgraph/vcfupdate.py b/src/python/lib/grm/vcfgraph/vcfupdate.py index 218ba9a..0411d08 100644 --- a/src/python/lib/grm/vcfgraph/vcfupdate.py +++ b/src/python/lib/grm/vcfgraph/vcfupdate.py @@ -89,34 +89,58 @@ def read_grmpy(grmpyJsonName): return result -def update_vcf_from_grmpy(inVcfFilename, grmpyOutput, outVcfFilename): - variantFile = pysam.VariantFile(inVcfFilename) +def update_vcf_from_grmpy(inVcfFilename, grmpyOutput, outVcfFilename, sample_names=None): + inVariantFile = pysam.VariantFile(inVcfFilename) # pylint: disable=no-member - header = variantFile.header - header.add_line('##FORMAT=') + header = inVariantFile.header.copy() + + vcf_samples = list(header.samples) + if vcf_samples: + header.add_line('##FORMAT=') + + if sample_names is None: # get sample names from vcf + sample_names = vcf_samples + if not sample_names: + logging.error( + "Sample names not assigned and input VCF doesn't have sample column. Paragraph VCF output will not contain any genotyping information!") + sample_names = set(sample_names) + sample_names.update(set(vcf_samples)) + added_samples = sample_names.difference(set(vcf_samples)) + + sample_names = list(sample_names) + added_samples = list(added_samples) + for s in added_samples: + header.add_sample(s) + + if 'GT' not in header.formats: + header.add_line('##FORMAT=') if 'FT' not in header.formats: header.add_line('##FORMAT=') if 'DP' not in header.formats: header.add_line('##FORMAT=') if 'AD' not in header.formats: - header.add_line('##FORMAT=') + header.add_line( + '##FORMAT=') if 'ADF' not in header.formats: - header.add_line('##FORMAT=') + header.add_line( + '##FORMAT=') if 'ADR' not in header.formats: - header.add_line('##FORMAT=') + header.add_line( + '##FORMAT=') if 'PL' not in header.formats: header.add_line('##FORMAT=') if 'GRMPY_ID' not in header.info: header.add_line('##INFO=') - header.add_line('##FILTER=') - header.add_line('##FILTER=') + header.add_line('##FILTER=') + header.add_line('##FILTER=') header.add_line('##FILTER=') - header.add_line('##FILTER=') + header.add_line('##FILTER=') header.add_line('##FILTER=') - header.add_line('##FILTER=') + header.add_line( + '##FILTER=') header.add_line('##FILTER=') header.add_line('##FILTER=') # pylint: enable=no-member @@ -125,7 +149,7 @@ def update_vcf_from_grmpy(inVcfFilename, grmpyOutput, outVcfFilename): matched = 0 unmatched = 0 multimatched = 0 - for record in variantFile: + for raw_record in inVariantFile: logging.debug("VCF to graph matching statistics: " f"{matched} matched, {unmatched} unmatched, {multimatched} multiple matches.") # reset varIdCounts for every record. This @@ -134,18 +158,22 @@ def update_vcf_from_grmpy(inVcfFilename, grmpyOutput, outVcfFilename): # start at the same position but come in different records. This could be fixed # by collapsing these into single VCF records as alleles before running # through paragraph. + record = header.new_record(contig=raw_record.chrom, start=raw_record.start, stop=raw_record.stop, + alleles=raw_record.alleles, id=raw_record.id, qual=raw_record.qual, filter=raw_record.filter, info=raw_record.info) + varIdCounts = defaultdict(int) varId = VCFGraph.generate_variant_id(record, varIdCounts) alleleIds = [aId for aId, _ in VCFGraph.generate_allele_ids(record, varId)] try: - grmpyRecords = [grmpyOutput["by_id"][record.info["GRMPY_ID"]]] + grmpyRecords = [grmpyOutput["by_id"][raw_record.info["GRMPY_ID"]]] except KeyError: grmpyRecords = [] if not grmpyRecords: # match by sequence name if we don't have GRMPY_ID - grmpyRecords = [grmpyOutput["by_sequencename"][aId] for aId in alleleIds if aId in grmpyOutput["by_sequencename"]] + grmpyRecords = [grmpyOutput["by_sequencename"][aId] + for aId in alleleIds if aId in grmpyOutput["by_sequencename"]] records = [] for recordList in grmpyRecords: @@ -180,77 +208,96 @@ def update_vcf_from_grmpy(inVcfFilename, grmpyOutput, outVcfFilename): for ii, aId in enumerate(alleleIds): alleleMap[aId] = ii - for sample in record.samples: - old_gt = "/".join(sorted([str(val) if val is not None else "." for val in record.samples[sample]["GT"]])) - record.samples[sample]["OLD_GT"] = old_gt + for sample in sample_names: + if vcf_samples: + if sample in vcf_samples: + for k, v in raw_record.samples[sample].items(): + record.samples[sample][k] = v + old_gt = "/".join(sorted([str(val) + if val is not None else "." for val in raw_record.samples[sample]["GT"]])) + record.samples[sample]["OLD_GT"] = old_gt + else: + record.samples[sample]["OLD_GT"] = [None] record.samples[sample]["GT"] = [None] record.samples[sample]["DP"] = None record.samples[sample]["FT"] = "." record.samples[sample]["AD"] = [None] * (1 + len(record.alts)) record.samples[sample]["ADF"] = [None] * (1 + len(record.alts)) record.samples[sample]["ADR"] = [None] * (1 + len(record.alts)) - try: - gt = grmpyRecord["samples"][sample]["gt"] - filters = set() - for f in gt["filter"].split(";"): - filters.add(f) - gt_to_set = sorted([alleleMap[g] if g in alleleMap else -1 for g in gt["GT"].split("/")]) - gt_to_set = [g if g >= 0 else None for g in gt_to_set] - if None in gt_to_set: - # this happens when we cannot match an allele - filters.add("UNMATCHED") - else: - record.samples[sample]["GT"] = gt_to_set - record.samples[sample]["FT"] = [",".join(list(filters))] + if sample in grmpyRecord["samples"]: try: - record.samples[sample]["DP"] = gt["num_reads"] + set_record_for_sample(record, sample, grmpyRecord, alleleMap) except KeyError: - record.samples[sample]["DP"] = 0 - - ad = grmpyRecord["samples"][sample]["alleles"] - ads_to_set = [0] * (1 + len(record.alts)) - adfs_to_set = [0] * (1 + len(record.alts)) - adrs_to_set = [0] * (1 + len(record.alts)) - for a in ad.keys(): - ads_to_set[alleleMap[a]] = ad[a]['num_fwd_reads'] + ad[a]['num_rev_reads'] - adfs_to_set[alleleMap[a]] = ad[a]['num_fwd_reads'] - adrs_to_set[alleleMap[a]] = ad[a]['num_rev_reads'] - record.samples[sample]["AD"] = ads_to_set - record.samples[sample]["ADF"] = adfs_to_set - record.samples[sample]["ADR"] = adrs_to_set - - ploidy = len(gt_to_set) - allelecount = len(record.alts) - - gtlist = makePLGenotypes(ploidy, allelecount) - gtlist_map = {} - for ii, g in enumerate(gtlist): - gtlist_map[str(g)] = ii - pls_to_set = [0] * len(gtlist) - min_pl = None - for name, ll in gt["GL"].items(): - alleles = sorted([alleleMap[a] for a in name.split("/")]) - try: - phred_l = round(-10 * ll) - except TypeError: - phred_l = None - except OverflowError: - phred_l = 32768 - phred_l = min(phred_l, 32768) - - if min_pl is None or phred_l < min_pl: - min_pl = phred_l - - if str(alleles) in gtlist_map: - pls_to_set[gtlist_map[str(alleles)]] = phred_l - - # normalize, see e.g. https://software.broadinstitute.org/gatk/documentation/article?id=5913 - pls_to_set = [pl - min_pl for pl in pls_to_set] - record.samples[sample]["PL"] = pls_to_set - - except KeyError: - continue + logging.warning("VCF key error for sample " + str(sample) + " at " + + '_'.join(map(str, [record.chrom, record.pos, record.stop]))) + continue bcf_out.write(record) - logging.info(f"VCF to graph matching statistics: {matched} matched, {unmatched} unmatched, {multimatched} multiple matches.") + logging.info( + f"VCF to graph matching statistics: {matched} matched, {unmatched} unmatched, {multimatched} multiple matches.") + + +def set_record_for_sample(record, sample, grmpyRecord, alleleMap): + """ + set vcf record for the given sample + """ + gt = grmpyRecord["samples"][sample]["gt"] + filters = set() + for f in gt["filters"]: + filters.add(f) + gt_to_set = sorted([alleleMap[g] if g in alleleMap else -1 for g in gt["GT"].split("/")]) + gt_to_set = [g if g >= 0 else None for g in gt_to_set] + if None in gt_to_set: + # this happens when we cannot match an allele + filters.add("UNMATCHED") + else: + record.samples[sample]["GT"] = gt_to_set + record.samples[sample]["FT"] = [",".join(list(filters))] + try: + record.samples[sample]["DP"] = gt["num_reads"] + except KeyError: + record.samples[sample]["DP"] = 0 + + ad = grmpyRecord["samples"][sample]["alleles"] + ads_to_set = [0] * (1 + len(record.alts)) + adfs_to_set = [0] * (1 + len(record.alts)) + adrs_to_set = [0] * (1 + len(record.alts)) + for a in ad.keys(): + ads_to_set[alleleMap[a]] = ad[a]['num_fwd_reads'] + ad[a]['num_rev_reads'] + adfs_to_set[alleleMap[a]] = ad[a]['num_fwd_reads'] + adrs_to_set[alleleMap[a]] = ad[a]['num_rev_reads'] + record.samples[sample]["AD"] = ads_to_set + record.samples[sample]["ADF"] = adfs_to_set + record.samples[sample]["ADR"] = adrs_to_set + + ploidy = len(gt_to_set) + allelecount = len(record.alts) + + gtlist = makePLGenotypes(ploidy, allelecount) + gtlist_map = {} + for ii, g in enumerate(gtlist): + gtlist_map[str(g)] = ii + pls_to_set = [0] * len(gtlist) + min_pl = None + if "GL" not in gt: # missing genotype + return + for name, ll in gt["GL"].items(): + alleles = sorted([alleleMap[a] for a in name.split("/")]) + try: + phred_l = round(-10 * ll) + except TypeError: + phred_l = None + except OverflowError: + phred_l = 32768 + phred_l = min(phred_l, 32768) + + if min_pl is None or phred_l < min_pl: + min_pl = phred_l + + if str(alleles) in gtlist_map: + pls_to_set[gtlist_map[str(alleles)]] = phred_l + + # normalize, see e.g. https://software.broadinstitute.org/gatk/documentation/article?id=5913 + pls_to_set = [pl - min_pl for pl in pls_to_set] + record.samples[sample]["PL"] = pls_to_set diff --git a/src/python/test/test_multigrmpy.py b/src/python/test/test_multigrmpy.py index 7809f98..1f0f55b 100644 --- a/src/python/test/test_multigrmpy.py +++ b/src/python/test/test_multigrmpy.py @@ -27,7 +27,6 @@ if "GRMPY_INSTALL" in os.environ: GRMPY_INSTALL = os.environ["GRMPY_INSTALL"] - hg38_locations = [ "/Users/pkrusche/workspace/human_genome/hg38.fa", "/illumina/sync/igenomes/Homo_sapiens/NCBI/GRCh38Decoy/Sequence/WholeGenomeFasta/genome.fa", @@ -55,19 +54,26 @@ def setUp(self): self.manifest = os.path.join(GRMPY_ROOT, "share", "test-data", "round-trip-genotyping", "samples.txt") self.swaps_input_vcf = os.path.join(GRMPY_ROOT, "share", "test-data", "genotyping_test_2", "swaps.vcf") - self.swaps_expected_genotypes_json = os.path.join(GRMPY_ROOT, "share", "test-data", "genotyping_test_2", "expected-genotypes.json") - self.swaps_expected_genotypes_vcf = os.path.join(GRMPY_ROOT, "share", "test-data", "genotyping_test_2", "expected-genotypes.vcf") - self.swaps_expected_variants_json = os.path.join(GRMPY_ROOT, "share", "test-data", "genotyping_test_2", "expected-variants.json") + self.swaps_expected_genotypes_json = os.path.join(GRMPY_ROOT, "share", "test-data", "genotyping_test_2", + "expected-genotypes.json") + self.swaps_expected_genotypes_vcf = os.path.join(GRMPY_ROOT, "share", "test-data", "genotyping_test_2", + "expected-genotypes.vcf") + self.swaps_expected_variants_json = os.path.join(GRMPY_ROOT, "share", "test-data", "genotyping_test_2", + "expected-variants.json") self.swaps_reference = os.path.join(GRMPY_ROOT, "share", "test-data", "genotyping_test_2", "swaps.fa") self.swaps_manifest = os.path.join(GRMPY_ROOT, "share", "test-data", "genotyping_test_2", "samples.txt") - self.hg38_input_vcf = os.path.join(GRMPY_ROOT, "share", "test-data", "paragraph", "pg-het-ins", "pg-het-ins.vcf") - self.hg38_expected_genotypes_json = os.path.join(GRMPY_ROOT, "share", "test-data", "paragraph", "pg-het-ins", "genotypes_expected.json") - self.hg38_expected_variants_json = os.path.join(GRMPY_ROOT, "share", "test-data", "paragraph", "pg-het-ins", "variants_expected.json") + self.hg38_input_vcf = os.path.join(GRMPY_ROOT, "share", "test-data", "paragraph", "pg-het-ins", + "pg-het-ins.vcf") + self.hg38_expected_genotypes_json = os.path.join(GRMPY_ROOT, "share", "test-data", "paragraph", "pg-het-ins", + "genotypes_expected.json") + self.hg38_expected_variants_json = os.path.join(GRMPY_ROOT, "share", "test-data", "paragraph", "pg-het-ins", + "variants_expected.json") self.hg38_reference = HG38 self.hg38_manifest = os.path.join(GRMPY_ROOT, "share", "test-data", "paragraph", "pg-het-ins", "manifest.txt") - @unittest.skipIf(not GRMPY_INSTALL, "No compiled/install path specified, please set the GRMPY_INSTALL variable to point to an installation.") + @unittest.skipIf(not GRMPY_INSTALL, + "No compiled/install path specified, please set the GRMPY_INSTALL variable to point to an installation.") def test_multigrmpy(self): import multigrmpy @@ -77,7 +83,8 @@ def test_multigrmpy(self): "manifest": self.manifest, "reference": self.reference, "output": output_dir, - "grmpy": os.path.join(os.path.dirname(__file__), "module-wrapper.sh") + " " + os.path.join(GRMPY_INSTALL, "bin", "grmpy"), + "grmpy": os.path.join(os.path.dirname(__file__), "module-wrapper.sh") + " " + os.path.join( + GRMPY_INSTALL, "bin", "grmpy"), "verbose": False, "quiet": True, "logfile": None, @@ -112,7 +119,8 @@ def test_multigrmpy(self): self.assertEqual(item["samples"]["sample1"]["gt"]["GT"], "./.") self.assertEqual(item["samples"]["sample2"]["gt"]["GT"], "S1/S1") - @unittest.skipIf(not GRMPY_INSTALL, "No compiled/install path specified, please set the GRMPY_INSTALL variable to point to an installation.") + @unittest.skipIf(not GRMPY_INSTALL, + "No compiled/install path specified, please set the GRMPY_INSTALL variable to point to an installation.") def test_multigrmpy_expected_genotypes(self): import multigrmpy @@ -122,7 +130,8 @@ def test_multigrmpy_expected_genotypes(self): "manifest": self.swaps_manifest, "reference": self.swaps_reference, "output": output_dir, - "grmpy": os.path.join(os.path.dirname(__file__), "module-wrapper.sh") + " " + os.path.join(GRMPY_INSTALL, "bin", "grmpy"), + "grmpy": os.path.join(os.path.dirname(__file__), "module-wrapper.sh") + " " + os.path.join( + GRMPY_INSTALL, "bin", "grmpy"), "verbose": False, "quiet": True, "logfile": None, @@ -230,8 +239,8 @@ def test_multigrmpy_expected_genotypes(self): raise Exception("Swaps test converted variants don't match! If this is expected and new behavior, " "cp test_swaps_variants.json %s" % self.swaps_expected_variants_json) - - @unittest.skipIf(not GRMPY_INSTALL, "No compiled/install path specified, please set the GRMPY_INSTALL variable to point to an installation.") + @unittest.skipIf(not GRMPY_INSTALL, + "No compiled/install path specified, please set the GRMPY_INSTALL variable to point to an installation.") @unittest.skipIf(not HG38, "No hg38 reference fasta file was found.") def test_multigrmpy_pg_het_ins(self): import multigrmpy @@ -242,7 +251,8 @@ def test_multigrmpy_pg_het_ins(self): "manifest": self.hg38_manifest, "reference": self.hg38_reference, "output": output_dir, - "grmpy": os.path.join(os.path.dirname(__file__), "module-wrapper.sh") + " " + os.path.join(GRMPY_INSTALL, "bin", "grmpy"), + "grmpy": os.path.join(os.path.dirname(__file__), "module-wrapper.sh") + " " + os.path.join( + GRMPY_INSTALL, "bin", "grmpy"), "verbose": False, "quiet": True, "logfile": None, diff --git a/src/sh/valgrind-test.sh b/src/sh/valgrind-test.sh index 1ce217c..5bd3638 100644 --- a/src/sh/valgrind-test.sh +++ b/src/sh/valgrind-test.sh @@ -31,9 +31,9 @@ valgrind --leak-check=full --xml=yes \ --xml-file=${WORKSPACE}/valgrind3.xml \ --suppressions=${WORKSPACE}/src/sh/valgrind-suppressions.supp \ ${WORKSPACE}/install/bin/grmpy \ - -r ${WORKSPACE}/share/test-data/paragraph/long-del/chr4_graph_typing.fa \ - -g ${WORKSPACE}/share/test-data/paragraph/long-del/chr4_graph_typing.2sample.json\ - -m ${WORKSPACE}/share/test-data/paragraph/long-del/chr4_graph_typing.manifest \ + -r ${WORKSPACE}/share/test-data/paragraph/long-del/chrX_graph_typing.fa \ + -g ${WORKSPACE}/share/test-data/paragraph/long-del/chrX_graph_typing.2sample.json\ + -m ${WORKSPACE}/share/test-data/paragraph/long-del/chrX_graph_typing.manifest \ -o t${WORKSPACE}/vg_test.json python ${WORKSPACE}/src/sh/valgrind-check.py ${WORKSPACE}/valgrind3.xml diff --git a/src/sh/validation/simulate-reads.sh.in b/src/sh/validation/simulate-reads.sh.in old mode 100644 new mode 100755