diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..3adbe05 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,1352 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "aho-corasick" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" +dependencies = [ + "memchr", +] + +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anstream" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ab91ebe16eb252986481c5b62f6098f3b698a45e34b5b98200cf20dd2484a44" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87" + +[[package]] +name = "anstyle-parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "317b9a89c1868f5ea6ff1d9539a69f45dffc21ce321ac1fd1160dfa48c8e2140" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0699d10d2f4d628a98ee7b57b289abbc98ff3bad977cb3152709d4bf2330628" +dependencies = [ + "anstyle", + "windows-sys", +] + +[[package]] +name = "anyhow" +version = "1.0.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6" + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "bindgen" +version = "0.69.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "042e2e131c066e496ea7880ef6cfeec415a9adc79fc882a65979394f8840bf7c" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "lazy_static", + "lazycell", + "log", + "peeking_take_while", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn", + "which", +] + +[[package]] +name = "bit-vec" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" + +[[package]] +name = "bitflags" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" + +[[package]] +name = "bumpalo" +version = "3.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" + +[[package]] +name = "cc" +version = "1.0.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +dependencies = [ + "libc", +] + +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "chrono" +version = "0.4.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "js-sys", + "num-traits", + "wasm-bindgen", + "windows-targets", +] + +[[package]] +name = "clang-sys" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f" +dependencies = [ + "glob", + "libc", + "libloading", +] + +[[package]] +name = "clap" +version = "4.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac495e00dcec98c83465d5ad66c5c4fabd652fd6686e7c6269b117e729a6f17b" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c77ed9a32a62e6ca27175d00d29d05ca32e396ea1eb5fb01d8256b669cec7663" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf9804afaaf59a91e75b022a30fb7229a7901f60c755489cc61c9b423b836442" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1" + +[[package]] +name = "cmake" +version = "0.1.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a31c789563b815f77f4250caee12365734369f942439b7defd71e18a48197130" +dependencies = [ + "cc", +] + +[[package]] +name = "colorchoice" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" + +[[package]] +name = "core-foundation-sys" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" + +[[package]] +name = "crc32fast" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" +dependencies = [ + "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" +dependencies = [ + "autocfg", + "cfg-if", + "crossbeam-utils", + "memoffset", + "scopeguard", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "csv" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" +dependencies = [ + "memchr", +] + +[[package]] +name = "deranged" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f32d04922c60427da6f9fef14d042d9edddef64cb9d4ce0d64d0685fbeb1fd3" +dependencies = [ + "powerfmt", +] + +[[package]] +name = "either" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" + +[[package]] +name = "env_logger" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0" +dependencies = [ + "humantime", + "is-terminal", + "log", + "regex", + "termcolor", +] + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "errno" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "flate2" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "getrandom" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + +[[package]] +name = "hashbrown" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f93e7192158dbcda357bdec5fb5788eebf8bbac027f3f33e719d29135ae84156" + +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + +[[package]] +name = "hermit-abi" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7" + +[[package]] +name = "home" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5444c27eef6923071f7ebcc33e3444508466a76f7a2b93da00ed6e19f30c1ddb" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + +[[package]] +name = "iana-time-zone" +version = "0.1.58" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8326b86b6cff230b97d0d312a6c40a60726df3332e721f72a1b035f451663b20" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "indexmap" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d530e1a18b1cb4c484e6e34556a0d948706958449fca0cab753d649f2bce3d1f" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "is-terminal" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" +dependencies = [ + "hermit-abi", + "rustix", + "windows-sys", +] + +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" + +[[package]] +name = "js-sys" +version = "0.3.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54c0c35952f67de54bb584e9fd912b3023117cbafc0a77d8f3dee1fb5f572fe8" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + +[[package]] +name = "lexical-core" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-parse-float" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f" +dependencies = [ + "lexical-parse-integer", + "lexical-util", + "static_assertions", +] + +[[package]] +name = "lexical-parse-integer" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9" +dependencies = [ + "lexical-util", + "static_assertions", +] + +[[package]] +name = "lexical-util" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc" +dependencies = [ + "static_assertions", +] + +[[package]] +name = "lexical-write-float" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862" +dependencies = [ + "lexical-util", + "lexical-write-integer", + "static_assertions", +] + +[[package]] +name = "lexical-write-integer" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446" +dependencies = [ + "lexical-util", + "static_assertions", +] + +[[package]] +name = "libc" +version = "0.2.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" + +[[package]] +name = "libloading" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" +dependencies = [ + "cfg-if", + "winapi", +] + +[[package]] +name = "linux-raw-sys" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f" + +[[package]] +name = "log" +version = "0.4.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" + +[[package]] +name = "matrixmultiply" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7574c1cf36da4798ab73da5b215bbf444f50718207754cb522201d78d1cd0ff2" +dependencies = [ + "autocfg", + "rawpointer", +] + +[[package]] +name = "memchr" +version = "2.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" + +[[package]] +name = "memoffset" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" +dependencies = [ + "autocfg", +] + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "miniz_oxide" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" +dependencies = [ + "adler", +] + +[[package]] +name = "ndarray" +version = "0.15.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "rawpointer", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "noodles" +version = "0.56.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ba860dd272b95bc6758e9cf1ec9d13d159c928726b020432a88bd897ca75bbb" +dependencies = [ + "noodles-bam", + "noodles-bed", + "noodles-bgzf", + "noodles-core", + "noodles-fasta", + "noodles-sam", + "noodles-vcf", +] + +[[package]] +name = "noodles-bam" +version = "0.49.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27efd2e9691c3cf6c894559089562f2dceebfd1f09fe1bc59e85235e16b77e1e" +dependencies = [ + "bit-vec", + "byteorder", + "bytes", + "indexmap", + "noodles-bgzf", + "noodles-core", + "noodles-csi", + "noodles-sam", +] + +[[package]] +name = "noodles-bed" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "deba180b7b94c524307da91d5bc983e0ccb9a08c4e24384cc8f93e0210af254a" +dependencies = [ + "noodles-core", +] + +[[package]] +name = "noodles-bgzf" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d578e5a173cbfac77295db4188c959966ce24a3364e009d363170d1ed44066a" +dependencies = [ + "byteorder", + "bytes", + "crossbeam-channel", + "flate2", +] + +[[package]] +name = "noodles-core" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94fbe3192fe33acacabaedd387657f39b0fc606f1996d546db0dfe14703b843a" + +[[package]] +name = "noodles-csi" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0531175d5473e6057c1724c1242b19bfc42dba644fe275b4df89c5b8d31a782" +dependencies = [ + "bit-vec", + "byteorder", + "indexmap", + "noodles-bgzf", + "noodles-core", +] + +[[package]] +name = "noodles-fasta" +version = "0.30.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "310dcfb61e8e2cafb65d9da4b329a98a390f2b570c17599a7f4639328cfb3e2c" +dependencies = [ + "bytes", + "memchr", + "noodles-bgzf", + "noodles-core", +] + +[[package]] +name = "noodles-sam" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "045d64736a3772d6edf5e05c167398808a45ce2c802627c7ba542820f3a48b64" +dependencies = [ + "bitflags", + "indexmap", + "lexical-core", + "memchr", + "noodles-bgzf", + "noodles-core", + "noodles-csi", +] + +[[package]] +name = "noodles-tabix" +version = "0.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "415e319f97784c110a85756a8747bf26e9a18bf321b113d00984ca1af7a6fef9" +dependencies = [ + "bit-vec", + "byteorder", + "indexmap", + "noodles-bgzf", + "noodles-core", + "noodles-csi", +] + +[[package]] +name = "noodles-vcf" +version = "0.44.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82d34757c2d465c34dab54582b475ad53fd3ffb5b12943b7166876235db8a297" +dependencies = [ + "indexmap", + "memchr", + "noodles-bgzf", + "noodles-core", + "noodles-csi", + "noodles-tabix", + "percent-encoding", +] + +[[package]] +name = "num-complex" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ba157ca0885411de85d6ca030ba7e2a83a28636056c7c699b07c8b6f7383214" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" +dependencies = [ + "autocfg", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "num_threads" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44" +dependencies = [ + "libc", +] + +[[package]] +name = "once_cell" +version = "1.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" + +[[package]] +name = "peeking_take_while" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" + +[[package]] +name = "percent-encoding" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" + +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + +[[package]] +name = "ppv-lite86" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" + +[[package]] +name = "prettyplease" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae005bd773ab59b4725093fd7df83fd7892f7d8eafb48dbd7de6e024e4215f9d" +dependencies = [ + "proc-macro2", + "syn", +] + +[[package]] +name = "proc-macro2" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + +[[package]] +name = "rayon" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c27db03db7734835b3f53954b534c91069375ce6ccaa2e065441e07d9b6cdb1" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ce3fb6ad83f861aac485e76e1985cd109d9a3713802152be56c3b1f0e0658ed" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "regex" +version = "1.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "rustix" +version = "0.38.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b426b0506e5d50a7d8dafcf2e81471400deb602392c7dd110815afb4eaf02a3" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + +[[package]] +name = "rustversion" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4" + +[[package]] +name = "ryu" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "serde" +version = "1.0.190" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91d3c334ca1ee894a2c6f6ad698fe8c435b76d504b13d436f0685d648d6d96f7" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.190" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67c5609f394e5c2bd7fc51efda478004ea80ef42fee983d5c67a65e34f32c0e3" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "shlex" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7cee0529a6d40f580e7a5e6c495c8fbfe21b7b52795ed4bb5e62cdf92bc6380" + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + +[[package]] +name = "syn" +version = "2.0.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "termcolor" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6093bad37da69aab9d123a8091e4be0aa4a03e4d601ec641c327398315f62b64" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "threadpool" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d050e60b33d41c19108b32cea32164033a9013fe3b46cbd4457559bfbf77afaa" +dependencies = [ + "num_cpus", +] + +[[package]] +name = "time" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4a34ab300f2dee6e562c10a046fc05e358b29f9bf92277f30c3c8d82275f6f5" +dependencies = [ + "deranged", + "itoa", + "libc", + "num_threads", + "powerfmt", + "serde", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" + +[[package]] +name = "time-macros" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ad70d68dba9e1f8aceda7aa6711965dfec1cac869f311a51bd08b3a2ccbce20" +dependencies = [ + "time-core", +] + +[[package]] +name = "trgt-denovo" +version = "0.1.0" +dependencies = [ + "anyhow", + "chrono", + "clap", + "csv", + "env_logger", + "itertools", + "log", + "ndarray", + "noodles", + "once_cell", + "rand", + "rayon", + "serde", + "threadpool", + "vergen", + "wfa2-sys", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "utf8parse" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" + +[[package]] +name = "vergen" +version = "8.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85e7dc29b3c54a2ea67ef4f953d5ec0c4085035c0ae2d325be1c0d2144bd9f16" +dependencies = [ + "anyhow", + "rustversion", + "time", +] + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasm-bindgen" +version = "0.2.88" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7daec296f25a1bae309c0cd5c29c4b260e510e6d813c286b19eaadf409d40fce" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.88" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e397f4664c0e4e428e8313a469aaa58310d302159845980fd23b0f22a847f217" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.88" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5961017b3b08ad5f3fe39f1e79877f8ee7c23c5e5fd5eb80de95abc41f1f16b2" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.88" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5353b8dab669f5e10f5bd76df26a9360c748f054f862ff5f3f8aae0c7fb3907" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.88" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d046c5d029ba91a1ed14da14dca44b68bf2f124cfbaf741c54151fdb3e0750b" + +[[package]] +name = "wfa2-sys" +version = "0.1.0" +source = "git+https://github.com/ctsa/rust-wfa2.git?rev=6e1d1dd#6e1d1dd70d1f76ab35206ce72d32f08075ae02e6" +dependencies = [ + "bindgen", + "cmake", +] + +[[package]] +name = "which" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" +dependencies = [ + "either", + "home", + "once_cell", + "rustix", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-core" +version = "0.51.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1f8cf84f35d2db49a46868f947758c7a1138116f7fac3bc844f43ade1292e64" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..8353c58 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,35 @@ +[package] +name = "trgt-denovo" +version = "0.1.0" +authors = ["Tom Mokveld", "Egor Dolzhenko"] +description = """ +trgt-denovo is a CLI tool for targeted de novo tandem repeat calling from long-read HiFI sequencing data. +""" +edition = "2021" +build = "build.rs" + +[build-dependencies] +vergen = { version = "8.0.0", features = ["git", "gitcl"] } + +[[bin]] +bench = false +path = "src/main.rs" +name = "trgt-denovo" + +[dependencies] +serde = { version = "1.0.150", features = ["derive"] } +wfa2-sys = { version = "*", git = "https://github.com/ctsa/rust-wfa2.git", rev = "6e1d1dd" } +clap = { version = "4.0.18", features = ["suggestions", "derive"] } +itertools = "*" +log = "*" +env_logger = "*" +rand = "0.8.5" +threadpool = "*" +chrono = "*" +csv = "*" +ndarray = "*" +# flate2 = { version = "1.0.20", default-features = false, features = ["zlib-ng"] } # Options: zlib-ng, zlib, zlib-ng-compat +noodles = { version = "*", features = ["vcf", "fasta", "core", "bam", "sam", "bgzf", "bed"] } +once_cell = "*" +rayon = "*" +anyhow = "*" \ No newline at end of file diff --git a/LICENSE-THIRDPARTY.json b/LICENSE-THIRDPARTY.json new file mode 100644 index 0000000..a134b50 --- /dev/null +++ b/LICENSE-THIRDPARTY.json @@ -0,0 +1,1343 @@ +[ + { + "name": "adler", + "version": "1.0.2", + "authors": "Jonas Schievink ", + "repository": "https://github.com/jonas-schievink/adler.git", + "license": "0BSD OR Apache-2.0 OR MIT", + "license_file": null, + "description": "A simple clean-room implementation of the Adler-32 checksum" + }, + { + "name": "aho-corasick", + "version": "1.1.2", + "authors": "Andrew Gallant ", + "repository": "https://github.com/BurntSushi/aho-corasick", + "license": "MIT OR Unlicense", + "license_file": null, + "description": "Fast multiple substring searching." + }, + { + "name": "android-tzdata", + "version": "0.1.1", + "authors": "RumovZ", + "repository": "https://github.com/RumovZ/android-tzdata", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Parser for the Android-specific tzdata file" + }, + { + "name": "android_system_properties", + "version": "0.1.5", + "authors": "Nicolas Silva ", + "repository": "https://github.com/nical/android_system_properties", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Minimal Android system properties wrapper" + }, + { + "name": "anstream", + "version": "0.6.4", + "authors": null, + "repository": "https://github.com/rust-cli/anstyle.git", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A simple cross platform library for writing colored text to a terminal." + }, + { + "name": "anstyle", + "version": "1.0.4", + "authors": null, + "repository": "https://github.com/rust-cli/anstyle.git", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "ANSI text styling" + }, + { + "name": "anstyle-parse", + "version": "0.2.2", + "authors": null, + "repository": "https://github.com/rust-cli/anstyle.git", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Parse ANSI Style Escapes" + }, + { + "name": "anstyle-query", + "version": "1.0.0", + "authors": null, + "repository": "https://github.com/rust-cli/anstyle", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Look up colored console capabilities" + }, + { + "name": "anstyle-wincon", + "version": "3.0.1", + "authors": null, + "repository": "https://github.com/rust-cli/anstyle.git", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Styling legacy Windows terminals" + }, + { + "name": "anyhow", + "version": "1.0.75", + "authors": "David Tolnay ", + "repository": "https://github.com/dtolnay/anyhow", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Flexible concrete Error type built on std::error::Error" + }, + { + "name": "autocfg", + "version": "1.1.0", + "authors": "Josh Stone ", + "repository": "https://github.com/cuviper/autocfg", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Automatic cfg for Rust compiler features" + }, + { + "name": "bindgen", + "version": "0.69.0", + "authors": "Jyun-Yan You |Emilio Cobos Álvarez |Nick Fitzgerald |The Servo project developers", + "repository": "https://github.com/rust-lang/rust-bindgen", + "license": "BSD-3-Clause", + "license_file": null, + "description": "Automatically generates Rust FFI bindings to C and C++ libraries." + }, + { + "name": "bit-vec", + "version": "0.6.3", + "authors": "Alexis Beingessner ", + "repository": "https://github.com/contain-rs/bit-vec", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A vector of bits" + }, + { + "name": "bitflags", + "version": "2.4.1", + "authors": "The Rust Project Developers", + "repository": "https://github.com/bitflags/bitflags", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A macro to generate structures which behave like bitflags." + }, + { + "name": "bumpalo", + "version": "3.14.0", + "authors": "Nick Fitzgerald ", + "repository": "https://github.com/fitzgen/bumpalo", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A fast bump allocation arena for Rust." + }, + { + "name": "byteorder", + "version": "1.5.0", + "authors": "Andrew Gallant ", + "repository": "https://github.com/BurntSushi/byteorder", + "license": "MIT OR Unlicense", + "license_file": null, + "description": "Library for reading/writing numbers in big-endian and little-endian." + }, + { + "name": "bytes", + "version": "1.5.0", + "authors": "Carl Lerche |Sean McArthur ", + "repository": "https://github.com/tokio-rs/bytes", + "license": "MIT", + "license_file": null, + "description": "Types and traits for working with bytes" + }, + { + "name": "cc", + "version": "1.0.83", + "authors": "Alex Crichton ", + "repository": "https://github.com/rust-lang/cc-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A build-time dependency for Cargo build scripts to assist in invoking the native C compiler to compile native C code into a static archive to be linked into Rust code." + }, + { + "name": "cexpr", + "version": "0.6.0", + "authors": "Jethro Beekman ", + "repository": "https://github.com/jethrogb/rust-cexpr", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A C expression parser and evaluator" + }, + { + "name": "cfg-if", + "version": "1.0.0", + "authors": "Alex Crichton ", + "repository": "https://github.com/alexcrichton/cfg-if", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A macro to ergonomically define an item depending on a large number of #[cfg] parameters. Structured like an if-else chain, the first matching branch is the item that gets emitted." + }, + { + "name": "chrono", + "version": "0.4.31", + "authors": null, + "repository": "https://github.com/chronotope/chrono", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Date and time library for Rust" + }, + { + "name": "clang-sys", + "version": "1.6.1", + "authors": "Kyle Mayes ", + "repository": "https://github.com/KyleMayes/clang-sys", + "license": "Apache-2.0", + "license_file": null, + "description": "Rust bindings for libclang." + }, + { + "name": "clap", + "version": "4.4.7", + "authors": null, + "repository": "https://github.com/clap-rs/clap", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A simple to use, efficient, and full-featured Command Line Argument Parser" + }, + { + "name": "clap_builder", + "version": "4.4.7", + "authors": null, + "repository": "https://github.com/clap-rs/clap", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A simple to use, efficient, and full-featured Command Line Argument Parser" + }, + { + "name": "clap_derive", + "version": "4.4.7", + "authors": null, + "repository": "https://github.com/clap-rs/clap/tree/master/clap_derive", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Parse command line argument by defining a struct, derive crate." + }, + { + "name": "clap_lex", + "version": "0.6.0", + "authors": null, + "repository": "https://github.com/clap-rs/clap/tree/master/clap_lex", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Minimal, flexible command line parser" + }, + { + "name": "cmake", + "version": "0.1.50", + "authors": "Alex Crichton ", + "repository": "https://github.com/rust-lang/cmake-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A build dependency for running `cmake` to build a native library" + }, + { + "name": "colorchoice", + "version": "1.0.0", + "authors": null, + "repository": "https://github.com/rust-cli/anstyle", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Global override of color control" + }, + { + "name": "core-foundation-sys", + "version": "0.8.4", + "authors": "The Servo Project Developers", + "repository": "https://github.com/servo/core-foundation-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Bindings to Core Foundation for macOS" + }, + { + "name": "crc32fast", + "version": "1.3.2", + "authors": "Sam Rijs |Alex Crichton ", + "repository": "https://github.com/srijs/rust-crc32fast", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Fast, SIMD-accelerated CRC32 (IEEE) checksum computation" + }, + { + "name": "crossbeam-channel", + "version": "0.5.8", + "authors": null, + "repository": "https://github.com/crossbeam-rs/crossbeam", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Multi-producer multi-consumer channels for message passing" + }, + { + "name": "crossbeam-deque", + "version": "0.8.3", + "authors": null, + "repository": "https://github.com/crossbeam-rs/crossbeam", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Concurrent work-stealing deque" + }, + { + "name": "crossbeam-epoch", + "version": "0.9.15", + "authors": null, + "repository": "https://github.com/crossbeam-rs/crossbeam", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Epoch-based garbage collection" + }, + { + "name": "crossbeam-utils", + "version": "0.8.16", + "authors": null, + "repository": "https://github.com/crossbeam-rs/crossbeam", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Utilities for concurrent programming" + }, + { + "name": "csv", + "version": "1.3.0", + "authors": "Andrew Gallant ", + "repository": "https://github.com/BurntSushi/rust-csv", + "license": "MIT OR Unlicense", + "license_file": null, + "description": "Fast CSV parsing with support for serde." + }, + { + "name": "csv-core", + "version": "0.1.11", + "authors": "Andrew Gallant ", + "repository": "https://github.com/BurntSushi/rust-csv", + "license": "MIT OR Unlicense", + "license_file": null, + "description": "Bare bones CSV parsing with no_std support." + }, + { + "name": "deranged", + "version": "0.3.9", + "authors": "Jacob Pratt ", + "repository": "https://github.com/jhpratt/deranged", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Ranged integers" + }, + { + "name": "either", + "version": "1.9.0", + "authors": "bluss", + "repository": "https://github.com/bluss/either", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "The enum `Either` with variants `Left` and `Right` is a general purpose sum type with two cases." + }, + { + "name": "env_logger", + "version": "0.10.0", + "authors": null, + "repository": "https://github.com/rust-cli/env_logger/", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A logging implementation for `log` which is configured via an environment variable." + }, + { + "name": "equivalent", + "version": "1.0.1", + "authors": null, + "repository": "https://github.com/cuviper/equivalent", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Traits for key comparison in maps." + }, + { + "name": "errno", + "version": "0.3.5", + "authors": "Chris Wong ", + "repository": "https://github.com/lambda-fairy/rust-errno", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Cross-platform interface to the `errno` variable." + }, + { + "name": "flate2", + "version": "1.0.28", + "authors": "Alex Crichton |Josh Triplett ", + "repository": "https://github.com/rust-lang/flate2-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "DEFLATE compression and decompression exposed as Read/BufRead/Write streams. Supports miniz_oxide and multiple zlib implementations. Supports zlib, gzip, and raw deflate streams." + }, + { + "name": "getrandom", + "version": "0.2.10", + "authors": "The Rand Project Developers", + "repository": "https://github.com/rust-random/getrandom", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A small cross-platform library for retrieving random data from system source" + }, + { + "name": "glob", + "version": "0.3.1", + "authors": "The Rust Project Developers", + "repository": "https://github.com/rust-lang/glob", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Support for matching file paths against Unix shell style patterns." + }, + { + "name": "hashbrown", + "version": "0.14.2", + "authors": "Amanieu d'Antras ", + "repository": "https://github.com/rust-lang/hashbrown", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A Rust port of Google's SwissTable hash map" + }, + { + "name": "heck", + "version": "0.4.1", + "authors": "Without Boats ", + "repository": "https://github.com/withoutboats/heck", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "heck is a case conversion library." + }, + { + "name": "hermit-abi", + "version": "0.3.3", + "authors": "Stefan Lankes", + "repository": "https://github.com/hermitcore/hermit-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Hermit system calls definitions." + }, + { + "name": "home", + "version": "0.5.5", + "authors": "Brian Anderson ", + "repository": "https://github.com/rust-lang/cargo", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Shared definitions of home directories." + }, + { + "name": "humantime", + "version": "2.1.0", + "authors": "Paul Colomiets ", + "repository": "https://github.com/tailhook/humantime", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A parser and formatter for std::time::{Duration, SystemTime}" + }, + { + "name": "iana-time-zone", + "version": "0.1.58", + "authors": "Andrew Straw |René Kijewski |Ryan Lopopolo ", + "repository": "https://github.com/strawlab/iana-time-zone", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "get the IANA time zone for the current system" + }, + { + "name": "iana-time-zone-haiku", + "version": "0.1.2", + "authors": "René Kijewski ", + "repository": "https://github.com/strawlab/iana-time-zone", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "iana-time-zone support crate for Haiku OS" + }, + { + "name": "indexmap", + "version": "2.1.0", + "authors": null, + "repository": "https://github.com/bluss/indexmap", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A hash table with consistent order and fast iteration." + }, + { + "name": "is-terminal", + "version": "0.4.9", + "authors": "softprops |Dan Gohman ", + "repository": "https://github.com/sunfishcode/is-terminal", + "license": "MIT", + "license_file": null, + "description": "Test whether a given stream is a terminal" + }, + { + "name": "itertools", + "version": "0.11.0", + "authors": "bluss", + "repository": "https://github.com/rust-itertools/itertools", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Extra iterator adaptors, iterator methods, free functions, and macros." + }, + { + "name": "itoa", + "version": "1.0.9", + "authors": "David Tolnay ", + "repository": "https://github.com/dtolnay/itoa", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Fast integer primitive to string conversion" + }, + { + "name": "js-sys", + "version": "0.3.65", + "authors": "The wasm-bindgen Developers", + "repository": "https://github.com/rustwasm/wasm-bindgen/tree/master/crates/js-sys", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Bindings for all JS global objects and functions in all JS environments like Node.js and browsers, built on `#[wasm_bindgen]` using the `wasm-bindgen` crate." + }, + { + "name": "lazy_static", + "version": "1.4.0", + "authors": "Marvin Löbel ", + "repository": "https://github.com/rust-lang-nursery/lazy-static.rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A macro for declaring lazily evaluated statics in Rust." + }, + { + "name": "lazycell", + "version": "1.3.0", + "authors": "Alex Crichton |Nikita Pekin ", + "repository": "https://github.com/indiv0/lazycell", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A library providing a lazily filled Cell struct" + }, + { + "name": "lexical-core", + "version": "0.8.5", + "authors": "Alex Huszagh ", + "repository": "https://github.com/Alexhuszagh/rust-lexical", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Lexical, to- and from-string conversion routines." + }, + { + "name": "lexical-parse-float", + "version": "0.8.5", + "authors": "Alex Huszagh ", + "repository": "https://github.com/Alexhuszagh/rust-lexical", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Efficient parsing of floats from strings." + }, + { + "name": "lexical-parse-integer", + "version": "0.8.6", + "authors": "Alex Huszagh ", + "repository": "https://github.com/Alexhuszagh/rust-lexical", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Efficient parsing of integers from strings." + }, + { + "name": "lexical-util", + "version": "0.8.5", + "authors": "Alex Huszagh ", + "repository": "https://github.com/Alexhuszagh/rust-lexical", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Shared utilities for lexical creates." + }, + { + "name": "lexical-write-float", + "version": "0.8.5", + "authors": "Alex Huszagh ", + "repository": "https://github.com/Alexhuszagh/rust-lexical", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Efficient formatting of floats to strings." + }, + { + "name": "lexical-write-integer", + "version": "0.8.5", + "authors": "Alex Huszagh ", + "repository": "https://github.com/Alexhuszagh/rust-lexical", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Efficient formatting of integers to strings." + }, + { + "name": "libc", + "version": "0.2.149", + "authors": "The Rust Project Developers", + "repository": "https://github.com/rust-lang/libc", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Raw FFI bindings to platform libraries like libc." + }, + { + "name": "libloading", + "version": "0.7.4", + "authors": "Simonas Kazlauskas ", + "repository": "https://github.com/nagisa/rust_libloading/", + "license": "ISC", + "license_file": null, + "description": "Bindings around the platform's dynamic library loading primitives with greatly improved memory safety." + }, + { + "name": "linux-raw-sys", + "version": "0.4.10", + "authors": "Dan Gohman ", + "repository": "https://github.com/sunfishcode/linux-raw-sys", + "license": "Apache-2.0 OR Apache-2.0 WITH LLVM-exception OR MIT", + "license_file": null, + "description": "Generated bindings for Linux's userspace API" + }, + { + "name": "log", + "version": "0.4.20", + "authors": "The Rust Project Developers", + "repository": "https://github.com/rust-lang/log", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A lightweight logging facade for Rust" + }, + { + "name": "matrixmultiply", + "version": "0.3.8", + "authors": "bluss|R. Janis Goldschmidt", + "repository": "https://github.com/bluss/matrixmultiply/", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "General matrix multiplication for f32 and f64 matrices. Operates on matrices with general layout (they can use arbitrary row and column stride). Detects and uses AVX or SSE2 on x86 platforms transparently for higher performance. Uses a microkernel strategy, so that the implementation is easy to parallelize and optimize. Supports multithreading." + }, + { + "name": "memchr", + "version": "2.6.4", + "authors": "Andrew Gallant |bluss", + "repository": "https://github.com/BurntSushi/memchr", + "license": "MIT OR Unlicense", + "license_file": null, + "description": "Provides extremely fast (uses SIMD on x86_64, aarch64 and wasm32) routines for 1, 2 or 3 byte search and single substring search." + }, + { + "name": "memoffset", + "version": "0.9.0", + "authors": "Gilad Naaman ", + "repository": "https://github.com/Gilnaa/memoffset", + "license": "MIT", + "license_file": null, + "description": "offset_of functionality for Rust structs." + }, + { + "name": "minimal-lexical", + "version": "0.2.1", + "authors": "Alex Huszagh ", + "repository": "https://github.com/Alexhuszagh/minimal-lexical", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Fast float parsing conversion routines." + }, + { + "name": "miniz_oxide", + "version": "0.7.1", + "authors": "Frommi |oyvindln ", + "repository": "https://github.com/Frommi/miniz_oxide/tree/master/miniz_oxide", + "license": "Apache-2.0 OR MIT OR Zlib", + "license_file": null, + "description": "DEFLATE compression and decompression library rewritten in Rust based on miniz" + }, + { + "name": "ndarray", + "version": "0.15.6", + "authors": "Ulrik Sverdrup \"bluss\"|Jim Turner", + "repository": "https://github.com/rust-ndarray/ndarray", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "An n-dimensional array for general elements and for numerics. Lightweight array views and slicing; views support chunking and splitting." + }, + { + "name": "nom", + "version": "7.1.3", + "authors": "contact@geoffroycouprie.com", + "repository": "https://github.com/Geal/nom", + "license": "MIT", + "license_file": null, + "description": "A byte-oriented, zero-copy, parser combinators library" + }, + { + "name": "noodles", + "version": "0.56.0", + "authors": "Michael Macias ", + "repository": "https://github.com/zaeleus/noodles", + "license": "MIT", + "license_file": null, + "description": "Bioinformatics I/O libraries" + }, + { + "name": "noodles-bam", + "version": "0.49.0", + "authors": "Michael Macias ", + "repository": "https://github.com/zaeleus/noodles", + "license": "MIT", + "license_file": null, + "description": "Binary Alignment/Map (BAM) format reader and writer" + }, + { + "name": "noodles-bed", + "version": "0.10.0", + "authors": "Michael Macias ", + "repository": "https://github.com/zaeleus/noodles", + "license": "MIT", + "license_file": null, + "description": "BED (Browser Extensible Data) reader" + }, + { + "name": "noodles-bgzf", + "version": "0.25.0", + "authors": "Michael Macias ", + "repository": "https://github.com/zaeleus/noodles", + "license": "MIT", + "license_file": null, + "description": "Blocked gzip format (BGZF) reader and writer" + }, + { + "name": "noodles-core", + "version": "0.12.0", + "authors": "Michael Macias ", + "repository": "https://github.com/zaeleus/noodles", + "license": "MIT", + "license_file": null, + "description": "Shared utilities when working with noodles" + }, + { + "name": "noodles-csi", + "version": "0.26.0", + "authors": "Michael Macias ", + "repository": "https://github.com/zaeleus/noodles", + "license": "MIT", + "license_file": null, + "description": "Coordinate-sorted index (CSI) format reader and writer" + }, + { + "name": "noodles-fasta", + "version": "0.30.0", + "authors": "Michael Macias ", + "repository": "https://github.com/zaeleus/noodles", + "license": "MIT", + "license_file": null, + "description": "FASTA format reader and writer" + }, + { + "name": "noodles-sam", + "version": "0.46.0", + "authors": "Michael Macias ", + "repository": "https://github.com/zaeleus/noodles", + "license": "MIT", + "license_file": null, + "description": "Sequence Alignment/Map (SAM) format reader and writer" + }, + { + "name": "noodles-tabix", + "version": "0.32.0", + "authors": "Michael Macias ", + "repository": "https://github.com/zaeleus/noodles", + "license": "MIT", + "license_file": null, + "description": "Tabix (TBI) format reader and writer" + }, + { + "name": "noodles-vcf", + "version": "0.44.0", + "authors": "Michael Macias ", + "repository": "https://github.com/zaeleus/noodles", + "license": "MIT", + "license_file": null, + "description": "Variant Call Format (VCF) reader and writer" + }, + { + "name": "num-complex", + "version": "0.4.4", + "authors": "The Rust Project Developers", + "repository": "https://github.com/rust-num/num-complex", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Complex numbers implementation for Rust" + }, + { + "name": "num-integer", + "version": "0.1.45", + "authors": "The Rust Project Developers", + "repository": "https://github.com/rust-num/num-integer", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Integer traits and functions" + }, + { + "name": "num-traits", + "version": "0.2.17", + "authors": "The Rust Project Developers", + "repository": "https://github.com/rust-num/num-traits", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Numeric traits for generic mathematics" + }, + { + "name": "num_cpus", + "version": "1.16.0", + "authors": "Sean McArthur ", + "repository": "https://github.com/seanmonstar/num_cpus", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Get the number of CPUs on a machine." + }, + { + "name": "num_threads", + "version": "0.1.6", + "authors": "Jacob Pratt ", + "repository": "https://github.com/jhpratt/num_threads", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A minimal library that determines the number of running threads for the current process." + }, + { + "name": "once_cell", + "version": "1.18.0", + "authors": "Aleksey Kladov ", + "repository": "https://github.com/matklad/once_cell", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Single assignment cells and lazy values." + }, + { + "name": "peeking_take_while", + "version": "0.1.2", + "authors": "Nick Fitzgerald ", + "repository": "https://github.com/fitzgen/peeking_take_while", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Like `Iterator::take_while`, but calls the predicate on a peeked value. This allows you to use `Iterator::by_ref` and `Iterator::take_while` together, and still get the first value for which the `take_while` predicate returned false after dropping the `by_ref`." + }, + { + "name": "percent-encoding", + "version": "2.3.0", + "authors": "The rust-url developers", + "repository": "https://github.com/servo/rust-url/", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Percent encoding and decoding" + }, + { + "name": "powerfmt", + "version": "0.2.0", + "authors": "Jacob Pratt ", + "repository": "https://github.com/jhpratt/powerfmt", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "`powerfmt` is a library that provides utilities for formatting values. This crate makes it significantly easier to support filling to a minimum width with alignment, avoid heap allocation, and avoid repetitive calculations." + }, + { + "name": "ppv-lite86", + "version": "0.2.17", + "authors": "The CryptoCorrosion Contributors", + "repository": "https://github.com/cryptocorrosion/cryptocorrosion", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Implementation of the crypto-simd API for x86" + }, + { + "name": "prettyplease", + "version": "0.2.15", + "authors": "David Tolnay ", + "repository": "https://github.com/dtolnay/prettyplease", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A minimal `syn` syntax tree pretty-printer" + }, + { + "name": "proc-macro2", + "version": "1.0.69", + "authors": "David Tolnay |Alex Crichton ", + "repository": "https://github.com/dtolnay/proc-macro2", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A substitute implementation of the compiler's `proc_macro` API to decouple token-based libraries from the procedural macro use case." + }, + { + "name": "quote", + "version": "1.0.33", + "authors": "David Tolnay ", + "repository": "https://github.com/dtolnay/quote", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Quasi-quoting macro quote!(...)" + }, + { + "name": "rand", + "version": "0.8.5", + "authors": "The Rand Project Developers|The Rust Project Developers", + "repository": "https://github.com/rust-random/rand", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Random number generators and other randomness functionality." + }, + { + "name": "rand_chacha", + "version": "0.3.1", + "authors": "The Rand Project Developers|The Rust Project Developers|The CryptoCorrosion Contributors", + "repository": "https://github.com/rust-random/rand", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "ChaCha random number generator" + }, + { + "name": "rand_core", + "version": "0.6.4", + "authors": "The Rand Project Developers|The Rust Project Developers", + "repository": "https://github.com/rust-random/rand", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Core random number generator traits and tools for implementation." + }, + { + "name": "rawpointer", + "version": "0.2.1", + "authors": "bluss", + "repository": "https://github.com/bluss/rawpointer/", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Extra methods for raw pointers and `NonNull`. For example `.post_inc()` and `.pre_dec()` (c.f. `ptr++` and `--ptr`), `offset` and `add` for `NonNull`, and the function `ptrdistance`." + }, + { + "name": "rayon", + "version": "1.8.0", + "authors": "Niko Matsakis |Josh Stone ", + "repository": "https://github.com/rayon-rs/rayon", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Simple work-stealing parallelism for Rust" + }, + { + "name": "rayon-core", + "version": "1.12.0", + "authors": "Niko Matsakis |Josh Stone ", + "repository": "https://github.com/rayon-rs/rayon", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Core APIs for Rayon" + }, + { + "name": "regex", + "version": "1.10.2", + "authors": "The Rust Project Developers|Andrew Gallant ", + "repository": "https://github.com/rust-lang/regex", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "An implementation of regular expressions for Rust. This implementation uses finite automata and guarantees linear time matching on all inputs." + }, + { + "name": "regex-automata", + "version": "0.4.3", + "authors": "The Rust Project Developers|Andrew Gallant ", + "repository": "https://github.com/rust-lang/regex/tree/master/regex-automata", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Automata construction and matching using regular expressions." + }, + { + "name": "regex-syntax", + "version": "0.8.2", + "authors": "The Rust Project Developers|Andrew Gallant ", + "repository": "https://github.com/rust-lang/regex/tree/master/regex-syntax", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A regular expression parser." + }, + { + "name": "rustc-hash", + "version": "1.1.0", + "authors": "The Rust Project Developers", + "repository": "https://github.com/rust-lang-nursery/rustc-hash", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "speed, non-cryptographic hash used in rustc" + }, + { + "name": "rustix", + "version": "0.38.21", + "authors": "Dan Gohman |Jakub Konka ", + "repository": "https://github.com/bytecodealliance/rustix", + "license": "Apache-2.0 OR Apache-2.0 WITH LLVM-exception OR MIT", + "license_file": null, + "description": "Safe Rust bindings to POSIX/Unix/Linux/Winsock2-like syscalls" + }, + { + "name": "rustversion", + "version": "1.0.14", + "authors": "David Tolnay ", + "repository": "https://github.com/dtolnay/rustversion", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Conditional compilation according to rustc compiler version" + }, + { + "name": "ryu", + "version": "1.0.15", + "authors": "David Tolnay ", + "repository": "https://github.com/dtolnay/ryu", + "license": "Apache-2.0 OR BSL-1.0", + "license_file": null, + "description": "Fast floating point to string conversion" + }, + { + "name": "scopeguard", + "version": "1.2.0", + "authors": "bluss", + "repository": "https://github.com/bluss/scopeguard", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A RAII scope guard that will run a given closure when it goes out of scope, even if the code between panics (assuming unwinding panic). Defines the macros `defer!`, `defer_on_unwind!`, `defer_on_success!` as shorthands for guards with one of the implemented strategies." + }, + { + "name": "serde", + "version": "1.0.190", + "authors": "Erick Tryzelaar |David Tolnay ", + "repository": "https://github.com/serde-rs/serde", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A generic serialization/deserialization framework" + }, + { + "name": "serde_derive", + "version": "1.0.190", + "authors": "Erick Tryzelaar |David Tolnay ", + "repository": "https://github.com/serde-rs/serde", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Macros 1.1 implementation of #[derive(Serialize, Deserialize)]" + }, + { + "name": "shlex", + "version": "1.2.0", + "authors": "comex |Fenhl ", + "repository": "https://github.com/comex/rust-shlex", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Split a string into shell words, like Python's shlex." + }, + { + "name": "static_assertions", + "version": "1.1.0", + "authors": "Nikolai Vazquez", + "repository": "https://github.com/nvzqz/static-assertions-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Compile-time assertions to ensure that invariants are met." + }, + { + "name": "strsim", + "version": "0.10.0", + "authors": "Danny Guo ", + "repository": "https://github.com/dguo/strsim-rs", + "license": "MIT", + "license_file": null, + "description": "Implementations of string similarity metrics. Includes Hamming, Levenshtein, OSA, Damerau-Levenshtein, Jaro, Jaro-Winkler, and Sørensen-Dice." + }, + { + "name": "syn", + "version": "2.0.38", + "authors": "David Tolnay ", + "repository": "https://github.com/dtolnay/syn", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Parser for Rust source code" + }, + { + "name": "termcolor", + "version": "1.3.0", + "authors": "Andrew Gallant ", + "repository": "https://github.com/BurntSushi/termcolor", + "license": "MIT OR Unlicense", + "license_file": null, + "description": "A simple cross platform library for writing colored text to a terminal." + }, + { + "name": "threadpool", + "version": "1.8.1", + "authors": "The Rust Project Developers|Corey Farwell |Stefan Schindler ", + "repository": "https://github.com/rust-threadpool/rust-threadpool", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "A thread pool for running a number of jobs on a fixed set of worker threads." + }, + { + "name": "time", + "version": "0.3.30", + "authors": "Jacob Pratt |Time contributors", + "repository": "https://github.com/time-rs/time", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Date and time library. Fully interoperable with the standard library. Mostly compatible with #![no_std]." + }, + { + "name": "time-core", + "version": "0.1.2", + "authors": "Jacob Pratt |Time contributors", + "repository": "https://github.com/time-rs/time", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "This crate is an implementation detail and should not be relied upon directly." + }, + { + "name": "time-macros", + "version": "0.2.15", + "authors": "Jacob Pratt |Time contributors", + "repository": "https://github.com/time-rs/time", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Procedural macros for the time crate. This crate is an implementation detail and should not be relied upon directly." + }, + { + "name": "trgt-denovo", + "version": "0.1.0", + "authors": "Tom Mokveld|Egor Dolzhenko", + "repository": null, + "license": null, + "license_file": null, + "description": "trgt-denovo is a CLI tool for targeted de novo tandem repeat calling from long-read HiFI sequencing data." + }, + { + "name": "unicode-ident", + "version": "1.0.12", + "authors": "David Tolnay ", + "repository": "https://github.com/dtolnay/unicode-ident", + "license": "(MIT OR Apache-2.0) AND Unicode-DFS-2016", + "license_file": null, + "description": "Determine whether characters have the XID_Start or XID_Continue properties according to Unicode Standard Annex #31" + }, + { + "name": "utf8parse", + "version": "0.2.1", + "authors": "Joe Wilm |Christian Duerr ", + "repository": "https://github.com/alacritty/vte", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Table-driven UTF-8 parser" + }, + { + "name": "vergen", + "version": "8.2.5", + "authors": "Jason Ozias ", + "repository": "https://github.com/rustyhorde/vergen", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Generate 'cargo:rustc-env' instructions via 'build.rs' for use in your code via the 'env!' macro" + }, + { + "name": "wasi", + "version": "0.11.0+wasi-snapshot-preview1", + "authors": "The Cranelift Project Developers", + "repository": "https://github.com/bytecodealliance/wasi", + "license": "Apache-2.0 OR Apache-2.0 WITH LLVM-exception OR MIT", + "license_file": null, + "description": "Experimental WASI API bindings for Rust" + }, + { + "name": "wasm-bindgen", + "version": "0.2.88", + "authors": "The wasm-bindgen Developers", + "repository": "https://github.com/rustwasm/wasm-bindgen", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Easy support for interacting between JS and Rust." + }, + { + "name": "wasm-bindgen-backend", + "version": "0.2.88", + "authors": "The wasm-bindgen Developers", + "repository": "https://github.com/rustwasm/wasm-bindgen/tree/master/crates/backend", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Backend code generation of the wasm-bindgen tool" + }, + { + "name": "wasm-bindgen-macro", + "version": "0.2.88", + "authors": "The wasm-bindgen Developers", + "repository": "https://github.com/rustwasm/wasm-bindgen/tree/master/crates/macro", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Definition of the `#[wasm_bindgen]` attribute, an internal dependency" + }, + { + "name": "wasm-bindgen-macro-support", + "version": "0.2.88", + "authors": "The wasm-bindgen Developers", + "repository": "https://github.com/rustwasm/wasm-bindgen/tree/master/crates/macro-support", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "The part of the implementation of the `#[wasm_bindgen]` attribute that is not in the shared backend crate" + }, + { + "name": "wasm-bindgen-shared", + "version": "0.2.88", + "authors": "The wasm-bindgen Developers", + "repository": "https://github.com/rustwasm/wasm-bindgen/tree/master/crates/shared", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Shared support between wasm-bindgen and wasm-bindgen cli, an internal dependency." + }, + { + "name": "wfa2-sys", + "version": "0.1.0", + "authors": null, + "repository": null, + "license": null, + "license_file": null, + "description": null + }, + { + "name": "which", + "version": "4.4.2", + "authors": "Harry Fei ", + "repository": "https://github.com/harryfei/which-rs.git", + "license": "MIT", + "license_file": null, + "description": "A Rust equivalent of Unix command \"which\". Locate installed executable in cross platforms." + }, + { + "name": "winapi", + "version": "0.3.9", + "authors": "Peter Atashian ", + "repository": "https://github.com/retep998/winapi-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Raw FFI bindings for all of Windows API." + }, + { + "name": "winapi-i686-pc-windows-gnu", + "version": "0.4.0", + "authors": "Peter Atashian ", + "repository": "https://github.com/retep998/winapi-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import libraries for the i686-pc-windows-gnu target. Please don't use this crate directly, depend on winapi instead." + }, + { + "name": "winapi-util", + "version": "0.1.6", + "authors": "Andrew Gallant ", + "repository": "https://github.com/BurntSushi/winapi-util", + "license": "MIT OR Unlicense", + "license_file": null, + "description": "A dumping ground for high level safe wrappers over winapi." + }, + { + "name": "winapi-x86_64-pc-windows-gnu", + "version": "0.4.0", + "authors": "Peter Atashian ", + "repository": "https://github.com/retep998/winapi-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import libraries for the x86_64-pc-windows-gnu target. Please don't use this crate directly, depend on winapi instead." + }, + { + "name": "windows-core", + "version": "0.51.1", + "authors": "Microsoft", + "repository": "https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Rust for Windows" + }, + { + "name": "windows-sys", + "version": "0.48.0", + "authors": "Microsoft", + "repository": "https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Rust for Windows" + }, + { + "name": "windows-targets", + "version": "0.48.5", + "authors": "Microsoft", + "repository": "https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import libs for Windows" + }, + { + "name": "windows_aarch64_gnullvm", + "version": "0.48.5", + "authors": "Microsoft", + "repository": "https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import lib for Windows" + }, + { + "name": "windows_aarch64_msvc", + "version": "0.48.5", + "authors": "Microsoft", + "repository": "https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import lib for Windows" + }, + { + "name": "windows_i686_gnu", + "version": "0.48.5", + "authors": "Microsoft", + "repository": "https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import lib for Windows" + }, + { + "name": "windows_i686_msvc", + "version": "0.48.5", + "authors": "Microsoft", + "repository": "https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import lib for Windows" + }, + { + "name": "windows_x86_64_gnu", + "version": "0.48.5", + "authors": "Microsoft", + "repository": "https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import lib for Windows" + }, + { + "name": "windows_x86_64_gnullvm", + "version": "0.48.5", + "authors": "Microsoft", + "repository": "https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import lib for Windows" + }, + { + "name": "windows_x86_64_msvc", + "version": "0.48.5", + "authors": "Microsoft", + "repository": "https://github.com/microsoft/windows-rs", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Import lib for Windows" + } +] diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..5b0993a --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,15 @@ +# Pacific Biosciences Software License Agreement +1. **Introduction and Acceptance.** This Software License Agreement (this “**Agreement**”) is a legal agreement between you (either an individual or an entity) and Pacific Biosciences of California, Inc. (“**PacBio**”) regarding the use of the PacBio software accompanying this Agreement, which includes documentation provided in “online” or electronic form (together, the “**Software**”). PACBIO PROVIDES THE SOFTWARE SOLELY ON THE TERMS AND CONDITIONS SET FORTH IN THIS AGREEMENT AND ON THE CONDITION THAT YOU ACCEPT AND COMPLY WITH THEM. BY DOWNLOADING, DISTRIBUTING, MODIFYING OR OTHERWISE USING THE SOFTWARE, YOU (A) ACCEPT THIS AGREEMENT AND AGREE THAT YOU ARE LEGALLY BOUND BY ITS TERMS; AND (B) REPRESENT AND WARRANT THAT: (I) YOU ARE OF LEGAL AGE TO ENTER INTO A BINDING AGREEMENT; AND (II) IF YOU REPRESENT A CORPORATION, GOVERNMENTAL ORGANIZATION OR OTHER LEGAL ENTITY, YOU HAVE THE RIGHT, POWER AND AUTHORITY TO ENTER INTO THIS AGREEMENT ON BEHALF OF SUCH ENTITY AND BIND SUCH ENTITY TO THESE TERMS. IF YOU DO NOT AGREE TO THE TERMS OF THIS AGREEMENT, PACBIO WILL NOT AND DOES NOT LICENSE THE SOFTWARE TO YOU AND YOU MUST NOT DOWNLOAD, INSTALL OR OTHERWISE USE THE SOFTWARE OR DOCUMENTATION. +2. **Grant of License.** Subject to your compliance with the restrictions set forth in this Agreement, PacBio hereby grants to you a non-exclusive, non-transferable license during the Term to install, copy, use, distribute in binary form only, and host the Software. If you received the Software from PacBio in source code format, you may also modify and/or compile the Software. +3. **License Restrictions.** You may not remove or destroy any copyright notices or other proprietary markings. You may only use the Software to process or analyze data generated on a PacBio instrument or otherwise provided to you by PacBio. Any use, modification, translation, or compilation of the Software not expressly authorized in Section 2 is prohibited. You may not use, modify, host, or distribute the Software so that any part of the Software becomes subject to any license that requires, as a condition of use, modification, hosting, or distribution, that (a) the Software, in whole or in part, be disclosed or distributed in source code form or (b) any third party have the right to modify the Software, in whole or in part. +4. **Ownership.** The license granted to you in Section 2 is not a transfer or sale of PacBio’s ownership rights in or to the Software. Except for the license granted in Section 2, PacBio retains all right, title and interest (including all intellectual property rights) in and to the Software. The Software is protected by applicable intellectual property laws, including United States copyright laws and international treaties. +5. **Third Party Materials.** The Software may include software, content, data or other materials, including related documentation and open source software, that are owned by one or more third parties and that are subject to separate licensee terms (“**Third-Party Licenses**”). A list of all materials, if any, can be found the documentation for the Software. You acknowledge and agree that such third party materials subject to Third-Party Licenses are not licensed to you pursuant to the provisions of this Agreement and that this Agreement shall not be construed to grant any such right and/or license. You shall have only such rights and/or licenses, if any, to use such third party materials as set forth in the applicable Third-Party Licenses. +6. **Feedback.** If you provide any feedback to PacBio concerning the functionality and performance of the Software, including identifying potential errors and improvements (“**Feedback**”), such Feedback shall be owned by PacBio. You hereby assign to PacBio all right, title, and interest in and to the Feedback, and PacBio is free to use the Feedback without any payment or restriction. +7. **Confidentiality.** You must hold in the strictest confidence the Software and any related materials or information including, but not limited to, any Feedback, technical data, research, product plans, or know-how provided by PacBio to you, directly or indirectly in writing, orally or by inspection of tangible objects (“**Confidential Information**”). You will not disclose any Confidential Information to third parties, including any of your employees who do not have a need to know such information, and you will take reasonable measures to protect the secrecy of, and to avoid disclosure and unauthorized use of, the Confidential Information. You will immediately notify the PacBio in the event of any unauthorized or suspected use or disclosure of the Confidential Information. To protect the Confidential Information contained in the Software, you may not reverse engineer, decompile, or disassemble the Software, except to the extent the foregoing restriction is expressly prohibited by applicable law. +8. **Termination.** This Agreement will terminate upon the earlier of: (a) your failure to comply with any term of this Agreement; or (b) return, destruction, or deletion of all copies of the Software in your possession. PacBio’s rights and your obligations will survive the termination of this Agreement. The “**Term**” means the period beginning on when this Agreement becomes effective until the termination of this Agreement. Upon termination of this Agreement for any reason, you will delete from all of your computer libraries or storage devices or otherwise destroy all copies of the Software and derivatives thereof. +9. **NO OTHER WARRANTIES.** THE SOFTWARE IS PROVIDED ON AN “AS IS” BASIS. YOU ASSUME ALL RESPONSIBILITIES FOR SELECTION OF THE SOFTWARE TO ACHIEVE YOUR INTENDED RESULTS, AND FOR THE INSTALLATION OF, USE OF, AND RESULTS OBTAINED FROM THE SOFTWARE. TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, PACBIO DISCLAIMS ALL WARRANTIES, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO IMPLIED WARRANTIES OF MERCHANTABILITY, QUALITY, ACCURACY, TITLE, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE WITH RESPECT TO THE SOFTWARE AND THE ACCOMPANYING WRITTEN MATERIALS. THERE IS NO WARRANTY AGAINST INTERFERENCE WITH THE ENJOYMENT OF THE SOFTWARE OR AGAINST INFRINGEMENT. THERE IS NO WARRANTY THAT THE SOFTWARE OR PACBIO’S EFFORTS WILL FULFILL ANY OF YOUR PARTICULAR PURPOSES OR NEEDS. +10. **LIMITATION OF LIABILITY.** UNDER NO CIRCUMSTANCES WILL PACBIO BE LIABLE FOR ANY CONSEQUENTIAL, SPECIAL, INDIRECT, INCIDENTAL OR PUNITIVE DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION, LOSS OF DATA OR OTHER SUCH PECUNIARY LOSS) ARISING OUT OF THE USE OR INABILITY TO USE THE SOFTWARE, EVEN IF PACBIO HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL PACBIO’S AGGREGATE LIABILITY FOR DAMAGES ARISING OUT OF THIS AGREEMENT EXCEED $5. THE FOREGOING EXCLUSIONS AND LIMITATIONS OF LIABILITY AND DAMAGES WILL NOT APPLY TO CONSEQUENTIAL DAMAGES FOR PERSONAL INJURY. +11. **Indemnification.** You will indemnify, hold harmless, and defend PacBio (including all of its officers, employees, directors, subsidiaries, representatives, affiliates, and agents) and PacBio’s suppliers from and against any damages (including attorney’s fees and expenses), claims, and lawsuits that arise or result from your use of the Software. +12. **Trademarks.** Certain of the product and PacBio names used in this Agreement, the Software may constitute trademarks of PacBio or third parties. You are not authorized to use any such trademarks. +13. **Export Restrictions.** YOU UNDERSTAND AND AGREE THAT THE SOFTWARE IS SUBJECT TO UNITED STATES AND OTHER APPLICABLE EXPORT-RELATED LAWS AND REGULATIONS AND THAT YOU MAY NOT EXPORT, RE-EXPORT OR TRANSFER THE SOFTWARE OR ANY DIRECT PRODUCT OF THE SOFTWARE EXCEPT AS PERMITTED UNDER THOSE LAWS. WITHOUT LIMITING THE FOREGOING, EXPORT, RE-EXPORT, OR TRANSFER OF THE SOFTWARE TO CUBA, IRAN, NORTH KOREA, SYRIA, RUSSIA, BELARUS, AND THE REGIONS OF CRIMEA, LNR, AND DNR OF UKRAINE IS PROHIBITED. +14. **General.** This Agreement is governed by the laws of the State of California, without reference to its conflict of laws principles. This Agreement is the entire agreement between you and PacBio and supersedes any other communications with respect to the Software. If any provision of this Agreement is held invalid or unenforceable, the remainder of this Agreement will continue in full force and effect. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..9b4a7a2 --- /dev/null +++ b/README.md @@ -0,0 +1,36 @@ +# TRGT-denovo: *de novo* tandem repeat caller + +TRGT-denovo is a companion tool of [TRGT (Tandem Repeat Genotyper)](https://github.com/PacificBiosciences/trgt) that does targeted *de novo* calling of tandem repeat mutations from PacBio Hifi Data in trios. It uses the output generated by TRGT to do so. + +Authors: [Tom Mokveld](https://github.com/tmokveld), [Egor Dolzhenko](https://github.com/egor-dolzhenko) + +## Early version warning + +Please note that TRGT-denovo is still in early development and is subject to signficant changes that can affect anything from input / output file formats to program behavior. + +## Availability + +* [Latest release with binary](https://github.com/PacificBiosciences/trgt-denovo/releases/latest) + +To build TRGT-denovo you need a working C compiler. It was tested on Linux with Clang 13.0.0 & GCC 11.3.0 and on Mac OSX (M1) with Clang 15.0.7 & GCC 14.0.0. + +## Documentation + +* [Example run](docs/example.md) +* [Output](docs/output.md) +* [Command-line interface](docs/cli.md) + +## Need help? +If you notice any missing features, bugs, or need assistance with analyzing the output of TRGT-denovo, +please don't hesitate to open a GitHub issue. + +## Support information +TRGT-denovo is a pre-release software intended for research use only and not for use in diagnostic procedures. +While efforts have been made to ensure that TRGT-denovo lives up to the quality that PacBio strives for, we make no warranty regarding this software. + +As TRGT-denovo is not covered by any service level agreement or the like, please do not contact a PacBio Field Applications Scientists or PacBio Customer Service for assistance with any TRGT-denovo release. +Please report all issues through GitHub instead. +We make no warranty that any such issue will be addressed, to any extent or within any time frame. + +### DISCLAIMER +THIS WEBSITE AND CONTENT AND ALL SITE-RELATED SERVICES, INCLUDING ANY DATA, ARE PROVIDED "AS IS," WITH ALL FAULTS, WITH NO REPRESENTATIONS OR WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTIES OF MERCHANTABILITY, SATISFACTORY QUALITY, NON-INFRINGEMENT OR FITNESS FOR A PARTICULAR PURPOSE. YOU ASSUME TOTAL RESPONSIBILITY AND RISK FOR YOUR USE OF THIS SITE, ALL SITE-RELATED SERVICES, AND ANY THIRD PARTY WEBSITES OR APPLICATIONS. NO ORAL OR WRITTEN INFORMATION OR ADVICE SHALL CREATE A WARRANTY OF ANY KIND. ANY REFERENCES TO SPECIFIC PRODUCTS OR SERVICES ON THE WEBSITES DO NOT CONSTITUTE OR IMPLY A RECOMMENDATION OR ENDORSEMENT BY PACIFIC BIOSCIENCES. \ No newline at end of file diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..2e62fcd --- /dev/null +++ b/build.rs @@ -0,0 +1,21 @@ +use std::error::Error; +use vergen::EmitBuilder; + +fn main() -> Result<(), Box> { + match EmitBuilder::builder() + .fail_on_error() + .custom_build_rs(".") // override such that it will re-run whenever we have a file change in this folder + .all_git() + .git_describe(true, false, Some("ThisPatternShouldNotMatchAnythingEver")) + .emit() + { + Ok(_) => { + println!("cargo:rerun-if-changed=Cargo.toml"); + println!("cargo:rerun-if-changed=src"); + } + Err(_e) => { + println!("cargo:rustc-env=VERGEN_GIT_DESCRIBE=unknown"); + } + } + Ok(()) +} diff --git a/docs/cli.md b/docs/cli.md new file mode 100644 index 0000000..62c0582 --- /dev/null +++ b/docs/cli.md @@ -0,0 +1,19 @@ +## TRGT-denovo command-line options + +### Command: trio +Basic: +- `-r, --reference ` Path to the FASTA file containing the reference genome. The same reference genome as was used for read alignment. +- `-b, --bed ` Path to the BED file with reference coordinates of tandem repeats +- `-m, --mother ` Common (path) prefix of spanning reads BAM file and variant call VCF file of mother +- `-f, --father ` Common (path) prefix of spanning reads BAM file and variant call VCF file of father +- `-c, --child ` Common (path) prefix of spanning reads BAM file and variant call VCF file of child +- `-o, --out ` Output csv path +- `--trid ` Optionally a tandem repeat ID may be supplied, if so, only this site will be tested, default = None +- `-@ ` Number of threads, the number of sites that are processed in parallel, default = 1 +- `-h, --help` Print help +- `-V, --version` Print version + +Advanced: +- `--flank-len ` Number of flanking nucleotides added to a target region during realignment, default = 50 +- `--no-clip-aln` Score alignments without stripping the flanks +- `--parental-quantile ` Quantile of alignment scores to determine the parental threshold, default is strict and takes only the top scoring alignment, default = 1.0 \ No newline at end of file diff --git a/docs/example.md b/docs/example.md new file mode 100644 index 0000000..dc5a77a --- /dev/null +++ b/docs/example.md @@ -0,0 +1,104 @@ +# Introduction + +A brief overview of the steps necessary to perform *de novo* tandem repeat mutation calling from TRGT output using TRGT-denovo + +## Prerequisites + +- [TRGT binary](https://github.com/PacificBiosciences/trgt/releases/latest) +- [TRGT-denovo binary](https://github.com/PacificBiosciences/trgt-denovo/releases/latest) +- Aligned HiFi data of a family trio (father, mother, child) +- The reference genome used for the alignments +- A BED file with repeat expansion definitions (always same or a subset of those with TRGT) + +## Calling *de novo* tandem repeats + +Given the following data: + +- reference genome `reference.fasta` +- aligned sequencing data of the family (father, mother, and son respectively) `sample_F.bam`, `sample_M.bam`, `sample_S.bam`, +- repeat definition file `repeat.bed`. + +### Data pre-processing + +Prior to *de novo* calling all data must first be genotyped by TRGT: + +``` +./trgt --genome reference.fasta \ + --repeats repeat.bed \ + --reads sample_F.bam \ + --output-prefix sample_F \ + --karyotype XY +``` + +``` +./trgt --genome reference.fasta \ + --repeats repeat.bed \ + --reads sample_M.bam \ + --output-prefix sample_M \ + --karyotype XX +``` + +``` +./trgt --genome reference.fasta \ + --repeats repeat.bed \ + --reads sample_S.bam \ + --output-prefix sample_S \ + --karyotype XY +``` + +TRGT outputs the genotyped repeat sites in a VCF file stored in `prefix.vcf.gz` and the spanning reads that were used to genotype each site (that fully span the repeat sequences) stored in `prefix.spanning.bam`. TRGT-denovo requires sorted BAM and VCF data, hence you will need to sort and index the output VCF and BAM files. For each family member this involves: + +#### VCF sorting +``` +bcftools sort -Ob -o sample_F.sorted.vcf.gz sample_F.vcf.gz +bcftools index sample_F.sorted.vcf.gz +``` + +#### BAM sorting +``` +samtools sort -o sample_F.spanning.sorted.bam sample_F.spanning.bam +samtools index sample_F.spanning.sorted.bam +``` + +Such that you end up with `sample_F.sorted.vcf.gz`, `sample_F.spanning.sorted.bam`, `sample_M.sorted.vcf.gz`, `sample_M.spanning.sorted.bam`, `sample_S.sorted.vcf.gz`, `sample_S.spanning.sorted.bam` (and their associated `.bam.bai` and `.vcf.gz.csi` indices). + +### Running TRGT-denovo + +With all preprocessing completed it we can call *de novo* repeat expansion mutations using TRGT-denovo from the sample data. Note that family members are supplied by their common prefix of `spanning.sorted.bam` and `sorted.vcf.gz`, i.e., `sample_F`, `sample_M`, and `sample_S` and path if not running TRGT-denovo in the same directory as the data: + +``` +./TRGT-denovo trio --reference reference.fasta \ + --bed repeat.bed \ + --father sample_F \ + --mother sample_M \ + --child sample_S \ + --out out.csv +``` + +## Example + +Below output of HG002 is shown for two candidate *de novo* tandem repeat mutation sites: +``` +trid genotype denovo_coverage allele_coverage allele_ratio child_coverage child_ratio mean_diff_father mean_diff_mother father_dropout_prob mother_dropout_prob allele_origin denovo_status per_allele_reads_father per_allele_reads_mother per_allele_reads_child index father_motif_counts mother_motif_counts child_motif_counts maxlh +chr1_47268728_47268830_ATAA 1 19 37 0.5135 37 0.5135 6.7368 6.7368 0.0000 0.0000 M:1 Y:= 43 26 37 0 25 25 25 0.7152 +chr1_7862944_7863157_TATTG 1 0 21 0.0000 37 0.0000 0.0000 19.2000 0.0000 0.0000 F:2 X 18,17 16,19 21,16 0 27,29 27,63 29,60 0.9820 +chr1_7862944_7863157_TATTG 2 16 16 1.0000 37 0.4324 171.8750 22.8750 0.0000 0.0000 M:2 Y:- 18,17 16,19 21,16 1 27,29 27,63 29,60 1.0000 +``` + +### Site 1 + +The first site is homozygous in the child, hence only one call is made. It has a *de novo* coverage (DNC) of 19, i.e., there are 19 reads that support a candidate *de novo* allele relative to the parental read alignments. The DNC should always be put into context of the total coverage, to ascertain that: + +1. There is sufficient coverage +2. The ratio between the two is close to 0.5 + +The DNC at this site is high and the ratio makes it likely that this is a confident call. However, the score difference with respect to either parents is low, such that the expected event size is small. Additionally, the score difference is equivalent in both parents such that parental origin may not be assessed. Generally the parent with the smallest score difference is the one from which is inherited: + +![Example site 1](figures/example_site_1.png) + +### Site 2 + +The second site is heterozygous in the child, hence both child alleles are tested. The second allele is a potential *de novo* call. The score difference with respect to the maternal alleles is the smallest. + +![Example site 2](figures/example_site_2.png) + diff --git a/docs/figures/example_site_1.png b/docs/figures/example_site_1.png new file mode 100644 index 0000000..8eb4532 Binary files /dev/null and b/docs/figures/example_site_1.png differ diff --git a/docs/figures/example_site_2.png b/docs/figures/example_site_2.png new file mode 100644 index 0000000..325bd79 Binary files /dev/null and b/docs/figures/example_site_2.png differ diff --git a/docs/output.md b/docs/output.md new file mode 100644 index 0000000..5f19cbf --- /dev/null +++ b/docs/output.md @@ -0,0 +1,27 @@ +# Interpreting TRGT-denovo output + +TRGT-denovo scores and then outputs target sites in a tab-separated format, with 11 columns: + +- `trid` ID of the tandem repeat, encoded as in the BED file +- `genotype` Genotype ID of the child for a specific allele, corresponds to the TRGT genotype ID +- `denovo_coverage` Number of child reads supporting a de novo allele compared to parental data. +- `allele_coverage` Number of child reads mapped to this specific allele +- `allele_ratio` Ratio of de novo coverage to allele coverage. +- `child_coverage` Total number of child reads at this site +- `child_ratio` Ratio of de novo coverage to total coverage at this site +- `mean_diff_father` Score difference between de novo and paternal reads; lower values indicate greater similarity +- `mean_diff_mother` Score difference between de novo and maternal reads; lower values indicate greater similarity +- `father_dropout_prob` Dropout rate for reads coming from the mother +- `mother_dropout_prob` Dropout rate for reads coming from the father. +- `allele_origin` Inferred origin of the allele based on alignment; possible values: `{F:{1,2,?}, M:{1,2,?}, ?}` +- `denovo_status` Indicates if the allele is de novo, only if `allele_origin` is defined; possible values: `{?, Y:{+, -, =}}` +- `per_allele_reads_father` Number of reads partitioned per allele in the father (allele1, allele2) +- `per_allele_reads_mother` Number of reads partitioned per allele in the mother (allele1, allele2) +- `per_allele_reads_child` Number of reads partitioned per allele in the child (allele1, allele2) +- `index` Index of this allele in the TRGT VCF, used for linking to `child_motif_counts` +- `father_motif_counts` TRGT VCF motif counts for this locus in the father +- `mother_motif_counts` TRGT VCF motif counts for this locus in the mother +- `child_motif_counts` TRGT VCF motif counts for this locus in the child +- `maxlh` Maximum likelihood score of child allele data relative to parental alleles (unstable) + +TRGT-denovo does calling based on the genotyping and spanning reads generated by TRGT and will test and output at most two alleles in the event of a heterozygous site. \ No newline at end of file diff --git a/src/aligner.rs b/src/aligner.rs new file mode 100644 index 0000000..878d7e9 --- /dev/null +++ b/src/aligner.rs @@ -0,0 +1,1038 @@ +use crate::wfa2; +use serde::{Deserialize, Serialize}; +use std::ptr; + +#[derive(Serialize, Deserialize, Debug, Clone, Copy, PartialEq, Eq)] +pub enum MemoryModel { + MemoryHigh, + MemoryMed, + MemoryLow, + MemoryUltraLow, +} + +#[derive(Serialize, Deserialize, Debug, Clone, Copy, PartialEq, Eq)] +pub enum AlignmentScope { + Score, + Alignment, +} + +impl From for AlignmentScope { + fn from(value: u32) -> Self { + match value { + wfa2::alignment_scope_t_compute_score => AlignmentScope::Score, + wfa2::alignment_scope_t_compute_alignment => AlignmentScope::Alignment, + _ => panic!("Unknown alignment scope: {}", value), + } + } +} + +#[derive(Serialize, Deserialize, Debug, Clone, Copy, PartialEq, Eq)] +pub enum Heuristic { + None, + WFadaptive(i32, i32, i32), + WFmash(i32, i32, i32), + XDrop(i32, i32), + ZDrop(i32, i32), + BandedStatic(i32, i32), + BandedAdaptive(i32, i32, i32), +} + +pub enum DistanceMetric { + Indel, + Edit, + GapLinear, + GapAffine, + GapAffine2p, +} + +impl From for DistanceMetric { + fn from(value: u32) -> Self { + match value { + wfa2::distance_metric_t_indel => DistanceMetric::Indel, + wfa2::distance_metric_t_edit => DistanceMetric::Edit, + wfa2::distance_metric_t_gap_linear => DistanceMetric::GapLinear, + wfa2::distance_metric_t_gap_affine => DistanceMetric::GapAffine, + wfa2::distance_metric_t_gap_affine_2p => DistanceMetric::GapAffine2p, + _ => panic!("Unknown distance metric: {}", value), + } + } +} + +#[derive(Debug, PartialEq, Eq)] +pub enum AlignmentStatus { + // OK Status (>=0) + StatusAlgCompleted = wfa2::WF_STATUS_ALG_COMPLETED as isize, + StatusAlgPartial = wfa2::WF_STATUS_ALG_PARTIAL as isize, + // FAILED Status (<0) + StatusMaxStepsReached = wfa2::WF_STATUS_MAX_STEPS_REACHED as isize, + StatusOOM = wfa2::WF_STATUS_OOM as isize, + StatusUnattainable = wfa2::WF_STATUS_UNATTAINABLE as isize, +} + +impl From for AlignmentStatus { + fn from(value: i32) -> Self { + match value { + x if x == wfa2::WF_STATUS_ALG_COMPLETED as i32 => AlignmentStatus::StatusAlgCompleted, + x if x == wfa2::WF_STATUS_ALG_PARTIAL as i32 => AlignmentStatus::StatusAlgPartial, + wfa2::WF_STATUS_MAX_STEPS_REACHED => AlignmentStatus::StatusMaxStepsReached, + wfa2::WF_STATUS_OOM => AlignmentStatus::StatusOOM, + wfa2::WF_STATUS_UNATTAINABLE => AlignmentStatus::StatusUnattainable, + _ => panic!("Unknown alignment status: {}", value), + } + } +} + +#[derive(Debug, Copy, Clone)] +struct WFAttributes { + inner: wfa2::wavefront_aligner_attr_t, +} + +impl WFAttributes { + fn default() -> Self { + Self { + inner: unsafe { wfa2::wavefront_aligner_attr_default }, + } + } + + fn memory_model(mut self, memory_model: MemoryModel) -> Self { + let memory_mode = match memory_model { + MemoryModel::MemoryHigh => wfa2::wavefront_memory_t_wavefront_memory_high, + MemoryModel::MemoryMed => wfa2::wavefront_memory_t_wavefront_memory_med, + MemoryModel::MemoryLow => wfa2::wavefront_memory_t_wavefront_memory_low, + MemoryModel::MemoryUltraLow => wfa2::wavefront_memory_t_wavefront_memory_ultralow, + }; + self.inner.memory_mode = memory_mode; + self + } + + fn alignment_scope(mut self, alignment_scope: AlignmentScope) -> Self { + let alignment_scope = match alignment_scope { + AlignmentScope::Score => wfa2::alignment_scope_t_compute_score, + AlignmentScope::Alignment => wfa2::alignment_scope_t_compute_alignment, + }; + self.inner.alignment_scope = alignment_scope; + self + } + + fn indel_penalties(mut self) -> Self { + self.inner.distance_metric = wfa2::distance_metric_t_indel; + self + } + + fn edit_penalties(mut self) -> Self { + self.inner.distance_metric = wfa2::distance_metric_t_edit; + self + } + + fn linear_penalties(mut self, match_: i32, mismatch: i32, indel: i32) -> Self { + self.inner.distance_metric = wfa2::distance_metric_t_gap_linear; + self.inner.linear_penalties.match_ = match_; // (Penalty representation usually M <= 0) + self.inner.linear_penalties.mismatch = mismatch; // (Penalty representation usually X > 0) + self.inner.linear_penalties.indel = indel; // (Penalty representation usually I > 0) + self + } + + fn affine_penalties( + mut self, + match_: i32, + mismatch: i32, + gap_opening: i32, + gap_extension: i32, + ) -> Self { + self.inner.distance_metric = wfa2::distance_metric_t_gap_affine; + self.inner.affine_penalties.match_ = match_; // (Penalty representation usually M <= 0) + self.inner.affine_penalties.mismatch = mismatch; // (Penalty representation usually X > 0) + self.inner.affine_penalties.gap_opening = gap_opening; // (Penalty representation usually O > 0) + self.inner.affine_penalties.gap_extension = gap_extension; // (Penalty representation usually E > 0) + self + } + + fn affine2p_penalties( + mut self, + match_: i32, + mismatch: i32, + gap_opening1: i32, + gap_extension1: i32, + gap_opening2: i32, + gap_extension2: i32, + ) -> Self { + self.inner.distance_metric = wfa2::distance_metric_t_gap_affine_2p; + self.inner.affine2p_penalties.match_ = match_; // (Penalty representation usually M <= 0) + self.inner.affine2p_penalties.mismatch = mismatch; // (Penalty representation usually X > 0) + // Usually concave Q1 + E1 < Q2 + E2 and E1 > E2. + self.inner.affine2p_penalties.gap_opening1 = gap_opening1; // (Penalty representation usually O1 > 0) + self.inner.affine2p_penalties.gap_extension1 = gap_extension1; // (Penalty representation usually E1 > 0) + self.inner.affine2p_penalties.gap_opening2 = gap_opening2; // (Penalty representation usually O2 > 0) + self.inner.affine2p_penalties.gap_extension2 = gap_extension2; // (Penalty representation usually E2 > 0) + self + } +} + +pub struct WFAligner { + attributes: WFAttributes, + inner: *mut wfa2::wavefront_aligner_t, +} + +impl WFAligner { + pub fn new(alignment_scope: AlignmentScope, memory_model: MemoryModel) -> Self { + let attributes = WFAttributes::default() + .memory_model(memory_model) + .alignment_scope(alignment_scope); + Self { + attributes, + inner: ptr::null_mut(), + } + } +} + +impl Drop for WFAligner { + fn drop(&mut self) { + unsafe { + if !self.inner.is_null() { + wfa2::wavefront_aligner_delete(self.inner); + } + } + } +} + +impl WFAligner { + pub fn align_end_to_end(&mut self, pattern: &[u8], text: &[u8]) -> AlignmentStatus { + let status = unsafe { + wfa2::wavefront_aligner_set_alignment_end_to_end(self.inner); + wfa2::wavefront_align( + self.inner, + pattern.as_ptr() as *const i8, + pattern.len() as i32, + text.as_ptr() as *const i8, + text.len() as i32, + ) + }; + AlignmentStatus::from(status) + } + + pub fn align_ends_free( + &mut self, + pattern: &[u8], + pattern_begin_free: i32, + pattern_end_free: i32, + text: &[u8], + text_begin_free: i32, + text_end_free: i32, + ) -> AlignmentStatus { + let status = unsafe { + wfa2::wavefront_aligner_set_alignment_free_ends( + self.inner, + pattern_begin_free, + pattern_end_free, + text_begin_free, + text_end_free, + ); + wfa2::wavefront_align( + self.inner, + pattern.as_ptr() as *const i8, + pattern.len() as i32, + text.as_ptr() as *const i8, + text.len() as i32, + ) + }; + AlignmentStatus::from(status) + } + + pub fn score(&self) -> i32 { + unsafe { *(*self.inner).cigar }.score + } + + pub fn cigar_score_gap_affine2p_direct(&mut self) -> i32 { + unsafe { + wfa2::cigar_score_gap_affine2p( + (*self.inner).cigar, + &mut self.attributes.inner.affine2p_penalties, + ) + } + } + + fn cigar_score_gap_affine2p_get_operations_score( + operation: char, + op_length: i32, + penalties: &wfa2::affine2p_penalties_t, + ) -> i32 { + match operation { + 'M' => op_length * penalties.match_, + 'X' => op_length * penalties.mismatch, + 'D' | 'I' => { + let score1 = penalties.gap_opening1 + penalties.gap_extension1 * op_length; + let score2 = penalties.gap_opening2 + penalties.gap_extension2 * op_length; + std::cmp::min(score1, score2) + } + _ => panic!("Invalid operation: {}", operation), + } + } + + fn cigar_score_gap_affine2_clipped(&self, flank_len: usize) -> i32 { + let cigar = unsafe { (*self.inner).cigar.as_ref() }.unwrap(); + let begin_offset = cigar.begin_offset as isize + flank_len as isize; + let end_offset = cigar.end_offset as isize - flank_len as isize; + + let operations: Vec = + unsafe { std::slice::from_raw_parts(cigar.operations, cigar.max_operations as usize) } + .iter() + .map(|&op| op as u8 as char) + .collect(); + + let mut score = 0; + let mut op_length = 0; + let mut last_op: Option = None; + for i in begin_offset..end_offset { + let cur_op = operations[i as usize]; + if last_op.is_some() && cur_op != last_op.unwrap() { + score -= Self::cigar_score_gap_affine2p_get_operations_score( + last_op.unwrap(), + op_length, + &self.attributes.inner.affine2p_penalties, + ); + op_length = 0; + } + last_op = Some(cur_op); + op_length += 1; + } + if let Some(op) = last_op { + score -= Self::cigar_score_gap_affine2p_get_operations_score( + op, + op_length, + &self.attributes.inner.affine2p_penalties, + ); + } + score + } + + pub fn cigar_score_clipped(&self, flank_len: usize) -> i32 { + if AlignmentScope::from(self.attributes.inner.alignment_scope) == AlignmentScope::Score { + panic!("Cannot clip when AlignmentScope is Score"); + } + + match DistanceMetric::from(self.attributes.inner.distance_metric) { + DistanceMetric::Indel | DistanceMetric::Edit => { + unimplemented!("Indel/Linear distance metric not implemented") + } + DistanceMetric::GapLinear => { + unimplemented!("GapLinear distance metric not implemented") + } + DistanceMetric::GapAffine => { + unimplemented!("GapAffine distance metric not implemented") + } + DistanceMetric::GapAffine2p => self.cigar_score_gap_affine2_clipped(flank_len), + } + } + + pub fn cigar_score(&mut self) -> i32 { + if AlignmentScope::from(self.attributes.inner.alignment_scope) == AlignmentScope::Score { + panic!("Cannot clip when AlignmentScope is Score"); + } + + unsafe { + match DistanceMetric::from(self.attributes.inner.distance_metric) { + DistanceMetric::Indel | DistanceMetric::Edit => { + wfa2::cigar_score_edit((*self.inner).cigar) + } + DistanceMetric::GapLinear => wfa2::cigar_score_gap_linear( + (*self.inner).cigar, + &mut self.attributes.inner.linear_penalties, + ), + DistanceMetric::GapAffine => wfa2::cigar_score_gap_affine( + (*self.inner).cigar, + &mut self.attributes.inner.affine_penalties, + ), + DistanceMetric::GapAffine2p => wfa2::cigar_score_gap_affine2p( + (*self.inner).cigar, + &mut self.attributes.inner.affine2p_penalties, + ), + } + } + } + + pub fn cigar(&self, flank_len: Option) -> String { + let offset = flank_len.unwrap_or(0); + let mut cstr = String::new(); + + let cigar = unsafe { (*self.inner).cigar.as_ref() }.unwrap(); + let begin_offset = cigar.begin_offset as usize + offset; + let end_offset = cigar.end_offset as usize - offset; + + if begin_offset >= end_offset { + return cstr; + } + + let operations = + unsafe { std::slice::from_raw_parts(cigar.operations, cigar.max_operations as usize) }; + let mut last_op = operations[begin_offset]; + let mut last_op_length = 1; + + for i in 1..(end_offset - begin_offset) { + let cur_op = operations[begin_offset + i]; + if cur_op == last_op { + last_op_length += 1; + } else { + cstr.push_str(&format!("{}", last_op_length)); + cstr.push(last_op as u8 as char); + last_op = cur_op; + last_op_length = 1; + } + } + cstr.push_str(&format!("{}", last_op_length)); + cstr.push(last_op as u8 as char); + cstr + } + + pub fn matching( + &self, + pattern: &[u8], + text: &[u8], + flank_len: Option, + ) -> (String, String, String) { + let offset = flank_len.unwrap_or(0); + + let mut pattern_iter = pattern.iter().peekable(); + let mut text_iter = text.iter().peekable(); + + if offset > 0 { + text_iter.nth(offset - 1); + pattern_iter.nth(offset - 1); + } + + let mut pattern_alg = String::new(); + let mut ops_alg = String::new(); + let mut text_alg = String::new(); + + let cigar = unsafe { (*self.inner).cigar.as_ref() }.unwrap(); + let operations = + unsafe { std::slice::from_raw_parts(cigar.operations, cigar.max_operations as usize) }; + + let begin_offset = cigar.begin_offset as isize + offset as isize; + let end_offset = cigar.end_offset as isize - offset as isize; + + for i in begin_offset..end_offset { + match operations[i as usize] as u8 as char { + 'M' => { + if pattern_iter.peek() != text_iter.peek() { + ops_alg.push('X'); + } else { + ops_alg.push('|'); + } + pattern_alg.push(*pattern_iter.next().unwrap() as char); + text_alg.push(*text_iter.next().unwrap() as char); + } + 'X' => { + if pattern_iter.peek() != text_iter.peek() { + ops_alg.push(' '); + } else { + ops_alg.push('X'); + } + pattern_alg.push(*pattern_iter.next().unwrap() as char); + text_alg.push(*text_iter.next().unwrap() as char); + } + 'I' => { + pattern_alg.push('-'); + ops_alg.push(' '); + text_alg.push(*text_iter.next().unwrap() as char); + } + 'D' => { + pattern_alg.push(*pattern_iter.next().unwrap() as char); + ops_alg.push(' '); + text_alg.push('-'); + } + _ => panic!("Unknown cigar operation"), + } + } + (pattern_alg, ops_alg, text_alg) + } + + // TODO: Update this and other functions using cigar.operations to use cigar_buffer instead. + pub fn find_alignment_span(&self) -> ((usize, usize), (usize, usize)) { + let mut pattern_start: usize = 0; + let mut pattern_end: usize = 0; + let mut text_start: usize = 0; + let mut text_end: usize = 0; + + let mut pattern_index: usize = 0; + let mut text_index: usize = 0; + let mut is_span_started: bool = false; + + let cigar = unsafe { (*self.inner).cigar.as_ref() }.unwrap(); + let operations = + unsafe { std::slice::from_raw_parts(cigar.operations, cigar.max_operations as usize) }; + + for i in cigar.begin_offset..cigar.end_offset { + match operations[i as usize] as u8 as char { + 'I' => text_index += 1, + 'D' => pattern_index += 1, + 'M' => { + if !is_span_started { + pattern_start = pattern_index; + text_start = text_index; + is_span_started = true; + } + pattern_index += 1; + text_index += 1; + pattern_end = pattern_index; + text_end = text_index; + } + 'X' => { + pattern_index += 1; + text_index += 1; + } + _ => panic!("Unexpected operation"), + } + } + ((pattern_start, pattern_end), (text_start, text_end)) + } + + pub fn set_heuristic(&mut self, heuristic: Heuristic) { + unsafe { + match heuristic { + Heuristic::None => wfa2::wavefront_aligner_set_heuristic_none(self.inner), + Heuristic::WFadaptive( + min_wavefront_length, + max_distance_threshold, + steps_between_cutoffs, + ) => wfa2::wavefront_aligner_set_heuristic_wfadaptive( + self.inner, + min_wavefront_length, + max_distance_threshold, + steps_between_cutoffs, + ), + Heuristic::WFmash( + min_wavefront_length, + max_distance_threshold, + steps_between_cutoffs, + ) => wfa2::wavefront_aligner_set_heuristic_wfmash( + self.inner, + min_wavefront_length, + max_distance_threshold, + steps_between_cutoffs, + ), + Heuristic::XDrop(xdrop, steps_between_cutoffs) => { + wfa2::wavefront_aligner_set_heuristic_xdrop( + self.inner, + xdrop, + steps_between_cutoffs, + ) + } + Heuristic::ZDrop(zdrop, steps_between_cutoffs) => { + wfa2::wavefront_aligner_set_heuristic_zdrop( + self.inner, + zdrop, + steps_between_cutoffs, + ) + } + Heuristic::BandedStatic(band_min_k, band_max_k) => { + wfa2::wavefront_aligner_set_heuristic_banded_static( + self.inner, band_min_k, band_max_k, + ) + } + Heuristic::BandedAdaptive(band_min_k, band_max_k, steps_between_cutoffs) => { + wfa2::wavefront_aligner_set_heuristic_banded_adaptive( + self.inner, + band_min_k, + band_max_k, + steps_between_cutoffs, + ) + } + } + } + } +} + +/// Indel Aligner (a.k.a Longest Common Subsequence - LCS) +pub struct WFAlignerIndel; + +impl WFAlignerIndel { + pub fn new(alignment_scope: AlignmentScope, memory_model: MemoryModel) -> WFAligner { + let mut aligner = WFAligner::new(alignment_scope, memory_model); + aligner.attributes = aligner.attributes.indel_penalties(); + unsafe { + aligner.inner = wfa2::wavefront_aligner_new(&mut aligner.attributes.inner); + } + aligner + } +} + +/// Edit Aligner (a.k.a Levenshtein) +pub struct WFAlignerEdit; + +impl WFAlignerEdit { + pub fn new(alignment_scope: AlignmentScope, memory_model: MemoryModel) -> WFAligner { + let mut aligner = WFAligner::new(alignment_scope, memory_model); + aligner.attributes = aligner.attributes.edit_penalties(); + unsafe { + aligner.inner = wfa2::wavefront_aligner_new(&mut aligner.attributes.inner); + } + aligner + } +} + +/// Gap-Linear Aligner (a.k.a Needleman-Wunsch) +pub struct WFAlignerGapLinear; + +impl WFAlignerGapLinear { + pub fn new_with_match( + match_: i32, + mismatch: i32, + indel: i32, + alignment_scope: AlignmentScope, + memory_model: MemoryModel, + ) -> WFAligner { + let mut aligner = WFAligner::new(alignment_scope, memory_model); + aligner.attributes = WFAttributes::default() + .linear_penalties(match_, mismatch, indel) + .alignment_scope(alignment_scope) + .memory_model(memory_model); + unsafe { + aligner.inner = wfa2::wavefront_aligner_new(&mut aligner.attributes.inner); + } + aligner + } + + pub fn new( + mismatch: i32, + indel: i32, + alignment_scope: AlignmentScope, + memory_model: MemoryModel, + ) -> WFAligner { + Self::new_with_match(0, mismatch, indel, alignment_scope, memory_model) + } +} + +/// Gap-Affine Aligner (a.k.a Smith-Waterman-Gotoh) +pub struct WFAlignerGapAffine; + +impl WFAlignerGapAffine { + pub fn new_with_match( + match_: i32, + mismatch: i32, + gap_opening: i32, + gap_extension: i32, + alignment_scope: AlignmentScope, + memory_model: MemoryModel, + ) -> WFAligner { + let mut aligner = WFAligner::new(alignment_scope, memory_model); + aligner.attributes = WFAttributes::default() + .affine_penalties(match_, mismatch, gap_opening, gap_extension) + .alignment_scope(alignment_scope) + .memory_model(memory_model); + unsafe { + aligner.inner = wfa2::wavefront_aligner_new(&mut aligner.attributes.inner); + } + aligner + } + + pub fn new( + mismatch: i32, + gap_opening: i32, + gap_extension: i32, + alignment_scope: AlignmentScope, + memory_model: MemoryModel, + ) -> WFAligner { + Self::new_with_match( + 0, + mismatch, + gap_opening, + gap_extension, + alignment_scope, + memory_model, + ) + } +} + +/// Gap-Affine Dual-Cost Aligner (a.k.a. concave 2-pieces) +pub struct WFAlignerGapAffine2Pieces; + +impl WFAlignerGapAffine2Pieces { + pub fn new_with_match( + match_: i32, + mismatch: i32, + gap_opening1: i32, + gap_extension1: i32, + gap_opening2: i32, + gap_extension2: i32, + alignment_scope: AlignmentScope, + memory_model: MemoryModel, + ) -> WFAligner { + let mut aligner = WFAligner::new(alignment_scope, memory_model); + aligner.attributes = WFAttributes::default() + .affine2p_penalties( + match_, + mismatch, + gap_opening1, + gap_extension1, + gap_opening2, + gap_extension2, + ) + .alignment_scope(alignment_scope) + .memory_model(memory_model); + unsafe { + aligner.inner = wfa2::wavefront_aligner_new(&mut aligner.attributes.inner); + } + aligner + } + + pub fn new( + mismatch: i32, + gap_opening1: i32, + gap_extension1: i32, + gap_opening2: i32, + gap_extension2: i32, + alignment_scope: AlignmentScope, + memory_model: MemoryModel, + ) -> WFAligner { + Self::new_with_match( + 0, + mismatch, + gap_opening1, + gap_extension1, + gap_opening2, + gap_extension2, + alignment_scope, + memory_model, + ) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + const PATTERN: &[u8] = b"AGCTAGTGTCAATGGCTACTTTTCAGGTCCT"; + const TEXT: &[u8] = b"AACTAAGTGTCGGTGGCTACTATATATCAGGTCCT"; + + #[test] + fn test_aligner_indel() { + let mut aligner = WFAlignerIndel::new(AlignmentScope::Alignment, MemoryModel::MemoryHigh); + let status = aligner.align_end_to_end(PATTERN, TEXT); + assert_eq!(status, AlignmentStatus::StatusAlgCompleted); + assert_eq!(aligner.score(), 10); + assert_eq!(aligner.cigar(None), "1M1I1D3M1I5M2I2D8M1I1M1I1M1I9M"); + let (a, b, c) = aligner.matching(PATTERN, TEXT, None); + assert_eq!( + format!("{}\n{}\n{}", a, b, c), + "A-GCTA-GTGTC--AATGGCTACT-T-T-TCAGGTCCT\n| ||| ||||| |||||||| | | |||||||||\nAA-CTAAGTGTCGG--TGGCTACTATATATCAGGTCCT" + ); + } + + #[test] + fn test_aligner_edit() { + let mut aligner = WFAlignerEdit::new(AlignmentScope::Alignment, MemoryModel::MemoryHigh); + let status = aligner.align_end_to_end(PATTERN, TEXT); + assert_eq!(status, AlignmentStatus::StatusAlgCompleted); + assert_eq!(aligner.score(), 7); + assert_eq!(aligner.cigar(None), "1M1X3M1I5M2X8M1I1M1I1M1I9M"); + let (a, b, c) = aligner.matching(PATTERN, TEXT, None); + assert_eq!( + format!("{}\n{}\n{}", a, b, c), + "AGCTA-GTGTCAATGGCTACT-T-T-TCAGGTCCT\n| ||| ||||| |||||||| | | |||||||||\nAACTAAGTGTCGGTGGCTACTATATATCAGGTCCT" + ); + } + + #[test] + fn test_aligner_gap_linear() { + let mut aligner = + WFAlignerGapLinear::new(6, 2, AlignmentScope::Alignment, MemoryModel::MemoryHigh); + let status = aligner.align_end_to_end(PATTERN, TEXT); + assert_eq!(status, AlignmentStatus::StatusAlgCompleted); + assert_eq!(aligner.score(), -20); + assert_eq!(aligner.cigar(None), "1M1I1D3M1I5M2I2D8M1I1M1I1M1I9M"); + let (a, b, c) = aligner.matching(PATTERN, TEXT, None); + assert_eq!( + format!("{}\n{}\n{}", a, b, c), + "A-GCTA-GTGTC--AATGGCTACT-T-T-TCAGGTCCT\n| ||| ||||| |||||||| | | |||||||||\nAA-CTAAGTGTCGG--TGGCTACTATATATCAGGTCCT" + ); + } + + #[test] + fn test_aligner_gap_affine() { + let mut aligner = + WFAlignerGapAffine::new(6, 4, 2, AlignmentScope::Alignment, MemoryModel::MemoryLow); + let status = aligner.align_end_to_end(PATTERN, TEXT); + assert_eq!(status, AlignmentStatus::StatusAlgCompleted); + assert_eq!(aligner.score(), -40); + assert_eq!(aligner.cigar(None), "1M1X3M1I5M2X8M3I1M1X9M"); + let (a, b, c) = aligner.matching(PATTERN, TEXT, None); + assert_eq!( + format!("{}\n{}\n{}", a, b, c), + "AGCTA-GTGTCAATGGCTACT---TTTCAGGTCCT\n| ||| ||||| |||||||| | |||||||||\nAACTAAGTGTCGGTGGCTACTATATATCAGGTCCT" + ); + } + + #[test] + fn test_aligner_score_only() { + let mut aligner = + WFAlignerGapAffine::new(6, 4, 2, AlignmentScope::Score, MemoryModel::MemoryLow); + let status = aligner.align_end_to_end(PATTERN, TEXT); + assert_eq!(status, AlignmentStatus::StatusAlgCompleted); + assert_eq!(aligner.score(), -40); + assert_eq!(aligner.cigar(None), ""); + let (a, b, c) = aligner.matching(PATTERN, TEXT, None); + assert_eq!(format!("{}\n{}\n{}", a, b, c), "\n\n"); + } + + #[test] + fn test_aligner_gap_affine_2pieces() { + let mut aligner = WFAlignerGapAffine2Pieces::new( + 6, + 2, + 2, + 4, + 1, + AlignmentScope::Alignment, + MemoryModel::MemoryHigh, + ); + let status = aligner.align_end_to_end(PATTERN, TEXT); + assert_eq!(status, AlignmentStatus::StatusAlgCompleted); + assert_eq!(aligner.score(), -34); + assert_eq!(aligner.cigar(None), "1M1X3M1I5M2X8M1I1M1I1M1I9M"); + let (a, b, c) = aligner.matching(PATTERN, TEXT, None); + assert_eq!( + format!("{}\n{}\n{}", a, b, c), + "AGCTA-GTGTCAATGGCTACT-T-T-TCAGGTCCT\n| ||| ||||| |||||||| | | |||||||||\nAACTAAGTGTCGGTGGCTACTATATATCAGGTCCT" + ); + } + + #[test] + fn test_aligner_span_1() { + let pattern = b"AATTTAAGTCTAGGCTACTTTC"; + let text = b"CCGACTACTACGAAATTTAAGTATAGGCTACTTTCCGTACGTACGTACGT"; + let mut aligner = WFAlignerGapAffine2Pieces::new( + 8, + 4, + 2, + 24, + 1, + AlignmentScope::Alignment, + MemoryModel::MemoryHigh, + ); + let status = aligner.align_ends_free(pattern, 0, 0, text, 0, text.len() as i32); + assert_eq!(status, AlignmentStatus::StatusAlgCompleted); + let ((xstart, xend), (ystart, yend)) = aligner.find_alignment_span(); + assert_eq!(ystart, 13); + assert_eq!(yend, 35); + assert_eq!(xstart, 0); + assert_eq!(xend, 22); + } + + #[test] + fn test_aligner_span_2() { + let pattern = b"GGGATCCCCGAAAAAGCGGGTTTGGCAAAAGCAAATTTCCCGAGTAAGCAGGCAGAGATCGCGCCAGACGCTCCCCAGAGCAGGGCGTCATGCACAAGAAAGCTTTGCACTTTGCGAACCAACGATAGGTGGGGGTGCGTGGAGGATGGAACACGGACGGCCCGGCTTGCTGCCTTCCCAGGCCTGCAGTTTGCCCATCCACGTCAGGGCCTCAGCCTGGCCGAAAGAAAGAAATGGTCTGTGATCCCCC"; + let text = b"AGCAGGGCGTCATGCACAAGAAAGCTTTGCACTTTGCGAACCAACGATAGGTGGGGGTGCGTGGAGGATGGAACACGGACGGCCCGGCTTGCTGCCTTCCCAGGCCTGCAGTTTGCCCATCCACGTCAGGGCCTCAGCCTGGCCGAAAGAAAGAAATGGTCTGTGATCCCCCCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCATTCCCGGCTACAAGGACCCTTCGAGCCCCGTTCGCCGGCCGCGGACCCGGCCCCTCCCTCCCCGGCCGCTAGGGGGCGGGCCCGGATCACAGGACTGGAGCTGGGCGGAGACCCACGCTCGGAGCGGTTGTGAACTGGCAGGCGGTGGGCGCGGCTTCTGTGCCGTGCCCCGGGCACTCAGTCTTCCAACGGGGCCCCGGAGTCGAAGACAGTTCTAGGGTTCAGGGAGCGCGGGCGGCTCCTGGGCGGCGCCAGACTGCGGTGAGTTGGCCGGCGTGGGCCACCAACCCAATGCAGCCCAGGGCGGCGGCACGAGACAGAACAACGGCGAACAGGAGCAGGGAAAGCGCCTCCGATAGGCCAGGCCTAGGGACCTGCGGGGAGAGGGCGAGGTCAACACCCGGCATGGGCCTCTGATTGGCTCCTGGGACTCGCCCCGCCTACGCCCATAGGTGGGCCCGCACTCTTCCCTGCGCCCCGCCCCCGCCCCAACAGCCT"; + let mut aligner = WFAlignerGapAffine2Pieces::new( + 8, + 4, + 2, + 24, + 1, + AlignmentScope::Alignment, + MemoryModel::MemoryHigh, + ); + aligner.set_heuristic(Heuristic::None); + let status = aligner.align_ends_free(pattern, 0, 0, text, 0, text.len() as i32); + assert_eq!(status, AlignmentStatus::StatusAlgCompleted); + let ((xstart, xend), (ystart, yend)) = aligner.find_alignment_span(); + + assert_eq!(ystart, 0); + assert_eq!(yend, 172); + assert_eq!(xstart, 78); + assert_eq!(xend, 250); + } + + #[test] + fn test_aligner_ends_free_global() { + let pattern = b"AATTTAAGTCTAGGCTACTTTC"; + let text = b"CCGACTACTACGAAATTTAAGTATAGGCTACTTTCCGTACGTACGTACGT"; + let mut aligner = + WFAlignerGapAffine::new(6, 4, 2, AlignmentScope::Alignment, MemoryModel::MemoryHigh); + let status = aligner.align_ends_free(pattern, 0, 0, text, 0, text.len() as i32); + assert_eq!(status, AlignmentStatus::StatusAlgCompleted); + assert_eq!(aligner.score(), -36); + assert_eq!(aligner.cigar(None), "13I9M1X12M15I"); + let (a, b, c) = aligner.matching(pattern, text, None); + assert_eq!( + format!("{}\n{}\n{}", a, b, c), + "-------------AATTTAAGTCTAGGCTACTTTC---------------\n ||||||||| |||||||||||| \nCCGACTACTACGAAATTTAAGTATAGGCTACTTTCCGTACGTACGTACGT" + ); + } + + #[test] + fn test_aligner_ends_free_right_extent() { + let pattern = b"AATTTAAGTCTGCTACTTTCACGCAGCT"; + let text = b"AATTTCAGTCTGGCTACTTTCACGTACGATGACAGACTCT"; + let mut aligner = + WFAlignerGapAffine::new(6, 4, 2, AlignmentScope::Alignment, MemoryModel::MemoryHigh); + let status = + aligner.align_ends_free(pattern, 0, pattern.len() as i32, text, 0, text.len() as i32); + assert_eq!(status, AlignmentStatus::StatusAlgCompleted); + assert_eq!(aligner.score(), -24); + assert_eq!(aligner.cigar(None), "5M1X6M1I11M4D1M15I"); + let (a, b, c) = aligner.matching(pattern, text, None); + assert_eq!( + format!("{}\n{}\n{}", a, b, c), + "AATTTAAGTCTG-CTACTTTCACGCAGCT---------------\n||||| |||||| ||||||||||| | \nAATTTCAGTCTGGCTACTTTCACG----TACGATGACAGACTCT" + ); + } + + #[test] + fn test_aligner_ends_free_left_extent() { + let pattern = b"CTTTCACGTACGTGACAGTCTCT"; + let text = b"AATTTCAGTCTGGCTACTTTCACGTACGATGACAGACTCT"; + let mut aligner = + WFAlignerGapAffine::new(6, 4, 2, AlignmentScope::Alignment, MemoryModel::MemoryHigh); + let status = aligner.align_ends_free(pattern, 0, 0, text, 0, 0); + assert_eq!(status, AlignmentStatus::StatusAlgCompleted); + assert_eq!(aligner.score(), -48); + assert_eq!(aligner.cigar(None), "16I12M1I6M1X4M"); + let (a, b, c) = aligner.matching(pattern, text, None); + assert_eq!( + format!("{}\n{}\n{}", a, b, c), + "----------------CTTTCACGTACG-TGACAGTCTCT\n |||||||||||| |||||| ||||\nAATTTCAGTCTGGCTACTTTCACGTACGATGACAGACTCT" + ); + } + + #[test] + fn test_aligner_ends_free_right_overlap() { + let pattern = b"CGCGTCTGACTGACTGACTAAACTTTCATGTACCTGACA"; + let text = b"AAACTTTCACGTACGTGACATATAGCGATCGATGACT"; + let mut aligner = + WFAlignerGapAffine::new(6, 4, 2, AlignmentScope::Alignment, MemoryModel::MemoryHigh); + let status = aligner.align_ends_free(pattern, 0, 0, text, 0, 0); + assert_eq!(status, AlignmentStatus::StatusAlgCompleted); + assert_eq!(aligner.score(), -92); + assert_eq!(aligner.cigar(None), "19D9M1X4M1X5M17I"); + let (a, b, c) = aligner.matching(pattern, text, None); + assert_eq!( + format!("{}\n{}\n{}", a, b, c), + "CGCGTCTGACTGACTGACTAAACTTTCATGTACCTGACA-----------------\n ||||||||| |||| ||||| \n-------------------AAACTTTCACGTACGTGACATATAGCGATCGATGACT" + ); + } + + #[test] + fn test_clipping_score() { + let text_lf = b"AAGGAGCTGAGAATTGTTCTTCCAGATACCTTTCCGACCTCTTCTTGGTT"; + let text_rf = b"GGAGTGCAGTGGTGCAATCTTGGCTCACTACAACCTCCGCATCCTGGGTT"; + + let pattern_lf = b"AAGGAGCTGAGAATTGTTCGTCCAGATACCTTTCCGACCTCTTCTTGGTT"; + let pattern_rf = b"GGAGTGCAGTGGTGCAATCTTGGCTCACTACAACCTCTGCATCCTGGGTT"; + + let motif = b"ATTT"; + + let text = [text_lf, &motif.repeat(10)[..], text_rf].concat(); + let pattern = [pattern_lf, &motif.repeat(8)[..], pattern_rf].concat(); + + let mut aligner = WFAlignerGapAffine2Pieces::new( + 8, + 4, + 2, + 24, + 1, + AlignmentScope::Alignment, + MemoryModel::MemoryHigh, + ); + let status = aligner.align_end_to_end(&pattern, &text); + + assert_eq!(status, AlignmentStatus::StatusAlgCompleted); + assert_eq!(aligner.score(), -36); + assert_eq!(aligner.cigar(None), "19M1X62M8I37M1X12M"); + let (a, b, c) = aligner.matching(&pattern, &text, None); + assert_eq!( + format!("{}\n{}\n{}", a, b, c), + "AAGGAGCTGAGAATTGTTCGTCCAGATACCTTTCCGACCTCTTCTTGGTTATTTATTTATTTATTTATTTATTTATTTATTT--------GGAGTGCAGTGGTGCAATCTTGGCTCACTACAACCTCTGCATCCTGGGTT\n||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ||||||||||||||||||||||||||||||||||||| ||||||||||||\nAAGGAGCTGAGAATTGTTCTTCCAGATACCTTTCCGACCTCTTCTTGGTTATTTATTTATTTATTTATTTATTTATTTATTTATTTATTTGGAGTGCAGTGGTGCAATCTTGGCTCACTACAACCTCCGCATCCTGGGTT" + ); + assert_eq!(aligner.cigar_score(), -36); + assert_eq!(aligner.cigar_score_clipped(50), -20); + assert_eq!(aligner.cigar(Some(50)), "32M8I"); + let (a, b, c) = aligner.matching(&pattern, &text, Some(50)); + assert_eq!( + format!("{}\n{}\n{}", a, b, c), + "ATTTATTTATTTATTTATTTATTTATTTATTT--------\n|||||||||||||||||||||||||||||||| \nATTTATTTATTTATTTATTTATTTATTTATTTATTTATTT" + ); + } + + #[test] + fn test_memory_modes() { + let expected_cigar = "1M1X3M1I5M2X8M3I1M1X9M"; + let expected_matching = "AGCTA-GTGTCAATGGCTACT---TTTCAGGTCCT\n| ||| ||||| |||||||| | |||||||||\nAACTAAGTGTCGGTGGCTACTATATATCAGGTCCT"; + let expected_score = -48; + + struct Test { + memory_mode: MemoryModel, + } + + let tests = vec![ + Test { + memory_mode: MemoryModel::MemoryHigh, + }, + Test { + memory_mode: MemoryModel::MemoryMed, + }, + Test { + memory_mode: MemoryModel::MemoryLow, + }, + // Test { + // memory_mode: MemoryModel::MemoryUltraLow, + // }, + ]; + + for test in tests { + let mut aligner = WFAlignerGapAffine2Pieces::new( + 8, + 4, + 2, + 24, + 1, + AlignmentScope::Alignment, + test.memory_mode, + ); + let status = aligner.align_end_to_end(PATTERN, TEXT); + assert_eq!(status, AlignmentStatus::StatusAlgCompleted); + assert_eq!(aligner.score(), expected_score); + assert_eq!(aligner.cigar_score(), expected_score); + assert_eq!(aligner.cigar_score_clipped(0), expected_score); + assert_eq!(aligner.cigar(None), expected_cigar); + let (a, b, c) = aligner.matching(PATTERN, TEXT, None); + assert_eq!(format!("{}\n{}\n{}", a, b, c), expected_matching); + } + } + + #[test] + fn test_set_heuristic() { + let mut aligner = + WFAlignerGapAffine::new(6, 4, 2, AlignmentScope::Alignment, MemoryModel::MemoryHigh); + aligner.set_heuristic(Heuristic::WFmash(1, 2, 3)); + aligner.set_heuristic(Heuristic::BandedStatic(1, 2)); + aligner.set_heuristic(Heuristic::BandedAdaptive(1, 2, 3)); + aligner.set_heuristic(Heuristic::WFadaptive(1, 2, 3)); + aligner.set_heuristic(Heuristic::XDrop(1, 2)); + aligner.set_heuristic(Heuristic::ZDrop(1, 2)); + aligner.set_heuristic(Heuristic::None); + } + + #[test] + fn test_invalid_sequence() { + let read = b"GCTGCTACTGGGGTGTCCCCTCTCAAAGGACAAACCCAGGATCTACAGATGTGTGTGCTAAGCCATGTATGCACATGCACGTGTGTGTGTATATATTTAACCTATCTGTATATATGTATTATGTAAACATGAGTTCCTGCTGGCATATCTGACTATAACTGACCACCTCAGGGTCCATTCTGATCTGTATATATGTATCATGTAAACATGAGTTCCCTGCTGGCATATCTGACTATAACTGACCACCTCACAGTCCATTCTGATCTCTATATATGTATTATGTAAACATGAGTTCCTACTGGCATATCTGACTATAACTGACCACCTCAGGGTCCATTCTGATCTGTATATATGTATTATGTAAACATGAGTTCCCTGCTGGCATATCTGATTATAACTGACCACCTCAGGGTCCATTCTGATCTGTATATATGTATTATGTAAACATGAGTTCCTACTGGCATATCTGACTATAACCGACCACCTCAGGGTCCATTCTGATCTGTATATATGTATCATGTAAACACGAGTTCCTACTGGCATATCTGACTATAACTGACCACCTCAGGGTCCATTCTGATCTGTATATATGTATCATGTAAACACGAGTTCCTGCTGGCATATCTGACTATAACTGACCACCTCAGGGTCCATTCTGATCTGTATATATGTATAATATATATTATATATGGACCTCAGGGTCCATTCTGATCTGTATATATGTATCATGTAAACATGAGTTCCTGCTGGCATATCTGACTATAACTGACCACCTCAGGGTCTATTCTGATCTGTATATATGTATAATATATATTATATATGGACCTCAGGGTCCATTCTGATCTGTATATATGTATCATGTAAACATGAGTTCCTGCTGGCATATCTGATTATAACCGACCACCTCAGGGTCCATTCTGATCTGTATATATGTATTATGTAAACATGAGTTCCTACTGGCATATCTGACTATAACCGACCACCTCAGGGTCCATTCTGATCTGTATATATGTATCATGTAAACATGAGTTCCTACTGGCATATCTGACTATAACTGACCACCTCAGGGTCCATTCTGATCTGTATATATGTATCATGTAAACATGAGTTCCTACTGGCATATCTGACTATAACTGACCACCTCAGGATCCATTCTGATCTGTATATATGTATAATATATATTATATATGGACCTCAGGGTCCATTCTGATCTGTATATATGTATAATATATATTATATATGGACCTCAGGGTCCATTCTGATCTGTATATATGTATCATGTAAACATGAGTTCCTGGCTGGCATATCTGATTATAACCGACCACCTCAGGGTCCATTCTGATCTGTATATATGTATCATGTAAACACGAGTTCCTACTGGCATATCTGACTATAACTGACCACCTCAGGGTCCATTCTGATCTGTATATATGTATCATGTAAACACGAGTTCCTGCTGGCATATCTGATTATAACCGACCACCTCAGGGTCCATTCTGATCTGTATATATGTATAATATATATTATATATGGACCTCAGGGTCCCCGCTGGCTTTTCCATGACTTCCTTATCCAGCTGTGAGAACCCTGACTCTTACTACCCATACTGTATTGACTTATTT"; + let allele = b"GCTGCTACTGGGGTGTCCCCTCTCAAAGGACAAACCCAGGATCTACAGATGTGTGTGCTAAGCCATGTATGCACACGCACGTGTGTGTGTATATATTTAACCTATCTGTATATATGTATTATGTAAACATGAGTTCCTGCTGGCATATCTGACTATAACTGACCACCTCAGGGTCCATTCTGATCTGTATATATGTATCATGTAAACACGACTTCCTACTGGCATATCTGACTGTAACCGACCACCTCAGGGTCCATTCTGATCTGTATATATGTATCATGTAAACATGATTTCCTACTGGCATATCTGACTATAACTGACCACCTCAGGGTTCATTCCGATCTGTATATAAGTATCATGTAAACACGAGTTCCTGCTGGCATATCTGACTGTAACCGACCACCTCAGGGTCCATTCTGATCTGTATATATGTATCATGTAAACACGAGTTCCTGCTGGCATATCTGACTATAACCGACCACCTCAGGGTCCATTCTGATCTGTATATATGTATCATGTAAACATGAGTTCCTACTGGCATATCTGACTATAACTGACCACCTCAGGGTCCATTCTGATCTGTATGTATGTATCATGTAAACACGAGTTCCTACTGGCATATCTGACTATAACTGACCACCTCAGGGTCCATTCCGATCTGTATATAAGTATCATGTAAACACGAGTTCCTGCTGGCATATCTGACTGTAACCGACCACCTCAGGGTCCATTCTGATCTGTATATATGTATCATGTAAACACGAGTTCCTGCTGGCATATCTGACTATAACTGACCACCTCAGGGTCCATTCTGATCTGTATATATGTATAATATATATTATATATGGACCTCAGGGTCCATTCTGATCTGCATATATGTATAATATATATTATATATGGACCTCAGGGTCCATTCTGATCTGTATATATGTATCATGTAAACATGAGTTCCTGCTGGCATATCTGTCTATAACCGACCACCTTAGGGTCCATTCTGATCTGTATATATGTATAATATATATTATATATGGTCCTCAGGGTCCATTCTGATCTGTATATATGTATCATGTAAACATGAGTTCCTGCTGGCATATCTGTCTATAACCGACCACCTTAGGGTCCATTCTGATCTGTATATATGTATAATATATATTATATATGGACCTCAGGGTCCATTCTGATCTGCATATATGTATAATATATATTATATATGGTCCTCAGGGTCCATTCTGATCTGTATATATGTATCATGTAAACATGAGTTCCTGCTGGCATATCTGTCTATAACCGACCACCTTAGGGTCCATTCTGATCTGTATATATGTATAATATATATTATATATGGACCTCAGGGTCCCCGCTGGCTTTTCCATGACTTCCTTATCCAGCTGTGAGAACCCTGACTCTTACTACTGTATTGACTTATTTGTGAAACCT"; + + let mut aligner = WFAlignerGapAffine2Pieces::new( + 8, + 4, + 2, + 24, + 1, + AlignmentScope::Alignment, + MemoryModel::MemoryUltraLow, + ); + let _status = aligner.align_end_to_end(read, allele); + assert_eq!(_status, AlignmentStatus::StatusUnattainable); + assert_eq!(aligner.score(), -2147483648); + + aligner.set_heuristic(Heuristic::None); + let _status = aligner.align_end_to_end(read, allele); + assert_eq!(_status, AlignmentStatus::StatusAlgCompleted); + assert_eq!(aligner.score(), -881); + } +} diff --git a/src/allele.rs b/src/allele.rs new file mode 100644 index 0000000..5e38e76 --- /dev/null +++ b/src/allele.rs @@ -0,0 +1,409 @@ +use crate::aligner::{AlignmentStatus, WFAligner}; +use crate::denovo::{self, AlleleOrigin, DenovoStatus}; +use crate::handles; +use crate::locus::Locus; +use crate::read::ReadInfo; +use crate::snp::{self, TrinaryMatrix}; +use crate::stats; +use crate::util::Result; +use anyhow::anyhow; +use itertools::Itertools; +use log; +use noodles::{ + bam, + bgzf::Reader, + sam, + vcf::{ + self, + record::genotypes::{self, sample::value::Genotype}, + record::info::field, + Record, + }, +}; +use once_cell::sync::Lazy; +use serde::Serialize; +use std::{ + fs::File, + sync::{Arc, Mutex}, +}; + +#[derive(Debug, PartialEq)] +pub struct Allele { + pub seq: Vec, + pub genotype: usize, + pub read_aligns: Vec<(ReadInfo, i32)>, + pub motif_count: String, + pub index: usize, +} + +impl Allele { + pub fn dummy(reads: Vec) -> Allele { + Allele { + seq: vec![], + genotype: 0, + read_aligns: reads.into_iter().map(|info| (info, 0i32)).collect_vec(), + motif_count: String::from("0"), + index: 0, + } + } +} + +pub static FILTERS: &[&'static (dyn snp::ReadFilter + Sync)] = + &[&snp::FilterByDist, &snp::FilterByFreq]; + +pub fn load_alleles( + locus: &Locus, + subhandle: &handles::SubHandle, + clip_len: usize, + aligner: &mut WFAligner, +) -> Result> { + let (allele_seqs, motif_counts, genotype_indices) = + get_allele_seqs(locus, &subhandle.vcf, &subhandle.vcf_header)?; + let mut reads = get_reads(&subhandle.bam, &subhandle.bam_header, locus)?; + + snp::apply_read_filters(&mut reads, FILTERS); + + let reads_by_allele = assign_reads(&allele_seqs, reads, clip_len, aligner); + let alleles = allele_seqs + .into_iter() + .enumerate() + .map(|(index, allele_seq)| Allele { + seq: allele_seq, + genotype: genotype_indices[index], + read_aligns: reads_by_allele[index].to_owned(), + motif_count: motif_counts[index].to_owned(), + index, + }) + .collect(); + Ok(alleles) +} + +fn assign_reads( + alleles: &[Vec], + reads: Vec, + clip_len: usize, + aligner: &mut WFAligner, +) -> Vec> { + let mut reads_by_allele = vec![Vec::new(); alleles.len()]; + let mut index_flip: usize = 0; + for read in reads { + let mut max_score = None; + let mut max_aligns = Vec::new(); + for (i, a) in alleles.iter().enumerate() { + if let AlignmentStatus::StatusAlgCompleted = aligner.align_end_to_end(&read.bases, a) { + let score = aligner.cigar_score_clipped(clip_len); + match max_score { + None => { + max_score = Some(score); + max_aligns = vec![(i, score)]; + } + Some(max) if score > max => { + max_score = Some(score); + max_aligns = vec![(i, score)]; + } + Some(max) if score == max => { + max_aligns.push((i, score)); + } + _ => (), + } + } + } + if !max_aligns.is_empty() { + if max_aligns.len() > 1 { + index_flip = (index_flip + 1) % max_aligns.len(); + } + let (allele_index, align) = max_aligns[index_flip % max_aligns.len()]; + reads_by_allele[allele_index].push((read, align)); + } + } + reads_by_allele +} + +pub fn get_reads( + bam: &Arc>>>, + bam_header: &Arc, + locus: &Locus, +) -> Result> { + let mut bam = bam + .lock() + .map_err(|e| anyhow!("Failed to acquire lock: {}", e))?; + let query = bam.query(bam_header, &locus.region)?; + + let reads: Vec<_> = query + .map(|record| record.map_err(|e| e.into()).map(ReadInfo::new)) + .collect::>>()?; + + Ok(reads) +} + +// TODO: improve, for now it just pulls out the genotype index and checks duplicates +fn is_homozygous(genotypes: &Genotype) -> bool { + let alleles: Vec<_> = genotypes + .iter() + .filter_map(|allele| allele.position()) + .collect(); + alleles.into_iter().unique().count() == 1 +} + +static TRID_KEY: Lazy = Lazy::new(|| "TRID".parse().unwrap()); +static MC_KEY: Lazy = Lazy::new(|| "MC".parse().unwrap()); + +fn get_allele_seqs( + locus: &Locus, + vcf: &Arc>>, + vcf_header: &Arc, +) -> Result<(Vec>, Vec, Vec)> { + let mut vcf = vcf + .lock() + .map_err(|_| anyhow!("Error locking Mutex for vcf::IndexedReader"))?; + + let query = vcf.query(vcf_header, &locus.region)?; + let locus_id = &locus.id; + + if let Some(result) = query.into_iter().next() { + let record = result?; + let info = record.info(); + + let trid = info.get(&*TRID_KEY).unwrap().unwrap().to_string(); + if &trid != locus_id { + return Err(anyhow!("TRID={} missing", locus_id)); + } + + let format = record.genotypes().get_index(0).unwrap(); + let genotype = format.genotype().unwrap(); + if genotype.is_err() { + return Err(anyhow!("TRID={} misses genotyping", locus_id)); + } + let genotype = genotype.unwrap(); + + let mc_field = format.get(&*MC_KEY).unwrap().unwrap().to_string(); + let motif_counts: Vec = mc_field.split(',').map(ToString::to_string).collect(); + + let (alleles, genotype_indices) = process_genotypes(&genotype, &record, locus)?; + return Ok((alleles, motif_counts, genotype_indices)); + } + Err(anyhow!("TRID={} missing", &locus.id)) +} + +fn process_genotypes( + genotype: &Genotype, + record: &Record, + locus: &Locus, +) -> Result<(Vec>, Vec)> { + let reference_bases = record.reference_bases().to_string().into_bytes(); + let alternate_bases = record + .alternate_bases() + .iter() + .map(|base| base.to_string().into_bytes()) + .collect::>(); + let mut seq = locus.left_flank.clone(); + let mut alleles = Vec::new(); + let mut genotype_indices = Vec::new(); + + for allele in genotype.iter() { + let allele_index: usize = allele + .position() + .ok_or_else(|| anyhow!("Allele position missing for TRID={}", locus.id))?; + match allele_index { + 0 => seq.extend_from_slice(&reference_bases), + _ => seq.extend_from_slice( + alternate_bases + .get(allele_index - 1) + .ok_or_else(|| anyhow!("Invalid allele index for TRID={}", locus.id))?, + ), + } + seq.extend_from_slice(&locus.right_flank); + alleles.push(seq.clone()); + seq.truncate(locus.left_flank.len()); // reset the sequence for the next allele + + genotype_indices.push(allele_index); + + if genotype.len() > 1 && is_homozygous(genotype) { + break; + } + } + Ok((alleles, genotype_indices)) +} + +#[derive(Debug, PartialEq, Serialize)] +pub struct AlleleResult { + pub trid: String, + pub genotype: usize, + pub denovo_coverage: usize, + pub allele_coverage: usize, + #[serde(serialize_with = "serialize_with_precision")] + pub allele_ratio: f64, + pub child_coverage: usize, + #[serde(serialize_with = "serialize_with_precision")] + pub child_ratio: f64, + #[serde(serialize_with = "serialize_with_precision")] + pub mean_diff_father: f32, + #[serde(serialize_with = "serialize_with_precision")] + pub mean_diff_mother: f32, + #[serde(serialize_with = "serialize_with_precision")] + pub father_dropout_prob: f64, + #[serde(serialize_with = "serialize_with_precision")] + pub mother_dropout_prob: f64, + #[serde(serialize_with = "serialize_as_display")] + pub allele_origin: AlleleOrigin, + #[serde(serialize_with = "serialize_as_display")] + pub denovo_status: DenovoStatus, + pub per_allele_reads_father: String, + pub per_allele_reads_mother: String, + pub per_allele_reads_child: String, + pub index: usize, + pub father_motif_counts: String, + pub mother_motif_counts: String, + pub child_motif_counts: String, + #[serde(serialize_with = "serialize_with_precision")] + pub maxlh: f64, +} + +fn serialize_as_display( + value: &T, + serializer: S, +) -> std::result::Result { + serializer.collect_str(value) +} + +fn serialize_with_precision( + value: &T, + serializer: S, +) -> std::result::Result { + let formatted_value = format!("{:.4}", value); + serializer.serialize_str(&formatted_value) +} + +fn load_alleles_handle( + role: char, + locus: &Locus, + subhandle: &handles::SubHandle, + clip_len: usize, + aligner: &mut WFAligner, +) -> Result> { + load_alleles(locus, subhandle, clip_len, aligner).map_err(|err| { + log::warn!("Skipping {} in {}", err, role); + err + }) +} + +pub fn process_alleles( + locus: &Locus, + handle: Arc, + clip_len: usize, + parent_quantile: f64, + aligner: &mut WFAligner, +) -> Result> { + let father_alleles = load_alleles_handle('F', locus, &handle.father, clip_len, aligner)?; + let mother_alleles = load_alleles_handle('M', locus, &handle.mother, clip_len, aligner)?; + let child_alleles = load_alleles_handle('C', locus, &handle.child, clip_len, aligner)?; + + // TODO: ongoing work, the maximum likelihood is obtained naively + let max_lhs = TrinaryMatrix::new(&child_alleles, &father_alleles, &mother_alleles) + .and_then(|trinary_mat| snp::inheritance_prob(&trinary_mat)) + .map(|(_inherit_p, max_lh)| max_lh) + .unwrap_or((-1.0, -1.0)); + let max_lhs = vec![max_lhs.0, max_lhs.1]; + + let mother_dropout_prob = stats::get_dropout_prob(&mother_alleles); + let father_dropout_prob = stats::get_dropout_prob(&father_alleles); + + let father_reads = stats::get_per_allele_reads(&father_alleles); + let mother_reads = stats::get_per_allele_reads(&mother_alleles); + let child_reads = stats::get_per_allele_reads(&child_alleles); + + let father_motifs = father_alleles + .iter() + .map(|a| a.motif_count.to_string()) + .collect::>() + .join(","); + let mother_motifs = mother_alleles + .iter() + .map(|a| a.motif_count.to_string()) + .collect::>() + .join(","); + let child_motifs = child_alleles + .iter() + .map(|a| a.motif_count.to_string()) + .collect::>() + .join(","); + + let mut out_vec = Vec::::new(); + for (i, dna) in denovo::assess_denovo( + &mother_alleles, + &father_alleles, + &child_alleles, + clip_len, + parent_quantile, + aligner, + ) + .enumerate() + { + let output = AlleleResult { + trid: locus.id.clone(), + genotype: dna.genotype, + denovo_coverage: dna.denovo_score, + allele_coverage: dna.allele_coverage, + allele_ratio: dna.denovo_score as f64 / dna.allele_coverage as f64, + child_coverage: dna.child_coverage, + child_ratio: dna.denovo_score as f64 / dna.child_coverage as f64, + mean_diff_father: dna.mean_diff_father, + mean_diff_mother: dna.mean_diff_mother, + father_dropout_prob, + mother_dropout_prob, + allele_origin: dna.allele_origin, + denovo_status: dna.denovo_status, + per_allele_reads_father: father_reads + .iter() + .map(|a| a.to_string()) + .collect::>() + .join(","), + per_allele_reads_mother: mother_reads + .iter() + .map(|a| a.to_string()) + .collect::>() + .join(","), + per_allele_reads_child: child_reads + .iter() + .map(|a| a.to_string()) + .collect::>() + .join(","), + index: dna.index, + father_motif_counts: father_motifs.clone(), + mother_motif_counts: mother_motifs.clone(), + child_motif_counts: child_motifs.clone(), + maxlh: max_lhs[i], + }; + out_vec.push(output); + } + Ok(out_vec) +} + +// #[cfg(test)] +// mod tests { +// use super::*; + +// #[test] +// fn test_is_homozygous() { +// let mut header = Header::new(); +// let header_contig_line = r#"##contig="#; +// header.push_record(header_contig_line.as_bytes()); +// let header_gt_line = r#"##FORMAT="#; +// header.push_record(header_gt_line.as_bytes()); +// header.push_sample("test_sample".as_bytes()); +// let vcf = Writer::from_stdout(&header, true, Format::Vcf).unwrap(); +// let mut record = vcf.empty_record(); + +// let alleles = &[GenotypeAllele::Unphased(0), GenotypeAllele::Phased(0)]; +// record.push_genotypes(alleles).unwrap(); +// let genotypes = record.genotypes().unwrap().get(0); +// // assert!(is_homozygous(&genotypes)); + +// record.clear(); + +// let alleles = &[GenotypeAllele::Unphased(2), GenotypeAllele::Phased(1)]; +// record.push_genotypes(alleles).unwrap(); +// let genotypes = record.genotypes().unwrap().get(0); +// // assert!(!is_homozygous(&genotypes)); +// } +// } diff --git a/src/cli.rs b/src/cli.rs new file mode 100644 index 0000000..cdcb922 --- /dev/null +++ b/src/cli.rs @@ -0,0 +1,206 @@ +use crate::util::Result; +use anyhow::anyhow; +use chrono::Datelike; +use clap::{ArgAction, ArgGroup, Parser, Subcommand}; +use env_logger::fmt::Color; +use log::{Level, LevelFilter}; +use once_cell::sync::Lazy; +use std::{ + io::Write, + path::{Path, PathBuf}, +}; + +pub static FULL_VERSION: Lazy = Lazy::new(|| { + format!( + "{}-{}", + env!("CARGO_PKG_VERSION"), + env!("VERGEN_GIT_DESCRIBE") + ) +}); + +#[derive(Parser)] +#[command(name="trgt-denovo", + author="Tom Mokveld \nEgor Dolzhenko ", + version=&**FULL_VERSION, + about="Tandem repeat de novo caller", + long_about = None, + after_help = format!("Copyright (C) 2004-{} Pacific Biosciences of California, Inc. + This program comes with ABSOLUTELY NO WARRANTY; it is intended for + Research Use Only and not for use in diagnostic procedures.", chrono::Utc::now().year()), + help_template = "{name} {version}\n{author}{about-section}\n{usage-heading}\n {usage}\n\n{all-args}{after-help}", + )] +pub struct Cli { + #[command(subcommand)] + pub command: Command, + + #[clap(short = 'v')] + #[clap(long = "verbose")] + #[clap(action = ArgAction::Count)] + pub verbosity: u8, +} + +#[derive(Subcommand)] +pub enum Command { + Trio(TrioArgs), +} + +#[derive(Parser, Debug)] +#[command(group(ArgGroup::new("trio")))] +#[command(arg_required_else_help(true))] +pub struct TrioArgs { + #[clap(required = true)] + #[clap(short = 'r')] + #[clap(long = "reference")] + #[clap(help = "Path to reference genome FASTA")] + #[clap(value_name = "FASTA")] + #[arg(value_parser = check_file_exists)] + pub reference_filename: PathBuf, + + #[clap(required = true)] + #[clap(short = 'b')] + #[clap(long = "bed")] + #[clap(help = "BED file with repeat coordinates")] + #[clap(value_name = "BED")] + #[arg(value_parser = check_file_exists)] + pub bed_filename: PathBuf, + + #[clap(required = true)] + #[clap(short = 'm')] + #[clap(long = "mother")] + #[clap(help = "Prefix of mother VCF and spanning reads BAM files")] + #[clap(value_name = "PREFIX")] + #[arg(value_parser = check_prefix_path)] + pub mother_prefix: String, + + #[clap(required = true)] + #[clap(short = 'f')] + #[clap(long = "father")] + #[clap(help = "Prefix of father VCF and spanning reads BAM files")] + #[arg(value_parser = check_prefix_path)] + #[clap(value_name = "PREFIX")] + #[arg(value_parser = check_prefix_path)] + pub father_prefix: String, + + #[clap(required = true)] + #[clap(short = 'c')] + #[clap(long = "child")] + #[clap(help = "Prefix of child VCF and spanning reads BAM files")] + #[clap(value_name = "PREFIX")] + #[arg(value_parser = check_prefix_path)] + pub child_prefix: String, + + #[clap(required = true)] + #[clap(short = 'o')] + #[clap(long = "out")] + #[clap(help = "Output csv path")] + #[clap(value_name = "CSV")] + #[arg(value_parser = check_prefix_path)] + pub output_path: String, + + #[clap(long = "trid")] + #[clap(help = "TRID of a specific repeat to analyze, should be in the BED file")] + #[clap(value_name = "TRID")] + pub trid: Option, + + #[clap(short = '@')] + #[clap(value_name = "THREADS")] + #[clap(default_value = "1")] + #[arg(value_parser = threads_in_range)] + pub num_threads: usize, + + #[clap(help_heading("Advanced"))] + #[clap(long = "flank-len")] + #[clap(help = "Amount of additional flanking sequence that should be used during alignment")] + #[clap(value_name = "FLANK_LEN")] + #[clap(default_value = "50")] + pub flank_len: usize, + + #[clap(help_heading("Advanced"))] + #[clap(long = "no-clip-aln")] + #[clap(help = "Score alignments without stripping the flanks")] + #[clap(value_name = "CLIP")] + pub no_clip_aln: bool, + + #[clap(help_heading("Advanced"))] + #[clap(long = "parental-quantile")] + #[clap( + help = "Quantile of alignments scores to determine the parental threshold (default is strict and takes only the top score)" + )] + #[clap(value_name = "QUANTILE")] + #[clap(default_value = "1.0")] + #[arg(value_parser = parse_quantile)] + pub parent_quantile: f64, +} + +pub fn init_verbose(args: &Cli) { + let filter_level: LevelFilter = match args.verbosity { + 0 => LevelFilter::Info, + 1 => LevelFilter::Debug, + _ => LevelFilter::Trace, + }; + + env_logger::Builder::from_default_env() + .format(|buf, record| { + let level = record.level(); + let mut style = buf.style(); + match record.level() { + Level::Error => style.set_color(Color::Red), + Level::Warn => style.set_color(Color::Yellow), + Level::Info => style.set_color(Color::Green), + Level::Debug => style.set_color(Color::Blue), + Level::Trace => style.set_color(Color::Cyan), + }; + + writeln!( + buf, + "{} [{}] - {}", + chrono::Local::now().format("%Y-%m-%d %H:%M:%S"), + style.value(level), + record.args() + ) + }) + .filter_level(filter_level) + .init(); +} + +fn check_prefix_path(s: &str) -> Result { + let path = Path::new(s); + if let Some(parent_dir) = path.parent() { + if !parent_dir.as_os_str().is_empty() && !parent_dir.exists() { + return Err(anyhow!("Path does not exist: {}", parent_dir.display())); + } + } + Ok(s.to_string()) +} + +fn threads_in_range(s: &str) -> Result { + let thread: usize = s + .parse::() + .map_err(|_| anyhow!("`{}` is not a valid thread number", s))?; + if thread <= 0 { + return Err(anyhow!("Number of threads must be >= 1")); + } + Ok(thread) +} + +fn parse_quantile(s: &str) -> Result { + let value = s + .parse::() + .map_err(|e| anyhow!("Could not parse float: {}", e))?; + if value < 0.0 || value > 1.0 { + Err(anyhow!( + "The value must be between 0.0 and 1.0, got: {}", + value + )) + } else { + Ok(value) + } +} + +fn check_file_exists(s: &str) -> Result { + let path = Path::new(s); + if !path.exists() { + return Err(anyhow!("File does not exist: {}", path.display())); + } + Ok(path.to_path_buf()) +} diff --git a/src/commands.rs b/src/commands.rs new file mode 100644 index 0000000..8aacde7 --- /dev/null +++ b/src/commands.rs @@ -0,0 +1,3 @@ +pub mod trio; + +pub use self::trio::trio; diff --git a/src/commands/trio.rs b/src/commands/trio.rs new file mode 100644 index 0000000..a0d45b1 --- /dev/null +++ b/src/commands/trio.rs @@ -0,0 +1,146 @@ +use crate::aligner::{AlignmentScope, MemoryModel, WFAligner, WFAlignerGapAffine2Pieces}; +use crate::{ + allele, + cli::TrioArgs, + handles, locus, + util::{self}, +}; +use anyhow::Result; +use csv::WriterBuilder; +use log; +use noodles::{ + bed, + fasta::{self, io::BufReadSeek}, +}; +use rayon::prelude::*; +use rayon::ThreadPoolBuilder; +use std::{ + cell::RefCell, + fs, + io::BufReader, + sync::{mpsc::channel, Arc}, + thread, + time::Instant, +}; + +thread_local! { + static ALIGNER: RefCell = RefCell::new(WFAlignerGapAffine2Pieces::new( + 8, + 4, + 2, + 24, + 1, + AlignmentScope::Alignment, + MemoryModel::MemoryLow + )); +} + +pub fn trio(args: TrioArgs) -> Result<()> { + log::info!( + "{}-{} trio start", + env!("CARGO_PKG_NAME"), + *crate::cli::FULL_VERSION + ); + let start_timer = Instant::now(); + let clip_len = if args.no_clip_aln { 0 } else { args.flank_len }; + + let handles = + handles::Handles::new(&args.mother_prefix, &args.father_prefix, &args.child_prefix) + .unwrap_or_else(|err| util::handle_error_and_exit(err)); + let handles_arc = Arc::new(handles); + + let mut catalog_reader = fs::File::open(args.bed_filename) + .map(BufReader::new) + .map(bed::Reader::new)?; + + let mut genome_reader: fasta::IndexedReader> = + fasta::indexed_reader::Builder::default() + .build_from_path(&args.reference_filename) + .unwrap_or_else(|err| util::handle_error_and_exit(err.into())); + + match args.trid { + Some(trid) => { + let locus = locus::get_locus( + &mut genome_reader, + &mut catalog_reader, + &trid, + args.flank_len, + ) + .unwrap_or_else(|err| util::handle_error_and_exit(err)); + + let mut csv_wtr = WriterBuilder::new() + .delimiter(b'\t') + .from_writer(std::io::stdout()); + + ALIGNER.with(|aligner| { + let mut aligner = aligner.borrow_mut(); + if let Ok(result) = allele::process_alleles( + &locus, + handles_arc, + clip_len, + args.parent_quantile, + &mut aligner, + ) { + for row in result { + if let Err(err) = csv_wtr.serialize(row) { + log::error!("Failed to write record: {}", err); + } + csv_wtr.flush().unwrap(); + } + } + }); + } + None => { + let all_loci = locus::get_loci(&mut genome_reader, &mut catalog_reader, args.flank_len) + .collect::>>() + .unwrap_or_else(|err| util::handle_error_and_exit(err)); + + let mut csv_wtr = WriterBuilder::new() + .delimiter(b'\t') + .from_path(&args.output_path)?; + + let (sender, receiver) = channel(); + let writer_thread = thread::spawn(move || { + for results in &receiver { + for row in results { + if let Err(err) = csv_wtr.serialize(row) { + log::error!("Failed to write record: {}", err); + } + csv_wtr.flush().unwrap(); + } + } + }); + + log::info!("Starting job pool with {} threads...", args.num_threads); + let pool = ThreadPoolBuilder::new() + .num_threads(args.num_threads) + .build() + .unwrap(); + + pool.install(|| { + all_loci + .into_iter() + .par_bridge() + .for_each_with(sender, |s, locus| { + ALIGNER.with(|aligner| { + let mut aligner = aligner.borrow_mut(); + if let Ok(result) = allele::process_alleles( + &locus, + handles_arc.clone(), + clip_len, + args.parent_quantile, + &mut aligner, + ) { + s.send(result).unwrap(); + } + }); + }); + }); + writer_thread.join().unwrap(); + } + } + log::info!("Total execution time: {:?}", start_timer.elapsed()); + log::info!("{} trio end", env!("CARGO_PKG_NAME")); + + Ok(()) +} diff --git a/src/denovo.rs b/src/denovo.rs new file mode 100644 index 0000000..6c9dc52 --- /dev/null +++ b/src/denovo.rs @@ -0,0 +1,506 @@ +use crate::aligner::WFAligner; +use crate::allele::Allele; +use ndarray::{Array2, ArrayBase, Dim, OwnedRepr}; +use serde::Serialize; +use std::{cmp::Ordering, fmt}; + +#[derive(Debug)] +pub struct DenovoAllele { + pub genotype: usize, + pub denovo_score: usize, + pub child_coverage: usize, + pub allele_coverage: usize, + pub mean_diff_father: f32, + pub mean_diff_mother: f32, + pub allele_origin: AlleleOrigin, + pub denovo_status: DenovoStatus, + pub motif_count: String, + pub index: usize, +} + +#[derive(Debug, PartialEq, Clone, Serialize)] +pub enum DenovoType { + Expansion, + Contraction, + Substitution, + Unclear, +} + +#[derive(Debug, PartialEq, Clone, Serialize)] +pub enum DenovoStatus { + Denovo(DenovoType), + NotDenovo, +} + +impl std::fmt::Display for DenovoType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + DenovoType::Expansion => write!(f, "+"), + DenovoType::Contraction => write!(f, "-"), + DenovoType::Substitution => write!(f, "="), + DenovoType::Unclear => write!(f, "?"), + } + } +} + +impl std::fmt::Display for DenovoStatus { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + DenovoStatus::Denovo(denovo_type) => write!(f, "Y:{}", denovo_type), + DenovoStatus::NotDenovo => write!(f, "X"), + } + } +} + +#[derive(Debug, Clone, PartialEq, Serialize)] +pub enum AlleleNum { + One, + Two, + Unclear, +} + +#[derive(Debug, PartialEq, Clone, Serialize)] +pub enum AlleleOrigin { + Father { allele: AlleleNum }, + Mother { allele: AlleleNum }, + Unclear, +} + +impl AlleleOrigin { + pub fn new(value: usize) -> Option { + match value { + 0 => Some(AlleleOrigin::Mother { + allele: AlleleNum::One, + }), + 1 => Some(AlleleOrigin::Mother { + allele: AlleleNum::Two, + }), + 2 => Some(AlleleOrigin::Father { + allele: AlleleNum::One, + }), + 3 => Some(AlleleOrigin::Father { + allele: AlleleNum::Two, + }), + _ => None, + } + } +} + +impl fmt::Display for AlleleNum { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + AlleleNum::One => write!(f, "1"), + AlleleNum::Two => write!(f, "2"), + AlleleNum::Unclear => write!(f, "?"), + } + } +} + +impl fmt::Display for AlleleOrigin { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + AlleleOrigin::Father { allele } => write!(f, "F:{}", allele), + AlleleOrigin::Mother { allele } => write!(f, "M:{}", allele), + AlleleOrigin::Unclear => write!(f, "?"), + } + } +} + +// Valid allele combinations +static COMBS_1D: [[(usize, usize); 2]; 4] = [ + [(0, 0), (0, 0)], + [(1, 0), (1, 0)], + [(2, 0), (2, 0)], + [(3, 0), (3, 0)], +]; +static COMBS_2D: [[(usize, usize); 2]; 8] = [ + [(0, 0), (2, 1)], + [(0, 0), (3, 1)], + [(1, 0), (2, 1)], + [(1, 0), (3, 1)], + [(2, 0), (0, 1)], + [(3, 0), (0, 1)], + [(2, 0), (1, 1)], + [(3, 0), (1, 1)], +]; + +pub fn assess_denovo<'a>( + mother_gts: &'a [Allele], + father_gts: &'a [Allele], + child_gts: &'a [Allele], + clip_len: usize, + parent_quantile: f64, + aligner: &mut WFAligner, +) -> impl Iterator + 'a { + let mut matrix = Array2::from_elem((4, 2), f64::MIN); + let mut dnrs = Vec::with_capacity(child_gts.len()); + + for (index, denovo_allele) in child_gts.iter().enumerate() { + let mother_align_scores = align(mother_gts, &denovo_allele.seq, clip_len, aligner); + let father_align_scores = align(father_gts, &denovo_allele.seq, clip_len, aligner); + let child_align_scores = align(child_gts, &denovo_allele.seq, clip_len, aligner); + + let (denovo_coverage, mean_diff_father, mean_diff_mother) = get_denovo_coverage( + &mother_align_scores, + &father_align_scores, + &child_align_scores, + parent_quantile, + ); + + update_mean_matrix( + &mut matrix, + index, + &mother_align_scores, + &father_align_scores, + ); + + dnrs.push(DenovoAllele { + genotype: denovo_allele.genotype, + denovo_score: denovo_coverage, + child_coverage: child_align_scores.iter().map(|vec| vec.len()).sum(), + allele_coverage: denovo_allele.read_aligns.len(), + mean_diff_mother, + mean_diff_father, + allele_origin: AlleleOrigin::Unclear, + denovo_status: DenovoStatus::NotDenovo, + motif_count: denovo_allele.motif_count.clone(), + index: denovo_allele.index, + }); + } + + // Get best allele combinations + let mut combs_score: Vec<(f64, [(usize, usize); 2])> = Vec::new(); + if child_gts.len() > 1 { + for c in COMBS_2D.iter() { + combs_score.push((c.iter().map(|&(i, j)| matrix[[i, j]]).sum::(), *c)); + } + } else { + for c in COMBS_1D.iter() { + combs_score.push((matrix[[c[0].0, c[0].1]], *c)); + } + } + combs_score.sort_unstable_by(|a, b| b.0.partial_cmp(&a.0).unwrap()); + + // Update allele origin and de novo type + let comb_diff = (combs_score[0].0 - combs_score[1].0).abs(); + let comb_0 = combs_score[0].1; + for (index, denovo_allele) in child_gts.iter().enumerate() { + let dna = &mut dnrs[index]; + // TODO: Alignment based: while we might not know the allele origin, may still figure out parental origin + if comb_diff < 1.0 { + dna.allele_origin = AlleleOrigin::Unclear; + } else { + dna.allele_origin = AlleleOrigin::new(comb_0[index].0).unwrap(); + } + + dna.denovo_status = match dna.denovo_score { + 0 => DenovoStatus::NotDenovo, + _ => DenovoStatus::Denovo(get_denovo_type( + mother_gts, + father_gts, + &dna.allele_origin, + &denovo_allele.seq, + )), + }; + } + dnrs.into_iter().filter_map(Some) +} + +pub fn update_mean_matrix( + matrix: &mut ArrayBase, Dim<[usize; 2]>>, + index: usize, + mother_align_scores: &[Vec], + father_align_scores: &[Vec], +) { + let get_mean = |aligns: &[Vec]| -> Vec<(f64, usize)> { + aligns + .iter() + .enumerate() + .map(|(i, v)| { + if v.is_empty() { + (f64::NEG_INFINITY, i) + } else { + (v.iter().sum::() as f64 / v.len() as f64, i) + } + }) + .collect() + }; + + let mother_mean: Vec<(f64, usize)> = get_mean(mother_align_scores); + let father_mean: Vec<(f64, usize)> = get_mean(father_align_scores); + + matrix[[0, index]] = mother_mean[0].0; + matrix[[1, index]] = mother_mean.get(1).map_or(f64::MIN, |v| v.0); + matrix[[2, index]] = father_mean[0].0; + matrix[[3, index]] = father_mean.get(1).map_or(f64::MIN, |v| v.0); +} + +fn compare_seq_lengths(gts: &[Allele], allele: &AlleleNum, child_seq: &[u8]) -> Option { + match allele { + AlleleNum::One => compare_sequences(>s[0].seq, child_seq), + AlleleNum::Two => compare_sequences(>s[1].seq, child_seq), + _ => None, + } +} + +fn compare_sequences(seq: &[u8], child_seq: &[u8]) -> Option { + match seq.len().cmp(&child_seq.len()) { + Ordering::Less => Some(DenovoType::Expansion), + Ordering::Greater => Some(DenovoType::Contraction), + Ordering::Equal => Some(DenovoType::Substitution), + } +} + +fn get_denovo_type( + mother_gts: &[Allele], + father_gts: &[Allele], + allele_origin: &AlleleOrigin, + child_seq: &[u8], +) -> DenovoType { + match allele_origin { + AlleleOrigin::Father { allele } => { + compare_seq_lengths(father_gts, allele, child_seq).unwrap_or(DenovoType::Unclear) + } + AlleleOrigin::Mother { allele } => { + compare_seq_lengths(mother_gts, allele, child_seq).unwrap_or(DenovoType::Unclear) + } + _ => DenovoType::Unclear, + } +} + +fn align(gts: &[Allele], target: &[u8], clip_len: usize, aligner: &mut WFAligner) -> Vec> { + let mut align_scores = vec![vec![]; gts.len()]; + for (i, allele) in gts.iter().enumerate() { + for (read, _align) in &allele.read_aligns { + let _status = aligner.align_end_to_end(&read.bases, target); + align_scores[i].push(aligner.cigar_score_clipped(clip_len)); + } + } + align_scores +} + +fn get_parental_count_diff(top_parent_score: f64, child_aligns: &[Vec]) -> (usize, f32) { + let (count, sum) = child_aligns + .iter() + .flatten() + .map(|&a| a as f64) + .filter(|a| a > &top_parent_score) + .fold((0, 0.0), |(count, sum), a| (count + 1, sum + a)); + let mean_diff = if count > 0 { + (sum / count as f64 - top_parent_score).abs() as f32 + } else { + 0.0 + }; + (count, mean_diff) +} + +fn get_score_quantile(xs: &mut [f64], q: f64) -> Option { + if xs.is_empty() { + return None; + } + xs.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); + + let index = q * (xs.len() as f64 - 1.0); + let lower_index = index.floor() as usize; + let upper_index = lower_index + 1; + let fraction = index - lower_index as f64; + + if upper_index >= xs.len() { + return Some(xs[xs.len() - 1]); + } + + Some(xs[lower_index] + (xs[upper_index] - xs[lower_index]) * fraction) +} + +fn get_top_parent_score(align_scores: &[Vec], quantile: f64) -> Option { + align_scores + .iter() + .filter_map(|scores| { + let mut scores_f64: Vec = scores.iter().map(|&x| x as f64).collect(); + get_score_quantile(&mut scores_f64, quantile) + }) + .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)) +} + +fn get_denovo_coverage( + mother_align_scores: &[Vec], + father_align_scores: &[Vec], + child_align_scores: &[Vec], + parent_quantile: f64, +) -> (usize, f32, f32) { + let top_mother_score = get_top_parent_score(mother_align_scores, parent_quantile).unwrap(); + let top_father_score = get_top_parent_score(father_align_scores, parent_quantile).unwrap(); + + let (mother_count, mean_diff_mother) = + get_parental_count_diff(top_mother_score, child_align_scores); + let (father_count, mean_diff_father) = + get_parental_count_diff(top_father_score, child_align_scores); + + let top_score = if top_mother_score >= top_father_score { + mother_count + } else { + father_count + }; + + (top_score, mean_diff_father, mean_diff_mother) +} + +#[cfg(test)] +mod tests { + use crate::read::ReadInfoBuilder; + + use super::*; + + #[test] + fn test_parent_origin_matrix_1() { + let mut matrix = Array2::from_elem((4, 2), f64::MIN); + matrix[[0, 0]] = -25.0; + matrix[[0, 1]] = -50.0; + matrix[[1, 0]] = -40.0; + matrix[[1, 1]] = -30.0; + matrix[[2, 0]] = 0.0; + matrix[[2, 1]] = -50.0; + matrix[[3, 0]] = -10.0; + matrix[[3, 1]] = -20.0; + + let mut combs_score: Vec<(f64, [(usize, usize); 2])> = Vec::new(); + for c in COMBS_2D.iter() { + combs_score.push((c.iter().map(|&(i, j)| matrix[[i, j]]).sum::(), *c)); + } + combs_score.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap()); + + let expected_result = (-30.0, [(2, 0), (1, 1)]); + assert_eq!(combs_score[0], expected_result); + + assert_eq!( + AlleleOrigin::new(combs_score[0].1[0].0).unwrap(), + AlleleOrigin::Father { + allele: AlleleNum::One, + }, + ); + + assert_eq!( + AlleleOrigin::new(combs_score[0].1[1].0).unwrap(), + AlleleOrigin::Mother { + allele: AlleleNum::Two, + }, + ); + } + + #[test] + fn test_get_parental_count_diff() { + let top_parent_score = -8.0; + let child_aligns = vec![vec![-30, -20, -30], vec![-6, 3, 0]]; + assert_eq!( + get_parental_count_diff(top_parent_score, &child_aligns), + (3, 7.0) + ); + } + + #[test] + fn test_get_denovo_coverage() { + let mother_aligns = vec![vec![-24, -24, -24], vec![-24, -12, -24]]; + let father_aligns = vec![vec![-8, -8, -12], vec![-8, -8, -20]]; + let child_aligns = vec![vec![-30, -20, -30], vec![-6, 0, 0]]; + assert_eq!( + get_denovo_coverage(&mother_aligns, &father_aligns, &child_aligns, 1.0), + (3, 6.0, 10.0) + ); + } + + #[test] + fn test_compare_seq_lengths_substitution() { + let gts = vec![ + Allele { + seq: b"ATATATAT".to_vec(), + genotype: 0, + read_aligns: vec![ + (ReadInfoBuilder::default().build().unwrap(), 0), + (ReadInfoBuilder::default().build().unwrap(), 1), + ], + motif_count: "".to_string(), + index: 0, + }, + Allele { + seq: b"ATATATAT".to_vec(), + genotype: 1, + read_aligns: vec![ + (ReadInfoBuilder::default().build().unwrap(), 0), + (ReadInfoBuilder::default().build().unwrap(), 1), + ], + motif_count: "".to_string(), + index: 1, + }, + ]; + let allele = AlleleNum::One; + let child_seq = b"ATCGATAT".to_vec(); + assert_eq!( + compare_seq_lengths(>s, &allele, &child_seq).unwrap(), + DenovoType::Substitution + ); + } + + #[test] + fn test_compare_seq_lengths_contraction() { + let gts = vec![ + Allele { + seq: b"ATATATAT".to_vec(), + genotype: 0, + read_aligns: vec![ + (ReadInfoBuilder::default().build().unwrap(), 0), + (ReadInfoBuilder::default().build().unwrap(), 1), + ], + motif_count: "".to_string(), + index: 0, + }, + Allele { + seq: b"ATA".to_vec(), + genotype: 1, + read_aligns: vec![ + (ReadInfoBuilder::default().build().unwrap(), 0), + (ReadInfoBuilder::default().build().unwrap(), 1), + ], + motif_count: "".to_string(), + index: 1, + }, + ]; + let allele = AlleleNum::One; + let child_seq = b"ATATAT".to_vec(); + assert_eq!( + compare_seq_lengths(>s, &allele, &child_seq).unwrap(), + DenovoType::Contraction + ); + } + + #[test] + fn test_compare_seq_lengths_expansion() { + let gts = vec![ + Allele { + seq: b"ATATA".to_vec(), + genotype: 0, + read_aligns: vec![ + (ReadInfoBuilder::default().build().unwrap(), 0), + (ReadInfoBuilder::default().build().unwrap(), 1), + ], + motif_count: "".to_string(), + index: 0, + }, + Allele { + seq: b"ATATATAT".to_vec(), + genotype: 1, + read_aligns: vec![ + (ReadInfoBuilder::default().build().unwrap(), 0), + (ReadInfoBuilder::default().build().unwrap(), 1), + ], + motif_count: "".to_string(), + index: 1, + }, + ]; + let allele = AlleleNum::Two; + let child_seq = b"ATATATATAT".to_vec(); + assert_eq!( + compare_seq_lengths(>s, &allele, &child_seq).unwrap(), + DenovoType::Expansion + ); + } +} diff --git a/src/handles.rs b/src/handles.rs new file mode 100644 index 0000000..eb221bd --- /dev/null +++ b/src/handles.rs @@ -0,0 +1,90 @@ +use crate::util::{self, Result}; +use anyhow::anyhow; +use noodles::{bam, bgzf::Reader, sam, vcf}; +use std::{ + fs::File, + path::PathBuf, + sync::{Arc, Mutex}, +}; + +pub fn build_paths(prefix: &str) -> Result> { + let mut paths = Vec::new(); + for ext in &["spanning.sorted.bam", "sorted.vcf.gz"] { + let mut path = PathBuf::from(prefix); + if let Some(file_name) = path.file_name() { + let file_name = file_name.to_string_lossy().to_string(); + path.set_file_name(format!("{}.{}", file_name, ext)); + } + util::try_exists(&path)?; + paths.push(path); + } + Ok(paths) +} + +#[derive(Clone)] +pub struct Handles { + pub mother: SubHandle, + pub father: SubHandle, + pub child: SubHandle, +} + +impl Handles { + pub fn new(mother_prefix: &str, father_prefix: &str, child_prefix: &str) -> Result { + Ok(Handles { + mother: SubHandle::new(mother_prefix)?, + father: SubHandle::new(father_prefix)?, + child: SubHandle::new(child_prefix)?, + }) + } +} + +#[derive(Clone)] +pub struct SubHandle { + pub vcf: Arc>>, + pub vcf_header: Arc, + pub bam: Arc>>>, + pub bam_header: Arc, +} + +impl SubHandle { + pub fn new(prefix: &str) -> Result { + let paths = build_paths(prefix)?; + let paths_slice = paths.as_slice(); + + if paths_slice.len() < 2 { + return Err(anyhow!("Failed to parse paths")); + } + + let bam_path = &paths_slice[0]; + let vcf_path = &paths_slice[1]; + + let mut bam = bam::indexed_reader::Builder::default() + .build_from_path(bam_path) + .map_err(|e| anyhow!("Failed to create bam reader: {}", e))?; + + let bam_header = bam + .read_header() + .map_err(|e| anyhow!("Failed to read bam header: {}", e))?; + + let bam = Arc::new(Mutex::new(bam)); + let bam_header = Arc::new(bam_header); + + let mut vcf = vcf::indexed_reader::Builder::default() + .build_from_path(vcf_path) + .map_err(|e| anyhow!("Failed to create vcf reader: {}", e))?; + + let vcf_header = vcf + .read_header() + .map_err(|e| anyhow!("Failed to read vcf header: {}", e))?; + + let vcf = Arc::new(Mutex::new(vcf)); + let vcf_header = Arc::new(vcf_header); + + Ok(SubHandle { + vcf, + vcf_header, + bam, + bam_header, + }) + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..a02f43c --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,12 @@ +pub mod aligner; +pub mod allele; +pub mod cli; +pub mod commands; +pub mod denovo; +pub mod handles; +pub mod locus; +pub mod read; +pub mod snp; +pub mod stats; +pub mod util; +pub mod wfa2; diff --git a/src/locus.rs b/src/locus.rs new file mode 100644 index 0000000..ec69cac --- /dev/null +++ b/src/locus.rs @@ -0,0 +1,203 @@ +use crate::util::Result; +use anyhow::{anyhow, Context}; +use noodles::{ + bed, + core::{Position, Region}, + fasta::{self, io::BufReadSeek, record::Sequence}, +}; +use std::{ + collections::{HashMap, HashSet}, + fs::File, + io::BufReader, +}; + +#[derive(Debug, PartialEq)] +pub struct Locus { + pub id: String, + pub struc: String, + pub motifs: Vec, + pub left_flank: Vec, + pub right_flank: Vec, + pub region: Region, +} + +fn decode_info_field(encoding: &str) -> Result<(&str, &str)> { + let mut name_and_value = encoding.splitn(2, '='); + let name = name_and_value + .next() + .ok_or_else(|| anyhow!("Invalid entry: {}", encoding))?; + let value = name_and_value + .next() + .ok_or_else(|| anyhow!("Invalid entry: {}", encoding))?; + Ok((name, value)) +} + +fn decode_fields(info_fields: &str) -> Result> { + let mut fields = HashMap::new(); + for field_encoding in info_fields.split(';') { + let (name, value) = decode_info_field(field_encoding)?; + if fields.insert(name, value.to_string()).is_some() { + return Err(anyhow!("Duplicate field: {}", name)); + } + } + Ok(fields) +} + +impl Locus { + pub fn new( + genome: &mut fasta::IndexedReader>, + chrom_lookup: &HashSet, + line: &bed::Record<3>, + flank_len: usize, + ) -> Result { + if !chrom_lookup.contains(line.reference_sequence_name()) { + return Err(anyhow!( + "Chromosome {} not found in reference", + line.reference_sequence_name() + )); + } + + // -1 because bed is 0-based + let region = Region::new( + line.reference_sequence_name(), + Position::new(usize::from(line.start_position()) - 1).unwrap()..=line.end_position(), + ); + + let fields = decode_fields(&line.optional_fields()[0])?; + + let id = fields + .get("ID") + .ok_or_else(|| anyhow!("ID field missing"))? + .to_string(); + let struc = fields + .get("STRUC") + .ok_or_else(|| anyhow!("STRUC field missing"))? + .to_string(); + let motifs = fields + .get("MOTIFS") + .ok_or_else(|| anyhow!("MOTIFS field missing"))? + .split(',') + .map(|s| s.to_string()) + .collect(); + + let (left_flank, right_flank) = get_flanks(genome, ®ion, flank_len)?; + + Ok(Locus { + id, + struc, + motifs, + left_flank, + right_flank, + region, + }) + } +} + +pub fn get_locus( + genome_reader: &mut fasta::IndexedReader>, + catalog_reader: &mut bed::Reader>, + tr_id: &str, + flank_len: usize, +) -> Result { + let chrom_lookup = HashSet::from_iter( + genome_reader + .index() + .iter() + .map(|entry| entry.name().to_string()), + ); + + let query = format!("ID={tr_id};"); + for (line_number, result) in catalog_reader.records::<3>().enumerate() { + let line = result.with_context(|| format!("Error reading BED line {}", line_number + 1))?; + if line.optional_fields().get(0).unwrap().contains(&query) { + return Locus::new(genome_reader, &chrom_lookup, &line, flank_len) + .with_context(|| format!("Error processing BED line {}", line_number + 1)); + } + } + Err(anyhow!("Unable to find locus {tr_id}")) +} + +pub fn get_loci<'a>( + genome_reader: &'a mut fasta::IndexedReader>, + catalog_reader: &'a mut bed::Reader>, + flank_len: usize, +) -> impl Iterator> + 'a { + let chrom_lookup = HashSet::from_iter( + genome_reader + .index() + .iter() + .map(|entry| entry.name().to_string()), + ); + + catalog_reader + .records::<3>() + .enumerate() + .filter_map(move |(line_number, result_line)| match result_line { + Ok(line) => Some(Ok((line, line_number))), + Err(err) => Some(Err( + anyhow!(err).context(format!("Error at BED line {}", line_number + 1)) + )), + }) + .map(move |result| { + result.and_then(|(line, line_number)| { + Locus::new(genome_reader, &chrom_lookup, &line, flank_len) + .with_context(|| format!("Error processing BED line {}", line_number + 1)) + }) + }) +} + +fn get_flank( + genome: &mut fasta::IndexedReader>, + region: &Region, + start: usize, + end: usize, +) -> Result { + let start_pos = Position::try_from(start + 1).unwrap(); + let end_pos = Position::try_from(end).unwrap(); + let query_region = Region::new(region.name(), start_pos..=end_pos); + match genome.query(&query_region) { + Ok(seq) => Ok(seq.sequence().to_owned()), + Err(_) => Err(anyhow!("Unable to extract: {:?}", region)), + } +} + +fn get_flanks( + genome: &mut fasta::IndexedReader>, + region: &Region, + flank_len: usize, +) -> Result<(Vec, Vec)> { + let (lf_start, lf_end) = ( + region.interval().start().unwrap().get() - flank_len, + region.interval().start().unwrap().get(), + ); + let (rf_start, rf_end) = ( + region.interval().end().unwrap().get(), + region.interval().end().unwrap().get() + flank_len, + ); + + let left_flank = get_flank(genome, region, lf_start, lf_end)?; + let right_flank = get_flank(genome, region, rf_start, rf_end)?; + + Ok(( + left_flank.as_ref().to_ascii_uppercase(), + right_flank.as_ref().to_ascii_uppercase(), + )) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn can_decode_info_field() { + let decoded = decode_info_field("ID=AFF2").unwrap(); + assert_eq!("ID", decoded.0); + assert_eq!("AFF2", decoded.1); + } + + #[test] + #[should_panic] + fn panic_invalid_decode_info_field() { + decode_info_field("ID:AFF2").unwrap(); + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..9fa805b --- /dev/null +++ b/src/main.rs @@ -0,0 +1,15 @@ +use clap::Parser; +use trgt_denovo::{ + cli::{init_verbose, Cli, Command}, + commands::trio, + util::Result, +}; + +fn main() -> Result<()> { + let cli = Cli::parse(); + init_verbose(&cli); + match cli.command { + Command::Trio(args) => trio(args)?, + } + Ok(()) +} diff --git a/src/read.rs b/src/read.rs new file mode 100644 index 0000000..a14e103 --- /dev/null +++ b/src/read.rs @@ -0,0 +1,198 @@ +use noodles::sam::alignment::Record; +use noodles::sam::record::data::field::value::Array; +use noodles::sam::record::data::field::{tag, Value}; +use once_cell::sync::Lazy; + +#[derive(Debug, PartialEq, Clone)] +pub struct FlankingReadInfo { + is_left_flank: bool, +} + +#[derive(Debug, PartialEq, Clone)] +pub struct ReadInfo { + pub bases: Box<[u8]>, + pub classification: Option, + pub start_offset: Option, + pub end_offset: Option, + pub mismatch_offsets: Option>, + pub flank_info: Option, +} + +static AL_KEY: Lazy = Lazy::new(|| "AL".parse().unwrap()); +static MO_KEY: Lazy = Lazy::new(|| "MO".parse().unwrap()); +static SO_KEY: Lazy = Lazy::new(|| "SO".parse().unwrap()); +static EO_KEY: Lazy = Lazy::new(|| "EO".parse().unwrap()); + +impl ReadInfo { + pub fn new(record: Record) -> Self { + let bases = record + .sequence() + .as_ref() + .iter() + .map(|base| u8::from(*base)) + .collect::>() + .into_boxed_slice(); + + let data = record.data(); + + let classification = data.get(&*AL_KEY).and_then(|value| match value { + Value::UInt8(v) => Some(*v), + _ => None, + }); + + let start_offset = data.get(&*SO_KEY).and_then(|value| match value { + Value::Int32(v) => Some(*v), + _ => None, + }); + + let end_offset = data.get(&*EO_KEY).and_then(|value| match value { + Value::Int32(v) => Some(*v), + _ => None, + }); + + let mismatch_offsets = data + .get(&*MO_KEY) + .and_then(|value| value.as_array()) + .and_then(|array| match array { + Array::Int32(vec) => Some(vec.clone()), + _ => None, + }); + + Self { + bases, + classification, + start_offset, + end_offset, + mismatch_offsets, + flank_info: None, + } + } +} + +#[derive(Debug, PartialEq, Clone)] + +pub struct ReadInfoBuilder { + bases: Box<[u8]>, + classification: Option, + start_offset: Option, + end_offset: Option, + mismatch_offsets: Option>, + flank_info: Option, +} + +impl ReadInfoBuilder { + pub fn new() -> Self { + Self { + bases: Box::new([0u8; 10]), + classification: Some(0), + start_offset: Some(0), + end_offset: Some(0), + mismatch_offsets: Some(vec![0, 0, 0, 0, 0]), + flank_info: None, + } + } + + pub fn with_bases>>(mut self, bases: T) -> Self { + self.bases = bases.into(); + self + } + + pub fn with_classification(mut self, classification: Option) -> Self { + self.classification = classification; + self + } + + pub fn with_start_offset(mut self, start_offset: Option) -> Self { + self.start_offset = start_offset; + self + } + + pub fn with_end_offset(mut self, end_offset: Option) -> Self { + self.end_offset = end_offset; + self + } + + pub fn with_mismatch_offsets(mut self, mismatch_offsets: Option>) -> Self { + self.mismatch_offsets = mismatch_offsets; + self + } + + pub fn with_flank_info(mut self, flank_info: Option) -> Self { + self.flank_info = flank_info; + self + } + + pub fn build(self) -> Result { + Ok(ReadInfo { + bases: self.bases, + classification: self.classification, + start_offset: self.start_offset, + end_offset: self.end_offset, + mismatch_offsets: self.mismatch_offsets, + flank_info: self.flank_info, + }) + } +} + +impl Default for ReadInfoBuilder { + fn default() -> Self { + Self { + bases: Box::new([0u8; 10]), + classification: None, + start_offset: None, + end_offset: None, + mismatch_offsets: None, + flank_info: None, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_read_info_builder() { + let dummy_read_info = ReadInfoBuilder::default() + .with_bases(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + .with_classification(Some(1)) + .with_start_offset(Some(100)) + .with_end_offset(Some(200)) + .with_mismatch_offsets(Some(vec![1, 2, 3, 4, 5])) + .with_flank_info(Some(FlankingReadInfo { + is_left_flank: true, + })) + .build() + .unwrap(); + + assert_eq!( + dummy_read_info.bases, + vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10].into() + ); + assert_eq!(dummy_read_info.classification, Some(1)); + assert_eq!(dummy_read_info.start_offset, Some(100)); + assert_eq!(dummy_read_info.end_offset, Some(200)); + assert_eq!(dummy_read_info.mismatch_offsets, Some(vec![1, 2, 3, 4, 5])); + assert_eq!( + dummy_read_info.flank_info, + Some(FlankingReadInfo { + is_left_flank: true + }) + ); + } + + #[test] + fn test_read_info_builder_defaults() { + let default_read_info = ReadInfoBuilder::default().build().unwrap(); + + assert_eq!( + default_read_info.bases, + vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0].into() + ); + assert_eq!(default_read_info.classification, None); + assert_eq!(default_read_info.start_offset, None); + assert_eq!(default_read_info.end_offset, None); + assert_eq!(default_read_info.mismatch_offsets, None); + assert_eq!(default_read_info.flank_info, None); + } +} diff --git a/src/snp.rs b/src/snp.rs new file mode 100644 index 0000000..26f0f5b --- /dev/null +++ b/src/snp.rs @@ -0,0 +1,644 @@ +use crate::{allele::Allele, read::ReadInfo}; +use ndarray::{s, Array2, ArrayView1, ArrayView2}; +use std::collections::{HashMap, HashSet}; +use std::iter; + +#[cfg(not(test))] +mod constants { + pub const MAX_SNP_DIFF: i32 = 6000; + pub const MIN_SNP_FREQ: f64 = 0.2; +} + +#[cfg(test)] +mod constants { + pub const MAX_SNP_DIFF: i32 = 4000; + pub const MIN_SNP_FREQ: f64 = 0.2; +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum FamilyMember { + Father, + Mother, + Child, +} + +impl FamilyMember { + fn index(&self) -> usize { + match self { + FamilyMember::Father => 0, + FamilyMember::Mother => 1, + FamilyMember::Child => 2, + } + } +} + +#[derive(Debug, Clone, PartialEq, Default)] +pub struct FamilyOffsets { + pub start_offset: usize, + pub end_offset: usize, + pub allele_offsets: Vec, +} + +#[derive(Debug, PartialEq)] +pub struct TrinaryMatrix { + pub matrix: Array2, + pub offsets: [FamilyOffsets; 3], + pub mismatch_offsets: Vec, +} + +impl TrinaryMatrix { + pub fn new( + child: &Vec, + father: &Vec, + mother: &Vec, + ) -> Option { + let mut n_reads = 0; + let mut all_pois: HashSet = HashSet::new(); + let family_members = [father, mother, child]; + + for family_member in &family_members { + for allele in *family_member { + n_reads += allele.read_aligns.len(); + for (read, _) in &allele.read_aligns { + all_pois.extend(read.mismatch_offsets.as_ref().unwrap()); + } + } + } + + if all_pois.is_empty() { + return None; + } + + let mut all_pois: Vec = all_pois.into_iter().collect(); + all_pois.sort_unstable(); + + let mut matrix = Array2::from_elem((n_reads, all_pois.len()), 2); + + // TODO: ideally want: offsets = [FamilyOffsets::default(); 3], can't because of vec, maybe box Vec? + let mut offsets = [ + (FamilyOffsets { + start_offset: 0, + end_offset: 0, + allele_offsets: Vec::new(), + }), + (FamilyOffsets { + start_offset: 0, + end_offset: 0, + allele_offsets: Vec::new(), + }), + (FamilyOffsets { + start_offset: 0, + end_offset: 0, + allele_offsets: Vec::new(), + }), + ]; + + let mut row_idx = 0; + let mut allele_offsets = Vec::new(); + for (member_idx, family_member) in family_members.iter().enumerate() { + let start_offset = row_idx; + for (allele_idx, allele) in family_member.iter().enumerate() { + if allele_idx > 0 { + allele_offsets.push(row_idx); + } + for (read, _) in &allele.read_aligns { + let mismatch_offsets = read.mismatch_offsets.as_ref().unwrap(); + let start_col_idx = all_pois + .binary_search(&read.start_offset.unwrap()) + .unwrap_or_else(|x| x); + let end_col_idx = all_pois + .binary_search(&read.end_offset.unwrap()) + .unwrap_or_else(|x| x); + for col_idx in start_col_idx..end_col_idx { + let offset = all_pois[col_idx]; + matrix[[row_idx, col_idx]] = + mismatch_offsets.binary_search(&offset).is_ok() as u8; + } + row_idx += 1; + } + } + offsets[member_idx] = FamilyOffsets { + start_offset, + end_offset: row_idx - 1, + allele_offsets: allele_offsets.clone(), + }; + allele_offsets.clear(); + } + + Some(TrinaryMatrix { + matrix, + offsets, + mismatch_offsets: all_pois, + }) + } + + pub fn family_submatrix(&self, member: FamilyMember) -> ArrayView2 { + let offset = &self.offsets[member.index()]; + self.matrix + .slice(s![offset.start_offset..=offset.end_offset, ..]) + } + + pub fn allele_submatrix( + &self, + member: FamilyMember, + allele_idx: usize, + ) -> Option> { + let offset = &self.offsets[member.index()]; + + if allele_idx > offset.allele_offsets.len() { + return None; + } + + let start_offset = if allele_idx == 0 { + offset.start_offset + } else { + offset.allele_offsets[allele_idx - 1] + }; + + let end_offset = if allele_idx < offset.allele_offsets.len() { + offset.allele_offsets[allele_idx] - 1 + } else { + offset.end_offset + }; + + Some(self.matrix.slice(s![start_offset..=end_offset, ..])) + } + + pub fn iter_alleles<'a>( + &'a self, + member: FamilyMember, + ) -> impl Iterator> + 'a { + let offset = &self.offsets[member.index()]; + let start_offsets = + iter::once(offset.start_offset).chain(offset.allele_offsets.iter().cloned()); + let end_offsets = offset + .allele_offsets + .iter() + .cloned() + .chain(iter::once(offset.end_offset + 1)); + + start_offsets + .zip(end_offsets) + .map(move |(start, end)| self.matrix.slice(s![start..end, ..])) + } + + pub fn count_alleles(&self, member: FamilyMember) -> usize { + let offset = &self.offsets[member.index()]; + offset.allele_offsets.len() + 1 + } +} + +pub trait ReadFilter { + fn filter(&self, reads: &mut Vec); +} + +pub struct FilterByDist; +impl ReadFilter for FilterByDist { + fn filter(&self, reads: &mut Vec) { + for read in reads.iter_mut() { + if let Some(offsets) = &mut read.mismatch_offsets { + offsets.drain( + ..offsets + .binary_search(&(-constants::MAX_SNP_DIFF - 1)) + .unwrap_or_else(|x| x), + ); + offsets.drain( + offsets + .binary_search(&(constants::MAX_SNP_DIFF + 1)) + .unwrap_or_else(|x| x).., + ); + } + } + } +} + +fn calc_similarity(child_row: &ArrayView1, parent_row: &ArrayView1) -> f64 { + let mut matching_positions = 0; + let mut total_covered_positions = 0; + + for (&child_value, &parent_value) in child_row.iter().zip(parent_row.iter()) { + if child_value == 2 || parent_value == 2 { + continue; + } + + total_covered_positions += 1; + + if child_value == parent_value { + matching_positions += 1; + } + } + + let total_covered_positions = total_covered_positions as f64; + let total_covered_positions = if total_covered_positions == 0.0 { + 0.00000001 + } else { + total_covered_positions + }; + + matching_positions as f64 / total_covered_positions +} + +fn get_likelihood(child_allele: ArrayView2, parent_allele: ArrayView2) -> f64 { + let similarity_scores: Vec = child_allele + .outer_iter() + .flat_map(|child_row| { + parent_allele + .outer_iter() + .map(move |parent_row| calc_similarity(&child_row, &parent_row)) + }) + .collect(); + + let likelihood: f64 = similarity_scores.iter().sum::() / similarity_scores.len() as f64; + likelihood +} + +fn get_individual_prob( + trinary_matrix: &TrinaryMatrix, + child_allele_idx: usize, +) -> Option<(Vec, f64)> { + let child_allele = trinary_matrix.allele_submatrix(FamilyMember::Child, child_allele_idx)?; + if child_allele.is_empty() { + return None; + } + + let father_count = trinary_matrix.count_alleles(FamilyMember::Father) as f64; + let mother_count = trinary_matrix.count_alleles(FamilyMember::Mother) as f64; + + let mut hypotheses = Vec::new(); + let mut priors = Vec::new(); + + for father_allele in trinary_matrix.iter_alleles(FamilyMember::Father) { + if father_allele.is_empty() { + return None; + } + hypotheses.push((child_allele, father_allele)); + priors.push(0.5 / father_count); + } + + for mother_allele in trinary_matrix.iter_alleles(FamilyMember::Mother) { + if mother_allele.is_empty() { + return None; + } + hypotheses.push((child_allele, mother_allele)); + priors.push(0.5 / mother_count); + } + + let likelihoods: Vec = hypotheses + .iter() + .map(|(c, p)| get_likelihood(*c, *p)) + .collect(); + + let max_likelihood = likelihoods + .iter() + .cloned() + .max_by(|a, b| a.partial_cmp(b).unwrap()) + .unwrap(); + + let all_probabilities: f64 = likelihoods.iter().zip(&priors).map(|(l, p)| l * p).sum(); + let posteriors: Vec = likelihoods + .iter() + .zip(&priors) + .map(|(l, p)| l * p / all_probabilities) + .collect(); + + Some((posteriors, max_likelihood)) +} + +fn combine_probabilities( + child_allele1_prob: &[f64], + child_allele2_prob: &[f64], + father_count: usize, + mother_count: usize, +) -> HashMap { + let mut combined_probabilities = HashMap::new(); + let mut total_prob = 0.0; + + for i in 0..(father_count + mother_count) { + for j in 0..(father_count + mother_count) { + if (i < father_count && j < father_count) || (i >= father_count && j >= father_count) { + continue; + } + + let probability = child_allele1_prob[i] * child_allele2_prob[j]; + combined_probabilities.insert(format!("H{}_{}", i, j), probability); + total_prob += probability; + } + } + + // Normalize probabilities + for value in combined_probabilities.values_mut() { + *value /= total_prob; + } + + combined_probabilities +} + +pub fn inheritance_prob( + trinary_matrix: &TrinaryMatrix, +) -> Option<(HashMap, (f64, f64))> { + match trinary_matrix.count_alleles(FamilyMember::Child) { + 1 => { + let (child_allele1_prob, max_likelihood) = get_individual_prob(trinary_matrix, 0)?; + let mut result: HashMap = HashMap::new(); + for (i, prob) in child_allele1_prob.iter().enumerate() { + result.insert(i.to_string(), *prob); + } + Some((result, ((max_likelihood, -1.0)))) + } + 2 => { + let (child_allele1_prob, max_likelihood_1) = get_individual_prob(trinary_matrix, 0)?; + let (child_allele2_prob, max_likelihood_2) = get_individual_prob(trinary_matrix, 1)?; + Some(( + combine_probabilities( + &child_allele1_prob, + &child_allele2_prob, + trinary_matrix.count_alleles(FamilyMember::Father), + trinary_matrix.count_alleles(FamilyMember::Mother), + ), + (max_likelihood_1, max_likelihood_2), + )) + } + _ => None, + } +} + +pub struct FilterByFreq; +impl ReadFilter for FilterByFreq { + fn filter(&self, reads: &mut Vec) { + let mut offset_counts: HashMap = HashMap::new(); + let total_reads = reads.len(); + + for read in reads.iter() { + if let Some(offsets) = &read.mismatch_offsets { + for &offset in offsets { + *offset_counts.entry(offset).or_insert(0) += 1; + } + } + } + for read in reads.iter_mut() { + if let Some(offsets) = &mut read.mismatch_offsets { + offsets.retain(|&offset| { + let count = offset_counts.get(&offset).unwrap_or(&0); + (*count as f64) / (total_reads as f64) >= constants::MIN_SNP_FREQ + }); + } + } + } +} + +pub fn apply_read_filters(reads: &mut Vec, filters: &[&(dyn ReadFilter + Sync)]) { + for filter in filters { + filter.filter(reads); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::read::ReadInfoBuilder; + use itertools::Itertools; + + fn create_read_info( + mismatch_offsets: Vec, + start_offset: i32, + end_offset: i32, + ) -> ReadInfo { + ReadInfoBuilder::new() + .with_mismatch_offsets(Some(mismatch_offsets)) + .with_start_offset(Some(start_offset)) + .with_end_offset(Some(end_offset)) + .build() + .unwrap() + } + + #[test] + fn test_empty_trinary_matrix() { + let child_alleles: Vec = vec![]; + let father_alleles: Vec = vec![]; + let mother_alleles: Vec = vec![]; + let trinaray_mat: Option = + TrinaryMatrix::new(&child_alleles, &father_alleles, &mother_alleles); + assert_eq!(trinaray_mat, None); + } + + // TODO: split into multiple tests + #[test] + fn test_trinary_matrix() { + let father_alleles = vec![ + Allele::dummy(vec![ + create_read_info(vec![-8, -2, 3, 5, 8, 42], -12, 70), + create_read_info(vec![-2, 3, 5, 8], -6, 30), + create_read_info(vec![3, 5, 8, 42], 0, 50), + ]), + Allele::dummy(vec![ + create_read_info(vec![-8, -4, 2], -22, 30), + create_read_info(vec![-8, -4, 2], -40, 60), + create_read_info(vec![-8, -4, 2], -30, 90), + ]), + ]; + + let mother_alleles = vec![ + Allele::dummy(vec![ + create_read_info(vec![-4, -1, 2, 6], -5, 7), + create_read_info(vec![-4, -1, 2, 6, 23], -20, 70), + create_read_info(vec![2, 6, 23], 1, 120), + ]), + // Allele::dummy(vec![ + // create_read_info(vec![6, 23], -30, 40), + // create_read_info(vec![6, 23], -10, 90), + // create_read_info(vec![6, 23], -2, 120), + // ]), + ]; + + let child_alleles = vec![ + Allele::dummy(vec![ + create_read_info(vec![-8, -2, 3, 5, 8, 42], -12, 70), + create_read_info(vec![-8, -2, 3, 5, 8], -10, 30), + create_read_info(vec![3, 5, 8, 42], 0, 50), + ]), + Allele::dummy(vec![ + create_read_info(vec![-4, -1, 2, 6, 23], -40, 90), + create_read_info(vec![-4, -1, 2, 6], -32, 20), + create_read_info(vec![2, 6, 23], 1, 120), + ]), + Allele::dummy(vec![create_read_info(vec![], -40, 90)]), + Allele::dummy(vec![ + create_read_info(vec![-1, 2, 6], -32, 20), + create_read_info(vec![2, 6], 1, 10), + ]), + ]; + + let trinaray_mat = + TrinaryMatrix::new(&child_alleles, &father_alleles, &mother_alleles).unwrap(); + + // Test dimensionality + assert_eq!(trinaray_mat.matrix.shape(), &[18, 11]); + + assert_eq!( + get_likelihood( + trinaray_mat + .allele_submatrix(FamilyMember::Child, 0) + .unwrap(), + trinaray_mat + .allele_submatrix(FamilyMember::Child, 0) + .unwrap() + ), + 1.0 + ); + + // Counts alleles + assert_eq!(trinaray_mat.count_alleles(FamilyMember::Child), 4); + assert_eq!(trinaray_mat.count_alleles(FamilyMember::Mother), 1); + assert_eq!(trinaray_mat.count_alleles(FamilyMember::Father), 2); + + // Check for correctness of mismatch offset labels + assert_eq!( + trinaray_mat.mismatch_offsets, + vec![-8, -4, -2, -1, 2, 3, 5, 6, 8, 23, 42,] + ); + + // Row 0 should correspond to the first read of the father + assert_eq!( + trinaray_mat.matrix.row(0), + trinaray_mat + .matrix + .row(trinaray_mat.offsets[FamilyMember::Father.index()].start_offset), + ); + + // Check for correctness of matrix row 0 + assert_eq!( + trinaray_mat.matrix.row(0).to_vec(), + vec![1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1] + ); + + // Check for correctness for the first read of the mother + assert_eq!( + trinaray_mat + .matrix + .row(trinaray_mat.offsets[FamilyMember::Mother.index()].start_offset) + .to_vec(), + vec![2, 1, 0, 1, 1, 0, 0, 1, 2, 2, 2] + ); + + // Mother has only one allele so the mother submatrix should be equivalent to allele submatrix 0 + assert_eq!( + trinaray_mat.family_submatrix(FamilyMember::Mother), + trinaray_mat + .allele_submatrix(FamilyMember::Mother, 0) + .unwrap() + ); + + // Allele submatrix 1 (and beyond) of the mother should be None + assert_eq!(None, trinaray_mat.allele_submatrix(FamilyMember::Mother, 1)); + + // Child has 4 alleles, check that allele 3 is correct + assert_eq!( + trinaray_mat + .allele_submatrix(FamilyMember::Child, 2) + .unwrap() + .rows() + .into_iter() + .map(|row| row.to_vec()) + .collect_vec(), + vec![[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], + ); + } + + #[test] + fn test_filter_by_dist() { + let mut reads = vec![ + create_read_info( + vec![ + -5000, -4000, -3000, -2000, -1000, 1000, 2000, 3000, 4000, 5000, + ], + 0, + 0, + ), + create_read_info(vec![4001, 6000, 7000, 8000, 9000, 10000], 0, 0), + create_read_info(vec![0, 1000, 2000, 3000, 3999, 4000, 4001], 0, 0), + ]; + + FilterByDist.filter(&mut reads); + + assert_eq!( + reads[0].mismatch_offsets.as_ref().unwrap(), + &vec![-4000, -3000, -2000, -1000, 1000, 2000, 3000, 4000] + ); + assert_eq!(reads[1].mismatch_offsets.as_ref().unwrap(), &vec![]); + assert_eq!( + reads[2].mismatch_offsets.as_ref().unwrap(), + &vec![0, 1000, 2000, 3000, 3999, 4000] + ); + } + + #[test] + fn test_filter_by_freq() { + let mut reads = vec![ + create_read_info(vec![1000, 2000, 3000, 3002, 4000, 5000], 0, 0), + create_read_info(vec![1000, 2000, 3000, 4000, 4090, 5000], 0, 0), + create_read_info(vec![-3213, 1000, 2000, 3000, 4000, 5000], 0, 0), + create_read_info(vec![1, 2, 3, 4, 5], 0, 0), + create_read_info(vec![-3, -2, -1, 0, 2000], 0, 0), + create_read_info(vec![], 0, 0), + ]; + + FilterByFreq.filter(&mut reads); + + assert_eq!( + reads[0].mismatch_offsets.as_ref().unwrap(), + &vec![1000, 2000, 3000, 4000, 5000] + ); + assert_eq!( + reads[1].mismatch_offsets.as_ref().unwrap(), + &vec![1000, 2000, 3000, 4000, 5000] + ); + assert_eq!( + reads[2].mismatch_offsets.as_ref().unwrap(), + &vec![1000, 2000, 3000, 4000, 5000] + ); + assert_eq!(reads[3].mismatch_offsets.as_ref().unwrap(), &vec![]); + assert_eq!(reads[4].mismatch_offsets.as_ref().unwrap(), &vec![2000]); + } + + #[test] + fn test_filter_by_dist_and_freq() { + let mut reads = vec![ + create_read_info( + vec![ + -5000, -4000, -3000, -2000, -1000, 1000, 2000, 3000, 3002, 4000, 5000, + ], + 0, + 0, + ), + create_read_info( + vec![ + -5000, -4000, -3000, -2000, -1000, 1000, 2000, 3000, 4000, 4090, 5000, + ], + 0, + 0, + ), + create_read_info(vec![-3213, 1000, 2000, 3000, 4000, 5000], 0, 0), + create_read_info(vec![1, 2, 3, 4, 5], 0, 0), + create_read_info(vec![-3, -2, -1, 0, 2000], 0, 0), + create_read_info(vec![], 0, 0), + ]; + + FilterByDist.filter(&mut reads); + FilterByFreq.filter(&mut reads); + + assert_eq!( + reads[0].mismatch_offsets.as_ref().unwrap(), + &vec![-4000, -3000, -2000, -1000, 1000, 2000, 3000, 4000] + ); + assert_eq!( + reads[1].mismatch_offsets.as_ref().unwrap(), + &vec![-4000, -3000, -2000, -1000, 1000, 2000, 3000, 4000] + ); + assert_eq!( + reads[2].mismatch_offsets.as_ref().unwrap(), + &vec![1000, 2000, 3000, 4000] + ); + assert_eq!(reads[3].mismatch_offsets.as_ref().unwrap(), &vec![]); + assert_eq!(reads[4].mismatch_offsets.as_ref().unwrap(), &vec![2000]); + } +} diff --git a/src/stats.rs b/src/stats.rs new file mode 100644 index 0000000..001abf2 --- /dev/null +++ b/src/stats.rs @@ -0,0 +1,28 @@ +use crate::allele::Allele; + +pub fn get_per_allele_reads(alleles: &[Allele]) -> Vec { + alleles.iter().map(|a| a.read_aligns.len()).collect() +} + +pub fn get_total_reads(alleles: &[Allele]) -> usize { + get_per_allele_reads(alleles).iter().sum() +} + +pub fn get_dropout_prob(alleles: &[Allele]) -> f64 { + assert!(!alleles.is_empty()); + let allele_frac: f64 = 0.5; + let num_reads = get_total_reads(alleles) as f64; + allele_frac.powf(num_reads) +} + +pub fn get_allele_freqs(alleles: &[Allele]) -> Vec { + let num_reads = get_total_reads(alleles) as f64; + let allele_freqs: Vec = alleles + .iter() + .map(|a| a.read_aligns.len() as f64 / num_reads) + .collect(); + allele_freqs +} + +#[cfg(test)] +mod tests {} diff --git a/src/util.rs b/src/util.rs new file mode 100644 index 0000000..65d6467 --- /dev/null +++ b/src/util.rs @@ -0,0 +1,17 @@ +use anyhow::anyhow; +use log; +use std::path::Path; + +pub type Result = anyhow::Result; + +pub fn handle_error_and_exit(err: anyhow::Error) -> ! { + log::error!("{:#}", err); + std::process::exit(1); +} + +pub fn try_exists(path: &Path) -> Result<()> { + if !path.exists() { + return Err(anyhow!("Path does not exist: {}", path.display())); + } + Ok(()) +} diff --git a/src/wfa2.rs b/src/wfa2.rs new file mode 100644 index 0000000..d9c0de6 --- /dev/null +++ b/src/wfa2.rs @@ -0,0 +1,2 @@ +//! Re-export wfa2-sys bindings +pub use wfa2_sys::*;