From cde7bf22de4af5e929abe63ecb4ea78e7e17a29d Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 6 Sep 2024 13:29:45 -0700 Subject: [PATCH 01/11] write checksum download failures to file --- Cargo.lock | 459 ++++++++++-------- src/directsketch.rs | 134 ++++- src/lib.rs | 2 + .../sourmash_plugin_directsketch/__init__.py | 4 +- tests/test_gbsketch.py | 44 +- tests/test_urlsketch.py | 3 +- 6 files changed, 415 insertions(+), 231 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c700eb1..c144dd7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "addr2line" -version = "0.21.0" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" +checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678" dependencies = [ "gimli", ] @@ -17,6 +17,12 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + [[package]] name = "aho-corasick" version = "1.1.3" @@ -64,9 +70,9 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.9" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e9eabd7a98fe442131a17c316bd9349c43695e49e730c3c8e12cfb5f4da2693" +checksum = "fec134f64e2bc57411226dfc4e52dec859ddfc7e711fc5e07b612584f000e4aa" dependencies = [ "bzip2", "deflate64", @@ -76,8 +82,8 @@ dependencies = [ "memchr", "pin-project-lite", "xz2", - "zstd 0.13.1", - "zstd-safe 7.1.0", + "zstd 0.13.2", + "zstd-safe 7.2.1", ] [[package]] @@ -96,11 +102,17 @@ dependencies = [ "tokio-util", ] +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + [[package]] name = "autocfg" -version = "1.2.0" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1fdabc7756949593fe60f30ec81974b613357de856987752631dea1e3394c80" +checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" [[package]] name = "az" @@ -110,24 +122,24 @@ checksum = "7b7e4c2464d97fe331d41de9d5db0def0a96f4d823b8b32a2efd503578988973" [[package]] name = "backtrace" -version = "0.3.71" +version = "0.3.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26b05800d2e817c8b3b4b54abd461726265fa9789ae34330622f2db9ee696f9d" +checksum = "5cc23269a4f8976d0a4d2e7109211a419fe30e8d88d677cd60b6bc79c5732e0a" dependencies = [ "addr2line", "cc", "cfg-if", "libc", - "miniz_oxide", + "miniz_oxide 0.7.4", "object", "rustc-demangle", ] [[package]] name = "base64" -version = "0.22.0" +version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9475866fec1451be56a3c2400fd081ff546538961565ccb5b7142cbd22bc7a51" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" [[package]] name = "bgzip" @@ -147,21 +159,15 @@ checksum = "597bb81c80a54b6a4381b23faba8d7774b144c94cbd1d6fe3f1329bd776554ab" [[package]] name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - -[[package]] -name = "bitflags" -version = "2.5.0" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" [[package]] name = "buffer-redux" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c9f8ddd22e0a12391d1e7ada69ec3b0da1914f1cec39c5cf977143c5b2854f5" +checksum = "4e8acf87c5b9f5897cd3ebb9a327f420e0cae9dd4e5c1d2e36f2c84c571a58f1" dependencies = [ "memchr", ] @@ -180,9 +186,9 @@ checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce" [[package]] name = "bytemuck" -version = "1.16.1" +version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b236fc92302c97ed75b38da1f4917b5cdda4984745740f153a5d3059e48d725e" +checksum = "94bbb0ad554ad961ddc5da507a12a29b14e4ae5bda06b19f575a3e6079d2e2ae" [[package]] name = "byteorder" @@ -192,9 +198,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.6.0" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" +checksum = "8318a53db07bb3f8dca91a600466bdb3f2eaadeedfdbcf02e1accbad9271ba50" [[package]] name = "bzip2" @@ -228,13 +234,13 @@ dependencies = [ [[package]] name = "cc" -version = "1.0.95" +version = "1.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d32a725bc159af97c3e629873bb9f88fb8cf8a4867175f76dc987815ea07c83b" +checksum = "e9d013ecb737093c0e86b151a7b837993cf9ec6c502946cfb44bedc392421e0b" dependencies = [ "jobserver", "libc", - "once_cell", + "shlex", ] [[package]] @@ -278,9 +284,9 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.6" +version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" [[package]] name = "counter" @@ -293,9 +299,9 @@ dependencies = [ [[package]] name = "crc32fast" -version = "1.4.0" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3855a8a784b474f333699ef2bbca9db2c4a1f6d9088a90a2d25b1eb53111eaa" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" dependencies = [ "cfg-if", ] @@ -321,9 +327,9 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.19" +version = "0.8.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" +checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" [[package]] name = "csv" @@ -348,15 +354,15 @@ dependencies = [ [[package]] name = "deflate64" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83ace6c86376be0b6cdcf3fb41882e81d94b31587573d1cfa9d01cd06bba210d" +checksum = "da692b8d1080ea3045efaab14434d40468c3d8657e42abddfffca87b428f4c1b" [[package]] name = "either" -version = "1.11.0" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a47c1c47d2f5964e29c61246e81db715514cd532db6b5116a25ea3c03d6780a2" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" [[package]] name = "encoding_rs" @@ -376,7 +382,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.77", ] [[package]] @@ -387,19 +393,19 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.8" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] name = "fastrand" -version = "2.0.2" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "658bd65b1cf4c852a3cc96f18a8ce7b5640f6b703f905c7d74532294c2a63984" +checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6" [[package]] name = "fixedbitset" @@ -409,12 +415,12 @@ checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" [[package]] name = "flate2" -version = "1.0.28" +version = "1.0.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e" +checksum = "324a1be68054ef05ad64b861cc9eaf1d623d2d8cb25b4bf2cb9cdd902b4bf253" dependencies = [ "crc32fast", - "miniz_oxide", + "miniz_oxide 0.8.0", ] [[package]] @@ -516,7 +522,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.77", ] [[package]] @@ -551,9 +557,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.14" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94b22e06ecb0110981051723910cbf0b5f5e09a2062dd7663334ee79a9d1286c" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", "js-sys", @@ -576,21 +582,21 @@ dependencies = [ [[package]] name = "gimli" -version = "0.28.1" +version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" +checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd" [[package]] name = "h2" -version = "0.4.4" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "816ec7294445779408f36fe57bc5b7fc1cf59664059096c65f905c1c61f58069" +checksum = "524e8ac6999421f49a846c2d4411f337e53497d8ec55d67753beffa43c5d9205" dependencies = [ + "atomic-waker", "bytes", "fnv", "futures-core", "futures-sink", - "futures-util", "http", "indexmap", "slab", @@ -601,9 +607,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.14.3" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" [[package]] name = "heck" @@ -645,9 +651,9 @@ dependencies = [ [[package]] name = "http-body" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", "http", @@ -655,12 +661,12 @@ dependencies = [ [[package]] name = "http-body-util" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0475f8b2ac86659c21b64320d5d653f9efe42acd2a4e560073ec61a155a34f1d" +checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" dependencies = [ "bytes", - "futures-core", + "futures-util", "http", "http-body", "pin-project-lite", @@ -668,15 +674,15 @@ dependencies = [ [[package]] name = "httparse" -version = "1.8.0" +version = "1.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" +checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9" [[package]] name = "hyper" -version = "1.3.1" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe575dd17d0862a9a33781c8c4696a55c320909004a67a00fb286ba8b1bc496d" +checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05" dependencies = [ "bytes", "futures-channel", @@ -694,9 +700,9 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.27.2" +version = "0.27.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ee4be2c948921a1a5320b629c4193916ed787a7f7f293fd3f7f5a6c9de74155" +checksum = "08afdbb5c31130e3034af566421053ab03787c640246a446327f550d11bcb333" dependencies = [ "futures-util", "http", @@ -727,9 +733,9 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.3" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa" +checksum = "cde7055719c54e36e95e8719f95883f22072a48ede39db7fc17a4e1d5281e9b9" dependencies = [ "bytes", "futures-channel", @@ -780,9 +786,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.2.6" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" +checksum = "68b900aa2f7301e21c36462b170ee99994de34dff39a4a6a528e80e7376d07e5" dependencies = [ "equivalent", "hashbrown", @@ -835,9 +841,9 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "jobserver" -version = "0.1.31" +version = "0.1.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" dependencies = [ "libc", ] @@ -859,9 +865,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "libc" -version = "0.2.153" +version = "0.2.158" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" +checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439" [[package]] name = "libm" @@ -871,9 +877,9 @@ checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" [[package]] name = "linux-raw-sys" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" [[package]] name = "lock_api" @@ -904,9 +910,9 @@ dependencies = [ [[package]] name = "matrixmultiply" -version = "0.3.8" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7574c1cf36da4798ab73da5b215bbf444f50718207754cb522201d78d1cd0ff2" +checksum = "9380b911e3e96d10c1f415da0876389aaf1b56759054eeb0de7df940c456ba1a" dependencies = [ "autocfg", "rawpointer", @@ -920,9 +926,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "memchr" -version = "2.7.2" +version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "memmap2" @@ -950,13 +956,22 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "miniz_oxide" -version = "0.7.2" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" +checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08" dependencies = [ "adler", ] +[[package]] +name = "miniz_oxide" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +dependencies = [ + "adler2", +] + [[package]] name = "mio" version = "1.0.2" @@ -966,7 +981,7 @@ dependencies = [ "hermit-abi", "libc", "wasi", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -1001,16 +1016,15 @@ checksum = "254a5372af8fc138e36684761d3c0cdb758a4410e938babcff1c860ce14ddbfc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.77", ] [[package]] name = "native-tls" -version = "0.2.11" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e" +checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466" dependencies = [ - "lazy_static", "libc", "log", "openssl", @@ -1059,9 +1073,9 @@ checksum = "2bf50223579dc7cdcfb3bfcacf7069ff68243f8c363f62ffa99cf000a6b9c451" [[package]] name = "num-complex" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23c6602fda94a57c990fe0df199a035d83576b496aa29f4e634a8ac6004e68a6" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" dependencies = [ "num-traits", ] @@ -1088,20 +1102,19 @@ dependencies = [ [[package]] name = "num-rational" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" dependencies = [ - "autocfg", "num-integer", "num-traits", ] [[package]] name = "num-traits" -version = "0.2.18" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", "libm", @@ -1109,9 +1122,9 @@ dependencies = [ [[package]] name = "object" -version = "0.32.2" +version = "0.36.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" +checksum = "084f1a5821ac4c651660a94a7153d27ac9d8a53736203f58b31945ded098070a" dependencies = [ "memchr", ] @@ -1128,7 +1141,7 @@ version = "0.10.66" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9529f4786b70a3e8c61e11179af17ab6188ad8d0ded78c5529441ed39d4bd9c1" dependencies = [ - "bitflags 2.5.0", + "bitflags", "cfg-if", "foreign-types", "libc", @@ -1145,7 +1158,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.77", ] [[package]] @@ -1156,9 +1169,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-src" -version = "300.2.3+3.2.1" +version = "300.3.2+3.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cff92b6f71555b61bb9315f7c64da3ca43d87531622120fea0195fc761b4843" +checksum = "a211a18d945ef7e648cc6e0058f4c548ee46aab922ea203e0d30e966ea23647b" dependencies = [ "cc", ] @@ -1198,7 +1211,7 @@ dependencies = [ "proc-macro2", "proc-macro2-diagnostics", "quote", - "syn 2.0.60", + "syn 2.0.77", ] [[package]] @@ -1209,9 +1222,9 @@ checksum = "bb813b8af86854136c6922af0598d719255ecb2179515e6e7730d468f05c9cae" [[package]] name = "parking_lot" -version = "0.12.2" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e4af0ca4f6caed20e900d564c242b8e5d4903fdacf31d3daf527b66fe6f42fb" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" dependencies = [ "lock_api", "parking_lot_core", @@ -1232,9 +1245,9 @@ dependencies = [ [[package]] name = "paste" -version = "1.0.14" +version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" [[package]] name = "percent-encoding" @@ -1259,7 +1272,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.77", ] [[package]] @@ -1298,15 +1311,18 @@ checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" [[package]] name = "portable-atomic" -version = "1.6.0" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" +checksum = "da544ee218f0d287a911e9c99a39a8c9bc8fcad3cb8db5959940044ecfc67265" [[package]] name = "ppv-lite86" -version = "0.2.17" +version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] [[package]] name = "primal-check" @@ -1343,9 +1359,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.81" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d1597b0c024618f09a9c3b8655b7e430397a36d23fdafec26d6965e9eec3eba" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" dependencies = [ "unicode-ident", ] @@ -1358,7 +1374,7 @@ checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.77", "version_check", "yansi", ] @@ -1411,7 +1427,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.60", + "syn 2.0.77", ] [[package]] @@ -1424,14 +1440,14 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.60", + "syn 2.0.77", ] [[package]] name = "quote" -version = "1.0.36" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" dependencies = [ "proc-macro2", ] @@ -1504,11 +1520,11 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.1" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "469052894dcb553421e483e4209ee581a45100d31b4018de03e5a7ad86374a7e" +checksum = "2a908a6e00f1fdd0dfd9c0eb08ce85126f6d8bbda50017e74bc4a4b7d4a926a4" dependencies = [ - "bitflags 2.5.0", + "bitflags", ] [[package]] @@ -1525,9 +1541,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.6" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" +checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" dependencies = [ "aho-corasick", "memchr", @@ -1536,9 +1552,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56" +checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" [[package]] name = "reqwest" @@ -1597,7 +1613,7 @@ dependencies = [ "libc", "spin", "untrusted", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -1618,28 +1634,28 @@ checksum = "082f11ffa03bbef6c2c6ea6bea1acafaade2fd9050ae0234ab44a2153742b058" [[package]] name = "rustc-demangle" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" [[package]] name = "rustix" -version = "0.38.34" +version = "0.38.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" +checksum = "3f55e80d50763938498dd5ebb18647174e0c76dc38c5505294bb224624f30f36" dependencies = [ - "bitflags 2.5.0", + "bitflags", "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] name = "rustls" -version = "0.23.7" +version = "0.23.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebbbdb961df0ad3f2652da8f3fdc4b36122f568f968f45ad3316f26c025c677b" +checksum = "c58f8c84392efc0a126acce10fa59ff7b3d2ac06ab451a33f2741989b806b044" dependencies = [ "once_cell", "rustls-pki-types", @@ -1650,9 +1666,9 @@ dependencies = [ [[package]] name = "rustls-pemfile" -version = "2.1.2" +version = "2.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29993a25686778eb88d4189742cd713c9bce943bc54251a33509dc63cbacf73d" +checksum = "196fe16b00e106300d3e45ecfcb764fa292a535d7326a29a5875c579c7417425" dependencies = [ "base64", "rustls-pki-types", @@ -1660,15 +1676,15 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.5.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "beb461507cee2c2ff151784c52762cf4d9ff6a61f3e80968600ed24fa837fa54" +checksum = "fc0a2ce646f8655401bb81e7927b812614bd5d91dbc968696be50603510fcaf0" [[package]] name = "rustls-webpki" -version = "0.102.3" +version = "0.102.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3bce581c0dd41bce533ce695a1437fa16a7ab5ac3ccfa99fe1a620a7885eabf" +checksum = "84678086bd54edf2b415183ed7a94d0efb049f1b646a33e22a36f3794be6ae56" dependencies = [ "ring", "rustls-pki-types", @@ -1677,15 +1693,15 @@ dependencies = [ [[package]] name = "ryu" -version = "1.0.17" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" [[package]] name = "safe_arch" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f398075ce1e6a179b46f51bd88d0598b92b00d3551f1a2d4ac49e771b56ac354" +checksum = "c3460605018fdc9612bce72735cba0d27efbcd9904780d44c7e3a9948f96148a" dependencies = [ "bytemuck", ] @@ -1696,7 +1712,7 @@ version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534" dependencies = [ - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -1707,11 +1723,11 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "security-framework" -version = "2.10.0" +version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "770452e37cad93e0a50d5abc3990d2bc351c36d0328f86cefec2f2fb206eaef6" +checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" dependencies = [ - "bitflags 1.3.2", + "bitflags", "core-foundation", "core-foundation-sys", "libc", @@ -1720,9 +1736,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.10.0" +version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41f3cc463c0ef97e11c3461a9d3787412d30e8e7eb907c79180c4a57bf7c04ef" +checksum = "75da29fe9b9b08fe9d6b22b5b4bcbc75d8db3aa31e639aa56bb62e9d46bfceaf" dependencies = [ "core-foundation-sys", "libc", @@ -1745,14 +1761,14 @@ checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.77", ] [[package]] name = "serde_json" -version = "1.0.127" +version = "1.0.128" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8043c06d9f82bd7271361ed64f415fe5e12a77fdb52e573e7f06a516dea329ad" +checksum = "6ff5456707a1de34e7e37f2a6fd3d3f808c318259cbd01ab6377795054b483d8" dependencies = [ "itoa", "memchr", @@ -1772,6 +1788,12 @@ dependencies = [ "serde", ] +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + [[package]] name = "signal-hook-registry" version = "1.4.2" @@ -1817,12 +1839,12 @@ checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" [[package]] name = "socket2" -version = "0.5.6" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05ffd9c0a93b7543e062e759284fcf5f5e3b098501104bfbdde4d404db792871" +checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -1939,9 +1961,9 @@ dependencies = [ [[package]] name = "subtle" -version = "2.6.0" +version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d0208408ba0c3df17ed26eb06992cb1a1268d41b2c0e12e65203fbe3972cee5" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" @@ -1956,9 +1978,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.60" +version = "2.0.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "909518bc7b1c9b779f1bbf07f2929d35af9f0f37e47c6e9ef7f9dddc1e1821f3" +checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed" dependencies = [ "proc-macro2", "quote", @@ -1980,7 +2002,7 @@ version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" dependencies = [ - "bitflags 2.5.0", + "bitflags", "core-foundation", "system-configuration-sys", ] @@ -1997,47 +2019,48 @@ dependencies = [ [[package]] name = "target-lexicon" -version = "0.12.14" +version = "0.12.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1fc403891a21bcfb7c37834ba66a547a8f402146eba7265b5a6d88059c9ff2f" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "tempfile" -version = "3.10.1" +version = "3.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" +checksum = "04cbcdd0c794ebb0d4cf35e88edd2f7d2c4c3e9a5a6dab322839b321c6a87a64" dependencies = [ "cfg-if", "fastrand", + "once_cell", "rustix", - "windows-sys", + "windows-sys 0.59.0", ] [[package]] name = "thiserror" -version = "1.0.59" +version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0126ad08bff79f29fc3ae6a55cc72352056dfff61e3ff8bb7129476d44b23aa" +checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.59" +version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1cd413b5d558b4c5bf3680e324a6fa5014e7b7c067a51e69dbdf47eb7148b66" +checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.77", ] [[package]] name = "tinyvec" -version = "1.6.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938" dependencies = [ "tinyvec_macros", ] @@ -2063,7 +2086,7 @@ dependencies = [ "signal-hook-registry", "socket2", "tokio-macros", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -2074,7 +2097,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.77", ] [[package]] @@ -2100,9 +2123,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.11" +version = "0.7.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1" +checksum = "61e7c3654c13bcd040d4a03abee2c75b1d14a37b423cf5a813ceae1cc903ec6a" dependencies = [ "bytes", "futures-core", @@ -2125,20 +2148,19 @@ dependencies = [ "tokio", "tower-layer", "tower-service", - "tracing", ] [[package]] name = "tower-layer" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" [[package]] name = "tower-service" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" [[package]] name = "tracing" @@ -2146,7 +2168,6 @@ version = "0.1.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" dependencies = [ - "log", "pin-project-lite", "tracing-core", ] @@ -2194,7 +2215,7 @@ checksum = "1f718dfaf347dcb5b983bfc87608144b0bad87970aebcbea5ce44d2a30c08e63" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.77", ] [[package]] @@ -2238,9 +2259,9 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "url" -version = "2.5.0" +version = "2.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" +checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c" dependencies = [ "form_urlencoded", "idna", @@ -2270,9 +2291,9 @@ dependencies = [ [[package]] name = "version_check" -version = "0.9.4" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" [[package]] name = "want" @@ -2311,15 +2332,15 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.77", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.42" +version = "0.4.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0" +checksum = "61e9300f63a621e96ed275155c108eb6f843b6a26d053f122ab69724559dc8ed" dependencies = [ "cfg-if", "js-sys", @@ -2345,7 +2366,7 @@ checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.60", + "syn 2.0.77", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -2381,9 +2402,9 @@ dependencies = [ [[package]] name = "wide" -version = "0.7.16" +version = "0.7.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81a1851a719f11d1d2fea40e15c72f6c00de8c142d7ac47c1441cc7e4d0d5bc6" +checksum = "b828f995bf1e9622031f8009f8481a85406ce1f4d4588ff746d872043e855690" dependencies = [ "bytemuck", "safe_arch", @@ -2437,6 +2458,15 @@ dependencies = [ "windows-targets", ] +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + [[package]] name = "windows-targets" version = "0.52.6" @@ -2516,6 +2546,27 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "byteorder", + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.77", +] + [[package]] name = "zeroize" version = "1.8.1" @@ -2533,11 +2584,11 @@ dependencies = [ [[package]] name = "zstd" -version = "0.13.1" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d789b1514203a1120ad2429eae43a7bd32b90976a7bb8a05f7ec02fa88cc23a" +checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9" dependencies = [ - "zstd-safe 7.1.0", + "zstd-safe 7.2.1", ] [[package]] @@ -2552,18 +2603,18 @@ dependencies = [ [[package]] name = "zstd-safe" -version = "7.1.0" +version = "7.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd99b45c6bc03a018c8b8a86025678c87e55526064e38f9df301989dce7ec0a" +checksum = "54a3ab4db68cea366acc5c897c7b4d4d1b8994a9cd6e6f841f8964566a419059" dependencies = [ "zstd-sys", ] [[package]] name = "zstd-sys" -version = "2.0.10+zstd.1.5.6" +version = "2.0.13+zstd.1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c253a4914af5bafc8fa8c86ee400827e83cf6ec01195ec1f1ed8441bf00d65aa" +checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" dependencies = [ "cc", "pkg-config", diff --git a/src/directsketch.rs b/src/directsketch.rs index f3f50a5..7438ca6 100644 --- a/src/directsketch.rs +++ b/src/directsketch.rs @@ -248,6 +248,15 @@ pub struct FailedDownload { url: Option, } +pub struct FailedChecksum { + accession: String, + name: String, + moltype: String, + md5sum_url: Url, + download_filename: Option, + url: Option, +} + #[allow(clippy::too_many_arguments)] async fn dl_sketch_assembly_accession( client: &Client, @@ -260,10 +269,11 @@ async fn dl_sketch_assembly_accession( genomes_only: bool, proteomes_only: bool, download_only: bool, -) -> Result<(Vec, Vec)> { +) -> Result<(Vec, Vec, Vec)> { let retry_count = retry.unwrap_or(3); // Default retry count let mut sigs = Vec::::new(); - let mut failed = Vec::::new(); + let mut download_failures = Vec::::new(); + let mut checksum_failures = Vec::::new(); let name = accinfo.name; let accession = accinfo.accession; @@ -283,7 +293,7 @@ async fn dl_sketch_assembly_accession( download_filename: None, url: None, }; - failed.push(failed_download_dna); + download_failures.push(failed_download_dna); } if !genomes_only { let failed_download_protein = FailedDownload { @@ -294,21 +304,14 @@ async fn dl_sketch_assembly_accession( download_filename: None, url: None, }; - failed.push(failed_download_protein); + download_failures.push(failed_download_protein); } - return Ok((sigs, failed)); + return Ok((sigs, download_failures, checksum_failures)); } }; let md5sum_url = GenBankFileType::Checksum.url(&base_url, &full_name); - let checksums = match download_and_parse_md5(client, &md5sum_url).await { - Ok(cs) => cs, - Err(e) => { - return Err(e); - } - }; - let mut file_types = vec![ GenBankFileType::Genomic, GenBankFileType::Protein, @@ -320,6 +323,29 @@ async fn dl_sketch_assembly_accession( file_types = vec![GenBankFileType::Protein]; } + let checksums = match download_and_parse_md5(client, &md5sum_url).await { + Ok(cs) => cs, + Err(_e) => { + // if we can't download/parse the md5sum file, write to checksum failures file to allow manual troubleshooting + for file_type in &file_types { + // get filename, filetype info to facilitate downstream + let url = file_type.url(&base_url, &full_name); + let file_name = file_type.filename_to_write(&accession); + let failed_checksum_download: FailedChecksum = FailedChecksum { + accession: accession.clone(), + name: name.clone(), + moltype: file_type.moltype(), + md5sum_url: md5sum_url.clone(), + download_filename: Some(file_name), + url: Some(url), + }; + checksum_failures.push(failed_checksum_download); + } + // return early from function b/c we can't check any checksums + return Ok((sigs, download_failures, checksum_failures)); + } + }; + for file_type in &file_types { let url = file_type.url(&base_url, &full_name); let expected_md5 = checksums.get(&file_type.server_filename(&full_name)); @@ -339,7 +365,7 @@ async fn dl_sketch_assembly_accession( download_filename: Some(file_name), url: Some(url), }; - failed.push(failed_download); + download_failures.push(failed_download); continue; } }; @@ -378,7 +404,7 @@ async fn dl_sketch_assembly_accession( } } - Ok((sigs, failed)) + Ok((sigs, download_failures, checksum_failures)) } #[allow(clippy::too_many_arguments)] @@ -654,6 +680,67 @@ pub fn failures_handle( }) } +pub fn checksum_failures_handle( + checksum_failed_csv: String, + mut recv_failed: tokio::sync::mpsc::Receiver, + error_sender: tokio::sync::mpsc::Sender, // Additional parameter for error channel +) -> tokio::task::JoinHandle<()> { + tokio::spawn(async move { + match File::create(&checksum_failed_csv).await { + Ok(file) => { + let mut writer = BufWriter::new(file); + + // Attempt to write CSV headers + if let Err(e) = writer + .write_all(b"accession,name,moltype,md5sum_url,download_filename,url\n") + .await + { + let error = Error::new(e).context("Failed to write headers"); + let _ = error_sender.send(error).await; + return; // Exit the task early after reporting the error + } + + while let Some(FailedChecksum { + accession, + name, + moltype, + md5sum_url, + download_filename, + url, + }) = recv_failed.recv().await + { + let record = format!( + "{},{},{},{},{},{}\n", + accession, + name, + moltype, + md5sum_url.to_string(), + download_filename.unwrap_or("".to_string()), + url.map(|u| u.to_string()).unwrap_or("".to_string()) + ); + // Attempt to write each record + if let Err(e) = writer.write_all(record.as_bytes()).await { + let error = Error::new(e).context("Failed to write failed checksum record"); + let _ = error_sender.send(error).await; + continue; // continue to try to write next records + } + } + + // Attempt to flush the writer + if let Err(e) = writer.flush().await { + let error = Error::new(e).context("Failed to flush failed checksum writer"); + let _ = error_sender.send(error).await; + } + } + Err(e) => { + let error = Error::new(e).context("Failed to create failed checksum file"); + let _ = error_sender.send(error).await; + } + } + drop(error_sender); + }) +} + pub fn error_handler( mut recv_errors: tokio::sync::mpsc::Receiver, error_flag: Arc, @@ -676,6 +763,7 @@ pub async fn gbsketch( input_csv: String, param_str: String, failed_csv: String, + failed_checksums_csv: String, retry_times: u32, fasta_location: String, keep_fastas: bool, @@ -702,6 +790,8 @@ pub async fn gbsketch( // create channels. buffer size here is 4 b/c we can do 3 downloads simultaneously let (send_sigs, recv_sigs) = tokio::sync::mpsc::channel::>(4); let (send_failed, recv_failed) = tokio::sync::mpsc::channel::(4); + let (send_failed_checksums, recv_failed_checksum) = + tokio::sync::mpsc::channel::(4); // Error channel for handling task errors let (error_sender, error_receiver) = tokio::sync::mpsc::channel::(1); @@ -709,11 +799,17 @@ pub async fn gbsketch( let mut handles = Vec::new(); let sig_handle = sigwriter_handle(recv_sigs, output_sigs, error_sender.clone()); let failures_handle = failures_handle(failed_csv, recv_failed, error_sender.clone()); + let checksum_failures_handle = checksum_failures_handle( + failed_checksums_csv, + recv_failed_checksum, + error_sender.clone(), + ); let critical_error_flag = Arc::new(AtomicBool::new(false)); let error_handle = error_handler(error_receiver, critical_error_flag.clone()); handles.push(sig_handle); handles.push(failures_handle); handles.push(error_handle); + handles.push(checksum_failures_handle); // Worker tasks let semaphore = Arc::new(Semaphore::new(3)); // Limiting concurrent downloads @@ -773,6 +869,7 @@ pub async fn gbsketch( let client_clone = Arc::clone(&client); let send_sigs = send_sigs.clone(); let send_failed = send_failed.clone(); + let checksum_send_failed = send_failed_checksums.clone(); let download_path_clone = download_path.clone(); // Clone the path for each task let send_errors = error_sender.clone(); @@ -806,7 +903,7 @@ pub async fn gbsketch( ) .await; match result { - Ok((sigs, failed_downloads)) => { + Ok((sigs, failed_downloads, failed_checksums)) => { if !sigs.is_empty() { if let Err(e) = send_sigs.send(sigs).await { eprintln!("Failed to send signatures: {}", e); @@ -819,6 +916,12 @@ pub async fn gbsketch( let _ = send_errors.send(e.into()).await; // Send the error through the channel } } + for fail in failed_checksums { + if let Err(e) = checksum_send_failed.send(fail).await { + eprintln!("Failed to send failed checksum info: {}", e); + let _ = send_errors.send(e.into()).await; // Send the error through the channel + } + } } Err(e) => { let _ = send_errors.send(e).await; @@ -830,6 +933,7 @@ pub async fn gbsketch( // drop senders as we're done sending data drop(send_sigs); drop(send_failed); + drop(send_failed_checksums); drop(error_sender); // Wait for all tasks to complete for handle in handles { diff --git a/src/lib.rs b/src/lib.rs index 88e77ca..8eebeb4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -54,6 +54,7 @@ fn do_gbsketch( input_csv: String, param_str: String, failed_csv: String, + failed_checksums: String, retry_times: u32, fasta_location: String, keep_fastas: bool, @@ -67,6 +68,7 @@ fn do_gbsketch( input_csv, param_str, failed_csv, + failed_checksums, retry_times, fasta_location, keep_fastas, diff --git a/src/python/sourmash_plugin_directsketch/__init__.py b/src/python/sourmash_plugin_directsketch/__init__.py index bd52812..17f17cf 100644 --- a/src/python/sourmash_plugin_directsketch/__init__.py +++ b/src/python/sourmash_plugin_directsketch/__init__.py @@ -46,7 +46,8 @@ def __init__(self, p): p.add_argument('-k', '--keep-fasta', action='store_true', help="write FASTA files in addition to sketching. Default: do not write FASTA files") p.add_argument('--download-only', help='just download genomes; do not sketch', action='store_true') - p.add_argument('--failed',help='csv of failed accessions and download links (should be mostly protein).') + p.add_argument('--failed', help='csv of failed accessions and download links (should be mostly protein).', required=True) + p.add_argument('--checksum-download-failed', help="csv of accessions where md5sum file was improperly formatted or could not be downloaded", required=True) p.add_argument('-p', '--param-string', action='append', type=str, default=[], help='parameter string for sketching (default: k=31,scaled=1000)') p.add_argument('-c', '--cores', default=0, type=int, @@ -84,6 +85,7 @@ def main(self, args): status = sourmash_plugin_directsketch.do_gbsketch(args.input_csv, args.param_string, args.failed, + args.checksum_download_failed, args.retry_times, args.fastas, args.keep_fasta, diff --git a/tests/test_gbsketch.py b/tests/test_gbsketch.py index 9ea7578..25ae86c 100644 --- a/tests/test_gbsketch.py +++ b/tests/test_gbsketch.py @@ -25,6 +25,7 @@ def test_gbsketch_simple(runtmp): acc_csv = get_test_data('acc.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') sig1 = get_test_data('GCA_000175535.1.sig.gz') sig2 = get_test_data('GCA_000961135.2.sig.gz') @@ -35,7 +36,7 @@ def test_gbsketch_simple(runtmp): ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', + '--failed', failed, '-r', '1', '--checksum-download-failed', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert os.path.exists(output) @@ -73,6 +74,7 @@ def test_gbsketch_simple_url(runtmp): acc_csv = get_test_data('acc-with-ftppath.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') sig1 = get_test_data('GCA_000175535.1.sig.gz') sig2 = get_test_data('GCA_000961135.2.sig.gz') @@ -83,7 +85,7 @@ def test_gbsketch_simple_url(runtmp): ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', + '--failed', failed, '-r', '1', '--checksum-download-failed', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert os.path.exists(output) @@ -109,6 +111,7 @@ def test_gbsketch_genomes_only(runtmp): acc_csv = get_test_data('acc.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') sig1 = get_test_data('GCA_000175535.1.sig.gz') sig2 = get_test_data('GCA_000961135.2.sig.gz') @@ -117,6 +120,7 @@ def test_gbsketch_genomes_only(runtmp): runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, '--failed', failed, '-r', '1', '--genomes-only', + '--checksum-download-failed', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert os.path.exists(output) @@ -139,6 +143,7 @@ def test_gbsketch_proteomes_only(runtmp): acc_csv = get_test_data('acc.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') sig3 = get_test_data('GCA_000961135.2.protein.sig.gz') # why does this need ksize =30 and not ksize = 10!??? @@ -146,6 +151,7 @@ def test_gbsketch_proteomes_only(runtmp): runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, '--failed', failed, '-r', '1', '--proteomes-only', + '--checksum-download-failed', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert os.path.exists(output) @@ -164,6 +170,7 @@ def test_gbsketch_genomes_only_via_params(runtmp, capfd): acc_csv = get_test_data('acc.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') sig1 = get_test_data('GCA_000175535.1.sig.gz') sig2 = get_test_data('GCA_000961135.2.sig.gz') @@ -171,7 +178,7 @@ def test_gbsketch_genomes_only_via_params(runtmp, capfd): ss2 = sourmash.load_one_signature(sig2, ksize=31) runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', + '--failed', failed, '-r', '1', '--checksum-download-failed', ch_fail, '--param-str', "dna,k=31,scaled=1000") assert os.path.exists(output) @@ -197,6 +204,7 @@ def test_gbsketch_proteomes_only_via_params(runtmp, capfd): acc_csv = get_test_data('acc.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') sig3 = get_test_data('GCA_000961135.2.protein.sig.gz') # why does this need ksize =30 and not ksize = 10!??? @@ -204,6 +212,7 @@ def test_gbsketch_proteomes_only_via_params(runtmp, capfd): runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, '--failed', failed, '-r', '1', + '--checksum-download-failed', ch_fail, '--param-str', "protein,k=10,scaled=200") assert os.path.exists(output) @@ -227,6 +236,7 @@ def test_gbsketch_save_fastas(runtmp): output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') out_dir = runtmp.output('out_fastas') + ch_fail = runtmp.output('checksum_dl_failed.csv') sig1 = get_test_data('GCA_000175535.1.sig.gz') @@ -239,6 +249,7 @@ def test_gbsketch_save_fastas(runtmp): runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, '--failed', failed, '-r', '1', '--fastas', out_dir, '--keep-fasta', + '--checksum-download-failed', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert os.path.exists(output) @@ -266,6 +277,7 @@ def test_gbsketch_download_only(runtmp, capfd): output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') out_dir = runtmp.output('out_fastas') + ch_fail = runtmp.output('checksum_dl_failed.csv') sig1 = get_test_data('GCA_000175535.1.sig.gz') @@ -278,6 +290,7 @@ def test_gbsketch_download_only(runtmp, capfd): runtmp.sourmash('scripts', 'gbsketch', acc_csv, '--download-only', '--failed', failed, '-r', '1', '--fastas', out_dir, '--keep-fasta', + '--checksum-download-failed', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert not runtmp.last_result.out # stdout should be empty @@ -303,6 +316,7 @@ def test_gbsketch_bad_acc(runtmp): output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') sig1 = get_test_data('GCA_000175535.1.sig.gz') sig2 = get_test_data('GCA_000961135.2.sig.gz') @@ -314,6 +328,7 @@ def test_gbsketch_bad_acc(runtmp): runtmp.sourmash('scripts', 'gbsketch', acc_mod, '-o', output, '--failed', failed, '-r', '1', #'--fastas', output_fastas, + '--checksum-download-failed', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert os.path.exists(output) @@ -353,10 +368,11 @@ def test_gbsketch_missing_accfile(runtmp, capfd): acc_csv = runtmp.output('acc1.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', + '--failed', failed, '-r', '1', '--checksum-download-failed', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") captured = capfd.readouterr() @@ -370,10 +386,11 @@ def test_gbsketch_empty_accfile(runtmp, capfd): file.write('') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', + '--failed', failed, '-r', '1', '--checksum-download-failed', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") captured = capfd.readouterr() @@ -396,10 +413,11 @@ def test_gbsketch_bad_acc_fail(runtmp, capfd): output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'gbsketch', acc_mod, '-o', output, - '--failed', failed, '-r', '1', + '--failed', failed, '-r', '1', '--checksum-download-failed', ch_fail, '--param-str', "dna,k=31,scaled=1000") captured = capfd.readouterr() @@ -413,12 +431,13 @@ def test_gbsketch_version_bug(runtmp): acc_csv = get_test_data('acc-version.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') sig1 = get_test_data('GCA_000193795.2.sig.gz') ss1 = sourmash.load_one_signature(sig1, ksize=31) runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', + '--failed', failed, '-r', '1', '--checksum-download-failed', ch_fail, '--param-str', "dna,k=31,scaled=1000") assert os.path.exists(output) @@ -437,10 +456,11 @@ def test_gbsketch_cols_trailing_commas(runtmp, capfd): acc_csv = get_test_data('acc-cols.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', + '--failed', failed, '-r', '1', '--checksum-download-failed', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") captured = capfd.readouterr() @@ -453,10 +473,11 @@ def test_gbsketch_missing_output(runtmp): acc_csv = runtmp.output('acc1.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'gbsketch', acc_csv, - '--failed', failed, '-r', '1', + '--failed', failed, '-r', '1', '--checksum-download-failed', ch_fail, '--param-str', "dna,k=31,scaled=1000") assert "Error: output signature zipfile is required if not using '--download-only'." in runtmp.last_result.err @@ -470,6 +491,7 @@ def test_zip_file_permissions(runtmp): acc_csv = get_test_data('acc.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') sig1 = get_test_data('GCA_000175535.1.sig.gz') sig2 = get_test_data('GCA_000961135.2.sig.gz') @@ -479,7 +501,7 @@ def test_zip_file_permissions(runtmp): ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', + '--failed', failed, '-r', '1', '--checksum-download-failed', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert os.path.exists(output) @@ -500,6 +522,7 @@ def test_gbsketch_protein_dayhoff_hp(runtmp): acc_csv = get_test_data('acc.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') sig1 = get_test_data('GCA_000961135.2.protein.sig.gz') sig2 = get_test_data('GCA_000961135.2.dayhoff.sig.gz') @@ -510,6 +533,7 @@ def test_gbsketch_protein_dayhoff_hp(runtmp): runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, '--failed', failed, '-r', '1', + '--checksum-download-failed', ch_fail, '--param-str',"protein,k=10,scaled=200", '-p', "dayhoff,k=10,scaled=200", '-p', "hp,k=10,scaled=200") diff --git a/tests/test_urlsketch.py b/tests/test_urlsketch.py index edfe92d..aa2ae20 100644 --- a/tests/test_urlsketch.py +++ b/tests/test_urlsketch.py @@ -273,9 +273,10 @@ def test_urlsketch_from_gbsketch_failed(runtmp, capfd): acc_csv = get_test_data('acc.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', + '--failed', failed, '-r', '1', '--checksum-download-failed', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert os.path.exists(failed) From 1f9ef9bf00204e9f28786a71acc90886e8065037 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 6 Sep 2024 13:43:29 -0700 Subject: [PATCH 02/11] check header of checksum failures file --- tests/test_gbsketch.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_gbsketch.py b/tests/test_gbsketch.py index 25ae86c..b7008b5 100644 --- a/tests/test_gbsketch.py +++ b/tests/test_gbsketch.py @@ -347,6 +347,14 @@ def test_gbsketch_bad_acc(runtmp): break else: assert False, "Modified accession not found" + + assert os.path.exists(failed) + with open(ch_fail, 'r') as ch_file: + # Read the lines of the file + lines = ch_file.readlines() + print("CHECKSUM FAILURES") + print(lines) + assert lines == ['accession,name,moltype,md5sum_url,download_filename,url\n'] idx = sourmash.load_file_as_index(output) sigs = list(idx.signatures()) From 6755296f80d0a859ecc1a81f73e94418b7bfd120 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 6 Sep 2024 14:09:09 -0700 Subject: [PATCH 03/11] write checksum mismatches to checksum failures file --- src/directsketch.rs | 49 ++++++++++++++----- .../sourmash_plugin_directsketch/__init__.py | 4 +- tests/test_gbsketch.py | 36 +++++++------- tests/test_urlsketch.py | 2 +- 4 files changed, 57 insertions(+), 34 deletions(-) diff --git a/src/directsketch.rs b/src/directsketch.rs index 7438ca6..4ba60a8 100644 --- a/src/directsketch.rs +++ b/src/directsketch.rs @@ -255,6 +255,8 @@ pub struct FailedChecksum { md5sum_url: Url, download_filename: Option, url: Option, + expected_md5sum: Option, + reason: String, } #[allow(clippy::too_many_arguments)] @@ -338,6 +340,8 @@ async fn dl_sketch_assembly_accession( md5sum_url: md5sum_url.clone(), download_filename: Some(file_name), url: Some(url), + expected_md5sum: None, + reason: "md5sum download or parse failure".to_string(), }; checksum_failures.push(failed_checksum_download); } @@ -355,17 +359,32 @@ async fn dl_sketch_assembly_accession( .await { Ok(data) => data, - Err(_err) => { + Err(e) => { + // did we have a checksum error or a download error? // here --> keep track of accession errors + filetype - let failed_download = FailedDownload { - accession: accession.clone(), - name: name.clone(), - moltype: file_type.moltype(), - md5sum: expected_md5.map(|x| x.to_string()), - download_filename: Some(file_name), - url: Some(url), - }; - download_failures.push(failed_download); + if e.to_string().contains("MD5 hash does not match") { + let checksum_mismatch: FailedChecksum = FailedChecksum { + accession: accession.clone(), + name: name.clone(), + moltype: file_type.moltype(), + md5sum_url: md5sum_url.clone(), + download_filename: Some(file_name.clone()), + url: Some(url.clone()), + expected_md5sum: expected_md5.cloned(), + reason: "checksum mismatch".to_string(), + }; + checksum_failures.push(checksum_mismatch); + } else { + let failed_download = FailedDownload { + accession: accession.clone(), + name: name.clone(), + moltype: file_type.moltype(), + md5sum: expected_md5.map(|x| x.to_string()), + download_filename: Some(file_name), + url: Some(url), + }; + download_failures.push(failed_download); + } continue; } }; @@ -692,7 +711,7 @@ pub fn checksum_failures_handle( // Attempt to write CSV headers if let Err(e) = writer - .write_all(b"accession,name,moltype,md5sum_url,download_filename,url\n") + .write_all(b"accession,name,moltype,md5sum_url,download_filename,url,expected_md5sum,reason\n") .await { let error = Error::new(e).context("Failed to write headers"); @@ -707,16 +726,20 @@ pub fn checksum_failures_handle( md5sum_url, download_filename, url, + expected_md5sum, + reason, }) = recv_failed.recv().await { let record = format!( - "{},{},{},{},{},{}\n", + "{},{},{},{},{},{},{},{}\n", accession, name, moltype, md5sum_url.to_string(), download_filename.unwrap_or("".to_string()), - url.map(|u| u.to_string()).unwrap_or("".to_string()) + url.map(|u| u.to_string()).unwrap_or("".to_string()), + expected_md5sum.unwrap_or("".to_string()), + reason, ); // Attempt to write each record if let Err(e) = writer.write_all(record.as_bytes()).await { diff --git a/src/python/sourmash_plugin_directsketch/__init__.py b/src/python/sourmash_plugin_directsketch/__init__.py index 17f17cf..abf32da 100644 --- a/src/python/sourmash_plugin_directsketch/__init__.py +++ b/src/python/sourmash_plugin_directsketch/__init__.py @@ -47,7 +47,7 @@ def __init__(self, p): help="write FASTA files in addition to sketching. Default: do not write FASTA files") p.add_argument('--download-only', help='just download genomes; do not sketch', action='store_true') p.add_argument('--failed', help='csv of failed accessions and download links (should be mostly protein).', required=True) - p.add_argument('--checksum-download-failed', help="csv of accessions where md5sum file was improperly formatted or could not be downloaded", required=True) + p.add_argument('--checksum-fail', help="csv of accessions where the md5sum check failed or the md5sum file was improperly formatted or could not be downloaded", required=True) p.add_argument('-p', '--param-string', action='append', type=str, default=[], help='parameter string for sketching (default: k=31,scaled=1000)') p.add_argument('-c', '--cores', default=0, type=int, @@ -85,7 +85,7 @@ def main(self, args): status = sourmash_plugin_directsketch.do_gbsketch(args.input_csv, args.param_string, args.failed, - args.checksum_download_failed, + args.checksum_fail, args.retry_times, args.fastas, args.keep_fasta, diff --git a/tests/test_gbsketch.py b/tests/test_gbsketch.py index b7008b5..5f9cad9 100644 --- a/tests/test_gbsketch.py +++ b/tests/test_gbsketch.py @@ -36,7 +36,7 @@ def test_gbsketch_simple(runtmp): ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', '--checksum-download-failed', ch_fail, + '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert os.path.exists(output) @@ -85,7 +85,7 @@ def test_gbsketch_simple_url(runtmp): ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', '--checksum-download-failed', ch_fail, + '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert os.path.exists(output) @@ -120,7 +120,7 @@ def test_gbsketch_genomes_only(runtmp): runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, '--failed', failed, '-r', '1', '--genomes-only', - '--checksum-download-failed', ch_fail, + '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert os.path.exists(output) @@ -151,7 +151,7 @@ def test_gbsketch_proteomes_only(runtmp): runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, '--failed', failed, '-r', '1', '--proteomes-only', - '--checksum-download-failed', ch_fail, + '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert os.path.exists(output) @@ -178,7 +178,7 @@ def test_gbsketch_genomes_only_via_params(runtmp, capfd): ss2 = sourmash.load_one_signature(sig2, ksize=31) runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', '--checksum-download-failed', ch_fail, + '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000") assert os.path.exists(output) @@ -212,7 +212,7 @@ def test_gbsketch_proteomes_only_via_params(runtmp, capfd): runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, '--failed', failed, '-r', '1', - '--checksum-download-failed', ch_fail, + '--checksum-fail', ch_fail, '--param-str', "protein,k=10,scaled=200") assert os.path.exists(output) @@ -249,7 +249,7 @@ def test_gbsketch_save_fastas(runtmp): runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, '--failed', failed, '-r', '1', '--fastas', out_dir, '--keep-fasta', - '--checksum-download-failed', ch_fail, + '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert os.path.exists(output) @@ -290,7 +290,7 @@ def test_gbsketch_download_only(runtmp, capfd): runtmp.sourmash('scripts', 'gbsketch', acc_csv, '--download-only', '--failed', failed, '-r', '1', '--fastas', out_dir, '--keep-fasta', - '--checksum-download-failed', ch_fail, + '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert not runtmp.last_result.out # stdout should be empty @@ -328,7 +328,7 @@ def test_gbsketch_bad_acc(runtmp): runtmp.sourmash('scripts', 'gbsketch', acc_mod, '-o', output, '--failed', failed, '-r', '1', #'--fastas', output_fastas, - '--checksum-download-failed', ch_fail, + '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert os.path.exists(output) @@ -354,7 +354,7 @@ def test_gbsketch_bad_acc(runtmp): lines = ch_file.readlines() print("CHECKSUM FAILURES") print(lines) - assert lines == ['accession,name,moltype,md5sum_url,download_filename,url\n'] + assert lines == ['accession,name,moltype,md5sum_url,download_filename,url,expected_md5sum,reason\n'] idx = sourmash.load_file_as_index(output) sigs = list(idx.signatures()) @@ -380,7 +380,7 @@ def test_gbsketch_missing_accfile(runtmp, capfd): with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', '--checksum-download-failed', ch_fail, + '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") captured = capfd.readouterr() @@ -398,7 +398,7 @@ def test_gbsketch_empty_accfile(runtmp, capfd): with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', '--checksum-download-failed', ch_fail, + '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") captured = capfd.readouterr() @@ -425,7 +425,7 @@ def test_gbsketch_bad_acc_fail(runtmp, capfd): with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'gbsketch', acc_mod, '-o', output, - '--failed', failed, '-r', '1', '--checksum-download-failed', ch_fail, + '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000") captured = capfd.readouterr() @@ -445,7 +445,7 @@ def test_gbsketch_version_bug(runtmp): ss1 = sourmash.load_one_signature(sig1, ksize=31) runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', '--checksum-download-failed', ch_fail, + '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000") assert os.path.exists(output) @@ -468,7 +468,7 @@ def test_gbsketch_cols_trailing_commas(runtmp, capfd): with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', '--checksum-download-failed', ch_fail, + '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") captured = capfd.readouterr() @@ -485,7 +485,7 @@ def test_gbsketch_missing_output(runtmp): with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'gbsketch', acc_csv, - '--failed', failed, '-r', '1', '--checksum-download-failed', ch_fail, + '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000") assert "Error: output signature zipfile is required if not using '--download-only'." in runtmp.last_result.err @@ -509,7 +509,7 @@ def test_zip_file_permissions(runtmp): ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', '--checksum-download-failed', ch_fail, + '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert os.path.exists(output) @@ -541,7 +541,7 @@ def test_gbsketch_protein_dayhoff_hp(runtmp): runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, '--failed', failed, '-r', '1', - '--checksum-download-failed', ch_fail, + '--checksum-fail', ch_fail, '--param-str',"protein,k=10,scaled=200", '-p', "dayhoff,k=10,scaled=200", '-p', "hp,k=10,scaled=200") diff --git a/tests/test_urlsketch.py b/tests/test_urlsketch.py index aa2ae20..502d523 100644 --- a/tests/test_urlsketch.py +++ b/tests/test_urlsketch.py @@ -276,7 +276,7 @@ def test_urlsketch_from_gbsketch_failed(runtmp, capfd): ch_fail = runtmp.output('checksum_dl_failed.csv') runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', '--checksum-download-failed', ch_fail, + '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert os.path.exists(failed) From e883aad829065333510780e032b893135f2adab7 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Sat, 7 Sep 2024 08:32:51 -0700 Subject: [PATCH 04/11] gbsketch: check http response status for md5sum file --- src/directsketch.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/directsketch.rs b/src/directsketch.rs index 4ba60a8..fd5cdc4 100644 --- a/src/directsketch.rs +++ b/src/directsketch.rs @@ -115,6 +115,15 @@ async fn download_and_parse_md5(client: &Client, url: &Url) -> Result Date: Sat, 7 Sep 2024 09:22:05 -0700 Subject: [PATCH 05/11] write full error message --- src/directsketch.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/directsketch.rs b/src/directsketch.rs index fd5cdc4..c54abd7 100644 --- a/src/directsketch.rs +++ b/src/directsketch.rs @@ -336,7 +336,9 @@ async fn dl_sketch_assembly_accession( let checksums = match download_and_parse_md5(client, &md5sum_url).await { Ok(cs) => cs, - Err(_e) => { + Err(err) => { + // capture the error message as a string + let error_message = err.to_string(); // if we can't download/parse the md5sum file, write to checksum failures file to allow manual troubleshooting for file_type in &file_types { // get filename, filetype info to facilitate downstream @@ -350,7 +352,7 @@ async fn dl_sketch_assembly_accession( download_filename: Some(file_name), url: Some(url), expected_md5sum: None, - reason: "md5sum download or parse failure".to_string(), + reason: error_message.clone(), // write full error message }; checksum_failures.push(failed_checksum_download); } @@ -369,9 +371,10 @@ async fn dl_sketch_assembly_accession( { Ok(data) => data, Err(e) => { + let error_message = e.to_string(); // did we have a checksum error or a download error? // here --> keep track of accession errors + filetype - if e.to_string().contains("MD5 hash does not match") { + if error_message.contains("MD5 hash does not match") { let checksum_mismatch: FailedChecksum = FailedChecksum { accession: accession.clone(), name: name.clone(), @@ -380,7 +383,7 @@ async fn dl_sketch_assembly_accession( download_filename: Some(file_name.clone()), url: Some(url.clone()), expected_md5sum: expected_md5.cloned(), - reason: "checksum mismatch".to_string(), + reason: error_message.clone(), }; checksum_failures.push(checksum_mismatch); } else { From d5c9e3c304d0ca359915b09d24f47f139e600571 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Sat, 7 Sep 2024 10:37:09 -0700 Subject: [PATCH 06/11] add failed checksums to urlsketch; test failed checksum output --- src/directsketch.rs | 78 ++++++++++++------ src/lib.rs | 4 + .../sourmash_plugin_directsketch/__init__.py | 4 +- tests/test_urlsketch.py | 81 ++++++++++++++++--- 4 files changed, 132 insertions(+), 35 deletions(-) diff --git a/src/directsketch.rs b/src/directsketch.rs index c54abd7..be0bf20 100644 --- a/src/directsketch.rs +++ b/src/directsketch.rs @@ -176,7 +176,7 @@ async fn download_with_retry( return Ok(data.to_vec()); } else { last_error = Some(anyhow!( - "MD5 hash does not match. Expected: {}, Found: {}", + "MD5 hash does not match. Expected: {}; Found: {}", md5, computed_hash )); @@ -261,7 +261,7 @@ pub struct FailedChecksum { accession: String, name: String, moltype: String, - md5sum_url: Url, + md5sum_url: Option, download_filename: Option, url: Option, expected_md5sum: Option, @@ -348,7 +348,7 @@ async fn dl_sketch_assembly_accession( accession: accession.clone(), name: name.clone(), moltype: file_type.moltype(), - md5sum_url: md5sum_url.clone(), + md5sum_url: Some(md5sum_url.clone()), download_filename: Some(file_name), url: Some(url), expected_md5sum: None, @@ -379,7 +379,7 @@ async fn dl_sketch_assembly_accession( accession: accession.clone(), name: name.clone(), moltype: file_type.moltype(), - md5sum_url: md5sum_url.clone(), + md5sum_url: Some(md5sum_url.clone()), download_filename: Some(file_name.clone()), url: Some(url.clone()), expected_md5sum: expected_md5.cloned(), @@ -450,10 +450,11 @@ async fn dl_sketch_url( _genomes_only: bool, _proteomes_only: bool, download_only: bool, -) -> Result<(Vec, Vec)> { +) -> Result<(Vec, Vec, Vec)> { let retry_count = retry.unwrap_or(3); // Default retry count let mut sigs = Vec::::new(); let mut failed = Vec::::new(); + let mut checksum_failures = Vec::::new(); let name = accinfo.name; let accession = accinfo.accession; @@ -462,11 +463,8 @@ async fn dl_sketch_url( let download_filename = accinfo.download_filename; let moltype = accinfo.moltype; - match download_with_retry(client, &url, expected_md5.as_deref(), retry_count) - .await - .ok() - { - Some(data) => { + match download_with_retry(client, &url, expected_md5.as_deref(), retry_count).await { + Ok(data) => { // check keep_fastas instead?? if let Some(ref download_filename) = download_filename { let path = location.join(download_filename); @@ -501,20 +499,37 @@ async fn dl_sketch_url( }; } } - None => { - let failed_download = FailedDownload { - accession: accession.clone(), - name: name.clone(), - moltype: moltype.to_string(), - md5sum: expected_md5.map(|x| x.to_string()), - download_filename, - url: Some(url), - }; - failed.push(failed_download); + Err(err) => { + let error_message = err.to_string(); + // did we have a checksum error or a download error? + // here --> keep track of accession errors + filetype + if error_message.contains("MD5 hash does not match") { + let checksum_mismatch: FailedChecksum = FailedChecksum { + accession: accession.clone(), + name: name.clone(), + moltype: moltype.to_string(), + md5sum_url: None, + download_filename, + url: Some(url.clone()), + expected_md5sum: expected_md5.clone(), + reason: error_message.clone(), + }; + checksum_failures.push(checksum_mismatch); + } else { + let failed_download = FailedDownload { + accession: accession.clone(), + name: name.clone(), + moltype: moltype.to_string(), + md5sum: expected_md5.map(|x| x.to_string()), + download_filename, + url: Some(url), + }; + failed.push(failed_download); + } } } - Ok((sigs, failed)) + Ok((sigs, failed, checksum_failures)) } async fn write_sig( @@ -747,7 +762,7 @@ pub fn checksum_failures_handle( accession, name, moltype, - md5sum_url.to_string(), + md5sum_url.map(|u| u.to_string()).unwrap_or("".to_string()), download_filename.unwrap_or("".to_string()), url.map(|u| u.to_string()).unwrap_or("".to_string()), expected_md5sum.unwrap_or("".to_string()), @@ -992,6 +1007,7 @@ pub async fn urlsketch( input_csv: String, param_str: String, failed_csv: String, + failed_checksums_csv: String, retry_times: u32, fasta_location: String, keep_fastas: bool, @@ -1016,6 +1032,8 @@ pub async fn urlsketch( // create channels. buffer size here is 4 b/c we can do 3 downloads simultaneously let (send_sigs, recv_sigs) = tokio::sync::mpsc::channel::>(4); let (send_failed, recv_failed) = tokio::sync::mpsc::channel::(4); + let (send_failed_checksums, recv_failed_checksum) = + tokio::sync::mpsc::channel::(4); // Error channel for handling task errors let (error_sender, error_receiver) = tokio::sync::mpsc::channel::(1); @@ -1023,11 +1041,17 @@ pub async fn urlsketch( let mut handles = Vec::new(); let sig_handle = sigwriter_handle(recv_sigs, output_sigs, error_sender.clone()); let failures_handle = failures_handle(failed_csv, recv_failed, error_sender.clone()); + let checksum_failures_handle = checksum_failures_handle( + failed_checksums_csv, + recv_failed_checksum, + error_sender.clone(), + ); let critical_error_flag = Arc::new(AtomicBool::new(false)); let error_handle = error_handler(error_receiver, critical_error_flag.clone()); handles.push(sig_handle); handles.push(failures_handle); handles.push(error_handle); + handles.push(checksum_failures_handle); // Worker tasks let semaphore = Arc::new(Semaphore::new(3)); // Limiting concurrent downloads @@ -1086,6 +1110,7 @@ pub async fn urlsketch( let client_clone = Arc::clone(&client); let send_sigs = send_sigs.clone(); let send_failed = send_failed.clone(); + let checksum_send_failed = send_failed_checksums.clone(); let download_path_clone = download_path.clone(); // Clone the path for each task let send_errors = error_sender.clone(); @@ -1119,7 +1144,7 @@ pub async fn urlsketch( ) .await; match result { - Ok((sigs, failed_downloads)) => { + Ok((sigs, failed_downloads, failed_checksums)) => { if !sigs.is_empty() { if let Err(e) = send_sigs.send(sigs).await { eprintln!("Failed to send signatures: {}", e); @@ -1132,6 +1157,12 @@ pub async fn urlsketch( let _ = send_errors.send(e.into()).await; // Send the error through the channel } } + for fail in failed_checksums { + if let Err(e) = checksum_send_failed.send(fail).await { + eprintln!("Failed to send failed checksum info: {}", e); + let _ = send_errors.send(e.into()).await; // Send the error through the channel + } + } } Err(e) => { let _ = send_errors.send(e).await; @@ -1144,6 +1175,7 @@ pub async fn urlsketch( drop(send_sigs); drop(send_failed); drop(error_sender); + drop(send_failed_checksums); // Wait for all tasks to complete for handle in handles { if let Err(e) = handle.await { diff --git a/src/lib.rs b/src/lib.rs index 8eebeb4..f8cf014 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -49,6 +49,7 @@ fn set_tokio_thread_pool(num_threads: usize) -> PyResult { #[pyfunction] #[allow(clippy::too_many_arguments)] +#[pyo3(signature = (input_csv, param_str, failed_csv, failed_checksums, retry_times, fasta_location, keep_fastas, genomes_only, proteomes_only, download_only, output_sigs=None))] fn do_gbsketch( py: Python, input_csv: String, @@ -87,11 +88,13 @@ fn do_gbsketch( #[pyfunction] #[allow(clippy::too_many_arguments)] +#[pyo3(signature = (input_csv, param_str, failed_csv, failed_checksums, retry_times, fasta_location, keep_fastas, download_only, output_sigs=None))] fn do_urlsketch( py: Python, input_csv: String, param_str: String, failed_csv: String, + failed_checksums: String, retry_times: u32, fasta_location: String, keep_fastas: bool, @@ -103,6 +106,7 @@ fn do_urlsketch( input_csv, param_str, failed_csv, + failed_checksums, retry_times, fasta_location, keep_fastas, diff --git a/src/python/sourmash_plugin_directsketch/__init__.py b/src/python/sourmash_plugin_directsketch/__init__.py index abf32da..568cab8 100644 --- a/src/python/sourmash_plugin_directsketch/__init__.py +++ b/src/python/sourmash_plugin_directsketch/__init__.py @@ -118,7 +118,8 @@ def __init__(self, p): p.add_argument('-k', '--keep-fasta', '--keep-fastq', action='store_true', help="write FASTA/Q files in addition to sketching. Default: do not write FASTA files") p.add_argument('--download-only', help='just download genomes; do not sketch', action='store_true') - p.add_argument('--failed',help='csv of failed accessions and download links (should be mostly protein).') + p.add_argument('--failed',help='csv of failed accessions and download links.', required=True) + p.add_argument('--checksum-fail', help="csv of accessions where the md5sum check failed", required=True) p.add_argument('-p', '--param-string', action='append', type=str, default=[], help='parameter string for sketching (default: k=31,scaled=1000)') p.add_argument('-c', '--cores', default=0, type=int, @@ -153,6 +154,7 @@ def main(self, args): status = sourmash_plugin_directsketch.do_urlsketch(args.input_csv, args.param_string, args.failed, + args.checksum_fail, args.retry_times, args.fastas, args.keep_fasta, diff --git a/tests/test_urlsketch.py b/tests/test_urlsketch.py index 502d523..1eb4112 100644 --- a/tests/test_urlsketch.py +++ b/tests/test_urlsketch.py @@ -14,6 +14,7 @@ def get_test_data(filename): thisdir = os.path.dirname(__file__) return os.path.join(thisdir, 'test-data', filename) + def test_installed(runtmp): with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'urlsketch') @@ -25,6 +26,7 @@ def test_urlsketch_simple(runtmp): acc_csv = get_test_data('acc-url.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') sig1 = get_test_data('GCA_000175535.1.sig.gz') sig2 = get_test_data('GCA_000961135.2.sig.gz') @@ -34,7 +36,7 @@ def test_urlsketch_simple(runtmp): ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', + '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert os.path.exists(output) @@ -72,6 +74,7 @@ def test_urlsketch_save_fastas(runtmp): acc_csv = get_test_data('acc-url.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') out_dir = runtmp.output('out_fastas') @@ -85,6 +88,7 @@ def test_urlsketch_save_fastas(runtmp): runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, '--failed', failed, '-r', '1', '--fastas', out_dir, '--keep-fasta', + '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert os.path.exists(output) @@ -112,6 +116,7 @@ def test_urlsketch_download_only(runtmp, capfd): acc_csv = get_test_data('acc-url.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') out_dir = runtmp.output('out_fastas') @@ -125,6 +130,7 @@ def test_urlsketch_download_only(runtmp, capfd): runtmp.sourmash('scripts', 'urlsketch', acc_csv, '--download-only', '--failed', failed, '-r', '1', '--fastas', out_dir, '--keep-fasta', + '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert not runtmp.last_result.out # stdout should be empty @@ -150,6 +156,7 @@ def test_urlsketch_bad_acc(runtmp): output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') sig1 = get_test_data('GCA_000175535.1.sig.gz') sig2 = get_test_data('GCA_000961135.2.sig.gz') @@ -160,7 +167,7 @@ def test_urlsketch_bad_acc(runtmp): ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') runtmp.sourmash('scripts', 'urlsketch', acc_mod, '-o', output, - '--failed', failed, '-r', '1', + '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert os.path.exists(output) @@ -195,14 +202,16 @@ def test_urlsketch_bad_acc(runtmp): else: assert sig.md5sum() == ss3.md5sum() + def test_urlsketch_missing_accfile(runtmp, capfd): acc_csv = runtmp.output('acc1.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', + '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") captured = capfd.readouterr() @@ -216,10 +225,11 @@ def test_urlsketch_empty_accfile(runtmp, capfd): file.write('') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', + '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") captured = capfd.readouterr() @@ -242,10 +252,11 @@ def test_urlsketch_bad_acc_fail(runtmp, capfd): output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'urlsketch', acc_mod, '-o', output, - '--failed', failed, '-r', '1', + '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000") captured = capfd.readouterr() @@ -254,16 +265,16 @@ def test_urlsketch_bad_acc_fail(runtmp, capfd): assert "Error: No signatures written, exiting." in captured.err - def test_urlsketch_missing_output(runtmp): # no output sig zipfile provided but also not --download-only acc_csv = runtmp.output('acc1.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'urlsketch', acc_csv, - '--failed', failed, '-r', '1', + '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000") assert "Error: output signature zipfile is required if not using '--download-only'." in runtmp.last_result.err @@ -294,10 +305,12 @@ def test_urlsketch_from_gbsketch_failed(runtmp, capfd): out2 = runtmp.output('failed-retry.zip') fail2 = runtmp.output('fail2.csv') + ch_fail2 = runtmp.output('checksum_dl_failed2.csv') + with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'urlsketch', failed, '-o', out2, - '--failed', fail2, '-r', '1', + '--failed', fail2, '-r', '1', '--checksum-fail', ch_fail2, '-p', "protein,k=10,scaled=200") captured = capfd.readouterr() print(captured.out) @@ -327,6 +340,7 @@ def test_zip_file_permissions(runtmp): acc_csv = get_test_data('acc-url.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') sig1 = get_test_data('GCA_000175535.1.sig.gz') sig2 = get_test_data('GCA_000961135.2.sig.gz') @@ -336,7 +350,7 @@ def test_zip_file_permissions(runtmp): ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', + '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert os.path.exists(output) @@ -353,10 +367,11 @@ def test_zip_file_permissions(runtmp): assert external_attr == 0o644 -def test_gbsketch_protein_dayhoff_hp(runtmp): +def test_urlsketch_protein_dayhoff_hp(runtmp): acc_csv = get_test_data('acc-url.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') sig1 = get_test_data('GCA_000961135.2.protein.sig.gz') sig2 = get_test_data('GCA_000961135.2.dayhoff.sig.gz') @@ -366,7 +381,7 @@ def test_gbsketch_protein_dayhoff_hp(runtmp): ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='hp') runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', + '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, '--param-str',"protein,k=10,scaled=200", '-p', "dayhoff,k=10,scaled=200", '-p', "hp,k=10,scaled=200") @@ -393,3 +408,47 @@ def test_gbsketch_protein_dayhoff_hp(runtmp): assert len(fail_lines) == 1 assert fail_lines[0] == "accession,name,moltype,md5sum,download_filename,url\n" + +def test_urlsketch_md5sum_mismatch(runtmp, capfd): + acc_csv = get_test_data('acc-url-md5sum.csv') + + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + ch_fail = runtmp.output('checksum_dl_failed.csv') + + sig1 = get_test_data('GCA_000961135.2.sig.gz') + ss1 = sourmash.load_one_signature(sig1, ksize=31) + + runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, + '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, + '--param-str', "dna,k=31,scaled=1000") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + captured = capfd.readouterr() + print(captured.out) + print(captured.err) + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + + assert len(sigs) == 1 + for sig in sigs: + assert sig.name == ss1.name + assert sig.md5sum() == ss1.md5sum() + + assert os.path.exists(ch_fail) + with open(ch_fail, 'r') as failF: + header = next(failF).strip() + assert header == "accession,name,moltype,md5sum_url,download_filename,url,expected_md5sum,reason" + for line in failF: + print(line) + acc, name, moltype, md5sum_url, download_filename, url, expected_md5, reason= line.strip().split(',') + assert acc == "GCA_000175535.1" + assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14" + assert moltype == "DNA" + assert md5sum_url == "" + assert expected_md5 == "b1234567" + assert download_filename == "GCA_000175535.1_genomic.urlsketch.fna.gz" + assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz" + assert reason == "MD5 hash does not match. Expected: b1234567; Found: a1a8f1c6dc56999c73fe298871c963d1" From 2b692a6b13fdad77048015d36b9c31c6ef1a0413 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Sat, 7 Sep 2024 11:04:48 -0700 Subject: [PATCH 07/11] make md5sum failure file optional for urlsketch --- src/directsketch.rs | 46 +++++++++--- src/lib.rs | 6 +- .../sourmash_plugin_directsketch/__init__.py | 7 +- tests/test_urlsketch.py | 75 +++++++++++++------ 4 files changed, 94 insertions(+), 40 deletions(-) diff --git a/src/directsketch.rs b/src/directsketch.rs index be0bf20..2ca3cd7 100644 --- a/src/directsketch.rs +++ b/src/directsketch.rs @@ -1007,12 +1007,12 @@ pub async fn urlsketch( input_csv: String, param_str: String, failed_csv: String, - failed_checksums_csv: String, retry_times: u32, fasta_location: String, keep_fastas: bool, download_only: bool, output_sigs: Option, + failed_checksums_csv: Option, ) -> Result<(), anyhow::Error> { // if sig output provided but doesn't end in zip, bail if let Some(ref output_sigs) = output_sigs { @@ -1041,17 +1041,23 @@ pub async fn urlsketch( let mut handles = Vec::new(); let sig_handle = sigwriter_handle(recv_sigs, output_sigs, error_sender.clone()); let failures_handle = failures_handle(failed_csv, recv_failed, error_sender.clone()); - let checksum_failures_handle = checksum_failures_handle( - failed_checksums_csv, - recv_failed_checksum, - error_sender.clone(), - ); + + let mut write_failed_checksums = false; + if let Some(ref failed_checksums) = failed_checksums_csv { + let checksum_failures_handle = checksum_failures_handle( + failed_checksums.clone(), + recv_failed_checksum, + error_sender.clone(), + ); + write_failed_checksums = true; + handles.push(checksum_failures_handle); + } + let critical_error_flag = Arc::new(AtomicBool::new(false)); let error_handle = error_handler(error_receiver, critical_error_flag.clone()); handles.push(sig_handle); handles.push(failures_handle); handles.push(error_handle); - handles.push(checksum_failures_handle); // Worker tasks let semaphore = Arc::new(Semaphore::new(3)); // Limiting concurrent downloads @@ -1157,10 +1163,28 @@ pub async fn urlsketch( let _ = send_errors.send(e.into()).await; // Send the error through the channel } } - for fail in failed_checksums { - if let Err(e) = checksum_send_failed.send(fail).await { - eprintln!("Failed to send failed checksum info: {}", e); - let _ = send_errors.send(e.into()).await; // Send the error through the channel + if write_failed_checksums { + for fail in failed_checksums { + if let Err(e) = checksum_send_failed.send(fail).await { + eprintln!("Failed to send failed checksum info: {}", e); + let _ = send_errors.send(e.into()).await; // Send the error through the channel + } + } + } else { + // if we don't have a failed checksum file, convert to failed downloads + write there + for fail in failed_checksums { + let dl_fail: FailedDownload = FailedDownload { + accession: fail.accession, + name: fail.name, + moltype: fail.moltype, + md5sum: fail.expected_md5sum, + download_filename: fail.download_filename, + url: fail.url, + }; + if let Err(e) = send_failed.send(dl_fail).await { + eprintln!("Failed to send failed download info: {}", e); + let _ = send_errors.send(e.into()).await; // Send the error through the channel + } } } } diff --git a/src/lib.rs b/src/lib.rs index f8cf014..387b3fc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -88,30 +88,30 @@ fn do_gbsketch( #[pyfunction] #[allow(clippy::too_many_arguments)] -#[pyo3(signature = (input_csv, param_str, failed_csv, failed_checksums, retry_times, fasta_location, keep_fastas, download_only, output_sigs=None))] +#[pyo3(signature = (input_csv, param_str, failed_csv, retry_times, fasta_location, keep_fastas, download_only, output_sigs=None, failed_checksums=None))] fn do_urlsketch( py: Python, input_csv: String, param_str: String, failed_csv: String, - failed_checksums: String, retry_times: u32, fasta_location: String, keep_fastas: bool, download_only: bool, output_sigs: Option, + failed_checksums: Option, ) -> anyhow::Result { match directsketch::urlsketch( py, input_csv, param_str, failed_csv, - failed_checksums, retry_times, fasta_location, keep_fastas, download_only, output_sigs, + failed_checksums, ) { Ok(_) => Ok(0), Err(e) => { diff --git a/src/python/sourmash_plugin_directsketch/__init__.py b/src/python/sourmash_plugin_directsketch/__init__.py index 568cab8..4254d6b 100644 --- a/src/python/sourmash_plugin_directsketch/__init__.py +++ b/src/python/sourmash_plugin_directsketch/__init__.py @@ -119,7 +119,8 @@ def __init__(self, p): help="write FASTA/Q files in addition to sketching. Default: do not write FASTA files") p.add_argument('--download-only', help='just download genomes; do not sketch', action='store_true') p.add_argument('--failed',help='csv of failed accessions and download links.', required=True) - p.add_argument('--checksum-fail', help="csv of accessions where the md5sum check failed", required=True) + # don't require checksum_fail here b/c users don't need to provide checksums + p.add_argument('--checksum-fail', help="csv of accessions where the md5sum check failed. If not provided, md5sum failures will be written to the download failures file (no additional md5sum information).", default=None) p.add_argument('-p', '--param-string', action='append', type=str, default=[], help='parameter string for sketching (default: k=31,scaled=1000)') p.add_argument('-c', '--cores', default=0, type=int, @@ -154,12 +155,12 @@ def main(self, args): status = sourmash_plugin_directsketch.do_urlsketch(args.input_csv, args.param_string, args.failed, - args.checksum_fail, args.retry_times, args.fastas, args.keep_fasta, args.download_only, - args.output) + args.output, + args.checksum_fail) if status == 0: notify("...gbsketch is done!") diff --git a/tests/test_urlsketch.py b/tests/test_urlsketch.py index 1eb4112..3c6d6ae 100644 --- a/tests/test_urlsketch.py +++ b/tests/test_urlsketch.py @@ -26,7 +26,6 @@ def test_urlsketch_simple(runtmp): acc_csv = get_test_data('acc-url.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') - ch_fail = runtmp.output('checksum_dl_failed.csv') sig1 = get_test_data('GCA_000175535.1.sig.gz') sig2 = get_test_data('GCA_000961135.2.sig.gz') @@ -36,7 +35,7 @@ def test_urlsketch_simple(runtmp): ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, + '--failed', failed, '-r', '1', '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert os.path.exists(output) @@ -74,7 +73,6 @@ def test_urlsketch_save_fastas(runtmp): acc_csv = get_test_data('acc-url.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') - ch_fail = runtmp.output('checksum_dl_failed.csv') out_dir = runtmp.output('out_fastas') @@ -88,7 +86,6 @@ def test_urlsketch_save_fastas(runtmp): runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, '--failed', failed, '-r', '1', '--fastas', out_dir, '--keep-fasta', - '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert os.path.exists(output) @@ -116,7 +113,6 @@ def test_urlsketch_download_only(runtmp, capfd): acc_csv = get_test_data('acc-url.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') - ch_fail = runtmp.output('checksum_dl_failed.csv') out_dir = runtmp.output('out_fastas') @@ -130,7 +126,6 @@ def test_urlsketch_download_only(runtmp, capfd): runtmp.sourmash('scripts', 'urlsketch', acc_csv, '--download-only', '--failed', failed, '-r', '1', '--fastas', out_dir, '--keep-fasta', - '--checksum-fail', ch_fail, '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert not runtmp.last_result.out # stdout should be empty @@ -156,7 +151,6 @@ def test_urlsketch_bad_acc(runtmp): output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') - ch_fail = runtmp.output('checksum_dl_failed.csv') sig1 = get_test_data('GCA_000175535.1.sig.gz') sig2 = get_test_data('GCA_000961135.2.sig.gz') @@ -167,7 +161,7 @@ def test_urlsketch_bad_acc(runtmp): ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') runtmp.sourmash('scripts', 'urlsketch', acc_mod, '-o', output, - '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, + '--failed', failed, '-r', '1', '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert os.path.exists(output) @@ -207,11 +201,10 @@ def test_urlsketch_missing_accfile(runtmp, capfd): acc_csv = runtmp.output('acc1.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') - ch_fail = runtmp.output('checksum_dl_failed.csv') with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, + '--failed', failed, '-r', '1', '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") captured = capfd.readouterr() @@ -225,11 +218,10 @@ def test_urlsketch_empty_accfile(runtmp, capfd): file.write('') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') - ch_fail = runtmp.output('checksum_dl_failed.csv') with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, + '--failed', failed, '-r', '1', '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") captured = capfd.readouterr() @@ -252,11 +244,10 @@ def test_urlsketch_bad_acc_fail(runtmp, capfd): output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') - ch_fail = runtmp.output('checksum_dl_failed.csv') with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'urlsketch', acc_mod, '-o', output, - '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, + '--failed', failed, '-r', '1', '--param-str', "dna,k=31,scaled=1000") captured = capfd.readouterr() @@ -270,11 +261,10 @@ def test_urlsketch_missing_output(runtmp): acc_csv = runtmp.output('acc1.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') - ch_fail = runtmp.output('checksum_dl_failed.csv') with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'urlsketch', acc_csv, - '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, + '--failed', failed, '-r', '1', '--param-str', "dna,k=31,scaled=1000") assert "Error: output signature zipfile is required if not using '--download-only'." in runtmp.last_result.err @@ -305,12 +295,11 @@ def test_urlsketch_from_gbsketch_failed(runtmp, capfd): out2 = runtmp.output('failed-retry.zip') fail2 = runtmp.output('fail2.csv') - ch_fail2 = runtmp.output('checksum_dl_failed2.csv') with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'urlsketch', failed, '-o', out2, - '--failed', fail2, '-r', '1', '--checksum-fail', ch_fail2, + '--failed', fail2, '-r', '1', '-p', "protein,k=10,scaled=200") captured = capfd.readouterr() print(captured.out) @@ -340,7 +329,6 @@ def test_zip_file_permissions(runtmp): acc_csv = get_test_data('acc-url.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') - ch_fail = runtmp.output('checksum_dl_failed.csv') sig1 = get_test_data('GCA_000175535.1.sig.gz') sig2 = get_test_data('GCA_000961135.2.sig.gz') @@ -350,7 +338,7 @@ def test_zip_file_permissions(runtmp): ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='protein') runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, + '--failed', failed, '-r', '1', '--param-str', "dna,k=31,scaled=1000", '-p', "protein,k=10,scaled=200") assert os.path.exists(output) @@ -371,7 +359,6 @@ def test_urlsketch_protein_dayhoff_hp(runtmp): acc_csv = get_test_data('acc-url.csv') output = runtmp.output('simple.zip') failed = runtmp.output('failed.csv') - ch_fail = runtmp.output('checksum_dl_failed.csv') sig1 = get_test_data('GCA_000961135.2.protein.sig.gz') sig2 = get_test_data('GCA_000961135.2.dayhoff.sig.gz') @@ -381,7 +368,7 @@ def test_urlsketch_protein_dayhoff_hp(runtmp): ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='hp') runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, - '--failed', failed, '-r', '1', '--checksum-fail', ch_fail, + '--failed', failed, '-r', '1', '--param-str',"protein,k=10,scaled=200", '-p', "dayhoff,k=10,scaled=200", '-p', "hp,k=10,scaled=200") @@ -409,7 +396,7 @@ def test_urlsketch_protein_dayhoff_hp(runtmp): assert fail_lines[0] == "accession,name,moltype,md5sum,download_filename,url\n" -def test_urlsketch_md5sum_mismatch(runtmp, capfd): +def test_urlsketch_md5sum_mismatch_checksum_file(runtmp, capfd): acc_csv = get_test_data('acc-url-md5sum.csv') output = runtmp.output('simple.zip') @@ -452,3 +439,45 @@ def test_urlsketch_md5sum_mismatch(runtmp, capfd): assert download_filename == "GCA_000175535.1_genomic.urlsketch.fna.gz" assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz" assert reason == "MD5 hash does not match. Expected: b1234567; Found: a1a8f1c6dc56999c73fe298871c963d1" + + +def test_urlsketch_md5sum_mismatch_no_checksum_file(runtmp, capfd): + acc_csv = get_test_data('acc-url-md5sum.csv') + + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + + sig1 = get_test_data('GCA_000961135.2.sig.gz') + ss1 = sourmash.load_one_signature(sig1, ksize=31) + + runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, + '--failed', failed, '-r', '1', + '--param-str', "dna,k=31,scaled=1000") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + captured = capfd.readouterr() + print(captured.out) + print(captured.err) + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + + assert len(sigs) == 1 + for sig in sigs: + assert sig.name == ss1.name + assert sig.md5sum() == ss1.md5sum() + + assert os.path.exists(failed) + with open(failed, 'r') as failF: + header = next(failF).strip() + assert header == "accession,name,moltype,md5sum,download_filename,url" + for line in failF: + print(line) + acc, name, moltype, md5sum, download_filename, url= line.strip().split(',') + assert acc == "GCA_000175535.1" + assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14" + assert moltype == "DNA" + assert md5sum == "b1234567" + assert download_filename == "GCA_000175535.1_genomic.urlsketch.fna.gz" + assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz" From 3937eaabe4ce34ac19a80a36dddd1186eba034a5 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Sat, 7 Sep 2024 11:17:45 -0700 Subject: [PATCH 08/11] add bad md5sum test csv --- tests/test-data/acc-url-md5sum.csv | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 tests/test-data/acc-url-md5sum.csv diff --git a/tests/test-data/acc-url-md5sum.csv b/tests/test-data/acc-url-md5sum.csv new file mode 100644 index 0000000..542cc8a --- /dev/null +++ b/tests/test-data/acc-url-md5sum.csv @@ -0,0 +1,3 @@ +accession,name,moltype,md5sum,download_filename,url +GCA_000961135.2,GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454,dna,47b9fb20c51f0552b87db5d44d5d4566,GCA_000961135.2_genomic.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/961/135/GCA_000961135.2_ASM96113v2/GCA_000961135.2_ASM96113v2_genomic.fna.gz +GCA_000175535.1,GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14,dna,b1234567,GCA_000175535.1_genomic.urlsketch.fna.gz,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz From 139c956a41b5e91d9d41c61ab00d3a55f0c6009a Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Sat, 7 Sep 2024 11:48:06 -0700 Subject: [PATCH 09/11] add quotes to md5 mismatch err --- src/directsketch.rs | 2 +- tests/test_urlsketch.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/directsketch.rs b/src/directsketch.rs index 2ca3cd7..d6d7272 100644 --- a/src/directsketch.rs +++ b/src/directsketch.rs @@ -176,7 +176,7 @@ async fn download_with_retry( return Ok(data.to_vec()); } else { last_error = Some(anyhow!( - "MD5 hash does not match. Expected: {}; Found: {}", + "MD5 hash does not match. Expected: '{}'; Found: '{}'", md5, computed_hash )); diff --git a/tests/test_urlsketch.py b/tests/test_urlsketch.py index 3c6d6ae..51160d2 100644 --- a/tests/test_urlsketch.py +++ b/tests/test_urlsketch.py @@ -438,7 +438,7 @@ def test_urlsketch_md5sum_mismatch_checksum_file(runtmp, capfd): assert expected_md5 == "b1234567" assert download_filename == "GCA_000175535.1_genomic.urlsketch.fna.gz" assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz" - assert reason == "MD5 hash does not match. Expected: b1234567; Found: a1a8f1c6dc56999c73fe298871c963d1" + assert reason == "MD5 hash does not match. Expected: 'b1234567'; Found: 'a1a8f1c6dc56999c73fe298871c963d1'" def test_urlsketch_md5sum_mismatch_no_checksum_file(runtmp, capfd): From 323c38957603e1324f4d9b6bd472adad6b99ed6a Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Sat, 7 Sep 2024 11:58:09 -0700 Subject: [PATCH 10/11] rm quotes :) --- src/directsketch.rs | 2 +- tests/test_urlsketch.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/directsketch.rs b/src/directsketch.rs index d6d7272..2ca3cd7 100644 --- a/src/directsketch.rs +++ b/src/directsketch.rs @@ -176,7 +176,7 @@ async fn download_with_retry( return Ok(data.to_vec()); } else { last_error = Some(anyhow!( - "MD5 hash does not match. Expected: '{}'; Found: '{}'", + "MD5 hash does not match. Expected: {}; Found: {}", md5, computed_hash )); diff --git a/tests/test_urlsketch.py b/tests/test_urlsketch.py index 51160d2..3c6d6ae 100644 --- a/tests/test_urlsketch.py +++ b/tests/test_urlsketch.py @@ -438,7 +438,7 @@ def test_urlsketch_md5sum_mismatch_checksum_file(runtmp, capfd): assert expected_md5 == "b1234567" assert download_filename == "GCA_000175535.1_genomic.urlsketch.fna.gz" assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_genomic.fna.gz" - assert reason == "MD5 hash does not match. Expected: 'b1234567'; Found: 'a1a8f1c6dc56999c73fe298871c963d1'" + assert reason == "MD5 hash does not match. Expected: b1234567; Found: a1a8f1c6dc56999c73fe298871c963d1" def test_urlsketch_md5sum_mismatch_no_checksum_file(runtmp, capfd): From a8e6143e11f06f5ee9b1571f99a65502cdfdd117 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Sat, 7 Sep 2024 12:04:04 -0700 Subject: [PATCH 11/11] actually, quotes erryywhere --- src/directsketch.rs | 4 ++-- tests/test_urlsketch.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/directsketch.rs b/src/directsketch.rs index 2ca3cd7..c47dd36 100644 --- a/src/directsketch.rs +++ b/src/directsketch.rs @@ -140,7 +140,7 @@ async fn download_and_parse_md5(client: &Client, url: &Url) -> Result