From 6672f88a8c53a4a40d234c7427d70f6c44a3f20e Mon Sep 17 00:00:00 2001 From: Tessa Pierce Ward Date: Fri, 14 Jun 2024 09:41:55 -0700 Subject: [PATCH] MRG: enable dayhoff, hp sketching (#55) Adds dayhoff and hp sketching of protein files - Modifies `parse_params_str` and `build_sig` to parse `dayhoff` and `hp` in params and build signature templates. Protein signature templates will now include all protein, dayhoff, hp sigs. These functions are now used both in branchwater `manysketch` and here, so are good candidates for going back into sourmash, probably here: `src/core/src/cmd.rs` General note: we do not yet support translated sketches, so dayhoff and hp (like protein) will only be made from protein files. --- src/directsketch.rs | 1 + src/utils.rs | 32 ++++++++---- .../test-data/GCA_000961135.2.dayhoff.sig.gz | Bin 0 -> 21491 bytes tests/test-data/GCA_000961135.2.hp.sig.gz | Bin 0 -> 339 bytes tests/test_gbsketch.py | 48 ++++++++++++++++++ tests/test_urlsketch.py | 41 +++++++++++++++ 6 files changed, 113 insertions(+), 9 deletions(-) create mode 100644 tests/test-data/GCA_000961135.2.dayhoff.sig.gz create mode 100644 tests/test-data/GCA_000961135.2.hp.sig.gz diff --git a/src/directsketch.rs b/src/directsketch.rs index 34915a0..f3f50a5 100644 --- a/src/directsketch.rs +++ b/src/directsketch.rs @@ -734,6 +734,7 @@ pub async fn gbsketch( } }; let dna_sig_templates = build_siginfo(¶ms_vec, "DNA"); + // prot will build protein, dayhoff, hp let prot_sig_templates = build_siginfo(¶ms_vec, "protein"); let mut genomes_only = genomes_only; diff --git a/src/utils.rs b/src/utils.rs index ab91ac7..5b5df70 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -17,7 +17,7 @@ impl InputMolType {} impl fmt::Display for InputMolType { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - InputMolType::Dna => write!(f, "dna"), + InputMolType::Dna => write!(f, "DNA"), InputMolType::Protein => write!(f, "protein"), } } @@ -280,6 +280,8 @@ pub struct Params { pub scaled: u64, pub seed: u32, pub is_protein: bool, + pub is_dayhoff: bool, + pub is_hp: bool, pub is_dna: bool, } @@ -291,6 +293,8 @@ impl Hash for Params { self.scaled.hash(state); self.seed.hash(state); self.is_protein.hash(state); + self.is_dayhoff.hash(state); + self.is_hp.hash(state); self.is_dna.hash(state); } } @@ -308,7 +312,9 @@ pub fn parse_params_str(params_strs: String) -> Result, String> { let mut scaled = 1000; let mut seed = 42; let mut is_protein = false; - let mut is_dna = true; + let mut is_dayhoff = false; + let mut is_hp = false; + let mut is_dna = false; for item in items.iter() { match *item { @@ -337,12 +343,16 @@ pub fn parse_params_str(params_strs: String) -> Result, String> { } "protein" => { is_protein = true; - is_dna = false; } "dna" => { - is_protein = false; is_dna = true; } + "dayhoff" => { + is_dayhoff = true; + } + "hp" => { + is_hp = true; + } _ => return Err(format!("unknown component '{}' in params string", item)), } } @@ -356,6 +366,8 @@ pub fn parse_params_str(params_strs: String) -> Result, String> { seed, is_protein, is_dna, + is_dayhoff, + is_hp, }; unique_params.insert(param); } @@ -364,19 +376,19 @@ pub fn parse_params_str(params_strs: String) -> Result, String> { Ok(unique_params.into_iter().collect()) } -pub fn build_siginfo(params: &[Params], moltype: &str) -> Vec { +pub fn build_siginfo(params: &[Params], input_moltype: &str) -> Vec { let mut sigs = Vec::new(); for param in params.iter().cloned() { - match moltype { - // if dna, only build dna sigs. if protein, only build protein sigs + match input_moltype { + // if dna, only build dna sigs. if protein, only build protein sigs, etc "dna" | "DNA" if !param.is_dna => continue, - "protein" if !param.is_protein => continue, + "protein" if !param.is_protein && !param.is_dayhoff && !param.is_hp => continue, _ => (), } // Adjust ksize value based on the is_protein flag - let adjusted_ksize = if param.is_protein { + let adjusted_ksize = if param.is_protein || param.is_dayhoff || param.is_hp { param.ksize * 3 } else { param.ksize @@ -387,6 +399,8 @@ pub fn build_siginfo(params: &[Params], moltype: &str) -> Vec { .scaled(param.scaled) .protein(param.is_protein) .dna(param.is_dna) + .dayhoff(param.is_dayhoff) + .hp(param.is_hp) .num_hashes(param.num) .track_abundance(param.track_abundance) .build(); diff --git a/tests/test-data/GCA_000961135.2.dayhoff.sig.gz b/tests/test-data/GCA_000961135.2.dayhoff.sig.gz new file mode 100644 index 0000000000000000000000000000000000000000..56380116068d64155cc5c41440ac4f2765ea81e1 GIT binary patch literal 21491 zcmV(`K-0e;iwFP!00004|E>AavTfUKB#8Y=1)sMSKqi0$^j1+1{s(t-RIntfi(!ef zMbTlmqyODG*4ELzydo)C(dV4K)|wbVB9XutbFO{aTzG`+xky zKmEh6fB!%K`|p4Izy0pl|NMu4`pvJte)XGw`1No9-i!YDKi}Hl{o!|i^UuHi$KTl$ zfAtRz_`^R3-LL=QH@5#@|J#54?-6m6$^I++_y5N~{o_CX=C{B5tH1m8 zum9?Q``_*Rr~UrRfBW_C{`R+j>%_nRFaPzQ4E4J|{QkfE{ttiucYpY&U;p#3|DS*O z^>6;mZ-4)ffB)-${>^{+um89Fzf`}pfB)OR`OWWs|FZtK{}zV+SL(n2)nB``-~Hhq ze)ZS!^{fB;?|=Kh+ll`8```TLZ-4dIg|A=z!>|9}|NVcureFQ_0i+O|?UKfB|M1)2 zJ%}7-v3qYK@KtjM4Tu2Zt0#T2tx*VH<8X9UvZCwjN>G-rYaE+Vh06|QpS{yy%(XM8 zm0wxANC+L%J72JV!A%B?t}ixu&bVERsV^$4gY3(Z;;UkAdSjM}ZC~GA#Me@ zh$|B?%deKSj_p)$$CpY~d3_N%eP434t4_qua=$i{Or5=h5noO-5PK$@alTg1jaEEo z`r7NvVQe6Z^K}ud!O#WaWO*oZ2L&gLud}Qbd>XKEUm2Av#pPDq@+-3hExW+x!iMeH zQ>;A<{*_39WkRUywg-AmuJU&6Q1%eIj@|Y>*n{P=8Ic?49=9RPndq8g-P+K58?$ZY zmF7{Wlgn$J^>j#wb&nKhh_Z)s#&PBNndz@=YnW!=xrKAuStu{Kk&4z=&QVIuwR$X9 z3e_>2J$3f`N_9>LrqMHU+RbX@tZi{L=;nflbY~Cq49zg3b5}3YwXBex6#<#sGu+iG zhb-*Fur`XdT*{4gmaBl*!l^EzzF;BQ0ZfFbwV@uj3ho&GzD%GyijHK8X~T&;WkT5( zUqF_frFJ@yHYkX8eLdKO4cnMC$Xuw5!QkW@!pdel3zL@@y9QzC3zQa&&cc;d2e-}> z8L?F}&%+h$3ZX`{)7lfWGsu{bvs7Uju4D6b&Qg7aOC0MqjZwp_%L@5rT}#h-W!Da~ zvd5re&~SiejWJ!8YL7wVSTS1_*J-{nG7lMbop~$J+h0&gK-cu5pB|gt)Ks5#+5+8# z(1rFU&YLfWJEB%qxuAK5a)Q#;XtcCP&BR>KpxOet(6Jn2m(JR<$3~BOYQu~D#q7md z#JxFFwP7)vi>Tu~4CJPcp~E)g%WYgm#AUG3=O9T%bu0?F))!gDnmeY#W!lg+vi~{l z_S&$0P}-&Cvc9aXF>?Bnp*7Ps7&_Q>sj^|<9BpnFu|A(Tjj6Ra5p~vMCd5;3vWMvt zG!SXhDkaKkMPP0+fm1j{tk{y!HZ>elGkqF;M6ssJM^u0-i+1^*S;MISP%yEx87wht zJ!fCMj9F`RQFmp~yDqZkdi`=fx~a&Nv^jIKp=m|Je@x6g$|W)llFErndu(+duC(XC zuSzo>oL_^$+JL;KOeF(wNNP9HYnaNH1u?3gBhqD>hYfc<5lu!q1UaL39AepLP!z;Y zy0Q#!k9T<&Ak>lbJmeF`94mw=*MfES*{Vr4UzU|W!mO;r^_(W`vEk;aa@39+?SK<&JOey)FjSSP|dR z47bvrBRnKS-$IE25msQfO?|IB4iOOR>Z_c*OsxF}-R5z1L!2vJs@ooY0ViN(RT=b@ z2PY5^xcKUbp`9W&8yXuXM;&=1?$%f16>}NscWm3Ro9vz*EZQOTqAJR+yT>m#K1C0R zNhN3uZZKjRq2}nT3ySXQy4|U>!F&EPuR=MKUp>K@iPcrbIPGLCmPuvMt?ap2Re5Z; zQ4Ue2De>+!;YPK(;(4vFt$KGf(|!4wGZQ+3c?_Ly>xMRX+$&MWm%6g zL6hH#i}A~sc@3+YM3vEP$f%^DeJOWtlG(>`CwJH5FsWRW@=`s8uK}1b8_lH{Hq`96 zs`N|T^XP(I1Cpp`*etUoMRR>2+lHU6s))L;F(g+Uv_I9@z}$t7O@YQB_qe49X+9zM zBwO~vn&sCNuQkZtAxAdU^d`{otm(WPl;r>h)tFjI_-`9`*8DS^<(hI5K~s5oaKBdJ6~mu)pr|i z-(7R9nm@P}i|cXoYfly`z7%kk&cu?*tPQ8QQO$Y6eLGRck!_B?795fpnPc|d)bdMt z-kc#q6^;H9Twa!2JDl+2ml(m!yJVBRFEOb}gCdJc8(ed)s~ctrHt0HIm3yU|FO7$s z#NDkdfDLFwJ1bz$PCFV_-5HTX*D~u>MFwejV<@nRv7ljBXHnFt+cqf?HqOxT@aCI_v84mtmy zrqT0w*s#d1vw(hmpw3&z=$wi3mm6zWCDZETsJ}Mz+OR`Z(l@?MO|~k)t7(TIBRghx z%NBh(1uI=I8I3hlWZj-+Q!Dm`nD`E9TAUq+T#3il^mI8bqcB28dC!dy;klMbZ{^K~drWk9!SSR2}iu8oDg$uBn%rRgE&&fUKXw$&ENzQ?1YYLeA7 zg15RoB&R8E(dGH8>fKz|65+i4bsaD@3%$Cbd33(PZ}_%0@Fd$CQ|DP%7N21XQLBwi zhljuKDml?CX7rs6*O{ve2aNXXOyag3&%nST>)L}?0dt`RGGW1(#|r6VxI(Zd-h|*Q z!+$V_oSB>IXYoue<0fv;Uz{@XGA&YKGj`5C#)xe&*E0Jyoq?D}$KXbRhYYaq<~ACM zd{KPDt|;Ib&$1Fubz1{ZV+JF3!dz%5mVFpwS#QW|!#zvYxHNV-Z9bgpHbxH(WJ>mN z5gY;s&F;hL9K|KHnd44pm{-AtRWHHEu-Vn6jj-E%qNb{{!o?bEjBMvlVJdINXK=<0 zUt-`Q(XV#t9F<~n+Rus^a@nq-d+J<{an_(hcZ0#*XPE(*cZsXkx5cB9ZdTFxo@GZF znK4F_`{j#MUVA)=kvY3CFD9=dpfw*`w!mdCUf9=*FZ9qdjXr#A>e8k>8bm&vEDwPc z$!)#55+SrgiVif5V@oAc5vEP+li;Mg6Bk4IQ6UpGh2WyQ0nd*umImaUVe(@UoMc^B zSCrLA93qgz9;bA7l6j1cI_k8~hn?Q^xe=|d$C7SlkXW_|z!oK7lQ1 z@0*0IE;>nFMnpeVJ8VR{f=KMgQfN1P!V`^swfA_W`etL6_p$6eF)rc>{py;YQ*c)M zbk{dF9to?YV=%Y_%(!*krz*qB6x`*G|Zfiiv8kN30!j{g7)5JISG;e49>m9@C>d4)1Z_ z(TIZDXwfCl58j(VHZQ7Pr4vBxm8wwXoeNdnAadI%@7IE`776Cmnf*RB7CqM{aO(P0 zx~#aKijnFb!`<1*%!>@YQxuoh;;I%KcxNx%FT1mI0<6c!)g?5?RqXhL#7#$-+E8Y0 zWJWN@U^jJuJNPVfA9j=XDrHup*vhDmhr{$_6eDKL_)Q%VTGyE!K!0$MQPNCjHD6r> zOG%43*!m!&)0;haI3t=fS)FNA=Rj62AQsVQf&PkeBO7ZwRUGkkl8bHnzKR_kAIEEe3V^~T)0m;IDM zZj+%p>rC6v>C`yt?XL=8FDaL~S=PD2He0)jiC32`PNte84r9(0-A*8LjZUE3HKO;g}*09l8zr*(uGnI}h2T3AtakjC*s% zJH$PPRhGA;S;xQodo|i2P31(qGqcp*Clj2WLO-0{)#OcA74#Hy=Hn!8NzJRm0&c(x zsb$t*Sy#wcfhy9^XdjCe-3=9Y=kd9@QtZ32Fw4jt)X7ZZTzosbEj~(+V8ls20XlR8 zju2H1_oBn+v!qs%|=cuJ2yH`Xos%Er`;$*z3I;L8sAqt{oAnwOox z^un9K?)Q9%S)5zcJ@Y2K35`W8^EtkWYVKUCEYNV?^H>+le^#ZKddhblYh5L07i%@T zFTFlq_c2+%U)Un4+EqsKJPAFt*K}r9E+4~Le`Z(2WPhS&rHQmU75LVHfAq@_e9ip5rvk^v z8MMOhhz{fI?yp5ov=W(7WTqY36sfTP@ zYv`e)gqMA0q>M_MX@8=`*@<3a_$FcIvagPlP`@*3)Gb=U&54yKYBU`ca5gXSRVu4X zfZ=5L@bQ3JudV1ryngv=HZ-B5;ys7Y){^16qUY-M!n3hTMMB#jeO{9l)3sN8lQ0*T z61_ZhXqO!-*NJ5zo*_H8$Y!CMM%bf;Ay?jms-3B1PSWSgi<*e_Ux&IyaHX0+o0)O@vOhk^sxG*$65K(j zxsW=m6T6Sk%o76L;fcPF&yFP;0(;ZXqKp7}nB40Zr3cXEBJ^bYN@*wGN6yF;p5+>G z8k6zBzPe;f8?&>(aWAyVZqNrW*m^$hTXjs7Zem|uS8fXOau~Q**K@Dh%iPX>`rHfx zo;{D#y{Xe}W=eUNk?*9nrIo$#dfhv4@(3EFSVXd*Xq)b}{oLMfH&bG`1@wrixdx;ttf=dMs8)c!#|oc8&M zrpw>%vhtsf_JLBw==n&i&Z5_wdQ~h|1yoT{pW1hnKoBVK?cCk9Z@aYS7N0)r84HZC z2l$tn>GzWtBe-qyuKg+#E>+cmeNi_(;4IJj>|^*W9byRFbzK)VH8_c8{<`dT+a_05 zo&5Nu{I|TW<(BU|hh%0?BjZrc^(+@xb;}HR>dqNXlU3z%`eW%?1j=<(yywg#8D}~c zC<0dOcZrfO5h1c3Wem7APYQ_h}|#B{_b_VxW)G!w9CBv+9! zs2Pk~gYXj?(ir93F%VfjAC?yo`j?`XCE-heZ9!u zYgVf{!Fvq*Vh9c^()eFUqgap|Wj1|JvE|N?qU%q~Zy&{h%fqK_cvdp zue}-}MA^$x-b|ZEbeI6Gb6%0M$I3_b-1Cw95W`Iq_wGjCrnu{ji18T(L~g;o$U;xM zI|yzpy0TeOo?g4wt*|#$CFererODcA=HqOOy|)4t-1c5Z8mZ3>oj2ew%E>wxH~y0! zpK{!s9cfa0*{d^r%3!pOw^RHQ>#TM=)VVG+F5+%zj(IQFv$Q{+WodsKg~iON8BM9$ zCz36wO5~~H(C#~?+b5~it=8)fX3jd>Bj3u$M|EUl%rw7!bPH)@sGe8XX6(td1~REH zz(S~_x?FmWC)1V^BBE}3=Bc`aW^+eFd~{yL9Qut_diCLG6LHCrRdb~b#!D9JEWQrX zF$@uNA)NJL93xYG0QyyG%e4(XZmibxOc;{t4l3biB+w;l&t*kJ?Gu2^N0H=G^zVd0 zwihjMWV6dNPp?#jx|7A+-1F}s%?z|LzP}#b>nu~Z4`4a zdnelg)v4Gw&z#ttE0~_a+#SwD_sOnPh~v!T&OOd< zscpNMUcEDu-bLi>aXa!&6i-R@1VlwW4bOvcVOp+i&XZ6PWbbWgRr|4^ICyAqk6e3Y zFtVz&G1tisv&`tM=(fY}wC?3{E!w~sJ`Cr&E+U7`Cs)?;XbFo%Xd(-_+(SZ#K-vM>KA3jJVtCP25QJfu2PY2a{f;`Ub42u`3b^_1XR1E{i zEWho?vNbBN9~69ix(uA8MVQp9Vk!k}&XBvFsF`YyVyz;%>q(fA4-qAfW0IA0MeAlrwXfewNK360PBhFT2I53r;LL-uW{tMk%nb z`Zt-gUdxhj$$`~SEU?UsaQ!l}z&*All{J0OhjJEHhKB$Q_zohwr1NWXdf(Kka)1gi zTvN}dz^r%<+n%#8sLm4Q%})CGv8t~FRt%?c8^>yBDwdU7M9x|q3d2~vlX&`1&WX1@=ionOb8#JYB&0%7q&0gzPPSV zjPf3z3w3QEZ-{$7mx^{tb=Kei=L@}&tJtcUWIcvgt?E@HvR(&u3r@kNJJ#cK*Ov|w zD;J)GjpRLoQuoxyf^jl3E2VSP=I*en3tE#V7xn?;}v5Pm^OE>FO5BYG(Dx?rf3YOq+T-y~e@)Fzj+uFLatnQ<8wHO>BvY*>WrIBjstwO{t0j@2_N zc~&p)_ILs!TMq~5Ja0Dor{?*f>`N3YG4qh&qVXuL!tMORG495+4U>4!KgV>7DTC4^ zA2M6>!7_5bXSRD~4-~m+rdG@cnsbdwVWiNjNBX;*xzrI^_3!675>Yf^RJw1=5DtxzM z1Ufj!#Gt3XY^k2<&C!j%eX1pB=Mw9Qvd_6M6&64l{+-^ll)8#dS6dG!`9`U+TkoTh zk>TVsSufc>OJ_O}IzsgHI=DAfo6^qQ*@wuX1=uQ&`PfpE`;rZD{`@Rkn9VqLt!PVU z=T#b(@r*cJw^&O$#6uZccjiauKyU0$i%Ixa`g$7PvEr5gMoPh>{UVYu>du^1}V zlwoZThNC3yRfkgCf-Reg#4RE!^pLmLyQOSt;G^nJ_f1_9r01Ej)- zW>=#>x%LJ*N|NYJ=en5eyNj0g!|*~Q`X5loRpbVR1ID)fIzRJXp4WG3lK<`XP1Hd4!9L-{7U%dIrJX9Kp-Kb;cyse#JSDejI0-3t`(_{KR#Zx^6AJS|e`+T2q#>%b6k zn{UR@b9JnV$jo_a%(inMHaO<(y5@HBHlhykb^#R9MJ-^bdD*v$hN>i&?*xEhINc77 zqrW>4d)WzX-9T>=a4^$mR83n;=Q?Jp_C}lgslq$Gx-cGBbAP$+Dtr~O(FXe>gK&0b z#O16NW0vHb5(d;hhD>eN2GTTQjpLX&r*-6r>vN3cmEweWXU@an4rZrNBbCogcEuiM zdMd817<(9H1pBGZ)51YVPr86YKQ%ZV$|mbL4m9T(Aa`Qdq4BcMNX;RpFy}d>A`57^ z@Q8XoBD0D~Jpzx9sM>wB!^rNkujir>$&ReY&_Y=-XVcZ*!`ZxG+U1hu!`WeF4uYPC zdzNd!pVu@U{rFtB8-`>~_b2N5I%*ar)cKU)Z5K^DaWybb1$y2(Gi0zYR?P0E!P{fM z*4a{4b57v} zubGd676_Ua%)?w~&%W9Nv``pL>zGuh-a z%27{4pDUM0V;!stBQ@15?}84=Y6{WkFzeH%TntP`EM$hA(LyyUdG5&&&ME>FrYHZAEU z(^1oWQDu~??#gcZw>0cJjmbv7obSx+?kw|7i`KEPu2xUEhMk&ud%1g8L&TQ0=?=Cg z;!qnh)w0gLFlZ;yP|n9P$D4`X8!wvksskQL^RWV+!@g|h!iAymsnqH!h7uwBesRJD z&*A*uJ=WPR&bHRenmn2Fm>RR1?Lmm~Y)sZ!#6s*gU_O(5*QNxlvYdSo(~I8AF%$QF zP&2wUaxUMgK3;Cc)OKa!A>)d!wmda50W-w8?R6crlm9P?W3>|%=lZVtF=Ws1s0!~z zfA6->WY%;Yc{iTJaRvi)Is8(dgsBw0C7N2-_;88`ZdG$i^L!dJ2VVo+x8ok>&JE(? z31o)dwVM-VtM+L)!AUNI6zA17brsc_=?wnOwylTizQj)0Y#ikd!w!~?Z|9Cs6VupY z-98NkkG8Mc9n5bF!!gCgM$F4DRu^2oaYCNeQI*O&BHH^^9~@$P()5<5R+|%HHVGC)Y~!KO|1P4~{f91qNO2$FgaLlc*K- zv0S5IGZwjz;ok5Zl$Ra-G`x&^LzZgAJRid$Q|@qel=sf~bgDJ!D5c7r6^tVjk=Hs; zrrd{BBQ=qMXGwHutH&u4#<4m(Rk*4%t7k!MAUeht&O5)avY*^!m$mgX@>4+jO3+4Eua@#)rWJGqIy zb7L`1bE9?XTI+1f9ay@Vj&B^3@2jywxOvD`%)6LlIl*ICU69+76@m9{eM_NAbuU`; zkQ)l)+OsC^@#*WxU^0||yR#F{gXHyG;6+U?wnmyQ1ATF)#7k#O|G`srI=8Z;qlo$7 zw9}Te*EQZ`uAXQq74uEy2`^L1uON` zxH&dUg^Ex1ixyBg%w?{l{8r>fc%rkOKDKPuWgv_5?x&Xs?p{?u`)2+S)`2ZuY<*iY z8#`MtQoDBN5DYU>>Ck7P%z7Az8h7Q3zE8zq*7Bt5u=4igQf`h!9Y}w&KsAfG?J%YQdtj2kZ zJTtY6^7t%hVyC;V zvS5qzPPNl8dCF(e9PMZ5vREI>viyhX82^T<0*@k|$c@v}M$8u$4XRFD5)8ilG*&>atG85$`?EwP#q&yVixocJ|Dd8e~pds&9(|!>J{A%_lQ!lUQ!?R3e~^@tii~39m${&jZ{;B=q$xZYnpgqrCDg>$AiIm#);bZD9t3McOGo$mBG= z4Zy;#Cn3o_qo&srPqFlpDNUM!C3y^aSM~Z4e2p`5XYJ1Av0~~e)^$4hVzTtPPXMP1&RNo7WmuU`i>RjeHayqnTFjoTfX z+WYuO_Gl|G+WYvpnj=${%XRID)QY{q&YEY*=$goGYN8y4oPgP~2A6GQTAhxLI9`GjZ- zR?l)dvva*s;5|N9tBR;_$){KQDUzgekl`o%V))89x3CGGXGX)4#pH24BVWagI-ILW zyPp+f?$KKxl^OnIACzZ8t~`mRC&6zj0^>%;_^GML(|ODi*sggvRb_7*(O2}0v9{;Z z#U)f@&g`gbj{=xwlFrqdsFFw3BE9agFE#JAGreM;LDGpFd-UgLahUmzNtDw)%WQKy zgOlgyjiPRJ^s`G_)_r`oyH{KGx#`E*>%7ZEaLXZQN?WL9dF*!^mKrX&TNo=vMm&s@#`rs+?$c0$ca7Ol%(Oa|Yhia5j3A#b#?>rDI8> z$eFl%`=aFJIZyWN;nPRts={@}=9`~7FQFqP$M#e8W|B?28S$LG-LC<_n$WGYcedP> z4zM#{uFi>4RHZoKMKR&?ajnc8eI#;77$8Lm<^1f)Cq%vk`XJ0OcciO3G z&+U%|Kq0$W^3MFKCc0eb)UnF{2lOl||H({cyh@uL2Bwi#aj!0_y;RP)SMNU9wm7w2 zJXrGYH>!(0-JYuSKHOJVZ69`sQp`S_I<%r$HOBI6bYH;Z6q>8li|Xr*%0f7JJsYmP zYt|-)%X)Q<=L9<0gSiiRI8N_*A6HjTpD9*DAOiIFo$5u5)V0SbJRk2+$Siko`%!hd zy?dQu660CzTYF~|eN1ONYJy(83nr_^*(WAPdPWA2%vnjJvSTC|3pXFq4`iG%;in-zx5iX*uto$5ny>ok~Q1g)Ay`no&VYoNF z3TE`&op+Jib;;LM4KW@^|Mt6-0D4_oTosL@QvG%jcXcD{sqHb8AZQgC^eC4{}T#HhcFY27@&DgSXbTue! zeJthaS?ho+?K%6-4l?Q)FtY!DPc3ZB~8(=cJ z>)~wi0|BV=X}DV>*@#)?C%bP^JIYS=YTe~Ep-QkB;bf2DuFsKHF>%!o+1qqHm{rf) zzmKXtzFSH)YOdaCe1UXP+^jyV8D6NHa;w4680RwEHlbKN{|=-shwH>?uZ-7=qe5{p zSSH_O+QNhGJQ=Lb%gtsFs1GH092Jv1~Zhm2kFR_HiOovy+E=l`7Y5xGK)!O~QkWJ5`Jj zvd$+rC)TTT!B3^ys_+AwN_guMOw@s+4%-nnL<+{bdB%z#s{kq$74T$bt zFMM@`V@6>pPpD7Pj-d6qsdQK!?2{SF$e1&}@>!@xZy}GVvIP^0L%+(r^s!uflunhc zXK2z=v zJd4$EwogMwFT3e7lA9pCVjqFkx-KkQ4_RKZF5&JKKMX}hP7+So6ECWQwae>><~>V? zM{w_;K0d|NjJxaEEq)`n`@$oWJUu=yaVjvwh+8}3zY+m?~EOHRo8%h{n^MFKtwyW-)r29jKxN| z4S>d{dT08QcdHsN@yix4c4x9?`ze-*s=~B?qR)IX7h;jG-+I|IujeW{r?hJ|(oFa; zo8=*U*-nV>#Cl~Dwhnd|eXf!t2vgv5+F1C<(?Z(~hBEp3^ z*D5bu!&NBoa^0~>E0f}u88Xo5;H(d!lurOgpKyjA1Yn&R<0 z`M{>eys+!DhD1jtQDrsOceJLL!}_&r%JLJnEkdLf8w<&H6~tC^ZQPxCJ*7?UY(Vzr zd0JQr4J&A#bFPckjc`FZ@}DWe(ZunPCXExPp+ zZ1bi%B4;x%Kgc%?P6)!3jBHFVG0Cl-bzPNiY7I;&V%qu?64%_E+;qaPF7!s-4hFVa zvBCyU?fz`nJ!EF|Ws6}*dNzuUny&E<_S?%vCE{G2gFKH$F2;l-N|9dqSkyT{RBgR; zqpFSoBF-EgnWKV`E&XikgWY^QcT%U8pvs%M4?V6Z? z{LI!RUuUbOM*4qCpJlc{)^rrTPfLBL={yQ|uP$k->CaJ;fA8n0Mc-cJ9K7P}n~9EX zM6CUv1#LCAW*S*f)r+b@^^Q93RdK3-rcpOBp70L2P#`nGd{IrtCd5@ow(GJb>vEiO z%)!XE%!KCZ^5WIitnGx)g{D%UsJ`gBlG|weS9eul$;Fx_Tzgj+XM<&8)9!nBb_-^+ z)mP@+Js6^K;YfemYPZWmWVA~m`uPlOdEh72yL-N%QCG%p>^wfIp2#4?9K9+ebY|x< zT=3(w5!IEIXlw6Yw4=RF&kk7oa&6WDrUP^RwvbUVxtE*!a;^7JDC83S`l2iXfvibf z@nyfMy#wGbW1vl;1498kk;O^GoaIT3KRfG%cJZm^=KlI~r7imeJqndnTGez1I}I zPiGQQET@LWcxu?PVL0+2zU%|R;JE4Qr}NB6mgp9e_TzJ|| z7SS2A^(I`Aoyv0pE#AFJv76l{?84%mQHl$OV_0s*S`AzHQrFtSkYBANuzOk8?&b5z z+IAxv7Mf>-dn zGQ)0D!+zN{P~S<3mR_YAV<4v)>W```M=&Dp4(|y~7ju^QVzUYFQ=QeYM;}G#X?UH8 zoy|jSKZZ9^XmT*r+m$-MSs1V1+&dvRnho@h%iJ=~#zGlcTXa&_zFAqrJ#ohrzk2gE zhKcQs8Q0~kL{~NhQqI2a5i<7)tKEl;PA=}y(Y3C9sXc~b>wtMt^|E%?Et7o6HAX3y zUVCvbsw*&~RoCgP&!@9`&R#RQro7gyhwgi?yI{J^<` z2M8HG3>P}C^gvwth`?<0`+jk#;u*PGvN zX1j`OvFnAjD*7=TL@;{LTk({?85inCY}M8$yUr^O4d;MY=^CiizIR^7o5R~NFv>Fs z1Nbz)3oWuzOW?!MqzwwhWP9Aq@Cq;$Uv|xT!tZqF+5rzJ+~tW-a5fU1HNNLg#C9P& zxw>%`6WOcI4RWc|mo1CH4bfFKWeb2YxJt=xJPTt?ugA@Zxo5GWzASsNmd~P!j0ve` z{LN*;I>`7TNU%P(Yz(+wwus%1VsaFHTb%DI3d}Qu?NHA{F7zvvf{U>76UB5D(?*)A zhvOFHU{YIgdQt6##++;@zAC!UVHa#N_f|{l%D}6CKFRen6B2C;jrKk_p26o zwNGiQvx)M|r+t*wGw5(vxX-fK<_CKw_;Rf>E+TrvU9|EU$*r%CGzt;0X z)P$-!eRIO&14qofrpLDb2g*_KxGAq@-K575HK}SAUnQbb z_jNhD4J3{35+9raji_4In5uI*cDms4n2EPjayt5fIM=05wy~mzXvU@d_w`+k2t!n4 z>rHrWrsEr}9r&qoa?5M$R#_*{gmd{g-*8#O>iDR!JFl!c2XDV=IqVEF86kgm)U_|6 zmW^F+63(toe-2uiPt;nC&IWWUcM!2AQrmfpg1dus1?^fE<3G=W3|xh3qA2eRzdV;HydyQw7WOwSGhYFkyVqyY0#W8n{i!v zbUA#vY{6zl86^GmK2W}LAYzK`IfO9Tlew+-ZuzngC#R`UW`3%Wwr8Ze?!$}PxWilz za`qk{=NXg_u}Y<^Dt0mo+q0|oU3hSnu@h4pQTG(Pt7=*RUh?fh+_gm(BggVsc?PS} z;-oh7&8i*I(Kn|Bo?^_1iH3BE^zT51$#%e=Nxhl=`dGRURj8G$k9~G-X~vB3{mVW( zQ8F2f%m4TsytGwmmT+AJV;7A*wNFALvu@VBt0Lpv3v%0k?Ahw+7gaHubT*hc-DQ@M z4rz=W$GUXz|L4P_RRL1sCJ5VFWAjF;+3lVXj@?D3Qm>E`FjG zAvLCO8S~WG8qn-`(X+4i_2lf%j@oT{x%Or-u(G#%uToE188X93+yBovbe0)CDO&V1 z|5U`4-6tX@dE{E>ip#>znQ3{Fzos&x5=x8T@pAbp=unE? zK`T~SQIZJ)F63hwoK+o4H~o;)pU;&}F#243<*8~O^7+lR7dW~;-8S;gYC%#|wYs!# z!pqTewaSQaJ!V)I2g=!UoUBiEL>VeDclt#2AH;CVD7`o zt9Y=j6&{~Pv@C==cX^(chJ7sLwnxsqzNVRKNTwc_d_J_~%({r(s%NP?G-|8u(Vm2E za!{nF5p$1Evyo*vuJS#2d!wW*A;qkYs?lheAZo0rQi zhAQRs6?Avndz;FdPOHX;^Z0FZPUyRj4@+9PChPjC;wE#Z9A*NtViTc6x7fR)Prr!_ zpmed${-P$-*hF$-`rT6|qnam9wo=w-Qoya7KSVy|pT#4WYCr?;f{#VoJBtxzh6DHb zt}Itv_qSydozb|ib?t+yELEqc=hiyg(scVaN%VSJTpo5Z@;E#11Q@m|c@wABCyLG% z!3x;t?ZK&v0{8iL&O@%U)0;$xEIyV=E)B$S+{;xLMO0*??xRneE&95r4ti=dA~6~2 zynIhG(_=aN66}r-pFWXrB~ws6n8ARM_Z}9hF<`pbwlF$J&BMZKaS33;D6^#khZ# z%p7cChVGg*%hJn5^acA7iaYm9B(92Gm>60OwShxodrwclsK)Aw5qXuLx`q`oQL*Z1 zPo1V_w^L>CeTrGID-Y9DRz|zal0sFN1v7o$0<;kqr9}9ZkIx2|yUcaeS!eBD+S&pp z-yCW^38*Z!hUL91t8Bs;ByszyxT6ZC*nEG##;JIiDkPxsY;5@AZd4m<3G9=`GkB%q z%>G$cE8r^Z@t)Ne!L68n4qNWmg4K?Sj$**s*Z$X^3(yRV!v;F8K|}d>%dOwHoWQHH==@FbDrNa?La{vQx-+m)VYNZR8zm`bBjXQw(QH zwpHD($^m<>VFirS$GkqQ%e!$;cZTMp8dSOi_OZn!n@krH4`*}^$12MOzFd|YV=_JV zz^lu2qg`x{IQ5VhC>qQgrVj4lE`$rvuxrhnYgtZm?T3)?8EDIK5SW>;l`s4CaO@AA zp*>?4xm~KMMfE#Lo@GO7-IVBD7q#n-H+nnd3BOp@N{RNYt!Kjrk0P`4CMRjm=htBD zay329?m=EoXgYVY-S^AEOMUAP%jbiXdg&r(OP-d4V@$fxPFar+t81>BUD|zB*k#+8 z7|iG)du-HY)ev@i$c0=F=MmBB$9Z(hP$xOwSYLCq+^gcTs^52q(=26m4xOo&ebInx z(o6?m_RAPWMTQG|-x=?;IJR_^{6zVt)V6jwq*q@GyKiJ>`IXI&#RQ|awN3W3K$_?3 zgxL95wyiM?Gbq@S5xRX0a>F}LkzSY(FYj!7KA6;GaN~FnC!5n-t48!a9E7TdOh7s3 zC@dycF_G(1Y+=@g`PFF5@mVxAgY22;$!Bp?74~2a^eU~WP8G-UZ){pxa>tE!1-otJAX z-#$vq%D$It3lKN`GW@9FqR%@ki4FRjf--H4QC@A)7xHwR$~o_QtI4HN?QdDD`t#^ckA?Q`F=va7nU1ZQ`5Y?-cwfiiL) za;1TmH4N&Lz4~_Ar-@~Ya_tYrc?=_2OxK>hRW1uL#`4)KTjZ`RWXt*4PdQ96t`t4M z%jF4yY7};ld1}n);teCB-dX3UIW{NUu(aEkEuN+a&m^pYEdt%>W#%;f+XJPPb3(XS zXM7s!dgj3OF9x;g`{Hz!?y9FHucEqw z1DcOjMPwBzkS_Y3CAZzpMju*oF96oqZ;CUf@+Dw$!bRApFxDB6P`H{DDQAbhdE}~L z+O7hnMY31e!|NBu!Of+jZ}4)B3p?ilPV3(r3-&g8@uJCd6;Rh@)s_AF%_=qqIvj$h zS39mW9(wy-2Oh&KUuQe2Tks^n&+?m_(y*-Xml!de>)g1?u@8f6Qk_9qJNtm#PGGnp zSuvs_I4ZkU`uAEQju{K=tZJP7S(HIz_vY&t*~@a8tuRyTLr(ULq7|NEPaP|3IyQS1 z=;u={Gsa$*7ay|MX^*nSWqm9w)k(9PLr)YZ&F7jl4ExpPyQxguT0 z+#2Eaq9!6vF;ar_a*b6biX3X!8kUvul~%0cC%ZP^C(#aivs$|echeN&qF$~;R^FDP zCjGRmy4uRIj4|a5W_U9Q?3}Kqe3H}&?%b+JH*K7bWX&8M;Sb&y9d2MmXd&jfgB=g% zq9KLeeWycCx2fKps4avlHu{|7)n$t=VLdM*%6;!7IqJBM7(JYv zdt~D^0QW42OYQWn;p#EOJ$=y{WqxuolGDY3s^30scHa)i3@h8pH>XOXkS4LOu9{+} zl}k4L-9c>!1Tu?m&UzQva#m`_k4hQ-Uo&-Mq9bEw0&7*DrCoWe*OWe%X=mQhG>6`~ z-Fsek!YEXKe9l}C>k!mDhKg8>u^DX{ztoBpxW~S#cfWW!#sj=QMQ-pIN?nD^7Z~%9 zwY-pALR9G~rnMnsXL1j(@0*1g&S;}QJ>BgI|65S|scSDQ+n1`H_f*ghZf+xL%-s8Z zVthk$c2#rgS#lCEKoNQVKMqYwpfq~n<(>|Ly)|tI@pW)eRXL_dDBhjgmR>Hx(>t}U zbYK?K)5`Pevk^<3>LY?P%)-F*K-AR3_I8=#?>M`Y~ z#xZBISRNwnw}r}xS;33$pu#>>@yb8`mL$b6jRkhVdB3F0V6rFJt=BJW14Q_TeyBLB zp`5eI81SywoZYy9V>^@>dJ@D)p{_-S_p6bqA~&gFUb`+)eZth0uJ19VXOMwgIfifX%R97R-)d!f3{VTI6h?McuiYI<|_ZtM9>E}6aTl;%}22n5Tx2l)3J zHkXkGhe_4GD0xN<)El*!hpbJUUn0Acul6;K+f!ZUwUyU4u5~*uV)s>QJMWSxus&7X z&dO0*7U22O*`dpi6S_+Ou0R{0!B^c9Va~R+Qza~a>2+tLODfdG%=%lKqpk{l-pAZ| zXXa+;Pz*Hjjf||0a7M57J0X2Ipj6yF+S!#Z29uRBvjD7yM^!x2joCSx^D=Rq7IRRa z?CgN^*G7qSPnQFP5iace@u`x1g!JpZXMkwsv; zmz}ZO8Zb4{k24brafUo+{nC+}nn=a=)hBA3EJIA6cY}Ru9s=4Tq-A@SGZXFbVZFh?i^nj-+Mj&C>IE zx>hpb>yM8jPTsZl?b=s`Rbo5NHR*iE-`JQ{qQ;-@o{uwnNTe&4*Nwbgq4!eQ=JUCz zk+LUnsK@!R$6%dCdynBQP>!=zTvw4%1?E1N%QokXc80O6m3_VoZ(62s4IU`i_v53_ zPT6vuC3o3!UZQKkC-8l3=9Srw6L7s@Y=IYv0UXwE5)yOWvDJxd<@4F%DWq$%%{iGN zRpfDSKG`;jjdEaNGU`Q4Vjv9@-7qh96^+_sP3?s2o)-bI} zs+J6^`l*X2($WSd(kH-X0aJ=Y`Z%8#RUI@qcW(LSC(C$oA#*9sV^|Ayii*(TW$zxX zN;%um$D)`(H=Fi(Rg6&6L~khlO?>U_B#Uhv`(F0hy|#y>%X!_=%7=IQrNX_msvBhs zoOpd$y<8jTISr#1~3sZPet+jj?g|1IsM>}JcF^qe4xydLutb^%s_*!`B zE;WRmU%zY_>&e;Z+&59kRx@E%CGSf|**T3-7I5axA<&2%#Fp&oT!52CR)}+aK_Y|Qy&ntjP;p~l?|L(= zc>$MNV)o%o&qN_=ttMX7$aA*Yp%~j8W!SteGq+K%3h$iPc8uz%hqKNSk}GY6Jo9ks zPOiFh3Z4WUDeYpyDdyu-*|^L|IHP+Ci?G0)s_*92eplPGm4_6aosoSzrKoT_9L1;` zUAl&1+{dSf@3;X#yj;xSDkkH$bG*Bm2&@^+MX;QU^c<`w#`1uCKI|+WPTj_Nxu}{f zT;OKt6PwR6qex|OzI|FyZk6BoL3WqwLGsz&;#{(6KaZ zh5f#k^{A;#*BZXN3_JQJr`25O+u>|qK*~p?*)eQXS%V+DQE|{o^!pnZnMLNw+tDa(~!fd!X zA^&yMIay`Kq0&5_Wf{n^sZ;Bob^v7D99Vtk(~ea&G|?=%u8tj_s;1yx#=WSt4BJ4a zyD^@zPOAa58*}X&1$rziED9!7#g(D|Wd3{|^)v?k;zXTTVl<1j?!o7`<((A?8R7Gu# z<6ck4DMIe+9I+o9pI5<7`;5`e$G%=0p=XI#h04{q=ultd;h1}?T!kIH;3(;C#2$5( zo~o`z8ekIBpW1b6h7Yz}{mXuV)6i{ ztKak35I1Wazh$@87*c6Ny42wKZZ*!EZgkdO-FuFj0xfa$lc4n&*2$Y?;Ns29m5$MF zIiqmQJCSCL3D=4mSL28I4GxstxC*$P7s>HT@8Yz=fhT5=xouXUoI+a z>dbMzdAaJIjj3&uf4SP38|}zFJ;gM*OMhN zfEmJiIM>DmkqJJOb(Ag6=GriMwJ$1>bX%HZo^K8}QLzJ2knTn0jfKgrUQgn)0EAAR zU<9qkf>mtYvUu-F&zA0{+WR|9{%d_-ik06gW>zmdNfMC81lT9?5s z*7a7dyR((OhTQ3CeWFU|&(S4>MRcx?o^%b7wS6zDQ=8?#(JOhLx69c>DOztlK0S9+ z>;uDMJsg#{PmNc|^|`U_DS>uJx7)Awhv&LYa!&l@I?kalMkPJPMlbx@yszah`|xVR zRP!SC+sDFjGjQzm?+@3ZXmDkp%do>e zFzV>`)WW%JffVf?4(FZNm%dCZz_P2XhAqZZ?%FTzSqxmNnNET2D%=!{zQ|eN8>cdK zP3KC7KA+w#&b3!g@9ecEd8Gg=YxnUP37K($b=1oxe}a0>ej%&s*exWC)xf+;Ma{$& zt%usvVkvn`v;+t*YR`WYs5W~&#jNJ!9j0m>@U`dWb}M;@Q_Yt>A}1$+alg3F0(V>2 zlVV+dEY~blw<V_uFo;&O2;$lZq!IGD#QHpOsAFU zd*x+ZxHa2vA*a5+E!_jfK8m(rYlCHi7|z*ZSXXv=QB&helbo(Dk2tnp2eWeS&S56) zct>^iWMsOk<*Pl%1-tMFbde9|^{>WgCB?{5f~L2mlr8R#%dBi=`?Xy0^Kk+agng=B z?JH)$8^aFhXL;KayNFH~`Er?TL%|7OSN%8Z1p7bkfBbFX@8Wd_rlAf5H2{ z`s3H1B7gey?ZJPE`tkmAX5XLxIO3ss@sHxZZ9n<^xlcd*e!Tx%zhAEW^7>Cl{AI7dwECCU z{&cQq{dv^SKfksA+|M`4_vGkLSKUshN=}&w7()!Eme;?t8$BTSF`Stk!u>N@eUdg|>Lw@-F>GvN9KURNv z{lost_rJWmet7>Sw7<0WBmUoi|Bs>k-0SC`e~JH>)_&>xZ?Ez7msR-*{V($m;A{(Ck4aooSn^elf_rJoR-_d{0_u27B6+ZuGeeeJ0{NPQ;zxM4L^G{{KpPi5&-ap>IG5+}V z&#nDA`yWUD=db9>UJm>#`*)ST`hI_({`2U6D*C5PKjD7o`hNY%`lq6QzWP4@GKQ|_ z+5gnfFMWUV`Q^$luYdCVx1WCL`}6DX&)=^&8g zf6DuZ?U(OA5Pp8={$cxb@Azo&$E*DG><9XfwO?NUu>bk@Z=65B{vTQSk@d&>kCI{jh#teZT&&{rUI*|1{!P|B2H- zKmGdY^{3qL)BE{p?Wa$F&i=#W-&_CTm*(`-=O3T&FLU*c^8K3sdS|3p@h_~u-@jjf zME`jIw(?&*f3iNy_u3EJzj=SZg!;$7|IOe0;qQOrUqk)vum3;)^B@22?|$`v*!=(c e%|HGAxBvLNU;TCbRsA3T>;Df6M`nvkw*UYf3v2NJ literal 0 HcmV?d00001 diff --git a/tests/test-data/GCA_000961135.2.hp.sig.gz b/tests/test-data/GCA_000961135.2.hp.sig.gz new file mode 100644 index 0000000000000000000000000000000000000000..40cbabca6820b0bc44387b6816a5cc11ae9896d7 GIT binary patch literal 339 zcmV-Z0j&NXiwFP!00004|7}rAPQx$|y-MVDi2VO#S9QT1K&YC=lhz_}N*p6p)Vnib z!3xQX@yzr4cwWERFhMD-XJx7D4CQ(%>5^fqE3%g13~8bvmDhbX);w%!$<*TAOb_a6 zH4Vo!VTKv0|9m{0IOo9%p-d-Fw{_VtsQ;`N)>E`xXV&wUy_a++kfO36M%HWDvKhWk zSIWftAf-}Ha%H{I&RK%ZDen|Q2RhbpZ~2Z0tu{_N8I<8cdg+5{tro%?FRd|38oIPj ziNJ&NQkdXE2>gxIqA9x@7GgxFtsVzyf%Bm7JPw{fK0*{DQVKZN93c<5TQDRcly9{b lZE$v{S|%Log!I?f8+%aa6W8*)6yE8_+Yi1@;5*a-005%Zp*;Wq literal 0 HcmV?d00001 diff --git a/tests/test_gbsketch.py b/tests/test_gbsketch.py index 8c1fd92..9ea7578 100644 --- a/tests/test_gbsketch.py +++ b/tests/test_gbsketch.py @@ -494,3 +494,51 @@ def test_zip_file_permissions(runtmp): print(f"File: {zip_info.filename}, Permissions: {permissions}") # check permissions are 644 (rw-r--r--) assert external_attr == 0o644 + + +def test_gbsketch_protein_dayhoff_hp(runtmp): + acc_csv = get_test_data('acc.csv') + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + + sig1 = get_test_data('GCA_000961135.2.protein.sig.gz') + sig2 = get_test_data('GCA_000961135.2.dayhoff.sig.gz') + sig3 = get_test_data('GCA_000961135.2.hp.sig.gz') + ss1 = sourmash.load_one_signature(sig1, ksize=30, select_moltype='protein') + ss2 = sourmash.load_one_signature(sig2, ksize=30, select_moltype='dayhoff') + ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='hp') + + runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output, + '--failed', failed, '-r', '1', + '--param-str',"protein,k=10,scaled=200", + '-p', "dayhoff,k=10,scaled=200", + '-p', "hp,k=10,scaled=200") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + + assert len(sigs) == 3 + for sig in sigs: + assert sig.name == ss1.name + if sig.minhash.moltype == 'protein': + assert sig.md5sum() == ss1.md5sum() + elif sig.minhash.moltype == 'dayhoff': + assert sig.md5sum() == ss2.md5sum() + elif sig.minhash.moltype == 'hp': + assert sig.md5sum() == ss3.md5sum() + assert os.path.exists(failed) + with open(failed, 'r') as failF: + fail_lines = failF.readlines() + print(fail_lines) + assert len(fail_lines) == 2 + assert fail_lines[0] == "accession,name,moltype,md5sum,download_filename,url\n" + acc, name, moltype, md5sum, download_filename, url = fail_lines[1].strip().split(',') + assert acc == "GCA_000175535.1" + assert name == "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14" + assert moltype == "protein" + assert download_filename == "GCA_000175535.1_protein.faa.gz" + assert url == "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/175/535/GCA_000175535.1_ASM17553v1/GCA_000175535.1_ASM17553v1_protein.faa.gz" + diff --git a/tests/test_urlsketch.py b/tests/test_urlsketch.py index 6967849..edfe92d 100644 --- a/tests/test_urlsketch.py +++ b/tests/test_urlsketch.py @@ -351,3 +351,44 @@ def test_zip_file_permissions(runtmp): # check permissions are 644 (rw-r--r--) assert external_attr == 0o644 + +def test_gbsketch_protein_dayhoff_hp(runtmp): + acc_csv = get_test_data('acc-url.csv') + output = runtmp.output('simple.zip') + failed = runtmp.output('failed.csv') + + sig1 = get_test_data('GCA_000961135.2.protein.sig.gz') + sig2 = get_test_data('GCA_000961135.2.dayhoff.sig.gz') + sig3 = get_test_data('GCA_000961135.2.hp.sig.gz') + ss1 = sourmash.load_one_signature(sig1, ksize=30, select_moltype='protein') + ss2 = sourmash.load_one_signature(sig2, ksize=30, select_moltype='dayhoff') + ss3 = sourmash.load_one_signature(sig3, ksize=30, select_moltype='hp') + + runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output, + '--failed', failed, '-r', '1', + '--param-str',"protein,k=10,scaled=200", + '-p', "dayhoff,k=10,scaled=200", + '-p', "hp,k=10,scaled=200") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + + assert len(sigs) == 3 + for sig in sigs: + assert sig.name == ss1.name + if sig.minhash.moltype == 'protein': + assert sig.md5sum() == ss1.md5sum() + elif sig.minhash.moltype == 'dayhoff': + assert sig.md5sum() == ss2.md5sum() + elif sig.minhash.moltype == 'hp': + assert sig.md5sum() == ss3.md5sum() + assert os.path.exists(failed) + with open(failed, 'r') as failF: + fail_lines = failF.readlines() + print(fail_lines) + assert len(fail_lines) == 1 + assert fail_lines[0] == "accession,name,moltype,md5sum,download_filename,url\n" +