From 06ae9d7de9ef0071f7b0109151a61cd3aa636cdf Mon Sep 17 00:00:00 2001 From: Philipp Wolfer Date: Fri, 22 Mar 2024 07:48:38 +0100 Subject: [PATCH 1/2] Fixed utf-32 detection from BOM - the utf-32-le BOM was in wrong order - longer BOMs need to be checked first - added BOM for utf-8-sig --- picard/util/__init__.py | 5 +++-- test/data/eac-utf32le.log | Bin 0 -> 17932 bytes test/test_utils.py | 9 +++++++-- 3 files changed, 10 insertions(+), 4 deletions(-) create mode 100644 test/data/eac-utf32le.log diff --git a/picard/util/__init__.py b/picard/util/__init__.py index 6af22fda20..214f8ffef7 100644 --- a/picard/util/__init__.py +++ b/picard/util/__init__.py @@ -1167,10 +1167,11 @@ def strxfrm(string): ENCODING_BOMS = { + b'\xff\xfe\x00\x00': 'utf-32-le', + b'\x00\x00\xfe\xff': 'utf-32-be', + b'\xef\xbb\xbf': 'utf-8-sig', b'\xff\xfe': 'utf-16-le', b'\xfe\xff': 'utf-16-be', - b'\00\00\xff\xfe': 'utf-32-le', - b'\00\00\xfe\xff': 'utf-32-be', } diff --git a/test/data/eac-utf32le.log b/test/data/eac-utf32le.log new file mode 100644 index 0000000000000000000000000000000000000000..12df8b62a61699c5feec86c0099b484c0481c1e8 GIT binary patch literal 17932 zcmd_vTaR5w5d~mB^D9b7Mv0Ugb}oR#!{AJaA=tro;&6KyCl2A_0J#z=za3bgw3pO6 zI>#Af&rA|OEOqVP)m5vit9tjD`R{)&T{^yW>C(T0li>Z}vtWXw;Pc>v;4i^Hf(fo| z@;@_v6-@AM@Z;cx;MHJ)4}(tv>mCFXTnVlQ^d`6&)F-`r!TrFx3FyDHj6aABI3ESL zOkj-|aJTj(sNHh^ICB&HJ>cul0gm3AV`G9tykE|{^8Qt1KG>8aGQZ8o%A5RLFacSf zeiq0BG$#4Piyd~!=04hFbGZI1;In*Rj6A_F0&8*kBIx@YoBUq89o>Gch2g^b4p)*T1W2hGnlyi4-!;}-nYWfLHYZT0czgD4g096y-uaRvz8_|b9km3_sWlU<`C*T* zL-k7kNux38RWaNQy0`e0(@Es!#{R28F^Fw~w}Lf?_E660O<+B_HumZBEw9bZME)j} zzuLw{3~vNDs%e;jU3EgXx8B)7m?&fy}byD^ww^KGrYG*8$1-8K7ob}#O2Yvcy^zXgqXBE!DD zurG4YA535!9*1&BpN?2tFXcL!yB6HqM$h=L$3D6K%;9e>-tw`_c`~=6G3@TeJKNfQ zoVqXat8VzkqkA=x&l`hncN88B9 zOAFNbF9RIgpZdnL{j&f2YIQPyQCM>;etE(LS9{*Pse#JgJr~S2C&t#fIq`}g*l6r) zy|KmL1oY(Hdwbd%Uv1K-!!|B@R=#0hJ+p1Sb?jH3$m?F~TQ=#<|B2>s<oEa0~c@p&dY3@m)+?*%& z8>efVtQK33?U~73b7-!6+Is1Cr~P9OfbG`cM1B~`2@moggSP|g*^vvfyzn={8-d)m zF8Q3`nZR!rb)sIL4cI#l%(KxNp2$amTCpGO@i#$pqIcB|8+^VMSmXV^xfe3e*L&Nz zyXSn^tyh_2Bj<|2*U6 z0AKr2fBcZWvwi~hSDUqEltVeHJ!_kn?&rg7;m3ylcjo;js4d*sxEI*h2f?G-*N=SQ zy7E*9_X9q}|1gf$${{;y<=4TDpnP#>n@+jo$u9fqz!|4LzR8zwK3nXwN3Ko1b3dpZ z^5x*rKH+*ZkW*lbUGdcBZjIJH{tvS)X8B-Gj%!c6yL+Y{aAuQzIZ)$9&)?*WpWPZU zCqH62RM$_M$C^)Pn%wHA;{ccDV9lvG^|Zcf?3W{V&&a+f@gi#9B_Gar_0%)rUSu)KL3vH&U0?j#Go$*Ek6r)G2RrgCr$+e}d;ND$#4y2WeDN#y ze2djNvie(N;!`~Q^6C88^@-z}dwTeO5v=*Z4b)<5ihcRxgYH@b`|O-{|M#uMa{{9n zaIA0o{8tB0?`NB{*WO!)GdZ(N?)PTOGhx=@(?jd{fW4(M@D-Zll(6z^r*>9YO z)`)v{&>H$akNe`@&BOY))8>`VOM$zY5nfp2iM;B*l4rVCgDU}Dc3%!ASam%g0zbR& z_O0cE?Vkqx@OzqAt(y(2-ACciZ}Z}JpIF(g?bYsNzS!m5-1NP1-&%U&GrAwk(P?6} zW;U#L<*a)}#-DEQ-m9*8y7JQ+5QlXWth%0yUCwKbRcGJY#wV|Q^K+V5*Id?a>jM9B z7Aqg(nV`C4HX17(x?*lkuDX-?V&%+6_50qqZ>`^e<@G4BmiuZ~tn4*bIuDDr`-*RM z&DHikzLWPW=SPXvK7#euopxX4b~cogoPk)y4PrI7kAGwJzBNowjZILyp)rx@J} zCwMdHJ9h0Q+b03P56j)#d2bEf<{SreL}bapwLVd=@kh_SUl) za2Ce|a@jod-8(6t{K>@x?*)haHAn1x6-;36);4;^iOnm4Sk#Vv^32B}+edkaJJ|ci zBNlw+s6M?DA0DvA^Gu%EXzcwRC0=^+;XQc*yq?`;&wmK62I|5dcz+y-;ne`oKLxXa zKXbRXXM0+khhn`J9sJfj-puo_18d+;@JVnI;M>}n$n00o^De&6{VI0x-VWrZJ!I~e z+g#5jb$wQSsy5i~`Efb&q5W1rkK3pGi2X_Ssybm;za9mxZ}V#OP&{z|I>74}5?z8M=b;5qn zmdlY3)xNrV61^-pPqLTU^L>TagZB0-k-tm5Y%lPaXE7ZGV)1;@z3g0hT6aXzxUg1oZxbB*8MJjPqN>A zw|%emx1QlAdf0g+2078cjl-Gb-9`7XoIb5REcOZR1}DK6!Tn%@qu|59H`$zf^8~xP zXW7H}op%rS+;CqM@8kAx>z=*#LVNnV@*VE|K`glIVX+v!yXYQvPkUN>SnLz%*Dr&U zz~A%r?8ia>2Ih`^mc5Dx`|X?a_}=z>a3`GLN%X3_zrIx8;F%xy&+o$b7T4oo{mz>6 z8~(iR%<`lj*WYixBfcT(^Zh`L_U}2JGyi!|J!{nQY5qUg{=X4@zl|oamMt)6tWVGU ztv)AJO6v1Kpy@Yyd3a(FZg{R zH@*XR13Yd8bZ|ck^b`FXf&W9^4)9TzeEEC4Sgrs>5PXERQ46z~u1l~ehiwudHb;mS6W;k^A{-Td+m=k5RH%`e|@ z-u_?N{PGRw?cZ6!Z*kuKoelMEJa7NKSCzwg`S1Ds^TfOYkGBHnpfh#?z25(8ko!&5 z|IVxloXgH%`Se`@^0h#Xd3NSHZ-73~@!mP$eeV-Tk*(DqdX%2;hiAI<_%qT2@7XfS oD|>Y48~HQu83%UpqF?^js7ZE=jfHOg8NIJPTqjubNS5RO0a!TU!vFvP literal 0 HcmV?d00001 diff --git a/test/test_utils.py b/test/test_utils.py index f3c997b28c..bc93ffd900 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -949,8 +949,8 @@ def test_detect_file_encoding_bom(self): boms = { b'\xff\xfe': 'utf-16-le', b'\xfe\xff': 'utf-16-be', - b'\00\00\xff\xfe': 'utf-32-le', - b'\00\00\xfe\xff': 'utf-32-be', + b'\xff\xfe\x00\x00': 'utf-32-le', + b'\x00\x00\xfe\xff': 'utf-32-be', b'\xef\xbb\xbf': 'utf-8-sig', b'': 'utf-8', b'\00': 'utf-8', @@ -970,6 +970,11 @@ def test_detect_file_encoding_eac_utf_16_le(self): file_path = get_test_data_path('eac-utf16le.log') self.assertEqual(expected_encoding, detect_file_encoding(file_path)) + def test_detect_file_encoding_eac_utf_32_le(self): + expected_encoding = 'utf-32-le' + file_path = get_test_data_path('eac-utf32le.log') + self.assertEqual(expected_encoding, detect_file_encoding(file_path)) + def test_detect_file_encoding_eac_windows_1251(self): expected_encoding = 'windows-1251' file_path = get_test_data_path('eac-windows1251.log') From 3cc33b5516b0ec13f98b2d0b4de50afaccc703c7 Mon Sep 17 00:00:00 2001 From: Philipp Wolfer Date: Fri, 22 Mar 2024 07:50:59 +0100 Subject: [PATCH 2/2] Fix tests for detect_file_encoding without charset-normalizer installed --- test/test_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index bc93ffd900..e95d66cf62 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -60,6 +60,7 @@ album_artist_from_path, any_exception_isinstance, build_qurl, + detect as charset_detect, detect_file_encoding, encoded_queryargs, extract_year_from_date, @@ -960,7 +961,9 @@ def test_detect_file_encoding_bom(self): f = NamedTemporaryFile(delete=False) f.write(bom) f.close() - self.assertEqual(expected_encoding, detect_file_encoding(f.name)) + encoding = detect_file_encoding(f.name) + self.assertEqual(expected_encoding, encoding, + f'BOM {bom!r} detected as {encoding}, expected {expected_encoding}') finally: f.close() os.remove(f.name) @@ -975,6 +978,7 @@ def test_detect_file_encoding_eac_utf_32_le(self): file_path = get_test_data_path('eac-utf32le.log') self.assertEqual(expected_encoding, detect_file_encoding(file_path)) + @unittest.skipUnless(charset_detect, "test requires charset-normalizer or chardet package") def test_detect_file_encoding_eac_windows_1251(self): expected_encoding = 'windows-1251' file_path = get_test_data_path('eac-windows1251.log')