Skip to content

Commit

Permalink
Merge pull request #2375 from phw/charset-detection-fixes
Browse files Browse the repository at this point in the history
Charset detection fixes
  • Loading branch information
zas authored Mar 28, 2024
2 parents 7d97ff1 + 3cc33b5 commit 8af19d4
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 5 deletions.
5 changes: 3 additions & 2 deletions picard/util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1167,10 +1167,11 @@ def strxfrm(string):


ENCODING_BOMS = {
b'\xff\xfe\x00\x00': 'utf-32-le',
b'\x00\x00\xfe\xff': 'utf-32-be',
b'\xef\xbb\xbf': 'utf-8-sig',
b'\xff\xfe': 'utf-16-le',
b'\xfe\xff': 'utf-16-be',
b'\00\00\xff\xfe': 'utf-32-le',
b'\00\00\xfe\xff': 'utf-32-be',
}


Expand Down
Binary file added test/data/eac-utf32le.log
Binary file not shown.
15 changes: 12 additions & 3 deletions test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
album_artist_from_path,
any_exception_isinstance,
build_qurl,
detect as charset_detect,
detect_file_encoding,
encoded_queryargs,
extract_year_from_date,
Expand Down Expand Up @@ -949,8 +950,8 @@ def test_detect_file_encoding_bom(self):
boms = {
b'\xff\xfe': 'utf-16-le',
b'\xfe\xff': 'utf-16-be',
b'\00\00\xff\xfe': 'utf-32-le',
b'\00\00\xfe\xff': 'utf-32-be',
b'\xff\xfe\x00\x00': 'utf-32-le',
b'\x00\x00\xfe\xff': 'utf-32-be',
b'\xef\xbb\xbf': 'utf-8-sig',
b'': 'utf-8',
b'\00': 'utf-8',
Expand All @@ -960,7 +961,9 @@ def test_detect_file_encoding_bom(self):
f = NamedTemporaryFile(delete=False)
f.write(bom)
f.close()
self.assertEqual(expected_encoding, detect_file_encoding(f.name))
encoding = detect_file_encoding(f.name)
self.assertEqual(expected_encoding, encoding,
f'BOM {bom!r} detected as {encoding}, expected {expected_encoding}')
finally:
f.close()
os.remove(f.name)
Expand All @@ -970,6 +973,12 @@ def test_detect_file_encoding_eac_utf_16_le(self):
file_path = get_test_data_path('eac-utf16le.log')
self.assertEqual(expected_encoding, detect_file_encoding(file_path))

def test_detect_file_encoding_eac_utf_32_le(self):
expected_encoding = 'utf-32-le'
file_path = get_test_data_path('eac-utf32le.log')
self.assertEqual(expected_encoding, detect_file_encoding(file_path))

@unittest.skipUnless(charset_detect, "test requires charset-normalizer or chardet package")
def test_detect_file_encoding_eac_windows_1251(self):
expected_encoding = 'windows-1251'
file_path = get_test_data_path('eac-windows1251.log')
Expand Down

0 comments on commit 8af19d4

Please sign in to comment.