Skip to content

Commit

Permalink
Always detect_file_encoding ASCII as UTF-8
Browse files Browse the repository at this point in the history
This ensures partially analyzed documents that are detected as ASCII
but contain UTF-8 encodings later in the file still get loaded. Also
it standardizes charset detection between chardet and charset_normalizers,
which previously differed on the \00 test case.
phw committed Apr 30, 2024
1 parent 43f7603 commit 01a3ee3
Showing 2 changed files with 10 additions and 3 deletions.
9 changes: 7 additions & 2 deletions picard/util/__init__.py
Original file line number Diff line number Diff line change
@@ -1219,6 +1219,11 @@ def detect_file_encoding(path, max_bytes_to_read=1024*256):
result = detect(f.read(max_bytes_to_read))
if result['encoding'] is None:
log.warning("Couldn't detect encoding for file %r", path)
result['encoding'] = 'UTF-8'
encoding = 'utf-8'
elif result['encoding'].lower() == 'ascii':
# Treat ASCII as UTF-8 (an ASCII document is also valid UTF-8)
encoding = 'utf-8'
else:
encoding = result['encoding'].lower()

return result['encoding'].lower()
return encoding
4 changes: 3 additions & 1 deletion test/test_utils.py
Original file line number Diff line number Diff line change
@@ -992,6 +992,7 @@ def test_nested_with(self):

class DetectUnicodeEncodingTest(PicardTestCase):

@unittest.skipUnless(charset_detect, "test requires charset_normalizer or chardet package")
def test_detect_file_encoding_bom(self):
boms = {
b'\xff\xfe': 'utf-16-le',
@@ -1001,6 +1002,7 @@ def test_detect_file_encoding_bom(self):
b'\xef\xbb\xbf': 'utf-8-sig',
b'': 'utf-8',
b'\00': 'utf-8',
b'no BOM, only ASCII': 'utf-8',
}
for bom, expected_encoding in boms.items():
try:
@@ -1024,7 +1026,7 @@ def test_detect_file_encoding_eac_utf_32_le(self):
file_path = get_test_data_path('eac-utf32le.log')
self.assertEqual(expected_encoding, detect_file_encoding(file_path))

@unittest.skipUnless(charset_detect, "test requires charset-normalizer or chardet package")
@unittest.skipUnless(charset_detect, "test requires charset_normalizer or chardet package")
def test_detect_file_encoding_eac_windows_1251(self):
expected_encoding = 'windows-1251'
file_path = get_test_data_path('eac-windows1251.log')

0 comments on commit 01a3ee3

Please sign in to comment.