diff --git a/picard/util/__init__.py b/picard/util/__init__.py index 39cfc785c0..0d1be5fac0 100644 --- a/picard/util/__init__.py +++ b/picard/util/__init__.py @@ -1219,6 +1219,11 @@ def detect_file_encoding(path, max_bytes_to_read=1024*256): result = detect(f.read(max_bytes_to_read)) if result['encoding'] is None: log.warning("Couldn't detect encoding for file %r", path) - result['encoding'] = 'UTF-8' + encoding = 'utf-8' + elif result['encoding'].lower() == 'ascii': + # Treat ASCII as UTF-8 (an ASCII document is also valid UTF-8) + encoding = 'utf-8' + else: + encoding = result['encoding'].lower() - return result['encoding'].lower() + return encoding diff --git a/test/test_utils.py b/test/test_utils.py index ffdbfcf160..e7ab0ddf54 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -992,6 +992,7 @@ def test_nested_with(self): class DetectUnicodeEncodingTest(PicardTestCase): + @unittest.skipUnless(charset_detect, "test requires charset_normalizer or chardet package") def test_detect_file_encoding_bom(self): boms = { b'\xff\xfe': 'utf-16-le', @@ -1001,6 +1002,7 @@ def test_detect_file_encoding_bom(self): b'\xef\xbb\xbf': 'utf-8-sig', b'': 'utf-8', b'\00': 'utf-8', + b'no BOM, only ASCII': 'utf-8', } for bom, expected_encoding in boms.items(): try: @@ -1024,7 +1026,7 @@ def test_detect_file_encoding_eac_utf_32_le(self): file_path = get_test_data_path('eac-utf32le.log') self.assertEqual(expected_encoding, detect_file_encoding(file_path)) - @unittest.skipUnless(charset_detect, "test requires charset-normalizer or chardet package") + @unittest.skipUnless(charset_detect, "test requires charset_normalizer or chardet package") def test_detect_file_encoding_eac_windows_1251(self): expected_encoding = 'windows-1251' file_path = get_test_data_path('eac-windows1251.log')