From e0ba1a03de076478354c3804c627be10e3f61042 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Fri, 5 Jul 2024 16:27:07 +0200 Subject: [PATCH] Display binary data even if text data is expected I've been going back and forth on whether we should raise an exception or not, however I think the advantage here is that often you can tell from the binary data what the correct datatype is (e.g. BAM files assigned as fastqsanger.gz will start with "BAM"). We could also raise a message exception that just says "this isn't text data, check your datatype" ... I don't know what's better, but this is easier. Fixes https://sentry.galaxyproject.org/share/issue/a8843884527f4e4089b32fd14a2f126d/: ``` UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 4: invalid start byte File "galaxy/web/framework/middleware/error.py", line 167, in __call__ app_iter = self.application(environ, sr_checker) File "galaxy/web/framework/middleware/statsd.py", line 29, in __call__ req = self.application(environ, start_response) File "/cvmfs/main.galaxyproject.org/venv/lib/python3.11/site-packages/paste/httpexceptions.py", line 635, in __call__ return self.application(environ, start_response) File "galaxy/web/framework/base.py", line 174, in __call__ return self.handle_request(request_id, path_info, environ, start_response) File "galaxy/web/framework/base.py", line 263, in handle_request body = method(trans, **kwargs) File "galaxy/webapps/galaxy/controllers/dataset.py", line 152, in display display_data, headers = data.datatype.display_data( File "galaxy/datatypes/sequence.py", line 785, in display_data "/dataset/large_file.mako", truncated_data=fh.read(max_peek_size), data=dataset File "", line 322, in decode ``` Which is a BAM file assigned to fastqsanger.gz --- lib/galaxy/datatypes/sequence.py | 2 +- lib/galaxy/datatypes/tabular.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/galaxy/datatypes/sequence.py b/lib/galaxy/datatypes/sequence.py index 911a1a2db31a..71fbc0ec8d9c 100644 --- a/lib/galaxy/datatypes/sequence.py +++ b/lib/galaxy/datatypes/sequence.py @@ -774,7 +774,7 @@ def display_data( ): headers = kwd.get("headers", {}) if preview: - with compression_utils.get_fileobj(dataset.get_file_name()) as fh: + with compression_utils.get_fileobj(dataset.get_file_name(), "rb") as fh: max_peek_size = 1000000 # 1 MB if os.stat(dataset.get_file_name()).st_size < max_peek_size: mime = "text/plain" diff --git a/lib/galaxy/datatypes/tabular.py b/lib/galaxy/datatypes/tabular.py index 5b4e1d523e12..ea381d1f9c0b 100644 --- a/lib/galaxy/datatypes/tabular.py +++ b/lib/galaxy/datatypes/tabular.py @@ -154,12 +154,12 @@ def get_chunk(self, trans, dataset: HasFileName, offset: int = 0, ck_size: Optio ) def _read_chunk(self, trans, dataset: HasFileName, offset: int, ck_size: Optional[int] = None): - with compression_utils.get_fileobj(dataset.get_file_name()) as f: + with compression_utils.get_fileobj(dataset.get_file_name(), "rb") as f: f.seek(offset) ck_data = f.read(ck_size or trans.app.config.display_chunk_size) - if ck_data and ck_data[-1] != "\n": + if ck_data and ck_data[-1] != b"\n": cursor = f.read(1) - while cursor and cursor != "\n": + while cursor and cursor != b"\n": ck_data += cursor cursor = f.read(1) last_read = f.tell()