From e0ba1a03de076478354c3804c627be10e3f61042 Mon Sep 17 00:00:00 2001
From: mvdbeek <m.vandenbeek@gmail.com>
Date: Fri, 5 Jul 2024 16:27:07 +0200
Subject: [PATCH] Display binary data even if text data is expected

I've been going back and forth on whether we should raise an exception
or not, however I think the advantage here is that often you can tell
from the binary data what the correct datatype is (e.g. BAM files
assigned as fastqsanger.gz will start with "BAM").
We could also raise a message exception that just says "this isn't text
data, check your datatype" ... I don't know what's better, but this is
easier.

Fixes
https://sentry.galaxyproject.org/share/issue/a8843884527f4e4089b32fd14a2f126d/:
```
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 4: invalid start byte
  File "galaxy/web/framework/middleware/error.py", line 167, in __call__
    app_iter = self.application(environ, sr_checker)
  File "galaxy/web/framework/middleware/statsd.py", line 29, in __call__
    req = self.application(environ, start_response)
  File "/cvmfs/main.galaxyproject.org/venv/lib/python3.11/site-packages/paste/httpexceptions.py", line 635, in __call__
    return self.application(environ, start_response)
  File "galaxy/web/framework/base.py", line 174, in __call__
    return self.handle_request(request_id, path_info, environ, start_response)
  File "galaxy/web/framework/base.py", line 263, in handle_request
    body = method(trans, **kwargs)
  File "galaxy/webapps/galaxy/controllers/dataset.py", line 152, in display
    display_data, headers = data.datatype.display_data(
  File "galaxy/datatypes/sequence.py", line 785, in display_data
    "/dataset/large_file.mako", truncated_data=fh.read(max_peek_size), data=dataset
  File "<frozen codecs>", line 322, in decode

```
Which is a BAM file assigned to fastqsanger.gz
---
 lib/galaxy/datatypes/sequence.py | 2 +-
 lib/galaxy/datatypes/tabular.py  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/lib/galaxy/datatypes/sequence.py b/lib/galaxy/datatypes/sequence.py
index 911a1a2db31a..71fbc0ec8d9c 100644
--- a/lib/galaxy/datatypes/sequence.py
+++ b/lib/galaxy/datatypes/sequence.py
@@ -774,7 +774,7 @@ def display_data(
     ):
         headers = kwd.get("headers", {})
         if preview:
-            with compression_utils.get_fileobj(dataset.get_file_name()) as fh:
+            with compression_utils.get_fileobj(dataset.get_file_name(), "rb") as fh:
                 max_peek_size = 1000000  # 1 MB
                 if os.stat(dataset.get_file_name()).st_size < max_peek_size:
                     mime = "text/plain"
diff --git a/lib/galaxy/datatypes/tabular.py b/lib/galaxy/datatypes/tabular.py
index 5b4e1d523e12..ea381d1f9c0b 100644
--- a/lib/galaxy/datatypes/tabular.py
+++ b/lib/galaxy/datatypes/tabular.py
@@ -154,12 +154,12 @@ def get_chunk(self, trans, dataset: HasFileName, offset: int = 0, ck_size: Optio
         )
 
     def _read_chunk(self, trans, dataset: HasFileName, offset: int, ck_size: Optional[int] = None):
-        with compression_utils.get_fileobj(dataset.get_file_name()) as f:
+        with compression_utils.get_fileobj(dataset.get_file_name(), "rb") as f:
             f.seek(offset)
             ck_data = f.read(ck_size or trans.app.config.display_chunk_size)
-            if ck_data and ck_data[-1] != "\n":
+            if ck_data and ck_data[-1] != b"\n":
                 cursor = f.read(1)
-                while cursor and cursor != "\n":
+                while cursor and cursor != b"\n":
                     ck_data += cursor
                     cursor = f.read(1)
             last_read = f.tell()