From 22df68eb1c3a5ab3b75b8e23531d9181798374fe Mon Sep 17 00:00:00 2001 From: Kevin Amparado <109636487+KevsterAmp@users.noreply.github.com> Date: Tue, 12 Nov 2024 05:16:00 +0800 Subject: [PATCH] BUG: `read_csv` with chained fsspec TAR file and `compression="infer"` (#60100) --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/common.py | 3 +++ pandas/tests/io/data/tar/test-csv.tar | Bin 0 -> 10240 bytes pandas/tests/io/test_common.py | 14 ++++++++++++++ 4 files changed, 18 insertions(+) create mode 100644 pandas/tests/io/data/tar/test-csv.tar diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 89bc942cb7250..de69166b8c196 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -784,6 +784,7 @@ Other - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`) - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) +- Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) - Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`) diff --git a/pandas/io/common.py b/pandas/io/common.py index 8da3ca0218983..e0076eb486976 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -584,6 +584,9 @@ def infer_compression( # Infer compression if compression == "infer": # Convert all path types (e.g. pathlib.Path) to strings + if isinstance(filepath_or_buffer, str) and "::" in filepath_or_buffer: + # chained URLs contain :: + filepath_or_buffer = filepath_or_buffer.split("::")[0] filepath_or_buffer = stringify_path(filepath_or_buffer, convert_file_like=True) if not isinstance(filepath_or_buffer, str): # Cannot infer compression of a buffer, assume no compression diff --git a/pandas/tests/io/data/tar/test-csv.tar b/pandas/tests/io/data/tar/test-csv.tar new file mode 100644 index 0000000000000000000000000000000000000000..c3b3091348426791f9bb09e2cbd8196465074c49 GIT binary patch literal 10240 zcmeIy!3u&f9LMpUeTtqy_ur;VBIww$SCCo|(Ir>(_-P`9P?sQ8|uSTYwP;G_K4D=jTp6fjPf;uqPIF$*QWj8^<0)_xwypBC9K6; zZFE^mnfhkF%xy9kgE{|a40TNR^?gi(Hq?ddGVY7K%er~byjSA9Xepd|cWX`0%yGKpQeX`0g&0R#|0009ILKmY**5I_I{1Q0*~0R#|0009IL SKmY**5I_I{1Q0;rXMrbBPb;PX literal 0 HcmV?d00001 diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 10e3af601b7ef..4f3f613f71542 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -25,6 +25,7 @@ WASM, is_platform_windows, ) +import pandas.util._test_decorators as td import pandas as pd import pandas._testing as tm @@ -642,6 +643,19 @@ def close(self): handles.created_handles.append(TestError()) +@td.skip_if_no("fsspec", min_version="2023.1.0") +@pytest.mark.parametrize("compression", [None, "infer"]) +def test_read_csv_chained_url_no_error(compression): + # GH 60100 + tar_file_path = "pandas/tests/io/data/tar/test-csv.tar" + chained_file_url = f"tar://test.csv::file://{tar_file_path}" + + result = pd.read_csv(chained_file_url, compression=compression, sep=";") + expected = pd.DataFrame({"1": {0: 3}, "2": {0: 4}}) + + tm.assert_frame_equal(expected, result) + + @pytest.mark.parametrize( "reader", [