From 22df68eb1c3a5ab3b75b8e23531d9181798374fe Mon Sep 17 00:00:00 2001
From: Kevin Amparado <109636487+KevsterAmp@users.noreply.github.com>
Date: Tue, 12 Nov 2024 05:16:00 +0800
Subject: [PATCH] BUG: `read_csv` with chained fsspec TAR file and
 `compression="infer"` (#60100)

---
 doc/source/whatsnew/v3.0.0.rst        |   1 +
 pandas/io/common.py                   |   3 +++
 pandas/tests/io/data/tar/test-csv.tar | Bin 0 -> 10240 bytes
 pandas/tests/io/test_common.py        |  14 ++++++++++++++
 4 files changed, 18 insertions(+)
 create mode 100644 pandas/tests/io/data/tar/test-csv.tar

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 89bc942cb7250..de69166b8c196 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -784,6 +784,7 @@ Other
 - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`)
 - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`)
 - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`)
+- Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`)
 - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`)
 - Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`)
 
diff --git a/pandas/io/common.py b/pandas/io/common.py
index 8da3ca0218983..e0076eb486976 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -584,6 +584,9 @@ def infer_compression(
     # Infer compression
     if compression == "infer":
         # Convert all path types (e.g. pathlib.Path) to strings
+        if isinstance(filepath_or_buffer, str) and "::" in filepath_or_buffer:
+            # chained URLs contain ::
+            filepath_or_buffer = filepath_or_buffer.split("::")[0]
         filepath_or_buffer = stringify_path(filepath_or_buffer, convert_file_like=True)
         if not isinstance(filepath_or_buffer, str):
             # Cannot infer compression of a buffer, assume no compression
diff --git a/pandas/tests/io/data/tar/test-csv.tar b/pandas/tests/io/data/tar/test-csv.tar
new file mode 100644
index 0000000000000000000000000000000000000000..c3b3091348426791f9bb09e2cbd8196465074c49
GIT binary patch
literal 10240
zcmeIy!3u&f9LMpUeTtqy_ur;VBIww$SCCo|(Ir>(_-P`9P?s<des^(m<NwzW{FY{E
zw%JvWZKw|6szw>Q8|uSTYwP;G_K4D=jTp6fjPf;uqPIF$*QWj8^<0)_xwypBC9K6;
zZFE^mnfhkF%xy9kgE{|a40TNR^?gi(Hq?ddGVY7K%er~byjSA9Xepd|<zGzR<o_?3
z?DKyP-wm2;;&>cWX`0%yGKpQeX`0g&0R#|0009ILKmY**5I_I{1Q0*~0R#|0009IL
SKmY**5I_I{1Q0;rXMrbBPb;PX

literal 0
HcmV?d00001

diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
index 10e3af601b7ef..4f3f613f71542 100644
--- a/pandas/tests/io/test_common.py
+++ b/pandas/tests/io/test_common.py
@@ -25,6 +25,7 @@
     WASM,
     is_platform_windows,
 )
+import pandas.util._test_decorators as td
 
 import pandas as pd
 import pandas._testing as tm
@@ -642,6 +643,19 @@ def close(self):
                 handles.created_handles.append(TestError())
 
 
+@td.skip_if_no("fsspec", min_version="2023.1.0")
+@pytest.mark.parametrize("compression", [None, "infer"])
+def test_read_csv_chained_url_no_error(compression):
+    # GH 60100
+    tar_file_path = "pandas/tests/io/data/tar/test-csv.tar"
+    chained_file_url = f"tar://test.csv::file://{tar_file_path}"
+
+    result = pd.read_csv(chained_file_url, compression=compression, sep=";")
+    expected = pd.DataFrame({"1": {0: 3}, "2": {0: 4}})
+
+    tm.assert_frame_equal(expected, result)
+
+
 @pytest.mark.parametrize(
     "reader",
     [