From cf5288c41c4cee71a1640c3f1229bb127171ed94 Mon Sep 17 00:00:00 2001 From: hedeershowk Date: Tue, 12 Sep 2023 23:53:45 -0400 Subject: [PATCH 1/8] add pyarrow autogenerated prefix --- pandas/io/parsers/arrow_parser_wrapper.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index bb6bcd3c4d6a0..63da9928aec8b 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -100,6 +100,12 @@ def _get_pyarrow_options(self) -> None: ) } self.convert_options["strings_can_be_null"] = "" in self.kwds["null_values"] + # autogenerated column names are prefixed with 'f' in pyarrow.csv + if self.header is None and "include_columns" in self.convert_options: + self.convert_options["include_columns"] = [ + f"f{n}" for n in self.convert_options["include_columns"] + ] + self.read_options = { "autogenerate_column_names": self.header is None, "skip_rows": self.header From 7d1738a2e4701326465d8f358a3e259076d29f10 Mon Sep 17 00:00:00 2001 From: hedeershowk Date: Tue, 12 Sep 2023 23:54:15 -0400 Subject: [PATCH 2/8] whats new bug fix --- doc/source/whatsnew/v2.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 249f08c7e387b..57226843aa167 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -188,6 +188,7 @@ Bug fixes ~~~~~~~~~ - Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`) - Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) +- Bug in :class:`pandas.io.parsers.ArrowParserWrapper` where ``usecols`` wasn't working when using pyarrow to read a csv with no headers (:issue:`54459`) Categorical ^^^^^^^^^^^ From cb39950ee19110a22caab1dc833f1a274731f7a4 Mon Sep 17 00:00:00 2001 From: hedeershowk Date: Thu, 14 Sep 2023 21:23:53 -0400 Subject: [PATCH 3/8] test with no head and pyarrow --- pandas/tests/io/parser/test_header.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index d72174c40478e..4d262e4375e45 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -684,3 +684,21 @@ def test_header_delim_whitespace(all_parsers): result = parser.read_csv(StringIO(data), delim_whitespace=True) expected = DataFrame({"a,b": ["1,2", "3,4"]}) tm.assert_frame_equal(result, expected) + + +def test_usecols_no_header_pyarrow(all_parsers): + parser = all_parsers + data = """ +a,i,x +b,j,y +""" + result = parser.read_csv( + StringIO(data), + header=None, + usecols=[0, 1], + dtype="object", + dtype_backend="pyarrow", + engine="pyarrow", + ) + expected = DataFrame([["a", "i"], ["b", "j"]]) + tm.assert_frame_equal(result, expected) From b1d7353897c74ba6649e537206d1789973f6f3be Mon Sep 17 00:00:00 2001 From: hedeershowk Date: Fri, 15 Sep 2023 07:43:23 -0400 Subject: [PATCH 4/8] only test pyarrow --- pandas/tests/io/parser/test_header.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 4d262e4375e45..1c6526b7ca04a 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -686,8 +686,8 @@ def test_header_delim_whitespace(all_parsers): tm.assert_frame_equal(result, expected) -def test_usecols_no_header_pyarrow(all_parsers): - parser = all_parsers +def test_usecols_no_header_pyarrow(pyarrow_parser_only): + parser = pyarrow_parser_only data = """ a,i,x b,j,y From 955e108f65661f170ec265e94208fb58f4f3427a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20=C5=A0=C3=ADcho?= Date: Wed, 13 Sep 2023 19:14:59 +0200 Subject: [PATCH 5/8] BUG: This fixes #55009 (`raw=True` caused `apply` method of `DataFrame` to ignore passed arguments) (#55089) * fixes #55009 * update documentation * write documentation * add test * change formatting * cite DataDrame directly in docs Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 59627f17d148a..cc41949ac4825 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -222,10 +222,8 @@ Bug fixes ~~~~~~~~~ - Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`) - Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) -- Bug in :class:`pandas.io.parsers.ArrowParserWrapper` where ``usecols`` wasn't working when using pyarrow to read a csv with no headers (:issue:`54459`) - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) - Categorical ^^^^^^^^^^^ - :meth:`Categorical.isin` raising ``InvalidIndexError`` for categorical containing overlapping :class:`Interval` values (:issue:`34974`) @@ -284,6 +282,7 @@ MultiIndex I/O ^^^ +- Bug in :class:`pandas.io.parsers.ArrowParserWrapper` where ``usecols`` wasn't working when using pyarrow to read a csv with no headers (:issue:`54459`) - Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`) - Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) - Bug in :func:`to_excel`, with ``OdsWriter`` (``ods`` files) writing boolean/string value (:issue:`54994`) From e9af3208a2e317ec9093f7ef4587794e7417fa0b Mon Sep 17 00:00:00 2001 From: hedeershowk Date: Tue, 19 Sep 2023 22:38:25 -0400 Subject: [PATCH 6/8] PR review feedback --- pandas/tests/io/parser/test_header.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 1c6526b7ca04a..d6eab59074dd6 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -696,9 +696,9 @@ def test_usecols_no_header_pyarrow(pyarrow_parser_only): StringIO(data), header=None, usecols=[0, 1], - dtype="object", + dtype="string[pyarrow]", dtype_backend="pyarrow", engine="pyarrow", ) - expected = DataFrame([["a", "i"], ["b", "j"]]) + expected = DataFrame([["a", "i"], ["b", "j"]], dtype="string[pyarrow]") tm.assert_frame_equal(result, expected) From ee0fba4f34c5007bb64620490cf3e535eb93b746 Mon Sep 17 00:00:00 2001 From: Hedeer El Showk <144284759+hedeershowk@users.noreply.github.com> Date: Fri, 22 Sep 2023 18:15:40 -0400 Subject: [PATCH 7/8] Update doc/source/whatsnew/v2.2.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index cc41949ac4825..2f6631bf4d351 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -282,7 +282,7 @@ MultiIndex I/O ^^^ -- Bug in :class:`pandas.io.parsers.ArrowParserWrapper` where ``usecols`` wasn't working when using pyarrow to read a csv with no headers (:issue:`54459`) +- Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`) - Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`) - Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) - Bug in :func:`to_excel`, with ``OdsWriter`` (``ods`` files) writing boolean/string value (:issue:`54994`) From f19b3e92dda9e95245f9af5fa9e4788c98ef90a0 Mon Sep 17 00:00:00 2001 From: hedeershowk Date: Tue, 26 Sep 2023 14:47:30 -0400 Subject: [PATCH 8/8] alphabetical whatsnew --- doc/source/whatsnew/v2.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index c1c3addf4a2de..445b93705cde5 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -313,8 +313,8 @@ MultiIndex I/O ^^^ -- Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`) - Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`) +- Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`) - Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) - Bug in :func:`to_excel`, with ``OdsWriter`` (``ods`` files) writing boolean/string value (:issue:`54994`)