diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 7abfcd167afda..8163da022e3d5 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -316,6 +316,7 @@ MultiIndex I/O ^^^ - Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`) +- Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`) - Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) - Bug in :func:`to_excel`, with ``OdsWriter`` (``ods`` files) writing boolean/string value (:issue:`54994`) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 765a4ffcd2cb9..35965c90ee7fb 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -130,6 +130,12 @@ def handle_warning(invalid_row): ) } self.convert_options["strings_can_be_null"] = "" in self.kwds["null_values"] + # autogenerated column names are prefixed with 'f' in pyarrow.csv + if self.header is None and "include_columns" in self.convert_options: + self.convert_options["include_columns"] = [ + f"f{n}" for n in self.convert_options["include_columns"] + ] + self.read_options = { "autogenerate_column_names": self.header is None, "skip_rows": self.header diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index d72174c40478e..d6eab59074dd6 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -684,3 +684,21 @@ def test_header_delim_whitespace(all_parsers): result = parser.read_csv(StringIO(data), delim_whitespace=True) expected = DataFrame({"a,b": ["1,2", "3,4"]}) tm.assert_frame_equal(result, expected) + + +def test_usecols_no_header_pyarrow(pyarrow_parser_only): + parser = pyarrow_parser_only + data = """ +a,i,x +b,j,y +""" + result = parser.read_csv( + StringIO(data), + header=None, + usecols=[0, 1], + dtype="string[pyarrow]", + dtype_backend="pyarrow", + engine="pyarrow", + ) + expected = DataFrame([["a", "i"], ["b", "j"]], dtype="string[pyarrow]") + tm.assert_frame_equal(result, expected)