Skip to content

Commit

Permalink
REGR: fix read_parquet with column of large strings (avoid overflow f…
Browse files Browse the repository at this point in the history
…rom concat)
  • Loading branch information
jorisvandenbossche committed Oct 25, 2023
1 parent 074ab2f commit dc7270d
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 2 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Fixed regressions
- Fixed regression in :meth:`DataFrameGroupBy.agg` and :meth:`SeriesGroupBy.agg` where if the option ``compute.use_numba`` was set to True, groupby methods not supported by the numba engine would raise a ``TypeError`` (:issue:`55520`)
- Fixed performance regression with wide DataFrames, typically involving methods where all columns were accessed individually (:issue:`55256`, :issue:`55245`)
- Fixed regression in :func:`merge_asof` raising ``TypeError`` for ``by`` with datetime and timedelta dtypes (:issue:`55453`)
- Fixed regression in :func:`read_parquet` when reading a file with a string column consisting of more than 2 GB of string data and using the ``"string"`` dtype (:issue:`55606`)

.. ---------------------------------------------------------------------------
.. _whatsnew_212.bug_fixes:
Expand Down
10 changes: 8 additions & 2 deletions pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,11 +228,17 @@ def __from_arrow__(
# pyarrow.ChunkedArray
chunks = array.chunks

results = []
for arr in chunks:
arr = arr.to_numpy(zero_copy_only=False)
arr = ensure_string_array(arr, na_value=libmissing.NA)
results.append(arr)

if len(chunks) == 0:
arr = np.array([], dtype=object)
else:
arr = pyarrow.concat_arrays(chunks).to_numpy(zero_copy_only=False)
arr = ensure_string_array(arr, na_value=libmissing.NA)
arr = np.concatenate(results)

# Bypass validation inside StringArray constructor, see GH#47781
new_string_array = StringArray.__new__(StringArray)
NDArrayBacked.__init__(
Expand Down
11 changes: 11 additions & 0 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1141,6 +1141,17 @@ def test_infer_string_large_string_type(self, tmp_path, pa):
)
tm.assert_frame_equal(result, expected)

@pytest.mark.slow
def test_string_column_above_2GB(self, tmp_path, pa):
# https://github.com/pandas-dev/pandas/issues/55606
# above 2GB of string data
v1 = b"x" * 100000000
v2 = b"x" * 147483646
df = pd.DataFrame({"strings": [v1] * 20 + [v2] + ["x"] * 20}, dtype="string")
df.to_parquet(tmp_path / "test.parquet")
result = read_parquet(tmp_path / "test.parquet")
assert result["strings"].dtype == "string"


class TestParquetFastParquet(Base):
def test_basic(self, fp, df_full):
Expand Down

0 comments on commit dc7270d

Please sign in to comment.