diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 8ff7d290ea9d1..14abcdbfdb310 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -236,7 +236,7 @@ jobs: . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install numpy --config-settings=setup-args="-Dallow-noblas=true" - python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . python -m pip list --no-cache-dir export PANDAS_CI=1 @@ -274,7 +274,7 @@ jobs: /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . python -m pip list --no-cache-dir @@ -347,7 +347,7 @@ jobs: python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] - python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 + python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov python -m pip install -ve . --no-build-isolation --no-index --no-deps python -m pip list diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 6759f7b82eacf..95b48ee4b7426 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -168,7 +168,7 @@ jobs: shell: pwsh run: | $TST_CMD = @" - python -m pip install hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17; + python -m pip install hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0; python -m pip install `$(Get-Item pandas\wheelhouse\*.whl); python -c `'import pandas as pd; pd.test(extra_args=[\"`\"--no-strict-data-files`\"\", \"`\"-m not clipboard and not single_cpu and not slow and not network and not db`\"\"])`'; "@ diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index f8fa04a60a378..82f41811bf56e 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -14,7 +14,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - pytest-localserver>=0.7.1 - boto3 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 8bf5ea51c4bf3..8422aba5239f9 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -15,7 +15,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - pytest-localserver>=0.7.1 - boto3 diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index 9a6c05a138f9b..695bd483fc572 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -18,7 +18,6 @@ dependencies: # causes an InternalError within pytest - pytest-xdist>=2.2.0, <3 - hypothesis>=6.46.1 - - pytest-asyncio>=0.17.0 # pandas dependencies - python-dateutil diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index d7e5523dd2545..b5df68968ea46 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -15,7 +15,6 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - hypothesis>=6.46.1 - - pytest-asyncio>=0.17.0 # required dependencies - python-dateutil diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 7f8cca9e65fe1..cb1771c485297 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -14,7 +14,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - pytest-localserver>=0.7.1 - boto3 diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index 5d34940aa27e1..e4db8cb0a540b 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -16,7 +16,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - pytest-localserver>=0.7.1 - boto3 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index f5ea116e51dd4..745781298b86a 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -14,7 +14,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - pytest-localserver>=0.7.1 - boto3 diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index a489690776be3..ce02ea2b9a090 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -16,7 +16,6 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-asyncio>=0.17.0 - pytest-xdist>=2.2.0 - hypothesis>=6.46.1 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index a71523e8c3579..08a422c6d538a 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -14,7 +14,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - pytest-localserver>=0.7.1 - boto3 diff --git a/ci/meta.yaml b/ci/meta.yaml index 77f536a18a204..5b343f60b3dce 100644 --- a/ci/meta.yaml +++ b/ci/meta.yaml @@ -64,7 +64,6 @@ test: requires: - pip - pytest >=7.3.2 - - pytest-asyncio >=0.17.0 - pytest-xdist >=2.2.0 - pytest-cov - hypothesis >=6.46.1 diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 48ef21686a26f..6a70ea1df3e71 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -10,7 +10,8 @@ echo PYTHONHASHSEED=$PYTHONHASHSEED COVERAGE="-s --cov=pandas --cov-report=xml --cov-append --cov-config=pyproject.toml" -PYTEST_CMD="MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" +# TODO: Support NEP 50 and remove NPY_PROMOTION_STATE +PYTEST_CMD="NPY_PROMOTION_STATE=legacy MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" if [[ "$PATTERN" ]]; then PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 2e299da5e5794..d3c9d83b943ce 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -408,20 +408,6 @@ raise a ValueError: pd.Series(['foo', 'bar', 'baz']) == pd.Series(['foo']) -Note that this is different from the NumPy behavior where a comparison can -be broadcast: - -.. ipython:: python - - np.array([1, 2, 3]) == np.array([2]) - -or it can return False if broadcasting can not be done: - -.. ipython:: python - :okwarning: - - np.array([1, 2, 3]) == np.array([1, 2]) - Combining overlapping data sets ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index baab20845a2a2..3af7a06428f0c 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,8 @@ Version 2.1 .. toctree:: :maxdepth: 2 + v2.1.4 + v2.1.3 v2.1.2 v2.1.1 v2.1.0 diff --git a/doc/source/whatsnew/v0.17.0.rst b/doc/source/whatsnew/v0.17.0.rst index abbda2ffc9be2..5a668e5fabd9c 100644 --- a/doc/source/whatsnew/v0.17.0.rst +++ b/doc/source/whatsnew/v0.17.0.rst @@ -726,10 +726,10 @@ be broadcast: or it can return False if broadcasting can not be done: -.. ipython:: python - :okwarning: +.. code-block:: ipython - np.array([1, 2, 3]) == np.array([1, 2]) + In [11]: np.array([1, 2, 3]) == np.array([1, 2]) + Out[11]: False Changes to boolean comparisons vs. None ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index c52f7db8ec3ec..6101333ae760c 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -52,4 +52,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v2.1.0..v2.1.1|HEAD +.. contributors:: v2.1.0..v2.1.1 diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 2c56a1481f0a9..38416afc1c94c 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -1,6 +1,6 @@ .. _whatsnew_212: -What's new in 2.1.2 (October 25, 2023) +What's new in 2.1.2 (October 26, 2023) --------------------------------------- These are the changes in pandas 2.1.2. See :ref:`release` for a full changelog @@ -28,7 +28,9 @@ Fixed regressions - Fixed regression in :meth:`DataFrameGroupBy.agg` and :meth:`SeriesGroupBy.agg` where if the option ``compute.use_numba`` was set to True, groupby methods not supported by the numba engine would raise a ``TypeError`` (:issue:`55520`) - Fixed performance regression with wide DataFrames, typically involving methods where all columns were accessed individually (:issue:`55256`, :issue:`55245`) - Fixed regression in :func:`merge_asof` raising ``TypeError`` for ``by`` with datetime and timedelta dtypes (:issue:`55453`) +- Fixed regression in :func:`read_parquet` when reading a file with a string column consisting of more than 2 GB of string data and using the ``"string"`` dtype (:issue:`55606`) - Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite when using ``detect_types`` (:issue:`55554`) +- Fixed regression in construction of certain DataFrame or Series subclasses (:issue:`54922`) .. --------------------------------------------------------------------------- .. _whatsnew_212.bug_fixes: @@ -65,3 +67,5 @@ Other Contributors ~~~~~~~~~~~~ + +.. contributors:: v2.1.1..v2.1.2 diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst new file mode 100644 index 0000000000000..af626895a9e0e --- /dev/null +++ b/doc/source/whatsnew/v2.1.3.rst @@ -0,0 +1,33 @@ +.. _whatsnew_213: + +What's new in 2.1.3 (November 10, 2023) +--------------------------------------- + +These are the changes in pandas 2.1.3. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- +.. _whatsnew_213.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- Fixed infinite recursion from operations that return a new object on some DataFrame subclasses (:issue:`55763`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_213.bug_fixes: + +Bug fixes +~~~~~~~~~ +- Bug in :meth:`DatetimeIndex.diff` raising ``TypeError`` (:issue:`55080`) +- Bug in :meth:`Index.isin` raising for Arrow backed string and ``None`` value (:issue:`55821`) +- Fix :func:`read_parquet` and :func:`read_feather` for `CVE-2023-47248 `__ (:issue:`55894`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_213.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v2.1.2..v2.1.3|HEAD diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst new file mode 100644 index 0000000000000..3a72c0864d29c --- /dev/null +++ b/doc/source/whatsnew/v2.1.4.rst @@ -0,0 +1,46 @@ +.. _whatsnew_214: + +What's new in 2.1.4 (December ??, 2023) +--------------------------------------- + +These are the changes in pandas 2.1.4. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- +.. _whatsnew_214.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_214.bug_fixes: + +Bug fixes +~~~~~~~~~ +- Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55753`) +- Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) +- Fixed bug in :func:`to_numeric` converting to extension dtype for ``string[pyarrow_numpy]`` dtype (:issue:`56179`) +- Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) +- Fixed bug in :meth:`DataFrame.to_hdf` raising when columns have ``StringDtype`` (:issue:`55088`) +- Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) +- Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_214.other: + +Other +~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_214.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v2.1.3..v2.1.4|HEAD diff --git a/environment.yml b/environment.yml index 98c39378d1ae1..3f6f0de68748b 100644 --- a/environment.yml +++ b/environment.yml @@ -16,7 +16,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - coverage # required dependencies diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index e17d264333264..af27acf16b771 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -332,13 +332,16 @@ timedelta-like} # Shift the delta_idx by if the UTC offset of # the target tz is greater than 0 and we're moving forward # or vice versa - first_delta = info.deltas[0] - if (shift_forward or shift_delta > 0) and first_delta > 0: - delta_idx_offset = 1 - elif (shift_backward or shift_delta < 0) and first_delta < 0: - delta_idx_offset = 1 - else: - delta_idx_offset = 0 + # TODO: delta_idx_offset and info.deltas are needed for zoneinfo timezones, + # but are not applicable for all timezones. Setting the former to 0 and + # length checking the latter avoids UB, but this could use a larger refactor + delta_idx_offset = 0 + if len(info.deltas): + first_delta = info.deltas[0] + if (shift_forward or shift_delta > 0) and first_delta > 0: + delta_idx_offset = 1 + elif (shift_backward or shift_delta < 0) and first_delta < 0: + delta_idx_offset = 1 for i in range(n): val = vals[i] diff --git a/pandas/_testing/_warnings.py b/pandas/_testing/_warnings.py index 11cf60ef36a9c..c1c70605d9648 100644 --- a/pandas/_testing/_warnings.py +++ b/pandas/_testing/_warnings.py @@ -13,6 +13,8 @@ ) import warnings +from pandas.compat import PY311 + if TYPE_CHECKING: from collections.abc import ( Generator, @@ -179,6 +181,11 @@ def _assert_caught_no_extra_warnings( # due to these open files. if any("matplotlib" in mod for mod in sys.modules): continue + if PY311 and actual_warning.category == EncodingWarning: + # EncodingWarnings are checked in the CI + # pyproject.toml errors on EncodingWarnings in pandas + # Ignore EncodingWarnings from other libraries + continue extra_warnings.append( ( actual_warning.category.__name__, diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 684e9dccdc0f9..6896945344b24 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -31,6 +31,7 @@ pa_version_under11p0, pa_version_under13p0, pa_version_under14p0, + pa_version_under14p1, ) if TYPE_CHECKING: @@ -188,6 +189,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under11p0", "pa_version_under13p0", "pa_version_under14p0", + "pa_version_under14p1", "IS64", "ISMUSL", "PY310", diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 51c9892b64a08..0552301b59400 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -11,6 +11,7 @@ np_version_gte1p24 = _nlv >= Version("1.24") np_version_gte1p24p3 = _nlv >= Version("1.24.3") np_version_gte1p25 = _nlv >= Version("1.25") +np_version_gt2 = _nlv >= Version("2.0.0.dev0") is_numpy_dev = _nlv.dev is not None _min_numpy_ver = "1.22.4" @@ -26,7 +27,7 @@ np_long: type np_ulong: type -if _nlv >= Version("2.0.0.dev0"): +if np_version_gt2: try: with warnings.catch_warnings(): warnings.filterwarnings( diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 12f58be109d98..be3a038d472ab 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -16,6 +16,8 @@ pa_version_under12p0 = _palv < Version("12.0.0") pa_version_under13p0 = _palv < Version("13.0.0") pa_version_under14p0 = _palv < Version("14.0.0") + pa_version_under14p1 = _palv < Version("14.0.1") + pa_version_under15p0 = _palv < Version("15.0.0") except ImportError: pa_version_under7p0 = True pa_version_under8p0 = True @@ -25,3 +27,5 @@ pa_version_under12p0 = True pa_version_under13p0 = True pa_version_under14p0 = True + pa_version_under14p1 = True + pa_version_under15p0 = True diff --git a/pandas/core/apply.py b/pandas/core/apply.py index e5683359c2fb9..43bc26f6eb637 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -923,7 +923,9 @@ def wrapper(*args, **kwargs): return wrapper - result = np.apply_along_axis(wrap_function(self.func), self.axis, self.values) + result = np.apply_along_axis( + wrap_function(self.func), self.axis, self.values, *self.args, **self.kwargs + ) # TODO: mixed type case if result.ndim == 2: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 9c8f28d660450..a08f3ba44f417 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -552,6 +552,18 @@ def __getitem__(self, item: PositionalIndexer): ) # We are not an array indexer, so maybe e.g. a slice or integer # indexer. We dispatch to pyarrow. + if isinstance(item, slice): + # Arrow bug https://github.com/apache/arrow/issues/38768 + if item.start == item.stop: + pass + elif ( + item.stop is not None + and item.stop < -len(self) + and item.step is not None + and item.step < 0 + ): + item = slice(item.start, None, item.step) + value = self._pa_array[item] if isinstance(value, pa.ChunkedArray): return type(self)(value) diff --git a/pandas/core/arrays/arrow/extension_types.py b/pandas/core/arrays/arrow/extension_types.py index 3249c1c829546..36d536bf86847 100644 --- a/pandas/core/arrays/arrow/extension_types.py +++ b/pandas/core/arrays/arrow/extension_types.py @@ -5,6 +5,8 @@ import pyarrow +from pandas.compat import pa_version_under14p1 + from pandas.core.dtypes.dtypes import ( IntervalDtype, PeriodDtype, @@ -112,3 +114,61 @@ def to_pandas_dtype(self): # register the type with a dummy instance _interval_type = ArrowIntervalType(pyarrow.int64(), "left") pyarrow.register_extension_type(_interval_type) + + +_ERROR_MSG = """\ +Disallowed deserialization of 'arrow.py_extension_type': +storage_type = {storage_type} +serialized = {serialized} +pickle disassembly:\n{pickle_disassembly} + +Reading of untrusted Parquet or Feather files with a PyExtensionType column +allows arbitrary code execution. +If you trust this file, you can enable reading the extension type by one of: + +- upgrading to pyarrow >= 14.0.1, and call `pa.PyExtensionType.set_auto_load(True)` +- install pyarrow-hotfix (`pip install pyarrow-hotfix`) and disable it by running + `import pyarrow_hotfix; pyarrow_hotfix.uninstall()` + +We strongly recommend updating your Parquet/Feather files to use extension types +derived from `pyarrow.ExtensionType` instead, and register this type explicitly. +""" + + +def patch_pyarrow(): + # starting from pyarrow 14.0.1, it has its own mechanism + if not pa_version_under14p1: + return + + # if https://github.com/pitrou/pyarrow-hotfix was installed and enabled + if getattr(pyarrow, "_hotfix_installed", False): + return + + class ForbiddenExtensionType(pyarrow.ExtensionType): + def __arrow_ext_serialize__(self): + return b"" + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + import io + import pickletools + + out = io.StringIO() + pickletools.dis(serialized, out) + raise RuntimeError( + _ERROR_MSG.format( + storage_type=storage_type, + serialized=serialized, + pickle_disassembly=out.getvalue(), + ) + ) + + pyarrow.unregister_extension_type("arrow.py_extension_type") + pyarrow.register_extension_type( + ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type") + ) + + pyarrow._hotfix_installed = True + + +patch_pyarrow() diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 58c638e7f59b5..a41214ae17044 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -223,11 +223,19 @@ def __from_arrow__( # pyarrow.ChunkedArray chunks = array.chunks + results = [] + for arr in chunks: + # convert chunk by chunk to numpy and concatenate then, to avoid + # overflow for large string data when concatenating the pyarrow arrays + arr = arr.to_numpy(zero_copy_only=False) + arr = ensure_string_array(arr, na_value=libmissing.NA) + results.append(arr) + if len(chunks) == 0: arr = np.array([], dtype=object) else: - arr = pyarrow.concat_arrays(chunks).to_numpy(zero_copy_only=False) - arr = ensure_string_array(arr, na_value=libmissing.NA) + arr = np.concatenate(results) + # Bypass validation inside StringArray constructor, see GH#47781 new_string_array = StringArray.__new__(StringArray) NDArrayBacked.__init__( diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 5903187769f08..39f0ddf172e33 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -49,6 +49,7 @@ from pandas.core.dtypes.common import ( is_list_like, is_object_dtype, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import NumpyEADtype @@ -548,6 +549,10 @@ def sanitize_array( # Avoid ending up with a NumpyExtensionArray dtype = dtype.numpy_dtype + object_index = False + if isinstance(data, ABCIndex) and data.dtype == object and dtype is None: + object_index = True + # extract ndarray or ExtensionArray, ensure we have no NumpyExtensionArray data = extract_array(data, extract_numpy=True, extract_range=True) @@ -601,6 +606,13 @@ def sanitize_array( subarr = data if data.dtype == object: subarr = maybe_infer_to_datetimelike(data) + if ( + object_index + and using_pyarrow_string_dtype() + and is_string_dtype(subarr) + ): + # Avoid inference when string option is set + subarr = data elif data.dtype.kind == "U" and using_pyarrow_string_dtype(): from pandas.core.arrays.string_ import StringDtype diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 8ed57e9bf5532..9a9a8ed22f282 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -700,7 +700,7 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan): dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.integer): - if not np.can_cast(fill_value, dtype): + if not np_can_cast_scalar(fill_value, dtype): # type: ignore[arg-type] # upcast to prevent overflow mst = np.min_scalar_type(fill_value) dtype = np.promote_types(dtype, mst) @@ -1892,4 +1892,25 @@ def _dtype_can_hold_range(rng: range, dtype: np.dtype) -> bool: """ if not len(rng): return True - return np.can_cast(rng[0], dtype) and np.can_cast(rng[-1], dtype) + return np_can_cast_scalar(rng.start, dtype) and np_can_cast_scalar(rng.stop, dtype) + + +def np_can_cast_scalar(element: Scalar, dtype: np.dtype) -> bool: + """ + np.can_cast pandas-equivalent for pre 2-0 behavior that allowed scalar + inference + + Parameters + ---------- + element : Scalar + dtype : np.dtype + + Returns + ------- + bool + """ + try: + np_can_hold_element(dtype, element) + return True + except (LossySetitemError, NotImplementedError): + return False diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1614ecf1d7037..605cf49856e16 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -639,14 +639,12 @@ def _constructor(self) -> Callable[..., DataFrame]: return DataFrame def _constructor_from_mgr(self, mgr, axes): - df = self._from_mgr(mgr, axes=axes) - - if type(self) is DataFrame: - # fastpath avoiding constructor call - return df + if self._constructor is DataFrame: + # we are pandas.DataFrame (or a subclass that doesn't override _constructor) + return DataFrame._from_mgr(mgr, axes=axes) else: assert axes is mgr.axes - return self._constructor(df, copy=False) + return self._constructor(mgr) _constructor_sliced: Callable[..., Series] = Series @@ -654,13 +652,12 @@ def _sliced_from_mgr(self, mgr, axes) -> Series: return Series._from_mgr(mgr, axes) def _constructor_sliced_from_mgr(self, mgr, axes): - ser = self._sliced_from_mgr(mgr, axes=axes) - ser._name = None # caller is responsible for setting real name - if type(self) is DataFrame: - # fastpath avoiding constructor call + if self._constructor_sliced is Series: + ser = self._sliced_from_mgr(mgr, axes) + ser._name = None # caller is responsible for setting real name return ser assert axes is mgr.axes - return self._constructor_sliced(ser, copy=False) + return self._constructor_sliced(mgr) # ---------------------------------------------------------------------- # Constructors diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3281d245dca56..564d572254f8d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9456,6 +9456,10 @@ def first(self, offset) -> Self: """ Select initial periods of time series data based on a date offset. + .. deprecated:: 2.1 + :meth:`.first` is deprecated and will be removed in a future version. + Please create a mask and filter using `.loc` instead. + For a DataFrame with a sorted DatetimeIndex, this function can select the first few rows based on a date offset. @@ -9535,6 +9539,10 @@ def last(self, offset) -> Self: """ Select final periods of time series data based on a date offset. + .. deprecated:: 2.1 + :meth:`.last` is deprecated and will be removed in a future version. + Please create a mask and filter using `.loc` instead. + For a DataFrame with a sorted DatetimeIndex, this function selects the last few rows based on a date offset. diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index 694a420ad2494..b516227cf8b06 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -102,7 +102,7 @@ def get_window_bounds( closed: str | None = None, step: int | None = None, ) -> tuple[np.ndarray, np.ndarray]: - if center: + if center or self.window_size == 0: offset = (self.window_size - 1) // 2 else: offset = 0 diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b4ef5380d2b30..aaea5a58723a1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -23,6 +23,7 @@ from pandas._config import ( get_option, using_copy_on_write, + using_pyarrow_string_dtype, ) from pandas._libs import ( @@ -6948,7 +6949,14 @@ def insert(self, loc: int, item) -> Index: loc = loc if loc >= 0 else loc - 1 new_values[loc] = item - return Index._with_infer(new_values, name=self.name) + idx = Index._with_infer(new_values, name=self.name) + if ( + using_pyarrow_string_dtype() + and is_string_dtype(idx.dtype) + and new_values.dtype == object + ): + idx = idx.astype(new_values.dtype) + return idx def drop( self, @@ -7024,7 +7032,8 @@ def infer_objects(self, copy: bool = True) -> Index: result._references.add_index_reference(result) return result - def diff(self, periods: int = 1): + @final + def diff(self, periods: int = 1) -> Index: """ Computes the difference between consecutive values in the Index object. @@ -7050,7 +7059,7 @@ def diff(self, periods: int = 1): Index([nan, 10.0, 10.0, 10.0, 10.0], dtype='float64') """ - return self._constructor(self.to_series().diff(periods)) + return Index(self.to_series().diff(periods)) def round(self, decimals: int = 0): """ diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 214fbf9f36435..d45ae37890ba7 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -266,10 +266,9 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: assert buffers["offsets"], "String buffers must contain offsets" # Retrieve the data buffer containing the UTF-8 code units - data_buff, protocol_data_dtype = buffers["data"] + data_buff, _ = buffers["data"] # We're going to reinterpret the buffer as uint8, so make sure we can do it safely - assert protocol_data_dtype[1] == 8 - assert protocol_data_dtype[2] in ( + assert col.dtype[2] in ( ArrowCTypes.STRING, ArrowCTypes.LARGE_STRING, ) # format_str == utf-8 @@ -377,15 +376,16 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any """ buffers = col.get_buffers() - _, _, format_str, _ = col.dtype - dbuf, dtype = buffers["data"] + _, col_bit_width, format_str, _ = col.dtype + dbuf, _ = buffers["data"] # Consider dtype being `uint` to get number of units passed since the 01.01.1970 + data = buffer_to_ndarray( dbuf, ( - DtypeKind.UINT, - dtype[1], - getattr(ArrowCTypes, f"UINT{dtype[1]}"), + DtypeKind.INT, + col_bit_width, + getattr(ArrowCTypes, f"INT{col_bit_width}"), Endianness.NATIVE, ), offset=col.offset, diff --git a/pandas/core/series.py b/pandas/core/series.py index 1d83643722ece..7b22d89bfe22d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -579,14 +579,14 @@ def _constructor(self) -> Callable[..., Series]: return Series def _constructor_from_mgr(self, mgr, axes): - ser = self._from_mgr(mgr, axes=axes) - ser._name = None # caller is responsible for setting real name - if type(self) is Series: - # fastpath avoiding constructor call + if self._constructor is Series: + # we are pandas.Series (or a subclass that doesn't override _constructor) + ser = Series._from_mgr(mgr, axes=axes) + ser._name = None # caller is responsible for setting real name return ser else: assert axes is mgr.axes - return self._constructor(ser, copy=False) + return self._constructor(mgr) @property def _constructor_expanddim(self) -> Callable[..., DataFrame]: @@ -610,12 +610,12 @@ def _expanddim_from_mgr(self, mgr, axes) -> DataFrame: return DataFrame._from_mgr(mgr, axes=mgr.axes) def _constructor_expanddim_from_mgr(self, mgr, axes): - df = self._expanddim_from_mgr(mgr, axes) - if type(self) is Series: - # fastpath avoiding constructor - return df + from pandas.core.frame import DataFrame + + if self._constructor_expanddim is DataFrame: + return self._expanddim_from_mgr(mgr, axes) assert axes is mgr.axes - return self._constructor_expanddim(df, copy=False) + return self._constructor_expanddim(mgr) # types @property diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index b299f5d6deab3..e2a3b9378a4f7 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -259,6 +259,7 @@ def _wrap_result( fill_value=np.nan, returns_string: bool = True, returns_bool: bool = False, + dtype=None, ): from pandas import ( Index, @@ -379,29 +380,29 @@ def cons_row(x): out = out.get_level_values(0) return out else: - return Index(result, name=name) + return Index(result, name=name, dtype=dtype) else: index = self._orig.index # This is a mess. - dtype: DtypeObj | str | None + _dtype: DtypeObj | str | None = dtype vdtype = getattr(result, "dtype", None) if self._is_string: if is_bool_dtype(vdtype): - dtype = result.dtype + _dtype = result.dtype elif returns_string: - dtype = self._orig.dtype + _dtype = self._orig.dtype else: - dtype = vdtype - else: - dtype = vdtype + _dtype = vdtype + elif vdtype is not None: + _dtype = vdtype if expand: cons = self._orig._constructor_expanddim - result = cons(result, columns=name, index=index, dtype=dtype) + result = cons(result, columns=name, index=index, dtype=_dtype) else: # Must be a Series cons = self._orig._constructor - result = cons(result, name=name, index=index, dtype=dtype) + result = cons(result, name=name, index=index, dtype=_dtype) result = result.__finalize__(self._orig, method="str") if name is not None and result.ndim == 1: # __finalize__ might copy over the original name, but we may @@ -2317,7 +2318,8 @@ def translate(self, table): dtype: object """ result = self._data.array._str_translate(table) - return self._wrap_result(result) + dtype = object if self._data.dtype == "object" else None + return self._wrap_result(result, dtype=dtype) @forbid_nonstring_types(["bytes"]) def count(self, pat, flags: int = 0): diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index a50dbeb110bff..f1b14cdc58b13 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -224,7 +224,8 @@ def to_numeric( set(), coerce_numeric=coerce_numeric, convert_to_masked_nullable=dtype_backend is not lib.no_default - or isinstance(values_dtype, StringDtype), + or isinstance(values_dtype, StringDtype) + and not values_dtype.storage == "pyarrow_numpy", ) except (ValueError, TypeError): if errors == "raise": @@ -239,6 +240,7 @@ def to_numeric( dtype_backend is not lib.no_default and new_mask is None or isinstance(values_dtype, StringDtype) + and not values_dtype.storage == "pyarrow_numpy" ): new_mask = np.zeros(values.shape, dtype=np.bool_) diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index c463f6e4d2759..b018b5720d126 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -117,6 +117,9 @@ def read_feather( import_optional_dependency("pyarrow") from pyarrow import feather + # import utils to register the pyarrow extension types + import pandas.core.arrays.arrow.extension_types # pyright: ignore[reportUnusedImport] # noqa: F401,E501 + check_dtype_backend(dtype_backend) with get_handle( diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index d3055f0ad2a38..2a72f7d32b1e7 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -56,7 +56,6 @@ is_bool_dtype, is_complex_dtype, is_list_like, - is_object_dtype, is_string_dtype, needs_i8_conversion, ) @@ -2645,7 +2644,7 @@ class DataIndexableCol(DataCol): is_data_indexable = True def validate_names(self) -> None: - if not is_object_dtype(Index(self.values).dtype): + if not is_string_dtype(Index(self.values).dtype): # TODO: should the message here be more specifically non-str? raise ValueError("cannot have non-object label DataIndexableCol") diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 3a3f73a68374b..92612861e551a 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -38,8 +38,9 @@ def test_apply(float_frame): @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_args(float_frame, axis): - result = float_frame.apply(lambda x, y: x + y, axis, args=(1,)) +@pytest.mark.parametrize("raw", [True, False]) +def test_apply_args(float_frame, axis, raw): + result = float_frame.apply(lambda x, y: x + y, axis, args=(1,), raw=raw) expected = float_frame + 1 tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/arrays/categorical/test_warnings.py b/pandas/tests/arrays/categorical/test_warnings.py index c0623faccd325..68c59706a6c3b 100644 --- a/pandas/tests/arrays/categorical/test_warnings.py +++ b/pandas/tests/arrays/categorical/test_warnings.py @@ -1,19 +1,16 @@ import pytest -from pandas.util._test_decorators import async_mark - import pandas._testing as tm class TestCategoricalWarnings: - @async_mark() - async def test_tab_complete_warning(self, ip): + def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; c = pd.Categorical([])" - await ip.run_code(code) + ip.run_cell(code) # GH 31324 newer jedi version raises Deprecation warning; # appears resolved 2021-02-02 diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index 43b2df4290eed..91d4855dcefe5 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -39,7 +39,7 @@ def check_reduce(self, s, op_name, skipna): expected = exp_op(skipna=skipna) tm.assert_almost_equal(result, expected) - def _get_expected_reduction_dtype(self, arr, op_name: str): + def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): # Find the expected dtype when the given reduction is done on a DataFrame # column with this array. The default assumes float64-like behavior, # i.e. retains the dtype. @@ -58,7 +58,7 @@ def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool): kwargs = {"ddof": 1} if op_name in ["var", "std"] else {} - cmp_dtype = self._get_expected_reduction_dtype(arr, op_name) + cmp_dtype = self._get_expected_reduction_dtype(arr, op_name, skipna) # The DataFrame method just calls arr._reduce with keepdims=True, # so this first check is perfunctory. diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index fef4fbea2e485..61474aa94d1c8 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -543,7 +543,7 @@ def test_reduce_series_boolean( return super().test_reduce_series_boolean(data, all_boolean_reductions, skipna) - def _get_expected_reduction_dtype(self, arr, op_name: str): + def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): if op_name in ["max", "min"]: cmp_dtype = arr.dtype elif arr.dtype.name == "decimal128(7, 3)[pyarrow]": diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index c4195be8ea121..588a2fb58d9be 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -20,6 +20,7 @@ IS64, is_platform_windows, ) +from pandas.compat.numpy import np_version_gt2 import pandas as pd import pandas._testing as tm @@ -40,7 +41,7 @@ ) from pandas.tests.extension import base -is_windows_or_32bit = is_platform_windows() or not IS64 +is_windows_or_32bit = (is_platform_windows() and not np_version_gt2) or not IS64 pytestmark = [ pytest.mark.filterwarnings( @@ -325,7 +326,7 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): expected = pd.NA tm.assert_almost_equal(result, expected) - def _get_expected_reduction_dtype(self, arr, op_name: str): + def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): if tm.is_float_dtype(arr.dtype): cmp_dtype = arr.dtype.name elif op_name in ["mean", "median", "var", "std", "skew"]: @@ -335,16 +336,32 @@ def _get_expected_reduction_dtype(self, arr, op_name: str): elif arr.dtype in ["Int64", "UInt64"]: cmp_dtype = arr.dtype.name elif tm.is_signed_integer_dtype(arr.dtype): - cmp_dtype = "Int32" if is_windows_or_32bit else "Int64" + # TODO: Why does Window Numpy 2.0 dtype depend on skipna? + cmp_dtype = ( + "Int32" + if (is_platform_windows() and (not np_version_gt2 or not skipna)) + or not IS64 + else "Int64" + ) elif tm.is_unsigned_integer_dtype(arr.dtype): - cmp_dtype = "UInt32" if is_windows_or_32bit else "UInt64" + cmp_dtype = ( + "UInt32" + if (is_platform_windows() and (not np_version_gt2 or not skipna)) + or not IS64 + else "UInt64" + ) elif arr.dtype.kind == "b": if op_name in ["mean", "median", "var", "std", "skew"]: cmp_dtype = "Float64" elif op_name in ["min", "max"]: cmp_dtype = "boolean" elif op_name in ["sum", "prod"]: - cmp_dtype = "Int32" if is_windows_or_32bit else "Int64" + cmp_dtype = ( + "Int32" + if (is_platform_windows() and (not np_version_gt2 or not skipna)) + or not IS64 + else "Int64" + ) else: raise TypeError("not supposed to reach this") else: @@ -360,7 +377,7 @@ def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool): # overwrite to ensure pd.NA is tested instead of np.nan # https://github.com/pandas-dev/pandas/issues/30958 length = 64 - if not IS64 or is_platform_windows(): + if is_windows_or_32bit: # Item "ExtensionDtype" of "Union[dtype[Any], ExtensionDtype]" has # no attribute "itemsize" if not ser.dtype.itemsize == 8: # type: ignore[union-attr] diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index de8df15a9d747..51b0a0a13d90b 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1913,7 +1913,7 @@ def test_add_new_column_infer_string(): df.loc[df["x"] == 1, "y"] = "1" expected = DataFrame( {"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")}, - columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"), + columns=Index(["x", "y"], dtype=object), ) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index ccc1249088f9a..fc2e817b1600e 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -753,6 +753,15 @@ def test_setitem_frame_overwrite_with_ea_dtype(self, any_numeric_ea_dtype): ) tm.assert_frame_equal(df, expected) + def test_setitem_string_option_object_index(self): + # GH#55638 + pytest.importorskip("pyarrow") + df = DataFrame({"a": [1, 2]}) + with pd.option_context("future.infer_string", True): + df["b"] = Index(["a", "b"], dtype=object) + expected = DataFrame({"a": [1, 2], "b": Series(["a", "b"], dtype=object)}) + tm.assert_frame_equal(df, expected) + def test_setitem_frame_midx_columns(self): # GH#49121 df = DataFrame({("a", "b"): [10]}) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index 0858e33a989b7..678fec835aa82 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -12,6 +12,7 @@ IS64, is_platform_windows, ) +from pandas.compat.numpy import np_version_gt2 import pandas.util._test_decorators as td import pandas as pd @@ -131,7 +132,7 @@ class TestDataFrameSelectReindex: # test_indexing @pytest.mark.xfail( - not IS64 or is_platform_windows(), + not IS64 or (is_platform_windows() and not np_version_gt2), reason="Passes int32 values to DatetimeArray in make_na_array on " "windows, 32bit linux builds", ) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 8fc78629beb0a..aa7aa8964a059 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -7,8 +7,6 @@ from pandas._config.config import option_context -from pandas.util._test_decorators import async_mark - import pandas as pd from pandas import ( DataFrame, @@ -288,8 +286,7 @@ def _check_f(base, f): f = lambda x: x.rename({1: "foo"}, inplace=True) _check_f(d.copy(), f) - @async_mark() - async def test_tab_complete_warning(self, ip, frame_or_series): + def test_tab_complete_warning(self, ip, frame_or_series): # GH 16409 pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter @@ -299,8 +296,7 @@ async def test_tab_complete_warning(self, ip, frame_or_series): else: code = "from pandas import Series; obj = Series(dtype=object)" - await ip.run_code(code) - + ip.run_cell(code) # GH 31324 newer jedi version raises Deprecation warning; # appears resolved 2021-02-02 with tm.assert_produces_warning(None, raise_on_extra_warnings=False): diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 670d4d4f554a6..e5a8feb7a89d3 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -2078,6 +2078,9 @@ def test_frame_sub_nullable_int(any_int_ea_dtype): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" +) def test_frame_op_subclass_nonclass_constructor(): # GH#43201 subclass._constructor is a function, not the subclass itself diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 61d7c125ee5e7..bec1fcd1e7462 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -10,6 +10,7 @@ IS64, is_platform_windows, ) +from pandas.compat.numpy import np_version_gt2 import pandas.util._test_decorators as td import pandas as pd @@ -32,6 +33,7 @@ nanops, ) +is_windows_np2_or_is32 = (is_platform_windows() and not np_version_gt2) or not IS64 is_windows_or_is32 = is_platform_windows() or not IS64 @@ -1766,13 +1768,13 @@ def test_df_empty_min_count_1(self, opname, dtype, exp_dtype): @pytest.mark.parametrize( "opname, dtype, exp_value, exp_dtype", [ - ("sum", "Int8", 0, ("Int32" if is_windows_or_is32 else "Int64")), - ("prod", "Int8", 1, ("Int32" if is_windows_or_is32 else "Int64")), - ("prod", "Int8", 1, ("Int32" if is_windows_or_is32 else "Int64")), + ("sum", "Int8", 0, ("Int32" if is_windows_np2_or_is32 else "Int64")), + ("prod", "Int8", 1, ("Int32" if is_windows_np2_or_is32 else "Int64")), + ("prod", "Int8", 1, ("Int32" if is_windows_np2_or_is32 else "Int64")), ("sum", "Int64", 0, "Int64"), ("prod", "Int64", 1, "Int64"), - ("sum", "UInt8", 0, ("UInt32" if is_windows_or_is32 else "UInt64")), - ("prod", "UInt8", 1, ("UInt32" if is_windows_or_is32 else "UInt64")), + ("sum", "UInt8", 0, ("UInt32" if is_windows_np2_or_is32 else "UInt64")), + ("prod", "UInt8", 1, ("UInt32" if is_windows_np2_or_is32 else "UInt64")), ("sum", "UInt64", 0, "UInt64"), ("prod", "UInt64", 1, "UInt64"), ("sum", "Float32", 0, "Float32"), @@ -1787,6 +1789,8 @@ def test_df_empty_nullable_min_count_0(self, opname, dtype, exp_value, exp_dtype expected = Series([exp_value, exp_value], dtype=exp_dtype) tm.assert_series_equal(result, expected) + # TODO: why does min_count=1 impact the resulting Windows dtype + # differently than min_count=0? @pytest.mark.parametrize( "opname, dtype, exp_dtype", [ diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 37ca52eba6451..ef78ae62cb4d6 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -10,6 +10,10 @@ ) import pandas._testing as tm +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" +) + @pytest.fixture() def gpd_style_subclass_df(): @@ -734,8 +738,77 @@ def test_replace_list_method(self): # https://github.com/pandas-dev/pandas/pull/46018 df = tm.SubclassedDataFrame({"A": [0, 1, 2]}) msg = "The 'method' keyword in SubclassedDataFrame.replace is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning( + FutureWarning, match=msg, raise_on_extra_warnings=False + ): result = df.replace([1, 2], method="ffill") expected = tm.SubclassedDataFrame({"A": [0, 0, 0]}) assert isinstance(result, tm.SubclassedDataFrame) tm.assert_frame_equal(result, expected) + + +class MySubclassWithMetadata(DataFrame): + _metadata = ["my_metadata"] + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + my_metadata = kwargs.pop("my_metadata", None) + if args and isinstance(args[0], MySubclassWithMetadata): + my_metadata = args[0].my_metadata # type: ignore[has-type] + self.my_metadata = my_metadata + + @property + def _constructor(self): + return MySubclassWithMetadata + + +def test_constructor_with_metadata(): + # https://github.com/pandas-dev/pandas/pull/54922 + # https://github.com/pandas-dev/pandas/issues/55120 + df = MySubclassWithMetadata( + np.random.default_rng(2).random((5, 3)), columns=["A", "B", "C"] + ) + subset = df[["A", "B"]] + assert isinstance(subset, MySubclassWithMetadata) + + +class SimpleDataFrameSubClass(DataFrame): + """A subclass of DataFrame that does not define a constructor.""" + + +class SimpleSeriesSubClass(Series): + """A subclass of Series that does not define a constructor.""" + + +class TestSubclassWithoutConstructor: + def test_copy_df(self): + expected = DataFrame({"a": [1, 2, 3]}) + result = SimpleDataFrameSubClass(expected).copy() + + assert ( + type(result) is DataFrame + ) # assert_frame_equal only checks isinstance(lhs, type(rhs)) + tm.assert_frame_equal(result, expected) + + def test_copy_series(self): + expected = Series([1, 2, 3]) + result = SimpleSeriesSubClass(expected).copy() + + tm.assert_series_equal(result, expected) + + def test_series_to_frame(self): + orig = Series([1, 2, 3]) + expected = orig.to_frame() + result = SimpleSeriesSubClass(orig).to_frame() + + assert ( + type(result) is DataFrame + ) # assert_frame_equal only checks isinstance(lhs, type(rhs)) + tm.assert_frame_equal(result, expected) + + def test_groupby(self): + df = SimpleDataFrameSubClass(DataFrame({"a": [1, 2, 3]})) + + for _, v in df.groupby("a"): + assert type(v) is DataFrame diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index 773c1e60e97af..678211ea4a053 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -11,6 +11,10 @@ import pandas._testing as tm from pandas.tests.groupby import get_groupby_method_args +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" +) + @pytest.mark.parametrize( "obj", diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 87d3afc77d556..93d46ebdd0b51 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -4,6 +4,7 @@ import pytest from pandas._libs.missing import is_matching_na +import pandas.util._test_decorators as td import pandas as pd from pandas import Index @@ -144,6 +145,13 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): class TestSliceLocs: + @pytest.mark.parametrize( + "dtype", + [ + "object", + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), + ], + ) @pytest.mark.parametrize( "in_slice,expected", [ @@ -167,12 +175,23 @@ class TestSliceLocs: (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] ], ) - def test_slice_locs_negative_step(self, in_slice, expected): - index = Index(list("bcdxy")) + def test_slice_locs_negative_step(self, in_slice, expected, dtype): + index = Index(list("bcdxy"), dtype=dtype) s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) result = index[s_start : s_stop : in_slice.step] - expected = Index(list(expected)) + expected = Index(list(expected), dtype=dtype) + tm.assert_index_equal(result, expected) + + @td.skip_if_no("pyarrow") + def test_slice_locs_negative_step_oob(self): + index = Index(list("bcdxy"), dtype="string[pyarrow_numpy]") + + result = index[-10:5:1] + tm.assert_index_equal(result, index) + + result = index[4:-10:-1] + expected = Index(list("yxdcb"), dtype="string[pyarrow_numpy]") tm.assert_index_equal(result, expected) def test_slice_locs_dup(self): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 8fd1e296fb79a..da4b44227bef3 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -10,7 +10,6 @@ from pandas.compat import IS64 from pandas.errors import InvalidIndexError -from pandas.util._test_decorators import async_mark from pandas.core.dtypes.common import ( is_any_real_numeric_dtype, @@ -1218,14 +1217,13 @@ def test_cached_properties_not_settable(self): with pytest.raises(AttributeError, match="Can't set attribute"): index.is_unique = False - @async_mark() - async def test_tab_complete_warning(self, ip): + def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; idx = pd.Index([1, 2])" - await ip.run_code(code) + ip.run_cell(code) # GH 31324 newer jedi version raises Deprecation warning; # appears resolved 2021-02-02 diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index 71cc7f29c62bc..5ad2e9b2f717e 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -159,3 +159,11 @@ def test_where_cast_str(self, simple_index): result = index.where(mask, ["foo"]) tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) + def test_diff(self, unit): + # GH 55080 + dti = pd.to_datetime([10, 20, 30], unit=unit).as_unit(unit) + result = dti.diff(1) + expected = pd.TimedeltaIndex([pd.NaT, 10, 10], unit=unit).as_unit(unit) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 8a25a2c1889f3..97a388569e261 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -4,6 +4,10 @@ import pytest from pandas._libs.tslibs import iNaT +from pandas.compat import ( + is_ci_environment, + is_platform_windows, +) import pandas.util._test_decorators as td import pandas as pd @@ -14,6 +18,7 @@ DtypeKind, ) from pandas.core.interchange.from_dataframe import from_dataframe +from pandas.core.interchange.utils import ArrowCTypes @pytest.fixture @@ -309,11 +314,21 @@ def test_datetimetzdtype(tz, unit): tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) -def test_interchange_from_non_pandas_tz_aware(): +def test_interchange_from_non_pandas_tz_aware(request): # GH 54239, 54287 pa = pytest.importorskip("pyarrow", "11.0.0") import pyarrow.compute as pc + if is_platform_windows() and is_ci_environment(): + mark = pytest.mark.xfail( + raises=pa.ArrowInvalid, + reason=( + "TODO: Set ARROW_TIMEZONE_DATABASE environment variable " + "on CI to path to the tzdata for pyarrow." + ), + ) + request.node.add_marker(mark) + arr = pa.array([datetime(2020, 1, 1), None, datetime(2020, 1, 2)]) arr = pc.assume_timezone(arr, "Asia/Kathmandu") table = pa.table({"arr": arr}) @@ -326,3 +341,24 @@ def test_interchange_from_non_pandas_tz_aware(): dtype="datetime64[us, Asia/Kathmandu]", ) tm.assert_frame_equal(expected, result) + + +def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: + # https://github.com/pandas-dev/pandas/issues/54781 + df = pd.DataFrame({"a": ["foo", "bar"]}).__dataframe__() + interchange = df.__dataframe__() + column = interchange.get_column_by_name("a") + buffers = column.get_buffers() + buffers_data = buffers["data"] + buffer_dtype = buffers_data[1] + buffer_dtype = ( + DtypeKind.UINT, + 8, + ArrowCTypes.UINT8, + buffer_dtype[3], + ) + buffers["data"] = (buffers_data[0], buffer_dtype) + column.get_buffers = lambda: buffers + interchange.get_column_by_name = lambda _: column + monkeypatch.setattr(df, "__dataframe__", lambda allow_copy: interchange) + pd.api.interchange.from_dataframe(df) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 00a26a755756f..442aa5ef87b10 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -86,7 +86,9 @@ def test_read_csv_local(all_parsers, csv1): fname = prefix + str(os.path.abspath(csv1)) result = parser.read_csv(fname, index_col=0, parse_dates=True) - + # TODO: make unit check more specific + if parser.engine == "pyarrow": + result.index = result.index.as_unit("ns") expected = DataFrame( [ [0.980269, 3.685731, -0.364216805298, -1.159738], @@ -177,7 +179,9 @@ def test_read_csv_low_memory_no_rows_with_index(all_parsers): def test_read_csv_dataframe(all_parsers, csv1): parser = all_parsers result = parser.read_csv(csv1, index_col=0, parse_dates=True) - + # TODO: make unit check more specific + if parser.engine == "pyarrow": + result.index = result.index.as_unit("ns") expected = DataFrame( [ [0.980269, 3.685731, -0.364216805298, -1.159738], diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 9f7840588f89e..c6afff56de16b 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -979,12 +979,15 @@ def test_parse_dates_custom_euro_format(all_parsers, kwargs): ) -def test_parse_tz_aware(all_parsers, request): +def test_parse_tz_aware(all_parsers): # See gh-1693 parser = all_parsers data = "Date,x\n2012-06-13T01:39:00Z,0.5" result = parser.read_csv(StringIO(data), index_col=0, parse_dates=True) + # TODO: make unit check more specific + if parser.engine == "pyarrow": + result.index = result.index.as_unit("ns") expected = DataFrame( {"x": [0.5]}, index=Index([Timestamp("2012-06-13 01:39:00+00:00")], name="Date") ) @@ -2231,6 +2234,9 @@ def test_parse_dates_arrow_engine(all_parsers): 2000-01-01 00:00:01,1""" result = parser.read_csv(StringIO(data), parse_dates=["a"]) + # TODO: make unit check more specific + if parser.engine == "pyarrow": + result["a"] = result["a"].dt.as_unit("ns") expected = DataFrame( { "a": [ diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 085db5f521a9f..3bc8c0c4f8e30 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -526,3 +526,18 @@ def test_round_trip_equals(tmp_path, setup_path): tm.assert_frame_equal(df, other) assert df.equals(other) assert other.equals(df) + + +def test_infer_string_columns(tmp_path, setup_path): + # GH# + pytest.importorskip("pyarrow") + path = tmp_path / setup_path + with pd.option_context("future.infer_string", True): + df = DataFrame(1, columns=list("ABCD"), index=list(range(10))).set_index( + ["A", "B"] + ) + expected = df.copy() + df.to_hdf(path, key="df", format="table") + + result = read_hdf(path, "df") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 41ae4042e9ae0..699f8952e17b9 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -19,6 +19,7 @@ pa_version_under8p0, pa_version_under11p0, pa_version_under13p0, + pa_version_under15p0, ) import pandas as pd @@ -755,7 +756,10 @@ def test_unsupported_float16(self, pa): # Not able to write float 16 column using pyarrow. data = np.arange(2, 10, dtype=np.float16) df = pd.DataFrame(data=data, columns=["fp16"]) - self.check_external_error_on_write(df, pa, pyarrow.ArrowException) + if pa_version_under15p0: + self.check_external_error_on_write(df, pa, pyarrow.ArrowException) + else: + check_round_trip(df, pa) @pytest.mark.xfail( is_platform_windows(), @@ -764,6 +768,7 @@ def test_unsupported_float16(self, pa): "dtypes are passed to_parquet function in windows" ), ) + @pytest.mark.skipif(not pa_version_under15p0, reason="float16 works on 15") @pytest.mark.parametrize("path_type", [str, pathlib.Path]) def test_unsupported_float16_cleanup(self, pa, path_type): # #44847, #44914 @@ -1144,6 +1149,18 @@ def test_infer_string_large_string_type(self, tmp_path, pa): ) tm.assert_frame_equal(result, expected) + # NOTE: this test is not run by default, because it requires a lot of memory (>5GB) + # @pytest.mark.slow + # def test_string_column_above_2GB(self, tmp_path, pa): + # # https://github.com/pandas-dev/pandas/issues/55606 + # # above 2GB of string data + # v1 = b"x" * 100000000 + # v2 = b"x" * 147483646 + # df = pd.DataFrame({"strings": [v1] * 20 + [v2] + ["x"] * 20}, dtype="string") + # df.to_parquet(tmp_path / "test.parquet") + # result = read_parquet(tmp_path / "test.parquet") + # assert result["strings"].dtype == "string" + class TestParquetFastParquet(Base): def test_basic(self, fp, df_full): diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 6abdde0f5ccff..b5288c793dafa 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -4,7 +4,6 @@ import pytest from pandas.compat import is_platform_windows -from pandas.util._test_decorators import async_mark import pandas as pd from pandas import ( @@ -26,8 +25,7 @@ def test_frame(): ) -@async_mark() -async def test_tab_complete_ipython6_warning(ip): +def test_tab_complete_ipython6_warning(ip): from IPython.core.completer import provisionalcompleter code = dedent( @@ -37,7 +35,7 @@ async def test_tab_complete_ipython6_warning(ip): rs = s.resample("D") """ ) - await ip.run_code(code) + ip.run_cell(code) # GH 31324 newer jedi version raises Deprecation warning; # appears resolved 2021-02-02 diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 5dde863f246d1..7863aa55152f1 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -591,6 +591,9 @@ def test_duplicate_keys_same_frame(): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" +) @pytest.mark.parametrize( "obj", [ diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 304ec37b1e453..6868f7344c153 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -689,6 +689,9 @@ def test_merge_nan_right2(self): )[["i1", "i2", "i1_", "i3"]] tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" + ) def test_merge_type(self, df, df2): class NotADataFrame(DataFrame): @property diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py index 1d0d4e3eb554b..cfb4e92fb45cd 100644 --- a/pandas/tests/reshape/merge/test_merge_ordered.py +++ b/pandas/tests/reshape/merge/test_merge_ordered.py @@ -70,6 +70,9 @@ def test_multigroup(self, left, right): result = merge_ordered(left, right, on="key", left_by="group") assert result["group"].notna().all() + @pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" + ) def test_merge_type(self, left, right): class NotADataFrame(DataFrame): @property diff --git a/pandas/tests/series/methods/test_to_frame.py b/pandas/tests/series/methods/test_to_frame.py index 01e547aa34b47..0eadf696b34cc 100644 --- a/pandas/tests/series/methods/test_to_frame.py +++ b/pandas/tests/series/methods/test_to_frame.py @@ -1,3 +1,5 @@ +import pytest + from pandas import ( DataFrame, Index, @@ -40,6 +42,9 @@ def test_to_frame(self, datetime_series): ) tm.assert_frame_equal(rs, xp) + @pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" + ) def test_to_frame_expanddim(self): # GH#9762 diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py index a3550c6de6780..c2d5afcf884b1 100644 --- a/pandas/tests/series/test_subclass.py +++ b/pandas/tests/series/test_subclass.py @@ -4,6 +4,10 @@ import pandas as pd import pandas._testing as tm +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" +) + class TestSeriesSubclassing: @pytest.mark.parametrize( diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 78f0730d730e8..bd64a5dce3b9a 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -5,6 +5,7 @@ import pytest from pandas.errors import PerformanceWarning +import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -893,7 +894,10 @@ def test_find_nan(any_string_dtype): # -------------------------------------------------------------------------------------- -def test_translate(index_or_series, any_string_dtype): +@pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] +) +def test_translate(index_or_series, any_string_dtype, infer_string): obj = index_or_series( ["abcdefg", "abcc", "cdddfg", "cdefggg"], dtype=any_string_dtype ) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index c541c5792ec7c..8e3d7780224fe 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -348,11 +348,9 @@ def test_dataframe_consortium() -> None: expected_1 = ["a", "b"] assert result_1 == expected_1 - ser = Series([1, 2, 3]) + ser = Series([1, 2, 3], name="a") col = ser.__column_consortium_standard__() - result_2 = col.get_value(1) - expected_2 = 2 - assert result_2 == expected_2 + assert col.name == "a" def test_xarray_coerce_unit(): diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 1d969e648b752..7f37e6003f313 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -4,12 +4,15 @@ from numpy import iinfo import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( ArrowDtype, DataFrame, Index, Series, + option_context, to_numeric, ) import pandas._testing as tm @@ -67,10 +70,14 @@ def test_empty(input_kwargs, result_kwargs): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] +) @pytest.mark.parametrize("last_val", ["7", 7]) -def test_series(last_val): - ser = Series(["1", "-3.14", last_val]) - result = to_numeric(ser) +def test_series(last_val, infer_string): + with option_context("future.infer_string", infer_string): + ser = Series(["1", "-3.14", last_val]) + result = to_numeric(ser) expected = Series([1, -3.14, 7]) tm.assert_series_equal(result, expected) diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 03011a1ffe622..93c46274fdc1b 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -38,11 +38,11 @@ def test_foo(): if TYPE_CHECKING: from pandas._typing import F + from pandas.compat import ( IS64, is_platform_windows, ) -from pandas.compat._optional import import_optional_dependency from pandas.core.computation.expressions import ( NUMEXPR_INSTALLED, @@ -214,16 +214,6 @@ def documented_fixture(fixture): return documented_fixture -def async_mark(): - try: - import_optional_dependency("pytest_asyncio") - async_mark = pytest.mark.asyncio - except ImportError: - async_mark = pytest.mark.skip(reason="Missing dependency pytest-asyncio") - - return async_mark - - def mark_array_manager_not_yet_implemented(request) -> None: mark = pytest.mark.xfail(reason="Not yet implemented for ArrayManager") request.node.add_marker(mark) diff --git a/pyproject.toml b/pyproject.toml index e9e3e2f67e2f1..77b49d4474334 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,7 +61,7 @@ repository = 'https://github.com/pandas-dev/pandas' matplotlib = "pandas:plotting._matplotlib" [project.optional-dependencies] -test = ['hypothesis>=6.46.1', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', 'pytest-asyncio>=0.17.0'] +test = ['hypothesis>=6.46.1', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0'] performance = ['bottleneck>=1.3.4', 'numba>=0.55.2', 'numexpr>=2.8.0'] computation = ['scipy>=1.8.1', 'xarray>=2022.03.0'] fss = ['fsspec>=2022.05.0'] @@ -109,7 +109,6 @@ all = ['beautifulsoup4>=4.11.1', 'pyreadstat>=1.1.5', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', - 'pytest-asyncio>=0.17.0', 'pyxlsb>=1.0.9', 'qtpy>=2.2.0', 'scipy>=1.8.1', @@ -151,7 +150,7 @@ setup = ['--vsenv'] # For Windows skip = "cp36-* cp37-* cp38-* pp* *_i686 *_ppc64le *_s390x *-musllinux_aarch64" build-verbosity = "3" environment = {LDFLAGS="-Wl,--strip-all"} -test-requires = "hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17" +test-requires = "hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0" test-command = """ PANDAS_CI='1' python -c 'import pandas as pd; \ pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db", "-n 2", "--no-strict-data-files"]); \ @@ -481,6 +480,10 @@ filterwarnings = [ "error:::pandas", "error::ResourceWarning", "error::pytest.PytestUnraisableExceptionWarning", + # TODO(PY311-minimum): Specify EncodingWarning + # Ignore 3rd party EncodingWarning but raise on pandas' + "ignore:.*encoding.* argument not specified", + "error:.*encoding.* argument not specified::pandas", "ignore:.*ssl.SSLSocket:pytest.PytestUnraisableExceptionWarning", "ignore:.*ssl.SSLSocket:ResourceWarning", # GH 44844: Can remove once minimum matplotlib version >= 3.7 @@ -514,7 +517,6 @@ markers = [ "arm_slow: mark a test as slow for arm64 architecture", "arraymanager: mark a test to run with ArrayManager enabled", ] -asyncio_mode = "strict" [tool.mypy] # Import discovery diff --git a/requirements-dev.txt b/requirements-dev.txt index 855eaccfd4f5f..af4f82b068d31 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -9,7 +9,6 @@ meson-python==0.13.1 pytest>=7.3.2 pytest-cov pytest-xdist>=2.2.0 -pytest-asyncio>=0.17.0 coverage python-dateutil numpy<2 diff --git a/scripts/tests/data/deps_expected_random.yaml b/scripts/tests/data/deps_expected_random.yaml index c70025f8f019d..bebb25b9e1bb8 100644 --- a/scripts/tests/data/deps_expected_random.yaml +++ b/scripts/tests/data/deps_expected_random.yaml @@ -14,7 +14,6 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - psutil - - pytest-asyncio>=0.17.0 - boto3 # required dependencies diff --git a/scripts/tests/data/deps_minimum.toml b/scripts/tests/data/deps_minimum.toml index b43815a982139..5fb51856e841e 100644 --- a/scripts/tests/data/deps_minimum.toml +++ b/scripts/tests/data/deps_minimum.toml @@ -55,7 +55,7 @@ repository = 'https://github.com/pandas-dev/pandas' matplotlib = "pandas:plotting._matplotlib" [project.optional-dependencies] -test = ['hypothesis>=6.34.2', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', 'pytest-asyncio>=0.17.0'] +test = ['hypothesis>=6.34.2', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0'] performance = ['bottleneck>=1.3.2', 'numba>=0.53.1', 'numexpr>=2.7.1'] timezone = ['tzdata>=2022.1'] computation = ['scipy>=1.7.1', 'xarray>=0.21.0'] @@ -102,7 +102,6 @@ all = ['beautifulsoup4>=5.9.3', 'pyreadstat>=1.1.2', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', - 'pytest-asyncio>=0.17.0', 'pyxlsb>=1.0.8', 'qtpy>=2.2.0', 'scipy>=1.7.1', @@ -141,7 +140,7 @@ parentdir_prefix = "pandas-" [tool.cibuildwheel] skip = "cp36-* cp37-* pp37-* *-manylinux_i686 *_ppc64le *_s390x *-musllinux*" build-verbosity = "3" -test-requires = "hypothesis>=6.34.2 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17" +test-requires = "hypothesis>=6.34.2 pytest>=7.3.2 pytest-xdist>=2.2.0" test-command = "python {project}/ci/test_wheels.py" [tool.cibuildwheel.macos] @@ -385,7 +384,6 @@ markers = [ "arm_slow: mark a test as slow for arm64 architecture", "arraymanager: mark a test to run with ArrayManager enabled", ] -asyncio_mode = "strict" [tool.mypy] # Import discovery diff --git a/scripts/tests/data/deps_unmodified_random.yaml b/scripts/tests/data/deps_unmodified_random.yaml index 503eb3c7c7734..e54482282fcc8 100644 --- a/scripts/tests/data/deps_unmodified_random.yaml +++ b/scripts/tests/data/deps_unmodified_random.yaml @@ -14,7 +14,6 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - psutil - - pytest-asyncio>=0.17 - boto3 # required dependencies