Merge remote-tracking branch 'upstream/main' into na_rep-bug

pandas-dev · Oct 17, 2023 · 7aad289 · 7aad289
2 parents 7087110 + 746e5ee
commit 7aad289
Show file tree

Hide file tree

Showing 19 changed files with 143 additions and 51 deletions.
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -358,6 +358,7 @@ I/O
 - Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`)
 - Bug in :func:`to_excel`, with ``OdsWriter`` (``ods`` files) writing boolean/string value (:issue:`54994`)
 - Bug in :meth:`pandas.read_excel` with an ODS file without cached formatted cell for float values (:issue:`55219`)
+- Bug where :meth:`DataFrame.to_json` would raise an ``OverflowError`` instead of a ``TypeError`` with unsupported NumPy types (:issue:`55403`)
 
 Period
 ^^^^^^

diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c
@@ -1610,9 +1610,9 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
                                   PyArray_DescrFromType(NPY_DOUBLE));
         tc->type = JT_DOUBLE;
         return;
-    } else if (PyArray_Check(obj) && PyArray_CheckScalar(obj)) {
+    } else if (PyArray_CheckScalar(obj)) {
         PyErr_Format(PyExc_TypeError,
-                     "%R (0d array) is not JSON serializable at the moment",
+                     "%R (numpy-scalar) is not JSON serializable at the moment",
                      obj);
         goto INVALID;
     } else if (object_is_na_type(obj)) {

diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
@@ -440,7 +440,10 @@ def assert_is_sorted(seq) -> None:
     if isinstance(seq, (Index, Series)):
         seq = seq.values
     # sorting does not change precisions
-    assert_numpy_array_equal(seq, np.sort(np.array(seq)))
+    if isinstance(seq, np.ndarray):
+        assert_numpy_array_equal(seq, np.sort(np.array(seq)))
+    else:
+        assert_extension_array_equal(seq, seq[seq.argsort()])
 
 
 def assert_categorical_equal(

diff --git a/pandas/_typing.py b/pandas/_typing.py
@@ -410,6 +410,9 @@ def closed(self) -> bool:
 # read_xml parsers
 XMLParsers = Literal["lxml", "etree"]
 
+# read_html flavors
+HTMLFlavors = Literal["lxml", "html5lib", "bs4"]
+
 # Interval closed type
 IntervalLeftRight = Literal["left", "right"]
 IntervalClosedType = Union[IntervalLeftRight, Literal["both", "neither"]]

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -86,6 +86,7 @@
         AstypeArg,
         AxisInt,
         Dtype,
+        DtypeObj,
         FillnaOptions,
         InterpolateOptions,
         NumpySorter,
@@ -293,6 +294,38 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal
         """
         raise AbstractMethodError(cls)
 
+    @classmethod
+    def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self:
+        """
+        Strict analogue to _from_sequence, allowing only sequences of scalars
+        that should be specifically inferred to the given dtype.
+
+        Parameters
+        ----------
+        scalars : sequence
+        dtype : ExtensionDtype
+
+        Raises
+        ------
+        TypeError or ValueError
+
+        Notes
+        -----
+        This is called in a try/except block when casting the result of a
+        pointwise operation.
+        """
+        try:
+            return cls._from_sequence(scalars, dtype=dtype, copy=False)
+        except (ValueError, TypeError):
+            raise
+        except Exception:
+            warnings.warn(
+                "_from_scalars should only raise ValueError or TypeError. "
+                "Consider overriding _from_scalars where appropriate.",
+                stacklevel=find_stack_level(),
+            )
+            raise
+
     @classmethod
     def _from_sequence_of_strings(
         cls, strings, *, dtype: Dtype | None = None, copy: bool = False

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -101,6 +101,7 @@
         AstypeArg,
         AxisInt,
         Dtype,
+        DtypeObj,
         NpDtype,
         Ordered,
         Self,
@@ -509,6 +510,22 @@ def _from_sequence(
     ) -> Self:
         return cls(scalars, dtype=dtype, copy=copy)
 
+    @classmethod
+    def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self:
+        if dtype is None:
+            # The _from_scalars strictness doesn't make much sense in this case.
+            raise NotImplementedError
+
+        res = cls._from_sequence(scalars, dtype=dtype)
+
+        # if there are any non-category elements in scalars, these will be
+        #  converted to NAs in res.
+        mask = isna(scalars)
+        if not (mask == res.isna()).all():
+            # Some non-category element in scalars got converted to NA in res.
+            raise ValueError
+        return res
+
     @overload
     def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray:
         ...

diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -77,6 +77,7 @@
 
     from pandas._typing import (
         DateTimeErrorChoices,
+        DtypeObj,
         IntervalClosedType,
         Self,
         TimeAmbiguous,
@@ -266,6 +267,14 @@ def _scalar_type(self) -> type[Timestamp]:
     _freq: BaseOffset | None = None
     _default_dtype = DT64NS_DTYPE  # used in TimeLikeOps.__init__
 
+    @classmethod
+    def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self:
+        if lib.infer_dtype(scalars, skipna=True) not in ["datetime", "datetime64"]:
+            # TODO: require any NAs be valid-for-DTA
+            # TODO: if dtype is passed, check for tzawareness compat?
+            raise ValueError
+        return cls._from_sequence(scalars, dtype=dtype)
+
     @classmethod
     def _validate_dtype(cls, values, dtype):
         # used in TimeLikeOps.__init__

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -56,6 +56,7 @@
     from pandas._typing import (
         AxisInt,
         Dtype,
+        DtypeObj,
         NumpySorter,
         NumpyValueArrayLike,
         Scalar,
@@ -253,6 +254,13 @@ def tolist(self):
             return [x.tolist() for x in self]
         return list(self.to_numpy())
 
+    @classmethod
+    def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self:
+        if lib.infer_dtype(scalars, skipna=True) != "string":
+            # TODO: require any NAs be valid-for-string
+            raise ValueError
+        return cls._from_sequence(scalars, dtype=dtype)
+
 
 # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is
 # incompatible with definition in base class "ExtensionArray"

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -464,16 +464,11 @@ def maybe_cast_pointwise_result(
     """
 
     if isinstance(dtype, ExtensionDtype):
-        if not isinstance(dtype, (CategoricalDtype, DatetimeTZDtype)):
-            # TODO: avoid this special-casing
-            # We have to special case categorical so as not to upcast
-            # things like counts back to categorical
-
-            cls = dtype.construct_array_type()
-            if same_dtype:
-                result = _maybe_cast_to_extension_array(cls, result, dtype=dtype)
-            else:
-                result = _maybe_cast_to_extension_array(cls, result)
+        cls = dtype.construct_array_type()
+        if same_dtype:
+            result = _maybe_cast_to_extension_array(cls, result, dtype=dtype)
+        else:
+            result = _maybe_cast_to_extension_array(cls, result)
 
     elif (numeric_only and dtype.kind in "iufcb") or not numeric_only:
         result = maybe_downcast_to_dtype(result, dtype)
@@ -498,11 +493,14 @@ def _maybe_cast_to_extension_array(
     -------
     ExtensionArray or obj
     """
-    from pandas.core.arrays.string_ import BaseStringArray
+    result: ArrayLike
 
-    # Everything can be converted to StringArrays, but we may not want to convert
-    if issubclass(cls, BaseStringArray) and lib.infer_dtype(obj) != "string":
-        return obj
+    if dtype is not None:
+        try:
+            result = cls._from_scalars(obj, dtype=dtype)
+        except (TypeError, ValueError):
+            return obj
+        return result
 
     try:
         result = cls._from_sequence(obj, dtype=dtype)

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -75,7 +75,10 @@
     pandas_dtype,
     validate_all_hashable,
 )
-from pandas.core.dtypes.dtypes import ExtensionDtype
+from pandas.core.dtypes.dtypes import (
+    CategoricalDtype,
+    ExtensionDtype,
+)
 from pandas.core.dtypes.generic import ABCDataFrame
 from pandas.core.dtypes.inference import is_hashable
 from pandas.core.dtypes.missing import (
@@ -100,6 +103,7 @@
 from pandas.core.arrays.arrow import StructAccessor
 from pandas.core.arrays.categorical import CategoricalAccessor
 from pandas.core.arrays.sparse import SparseAccessor
+from pandas.core.arrays.string_ import StringDtype
 from pandas.core.construction import (
     extract_array,
     sanitize_array,
@@ -3377,7 +3381,12 @@ def combine(
 
         # try_float=False is to match agg_series
         npvalues = lib.maybe_convert_objects(new_values, try_float=False)
-        res_values = maybe_cast_pointwise_result(npvalues, self.dtype, same_dtype=False)
+        # same_dtype here is a kludge to avoid casting e.g. [True, False] to
+        #  ["True", "False"]
+        same_dtype = isinstance(self.dtype, (StringDtype, CategoricalDtype))
+        res_values = maybe_cast_pointwise_result(
+            npvalues, self.dtype, same_dtype=same_dtype
+        )
         return self._constructor(res_values, index=new_index, name=new_name, copy=False)
 
     def combine_first(self, other) -> Series:

diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -57,6 +57,7 @@
         BaseBuffer,
         DtypeBackend,
         FilePath,
+        HTMLFlavors,
         ReadBuffer,
         StorageOptions,
     )
@@ -889,13 +890,13 @@ def _data_to_frame(**kwargs):
 }
 
 
-def _parser_dispatch(flavor: str | None) -> type[_HtmlFrameParser]:
+def _parser_dispatch(flavor: HTMLFlavors | None) -> type[_HtmlFrameParser]:
     """
     Choose the parser based on the input flavor.
 
     Parameters
     ----------
-    flavor : str
+    flavor : {{"lxml", "html5lib", "bs4"}} or None
         The type of parser to use. This must be a valid backend.
 
     Returns
@@ -1033,7 +1034,7 @@ def read_html(
     io: FilePath | ReadBuffer[str],
     *,
     match: str | Pattern = ".+",
-    flavor: str | Sequence[str] | None = None,
+    flavor: HTMLFlavors | Sequence[HTMLFlavors] | None = None,
     header: int | Sequence[int] | None = None,
     index_col: int | Sequence[int] | None = None,
     skiprows: int | Sequence[int] | slice | None = None,
@@ -1074,7 +1075,7 @@ def read_html(
         This value is converted to a regular expression so that there is
         consistent behavior between Beautiful Soup and lxml.
 
-    flavor : str or list-like, optional
+    flavor : {{"lxml", "html5lib", "bs4"}} or list-like, optional
         The parsing engine (or list of parsing engines) to use. 'bs4' and
         'html5lib' are synonymous with each other, they are both there for
         backwards compatibility. The default of ``None`` tries to use ``lxml``

diff --git a/pandas/tests/arrays/boolean/test_reduction.py b/pandas/tests/arrays/boolean/test_reduction.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas.compat.numpy import np_long
-
 import pandas as pd
 
 
@@ -53,7 +51,7 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions):
         s = s.dropna()
 
     if op in ("sum", "prod"):
-        assert isinstance(getattr(s, op)(), np_long)
+        assert isinstance(getattr(s, op)(), np.int_)
     elif op == "count":
         # Oddly on the 32 bit build (but not Windows), this is intc (!= intp)
         assert isinstance(getattr(s, op)(), np.integer)

diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py
@@ -1,7 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas.compat.numpy import np_long
 import pandas.util._test_decorators as td
 
 import pandas as pd
@@ -472,22 +471,22 @@ def test_shift_axis1_multiple_blocks_with_int_fill(self):
         df1 = DataFrame(rng.integers(1000, size=(5, 3), dtype=int))
         df2 = DataFrame(rng.integers(1000, size=(5, 2), dtype=int))
         df3 = pd.concat([df1.iloc[:4, 1:3], df2.iloc[:4, :]], axis=1)
-        result = df3.shift(2, axis=1, fill_value=np_long(0))
+        result = df3.shift(2, axis=1, fill_value=np.int_(0))
         assert len(df3._mgr.blocks) == 2
 
         expected = df3.take([-1, -1, 0, 1], axis=1)
-        expected.iloc[:, :2] = np_long(0)
+        expected.iloc[:, :2] = np.int_(0)
         expected.columns = df3.columns
 
         tm.assert_frame_equal(result, expected)
 
         # Case with periods < 0
         df3 = pd.concat([df1.iloc[:4, 1:3], df2.iloc[:4, :]], axis=1)
-        result = df3.shift(-2, axis=1, fill_value=np_long(0))
+        result = df3.shift(-2, axis=1, fill_value=np.int_(0))
         assert len(df3._mgr.blocks) == 2
 
         expected = df3.take([2, 3, -1, -1], axis=1)
-        expected.iloc[:, -2:] = np_long(0)
+        expected.iloc[:, -2:] = np.int_(0)
         expected.columns = df3.columns
 
         tm.assert_frame_equal(result, expected)

diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
@@ -10,10 +10,6 @@
     IS64,
     is_platform_windows,
 )
-from pandas.compat.numpy import (
-    np_long,
-    np_ulong,
-)
 import pandas.util._test_decorators as td
 
 import pandas as pd
@@ -1726,11 +1722,11 @@ class TestEmptyDataFrameReductions:
         "opname, dtype, exp_value, exp_dtype",
         [
             ("sum", np.int8, 0, np.int64),
-            ("prod", np.int8, 1, np_long),
+            ("prod", np.int8, 1, np.int_),
             ("sum", np.int64, 0, np.int64),
             ("prod", np.int64, 1, np.int64),
             ("sum", np.uint8, 0, np.uint64),
-            ("prod", np.uint8, 1, np_ulong),
+            ("prod", np.uint8, 1, np.uint),
             ("sum", np.uint64, 0, np.uint64),
             ("prod", np.uint64, 1, np.uint64),
             ("sum", np.float32, 0, np.float32),

diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py
@@ -814,10 +814,19 @@ def test_array_float(self):
 
     def test_0d_array(self):
         # gh-18878
-        msg = re.escape("array(1) (0d array) is not JSON serializable at the moment")
+        msg = re.escape(
+            "array(1) (numpy-scalar) is not JSON serializable at the moment"
+        )
         with pytest.raises(TypeError, match=msg):
             ujson.ujson_dumps(np.array(1))
 
+    def test_array_long_double(self):
+        msg = re.compile(
+            "1234.5.* \\(numpy-scalar\\) is not JSON serializable at the moment"
+        )
+        with pytest.raises(TypeError, match=msg):
+            ujson.ujson_dumps(np.longdouble(1234.5))
+
 
 class TestPandasJSONTests:
     def test_dataframe(self, orient):