Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into na_rep-bug
Browse files Browse the repository at this point in the history
  • Loading branch information
rsm-23 committed Oct 17, 2023
2 parents 7087110 + 746e5ee commit 7aad289
Show file tree
Hide file tree
Showing 19 changed files with 143 additions and 51 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ I/O
- Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`)
- Bug in :func:`to_excel`, with ``OdsWriter`` (``ods`` files) writing boolean/string value (:issue:`54994`)
- Bug in :meth:`pandas.read_excel` with an ODS file without cached formatted cell for float values (:issue:`55219`)
- Bug where :meth:`DataFrame.to_json` would raise an ``OverflowError`` instead of a ``TypeError`` with unsupported NumPy types (:issue:`55403`)

Period
^^^^^^
Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/src/vendored/ujson/python/objToJSON.c
Original file line number Diff line number Diff line change
Expand Up @@ -1610,9 +1610,9 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
PyArray_DescrFromType(NPY_DOUBLE));
tc->type = JT_DOUBLE;
return;
} else if (PyArray_Check(obj) && PyArray_CheckScalar(obj)) {
} else if (PyArray_CheckScalar(obj)) {
PyErr_Format(PyExc_TypeError,
"%R (0d array) is not JSON serializable at the moment",
"%R (numpy-scalar) is not JSON serializable at the moment",
obj);
goto INVALID;
} else if (object_is_na_type(obj)) {
Expand Down
5 changes: 4 additions & 1 deletion pandas/_testing/asserters.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,10 @@ def assert_is_sorted(seq) -> None:
if isinstance(seq, (Index, Series)):
seq = seq.values
# sorting does not change precisions
assert_numpy_array_equal(seq, np.sort(np.array(seq)))
if isinstance(seq, np.ndarray):
assert_numpy_array_equal(seq, np.sort(np.array(seq)))
else:
assert_extension_array_equal(seq, seq[seq.argsort()])


def assert_categorical_equal(
Expand Down
3 changes: 3 additions & 0 deletions pandas/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,9 @@ def closed(self) -> bool:
# read_xml parsers
XMLParsers = Literal["lxml", "etree"]

# read_html flavors
HTMLFlavors = Literal["lxml", "html5lib", "bs4"]

# Interval closed type
IntervalLeftRight = Literal["left", "right"]
IntervalClosedType = Union[IntervalLeftRight, Literal["both", "neither"]]
Expand Down
33 changes: 33 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
AstypeArg,
AxisInt,
Dtype,
DtypeObj,
FillnaOptions,
InterpolateOptions,
NumpySorter,
Expand Down Expand Up @@ -293,6 +294,38 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal
"""
raise AbstractMethodError(cls)

@classmethod
def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self:
"""
Strict analogue to _from_sequence, allowing only sequences of scalars
that should be specifically inferred to the given dtype.
Parameters
----------
scalars : sequence
dtype : ExtensionDtype
Raises
------
TypeError or ValueError
Notes
-----
This is called in a try/except block when casting the result of a
pointwise operation.
"""
try:
return cls._from_sequence(scalars, dtype=dtype, copy=False)
except (ValueError, TypeError):
raise
except Exception:
warnings.warn(
"_from_scalars should only raise ValueError or TypeError. "
"Consider overriding _from_scalars where appropriate.",
stacklevel=find_stack_level(),
)
raise

@classmethod
def _from_sequence_of_strings(
cls, strings, *, dtype: Dtype | None = None, copy: bool = False
Expand Down
17 changes: 17 additions & 0 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@
AstypeArg,
AxisInt,
Dtype,
DtypeObj,
NpDtype,
Ordered,
Self,
Expand Down Expand Up @@ -509,6 +510,22 @@ def _from_sequence(
) -> Self:
return cls(scalars, dtype=dtype, copy=copy)

@classmethod
def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self:
if dtype is None:
# The _from_scalars strictness doesn't make much sense in this case.
raise NotImplementedError

res = cls._from_sequence(scalars, dtype=dtype)

# if there are any non-category elements in scalars, these will be
# converted to NAs in res.
mask = isna(scalars)
if not (mask == res.isna()).all():
# Some non-category element in scalars got converted to NA in res.
raise ValueError
return res

@overload
def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray:
...
Expand Down
9 changes: 9 additions & 0 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@

from pandas._typing import (
DateTimeErrorChoices,
DtypeObj,
IntervalClosedType,
Self,
TimeAmbiguous,
Expand Down Expand Up @@ -266,6 +267,14 @@ def _scalar_type(self) -> type[Timestamp]:
_freq: BaseOffset | None = None
_default_dtype = DT64NS_DTYPE # used in TimeLikeOps.__init__

@classmethod
def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self:
if lib.infer_dtype(scalars, skipna=True) not in ["datetime", "datetime64"]:
# TODO: require any NAs be valid-for-DTA
# TODO: if dtype is passed, check for tzawareness compat?
raise ValueError
return cls._from_sequence(scalars, dtype=dtype)

@classmethod
def _validate_dtype(cls, values, dtype):
# used in TimeLikeOps.__init__
Expand Down
8 changes: 8 additions & 0 deletions pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
from pandas._typing import (
AxisInt,
Dtype,
DtypeObj,
NumpySorter,
NumpyValueArrayLike,
Scalar,
Expand Down Expand Up @@ -253,6 +254,13 @@ def tolist(self):
return [x.tolist() for x in self]
return list(self.to_numpy())

@classmethod
def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self:
if lib.infer_dtype(scalars, skipna=True) != "string":
# TODO: require any NAs be valid-for-string
raise ValueError
return cls._from_sequence(scalars, dtype=dtype)


# error: Definition of "_concat_same_type" in base class "NDArrayBacked" is
# incompatible with definition in base class "ExtensionArray"
Expand Down
26 changes: 12 additions & 14 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,16 +464,11 @@ def maybe_cast_pointwise_result(
"""

if isinstance(dtype, ExtensionDtype):
if not isinstance(dtype, (CategoricalDtype, DatetimeTZDtype)):
# TODO: avoid this special-casing
# We have to special case categorical so as not to upcast
# things like counts back to categorical

cls = dtype.construct_array_type()
if same_dtype:
result = _maybe_cast_to_extension_array(cls, result, dtype=dtype)
else:
result = _maybe_cast_to_extension_array(cls, result)
cls = dtype.construct_array_type()
if same_dtype:
result = _maybe_cast_to_extension_array(cls, result, dtype=dtype)
else:
result = _maybe_cast_to_extension_array(cls, result)

elif (numeric_only and dtype.kind in "iufcb") or not numeric_only:
result = maybe_downcast_to_dtype(result, dtype)
Expand All @@ -498,11 +493,14 @@ def _maybe_cast_to_extension_array(
-------
ExtensionArray or obj
"""
from pandas.core.arrays.string_ import BaseStringArray
result: ArrayLike

# Everything can be converted to StringArrays, but we may not want to convert
if issubclass(cls, BaseStringArray) and lib.infer_dtype(obj) != "string":
return obj
if dtype is not None:
try:
result = cls._from_scalars(obj, dtype=dtype)
except (TypeError, ValueError):
return obj
return result

try:
result = cls._from_sequence(obj, dtype=dtype)
Expand Down
13 changes: 11 additions & 2 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,10 @@
pandas_dtype,
validate_all_hashable,
)
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.dtypes import (
CategoricalDtype,
ExtensionDtype,
)
from pandas.core.dtypes.generic import ABCDataFrame
from pandas.core.dtypes.inference import is_hashable
from pandas.core.dtypes.missing import (
Expand All @@ -100,6 +103,7 @@
from pandas.core.arrays.arrow import StructAccessor
from pandas.core.arrays.categorical import CategoricalAccessor
from pandas.core.arrays.sparse import SparseAccessor
from pandas.core.arrays.string_ import StringDtype
from pandas.core.construction import (
extract_array,
sanitize_array,
Expand Down Expand Up @@ -3377,7 +3381,12 @@ def combine(

# try_float=False is to match agg_series
npvalues = lib.maybe_convert_objects(new_values, try_float=False)
res_values = maybe_cast_pointwise_result(npvalues, self.dtype, same_dtype=False)
# same_dtype here is a kludge to avoid casting e.g. [True, False] to
# ["True", "False"]
same_dtype = isinstance(self.dtype, (StringDtype, CategoricalDtype))
res_values = maybe_cast_pointwise_result(
npvalues, self.dtype, same_dtype=same_dtype
)
return self._constructor(res_values, index=new_index, name=new_name, copy=False)

def combine_first(self, other) -> Series:
Expand Down
9 changes: 5 additions & 4 deletions pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
BaseBuffer,
DtypeBackend,
FilePath,
HTMLFlavors,
ReadBuffer,
StorageOptions,
)
Expand Down Expand Up @@ -889,13 +890,13 @@ def _data_to_frame(**kwargs):
}


def _parser_dispatch(flavor: str | None) -> type[_HtmlFrameParser]:
def _parser_dispatch(flavor: HTMLFlavors | None) -> type[_HtmlFrameParser]:
"""
Choose the parser based on the input flavor.
Parameters
----------
flavor : str
flavor : {{"lxml", "html5lib", "bs4"}} or None
The type of parser to use. This must be a valid backend.
Returns
Expand Down Expand Up @@ -1033,7 +1034,7 @@ def read_html(
io: FilePath | ReadBuffer[str],
*,
match: str | Pattern = ".+",
flavor: str | Sequence[str] | None = None,
flavor: HTMLFlavors | Sequence[HTMLFlavors] | None = None,
header: int | Sequence[int] | None = None,
index_col: int | Sequence[int] | None = None,
skiprows: int | Sequence[int] | slice | None = None,
Expand Down Expand Up @@ -1074,7 +1075,7 @@ def read_html(
This value is converted to a regular expression so that there is
consistent behavior between Beautiful Soup and lxml.
flavor : str or list-like, optional
flavor : {{"lxml", "html5lib", "bs4"}} or list-like, optional
The parsing engine (or list of parsing engines) to use. 'bs4' and
'html5lib' are synonymous with each other, they are both there for
backwards compatibility. The default of ``None`` tries to use ``lxml``
Expand Down
4 changes: 1 addition & 3 deletions pandas/tests/arrays/boolean/test_reduction.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

from pandas.compat.numpy import np_long

import pandas as pd


Expand Down Expand Up @@ -53,7 +51,7 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions):
s = s.dropna()

if op in ("sum", "prod"):
assert isinstance(getattr(s, op)(), np_long)
assert isinstance(getattr(s, op)(), np.int_)
elif op == "count":
# Oddly on the 32 bit build (but not Windows), this is intc (!= intp)
assert isinstance(getattr(s, op)(), np.integer)
Expand Down
9 changes: 4 additions & 5 deletions pandas/tests/frame/methods/test_shift.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import numpy as np
import pytest

from pandas.compat.numpy import np_long
import pandas.util._test_decorators as td

import pandas as pd
Expand Down Expand Up @@ -472,22 +471,22 @@ def test_shift_axis1_multiple_blocks_with_int_fill(self):
df1 = DataFrame(rng.integers(1000, size=(5, 3), dtype=int))
df2 = DataFrame(rng.integers(1000, size=(5, 2), dtype=int))
df3 = pd.concat([df1.iloc[:4, 1:3], df2.iloc[:4, :]], axis=1)
result = df3.shift(2, axis=1, fill_value=np_long(0))
result = df3.shift(2, axis=1, fill_value=np.int_(0))
assert len(df3._mgr.blocks) == 2

expected = df3.take([-1, -1, 0, 1], axis=1)
expected.iloc[:, :2] = np_long(0)
expected.iloc[:, :2] = np.int_(0)
expected.columns = df3.columns

tm.assert_frame_equal(result, expected)

# Case with periods < 0
df3 = pd.concat([df1.iloc[:4, 1:3], df2.iloc[:4, :]], axis=1)
result = df3.shift(-2, axis=1, fill_value=np_long(0))
result = df3.shift(-2, axis=1, fill_value=np.int_(0))
assert len(df3._mgr.blocks) == 2

expected = df3.take([2, 3, -1, -1], axis=1)
expected.iloc[:, -2:] = np_long(0)
expected.iloc[:, -2:] = np.int_(0)
expected.columns = df3.columns

tm.assert_frame_equal(result, expected)
Expand Down
8 changes: 2 additions & 6 deletions pandas/tests/frame/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,6 @@
IS64,
is_platform_windows,
)
from pandas.compat.numpy import (
np_long,
np_ulong,
)
import pandas.util._test_decorators as td

import pandas as pd
Expand Down Expand Up @@ -1726,11 +1722,11 @@ class TestEmptyDataFrameReductions:
"opname, dtype, exp_value, exp_dtype",
[
("sum", np.int8, 0, np.int64),
("prod", np.int8, 1, np_long),
("prod", np.int8, 1, np.int_),
("sum", np.int64, 0, np.int64),
("prod", np.int64, 1, np.int64),
("sum", np.uint8, 0, np.uint64),
("prod", np.uint8, 1, np_ulong),
("prod", np.uint8, 1, np.uint),
("sum", np.uint64, 0, np.uint64),
("prod", np.uint64, 1, np.uint64),
("sum", np.float32, 0, np.float32),
Expand Down
11 changes: 10 additions & 1 deletion pandas/tests/io/json/test_ujson.py
Original file line number Diff line number Diff line change
Expand Up @@ -814,10 +814,19 @@ def test_array_float(self):

def test_0d_array(self):
# gh-18878
msg = re.escape("array(1) (0d array) is not JSON serializable at the moment")
msg = re.escape(
"array(1) (numpy-scalar) is not JSON serializable at the moment"
)
with pytest.raises(TypeError, match=msg):
ujson.ujson_dumps(np.array(1))

def test_array_long_double(self):
msg = re.compile(
"1234.5.* \\(numpy-scalar\\) is not JSON serializable at the moment"
)
with pytest.raises(TypeError, match=msg):
ujson.ujson_dumps(np.longdouble(1234.5))


class TestPandasJSONTests:
def test_dataframe(self, orient):
Expand Down
Loading

0 comments on commit 7aad289

Please sign in to comment.