Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REF: use maybe_convert_objects in pd.array #56484

Merged
merged 6 commits into from
Apr 24, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 34 additions & 9 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2668,7 +2668,11 @@ def maybe_convert_objects(ndarray[object] objects,
seen.object_ = True
break
elif val is C_NA:
seen.object_ = True
if convert_to_nullable_dtype:
seen.null_ = True
mask[i] = True
else:
seen.object_ = True
continue
else:
seen.object_ = True
Expand Down Expand Up @@ -2731,6 +2735,12 @@ def maybe_convert_objects(ndarray[object] objects,
dtype = StringDtype(storage="pyarrow_numpy")
return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)

elif convert_to_nullable_dtype and is_string_array(objects, skipna=True):
from pandas.core.arrays.string_ import StringDtype

dtype = StringDtype()
return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)

seen.object_ = True
elif seen.interval_:
if is_interval_array(objects):
Expand Down Expand Up @@ -2774,12 +2784,12 @@ def maybe_convert_objects(ndarray[object] objects,
return objects

if seen.bool_:
if seen.is_bool:
# is_bool property rules out everything else
return bools.view(np.bool_)
elif convert_to_nullable_dtype and seen.is_bool_or_na:
if convert_to_nullable_dtype and seen.is_bool_or_na:
from pandas.core.arrays import BooleanArray
return BooleanArray(bools.view(np.bool_), mask)
elif seen.is_bool:
# is_bool property rules out everything else
return bools.view(np.bool_)
seen.object_ = True

if not seen.object_:
Expand All @@ -2792,11 +2802,11 @@ def maybe_convert_objects(ndarray[object] objects,
result = floats
elif seen.int_ or seen.uint_:
if convert_to_nullable_dtype:
from pandas.core.arrays import IntegerArray
# Below we will wrap in IntegerArray
if seen.uint_:
result = IntegerArray(uints, mask)
result = uints
else:
result = IntegerArray(ints, mask)
result = ints
else:
result = floats
elif seen.nan_:
Expand All @@ -2811,7 +2821,6 @@ def maybe_convert_objects(ndarray[object] objects,
result = uints
else:
result = ints

else:
# don't cast int to float, etc.
if seen.null_:
Expand All @@ -2834,6 +2843,22 @@ def maybe_convert_objects(ndarray[object] objects,
else:
result = ints

# TODO: do these after the itemsize check?
if (result is ints or result is uints) and convert_to_nullable_dtype:
from pandas.core.arrays import IntegerArray

# Set these values to 1 to be deterministic, match
# IntegerArray._internal_fill_value
result[mask] = 1
result = IntegerArray(result, mask)
elif result is floats and convert_to_nullable_dtype:
from pandas.core.arrays import FloatingArray

# Set these values to 1.0 to be deterministic, match
# IntegerArray._internal_fill_value
jbrockmendel marked this conversation as resolved.
Show resolved Hide resolved
result[mask] = 1.0
result = FloatingArray(result, mask)

if result is uints or result is ints or result is floats or result is complexes:
# cast to the largest itemsize when all values are NumPy scalars
if itemsize_max > 0 and itemsize_max != result.dtype.itemsize:
Expand Down
100 changes: 57 additions & 43 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,8 @@
"""
from __future__ import annotations

from collections.abc import Sequence
from typing import (
TYPE_CHECKING,
Optional,
Union,
cast,
overload,
)
Expand All @@ -23,17 +20,9 @@

from pandas._libs import lib
from pandas._libs.tslibs import (
Period,
get_supported_dtype,
is_supported_dtype,
)
from pandas._typing import (
AnyArrayLike,
ArrayLike,
Dtype,
DtypeObj,
T,
)
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.base import ExtensionDtype
Expand All @@ -47,6 +36,7 @@
maybe_promote,
)
from pandas.core.dtypes.common import (
ensure_object,
is_list_like,
is_object_dtype,
is_string_dtype,
Expand All @@ -64,11 +54,25 @@
import pandas.core.common as com

if TYPE_CHECKING:
from collections.abc import Sequence

from pandas._typing import (
AnyArrayLike,
ArrayLike,
Dtype,
DtypeObj,
T,
)

from pandas import (
Index,
Series,
)
from pandas.core.arrays.base import ExtensionArray
from pandas.core.arrays import (
DatetimeArray,
ExtensionArray,
TimedeltaArray,
)


def array(
Expand Down Expand Up @@ -293,9 +297,7 @@ def array(
ExtensionArray,
FloatingArray,
IntegerArray,
IntervalArray,
NumpyExtensionArray,
PeriodArray,
TimedeltaArray,
)
from pandas.core.arrays.string_ import StringDtype
Expand Down Expand Up @@ -327,46 +329,58 @@ def array(
return cls._from_sequence(data, dtype=dtype, copy=copy)

if dtype is None:
inferred_dtype = lib.infer_dtype(data, skipna=True)
if inferred_dtype == "period":
period_data = cast(Union[Sequence[Optional[Period]], AnyArrayLike], data)
return PeriodArray._from_sequence(period_data, copy=copy)

elif inferred_dtype == "interval":
return IntervalArray(data, copy=copy)

elif inferred_dtype.startswith("datetime"):
# datetime, datetime64
try:
return DatetimeArray._from_sequence(data, copy=copy)
except ValueError:
# Mixture of timezones, fall back to NumpyExtensionArray
pass

elif inferred_dtype.startswith("timedelta"):
# timedelta, timedelta64
return TimedeltaArray._from_sequence(data, copy=copy)

elif inferred_dtype == "string":
was_ndarray = isinstance(data, np.ndarray)
# error: Item "Sequence[object]" of "Sequence[object] | ExtensionArray |
# ndarray[Any, Any]" has no attribute "dtype"
if not was_ndarray or data.dtype == object: # type: ignore[union-attr]
result = lib.maybe_convert_objects(
ensure_object(data),
convert_non_numeric=True,
convert_to_nullable_dtype=True,
dtype_if_all_nat=None,
)
result = ensure_wrapped_if_datetimelike(result)
if isinstance(result, np.ndarray):
if len(result) == 0 and not was_ndarray:
# e.g. empty list
return FloatingArray._from_sequence(data, dtype="Float64")
return NumpyExtensionArray._from_sequence(
data, dtype=result.dtype, copy=copy
)
if result is data and copy:
return result.copy()
return result

data = cast(np.ndarray, data)
result = ensure_wrapped_if_datetimelike(data)
if result is not data:
result = cast("DatetimeArray | TimedeltaArray", result)
if copy and result.dtype == data.dtype:
return result.copy()
return result

if data.dtype.kind in "SU":
# StringArray/ArrowStringArray depending on pd.options.mode.string_storage
dtype = StringDtype()
cls = dtype.construct_array_type()
return cls._from_sequence(data, dtype=dtype, copy=copy)

elif inferred_dtype == "integer":
elif data.dtype.kind in "iu":
return IntegerArray._from_sequence(data, copy=copy)
elif inferred_dtype == "empty" and not hasattr(data, "dtype") and not len(data):
return FloatingArray._from_sequence(data, copy=copy)
elif (
inferred_dtype in ("floating", "mixed-integer-float")
and getattr(data, "dtype", None) != np.float16
):
elif data.dtype.kind == "f":
# GH#44715 Exclude np.float16 bc FloatingArray does not support it;
# we will fall back to NumpyExtensionArray.
if data.dtype == np.float16:
return NumpyExtensionArray._from_sequence(
data, dtype=data.dtype, copy=copy
)
return FloatingArray._from_sequence(data, copy=copy)

elif inferred_dtype == "boolean":
elif data.dtype.kind == "b":
return BooleanArray._from_sequence(data, dtype="boolean", copy=copy)
else:
# e.g. complex
return NumpyExtensionArray._from_sequence(data, dtype=data.dtype, copy=copy)

# Pandas overrides NumPy for
# 1. datetime64[ns,us,ms,s]
Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/arrays/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,14 @@ def test_dt64_array(dtype_unit):
.construct_array_type()
._from_sequence(["a", None], dtype=pd.StringDtype()),
),
(
# numpy array with string dtype
np.array(["a", "b"], dtype=str),
None,
pd.StringDtype()
.construct_array_type()
._from_sequence(["a", "b"], dtype=pd.StringDtype()),
),
# Boolean
(
[True, None],
Expand Down Expand Up @@ -246,6 +254,14 @@ def test_dt64_array(dtype_unit):
"category",
pd.Categorical([pd.Period("2000", "D"), pd.Period("2001", "D")]),
),
# Complex
(
np.array([complex(1), complex(2)], dtype=np.complex128),
None,
NumpyExtensionArray(
np.array([complex(1), complex(2)], dtype=np.complex128)
),
),
],
)
def test_array(data, dtype, expected):
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -937,9 +937,9 @@ def test_maybe_convert_objects_bool_nan(self):
def test_maybe_convert_objects_nullable_boolean(self):
# GH50047
arr = np.array([True, False], dtype=object)
exp = np.array([True, False])
exp = BooleanArray._from_sequence([True, False], dtype="boolean")
out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True)
tm.assert_numpy_array_equal(out, exp)
tm.assert_extension_array_equal(out, exp)

arr = np.array([True, False, pd.NaT], dtype=object)
exp = np.array([True, False, pd.NaT], dtype=object)
Expand Down