Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REF: pass dtype to _from_sequence #56436

Merged
merged 8 commits into from
Dec 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1471,7 +1471,9 @@ def _maybe_upcast(

elif arr.dtype == np.object_:
if use_dtype_backend:
arr = StringDtype().construct_array_type()._from_sequence(arr)
dtype = StringDtype()
cls = dtype.construct_array_type()
arr = cls._from_sequence(arr, dtype=dtype)

if use_dtype_backend and dtype_backend == "pyarrow":
import pyarrow as pa
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
is_integer,
is_list_like,
is_scalar,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import DatetimeTZDtype
from pandas.core.dtypes.missing import isna
Expand Down Expand Up @@ -273,6 +274,10 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal
"""
Construct a new ExtensionArray from a sequence of scalars.
"""
if dtype is not None and isinstance(dtype, str):
# FIXME: in tests.extension.test_arrow we pass pyarrow _type_ objects
# which raise when passed to pandas_dtype
dtype = pandas_dtype(dtype)
pa_type = to_pyarrow_type(dtype)
pa_array = cls._box_pa_array(scalars, pa_type=pa_type, copy=copy)
arr = cls(pa_array)
Expand Down
10 changes: 6 additions & 4 deletions pandas/core/arrays/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,12 @@ def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarr
raise AbstractMethodError(cls)


def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype):
def _coerce_to_data_and_mask(
values, dtype, copy: bool, dtype_cls: type[NumericDtype], default_dtype: np.dtype
):
checker = dtype_cls._checker

mask = None
inferred_type = None

if dtype is None and hasattr(values, "dtype"):
Expand Down Expand Up @@ -190,7 +193,7 @@ def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype
if dtype is None:
dtype = default_dtype
else:
dtype = dtype.type
dtype = dtype.numpy_dtype

if is_integer_dtype(dtype) and values.dtype.kind == "f" and len(values) > 0:
if mask.all():
Expand Down Expand Up @@ -260,9 +263,8 @@ def _coerce_to_array(
) -> tuple[np.ndarray, np.ndarray]:
dtype_cls = cls._dtype_cls
default_dtype = dtype_cls._default_np_dtype
mask = None
values, mask, _, _ = _coerce_to_data_and_mask(
value, mask, dtype, copy, dtype_cls, default_dtype
value, dtype, copy, dtype_cls, default_dtype
)
return values, mask

Expand Down
4 changes: 3 additions & 1 deletion pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -1090,7 +1090,9 @@ def period_array(
return PeriodArray(ordinals, dtype=dtype)

data = ensure_object(arrdata)

if freq is None:
freq = libperiod.extract_freq(data)
dtype = PeriodDtype(freq)
return PeriodArray._from_sequence(data, dtype=dtype)


Expand Down
6 changes: 4 additions & 2 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,9 @@ def array(

elif inferred_dtype == "string":
# StringArray/ArrowStringArray depending on pd.options.mode.string_storage
return StringDtype().construct_array_type()._from_sequence(data, copy=copy)
dtype = StringDtype()
cls = dtype.construct_array_type()
return cls._from_sequence(data, dtype=dtype, copy=copy)

elif inferred_dtype == "integer":
return IntegerArray._from_sequence(data, copy=copy)
Expand All @@ -364,7 +366,7 @@ def array(
return FloatingArray._from_sequence(data, copy=copy)

elif inferred_dtype == "boolean":
return BooleanArray._from_sequence(data, copy=copy)
return BooleanArray._from_sequence(data, dtype="boolean", copy=copy)

# Pandas overrides NumPy for
# 1. datetime64[ns,us,ms,s]
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2330,7 +2330,7 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike:
elif isinstance(bvalues, ArrowExtensionArray) and not isinstance(
bvalues.dtype, StringDtype
):
return type(bvalues)._from_sequence(counted[0])
return type(bvalues)._from_sequence(counted[0], dtype="int64[pyarrow]")
if is_series:
assert counted.ndim == 2
assert counted.shape[0] == 1
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5194,12 +5194,12 @@ def _get_join_target(self) -> np.ndarray:
def _from_join_target(self, result: np.ndarray) -> ArrayLike:
"""
Cast the ndarray returned from one of the libjoin.foo_indexer functions
back to type(self)._data.
back to type(self._data).
"""
if isinstance(self.values, BaseMaskedArray):
return type(self.values)(result, np.zeros(result.shape, dtype=np.bool_))
elif isinstance(self.values, (ArrowExtensionArray, StringArray)):
return type(self.values)._from_sequence(result)
return type(self.values)._from_sequence(result, dtype=self.dtype)
return result

@doc(IndexOpsMixin._memory_usage)
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -1044,7 +1044,9 @@ def convert(arr):
# i.e. maybe_convert_objects didn't convert
arr = maybe_infer_to_datetimelike(arr)
if dtype_backend != "numpy" and arr.dtype == np.dtype("O"):
arr = StringDtype().construct_array_type()._from_sequence(arr)
new_dtype = StringDtype()
arr_cls = new_dtype.construct_array_type()
arr = arr_cls._from_sequence(arr, dtype=new_dtype)
elif dtype_backend != "numpy" and isinstance(arr, np.ndarray):
if arr.dtype.kind in "iufb":
arr = pd_array(arr, copy=False)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/strings/object_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ def rep(x, r):
)
if isinstance(self, BaseStringArray):
# Not going through map, so we have to do this here.
result = type(self)._from_sequence(result)
result = type(self)._from_sequence(result, dtype=self.dtype)
return result

def _str_match(
Expand Down
4 changes: 3 additions & 1 deletion pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -757,7 +757,9 @@ def _infer_types(
elif result.dtype == np.object_ and non_default_dtype_backend:
# read_excel sends array of datetime objects
if not lib.is_datetime_array(result, skipna=True):
result = StringDtype().construct_array_type()._from_sequence(values)
dtype = StringDtype()
cls = dtype.construct_array_type()
result = cls._from_sequence(values, dtype=dtype)

if dtype_backend == "pyarrow":
pa = import_optional_dependency("pyarrow")
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arithmetic/test_period.py
Original file line number Diff line number Diff line change
Expand Up @@ -1282,7 +1282,7 @@ def test_parr_add_sub_td64_nat(self, box_with_array, transpose):
"other",
[
np.array(["NaT"] * 9, dtype="m8[ns]"),
TimedeltaArray._from_sequence(["NaT"] * 9),
TimedeltaArray._from_sequence(["NaT"] * 9, dtype="m8[ns]"),
],
)
def test_parr_add_sub_tdt64_nat_array(self, box_with_array, other):
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/arrays/categorical/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -745,7 +745,9 @@ def test_interval(self):

def test_categorical_extension_array_nullable(self, nulls_fixture):
# GH:
arr = pd.arrays.StringArray._from_sequence([nulls_fixture] * 2)
arr = pd.arrays.StringArray._from_sequence(
[nulls_fixture] * 2, dtype=pd.StringDtype()
)
result = Categorical(arr)
assert arr.dtype == result.categories.dtype
expected = Categorical(Series([pd.NA, pd.NA], dtype=arr.dtype))
Expand Down
12 changes: 7 additions & 5 deletions pandas/tests/arrays/datetimes/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class TestDatetimeArrayConstructor:
def test_from_sequence_invalid_type(self):
mi = pd.MultiIndex.from_product([np.arange(5), np.arange(5)])
with pytest.raises(TypeError, match="Cannot create a DatetimeArray"):
DatetimeArray._from_sequence(mi)
DatetimeArray._from_sequence(mi, dtype="M8[ns]")

def test_only_1dim_accepted(self):
arr = np.array([0, 1, 2, 3], dtype="M8[h]").astype("M8[ns]")
Expand Down Expand Up @@ -66,7 +66,7 @@ def test_mixing_naive_tzaware_raises(self, meth):
def test_from_pandas_array(self):
arr = pd.array(np.arange(5, dtype=np.int64)) * 3600 * 10**9

result = DatetimeArray._from_sequence(arr)._with_freq("infer")
result = DatetimeArray._from_sequence(arr, dtype="M8[ns]")._with_freq("infer")

expected = pd.date_range("1970-01-01", periods=5, freq="h")._data
tm.assert_datetime_array_equal(result, expected)
Expand Down Expand Up @@ -100,7 +100,7 @@ def test_bool_dtype_raises(self):

msg = r"dtype bool cannot be converted to datetime64\[ns\]"
with pytest.raises(TypeError, match=msg):
DatetimeArray._from_sequence(arr)
DatetimeArray._from_sequence(arr, dtype="M8[ns]")

with pytest.raises(TypeError, match=msg):
pd.DatetimeIndex(arr)
Expand Down Expand Up @@ -171,8 +171,10 @@ def test_2d(self, order):
if order == "F":
arr = arr.T

res = DatetimeArray._from_sequence(arr)
expected = DatetimeArray._from_sequence(arr.ravel()).reshape(arr.shape)
res = DatetimeArray._from_sequence(arr, dtype=dti.dtype)
expected = DatetimeArray._from_sequence(arr.ravel(), dtype=dti.dtype).reshape(
arr.shape
)
tm.assert_datetime_array_equal(res, expected)


Expand Down
6 changes: 4 additions & 2 deletions pandas/tests/arrays/datetimes/test_cumulative.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@ def test_accumulators_freq(self):
"2000-01-01",
"2000-01-02",
"2000-01-03",
]
],
dtype="M8[ns]",
)._with_freq("infer")
result = arr._accumulate("cummin")
expected = DatetimeArray._from_sequence(["2000-01-01"] * 3)
expected = DatetimeArray._from_sequence(["2000-01-01"] * 3, dtype="M8[ns]")
tm.assert_datetime_array_equal(result, expected)

result = arr._accumulate("cummax")
Expand All @@ -36,6 +37,7 @@ def test_accumulators_disallowed(self, func):
"2000-01-01",
"2000-01-02",
],
dtype="M8[ns]",
)._with_freq("infer")
with pytest.raises(TypeError, match=f"Accumulation {func}"):
arr._accumulate(func)
2 changes: 1 addition & 1 deletion pandas/tests/arrays/datetimes/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def test_median_2d(self, arr1d):

# axis = 1
result = arr.median(axis=1)
expected = type(arr)._from_sequence([arr1d.median()])
expected = type(arr)._from_sequence([arr1d.median()], dtype=arr.dtype)
tm.assert_equal(result, expected)

result = arr.median(axis=1, skipna=False)
Expand Down
14 changes: 7 additions & 7 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,14 @@ def test_repr(dtype):
assert repr(df.A.array) == expected


def test_none_to_nan(cls):
a = cls._from_sequence(["a", None, "b"])
def test_none_to_nan(cls, dtype):
a = cls._from_sequence(["a", None, "b"], dtype=dtype)
assert a[1] is not None
assert a[1] is na_val(a.dtype)


def test_setitem_validates(cls):
arr = cls._from_sequence(["a", "b"])
def test_setitem_validates(cls, dtype):
arr = cls._from_sequence(["a", "b"], dtype=dtype)

if cls is pd.arrays.StringArray:
msg = "Cannot set non-string value '10' into a StringArray."
Expand Down Expand Up @@ -361,12 +361,12 @@ def test_constructor_nan_like(na):


@pytest.mark.parametrize("copy", [True, False])
def test_from_sequence_no_mutate(copy, cls, request):
def test_from_sequence_no_mutate(copy, cls, dtype):
nan_arr = np.array(["a", np.nan], dtype=object)
expected_input = nan_arr.copy()
na_arr = np.array(["a", pd.NA], dtype=object)

result = cls._from_sequence(nan_arr, copy=copy)
result = cls._from_sequence(nan_arr, dtype=dtype, copy=copy)

if cls in (ArrowStringArray, ArrowStringArrayNumpySemantics):
import pyarrow as pa
Expand Down Expand Up @@ -436,7 +436,7 @@ def test_reduce_missing(skipna, dtype):

@pytest.mark.parametrize("method", ["min", "max"])
@pytest.mark.parametrize("skipna", [True, False])
def test_min_max(method, skipna, dtype, request):
def test_min_max(method, skipna, dtype):
arr = pd.Series(["a", "b", "c", None], dtype=dtype)
result = getattr(arr, method)(skipna=skipna)
if skipna:
Expand Down
5 changes: 2 additions & 3 deletions pandas/tests/arrays/string_/test_string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,8 @@ def test_config(string_storage, request, using_infer_string):
result = pd.array(["a", "b"])
assert result.dtype.storage == string_storage

expected = (
StringDtype(string_storage).construct_array_type()._from_sequence(["a", "b"])
)
dtype = StringDtype(string_storage)
expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype)
tm.assert_equal(result, expected)


Expand Down
Loading