Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into ci/debug
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke committed Nov 6, 2023
2 parents dc4f3cb + 2c12853 commit 182f432
Show file tree
Hide file tree
Showing 36 changed files with 348 additions and 271 deletions.
2 changes: 1 addition & 1 deletion doc/source/user_guide/timeseries.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2013,7 +2013,7 @@ frequency. Arithmetic is not allowed between ``Period`` with different ``freq``
p == pd.Period("2012-01", freq="3M")
If ``Period`` freq is daily or higher (``D``, ``H``, ``T``, ``S``, ``L``, ``U``, ``N``), ``offsets`` and ``timedelta``-like can be added if the result can have the same freq. Otherwise, ``ValueError`` will be raised.
If ``Period`` freq is daily or higher (``D``, ``h``, ``min``, ``s``, ``ms``, ``us``, and ``ns``), ``offsets`` and ``timedelta``-like can be added if the result can have the same freq. Otherwise, ``ValueError`` will be raised.

.. ipython:: python
Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.1.3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ Fixed regressions
Bug fixes
~~~~~~~~~
- Bug in :meth:`DatetimeIndex.diff` raising ``TypeError`` (:issue:`55080`)
-
- Bug in :meth:`Index.isin` raising for Arrow backed string and ``None`` value (:issue:`55821`)

.. ---------------------------------------------------------------------------
.. _whatsnew_213.other:
Expand Down
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,10 @@ Performance improvements
- Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`)
- Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`)
- Performance improvement in :meth:`Index.difference` (:issue:`55108`)
- Performance improvement in :meth:`MultiIndex.get_indexer` when ``method`` is not ``None`` (:issue:`55839`)
- Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`)
- Performance improvement in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` (:issue:`54234`)
- Performance improvement when indexing into a non-unique index (:issue:`55816`)
- Performance improvement when indexing with more than 4 keys (:issue:`54550`)
- Performance improvement when localizing time to UTC (:issue:`55241`)

Expand All @@ -346,6 +348,7 @@ Datetimelike
- Bug in :class:`DatetimeIndex` construction when passing both a ``tz`` and either ``dayfirst`` or ``yearfirst`` ignoring dayfirst/yearfirst (:issue:`55813`)
- Bug in :class:`DatetimeIndex` when passing an object-dtype ndarray of float objects and a ``tz`` incorrectly localizing the result (:issue:`55780`)
- Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame. (:issue:`52093`)
- Bug in :func:`testing.assert_extension_array_equal` that could use the wrong unit when comparing resolutions (:issue:`55730`)
- Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing a list of mixed-string-and-numeric types incorrectly raising (:issue:`55780`)
- Bug in :meth:`DatetimeIndex.union` returning object dtype for tz-aware indexes with the same timezone but different units (:issue:`55238`)
- Bug in :meth:`Index.is_monotonic_increasing` and :meth:`Index.is_monotonic_decreasing` always caching :meth:`Index.is_unique` as ``True`` when first value in index is ``NaT`` (:issue:`55755`)
Expand Down
7 changes: 0 additions & 7 deletions pandas/_libs/index.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,6 @@ class BaseMultiIndexCodesEngine:
) -> None: ...
def get_indexer(self, target: npt.NDArray[np.object_]) -> npt.NDArray[np.intp]: ...
def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ...
def get_indexer_with_fill(
self,
target: np.ndarray, # np.ndarray[object] of tuples
values: np.ndarray, # np.ndarray[object] of tuples
method: str,
limit: int | None,
) -> npt.NDArray[np.intp]: ...

class ExtensionEngine:
def __init__(self, values: ExtensionArray) -> None: ...
Expand Down
109 changes: 16 additions & 93 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ cdef class IndexEngine:
dict d = {}
object val
Py_ssize_t count = 0, count_missing = 0
Py_ssize_t i, j, n, n_t, n_alloc, start, end
Py_ssize_t i, j, n, n_t, n_alloc, max_alloc, start, end
bint check_na_values = False

values = self.values
Expand All @@ -364,6 +364,7 @@ cdef class IndexEngine:

n = len(values)
n_t = len(targets)
max_alloc = n * n_t
if n > 10_000:
n_alloc = 10_000
else:
Expand Down Expand Up @@ -453,7 +454,9 @@ cdef class IndexEngine:

# realloc if needed
if count >= n_alloc:
n_alloc += 10_000
n_alloc *= 2
if n_alloc > max_alloc:
n_alloc = max_alloc
result = np.resize(result, n_alloc)

result[count] = j
Expand All @@ -463,7 +466,9 @@ cdef class IndexEngine:
else:

if count >= n_alloc:
n_alloc += 10_000
n_alloc *= 2
if n_alloc > max_alloc:
n_alloc = max_alloc
result = np.resize(result, n_alloc)
result[count] = -1
count += 1
Expand Down Expand Up @@ -748,91 +753,6 @@ cdef class BaseMultiIndexCodesEngine:
"""
return self._base.get_indexer(self, target)

def get_indexer_with_fill(self, ndarray target, ndarray values,
str method, object limit) -> np.ndarray:
"""
Returns an array giving the positions of each value of `target` in
`values`, where -1 represents a value in `target` which does not
appear in `values`

If `method` is "backfill" then the position for a value in `target`
which does not appear in `values` is that of the next greater value
in `values` (if one exists), and -1 if there is no such value.

Similarly, if the method is "pad" then the position for a value in
`target` which does not appear in `values` is that of the next smaller
value in `values` (if one exists), and -1 if there is no such value.

Parameters
----------
target: ndarray[object] of tuples
need not be sorted, but all must have the same length, which must be
the same as the length of all tuples in `values`
values : ndarray[object] of tuples
must be sorted and all have the same length. Should be the set of
the MultiIndex's values.
method: string
"backfill" or "pad"
limit: int or None
if provided, limit the number of fills to this value

Returns
-------
np.ndarray[intp_t, ndim=1] of the indexer of `target` into `values`,
filled with the `method` (and optionally `limit`) specified
"""
assert method in ("backfill", "pad")
cdef:
int64_t i, j, next_code
int64_t num_values, num_target_values
ndarray[int64_t, ndim=1] target_order
ndarray[object, ndim=1] target_values
ndarray[int64_t, ndim=1] new_codes, new_target_codes
ndarray[intp_t, ndim=1] sorted_indexer

target_order = np.argsort(target).astype("int64")
target_values = target[target_order]
num_values, num_target_values = len(values), len(target_values)
new_codes, new_target_codes = (
np.empty((num_values,)).astype("int64"),
np.empty((num_target_values,)).astype("int64"),
)

# `values` and `target_values` are both sorted, so we walk through them
# and memoize the (ordered) set of indices in the (implicit) merged-and
# sorted list of the two which belong to each of them
# the effect of this is to create a factorization for the (sorted)
# merger of the index values, where `new_codes` and `new_target_codes`
# are the subset of the factors which appear in `values` and `target`,
# respectively
i, j, next_code = 0, 0, 0
while i < num_values and j < num_target_values:
val, target_val = values[i], target_values[j]
if val <= target_val:
new_codes[i] = next_code
i += 1
if target_val <= val:
new_target_codes[j] = next_code
j += 1
next_code += 1

# at this point, at least one should have reached the end
# the remaining values of the other should be added to the end
assert i == num_values or j == num_target_values
while i < num_values:
new_codes[i] = next_code
i += 1
next_code += 1
while j < num_target_values:
new_target_codes[j] = next_code
j += 1
next_code += 1

# get the indexer, and undo the sorting of `target.values`
algo = algos.backfill if method == "backfill" else algos.pad
sorted_indexer = algo(new_codes, new_target_codes, limit=limit)
return sorted_indexer[np.argsort(target_order)]

def get_loc(self, object key):
if is_definitely_invalid_key(key):
raise TypeError(f"'{key}' is an invalid key")
Expand Down Expand Up @@ -1211,7 +1131,7 @@ cdef class MaskedIndexEngine(IndexEngine):
dict d = {}
object val
Py_ssize_t count = 0, count_missing = 0
Py_ssize_t i, j, n, n_t, n_alloc, start, end, na_idx
Py_ssize_t i, j, n, n_t, n_alloc, max_alloc, start, end, na_idx

target_vals = self._get_data(targets)
target_mask = self._get_mask(targets)
Expand All @@ -1224,6 +1144,7 @@ cdef class MaskedIndexEngine(IndexEngine):

n = len(values)
n_t = len(target_vals)
max_alloc = n * n_t
if n > 10_000:
n_alloc = 10_000
else:
Expand Down Expand Up @@ -1274,8 +1195,9 @@ cdef class MaskedIndexEngine(IndexEngine):
for na_idx in na_pos:
# realloc if needed
if count >= n_alloc:
n_alloc += 10_000
result = np.resize(result, n_alloc)
n_alloc *= 2
if n_alloc > max_alloc:
n_alloc = max_alloc

result[count] = na_idx
count += 1
Expand All @@ -1289,8 +1211,9 @@ cdef class MaskedIndexEngine(IndexEngine):

# realloc if needed
if count >= n_alloc:
n_alloc += 10_000
result = np.resize(result, n_alloc)
n_alloc *= 2
if n_alloc > max_alloc:
n_alloc = max_alloc

result[count] = j
count += 1
Expand Down
6 changes: 3 additions & 3 deletions pandas/_libs/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,12 @@ subdir('tslibs')
libs_sources = {
# Dict of extension name -> dict of {sources, include_dirs, and deps}
# numpy include dir is implicitly included
'algos': {'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper, _khash_primitive_helper]},
'algos': {'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper], 'deps': _khash_primitive_helper_dep},
'arrays': {'sources': ['arrays.pyx']},
'groupby': {'sources': ['groupby.pyx']},
'hashing': {'sources': ['hashing.pyx']},
'hashtable': {'sources': ['hashtable.pyx', _khash_primitive_helper, _hashtable_class_helper, _hashtable_func_helper]},
'index': {'sources': ['index.pyx', _index_class_helper]},
'hashtable': {'sources': ['hashtable.pyx', _hashtable_class_helper, _hashtable_func_helper], 'deps': _khash_primitive_helper_dep},
'index': {'sources': ['index.pyx', _index_class_helper], 'deps': _khash_primitive_helper_dep},
'indexing': {'sources': ['indexing.pyx']},
'internals': {'sources': ['internals.pyx']},
'interval': {'sources': ['interval.pyx', _intervaltree_helper],
Expand Down
21 changes: 21 additions & 0 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -693,6 +693,8 @@ def array_to_datetime_with_tz(
object item
int64_t ival
_TSObject tsobj
bint infer_reso = creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC
DatetimeParseState state = DatetimeParseState(creso)

for i in range(n):
# Analogous to `item = values[i]`
Expand All @@ -707,6 +709,9 @@ def array_to_datetime_with_tz(
item, tz=tz, unit="ns", dayfirst=dayfirst, yearfirst=yearfirst, nanos=0
)
if tsobj.value != NPY_NAT:
state.update_creso(tsobj.creso)
if infer_reso:
creso = state.creso
tsobj.ensure_reso(creso, item, round_ok=True)
ival = tsobj.value

Expand All @@ -715,4 +720,20 @@ def array_to_datetime_with_tz(

cnp.PyArray_MultiIter_NEXT(mi)

if infer_reso:
if state.creso_ever_changed:
# We encountered mismatched resolutions, need to re-parse with
# the correct one.
return array_to_datetime_with_tz(values, tz=tz, creso=creso)

# Otherwise we can use the single reso that we encountered and avoid
# a second pass.
abbrev = npy_unit_to_abbrev(creso)
result = result.view(f"M8[{abbrev}]")
elif creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
# We didn't find any non-NaT to infer from, default to "ns"
result = result.view("M8[ns]")
else:
abbrev = npy_unit_to_abbrev(creso)
result = result.view(f"M8[{abbrev}]")
return result
25 changes: 23 additions & 2 deletions pandas/_testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
is_float_dtype,
is_sequence,
is_signed_integer_dtype,
is_string_dtype,
is_unsigned_integer_dtype,
pandas_dtype,
)
Expand Down Expand Up @@ -1019,6 +1020,17 @@ def iat(x):

# -----------------------------------------------------------------------------

_UNITS = ["s", "ms", "us", "ns"]


def get_finest_unit(left: str, right: str):
"""
Find the higher of two datetime64 units.
"""
if _UNITS.index(left) >= _UNITS.index(right):
return left
return right


def shares_memory(left, right) -> bool:
"""
Expand All @@ -1044,10 +1056,18 @@ def shares_memory(left, right) -> bool:
if isinstance(left, pd.core.arrays.IntervalArray):
return shares_memory(left._left, right) or shares_memory(left._right, right)

if isinstance(left, ExtensionArray) and left.dtype == "string[pyarrow]":
if (
isinstance(left, ExtensionArray)
and is_string_dtype(left.dtype)
and left.dtype.storage in ("pyarrow", "pyarrow_numpy") # type: ignore[attr-defined] # noqa: E501
):
# https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
left = cast("ArrowExtensionArray", left)
if isinstance(right, ExtensionArray) and right.dtype == "string[pyarrow]":
if (
isinstance(right, ExtensionArray)
and is_string_dtype(right.dtype)
and right.dtype.storage in ("pyarrow", "pyarrow_numpy") # type: ignore[attr-defined] # noqa: E501
):
right = cast("ArrowExtensionArray", right)
left_pa_data = left._pa_array
right_pa_data = right._pa_array
Expand Down Expand Up @@ -1121,6 +1141,7 @@ def shares_memory(left, right) -> bool:
"getitem",
"get_locales",
"getMixedTypeDict",
"get_finest_unit",
"get_obj",
"get_op_from_name",
"getPeriodData",
Expand Down
2 changes: 1 addition & 1 deletion pandas/_testing/asserters.py
Original file line number Diff line number Diff line change
Expand Up @@ -745,7 +745,7 @@ def assert_extension_array_equal(
else:
l_unit = np.datetime_data(left.dtype)[0]
if not isinstance(right.dtype, np.dtype):
r_unit = cast(DatetimeTZDtype, left.dtype).unit
r_unit = cast(DatetimeTZDtype, right.dtype).unit
else:
r_unit = np.datetime_data(right.dtype)[0]
if (
Expand Down
3 changes: 3 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1303,6 +1303,9 @@ def unit(request):
return request.param


unit2 = unit


# ----------------------------------------------------------------
# Dtypes
# ----------------------------------------------------------------
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2241,14 +2241,14 @@ def _sequence_to_dt64(
data = data.astype(np.int64)
elif tz is not None and ambiguous == "raise":
obj_data = np.asarray(data, dtype=object)
i8data = tslib.array_to_datetime_with_tz(
result = tslib.array_to_datetime_with_tz(
obj_data,
tz=tz,
dayfirst=dayfirst,
yearfirst=yearfirst,
creso=abbrev_to_npy_unit(out_unit),
)
return i8data.view(out_dtype), tz, None
return result, tz, None
else:
# data comes back here as either i8 to denote UTC timestamps
# or M8[ns] to denote wall times
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,9 @@ def isin(self, values) -> npt.NDArray[np.bool_]:
if not len(value_set):
return np.zeros(len(self), dtype=bool)

result = pc.is_in(self._pa_array, value_set=pa.array(value_set))
result = pc.is_in(
self._pa_array, value_set=pa.array(value_set, type=self._pa_array.type)
)
# pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls
# to False
return np.array(result, dtype=np.bool_)
Expand Down
Loading

0 comments on commit 182f432

Please sign in to comment.