Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into tst/skips
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke committed Nov 17, 2023
2 parents 2dd3e84 + ef2c61a commit ea23aa5
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 112 deletions.
117 changes: 10 additions & 107 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1145,107 +1145,7 @@ cdef void rank_sorted_1d(
# that sorted value for retrieval back from the original
# values / masked_vals arrays
# TODO(cython3): de-duplicate once cython supports conditional nogil
if numeric_object_t is object:
with gil:
for i in range(N):
at_end = i == N - 1

# dups and sum_ranks will be incremented each loop where
# the value / group remains the same, and should be reset
# when either of those change. Used to calculate tiebreakers
dups += 1
sum_ranks += i - grp_start + 1

next_val_diff = at_end or are_diff(masked_vals[sort_indexer[i]],
masked_vals[sort_indexer[i+1]])

# We'll need this check later anyway to determine group size, so just
# compute it here since shortcircuiting won't help
group_changed = at_end or (check_labels and
(labels[sort_indexer[i]]
!= labels[sort_indexer[i+1]]))

# Update out only when there is a transition of values or labels.
# When a new value or group is encountered, go back #dups steps(
# the number of occurrence of current value) and assign the ranks
# based on the starting index of the current group (grp_start)
# and the current index
if (next_val_diff or group_changed or (check_mask and
(mask[sort_indexer[i]]
^ mask[sort_indexer[i+1]]))):

# If keep_na, check for missing values and assign back
# to the result where appropriate
if keep_na and check_mask and mask[sort_indexer[i]]:
grp_na_count = dups
for j in range(i - dups + 1, i + 1):
out[sort_indexer[j]] = NaN
elif tiebreak == TIEBREAK_AVERAGE:
for j in range(i - dups + 1, i + 1):
out[sort_indexer[j]] = sum_ranks / <float64_t>dups
elif tiebreak == TIEBREAK_MIN:
for j in range(i - dups + 1, i + 1):
out[sort_indexer[j]] = i - grp_start - dups + 2
elif tiebreak == TIEBREAK_MAX:
for j in range(i - dups + 1, i + 1):
out[sort_indexer[j]] = i - grp_start + 1

# With n as the previous rank in the group and m as the number
# of duplicates in this stretch, if TIEBREAK_FIRST and ascending,
# then rankings should be n + 1, n + 2 ... n + m
elif tiebreak == TIEBREAK_FIRST:
for j in range(i - dups + 1, i + 1):
out[sort_indexer[j]] = j + 1 - grp_start

# If TIEBREAK_FIRST and descending, the ranking should be
# n + m, n + (m - 1) ... n + 1. This is equivalent to
# (i - dups + 1) + (i - j + 1) - grp_start
elif tiebreak == TIEBREAK_FIRST_DESCENDING:
for j in range(i - dups + 1, i + 1):
out[sort_indexer[j]] = 2 * i - j - dups + 2 - grp_start
elif tiebreak == TIEBREAK_DENSE:
for j in range(i - dups + 1, i + 1):
out[sort_indexer[j]] = grp_vals_seen

# Look forward to the next value (using the sorting in
# lexsort_indexer). If the value does not equal the current
# value then we need to reset the dups and sum_ranks, knowing
# that a new value is coming up. The conditional also needs
# to handle nan equality and the end of iteration. If group
# changes we do not record seeing a new value in the group
if not group_changed and (next_val_diff or (check_mask and
(mask[sort_indexer[i]]
^ mask[sort_indexer[i+1]]))):
dups = sum_ranks = 0
grp_vals_seen += 1

# Similar to the previous conditional, check now if we are
# moving to a new group. If so, keep track of the index where
# the new group occurs, so the tiebreaker calculations can
# decrement that from their position. Fill in the size of each
# group encountered (used by pct calculations later). Also be
# sure to reset any of the items helping to calculate dups
if group_changed:

# If not dense tiebreak, group size used to compute
# percentile will be # of non-null elements in group
if tiebreak != TIEBREAK_DENSE:
grp_size = i - grp_start + 1 - grp_na_count

# Otherwise, it will be the number of distinct values
# in the group, subtracting 1 if NaNs are present
# since that is a distinct value we shouldn't count
else:
grp_size = grp_vals_seen - (grp_na_count > 0)

for j in range(grp_start, i + 1):
grp_sizes[sort_indexer[j]] = grp_size

dups = sum_ranks = 0
grp_na_count = 0
grp_start = i + 1
grp_vals_seen = 1
else:
with gil(numeric_object_t is object):
for i in range(N):
at_end = i == N - 1

Expand All @@ -1255,8 +1155,12 @@ cdef void rank_sorted_1d(
dups += 1
sum_ranks += i - grp_start + 1

next_val_diff = at_end or (masked_vals[sort_indexer[i]]
!= masked_vals[sort_indexer[i+1]])
if numeric_object_t is object:
next_val_diff = at_end or are_diff(masked_vals[sort_indexer[i]],
masked_vals[sort_indexer[i+1]])
else:
next_val_diff = at_end or (masked_vals[sort_indexer[i]]
!= masked_vals[sort_indexer[i+1]])

# We'll need this check later anyway to determine group size, so just
# compute it here since shortcircuiting won't help
Expand All @@ -1269,10 +1173,9 @@ cdef void rank_sorted_1d(
# the number of occurrence of current value) and assign the ranks
# based on the starting index of the current group (grp_start)
# and the current index
if (next_val_diff or group_changed
or (check_mask and
(mask[sort_indexer[i]] ^ mask[sort_indexer[i+1]]))):

if (next_val_diff or group_changed or (check_mask and
(mask[sort_indexer[i]]
^ mask[sort_indexer[i+1]]))):
# If keep_na, check for missing values and assign back
# to the result where appropriate
if keep_na and check_mask and mask[sort_indexer[i]]:
Expand Down
7 changes: 6 additions & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -12378,7 +12378,12 @@ def _inplace_method(self, other, op) -> Self:

result = op(self, other)

if self.ndim == 1 and result._indexed_same(self) and result.dtype == self.dtype:
if (
self.ndim == 1
and result._indexed_same(self)
and result.dtype == self.dtype
and not using_copy_on_write()
):
# GH#36498 this inplace op can _actually_ be inplace.
# Item "ArrayManager" of "Union[ArrayManager, SingleArrayManager,
# BlockManager, SingleBlockManager]" has no attribute "setitem_inplace"
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/methods/selectn.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,8 @@ def compute(self, method: str) -> Series:

# arr passed into kth_smallest must be contiguous. We copy
# here because kth_smallest will modify its input
kth_val = libalgos.kth_smallest(arr.copy(order="C"), n - 1)
# avoid OOB access with kth_smallest_c when n <= 0
kth_val = libalgos.kth_smallest(arr.copy(order="C"), max(n - 1, 0))
(ns,) = np.nonzero(arr <= kth_val)
inds = ns[arr[ns].argsort(kind="mergesort")]

Expand Down
16 changes: 13 additions & 3 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -1815,12 +1815,22 @@ def test_update_chained_assignment(using_copy_on_write):
tm.assert_frame_equal(df, df_orig)


def test_inplace_arithmetic_series():
def test_inplace_arithmetic_series(using_copy_on_write):
ser = Series([1, 2, 3])
ser_orig = ser.copy()
data = get_array(ser)
ser *= 2
assert np.shares_memory(get_array(ser), data)
tm.assert_numpy_array_equal(data, get_array(ser))
if using_copy_on_write:
# https://github.com/pandas-dev/pandas/pull/55745
# changed to NOT update inplace because there is no benefit (actual
# operation already done non-inplace). This was only for the optics
# of updating the backing array inplace, but we no longer want to make
# that guarantee
assert not np.shares_memory(get_array(ser), data)
tm.assert_numpy_array_equal(data, get_array(ser_orig))
else:
assert np.shares_memory(get_array(ser), data)
tm.assert_numpy_array_equal(data, get_array(ser))


def test_inplace_arithmetic_series_with_reference(
Expand Down

0 comments on commit ea23aa5

Please sign in to comment.