Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REF: use conditional-nogil in libalgos #56025

Merged
merged 1 commit into from
Nov 17, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 10 additions & 107 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1145,107 +1145,7 @@ cdef void rank_sorted_1d(
# that sorted value for retrieval back from the original
# values / masked_vals arrays
# TODO(cython3): de-duplicate once cython supports conditional nogil
if numeric_object_t is object:
with gil:
for i in range(N):
at_end = i == N - 1

# dups and sum_ranks will be incremented each loop where
# the value / group remains the same, and should be reset
# when either of those change. Used to calculate tiebreakers
dups += 1
sum_ranks += i - grp_start + 1

next_val_diff = at_end or are_diff(masked_vals[sort_indexer[i]],
masked_vals[sort_indexer[i+1]])

# We'll need this check later anyway to determine group size, so just
# compute it here since shortcircuiting won't help
group_changed = at_end or (check_labels and
(labels[sort_indexer[i]]
!= labels[sort_indexer[i+1]]))

# Update out only when there is a transition of values or labels.
# When a new value or group is encountered, go back #dups steps(
# the number of occurrence of current value) and assign the ranks
# based on the starting index of the current group (grp_start)
# and the current index
if (next_val_diff or group_changed or (check_mask and
(mask[sort_indexer[i]]
^ mask[sort_indexer[i+1]]))):

# If keep_na, check for missing values and assign back
# to the result where appropriate
if keep_na and check_mask and mask[sort_indexer[i]]:
grp_na_count = dups
for j in range(i - dups + 1, i + 1):
out[sort_indexer[j]] = NaN
elif tiebreak == TIEBREAK_AVERAGE:
for j in range(i - dups + 1, i + 1):
out[sort_indexer[j]] = sum_ranks / <float64_t>dups
elif tiebreak == TIEBREAK_MIN:
for j in range(i - dups + 1, i + 1):
out[sort_indexer[j]] = i - grp_start - dups + 2
elif tiebreak == TIEBREAK_MAX:
for j in range(i - dups + 1, i + 1):
out[sort_indexer[j]] = i - grp_start + 1

# With n as the previous rank in the group and m as the number
# of duplicates in this stretch, if TIEBREAK_FIRST and ascending,
# then rankings should be n + 1, n + 2 ... n + m
elif tiebreak == TIEBREAK_FIRST:
for j in range(i - dups + 1, i + 1):
out[sort_indexer[j]] = j + 1 - grp_start

# If TIEBREAK_FIRST and descending, the ranking should be
# n + m, n + (m - 1) ... n + 1. This is equivalent to
# (i - dups + 1) + (i - j + 1) - grp_start
elif tiebreak == TIEBREAK_FIRST_DESCENDING:
for j in range(i - dups + 1, i + 1):
out[sort_indexer[j]] = 2 * i - j - dups + 2 - grp_start
elif tiebreak == TIEBREAK_DENSE:
for j in range(i - dups + 1, i + 1):
out[sort_indexer[j]] = grp_vals_seen

# Look forward to the next value (using the sorting in
# lexsort_indexer). If the value does not equal the current
# value then we need to reset the dups and sum_ranks, knowing
# that a new value is coming up. The conditional also needs
# to handle nan equality and the end of iteration. If group
# changes we do not record seeing a new value in the group
if not group_changed and (next_val_diff or (check_mask and
(mask[sort_indexer[i]]
^ mask[sort_indexer[i+1]]))):
dups = sum_ranks = 0
grp_vals_seen += 1

# Similar to the previous conditional, check now if we are
# moving to a new group. If so, keep track of the index where
# the new group occurs, so the tiebreaker calculations can
# decrement that from their position. Fill in the size of each
# group encountered (used by pct calculations later). Also be
# sure to reset any of the items helping to calculate dups
if group_changed:

# If not dense tiebreak, group size used to compute
# percentile will be # of non-null elements in group
if tiebreak != TIEBREAK_DENSE:
grp_size = i - grp_start + 1 - grp_na_count

# Otherwise, it will be the number of distinct values
# in the group, subtracting 1 if NaNs are present
# since that is a distinct value we shouldn't count
else:
grp_size = grp_vals_seen - (grp_na_count > 0)

for j in range(grp_start, i + 1):
grp_sizes[sort_indexer[j]] = grp_size

dups = sum_ranks = 0
grp_na_count = 0
grp_start = i + 1
grp_vals_seen = 1
else:
with gil(numeric_object_t is object):
for i in range(N):
at_end = i == N - 1

Expand All @@ -1255,8 +1155,12 @@ cdef void rank_sorted_1d(
dups += 1
sum_ranks += i - grp_start + 1

next_val_diff = at_end or (masked_vals[sort_indexer[i]]
!= masked_vals[sort_indexer[i+1]])
if numeric_object_t is object:
next_val_diff = at_end or are_diff(masked_vals[sort_indexer[i]],
masked_vals[sort_indexer[i+1]])
else:
next_val_diff = at_end or (masked_vals[sort_indexer[i]]
!= masked_vals[sort_indexer[i+1]])

# We'll need this check later anyway to determine group size, so just
# compute it here since shortcircuiting won't help
Expand All @@ -1269,10 +1173,9 @@ cdef void rank_sorted_1d(
# the number of occurrence of current value) and assign the ranks
# based on the starting index of the current group (grp_start)
# and the current index
if (next_val_diff or group_changed
or (check_mask and
(mask[sort_indexer[i]] ^ mask[sort_indexer[i+1]]))):

if (next_val_diff or group_changed or (check_mask and
(mask[sort_indexer[i]]
^ mask[sort_indexer[i+1]]))):
# If keep_na, check for missing values and assign back
# to the result where appropriate
if keep_na and check_mask and mask[sort_indexer[i]]:
Expand Down
Loading