diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
index b7dce4f63f94c..810764754b7e1 100644
--- a/asv_bench/asv.conf.json
+++ b/asv_bench/asv.conf.json
@@ -41,7 +41,7 @@
     // pip (with all the conda available packages installed first,
     // followed by the pip installed packages).
     "matrix": {
-        "Cython": ["3.0.0"],
+        "Cython": ["0.29.33"],
         "matplotlib": [],
         "sqlalchemy": [],
         "scipy": [],
diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml
index d8186d09bb9d4..ffa7732c604a0 100644
--- a/ci/deps/actions-310.yaml
+++ b/ci/deps/actions-310.yaml
@@ -6,7 +6,7 @@ dependencies:
 
   # build dependencies
   - versioneer[toml]
-  - cython>=3.0.0
+  - cython>=0.29.33
   - meson[ninja]=1.0.1
   - meson-python=0.13.1
 
diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml
index a40640e99265a..5a6a26c2e1ad8 100644
--- a/ci/deps/actions-311-downstream_compat.yaml
+++ b/ci/deps/actions-311-downstream_compat.yaml
@@ -7,7 +7,7 @@ dependencies:
 
   # build dependencies
   - versioneer[toml]
-  - cython>=3.0.0
+  - cython>=0.29.33
   - meson[ninja]=1.0.1
   - meson-python=0.13.1
 
diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml
index 3d5e2e99feb9c..f24e866af0439 100644
--- a/ci/deps/actions-311-pyarrownightly.yaml
+++ b/ci/deps/actions-311-pyarrownightly.yaml
@@ -7,7 +7,7 @@ dependencies:
   # build dependencies
   - versioneer[toml]
   - meson[ninja]=1.0.1
-  - cython>=3.0.0
+  - cython>=0.29.33
   - meson-python=0.13.1
 
   # test dependencies
diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml
index 67203201ea637..9d60d734db5b3 100644
--- a/ci/deps/actions-311.yaml
+++ b/ci/deps/actions-311.yaml
@@ -6,7 +6,7 @@ dependencies:
 
   # build dependencies
   - versioneer[toml]
-  - cython>=3.0.0
+  - cython>=0.29.33
   - meson[ninja]=1.0.1
   - meson-python=0.13.1
 
diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml
index c1c7b986fe8a4..0e2fcf87c2d6e 100644
--- a/ci/deps/actions-39-minimum_versions.yaml
+++ b/ci/deps/actions-39-minimum_versions.yaml
@@ -8,7 +8,7 @@ dependencies:
 
   # build dependencies
   - versioneer[toml]
-  - cython>=3.0.0
+  - cython>=0.29.33
   - meson[ninja]=1.0.1
   - meson-python=0.13.1
 
diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml
index 658dfe032a42b..6ea0d41b947dc 100644
--- a/ci/deps/actions-39.yaml
+++ b/ci/deps/actions-39.yaml
@@ -6,7 +6,7 @@ dependencies:
 
   # build dependencies
   - versioneer[toml]
-  - cython>=3.0.0
+  - cython>=0.29.33
   - meson[ninja]=1.0.1
   - meson-python=0.13.1
 
diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml
index 292565e9640d9..035395d55eb3a 100644
--- a/ci/deps/actions-pypy-39.yaml
+++ b/ci/deps/actions-pypy-39.yaml
@@ -9,7 +9,7 @@ dependencies:
 
   # build dependencies
   - versioneer[toml]
-  - cython>=3.0.0
+  - cython>=0.29.33
   - meson[ninja]=1.0.1
   - meson-python=0.13.1
 
diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml
index 60bdc2b828dae..df4e8e285bd02 100644
--- a/ci/deps/circle-310-arm64.yaml
+++ b/ci/deps/circle-310-arm64.yaml
@@ -6,7 +6,7 @@ dependencies:
 
   # build dependencies
   - versioneer[toml]
-  - cython>=3.0.0
+  - cython>=0.29.33
   - meson[ninja]=1.0.1
   - meson-python=0.13.1
 
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
index 998efdedb1b57..313bf61ecabf9 100644
--- a/doc/source/whatsnew/v2.1.0.rst
+++ b/doc/source/whatsnew/v2.1.0.rst
@@ -873,7 +873,6 @@ Other
 - Bug in :meth:`Series.memory_usage` when ``deep=True`` throw an error with Series of objects and the returned value is incorrect, as it does not take into account GC corrections (:issue:`51858`)
 - Bug in :meth:`period_range` the default behavior when freq was not passed as an argument was incorrect(:issue:`53687`)
 - Fixed incorrect ``__name__`` attribute of ``pandas._libs.json`` (:issue:`52898`)
-- The minimum version of Cython needed to compile pandas is now ``3.0.0`` (:issue:`54335`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_210.contributors:
diff --git a/environment.yml b/environment.yml
index 44c0ce37c2957..3a0da0bfc703d 100644
--- a/environment.yml
+++ b/environment.yml
@@ -8,7 +8,7 @@ dependencies:
 
   # build dependencies
   - versioneer[toml]
-  - cython=3.0.0
+  - cython=0.29.33
   - meson[ninja]=1.0.1
   - meson-python=0.13.1
 
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index ed251c401c277..0b6ea58f987d4 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -998,7 +998,8 @@ def rank_1d(
 
     N = len(values)
     if labels is not None:
-        assert len(labels) == N
+        # TODO(cython3): cast won't be necessary (#2992)
+        assert <Py_ssize_t>len(labels) == N
     out = np.empty(N)
     grp_sizes = np.ones(N, dtype=np.int64)
 
@@ -1087,7 +1088,8 @@ cdef void rank_sorted_1d(
     float64_t[::1] out,
     int64_t[::1] grp_sizes,
     const intp_t[:] sort_indexer,
-    const numeric_object_t[:] masked_vals,
+    # TODO(cython3): make const (https://github.com/cython/cython/issues/3222)
+    numeric_object_t[:] masked_vals,
     const uint8_t[:] mask,
     bint check_mask,
     Py_ssize_t N,
@@ -1142,7 +1144,108 @@ cdef void rank_sorted_1d(
     # array that we sorted previously, which gives us the location of
     # that sorted value for retrieval back from the original
     # values / masked_vals arrays
-    with gil(numeric_object_t is object):
+    # TODO(cython3): de-duplicate once cython supports conditional nogil
+    if numeric_object_t is object:
+        with gil:
+            for i in range(N):
+                at_end = i == N - 1
+
+                # dups and sum_ranks will be incremented each loop where
+                # the value / group remains the same, and should be reset
+                # when either of those change. Used to calculate tiebreakers
+                dups += 1
+                sum_ranks += i - grp_start + 1
+
+                next_val_diff = at_end or are_diff(masked_vals[sort_indexer[i]],
+                                                   masked_vals[sort_indexer[i+1]])
+
+                # We'll need this check later anyway to determine group size, so just
+                # compute it here since shortcircuiting won't help
+                group_changed = at_end or (check_labels and
+                                           (labels[sort_indexer[i]]
+                                            != labels[sort_indexer[i+1]]))
+
+                # Update out only when there is a transition of values or labels.
+                # When a new value or group is encountered, go back #dups steps(
+                # the number of occurrence of current value) and assign the ranks
+                # based on the starting index of the current group (grp_start)
+                # and the current index
+                if (next_val_diff or group_changed or (check_mask and
+                                                       (mask[sort_indexer[i]]
+                                                        ^ mask[sort_indexer[i+1]]))):
+
+                    # If keep_na, check for missing values and assign back
+                    # to the result where appropriate
+                    if keep_na and check_mask and mask[sort_indexer[i]]:
+                        grp_na_count = dups
+                        for j in range(i - dups + 1, i + 1):
+                            out[sort_indexer[j]] = NaN
+                    elif tiebreak == TIEBREAK_AVERAGE:
+                        for j in range(i - dups + 1, i + 1):
+                            out[sort_indexer[j]] = sum_ranks / <float64_t>dups
+                    elif tiebreak == TIEBREAK_MIN:
+                        for j in range(i - dups + 1, i + 1):
+                            out[sort_indexer[j]] = i - grp_start - dups + 2
+                    elif tiebreak == TIEBREAK_MAX:
+                        for j in range(i - dups + 1, i + 1):
+                            out[sort_indexer[j]] = i - grp_start + 1
+
+                    # With n as the previous rank in the group and m as the number
+                    # of duplicates in this stretch, if TIEBREAK_FIRST and ascending,
+                    # then rankings should be n + 1, n + 2 ... n + m
+                    elif tiebreak == TIEBREAK_FIRST:
+                        for j in range(i - dups + 1, i + 1):
+                            out[sort_indexer[j]] = j + 1 - grp_start
+
+                    # If TIEBREAK_FIRST and descending, the ranking should be
+                    # n + m, n + (m - 1) ... n + 1. This is equivalent to
+                    # (i - dups + 1) + (i - j + 1) - grp_start
+                    elif tiebreak == TIEBREAK_FIRST_DESCENDING:
+                        for j in range(i - dups + 1, i + 1):
+                            out[sort_indexer[j]] = 2 * i - j - dups + 2 - grp_start
+                    elif tiebreak == TIEBREAK_DENSE:
+                        for j in range(i - dups + 1, i + 1):
+                            out[sort_indexer[j]] = grp_vals_seen
+
+                    # Look forward to the next value (using the sorting in
+                    # lexsort_indexer). If the value does not equal the current
+                    # value then we need to reset the dups and sum_ranks, knowing
+                    # that a new value is coming up. The conditional also needs
+                    # to handle nan equality and the end of iteration. If group
+                    # changes we do not record seeing a new value in the group
+                    if not group_changed and (next_val_diff or (check_mask and
+                                              (mask[sort_indexer[i]]
+                                               ^ mask[sort_indexer[i+1]]))):
+                        dups = sum_ranks = 0
+                        grp_vals_seen += 1
+
+                    # Similar to the previous conditional, check now if we are
+                    # moving to a new group. If so, keep track of the index where
+                    # the new group occurs, so the tiebreaker calculations can
+                    # decrement that from their position. Fill in the size of each
+                    # group encountered (used by pct calculations later). Also be
+                    # sure to reset any of the items helping to calculate dups
+                    if group_changed:
+
+                        # If not dense tiebreak, group size used to compute
+                        # percentile will be # of non-null elements in group
+                        if tiebreak != TIEBREAK_DENSE:
+                            grp_size = i - grp_start + 1 - grp_na_count
+
+                        # Otherwise, it will be the number of distinct values
+                        # in the group, subtracting 1 if NaNs are present
+                        # since that is a distinct value we shouldn't count
+                        else:
+                            grp_size = grp_vals_seen - (grp_na_count > 0)
+
+                        for j in range(grp_start, i + 1):
+                            grp_sizes[sort_indexer[j]] = grp_size
+
+                        dups = sum_ranks = 0
+                        grp_na_count = 0
+                        grp_start = i + 1
+                        grp_vals_seen = 1
+    else:
         for i in range(N):
             at_end = i == N - 1
 
@@ -1371,9 +1474,8 @@ ctypedef fused out_t:
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def diff_2d(
-    # TODO: cython bug (post Cython 3) prevents update to "const diff_t[:, :] arr"
-    ndarray[diff_t, ndim=2] arr,
-    out_t[:, :] out,
+    ndarray[diff_t, ndim=2] arr,  # TODO(cython3) update to "const diff_t[:, :] arr"
+    ndarray[out_t, ndim=2] out,
     Py_ssize_t periods,
     int axis,
     bint datetimelike=False,
@@ -1381,8 +1483,7 @@ def diff_2d(
     cdef:
         Py_ssize_t i, j, sx, sy, start, stop
         bint f_contig = arr.flags.f_contiguous
-        # TODO: change to this when arr becomes a memoryview
-        # bint f_contig = arr.is_f_contig()
+        # bint f_contig = arr.is_f_contig()  # TODO(cython3)
         diff_t left, right
 
     # Disable for unsupported dtype combinations,
diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi
index 86f69c3cdfc75..78fee8f01319c 100644
--- a/pandas/_libs/arrays.pyi
+++ b/pandas/_libs/arrays.pyi
@@ -26,7 +26,7 @@ class NDArrayBacked:
     def size(self) -> int: ...
     @property
     def nbytes(self) -> int: ...
-    def copy(self, order=...): ...
+    def copy(self): ...
     def delete(self, loc, axis=...): ...
     def swapaxes(self, axis1, axis2): ...
     def repeat(self, repeats: int | Sequence[int], axis: int | None = ...): ...
diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx
index 9889436a542c1..718fb358e26bc 100644
--- a/pandas/_libs/arrays.pyx
+++ b/pandas/_libs/arrays.pyx
@@ -126,7 +126,8 @@ cdef class NDArrayBacked:
 
     @property
     def size(self) -> int:
-        return self._ndarray.size
+        # TODO(cython3): use self._ndarray.size
+        return cnp.PyArray_SIZE(self._ndarray)
 
     @property
     def nbytes(self) -> int:
diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
index 019b30900547d..d165ddd6c8afa 100644
--- a/pandas/_libs/groupby.pyi
+++ b/pandas/_libs/groupby.pyi
@@ -44,6 +44,7 @@ def group_fillna_indexer(
     labels: np.ndarray,  # ndarray[int64_t]
     sorted_labels: npt.NDArray[np.intp],
     mask: npt.NDArray[np.uint8],
+    direction: Literal["ffill", "bfill"],
     limit: int,  # int64_t
     dropna: bool,
 ) -> None: ...
@@ -54,7 +55,7 @@ def group_any_all(
     mask: np.ndarray,  # const uint8_t[::1]
     val_test: Literal["any", "all"],
     skipna: bool,
-    result_mask: np.ndarray | None,
+    nullable: bool,
 ) -> None: ...
 def group_sum(
     out: np.ndarray,  # complexfloatingintuint_t[:, ::1]
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 3384060f74c20..20499016f951e 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -695,8 +695,6 @@ def group_sum(
 
     N, K = (<object>values).shape
 
-    # TODO: Port this to use conditional nogil
-    # Note: There are some test failures since the object/non-object paths have diverged
     if sum_t is object:
         # NB: this does not use 'compensation' like the non-object track does.
         for i in range(N):
@@ -757,9 +755,9 @@ def group_sum(
                             compensation[lab, j] = 0
                         sumx[lab, j] = t
 
-        _check_below_mincount(
-            out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx
-        )
+            _check_below_mincount(
+                out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx
+            )
 
 
 @cython.wraparound(False)
@@ -811,9 +809,9 @@ def group_prod(
                     nobs[lab, j] += 1
                     prodx[lab, j] *= val
 
-    _check_below_mincount(
-        out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx
-    )
+        _check_below_mincount(
+            out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx
+        )
 
 
 @cython.wraparound(False)
@@ -1371,7 +1369,7 @@ cdef numeric_t _get_na_val(numeric_t val, bint is_datetimelike):
 
 
 ctypedef fused mincount_t:
-    numeric_object_t
+    numeric_t
     complex64_t
     complex128_t
 
@@ -1387,7 +1385,7 @@ cdef inline void _check_below_mincount(
     int64_t[:, ::1] nobs,
     int64_t min_count,
     mincount_t[:, ::1] resx,
-) noexcept:
+) noexcept nogil:
     """
     Check if the number of observations for a group is below min_count,
     and if so set the result for that group to the appropriate NA-like value.
@@ -1395,49 +1393,48 @@ cdef inline void _check_below_mincount(
     cdef:
         Py_ssize_t i, j
 
-    with nogil(mincount_t is not object):
-        for i in range(ncounts):
-            for j in range(K):
+    for i in range(ncounts):
+        for j in range(K):
 
-                if nobs[i, j] < min_count:
-                    # if we are integer dtype, not is_datetimelike, and
-                    #  not uses_mask, then getting here implies that
-                    #  counts[i] < min_count, which means we will
-                    #  be cast to float64 and masked at the end
-                    #  of WrappedCythonOp._call_cython_op. So we can safely
-                    #  set a placeholder value in out[i, j].
-                    if uses_mask:
-                        result_mask[i, j] = True
-                        # set out[i, j] to 0 to be deterministic, as
-                        #  it was initialized with np.empty. Also ensures
-                        #  we can downcast out if appropriate.
-                        out[i, j] = 0
-                    elif (
-                        mincount_t is float32_t
-                        or mincount_t is float64_t
-                        or mincount_t is complex64_t
-                        or mincount_t is complex128_t
-                    ):
-                        out[i, j] = NAN
-                    elif mincount_t is int64_t:
-                        # Per above, this is a placeholder in
-                        #  non-is_datetimelike cases.
-                        out[i, j] = NPY_NAT
-                    elif mincount_t is object:
-                        out[i, j] = None
-                    else:
-                        # placeholder, see above
-                        out[i, j] = 0
+            if nobs[i, j] < min_count:
+                # if we are integer dtype, not is_datetimelike, and
+                #  not uses_mask, then getting here implies that
+                #  counts[i] < min_count, which means we will
+                #  be cast to float64 and masked at the end
+                #  of WrappedCythonOp._call_cython_op. So we can safely
+                #  set a placeholder value in out[i, j].
+                if uses_mask:
+                    result_mask[i, j] = True
+                    # set out[i, j] to 0 to be deterministic, as
+                    #  it was initialized with np.empty. Also ensures
+                    #  we can downcast out if appropriate.
+                    out[i, j] = 0
+                elif (
+                    mincount_t is float32_t
+                    or mincount_t is float64_t
+                    or mincount_t is complex64_t
+                    or mincount_t is complex128_t
+                ):
+                    out[i, j] = NAN
+                elif mincount_t is int64_t:
+                    # Per above, this is a placeholder in
+                    #  non-is_datetimelike cases.
+                    out[i, j] = NPY_NAT
                 else:
-                    out[i, j] = resx[i, j]
+                    # placeholder, see above
+                    out[i, j] = 0
+            else:
+                out[i, j] = resx[i, j]
 
 
+# TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can
+#  use `const numeric_object_t[:, :] values`
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def group_last(
     numeric_object_t[:, ::1] out,
     int64_t[::1] counts,
-    const numeric_object_t[:, :] values,
+    ndarray[numeric_object_t, ndim=2] values,
     const intp_t[::1] labels,
     const uint8_t[:, :] mask,
     uint8_t[:, ::1] result_mask=None,
@@ -1455,7 +1452,9 @@ def group_last(
         bint uses_mask = mask is not None
         bint isna_entry
 
-    if not len(values) == len(labels):
+    # TODO(cython3):
+    # Instead of `labels.shape[0]` use `len(labels)`
+    if not len(values) == labels.shape[0]:
         raise AssertionError("len(index) != len(labels)")
 
     min_count = max(min_count, 1)
@@ -1467,7 +1466,8 @@ def group_last(
 
     N, K = (<object>values).shape
 
-    with nogil(numeric_object_t is not object):
+    if numeric_object_t is object:
+        # TODO(cython3): De-duplicate once conditional-nogil is available
         for i in range(N):
             lab = labels[i]
             if lab < 0:
@@ -1480,28 +1480,53 @@ def group_last(
                 if uses_mask:
                     isna_entry = mask[i, j]
                 else:
-                    # TODO: just make _treat_as_na support this?
-                    # remove notimplemented for object dtype there
-                    if numeric_object_t is object:
-                        isna_entry = checknull(val)
-                    else:
-                        isna_entry = _treat_as_na(val, is_datetimelike)
+                    isna_entry = checknull(val)
 
                 if not isna_entry:
+                    # TODO(cython3): use _treat_as_na here once
+                    #  conditional-nogil is available.
                     nobs[lab, j] += 1
                     resx[lab, j] = val
 
-    _check_below_mincount(
-        out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx
-    )
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] < min_count:
+                    out[i, j] = None
+                else:
+                    out[i, j] = resx[i, j]
+    else:
+        with nogil:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    if uses_mask:
+                        isna_entry = mask[i, j]
+                    else:
+                        isna_entry = _treat_as_na(val, is_datetimelike)
+
+                    if not isna_entry:
+                        nobs[lab, j] += 1
+                        resx[lab, j] = val
 
+            _check_below_mincount(
+                out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx
+            )
 
+
+# TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can
+#  use `const numeric_object_t[:, :] values`
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def group_nth(
     numeric_object_t[:, ::1] out,
     int64_t[::1] counts,
-    const numeric_object_t[:, :] values,
+    ndarray[numeric_object_t, ndim=2] values,
     const intp_t[::1] labels,
     const uint8_t[:, :] mask,
     uint8_t[:, ::1] result_mask=None,
@@ -1520,7 +1545,9 @@ def group_nth(
         bint uses_mask = mask is not None
         bint isna_entry
 
-    if not len(values) == len(labels):
+    # TODO(cython3):
+    # Instead of `labels.shape[0]` use `len(labels)`
+    if not len(values) == labels.shape[0]:
         raise AssertionError("len(index) != len(labels)")
 
     min_count = max(min_count, 1)
@@ -1532,7 +1559,8 @@ def group_nth(
 
     N, K = (<object>values).shape
 
-    with nogil(numeric_object_t is not object):
+    if numeric_object_t is object:
+        # TODO(cython3): De-duplicate once conditional-nogil is available
         for i in range(N):
             lab = labels[i]
             if lab < 0:
@@ -1545,21 +1573,46 @@ def group_nth(
                 if uses_mask:
                     isna_entry = mask[i, j]
                 else:
-                    # TODO: just make _treat_as_na support this?
-                    # remove notimplemented for object dtype there
-                    if numeric_object_t is object:
-                        isna_entry = checknull(val)
-                    else:
-                        isna_entry = _treat_as_na(val, is_datetimelike)
+                    isna_entry = checknull(val)
 
                 if not isna_entry:
+                    # TODO(cython3): use _treat_as_na here once
+                    #  conditional-nogil is available.
                     nobs[lab, j] += 1
                     if nobs[lab, j] == rank:
                         resx[lab, j] = val
 
-    _check_below_mincount(
-        out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx
-    )
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] < min_count:
+                    out[i, j] = None
+                else:
+                    out[i, j] = resx[i, j]
+
+    else:
+        with nogil:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    if uses_mask:
+                        isna_entry = mask[i, j]
+                    else:
+                        isna_entry = _treat_as_na(val, is_datetimelike)
+
+                    if not isna_entry:
+                        nobs[lab, j] += 1
+                        if nobs[lab, j] == rank:
+                            resx[lab, j] = val
+
+            _check_below_mincount(
+                out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx
+            )
 
 
 @cython.boundscheck(False)
@@ -1651,7 +1704,7 @@ def group_rank(
 cdef group_min_max(
     numeric_t[:, ::1] out,
     int64_t[::1] counts,
-    const numeric_t[:, :] values,
+    ndarray[numeric_t, ndim=2] values,
     const intp_t[::1] labels,
     Py_ssize_t min_count=-1,
     bint is_datetimelike=False,
@@ -1699,7 +1752,9 @@ cdef group_min_max(
         bint uses_mask = mask is not None
         bint isna_entry
 
-    if not len(values) == len(labels):
+    # TODO(cython3):
+    # Instead of `labels.shape[0]` use `len(labels)`
+    if not len(values) == labels.shape[0]:
         raise AssertionError("len(index) != len(labels)")
 
     min_count = max(min_count, 1)
@@ -1734,9 +1789,9 @@ cdef group_min_max(
                         if val < group_min_or_max[lab, j]:
                             group_min_or_max[lab, j] = val
 
-    _check_below_mincount(
-        out, uses_mask, result_mask, ngroups, K, nobs, min_count, group_min_or_max
-    )
+        _check_below_mincount(
+            out, uses_mask, result_mask, ngroups, K, nobs, min_count, group_min_or_max
+        )
 
 
 @cython.wraparound(False)
@@ -1744,7 +1799,7 @@ cdef group_min_max(
 def group_max(
     numeric_t[:, ::1] out,
     int64_t[::1] counts,
-    const numeric_t[:, :] values,
+    ndarray[numeric_t, ndim=2] values,
     const intp_t[::1] labels,
     Py_ssize_t min_count=-1,
     bint is_datetimelike=False,
@@ -1770,7 +1825,7 @@ def group_max(
 def group_min(
     numeric_t[:, ::1] out,
     int64_t[::1] counts,
-    const numeric_t[:, :] values,
+    ndarray[numeric_t, ndim=2] values,
     const intp_t[::1] labels,
     Py_ssize_t min_count=-1,
     bint is_datetimelike=False,
diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi
index 8069637a9bff4..2bc6d74fe6aee 100644
--- a/pandas/_libs/hashtable.pyi
+++ b/pandas/_libs/hashtable.pyi
@@ -20,6 +20,7 @@ class Factorizer:
     def factorize(
         self,
         values: np.ndarray,
+        sort: bool = ...,
         na_sentinel=...,
         na_value=...,
         mask=...,
@@ -156,9 +157,9 @@ class HashTable:
     def __contains__(self, key: Hashable) -> bool: ...
     def sizeof(self, deep: bool = ...) -> int: ...
     def get_state(self) -> dict[str, int]: ...
-    # TODO: `val/key` type is subclass-specific
-    def get_item(self, val): ...  # TODO: return type?
-    def set_item(self, key, val) -> None: ...
+    # TODO: `item` type is subclass-specific
+    def get_item(self, item): ...  # TODO: return type?
+    def set_item(self, item, val) -> None: ...
     def get_na(self): ...  # TODO: return type?
     def set_na(self, val) -> None: ...
     def map_locations(
@@ -184,7 +185,6 @@ class HashTable:
         self,
         values: np.ndarray,  # np.ndarray[subclass-specific]
         return_inverse: bool = ...,
-        mask=...,
     ) -> (
         tuple[
             np.ndarray,  # np.ndarray[subclass-specific]
@@ -198,7 +198,6 @@ class HashTable:
         na_sentinel: int = ...,
         na_value: object = ...,
         mask=...,
-        ignore_na: bool = True,
     ) -> tuple[np.ndarray, npt.NDArray[np.intp]]: ...  # np.ndarray[subclass-specific]
 
 class Complex128HashTable(HashTable): ...
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index c0723392496c1..1cf5d734705af 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -1239,10 +1239,9 @@ cdef class StringHashTable(HashTable):
                             na_value=na_value, ignore_na=ignore_na,
                             return_inverse=True)
 
-    # Add unused mask parameter for compat with other signatures
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                   object na_value=None, object mask=None):
+                   object na_value=None):
         # -> np.ndarray[np.intp]
         _, labels = self._unique(values, uniques, count_prior=count_prior,
                                  na_sentinel=na_sentinel, na_value=na_value,
@@ -1497,10 +1496,9 @@ cdef class PyObjectHashTable(HashTable):
                             na_value=na_value, ignore_na=ignore_na,
                             return_inverse=True)
 
-    # Add unused mask parameter for compat with other signatures
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                   object na_value=None, object mask=None):
+                   object na_value=None):
         # -> np.ndarray[np.intp]
         _, labels = self._unique(values, uniques, count_prior=count_prior,
                                  na_sentinel=na_sentinel, na_value=na_value,
diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx
index 83ea99c13b153..adf4e8c926fa3 100644
--- a/pandas/_libs/internals.pyx
+++ b/pandas/_libs/internals.pyx
@@ -2,10 +2,14 @@ from collections import defaultdict
 import weakref
 
 cimport cython
-from cpython.pyport cimport PY_SSIZE_T_MAX
 from cpython.slice cimport PySlice_GetIndicesEx
 from cython cimport Py_ssize_t
 
+
+cdef extern from "Python.h":
+    # TODO(cython3): from cpython.pyport cimport PY_SSIZE_T_MAX
+    Py_ssize_t PY_SSIZE_T_MAX
+
 import numpy as np
 
 cimport numpy as cnp
diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx
index 44f54bb451283..e07d80dd04b31 100644
--- a/pandas/_libs/interval.pyx
+++ b/pandas/_libs/interval.pyx
@@ -511,6 +511,17 @@ cdef class Interval(IntervalMixin):
             or is_timedelta64_object(y)
         ):
             return Interval(self.left + y, self.right + y, closed=self.closed)
+        elif (
+            # __radd__ pattern
+            # TODO(cython3): remove this
+            isinstance(y, Interval)
+            and (
+                isinstance(self, numbers.Number)
+                or PyDelta_Check(self)
+                or is_timedelta64_object(self)
+            )
+        ):
+            return Interval(y.left + self, y.right + self, closed=y.closed)
         return NotImplemented
 
     def __radd__(self, other):
@@ -534,6 +545,10 @@ cdef class Interval(IntervalMixin):
     def __mul__(self, y):
         if isinstance(y, numbers.Number):
             return Interval(self.left * y, self.right * y, closed=self.closed)
+        elif isinstance(y, Interval) and isinstance(self, numbers.Number):
+            # __radd__ semantics
+            # TODO(cython3): remove this
+            return Interval(y.left * self, y.right * self, closed=y.closed)
         return NotImplemented
 
     def __rmul__(self, other):
diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
index 7e92032a73325..32641319a6b96 100644
--- a/pandas/_libs/lib.pyi
+++ b/pandas/_libs/lib.pyi
@@ -45,24 +45,22 @@ def is_scalar(val: object) -> bool: ...
 def is_list_like(obj: object, allow_sets: bool = ...) -> bool: ...
 def is_pyarrow_array(obj: object) -> bool: ...
 def is_period(val: object) -> TypeGuard[Period]: ...
-def is_interval(obj: object) -> TypeGuard[Interval]: ...
-def is_decimal(obj: object) -> TypeGuard[Decimal]: ...
-def is_complex(obj: object) -> TypeGuard[complex]: ...
-def is_bool(obj: object) -> TypeGuard[bool | np.bool_]: ...
-def is_integer(obj: object) -> TypeGuard[int | np.integer]: ...
+def is_interval(val: object) -> TypeGuard[Interval]: ...
+def is_decimal(val: object) -> TypeGuard[Decimal]: ...
+def is_complex(val: object) -> TypeGuard[complex]: ...
+def is_bool(val: object) -> TypeGuard[bool | np.bool_]: ...
+def is_integer(val: object) -> TypeGuard[int | np.integer]: ...
 def is_int_or_none(obj) -> bool: ...
-def is_float(obj: object) -> TypeGuard[float]: ...
+def is_float(val: object) -> TypeGuard[float]: ...
 def is_interval_array(values: np.ndarray) -> bool: ...
-def is_datetime64_array(values: np.ndarray, skipna: bool = True) -> bool: ...
-def is_timedelta_or_timedelta64_array(
-    values: np.ndarray, skipna: bool = True
-) -> bool: ...
+def is_datetime64_array(values: np.ndarray) -> bool: ...
+def is_timedelta_or_timedelta64_array(values: np.ndarray) -> bool: ...
 def is_datetime_with_singletz_array(values: np.ndarray) -> bool: ...
 def is_time_array(values: np.ndarray, skipna: bool = ...): ...
 def is_date_array(values: np.ndarray, skipna: bool = ...): ...
 def is_datetime_array(values: np.ndarray, skipna: bool = ...): ...
 def is_string_array(values: np.ndarray, skipna: bool = ...): ...
-def is_float_array(values: np.ndarray): ...
+def is_float_array(values: np.ndarray, skipna: bool = ...): ...
 def is_integer_array(values: np.ndarray, skipna: bool = ...): ...
 def is_bool_array(values: np.ndarray, skipna: bool = ...): ...
 def fast_multiget(mapping: dict, keys: np.ndarray, default=...) -> np.ndarray: ...
@@ -183,7 +181,7 @@ def count_level_2d(
     max_bin: int,
 ) -> np.ndarray: ...  # np.ndarray[np.int64, ndim=2]
 def get_level_sorter(
-    codes: np.ndarray,  # const int64_t[:]
+    label: np.ndarray,  # const int64_t[:]
     starts: np.ndarray,  # const intp_t[:]
 ) -> np.ndarray: ...  # np.ndarray[np.intp, ndim=1]
 def generate_bins_dt64(
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 38695fbb8222b..55819ebd1f15e 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -512,7 +512,8 @@ def get_reverse_indexer(const intp_t[:] indexer, Py_ssize_t length) -> ndarray:
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def has_infs(const floating[:] arr) -> bool:
+# TODO(cython3): Can add const once cython#1772 is resolved
+def has_infs(floating[:] arr) -> bool:
     cdef:
         Py_ssize_t i, n = len(arr)
         floating inf, neginf, val
diff --git a/pandas/_libs/ops.pyi b/pandas/_libs/ops.pyi
index 6738a1dff4a9e..515f7aa53ba15 100644
--- a/pandas/_libs/ops.pyi
+++ b/pandas/_libs/ops.pyi
@@ -37,8 +37,8 @@ def vec_binop(
 @overload
 def maybe_convert_bool(
     arr: npt.NDArray[np.object_],
-    true_values: Iterable | None = None,
-    false_values: Iterable | None = None,
+    true_values: Iterable = ...,
+    false_values: Iterable = ...,
     convert_to_masked_nullable: Literal[False] = ...,
 ) -> tuple[np.ndarray, None]: ...
 @overload
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index e447d3b0f5920..6d66e21ce49f5 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -35,7 +35,6 @@ from cpython.unicode cimport (
     PyUnicode_AsUTF8String,
     PyUnicode_Decode,
     PyUnicode_DecodeUTF8,
-    PyUnicode_FromString,
 )
 from cython cimport Py_ssize_t
 from libc.stdlib cimport free
@@ -45,6 +44,12 @@ from libc.string cimport (
     strncpy,
 )
 
+
+cdef extern from "Python.h":
+    # TODO(cython3): get this from cpython.unicode
+    object PyUnicode_FromString(char *v)
+
+
 import numpy as np
 
 cimport numpy as cnp
diff --git a/pandas/_libs/sparse.pyi b/pandas/_libs/sparse.pyi
index 536265b25425e..9e5cecc61e5ca 100644
--- a/pandas/_libs/sparse.pyi
+++ b/pandas/_libs/sparse.pyi
@@ -39,10 +39,6 @@ class BlockIndex(SparseIndex):
         self, length: int, blocs: np.ndarray, blengths: np.ndarray
     ) -> None: ...
 
-    # Override to have correct parameters
-    def intersect(self, other: SparseIndex) -> Self: ...
-    def make_union(self, y: SparseIndex) -> Self: ...
-
 def make_mask_object_ndarray(
     arr: npt.NDArray[np.object_], fill_value
 ) -> npt.NDArray[np.bool_]: ...
diff --git a/pandas/_libs/tslibs/conversion.pyi b/pandas/_libs/tslibs/conversion.pyi
index 6426e32c52304..d564d767f7f05 100644
--- a/pandas/_libs/tslibs/conversion.pyi
+++ b/pandas/_libs/tslibs/conversion.pyi
@@ -10,6 +10,5 @@ TD64NS_DTYPE: np.dtype
 
 def precision_from_unit(
     unit: str,
-    out_reso: int = ...,
 ) -> tuple[int, int]: ...  # (int64_t, _)
 def localize_pydatetime(dt: datetime, tz: tzinfo | None) -> datetime: ...
diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx
index 2a2a0f347ce12..45c4d7809fe7a 100644
--- a/pandas/_libs/tslibs/conversion.pyx
+++ b/pandas/_libs/tslibs/conversion.pyx
@@ -5,7 +5,6 @@ from libc.math cimport log10
 from numpy cimport (
     int32_t,
     int64_t,
-    npy_datetime,
 )
 
 cnp.import_array()
@@ -44,6 +43,7 @@ from pandas._libs.tslibs.np_datetime cimport (
     get_datetime64_value,
     get_implementation_bounds,
     import_pandas_datetime,
+    npy_datetime,
     npy_datetimestruct,
     npy_datetimestruct_to_datetime,
     pandas_datetime_to_datetimestruct,
diff --git a/pandas/_libs/tslibs/dtypes.pyi b/pandas/_libs/tslibs/dtypes.pyi
index b0293d2e0fcf2..bea3e18273318 100644
--- a/pandas/_libs/tslibs/dtypes.pyi
+++ b/pandas/_libs/tslibs/dtypes.pyi
@@ -5,10 +5,10 @@ from enum import Enum
 _attrname_to_abbrevs: dict[str, str]
 _period_code_map: dict[str, int]
 
-def periods_per_day(reso: int = ...) -> int: ...
+def periods_per_day(reso: int) -> int: ...
 def periods_per_second(reso: int) -> int: ...
 def is_supported_unit(reso: int) -> bool: ...
-def npy_unit_to_abbrev(unit: int) -> str: ...
+def npy_unit_to_abbrev(reso: int) -> str: ...
 def get_supported_reso(reso: int) -> int: ...
 def abbrev_to_npy_unit(abbrev: str) -> int: ...
 
diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx
index 04a6858297aee..7d75fa3114d2b 100644
--- a/pandas/_libs/tslibs/nattype.pyx
+++ b/pandas/_libs/tslibs/nattype.pyx
@@ -128,6 +128,11 @@ cdef class _NaT(datetime):
         return NotImplemented
 
     def __add__(self, other):
+        if self is not c_NaT:
+            # TODO(cython3): remove this it moved to __radd__
+            # cython __radd__ semantics
+            self, other = other, self
+
         if PyDateTime_Check(other):
             return c_NaT
         elif PyDelta_Check(other):
@@ -157,6 +162,15 @@ cdef class _NaT(datetime):
     def __sub__(self, other):
         # Duplicate some logic from _Timestamp.__sub__ to avoid needing
         # to subclass; allows us to @final(_Timestamp.__sub__)
+        cdef:
+            bint is_rsub = False
+
+        if self is not c_NaT:
+            # cython __rsub__ semantics
+            # TODO(cython3): remove __rsub__ logic from here
+            self, other = other, self
+            is_rsub = True
+
         if PyDateTime_Check(other):
             return c_NaT
         elif PyDelta_Check(other):
@@ -170,9 +184,19 @@ cdef class _NaT(datetime):
 
         elif util.is_array(other):
             if other.dtype.kind == "m":
-                # NaT - timedelta64 we treat NaT as datetime64, so result
-                #  is datetime64
-                result = np.empty(other.shape, dtype="datetime64[ns]")
+                if not is_rsub:
+                    # NaT - timedelta64 we treat NaT as datetime64, so result
+                    #  is datetime64
+                    result = np.empty(other.shape, dtype="datetime64[ns]")
+                    result.fill("NaT")
+                    return result
+
+                # __rsub__ logic here
+                # TODO(cython3): remove this, move above code out of
+                # ``if not is_rsub`` block
+                # timedelta64 - NaT we have to treat NaT as timedelta64
+                # for this to be meaningful, and the result is timedelta64
+                result = np.empty(other.shape, dtype="timedelta64[ns]")
                 result.fill("NaT")
                 return result
 
diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd
index bf29184d7a94b..60532174e8bdc 100644
--- a/pandas/_libs/tslibs/np_datetime.pxd
+++ b/pandas/_libs/tslibs/np_datetime.pxd
@@ -6,22 +6,35 @@ from cpython.datetime cimport (
 from numpy cimport (
     int32_t,
     int64_t,
-    npy_datetime,
-    npy_timedelta,
 )
 
 
+# TODO(cython3): most of these can be cimported directly from numpy
+cdef extern from "numpy/ndarrayobject.h":
+    ctypedef int64_t npy_timedelta
+    ctypedef int64_t npy_datetime
+
 cdef extern from "numpy/ndarraytypes.h":
     ctypedef struct PyArray_DatetimeMetaData:
         NPY_DATETIMEUNIT base
         int64_t num
 
+cdef extern from "numpy/arrayscalars.h":
+    ctypedef struct PyDatetimeScalarObject:
+        # PyObject_HEAD
+        npy_datetime obval
+        PyArray_DatetimeMetaData obmeta
+
+    ctypedef struct PyTimedeltaScalarObject:
+        # PyObject_HEAD
+        npy_timedelta obval
+        PyArray_DatetimeMetaData obmeta
+
 cdef extern from "numpy/ndarraytypes.h":
     ctypedef struct npy_datetimestruct:
         int64_t year
         int32_t month, day, hour, min, sec, us, ps, as
-    # TODO: Can remove this once NPY_FR_GENERIC is added to
-    # the Cython __init__.pxd for numpy
+
     ctypedef enum NPY_DATETIMEUNIT:
         NPY_FR_Y
         NPY_FR_M
diff --git a/pandas/_libs/tslibs/np_datetime.pyi b/pandas/_libs/tslibs/np_datetime.pyi
index c42bc43ac9d89..0cb0e3b0237d7 100644
--- a/pandas/_libs/tslibs/np_datetime.pyi
+++ b/pandas/_libs/tslibs/np_datetime.pyi
@@ -9,7 +9,7 @@ class OutOfBoundsTimedelta(ValueError): ...
 def py_get_unit_from_dtype(dtype: np.dtype): ...
 def py_td64_to_tdstruct(td64: int, unit: int) -> dict: ...
 def astype_overflowsafe(
-    values: np.ndarray,
+    arr: np.ndarray,
     dtype: np.dtype,
     copy: bool = ...,
     round_ok: bool = ...,
diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx
index 8873695c23381..7b2ee68c73ad2 100644
--- a/pandas/_libs/tslibs/np_datetime.pyx
+++ b/pandas/_libs/tslibs/np_datetime.pyx
@@ -28,11 +28,8 @@ cimport numpy as cnp
 
 cnp.import_array()
 from numpy cimport (
-    PyDatetimeScalarObject,
-    PyTimedeltaScalarObject,
     int64_t,
     ndarray,
-    npy_datetime,
     uint8_t,
 )
 
diff --git a/pandas/_libs/tslibs/offsets.pyi b/pandas/_libs/tslibs/offsets.pyi
index 1d37477573023..1a4742111db89 100644
--- a/pandas/_libs/tslibs/offsets.pyi
+++ b/pandas/_libs/tslibs/offsets.pyi
@@ -277,10 +277,7 @@ def roll_qtrday(
 INVALID_FREQ_ERR_MSG: Literal["Invalid frequency: {0}"]
 
 def shift_months(
-    dtindex: npt.NDArray[np.int64],
-    months: int,
-    day_opt: str | None = ...,
-    reso: int = ...,
+    dtindex: npt.NDArray[np.int64], months: int, day_opt: str | None = ...
 ) -> npt.NDArray[np.int64]: ...
 
 _offset_map: dict[str, BaseOffset]
diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx
index f330a0cea1917..958fe1181d309 100644
--- a/pandas/_libs/tslibs/offsets.pyx
+++ b/pandas/_libs/tslibs/offsets.pyx
@@ -479,7 +479,12 @@ cdef class BaseOffset:
         return type(self)(n=1, normalize=self.normalize, **self.kwds)
 
     def __add__(self, other):
-        if util.is_array(other) and other.dtype == object:
+        if not isinstance(self, BaseOffset):
+            # cython semantics; this is __radd__
+            # TODO(cython3): remove this, this moved to __radd__
+            return other.__add__(self)
+
+        elif util.is_array(other) and other.dtype == object:
             return np.array([self + x for x in other])
 
         try:
@@ -496,6 +501,10 @@ cdef class BaseOffset:
         elif type(other) is type(self):
             return type(self)(self.n - other.n, normalize=self.normalize,
                               **self.kwds)
+        elif not isinstance(self, BaseOffset):
+            # TODO(cython3): remove, this moved to __rsub__
+            # cython semantics, this is __rsub__
+            return (-other).__add__(self)
         else:
             # e.g. PeriodIndex
             return NotImplemented
@@ -509,6 +518,10 @@ cdef class BaseOffset:
         elif is_integer_object(other):
             return type(self)(n=other * self.n, normalize=self.normalize,
                               **self.kwds)
+        elif not isinstance(self, BaseOffset):
+            # TODO(cython3): remove this, this moved to __rmul__
+            # cython semantics, this is __rmul__
+            return other.__mul__(self)
         return NotImplemented
 
     def __rmul__(self, other):
@@ -997,6 +1010,10 @@ cdef class Tick(SingleConstructorOffset):
         return self.delta.__gt__(other)
 
     def __mul__(self, other):
+        if not isinstance(self, Tick):
+            # TODO(cython3), remove this, this moved to __rmul__
+            # cython semantics, this is __rmul__
+            return other.__mul__(self)
         if is_float_object(other):
             n = other * self.n
             # If the new `n` is an integer, we can represent it using the
@@ -1024,6 +1041,11 @@ cdef class Tick(SingleConstructorOffset):
         return _wrap_timedelta_result(result)
 
     def __add__(self, other):
+        if not isinstance(self, Tick):
+            # cython semantics; this is __radd__
+            # TODO(cython3): remove this, this moved to __radd__
+            return other.__add__(self)
+
         if isinstance(other, Tick):
             if type(self) is type(other):
                 return type(self)(self.n + other.n)
diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx
index 5e3ed8d99c659..3643c840a50a6 100644
--- a/pandas/_libs/tslibs/parsing.pyx
+++ b/pandas/_libs/tslibs/parsing.pyx
@@ -774,7 +774,8 @@ def try_parse_year_month_day(
         object[::1] result
 
     n = len(years)
-    if len(months) != n or len(days) != n:
+    # TODO(cython3): Use len instead of `shape[0]`
+    if months.shape[0] != n or days.shape[0] != n:
         raise ValueError("Length of years/months/days must all be equal")
     result = np.empty(n, dtype="O")
 
diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi
index b3aa6c34e323f..8826757e31c32 100644
--- a/pandas/_libs/tslibs/period.pyi
+++ b/pandas/_libs/tslibs/period.pyi
@@ -89,7 +89,7 @@ class Period(PeriodMixin):
     @classmethod
     def _from_ordinal(cls, ordinal: int, freq) -> Period: ...
     @classmethod
-    def now(cls, freq: BaseOffset) -> Period: ...
+    def now(cls, freq: BaseOffset = ...) -> Period: ...
     def strftime(self, fmt: str) -> str: ...
     def to_timestamp(
         self,
diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx
index eadb23e0a94ca..c37e9cd7ef1f3 100644
--- a/pandas/_libs/tslibs/period.pyx
+++ b/pandas/_libs/tslibs/period.pyx
@@ -1838,6 +1838,10 @@ cdef class _Period(PeriodMixin):
 
     def __add__(self, other):
         if not is_period_object(self):
+            # cython semantics; this is analogous to a call to __radd__
+            # TODO(cython3): remove this
+            if self is NaT:
+                return NaT
             return other.__add__(self)
 
         if is_any_td_scalar(other):
@@ -1872,6 +1876,10 @@ cdef class _Period(PeriodMixin):
 
     def __sub__(self, other):
         if not is_period_object(self):
+            # cython semantics; this is like a call to __rsub__
+            # TODO(cython3): remove this
+            if self is NaT:
+                return NaT
             return NotImplemented
 
         elif (
@@ -2503,7 +2511,7 @@ cdef class _Period(PeriodMixin):
         object_state = None, self.freq, self.ordinal
         return (Period, object_state)
 
-    def strftime(self, fmt: str | None) -> str:
+    def strftime(self, fmt: str) -> str:
         r"""
         Returns a formatted string representation of the :class:`Period`.
 
diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi
index da88ad32d625b..aba9b25b23154 100644
--- a/pandas/_libs/tslibs/timedeltas.pyi
+++ b/pandas/_libs/tslibs/timedeltas.pyi
@@ -68,7 +68,7 @@ UnitChoices: TypeAlias = Literal[
 _S = TypeVar("_S", bound=timedelta)
 
 def ints_to_pytimedelta(
-    m8values: npt.NDArray[np.timedelta64],
+    arr: npt.NDArray[np.timedelta64],
     box: bool = ...,
 ) -> npt.NDArray[np.object_]: ...
 def array_to_timedelta64(
@@ -162,10 +162,8 @@ class Timedelta(timedelta):
     def __gt__(self, other: timedelta) -> bool: ...
     def __hash__(self) -> int: ...
     def isoformat(self) -> str: ...
-    def to_numpy(
-        self, dtype: npt.DTypeLike = ..., copy: bool = False
-    ) -> np.timedelta64: ...
-    def view(self, dtype: npt.DTypeLike) -> object: ...
+    def to_numpy(self) -> np.timedelta64: ...
+    def view(self, dtype: npt.DTypeLike = ...) -> object: ...
     @property
     def unit(self) -> str: ...
     def as_unit(self, unit: str, round_ok: bool = ...) -> Timedelta: ...
diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx
index d2b57f447c350..ffa9a67542e21 100644
--- a/pandas/_libs/tslibs/timedeltas.pyx
+++ b/pandas/_libs/tslibs/timedeltas.pyx
@@ -1043,9 +1043,8 @@ cdef class _Timedelta(timedelta):
         """
         return npy_unit_to_abbrev(self._creso)
 
-    # TODO: make cdef property once this works in Cython
     @property
-    def days(self) -> int:
+    def days(self) -> int:  # TODO(cython3): make cdef property
         """
         Returns the days of the timedelta.
 
@@ -1068,9 +1067,8 @@ cdef class _Timedelta(timedelta):
         self._ensure_components()
         return self._d
 
-    # TODO: make cdef property once this works in Cython
     @property
-    def seconds(self) -> int:
+    def seconds(self) -> int:  # TODO(cython3): make cdef property
         """
         Return the total hours, minutes, and seconds of the timedelta as seconds.
 
@@ -1107,9 +1105,8 @@ cdef class _Timedelta(timedelta):
         self._ensure_components()
         return self._h * 3600 + self._m * 60 + self._s
 
-    # TODO: make cdef property once this works in Cython
     @property
-    def microseconds(self) -> int:
+    def microseconds(self) -> int:  # TODO(cython3): make cdef property
         # NB: using the python C-API PyDateTime_DELTA_GET_MICROSECONDS will fail
         #  (or be incorrect)
         self._ensure_components()
diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi
index 24c0a07eb7985..36ae2d6d892f1 100644
--- a/pandas/_libs/tslibs/timestamps.pyi
+++ b/pandas/_libs/tslibs/timestamps.pyi
@@ -180,7 +180,7 @@ class Timestamp(datetime):
     def is_year_end(self) -> bool: ...
     def to_pydatetime(self, warn: bool = ...) -> datetime: ...
     def to_datetime64(self) -> np.datetime64: ...
-    def to_period(self, freq: BaseOffset | str | None = None) -> Period: ...
+    def to_period(self, freq: BaseOffset | str = ...) -> Period: ...
     def to_julian_date(self) -> np.float64: ...
     @property
     def asm8(self) -> np.datetime64: ...
diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx
index 536a8372c64a8..844fc8f0ed187 100644
--- a/pandas/_libs/tslibs/timestamps.pyx
+++ b/pandas/_libs/tslibs/timestamps.pyx
@@ -476,6 +476,11 @@ cdef class _Timestamp(ABCTimestamp):
                     dtype=object,
                 )
 
+        elif not isinstance(self, _Timestamp):
+            # cython semantics, args have been switched and this is __radd__
+            # TODO(cython3): remove this it moved to __radd__
+            return other.__add__(self)
+
         return NotImplemented
 
     def __radd__(self, other):
@@ -509,10 +514,13 @@ cdef class _Timestamp(ABCTimestamp):
                 and (PyDateTime_Check(other) or is_datetime64_object(other))):
             # both_timestamps is to determine whether Timedelta(self - other)
             # should raise the OOB error, or fall back returning a timedelta.
+            # TODO(cython3): clean out the bits that moved to __rsub__
             both_timestamps = (isinstance(other, _Timestamp) and
                                isinstance(self, _Timestamp))
             if isinstance(self, _Timestamp):
                 other = type(self)(other)
+            else:
+                self = type(other)(self)
 
             if (self.tzinfo is None) ^ (other.tzinfo is None):
                 raise TypeError(
@@ -542,6 +550,11 @@ cdef class _Timestamp(ABCTimestamp):
                 # We get here in stata tests, fall back to stdlib datetime
                 #  method and return stdlib timedelta object
                 pass
+        elif is_datetime64_object(self):
+            # GH#28286 cython semantics for __rsub__, `other` is actually
+            #  the Timestamp
+            # TODO(cython3): remove this, this moved to __rsub__
+            return type(other)(self) - other
 
         return NotImplemented
 
diff --git a/pandas/_libs/tslibs/tzconversion.pyi b/pandas/_libs/tslibs/tzconversion.pyi
index 2108fa0f35547..a354765a348ec 100644
--- a/pandas/_libs/tslibs/tzconversion.pyi
+++ b/pandas/_libs/tslibs/tzconversion.pyi
@@ -10,7 +10,7 @@ from pandas._typing import npt
 
 # tz_convert_from_utc_single exposed for testing
 def tz_convert_from_utc_single(
-    utc_val: np.int64, tz: tzinfo, creso: int = ...
+    val: np.int64, tz: tzinfo, creso: int = ...
 ) -> np.int64: ...
 def tz_localize_to_utc(
     vals: npt.NDArray[np.int64],
diff --git a/pandas/_libs/tslibs/vectorized.pyi b/pandas/_libs/tslibs/vectorized.pyi
index de19f592da62b..3fd9e2501e611 100644
--- a/pandas/_libs/tslibs/vectorized.pyi
+++ b/pandas/_libs/tslibs/vectorized.pyi
@@ -31,7 +31,7 @@ def get_resolution(
     reso: int = ...,  # NPY_DATETIMEUNIT
 ) -> Resolution: ...
 def ints_to_pydatetime(
-    stamps: npt.NDArray[np.int64],
+    arr: npt.NDArray[np.int64],
     tz: tzinfo | None = ...,
     box: str = ...,
     reso: int = ...,  # NPY_DATETIMEUNIT
diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi
index a6cfbec9b15b9..b926a7cb73425 100644
--- a/pandas/_libs/window/aggregations.pyi
+++ b/pandas/_libs/window/aggregations.pyi
@@ -111,8 +111,8 @@ def ewm(
     com: float,  # float64_t
     adjust: bool,
     ignore_na: bool,
-    deltas: np.ndarray | None = None,  # const float64_t[:]
-    normalize: bool = True,
+    deltas: np.ndarray,  # const float64_t[:]
+    normalize: bool,
 ) -> np.ndarray: ...  # np.ndarray[np.float64]
 def ewmcov(
     input_x: np.ndarray,  # const float64_t[:]
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index 2b43b090a43e0..c3b8d1c0e79e8 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -2260,7 +2260,8 @@ def _concat_same_type(
         return new_obj
 
     def copy(self, order: str = "C") -> Self:
-        new_obj = super().copy(order=order)
+        # error: Unexpected keyword argument "order" for "copy"
+        new_obj = super().copy(order=order)  # type: ignore[call-arg]
         new_obj._freq = self.freq
         return new_obj
 
diff --git a/pyproject.toml b/pyproject.toml
index ae658329f42ee..1034196baa15e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "meson-python==0.13.1",
     "meson==1.0.1",
     "wheel",
-    "Cython>=3.0.0",  # Note: sync with setup.py, environment.yml and asv.conf.json
+    "Cython>=0.29.33,<3",  # Note: sync with setup.py, environment.yml and asv.conf.json
     # Note: numpy 1.25 has a backwards compatible C API by default
     # we don't want to force users to compile with 1.25 though
     # (Ideally, in the future, though, oldest-supported-numpy can be dropped when our min numpy is 1.25.x)
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 4af4351413a5b..0944acbc36c9b 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -3,7 +3,7 @@
 
 pip
 versioneer[toml]
-cython==3.0.0
+cython==0.29.33
 meson[ninja]==1.0.1
 meson-python==0.13.1
 pytest>=7.3.2
diff --git a/scripts/run_stubtest.py b/scripts/run_stubtest.py
index 35cbbef08124e..dedcdb5532593 100644
--- a/scripts/run_stubtest.py
+++ b/scripts/run_stubtest.py
@@ -47,8 +47,6 @@
     # stubtest might be too sensitive
     "pandas._libs.lib.NoDefault",
     "pandas._libs.lib._NoDefault.no_default",
-    # stubtest/Cython is not recognizing the default value for the dtype parameter
-    "pandas._libs.lib.map_infer_mask",
     # internal type alias (should probably be private)
     "pandas._libs.lib.ndarray_obj_2d",
     # runtime argument "owner" has a default value but stub argument does not
diff --git a/setup.py b/setup.py
index 1ea7a502505b5..663bbd3952eab 100755
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,7 @@ def is_platform_mac():
 
 
 # note: sync with pyproject.toml, environment.yml and asv.conf.json
-min_cython_ver = "3.0.0"
+min_cython_ver = "0.29.33"
 
 try:
     from Cython import (