Merge branch 'main' into reset_index

pandas-dev · Dec 7, 2023 · cf59f20 · cf59f20
2 parents fb8552c + d36fb98
commit cf59f20
Show file tree

Hide file tree

Showing 398 changed files with 7,466 additions and 5,217 deletions.
diff --git a/.circleci/setup_env.sh b/.circleci/setup_env.sh
@@ -55,6 +55,6 @@ if pip show pandas 1>/dev/null; then
 fi
 
 echo "Install pandas"
-python -m pip install --no-build-isolation -ve .
+python -m pip install --no-build-isolation -ve . --config-settings=setup-args="--werror"
 
 echo "done"
diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml
@@ -25,8 +25,10 @@ runs:
     - name: Build Pandas
       run: |
         if [[ ${{ inputs.editable }} == "true" ]]; then
-          pip install -e . --no-build-isolation -v --no-deps
+          pip install -e . --no-build-isolation -v --no-deps \
+            --config-settings=setup-args="--werror"
         else
-          pip install . --no-build-isolation -v --no-deps
+          pip install . --no-build-isolation -v --no-deps \
+            --config-settings=setup-args="--werror"
         fi
       shell: bash -el {0}
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -26,7 +26,7 @@ jobs:
     timeout-minutes: 90
     strategy:
       matrix:
-        env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml]
+        env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml]
         # Prevent the include jobs from overriding other jobs
         pattern: [""]
         include:
@@ -69,10 +69,22 @@ jobs:
             env_file: actions-311.yaml
             pattern: "not slow and not network and not single_cpu"
             pandas_copy_on_write: "1"
+          - name: "Copy-on-Write 3.12"
+            env_file: actions-312.yaml
+            pattern: "not slow and not network and not single_cpu"
+            pandas_copy_on_write: "1"
           - name: "Copy-on-Write 3.11 (warnings)"
             env_file: actions-311.yaml
             pattern: "not slow and not network and not single_cpu"
             pandas_copy_on_write: "warn"
+          - name: "Copy-on-Write 3.10 (warnings)"
+            env_file: actions-310.yaml
+            pattern: "not slow and not network and not single_cpu"
+            pandas_copy_on_write: "warn"
+          - name: "Copy-on-Write 3.9 (warnings)"
+            env_file: actions-39.yaml
+            pattern: "not slow and not network and not single_cpu"
+            pandas_copy_on_write: "warn"
           - name: "Pypy"
             env_file: actions-pypy-39.yaml
             pattern: "not slow and not network and not single_cpu"
@@ -88,14 +100,15 @@ jobs:
     name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }}
     env:
       PATTERN: ${{ matrix.pattern }}
-      EXTRA_APT: ${{ matrix.extra_apt || '' }}
       LANG: ${{ matrix.lang || 'C.UTF-8' }}
       LC_ALL: ${{ matrix.lc_all || '' }}
       PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }}
       PANDAS_CI: ${{ matrix.pandas_ci || '1' }}
       TEST_ARGS: ${{ matrix.test_args || '' }}
       PYTEST_WORKERS: 'auto'
       PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }}
+      # Clipboard tests
+      QT_QPA_PLATFORM: offscreen
     concurrency:
       # https://github.community/t/concurrecy-not-work-for-push/183068/7
       group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_copy_on_write || '' }}
@@ -145,8 +158,8 @@ jobs:
         fetch-depth: 0
 
     - name: Extra installs
-      # xsel for clipboard tests
-      run: sudo apt-get update && sudo apt-get install -y xsel ${{ env.EXTRA_APT }}
+      run: sudo apt-get update && sudo apt-get install -y ${{ matrix.extra_apt }}
+      if: ${{ matrix.extra_apt }}
 
     - name: Generate extra locales
       # These extra locales will be available for locale.setlocale() calls in tests
@@ -181,7 +194,7 @@ jobs:
     strategy:
       matrix:
         os: [macos-latest, windows-latest]
-        env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml]
+        env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml]
       fail-fast: false
     runs-on: ${{ matrix.os }}
     name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }}
@@ -241,7 +254,7 @@ jobs:
           python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1
           python -m pip install numpy --config-settings=setup-args="-Dallow-noblas=true"
           python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1
-          python -m pip install --no-cache-dir --no-build-isolation -e .
+          python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror"
           python -m pip list --no-cache-dir
           export PANDAS_CI=1
           python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml
@@ -279,7 +292,7 @@ jobs:
           . ~/virtualenvs/pandas-dev/bin/activate
           python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1
           python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1
-          python -m pip install --no-cache-dir --no-build-isolation -e .
+          python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror"
           python -m pip list --no-cache-dir
 
       - name: Run Tests
@@ -312,7 +325,7 @@ jobs:
     #    To freeze this file, uncomment out the ``if: false`` condition, and migrate the jobs
     #    to the corresponding posix/windows-macos/sdist etc. workflows.
     # Feel free to modify this comment as necessary.
-    #if: false # Uncomment this to freeze the workflow, comment it to unfreeze
+    if: false # Uncomment this to freeze the workflow, comment it to unfreeze
     defaults:
       run:
         shell: bash -eou pipefail {0}
@@ -352,7 +365,7 @@ jobs:
           python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy
           python -m pip install versioneer[toml]
           python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov
-          python -m pip install -ve . --no-build-isolation --no-index --no-deps
+          python -m pip install -ve . --no-build-isolation --no-index --no-deps --config-settings=setup-args="--werror"
           python -m pip list
 
       - name: Run Tests

diff --git a/.gitignore b/.gitignore
@@ -39,6 +39,7 @@
 .mesonpy-native-file.ini
 MANIFEST
 compile_commands.json
+debug
 .debug
 
 # Python files #
@@ -104,10 +105,11 @@ scikits
 # Generated Sources #
 #####################
 !skts.c
-!np_datetime.c
-!np_datetime_strings.c
 *.c
 *.cpp
+!pandas/_libs/src/**/*.c
+!pandas/_libs/src/**/*.h
+!pandas/_libs/include/**/*.h
 
 # Unit / Performance Testing #
 ##############################

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -20,11 +20,11 @@ ci:
 repos:
 -   repo: https://github.com/hauntsaninja/black-pre-commit-mirror
     # black compiled with mypyc
-    rev: 23.10.1
+    rev: 23.11.0
     hooks:
       - id: black
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.4
+    rev: v0.1.6
     hooks:
     -   id: ruff
         args: [--exit-non-zero-on-fix]
@@ -47,7 +47,7 @@ repos:
         types_or: [python, rst, markdown, cython, c]
         additional_dependencies: [tomli]
 -   repo: https://github.com/MarcoGorelli/cython-lint
-    rev: v0.15.0
+    rev: v0.16.0
     hooks:
     -   id: cython-lint
     -   id: double-quote-cython-strings
@@ -111,11 +111,11 @@ repos:
         types: [text]  # overwrite types: [rst]
         types_or: [python, rst]
 -   repo: https://github.com/sphinx-contrib/sphinx-lint
-    rev: v0.8.1
+    rev: v0.9.0
     hooks:
     - id: sphinx-lint
 -   repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v17.0.4
+    rev: v17.0.6
     hooks:
     - id: clang-format
       files: ^pandas/_libs/src|^pandas/_libs/include
@@ -240,8 +240,9 @@ repos:
             # pytest raises without context
             |\s\ pytest.raises
 
+            # TODO
             # pytest.warns (use tm.assert_produces_warning instead)
-            |pytest\.warns
+            # |pytest\.warns
 
             # os.remove
             |os\.remove

diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
@@ -4,8 +4,6 @@
 
 import pandas as pd
 
-from .pandas_vb_common import tm
-
 for imp in ["pandas.util", "pandas.tools.hashing"]:
     try:
         hashing = import_module(imp)
@@ -47,9 +45,12 @@ def setup(self, unique, sort, dtype):
         elif dtype == "datetime64[ns, tz]":
             data = pd.date_range("2011-01-01", freq="h", periods=N, tz="Asia/Tokyo")
         elif dtype == "object_str":
-            data = tm.makeStringIndex(N)
+            data = pd.Index([f"i-{i}" for i in range(N)], dtype=object)
         elif dtype == "string[pyarrow]":
-            data = pd.array(tm.makeStringIndex(N), dtype="string[pyarrow]")
+            data = pd.array(
+                pd.Index([f"i-{i}" for i in range(N)], dtype=object),
+                dtype="string[pyarrow]",
+            )
         else:
             raise NotImplementedError
 
@@ -88,7 +89,7 @@ def setup(self, unique, keep, dtype):
         elif dtype == "float64":
             data = pd.Index(np.random.randn(N), dtype="float64")
         elif dtype == "string":
-            data = tm.makeStringIndex(N)
+            data = pd.Index([f"i-{i}" for i in range(N)], dtype=object)
         elif dtype == "datetime64[ns]":
             data = pd.date_range("2011-01-01", freq="h", periods=N)
         elif dtype == "datetime64[ns, tz]":
@@ -136,7 +137,9 @@ def setup_cache(self):
         df = pd.DataFrame(
             {
                 "strings": pd.Series(
-                    tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=N))
+                    pd.Index([f"i-{i}" for i in range(10000)], dtype=object).take(
+                        np.random.randint(0, 10000, size=N)
+                    )
                 ),
                 "floats": np.random.randn(N),
                 "ints": np.arange(N),

diff --git a/asv_bench/benchmarks/algos/isin.py b/asv_bench/benchmarks/algos/isin.py
@@ -8,8 +8,6 @@
     date_range,
 )
 
-from ..pandas_vb_common import tm
-
 
 class IsIn:
     params = [
@@ -60,7 +58,9 @@ def setup(self, dtype):
 
         elif dtype in ["str", "string[python]", "string[pyarrow]"]:
             try:
-                self.series = Series(tm.makeStringIndex(N), dtype=dtype)
+                self.series = Series(
+                    Index([f"i-{i}" for i in range(N)], dtype=object), dtype=dtype
+                )
             except ImportError:
                 raise NotImplementedError
             self.values = list(self.series[:2])

diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py
@@ -6,12 +6,12 @@
 import pandas as pd
 from pandas import (
     DataFrame,
+    Index,
     Series,
     Timestamp,
     date_range,
     to_timedelta,
 )
-import pandas._testing as tm
 from pandas.core.algorithms import checked_add_with_arr
 
 from .pandas_vb_common import numeric_dtypes
@@ -323,8 +323,10 @@ class IndexArithmetic:
 
     def setup(self, dtype):
         N = 10**6
-        indexes = {"int": "makeIntIndex", "float": "makeFloatIndex"}
-        self.index = getattr(tm, indexes[dtype])(N)
+        if dtype == "float":
+            self.index = Index(np.arange(N), dtype=np.float64)
+        elif dtype == "int":
+            self.index = Index(np.arange(N), dtype=np.int64)
 
     def time_add(self, dtype):
         self.index + 2

diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
@@ -6,8 +6,6 @@
 
 import pandas as pd
 
-from .pandas_vb_common import tm
-
 try:
     from pandas.api.types import union_categoricals
 except ImportError:
@@ -189,7 +187,7 @@ def setup(self):
         N = 10**5
         ncats = 15
 
-        self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str)
+        self.s_str = pd.Series(np.random.randint(0, ncats, size=N).astype(str))
         self.s_str_cat = pd.Series(self.s_str, dtype="category")
         with warnings.catch_warnings(record=True):
             str_cat_type = pd.CategoricalDtype(set(self.s_str), ordered=True)
@@ -242,7 +240,7 @@ def time_categorical_series_is_monotonic_decreasing(self):
 class Contains:
     def setup(self):
         N = 10**5
-        self.ci = tm.makeCategoricalIndex(N)
+        self.ci = pd.CategoricalIndex(np.arange(N))
         self.c = self.ci.values
         self.key = self.ci.categories[0]
 
@@ -325,7 +323,7 @@ def time_sort_values(self):
 class SearchSorted:
     def setup(self):
         N = 10**5
-        self.ci = tm.makeCategoricalIndex(N).sort_values()
+        self.ci = pd.CategoricalIndex(np.arange(N)).sort_values()
         self.c = self.ci.values
         self.key = self.ci.categories[1]
 

diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py
@@ -9,8 +9,6 @@
     date_range,
 )
 
-from .pandas_vb_common import tm
-
 
 def no_change(arr):
     return arr
@@ -115,7 +113,7 @@ def time_dtindex_from_index_with_series(self):
 class MultiIndexConstructor:
     def setup(self):
         N = 10**4
-        self.iterables = [tm.makeStringIndex(N), range(20)]
+        self.iterables = [Index([f"i-{i}" for i in range(N)], dtype=object), range(20)]
 
     def time_multiindex_from_iterables(self):
         MultiIndex.from_product(self.iterables)

diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py
@@ -3,7 +3,10 @@
 import numpy as np
 
 import pandas as pd
-from pandas import DataFrame
+from pandas import (
+    DataFrame,
+    Index,
+)
 import pandas._testing as tm
 from pandas.api.types import (
     is_extension_array_dtype,
@@ -73,8 +76,8 @@ class SelectDtypes:
 
     def setup(self, dtype):
         N, K = 5000, 50
-        self.index = tm.makeStringIndex(N)
-        self.columns = tm.makeStringIndex(K)
+        self.index = Index([f"i-{i}" for i in range(N)], dtype=object)
+        self.columns = Index([f"i-{i}" for i in range(K)], dtype=object)
 
         def create_df(data):
             return DataFrame(data, index=self.index, columns=self.columns)

diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py
@@ -12,8 +12,6 @@
     date_range,
 )
 
-from .pandas_vb_common import tm
-
 try:
     from pandas.tseries.offsets import (
         Hour,
@@ -30,8 +28,8 @@
 class FromDicts:
     def setup(self):
         N, K = 5000, 50
-        self.index = tm.makeStringIndex(N)
-        self.columns = tm.makeStringIndex(K)
+        self.index = pd.Index([f"i-{i}" for i in range(N)], dtype=object)
+        self.columns = pd.Index([f"i-{i}" for i in range(K)], dtype=object)
         frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns)
         self.data = frame.to_dict()
         self.dict_list = frame.to_dict(orient="records")