Merge remote-tracking branch 'upstream/main' into matplotlib

pandas-dev · Nov 14, 2023 · 0a79c0b · 0a79c0b
2 parents 00f30cd + 00d88e9
commit 0a79c0b
Show file tree

Hide file tree

Showing 129 changed files with 1,517 additions and 765 deletions.
diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
@@ -1,7 +1,6 @@
 from importlib import import_module
 
 import numpy as np
-import pyarrow as pa
 
 import pandas as pd
 
@@ -20,9 +19,9 @@ class Factorize:
         [True, False],
         [True, False],
         [
-            "int",
-            "uint",
-            "float",
+            "int64",
+            "uint64",
+            "float64",
             "object",
             "object_str",
             "datetime64[ns]",
@@ -36,28 +35,24 @@ class Factorize:
 
     def setup(self, unique, sort, dtype):
         N = 10**5
-        string_index = tm.makeStringIndex(N)
-        string_arrow = None
-        if dtype == "string[pyarrow]":
-            try:
-                string_arrow = pd.array(string_index, dtype="string[pyarrow]")
-            except ImportError:
-                raise NotImplementedError
-
-        data = {
-            "int": pd.Index(np.arange(N), dtype="int64"),
-            "uint": pd.Index(np.arange(N), dtype="uint64"),
-            "float": pd.Index(np.random.randn(N), dtype="float64"),
-            "object_str": string_index,
-            "object": pd.Index(np.arange(N), dtype="object"),
-            "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N),
-            "datetime64[ns, tz]": pd.date_range(
-                "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
-            ),
-            "Int64": pd.array(np.arange(N), dtype="Int64"),
-            "boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"),
-            "string[pyarrow]": string_arrow,
-        }[dtype]
+
+        if dtype in ["int64", "uint64", "Int64", "object"]:
+            data = pd.Index(np.arange(N), dtype=dtype)
+        elif dtype == "float64":
+            data = pd.Index(np.random.randn(N), dtype=dtype)
+        elif dtype == "boolean":
+            data = pd.array(np.random.randint(0, 2, N), dtype=dtype)
+        elif dtype == "datetime64[ns]":
+            data = pd.date_range("2011-01-01", freq="h", periods=N)
+        elif dtype == "datetime64[ns, tz]":
+            data = pd.date_range("2011-01-01", freq="h", periods=N, tz="Asia/Tokyo")
+        elif dtype == "object_str":
+            data = tm.makeStringIndex(N)
+        elif dtype == "string[pyarrow]":
+            data = pd.array(tm.makeStringIndex(N), dtype="string[pyarrow]")
+        else:
+            raise NotImplementedError
+
         if not unique:
             data = data.repeat(5)
         self.data = data
@@ -74,9 +69,9 @@ class Duplicated:
         [True, False],
         ["first", "last", False],
         [
-            "int",
-            "uint",
-            "float",
+            "int64",
+            "uint64",
+            "float64",
             "string",
             "datetime64[ns]",
             "datetime64[ns, tz]",
@@ -88,22 +83,20 @@ class Duplicated:
 
     def setup(self, unique, keep, dtype):
         N = 10**5
-        data = {
-            "int": pd.Index(np.arange(N), dtype="int64"),
-            "uint": pd.Index(np.arange(N), dtype="uint64"),
-            "float": pd.Index(np.random.randn(N), dtype="float64"),
-            "string": tm.makeStringIndex(N),
-            "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N),
-            "datetime64[ns, tz]": pd.date_range(
-                "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
-            ),
-            "timestamp[ms][pyarrow]": pd.Index(
-                np.arange(N), dtype=pd.ArrowDtype(pa.timestamp("ms"))
-            ),
-            "duration[s][pyarrow]": pd.Index(
-                np.arange(N), dtype=pd.ArrowDtype(pa.duration("s"))
-            ),
-        }[dtype]
+        if dtype in ["int64", "uint64"]:
+            data = pd.Index(np.arange(N), dtype=dtype)
+        elif dtype == "float64":
+            data = pd.Index(np.random.randn(N), dtype="float64")
+        elif dtype == "string":
+            data = tm.makeStringIndex(N)
+        elif dtype == "datetime64[ns]":
+            data = pd.date_range("2011-01-01", freq="h", periods=N)
+        elif dtype == "datetime64[ns, tz]":
+            data = pd.date_range("2011-01-01", freq="h", periods=N, tz="Asia/Tokyo")
+        elif dtype in ["timestamp[ms][pyarrow]", "duration[s][pyarrow]"]:
+            data = pd.Index(np.arange(N), dtype=dtype)
+        else:
+            raise NotImplementedError
         if not unique:
             data = data.repeat(5)
         self.idx = data
@@ -181,21 +174,22 @@ class Quantile:
     params = [
         [0, 0.5, 1],
         ["linear", "nearest", "lower", "higher", "midpoint"],
-        ["float", "int", "uint"],
+        ["float64", "int64", "uint64"],
     ]
     param_names = ["quantile", "interpolation", "dtype"]
 
     def setup(self, quantile, interpolation, dtype):
         N = 10**5
-        data = {
-            "int": np.arange(N),
-            "uint": np.arange(N).astype(np.uint64),
-            "float": np.random.randn(N),
-        }
-        self.idx = pd.Series(data[dtype].repeat(5))
+        if dtype in ["int64", "uint64"]:
+            data = np.arange(N, dtype=dtype)
+        elif dtype == "float64":
+            data = np.random.randn(N)
+        else:
+            raise NotImplementedError
+        self.ser = pd.Series(data.repeat(5))
 
     def time_quantile(self, quantile, interpolation, dtype):
-        self.idx.quantile(quantile, interpolation=interpolation)
+        self.ser.quantile(quantile, interpolation=interpolation)
 
 
 class SortIntegerArray:

diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py
@@ -491,7 +491,7 @@ class BinaryOpsMultiIndex:
     param_names = ["func"]
 
     def setup(self, func):
-        array = date_range("20200101 00:00", "20200102 0:00", freq="S")
+        array = date_range("20200101 00:00", "20200102 0:00", freq="s")
         level_0_names = [str(i) for i in range(30)]
 
         index = pd.MultiIndex.from_product([level_0_names, array])

diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py
@@ -31,9 +31,9 @@ def time_from_float_array(self):
 class IntegerArray:
     def setup(self):
         N = 250_000
-        self.values_integer = np.array([1, 0, 1, 0] * N)
-        self.data = np.array([1, 2, 3, 4] * N, dtype="int64")
-        self.mask = np.array([False, False, True, False] * N)
+        self.values_integer = np.tile(np.array([1, 0, 1, 0]), N)
+        self.data = np.tile(np.array([1, 2, 3, 4], dtype="int64"), N)
+        self.mask = np.tile(np.array([False, False, True, False]), N)
 
     def time_constructor(self):
         pd.arrays.IntegerArray(self.data, self.mask)

diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
@@ -260,18 +260,16 @@ class CategoricalSlicing:
     def setup(self, index):
         N = 10**6
         categories = ["a", "b", "c"]
-        values = [0] * N + [1] * N + [2] * N
         if index == "monotonic_incr":
-            self.data = pd.Categorical.from_codes(values, categories=categories)
+            codes = np.repeat([0, 1, 2], N)
         elif index == "monotonic_decr":
-            self.data = pd.Categorical.from_codes(
-                list(reversed(values)), categories=categories
-            )
+            codes = np.repeat([2, 1, 0], N)
         elif index == "non_monotonic":
-            self.data = pd.Categorical.from_codes([0, 1, 2] * N, categories=categories)
+            codes = np.tile([0, 1, 2], N)
         else:
             raise ValueError(f"Invalid index param: {index}")
 
+        self.data = pd.Categorical.from_codes(codes, categories=categories)
         self.scalar = 10000
         self.list = list(range(10000))
         self.cat_scalar = "b"

diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
@@ -439,9 +439,9 @@ def setup(self, inplace, dtype):
         N, M = 10000, 100
         if dtype in ("datetime64[ns]", "datetime64[ns, tz]", "timedelta64[ns]"):
             data = {
-                "datetime64[ns]": date_range("2011-01-01", freq="H", periods=N),
+                "datetime64[ns]": date_range("2011-01-01", freq="h", periods=N),
                 "datetime64[ns, tz]": date_range(
-                    "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
+                    "2011-01-01", freq="h", periods=N, tz="Asia/Tokyo"
                 ),
                 "timedelta64[ns]": timedelta_range(start="1 day", periods=N, freq="1D"),
             }
@@ -640,7 +640,8 @@ def time_frame_nunique(self):
 
 class SeriesNuniqueWithNan:
     def setup(self):
-        self.ser = Series(100000 * (100 * [np.nan] + list(range(100)))).astype(float)
+        values = 100 * [np.nan] + list(range(100))
+        self.ser = Series(np.tile(values, 10000), dtype=float)
 
     def time_series_nunique_nan(self):
         self.ser.nunique()
@@ -649,7 +650,7 @@ def time_series_nunique_nan(self):
 class Duplicated:
     def setup(self):
         n = 1 << 20
-        t = date_range("2015-01-01", freq="S", periods=(n // 64))
+        t = date_range("2015-01-01", freq="s", periods=(n // 64))
         xs = np.random.randn(n // 64).round(2)
         self.df = DataFrame(
             {

diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py
@@ -212,7 +212,7 @@ def run(dti):
     def time_datetime_to_period(self):
         @test_parallel(num_threads=2)
         def run(dti):
-            dti.to_period("S")
+            dti.to_period("s")
 
         run(self.dti)
 
@@ -272,18 +272,20 @@ class ParallelReadCSV(BaseIO):
     def setup(self, dtype):
         rows = 10000
         cols = 50
-        data = {
-            "float": DataFrame(np.random.randn(rows, cols)),
-            "datetime": DataFrame(
+        if dtype == "float":
+            df = DataFrame(np.random.randn(rows, cols))
+        elif dtype == "datetime":
+            df = DataFrame(
                 np.random.randn(rows, cols), index=date_range("1/1/2000", periods=rows)
-            ),
-            "object": DataFrame(
+            )
+        elif dtype == "object":
+            df = DataFrame(
                 "foo", index=range(rows), columns=["object%03d" for _ in range(5)]
-            ),
-        }
+            )
+        else:
+            raise NotImplementedError
 
         self.fname = f"__test_{dtype}__.csv"
-        df = data[dtype]
         df.to_csv(self.fname)
 
         @test_parallel(num_threads=2)

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -238,7 +238,7 @@ def time_series_nth(self, dtype):
 
 class DateAttributes:
     def setup(self):
-        rng = date_range("1/1/2000", "12/31/2005", freq="H")
+        rng = date_range("1/1/2000", "12/31/2005", freq="h")
         self.year, self.month, self.day = rng.year, rng.month, rng.day
         self.ts = Series(np.random.randn(len(rng)), index=rng)
 
@@ -713,7 +713,7 @@ def setup(self, dtype, tie_method):
         if dtype == "datetime64":
             data = np.array([Timestamp("2011/01/01")] * N, dtype=dtype)
         else:
-            data = np.array([1] * N, dtype=dtype)
+            data = np.ones(N, dtype=dtype)
         self.df = DataFrame({"values": data, "key": ["foo"] * N})
 
     def time_rank_ties(self, dtype, tie_method):

diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py
@@ -161,9 +161,7 @@ def setup(self, dtype):
         self.sorted = self.idx.sort_values()
         half = N // 2
         self.non_unique = self.idx[:half].append(self.idx[:half])
-        self.non_unique_sorted = (
-            self.sorted[:half].append(self.sorted[:half]).sort_values()
-        )
+        self.non_unique_sorted = self.sorted[:half].repeat(2)
         self.key = self.sorted[N // 4]
 
     def time_boolean_array(self, dtype):

diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
@@ -232,7 +232,7 @@ def setup(self, index):
         N = 100000
         indexes = {
             "int": Index(np.arange(N), dtype=np.int64),
-            "datetime": date_range("2011-01-01", freq="S", periods=N),
+            "datetime": date_range("2011-01-01", freq="s", periods=N),
         }
         index = indexes[index]
         self.s = Series(np.random.rand(N), index=index)
@@ -465,7 +465,7 @@ def time_loc_row(self, unique_cols):
 class AssignTimeseriesIndex:
     def setup(self):
         N = 100000
-        idx = date_range("1/1/2000", periods=N, freq="H")
+        idx = date_range("1/1/2000", periods=N, freq="h")
         self.df = DataFrame(np.random.randn(N, 1), columns=["A"], index=idx)
 
     def time_frame_assign_timeseries_index(self):
@@ -515,6 +515,18 @@ def time_setitem_list(self):
         self.df[[100, 200, 300]] = 100
 
 
+class SetitemObjectDtype:
+    # GH#19299
+
+    def setup(self):
+        N = 1000
+        cols = 500
+        self.df = DataFrame(index=range(N), columns=range(cols), dtype=object)
+
+    def time_setitem_object_dtype(self):
+        self.df.loc[0, 1] = 1.0
+
+
 class ChainIndexing:
     params = [None, "warn"]
     param_names = ["mode"]

diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py
@@ -71,22 +71,20 @@ def setup(self, engine_and_dtype, index_type, unique, N):
             if unique:
                 arr = np.arange(N * 3, dtype=dtype)
             else:
-                values = list([1] * N + [2] * N + [3] * N)
-                arr = np.array(values, dtype=dtype)
+                arr = np.array([1, 2, 3], dtype=dtype).repeat(N)
         elif index_type == "monotonic_decr":
             if unique:
                 arr = np.arange(N * 3, dtype=dtype)[::-1]
             else:
-                values = list([1] * N + [2] * N + [3] * N)
-                arr = np.array(values, dtype=dtype)[::-1]
+                arr = np.array([3, 2, 1], dtype=dtype).repeat(N)
         else:
             assert index_type == "non_monotonic"
             if unique:
                 arr = np.empty(N * 3, dtype=dtype)
                 arr[:N] = np.arange(N * 2, N * 3, dtype=dtype)
                 arr[N:] = np.arange(N * 2, dtype=dtype)
             else:
-                arr = np.array([1, 2, 3] * N, dtype=dtype)
+                arr = np.array([1, 2, 3], dtype=dtype).repeat(N)
 
         self.data = engine(arr)
         # code belows avoids populating the mapping etc. while timing.
@@ -115,30 +113,29 @@ class MaskedNumericEngineIndexing:
 
     def setup(self, engine_and_dtype, index_type, unique, N):
         engine, dtype = engine_and_dtype
+        dtype = dtype.lower()
 
         if index_type == "monotonic_incr":
             if unique:
-                arr = np.arange(N * 3, dtype=dtype.lower())
+                arr = np.arange(N * 3, dtype=dtype)
             else:
-                values = list([1] * N + [2] * N + [3] * N)
-                arr = np.array(values, dtype=dtype.lower())
+                arr = np.array([1, 2, 3], dtype=dtype).repeat(N)
             mask = np.zeros(N * 3, dtype=np.bool_)
         elif index_type == "monotonic_decr":
             if unique:
-                arr = np.arange(N * 3, dtype=dtype.lower())[::-1]
+                arr = np.arange(N * 3, dtype=dtype)[::-1]
             else:
-                values = list([1] * N + [2] * N + [3] * N)
-                arr = np.array(values, dtype=dtype.lower())[::-1]
+                arr = np.array([3, 2, 1], dtype=dtype).repeat(N)
             mask = np.zeros(N * 3, dtype=np.bool_)
         else:
             assert index_type == "non_monotonic"
             if unique:
-                arr = np.zeros(N * 3, dtype=dtype.lower())
-                arr[:N] = np.arange(N * 2, N * 3, dtype=dtype.lower())
-                arr[N:] = np.arange(N * 2, dtype=dtype.lower())
+                arr = np.zeros(N * 3, dtype=dtype)
+                arr[:N] = np.arange(N * 2, N * 3, dtype=dtype)
+                arr[N:] = np.arange(N * 2, dtype=dtype)
 
             else:
-                arr = np.array([1, 2, 3] * N, dtype=dtype.lower())
+                arr = np.array([1, 2, 3], dtype=dtype).repeat(N)
             mask = np.zeros(N * 3, dtype=np.bool_)
             mask[-1] = True