Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into matplotlib
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke committed Nov 14, 2023
2 parents 00f30cd + 00d88e9 commit 0a79c0b
Show file tree
Hide file tree
Showing 129 changed files with 1,517 additions and 765 deletions.
100 changes: 47 additions & 53 deletions asv_bench/benchmarks/algorithms.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from importlib import import_module

import numpy as np
import pyarrow as pa

import pandas as pd

Expand All @@ -20,9 +19,9 @@ class Factorize:
[True, False],
[True, False],
[
"int",
"uint",
"float",
"int64",
"uint64",
"float64",
"object",
"object_str",
"datetime64[ns]",
Expand All @@ -36,28 +35,24 @@ class Factorize:

def setup(self, unique, sort, dtype):
N = 10**5
string_index = tm.makeStringIndex(N)
string_arrow = None
if dtype == "string[pyarrow]":
try:
string_arrow = pd.array(string_index, dtype="string[pyarrow]")
except ImportError:
raise NotImplementedError

data = {
"int": pd.Index(np.arange(N), dtype="int64"),
"uint": pd.Index(np.arange(N), dtype="uint64"),
"float": pd.Index(np.random.randn(N), dtype="float64"),
"object_str": string_index,
"object": pd.Index(np.arange(N), dtype="object"),
"datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N),
"datetime64[ns, tz]": pd.date_range(
"2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
),
"Int64": pd.array(np.arange(N), dtype="Int64"),
"boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"),
"string[pyarrow]": string_arrow,
}[dtype]

if dtype in ["int64", "uint64", "Int64", "object"]:
data = pd.Index(np.arange(N), dtype=dtype)
elif dtype == "float64":
data = pd.Index(np.random.randn(N), dtype=dtype)
elif dtype == "boolean":
data = pd.array(np.random.randint(0, 2, N), dtype=dtype)
elif dtype == "datetime64[ns]":
data = pd.date_range("2011-01-01", freq="h", periods=N)
elif dtype == "datetime64[ns, tz]":
data = pd.date_range("2011-01-01", freq="h", periods=N, tz="Asia/Tokyo")
elif dtype == "object_str":
data = tm.makeStringIndex(N)
elif dtype == "string[pyarrow]":
data = pd.array(tm.makeStringIndex(N), dtype="string[pyarrow]")
else:
raise NotImplementedError

if not unique:
data = data.repeat(5)
self.data = data
Expand All @@ -74,9 +69,9 @@ class Duplicated:
[True, False],
["first", "last", False],
[
"int",
"uint",
"float",
"int64",
"uint64",
"float64",
"string",
"datetime64[ns]",
"datetime64[ns, tz]",
Expand All @@ -88,22 +83,20 @@ class Duplicated:

def setup(self, unique, keep, dtype):
N = 10**5
data = {
"int": pd.Index(np.arange(N), dtype="int64"),
"uint": pd.Index(np.arange(N), dtype="uint64"),
"float": pd.Index(np.random.randn(N), dtype="float64"),
"string": tm.makeStringIndex(N),
"datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N),
"datetime64[ns, tz]": pd.date_range(
"2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
),
"timestamp[ms][pyarrow]": pd.Index(
np.arange(N), dtype=pd.ArrowDtype(pa.timestamp("ms"))
),
"duration[s][pyarrow]": pd.Index(
np.arange(N), dtype=pd.ArrowDtype(pa.duration("s"))
),
}[dtype]
if dtype in ["int64", "uint64"]:
data = pd.Index(np.arange(N), dtype=dtype)
elif dtype == "float64":
data = pd.Index(np.random.randn(N), dtype="float64")
elif dtype == "string":
data = tm.makeStringIndex(N)
elif dtype == "datetime64[ns]":
data = pd.date_range("2011-01-01", freq="h", periods=N)
elif dtype == "datetime64[ns, tz]":
data = pd.date_range("2011-01-01", freq="h", periods=N, tz="Asia/Tokyo")
elif dtype in ["timestamp[ms][pyarrow]", "duration[s][pyarrow]"]:
data = pd.Index(np.arange(N), dtype=dtype)
else:
raise NotImplementedError
if not unique:
data = data.repeat(5)
self.idx = data
Expand Down Expand Up @@ -181,21 +174,22 @@ class Quantile:
params = [
[0, 0.5, 1],
["linear", "nearest", "lower", "higher", "midpoint"],
["float", "int", "uint"],
["float64", "int64", "uint64"],
]
param_names = ["quantile", "interpolation", "dtype"]

def setup(self, quantile, interpolation, dtype):
N = 10**5
data = {
"int": np.arange(N),
"uint": np.arange(N).astype(np.uint64),
"float": np.random.randn(N),
}
self.idx = pd.Series(data[dtype].repeat(5))
if dtype in ["int64", "uint64"]:
data = np.arange(N, dtype=dtype)
elif dtype == "float64":
data = np.random.randn(N)
else:
raise NotImplementedError
self.ser = pd.Series(data.repeat(5))

def time_quantile(self, quantile, interpolation, dtype):
self.idx.quantile(quantile, interpolation=interpolation)
self.ser.quantile(quantile, interpolation=interpolation)


class SortIntegerArray:
Expand Down
2 changes: 1 addition & 1 deletion asv_bench/benchmarks/arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ class BinaryOpsMultiIndex:
param_names = ["func"]

def setup(self, func):
array = date_range("20200101 00:00", "20200102 0:00", freq="S")
array = date_range("20200101 00:00", "20200102 0:00", freq="s")
level_0_names = [str(i) for i in range(30)]

index = pd.MultiIndex.from_product([level_0_names, array])
Expand Down
6 changes: 3 additions & 3 deletions asv_bench/benchmarks/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ def time_from_float_array(self):
class IntegerArray:
def setup(self):
N = 250_000
self.values_integer = np.array([1, 0, 1, 0] * N)
self.data = np.array([1, 2, 3, 4] * N, dtype="int64")
self.mask = np.array([False, False, True, False] * N)
self.values_integer = np.tile(np.array([1, 0, 1, 0]), N)
self.data = np.tile(np.array([1, 2, 3, 4], dtype="int64"), N)
self.mask = np.tile(np.array([False, False, True, False]), N)

def time_constructor(self):
pd.arrays.IntegerArray(self.data, self.mask)
Expand Down
10 changes: 4 additions & 6 deletions asv_bench/benchmarks/categoricals.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,18 +260,16 @@ class CategoricalSlicing:
def setup(self, index):
N = 10**6
categories = ["a", "b", "c"]
values = [0] * N + [1] * N + [2] * N
if index == "monotonic_incr":
self.data = pd.Categorical.from_codes(values, categories=categories)
codes = np.repeat([0, 1, 2], N)
elif index == "monotonic_decr":
self.data = pd.Categorical.from_codes(
list(reversed(values)), categories=categories
)
codes = np.repeat([2, 1, 0], N)
elif index == "non_monotonic":
self.data = pd.Categorical.from_codes([0, 1, 2] * N, categories=categories)
codes = np.tile([0, 1, 2], N)
else:
raise ValueError(f"Invalid index param: {index}")

self.data = pd.Categorical.from_codes(codes, categories=categories)
self.scalar = 10000
self.list = list(range(10000))
self.cat_scalar = "b"
Expand Down
9 changes: 5 additions & 4 deletions asv_bench/benchmarks/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,9 +439,9 @@ def setup(self, inplace, dtype):
N, M = 10000, 100
if dtype in ("datetime64[ns]", "datetime64[ns, tz]", "timedelta64[ns]"):
data = {
"datetime64[ns]": date_range("2011-01-01", freq="H", periods=N),
"datetime64[ns]": date_range("2011-01-01", freq="h", periods=N),
"datetime64[ns, tz]": date_range(
"2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
"2011-01-01", freq="h", periods=N, tz="Asia/Tokyo"
),
"timedelta64[ns]": timedelta_range(start="1 day", periods=N, freq="1D"),
}
Expand Down Expand Up @@ -640,7 +640,8 @@ def time_frame_nunique(self):

class SeriesNuniqueWithNan:
def setup(self):
self.ser = Series(100000 * (100 * [np.nan] + list(range(100)))).astype(float)
values = 100 * [np.nan] + list(range(100))
self.ser = Series(np.tile(values, 10000), dtype=float)

def time_series_nunique_nan(self):
self.ser.nunique()
Expand All @@ -649,7 +650,7 @@ def time_series_nunique_nan(self):
class Duplicated:
def setup(self):
n = 1 << 20
t = date_range("2015-01-01", freq="S", periods=(n // 64))
t = date_range("2015-01-01", freq="s", periods=(n // 64))
xs = np.random.randn(n // 64).round(2)
self.df = DataFrame(
{
Expand Down
20 changes: 11 additions & 9 deletions asv_bench/benchmarks/gil.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def run(dti):
def time_datetime_to_period(self):
@test_parallel(num_threads=2)
def run(dti):
dti.to_period("S")
dti.to_period("s")

run(self.dti)

Expand Down Expand Up @@ -272,18 +272,20 @@ class ParallelReadCSV(BaseIO):
def setup(self, dtype):
rows = 10000
cols = 50
data = {
"float": DataFrame(np.random.randn(rows, cols)),
"datetime": DataFrame(
if dtype == "float":
df = DataFrame(np.random.randn(rows, cols))
elif dtype == "datetime":
df = DataFrame(
np.random.randn(rows, cols), index=date_range("1/1/2000", periods=rows)
),
"object": DataFrame(
)
elif dtype == "object":
df = DataFrame(
"foo", index=range(rows), columns=["object%03d" for _ in range(5)]
),
}
)
else:
raise NotImplementedError

self.fname = f"__test_{dtype}__.csv"
df = data[dtype]
df.to_csv(self.fname)

@test_parallel(num_threads=2)
Expand Down
4 changes: 2 additions & 2 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ def time_series_nth(self, dtype):

class DateAttributes:
def setup(self):
rng = date_range("1/1/2000", "12/31/2005", freq="H")
rng = date_range("1/1/2000", "12/31/2005", freq="h")
self.year, self.month, self.day = rng.year, rng.month, rng.day
self.ts = Series(np.random.randn(len(rng)), index=rng)

Expand Down Expand Up @@ -713,7 +713,7 @@ def setup(self, dtype, tie_method):
if dtype == "datetime64":
data = np.array([Timestamp("2011/01/01")] * N, dtype=dtype)
else:
data = np.array([1] * N, dtype=dtype)
data = np.ones(N, dtype=dtype)
self.df = DataFrame({"values": data, "key": ["foo"] * N})

def time_rank_ties(self, dtype, tie_method):
Expand Down
4 changes: 1 addition & 3 deletions asv_bench/benchmarks/index_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,7 @@ def setup(self, dtype):
self.sorted = self.idx.sort_values()
half = N // 2
self.non_unique = self.idx[:half].append(self.idx[:half])
self.non_unique_sorted = (
self.sorted[:half].append(self.sorted[:half]).sort_values()
)
self.non_unique_sorted = self.sorted[:half].repeat(2)
self.key = self.sorted[N // 4]

def time_boolean_array(self, dtype):
Expand Down
16 changes: 14 additions & 2 deletions asv_bench/benchmarks/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def setup(self, index):
N = 100000
indexes = {
"int": Index(np.arange(N), dtype=np.int64),
"datetime": date_range("2011-01-01", freq="S", periods=N),
"datetime": date_range("2011-01-01", freq="s", periods=N),
}
index = indexes[index]
self.s = Series(np.random.rand(N), index=index)
Expand Down Expand Up @@ -465,7 +465,7 @@ def time_loc_row(self, unique_cols):
class AssignTimeseriesIndex:
def setup(self):
N = 100000
idx = date_range("1/1/2000", periods=N, freq="H")
idx = date_range("1/1/2000", periods=N, freq="h")
self.df = DataFrame(np.random.randn(N, 1), columns=["A"], index=idx)

def time_frame_assign_timeseries_index(self):
Expand Down Expand Up @@ -515,6 +515,18 @@ def time_setitem_list(self):
self.df[[100, 200, 300]] = 100


class SetitemObjectDtype:
# GH#19299

def setup(self):
N = 1000
cols = 500
self.df = DataFrame(index=range(N), columns=range(cols), dtype=object)

def time_setitem_object_dtype(self):
self.df.loc[0, 1] = 1.0


class ChainIndexing:
params = [None, "warn"]
param_names = ["mode"]
Expand Down
27 changes: 12 additions & 15 deletions asv_bench/benchmarks/indexing_engines.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,22 +71,20 @@ def setup(self, engine_and_dtype, index_type, unique, N):
if unique:
arr = np.arange(N * 3, dtype=dtype)
else:
values = list([1] * N + [2] * N + [3] * N)
arr = np.array(values, dtype=dtype)
arr = np.array([1, 2, 3], dtype=dtype).repeat(N)
elif index_type == "monotonic_decr":
if unique:
arr = np.arange(N * 3, dtype=dtype)[::-1]
else:
values = list([1] * N + [2] * N + [3] * N)
arr = np.array(values, dtype=dtype)[::-1]
arr = np.array([3, 2, 1], dtype=dtype).repeat(N)
else:
assert index_type == "non_monotonic"
if unique:
arr = np.empty(N * 3, dtype=dtype)
arr[:N] = np.arange(N * 2, N * 3, dtype=dtype)
arr[N:] = np.arange(N * 2, dtype=dtype)
else:
arr = np.array([1, 2, 3] * N, dtype=dtype)
arr = np.array([1, 2, 3], dtype=dtype).repeat(N)

self.data = engine(arr)
# code belows avoids populating the mapping etc. while timing.
Expand Down Expand Up @@ -115,30 +113,29 @@ class MaskedNumericEngineIndexing:

def setup(self, engine_and_dtype, index_type, unique, N):
engine, dtype = engine_and_dtype
dtype = dtype.lower()

if index_type == "monotonic_incr":
if unique:
arr = np.arange(N * 3, dtype=dtype.lower())
arr = np.arange(N * 3, dtype=dtype)
else:
values = list([1] * N + [2] * N + [3] * N)
arr = np.array(values, dtype=dtype.lower())
arr = np.array([1, 2, 3], dtype=dtype).repeat(N)
mask = np.zeros(N * 3, dtype=np.bool_)
elif index_type == "monotonic_decr":
if unique:
arr = np.arange(N * 3, dtype=dtype.lower())[::-1]
arr = np.arange(N * 3, dtype=dtype)[::-1]
else:
values = list([1] * N + [2] * N + [3] * N)
arr = np.array(values, dtype=dtype.lower())[::-1]
arr = np.array([3, 2, 1], dtype=dtype).repeat(N)
mask = np.zeros(N * 3, dtype=np.bool_)
else:
assert index_type == "non_monotonic"
if unique:
arr = np.zeros(N * 3, dtype=dtype.lower())
arr[:N] = np.arange(N * 2, N * 3, dtype=dtype.lower())
arr[N:] = np.arange(N * 2, dtype=dtype.lower())
arr = np.zeros(N * 3, dtype=dtype)
arr[:N] = np.arange(N * 2, N * 3, dtype=dtype)
arr[N:] = np.arange(N * 2, dtype=dtype)

else:
arr = np.array([1, 2, 3] * N, dtype=dtype.lower())
arr = np.array([1, 2, 3], dtype=dtype).repeat(N)
mask = np.zeros(N * 3, dtype=np.bool_)
mask[-1] = True

Expand Down
Loading

0 comments on commit 0a79c0b

Please sign in to comment.