-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
Commit
* ENH: non float64 result support in numba groupby * refactor & simplify * fix CI * maybe green? * skip unsupported ops in other bench as well * updates from code review * remove commented code * update whatsnew * debug benchmarks * Skip min/max benchmarks
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -57,6 +57,38 @@ | |
}, | ||
} | ||
|
||
# These aggregations don't have a kernel implemented for them yet | ||
_numba_unsupported_methods = [ | ||
"all", | ||
"any", | ||
"bfill", | ||
"count", | ||
"cumcount", | ||
"cummax", | ||
"cummin", | ||
"cumprod", | ||
"cumsum", | ||
"describe", | ||
"diff", | ||
"ffill", | ||
"first", | ||
"head", | ||
"last", | ||
"median", | ||
"nunique", | ||
"pct_change", | ||
"prod", | ||
"quantile", | ||
"rank", | ||
"sem", | ||
"shift", | ||
"size", | ||
"skew", | ||
"tail", | ||
"unique", | ||
"value_counts", | ||
] | ||
|
||
|
||
class ApplyDictReturn: | ||
def setup(self): | ||
|
@@ -453,9 +485,10 @@ class GroupByMethods: | |
], | ||
["direct", "transformation"], | ||
[1, 5], | ||
["cython", "numba"], | ||
] | ||
|
||
def setup(self, dtype, method, application, ncols): | ||
def setup(self, dtype, method, application, ncols, engine): | ||
if method in method_blocklist.get(dtype, {}): | ||
raise NotImplementedError # skip benchmark | ||
|
||
|
@@ -474,6 +507,19 @@ def setup(self, dtype, method, application, ncols): | |
# DataFrameGroupBy doesn't have these methods | ||
raise NotImplementedError | ||
|
||
# Numba currently doesn't support | ||
# multiple transform functions or strs for transform, | ||
# grouping on multiple columns | ||
# and we lack kernels for a bunch of methods | ||
if ( | ||
engine == "numba" | ||
and method in _numba_unsupported_methods | ||
or ncols > 1 | ||
or application == "transformation" | ||
or dtype == "datetime" | ||
): | ||
raise NotImplementedError | ||
|
||
if method == "describe": | ||
ngroups = 20 | ||
elif method == "skew": | ||
|
@@ -505,17 +551,30 @@ def setup(self, dtype, method, application, ncols): | |
if len(cols) == 1: | ||
cols = cols[0] | ||
|
||
# Not everything supports the engine keyword yet | ||
kwargs = {} | ||
if engine == "numba": | ||
kwargs["engine"] = engine | ||
|
||
if application == "transformation": | ||
self.as_group_method = lambda: df.groupby("key")[cols].transform(method) | ||
self.as_field_method = lambda: df.groupby(cols)["key"].transform(method) | ||
self.as_group_method = lambda: df.groupby("key")[cols].transform( | ||
method, **kwargs | ||
) | ||
self.as_field_method = lambda: df.groupby(cols)["key"].transform( | ||
method, **kwargs | ||
) | ||
else: | ||
self.as_group_method = getattr(df.groupby("key")[cols], method) | ||
self.as_field_method = getattr(df.groupby(cols)["key"], method) | ||
self.as_group_method = partial( | ||
getattr(df.groupby("key")[cols], method), **kwargs | ||
) | ||
self.as_field_method = partial( | ||
getattr(df.groupby(cols)["key"], method), **kwargs | ||
) | ||
|
||
def time_dtype_as_group(self, dtype, method, application, ncols): | ||
def time_dtype_as_group(self, dtype, method, application, ncols, engine): | ||
self.as_group_method() | ||
|
||
def time_dtype_as_field(self, dtype, method, application, ncols): | ||
def time_dtype_as_field(self, dtype, method, application, ncols, engine): | ||
self.as_field_method() | ||
|
||
|
||
|
@@ -532,8 +591,12 @@ class GroupByCythonAgg: | |
[ | ||
"sum", | ||
"prod", | ||
"min", | ||
"max", | ||
# TODO: uncomment min/max | ||
# Currently, min/max implemented very inefficiently | ||
# because it re-uses the Window min/max kernel | ||
# so it will time out ASVs | ||
# "min", | ||
# "max", | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
lithomas1
Author
Member
|
||
"mean", | ||
"median", | ||
"var", | ||
|
@@ -554,6 +617,22 @@ def time_frame_agg(self, dtype, method): | |
self.df.groupby("key").agg(method) | ||
|
||
|
||
class GroupByNumbaAgg(GroupByCythonAgg): | ||
""" | ||
Benchmarks specifically targeting our numba aggregation algorithms | ||
(using a big enough dataframe with simple key, so a large part of the | ||
time is actually spent in the grouped aggregation). | ||
""" | ||
|
||
def setup(self, dtype, method): | ||
if method in _numba_unsupported_methods: | ||
raise NotImplementedError | ||
super().setup(dtype, method) | ||
|
||
def time_frame_agg(self, dtype, method): | ||
self.df.groupby("key").agg(method, engine="numba") | ||
|
||
|
||
class GroupByCythonAggEaDtypes: | ||
""" | ||
Benchmarks specifically targeting our cython aggregation algorithms | ||
|
@lithomas1 - not sure I understand what this means; perhaps it was meant for another benchmark? For the Cython min/max implementation in groupby, I don't believe we're reusing the Window kernels, and I'm seeing better performance for min/max than e.g. sum.