Skip to content

Commit

Permalink
TST: Make old tests more performant (#55746)
Browse files Browse the repository at this point in the history
* Use zeros instead of random data

* Mark numba apply tests as single cpu

* Parameterize and make test_grow_boundary_at_cap more specific

* parameterize and test less values in test_precise_conversion

* Parameterize and mark test_parse_trim_buffers as slow

* Reduce resample size of test_nearest_upsample_with_limit

* use start_caching_at for test_bad_date_parse

* Parameterize test_series_groupby_value_counts

* Monkeypatch magic number in test_isin_large_series_mixed_dtypes_and_nan

* Use _SIZE_CUTOFF for test_loc_setitem_with_expansion_large_dataframe

* Use switch_numexpr_min_elements for test_floordiv_axis0_numexpr_path

* Remove redundant test
  • Loading branch information
mroeschke authored Oct 29, 2023
1 parent 8425c97 commit adae693
Show file tree
Hide file tree
Showing 9 changed files with 75 additions and 104 deletions.
2 changes: 1 addition & 1 deletion pandas/tests/apply/test_numba.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
)
import pandas._testing as tm

pytestmark = td.skip_if_no("numba")
pytestmark = [td.skip_if_no("numba"), pytest.mark.single_cpu]


def test_numba_vs_python_noop(float_frame, apply_axis):
Expand Down
32 changes: 1 addition & 31 deletions pandas/tests/frame/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,13 @@
)
import pandas._testing as tm
from pandas.core.computation import expressions as expr
from pandas.core.computation.expressions import _MIN_ELEMENTS
from pandas.tests.frame.common import (
_check_mixed_float,
_check_mixed_int,
)
from pandas.util.version import Version


@pytest.fixture(autouse=True, params=[0, 1000000], ids=["numexpr", "python"])
@pytest.fixture(autouse=True, params=[0, 100], ids=["numexpr", "python"])
def switch_numexpr_min_elements(request, monkeypatch):
with monkeypatch.context() as m:
m.setattr(expr, "_MIN_ELEMENTS", request.param)
Expand Down Expand Up @@ -499,34 +497,6 @@ def test_floordiv_axis0(self):
result2 = df.floordiv(ser.values, axis=0)
tm.assert_frame_equal(result2, expected)

@pytest.mark.parametrize("opname", ["floordiv", "pow"])
def test_floordiv_axis0_numexpr_path(self, opname, request):
# case that goes through numexpr and has to fall back to masked_arith_op
ne = pytest.importorskip("numexpr")
if (
Version(ne.__version__) >= Version("2.8.7")
and opname == "pow"
and "python" in request.node.callspec.id
):
request.applymarker(
pytest.mark.xfail(reason="https://github.com/pydata/numexpr/issues/454")
)

op = getattr(operator, opname)

arr = np.arange(_MIN_ELEMENTS + 100).reshape(_MIN_ELEMENTS // 100 + 1, -1) * 100
df = DataFrame(arr)
df["C"] = 1.0

ser = df[0]
result = getattr(df, opname)(ser, axis=0)

expected = DataFrame({col: op(df[col], ser) for col in df.columns})
tm.assert_frame_equal(result, expected)

result2 = getattr(df, opname)(ser.values, axis=0)
tm.assert_frame_equal(result2, expected)

def test_df_add_td64_columnwise(self):
# GH 22534 Check that column-wise addition broadcasts correctly
dti = pd.date_range("2016-01-01", periods=10)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -2186,7 +2186,7 @@ def __init__(self, *args, **kwargs) -> None:
with monkeypatch.context() as m:
m.setattr(reshape_lib, "_Unstacker", MockUnstacker)
df = DataFrame(
np.random.default_rng(2).standard_normal((2**16, 2)),
np.zeros((2**16, 2)),
index=[np.arange(2**16), np.arange(2**16)],
)
msg = "The following operation may generate"
Expand Down
35 changes: 18 additions & 17 deletions pandas/tests/groupby/methods/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
and proper parameter handling
"""

from itertools import product

import numpy as np
import pytest
Expand Down Expand Up @@ -46,7 +45,6 @@ def tests_value_counts_index_names_category_column():
tm.assert_series_equal(result, expected)


# our starting frame
def seed_df(seed_nans, n, m):
days = date_range("2015-08-24", periods=10)

Expand All @@ -70,29 +68,32 @@ def seed_df(seed_nans, n, m):
return frame


# create input df, keys, and the bins
binned = []
ids = []
for seed_nans in [True, False]:
for n, m in product((100, 1000), (5, 20)):
df = seed_df(seed_nans, n, m)
bins = None, np.arange(0, max(5, df["3rd"].max()) + 1, 2)
keys = "1st", "2nd", ["1st", "2nd"]
for k, b in product(keys, bins):
binned.append((df, k, b, n, m))
ids.append(f"{k}-{n}-{m}")


@pytest.mark.slow
@pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids)
@pytest.mark.parametrize("seed_nans", [True, False])
@pytest.mark.parametrize("num_rows", [10, 50])
@pytest.mark.parametrize("max_int", [5, 20])
@pytest.mark.parametrize("keys", ["1st", "2nd", ["1st", "2nd"]], ids=repr)
@pytest.mark.parametrize("bins", [None, [0, 5]], ids=repr)
@pytest.mark.parametrize("isort", [True, False])
@pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")])
@pytest.mark.parametrize("sort", [True, False])
@pytest.mark.parametrize("ascending", [True, False])
@pytest.mark.parametrize("dropna", [True, False])
def test_series_groupby_value_counts(
df, keys, bins, n, m, isort, normalize, name, sort, ascending, dropna
seed_nans,
num_rows,
max_int,
keys,
bins,
isort,
normalize,
name,
sort,
ascending,
dropna,
):
df = seed_df(seed_nans, num_rows, max_int)

def rebuild_index(df):
arr = list(map(df.index.get_level_values, range(df.index.nlevels)))
df.index = MultiIndex.from_arrays(arr, names=df.index.names)
Expand Down
13 changes: 8 additions & 5 deletions pandas/tests/indexing/test_loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import numpy as np
import pytest

from pandas._libs import index as libindex
from pandas.errors import IndexingError
import pandas.util._test_decorators as td

Expand Down Expand Up @@ -1974,12 +1975,14 @@ def test_loc_drops_level(self):


class TestLocSetitemWithExpansion:
@pytest.mark.slow
def test_loc_setitem_with_expansion_large_dataframe(self):
def test_loc_setitem_with_expansion_large_dataframe(self, monkeypatch):
# GH#10692
result = DataFrame({"x": range(10**6)}, dtype="int64")
result.loc[len(result)] = len(result) + 1
expected = DataFrame({"x": range(10**6 + 1)}, dtype="int64")
size_cutoff = 50
with monkeypatch.context():
monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff)
result = DataFrame({"x": range(size_cutoff)}, dtype="int64")
result.loc[size_cutoff] = size_cutoff
expected = DataFrame({"x": range(size_cutoff + 1)}, dtype="int64")
tm.assert_frame_equal(result, expected)

def test_loc_setitem_empty_series(self):
Expand Down
74 changes: 33 additions & 41 deletions pandas/tests/io/parser/test_c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,9 @@ def test_unsupported_dtype(c_parser_only, match, kwargs):

@td.skip_if_32bit
@pytest.mark.slow
def test_precise_conversion(c_parser_only):
# test numbers between 1 and 2
@pytest.mark.parametrize("num", np.linspace(1.0, 2.0, num=21))
def test_precise_conversion(c_parser_only, num):
parser = c_parser_only

normal_errors = []
Expand All @@ -156,27 +158,23 @@ def test_precise_conversion(c_parser_only):
def error(val: float, actual_val: Decimal) -> Decimal:
return abs(Decimal(f"{val:.100}") - actual_val)

# test numbers between 1 and 2
for num in np.linspace(1.0, 2.0, num=500):
# 25 decimal digits of precision
text = f"a\n{num:.25}"
# 25 decimal digits of precision
text = f"a\n{num:.25}"

normal_val = float(
parser.read_csv(StringIO(text), float_precision="legacy")["a"][0]
)
precise_val = float(
parser.read_csv(StringIO(text), float_precision="high")["a"][0]
)
roundtrip_val = float(
parser.read_csv(StringIO(text), float_precision="round_trip")["a"][0]
)
actual_val = Decimal(text[2:])
normal_val = float(
parser.read_csv(StringIO(text), float_precision="legacy")["a"][0]
)
precise_val = float(parser.read_csv(StringIO(text), float_precision="high")["a"][0])
roundtrip_val = float(
parser.read_csv(StringIO(text), float_precision="round_trip")["a"][0]
)
actual_val = Decimal(text[2:])

normal_errors.append(error(normal_val, actual_val))
precise_errors.append(error(precise_val, actual_val))
normal_errors.append(error(normal_val, actual_val))
precise_errors.append(error(precise_val, actual_val))

# round-trip should match float()
assert roundtrip_val == float(text[2:])
# round-trip should match float()
assert roundtrip_val == float(text[2:])

assert sum(precise_errors) <= sum(normal_errors)
assert max(precise_errors) <= max(normal_errors)
Expand Down Expand Up @@ -287,7 +285,8 @@ def test_tokenize_CR_with_quoting(c_parser_only):


@pytest.mark.slow
def test_grow_boundary_at_cap(c_parser_only):
@pytest.mark.parametrize("count", [3 * 2**n for n in range(6)])
def test_grow_boundary_at_cap(c_parser_only, count):
# See gh-12494
#
# Cause of error was that the C parser
Expand All @@ -296,19 +295,18 @@ def test_grow_boundary_at_cap(c_parser_only):
# to capacity, which would later cause a
# buffer overflow error when checking the
# EOF terminator of the CSV stream.
# 3 * 2^n commas was observed to break the parser
parser = c_parser_only

def test_empty_header_read(count):
with StringIO("," * count) as s:
expected = DataFrame(columns=[f"Unnamed: {i}" for i in range(count + 1)])
df = parser.read_csv(s)
tm.assert_frame_equal(df, expected)

for cnt in range(1, 101):
test_empty_header_read(cnt)
with StringIO("," * count) as s:
expected = DataFrame(columns=[f"Unnamed: {i}" for i in range(count + 1)])
df = parser.read_csv(s)
tm.assert_frame_equal(df, expected)


def test_parse_trim_buffers(c_parser_only):
@pytest.mark.slow
@pytest.mark.parametrize("encoding", [None, "utf-8"])
def test_parse_trim_buffers(c_parser_only, encoding):
# This test is part of a bugfix for gh-13703. It attempts to
# to stress the system memory allocator, to cause it to move the
# stream buffer and either let the OS reclaim the region, or let
Expand All @@ -319,6 +317,9 @@ def test_parse_trim_buffers(c_parser_only):
# times it fails due to memory corruption, which causes the
# loaded DataFrame to differ from the expected one.

# Also force 'utf-8' encoding, so that `_string_convert` would take
# a different execution branch.

parser = c_parser_only

# Generate a large mixed-type CSV file on-the-fly (one record is
Expand Down Expand Up @@ -374,25 +375,16 @@ def test_parse_trim_buffers(c_parser_only):
)

# Iterate over the CSV file in chunks of `chunksize` lines
with parser.read_csv(
StringIO(csv_data), header=None, dtype=object, chunksize=chunksize
) as chunks_:
result = concat(chunks_, axis=0, ignore_index=True)

# Check for data corruption if there was no segfault
tm.assert_frame_equal(result, expected)

# This extra test was added to replicate the fault in gh-5291.
# Force 'utf-8' encoding, so that `_string_convert` would take
# a different execution branch.
with parser.read_csv(
StringIO(csv_data),
header=None,
dtype=object,
chunksize=chunksize,
encoding="utf_8",
encoding=encoding,
) as chunks_:
result = concat(chunks_, axis=0, ignore_index=True)

# Check for data corruption if there was no segfault
tm.assert_frame_equal(result, expected)


Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/io/parser/test_parse_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import pandas._testing as tm
from pandas._testing._hypothesis import DATETIME_NO_TZ
from pandas.core.indexes.datetimes import date_range
from pandas.core.tools.datetimes import start_caching_at

from pandas.io.parsers import read_csv

Expand Down Expand Up @@ -1285,7 +1286,7 @@ def test_bad_date_parse(all_parsers, cache_dates, value):
# if we have an invalid date make sure that we handle this with
# and w/o the cache properly
parser = all_parsers
s = StringIO((f"{value},\n") * 50000)
s = StringIO((f"{value},\n") * (start_caching_at + 1))

warn = None
msg = "Passing a BlockManager to DataFrame"
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/resample/test_datetime_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,7 @@ def test_upsample_with_limit(unit):
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("freq", ["5D", "10h", "5Min", "10s"])
@pytest.mark.parametrize("freq", ["1D", "10h", "5Min", "10s"])
@pytest.mark.parametrize("rule", ["Y", "3ME", "15D", "30h", "15Min", "30s"])
def test_nearest_upsample_with_limit(tz_aware_fixture, freq, rule, unit):
# GH 33939
Expand Down
16 changes: 10 additions & 6 deletions pandas/tests/series/methods/test_isin.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
date_range,
)
import pandas._testing as tm
from pandas.core import algorithms
from pandas.core.arrays import PeriodArray


Expand Down Expand Up @@ -197,13 +198,16 @@ def test_isin_masked_types(self, dtype, data, values, expected):
tm.assert_series_equal(result, expected)


@pytest.mark.slow
def test_isin_large_series_mixed_dtypes_and_nan():
def test_isin_large_series_mixed_dtypes_and_nan(monkeypatch):
# https://github.com/pandas-dev/pandas/issues/37094
# combination of object dtype for the values and > 1_000_000 elements
ser = Series([1, 2, np.nan] * 1_000_000)
result = ser.isin({"foo", "bar"})
expected = Series([False] * 3 * 1_000_000)
# combination of object dtype for the values
# and > _MINIMUM_COMP_ARR_LEN elements
min_isin_comp = 5
ser = Series([1, 2, np.nan] * min_isin_comp)
with monkeypatch.context() as m:
m.setattr(algorithms, "_MINIMUM_COMP_ARR_LEN", min_isin_comp)
result = ser.isin({"foo", "bar"})
expected = Series([False] * 3 * min_isin_comp)
tm.assert_series_equal(result, expected)


Expand Down

0 comments on commit adae693

Please sign in to comment.