diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 1826291034dee..a45315f63d62e 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -621,4 +621,15 @@ def time_read_csv_index_col(self): ) +class ReadCSVCParserLowMemory: + # GH 16798 + def setup(self): + self.csv = StringIO( + "strings\n" + "\n".join(["x" * (1 << 20) for _ in range(2100)]) + ) + + def peakmem_over_2gb_input(self): + read_csv(self.csv, engine="c", low_memory=False) + + from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index d3552ab5d39f5..ab6cacc4cc860 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -51,23 +51,7 @@ def xml_file(datapath): @pytest.fixture -def s3so(worker_id): - if is_ci_environment(): - url = "http://localhost:5000/" - else: - worker_id = "5" if worker_id == "master" else worker_id.lstrip("gw") - url = f"http://127.0.0.1:555{worker_id}/" - return {"client_kwargs": {"endpoint_url": url}} - - -@pytest.fixture(scope="function" if is_ci_environment() else "session") -def monkeysession(): - with pytest.MonkeyPatch.context() as mp: - yield mp - - -@pytest.fixture(scope="function" if is_ci_environment() else "session") -def s3_base(worker_id, monkeysession): +def s3_base(worker_id, monkeypatch): """ Fixture for mocking S3 interaction. @@ -79,8 +63,8 @@ def s3_base(worker_id, monkeysession): # temporary workaround as moto fails for botocore >= 1.11 otherwise, # see https://github.com/spulec/moto/issues/1924 & 1952 - monkeysession.setenv("AWS_ACCESS_KEY_ID", "foobar_key") - monkeysession.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret") + monkeypatch.setenv("AWS_ACCESS_KEY_ID", "foobar_key") + monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret") if is_ci_environment(): if is_platform_arm() or is_platform_mac() or is_platform_windows(): # NOT RUN on Windows/macOS/ARM, only Ubuntu @@ -93,6 +77,7 @@ def s3_base(worker_id, monkeysession): "Windows, macOS or ARM platforms" ) else: + # set in .github/workflows/unit-tests.yml yield "http://localhost:5000" else: requests = pytest.importorskip("requests") @@ -128,6 +113,11 @@ def s3_base(worker_id, monkeysession): proc.terminate() +@pytest.fixture +def s3so(s3_base): + return {"client_kwargs": {"endpoint_url": s3_base}} + + @pytest.fixture def s3_resource(s3_base): import boto3 diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index a9540c94ce10e..500863dce84ee 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -17,7 +17,6 @@ import numpy as np import pytest -from pandas.compat import is_ci_environment from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import ( ParserError, @@ -531,24 +530,6 @@ def test_read_tarfile(c_parser_only, csv_dir_path, tar_suffix): tm.assert_frame_equal(out, expected) -@pytest.mark.single_cpu -@pytest.mark.skipif(is_ci_environment(), reason="Too memory intensive for CI.") -def test_bytes_exceed_2gb(c_parser_only): - # see gh-16798 - # - # Read from a "CSV" that has a column larger than 2GB. - parser = c_parser_only - - if parser.low_memory: - pytest.skip("not a low_memory test") - - # csv takes 10 seconds to construct, spikes memory to 8GB+, the whole test - # spikes up to 10.4GB on the c_high case - csv = StringIO("strings\n" + "\n".join(["x" * (1 << 20) for _ in range(2100)])) - df = parser.read_csv(csv) - assert not df.empty - - def test_chunk_whitespace_on_boundary(c_parser_only): # see gh-9735: this issue is C parser-specific (bug when # parsing whitespace and characters at chunk boundary) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 28e5f5ad9bb70..9351387dfc337 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -2,16 +2,13 @@ Tests parsers ability to read and parse non-local files and hence require a network connection to be read. """ -from io import ( - BytesIO, - StringIO, -) +from io import BytesIO import logging +import re import numpy as np import pytest -from pandas.compat import is_ci_environment import pandas.util._test_decorators as td from pandas import DataFrame @@ -292,39 +289,23 @@ def test_read_csv_handles_boto_s3_object( tm.assert_frame_equal(result, expected) @pytest.mark.single_cpu - @pytest.mark.skipif( - is_ci_environment(), - reason="GH: 45651: This test can hang in our CI min_versions build", - ) def test_read_csv_chunked_download(self, s3_public_bucket, caplog, s3so): # 8 MB, S3FS uses 5MB chunks - import s3fs - - df = DataFrame( - np.random.default_rng(2).standard_normal((100000, 4)), columns=list("abcd") - ) - str_buf = StringIO() - - df.to_csv(str_buf) - - buf = BytesIO(str_buf.getvalue().encode("utf-8")) - - s3_public_bucket.put_object(Key="large-file.csv", Body=buf) - - # Possibly some state leaking in between tests. - # If we don't clear this cache, we saw `GetObject operation: Forbidden`. - # Presumably the s3fs instance is being cached, with the directory listing - # from *before* we add the large-file.csv in the s3_public_bucket_with_data. - s3fs.S3FileSystem.clear_instance_cache() - - with caplog.at_level(logging.DEBUG, logger="s3fs"): - read_csv( - f"s3://{s3_public_bucket.name}/large-file.csv", - nrows=5, - storage_options=s3so, - ) - # log of fetch_range (start, stop) - assert (0, 5505024) in (x.args[-2:] for x in caplog.records) + df = DataFrame(np.zeros((100000, 4)), columns=list("abcd")) + with BytesIO(df.to_csv().encode("utf-8")) as buf: + s3_public_bucket.put_object(Key="large-file.csv", Body=buf) + uri = f"{s3_public_bucket.name}/large-file.csv" + match_re = re.compile(rf"^Fetch: {uri}, 0-(?P\d+)$") + with caplog.at_level(logging.DEBUG, logger="s3fs"): + read_csv( + f"s3://{uri}", + nrows=5, + storage_options=s3so, + ) + for log in caplog.messages: + if match := re.match(match_re, log): + # Less than 8 MB + assert int(match.group("stop")) < 8000000 def test_read_s3_with_hash_in_key(self, s3_public_bucket_with_data, tips_df, s3so): # GH 25945 diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py index 9ee3c09631d0e..79473895b662d 100644 --- a/pandas/tests/io/test_s3.py +++ b/pandas/tests/io/test_s3.py @@ -30,15 +30,10 @@ def test_read_without_creds_from_pub_bucket(s3_public_bucket_with_data, s3so): @pytest.mark.single_cpu -def test_read_with_creds_from_pub_bucket(s3_public_bucket_with_data, monkeypatch, s3so): +def test_read_with_creds_from_pub_bucket(s3_public_bucket_with_data, s3so): # Ensure we can read from a public bucket with credentials # GH 34626 - - # temporary workaround as moto fails for botocore >= 1.11 otherwise, - # see https://github.com/spulec/moto/issues/1924 & 1952 pytest.importorskip("s3fs") - monkeypatch.setenv("AWS_ACCESS_KEY_ID", "foobar_key") - monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret") df = read_csv( f"s3://{s3_public_bucket_with_data.name}/tips.csv", nrows=5, diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index f5ef6a00e0b32..b1cc7ec186f19 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -1,11 +1,6 @@ import numpy as np import pytest -from pandas.compat import ( - is_ci_environment, - is_platform_mac, - is_platform_windows, -) from pandas.errors import NumbaUtilError import pandas.util._test_decorators as td @@ -17,15 +12,7 @@ ) import pandas._testing as tm -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.skipif( - is_ci_environment() and (is_platform_windows() or is_platform_mac()), - reason="On GHA CI, Windows can fail with " - "'Windows fatal exception: stack overflow' " - "and macOS can timeout", - ), -] +pytestmark = pytest.mark.single_cpu @pytest.fixture(params=["single", "table"]) diff --git a/pandas/tests/window/test_online.py b/pandas/tests/window/test_online.py index 8c4fb1fe6872b..14d3a39107bc4 100644 --- a/pandas/tests/window/test_online.py +++ b/pandas/tests/window/test_online.py @@ -1,27 +1,13 @@ import numpy as np import pytest -from pandas.compat import ( - is_ci_environment, - is_platform_mac, - is_platform_windows, -) - from pandas import ( DataFrame, Series, ) import pandas._testing as tm -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.skipif( - is_ci_environment() and (is_platform_windows() or is_platform_mac()), - reason="On GHA CI, Windows can fail with " - "'Windows fatal exception: stack overflow' " - "and macOS can timeout", - ), -] +pytestmark = pytest.mark.single_cpu pytest.importorskip("numba")