Skip to content

Commit

Permalink
CI: Make is_ci_environment less necessary (#56058)
Browse files Browse the repository at this point in the history
* CI: Make is_ci_environment less necessary

* Add back env settingg

* Add back comment

* Refactor test_read_csv_chunked_download
  • Loading branch information
mroeschke authored Nov 20, 2023
1 parent 91af4fa commit 8438fe7
Show file tree
Hide file tree
Showing 7 changed files with 40 additions and 109 deletions.
11 changes: 11 additions & 0 deletions asv_bench/benchmarks/io/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -621,4 +621,15 @@ def time_read_csv_index_col(self):
)


class ReadCSVCParserLowMemory:
# GH 16798
def setup(self):
self.csv = StringIO(
"strings\n" + "\n".join(["x" * (1 << 20) for _ in range(2100)])
)

def peakmem_over_2gb_input(self):
read_csv(self.csv, engine="c", low_memory=False)


from ..pandas_vb_common import setup # noqa: F401 isort:skip
28 changes: 9 additions & 19 deletions pandas/tests/io/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,23 +51,7 @@ def xml_file(datapath):


@pytest.fixture
def s3so(worker_id):
if is_ci_environment():
url = "http://localhost:5000/"
else:
worker_id = "5" if worker_id == "master" else worker_id.lstrip("gw")
url = f"http://127.0.0.1:555{worker_id}/"
return {"client_kwargs": {"endpoint_url": url}}


@pytest.fixture(scope="function" if is_ci_environment() else "session")
def monkeysession():
with pytest.MonkeyPatch.context() as mp:
yield mp


@pytest.fixture(scope="function" if is_ci_environment() else "session")
def s3_base(worker_id, monkeysession):
def s3_base(worker_id, monkeypatch):
"""
Fixture for mocking S3 interaction.
Expand All @@ -79,8 +63,8 @@ def s3_base(worker_id, monkeysession):

# temporary workaround as moto fails for botocore >= 1.11 otherwise,
# see https://github.com/spulec/moto/issues/1924 & 1952
monkeysession.setenv("AWS_ACCESS_KEY_ID", "foobar_key")
monkeysession.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret")
monkeypatch.setenv("AWS_ACCESS_KEY_ID", "foobar_key")
monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret")
if is_ci_environment():
if is_platform_arm() or is_platform_mac() or is_platform_windows():
# NOT RUN on Windows/macOS/ARM, only Ubuntu
Expand All @@ -93,6 +77,7 @@ def s3_base(worker_id, monkeysession):
"Windows, macOS or ARM platforms"
)
else:
# set in .github/workflows/unit-tests.yml
yield "http://localhost:5000"
else:
requests = pytest.importorskip("requests")
Expand Down Expand Up @@ -128,6 +113,11 @@ def s3_base(worker_id, monkeysession):
proc.terminate()


@pytest.fixture
def s3so(s3_base):
return {"client_kwargs": {"endpoint_url": s3_base}}


@pytest.fixture
def s3_resource(s3_base):
import boto3
Expand Down
19 changes: 0 additions & 19 deletions pandas/tests/io/parser/test_c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import numpy as np
import pytest

from pandas.compat import is_ci_environment
from pandas.compat.numpy import np_version_gte1p24
from pandas.errors import (
ParserError,
Expand Down Expand Up @@ -531,24 +530,6 @@ def test_read_tarfile(c_parser_only, csv_dir_path, tar_suffix):
tm.assert_frame_equal(out, expected)


@pytest.mark.single_cpu
@pytest.mark.skipif(is_ci_environment(), reason="Too memory intensive for CI.")
def test_bytes_exceed_2gb(c_parser_only):
# see gh-16798
#
# Read from a "CSV" that has a column larger than 2GB.
parser = c_parser_only

if parser.low_memory:
pytest.skip("not a low_memory test")

# csv takes 10 seconds to construct, spikes memory to 8GB+, the whole test
# spikes up to 10.4GB on the c_high case
csv = StringIO("strings\n" + "\n".join(["x" * (1 << 20) for _ in range(2100)]))
df = parser.read_csv(csv)
assert not df.empty


def test_chunk_whitespace_on_boundary(c_parser_only):
# see gh-9735: this issue is C parser-specific (bug when
# parsing whitespace and characters at chunk boundary)
Expand Down
53 changes: 17 additions & 36 deletions pandas/tests/io/parser/test_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,13 @@
Tests parsers ability to read and parse non-local files
and hence require a network connection to be read.
"""
from io import (
BytesIO,
StringIO,
)
from io import BytesIO
import logging
import re

import numpy as np
import pytest

from pandas.compat import is_ci_environment
import pandas.util._test_decorators as td

from pandas import DataFrame
Expand Down Expand Up @@ -292,39 +289,23 @@ def test_read_csv_handles_boto_s3_object(
tm.assert_frame_equal(result, expected)

@pytest.mark.single_cpu
@pytest.mark.skipif(
is_ci_environment(),
reason="GH: 45651: This test can hang in our CI min_versions build",
)
def test_read_csv_chunked_download(self, s3_public_bucket, caplog, s3so):
# 8 MB, S3FS uses 5MB chunks
import s3fs

df = DataFrame(
np.random.default_rng(2).standard_normal((100000, 4)), columns=list("abcd")
)
str_buf = StringIO()

df.to_csv(str_buf)

buf = BytesIO(str_buf.getvalue().encode("utf-8"))

s3_public_bucket.put_object(Key="large-file.csv", Body=buf)

# Possibly some state leaking in between tests.
# If we don't clear this cache, we saw `GetObject operation: Forbidden`.
# Presumably the s3fs instance is being cached, with the directory listing
# from *before* we add the large-file.csv in the s3_public_bucket_with_data.
s3fs.S3FileSystem.clear_instance_cache()

with caplog.at_level(logging.DEBUG, logger="s3fs"):
read_csv(
f"s3://{s3_public_bucket.name}/large-file.csv",
nrows=5,
storage_options=s3so,
)
# log of fetch_range (start, stop)
assert (0, 5505024) in (x.args[-2:] for x in caplog.records)
df = DataFrame(np.zeros((100000, 4)), columns=list("abcd"))
with BytesIO(df.to_csv().encode("utf-8")) as buf:
s3_public_bucket.put_object(Key="large-file.csv", Body=buf)
uri = f"{s3_public_bucket.name}/large-file.csv"
match_re = re.compile(rf"^Fetch: {uri}, 0-(?P<stop>\d+)$")
with caplog.at_level(logging.DEBUG, logger="s3fs"):
read_csv(
f"s3://{uri}",
nrows=5,
storage_options=s3so,
)
for log in caplog.messages:
if match := re.match(match_re, log):
# Less than 8 MB
assert int(match.group("stop")) < 8000000

def test_read_s3_with_hash_in_key(self, s3_public_bucket_with_data, tips_df, s3so):
# GH 25945
Expand Down
7 changes: 1 addition & 6 deletions pandas/tests/io/test_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,10 @@ def test_read_without_creds_from_pub_bucket(s3_public_bucket_with_data, s3so):


@pytest.mark.single_cpu
def test_read_with_creds_from_pub_bucket(s3_public_bucket_with_data, monkeypatch, s3so):
def test_read_with_creds_from_pub_bucket(s3_public_bucket_with_data, s3so):
# Ensure we can read from a public bucket with credentials
# GH 34626

# temporary workaround as moto fails for botocore >= 1.11 otherwise,
# see https://github.com/spulec/moto/issues/1924 & 1952
pytest.importorskip("s3fs")
monkeypatch.setenv("AWS_ACCESS_KEY_ID", "foobar_key")
monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret")
df = read_csv(
f"s3://{s3_public_bucket_with_data.name}/tips.csv",
nrows=5,
Expand Down
15 changes: 1 addition & 14 deletions pandas/tests/window/test_numba.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
import numpy as np
import pytest

from pandas.compat import (
is_ci_environment,
is_platform_mac,
is_platform_windows,
)
from pandas.errors import NumbaUtilError
import pandas.util._test_decorators as td

Expand All @@ -17,15 +12,7 @@
)
import pandas._testing as tm

pytestmark = [
pytest.mark.single_cpu,
pytest.mark.skipif(
is_ci_environment() and (is_platform_windows() or is_platform_mac()),
reason="On GHA CI, Windows can fail with "
"'Windows fatal exception: stack overflow' "
"and macOS can timeout",
),
]
pytestmark = pytest.mark.single_cpu


@pytest.fixture(params=["single", "table"])
Expand Down
16 changes: 1 addition & 15 deletions pandas/tests/window/test_online.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,13 @@
import numpy as np
import pytest

from pandas.compat import (
is_ci_environment,
is_platform_mac,
is_platform_windows,
)

from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm

pytestmark = [
pytest.mark.single_cpu,
pytest.mark.skipif(
is_ci_environment() and (is_platform_windows() or is_platform_mac()),
reason="On GHA CI, Windows can fail with "
"'Windows fatal exception: stack overflow' "
"and macOS can timeout",
),
]
pytestmark = pytest.mark.single_cpu

pytest.importorskip("numba")

Expand Down

0 comments on commit 8438fe7

Please sign in to comment.