Skip to content

Commit

Permalink
TST: Make test_user_agent run in CI (#56057)
Browse files Browse the repository at this point in the history
* TST: Make test_user_agent run in CI

* Fix module skip name
  • Loading branch information
mroeschke authored Nov 20, 2023
1 parent caab88b commit d5e97d0
Show file tree
Hide file tree
Showing 2 changed files with 172 additions and 403 deletions.
172 changes: 172 additions & 0 deletions pandas/tests/io/test_http_headers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
"""
Tests for the pandas custom headers in http(s) requests
"""
from functools import partial
import gzip
from io import BytesIO

import pytest

import pandas.util._test_decorators as td

import pandas as pd
import pandas._testing as tm

pytestmark = [
pytest.mark.single_cpu,
pytest.mark.network,
pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
),
]


def gzip_bytes(response_bytes):
with BytesIO() as bio:
with gzip.GzipFile(fileobj=bio, mode="w") as zipper:
zipper.write(response_bytes)
return bio.getvalue()


def csv_responder(df):
return df.to_csv(index=False).encode("utf-8")


def gz_csv_responder(df):
return gzip_bytes(csv_responder(df))


def json_responder(df):
return df.to_json().encode("utf-8")


def gz_json_responder(df):
return gzip_bytes(json_responder(df))


def html_responder(df):
return df.to_html(index=False).encode("utf-8")


def parquetpyarrow_reponder(df):
return df.to_parquet(index=False, engine="pyarrow")


def parquetfastparquet_responder(df):
# the fastparquet engine doesn't like to write to a buffer
# it can do it via the open_with function being set appropriately
# however it automatically calls the close method and wipes the buffer
# so just overwrite that attribute on this instance to not do that

# protected by an importorskip in the respective test
import fsspec

df.to_parquet(
"memory://fastparquet_user_agent.parquet",
index=False,
engine="fastparquet",
compression=None,
)
with fsspec.open("memory://fastparquet_user_agent.parquet", "rb") as f:
return f.read()


def pickle_respnder(df):
with BytesIO() as bio:
df.to_pickle(bio)
return bio.getvalue()


def stata_responder(df):
with BytesIO() as bio:
df.to_stata(bio, write_index=False)
return bio.getvalue()


@pytest.mark.parametrize(
"responder, read_method",
[
(csv_responder, pd.read_csv),
(json_responder, pd.read_json),
(
html_responder,
lambda *args, **kwargs: pd.read_html(*args, **kwargs)[0],
),
pytest.param(
parquetpyarrow_reponder,
partial(pd.read_parquet, engine="pyarrow"),
marks=td.skip_if_no("pyarrow"),
),
pytest.param(
parquetfastparquet_responder,
partial(pd.read_parquet, engine="fastparquet"),
# TODO(ArrayManager) fastparquet
marks=[
td.skip_if_no("fastparquet"),
td.skip_if_no("fsspec"),
td.skip_array_manager_not_yet_implemented,
],
),
(pickle_respnder, pd.read_pickle),
(stata_responder, pd.read_stata),
(gz_csv_responder, pd.read_csv),
(gz_json_responder, pd.read_json),
],
)
@pytest.mark.parametrize(
"storage_options",
[
None,
{"User-Agent": "foo"},
{"User-Agent": "foo", "Auth": "bar"},
],
)
def test_request_headers(responder, read_method, httpserver, storage_options):
expected = pd.DataFrame({"a": ["b"]})
default_headers = ["Accept-Encoding", "Host", "Connection", "User-Agent"]
if "gz" in responder.__name__:
extra = {"Content-Encoding": "gzip"}
if storage_options is None:
storage_options = extra
else:
storage_options |= extra
else:
extra = None
expected_headers = set(default_headers).union(
storage_options.keys() if storage_options else []
)
httpserver.serve_content(content=responder(expected), headers=extra)
result = read_method(httpserver.url, storage_options=storage_options)
tm.assert_frame_equal(result, expected)

request_headers = dict(httpserver.requests[0].headers)
for header in expected_headers:
exp = request_headers.pop(header)
if storage_options and header in storage_options:
assert exp == storage_options[header]
# No extra headers added
assert not request_headers


@pytest.mark.parametrize(
"engine",
[
"pyarrow",
"fastparquet",
],
)
def test_to_parquet_to_disk_with_storage_options(engine):
headers = {
"User-Agent": "custom",
"Auth": "other_custom",
}

pytest.importorskip(engine)

true_df = pd.DataFrame({"column_name": ["column_value"]})
msg = (
"storage_options passed with file object or non-fsspec file path|"
"storage_options passed with buffer, or non-supported URL"
)
with pytest.raises(ValueError, match=msg):
true_df.to_parquet("/tmp/junk.parquet", storage_options=headers, engine=engine)
Loading

0 comments on commit d5e97d0

Please sign in to comment.