Skip to content

Commit

Permalink
TST: Remove unnecessary read_csv usage during testing (#55643)
Browse files Browse the repository at this point in the history
* Remove unnecessary read_csv usage

* Remove csvs

* Evaluate splits

* Typo
  • Loading branch information
mroeschke authored Oct 30, 2023
1 parent 73e085e commit 0287cde
Show file tree
Hide file tree
Showing 18 changed files with 2,087 additions and 503 deletions.
31 changes: 15 additions & 16 deletions pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from datetime import datetime
from io import StringIO
import itertools
import re

Expand Down Expand Up @@ -1771,21 +1770,21 @@ def test_stack_duplicate_index(self, idx, columns, exp_idx, future_stack):
"ignore:The previous implementation of stack is deprecated"
)
def test_unstack_odd_failure(self, future_stack):
data = """day,time,smoker,sum,len
Fri,Dinner,No,8.25,3.
Fri,Dinner,Yes,27.03,9
Fri,Lunch,No,3.0,1
Fri,Lunch,Yes,13.68,6
Sat,Dinner,No,139.63,45
Sat,Dinner,Yes,120.77,42
Sun,Dinner,No,180.57,57
Sun,Dinner,Yes,66.82,19
Thu,Dinner,No,3.0,1
Thu,Lunch,No,117.32,44
Thu,Lunch,Yes,51.51,17"""

df = pd.read_csv(StringIO(data)).set_index(["day", "time", "smoker"])

mi = MultiIndex.from_arrays(
[
["Fri"] * 4 + ["Sat"] * 2 + ["Sun"] * 2 + ["Thu"] * 3,
["Dinner"] * 2 + ["Lunch"] * 2 + ["Dinner"] * 5 + ["Lunch"] * 2,
["No", "Yes"] * 4 + ["No", "No", "Yes"],
],
names=["day", "time", "smoker"],
)
df = DataFrame(
{
"sum": np.arange(11, dtype="float64"),
"len": np.arange(11, dtype="float64"),
},
index=mi,
)
# it works, #2100
result = df.unstack(2)

Expand Down
90 changes: 65 additions & 25 deletions pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
date,
datetime,
)
from io import StringIO

import numpy as np
import pytest
Expand Down Expand Up @@ -38,39 +37,80 @@ def store(group):
tm.assert_frame_equal(groups[0], expected_value)


def test_apply_issues():
def test_apply_index_date():
# GH 5788

s = """2011.05.16,00:00,1.40893
2011.05.16,01:00,1.40760
2011.05.16,02:00,1.40750
2011.05.16,03:00,1.40649
2011.05.17,02:00,1.40893
2011.05.17,03:00,1.40760
2011.05.17,04:00,1.40750
2011.05.17,05:00,1.40649
2011.05.18,02:00,1.40893
2011.05.18,03:00,1.40760
2011.05.18,04:00,1.40750
2011.05.18,05:00,1.40649"""

df = pd.read_csv(
StringIO(s),
header=None,
names=["date", "time", "value"],
parse_dates=[["date", "time"]],
ts = [
"2011-05-16 00:00",
"2011-05-16 01:00",
"2011-05-16 02:00",
"2011-05-16 03:00",
"2011-05-17 02:00",
"2011-05-17 03:00",
"2011-05-17 04:00",
"2011-05-17 05:00",
"2011-05-18 02:00",
"2011-05-18 03:00",
"2011-05-18 04:00",
"2011-05-18 05:00",
]
df = DataFrame(
{
"value": [
1.40893,
1.40760,
1.40750,
1.40649,
1.40893,
1.40760,
1.40750,
1.40649,
1.40893,
1.40760,
1.40750,
1.40649,
],
},
index=Index(pd.to_datetime(ts), name="date_time"),
)
df = df.set_index("date_time")

expected = df.groupby(df.index.date).idxmax()
result = df.groupby(df.index.date).apply(lambda x: x.idxmax())
tm.assert_frame_equal(result, expected)


def test_apply_index_date_object():
# GH 5789
# don't auto coerce dates
df = pd.read_csv(StringIO(s), header=None, names=["date", "time", "value"])
ts = [
"2011-05-16 00:00",
"2011-05-16 01:00",
"2011-05-16 02:00",
"2011-05-16 03:00",
"2011-05-17 02:00",
"2011-05-17 03:00",
"2011-05-17 04:00",
"2011-05-17 05:00",
"2011-05-18 02:00",
"2011-05-18 03:00",
"2011-05-18 04:00",
"2011-05-18 05:00",
]
df = DataFrame([row.split() for row in ts], columns=["date", "time"])
df["value"] = [
1.40893,
1.40760,
1.40750,
1.40649,
1.40893,
1.40760,
1.40750,
1.40649,
1.40893,
1.40760,
1.40750,
1.40649,
]
exp_idx = Index(
["2011.05.16", "2011.05.17", "2011.05.18"], dtype=object, name="date"
["2011-05-16", "2011-05-17", "2011-05-18"], dtype=object, name="date"
)
expected = Series(["00:00", "02:00", "02:00"], index=exp_idx)
msg = "DataFrameGroupBy.apply operated on the grouping columns"
Expand Down
20 changes: 12 additions & 8 deletions pandas/tests/groupby/test_reductions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import builtins
import datetime as dt
from io import StringIO
from string import ascii_lowercase

import numpy as np
Expand Down Expand Up @@ -589,13 +588,18 @@ def test_min_empty_string_dtype(func):


def test_max_nan_bug():
raw = """,Date,app,File
-04-23,2013-04-23 00:00:00,,log080001.log
-05-06,2013-05-06 00:00:00,,log.log
-05-07,2013-05-07 00:00:00,OE,xlsx"""

with tm.assert_produces_warning(UserWarning, match="Could not infer format"):
df = pd.read_csv(StringIO(raw), parse_dates=[0])
df = DataFrame(
{
"Unnamed: 0": ["-04-23", "-05-06", "-05-07"],
"Date": [
"2013-04-23 00:00:00",
"2013-05-06 00:00:00",
"2013-05-07 00:00:00",
],
"app": Series([np.nan, np.nan, "OE"]),
"File": ["log080001.log", "log.log", "xlsx"],
}
)
gb = df.groupby("Date")
r = gb[["File"]].max()
e = gb["File"].max().to_frame()
Expand Down
27 changes: 19 additions & 8 deletions pandas/tests/groupby/test_timegrouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
datetime,
timedelta,
)
from io import StringIO

import numpy as np
import pytest
Expand Down Expand Up @@ -607,14 +606,26 @@ def test_frame_datetime64_handling_groupby(self):

def test_groupby_multi_timezone(self):
# combining multiple / different timezones yields UTC
df = DataFrame(
{
"value": range(5),
"date": [
"2000-01-28 16:47:00",
"2000-01-29 16:48:00",
"2000-01-30 16:49:00",
"2000-01-31 16:50:00",
"2000-01-01 16:50:00",
],
"tz": [
"America/Chicago",
"America/Chicago",
"America/Los_Angeles",
"America/Chicago",
"America/New_York",
],
}
)

data = """0,2000-01-28 16:47:00,America/Chicago
1,2000-01-29 16:48:00,America/Chicago
2,2000-01-30 16:49:00,America/Los_Angeles
3,2000-01-31 16:50:00,America/Chicago
4,2000-01-01 16:50:00,America/New_York"""

df = pd.read_csv(StringIO(data), header=None, names=["value", "date", "tz"])
result = df.groupby("tz", group_keys=False).date.apply(
lambda x: pd.to_datetime(x).dt.tz_localize(x.name)
)
Expand Down
40 changes: 22 additions & 18 deletions pandas/tests/groupby/transform/test_transform.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
""" test with the .transform """
from io import StringIO

import numpy as np
import pytest

Expand Down Expand Up @@ -337,22 +335,28 @@ def test_transform_datetime_to_numeric():

def test_transform_casting():
# 13046
data = """
idx A ID3 DATETIME
0 B-028 b76cd912ff "2014-10-08 13:43:27"
1 B-054 4a57ed0b02 "2014-10-08 14:26:19"
2 B-076 1a682034f8 "2014-10-08 14:29:01"
3 B-023 b76cd912ff "2014-10-08 18:39:34"
4 B-023 f88g8d7sds "2014-10-08 18:40:18"
5 B-033 b76cd912ff "2014-10-08 18:44:30"
6 B-032 b76cd912ff "2014-10-08 18:46:00"
7 B-037 b76cd912ff "2014-10-08 18:52:15"
8 B-046 db959faf02 "2014-10-08 18:59:59"
9 B-053 b76cd912ff "2014-10-08 19:17:48"
10 B-065 b76cd912ff "2014-10-08 19:21:38"
"""
df = pd.read_csv(
StringIO(data), sep=r"\s+", index_col=[0], parse_dates=["DATETIME"]
times = [
"13:43:27",
"14:26:19",
"14:29:01",
"18:39:34",
"18:40:18",
"18:44:30",
"18:46:00",
"18:52:15",
"18:59:59",
"19:17:48",
"19:21:38",
]
df = DataFrame(
{
"A": [f"B-{i}" for i in range(11)],
"ID3": np.take(
["a", "b", "c", "d", "e"], [0, 1, 2, 1, 3, 1, 1, 1, 4, 1, 1]
),
"DATETIME": pd.to_datetime([f"2014-10-08 {time}" for time in times]),
},
index=pd.RangeIndex(11, name="idx"),
)

result = df.groupby("ID3")["DATETIME"].transform(lambda x: x.diff())
Expand Down
19 changes: 15 additions & 4 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from collections import defaultdict
from datetime import datetime
from io import StringIO
import math
import operator
import re
Expand Down Expand Up @@ -1174,13 +1173,21 @@ def test_groupby(self):
def test_equals_op_multiindex(self, mi, expected):
# GH9785
# test comparisons of multiindex
df = pd.read_csv(StringIO("a,b,c\n1,2,3\n4,5,6"), index_col=[0, 1])
df = DataFrame(
[3, 6],
columns=["c"],
index=MultiIndex.from_arrays([[1, 4], [2, 5]], names=["a", "b"]),
)

result = df.index == mi
tm.assert_numpy_array_equal(result, expected)

def test_equals_op_multiindex_identify(self):
df = pd.read_csv(StringIO("a,b,c\n1,2,3\n4,5,6"), index_col=[0, 1])
df = DataFrame(
[3, 6],
columns=["c"],
index=MultiIndex.from_arrays([[1, 4], [2, 5]], names=["a", "b"]),
)

result = df.index == df.index
expected = np.array([True, True])
Expand All @@ -1194,7 +1201,11 @@ def test_equals_op_multiindex_identify(self):
],
)
def test_equals_op_mismatched_multiindex_raises(self, index):
df = pd.read_csv(StringIO("a,b,c\n1,2,3\n4,5,6"), index_col=[0, 1])
df = DataFrame(
[3, 6],
columns=["c"],
index=MultiIndex.from_arrays([[1, 4], [2, 5]], names=["a", "b"]),
)

with pytest.raises(ValueError, match="Lengths must match"):
df.index == index
Expand Down
15 changes: 12 additions & 3 deletions pandas/tests/indexing/multiindex/test_loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -698,10 +698,19 @@ def test_loc_mi_with_level1_named_0():
tm.assert_series_equal(result, expected)


def test_getitem_str_slice(datapath):
def test_getitem_str_slice():
# GH#15928
path = datapath("reshape", "merge", "data", "quotes2.csv")
df = pd.read_csv(path, parse_dates=["time"])
df = DataFrame(
[
["20160525 13:30:00.023", "MSFT", "51.95", "51.95"],
["20160525 13:30:00.048", "GOOG", "720.50", "720.93"],
["20160525 13:30:00.076", "AAPL", "98.55", "98.56"],
["20160525 13:30:00.131", "AAPL", "98.61", "98.62"],
["20160525 13:30:00.135", "MSFT", "51.92", "51.95"],
["20160525 13:30:00.135", "AAPL", "98.61", "98.62"],
],
columns="time,ticker,bid,ask".split(","),
)
df2 = df.set_index(["ticker", "time"]).sort_index()

res = df2.loc[("AAPL", slice("2016-05-25 13:30:00")), :].droplevel(0)
Expand Down
Loading

0 comments on commit 0287cde

Please sign in to comment.