TST: Remove unnecessary read_csv usage during testing (#55643)

* Remove unnecessary read_csv usage * Remove csvs * Evaluate splits * Typo
pandas-dev · Oct 30, 2023 · 0287cde · 0287cde
1 parent 73e085e
commit 0287cde
Show file tree

Hide file tree

Showing 18 changed files with 2,087 additions and 503 deletions.
diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py
@@ -1,5 +1,4 @@
 from datetime import datetime
-from io import StringIO
 import itertools
 import re
 
@@ -1771,21 +1770,21 @@ def test_stack_duplicate_index(self, idx, columns, exp_idx, future_stack):
         "ignore:The previous implementation of stack is deprecated"
     )
     def test_unstack_odd_failure(self, future_stack):
-        data = """day,time,smoker,sum,len
-Fri,Dinner,No,8.25,3.
-Fri,Dinner,Yes,27.03,9
-Fri,Lunch,No,3.0,1
-Fri,Lunch,Yes,13.68,6
-Sat,Dinner,No,139.63,45
-Sat,Dinner,Yes,120.77,42
-Sun,Dinner,No,180.57,57
-Sun,Dinner,Yes,66.82,19
-Thu,Dinner,No,3.0,1
-Thu,Lunch,No,117.32,44
-Thu,Lunch,Yes,51.51,17"""
-
-        df = pd.read_csv(StringIO(data)).set_index(["day", "time", "smoker"])
-
+        mi = MultiIndex.from_arrays(
+            [
+                ["Fri"] * 4 + ["Sat"] * 2 + ["Sun"] * 2 + ["Thu"] * 3,
+                ["Dinner"] * 2 + ["Lunch"] * 2 + ["Dinner"] * 5 + ["Lunch"] * 2,
+                ["No", "Yes"] * 4 + ["No", "No", "Yes"],
+            ],
+            names=["day", "time", "smoker"],
+        )
+        df = DataFrame(
+            {
+                "sum": np.arange(11, dtype="float64"),
+                "len": np.arange(11, dtype="float64"),
+            },
+            index=mi,
+        )
         # it works, #2100
         result = df.unstack(2)
 

diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
@@ -2,7 +2,6 @@
     date,
     datetime,
 )
-from io import StringIO
 
 import numpy as np
 import pytest
@@ -38,39 +37,80 @@ def store(group):
     tm.assert_frame_equal(groups[0], expected_value)
 
 
-def test_apply_issues():
+def test_apply_index_date():
     # GH 5788
-
-    s = """2011.05.16,00:00,1.40893
-2011.05.16,01:00,1.40760
-2011.05.16,02:00,1.40750
-2011.05.16,03:00,1.40649
-2011.05.17,02:00,1.40893
-2011.05.17,03:00,1.40760
-2011.05.17,04:00,1.40750
-2011.05.17,05:00,1.40649
-2011.05.18,02:00,1.40893
-2011.05.18,03:00,1.40760
-2011.05.18,04:00,1.40750
-2011.05.18,05:00,1.40649"""
-
-    df = pd.read_csv(
-        StringIO(s),
-        header=None,
-        names=["date", "time", "value"],
-        parse_dates=[["date", "time"]],
+    ts = [
+        "2011-05-16 00:00",
+        "2011-05-16 01:00",
+        "2011-05-16 02:00",
+        "2011-05-16 03:00",
+        "2011-05-17 02:00",
+        "2011-05-17 03:00",
+        "2011-05-17 04:00",
+        "2011-05-17 05:00",
+        "2011-05-18 02:00",
+        "2011-05-18 03:00",
+        "2011-05-18 04:00",
+        "2011-05-18 05:00",
+    ]
+    df = DataFrame(
+        {
+            "value": [
+                1.40893,
+                1.40760,
+                1.40750,
+                1.40649,
+                1.40893,
+                1.40760,
+                1.40750,
+                1.40649,
+                1.40893,
+                1.40760,
+                1.40750,
+                1.40649,
+            ],
+        },
+        index=Index(pd.to_datetime(ts), name="date_time"),
     )
-    df = df.set_index("date_time")
-
     expected = df.groupby(df.index.date).idxmax()
     result = df.groupby(df.index.date).apply(lambda x: x.idxmax())
     tm.assert_frame_equal(result, expected)
 
+
+def test_apply_index_date_object():
     # GH 5789
     # don't auto coerce dates
-    df = pd.read_csv(StringIO(s), header=None, names=["date", "time", "value"])
+    ts = [
+        "2011-05-16 00:00",
+        "2011-05-16 01:00",
+        "2011-05-16 02:00",
+        "2011-05-16 03:00",
+        "2011-05-17 02:00",
+        "2011-05-17 03:00",
+        "2011-05-17 04:00",
+        "2011-05-17 05:00",
+        "2011-05-18 02:00",
+        "2011-05-18 03:00",
+        "2011-05-18 04:00",
+        "2011-05-18 05:00",
+    ]
+    df = DataFrame([row.split() for row in ts], columns=["date", "time"])
+    df["value"] = [
+        1.40893,
+        1.40760,
+        1.40750,
+        1.40649,
+        1.40893,
+        1.40760,
+        1.40750,
+        1.40649,
+        1.40893,
+        1.40760,
+        1.40750,
+        1.40649,
+    ]
     exp_idx = Index(
-        ["2011.05.16", "2011.05.17", "2011.05.18"], dtype=object, name="date"
+        ["2011-05-16", "2011-05-17", "2011-05-18"], dtype=object, name="date"
     )
     expected = Series(["00:00", "02:00", "02:00"], index=exp_idx)
     msg = "DataFrameGroupBy.apply operated on the grouping columns"

diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py
@@ -1,6 +1,5 @@
 import builtins
 import datetime as dt
-from io import StringIO
 from string import ascii_lowercase
 
 import numpy as np
@@ -589,13 +588,18 @@ def test_min_empty_string_dtype(func):
 
 
 def test_max_nan_bug():
-    raw = """,Date,app,File
--04-23,2013-04-23 00:00:00,,log080001.log
--05-06,2013-05-06 00:00:00,,log.log
--05-07,2013-05-07 00:00:00,OE,xlsx"""
-
-    with tm.assert_produces_warning(UserWarning, match="Could not infer format"):
-        df = pd.read_csv(StringIO(raw), parse_dates=[0])
+    df = DataFrame(
+        {
+            "Unnamed: 0": ["-04-23", "-05-06", "-05-07"],
+            "Date": [
+                "2013-04-23 00:00:00",
+                "2013-05-06 00:00:00",
+                "2013-05-07 00:00:00",
+            ],
+            "app": Series([np.nan, np.nan, "OE"]),
+            "File": ["log080001.log", "log.log", "xlsx"],
+        }
+    )
     gb = df.groupby("Date")
     r = gb[["File"]].max()
     e = gb["File"].max().to_frame()

diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py
@@ -5,7 +5,6 @@
     datetime,
     timedelta,
 )
-from io import StringIO
 
 import numpy as np
 import pytest
@@ -607,14 +606,26 @@ def test_frame_datetime64_handling_groupby(self):
 
     def test_groupby_multi_timezone(self):
         # combining multiple / different timezones yields UTC
+        df = DataFrame(
+            {
+                "value": range(5),
+                "date": [
+                    "2000-01-28 16:47:00",
+                    "2000-01-29 16:48:00",
+                    "2000-01-30 16:49:00",
+                    "2000-01-31 16:50:00",
+                    "2000-01-01 16:50:00",
+                ],
+                "tz": [
+                    "America/Chicago",
+                    "America/Chicago",
+                    "America/Los_Angeles",
+                    "America/Chicago",
+                    "America/New_York",
+                ],
+            }
+        )
 
-        data = """0,2000-01-28 16:47:00,America/Chicago
-1,2000-01-29 16:48:00,America/Chicago
-2,2000-01-30 16:49:00,America/Los_Angeles
-3,2000-01-31 16:50:00,America/Chicago
-4,2000-01-01 16:50:00,America/New_York"""
-
-        df = pd.read_csv(StringIO(data), header=None, names=["value", "date", "tz"])
         result = df.groupby("tz", group_keys=False).date.apply(
             lambda x: pd.to_datetime(x).dt.tz_localize(x.name)
         )

diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
@@ -1,6 +1,4 @@
 """ test with the .transform """
-from io import StringIO
-
 import numpy as np
 import pytest
 
@@ -337,22 +335,28 @@ def test_transform_datetime_to_numeric():
 
 def test_transform_casting():
     # 13046
-    data = """
-    idx     A         ID3              DATETIME
-    0   B-028  b76cd912ff "2014-10-08 13:43:27"
-    1   B-054  4a57ed0b02 "2014-10-08 14:26:19"
-    2   B-076  1a682034f8 "2014-10-08 14:29:01"
-    3   B-023  b76cd912ff "2014-10-08 18:39:34"
-    4   B-023  f88g8d7sds "2014-10-08 18:40:18"
-    5   B-033  b76cd912ff "2014-10-08 18:44:30"
-    6   B-032  b76cd912ff "2014-10-08 18:46:00"
-    7   B-037  b76cd912ff "2014-10-08 18:52:15"
-    8   B-046  db959faf02 "2014-10-08 18:59:59"
-    9   B-053  b76cd912ff "2014-10-08 19:17:48"
-    10  B-065  b76cd912ff "2014-10-08 19:21:38"
-    """
-    df = pd.read_csv(
-        StringIO(data), sep=r"\s+", index_col=[0], parse_dates=["DATETIME"]
+    times = [
+        "13:43:27",
+        "14:26:19",
+        "14:29:01",
+        "18:39:34",
+        "18:40:18",
+        "18:44:30",
+        "18:46:00",
+        "18:52:15",
+        "18:59:59",
+        "19:17:48",
+        "19:21:38",
+    ]
+    df = DataFrame(
+        {
+            "A": [f"B-{i}" for i in range(11)],
+            "ID3": np.take(
+                ["a", "b", "c", "d", "e"], [0, 1, 2, 1, 3, 1, 1, 1, 4, 1, 1]
+            ),
+            "DATETIME": pd.to_datetime([f"2014-10-08 {time}" for time in times]),
+        },
+        index=pd.RangeIndex(11, name="idx"),
     )
 
     result = df.groupby("ID3")["DATETIME"].transform(lambda x: x.diff())

diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
@@ -1,6 +1,5 @@
 from collections import defaultdict
 from datetime import datetime
-from io import StringIO
 import math
 import operator
 import re
@@ -1174,13 +1173,21 @@ def test_groupby(self):
     def test_equals_op_multiindex(self, mi, expected):
         # GH9785
         # test comparisons of multiindex
-        df = pd.read_csv(StringIO("a,b,c\n1,2,3\n4,5,6"), index_col=[0, 1])
+        df = DataFrame(
+            [3, 6],
+            columns=["c"],
+            index=MultiIndex.from_arrays([[1, 4], [2, 5]], names=["a", "b"]),
+        )
 
         result = df.index == mi
         tm.assert_numpy_array_equal(result, expected)
 
     def test_equals_op_multiindex_identify(self):
-        df = pd.read_csv(StringIO("a,b,c\n1,2,3\n4,5,6"), index_col=[0, 1])
+        df = DataFrame(
+            [3, 6],
+            columns=["c"],
+            index=MultiIndex.from_arrays([[1, 4], [2, 5]], names=["a", "b"]),
+        )
 
         result = df.index == df.index
         expected = np.array([True, True])
@@ -1194,7 +1201,11 @@ def test_equals_op_multiindex_identify(self):
         ],
     )
     def test_equals_op_mismatched_multiindex_raises(self, index):
-        df = pd.read_csv(StringIO("a,b,c\n1,2,3\n4,5,6"), index_col=[0, 1])
+        df = DataFrame(
+            [3, 6],
+            columns=["c"],
+            index=MultiIndex.from_arrays([[1, 4], [2, 5]], names=["a", "b"]),
+        )
 
         with pytest.raises(ValueError, match="Lengths must match"):
             df.index == index

diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py
@@ -698,10 +698,19 @@ def test_loc_mi_with_level1_named_0():
     tm.assert_series_equal(result, expected)
 
 
-def test_getitem_str_slice(datapath):
+def test_getitem_str_slice():
     # GH#15928
-    path = datapath("reshape", "merge", "data", "quotes2.csv")
-    df = pd.read_csv(path, parse_dates=["time"])
+    df = DataFrame(
+        [
+            ["20160525 13:30:00.023", "MSFT", "51.95", "51.95"],
+            ["20160525 13:30:00.048", "GOOG", "720.50", "720.93"],
+            ["20160525 13:30:00.076", "AAPL", "98.55", "98.56"],
+            ["20160525 13:30:00.131", "AAPL", "98.61", "98.62"],
+            ["20160525 13:30:00.135", "MSFT", "51.92", "51.95"],
+            ["20160525 13:30:00.135", "AAPL", "98.61", "98.62"],
+        ],
+        columns="time,ticker,bid,ask".split(","),
+    )
     df2 = df.set_index(["ticker", "time"]).sort_index()
 
     res = df2.loc[("AAPL", slice("2016-05-25 13:30:00")), :].droplevel(0)