diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 192f19c36b47d..3c78b0a9a60c8 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -1,7 +1,6 @@ from importlib import import_module import numpy as np -import pyarrow as pa import pandas as pd @@ -20,9 +19,9 @@ class Factorize: [True, False], [True, False], [ - "int", - "uint", - "float", + "int64", + "uint64", + "float64", "object", "object_str", "datetime64[ns]", @@ -36,28 +35,24 @@ class Factorize: def setup(self, unique, sort, dtype): N = 10**5 - string_index = tm.makeStringIndex(N) - string_arrow = None - if dtype == "string[pyarrow]": - try: - string_arrow = pd.array(string_index, dtype="string[pyarrow]") - except ImportError: - raise NotImplementedError - - data = { - "int": pd.Index(np.arange(N), dtype="int64"), - "uint": pd.Index(np.arange(N), dtype="uint64"), - "float": pd.Index(np.random.randn(N), dtype="float64"), - "object_str": string_index, - "object": pd.Index(np.arange(N), dtype="object"), - "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N), - "datetime64[ns, tz]": pd.date_range( - "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" - ), - "Int64": pd.array(np.arange(N), dtype="Int64"), - "boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"), - "string[pyarrow]": string_arrow, - }[dtype] + + if dtype in ["int64", "uint64", "Int64", "object"]: + data = pd.Index(np.arange(N), dtype=dtype) + elif dtype == "float64": + data = pd.Index(np.random.randn(N), dtype=dtype) + elif dtype == "boolean": + data = pd.array(np.random.randint(0, 2, N), dtype=dtype) + elif dtype == "datetime64[ns]": + data = pd.date_range("2011-01-01", freq="h", periods=N) + elif dtype == "datetime64[ns, tz]": + data = pd.date_range("2011-01-01", freq="h", periods=N, tz="Asia/Tokyo") + elif dtype == "object_str": + data = tm.makeStringIndex(N) + elif dtype == "string[pyarrow]": + data = pd.array(tm.makeStringIndex(N), dtype="string[pyarrow]") + else: + raise NotImplementedError + if not unique: data = data.repeat(5) self.data = data @@ -74,9 +69,9 @@ class Duplicated: [True, False], ["first", "last", False], [ - "int", - "uint", - "float", + "int64", + "uint64", + "float64", "string", "datetime64[ns]", "datetime64[ns, tz]", @@ -88,22 +83,20 @@ class Duplicated: def setup(self, unique, keep, dtype): N = 10**5 - data = { - "int": pd.Index(np.arange(N), dtype="int64"), - "uint": pd.Index(np.arange(N), dtype="uint64"), - "float": pd.Index(np.random.randn(N), dtype="float64"), - "string": tm.makeStringIndex(N), - "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N), - "datetime64[ns, tz]": pd.date_range( - "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" - ), - "timestamp[ms][pyarrow]": pd.Index( - np.arange(N), dtype=pd.ArrowDtype(pa.timestamp("ms")) - ), - "duration[s][pyarrow]": pd.Index( - np.arange(N), dtype=pd.ArrowDtype(pa.duration("s")) - ), - }[dtype] + if dtype in ["int64", "uint64"]: + data = pd.Index(np.arange(N), dtype=dtype) + elif dtype == "float64": + data = pd.Index(np.random.randn(N), dtype="float64") + elif dtype == "string": + data = tm.makeStringIndex(N) + elif dtype == "datetime64[ns]": + data = pd.date_range("2011-01-01", freq="h", periods=N) + elif dtype == "datetime64[ns, tz]": + data = pd.date_range("2011-01-01", freq="h", periods=N, tz="Asia/Tokyo") + elif dtype in ["timestamp[ms][pyarrow]", "duration[s][pyarrow]"]: + data = pd.Index(np.arange(N), dtype=dtype) + else: + raise NotImplementedError if not unique: data = data.repeat(5) self.idx = data @@ -181,21 +174,22 @@ class Quantile: params = [ [0, 0.5, 1], ["linear", "nearest", "lower", "higher", "midpoint"], - ["float", "int", "uint"], + ["float64", "int64", "uint64"], ] param_names = ["quantile", "interpolation", "dtype"] def setup(self, quantile, interpolation, dtype): N = 10**5 - data = { - "int": np.arange(N), - "uint": np.arange(N).astype(np.uint64), - "float": np.random.randn(N), - } - self.idx = pd.Series(data[dtype].repeat(5)) + if dtype in ["int64", "uint64"]: + data = np.arange(N, dtype=dtype) + elif dtype == "float64": + data = np.random.randn(N) + else: + raise NotImplementedError + self.ser = pd.Series(data.repeat(5)) def time_quantile(self, quantile, interpolation, dtype): - self.idx.quantile(quantile, interpolation=interpolation) + self.ser.quantile(quantile, interpolation=interpolation) class SortIntegerArray: diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py index 49543c166d047..d70ad144a3455 100644 --- a/asv_bench/benchmarks/arithmetic.py +++ b/asv_bench/benchmarks/arithmetic.py @@ -491,7 +491,7 @@ class BinaryOpsMultiIndex: param_names = ["func"] def setup(self, func): - array = date_range("20200101 00:00", "20200102 0:00", freq="S") + array = date_range("20200101 00:00", "20200102 0:00", freq="s") level_0_names = [str(i) for i in range(30)] index = pd.MultiIndex.from_product([level_0_names, array]) diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index 0229cf15fbfb8..aefc6e6d3c307 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -31,9 +31,9 @@ def time_from_float_array(self): class IntegerArray: def setup(self): N = 250_000 - self.values_integer = np.array([1, 0, 1, 0] * N) - self.data = np.array([1, 2, 3, 4] * N, dtype="int64") - self.mask = np.array([False, False, True, False] * N) + self.values_integer = np.tile(np.array([1, 0, 1, 0]), N) + self.data = np.tile(np.array([1, 2, 3, 4], dtype="int64"), N) + self.mask = np.tile(np.array([False, False, True, False]), N) def time_constructor(self): pd.arrays.IntegerArray(self.data, self.mask) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 84d4a28d675d5..7e70db5681850 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -260,18 +260,16 @@ class CategoricalSlicing: def setup(self, index): N = 10**6 categories = ["a", "b", "c"] - values = [0] * N + [1] * N + [2] * N if index == "monotonic_incr": - self.data = pd.Categorical.from_codes(values, categories=categories) + codes = np.repeat([0, 1, 2], N) elif index == "monotonic_decr": - self.data = pd.Categorical.from_codes( - list(reversed(values)), categories=categories - ) + codes = np.repeat([2, 1, 0], N) elif index == "non_monotonic": - self.data = pd.Categorical.from_codes([0, 1, 2] * N, categories=categories) + codes = np.tile([0, 1, 2], N) else: raise ValueError(f"Invalid index param: {index}") + self.data = pd.Categorical.from_codes(codes, categories=categories) self.scalar = 10000 self.list = list(range(10000)) self.cat_scalar = "b" diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index e56fbf1d8c32f..f22a261041e17 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -439,9 +439,9 @@ def setup(self, inplace, dtype): N, M = 10000, 100 if dtype in ("datetime64[ns]", "datetime64[ns, tz]", "timedelta64[ns]"): data = { - "datetime64[ns]": date_range("2011-01-01", freq="H", periods=N), + "datetime64[ns]": date_range("2011-01-01", freq="h", periods=N), "datetime64[ns, tz]": date_range( - "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" + "2011-01-01", freq="h", periods=N, tz="Asia/Tokyo" ), "timedelta64[ns]": timedelta_range(start="1 day", periods=N, freq="1D"), } @@ -640,7 +640,8 @@ def time_frame_nunique(self): class SeriesNuniqueWithNan: def setup(self): - self.ser = Series(100000 * (100 * [np.nan] + list(range(100)))).astype(float) + values = 100 * [np.nan] + list(range(100)) + self.ser = Series(np.tile(values, 10000), dtype=float) def time_series_nunique_nan(self): self.ser.nunique() @@ -649,7 +650,7 @@ def time_series_nunique_nan(self): class Duplicated: def setup(self): n = 1 << 20 - t = date_range("2015-01-01", freq="S", periods=(n // 64)) + t = date_range("2015-01-01", freq="s", periods=(n // 64)) xs = np.random.randn(n // 64).round(2) self.df = DataFrame( { diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 4993ffd2c47d0..c78819f75c52a 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -212,7 +212,7 @@ def run(dti): def time_datetime_to_period(self): @test_parallel(num_threads=2) def run(dti): - dti.to_period("S") + dti.to_period("s") run(self.dti) @@ -272,18 +272,20 @@ class ParallelReadCSV(BaseIO): def setup(self, dtype): rows = 10000 cols = 50 - data = { - "float": DataFrame(np.random.randn(rows, cols)), - "datetime": DataFrame( + if dtype == "float": + df = DataFrame(np.random.randn(rows, cols)) + elif dtype == "datetime": + df = DataFrame( np.random.randn(rows, cols), index=date_range("1/1/2000", periods=rows) - ), - "object": DataFrame( + ) + elif dtype == "object": + df = DataFrame( "foo", index=range(rows), columns=["object%03d" for _ in range(5)] - ), - } + ) + else: + raise NotImplementedError self.fname = f"__test_{dtype}__.csv" - df = data[dtype] df.to_csv(self.fname) @test_parallel(num_threads=2) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 54c240e84243a..5e3eb7c7d4dac 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -238,7 +238,7 @@ def time_series_nth(self, dtype): class DateAttributes: def setup(self): - rng = date_range("1/1/2000", "12/31/2005", freq="H") + rng = date_range("1/1/2000", "12/31/2005", freq="h") self.year, self.month, self.day = rng.year, rng.month, rng.day self.ts = Series(np.random.randn(len(rng)), index=rng) @@ -713,7 +713,7 @@ def setup(self, dtype, tie_method): if dtype == "datetime64": data = np.array([Timestamp("2011/01/01")] * N, dtype=dtype) else: - data = np.array([1] * N, dtype=dtype) + data = np.ones(N, dtype=dtype) self.df = DataFrame({"values": data, "key": ["foo"] * N}) def time_rank_ties(self, dtype, tie_method): diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 2d8014570466e..7e33223260e0f 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -161,9 +161,7 @@ def setup(self, dtype): self.sorted = self.idx.sort_values() half = N // 2 self.non_unique = self.idx[:half].append(self.idx[:half]) - self.non_unique_sorted = ( - self.sorted[:half].append(self.sorted[:half]).sort_values() - ) + self.non_unique_sorted = self.sorted[:half].repeat(2) self.key = self.sorted[N // 4] def time_boolean_array(self, dtype): diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 84d95a23bd446..8058b347383a7 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -232,7 +232,7 @@ def setup(self, index): N = 100000 indexes = { "int": Index(np.arange(N), dtype=np.int64), - "datetime": date_range("2011-01-01", freq="S", periods=N), + "datetime": date_range("2011-01-01", freq="s", periods=N), } index = indexes[index] self.s = Series(np.random.rand(N), index=index) @@ -465,7 +465,7 @@ def time_loc_row(self, unique_cols): class AssignTimeseriesIndex: def setup(self): N = 100000 - idx = date_range("1/1/2000", periods=N, freq="H") + idx = date_range("1/1/2000", periods=N, freq="h") self.df = DataFrame(np.random.randn(N, 1), columns=["A"], index=idx) def time_frame_assign_timeseries_index(self): @@ -515,6 +515,18 @@ def time_setitem_list(self): self.df[[100, 200, 300]] = 100 +class SetitemObjectDtype: + # GH#19299 + + def setup(self): + N = 1000 + cols = 500 + self.df = DataFrame(index=range(N), columns=range(cols), dtype=object) + + def time_setitem_object_dtype(self): + self.df.loc[0, 1] = 1.0 + + class ChainIndexing: params = [None, "warn"] param_names = ["mode"] diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py index 6585a4be78dc6..fd3d0f0b9cf2e 100644 --- a/asv_bench/benchmarks/indexing_engines.py +++ b/asv_bench/benchmarks/indexing_engines.py @@ -71,14 +71,12 @@ def setup(self, engine_and_dtype, index_type, unique, N): if unique: arr = np.arange(N * 3, dtype=dtype) else: - values = list([1] * N + [2] * N + [3] * N) - arr = np.array(values, dtype=dtype) + arr = np.array([1, 2, 3], dtype=dtype).repeat(N) elif index_type == "monotonic_decr": if unique: arr = np.arange(N * 3, dtype=dtype)[::-1] else: - values = list([1] * N + [2] * N + [3] * N) - arr = np.array(values, dtype=dtype)[::-1] + arr = np.array([3, 2, 1], dtype=dtype).repeat(N) else: assert index_type == "non_monotonic" if unique: @@ -86,7 +84,7 @@ def setup(self, engine_and_dtype, index_type, unique, N): arr[:N] = np.arange(N * 2, N * 3, dtype=dtype) arr[N:] = np.arange(N * 2, dtype=dtype) else: - arr = np.array([1, 2, 3] * N, dtype=dtype) + arr = np.array([1, 2, 3], dtype=dtype).repeat(N) self.data = engine(arr) # code belows avoids populating the mapping etc. while timing. @@ -115,30 +113,29 @@ class MaskedNumericEngineIndexing: def setup(self, engine_and_dtype, index_type, unique, N): engine, dtype = engine_and_dtype + dtype = dtype.lower() if index_type == "monotonic_incr": if unique: - arr = np.arange(N * 3, dtype=dtype.lower()) + arr = np.arange(N * 3, dtype=dtype) else: - values = list([1] * N + [2] * N + [3] * N) - arr = np.array(values, dtype=dtype.lower()) + arr = np.array([1, 2, 3], dtype=dtype).repeat(N) mask = np.zeros(N * 3, dtype=np.bool_) elif index_type == "monotonic_decr": if unique: - arr = np.arange(N * 3, dtype=dtype.lower())[::-1] + arr = np.arange(N * 3, dtype=dtype)[::-1] else: - values = list([1] * N + [2] * N + [3] * N) - arr = np.array(values, dtype=dtype.lower())[::-1] + arr = np.array([3, 2, 1], dtype=dtype).repeat(N) mask = np.zeros(N * 3, dtype=np.bool_) else: assert index_type == "non_monotonic" if unique: - arr = np.zeros(N * 3, dtype=dtype.lower()) - arr[:N] = np.arange(N * 2, N * 3, dtype=dtype.lower()) - arr[N:] = np.arange(N * 2, dtype=dtype.lower()) + arr = np.zeros(N * 3, dtype=dtype) + arr[:N] = np.arange(N * 2, N * 3, dtype=dtype) + arr[N:] = np.arange(N * 2, dtype=dtype) else: - arr = np.array([1, 2, 3] * N, dtype=dtype.lower()) + arr = np.array([1, 2, 3], dtype=dtype).repeat(N) mask = np.zeros(N * 3, dtype=np.bool_) mask[-1] = True diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 476ff14dcc92a..805b0c807452c 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -164,7 +164,7 @@ def time_unique_date_strings(self, cache, count): class ToDatetimeISO8601: def setup(self): - rng = date_range(start="1/1/2000", periods=20000, freq="H") + rng = date_range(start="1/1/2000", periods=20000, freq="h") self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist() self.strings_nosep = rng.strftime("%Y%m%d %H:%M:%S").tolist() self.strings_tz_space = [ @@ -276,7 +276,7 @@ def time_dup_string_tzoffset_dates(self, cache): # GH 43901 class ToDatetimeInferDatetimeFormat: def setup(self): - rng = date_range(start="1/1/2000", periods=100000, freq="H") + rng = date_range(start="1/1/2000", periods=100000, freq="h") self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist() def time_infer_datetime_format(self): diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index c5e3e80571e30..1826291034dee 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -89,7 +89,7 @@ class ToCSVDatetimeIndex(BaseIO): fname = "__test__.csv" def setup(self): - rng = date_range("2000", periods=100_000, freq="S") + rng = date_range("2000", periods=100_000, freq="s") self.data = DataFrame({"a": 1}, index=rng) def time_frame_date_formatting_index(self): @@ -102,7 +102,7 @@ def time_frame_date_no_format_index(self): class ToCSVPeriod(BaseIO): fname = "__test__.csv" - params = ([1000, 10000], ["D", "H"]) + params = ([1000, 10000], ["D", "h"]) param_names = ["nobs", "freq"] def setup(self, nobs, freq): @@ -110,7 +110,7 @@ def setup(self, nobs, freq): self.data = DataFrame(rng) if freq == "D": self.default_fmt = "%Y-%m-%d" - elif freq == "H": + elif freq == "h": self.default_fmt = "%Y-%m-%d %H:00" def time_frame_period_formatting_default(self, nobs, freq): @@ -130,7 +130,7 @@ def time_frame_period_formatting(self, nobs, freq): class ToCSVPeriodIndex(BaseIO): fname = "__test__.csv" - params = ([1000, 10000], ["D", "H"]) + params = ([1000, 10000], ["D", "h"]) param_names = ["nobs", "freq"] def setup(self, nobs, freq): @@ -138,7 +138,7 @@ def setup(self, nobs, freq): self.data = DataFrame({"a": 1}, index=rng) if freq == "D": self.default_fmt = "%Y-%m-%d" - elif freq == "H": + elif freq == "h": self.default_fmt = "%Y-%m-%d %H:00" def time_frame_period_formatting_index(self, nobs, freq): @@ -253,7 +253,7 @@ class ReadCSVConcatDatetime(StringIORewind): iso8601 = "%Y-%m-%d %H:%M:%S" def setup(self): - rng = date_range("1/1/2000", periods=50000, freq="S") + rng = date_range("1/1/2000", periods=50000, freq="s") self.StringIO_input = StringIO("\n".join(rng.strftime(self.iso8601).tolist())) def time_read_csv(self): diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index c77c6b6f5727c..f8d81b0f6a699 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -25,7 +25,7 @@ def _generate_dataframe(): df = DataFrame( np.random.randn(N, C), columns=[f"float{i}" for i in range(C)], - index=date_range("20000101", periods=N, freq="H"), + index=date_range("20000101", periods=N, freq="h"), ) df["object"] = tm.makeStringIndex(N) return df diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py index f3e417e717609..195aaa158e178 100644 --- a/asv_bench/benchmarks/io/hdf.py +++ b/asv_bench/benchmarks/io/hdf.py @@ -122,7 +122,7 @@ def setup(self, format): self.df = DataFrame( np.random.randn(N, C), columns=[f"float{i}" for i in range(C)], - index=date_range("20000101", periods=N, freq="H"), + index=date_range("20000101", periods=N, freq="h"), ) self.df["object"] = tm.makeStringIndex(N) self.df.to_hdf(self.fname, "df", format=format) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index bebf6ee993aba..8a2e3fa87eb37 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -26,7 +26,7 @@ def setup(self, orient, index): N = 100000 indexes = { "int": np.arange(N), - "datetime": date_range("20000101", periods=N, freq="H"), + "datetime": date_range("20000101", periods=N, freq="h"), } df = DataFrame( np.random.randn(N, 5), @@ -48,7 +48,7 @@ def setup(self, index): N = 100000 indexes = { "int": np.arange(N), - "datetime": date_range("20000101", periods=N, freq="H"), + "datetime": date_range("20000101", periods=N, freq="h"), } df = DataFrame( np.random.randn(N, 5), @@ -108,7 +108,7 @@ class ToJSON(BaseIO): def setup(self, orient, frame): N = 10**5 ncols = 5 - index = date_range("20000101", periods=N, freq="H") + index = date_range("20000101", periods=N, freq="h") timedeltas = timedelta_range(start=1, periods=N, freq="s") datetimes = date_range(start=1, periods=N, freq="s") ints = np.random.randint(100000000, size=N) @@ -191,7 +191,7 @@ class ToJSONISO(BaseIO): def setup(self, orient): N = 10**5 - index = date_range("20000101", periods=N, freq="H") + index = date_range("20000101", periods=N, freq="h") timedeltas = timedelta_range(start=1, periods=N, freq="s") datetimes = date_range(start=1, periods=N, freq="s") self.df = DataFrame( @@ -214,7 +214,7 @@ class ToJSONLines(BaseIO): def setup(self): N = 10**5 ncols = 5 - index = date_range("20000101", periods=N, freq="H") + index = date_range("20000101", periods=N, freq="h") timedeltas = timedelta_range(start=1, periods=N, freq="s") datetimes = date_range(start=1, periods=N, freq="s") ints = np.random.randint(100000000, size=N) diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py index c71cdcdcc5c59..54631d9236887 100644 --- a/asv_bench/benchmarks/io/pickle.py +++ b/asv_bench/benchmarks/io/pickle.py @@ -20,7 +20,7 @@ def setup(self): self.df = DataFrame( np.random.randn(N, C), columns=[f"float{i}" for i in range(C)], - index=date_range("20000101", periods=N, freq="H"), + index=date_range("20000101", periods=N, freq="h"), ) self.df["object"] = tm.makeStringIndex(N) self.df.to_pickle(self.fname) diff --git a/asv_bench/benchmarks/io/stata.py b/asv_bench/benchmarks/io/stata.py index 300b9c778f1f8..750bcf4ccee5c 100644 --- a/asv_bench/benchmarks/io/stata.py +++ b/asv_bench/benchmarks/io/stata.py @@ -23,7 +23,7 @@ def setup(self, convert_dates): self.df = DataFrame( np.random.randn(N, C), columns=[f"float{i}" for i in range(C)], - index=date_range("20000101", periods=N, freq="H"), + index=date_range("20000101", periods=N, freq="h"), ) self.df["object"] = tm.makeStringIndex(self.N) self.df["int8_"] = np.random.randint( diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 04ac47a892a22..23824c2c748df 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -213,7 +213,7 @@ class JoinNonUnique: # GH 6329 def setup(self): date_index = date_range("01-Jan-2013", "23-Jan-2013", freq="min") - daily_dates = date_index.to_period("D").to_timestamp("S", "S") + daily_dates = date_index.to_period("D").to_timestamp("s", "s") self.fracofday = date_index.values - daily_dates.values self.fracofday = self.fracofday.astype("timedelta64[ns]") self.fracofday = self.fracofday.astype(np.float64) / 86_400_000_000_000 diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py index 501fe198d41d8..ccd86cae06d58 100644 --- a/asv_bench/benchmarks/period.py +++ b/asv_bench/benchmarks/period.py @@ -45,7 +45,7 @@ def time_from_ints_daily(self, freq, is_offset): class DataFramePeriodColumn: def setup(self): - self.rng = period_range(start="1/1/1990", freq="S", periods=20000) + self.rng = period_range(start="1/1/1990", freq="s", periods=20000) self.df = DataFrame(index=range(len(self.rng))) def time_setitem_period_column(self): diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index f52f7a4bef37a..79cf8f9cd2048 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -64,7 +64,7 @@ def setup(self, dtype): N = 10**6 data = { "int": np.random.randint(1, 10, N), - "datetime": date_range("2000-01-01", freq="S", periods=N), + "datetime": date_range("2000-01-01", freq="s", periods=N), } self.s = Series(data[dtype]) if dtype == "datetime": @@ -92,7 +92,7 @@ class Fillna: def setup(self, dtype): N = 10**6 if dtype == "datetime64[ns]": - data = date_range("2000-01-01", freq="S", periods=N) + data = date_range("2000-01-01", freq="s", periods=N) na_value = NaT elif dtype in ("float64", "Float64"): data = np.random.randn(N) @@ -317,7 +317,7 @@ def setup(self, func, N, dtype): if func == "argmax" and dtype in {"Int64", "boolean"}: # Skip argmax for nullable int since this doesn't work yet (GH-24382) raise NotImplementedError - self.s = Series([1] * N, dtype=dtype) + self.s = Series(np.ones(N), dtype=dtype) self.func = getattr(self.s, func) def time_func(self, func, N, dtype): diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 1652fcf8d48da..89bda81ccf08c 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -20,6 +20,39 @@ def time_op(self, op, dtype, axis): self.df_func(axis=axis) +class FrameMixedDtypesOps: + params = [ops, [0, 1, None]] + param_names = ["op", "axis"] + + def setup(self, op, axis): + if op in ("sum", "skew", "kurt", "prod", "sem", "var") or ( + (op, axis) + in ( + ("mean", 1), + ("mean", None), + ("median", 1), + ("median", None), + ("std", 1), + ) + ): + # Skipping cases where datetime aggregations are not implemented + raise NotImplementedError + + N = 1_000_000 + df = pd.DataFrame( + { + "f": np.random.normal(0.0, 1.0, N), + "i": np.random.randint(0, N, N), + "ts": pd.date_range(start="1/1/2000", periods=N, freq="h"), + } + ) + + self.df_func = getattr(df, op) + + def time_op(self, op, axis): + self.df_func(axis=axis) + + class FrameMultiIndexOps: params = [ops] param_names = ["op"] diff --git a/asv_bench/benchmarks/strftime.py b/asv_bench/benchmarks/strftime.py index 39cc82e1bdf79..47f25b331ab9b 100644 --- a/asv_bench/benchmarks/strftime.py +++ b/asv_bench/benchmarks/strftime.py @@ -53,7 +53,7 @@ def time_frame_datetime_formatting_custom(self, nobs): class PeriodStrftime: timeout = 1500 - params = ([1000, 10000], ["D", "H"]) + params = ([1000, 10000], ["D", "h"]) param_names = ["nobs", "freq"] def setup(self, nobs, freq): @@ -67,7 +67,7 @@ def setup(self, nobs, freq): self.data.set_index("i", inplace=True) if freq == "D": self.default_fmt = "%Y-%m-%d" - elif freq == "H": + elif freq == "h": self.default_fmt = "%Y-%m-%d %H:00" def time_frame_period_to_str(self, nobs, freq): diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 8c78a9c1723df..8e1deb99a66a4 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -27,7 +27,7 @@ def setup(self, index_type): N = 100000 dtidxes = { "dst": date_range( - start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="S" + start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="s" ), "repeated": date_range(start="2000", periods=N / 10, freq="s").repeat(10), "tz_aware": date_range(start="2000", periods=N, freq="s", tz="US/Eastern"), @@ -72,13 +72,13 @@ class TzLocalize: def setup(self, tz): dst_rng = date_range( - start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="S" + start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="s" ) - self.index = date_range(start="10/29/2000", end="10/29/2000 00:59:59", freq="S") + self.index = date_range(start="10/29/2000", end="10/29/2000 00:59:59", freq="s") self.index = self.index.append(dst_rng) self.index = self.index.append(dst_rng) self.index = self.index.append( - date_range(start="10/29/2000 2:00:00", end="10/29/2000 3:00:00", freq="S") + date_range(start="10/29/2000 2:00:00", end="10/29/2000 3:00:00", freq="s") ) def time_infer_dst(self, tz): @@ -90,7 +90,7 @@ class ResetIndex: param_names = "tz" def setup(self, tz): - idx = date_range(start="1/1/2000", periods=1000, freq="H", tz=tz) + idx = date_range(start="1/1/2000", periods=1000, freq="h", tz=tz) self.df = DataFrame(np.random.randn(1000, 2), index=idx) def time_reset_datetimeindex(self, tz): @@ -255,7 +255,7 @@ def time_get_slice(self, monotonic): class Lookup: def setup(self): N = 1500000 - rng = date_range(start="1/1/2000", periods=N, freq="S") + rng = date_range(start="1/1/2000", periods=N, freq="s") self.ts = Series(1, index=rng) self.lookup_val = rng[N // 2] diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index eeb8a717262bc..94652e8586d77 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -39,9 +39,6 @@ dependencies: - odfpy>=1.4.1 - qtpy>=2.3.0 - openpyxl>=3.1.0 - # Doesn't install well with pyarrow - # https://github.com/pandas-dev/pandas/issues/55525 - # - pandas-gbq>=0.19.0 - psycopg2>=2.9.6 - pyarrow>=10.0.1 - pymysql>=1.0.2 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 3f35a60a5aa35..bf47bfe3a83ec 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -40,9 +40,6 @@ dependencies: - odfpy>=1.4.1 - qtpy>=2.3.0 - openpyxl>=3.1.0 - # Doesn't install well with pyarrow - # https://github.com/pandas-dev/pandas/issues/55525 - # - pandas-gbq>=0.19.0 - psycopg2>=2.9.6 - pyarrow>=10.0.1 - pymysql>=1.0.2 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index b0e1f3c74b252..bd9e2059ef477 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -39,9 +39,6 @@ dependencies: - odfpy>=1.4.1 - qtpy>=2.3.0 - openpyxl>=3.1.0 - # Doesn't install well with pyarrow - # https://github.com/pandas-dev/pandas/issues/55525 - # - pandas-gbq>=0.19.0 - psycopg2>=2.9.6 - pyarrow>=10.0.1 - pymysql>=1.0.2 diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index 8ea2a8a2f855a..aa8597978ecf7 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -41,7 +41,6 @@ dependencies: - odfpy=1.4.1 - qtpy=2.3.0 - openpyxl=3.1.0 - #- pandas-gbq=0.19.0 - psycopg2=2.9.6 - pyarrow=10.0.1 - pymysql=1.0.2 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 83becd419a821..cf4087a3e4670 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -39,9 +39,6 @@ dependencies: - odfpy>=1.4.1 - qtpy>=2.3.0 - openpyxl>=3.1.0 - # Doesn't install well with pyarrow - # https://github.com/pandas-dev/pandas/issues/55525 - # - pandas-gbq>=0.19.0 - psycopg2>=2.9.6 - pyarrow>=10.0.1 - pymysql>=1.0.2 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index d53f91bc4b5ae..abe6145d077ed 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -39,9 +39,6 @@ dependencies: - odfpy>=1.4.1 - qtpy>=2.3.0 - openpyxl>=3.1.0 - # Doesn't install well with pyarrow - # https://github.com/pandas-dev/pandas/issues/55525 - # - pandas-gbq>=0.19.0 - psycopg2>=2.9.6 - pyarrow>=10.0.1 - pymysql>=1.0.2 diff --git a/doc/source/conf.py b/doc/source/conf.py index 6b52b52ce5e13..3f35e88cf543a 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -457,7 +457,6 @@ "dateutil": ("https://dateutil.readthedocs.io/en/latest/", None), "matplotlib": ("https://matplotlib.org/stable/", None), "numpy": ("https://numpy.org/doc/stable/", None), - "pandas-gbq": ("https://pandas-gbq.readthedocs.io/en/latest/", None), "py": ("https://pylib.readthedocs.io/en/latest/", None), "python": ("https://docs.python.org/3/", None), "scipy": ("https://docs.scipy.org/doc/scipy/", None), diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index dd3599bba8e54..ec024f36d78b1 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -24,6 +24,7 @@ Version 2.1 .. toctree:: :maxdepth: 2 + v2.1.4 v2.1.3 v2.1.2 v2.1.1 diff --git a/doc/source/whatsnew/v0.17.0.rst b/doc/source/whatsnew/v0.17.0.rst index ec441688fc91e..fb71ec60a22f0 100644 --- a/doc/source/whatsnew/v0.17.0.rst +++ b/doc/source/whatsnew/v0.17.0.rst @@ -727,10 +727,10 @@ be broadcast: or it can return False if broadcasting can not be done: -.. ipython:: python - :okwarning: +.. code-block:: ipython - np.array([1, 2, 3]) == np.array([1, 2]) + In [11]: np.array([1, 2, 3]) == np.array([1, 2]) + Out[11]: False Changes to boolean comparisons vs. None ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst index 6413e16afd800..af626895a9e0e 100644 --- a/doc/source/whatsnew/v2.1.3.rst +++ b/doc/source/whatsnew/v2.1.3.rst @@ -1,6 +1,6 @@ .. _whatsnew_213: -What's new in 2.1.3 (November ??, 2023) +What's new in 2.1.3 (November 10, 2023) --------------------------------------- These are the changes in pandas 2.1.3. See :ref:`release` for a full changelog @@ -14,7 +14,6 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed infinite recursion from operations that return a new object on some DataFrame subclasses (:issue:`55763`) -- .. --------------------------------------------------------------------------- .. _whatsnew_213.bug_fixes: @@ -23,14 +22,7 @@ Bug fixes ~~~~~~~~~ - Bug in :meth:`DatetimeIndex.diff` raising ``TypeError`` (:issue:`55080`) - Bug in :meth:`Index.isin` raising for Arrow backed string and ``None`` value (:issue:`55821`) - -.. --------------------------------------------------------------------------- -.. _whatsnew_213.other: - -Other -~~~~~ -- -- +- Fix :func:`read_parquet` and :func:`read_feather` for `CVE-2023-47248 `__ (:issue:`55894`) .. --------------------------------------------------------------------------- .. _whatsnew_213.contributors: diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst new file mode 100644 index 0000000000000..04bbb0f806cbd --- /dev/null +++ b/doc/source/whatsnew/v2.1.4.rst @@ -0,0 +1,41 @@ +.. _whatsnew_214: + +What's new in 2.1.4 (December ??, 2023) +--------------------------------------- + +These are the changes in pandas 2.1.4. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- +.. _whatsnew_214.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_214.bug_fixes: + +Bug fixes +~~~~~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_214.other: + +Other +~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_214.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v2.1.3..v2.1.4|HEAD diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index efa4a52993a90..3203dc083faf6 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -262,6 +262,7 @@ For example: Other Deprecations ^^^^^^^^^^^^^^^^^^ - Changed :meth:`Timedelta.resolution_string` to return ``h``, ``min``, ``s``, ``ms``, ``us``, and ``ns`` instead of ``H``, ``T``, ``S``, ``L``, ``U``, and ``N``, for compatibility with respective deprecations in frequency aliases (:issue:`52536`) +- Deprecated :func:`read_gbq` and :meth:`DataFrame.to_gbq`. Use ``pandas_gbq.read_gbq`` and ``pandas_gbq.to_gbq`` instead https://pandas-gbq.readthedocs.io/en/latest/api.html (:issue:`55525`) - Deprecated :meth:`Index.format`, use ``index.astype(str)`` or ``index.map(formatter)`` instead (:issue:`55413`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_clipboard`. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_csv` except ``path_or_buf``. (:issue:`54229`) @@ -304,6 +305,7 @@ Other Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Performance improvement in :func:`.testing.assert_frame_equal` and :func:`.testing.assert_series_equal` for objects indexed by a :class:`MultiIndex` (:issue:`55949`) - Performance improvement in :func:`concat` with ``axis=1`` and objects with unaligned indexes (:issue:`55084`) - Performance improvement in :func:`merge_asof` when ``by`` is not ``None`` (:issue:`55580`, :issue:`55678`) - Performance improvement in :func:`read_stata` for files with many variables (:issue:`55515`) diff --git a/meson.build b/meson.build index 68018046c081f..0bc04c59d8716 100644 --- a/meson.build +++ b/meson.build @@ -7,7 +7,7 @@ project( meson_version: '>=1.2.1', default_options: [ 'buildtype=release', - 'c_std=c99' + 'c_std=c11' ] ) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index f0dd16b6c75e4..0dc139781f58d 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -1,4 +1,5 @@ cimport cython +from cpython.sequence cimport PySequence_GetItem import numpy as np @@ -77,7 +78,7 @@ cdef ndarray _get_bool_indexer(ndarray values, object val, ndarray mask = None): indexer = np.empty(len(values), dtype=np.uint8) for i in range(len(values)): - item = values[i] + item = PySequence_GetItem(values, i) indexer[i] = is_matching_na(item, val) else: @@ -405,7 +406,7 @@ cdef class IndexEngine: found_nas = set() for i in range(n): - val = values[i] + val = PySequence_GetItem(values, i) # GH#43870 # handle lookup for nas @@ -437,7 +438,7 @@ cdef class IndexEngine: d[val].append(i) for i in range(n_t): - val = targets[i] + val = PySequence_GetItem(targets, i) # ensure there are nas in values before looking for a matching na if check_na_values and checknull(val): @@ -488,22 +489,22 @@ cdef Py_ssize_t _bin_search(ndarray values, object val) except -1: Py_ssize_t mid = 0, lo = 0, hi = len(values) - 1 object pval - if hi == 0 or (hi > 0 and val > values[hi]): + if hi == 0 or (hi > 0 and val > PySequence_GetItem(values, hi)): return len(values) while lo < hi: mid = (lo + hi) // 2 - pval = values[mid] + pval = PySequence_GetItem(values, mid) if val < pval: hi = mid elif val > pval: lo = mid + 1 else: - while mid > 0 and val == values[mid - 1]: + while mid > 0 and val == PySequence_GetItem(values, mid - 1): mid -= 1 return mid - if val <= values[mid]: + if val <= PySequence_GetItem(values, mid): return mid else: return mid + 1 @@ -591,7 +592,7 @@ cdef class DatetimeEngine(Int64Engine): loc = values.searchsorted(conv, side="left") - if loc == len(values) or values[loc] != conv: + if loc == len(values) or PySequence_GetItem(values, loc) != conv: raise KeyError(val) return loc @@ -962,7 +963,7 @@ cdef class SharedEngine: res = np.empty(N, dtype=np.intp) for i in range(N): - val = values[i] + val = PySequence_GetItem(values, i) try: loc = self.get_loc(val) # Because we are unique, loc should always be an integer @@ -996,7 +997,7 @@ cdef class SharedEngine: # See also IntervalIndex.get_indexer_pointwise for i in range(N): - val = targets[i] + val = PySequence_GetItem(targets, i) try: locs = self.get_loc(val) @@ -1176,9 +1177,9 @@ cdef class MaskedIndexEngine(IndexEngine): na_pos = [] for i in range(n): - val = values[i] + val = PySequence_GetItem(values, i) - if mask[i]: + if PySequence_GetItem(mask, i): na_pos.append(i) else: @@ -1188,9 +1189,9 @@ cdef class MaskedIndexEngine(IndexEngine): d[val].append(i) for i in range(n_t): - val = target_vals[i] + val = PySequence_GetItem(target_vals, i) - if target_mask[i]: + if PySequence_GetItem(target_mask, i): if na_pos: for na_idx in na_pos: # realloc if needed diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 15bd5a7379105..33f6f16381639 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -63,7 +63,11 @@ def is_string_array(values: np.ndarray, skipna: bool = ...): ... def is_float_array(values: np.ndarray, skipna: bool = ...): ... def is_integer_array(values: np.ndarray, skipna: bool = ...): ... def is_bool_array(values: np.ndarray, skipna: bool = ...): ... -def fast_multiget(mapping: dict, keys: np.ndarray, default=...) -> np.ndarray: ... +def fast_multiget( + mapping: dict, + keys: np.ndarray, # object[:] + default=..., +) -> np.ndarray: ... def fast_unique_multiple_list_gen(gen: Generator, sort: bool = ...) -> list: ... def fast_unique_multiple_list(lists: list, sort: bool | None = ...) -> list: ... def map_infer( diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 7ec70c8700a0a..e8a69891c6093 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2520,6 +2520,7 @@ def maybe_convert_objects(ndarray[object] objects, ndarray[int64_t] ints ndarray[uint64_t] uints ndarray[uint8_t] bools + ndarray[uint8_t] mask Seen seen = Seen() object val _TSObject tsobj @@ -3088,7 +3089,7 @@ def to_object_array_tuples(rows: object) -> np.ndarray: @cython.wraparound(False) @cython.boundscheck(False) -def fast_multiget(dict mapping, ndarray keys, default=np.nan) -> np.ndarray: +def fast_multiget(dict mapping, object[:] keys, default=np.nan) -> np.ndarray: cdef: Py_ssize_t i, n = len(keys) object val diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index 4ba7bce51ed64..aed0f4b082d4e 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -78,7 +78,7 @@ cpdef assert_almost_equal(a, b, robj : str, default None Specify right object name being compared, internally used to show appropriate assertion message. - index_values : ndarray, default None + index_values : Index | ndarray, default None Specify shared index values of objects being compared, internally used to show appropriate assertion message. diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 5f14d46be8e70..8e49fcfb355fa 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -283,22 +283,37 @@ def _get_ilevel_values(index, level): right = cast(MultiIndex, right) for level in range(left.nlevels): - # cannot use get_level_values here because it can change dtype - llevel = _get_ilevel_values(left, level) - rlevel = _get_ilevel_values(right, level) - lobj = f"MultiIndex level [{level}]" - assert_index_equal( - llevel, - rlevel, - exact=exact, - check_names=check_names, - check_exact=check_exact, - check_categorical=check_categorical, - rtol=rtol, - atol=atol, - obj=lobj, - ) + try: + # try comparison on levels/codes to avoid densifying MultiIndex + assert_index_equal( + left.levels[level], + right.levels[level], + exact=exact, + check_names=check_names, + check_exact=check_exact, + check_categorical=check_categorical, + rtol=rtol, + atol=atol, + obj=lobj, + ) + assert_numpy_array_equal(left.codes[level], right.codes[level]) + except AssertionError: + # cannot use get_level_values here because it can change dtype + llevel = _get_ilevel_values(left, level) + rlevel = _get_ilevel_values(right, level) + + assert_index_equal( + llevel, + rlevel, + exact=exact, + check_names=check_names, + check_exact=check_exact, + check_categorical=check_categorical, + rtol=rtol, + atol=atol, + obj=lobj, + ) # get_level_values may change dtype _check_types(left.levels[level], right.levels[level], obj=obj) @@ -576,6 +591,9 @@ def raise_assert_detail( {message}""" + if isinstance(index_values, Index): + index_values = np.array(index_values) + if isinstance(index_values, np.ndarray): msg += f"\n[index]: {pprint_thing(index_values)}" @@ -630,7 +648,7 @@ def assert_numpy_array_equal( obj : str, default 'numpy array' Specify object name being compared, internally used to show appropriate assertion message. - index_values : numpy.ndarray, default None + index_values : Index | numpy.ndarray, default None optional index (shared by both left and right), used in output. """ __tracebackhide__ = True @@ -701,7 +719,7 @@ def assert_extension_array_equal( The two arrays to compare. check_dtype : bool, default True Whether to check if the ExtensionArray dtypes are identical. - index_values : numpy.ndarray, default None + index_values : Index | numpy.ndarray, default None Optional index (shared by both left and right), used in output. check_exact : bool, default False Whether to compare number exactly. @@ -932,7 +950,7 @@ def assert_series_equal( left_values, right_values, check_dtype=check_dtype, - index_values=np.asarray(left.index), + index_values=left.index, obj=str(obj), ) else: @@ -941,7 +959,7 @@ def assert_series_equal( right_values, check_dtype=check_dtype, obj=str(obj), - index_values=np.asarray(left.index), + index_values=left.index, ) elif check_datetimelike_compat and ( needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype) @@ -972,7 +990,7 @@ def assert_series_equal( atol=atol, check_dtype=bool(check_dtype), obj=str(obj), - index_values=np.asarray(left.index), + index_values=left.index, ) elif isinstance(left.dtype, ExtensionDtype) and isinstance( right.dtype, ExtensionDtype @@ -983,7 +1001,7 @@ def assert_series_equal( rtol=rtol, atol=atol, check_dtype=check_dtype, - index_values=np.asarray(left.index), + index_values=left.index, obj=str(obj), ) elif is_extension_array_dtype_and_needs_i8_conversion( @@ -993,7 +1011,7 @@ def assert_series_equal( left._values, right._values, check_dtype=check_dtype, - index_values=np.asarray(left.index), + index_values=left.index, obj=str(obj), ) elif needs_i8_conversion(left.dtype) and needs_i8_conversion(right.dtype): @@ -1002,7 +1020,7 @@ def assert_series_equal( left._values, right._values, check_dtype=check_dtype, - index_values=np.asarray(left.index), + index_values=left.index, obj=str(obj), ) else: @@ -1013,7 +1031,7 @@ def assert_series_equal( atol=atol, check_dtype=bool(check_dtype), obj=str(obj), - index_values=np.asarray(left.index), + index_values=left.index, ) # metadata comparison diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index ea8cfb7cc144b..738442fab8c70 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -29,6 +29,7 @@ pa_version_under11p0, pa_version_under13p0, pa_version_under14p0, + pa_version_under14p1, ) if TYPE_CHECKING: @@ -184,6 +185,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under11p0", "pa_version_under13p0", "pa_version_under14p0", + "pa_version_under14p1", "IS64", "ISMUSL", "PY310", diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index d125904ba83f8..8dcb2669aa663 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -13,9 +13,11 @@ pa_version_under12p0 = _palv < Version("12.0.0") pa_version_under13p0 = _palv < Version("13.0.0") pa_version_under14p0 = _palv < Version("14.0.0") + pa_version_under14p1 = _palv < Version("14.0.1") except ImportError: pa_version_under10p1 = True pa_version_under11p0 = True pa_version_under12p0 = True pa_version_under13p0 = True pa_version_under14p0 = True + pa_version_under14p1 = True diff --git a/pandas/core/arrays/arrow/extension_types.py b/pandas/core/arrays/arrow/extension_types.py index 7814a77a1cdc5..72bfd6f2212f8 100644 --- a/pandas/core/arrays/arrow/extension_types.py +++ b/pandas/core/arrays/arrow/extension_types.py @@ -5,6 +5,8 @@ import pyarrow +from pandas.compat import pa_version_under14p1 + from pandas.core.dtypes.dtypes import ( IntervalDtype, PeriodDtype, @@ -112,3 +114,61 @@ def to_pandas_dtype(self) -> IntervalDtype: # register the type with a dummy instance _interval_type = ArrowIntervalType(pyarrow.int64(), "left") pyarrow.register_extension_type(_interval_type) + + +_ERROR_MSG = """\ +Disallowed deserialization of 'arrow.py_extension_type': +storage_type = {storage_type} +serialized = {serialized} +pickle disassembly:\n{pickle_disassembly} + +Reading of untrusted Parquet or Feather files with a PyExtensionType column +allows arbitrary code execution. +If you trust this file, you can enable reading the extension type by one of: + +- upgrading to pyarrow >= 14.0.1, and call `pa.PyExtensionType.set_auto_load(True)` +- install pyarrow-hotfix (`pip install pyarrow-hotfix`) and disable it by running + `import pyarrow_hotfix; pyarrow_hotfix.uninstall()` + +We strongly recommend updating your Parquet/Feather files to use extension types +derived from `pyarrow.ExtensionType` instead, and register this type explicitly. +""" + + +def patch_pyarrow(): + # starting from pyarrow 14.0.1, it has its own mechanism + if not pa_version_under14p1: + return + + # if https://github.com/pitrou/pyarrow-hotfix was installed and enabled + if getattr(pyarrow, "_hotfix_installed", False): + return + + class ForbiddenExtensionType(pyarrow.ExtensionType): + def __arrow_ext_serialize__(self): + return b"" + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + import io + import pickletools + + out = io.StringIO() + pickletools.dis(serialized, out) + raise RuntimeError( + _ERROR_MSG.format( + storage_type=storage_type, + serialized=serialized, + pickle_disassembly=out.getvalue(), + ) + ) + + pyarrow.unregister_extension_type("arrow.py_extension_type") + pyarrow.register_extension_type( + ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type") + ) + + pyarrow._hotfix_installed = True + + +patch_pyarrow() diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index e30177a43f6b8..6efff2483e1ae 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2250,9 +2250,7 @@ def _sequence_to_dt64( ) return result, tz, None else: - # data comes back here as either i8 to denote UTC timestamps - # or M8[ns] to denote wall times - converted, inferred_tz = objects_to_datetime64ns( + converted, inferred_tz = objects_to_datetime64( data, dayfirst=dayfirst, yearfirst=yearfirst, @@ -2262,13 +2260,13 @@ def _sequence_to_dt64( copy = False if tz and inferred_tz: # two timezones: convert to intended from base UTC repr - assert converted.dtype == "i8" - # GH#42505 - # by convention, these are _already_ UTC, e.g + # GH#42505 by convention, these are _already_ UTC + assert converted.dtype == out_dtype, converted.dtype result = converted.view(out_dtype) elif inferred_tz: tz = inferred_tz + assert converted.dtype == out_dtype, converted.dtype result = converted.view(out_dtype) else: @@ -2360,7 +2358,7 @@ def _construct_from_dt64_naive( return result, copy -def objects_to_datetime64ns( +def objects_to_datetime64( data: np.ndarray, dayfirst, yearfirst, @@ -2388,10 +2386,11 @@ def objects_to_datetime64ns( Returns ------- result : ndarray - np.int64 dtype if returned values represent UTC timestamps - np.datetime64[ns] if returned values represent wall times + np.datetime64[out_unit] if returned values represent wall times or UTC + timestamps. object if mixed timezones inferred_tz : tzinfo or None + If not None, then the datetime64 values in `result` denote UTC timestamps. Raises ------ @@ -2414,11 +2413,8 @@ def objects_to_datetime64ns( if tz_parsed is not None: # We can take a shortcut since the datetime64 numpy array # is in UTC - # Return i8 values to denote unix timestamps - return result.view("i8"), tz_parsed + return result, tz_parsed elif result.dtype.kind == "M": - # returning M8[ns] denotes wall-times; since tz is None - # the distinction is a thin one return result, tz_parsed elif result.dtype == object: # GH#23675 when called via `pd.to_datetime`, returning an object-dtype diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 533b91e1f2cee..c3696be0579b0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -42,6 +42,7 @@ from pandas._config import ( get_option, using_copy_on_write, + warn_copy_on_write, ) from pandas._config.config import _get_option @@ -2133,6 +2134,10 @@ def to_gbq( """ Write a DataFrame to a Google BigQuery table. + .. deprecated:: 2.2.0 + + Please use ``pandas_gbq.to_gbq`` instead. + This function requires the `pandas-gbq package `__. @@ -4538,7 +4543,7 @@ def _clear_item_cache(self) -> None: def _get_item_cache(self, item: Hashable) -> Series: """Return the cached item, item represents a label indexer.""" - if using_copy_on_write(): + if using_copy_on_write() or warn_copy_on_write(): loc = self.columns.get_loc(item) return self._ixs(loc, axis=1) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f16eac428cde0..7918e43b48719 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -12392,7 +12392,7 @@ def _inplace_method(self, other, op) -> Self: """ warn = True if not PYPY and warn_copy_on_write(): - if sys.getrefcount(self) <= 5: + if sys.getrefcount(self) <= 4: # we are probably in an inplace setitem context (e.g. df['a'] += 1) warn = False diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 06e6755079a22..fd0479e17d2bd 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -12,7 +12,10 @@ import numpy as np -from pandas._config import using_copy_on_write +from pandas._config import ( + using_copy_on_write, + warn_copy_on_write, +) from pandas._libs import lib from pandas._libs.tslibs import OutOfBoundsDatetime @@ -966,7 +969,7 @@ def is_in_axis(key) -> bool: def is_in_obj(gpr) -> bool: if not hasattr(gpr, "name"): return False - if using_copy_on_write(): + if using_copy_on_write() or warn_copy_on_write(): # For the CoW case, we check the references to determine if the # series is part of the object try: diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index c13ec51ff3851..f2db4886a5590 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -102,7 +102,7 @@ def get_window_bounds( closed: str | None = None, step: int | None = None, ) -> tuple[np.ndarray, np.ndarray]: - if center: + if center or self.window_size == 0: offset = (self.window_size - 1) // 2 else: offset = 0 diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b109ce25a3e73..13c039cef3f91 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -100,6 +100,15 @@ from pandas.api.extensions import ExtensionArray +COW_WARNING_GENERAL_MSG = """\ +Setting a value on a view: behaviour will change in pandas 3.0. +You are mutating a Series or DataFrame object, and currently this mutation will +also have effect on other Series or DataFrame objects that share data with this +object. In pandas 3.0 (with Copy-on-Write), updating one Series or DataFrame object +will never modify another. +""" + + COW_WARNING_SETITEM_MSG = """\ Setting a value on a view: behaviour will change in pandas 3.0. Currently, the mutation will also have effect on the object that shares data @@ -387,7 +396,14 @@ def setitem(self, indexer, value) -> Self: if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim: raise ValueError(f"Cannot set values with ndim > {self.ndim}") - if using_copy_on_write() and not self._has_no_reference(0): + if warn_copy_on_write() and not self._has_no_reference(0): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + + elif using_copy_on_write() and not self._has_no_reference(0): # this method is only called if there is a single block -> hardcoded 0 # Split blocks to only copy the columns we want to modify if self.ndim == 2 and isinstance(indexer, tuple): @@ -1951,9 +1967,15 @@ def get_rows_with_mask(self, indexer: npt.NDArray[np.bool_]) -> Self: return type(self)(blk.copy(deep=False), self.index) array = blk.values[indexer] + if isinstance(indexer, np.ndarray) and indexer.dtype.kind == "b": + # boolean indexing always gives a copy with numpy + refs = None + else: + # TODO(CoW) in theory only need to track reference if new_array is a view + refs = blk.refs + bp = BlockPlacement(slice(0, len(array))) - # TODO(CoW) in theory only need to track reference if new_array is a view - block = type(blk)(array, placement=bp, ndim=1, refs=blk.refs) + block = type(blk)(array, placement=bp, ndim=1, refs=refs) new_idx = self.index[indexer] return type(self)(block, new_idx) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f83a12b268b22..f8575b1b53908 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1863,9 +1863,11 @@ def get_result(self, copy: bool | None = True) -> DataFrame: right_indexer = cast("npt.NDArray[np.intp]", right_indexer) left_join_indexer = libjoin.ffill_indexer(left_indexer) right_join_indexer = libjoin.ffill_indexer(right_indexer) - else: + elif self.fill_method is None: left_join_indexer = left_indexer right_join_indexer = right_indexer + else: + raise ValueError("fill_method must be 'ffill' or None") result = self._reindex_and_concat( join_index, left_join_indexer, right_join_indexer, copy=copy diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 33ac5a169b08d..ea5e6e46f58ec 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -71,7 +71,7 @@ from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.datetimes import ( maybe_convert_dtype, - objects_to_datetime64ns, + objects_to_datetime64, tz_to_dtype, ) from pandas.core.construction import extract_array @@ -485,7 +485,7 @@ def _convert_listlike_datetimes( if format is not None and format != "mixed": return _array_strptime_with_fallback(arg, name, utc, format, exact, errors) - result, tz_parsed = objects_to_datetime64ns( + result, tz_parsed = objects_to_datetime64( arg, dayfirst=dayfirst, yearfirst=yearfirst, @@ -499,7 +499,7 @@ def _convert_listlike_datetimes( # is in UTC dtype = cast(DatetimeTZDtype, tz_to_dtype(tz_parsed)) dt64_values = result.view(f"M8[{dtype.unit}]") - dta = DatetimeArray(dt64_values, dtype=dtype) + dta = DatetimeArray._simple_new(dt64_values, dtype=dtype) return DatetimeIndex._simple_new(dta, name=name) return _box_as_indexlike(result, utc=utc, name=name) diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index 8909776d91369..df46837d4deb3 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -121,7 +121,7 @@ def to_timedelta( * 'us' / 'microseconds' / 'microsecond' / 'micro' / 'micros' / 'U' * 'ns' / 'nanoseconds' / 'nano' / 'nanos' / 'nanosecond' / 'N' - Must not be specified when `arg` context strings and ``errors="raise"``. + Must not be specified when `arg` contains strings and ``errors="raise"``. .. deprecated:: 2.2.0 Units 'H', 'T', 'S', 'L', 'U' and 'N' are deprecated and will be removed diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index c463f6e4d2759..c451cd6c139ed 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -117,6 +117,9 @@ def read_feather( import_optional_dependency("pyarrow") from pyarrow import feather + # import utils to register the pyarrow extension types + import pandas.core.arrays.arrow.extension_types # pyright: ignore[reportUnusedImport] # noqa: F401 + check_dtype_backend(dtype_backend) with get_handle( diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index ee71f5af12d09..350002bf461ff 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -5,8 +5,10 @@ TYPE_CHECKING, Any, ) +import warnings from pandas.compat._optional import import_optional_dependency +from pandas.util._exceptions import find_stack_level if TYPE_CHECKING: import google.auth @@ -43,6 +45,10 @@ def read_gbq( """ Load data from Google BigQuery. + .. deprecated:: 2.2.0 + + Please use ``pandas_gbq.read_gbq`` instead. + This function requires the `pandas-gbq package `__. @@ -178,6 +184,13 @@ def read_gbq( ... dialect="standard" ... ) # doctest: +SKIP """ + warnings.warn( + "read_gbq is deprecated and will be removed in a future version. " + "Please use pandas_gbq.read_gbq instead: " + "https://pandas-gbq.readthedocs.io/en/latest/api.html#pandas_gbq.read_gbq", + FutureWarning, + stacklevel=find_stack_level(), + ) pandas_gbq = _try_import() kwargs: dict[str, str | bool | int | None] = {} @@ -219,6 +232,13 @@ def to_gbq( progress_bar: bool = True, credentials: google.auth.credentials.Credentials | None = None, ) -> None: + warnings.warn( + "to_gbq is deprecated and will be removed in a future version. " + "Please use pandas_gbq.to_gbq instead: " + "https://pandas-gbq.readthedocs.io/en/latest/api.html#pandas_gbq.to_gbq", + FutureWarning, + stacklevel=find_stack_level(), + ) pandas_gbq = _try_import() pandas_gbq.to_gbq( dataframe, diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 35965c90ee7fb..a1d69deb6a21e 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -13,6 +13,7 @@ ) from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.inference import is_integer import pandas as pd @@ -203,7 +204,13 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: # Ignore non-existent columns from dtype mapping # like other parsers do if isinstance(self.dtype, dict): - self.dtype = {k: v for k, v in self.dtype.items() if k in frame.columns} + self.dtype = { + k: pandas_dtype(v) + for k, v in self.dtype.items() + if k in frame.columns + } + else: + self.dtype = pandas_dtype(self.dtype) try: frame = frame.astype(self.dtype) except TypeError as e: diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 37ebd940f3646..05e6d2bcfb46a 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -10,6 +10,7 @@ from matplotlib.artist import setp import numpy as np +from pandas._libs import lib from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level @@ -113,26 +114,26 @@ def _plot( # type: ignore[override] else: return ax, bp - def _validate_color_args(self): - if "color" in self.kwds: - if self.colormap is not None: - warnings.warn( - "'color' and 'colormap' cannot be used " - "simultaneously. Using 'color'", - stacklevel=find_stack_level(), - ) - self.color = self.kwds.pop("color") + def _validate_color_args(self, color, colormap): + if color is lib.no_default: + return None - if isinstance(self.color, dict): - valid_keys = ["boxes", "whiskers", "medians", "caps"] - for key in self.color: - if key not in valid_keys: - raise ValueError( - f"color dict contains invalid key '{key}'. " - f"The key must be either {valid_keys}" - ) - else: - self.color = None + if colormap is not None: + warnings.warn( + "'color' and 'colormap' cannot be used " + "simultaneously. Using 'color'", + stacklevel=find_stack_level(), + ) + + if isinstance(color, dict): + valid_keys = ["boxes", "whiskers", "medians", "caps"] + for key in color: + if key not in valid_keys: + raise ValueError( + f"color dict contains invalid key '{key}'. " + f"The key must be either {valid_keys}" + ) + return color @cache_readonly def _color_attrs(self): @@ -182,16 +183,8 @@ def maybe_color_bp(self, bp) -> None: medians = self.color or self._medians_c caps = self.color or self._caps_c - # GH 30346, when users specifying those arguments explicitly, our defaults - # for these four kwargs should be overridden; if not, use Pandas settings - if not self.kwds.get("boxprops"): - setp(bp["boxes"], color=boxes, alpha=1) - if not self.kwds.get("whiskerprops"): - setp(bp["whiskers"], color=whiskers, alpha=1) - if not self.kwds.get("medianprops"): - setp(bp["medians"], color=medians, alpha=1) - if not self.kwds.get("capprops"): - setp(bp["caps"], color=caps, alpha=1) + color_tup = (boxes, whiskers, medians, caps) + maybe_color_bp(bp, color_tup=color_tup, **self.kwds) def _make_plot(self, fig: Figure) -> None: if self.subplots: @@ -204,7 +197,10 @@ def _make_plot(self, fig: Figure) -> None: else self.data ) - for i, (label, y) in enumerate(self._iter_data(data=data)): + # error: Argument "data" to "_iter_data" of "MPLPlot" has + # incompatible type "object"; expected "DataFrame | + # dict[Hashable, Series | DataFrame]" + for i, (label, y) in enumerate(self._iter_data(data=data)): # type: ignore[arg-type] ax = self._get_ax(i) kwds = self.kwds.copy() @@ -216,9 +212,9 @@ def _make_plot(self, fig: Figure) -> None: # When `by` is assigned, the ticklabels will become unique grouped # values, instead of label which is used as subtitle in this case. - ticklabels = [ - pprint_thing(col) for col in self.data.columns.levels[0] - ] + # error: "Index" has no attribute "levels"; maybe "nlevels"? + levels = self.data.columns.levels # type: ignore[attr-defined] + ticklabels = [pprint_thing(col) for col in levels[0]] else: ticklabels = [pprint_thing(label)] @@ -273,6 +269,19 @@ def result(self): return self._return_obj +def maybe_color_bp(bp, color_tup, **kwds) -> None: + # GH#30346, when users specifying those arguments explicitly, our defaults + # for these four kwargs should be overridden; if not, use Pandas settings + if not kwds.get("boxprops"): + setp(bp["boxes"], color=color_tup[0], alpha=1) + if not kwds.get("whiskerprops"): + setp(bp["whiskers"], color=color_tup[1], alpha=1) + if not kwds.get("medianprops"): + setp(bp["medians"], color=color_tup[2], alpha=1) + if not kwds.get("capprops"): + setp(bp["caps"], color=color_tup[3], alpha=1) + + def _grouped_plot_by_column( plotf, data, @@ -386,18 +395,6 @@ def _get_colors(): return result - def maybe_color_bp(bp, **kwds) -> None: - # GH 30346, when users specifying those arguments explicitly, our defaults - # for these four kwargs should be overridden; if not, use Pandas settings - if not kwds.get("boxprops"): - setp(bp["boxes"], color=colors[0], alpha=1) - if not kwds.get("whiskerprops"): - setp(bp["whiskers"], color=colors[1], alpha=1) - if not kwds.get("medianprops"): - setp(bp["medians"], color=colors[2], alpha=1) - if not kwds.get("capprops"): - setp(bp["caps"], color=colors[3], alpha=1) - def plot_group(keys, values, ax: Axes, **kwds): # GH 45465: xlabel/ylabel need to be popped out before plotting happens xlabel, ylabel = kwds.pop("xlabel", None), kwds.pop("ylabel", None) @@ -416,7 +413,7 @@ def plot_group(keys, values, ax: Axes, **kwds): _set_ticklabels( ax=ax, labels=keys, is_vertical=kwds.get("vert", True), rotation=rot ) - maybe_color_bp(bp, **kwds) + maybe_color_bp(bp, color_tup=colors, **kwds) # Return axes in multiplot case, maybe revisit later # 985 if return_type == "dict": diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index dd56cbabdba0e..0e240e56de2c6 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -22,6 +22,7 @@ import matplotlib as mpl import numpy as np +from pandas._libs import lib from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level @@ -45,15 +46,13 @@ ) from pandas.core.dtypes.generic import ( ABCDataFrame, + ABCDatetimeIndex, ABCIndex, ABCMultiIndex, ABCPeriodIndex, ABCSeries, ) -from pandas.core.dtypes.missing import ( - isna, - notna, -) +from pandas.core.dtypes.missing import isna import pandas.core.common as com from pandas.core.frame import DataFrame @@ -89,6 +88,7 @@ from pandas._typing import ( IndexLabel, + NDFrameT, PlottingOrientation, npt, ) @@ -128,6 +128,8 @@ def _kind(self) -> str: def orientation(self) -> str | None: return None + data: DataFrame + def __init__( self, data, @@ -157,12 +159,18 @@ def __init__( layout=None, include_bool: bool = False, column: IndexLabel | None = None, + *, + logx: bool | None | Literal["sym"] = False, + logy: bool | None | Literal["sym"] = False, + loglog: bool | None | Literal["sym"] = False, + mark_right: bool = True, + stacked: bool = False, + label: Hashable | None = None, + style=None, **kwds, ) -> None: import matplotlib.pyplot as plt - self.data = data - # if users assign an empty list or tuple, raise `ValueError` # similar to current `df.box` and `df.hist` APIs. if by in ([], ()): @@ -193,9 +201,11 @@ def __init__( self.kind = kind - self.subplots = self._validate_subplots_kwarg(subplots) + self.subplots = type(self)._validate_subplots_kwarg( + subplots, data, kind=self._kind + ) - self.sharex = self._validate_sharex(sharex, ax, by) + self.sharex = type(self)._validate_sharex(sharex, ax, by) self.sharey = sharey self.figsize = figsize self.layout = layout @@ -228,13 +238,13 @@ def __init__( self.legend_handles: list[Artist] = [] self.legend_labels: list[Hashable] = [] - self.logx = kwds.pop("logx", False) - self.logy = kwds.pop("logy", False) - self.loglog = kwds.pop("loglog", False) - self.label = kwds.pop("label", None) - self.style = kwds.pop("style", None) - self.mark_right = kwds.pop("mark_right", True) - self.stacked = kwds.pop("stacked", False) + self.logx = type(self)._validate_log_kwd("logx", logx) + self.logy = type(self)._validate_log_kwd("logy", logy) + self.loglog = type(self)._validate_log_kwd("loglog", loglog) + self.label = label + self.style = style + self.mark_right = mark_right + self.stacked = stacked # ax may be an Axes object or (if self.subplots) an ndarray of # Axes objects @@ -245,10 +255,11 @@ def __init__( # parse errorbar input if given xerr = kwds.pop("xerr", None) yerr = kwds.pop("yerr", None) - self.errors = { - kw: self._parse_errorbars(kw, err) - for kw, err in zip(["xerr", "yerr"], [xerr, yerr]) - } + nseries = self._get_nseries(data) + xerr, data = type(self)._parse_errorbars("xerr", xerr, data, nseries) + yerr, data = type(self)._parse_errorbars("yerr", yerr, data, nseries) + self.errors = {"xerr": xerr, "yerr": yerr} + self.data = data if not isinstance(secondary_y, (bool, tuple, list, np.ndarray, ABCIndex)): secondary_y = [secondary_y] @@ -268,10 +279,15 @@ def __init__( self.kwds = kwds - self._validate_color_args() + color = kwds.pop("color", lib.no_default) + self.color = self._validate_color_args(color, self.colormap) + assert "color" not in self.kwds + + self.data = self._ensure_frame(self.data) @final - def _validate_sharex(self, sharex: bool | None, ax, by) -> bool: + @staticmethod + def _validate_sharex(sharex: bool | None, ax, by) -> bool: if sharex is None: # if by is defined, subplots are used and sharex should be False if ax is None and by is None: # pylint: disable=simplifiable-if-statement @@ -284,9 +300,26 @@ def _validate_sharex(self, sharex: bool | None, ax, by) -> bool: raise TypeError("sharex must be a bool or None") return bool(sharex) + @classmethod + def _validate_log_kwd( + cls, + kwd: str, + value: bool | None | Literal["sym"], + ) -> bool | None | Literal["sym"]: + if ( + value is None + or isinstance(value, bool) + or (isinstance(value, str) and value == "sym") + ): + return value + raise ValueError( + f"keyword '{kwd}' should be bool, None, or 'sym', not '{value}'" + ) + @final + @staticmethod def _validate_subplots_kwarg( - self, subplots: bool | Sequence[Sequence[str]] + subplots: bool | Sequence[Sequence[str]], data: Series | DataFrame, kind: str ) -> bool | list[tuple[int, ...]]: """ Validate the subplots parameter @@ -323,18 +356,18 @@ def _validate_subplots_kwarg( "area", "pie", ) - if self._kind not in supported_kinds: + if kind not in supported_kinds: raise ValueError( "When subplots is an iterable, kind must be " - f"one of {', '.join(supported_kinds)}. Got {self._kind}." + f"one of {', '.join(supported_kinds)}. Got {kind}." ) - if isinstance(self.data, ABCSeries): + if isinstance(data, ABCSeries): raise NotImplementedError( "An iterable subplots for a Series is not supported." ) - columns = self.data.columns + columns = data.columns if isinstance(columns, ABCMultiIndex): raise NotImplementedError( "An iterable subplots for a DataFrame with a MultiIndex column " @@ -390,34 +423,31 @@ def _validate_subplots_kwarg( out.append((idx_loc,)) return out - def _validate_color_args(self): - if ( - "color" in self.kwds - and self.nseries == 1 - and self.kwds["color"] is not None - and not is_list_like(self.kwds["color"]) - ): + def _validate_color_args(self, color, colormap): + if color is lib.no_default: + # It was not provided by the user + if "colors" in self.kwds and colormap is not None: + warnings.warn( + "'color' and 'colormap' cannot be used simultaneously. " + "Using 'color'", + stacklevel=find_stack_level(), + ) + return None + if self.nseries == 1 and color is not None and not is_list_like(color): # support series.plot(color='green') - self.kwds["color"] = [self.kwds["color"]] + color = [color] - if ( - "color" in self.kwds - and isinstance(self.kwds["color"], tuple) - and self.nseries == 1 - and len(self.kwds["color"]) in (3, 4) - ): + if isinstance(color, tuple) and self.nseries == 1 and len(color) in (3, 4): # support RGB and RGBA tuples in series plot - self.kwds["color"] = [self.kwds["color"]] + color = [color] - if ( - "color" in self.kwds or "colors" in self.kwds - ) and self.colormap is not None: + if colormap is not None: warnings.warn( "'color' and 'colormap' cannot be used simultaneously. Using 'color'", stacklevel=find_stack_level(), ) - if "color" in self.kwds and self.style is not None: + if self.style is not None: if is_list_like(self.style): styles = self.style else: @@ -430,6 +460,7 @@ def _validate_color_args(self): "'color' keyword argument. Please use one or the " "other or pass 'style' without a color symbol" ) + return color @final @staticmethod @@ -442,18 +473,22 @@ def _iter_data( # typing. yield col, np.asarray(values.values) - @property - def nseries(self) -> int: + def _get_nseries(self, data: Series | DataFrame) -> int: # When `by` is explicitly assigned, grouped data size will be defined, and # this will determine number of subplots to have, aka `self.nseries` - if self.data.ndim == 1: + if data.ndim == 1: return 1 elif self.by is not None and self._kind == "hist": return len(self._grouped) elif self.by is not None and self._kind == "box": return len(self.columns) else: - return self.data.shape[1] + return data.shape[1] + + @final + @property + def nseries(self) -> int: + return self._get_nseries(self.data) @final def draw(self) -> None: @@ -552,14 +587,6 @@ def _axes_and_fig(self) -> tuple[Sequence[Axes], Figure]: axes = flatten_axes(axes) - valid_log = {False, True, "sym", None} - input_log = {self.logx, self.logy, self.loglog} - if input_log - valid_log: - invalid_log = next(iter(input_log - valid_log)) - raise ValueError( - f"Boolean, None and 'sym' are valid options, '{invalid_log}' is given." - ) - if self.logx is True or self.loglog is True: [a.set_xscale("log") for a in axes] elif self.logx == "sym" or self.loglog == "sym": @@ -619,9 +646,7 @@ def _convert_to_ndarray(data): return data @final - def _compute_plot_data(self): - data = self.data - + def _ensure_frame(self, data) -> DataFrame: if isinstance(data, ABCSeries): label = self.label if label is None and data.name is None: @@ -634,6 +659,11 @@ def _compute_plot_data(self): elif self._kind in ("hist", "box"): cols = self.columns if self.by is None else self.columns + self.by data = data.loc[:, cols] + return data + + @final + def _compute_plot_data(self): + data = self.data # GH15079 reconstruct data if by is defined if self.by is not None: @@ -883,24 +913,26 @@ def plt(self): _need_to_set_index = False @final - def _get_xticks(self, convert_period: bool = False): + def _get_xticks(self): index = self.data.index is_datetype = index.inferred_type in ("datetime", "date", "datetime64", "time") + # TODO: be stricter about x? + x: list[int] | np.ndarray if self.use_index: - if convert_period and isinstance(index, ABCPeriodIndex): - self.data = self.data.reindex(index=index.sort_values()) - x = self.data.index.to_timestamp()._mpl_repr() + if isinstance(index, ABCPeriodIndex): + # test_mixed_freq_irreg_period + x = index.to_timestamp()._mpl_repr() + # TODO: why do we need to do to_timestamp() here but not other + # places where we call mpl_repr? elif is_any_real_numeric_dtype(index.dtype): # Matplotlib supports numeric values or datetime objects as # xaxis values. Taking LBYL approach here, by the time # matplotlib raises exception when using non numeric/datetime # values for xaxis, several actions are already taken by plt. x = index._mpl_repr() - elif is_datetype: - self.data = self.data[notna(self.data.index)] - self.data = self.data.sort_index() - x = self.data.index._mpl_repr() + elif isinstance(index, ABCDatetimeIndex) or is_datetype: + x = index._mpl_repr() else: self._need_to_set_index = True x = list(range(len(index))) @@ -1050,15 +1082,22 @@ def _get_colors( ): if num_colors is None: num_colors = self.nseries - + if color_kwds == "color": + color = self.color + else: + color = self.kwds.get(color_kwds) return get_standard_colors( num_colors=num_colors, colormap=self.colormap, - color=self.kwds.get(color_kwds), + color=color, ) + # TODO: tighter typing for first return? @final - def _parse_errorbars(self, label: str, err): + @staticmethod + def _parse_errorbars( + label: str, err, data: NDFrameT, nseries: int + ) -> tuple[Any, NDFrameT]: """ Look for error keyword arguments and return the actual errorbar data or return the error DataFrame/dict @@ -1078,7 +1117,7 @@ def _parse_errorbars(self, label: str, err): should be in a ``Mx2xN`` array. """ if err is None: - return None + return None, data def match_labels(data, e): e = e.reindex(data.index) @@ -1086,7 +1125,7 @@ def match_labels(data, e): # key-matched DataFrame if isinstance(err, ABCDataFrame): - err = match_labels(self.data, err) + err = match_labels(data, err) # key-matched dict elif isinstance(err, dict): pass @@ -1094,16 +1133,16 @@ def match_labels(data, e): # Series of error values elif isinstance(err, ABCSeries): # broadcast error series across data - err = match_labels(self.data, err) + err = match_labels(data, err) err = np.atleast_2d(err) - err = np.tile(err, (self.nseries, 1)) + err = np.tile(err, (nseries, 1)) # errors are a column in the dataframe elif isinstance(err, str): - evalues = self.data[err].values - self.data = self.data[self.data.columns.drop(err)] + evalues = data[err].values + data = data[data.columns.drop(err)] err = np.atleast_2d(evalues) - err = np.tile(err, (self.nseries, 1)) + err = np.tile(err, (nseries, 1)) elif is_list_like(err): if is_iterator(err): @@ -1115,40 +1154,40 @@ def match_labels(data, e): err_shape = err.shape # asymmetrical error bars - if isinstance(self.data, ABCSeries) and err_shape[0] == 2: + if isinstance(data, ABCSeries) and err_shape[0] == 2: err = np.expand_dims(err, 0) err_shape = err.shape - if err_shape[2] != len(self.data): + if err_shape[2] != len(data): raise ValueError( "Asymmetrical error bars should be provided " - f"with the shape (2, {len(self.data)})" + f"with the shape (2, {len(data)})" ) - elif isinstance(self.data, ABCDataFrame) and err.ndim == 3: + elif isinstance(data, ABCDataFrame) and err.ndim == 3: if ( - (err_shape[0] != self.nseries) + (err_shape[0] != nseries) or (err_shape[1] != 2) - or (err_shape[2] != len(self.data)) + or (err_shape[2] != len(data)) ): raise ValueError( "Asymmetrical error bars should be provided " - f"with the shape ({self.nseries}, 2, {len(self.data)})" + f"with the shape ({nseries}, 2, {len(data)})" ) # broadcast errors to each data series if len(err) == 1: - err = np.tile(err, (self.nseries, 1)) + err = np.tile(err, (nseries, 1)) elif is_number(err): err = np.tile( [err], # pyright: ignore[reportGeneralTypeIssues] - (self.nseries, len(self.data)), + (nseries, len(data)), ) else: msg = f"No valid {label} detected" raise ValueError(msg) - return err + return err, data # pyright: ignore[reportGeneralTypeIssues] @final def _get_errorbars( @@ -1214,19 +1253,11 @@ def __init__(self, data, x, y, **kwargs) -> None: if is_integer(y) and not self.data.columns._holds_integer(): y = self.data.columns[y] - # Scatter plot allows to plot objects data - if self._kind == "hexbin": - if len(self.data[x]._get_numeric_data()) == 0: - raise ValueError(self._kind + " requires x column to be numeric") - if len(self.data[y]._get_numeric_data()) == 0: - raise ValueError(self._kind + " requires y column to be numeric") - self.x = x self.y = y @final - @property - def nseries(self) -> int: + def _get_nseries(self, data: Series | DataFrame) -> int: return 1 @final @@ -1265,14 +1296,30 @@ class ScatterPlot(PlanePlot): def _kind(self) -> Literal["scatter"]: return "scatter" - def __init__(self, data, x, y, s=None, c=None, **kwargs) -> None: + def __init__( + self, + data, + x, + y, + s=None, + c=None, + *, + colorbar: bool | lib.NoDefault = lib.no_default, + norm=None, + **kwargs, + ) -> None: if s is None: # hide the matplotlib default for size, in case we want to change # the handling of this argument later s = 20 elif is_hashable(s) and s in data.columns: s = data[s] - super().__init__(data, x, y, s=s, **kwargs) + self.s = s + + self.colorbar = colorbar + self.norm = norm + + super().__init__(data, x, y, **kwargs) if is_integer(c) and not self.data.columns._holds_integer(): c = self.data.columns[c] self.c = c @@ -1287,7 +1334,50 @@ def _make_plot(self, fig: Figure): self.data[c].dtype, CategoricalDtype ) - color = self.kwds.pop("color", None) + color = self.color + c_values = self._get_c_values(color, color_by_categorical, c_is_column) + norm, cmap = self._get_norm_and_cmap(c_values, color_by_categorical) + cb = self._get_colorbar(c_values, c_is_column) + + if self.legend: + label = self.label + else: + label = None + scatter = ax.scatter( + data[x].values, + data[y].values, + c=c_values, + label=label, + cmap=cmap, + norm=norm, + s=self.s, + **self.kwds, + ) + if cb: + cbar_label = c if c_is_column else "" + cbar = self._plot_colorbar(ax, fig=fig, label=cbar_label) + if color_by_categorical: + n_cats = len(self.data[c].cat.categories) + cbar.set_ticks(np.linspace(0.5, n_cats - 0.5, n_cats)) + cbar.ax.set_yticklabels(self.data[c].cat.categories) + + if label is not None: + self._append_legend_handles_labels( + # error: Argument 2 to "_append_legend_handles_labels" of + # "MPLPlot" has incompatible type "Hashable"; expected "str" + scatter, + label, # type: ignore[arg-type] # pyright: ignore[reportGeneralTypeIssues] + ) + + errors_x = self._get_errorbars(label=x, index=0, yerr=False) + errors_y = self._get_errorbars(label=y, index=0, xerr=False) + if len(errors_x) > 0 or len(errors_y) > 0: + err_kwds = dict(errors_x, **errors_y) + err_kwds["ecolor"] = scatter.get_facecolor()[0] + ax.errorbar(data[x].values, data[y].values, linestyle="none", **err_kwds) + + def _get_c_values(self, color, color_by_categorical: bool, c_is_column: bool): + c = self.c if c is not None and color is not None: raise TypeError("Specify exactly one of `c` and `color`") if c is None and color is None: @@ -1300,7 +1390,10 @@ def _make_plot(self, fig: Figure): c_values = self.data[c].values else: c_values = c + return c_values + def _get_norm_and_cmap(self, c_values, color_by_categorical: bool): + c = self.c if self.colormap is not None: cmap = mpl.colormaps.get_cmap(self.colormap) # cmap is only used if c_values are integers, otherwise UserWarning. @@ -1319,45 +1412,21 @@ def _make_plot(self, fig: Figure): cmap = colors.ListedColormap([cmap(i) for i in range(cmap.N)]) bounds = np.linspace(0, n_cats, n_cats + 1) norm = colors.BoundaryNorm(bounds, cmap.N) + # TODO: warn that we are ignoring self.norm if user specified it? + # Doesn't happen in any tests 2023-11-09 else: - norm = self.kwds.pop("norm", None) + norm = self.norm + return norm, cmap + + def _get_colorbar(self, c_values, c_is_column: bool) -> bool: # plot colorbar if # 1. colormap is assigned, and # 2.`c` is a column containing only numeric values plot_colorbar = self.colormap or c_is_column - cb = self.kwds.pop("colorbar", is_numeric_dtype(c_values) and plot_colorbar) - - if self.legend and hasattr(self, "label"): - label = self.label - else: - label = None - scatter = ax.scatter( - data[x].values, - data[y].values, - c=c_values, - label=label, - cmap=cmap, - norm=norm, - **self.kwds, - ) - if cb: - cbar_label = c if c_is_column else "" - cbar = self._plot_colorbar(ax, fig=fig, label=cbar_label) - if color_by_categorical: - cbar.set_ticks(np.linspace(0.5, n_cats - 0.5, n_cats)) - cbar.ax.set_yticklabels(self.data[c].cat.categories) - - if label is not None: - self._append_legend_handles_labels(scatter, label) - else: - self.legend = False - - errors_x = self._get_errorbars(label=x, index=0, yerr=False) - errors_y = self._get_errorbars(label=y, index=0, xerr=False) - if len(errors_x) > 0 or len(errors_y) > 0: - err_kwds = dict(errors_x, **errors_y) - err_kwds["ecolor"] = scatter.get_facecolor()[0] - ax.errorbar(data[x].values, data[y].values, linestyle="none", **err_kwds) + cb = self.colorbar + if cb is lib.no_default: + return is_numeric_dtype(c_values) and plot_colorbar + return cb class HexBinPlot(PlanePlot): @@ -1365,19 +1434,27 @@ class HexBinPlot(PlanePlot): def _kind(self) -> Literal["hexbin"]: return "hexbin" - def __init__(self, data, x, y, C=None, **kwargs) -> None: + def __init__(self, data, x, y, C=None, *, colorbar: bool = True, **kwargs) -> None: super().__init__(data, x, y, **kwargs) if is_integer(C) and not self.data.columns._holds_integer(): C = self.data.columns[C] self.C = C + self.colorbar = colorbar + + # Scatter plot allows to plot objects data + if len(self.data[self.x]._get_numeric_data()) == 0: + raise ValueError(self._kind + " requires x column to be numeric") + if len(self.data[self.y]._get_numeric_data()) == 0: + raise ValueError(self._kind + " requires y column to be numeric") + def _make_plot(self, fig: Figure) -> None: x, y, data, C = self.x, self.y, self.data, self.C ax = self.axes[0] # pandas uses colormap, matplotlib uses cmap. cmap = self.colormap or "BuGn" cmap = mpl.colormaps.get_cmap(cmap) - cb = self.kwds.pop("colorbar", True) + cb = self.colorbar if C is None: c_values = None @@ -1430,12 +1507,15 @@ def _make_plot(self, fig: Figure) -> None: plotf = self._ts_plot it = data.items() else: - x = self._get_xticks(convert_period=True) + x = self._get_xticks() # error: Incompatible types in assignment (expression has type # "Callable[[Any, Any, Any, Any, Any, Any, KwArg(Any)], Any]", variable has # type "Callable[[Any, Any, Any, Any, KwArg(Any)], Any]") plotf = self._plot # type: ignore[assignment] - it = self._iter_data(data=self.data) + # error: Incompatible types in assignment (expression has type + # "Iterator[tuple[Hashable, ndarray[Any, Any]]]", variable has + # type "Iterable[tuple[Hashable, Series]]") + it = self._iter_data(data=self.data) # type: ignore[assignment] stacking_id = self._get_stacking_id() is_errorbar = com.any_not_none(*self.errors.values()) @@ -1444,11 +1524,15 @@ def _make_plot(self, fig: Figure) -> None: for i, (label, y) in enumerate(it): ax = self._get_ax(i) kwds = self.kwds.copy() + if self.color is not None: + kwds["color"] = self.color style, kwds = self._apply_style_colors( colors, kwds, i, - label, # pyright: ignore[reportGeneralTypeIssues] + # error: Argument 4 to "_apply_style_colors" of "MPLPlot" has + # incompatible type "Hashable"; expected "str" + label, # type: ignore[arg-type] # pyright: ignore[reportGeneralTypeIssues] ) errors = self._get_errorbars(label=label, index=i) @@ -1968,13 +2052,25 @@ def __init__(self, data, kind=None, **kwargs) -> None: if (data < 0).any().any(): raise ValueError(f"{self._kind} plot doesn't allow negative values") MPLPlot.__init__(self, data, kind=kind, **kwargs) - self.grid = False - self.logy = False - self.logx = False - self.loglog = False - def _validate_color_args(self) -> None: - pass + @classmethod + def _validate_log_kwd( + cls, + kwd: str, + value: bool | None | Literal["sym"], + ) -> bool | None | Literal["sym"]: + super()._validate_log_kwd(kwd=kwd, value=value) + if value is not False: + warnings.warn( + f"PiePlot ignores the '{kwd}' keyword", + UserWarning, + stacklevel=find_stack_level(), + ) + return False + + def _validate_color_args(self, color, colormap) -> None: + # TODO: warn if color is passed and ignored? + return None def _make_plot(self, fig: Figure) -> None: colors = self._get_colors(num_colors=len(self.data), color_kwds="colors") diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 59e0125322651..de4fd91541a9d 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -45,7 +45,10 @@ from pandas._typing import PlottingOrientation - from pandas import DataFrame + from pandas import ( + DataFrame, + Series, + ) class HistPlot(LinePlot): @@ -87,7 +90,7 @@ def _adjust_bins(self, bins: int | np.ndarray | list[np.ndarray]): bins = self._calculate_bins(self.data, bins) return bins - def _calculate_bins(self, data: DataFrame, bins) -> np.ndarray: + def _calculate_bins(self, data: Series | DataFrame, bins) -> np.ndarray: """Calculate bins given data""" nd_values = data.infer_objects(copy=False)._get_numeric_data() values = np.ravel(nd_values) @@ -131,10 +134,14 @@ def _make_plot(self, fig: Figure) -> None: else self.data ) - for i, (label, y) in enumerate(self._iter_data(data=data)): + # error: Argument "data" to "_iter_data" of "MPLPlot" has incompatible + # type "object"; expected "DataFrame | dict[Hashable, Series | DataFrame]" + for i, (label, y) in enumerate(self._iter_data(data=data)): # type: ignore[arg-type] ax = self._get_ax(i) kwds = self.kwds.copy() + if self.color is not None: + kwds["color"] = self.color label = pprint_thing(label) label = self._mark_right_label(label, index=i) diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 5516ecb9e2798..5efc0dd2cd4e3 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -1453,7 +1453,7 @@ def test_apply_dtype(col): tm.assert_series_equal(result, expected) -def test_apply_mutating(using_array_manager, using_copy_on_write): +def test_apply_mutating(using_array_manager, using_copy_on_write, warn_copy_on_write): # GH#35462 case where applied func pins a new BlockManager to a row df = DataFrame({"a": range(100), "b": range(100, 200)}) df_orig = df.copy() @@ -1467,7 +1467,8 @@ def func(row): expected = df.copy() expected["a"] += 1 - result = df.apply(func, axis=1) + with tm.assert_cow_warning(warn_copy_on_write): + result = df.apply(func, axis=1) tm.assert_frame_equal(result, expected) if using_copy_on_write or using_array_manager: diff --git a/pandas/tests/copy_view/index/test_datetimeindex.py b/pandas/tests/copy_view/index/test_datetimeindex.py index f54beca4cc414..b023297c9549d 100644 --- a/pandas/tests/copy_view/index/test_datetimeindex.py +++ b/pandas/tests/copy_view/index/test_datetimeindex.py @@ -8,6 +8,10 @@ ) import pandas._testing as tm +pytestmark = pytest.mark.filterwarnings( + "ignore:Setting a value on a view:FutureWarning" +) + @pytest.mark.parametrize( "cons", diff --git a/pandas/tests/copy_view/index/test_index.py b/pandas/tests/copy_view/index/test_index.py index 6411e20a972e7..49d756cf32d34 100644 --- a/pandas/tests/copy_view/index/test_index.py +++ b/pandas/tests/copy_view/index/test_index.py @@ -19,11 +19,12 @@ def index_view(index_data=[1, 2]): return idx, view -def test_set_index_update_column(using_copy_on_write): +def test_set_index_update_column(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2], "b": 1}) df = df.set_index("a", drop=False) expected = df.index.copy(deep=True) - df.iloc[0, 0] = 100 + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 100 if using_copy_on_write: tm.assert_index_equal(df.index, expected) else: @@ -39,49 +40,53 @@ def test_set_index_drop_update_column(using_copy_on_write): tm.assert_index_equal(df.index, expected) -def test_set_index_series(using_copy_on_write): +def test_set_index_series(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2], "b": 1.5}) ser = Series([10, 11]) df = df.set_index(ser) expected = df.index.copy(deep=True) - ser.iloc[0] = 100 + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 100 if using_copy_on_write: tm.assert_index_equal(df.index, expected) else: tm.assert_index_equal(df.index, Index([100, 11])) -def test_assign_index_as_series(using_copy_on_write): +def test_assign_index_as_series(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2], "b": 1.5}) ser = Series([10, 11]) df.index = ser expected = df.index.copy(deep=True) - ser.iloc[0] = 100 + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 100 if using_copy_on_write: tm.assert_index_equal(df.index, expected) else: tm.assert_index_equal(df.index, Index([100, 11])) -def test_assign_index_as_index(using_copy_on_write): +def test_assign_index_as_index(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2], "b": 1.5}) ser = Series([10, 11]) rhs_index = Index(ser) df.index = rhs_index rhs_index = None # overwrite to clear reference expected = df.index.copy(deep=True) - ser.iloc[0] = 100 + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 100 if using_copy_on_write: tm.assert_index_equal(df.index, expected) else: tm.assert_index_equal(df.index, Index([100, 11])) -def test_index_from_series(using_copy_on_write): +def test_index_from_series(using_copy_on_write, warn_copy_on_write): ser = Series([1, 2]) idx = Index(ser) expected = idx.copy(deep=True) - ser.iloc[0] = 100 + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 100 if using_copy_on_write: tm.assert_index_equal(idx, expected) else: @@ -96,12 +101,13 @@ def test_index_from_series_copy(using_copy_on_write): assert np.shares_memory(get_array(ser), arr) -def test_index_from_index(using_copy_on_write): +def test_index_from_index(using_copy_on_write, warn_copy_on_write): ser = Series([1, 2]) idx = Index(ser) idx = Index(idx) expected = idx.copy(deep=True) - ser.iloc[0] = 100 + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 100 if using_copy_on_write: tm.assert_index_equal(idx, expected) else: diff --git a/pandas/tests/copy_view/index/test_periodindex.py b/pandas/tests/copy_view/index/test_periodindex.py index 94bc3a66f0e2b..b80ce1d3d838f 100644 --- a/pandas/tests/copy_view/index/test_periodindex.py +++ b/pandas/tests/copy_view/index/test_periodindex.py @@ -8,6 +8,10 @@ ) import pandas._testing as tm +pytestmark = pytest.mark.filterwarnings( + "ignore:Setting a value on a view:FutureWarning" +) + @pytest.mark.parametrize( "cons", diff --git a/pandas/tests/copy_view/index/test_timedeltaindex.py b/pandas/tests/copy_view/index/test_timedeltaindex.py index a543e06cea328..5b9832093fded 100644 --- a/pandas/tests/copy_view/index/test_timedeltaindex.py +++ b/pandas/tests/copy_view/index/test_timedeltaindex.py @@ -8,6 +8,10 @@ ) import pandas._testing as tm +pytestmark = pytest.mark.filterwarnings( + "ignore:Setting a value on a view:FutureWarning" +) + @pytest.mark.parametrize( "cons", diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index 1f61598c40573..221bf8759875f 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -21,7 +21,7 @@ @pytest.mark.parametrize("dtype", [None, "int64"]) -def test_series_from_series(dtype, using_copy_on_write): +def test_series_from_series(dtype, using_copy_on_write, warn_copy_on_write): # Case: constructing a Series from another Series object follows CoW rules: # a new object is returned and thus mutations are not propagated ser = Series([1, 2, 3], name="name") @@ -43,7 +43,8 @@ def test_series_from_series(dtype, using_copy_on_write): assert not np.shares_memory(get_array(ser), get_array(result)) else: # mutating shallow copy does mutate original - result.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + result.iloc[0] = 0 assert ser.iloc[0] == 0 # and still shares memory assert np.shares_memory(get_array(ser), get_array(result)) @@ -57,11 +58,12 @@ def test_series_from_series(dtype, using_copy_on_write): assert result.iloc[0] == 1 else: # mutating original does mutate shallow copy - ser.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 0 assert result.iloc[0] == 0 -def test_series_from_series_with_reindex(using_copy_on_write): +def test_series_from_series_with_reindex(using_copy_on_write, warn_copy_on_write): # Case: constructing a Series from another Series with specifying an index # that potentially requires a reindex of the values ser = Series([1, 2, 3], name="name") @@ -76,7 +78,8 @@ def test_series_from_series_with_reindex(using_copy_on_write): ]: result = Series(ser, index=index) assert np.shares_memory(ser.values, result.values) - result.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + result.iloc[0] = 0 if using_copy_on_write: assert ser.iloc[0] == 1 else: @@ -153,6 +156,7 @@ def test_series_from_index_different_dtypes(using_copy_on_write): assert ser._mgr._has_no_reference(0) +@pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") @pytest.mark.parametrize("fastpath", [False, True]) @pytest.mark.parametrize("dtype", [None, "int64"]) @pytest.mark.parametrize("idx", [None, pd.RangeIndex(start=0, stop=3, step=1)]) @@ -186,7 +190,9 @@ def test_series_from_block_manager_different_dtype(using_copy_on_write): @pytest.mark.parametrize("use_mgr", [True, False]) @pytest.mark.parametrize("columns", [None, ["a"]]) -def test_dataframe_constructor_mgr_or_df(using_copy_on_write, columns, use_mgr): +def test_dataframe_constructor_mgr_or_df( + using_copy_on_write, warn_copy_on_write, columns, use_mgr +): df = DataFrame({"a": [1, 2, 3]}) df_orig = df.copy() @@ -201,7 +207,8 @@ def test_dataframe_constructor_mgr_or_df(using_copy_on_write, columns, use_mgr): new_df = DataFrame(data) assert np.shares_memory(get_array(df, "a"), get_array(new_df, "a")) - new_df.iloc[0] = 100 + with tm.assert_cow_warning(warn_copy_on_write and not use_mgr): + new_df.iloc[0] = 100 if using_copy_on_write: assert not np.shares_memory(get_array(df, "a"), get_array(new_df, "a")) @@ -215,7 +222,7 @@ def test_dataframe_constructor_mgr_or_df(using_copy_on_write, columns, use_mgr): @pytest.mark.parametrize("index", [None, [0, 1, 2]]) @pytest.mark.parametrize("columns", [None, ["a", "b"], ["a", "b", "c"]]) def test_dataframe_from_dict_of_series( - request, using_copy_on_write, columns, index, dtype + request, using_copy_on_write, warn_copy_on_write, columns, index, dtype ): # Case: constructing a DataFrame from Series objects with copy=False # has to do a lazy following CoW rules @@ -235,6 +242,7 @@ def test_dataframe_from_dict_of_series( assert np.shares_memory(get_array(result, "a"), get_array(s1)) # mutating the new dataframe doesn't mutate original + # TODO(CoW-warn) this should also warn result.iloc[0, 0] = 10 if using_copy_on_write: assert not np.shares_memory(get_array(result, "a"), get_array(s1)) @@ -248,7 +256,8 @@ def test_dataframe_from_dict_of_series( result = DataFrame( {"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False ) - s1.iloc[0] = 10 + with tm.assert_cow_warning(warn_copy_on_write): + s1.iloc[0] = 10 if using_copy_on_write: assert not np.shares_memory(get_array(result, "a"), get_array(s1)) tm.assert_frame_equal(result, expected) @@ -278,7 +287,9 @@ def test_dataframe_from_dict_of_series_with_reindex(dtype): @pytest.mark.parametrize( "data, dtype", [([1, 2], None), ([1, 2], "int64"), (["a", "b"], None)] ) -def test_dataframe_from_series_or_index(using_copy_on_write, data, dtype, cons): +def test_dataframe_from_series_or_index( + using_copy_on_write, warn_copy_on_write, data, dtype, cons +): obj = cons(data, dtype=dtype) obj_orig = obj.copy() df = DataFrame(obj, dtype=dtype) @@ -286,7 +297,9 @@ def test_dataframe_from_series_or_index(using_copy_on_write, data, dtype, cons): if using_copy_on_write: assert not df._mgr._has_no_reference(0) - df.iloc[0, 0] = data[-1] + # TODO(CoW-warn) should not warn for an index? + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = data[-1] if using_copy_on_write: tm.assert_equal(obj, obj_orig) @@ -341,7 +354,7 @@ def test_frame_from_numpy_array(using_copy_on_write, copy, using_array_manager): assert np.shares_memory(get_array(df, 0), arr) -def test_dataframe_from_records_with_dataframe(using_copy_on_write): +def test_dataframe_from_records_with_dataframe(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2, 3]}) df_orig = df.copy() with tm.assert_produces_warning(FutureWarning): @@ -349,7 +362,8 @@ def test_dataframe_from_records_with_dataframe(using_copy_on_write): if using_copy_on_write: assert not df._mgr._has_no_reference(0) assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) - df2.iloc[0, 0] = 100 + with tm.assert_cow_warning(warn_copy_on_write): + df2.iloc[0, 0] = 100 if using_copy_on_write: tm.assert_frame_equal(df, df_orig) else: diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index ad55f9d561fe0..c4d5e9dbce72a 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -161,14 +161,13 @@ def test_subset_column_slice( subset.iloc[0, 0] = 0 assert not np.shares_memory(get_array(subset, "b"), get_array(df, "b")) + elif warn_copy_on_write: + # TODO(CoW-warn) should warn + with tm.assert_cow_warning(single_block): + subset.iloc[0, 0] = 0 else: # we only get a warning in case of a single block - # TODO(CoW-warn) should warn - warn = ( - SettingWithCopyWarning - if (single_block and not warn_copy_on_write) - else None - ) + warn = SettingWithCopyWarning if single_block else None with pd.option_context("chained_assignment", "warn"): with tm.assert_produces_warning(warn): subset.iloc[0, 0] = 0 @@ -204,6 +203,7 @@ def test_subset_loc_rows_columns( column_indexer, using_array_manager, using_copy_on_write, + warn_copy_on_write, ): # Case: taking a subset of the rows+columns of a DataFrame using .loc # + afterwards modifying the subset @@ -219,16 +219,9 @@ def test_subset_loc_rows_columns( subset = df.loc[row_indexer, column_indexer] - # modifying the subset never modifies the parent - subset.iloc[0, 0] = 0 - - expected = DataFrame( - {"b": [0, 6], "c": np.array([8, 9], dtype=dtype)}, index=range(1, 3) - ) - tm.assert_frame_equal(subset, expected) # a few corner cases _do_ actually modify the parent (with both row and column # slice, and in case of ArrayManager or BlockManager with single block) - if ( + mutate_parent = ( isinstance(row_indexer, slice) and isinstance(column_indexer, slice) and ( @@ -239,7 +232,17 @@ def test_subset_loc_rows_columns( and not using_copy_on_write ) ) - ): + ) + + # modifying the subset never modifies the parent + with tm.assert_cow_warning(warn_copy_on_write and mutate_parent): + subset.iloc[0, 0] = 0 + + expected = DataFrame( + {"b": [0, 6], "c": np.array([8, 9], dtype=dtype)}, index=range(1, 3) + ) + tm.assert_frame_equal(subset, expected) + if mutate_parent: df_orig.iloc[1, 1] = 0 tm.assert_frame_equal(df, df_orig) @@ -264,6 +267,7 @@ def test_subset_iloc_rows_columns( column_indexer, using_array_manager, using_copy_on_write, + warn_copy_on_write, ): # Case: taking a subset of the rows+columns of a DataFrame using .iloc # + afterwards modifying the subset @@ -279,16 +283,9 @@ def test_subset_iloc_rows_columns( subset = df.iloc[row_indexer, column_indexer] - # modifying the subset never modifies the parent - subset.iloc[0, 0] = 0 - - expected = DataFrame( - {"b": [0, 6], "c": np.array([8, 9], dtype=dtype)}, index=range(1, 3) - ) - tm.assert_frame_equal(subset, expected) # a few corner cases _do_ actually modify the parent (with both row and column # slice, and in case of ArrayManager or BlockManager with single block) - if ( + mutate_parent = ( isinstance(row_indexer, slice) and isinstance(column_indexer, slice) and ( @@ -299,7 +296,17 @@ def test_subset_iloc_rows_columns( and not using_copy_on_write ) ) - ): + ) + + # modifying the subset never modifies the parent + with tm.assert_cow_warning(warn_copy_on_write and mutate_parent): + subset.iloc[0, 0] = 0 + + expected = DataFrame( + {"b": [0, 6], "c": np.array([8, 9], dtype=dtype)}, index=range(1, 3) + ) + tm.assert_frame_equal(subset, expected) + if mutate_parent: df_orig.iloc[1, 1] = 0 tm.assert_frame_equal(df, df_orig) @@ -573,7 +580,13 @@ def test_subset_set_with_column_indexer(backend, indexer, using_copy_on_write): "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] ) def test_subset_chained_getitem( - request, backend, method, dtype, using_copy_on_write, using_array_manager + request, + backend, + method, + dtype, + using_copy_on_write, + using_array_manager, + warn_copy_on_write, ): # Case: creating a subset using multiple, chained getitem calls using views # still needs to guarantee proper CoW behaviour @@ -600,7 +613,9 @@ def test_subset_chained_getitem( # modify subset -> don't modify parent subset = method(df) - subset.iloc[0, 0] = 0 + + with tm.assert_cow_warning(warn_copy_on_write and subset_is_view): + subset.iloc[0, 0] = 0 if using_copy_on_write or (not subset_is_view): tm.assert_frame_equal(df, df_orig) else: @@ -608,7 +623,8 @@ def test_subset_chained_getitem( # modify parent -> don't modify subset subset = method(df) - df.iloc[0, 0] = 0 + with tm.assert_cow_warning(warn_copy_on_write and subset_is_view): + df.iloc[0, 0] = 0 expected = DataFrame({"a": [1, 2], "b": [4, 5]}) if using_copy_on_write or not subset_is_view: tm.assert_frame_equal(subset, expected) @@ -619,10 +635,12 @@ def test_subset_chained_getitem( @pytest.mark.parametrize( "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] ) -def test_subset_chained_getitem_column(backend, dtype, using_copy_on_write): +def test_subset_chained_getitem_column( + backend, dtype, using_copy_on_write, warn_copy_on_write +): # Case: creating a subset using multiple, chained getitem calls using views # still needs to guarantee proper CoW behaviour - _, DataFrame, Series = backend + dtype_backend, DataFrame, Series = backend df = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} ) @@ -631,7 +649,8 @@ def test_subset_chained_getitem_column(backend, dtype, using_copy_on_write): # modify subset -> don't modify parent subset = df[:]["a"][0:2] df._clear_item_cache() - subset.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + subset.iloc[0] = 0 if using_copy_on_write: tm.assert_frame_equal(df, df_orig) else: @@ -640,7 +659,11 @@ def test_subset_chained_getitem_column(backend, dtype, using_copy_on_write): # modify parent -> don't modify subset subset = df[:]["a"][0:2] df._clear_item_cache() - df.iloc[0, 0] = 0 + # TODO(CoW-warn) should also warn for mixed block and nullable dtypes + with tm.assert_cow_warning( + warn_copy_on_write and dtype == "int64" and dtype_backend == "numpy" + ): + df.iloc[0, 0] = 0 expected = Series([1, 2], name="a") if using_copy_on_write: tm.assert_series_equal(subset, expected) @@ -662,7 +685,9 @@ def test_subset_chained_getitem_column(backend, dtype, using_copy_on_write): ], ids=["getitem", "iloc", "loc", "long-chain"], ) -def test_subset_chained_getitem_series(backend, method, using_copy_on_write): +def test_subset_chained_getitem_series( + backend, method, using_copy_on_write, warn_copy_on_write +): # Case: creating a subset using multiple, chained getitem calls using views # still needs to guarantee proper CoW behaviour _, _, Series = backend @@ -671,7 +696,8 @@ def test_subset_chained_getitem_series(backend, method, using_copy_on_write): # modify subset -> don't modify parent subset = method(s) - subset.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + subset.iloc[0] = 0 if using_copy_on_write: tm.assert_series_equal(s, s_orig) else: @@ -679,7 +705,8 @@ def test_subset_chained_getitem_series(backend, method, using_copy_on_write): # modify parent -> don't modify subset subset = s.iloc[0:3].iloc[0:2] - s.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + s.iloc[0] = 0 expected = Series([1, 2], index=["a", "b"]) if using_copy_on_write: tm.assert_series_equal(subset, expected) @@ -687,14 +714,17 @@ def test_subset_chained_getitem_series(backend, method, using_copy_on_write): assert subset.iloc[0] == 0 -def test_subset_chained_single_block_row(using_copy_on_write, using_array_manager): +def test_subset_chained_single_block_row( + using_copy_on_write, using_array_manager, warn_copy_on_write +): # not parametrizing this for dtype backend, since this explicitly tests single block df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) df_orig = df.copy() # modify subset -> don't modify parent subset = df[:].iloc[0].iloc[0:2] - subset.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + subset.iloc[0] = 0 if using_copy_on_write or using_array_manager: tm.assert_frame_equal(df, df_orig) else: @@ -702,7 +732,8 @@ def test_subset_chained_single_block_row(using_copy_on_write, using_array_manage # modify parent -> don't modify subset subset = df[:].iloc[0].iloc[0:2] - df.iloc[0, 0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 0 expected = Series([1, 4], index=["a", "b"], name=0) if using_copy_on_write or using_array_manager: tm.assert_series_equal(subset, expected) @@ -721,10 +752,10 @@ def test_subset_chained_single_block_row(using_copy_on_write, using_array_manage ], ids=["getitem", "loc", "loc-rows", "iloc", "iloc-rows"], ) -def test_null_slice(backend, method, using_copy_on_write): +def test_null_slice(backend, method, using_copy_on_write, warn_copy_on_write): # Case: also all variants of indexing with a null slice (:) should return # new objects to ensure we correctly use CoW for the results - _, DataFrame, _ = backend + dtype_backend, DataFrame, _ = backend df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) df_orig = df.copy() @@ -734,7 +765,9 @@ def test_null_slice(backend, method, using_copy_on_write): assert df2 is not df # and those trigger CoW when mutated - df2.iloc[0, 0] = 0 + # TODO(CoW-warn) should also warn for nullable dtypes + with tm.assert_cow_warning(warn_copy_on_write and dtype_backend == "numpy"): + df2.iloc[0, 0] = 0 if using_copy_on_write: tm.assert_frame_equal(df, df_orig) else: @@ -750,7 +783,7 @@ def test_null_slice(backend, method, using_copy_on_write): ], ids=["getitem", "loc", "iloc"], ) -def test_null_slice_series(backend, method, using_copy_on_write): +def test_null_slice_series(backend, method, using_copy_on_write, warn_copy_on_write): _, _, Series = backend s = Series([1, 2, 3], index=["a", "b", "c"]) s_orig = s.copy() @@ -761,7 +794,8 @@ def test_null_slice_series(backend, method, using_copy_on_write): assert s2 is not s # and those trigger CoW when mutated - s2.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + s2.iloc[0] = 0 if using_copy_on_write: tm.assert_series_equal(s, s_orig) else: @@ -775,7 +809,7 @@ def test_null_slice_series(backend, method, using_copy_on_write): # Series -- Indexing operations taking subset + modifying the subset/parent -def test_series_getitem_slice(backend, using_copy_on_write): +def test_series_getitem_slice(backend, using_copy_on_write, warn_copy_on_write): # Case: taking a slice of a Series + afterwards modifying the subset _, _, Series = backend s = Series([1, 2, 3], index=["a", "b", "c"]) @@ -784,7 +818,8 @@ def test_series_getitem_slice(backend, using_copy_on_write): subset = s[:] assert np.shares_memory(get_array(subset), get_array(s)) - subset.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + subset.iloc[0] = 0 if using_copy_on_write: assert not np.shares_memory(get_array(subset), get_array(s)) @@ -806,7 +841,7 @@ def test_series_getitem_slice(backend, using_copy_on_write): ids=["slice", "mask", "array"], ) def test_series_subset_set_with_indexer( - backend, indexer_si, indexer, using_copy_on_write + backend, indexer_si, indexer, using_copy_on_write, warn_copy_on_write ): # Case: setting values in a viewing Series with an indexer _, _, Series = backend @@ -822,9 +857,20 @@ def test_series_subset_set_with_indexer( and indexer.dtype.kind == "i" ): warn = FutureWarning - - with tm.assert_produces_warning(warn, match=msg): - indexer_si(subset)[indexer] = 0 + is_mask = ( + indexer_si is tm.setitem + and isinstance(indexer, np.ndarray) + and indexer.dtype.kind == "b" + ) + if warn_copy_on_write: + # TODO(CoW-warn) should also warn for setting with mask + with tm.assert_cow_warning( + not is_mask, raise_on_extra_warnings=warn is not None + ): + indexer_si(subset)[indexer] = 0 + else: + with tm.assert_produces_warning(warn, match=msg): + indexer_si(subset)[indexer] = 0 expected = Series([0, 0, 3], index=["a", "b", "c"]) tm.assert_series_equal(subset, expected) @@ -998,14 +1044,16 @@ def test_column_as_series_no_item_cache( s2 = method(df) is_iloc = "iloc" in request.node.name - if using_copy_on_write or is_iloc: + if using_copy_on_write or warn_copy_on_write or is_iloc: assert s1 is not s2 else: assert s1 is s2 - # TODO(CoW-warn) should warn - if using_copy_on_write or warn_copy_on_write or using_array_manager: + if using_copy_on_write or using_array_manager: s1.iloc[0] = 0 + elif warn_copy_on_write: + with tm.assert_cow_warning(): + s1.iloc[0] = 0 else: warn = SettingWithCopyWarning if dtype_backend == "numpy" else None with pd.option_context("chained_assignment", "warn"): @@ -1057,10 +1105,11 @@ def test_dataframe_add_column_from_series(backend, using_copy_on_write): "col", [[0.1, 0.2, 0.3], [7, 8, 9]], ids=["mixed-block", "single-block"] ) def test_set_value_copy_only_necessary_column( - using_copy_on_write, indexer_func, indexer, val, col + using_copy_on_write, warn_copy_on_write, indexer_func, indexer, val, col ): # When setting inplace, only copy column that is modified instead of the whole # block (by splitting the block) + single_block = isinstance(col[0], int) df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": col}) df_orig = df.copy() view = df[:] @@ -1071,7 +1120,12 @@ def test_set_value_copy_only_necessary_column( ): indexer_func(df)[indexer] = val else: - indexer_func(df)[indexer] = val + # TODO(CoW-warn) should also warn in the other cases + with tm.assert_cow_warning( + warn_copy_on_write + and not (indexer[0] == slice(None) or (not single_block and val == 100)) + ): + indexer_func(df)[indexer] = val if using_copy_on_write: assert np.shares_memory(get_array(df, "b"), get_array(view, "b")) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 60ab21f48e910..73bb9b4a71741 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -526,14 +526,15 @@ def test_shift_rows_freq(using_copy_on_write): tm.assert_frame_equal(df2, df_orig) -def test_shift_columns(using_copy_on_write): +def test_shift_columns(using_copy_on_write, warn_copy_on_write): df = DataFrame( [[1, 2], [3, 4], [5, 6]], columns=date_range("2020-01-01", "2020-01-02") ) df2 = df.shift(periods=1, axis=1) assert np.shares_memory(get_array(df2, "2020-01-02"), get_array(df, "2020-01-01")) - df.iloc[0, 0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 0 if using_copy_on_write: assert not np.shares_memory( get_array(df2, "2020-01-02"), get_array(df, "2020-01-01") @@ -648,7 +649,7 @@ def test_align_with_series_copy_false(using_copy_on_write): tm.assert_series_equal(ser, ser_orig) # Original is unchanged -def test_to_frame(using_copy_on_write): +def test_to_frame(using_copy_on_write, warn_copy_on_write): # Case: converting a Series to a DataFrame with to_frame ser = Series([1, 2, 3]) ser_orig = ser.copy() @@ -658,7 +659,8 @@ def test_to_frame(using_copy_on_write): # currently this always returns a "view" assert np.shares_memory(ser.values, get_array(df, 0)) - df.iloc[0, 0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 0 if using_copy_on_write: # mutating df triggers a copy-on-write for that column @@ -672,7 +674,8 @@ def test_to_frame(using_copy_on_write): # modify original series -> don't modify dataframe df = ser[:].to_frame() - ser.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 0 if using_copy_on_write: tm.assert_frame_equal(df, ser_orig.to_frame()) @@ -1139,7 +1142,7 @@ def test_sort_values(using_copy_on_write, obj, kwargs): "obj, kwargs", [(Series([1, 2, 3], name="a"), {}), (DataFrame({"a": [1, 2, 3]}), {"by": "a"})], ) -def test_sort_values_inplace(using_copy_on_write, obj, kwargs, using_array_manager): +def test_sort_values_inplace(using_copy_on_write, obj, kwargs, warn_copy_on_write): obj_orig = obj.copy() view = obj[:] obj.sort_values(inplace=True, **kwargs) @@ -1147,7 +1150,8 @@ def test_sort_values_inplace(using_copy_on_write, obj, kwargs, using_array_manag assert np.shares_memory(get_array(obj, "a"), get_array(view, "a")) # mutating obj triggers a copy-on-write for the column / block - obj.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + obj.iloc[0] = 0 if using_copy_on_write: assert not np.shares_memory(get_array(obj, "a"), get_array(view, "a")) tm.assert_equal(view, obj_orig) @@ -1270,7 +1274,7 @@ def test_series_set_axis(using_copy_on_write): tm.assert_series_equal(ser, ser_orig) -def test_set_flags(using_copy_on_write): +def test_set_flags(using_copy_on_write, warn_copy_on_write): ser = Series([1, 2, 3]) ser_orig = ser.copy() ser2 = ser.set_flags(allows_duplicate_labels=False) @@ -1278,7 +1282,8 @@ def test_set_flags(using_copy_on_write): assert np.shares_memory(ser, ser2) # mutating ser triggers a copy-on-write for the column / block - ser2.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + ser2.iloc[0] = 0 if using_copy_on_write: assert not np.shares_memory(ser2, ser) tm.assert_series_equal(ser, ser_orig) @@ -1351,7 +1356,7 @@ def test_droplevel(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -def test_squeeze(using_copy_on_write): +def test_squeeze(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2, 3]}) df_orig = df.copy() series = df.squeeze() @@ -1360,7 +1365,8 @@ def test_squeeze(using_copy_on_write): assert np.shares_memory(series.values, get_array(df, "a")) # mutating squeezed df triggers a copy-on-write for that column/block - series.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + series.iloc[0] = 0 if using_copy_on_write: assert not np.shares_memory(series.values, get_array(df, "a")) tm.assert_frame_equal(df, df_orig) @@ -1370,7 +1376,7 @@ def test_squeeze(using_copy_on_write): assert df.loc[0, "a"] == 0 -def test_items(using_copy_on_write): +def test_items(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) df_orig = df.copy() @@ -1381,7 +1387,8 @@ def test_items(using_copy_on_write): assert np.shares_memory(get_array(ser, name), get_array(df, name)) # mutating df triggers a copy-on-write for that column / block - ser.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 0 if using_copy_on_write: assert not np.shares_memory(get_array(ser, name), get_array(df, name)) @@ -1568,14 +1575,15 @@ def test_iterrows(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -def test_interpolate_creates_copy(using_copy_on_write): +def test_interpolate_creates_copy(using_copy_on_write, warn_copy_on_write): # GH#51126 df = DataFrame({"a": [1.5, np.nan, 3]}) view = df[:] expected = df.copy() df.ffill(inplace=True) - df.iloc[0, 0] = 100.5 + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 100.5 if using_copy_on_write: tm.assert_frame_equal(view, expected) @@ -1665,12 +1673,10 @@ def test_get(using_copy_on_write, warn_copy_on_write, key): else: # for non-CoW it depends on whether we got a Series or DataFrame if it # is a view or copy or triggers a warning or not - # TODO(CoW) should warn - warn = ( - (None if warn_copy_on_write else SettingWithCopyWarning) - if isinstance(key, list) - else None - ) + if warn_copy_on_write: + warn = FutureWarning if isinstance(key, str) else None + else: + warn = SettingWithCopyWarning if isinstance(key, list) else None with pd.option_context("chained_assignment", "warn"): with tm.assert_produces_warning(warn): result.iloc[0] = 0 @@ -1702,11 +1708,10 @@ def test_xs( elif using_copy_on_write: assert result._mgr._has_no_reference(0) - # TODO(CoW) should warn in case of is_view - if using_copy_on_write or is_view: + if using_copy_on_write or (is_view and not warn_copy_on_write): result.iloc[0] = 0 elif warn_copy_on_write: - with tm.assert_cow_warning(single_block): + with tm.assert_cow_warning(single_block or axis == 1): result.iloc[0] = 0 else: with pd.option_context("chained_assignment", "warn"): @@ -1738,12 +1743,12 @@ def test_xs_multiindex( get_array(df, df.columns[0]), get_array(result, result.columns[0]) ) - # TODO(CoW) should warn - warn = ( - (None if warn_copy_on_write else SettingWithCopyWarning) - if not using_copy_on_write and not using_array_manager - else None - ) + if warn_copy_on_write: + warn = FutureWarning if level == 0 else None + elif not using_copy_on_write and not using_array_manager: + warn = SettingWithCopyWarning + else: + warn = None with pd.option_context("chained_assignment", "warn"): with tm.assert_produces_warning(warn): result.iloc[0, 0] = 0 @@ -1809,11 +1814,14 @@ def test_inplace_arithmetic_series(): tm.assert_numpy_array_equal(data, get_array(ser)) -def test_inplace_arithmetic_series_with_reference(using_copy_on_write): +def test_inplace_arithmetic_series_with_reference( + using_copy_on_write, warn_copy_on_write +): ser = Series([1, 2, 3]) ser_orig = ser.copy() view = ser[:] - ser *= 2 + with tm.assert_cow_warning(warn_copy_on_write): + ser *= 2 if using_copy_on_write: assert not np.shares_memory(get_array(ser), get_array(view)) tm.assert_series_equal(ser_orig, view) @@ -1855,7 +1863,7 @@ def test_transpose_ea_single_column(using_copy_on_write): assert not np.shares_memory(get_array(df, "a"), get_array(result, 0)) -def test_transform_frame(using_copy_on_write): +def test_transform_frame(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": 1}) df_orig = df.copy() @@ -1863,12 +1871,13 @@ def func(ser): ser.iloc[0] = 100 return ser - df.transform(func) + with tm.assert_cow_warning(warn_copy_on_write): + df.transform(func) if using_copy_on_write: tm.assert_frame_equal(df, df_orig) -def test_transform_series(using_copy_on_write): +def test_transform_series(using_copy_on_write, warn_copy_on_write): ser = Series([1, 2, 3]) ser_orig = ser.copy() @@ -1876,6 +1885,7 @@ def func(ser): ser.iloc[0] = 100 return ser + # TODO(CoW-warn) should warn? ser.transform(func) if using_copy_on_write: tm.assert_series_equal(ser, ser_orig) @@ -1889,7 +1899,7 @@ def test_count_read_only_array(): tm.assert_series_equal(result, expected) -def test_series_view(using_copy_on_write): +def test_series_view(using_copy_on_write, warn_copy_on_write): ser = Series([1, 2, 3]) ser_orig = ser.copy() @@ -1898,7 +1908,8 @@ def test_series_view(using_copy_on_write): if using_copy_on_write: assert not ser2._mgr._has_no_reference(0) - ser2.iloc[0] = 100 + with tm.assert_cow_warning(warn_copy_on_write): + ser2.iloc[0] = 100 if using_copy_on_write: tm.assert_series_equal(ser_orig, ser) diff --git a/pandas/tests/copy_view/test_setitem.py b/pandas/tests/copy_view/test_setitem.py index bc3b939734534..4e08e00dac2b2 100644 --- a/pandas/tests/copy_view/test_setitem.py +++ b/pandas/tests/copy_view/test_setitem.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from pandas import ( DataFrame, @@ -66,6 +67,8 @@ def test_set_column_with_index(using_copy_on_write): assert not np.shares_memory(get_array(df, "d"), arr) +# TODO(CoW-warn) this should NOT warn +@pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_set_columns_with_dataframe(using_copy_on_write): # Case: setting a DataFrame as new columns copies that data # (with delayed copy with CoW) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index c975138837e6b..83e2c795b8b5e 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -405,7 +405,7 @@ def test_setitem_frame_2d_values(self, data): df.iloc[:] = df tm.assert_frame_equal(df, orig) - df.iloc[:-1] = df.iloc[:-1] + df.iloc[:-1] = df.iloc[:-1].copy() tm.assert_frame_equal(df, orig) df.iloc[:] = df.values diff --git a/pandas/tests/frame/indexing/test_getitem.py b/pandas/tests/frame/indexing/test_getitem.py index ecd8d1e988fd8..8502f98df5962 100644 --- a/pandas/tests/frame/indexing/test_getitem.py +++ b/pandas/tests/frame/indexing/test_getitem.py @@ -392,13 +392,14 @@ def test_getitem_empty_frame_with_boolean(self): tm.assert_frame_equal(df, df2) def test_getitem_returns_view_when_column_is_unique_in_df( - self, using_copy_on_write + self, using_copy_on_write, warn_copy_on_write ): # GH#45316 df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"]) df_orig = df.copy() view = df["b"] - view.loc[:] = 100 + with tm.assert_cow_warning(warn_copy_on_write): + view.loc[:] = 100 if using_copy_on_write: expected = df_orig else: diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 765671fdd7f25..93f391b16817c 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -519,7 +519,7 @@ def test_loc_setitem_boolean_mask_allfalse(self): ) result = df.copy() - result.loc[result.b.isna(), "a"] = result.a + result.loc[result.b.isna(), "a"] = result.a.copy() tm.assert_frame_equal(result, df) def test_getitem_fancy_slice_integers_step(self): @@ -1285,7 +1285,7 @@ def test_iloc_setitem_nullable_2d_values(self): df.loc[:] = pd.core.arrays.NumpyExtensionArray(df.values[:, ::-1]) tm.assert_frame_equal(df, orig) - df.iloc[:] = df.iloc[:, :] + df.iloc[:] = df.iloc[:, :].copy() tm.assert_frame_equal(df, orig) def test_getitem_segfault_with_empty_like_object(self): @@ -1295,6 +1295,7 @@ def test_getitem_segfault_with_empty_like_object(self): # this produces the segfault df[[0]] + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") @pytest.mark.parametrize( "null", [pd.NaT, pd.NaT.to_numpy("M8[ns]"), pd.NaT.to_numpy("m8[ns]")] ) @@ -1460,6 +1461,8 @@ def test_loc_named_tuple_for_midx(self): ) tm.assert_frame_equal(result, expected) + # TODO(CoW-warn) shouldn't warn, but does because of item cache + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") @pytest.mark.parametrize("indexer", [["a"], "a"]) @pytest.mark.parametrize("col", [{}, {"b": 1}]) def test_set_2d_casting_date_to_int(self, col, indexer): diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index 772738ae460b9..5cd184d564b3d 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -36,7 +36,9 @@ def four_level_index_dataframe(): class TestXS: - def test_xs(self, float_frame, datetime_frame, using_copy_on_write): + def test_xs( + self, float_frame, datetime_frame, using_copy_on_write, warn_copy_on_write + ): float_frame_orig = float_frame.copy() idx = float_frame.index[5] xs = float_frame.xs(idx) @@ -66,7 +68,8 @@ def test_xs(self, float_frame, datetime_frame, using_copy_on_write): # view is returned if possible series = float_frame.xs("A", axis=1) - series[:] = 5 + with tm.assert_cow_warning(warn_copy_on_write): + series[:] = 5 if using_copy_on_write: # but with CoW the view shouldn't propagate mutations tm.assert_series_equal(float_frame["A"], float_frame_orig["A"]) @@ -119,7 +122,9 @@ def test_xs_keep_level(self): result = df.xs((2008, "sat"), level=["year", "day"], drop_level=False) tm.assert_frame_equal(result, expected) - def test_xs_view(self, using_array_manager, using_copy_on_write): + def test_xs_view( + self, using_array_manager, using_copy_on_write, warn_copy_on_write + ): # in 0.14 this will return a view if possible a copy otherwise, but # this is numpy dependent @@ -138,7 +143,9 @@ def test_xs_view(self, using_array_manager, using_copy_on_write): dm.xs(2)[:] = 20 assert not (dm.xs(2) == 20).any() else: - dm.xs(2)[:] = 20 + # TODO(CoW-warn) should this raise a specific warning about being chained? + with tm.assert_cow_warning(warn_copy_on_write): + dm.xs(2)[:] = 20 assert (dm.xs(2) == 20).all() @@ -394,14 +401,17 @@ def test_xs_droplevel_false(self): expected = DataFrame({"a": [1]}) tm.assert_frame_equal(result, expected) - def test_xs_droplevel_false_view(self, using_array_manager, using_copy_on_write): + def test_xs_droplevel_false_view( + self, using_array_manager, using_copy_on_write, warn_copy_on_write + ): # GH#37832 df = DataFrame([[1, 2, 3]], columns=Index(["a", "b", "c"])) result = df.xs("a", axis=1, drop_level=False) # check that result still views the same data as df assert np.shares_memory(result.iloc[:, 0]._values, df.iloc[:, 0]._values) - df.iloc[0, 0] = 2 + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 2 if using_copy_on_write: # with copy on write the subset is never modified expected = DataFrame({"a": [1]}) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 23a9656193d2c..359e9122b0c0b 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -106,7 +106,7 @@ def test_corr_scipy_method(self, float_frame, method): pytest.importorskip("scipy") float_frame.loc[float_frame.index[:5], "A"] = np.nan float_frame.loc[float_frame.index[5:10], "B"] = np.nan - float_frame.loc[float_frame.index[:10], "A"] = float_frame["A"][10:20] + float_frame.loc[float_frame.index[:10], "A"] = float_frame["A"][10:20].copy() correls = float_frame.corr(method=method) expected = float_frame["A"].corr(float_frame["C"], method=method) @@ -205,7 +205,7 @@ def test_corr_nullable_integer(self, nullable_column, other_column, method): expected = DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"]) tm.assert_frame_equal(result, expected) - def test_corr_item_cache(self, using_copy_on_write): + def test_corr_item_cache(self, using_copy_on_write, warn_copy_on_write): # Check that corr does not lead to incorrect entries in item_cache df = DataFrame({"A": range(10)}) @@ -223,7 +223,8 @@ def test_corr_item_cache(self, using_copy_on_write): # Check that the corr didn't break link between ser and df ser.values[0] = 99 assert df.loc[0, "A"] == 99 - assert df["A"] is ser + if not warn_copy_on_write: + assert df["A"] is ser assert df.values[0, 0] == 99 @pytest.mark.parametrize("length", [2, 20, 200, 2000]) diff --git a/pandas/tests/frame/methods/test_pop.py b/pandas/tests/frame/methods/test_pop.py index 617f0c3a27885..3eb058015cd3d 100644 --- a/pandas/tests/frame/methods/test_pop.py +++ b/pandas/tests/frame/methods/test_pop.py @@ -9,7 +9,7 @@ class TestDataFramePop: - def test_pop(self, float_frame): + def test_pop(self, float_frame, warn_copy_on_write): float_frame.columns.name = "baz" float_frame.pop("A") @@ -23,7 +23,8 @@ def test_pop(self, float_frame): # gh-10912: inplace ops cause caching issue a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"], index=["X", "Y"]) b = a.pop("B") - b += 1 + with tm.assert_cow_warning(warn_copy_on_write): + b += 1 # original frame expected = DataFrame([[1, 3], [4, 6]], columns=["A", "C"], index=["X", "Y"]) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 1f4771f797ff9..637fc6270b78d 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -110,6 +110,8 @@ def test_non_numeric_exclusion(self, interp_method, request, using_array_manager request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_series_equal(rs, xp) + # TODO(CoW-warn) should not need to warn + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_axis(self, interp_method, request, using_array_manager): # axis interpolation, method = interp_method diff --git a/pandas/tests/frame/methods/test_to_dict_of_blocks.py b/pandas/tests/frame/methods/test_to_dict_of_blocks.py index 9d90111be6075..471b9eaf936ad 100644 --- a/pandas/tests/frame/methods/test_to_dict_of_blocks.py +++ b/pandas/tests/frame/methods/test_to_dict_of_blocks.py @@ -50,7 +50,7 @@ def test_no_copy_blocks(self, float_frame, using_copy_on_write): assert _last_df is not None and not _last_df[column].equals(df[column]) -def test_to_dict_of_blocks_item_cache(request, using_copy_on_write): +def test_to_dict_of_blocks_item_cache(request, using_copy_on_write, warn_copy_on_write): if using_copy_on_write: request.applymarker(pytest.mark.xfail(reason="CoW - not yet implemented")) # Calling to_dict_of_blocks should not poison item_cache @@ -68,6 +68,11 @@ def test_to_dict_of_blocks_item_cache(request, using_copy_on_write): # this currently still updates df, so this test fails ser.values[0] = "foo" assert df.loc[0, "b"] == "a" + elif warn_copy_on_write: + ser.values[0] = "foo" + assert df.loc[0, "b"] == "foo" + # with warning mode, the item cache is disabled + assert df["b"] is not ser else: # Check that the to_dict_of_blocks didn't break link between ser and df ser.values[0] = "foo" diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index f30db91f82b60..4136d641ef67f 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -147,7 +147,7 @@ def test_data_frame_value_counts_dropna_false(nulls_fixture): index=pd.MultiIndex( levels=[ pd.Index(["Anne", "Beth", "John"]), - pd.Index(["Louise", "Smith", nulls_fixture]), + pd.Index(["Louise", "Smith", np.nan]), ], codes=[[0, 1, 2, 2], [2, 0, 1, 2]], names=["first_name", "middle_name"], diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 04821d0865887..2b392ddcfb44d 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -320,7 +320,11 @@ def test_attrs_deepcopy(self): @pytest.mark.parametrize("allows_duplicate_labels", [True, False, None]) def test_set_flags( - self, allows_duplicate_labels, frame_or_series, using_copy_on_write + self, + allows_duplicate_labels, + frame_or_series, + using_copy_on_write, + warn_copy_on_write, ): obj = DataFrame({"A": [1, 2]}) key = (0, 0) @@ -348,13 +352,15 @@ def test_set_flags( else: assert np.may_share_memory(obj["A"].values, result["A"].values) - result.iloc[key] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + result.iloc[key] = 0 if using_copy_on_write: assert obj.iloc[key] == 1 else: assert obj.iloc[key] == 0 # set back to 1 for test below - result.iloc[key] = 1 + with tm.assert_cow_warning(warn_copy_on_write): + result.iloc[key] = 1 # Now we do copy. result = obj.set_flags( diff --git a/pandas/tests/generic/test_duplicate_labels.py b/pandas/tests/generic/test_duplicate_labels.py index cb21ac6b83ee9..f54db07824daf 100644 --- a/pandas/tests/generic/test_duplicate_labels.py +++ b/pandas/tests/generic/test_duplicate_labels.py @@ -90,8 +90,10 @@ def test_preserve_getitem(self): assert df.loc[[0]].flags.allows_duplicate_labels is False assert df.loc[0, ["A"]].flags.allows_duplicate_labels is False - def test_ndframe_getitem_caching_issue(self, request, using_copy_on_write): - if not using_copy_on_write: + def test_ndframe_getitem_caching_issue( + self, request, using_copy_on_write, warn_copy_on_write + ): + if not (using_copy_on_write or warn_copy_on_write): request.applymarker(pytest.mark.xfail(reason="Unclear behavior.")) # NDFrame.__getitem__ will cache the first df['A']. May need to # invalidate that cache? Update the cached entries? diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4a8a0851d2e42..de8ceb30b565b 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -41,7 +41,9 @@ def test_repr(): assert result == expected -def test_groupby_std_datetimelike(): +# TODO(CoW-warn) this should NOT warn +@pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") +def test_groupby_std_datetimelike(warn_copy_on_write): # GH#48481 tdi = pd.timedelta_range("1 Day", periods=10000) ser = Series(tdi) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 8c2b95ba631ee..01768582299eb 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -297,6 +297,23 @@ def test_grouper_creation_bug3(self): expected = ser.groupby(level="one").sum() tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("func", [False, True]) + def test_grouper_returning_tuples(self, func): + # GH 22257 , both with dict and with callable + df = DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}) + mapping = dict(zip(range(4), [("C", 5), ("D", 6)] * 2)) + + if func: + gb = df.groupby(by=lambda idx: mapping[idx], sort=False) + else: + gb = df.groupby(by=mapping, sort=False) + + name, expected = next(iter(gb)) + assert name == ("C", 5) + result = gb.get_group(name) + + tm.assert_frame_equal(result, expected) + def test_grouper_column_and_index(self): # GH 14327 diff --git a/pandas/tests/indexes/period/test_partial_slicing.py b/pandas/tests/indexes/period/test_partial_slicing.py index 5bc76340badaf..4af6b5ca1a6a7 100644 --- a/pandas/tests/indexes/period/test_partial_slicing.py +++ b/pandas/tests/indexes/period/test_partial_slicing.py @@ -12,7 +12,9 @@ class TestPeriodIndex: - def test_getitem_periodindex_duplicates_string_slice(self, using_copy_on_write): + def test_getitem_periodindex_duplicates_string_slice( + self, using_copy_on_write, warn_copy_on_write + ): # monotonic idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="Y-JUN") ts = Series(np.random.default_rng(2).standard_normal(len(idx)), index=idx) @@ -21,7 +23,8 @@ def test_getitem_periodindex_duplicates_string_slice(self, using_copy_on_write): result = ts["2007"] expected = ts[1:3] tm.assert_series_equal(result, expected) - result[:] = 1 + with tm.assert_cow_warning(warn_copy_on_write): + result[:] = 1 if using_copy_on_write: tm.assert_series_equal(ts, original) else: diff --git a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py index 1fe3a1275d8d6..9bf7c601e4db0 100644 --- a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py +++ b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py @@ -43,7 +43,7 @@ def test_detect_chained_assignment(using_copy_on_write, warn_copy_on_write): @td.skip_array_manager_invalid_test # with ArrayManager df.loc[0] is not a view -def test_cache_updating(using_copy_on_write): +def test_cache_updating(using_copy_on_write, warn_copy_on_write): # 5216 # make sure that we don't try to set a dead cache a = np.random.default_rng(2).random((10, 3)) @@ -60,7 +60,9 @@ def test_cache_updating(using_copy_on_write): df.loc[0]["z"].iloc[0] = 1.0 assert df.loc[(0, 0), "z"] == df_original.loc[0, "z"] else: - df.loc[0]["z"].iloc[0] = 1.0 + # TODO(CoW-warn) should raise custom warning message about chaining? + with tm.assert_cow_warning(warn_copy_on_write): + df.loc[0]["z"].iloc[0] = 1.0 result = df.loc[(0, 0), "z"] assert result == 1 diff --git a/pandas/tests/indexing/multiindex/test_partial.py b/pandas/tests/indexing/multiindex/test_partial.py index 081da385ebcc3..9cf11b4602cb2 100644 --- a/pandas/tests/indexing/multiindex/test_partial.py +++ b/pandas/tests/indexing/multiindex/test_partial.py @@ -122,7 +122,10 @@ def test_getitem_partial_column_select(self): # exp.loc[2000, 4].values[:] select multiple columns -> .values is not a view @td.skip_array_manager_invalid_test def test_partial_set( - self, multiindex_year_month_day_dataframe_random_data, using_copy_on_write + self, + multiindex_year_month_day_dataframe_random_data, + using_copy_on_write, + warn_copy_on_write, ): # GH #397 ymd = multiindex_year_month_day_dataframe_random_data @@ -137,7 +140,9 @@ def test_partial_set( df["A"].loc[2000, 4] = 1 df.loc[(2000, 4), "A"] = 1 else: - df["A"].loc[2000, 4] = 1 + # TODO(CoW-warn) should raise custom warning message about chaining? + with tm.assert_cow_warning(warn_copy_on_write): + df["A"].loc[2000, 4] = 1 exp.iloc[65:85, 0] = 1 tm.assert_frame_equal(df, exp) @@ -151,7 +156,9 @@ def test_partial_set( df["A"].iloc[14] = 5 df["A"].iloc[14] == exp["A"].iloc[14] else: - df["A"].iloc[14] = 5 + # TODO(CoW-warn) should raise custom warning message about chaining? + with tm.assert_cow_warning(warn_copy_on_write): + df["A"].iloc[14] = 5 assert df["A"].iloc[14] == 5 @pytest.mark.parametrize("dtype", [int, float]) diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index ec787a475575d..3237c8f52797a 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -390,6 +390,7 @@ def test_loc_getitem_tuple_plus_columns( expected = df.loc[2000, 1, 6][["A", "B", "C"]] tm.assert_series_equal(result, expected) + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_loc_getitem_setitem_slice_integers(self, frame_or_series): index = MultiIndex( levels=[[0, 1, 2], [0, 2]], codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]] @@ -421,7 +422,7 @@ def test_setitem_change_dtype(self, multiindex_dataframe_random_data): tm.assert_series_equal(reindexed["foo", "two"], s > s.median()) def test_set_column_scalar_with_loc( - self, multiindex_dataframe_random_data, using_copy_on_write + self, multiindex_dataframe_random_data, using_copy_on_write, warn_copy_on_write ): frame = multiindex_dataframe_random_data subset = frame.index[[1, 4, 5]] @@ -431,7 +432,8 @@ def test_set_column_scalar_with_loc( frame_original = frame.copy() col = frame["B"] - col[subset] = 97 + with tm.assert_cow_warning(warn_copy_on_write): + col[subset] = 97 if using_copy_on_write: # chained setitem doesn't work with CoW tm.assert_frame_equal(frame, frame_original) diff --git a/pandas/tests/indexing/multiindex/test_slice.py b/pandas/tests/indexing/multiindex/test_slice.py index 6a78b2243f07e..cef3dca054758 100644 --- a/pandas/tests/indexing/multiindex/test_slice.py +++ b/pandas/tests/indexing/multiindex/test_slice.py @@ -739,6 +739,7 @@ def test_int_series_slicing(self, multiindex_year_month_day_dataframe_random_dat expected = s.reindex(s.index[5:]) tm.assert_series_equal(result, expected) + s = ymd["A"].copy() exp = ymd["A"].copy() s[5:] = 0 exp.iloc[5:] = 0 diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index 6d70a6c59aa6b..21aab6652a300 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -32,7 +32,9 @@ def random_text(nobs=100): class TestCaching: - def test_slice_consolidate_invalidate_item_cache(self, using_copy_on_write): + def test_slice_consolidate_invalidate_item_cache( + self, using_copy_on_write, warn_copy_on_write + ): # this is chained assignment, but will 'work' with option_context("chained_assignment", None): # #3970 @@ -49,7 +51,9 @@ def test_slice_consolidate_invalidate_item_cache(self, using_copy_on_write): with tm.raises_chained_assignment_error(): df["bb"].iloc[0] = 0.17 else: - df["bb"].iloc[0] = 0.17 + # TODO(CoW-warn) custom warning message + with tm.assert_cow_warning(warn_copy_on_write): + df["bb"].iloc[0] = 0.17 df._clear_item_cache() if not using_copy_on_write: tm.assert_almost_equal(df["bb"][0], 0.17) @@ -74,7 +78,9 @@ def test_setitem_cache_updating(self, do_ref): assert df.loc[0, "c"] == 0.0 assert df.loc[7, "c"] == 1.0 - def test_setitem_cache_updating_slices(self, using_copy_on_write): + def test_setitem_cache_updating_slices( + self, using_copy_on_write, warn_copy_on_write + ): # GH 7084 # not updating cache on series setting with slices expected = DataFrame( @@ -102,7 +108,9 @@ def test_setitem_cache_updating_slices(self, using_copy_on_write): with tm.raises_chained_assignment_error(): out[row["C"]][six:eix] = v else: - out[row["C"]][six:eix] = v + # TODO(CoW-warn) custom warning message + with tm.assert_cow_warning(warn_copy_on_write): + out[row["C"]][six:eix] = v if not using_copy_on_write: tm.assert_frame_equal(out, expected) @@ -113,17 +121,23 @@ def test_setitem_cache_updating_slices(self, using_copy_on_write): out = DataFrame({"A": [0, 0, 0]}, index=date_range("5/7/2014", "5/9/2014")) for ix, row in df.iterrows(): - out.loc[six:eix, row["C"]] += row["D"] + # TODO(CoW-warn) should not warn + with tm.assert_produces_warning( + FutureWarning if warn_copy_on_write else None + ): + out.loc[six:eix, row["C"]] += row["D"] tm.assert_frame_equal(out, expected) tm.assert_series_equal(out["A"], expected["A"]) - def test_altering_series_clears_parent_cache(self, using_copy_on_write): + def test_altering_series_clears_parent_cache( + self, using_copy_on_write, warn_copy_on_write + ): # GH #33675 df = DataFrame([[1, 2], [3, 4]], index=["a", "b"], columns=["A", "B"]) ser = df["A"] - if using_copy_on_write: + if using_copy_on_write or warn_copy_on_write: assert "A" not in df._item_cache else: assert "A" in df._item_cache @@ -138,7 +152,7 @@ def test_altering_series_clears_parent_cache(self, using_copy_on_write): class TestChaining: - def test_setitem_chained_setfault(self, using_copy_on_write): + def test_setitem_chained_setfault(self, using_copy_on_write, warn_copy_on_write): # GH6026 data = ["right", "left", "left", "left", "right", "left", "timeout"] mdata = ["right", "left", "left", "left", "right", "left", "none"] @@ -150,6 +164,8 @@ def test_setitem_chained_setfault(self, using_copy_on_write): df.response[mask] = "none" tm.assert_frame_equal(df, DataFrame({"response": data})) else: + # TODO(CoW-warn) should warn + # with tm.assert_cow_warning(warn_copy_on_write): df.response[mask] = "none" tm.assert_frame_equal(df, DataFrame({"response": mdata})) @@ -161,6 +177,8 @@ def test_setitem_chained_setfault(self, using_copy_on_write): df.response[mask] = "none" tm.assert_frame_equal(df, DataFrame({"response": data})) else: + # TODO(CoW-warn) should warn + # with tm.assert_cow_warning(warn_copy_on_write): df.response[mask] = "none" tm.assert_frame_equal(df, DataFrame({"response": mdata})) @@ -172,6 +190,8 @@ def test_setitem_chained_setfault(self, using_copy_on_write): df.response[mask] = "none" tm.assert_frame_equal(df, df_original) else: + # TODO(CoW-warn) should warn + # with tm.assert_cow_warning(warn_copy_on_write): df.response[mask] = "none" tm.assert_frame_equal(df, DataFrame({"response": mdata, "response1": data})) @@ -183,7 +203,9 @@ def test_setitem_chained_setfault(self, using_copy_on_write): df["A"].iloc[0] = np.nan expected = DataFrame({"A": ["foo", "bar", "bah", "foo", "bar"]}) else: - df["A"].iloc[0] = np.nan + # TODO(CoW-warn) custom warning message + with tm.assert_cow_warning(warn_copy_on_write): + df["A"].iloc[0] = np.nan expected = DataFrame({"A": [np.nan, "bar", "bah", "foo", "bar"]}) result = df.head() tm.assert_frame_equal(result, expected) @@ -193,7 +215,8 @@ def test_setitem_chained_setfault(self, using_copy_on_write): with tm.raises_chained_assignment_error(): df.A.iloc[0] = np.nan else: - df.A.iloc[0] = np.nan + with tm.assert_cow_warning(warn_copy_on_write): + df.A.iloc[0] = np.nan result = df.head() tm.assert_frame_equal(result, expected) @@ -636,7 +659,9 @@ def test_cache_updating2(self, using_copy_on_write): expected = Series([0, 0, 0, 2, 0], name="f") tm.assert_series_equal(df.f, expected) - def test_iloc_setitem_chained_assignment(self, using_copy_on_write): + def test_iloc_setitem_chained_assignment( + self, using_copy_on_write, warn_copy_on_write + ): # GH#3970 with option_context("chained_assignment", None): df = DataFrame({"aa": range(5), "bb": [2.2] * 5}) @@ -648,7 +673,9 @@ def test_iloc_setitem_chained_assignment(self, using_copy_on_write): with tm.raises_chained_assignment_error(): df["bb"].iloc[0] = 0.13 else: - df["bb"].iloc[0] = 0.13 + # TODO(CoW-warn) custom warning message + with tm.assert_cow_warning(warn_copy_on_write): + df["bb"].iloc[0] = 0.13 # GH#3970 this lookup used to break the chained setting to 0.15 df.iloc[ck] @@ -657,7 +684,9 @@ def test_iloc_setitem_chained_assignment(self, using_copy_on_write): with tm.raises_chained_assignment_error(): df["bb"].iloc[0] = 0.15 else: - df["bb"].iloc[0] = 0.15 + # TODO(CoW-warn) custom warning message + with tm.assert_cow_warning(warn_copy_on_write): + df["bb"].iloc[0] = 0.15 if not using_copy_on_write: assert df["bb"].iloc[0] == 0.15 diff --git a/pandas/tests/indexing/test_iat.py b/pandas/tests/indexing/test_iat.py index 4497c16efdfda..b4c4b81ac9cfb 100644 --- a/pandas/tests/indexing/test_iat.py +++ b/pandas/tests/indexing/test_iat.py @@ -5,6 +5,7 @@ Series, period_range, ) +import pandas._testing as tm def test_iat(float_frame): @@ -30,7 +31,9 @@ def test_iat_getitem_series_with_period_index(): assert expected == result -def test_iat_setitem_item_cache_cleared(indexer_ial, using_copy_on_write): +def test_iat_setitem_item_cache_cleared( + indexer_ial, using_copy_on_write, warn_copy_on_write +): # GH#45684 data = {"x": np.arange(8, dtype=np.int64), "y": np.int64(0)} df = DataFrame(data).copy() @@ -38,9 +41,12 @@ def test_iat_setitem_item_cache_cleared(indexer_ial, using_copy_on_write): # previously this iat setting would split the block and fail to clear # the item_cache. - indexer_ial(df)[7, 0] = 9999 + with tm.assert_cow_warning(warn_copy_on_write and indexer_ial is tm.iloc): + indexer_ial(df)[7, 0] = 9999 - indexer_ial(df)[7, 1] = 1234 + # TODO(CoW-warn) should also warn for iat? + with tm.assert_cow_warning(warn_copy_on_write and indexer_ial is tm.iloc): + indexer_ial(df)[7, 1] = 1234 assert df.iat[7, 1] == 1234 if not using_copy_on_write: diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 558ad7ded5619..5c0c1b42ca963 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -424,6 +424,8 @@ def test_iloc_getitem_slice_dups(self): tm.assert_frame_equal(df.iloc[10:, :2], df2) tm.assert_frame_equal(df.iloc[10:, 2:], df1) + # TODO(CoW-warn) this should NOT warn + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_iloc_setitem(self): df = DataFrame( np.random.default_rng(2).standard_normal((4, 4)), @@ -835,7 +837,9 @@ def test_iloc_empty_list_indexer_is_ok(self): df.iloc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True ) - def test_identity_slice_returns_new_object(self, using_copy_on_write): + def test_identity_slice_returns_new_object( + self, using_copy_on_write, warn_copy_on_write + ): # GH13873 original_df = DataFrame({"a": [1, 2, 3]}) sliced_df = original_df.iloc[:] @@ -846,6 +850,8 @@ def test_identity_slice_returns_new_object(self, using_copy_on_write): # Setting using .loc[:, "a"] sets inplace so alters both sliced and orig # depending on CoW + # TODO(CoW-warn) this should warn + # with tm.assert_cow_warning(warn_copy_on_write): original_df.loc[:, "a"] = [4, 4, 4] if using_copy_on_write: assert (sliced_df["a"] == [1, 2, 3]).all() @@ -857,7 +863,8 @@ def test_identity_slice_returns_new_object(self, using_copy_on_write): assert sliced_series is not original_series # should also be a shallow copy - original_series[:3] = [7, 8, 9] + with tm.assert_cow_warning(warn_copy_on_write): + original_series[:3] = [7, 8, 9] if using_copy_on_write: # shallow copy not updated (CoW) assert all(sliced_series[:3] == [1, 2, 3]) @@ -1221,7 +1228,9 @@ def test_iloc_setitem_multicolumn_to_datetime(self): class TestILocErrors: # NB: this test should work for _any_ Series we can pass as # series_with_simple_index - def test_iloc_float_raises(self, series_with_simple_index, frame_or_series): + def test_iloc_float_raises( + self, series_with_simple_index, frame_or_series, warn_copy_on_write + ): # GH#4892 # float_indexers should raise exceptions # on appropriate Index types & accessors @@ -1238,7 +1247,10 @@ def test_iloc_float_raises(self, series_with_simple_index, frame_or_series): obj.iloc[3.0] with pytest.raises(IndexError, match=_slice_iloc_msg): - obj.iloc[3.0] = 0 + with tm.assert_cow_warning( + warn_copy_on_write and frame_or_series is DataFrame + ): + obj.iloc[3.0] = 0 def test_iloc_getitem_setitem_fancy_exceptions(self, float_frame): with pytest.raises(IndexingError, match="Too many indexers"): @@ -1401,7 +1413,7 @@ def test_frame_iloc_setitem_callable(self): class TestILocSeries: - def test_iloc(self, using_copy_on_write): + def test_iloc(self, using_copy_on_write, warn_copy_on_write): ser = Series( np.random.default_rng(2).standard_normal(10), index=list(range(0, 20, 2)) ) @@ -1420,7 +1432,8 @@ def test_iloc(self, using_copy_on_write): # test slice is a view with tm.assert_produces_warning(None): # GH#45324 make sure we aren't giving a spurious FutureWarning - result[:] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + result[:] = 0 if using_copy_on_write: tm.assert_series_equal(ser, ser_original) else: diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index dfbf30d06e82c..bdbbcabcaab0e 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -515,7 +515,7 @@ def test_multi_assign_broadcasting_rhs(self): for col in ["A", "B"]: expected.loc[mask, col] = df["D"] - df.loc[df["A"] == 0, ["A", "B"]] = df["D"] + df.loc[df["A"] == 0, ["A", "B"]] = df["D"].copy() tm.assert_frame_equal(df, expected) def test_setitem_list(self): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 240ce71c46093..35c50d6c705d1 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1081,7 +1081,9 @@ def test_loc_empty_list_indexer_is_ok(self): df.loc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True ) - def test_identity_slice_returns_new_object(self, using_copy_on_write): + def test_identity_slice_returns_new_object( + self, using_copy_on_write, warn_copy_on_write + ): # GH13873 original_df = DataFrame({"a": [1, 2, 3]}) @@ -1095,6 +1097,8 @@ def test_identity_slice_returns_new_object(self, using_copy_on_write): # Setting using .loc[:, "a"] sets inplace so alters both sliced and orig # depending on CoW + # TODO(CoW-warn) should warn + # with tm.assert_cow_warning(warn_copy_on_write): original_df.loc[:, "a"] = [4, 4, 4] if using_copy_on_write: assert (sliced_df["a"] == [1, 2, 3]).all() @@ -1103,7 +1107,7 @@ def test_identity_slice_returns_new_object(self, using_copy_on_write): # These should not return copies df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) - if using_copy_on_write: + if using_copy_on_write or warn_copy_on_write: assert df[0] is not df.loc[:, 0] else: assert df[0] is df.loc[:, 0] @@ -1114,7 +1118,8 @@ def test_identity_slice_returns_new_object(self, using_copy_on_write): assert sliced_series is not original_series assert original_series[:] is not original_series - original_series[:3] = [7, 8, 9] + with tm.assert_cow_warning(warn_copy_on_write): + original_series[:3] = [7, 8, 9] if using_copy_on_write: assert all(sliced_series[:3] == [1, 2, 3]) else: @@ -2627,7 +2632,9 @@ def test_loc_setitem_boolean_and_column(self, float_frame): expected = DataFrame(values, index=expected.index, columns=expected.columns) tm.assert_frame_equal(float_frame, expected) - def test_loc_setitem_ndframe_values_alignment(self, using_copy_on_write): + def test_loc_setitem_ndframe_values_alignment( + self, using_copy_on_write, warn_copy_on_write + ): # GH#45501 df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df.loc[[False, False, True], ["a"]] = DataFrame( @@ -2650,7 +2657,8 @@ def test_loc_setitem_ndframe_values_alignment(self, using_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df_orig = df.copy() ser = df["a"] - ser.loc[[False, False, True]] = Series([10, 11, 12], index=[2, 1, 0]) + with tm.assert_cow_warning(warn_copy_on_write): + ser.loc[[False, False, True]] = Series([10, 11, 12], index=[2, 1, 0]) if using_copy_on_write: tm.assert_frame_equal(df, df_orig) else: @@ -2660,21 +2668,21 @@ def test_loc_indexer_empty_broadcast(self): # GH#51450 df = DataFrame({"a": [], "b": []}, dtype=object) expected = df.copy() - df.loc[np.array([], dtype=np.bool_), ["a"]] = df["a"] + df.loc[np.array([], dtype=np.bool_), ["a"]] = df["a"].copy() tm.assert_frame_equal(df, expected) def test_loc_indexer_all_false_broadcast(self): # GH#51450 df = DataFrame({"a": ["x"], "b": ["y"]}, dtype=object) expected = df.copy() - df.loc[np.array([False], dtype=np.bool_), ["a"]] = df["b"] + df.loc[np.array([False], dtype=np.bool_), ["a"]] = df["b"].copy() tm.assert_frame_equal(df, expected) def test_loc_indexer_length_one(self): # GH#51435 df = DataFrame({"a": ["x"], "b": ["y"]}, dtype=object) expected = DataFrame({"a": ["y"], "b": ["y"]}, dtype=object) - df.loc[np.array([True], dtype=np.bool_), ["a"]] = df["b"] + df.loc[np.array([True], dtype=np.bool_), ["a"]] = df["b"].copy() tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index d4004ade02318..3d04cc764563f 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -260,6 +260,7 @@ def test_partial_setting(self): with pytest.raises(IndexError, match=msg): s.iat[3] = 5.0 + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_partial_setting_frame(self, using_array_manager): df_orig = DataFrame( np.arange(6).reshape(3, 2), columns=["A", "B"], dtype="int64" diff --git a/pandas/tests/indexing/test_scalar.py b/pandas/tests/indexing/test_scalar.py index 5b95eb65ff00a..29e3dc0aebe95 100644 --- a/pandas/tests/indexing/test_scalar.py +++ b/pandas/tests/indexing/test_scalar.py @@ -246,6 +246,7 @@ def test_at_with_tuple_index_get(): assert series.at[(1, 2)] == 1 +@pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_at_with_tuple_index_set(): # GH 26989 # DataFrame.at setter works with Index of tuples @@ -276,6 +277,7 @@ def test_multiindex_at_get(self): assert series.at[1, 3] == 1 assert series.loc[1, 3] == 1 + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_multiindex_at_set(self): # GH 26989 # DataFrame.at and DataFrame.loc setter works with MultiIndex diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 3abbd14c20e16..0c28db245de31 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -34,6 +34,7 @@ ) xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") def test_override_set_noconvert_columns(): @@ -119,7 +120,6 @@ def test_read_csv_local(all_parsers, csv1): tm.assert_frame_equal(result, expected) -@xfail_pyarrow def test_1000_sep(all_parsers): parser = all_parsers data = """A|B|C @@ -128,11 +128,17 @@ def test_1000_sep(all_parsers): """ expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]}) + if parser.engine == "pyarrow": + msg = "The 'thousands' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), sep="|", thousands=",") + return + result = parser.read_csv(StringIO(data), sep="|", thousands=",") tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@xfail_pyarrow # ValueError: Found non-unique column index def test_unnamed_columns(all_parsers): data = """A,B,C,, 1,2,3,4,5 @@ -161,7 +167,6 @@ def test_csv_mixed_type(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow def test_read_csv_low_memory_no_rows_with_index(all_parsers): # see gh-21141 parser = all_parsers @@ -174,6 +179,13 @@ def test_read_csv_low_memory_no_rows_with_index(all_parsers): 2,2,3,4 3,3,4,5 """ + + if parser.engine == "pyarrow": + msg = "The 'nrows' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0) + return + result = parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0) expected = DataFrame(columns=["A", "B", "C"]) tm.assert_frame_equal(result, expected) @@ -212,7 +224,6 @@ def test_read_csv_dataframe(all_parsers, csv1): tm.assert_frame_equal(result, expected) -@xfail_pyarrow @pytest.mark.parametrize("nrows", [3, 3.0]) def test_read_nrows(all_parsers, nrows): # see gh-10476 @@ -230,11 +241,16 @@ def test_read_nrows(all_parsers, nrows): ) parser = all_parsers + if parser.engine == "pyarrow": + msg = "The 'nrows' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), nrows=nrows) + return + result = parser.read_csv(StringIO(data), nrows=nrows) tm.assert_frame_equal(result, expected) -@xfail_pyarrow @pytest.mark.parametrize("nrows", [1.2, "foo", -1]) def test_read_nrows_bad(all_parsers, nrows): data = """index,A,B,C,D @@ -247,6 +263,8 @@ def test_read_nrows_bad(all_parsers, nrows): """ msg = r"'nrows' must be an integer >=0" parser = all_parsers + if parser.engine == "pyarrow": + msg = "The 'nrows' option is not supported with the 'pyarrow' engine" with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), nrows=nrows) @@ -261,7 +279,7 @@ def test_nrows_skipfooter_errors(all_parsers): parser.read_csv(StringIO(data), skipfooter=1, nrows=5) -@xfail_pyarrow +@skip_pyarrow def test_missing_trailing_delimiters(all_parsers): parser = all_parsers data = """A,B,C,D @@ -277,7 +295,6 @@ def test_missing_trailing_delimiters(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow def test_skip_initial_space(all_parsers): data = ( '"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, ' @@ -289,6 +306,18 @@ def test_skip_initial_space(all_parsers): ) parser = all_parsers + if parser.engine == "pyarrow": + msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + names=list(range(33)), + header=None, + na_values=["-9999.0"], + skipinitialspace=True, + ) + return + result = parser.read_csv( StringIO(data), names=list(range(33)), @@ -338,7 +367,7 @@ def test_skip_initial_space(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_trailing_delimiters(all_parsers): # see gh-2442 data = """A,B,C @@ -370,7 +399,7 @@ def test_escapechar(all_parsers): tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"])) -@xfail_pyarrow +@xfail_pyarrow # ValueError: the 'pyarrow' engine does not support regex separators def test_ignore_leading_whitespace(all_parsers): # see gh-3374, gh-6607 parser = all_parsers @@ -381,7 +410,7 @@ def test_ignore_leading_whitespace(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]]) def test_uneven_lines_with_usecols(all_parsers, usecols): # see gh-12203 @@ -404,7 +433,7 @@ def test_uneven_lines_with_usecols(all_parsers, usecols): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -437,7 +466,6 @@ def test_read_empty_with_usecols(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -@xfail_pyarrow @pytest.mark.parametrize( "kwargs,expected", [ @@ -467,6 +495,12 @@ def test_trailing_spaces(all_parsers, kwargs, expected): data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa: E501 parser = all_parsers + if parser.engine == "pyarrow": + msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) + return + result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) tm.assert_frame_equal(result, expected) @@ -488,7 +522,6 @@ def test_read_filepath_or_buffer(all_parsers): parser.read_csv(filepath_or_buffer=b"input") -@xfail_pyarrow @pytest.mark.parametrize("delim_whitespace", [True, False]) def test_single_char_leading_whitespace(all_parsers, delim_whitespace): # see gh-9710 @@ -501,6 +534,15 @@ def test_single_char_leading_whitespace(all_parsers, delim_whitespace): b\n""" expected = DataFrame({"MyColumn": list("abab")}) + + if parser.engine == "pyarrow": + msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace + ) + return + result = parser.read_csv( StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace ) @@ -552,7 +594,7 @@ def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data, request): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_whitespace_lines(all_parsers): parser = all_parsers data = """ @@ -568,7 +610,7 @@ def test_whitespace_lines(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@xfail_pyarrow # ValueError: the 'pyarrow' engine does not support regex separators @pytest.mark.parametrize( "data,expected", [ @@ -666,7 +708,7 @@ def test_read_csv_and_table_sys_setprofile(all_parsers, read_func): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_first_row_bom(all_parsers): # see gh-26545 parser = all_parsers @@ -677,7 +719,7 @@ def test_first_row_bom(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_first_row_bom_unquoted(all_parsers): # see gh-36343 parser = all_parsers @@ -688,7 +730,6 @@ def test_first_row_bom_unquoted(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow @pytest.mark.parametrize("nrows", range(1, 6)) def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): # GH 28071 @@ -698,11 +739,20 @@ def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): ) csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4" parser = all_parsers + + if parser.engine == "pyarrow": + msg = "The 'nrows' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False + ) + return + df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False) tm.assert_frame_equal(df, ref[:nrows]) -@xfail_pyarrow +@skip_pyarrow def test_no_header_two_extra_columns(all_parsers): # GH 26218 column_names = ["one", "two", "three"] @@ -731,11 +781,16 @@ def test_read_csv_names_not_accepting_sets(all_parsers): parser.read_csv(StringIO(data), names=set("QAZ")) -@xfail_pyarrow def test_read_table_delim_whitespace_default_sep(all_parsers): # GH: 35958 f = StringIO("a b c\n1 -2 -3\n4 5 6") parser = all_parsers + + if parser.engine == "pyarrow": + msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_table(f, delim_whitespace=True) + return result = parser.read_table(f, delim_whitespace=True) expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]}) tm.assert_frame_equal(result, expected) @@ -798,7 +853,7 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): parser.read_table(f, delim_whitespace=True, delimiter=delimiter) -@xfail_pyarrow +@skip_pyarrow def test_dict_keys_as_names(all_parsers): # GH: 36928 data = "1,2" @@ -811,7 +866,7 @@ def test_dict_keys_as_names(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0 def test_encoding_surrogatepass(all_parsers): # GH39017 parser = all_parsers @@ -839,7 +894,7 @@ def test_malformed_second_line(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_short_single_line(all_parsers): # GH 47566 parser = all_parsers @@ -850,7 +905,7 @@ def test_short_single_line(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@xfail_pyarrow # ValueError: Length mismatch: Expected axis has 2 elements def test_short_multi_line(all_parsers): # GH 47566 parser = all_parsers diff --git a/pandas/tests/io/parser/common/test_data_list.py b/pandas/tests/io/parser/common/test_data_list.py index 3b0ff9e08d349..5c798316e2cea 100644 --- a/pandas/tests/io/parser/common/test_data_list.py +++ b/pandas/tests/io/parser/common/test_data_list.py @@ -16,10 +16,10 @@ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") -@xfail_pyarrow +@skip_pyarrow def test_read_data_list(all_parsers): parser = all_parsers kwargs = {"index_col": 0} diff --git a/pandas/tests/io/parser/common/test_decimal.py b/pandas/tests/io/parser/common/test_decimal.py index b8a68c138eeff..4ceca037f589a 100644 --- a/pandas/tests/io/parser/common/test_decimal.py +++ b/pandas/tests/io/parser/common/test_decimal.py @@ -13,10 +13,7 @@ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") - -@xfail_pyarrow @pytest.mark.parametrize( "data,thousands,decimal", [ @@ -42,6 +39,14 @@ def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal): parser = all_parsers expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]}) + if parser.engine == "pyarrow": + msg = "The 'thousands' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), sep="|", thousands=thousands, decimal=decimal + ) + return + result = parser.read_csv( StringIO(data), sep="|", thousands=thousands, decimal=decimal ) diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index 5d5814e880f8b..a6e68cb984ef4 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -27,6 +27,7 @@ ) xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @pytest.mark.network @@ -214,8 +215,14 @@ def test_eof_states(all_parsers, data, kwargs, expected, msg, request): # see gh-10728, gh-10548 parser = all_parsers + if parser.engine == "pyarrow" and "comment" in kwargs: + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + return + if parser.engine == "pyarrow" and "\r" not in data: - mark = pytest.mark.xfail(reason="The 'comment' option is not supported") + mark = pytest.mark.xfail(reason="Mismatched exception type/message") request.applymarker(mark) if expected is None: @@ -356,7 +363,6 @@ def test_read_csv_file_handle(all_parsers, io_class, encoding): assert not handle.closed -@xfail_pyarrow # ValueError: The 'memory_map' option is not supported def test_memory_map_compression(all_parsers, compression): """ Support memory map for compressed files. @@ -369,19 +375,32 @@ def test_memory_map_compression(all_parsers, compression): with tm.ensure_clean() as path: expected.to_csv(path, index=False, compression=compression) - tm.assert_frame_equal( - parser.read_csv(path, memory_map=True, compression=compression), - expected, - ) + if parser.engine == "pyarrow": + msg = "The 'memory_map' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(path, memory_map=True, compression=compression) + return + + result = parser.read_csv(path, memory_map=True, compression=compression) + + tm.assert_frame_equal( + result, + expected, + ) -@xfail_pyarrow # ValueError: The 'chunksize' option is not supported def test_context_manager(all_parsers, datapath): # make sure that opened files are closed parser = all_parsers path = datapath("io", "data", "csv", "iris.csv") + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(path, chunksize=1) + return + reader = parser.read_csv(path, chunksize=1) assert not reader.handles.handle.closed try: @@ -392,12 +411,17 @@ def test_context_manager(all_parsers, datapath): assert reader.handles.handle.closed -@xfail_pyarrow # ValueError: The 'chunksize' option is not supported def test_context_manageri_user_provided(all_parsers, datapath): # make sure that user-provided handles are not closed parser = all_parsers with open(datapath("io", "data", "csv", "iris.csv"), encoding="utf-8") as path: + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(path, chunksize=1) + return + reader = parser.read_csv(path, chunksize=1) assert not reader.handles.handle.closed try: @@ -408,7 +432,7 @@ def test_context_manageri_user_provided(all_parsers, datapath): assert not reader.handles.handle.closed -@xfail_pyarrow # ParserError: Empty CSV file +@skip_pyarrow # ParserError: Empty CSV file def test_file_descriptor_leak(all_parsers, using_copy_on_write): # GH 31488 parser = all_parsers @@ -417,7 +441,6 @@ def test_file_descriptor_leak(all_parsers, using_copy_on_write): parser.read_csv(path) -@xfail_pyarrow # ValueError: The 'memory_map' option is not supported def test_memory_map(all_parsers, csv_dir_path): mmap_file = os.path.join(csv_dir_path, "test_mmap.csv") parser = all_parsers @@ -426,5 +449,11 @@ def test_memory_map(all_parsers, csv_dir_path): {"a": [1, 2, 3], "b": ["one", "two", "three"], "c": ["I", "II", "III"]} ) + if parser.engine == "pyarrow": + msg = "The 'memory_map' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(mmap_file, memory_map=True) + return + result = parser.read_csv(mmap_file, memory_map=True) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/common/test_float.py b/pandas/tests/io/parser/common/test_float.py index 63ad3bcb249ea..4b23774ee2d5b 100644 --- a/pandas/tests/io/parser/common/test_float.py +++ b/pandas/tests/io/parser/common/test_float.py @@ -16,9 +16,10 @@ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") -@xfail_pyarrow # ParserError: CSV parse error: Empty CSV file or block +@skip_pyarrow # ParserError: CSV parse error: Empty CSV file or block def test_float_parser(all_parsers): # see gh-9565 parser = all_parsers @@ -50,7 +51,7 @@ def test_very_negative_exponent(all_parsers_all_precisions, neg_exp): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different @pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999]) def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): # GH#38753 diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index 7df14043f478c..038c684c90c9e 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -20,6 +20,7 @@ ) xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @pytest.mark.parametrize( @@ -108,7 +109,7 @@ def test_multi_index_no_level_names(all_parsers, index_col): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_multi_index_no_level_names_implicit(all_parsers): parser = all_parsers data = """A,B,C,D @@ -142,7 +143,7 @@ def test_multi_index_no_level_names_implicit(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@xfail_pyarrow # TypeError: an integer is required @pytest.mark.parametrize( "data,expected,header", [ @@ -164,7 +165,7 @@ def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@xfail_pyarrow # AssertionError: DataFrame.columns are different def test_no_unnamed_index(all_parsers): parser = all_parsers data = """ id c0 c1 c2 @@ -207,7 +208,7 @@ def test_read_duplicate_index_explicit(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_read_duplicate_index_implicit(all_parsers): data = """A,B,C,D foo,2,3,4,5 @@ -235,7 +236,7 @@ def test_read_duplicate_index_implicit(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_read_csv_no_index_name(all_parsers, csv_dir_path): parser = all_parsers csv2 = os.path.join(csv_dir_path, "test2.csv") @@ -263,7 +264,7 @@ def test_read_csv_no_index_name(all_parsers, csv_dir_path): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_empty_with_index(all_parsers): # see gh-10184 data = "x,y" @@ -275,7 +276,7 @@ def test_empty_with_index(all_parsers): # CSV parse error: Empty CSV file or block: cannot infer number of columns -@xfail_pyarrow +@skip_pyarrow def test_empty_with_multi_index(all_parsers): # see gh-10467 data = "x,y,z" @@ -289,7 +290,7 @@ def test_empty_with_multi_index(all_parsers): # CSV parse error: Empty CSV file or block: cannot infer number of columns -@xfail_pyarrow +@skip_pyarrow def test_empty_with_reversed_multi_index(all_parsers): data = "x,y,z" parser = all_parsers diff --git a/pandas/tests/io/parser/common/test_inf.py b/pandas/tests/io/parser/common/test_inf.py index e1dc87ed0071e..74596b178d35d 100644 --- a/pandas/tests/io/parser/common/test_inf.py +++ b/pandas/tests/io/parser/common/test_inf.py @@ -20,7 +20,7 @@ xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") -@xfail_pyarrow +@xfail_pyarrow # AssertionError: DataFrame.index are different @pytest.mark.parametrize("na_filter", [True, False]) def test_inf_parsing(all_parsers, na_filter): parser = all_parsers @@ -44,7 +44,7 @@ def test_inf_parsing(all_parsers, na_filter): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@xfail_pyarrow # AssertionError: DataFrame.index are different @pytest.mark.parametrize("na_filter", [True, False]) def test_infinity_parsing(all_parsers, na_filter): parser = all_parsers diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py index 086b43be59823..a3167346c64ef 100644 --- a/pandas/tests/io/parser/common/test_ints.py +++ b/pandas/tests/io/parser/common/test_ints.py @@ -18,6 +18,7 @@ ) xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") def test_int_conversion(all_parsers): @@ -126,10 +127,8 @@ def test_int64_min_issues(all_parsers): tm.assert_frame_equal(result, expected) -# ValueError: The 'converters' option is not supported with the 'pyarrow' engine -@xfail_pyarrow @pytest.mark.parametrize("conv", [None, np.int64, np.uint64]) -def test_int64_overflow(all_parsers, conv): +def test_int64_overflow(all_parsers, conv, request): data = """ID 00013007854817840016671868 00013007854817840016749251 @@ -143,6 +142,10 @@ def test_int64_overflow(all_parsers, conv): if conv is None: # 13007854817840016671868 > UINT64_MAX, so this # will overflow and return object as the dtype. + if parser.engine == "pyarrow": + mark = pytest.mark.xfail(reason="parses to float64") + request.applymarker(mark) + result = parser.read_csv(StringIO(data)) expected = DataFrame( [ @@ -161,17 +164,23 @@ def test_int64_overflow(all_parsers, conv): # 13007854817840016671868 > UINT64_MAX, so attempts # to cast to either int64 or uint64 will result in # an OverflowError being raised. - msg = ( - "(Python int too large to convert to C long)|" - "(long too big to convert)|" - "(int too big to convert)" + msg = "|".join( + [ + "Python int too large to convert to C long", + "long too big to convert", + "int too big to convert", + ] ) + err = OverflowError + if parser.engine == "pyarrow": + err = ValueError + msg = "The 'converters' option is not supported with the 'pyarrow' engine" - with pytest.raises(OverflowError, match=msg): + with pytest.raises(err, match=msg): parser.read_csv(StringIO(data), converters={"ID": conv}) -@xfail_pyarrow # CSV parse error: Empty CSV file or block +@skip_pyarrow # CSV parse error: Empty CSV file or block @pytest.mark.parametrize( "val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min] ) @@ -185,7 +194,7 @@ def test_int64_uint64_range(all_parsers, val): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # CSV parse error: Empty CSV file or block +@skip_pyarrow # CSV parse error: Empty CSV file or block @pytest.mark.parametrize( "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1] ) diff --git a/pandas/tests/io/parser/common/test_iterator.py b/pandas/tests/io/parser/common/test_iterator.py index 26619857bd231..a521c84aa007d 100644 --- a/pandas/tests/io/parser/common/test_iterator.py +++ b/pandas/tests/io/parser/common/test_iterator.py @@ -15,10 +15,8 @@ pytestmark = pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") -@xfail_pyarrow # ValueError: The 'iterator' option is not supported def test_iterator(all_parsers): # see gh-6607 data = """index,A,B,C,D @@ -33,6 +31,13 @@ def test_iterator(all_parsers): kwargs = {"index_col": 0} expected = parser.read_csv(StringIO(data), **kwargs) + + if parser.engine == "pyarrow": + msg = "The 'iterator' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), iterator=True, **kwargs) + return + with parser.read_csv(StringIO(data), iterator=True, **kwargs) as reader: first_chunk = reader.read(3) tm.assert_frame_equal(first_chunk, expected[:3]) @@ -41,7 +46,6 @@ def test_iterator(all_parsers): tm.assert_frame_equal(last_chunk, expected[3:]) -@xfail_pyarrow # ValueError: The 'iterator' option is not supported def test_iterator2(all_parsers): parser = all_parsers data = """A,B,C @@ -50,6 +54,12 @@ def test_iterator2(all_parsers): baz,7,8,9 """ + if parser.engine == "pyarrow": + msg = "The 'iterator' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), iterator=True) + return + with parser.read_csv(StringIO(data), iterator=True) as reader: result = list(reader) @@ -61,7 +71,6 @@ def test_iterator2(all_parsers): tm.assert_frame_equal(result[0], expected) -@xfail_pyarrow # ValueError: The 'chunksize' option is not supported def test_iterator_stop_on_chunksize(all_parsers): # gh-3967: stopping iteration when chunksize is specified parser = all_parsers @@ -70,6 +79,11 @@ def test_iterator_stop_on_chunksize(all_parsers): bar,4,5,6 baz,7,8,9 """ + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), chunksize=1) + return with parser.read_csv(StringIO(data), chunksize=1) as reader: result = list(reader) @@ -83,7 +97,6 @@ def test_iterator_stop_on_chunksize(all_parsers): tm.assert_frame_equal(concat(result), expected) -@xfail_pyarrow # AssertionError: Regex pattern did not match @pytest.mark.parametrize( "kwargs", [{"iterator": True, "chunksize": 1}, {"iterator": True}, {"chunksize": 1}] ) @@ -92,6 +105,12 @@ def test_iterator_skipfooter_errors(all_parsers, kwargs): parser = all_parsers data = "a\n1\n2" + if parser.engine == "pyarrow": + msg = ( + "The '(chunksize|iterator)' option is not supported with the " + "'pyarrow' engine" + ) + with pytest.raises(ValueError, match=msg): with parser.read_csv(StringIO(data), skipfooter=1, **kwargs) as _: pass diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index 52ddb38192a6b..f3794c056a256 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -22,6 +22,7 @@ import pandas._testing as tm xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") def test_empty_decimal_marker(all_parsers): @@ -63,7 +64,6 @@ def test_bad_stream_exception(all_parsers, csv_dir_path): parser.read_csv(stream) -@xfail_pyarrow # ValueError: The 'comment' option is not supported def test_malformed(all_parsers): # see gh-6607 parser = all_parsers @@ -74,11 +74,14 @@ def test_malformed(all_parsers): 2,3,4 """ msg = "Expected 3 fields in line 4, saw 5" - with pytest.raises(ParserError, match=msg): + err = ParserError + if parser.engine == "pyarrow": + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + err = ValueError + with pytest.raises(err, match=msg): parser.read_csv(StringIO(data), header=1, comment="#") -@xfail_pyarrow # ValueError: The 'iterator' option is not supported @pytest.mark.parametrize("nrows", [5, 3, None]) def test_malformed_chunks(all_parsers, nrows): data = """ignore @@ -90,6 +93,20 @@ def test_malformed_chunks(all_parsers, nrows): 2,3,4 """ parser = all_parsers + + if parser.engine == "pyarrow": + msg = "The 'iterator' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + header=1, + comment="#", + iterator=True, + chunksize=1, + skiprows=[2], + ) + return + msg = "Expected 3 fields in line 6, saw 5" with parser.read_csv( StringIO(data), header=1, comment="#", iterator=True, chunksize=1, skiprows=[2] @@ -123,7 +140,7 @@ def test_catch_too_many_names(all_parsers): parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) -@xfail_pyarrow # CSV parse error: Empty CSV file or block +@skip_pyarrow # CSV parse error: Empty CSV file or block @pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5]) def test_raise_on_no_columns(all_parsers, nrows): parser = all_parsers @@ -239,19 +256,21 @@ def test_null_byte_char(request, all_parsers): parser.read_csv(StringIO(data), names=names) -# ValueError: the 'pyarrow' engine does not support sep=None with delim_whitespace=False -@xfail_pyarrow @pytest.mark.filterwarnings("always::ResourceWarning") def test_open_file(request, all_parsers): # GH 39024 parser = all_parsers + + msg = "Could not determine delimiter" + err = csv.Error if parser.engine == "c": - request.applymarker( - pytest.mark.xfail( - reason=f"{parser.engine} engine does not support sep=None " - f"with delim_whitespace=False" - ) + msg = "the 'c' engine does not support sep=None with delim_whitespace=False" + err = ValueError + elif parser.engine == "pyarrow": + msg = ( + "the 'pyarrow' engine does not support sep=None with delim_whitespace=False" ) + err = ValueError with tm.ensure_clean() as path: file = Path(path) @@ -259,7 +278,7 @@ def test_open_file(request, all_parsers): with tm.assert_produces_warning(None): # should not trigger a ResourceWarning - with pytest.raises(csv.Error, match="Could not determine delimiter"): + with pytest.raises(err, match=msg): parser.read_csv(file, sep=None, encoding_errors="replace") diff --git a/pandas/tests/io/parser/common/test_verbose.py b/pandas/tests/io/parser/common/test_verbose.py index bcfb9cd4032ad..14deba8b40b22 100644 --- a/pandas/tests/io/parser/common/test_verbose.py +++ b/pandas/tests/io/parser/common/test_verbose.py @@ -6,10 +6,7 @@ import pytest -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") - -@xfail_pyarrow # ValueError: The 'verbose' option is not supported def test_verbose_read(all_parsers, capsys): parser = all_parsers data = """a,b,c,d @@ -22,6 +19,12 @@ def test_verbose_read(all_parsers, capsys): one,1,2,3 two,1,2,3""" + if parser.engine == "pyarrow": + msg = "The 'verbose' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), verbose=True) + return + # Engines are verbose in different ways. parser.read_csv(StringIO(data), verbose=True) captured = capsys.readouterr() @@ -33,7 +36,6 @@ def test_verbose_read(all_parsers, capsys): assert captured.out == "Filled 3 NA values in column a\n" -@xfail_pyarrow # ValueError: The 'verbose' option is not supported def test_verbose_read2(all_parsers, capsys): parser = all_parsers data = """a,b,c,d @@ -46,6 +48,12 @@ def test_verbose_read2(all_parsers, capsys): seven,1,2,3 eight,1,2,3""" + if parser.engine == "pyarrow": + msg = "The 'verbose' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), verbose=True, index_col=0) + return + parser.read_csv(StringIO(data), verbose=True, index_col=0) captured = capsys.readouterr() diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 471f525e229e5..eb7835bb27372 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -282,6 +282,8 @@ def numeric_decimal(request): def pyarrow_xfail(request): """ Fixture that xfails a test if the engine is pyarrow. + + Use if failure is do to unsupported keywords or inconsistent results. """ if "all_parsers" in request.fixturenames: parser = request.getfixturevalue("all_parsers") @@ -293,3 +295,21 @@ def pyarrow_xfail(request): if parser.engine == "pyarrow": mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") request.applymarker(mark) + + +@pytest.fixture +def pyarrow_skip(request): + """ + Fixture that skips a test if the engine is pyarrow. + + Use if failure is do a parsing failure from pyarrow.csv.read_csv + """ + if "all_parsers" in request.fixturenames: + parser = request.getfixturevalue("all_parsers") + elif "all_parsers_all_precisions" in request.fixturenames: + # Return value is tuple of (engine, precision) + parser = request.getfixturevalue("all_parsers_all_precisions")[0] + else: + return + if parser.engine == "pyarrow": + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") diff --git a/pandas/tests/io/parser/dtypes/test_categorical.py b/pandas/tests/io/parser/dtypes/test_categorical.py index c7586bd9334ef..f4aff14a5ce32 100644 --- a/pandas/tests/io/parser/dtypes/test_categorical.py +++ b/pandas/tests/io/parser/dtypes/test_categorical.py @@ -27,7 +27,7 @@ xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") -@xfail_pyarrow +@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different @pytest.mark.parametrize( "dtype", [ @@ -76,7 +76,7 @@ def test_categorical_dtype_single(all_parsers, dtype, request): tm.assert_frame_equal(actual, expected) -@xfail_pyarrow +@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different def test_categorical_dtype_unsorted(all_parsers): # see gh-10153 parser = all_parsers @@ -95,7 +95,7 @@ def test_categorical_dtype_unsorted(all_parsers): tm.assert_frame_equal(actual, expected) -@xfail_pyarrow +@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different def test_categorical_dtype_missing(all_parsers): # see gh-10153 parser = all_parsers @@ -114,7 +114,7 @@ def test_categorical_dtype_missing(all_parsers): tm.assert_frame_equal(actual, expected) -@xfail_pyarrow +@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different @pytest.mark.slow def test_categorical_dtype_high_cardinality_numeric(all_parsers, monkeypatch): # see gh-18186 @@ -146,8 +146,6 @@ def test_categorical_dtype_utf16(all_parsers, csv_dir_path): tm.assert_frame_equal(actual, expected) -# ValueError: The 'chunksize' option is not supported with the 'pyarrow' engine -@xfail_pyarrow def test_categorical_dtype_chunksize_infer_categories(all_parsers): # see gh-10153 parser = all_parsers @@ -160,6 +158,13 @@ def test_categorical_dtype_chunksize_infer_categories(all_parsers): DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}), DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]), ] + + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), dtype={"b": "category"}, chunksize=2) + return + with parser.read_csv( StringIO(data), dtype={"b": "category"}, chunksize=2 ) as actuals: @@ -167,8 +172,6 @@ def test_categorical_dtype_chunksize_infer_categories(all_parsers): tm.assert_frame_equal(actual, expected) -# ValueError: The 'chunksize' option is not supported with the 'pyarrow' engine -@xfail_pyarrow def test_categorical_dtype_chunksize_explicit_categories(all_parsers): # see gh-10153 parser = all_parsers @@ -186,6 +189,13 @@ def test_categorical_dtype_chunksize_explicit_categories(all_parsers): ), ] dtype = CategoricalDtype(cats) + + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) + return + with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals: for actual, expected in zip(actuals, expecteds): tm.assert_frame_equal(actual, expected) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 3f3d340ab2e08..32b4b1dedc3cb 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -73,7 +73,6 @@ def test_dtype_per_column(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.usefixtures("pyarrow_xfail") def test_invalid_dtype_per_column(all_parsers): parser = all_parsers data = """\ @@ -87,7 +86,6 @@ def test_invalid_dtype_per_column(all_parsers): parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) -@pytest.mark.usefixtures("pyarrow_xfail") def test_raise_on_passed_int_dtype_with_nas(all_parsers): # see gh-2631 parser = all_parsers @@ -96,22 +94,31 @@ def test_raise_on_passed_int_dtype_with_nas(all_parsers): 2001,,11 2001,106380451,67""" - msg = ( - "Integer column has NA values" - if parser.engine == "c" - else "Unable to convert column DOY" - ) + if parser.engine == "c": + msg = "Integer column has NA values" + elif parser.engine == "pyarrow": + msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine" + else: + msg = "Unable to convert column DOY" + with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True) -@pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_with_converters(all_parsers): parser = all_parsers data = """a,b 1.1,2.2 1.2,2.3""" + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), dtype={"a": "i8"}, converters={"a": lambda x: str(x)} + ) + return + # Dtype spec ignored if converted specified. result = parser.read_csv_check_warnings( ParserWarning, diff --git a/pandas/tests/io/parser/dtypes/test_empty.py b/pandas/tests/io/parser/dtypes/test_empty.py index 8759c52485533..f34385b190c5f 100644 --- a/pandas/tests/io/parser/dtypes/test_empty.py +++ b/pandas/tests/io/parser/dtypes/test_empty.py @@ -17,10 +17,10 @@ ) import pandas._testing as tm -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") -@xfail_pyarrow # CSV parse error: Empty CSV file or block +@skip_pyarrow # CSV parse error: Empty CSV file or block def test_dtype_all_columns_empty(all_parsers): # see gh-12048 parser = all_parsers @@ -30,7 +30,7 @@ def test_dtype_all_columns_empty(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # CSV parse error: Empty CSV file or block +@skip_pyarrow # CSV parse error: Empty CSV file or block def test_empty_pass_dtype(all_parsers): parser = all_parsers @@ -43,7 +43,7 @@ def test_empty_pass_dtype(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # CSV parse error: Empty CSV file or block +@skip_pyarrow # CSV parse error: Empty CSV file or block def test_empty_with_index_pass_dtype(all_parsers): parser = all_parsers @@ -58,7 +58,7 @@ def test_empty_with_index_pass_dtype(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # CSV parse error: Empty CSV file or block +@skip_pyarrow # CSV parse error: Empty CSV file or block def test_empty_with_multi_index_pass_dtype(all_parsers): parser = all_parsers @@ -75,7 +75,7 @@ def test_empty_with_multi_index_pass_dtype(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # CSV parse error: Empty CSV file or block +@skip_pyarrow # CSV parse error: Empty CSV file or block def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers): parser = all_parsers @@ -88,7 +88,7 @@ def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # CSV parse error: Empty CSV file or block +@skip_pyarrow # CSV parse error: Empty CSV file or block def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers): parser = all_parsers @@ -101,7 +101,7 @@ def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # CSV parse error: Empty CSV file or block +@skip_pyarrow # CSV parse error: Empty CSV file or block def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers): # see gh-9424 parser = all_parsers @@ -171,7 +171,7 @@ def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers): ), ], ) -@xfail_pyarrow # CSV parse error: Empty CSV file or block +@skip_pyarrow # CSV parse error: Empty CSV file or block def test_empty_dtype(all_parsers, dtype, expected): # see gh-14712 parser = all_parsers diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 9e1200c142d6b..3580c040688d8 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -24,6 +24,7 @@ ) xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") def test_bytes_io_input(all_parsers): @@ -37,7 +38,7 @@ def test_bytes_io_input(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # CSV parse error: Empty CSV file or block +@skip_pyarrow # CSV parse error: Empty CSV file or block def test_read_csv_unicode(all_parsers): parser = all_parsers data = BytesIO("\u0141aski, Jan;1".encode()) @@ -47,7 +48,7 @@ def test_read_csv_unicode(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("sep", [",", "\t"]) @pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"]) def test_utf16_bom_skiprows(all_parsers, sep, encoding): @@ -237,7 +238,7 @@ def test_parse_encoded_special_characters(encoding): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@xfail_pyarrow # ValueError: The 'memory_map' option is not supported @pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"]) def test_encoding_memory_map(all_parsers, encoding): # GH40986 @@ -255,7 +256,7 @@ def test_encoding_memory_map(all_parsers, encoding): tm.assert_frame_equal(df, expected) -@xfail_pyarrow +@xfail_pyarrow # ValueError: The 'memory_map' option is not supported def test_chunk_splits_multibyte_char(all_parsers): """ Chunk splits a multibyte character with memory_map=True @@ -275,7 +276,7 @@ def test_chunk_splits_multibyte_char(all_parsers): tm.assert_frame_equal(dfr, df) -@xfail_pyarrow +@xfail_pyarrow # ValueError: The 'memory_map' option is not supported def test_readcsv_memmap_utf8(all_parsers): """ GH 43787 diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 2edb389a0c830..f55f8497f318c 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -23,6 +23,7 @@ ) xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @xfail_pyarrow # TypeError: an integer is required @@ -79,7 +80,7 @@ def test_bool_header_arg(all_parsers, header): parser.read_csv(StringIO(data), header=header) -@xfail_pyarrow +@xfail_pyarrow # AssertionError: DataFrame are different def test_header_with_index_col(all_parsers): parser = all_parsers data = """foo,1,2,3 @@ -183,7 +184,7 @@ def test_header_multi_index_invalid(all_parsers, kwargs, msg): _TestTuple = namedtuple("_TestTuple", ["first", "second"]) -@xfail_pyarrow +@xfail_pyarrow # TypeError: an integer is required @pytest.mark.parametrize( "kwargs", [ @@ -231,7 +232,7 @@ def test_header_multi_index_common_format1(all_parsers, kwargs): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@xfail_pyarrow # TypeError: an integer is required @pytest.mark.parametrize( "kwargs", [ @@ -278,7 +279,7 @@ def test_header_multi_index_common_format2(all_parsers, kwargs): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@xfail_pyarrow # TypeError: an integer is required @pytest.mark.parametrize( "kwargs", [ @@ -419,7 +420,7 @@ def test_header_names_backward_compat(all_parsers, data, header, request): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # CSV parse error: Empty CSV file or block: cannot infer +@skip_pyarrow # CSV parse error: Empty CSV file or block: cannot infer @pytest.mark.parametrize("kwargs", [{}, {"index_col": False}]) def test_read_only_header_no_rows(all_parsers, kwargs): # See gh-7773 @@ -561,7 +562,7 @@ def test_multi_index_unnamed(all_parsers, index_col, columns): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # CSV parse error: Expected 2 columns, got 3 +@skip_pyarrow # CSV parse error: Expected 2 columns, got 3 def test_names_longer_than_header_but_equal_with_data_rows(all_parsers): # GH#38453 parser = all_parsers @@ -622,7 +623,7 @@ def test_read_csv_multi_header_length_check(all_parsers): parser.read_csv(StringIO(case), header=[0, 2]) -@xfail_pyarrow # CSV parse error: Expected 3 columns, got 2 +@skip_pyarrow # CSV parse error: Expected 3 columns, got 2 def test_header_none_and_implicit_index(all_parsers): # GH#22144 parser = all_parsers diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index b938b129ac38d..ba15d061b2deb 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -20,6 +20,7 @@ ) xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @pytest.mark.parametrize("with_header", [True, False]) @@ -76,7 +77,7 @@ def test_index_col_is_true(all_parsers): parser.read_csv(StringIO(data), index_col=True) -@xfail_pyarrow # CSV parse error: Expected 3 columns, got 4 +@skip_pyarrow # CSV parse error: Expected 3 columns, got 4 def test_infer_index_col(all_parsers): data = """A,B,C foo,1,2,3 @@ -94,7 +95,7 @@ def test_infer_index_col(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # CSV parse error: Empty CSV file or block +@skip_pyarrow # CSV parse error: Empty CSV file or block @pytest.mark.parametrize( "index_col,kwargs", [ @@ -143,7 +144,7 @@ def test_index_col_empty_data(all_parsers, index_col, kwargs): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # CSV parse error: Empty CSV file or block +@skip_pyarrow # CSV parse error: Empty CSV file or block def test_empty_with_index_col_false(all_parsers): # see gh-10413 data = "x,y" @@ -317,7 +318,7 @@ def test_multiindex_columns_index_col_with_data(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # CSV parse error: Empty CSV file or block +@skip_pyarrow # CSV parse error: Empty CSV file or block def test_infer_types_boolean_sum(all_parsers): # GH#44079 parser = all_parsers diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 7d148ae6c5a27..1d245f81f027c 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -18,7 +18,7 @@ ) -@xfail_pyarrow +@xfail_pyarrow # ValueError: Found non-unique column index def test_basic(all_parsers): parser = all_parsers @@ -29,7 +29,7 @@ def test_basic(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@xfail_pyarrow # ValueError: Found non-unique column index def test_basic_names(all_parsers): # See gh-7160 parser = all_parsers @@ -50,7 +50,7 @@ def test_basic_names_raise(all_parsers): parser.read_csv(StringIO(data), names=["a", "b", "a"]) -@xfail_pyarrow +@xfail_pyarrow # ValueError: Found non-unique column index @pytest.mark.parametrize( "data,expected", [ @@ -118,7 +118,7 @@ def test_thorough_mangle_names(all_parsers, data, names, expected): parser.read_csv(StringIO(data), names=names) -@xfail_pyarrow +@xfail_pyarrow # AssertionError: DataFrame.columns are different def test_mangled_unnamed_placeholders(all_parsers): # xref gh-13017 orig_key = "0" @@ -141,7 +141,7 @@ def test_mangled_unnamed_placeholders(all_parsers): tm.assert_frame_equal(df, expected) -@xfail_pyarrow +@xfail_pyarrow # ValueError: Found non-unique column index def test_mangle_dupe_cols_already_exists(all_parsers): # GH#14704 parser = all_parsers @@ -155,7 +155,7 @@ def test_mangle_dupe_cols_already_exists(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@xfail_pyarrow # ValueError: Found non-unique column index def test_mangle_dupe_cols_already_exists_unnamed_col(all_parsers): # GH#14704 parser = all_parsers diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 59dae1eaa7e6c..437a5fb5e9f09 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -21,6 +21,7 @@ ) xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") def test_string_nas(all_parsers): @@ -398,7 +399,7 @@ def test_na_values_na_filter_override(all_parsers, na_filter, row_data): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # CSV parse error: Expected 8 columns, got 5: +@skip_pyarrow # CSV parse error: Expected 8 columns, got 5: def test_na_trailing_columns(all_parsers): parser = all_parsers data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax @@ -630,7 +631,7 @@ def test_nan_multi_index(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@xfail_pyarrow # Failed: DID NOT RAISE def test_bool_and_nan_to_bool(all_parsers): # GH#42808 parser = all_parsers diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 47e654fc606af..70d9171fa3c22 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -41,6 +41,7 @@ ) xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @xfail_pyarrow @@ -786,7 +787,7 @@ def test_nat_parse(all_parsers): tm.assert_frame_equal(result, df) -@xfail_pyarrow +@skip_pyarrow def test_csv_custom_parser(all_parsers): data = """A,B,C 20090101,a,1,2 @@ -806,7 +807,7 @@ def test_csv_custom_parser(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_parse_dates_implicit_first_col(all_parsers): data = """A,B,C 20090101,a,1,2 @@ -2101,7 +2102,7 @@ def test_dayfirst_warnings_no_leading_zero(date_string, dayfirst): tm.assert_index_equal(expected, res) -@xfail_pyarrow # CSV parse error: Expected 3 columns, got 4 +@skip_pyarrow # CSV parse error: Expected 3 columns, got 4 def test_infer_first_column_as_index(all_parsers): # GH#11019 parser = all_parsers diff --git a/pandas/tests/io/parser/test_quoting.py b/pandas/tests/io/parser/test_quoting.py index a677d9caa4b19..0a1ba0252f106 100644 --- a/pandas/tests/io/parser/test_quoting.py +++ b/pandas/tests/io/parser/test_quoting.py @@ -18,6 +18,7 @@ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @pytest.mark.parametrize( @@ -31,7 +32,7 @@ ({"quotechar": 2}, '"quotechar" must be string( or None)?, not int'), ], ) -@xfail_pyarrow # ParserError: CSV parse error: Empty CSV file or block +@skip_pyarrow # ParserError: CSV parse error: Empty CSV file or block def test_bad_quote_char(all_parsers, kwargs, msg): data = "1,2,3" parser = all_parsers diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index 9146af3f969e6..47c3739c979a3 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -67,7 +67,7 @@ def test_deep_skip_rows(all_parsers): tm.assert_frame_equal(result, condensed_result) -@xfail_pyarrow +@xfail_pyarrow # AssertionError: DataFrame are different def test_skip_rows_blank(all_parsers): # see gh-9832 parser = all_parsers @@ -225,7 +225,7 @@ def test_skiprows_lineterminator(all_parsers, lineterminator, request): tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@xfail_pyarrow # AssertionError: DataFrame are different def test_skiprows_infield_quote(all_parsers): # see gh-14459 parser = all_parsers diff --git a/pandas/tests/io/parser/usecols/test_parse_dates.py b/pandas/tests/io/parser/usecols/test_parse_dates.py index bcb1c6af80df6..042c3814ef72a 100644 --- a/pandas/tests/io/parser/usecols/test_parse_dates.py +++ b/pandas/tests/io/parser/usecols/test_parse_dates.py @@ -17,6 +17,7 @@ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @xfail_pyarrow # TypeError: expected bytes, int found @@ -38,7 +39,7 @@ def test_usecols_with_parse_dates(all_parsers, usecols): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # pyarrow.lib.ArrowKeyError: Column 'fdate' in include_columns +@skip_pyarrow # pyarrow.lib.ArrowKeyError: Column 'fdate' in include_columns def test_usecols_with_parse_dates2(all_parsers): # see gh-13604 parser = all_parsers diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index 7a620768040a7..055be81d2996d 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -30,6 +30,7 @@ ) xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") pytestmark = pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning" @@ -148,7 +149,7 @@ def test_usecols_single_string(all_parsers): parser.read_csv(StringIO(data), usecols="foo") -@xfail_pyarrow # CSV parse error in one case, AttributeError in another +@skip_pyarrow # CSV parse error in one case, AttributeError in another @pytest.mark.parametrize( "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"] ) @@ -191,7 +192,7 @@ def test_usecols_index_col_conflict2(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # CSV parse error: Expected 3 columns, got 4 +@skip_pyarrow # CSV parse error: Expected 3 columns, got 4 def test_usecols_implicit_index_col(all_parsers): # see gh-2654 parser = all_parsers @@ -337,7 +338,7 @@ def test_callable_usecols(all_parsers, usecols, expected): # ArrowKeyError: Column 'fa' in include_columns does not exist in CSV file -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]]) def test_incomplete_first_row(all_parsers, usecols): # see gh-6710 @@ -350,7 +351,7 @@ def test_incomplete_first_row(all_parsers, usecols): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # CSV parse error: Expected 3 columns, got 4 +@skip_pyarrow # CSV parse error: Expected 3 columns, got 4 @pytest.mark.parametrize( "data,usecols,kwargs,expected", [ diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py new file mode 100644 index 0000000000000..b2b212ceb2c41 --- /dev/null +++ b/pandas/tests/io/test_gbq.py @@ -0,0 +1,14 @@ +import pandas as pd +import pandas._testing as tm + + +def test_read_gbq_deprecated(): + with tm.assert_produces_warning(FutureWarning): + with tm.external_error_raised(Exception): + pd.read_gbq("fake") + + +def test_to_gbq_deprecated(): + with tm.assert_produces_warning(FutureWarning): + with tm.external_error_raised(Exception): + pd.DataFrame(range(1)).to_gbq("fake") diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index 2ef1f065f603d..f1fc1174416ca 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -333,10 +333,14 @@ def test_invalid_logscale(self, input_param): # GH: 24867 df = DataFrame({"a": np.arange(100)}, index=np.arange(100)) - msg = "Boolean, None and 'sym' are valid options, 'sm' is given." + msg = f"keyword '{input_param}' should be bool, None, or 'sym', not 'sm'" with pytest.raises(ValueError, match=msg): df.plot(**{input_param: "sm"}) + msg = f"PiePlot ignores the '{input_param}' keyword" + with tm.assert_produces_warning(UserWarning, match=msg): + df.plot.pie(subplots=True, **{input_param: True}) + def test_xcompat(self): df = tm.makeTimeDataFrame() ax = df.plot(x_compat=True) @@ -1362,16 +1366,27 @@ def test_specified_props_kwd_plot_box(self, props, expected): assert result[expected][0].get_color() == "C1" def test_unordered_ts(self): + # GH#2609, GH#55906 + index = [date(2012, 10, 1), date(2012, 9, 1), date(2012, 8, 1)] + values = [3.0, 2.0, 1.0] df = DataFrame( - np.array([3.0, 2.0, 1.0]), - index=[date(2012, 10, 1), date(2012, 9, 1), date(2012, 8, 1)], + np.array(values), + index=index, columns=["test"], ) ax = df.plot() xticks = ax.lines[0].get_xdata() - assert xticks[0] < xticks[1] + tm.assert_numpy_array_equal(xticks, np.array(index, dtype=object)) ydata = ax.lines[0].get_ydata() - tm.assert_numpy_array_equal(ydata, np.array([1.0, 2.0, 3.0])) + tm.assert_numpy_array_equal(ydata, np.array(values)) + + # even though we don't sort the data before passing it to matplotlib, + # the ticks are sorted + xticks = ax.xaxis.get_ticklabels() + xlocs = [x.get_position()[0] for x in xticks] + assert pd.Index(xlocs).is_monotonic_increasing + xlabels = [x.get_text() for x in xticks] + assert pd.to_datetime(xlabels, format="%Y-%m-%d").is_monotonic_increasing @pytest.mark.parametrize("kind", plotting.PlotAccessor._common_kinds) def test_kind_both_ways(self, kind): diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py index cfb4e92fb45cd..abd61026b4e37 100644 --- a/pandas/tests/reshape/merge/test_merge_ordered.py +++ b/pandas/tests/reshape/merge/test_merge_ordered.py @@ -1,3 +1,5 @@ +import re + import numpy as np import pytest @@ -209,3 +211,11 @@ def test_elements_not_in_by_but_in_df(self): msg = r"\{'h'\} not found in left columns" with pytest.raises(KeyError, match=msg): merge_ordered(left, right, on="E", left_by=["G", "h"]) + + @pytest.mark.parametrize("invalid_method", ["linear", "carrot"]) + def test_ffill_validate_fill_method(self, left, right, invalid_method): + # GH 55884 + with pytest.raises( + ValueError, match=re.escape("fill_method must be 'ffill' or None") + ): + merge_ordered(left, right, on="key", fill_method=invalid_method) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index a52d87b1a0457..0e850c2d20e72 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -239,7 +239,7 @@ def test_basic_getitem_setitem_corner(datetime_series): datetime_series[[5, [None, None]]] = 2 -def test_slice(string_series, object_series, using_copy_on_write): +def test_slice(string_series, object_series, using_copy_on_write, warn_copy_on_write): original = string_series.copy() numSlice = string_series[10:20] numSliceEnd = string_series[-10:] @@ -256,7 +256,8 @@ def test_slice(string_series, object_series, using_copy_on_write): # Test return view. sl = string_series[10:20] - sl[:] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + sl[:] = 0 if using_copy_on_write: # Doesn't modify parent (CoW) diff --git a/pandas/tests/series/methods/test_copy.py b/pandas/tests/series/methods/test_copy.py index ea439fb5a3263..23dbe85075916 100644 --- a/pandas/tests/series/methods/test_copy.py +++ b/pandas/tests/series/methods/test_copy.py @@ -10,7 +10,7 @@ class TestCopy: @pytest.mark.parametrize("deep", ["default", None, False, True]) - def test_copy(self, deep, using_copy_on_write): + def test_copy(self, deep, using_copy_on_write, warn_copy_on_write): ser = Series(np.arange(10), dtype="float64") # default deep is True @@ -27,7 +27,8 @@ def test_copy(self, deep, using_copy_on_write): else: assert not np.may_share_memory(ser.values, ser2.values) - ser2[::2] = np.nan + with tm.assert_cow_warning(warn_copy_on_write and deep is False): + ser2[::2] = np.nan if deep is not False or using_copy_on_write: # Did not modify original Series diff --git a/pandas/tests/series/methods/test_cov_corr.py b/pandas/tests/series/methods/test_cov_corr.py index b2d5d1ee090ac..a8bc44dfd9ca4 100644 --- a/pandas/tests/series/methods/test_cov_corr.py +++ b/pandas/tests/series/methods/test_cov_corr.py @@ -93,7 +93,7 @@ def test_corr_rank(self): # kendall and spearman A = tm.makeTimeSeries() B = tm.makeTimeSeries() - A[-5:] = A[:5] + A[-5:] = A[:5].copy() result = A.corr(B, method="kendall") expected = stats.kendalltau(A, B)[0] tm.assert_almost_equal(result, expected) diff --git a/pandas/tests/series/methods/test_get_numeric_data.py b/pandas/tests/series/methods/test_get_numeric_data.py index 11dc6d5c57162..8325cc884ebcb 100644 --- a/pandas/tests/series/methods/test_get_numeric_data.py +++ b/pandas/tests/series/methods/test_get_numeric_data.py @@ -7,14 +7,17 @@ class TestGetNumericData: - def test_get_numeric_data_preserve_dtype(self, using_copy_on_write): + def test_get_numeric_data_preserve_dtype( + self, using_copy_on_write, warn_copy_on_write + ): # get the numeric data obj = Series([1, 2, 3]) result = obj._get_numeric_data() tm.assert_series_equal(result, obj) # returned object is a shallow copy - result.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + result.iloc[0] = 0 if using_copy_on_write: assert obj.iloc[0] == 1 else: diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 41b4aa97c56f3..fe5bd33e15f15 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -161,11 +161,6 @@ def test_seaborn(): seaborn.stripplot(x="day", y="total_bill", data=tips) -def test_pandas_gbq(): - # Older versions import from non-public, non-existent pandas funcs - pytest.importorskip("pandas_gbq", minversion="0.10.0") - - def test_pandas_datareader(): pytest.importorskip("pandas_datareader") diff --git a/scripts/tests/data/deps_expected_random.yaml b/scripts/tests/data/deps_expected_random.yaml index 2c1f987e8101a..8fed09d88612d 100644 --- a/scripts/tests/data/deps_expected_random.yaml +++ b/scripts/tests/data/deps_expected_random.yaml @@ -37,7 +37,6 @@ dependencies: - numexpr>=2.7.3 - openpyxl>=3.0.7 - odfpy>=1.4.1 - - pandas-gbq>=0.15.0 - psycopg2>=2.8.6 - pyarrow<11, >=7.0.0 - pymysql>=1.0.2 diff --git a/scripts/tests/data/deps_minimum.toml b/scripts/tests/data/deps_minimum.toml index 9856d230c3de1..c74ad3d17a4a9 100644 --- a/scripts/tests/data/deps_minimum.toml +++ b/scripts/tests/data/deps_minimum.toml @@ -61,7 +61,7 @@ timezone = ['tzdata>=2022.1'] computation = ['scipy>=1.7.1', 'xarray>=0.21.0'] fss = ['fsspec>=2021.07.0'] aws = ['s3fs>=2021.08.0'] -gcp = ['gcsfs>=2021.07.0', 'pandas-gbq>=0.15.0'] +gcp = ['gcsfs>=2021.07.0'] excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'python-calamine>=0.1.6', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3'] parquet = ['pyarrow>=7.0.0'] feather = ['pyarrow>=7.0.0'] @@ -94,7 +94,6 @@ all = ['beautifulsoup4>=5.9.3', 'numexpr>=2.7.3', 'odfpy>=1.4.1', 'openpyxl>=3.0.7', - 'pandas-gbq>=0.15.0', 'psycopg2>=2.8.6', 'pyarrow>=7.0.0', 'pymysql>=1.0.2', diff --git a/scripts/tests/data/deps_unmodified_random.yaml b/scripts/tests/data/deps_unmodified_random.yaml index f3772d9e20a1b..e560dd50c41d4 100644 --- a/scripts/tests/data/deps_unmodified_random.yaml +++ b/scripts/tests/data/deps_unmodified_random.yaml @@ -37,7 +37,6 @@ dependencies: - numexpr>=2.7.3 - openpyxl>=3.0.7 - odfpy>=1.4.1 - - pandas-gbq>=0.15.0 - psycopg2 - pyarrow<11, >=7.0.0 - pymysql>=1.0.2