diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 33b6e7a8c2340..4c38324280528 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -73,6 +73,14 @@ jobs: env_file: actions-311.yaml pattern: "not slow and not network and not single_cpu" pandas_copy_on_write: "warn" + - name: "Copy-on-Write 3.10 (warnings)" + env_file: actions-310.yaml + pattern: "not slow and not network and not single_cpu" + pandas_copy_on_write: "warn" + - name: "Copy-on-Write 3.9 (warnings)" + env_file: actions-39.yaml + pattern: "not slow and not network and not single_cpu" + pandas_copy_on_write: "warn" - name: "Pypy" env_file: actions-pypy-39.yaml pattern: "not slow and not network and not single_cpu" diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 3c78b0a9a60c8..933e8fbc175d8 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -4,8 +4,6 @@ import pandas as pd -from .pandas_vb_common import tm - for imp in ["pandas.util", "pandas.tools.hashing"]: try: hashing = import_module(imp) @@ -47,9 +45,12 @@ def setup(self, unique, sort, dtype): elif dtype == "datetime64[ns, tz]": data = pd.date_range("2011-01-01", freq="h", periods=N, tz="Asia/Tokyo") elif dtype == "object_str": - data = tm.makeStringIndex(N) + data = pd.Index([f"i-{i}" for i in range(N)], dtype=object) elif dtype == "string[pyarrow]": - data = pd.array(tm.makeStringIndex(N), dtype="string[pyarrow]") + data = pd.array( + pd.Index([f"i-{i}" for i in range(N)], dtype=object), + dtype="string[pyarrow]", + ) else: raise NotImplementedError @@ -88,7 +89,7 @@ def setup(self, unique, keep, dtype): elif dtype == "float64": data = pd.Index(np.random.randn(N), dtype="float64") elif dtype == "string": - data = tm.makeStringIndex(N) + data = pd.Index([f"i-{i}" for i in range(N)], dtype=object) elif dtype == "datetime64[ns]": data = pd.date_range("2011-01-01", freq="h", periods=N) elif dtype == "datetime64[ns, tz]": @@ -136,7 +137,9 @@ def setup_cache(self): df = pd.DataFrame( { "strings": pd.Series( - tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=N)) + pd.Index([f"i-{i}" for i in range(10000)], dtype=object).take( + np.random.randint(0, 10000, size=N) + ) ), "floats": np.random.randn(N), "ints": np.arange(N), diff --git a/asv_bench/benchmarks/algos/isin.py b/asv_bench/benchmarks/algos/isin.py index 92797425b2c30..2b3d32fb579dc 100644 --- a/asv_bench/benchmarks/algos/isin.py +++ b/asv_bench/benchmarks/algos/isin.py @@ -8,8 +8,6 @@ date_range, ) -from ..pandas_vb_common import tm - class IsIn: params = [ @@ -60,7 +58,9 @@ def setup(self, dtype): elif dtype in ["str", "string[python]", "string[pyarrow]"]: try: - self.series = Series(tm.makeStringIndex(N), dtype=dtype) + self.series = Series( + Index([f"i-{i}" for i in range(N)], dtype=object), dtype=dtype + ) except ImportError: raise NotImplementedError self.values = list(self.series[:2]) diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py index 2db00cc7f2ad9..77c9faf3d3a87 100644 --- a/asv_bench/benchmarks/ctors.py +++ b/asv_bench/benchmarks/ctors.py @@ -9,8 +9,6 @@ date_range, ) -from .pandas_vb_common import tm - def no_change(arr): return arr @@ -115,7 +113,7 @@ def time_dtindex_from_index_with_series(self): class MultiIndexConstructor: def setup(self): N = 10**4 - self.iterables = [tm.makeStringIndex(N), range(20)] + self.iterables = [Index([f"i-{i}" for i in range(N)], dtype=object), range(20)] def time_multiindex_from_iterables(self): MultiIndex.from_product(self.iterables) diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py index c33043c0eddc1..7f3429b5e3882 100644 --- a/asv_bench/benchmarks/dtypes.py +++ b/asv_bench/benchmarks/dtypes.py @@ -3,7 +3,10 @@ import numpy as np import pandas as pd -from pandas import DataFrame +from pandas import ( + DataFrame, + Index, +) import pandas._testing as tm from pandas.api.types import ( is_extension_array_dtype, @@ -73,8 +76,8 @@ class SelectDtypes: def setup(self, dtype): N, K = 5000, 50 - self.index = tm.makeStringIndex(N) - self.columns = tm.makeStringIndex(K) + self.index = Index([f"i-{i}" for i in range(N)], dtype=object) + self.columns = Index([f"i-{i}" for i in range(K)], dtype=object) def create_df(data): return DataFrame(data, index=self.index, columns=self.columns) diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 7092a679b8cf0..f938f7eb0d951 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -12,8 +12,6 @@ date_range, ) -from .pandas_vb_common import tm - try: from pandas.tseries.offsets import ( Hour, @@ -30,8 +28,8 @@ class FromDicts: def setup(self): N, K = 5000, 50 - self.index = tm.makeStringIndex(N) - self.columns = tm.makeStringIndex(K) + self.index = pd.Index([f"i-{i}" for i in range(N)], dtype=object) + self.columns = pd.Index([f"i-{i}" for i in range(K)], dtype=object) frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns) self.data = frame.to_dict() self.dict_list = frame.to_dict(orient="records") diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index f22a261041e17..a283afd1f0f1e 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -5,6 +5,7 @@ from pandas import ( DataFrame, + Index, MultiIndex, NaT, Series, @@ -14,8 +15,6 @@ timedelta_range, ) -from .pandas_vb_common import tm - class AsType: params = [ @@ -703,8 +702,12 @@ def setup(self, monotonic): K = 10 df = DataFrame( { - "key1": tm.makeStringIndex(N).values.repeat(K), - "key2": tm.makeStringIndex(N).values.repeat(K), + "key1": Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat( + K + ), + "key2": Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat( + K + ), "value": np.random.randn(N * K), } ) diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index c78819f75c52a..a0c4189c72d0e 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -5,6 +5,7 @@ from pandas import ( DataFrame, + Index, Series, date_range, factorize, @@ -12,8 +13,6 @@ ) from pandas.core.algorithms import take_nd -from .pandas_vb_common import tm - try: from pandas import ( rolling_kurt, @@ -34,7 +33,6 @@ except ImportError: from pandas import algos - from .pandas_vb_common import BaseIO # isort:skip @@ -305,7 +303,7 @@ class ParallelFactorize: param_names = ["threads"] def setup(self, threads): - strings = tm.makeStringIndex(100000) + strings = Index([f"i-{i}" for i in range(100000)], dtype=object) @test_parallel(num_threads=threads) def parallel(): diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 202ba6e981b70..abffa1f702b9c 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -17,8 +17,6 @@ to_timedelta, ) -from .pandas_vb_common import tm - method_blocklist = { "object": { "diff", @@ -167,10 +165,14 @@ def setup_cache(self): "int64_small": Series(np.random.randint(0, 100, size=size)), "int64_large": Series(np.random.randint(0, 10000, size=size)), "object_small": Series( - tm.makeStringIndex(100).take(np.random.randint(0, 100, size=size)) + Index([f"i-{i}" for i in range(100)], dtype=object).take( + np.random.randint(0, 100, size=size) + ) ), "object_large": Series( - tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=size)) + Index([f"i-{i}" for i in range(10000)], dtype=object).take( + np.random.randint(0, 10000, size=size) + ) ), } return data @@ -912,7 +914,7 @@ def setup(self): n1 = 400 n2 = 250 index = MultiIndex( - levels=[np.arange(n1), tm.makeStringIndex(n2)], + levels=[np.arange(n1), Index([f"i-{i}" for i in range(n2)], dtype=object)], codes=[np.repeat(range(n1), n2).tolist(), list(range(n2)) * n1], names=["lev1", "lev2"], ) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 7e33223260e0f..637b1b40f59a3 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -12,8 +12,6 @@ date_range, ) -from .pandas_vb_common import tm - class SetOperations: params = ( @@ -30,7 +28,7 @@ def setup(self, index_structure, dtype, method): date_str_left = Index(dates_left.strftime(fmt)) int_left = Index(np.arange(N)) ea_int_left = Index(np.arange(N), dtype="Int64") - str_left = tm.makeStringIndex(N) + str_left = Index([f"i-{i}" for i in range(N)], dtype=object) data = { "datetime": dates_left, @@ -155,7 +153,12 @@ class Indexing: def setup(self, dtype): N = 10**6 - self.idx = getattr(tm, f"make{dtype}Index")(N) + if dtype == "String": + self.idx = Index([f"i-{i}" for i in range(N)], dtype=object) + elif dtype == "Float": + self.idx = Index(np.arange(N), dtype=np.float64) + elif dtype == "Int": + self.idx = Index(np.arange(N), dtype=np.int64) self.array_mask = (np.arange(N) % 3) == 0 self.series_mask = Series(self.array_mask) self.sorted = self.idx.sort_values() diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 4961722c0e9cd..9ad1f5b31016d 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -22,8 +22,6 @@ period_range, ) -from .pandas_vb_common import tm - class NumericSeriesIndexing: params = [ @@ -124,7 +122,7 @@ class NonNumericSeriesIndexing: def setup(self, index, index_structure): N = 10**6 if index == "string": - index = tm.makeStringIndex(N) + index = Index([f"i-{i}" for i in range(N)], dtype=object) elif index == "datetime": index = date_range("1900", periods=N, freq="s") elif index == "period": @@ -156,8 +154,8 @@ def time_getitem_list_like(self, index, index_structure): class DataFrameStringIndexing: def setup(self): - index = tm.makeStringIndex(1000) - columns = tm.makeStringIndex(30) + index = Index([f"i-{i}" for i in range(1000)], dtype=object) + columns = Index([f"i-{i}" for i in range(30)], dtype=object) with warnings.catch_warnings(record=True): self.df = DataFrame(np.random.randn(1000, 30), index=index, columns=columns) self.idx_scalar = index[100] diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 805b0c807452c..d5c58033c1157 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -9,6 +9,7 @@ import numpy as np from pandas import ( + Index, NaT, Series, date_range, @@ -17,10 +18,7 @@ to_timedelta, ) -from .pandas_vb_common import ( - lib, - tm, -) +from .pandas_vb_common import lib class ToNumeric: @@ -31,7 +29,7 @@ def setup(self, errors): N = 10000 self.float = Series(np.random.randn(N)) self.numstr = self.float.astype("str") - self.str = Series(tm.makeStringIndex(N)) + self.str = Series(Index([f"i-{i}" for i in range(N)], dtype=object)) def time_from_float(self, errors): to_numeric(self.float, errors=errors) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index a45315f63d62e..9ac83db4f85b9 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -10,6 +10,7 @@ from pandas import ( Categorical, DataFrame, + Index, concat, date_range, period_range, @@ -17,10 +18,7 @@ to_datetime, ) -from ..pandas_vb_common import ( - BaseIO, - tm, -) +from ..pandas_vb_common import BaseIO class ToCSV(BaseIO): @@ -288,7 +286,7 @@ class ReadCSVSkipRows(BaseIO): def setup(self, skiprows, engine): N = 20000 - index = tm.makeStringIndex(N) + index = Index([f"i-{i}" for i in range(N)], dtype=object) df = DataFrame( { "float1": np.random.randn(N), diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index f8d81b0f6a699..902a61be901bd 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -12,12 +12,11 @@ from pandas import ( DataFrame, ExcelWriter, + Index, date_range, read_excel, ) -from ..pandas_vb_common import tm - def _generate_dataframe(): N = 2000 @@ -27,7 +26,7 @@ def _generate_dataframe(): columns=[f"float{i}" for i in range(C)], index=date_range("20000101", periods=N, freq="h"), ) - df["object"] = tm.makeStringIndex(N) + df["object"] = Index([f"i-{i}" for i in range(N)], dtype=object) return df diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py index 195aaa158e178..acf0ec4b9d359 100644 --- a/asv_bench/benchmarks/io/hdf.py +++ b/asv_bench/benchmarks/io/hdf.py @@ -3,20 +3,18 @@ from pandas import ( DataFrame, HDFStore, + Index, date_range, read_hdf, ) -from ..pandas_vb_common import ( - BaseIO, - tm, -) +from ..pandas_vb_common import BaseIO class HDFStoreDataFrame(BaseIO): def setup(self): N = 25000 - index = tm.makeStringIndex(N) + index = Index([f"i-{i}" for i in range(N)], dtype=object) self.df = DataFrame( {"float1": np.random.randn(N), "float2": np.random.randn(N)}, index=index ) @@ -124,7 +122,7 @@ def setup(self, format): columns=[f"float{i}" for i in range(C)], index=date_range("20000101", periods=N, freq="h"), ) - self.df["object"] = tm.makeStringIndex(N) + self.df["object"] = Index([f"i-{i}" for i in range(N)], dtype=object) self.df.to_hdf(self.fname, "df", format=format) # Numeric df diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index 8a2e3fa87eb37..bcbfcdea42dd9 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -4,6 +4,7 @@ from pandas import ( DataFrame, + Index, concat, date_range, json_normalize, @@ -11,10 +12,7 @@ timedelta_range, ) -from ..pandas_vb_common import ( - BaseIO, - tm, -) +from ..pandas_vb_common import BaseIO class ReadJSON(BaseIO): @@ -114,7 +112,7 @@ def setup(self, orient, frame): ints = np.random.randint(100000000, size=N) longints = sys.maxsize * np.random.randint(100000000, size=N) floats = np.random.randn(N) - strings = tm.makeStringIndex(N) + strings = Index([f"i-{i}" for i in range(N)], dtype=object) self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N)) self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index) self.df_td_int_ts = DataFrame( @@ -220,7 +218,7 @@ def setup(self): ints = np.random.randint(100000000, size=N) longints = sys.maxsize * np.random.randint(100000000, size=N) floats = np.random.randn(N) - strings = tm.makeStringIndex(N) + strings = Index([f"i-{i}" for i in range(N)], dtype=object) self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N)) self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index) self.df_td_int_ts = DataFrame( diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py index 54631d9236887..4787b57b54756 100644 --- a/asv_bench/benchmarks/io/pickle.py +++ b/asv_bench/benchmarks/io/pickle.py @@ -2,14 +2,12 @@ from pandas import ( DataFrame, + Index, date_range, read_pickle, ) -from ..pandas_vb_common import ( - BaseIO, - tm, -) +from ..pandas_vb_common import BaseIO class Pickle(BaseIO): @@ -22,7 +20,7 @@ def setup(self): columns=[f"float{i}" for i in range(C)], index=date_range("20000101", periods=N, freq="h"), ) - self.df["object"] = tm.makeStringIndex(N) + self.df["object"] = Index([f"i-{i}" for i in range(N)], dtype=object) self.df.to_pickle(self.fname) def time_read_pickle(self): diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py index 6f893ee72d918..e87cc4aaa80c7 100644 --- a/asv_bench/benchmarks/io/sql.py +++ b/asv_bench/benchmarks/io/sql.py @@ -5,13 +5,12 @@ from pandas import ( DataFrame, + Index, date_range, read_sql_query, read_sql_table, ) -from ..pandas_vb_common import tm - class SQL: params = ["sqlalchemy", "sqlite"] @@ -35,7 +34,7 @@ def setup(self, connection): "int": np.random.randint(0, N, size=N), "datetime": date_range("2000-01-01", periods=N, freq="s"), }, - index=tm.makeStringIndex(N), + index=Index([f"i-{i}" for i in range(N)], dtype=object), ) self.df.iloc[1000:3000, 1] = np.nan self.df["date"] = self.df["datetime"].dt.date @@ -84,7 +83,7 @@ def setup(self, connection, dtype): "int": np.random.randint(0, N, size=N), "datetime": date_range("2000-01-01", periods=N, freq="s"), }, - index=tm.makeStringIndex(N), + index=Index([f"i-{i}" for i in range(N)], dtype=object), ) self.df.iloc[1000:3000, 1] = np.nan self.df["date"] = self.df["datetime"].dt.date @@ -113,7 +112,7 @@ def setup(self): "int": np.random.randint(0, N, size=N), "datetime": date_range("2000-01-01", periods=N, freq="s"), }, - index=tm.makeStringIndex(N), + index=Index([f"i-{i}" for i in range(N)], dtype=object), ) self.df.iloc[1000:3000, 1] = np.nan self.df["date"] = self.df["datetime"].dt.date @@ -159,7 +158,7 @@ def setup(self, dtype): "int": np.random.randint(0, N, size=N), "datetime": date_range("2000-01-01", periods=N, freq="s"), }, - index=tm.makeStringIndex(N), + index=Index([f"i-{i}" for i in range(N)], dtype=object), ) self.df.iloc[1000:3000, 1] = np.nan self.df["date"] = self.df["datetime"].dt.date diff --git a/asv_bench/benchmarks/io/stata.py b/asv_bench/benchmarks/io/stata.py index 750bcf4ccee5c..ff33ededdfed9 100644 --- a/asv_bench/benchmarks/io/stata.py +++ b/asv_bench/benchmarks/io/stata.py @@ -2,14 +2,12 @@ from pandas import ( DataFrame, + Index, date_range, read_stata, ) -from ..pandas_vb_common import ( - BaseIO, - tm, -) +from ..pandas_vb_common import BaseIO class Stata(BaseIO): @@ -25,7 +23,7 @@ def setup(self, convert_dates): columns=[f"float{i}" for i in range(C)], index=date_range("20000101", periods=N, freq="h"), ) - self.df["object"] = tm.makeStringIndex(self.N) + self.df["object"] = Index([f"i-{i}" for i in range(self.N)], dtype=object) self.df["int8_"] = np.random.randint( np.iinfo(np.int8).min, np.iinfo(np.int8).max - 27, N ) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 23824c2c748df..6f494562103c2 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -14,8 +14,6 @@ merge_asof, ) -from .pandas_vb_common import tm - try: from pandas import merge_ordered except ImportError: @@ -28,7 +26,7 @@ class Concat: def setup(self, axis): N = 1000 - s = Series(N, index=tm.makeStringIndex(N)) + s = Series(N, index=Index([f"i-{i}" for i in range(N)], dtype=object)) self.series = [s[i:-i] for i in range(1, 10)] * 50 self.small_frames = [DataFrame(np.random.randn(5, 4))] * 1000 df = DataFrame( @@ -94,7 +92,7 @@ def setup(self, dtype, structure, axis, sort): elif dtype in ("int64", "Int64", "int64[pyarrow]"): vals = np.arange(N, dtype=np.int64) elif dtype in ("string[python]", "string[pyarrow]"): - vals = tm.makeStringIndex(N) + vals = Index([f"i-{i}" for i in range(N)], dtype=object) else: raise NotImplementedError @@ -122,8 +120,8 @@ class Join: param_names = ["sort"] def setup(self, sort): - level1 = tm.makeStringIndex(10).values - level2 = tm.makeStringIndex(1000).values + level1 = Index([f"i-{i}" for i in range(10)], dtype=object).values + level2 = Index([f"i-{i}" for i in range(1000)], dtype=object).values codes1 = np.arange(10).repeat(1000) codes2 = np.tile(np.arange(1000), 10) index2 = MultiIndex(levels=[level1, level2], codes=[codes1, codes2]) @@ -231,8 +229,8 @@ class Merge: def setup(self, sort): N = 10000 - indices = tm.makeStringIndex(N).values - indices2 = tm.makeStringIndex(N).values + indices = Index([f"i-{i}" for i in range(N)], dtype=object).values + indices2 = Index([f"i-{i}" for i in range(N)], dtype=object).values key = np.tile(indices[:8000], 10) key2 = np.tile(indices2[:8000], 10) self.left = DataFrame( @@ -400,7 +398,7 @@ def time_merge_on_cat_idx(self): class MergeOrdered: def setup(self): - groups = tm.makeStringIndex(10).values + groups = Index([f"i-{i}" for i in range(10)], dtype=object).values self.left = DataFrame( { "group": groups.repeat(5000), diff --git a/asv_bench/benchmarks/libs.py b/asv_bench/benchmarks/libs.py index f041499c9c622..3419163bcfe09 100644 --- a/asv_bench/benchmarks/libs.py +++ b/asv_bench/benchmarks/libs.py @@ -15,13 +15,11 @@ from pandas import ( NA, + Index, NaT, ) -from .pandas_vb_common import ( - lib, - tm, -) +from .pandas_vb_common import lib try: from pandas.util import cache_readonly @@ -61,8 +59,8 @@ class FastZip: def setup(self): N = 10000 K = 10 - key1 = tm.makeStringIndex(N).values.repeat(K) - key2 = tm.makeStringIndex(N).values.repeat(K) + key1 = Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat(K) + key2 = Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat(K) col_array = np.vstack([key1, key2, np.random.randn(N * K)]) col_array2 = col_array.copy() col_array2[:, :10000] = np.nan diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 87dcdb16fa647..54788d41d83fe 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -5,6 +5,7 @@ from pandas import ( NA, DataFrame, + Index, MultiIndex, RangeIndex, Series, @@ -12,8 +13,6 @@ date_range, ) -from .pandas_vb_common import tm - class GetLoc: def setup(self): @@ -144,7 +143,11 @@ def time_is_monotonic(self): class Duplicated: def setup(self): n, k = 200, 5000 - levels = [np.arange(n), tm.makeStringIndex(n).values, 1000 + np.arange(n)] + levels = [ + np.arange(n), + Index([f"i-{i}" for i in range(n)], dtype=object).values, + 1000 + np.arange(n), + ] codes = [np.random.choice(n, (k * n)) for lev in levels] self.mi = MultiIndex(levels=levels, codes=codes) @@ -249,7 +252,7 @@ def setup(self, index_structure, dtype, method, sort): level2 = range(N // 1000) int_left = MultiIndex.from_product([level1, level2]) - level2 = tm.makeStringIndex(N // 1000).values + level2 = Index([f"i-{i}" for i in range(N // 1000)], dtype=object).values str_left = MultiIndex.from_product([level1, level2]) level2 = range(N // 1000) @@ -293,7 +296,7 @@ def setup(self, dtype): level2[0] = NA ea_int_left = MultiIndex.from_product([level1, level2]) - level2 = tm.makeStringIndex(N // 1000).values + level2 = Index([f"i-{i}" for i in range(N // 1000)], dtype=object).values str_left = MultiIndex.from_product([level1, level2]) data = { @@ -354,7 +357,7 @@ def setup(self, dtype): level2 = range(N // 1000) int_midx = MultiIndex.from_product([level1, level2]) - level2 = tm.makeStringIndex(N // 1000).values + level2 = Index([f"i-{i}" for i in range(N // 1000)], dtype=object).values str_midx = MultiIndex.from_product([level1, level2]) data = { @@ -411,7 +414,7 @@ def setup(self, dtype): elif dtype == "int64": level2 = range(N2) elif dtype == "string": - level2 = tm.makeStringIndex(N2) + level2 = Index([f"i-{i}" for i in range(N2)], dtype=object) else: raise NotImplementedError diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index 1c5e6050db275..3d22bfce7e2b2 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -9,8 +9,6 @@ period_range, ) -from .pandas_vb_common import tm - class Reindex: def setup(self): @@ -23,8 +21,8 @@ def setup(self): ) N = 5000 K = 200 - level1 = tm.makeStringIndex(N).values.repeat(K) - level2 = np.tile(tm.makeStringIndex(K).values, N) + level1 = Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat(K) + level2 = np.tile(Index([f"i-{i}" for i in range(K)], dtype=object).values, N) index = MultiIndex.from_arrays([level1, level2]) self.s = Series(np.random.randn(N * K), index=index) self.s_subset = self.s[::2] @@ -93,8 +91,8 @@ class DropDuplicates: def setup(self, inplace): N = 10000 K = 10 - key1 = tm.makeStringIndex(N).values.repeat(K) - key2 = tm.makeStringIndex(N).values.repeat(K) + key1 = Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat(K) + key2 = Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat(K) self.df = DataFrame( {"key1": key1, "key2": key2, "value": np.random.randn(N * K)} ) @@ -102,7 +100,9 @@ def setup(self, inplace): self.df_nan.iloc[:10000, :] = np.nan self.s = Series(np.random.randint(0, 1000, size=10000)) - self.s_str = Series(np.tile(tm.makeStringIndex(1000).values, 10)) + self.s_str = Series( + np.tile(Index([f"i-{i}" for i in range(1000)], dtype=object).values, 10) + ) N = 1000000 K = 10000 @@ -133,7 +133,7 @@ class Align: # blog "pandas escaped the zoo" def setup(self): n = 50000 - indices = tm.makeStringIndex(n) + indices = Index([f"i-{i}" for i in range(n)], dtype=object) subsample_size = 40000 self.x = Series(np.random.randn(n), indices) self.y = Series( diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 79cf8f9cd2048..b021af4694d7d 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -10,8 +10,6 @@ date_range, ) -from .pandas_vb_common import tm - class SeriesConstructor: def setup(self): @@ -253,7 +251,7 @@ def time_mode(self, N): class Dir: def setup(self): - self.s = Series(index=tm.makeStringIndex(10000)) + self.s = Series(index=Index([f"i-{i}" for i in range(10000)], dtype=object)) def time_dir_strings(self): dir(self.s) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 6682a60f42997..1f4a104255057 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -6,12 +6,11 @@ NA, Categorical, DataFrame, + Index, Series, ) from pandas.arrays import StringArray -from .pandas_vb_common import tm - class Dtypes: params = ["str", "string[python]", "string[pyarrow]"] @@ -19,7 +18,9 @@ class Dtypes: def setup(self, dtype): try: - self.s = Series(tm.makeStringIndex(10**5), dtype=dtype) + self.s = Series( + Index([f"i-{i}" for i in range(10000)], dtype=object), dtype=dtype + ) except ImportError: raise NotImplementedError @@ -172,7 +173,7 @@ class Repeat: def setup(self, repeats): N = 10**5 - self.s = Series(tm.makeStringIndex(N)) + self.s = Series(Index([f"i-{i}" for i in range(N)], dtype=object)) repeat = {"int": 1, "array": np.random.randint(1, 3, N)} self.values = repeat[repeats] @@ -187,13 +188,20 @@ class Cat: def setup(self, other_cols, sep, na_rep, na_frac): N = 10**5 mask_gen = lambda: np.random.choice([True, False], N, p=[1 - na_frac, na_frac]) - self.s = Series(tm.makeStringIndex(N)).where(mask_gen()) + self.s = Series(Index([f"i-{i}" for i in range(N)], dtype=object)).where( + mask_gen() + ) if other_cols == 0: # str.cat self-concatenates only for others=None self.others = None else: self.others = DataFrame( - {i: tm.makeStringIndex(N).where(mask_gen()) for i in range(other_cols)} + { + i: Index([f"i-{i}" for i in range(N)], dtype=object).where( + mask_gen() + ) + for i in range(other_cols) + } ) def time_cat(self, other_cols, sep, na_rep, na_frac): @@ -254,7 +262,7 @@ def time_get_dummies(self, dtype): class Encode: def setup(self): - self.ser = Series(tm.makeStringIndex()) + self.ser = Series(Index([f"i-{i}" for i in range(10_000)], dtype=object)) def time_encode_decode(self): self.ser.str.encode("utf-8").str.decode("utf-8") diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 77ce303dc1bfe..0cbf211305d12 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -24,6 +24,7 @@ Bug fixes - Bug in :class:`Series` constructor raising DeprecationWarning when ``index`` is a list of :class:`Series` (:issue:`55228`) - Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) +- Fixed bug in :meth:`DataFrame.to_hdf` raising when columns have ``StringDtype`` (:issue:`55088`) - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index d252c19a95d4a..dce776755ad7e 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -14,81 +14,6 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ -.. _whatsnew_220.enhancements.calamine: - -Calamine engine for :func:`read_excel` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The ``calamine`` engine was added to :func:`read_excel`. -It uses ``python-calamine``, which provides Python bindings for the Rust library `calamine `__. -This engine supports Excel files (``.xlsx``, ``.xlsm``, ``.xls``, ``.xlsb``) and OpenDocument spreadsheets (``.ods``) (:issue:`50395`). - -There are two advantages of this engine: - -1. Calamine is often faster than other engines, some benchmarks show results up to 5x faster than 'openpyxl', 20x - 'odf', 4x - 'pyxlsb', and 1.5x - 'xlrd'. - But, 'openpyxl' and 'pyxlsb' are faster in reading a few rows from large files because of lazy iteration over rows. -2. Calamine supports the recognition of datetime in ``.xlsb`` files, unlike 'pyxlsb' which is the only other engine in pandas that can read ``.xlsb`` files. - -.. code-block:: python - - pd.read_excel("path_to_file.xlsb", engine="calamine") - - -For more, see :ref:`io.calamine` in the user guide on IO tools. - -.. _whatsnew_220.enhancements.struct_accessor: - -Series.struct accessor to with PyArrow structured data -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The ``Series.struct`` accessor provides attributes and methods for processing -data with ``struct[pyarrow]`` dtype Series. For example, -:meth:`Series.struct.explode` converts PyArrow structured data to a pandas -DataFrame. (:issue:`54938`) - -.. ipython:: python - - import pyarrow as pa - series = pd.Series( - [ - {"project": "pandas", "version": "2.2.0"}, - {"project": "numpy", "version": "1.25.2"}, - {"project": "pyarrow", "version": "13.0.0"}, - ], - dtype=pd.ArrowDtype( - pa.struct([ - ("project", pa.string()), - ("version", pa.string()), - ]) - ), - ) - series.struct.explode() - -.. _whatsnew_220.enhancements.list_accessor: - -Series.list accessor for PyArrow list data -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The ``Series.list`` accessor provides attributes and methods for processing -data with ``list[pyarrow]`` dtype Series. For example, -:meth:`Series.list.__getitem__` allows indexing pyarrow lists in -a Series. (:issue:`55323`) - -.. ipython:: python - - import pyarrow as pa - series = pd.Series( - [ - [1, 2, 3], - [4, 5], - [6], - ], - dtype=pd.ArrowDtype( - pa.list_(pa.int64()) - ), - ) - series.list[0] - .. _whatsnew_220.enhancements.adbc_support: ADBC Driver support in to_sql and read_sql @@ -180,6 +105,81 @@ For a full list of ADBC drivers and their development status, see the `ADBC Driv Implementation Status `_ documentation. +.. _whatsnew_220.enhancements.struct_accessor: + +Series.struct accessor to with PyArrow structured data +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``Series.struct`` accessor provides attributes and methods for processing +data with ``struct[pyarrow]`` dtype Series. For example, +:meth:`Series.struct.explode` converts PyArrow structured data to a pandas +DataFrame. (:issue:`54938`) + +.. ipython:: python + + import pyarrow as pa + series = pd.Series( + [ + {"project": "pandas", "version": "2.2.0"}, + {"project": "numpy", "version": "1.25.2"}, + {"project": "pyarrow", "version": "13.0.0"}, + ], + dtype=pd.ArrowDtype( + pa.struct([ + ("project", pa.string()), + ("version", pa.string()), + ]) + ), + ) + series.struct.explode() + +.. _whatsnew_220.enhancements.list_accessor: + +Series.list accessor for PyArrow list data +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``Series.list`` accessor provides attributes and methods for processing +data with ``list[pyarrow]`` dtype Series. For example, +:meth:`Series.list.__getitem__` allows indexing pyarrow lists in +a Series. (:issue:`55323`) + +.. ipython:: python + + import pyarrow as pa + series = pd.Series( + [ + [1, 2, 3], + [4, 5], + [6], + ], + dtype=pd.ArrowDtype( + pa.list_(pa.int64()) + ), + ) + series.list[0] + +.. _whatsnew_220.enhancements.calamine: + +Calamine engine for :func:`read_excel` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``calamine`` engine was added to :func:`read_excel`. +It uses ``python-calamine``, which provides Python bindings for the Rust library `calamine `__. +This engine supports Excel files (``.xlsx``, ``.xlsm``, ``.xls``, ``.xlsb``) and OpenDocument spreadsheets (``.ods``) (:issue:`50395`). + +There are two advantages of this engine: + +1. Calamine is often faster than other engines, some benchmarks show results up to 5x faster than 'openpyxl', 20x - 'odf', 4x - 'pyxlsb', and 1.5x - 'xlrd'. + But, 'openpyxl' and 'pyxlsb' are faster in reading a few rows from large files because of lazy iteration over rows. +2. Calamine supports the recognition of datetime in ``.xlsb`` files, unlike 'pyxlsb' which is the only other engine in pandas that can read ``.xlsb`` files. + +.. code-block:: python + + pd.read_excel("path_to_file.xlsb", engine="calamine") + + +For more, see :ref:`io.calamine` in the user guide on IO tools. + .. _whatsnew_220.enhancements.other: Other enhancements @@ -315,7 +315,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor Other API changes ^^^^^^^^^^^^^^^^^ -- +- ``check_exact`` now only takes effect for floating-point dtypes in :func:`testing.assert_frame_equal` and :func:`testing.assert_series_equal`. In particular, integer dtypes are always checked exactly (:issue:`55882`) - .. --------------------------------------------------------------------------- @@ -362,6 +362,8 @@ Other Deprecations - Deprecated :func:`read_gbq` and :meth:`DataFrame.to_gbq`. Use ``pandas_gbq.read_gbq`` and ``pandas_gbq.to_gbq`` instead https://pandas-gbq.readthedocs.io/en/latest/api.html (:issue:`55525`) - Deprecated :meth:`.DataFrameGroupBy.fillna` and :meth:`.SeriesGroupBy.fillna`; use :meth:`.DataFrameGroupBy.ffill`, :meth:`.DataFrameGroupBy.bfill` for forward and backward filling or :meth:`.DataFrame.fillna` to fill with a single value (or the Series equivalents) (:issue:`55718`) - Deprecated :meth:`Index.format`, use ``index.astype(str)`` or ``index.map(formatter)`` instead (:issue:`55413`) +- Deprecated :meth:`Series.ravel`, the underlying array is already 1D, so ravel is not necessary (:issue:`52511`) +- Deprecated :meth:`Series.view`, use ``astype`` instead to change the dtype (:issue:`20251`) - Deprecated ``core.internals`` members ``Block``, ``ExtensionBlock``, and ``DatetimeTZBlock``, use public APIs instead (:issue:`55139`) - Deprecated ``year``, ``month``, ``quarter``, ``day``, ``hour``, ``minute``, and ``second`` keywords in the :class:`PeriodIndex` constructor, use :meth:`PeriodIndex.from_fields` instead (:issue:`55960`) - Deprecated allowing non-integer ``periods`` argument in :func:`date_range`, :func:`timedelta_range`, :func:`period_range`, and :func:`interval_range` (:issue:`56036`) @@ -457,6 +459,7 @@ Datetimelike - Bug in :meth:`Index.view` to a datetime64 dtype with non-supported resolution incorrectly raising (:issue:`55710`) - Bug in :meth:`Series.dt.round` with non-nanosecond resolution and ``NaT`` entries incorrectly raising ``OverflowError`` (:issue:`56158`) - Bug in :meth:`Tick.delta` with very large ticks raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) +- Bug in :meth:`Timestamp.unit` being inferred incorrectly from an ISO8601 format string with minute or hour resolution and a timezone offset (:issue:`56208`) - Bug in ``.astype`` converting from a higher-resolution ``datetime64`` dtype to a lower-resolution ``datetime64`` dtype (e.g. ``datetime64[us]->datetim64[ms]``) silently overflowing with values near the lower implementation bound (:issue:`55979`) - Bug in adding or subtracting a :class:`Week` offset to a ``datetime64`` :class:`Series`, :class:`Index`, or :class:`DataFrame` column with non-nanosecond resolution returning incorrect results (:issue:`55583`) - Bug in addition or subtraction of :class:`BusinessDay` offset with ``offset`` attribute to non-nanosecond :class:`Index`, :class:`Series`, or :class:`DataFrame` column giving incorrect results (:issue:`55608`) @@ -496,8 +499,8 @@ Conversion Strings ^^^^^^^ - Bug in :func:`pandas.api.types.is_string_dtype` while checking object array with no elements is of the string dtype (:issue:`54661`) +- Bug in :meth:`DataFrame.apply` failing when ``engine="numba"`` and columns or index have ``StringDtype`` (:issue:`56189`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`) -- Interval ^^^^^^^^ @@ -530,6 +533,7 @@ I/O - Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`) - Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`) - Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) +- Bug in :func:`read_json` not handling dtype conversion properly if ``infer_string`` is set (:issue:`56195`) - Bug in :func:`to_excel`, with ``OdsWriter`` (``ods`` files) writing boolean/string value (:issue:`54994`) - Bug in :meth:`DataFrame.to_hdf` and :func:`read_hdf` with ``datetime64`` dtypes with non-nanosecond resolution failing to round-trip correctly (:issue:`55622`) - Bug in :meth:`pandas.read_excel` with ``engine="odf"`` (``ods`` files) when string contains annotation (:issue:`55200`) diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c index a0d56efc14bd9..3a9a805a9ec45 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c @@ -364,6 +364,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto parse_error; } out->hour = (*substr - '0'); + bestunit = NPY_FR_h; ++substr; --sublen; /* Second digit optional */ @@ -425,6 +426,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* First digit required */ out->min = (*substr - '0'); + bestunit = NPY_FR_m; ++substr; --sublen; /* Second digit optional if there was a separator */ diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index a74fb2bf48bc4..14ee29d24800e 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -370,11 +370,6 @@ def getCols(k) -> str: return string.ascii_uppercase[:k] -# make index -def makeStringIndex(k: int = 10, name=None) -> Index: - return Index(rands_array(nchars=10, size=k), name=name) - - def makeCategoricalIndex( k: int = 10, n: int = 3, name=None, **kwargs ) -> CategoricalIndex: @@ -385,14 +380,6 @@ def makeCategoricalIndex( ) -def makeBoolIndex(k: int = 10, name=None) -> Index: - if k == 1: - return Index([True], name=name) - elif k == 2: - return Index([False, True], name=name) - return Index([False, True] + [False] * (k - 2), name=name) - - def makeNumericIndex(k: int = 10, *, name=None, dtype: Dtype | None) -> Index: dtype = pandas_dtype(dtype) assert isinstance(dtype, np.dtype) @@ -457,14 +444,13 @@ def makePeriodIndex(k: int = 10, name=None, **kwargs) -> PeriodIndex: def makeObjectSeries(name=None) -> Series: - data = makeStringIndex(_N) - data = Index(data, dtype=object) - index = makeStringIndex(_N) - return Series(data, index=index, name=name) + data = [f"foo_{i}" for i in range(_N)] + index = Index([f"bar_{i}" for i in range(_N)]) + return Series(data, index=index, name=name, dtype=object) def getSeriesData() -> dict[str, Series]: - index = makeStringIndex(_N) + index = Index([f"foo_{i}" for i in range(_N)]) return { c: Series(np.random.default_rng(i).standard_normal(_N), index=index) for i, c in enumerate(getCols(_K)) @@ -496,23 +482,6 @@ def makeDataFrame() -> DataFrame: return DataFrame(data) -def getMixedTypeDict(): - index = Index(["a", "b", "c", "d", "e"]) - - data = { - "A": [0.0, 1.0, 2.0, 3.0, 4.0], - "B": [0.0, 1.0, 0.0, 1.0, 0.0], - "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], - "D": bdate_range("1/1/2009", periods=5), - } - - return index, data - - -def makeMixedDataFrame() -> DataFrame: - return DataFrame(getMixedTypeDict()[1]) - - def makeCustomIndex( nentries, nlevels, @@ -566,7 +535,7 @@ def makeCustomIndex( idx_func_dict: dict[str, Callable[..., Index]] = { "i": makeIntIndex, "f": makeFloatIndex, - "s": makeStringIndex, + "s": lambda n: Index([f"{i}_{chr(i)}" for i in range(97, 97 + n)]), "dt": makeDateIndex, "td": makeTimedeltaIndex, "p": makePeriodIndex, @@ -1040,7 +1009,6 @@ def shares_memory(left, right) -> bool: "get_dtype", "getitem", "get_locales", - "getMixedTypeDict", "get_finest_unit", "get_obj", "get_op_from_name", @@ -1049,7 +1017,6 @@ def shares_memory(left, right) -> bool: "iat", "iloc", "loc", - "makeBoolIndex", "makeCategoricalIndex", "makeCustomDataframe", "makeCustomIndex", @@ -1057,12 +1024,10 @@ def shares_memory(left, right) -> bool: "makeDateIndex", "makeFloatIndex", "makeIntIndex", - "makeMixedDataFrame", "makeNumericIndex", "makeObjectSeries", "makePeriodIndex", "makeRangeIndex", - "makeStringIndex", "makeTimeDataFrame", "makeTimedeltaIndex", "makeTimeSeries", diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 0d8fb7bfce33a..6cad71b3dfd18 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -16,6 +16,7 @@ from pandas.core.dtypes.common import ( is_bool, + is_float_dtype, is_integer_dtype, is_number, is_numeric_dtype, @@ -426,6 +427,8 @@ def assert_is_valid_plot_return_object(objs) -> None: from matplotlib.axes import Axes if isinstance(objs, (Series, np.ndarray)): + if isinstance(objs, Series): + objs = objs._values for el in objs.ravel(): msg = ( "one of 'objs' is not a matplotlib Axes instance, " @@ -713,7 +716,7 @@ def assert_extension_array_equal( index_values : Index | numpy.ndarray, default None Optional index (shared by both left and right), used in output. check_exact : bool, default False - Whether to compare number exactly. + Whether to compare number exactly. Only takes effect for float dtypes. rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. atol : float, default 1e-8 @@ -782,7 +785,10 @@ def assert_extension_array_equal( left_valid = left[~left_na].to_numpy(dtype=object) right_valid = right[~right_na].to_numpy(dtype=object) - if check_exact: + if check_exact or ( + (is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype)) + or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype)) + ): assert_numpy_array_equal( left_valid, right_valid, obj=obj, index_values=index_values ) @@ -836,7 +842,7 @@ def assert_series_equal( check_names : bool, default True Whether to check the Series and Index names attribute. check_exact : bool, default False - Whether to compare number exactly. + Whether to compare number exactly. Only takes effect for float dtypes. check_datetimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True @@ -929,8 +935,10 @@ def assert_series_equal( pass else: assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") - - if check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype(right.dtype): + if check_exact or ( + (is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype)) + or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype)) + ): left_values = left._values right_values = right._values # Only check exact if dtype is numeric @@ -1093,7 +1101,7 @@ def assert_frame_equal( Specify how to compare internal data. If False, compare by columns. If True, compare by blocks. check_exact : bool, default False - Whether to compare number exactly. + Whether to compare number exactly. Only takes effect for float dtypes. check_datetimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True diff --git a/pandas/conftest.py b/pandas/conftest.py index 350871c3085c1..3205b6657439f 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -610,7 +610,7 @@ def _create_mi_with_dt64tz_level(): indices_dict = { - "string": tm.makeStringIndex(100), + "string": Index([f"pandas_{i}" for i in range(100)]), "datetime": tm.makeDateIndex(100), "datetime-tz": tm.makeDateIndex(100, tz="US/Pacific"), "period": tm.makePeriodIndex(100), @@ -626,7 +626,7 @@ def _create_mi_with_dt64tz_level(): "uint64": tm.makeUIntIndex(100, dtype="uint64"), "float32": tm.makeFloatIndex(100, dtype="float32"), "float64": tm.makeFloatIndex(100, dtype="float64"), - "bool-object": tm.makeBoolIndex(10).astype(object), + "bool-object": Index([True, False] * 5, dtype=object), "bool-dtype": Index(np.random.default_rng(2).standard_normal(10) < 0), "complex64": tm.makeNumericIndex(100, dtype="float64").astype("complex64"), "complex128": tm.makeNumericIndex(100, dtype="float64").astype("complex128"), @@ -641,10 +641,12 @@ def _create_mi_with_dt64tz_level(): "nullable_uint": Index(np.arange(100), dtype="UInt16"), "nullable_float": Index(np.arange(100), dtype="Float32"), "nullable_bool": Index(np.arange(100).astype(bool), dtype="boolean"), - "string-python": Index(pd.array(tm.makeStringIndex(100), dtype="string[python]")), + "string-python": Index( + pd.array([f"pandas_{i}" for i in range(100)], dtype="string[python]") + ), } if has_pyarrow: - idx = Index(pd.array(tm.makeStringIndex(100), dtype="string[pyarrow]")) + idx = Index(pd.array([f"pandas_{i}" for i in range(100)], dtype="string[pyarrow]")) indices_dict["string-pyarrow"] = idx diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 3b79882d3c762..bb3cc3a03760f 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1172,11 +1172,17 @@ def apply_with_numba(self) -> dict[int, Any]: ) from pandas.core._numba.extensions import set_numba_data + index = self.obj.index + if index.dtype == "string": + index = index.astype(object) + + columns = self.obj.columns + if columns.dtype == "string": + columns = columns.astype(object) + # Convert from numba dict to regular dict # Our isinstance checks in the df constructor don't pass for numbas typed dict - with set_numba_data(self.obj.index) as index, set_numba_data( - self.columns - ) as columns: + with set_numba_data(index) as index, set_numba_data(columns) as columns: res = dict(nb_func(self.values, columns, index)) return res diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 792fa4fdc24a5..320f028f4484c 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -261,6 +261,8 @@ def maybe_downcast_to_dtype(result: ArrayLike, dtype: str | np.dtype) -> ArrayLi try to cast to the specified dtype (e.g. convert back to bool/int or could be an astype of float64->float32 """ + if isinstance(result, ABCSeries): + result = result._values do_round = False if isinstance(dtype, str): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e4e95a973a3c1..56001fabfdc9d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -101,6 +101,7 @@ SettingWithCopyWarning, _chained_assignment_method_msg, _chained_assignment_warning_method_msg, + _check_cacher, ) from pandas.util._decorators import ( deprecate_nonkeyword_arguments, @@ -7195,7 +7196,7 @@ def fillna( elif not PYPY and not using_copy_on_write(): ctr = sys.getrefcount(self) ref_count = REF_COUNT - if isinstance(self, ABCSeries) and hasattr(self, "_cacher"): + if isinstance(self, ABCSeries) and _check_cacher(self): # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 ref_count += 1 if ctr <= ref_count: @@ -7477,7 +7478,7 @@ def ffill( elif not PYPY and not using_copy_on_write(): ctr = sys.getrefcount(self) ref_count = REF_COUNT - if isinstance(self, ABCSeries) and hasattr(self, "_cacher"): + if isinstance(self, ABCSeries) and _check_cacher(self): # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 ref_count += 1 if ctr <= ref_count: @@ -7660,7 +7661,7 @@ def bfill( elif not PYPY and not using_copy_on_write(): ctr = sys.getrefcount(self) ref_count = REF_COUNT - if isinstance(self, ABCSeries) and hasattr(self, "_cacher"): + if isinstance(self, ABCSeries) and _check_cacher(self): # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 ref_count += 1 if ctr <= ref_count: @@ -7826,12 +7827,12 @@ def replace( elif not PYPY and not using_copy_on_write(): ctr = sys.getrefcount(self) ref_count = REF_COUNT - if isinstance(self, ABCSeries) and hasattr(self, "_cacher"): + if isinstance(self, ABCSeries) and _check_cacher(self): # in non-CoW mode, chained Series access will populate the # `_item_cache` which results in an increased ref count not below # the threshold, while we still need to warn. We detect this case # of a Series derived from a DataFrame through the presence of - # `_cacher` + # checking the `_cacher` ref_count += 1 if ctr <= ref_count: warnings.warn( @@ -8267,7 +8268,7 @@ def interpolate( elif not PYPY and not using_copy_on_write(): ctr = sys.getrefcount(self) ref_count = REF_COUNT - if isinstance(self, ABCSeries) and hasattr(self, "_cacher"): + if isinstance(self, ABCSeries) and _check_cacher(self): # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 ref_count += 1 if ctr <= ref_count: @@ -8902,6 +8903,18 @@ def clip( ChainedAssignmentError, stacklevel=2, ) + elif not PYPY and not using_copy_on_write(): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if isinstance(self, ABCSeries) and hasattr(self, "_cacher"): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) axis = nv.validate_clip_with_axis(axis, (), kwargs) if axis is not None: @@ -10802,6 +10815,19 @@ def where( ChainedAssignmentError, stacklevel=2, ) + elif not PYPY and not using_copy_on_write(): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if isinstance(self, ABCSeries) and hasattr(self, "_cacher"): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) + other = common.apply_if_callable(other, self) return self._where(cond, other, inplace, axis, level) @@ -10868,14 +10894,27 @@ def mask( ChainedAssignmentError, stacklevel=2, ) + elif not PYPY and not using_copy_on_write(): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if isinstance(self, ABCSeries) and hasattr(self, "_cacher"): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) cond = common.apply_if_callable(cond, self) + other = common.apply_if_callable(other, self) # see gh-21891 if not hasattr(cond, "__invert__"): cond = np.array(cond) - return self.where( + return self._where( ~cond, other=other, inplace=inplace, @@ -12450,7 +12489,7 @@ def _inplace_method(self, other, op) -> Self: """ warn = True if not PYPY and warn_copy_on_write(): - if sys.getrefcount(self) <= 4: + if sys.getrefcount(self) <= REF_COUNT + 2: # we are probably in an inplace setitem context (e.g. df['a'] += 1) warn = False diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index c39fbfe6b6d33..eba4f72b5eb8f 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -309,6 +309,7 @@ def _add_margins( row_names = result.index.names # check the result column and leave floats + for dtype in set(result.dtypes): if isinstance(dtype, ExtensionDtype): # Can hold NA already diff --git a/pandas/core/series.py b/pandas/core/series.py index a9679f22f9933..3f7ac75e623a2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -45,6 +45,7 @@ _chained_assignment_method_msg, _chained_assignment_msg, _chained_assignment_warning_method_msg, + _check_cacher, ) from pandas.util._decorators import ( Appender, @@ -847,6 +848,11 @@ def ravel(self, order: str = "C") -> ArrayLike: """ Return the flattened underlying data as an ndarray or ExtensionArray. + .. deprecated:: 2.2.0 + Series.ravel is deprecated. The underlying array is already 1D, so + ravel is not necessary. Use :meth:`to_numpy` for conversion to a numpy + array instead. + Returns ------- numpy.ndarray or ExtensionArray @@ -859,9 +865,16 @@ def ravel(self, order: str = "C") -> ArrayLike: Examples -------- >>> s = pd.Series([1, 2, 3]) - >>> s.ravel() + >>> s.ravel() # doctest: +SKIP array([1, 2, 3]) """ + warnings.warn( + "Series.ravel is deprecated. The underlying array is already 1D, so " + "ravel is not necessary. Use `to_numpy()` for conversion to a numpy " + "array instead.", + FutureWarning, + stacklevel=2, + ) arr = self._values.ravel(order=order) if isinstance(arr, np.ndarray) and using_copy_on_write(): arr.flags.writeable = False @@ -877,6 +890,10 @@ def view(self, dtype: Dtype | None = None) -> Series: """ Create a new view of the Series. + .. deprecated:: 2.2.0 + ``Series.view`` is deprecated and will be removed in a future version. + Use :meth:`Series.astype` as an alternative to change the dtype. + This function will return a new Series with a view of the same underlying values in memory, optionally reinterpreted with a new data type. The new data type must preserve the same size in bytes as to not @@ -907,38 +924,14 @@ def view(self, dtype: Dtype | None = None) -> Series: Examples -------- - >>> s = pd.Series([-2, -1, 0, 1, 2], dtype='int8') - >>> s - 0 -2 - 1 -1 - 2 0 - 3 1 - 4 2 - dtype: int8 - - The 8 bit signed integer representation of `-1` is `0b11111111`, but - the same bytes represent 255 if read as an 8 bit unsigned integer: - - >>> us = s.view('uint8') - >>> us - 0 254 - 1 255 - 2 0 - 3 1 - 4 2 - dtype: uint8 - - The views share the same underlying values: - - >>> us[0] = 128 - >>> s - 0 -128 - 1 -1 - 2 0 - 3 1 - 4 2 - dtype: int8 + Use ``astype`` to change the dtype instead. """ + warnings.warn( + "Series.view is deprecated and will be removed in a future version. " + "Use ``astype`` as an alternative to change the dtype.", + FutureWarning, + stacklevel=2, + ) # self.array instead of self._values so we piggyback on NumpyExtensionArray # implementation res_values = self.array.view(dtype) @@ -3564,7 +3557,7 @@ def update(self, other: Series | Sequence | Mapping) -> None: elif not PYPY and not using_copy_on_write(): ctr = sys.getrefcount(self) ref_count = REF_COUNT - if hasattr(self, "_cacher"): + if _check_cacher(self): # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 ref_count += 1 if ctr <= ref_count: diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 82b4746aa57a5..db659713c6f16 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -15,6 +15,7 @@ is_datetime64_ns_dtype, is_numeric_dtype, ) +from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.missing import isna from pandas.core import common @@ -118,6 +119,8 @@ def _calculate_deltas( np.ndarray Diff of the times divided by the half-life """ + if isinstance(times, ABCSeries): + times = times._values _times = np.asarray(times.view(np.int64), dtype=np.float64) # TODO: generalize to non-nano? _halflife = float(Timedelta(halflife).as_unit("ns")._value) diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index e2aa9010dc109..c89e4aa2cac0f 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -516,6 +516,24 @@ class ChainedAssignmentError(Warning): ) +def _check_cacher(obj): + # This is a mess, selection paths that return a view set the _cacher attribute + # on the Series; most of them also set _item_cache which adds 1 to our relevant + # reference count, but iloc does not, so we have to check if we are actually + # in the item cache + if hasattr(obj, "_cacher"): + parent = obj._cacher[1]() + # parent could be dead + if parent is None: + return False + if hasattr(parent, "_item_cache"): + if obj._cacher[0] in parent._item_cache: + # Check if we are actually the item from item_cache, iloc creates a + # new object + return obj is parent._item_cache[obj._cacher[0]] + return False + + class NumExprClobberingError(NameError): """ Exception raised when trying to use a built-in numexpr name as a variable name. diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index e17fcea0aae71..9c56089560507 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -32,7 +32,10 @@ from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend -from pandas.core.dtypes.common import ensure_str +from pandas.core.dtypes.common import ( + ensure_str, + is_string_dtype, +) from pandas.core.dtypes.dtypes import PeriodDtype from pandas.core.dtypes.generic import ABCIndex @@ -1249,7 +1252,7 @@ def _try_convert_data( if self.dtype_backend is not lib.no_default and not isinstance(data, ABCIndex): # Fall through for conversion later on return data, True - elif data.dtype == "object": + elif is_string_dtype(data.dtype): # try float try: data = data.astype("float64") @@ -1301,6 +1304,10 @@ def _try_convert_to_date(self, data): return data, False new_data = data + + if new_data.dtype == "string": + new_data = new_data.astype(object) + if new_data.dtype == "object": try: new_data = data.astype("int64") diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 9e0e3686e4aa2..50611197ad7dd 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -57,7 +57,6 @@ is_bool_dtype, is_complex_dtype, is_list_like, - is_object_dtype, is_string_dtype, needs_i8_conversion, ) @@ -2647,7 +2646,7 @@ class DataIndexableCol(DataCol): is_data_indexable = True def validate_names(self) -> None: - if not is_object_dtype(Index(self.values).dtype): + if not is_string_dtype(Index(self.values).dtype): # TODO: should the message here be more specifically non-str? raise ValueError("cannot have non-object label DataIndexableCol") diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 70294e8a62cca..1eb8f531dc62a 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -429,9 +429,9 @@ def parse_dates_safe( d["year"] = date_index._data.year d["month"] = date_index._data.month if days: - days_in_ns = dates.view(np.int64) - to_datetime( + days_in_ns = dates._values.view(np.int64) - to_datetime( d["year"], format="%Y" - ).view(np.int64) + )._values.view(np.int64) d["days"] = days_in_ns // NS_PER_DAY elif infer_dtype(dates, skipna=False) == "datetime": diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index 9f8611dd4b08b..ae5b3eef4e4fe 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -152,7 +152,7 @@ def test_transform_axis_1_raises(): # TODO(CoW-warn) should not need to warn @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") -def test_apply_modify_traceback(): +def test_apply_modify_traceback(warn_copy_on_write): data = DataFrame( { "A": [ @@ -214,7 +214,8 @@ def transform2(row): msg = "'float' object has no attribute 'startswith'" with pytest.raises(AttributeError, match=msg): - data.apply(transform, axis=1) + with tm.assert_cow_warning(warn_copy_on_write): + data.apply(transform, axis=1) @pytest.mark.parametrize( diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index ee239568d057d..85d7baee1bdf5 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -24,6 +24,22 @@ def test_numba_vs_python_noop(float_frame, apply_axis): tm.assert_frame_equal(result, expected) +def test_numba_vs_python_string_index(): + # GH#56189 + pytest.importorskip("pyarrow") + df = DataFrame( + 1, + index=Index(["a", "b"], dtype="string[pyarrow_numpy]"), + columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"), + ) + func = lambda x: x + result = df.apply(func, engine="numba", axis=0) + expected = df.apply(func, engine="python", axis=0) + tm.assert_frame_equal( + result, expected, check_column_type=False, check_index_type=False + ) + + def test_numba_vs_python_indexing(): frame = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]}, @@ -88,7 +104,8 @@ def test_numba_unsupported_dtypes(apply_axis): df["c"] = df["c"].astype("double[pyarrow]") with pytest.raises( - ValueError, match="Column b must have a numeric dtype. Found 'object' instead" + ValueError, + match="Column b must have a numeric dtype. Found 'object|string' instead", ): df.apply(f, engine="numba", axis=apply_axis) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index f89711c0edee7..c3c4a8b4fc6c0 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -916,7 +916,7 @@ def test_add_frames(self, first, second, expected): # TODO: This came from series.test.test_operators, needs cleanup def test_series_frame_radd_bug(self, fixed_now_ts): # GH#353 - vals = Series(tm.makeStringIndex()) + vals = Series([str(i) for i in range(5)]) result = "foo_" + vals expected = vals.map(lambda x: "foo_" + x) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index 33953900c2006..6b36f447eb7d5 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -299,7 +299,7 @@ def test_iadd_string(self): @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="add doesn't work") def test_add(self): - index = tm.makeStringIndex(100) + index = pd.Index([str(i) for i in range(10)]) expected = pd.Index(index.values * 2) tm.assert_index_equal(index + index, expected) tm.assert_index_equal(index + index.tolist(), expected) @@ -313,7 +313,7 @@ def test_add(self): tm.assert_index_equal("1" + index, expected) def test_sub_fail(self, using_infer_string): - index = tm.makeStringIndex(100) + index = pd.Index([str(i) for i in range(10)]) if using_infer_string: import pyarrow as pa diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index 62a6a3374e612..e5d0aafb80b35 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -116,7 +116,8 @@ def test_series_to_numpy(using_copy_on_write): @pytest.mark.parametrize("order", ["F", "C"]) def test_ravel_read_only(using_copy_on_write, order): ser = Series([1, 2, 3]) - arr = ser.ravel(order=order) + with tm.assert_produces_warning(FutureWarning, match="is deprecated"): + arr = ser.ravel(order=order) if using_copy_on_write: assert arr.flags.writeable is False assert np.shares_memory(get_array(ser), arr) diff --git a/pandas/tests/copy_view/test_chained_assignment_deprecation.py b/pandas/tests/copy_view/test_chained_assignment_deprecation.py new file mode 100644 index 0000000000000..55781370a5a59 --- /dev/null +++ b/pandas/tests/copy_view/test_chained_assignment_deprecation.py @@ -0,0 +1,63 @@ +import pytest + +from pandas import DataFrame +import pandas._testing as tm + + +def test_methods_iloc_warn(using_copy_on_write): + if not using_copy_on_write: + df = DataFrame({"a": [1, 2, 3], "b": 1}) + with tm.assert_cow_warning(match="A value"): + df.iloc[:, 0].replace(1, 5, inplace=True) + + with tm.assert_cow_warning(match="A value"): + df.iloc[:, 0].fillna(1, inplace=True) + + with tm.assert_cow_warning(match="A value"): + df.iloc[:, 0].interpolate(inplace=True) + + with tm.assert_cow_warning(match="A value"): + df.iloc[:, 0].ffill(inplace=True) + + with tm.assert_cow_warning(match="A value"): + df.iloc[:, 0].bfill(inplace=True) + + +@pytest.mark.parametrize( + "func, args", + [ + ("replace", (1, 5)), + ("fillna", (1,)), + ("interpolate", ()), + ("bfill", ()), + ("ffill", ()), + ], +) +def test_methods_iloc_getitem_item_cache(func, args, using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": 1}) + ser = df.iloc[:, 0] + # TODO(CoW-warn) should warn about updating a view + getattr(ser, func)(*args, inplace=True) + + # parent that holds item_cache is dead, so don't increase ref count + ser = df.copy()["a"] + getattr(ser, func)(*args, inplace=True) + + df = df.copy() + + df["a"] # populate the item_cache + ser = df.iloc[:, 0] # iloc creates a new object + ser.fillna(0, inplace=True) + + df["a"] # populate the item_cache + ser = df["a"] + ser.fillna(0, inplace=True) + + df = df.copy() + df["a"] # populate the item_cache + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + df["a"].fillna(0, inplace=True) + else: + with tm.assert_cow_warning(match="A value"): + df["a"].fillna(0, inplace=True) diff --git a/pandas/tests/copy_view/test_clip.py b/pandas/tests/copy_view/test_clip.py index 6a27a0633a4aa..13ddd479c7c67 100644 --- a/pandas/tests/copy_view/test_clip.py +++ b/pandas/tests/copy_view/test_clip.py @@ -1,6 +1,9 @@ import numpy as np -from pandas import DataFrame +from pandas import ( + DataFrame, + option_context, +) import pandas._testing as tm from pandas.tests.copy_view.util import get_array @@ -81,3 +84,14 @@ def test_clip_chained_inplace(using_copy_on_write): with tm.raises_chained_assignment_error(): df[["a"]].clip(1, 2, inplace=True) tm.assert_frame_equal(df, df_orig) + else: + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + df["a"].clip(1, 2, inplace=True) + + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + with option_context("mode.chained_assignment", None): + df[["a"]].clip(1, 2, inplace=True) + + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + with option_context("mode.chained_assignment", None): + df[df["a"] > 1].clip(1, 2, inplace=True) diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index 89384a4ef6ba6..7d5c485958039 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -297,7 +297,6 @@ def test_dataframe_from_series_or_index( if using_copy_on_write: assert not df._mgr._has_no_reference(0) - # TODO(CoW-warn) should not warn for an index? with tm.assert_cow_warning(warn_copy_on_write): df.iloc[0, 0] = data[-1] if using_copy_on_write: diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 2e623f885b648..72b7aea3709c0 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -913,7 +913,6 @@ def test_del_frame(backend, using_copy_on_write, warn_copy_on_write): tm.assert_frame_equal(df2, df_orig[["a", "c"]]) df2._mgr._verify_integrity() - # TODO(CoW-warn) false positive, this should not warn? with tm.assert_cow_warning(warn_copy_on_write and dtype_backend == "numpy"): df.loc[0, "b"] = 200 assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 6416dd50daddc..806842dcab57a 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1551,6 +1551,17 @@ def test_chained_where_mask(using_copy_on_write, func): with tm.raises_chained_assignment_error(): getattr(df[["a"]], func)(df["a"] > 2, 5, inplace=True) tm.assert_frame_equal(df, df_orig) + else: + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + getattr(df["a"], func)(df["a"] > 2, 5, inplace=True) + + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + with option_context("mode.chained_assignment", None): + getattr(df[["a"]], func)(df["a"] > 2, 5, inplace=True) + + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + with option_context("mode.chained_assignment", None): + getattr(df[df["a"] > 1], func)(df["a"] > 2, 5, inplace=True) def test_asfreq_noop(using_copy_on_write): @@ -1934,7 +1945,8 @@ def test_series_view(using_copy_on_write, warn_copy_on_write): ser = Series([1, 2, 3]) ser_orig = ser.copy() - ser2 = ser.view() + with tm.assert_produces_warning(FutureWarning, match="is deprecated"): + ser2 = ser.view() assert np.shares_memory(get_array(ser), get_array(ser2)) if using_copy_on_write: assert not ser2._mgr._has_no_reference(0) diff --git a/pandas/tests/dtypes/cast/test_downcast.py b/pandas/tests/dtypes/cast/test_downcast.py index c01eac746455c..9430ba2c478ae 100644 --- a/pandas/tests/dtypes/cast/test_downcast.py +++ b/pandas/tests/dtypes/cast/test_downcast.py @@ -56,8 +56,8 @@ def test_downcast_booleans(): ser = Series([True, True, False]) result = maybe_downcast_to_dtype(ser, np.dtype(np.float64)) - expected = ser - tm.assert_series_equal(result, expected) + expected = ser.values + tm.assert_numpy_array_equal(result, expected) def test_downcast_conversion_no_nan(any_real_numpy_dtype): diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index b9407c7197f20..a354e5767f37f 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -7,6 +7,7 @@ from pandas._typing import Dtype from pandas.core.dtypes.common import is_bool_dtype +from pandas.core.dtypes.dtypes import NumpyEADtype from pandas.core.dtypes.missing import na_value_for_dtype import pandas as pd @@ -331,7 +332,7 @@ def test_fillna_length_mismatch(self, data_missing): data_missing.fillna(data_missing.take([1])) # Subclasses can override if we expect e.g Sparse[bool], boolean, pyarrow[bool] - _combine_le_expected_dtype: Dtype = np.dtype(bool) + _combine_le_expected_dtype: Dtype = NumpyEADtype("bool") def test_combine_le(self, data_repeated): # GH 20825 @@ -341,16 +342,20 @@ def test_combine_le(self, data_repeated): s2 = pd.Series(orig_data2) result = s1.combine(s2, lambda x1, x2: x1 <= x2) expected = pd.Series( - [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))], - dtype=self._combine_le_expected_dtype, + pd.array( + [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))], + dtype=self._combine_le_expected_dtype, + ) ) tm.assert_series_equal(result, expected) val = s1.iloc[0] result = s1.combine(val, lambda x1, x2: x1 <= x2) expected = pd.Series( - [a <= val for a in list(orig_data1)], - dtype=self._combine_le_expected_dtype, + pd.array( + [a <= val for a in list(orig_data1)], + dtype=self._combine_le_expected_dtype, + ) ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 4576a86ad27cd..6b2bf211ab748 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -771,7 +771,8 @@ def test_where_datetimelike_noop(self, dtype): # GH#45135, analogue to GH#44181 for Period don't raise on no-op # For td64/dt64/dt64tz we already don't raise, but also are # checking that we don't unnecessarily upcast to object. - ser = Series(np.arange(3) * 10**9, dtype=np.int64).view(dtype) + with tm.assert_produces_warning(FutureWarning, match="is deprecated"): + ser = Series(np.arange(3) * 10**9, dtype=np.int64).view(dtype) df = ser.to_frame() mask = np.array([False, False, False]) diff --git a/pandas/tests/frame/methods/test_first_valid_index.py b/pandas/tests/frame/methods/test_first_valid_index.py index a448768f4173d..2e27f1aa71700 100644 --- a/pandas/tests/frame/methods/test_first_valid_index.py +++ b/pandas/tests/frame/methods/test_first_valid_index.py @@ -6,9 +6,10 @@ from pandas import ( DataFrame, + Index, Series, + date_range, ) -import pandas._testing as tm class TestFirstValidIndex: @@ -44,11 +45,12 @@ def test_first_last_valid_frame(self, data, idx, expected_first, expected_last): assert expected_first == df.first_valid_index() assert expected_last == df.last_valid_index() - @pytest.mark.parametrize("index_func", [tm.makeStringIndex, tm.makeDateIndex]) - def test_first_last_valid(self, index_func): - N = 30 - index = index_func(N) - mat = np.random.default_rng(2).standard_normal(N) + @pytest.mark.parametrize( + "index", + [Index([str(i) for i in range(20)]), date_range("2020-01-01", periods=20)], + ) + def test_first_last_valid(self, index): + mat = np.random.default_rng(2).standard_normal(len(index)) mat[:5] = np.nan mat[-5:] = np.nan @@ -60,10 +62,12 @@ def test_first_last_valid(self, index_func): assert ser.first_valid_index() == frame.index[5] assert ser.last_valid_index() == frame.index[-6] - @pytest.mark.parametrize("index_func", [tm.makeStringIndex, tm.makeDateIndex]) - def test_first_last_valid_all_nan(self, index_func): + @pytest.mark.parametrize( + "index", + [Index([str(i) for i in range(10)]), date_range("2020-01-01", periods=10)], + ) + def test_first_last_valid_all_nan(self, index): # GH#17400: no valid entries - index = index_func(30) frame = DataFrame(np.nan, columns=["foo"], index=index) assert frame.last_valid_index() is None diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index 93a2f8d80019c..d0caa071fae1c 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -6,9 +6,11 @@ from pandas import ( DataFrame, DatetimeIndex, + Index, IntervalIndex, Series, Timestamp, + bdate_range, date_range, timedelta_range, ) @@ -108,9 +110,17 @@ def test_transpose_float(self, float_frame): else: assert value == frame[col][idx] + def test_transpose_mixed(self): # mixed type - index, data = tm.getMixedTypeDict() - mixed = DataFrame(data, index=index) + mixed = DataFrame( + { + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "D": bdate_range("1/1/2009", periods=5), + }, + index=Index(["a", "b", "c", "d", "e"], dtype=object), + ) mixed_T = mixed.T for col, s in mixed_T.items(): diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index bf17b61b0e3f3..2cca87adc5189 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -784,7 +784,7 @@ def test_constructor_dict_cast(self): def test_constructor_dict_cast2(self): # can't cast to float test_data = { - "A": dict(zip(range(20), tm.makeStringIndex(20))), + "A": dict(zip(range(20), [f"word_{i}" for i in range(20)])), "B": dict(zip(range(15), np.random.default_rng(2).standard_normal(15))), } with pytest.raises(ValueError, match="could not convert string"): @@ -1101,8 +1101,8 @@ def test_constructor_maskedarray_nonfloat(self): mat2[0, 0] = 1 mat2[1, 2] = 2 frame = DataFrame(mat2, columns=["A", "B", "C"], index=[1, 2]) - assert 1 == frame["A"].view("i8")[1] - assert 2 == frame["C"].view("i8")[2] + assert 1 == frame["A"].astype("i8")[1] + assert 2 == frame["C"].astype("i8")[2] # masked bool promoted to object mat = ma.masked_all((2, 3), dtype=bool) diff --git a/pandas/tests/frame/test_repr.py b/pandas/tests/frame/test_repr.py index 98962b3003b6d..eed48b9db116b 100644 --- a/pandas/tests/frame/test_repr.py +++ b/pandas/tests/frame/test_repr.py @@ -165,7 +165,7 @@ def test_repr_mixed_big(self): biggie = DataFrame( { "A": np.random.default_rng(2).standard_normal(200), - "B": tm.makeStringIndex(200), + "B": [str(i) for i in range(200)], }, index=range(200), ) diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 68746b9e9a803..866e9e203ffe3 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -31,11 +31,6 @@ # - Callable: pass the constructed value with attrs set to this. _all_methods = [ - ( - pd.Series, - (np.array([0], dtype="float64")), - operator.methodcaller("view", "int64"), - ), (pd.Series, ([0],), operator.methodcaller("take", [])), (pd.Series, ([0],), operator.methodcaller("__getitem__", [True])), (pd.Series, ([0],), operator.methodcaller("repeat", 2)), diff --git a/pandas/tests/groupby/test_cumulative.py b/pandas/tests/groupby/test_cumulative.py index 25534865b3486..bf572609f3d37 100644 --- a/pandas/tests/groupby/test_cumulative.py +++ b/pandas/tests/groupby/test_cumulative.py @@ -216,7 +216,7 @@ def test_cummax_i8_at_implementation_bound(): # the minimum value used to be treated as NPY_NAT+1 instead of NPY_NAT # for int64 dtype GH#46382 ser = Series([pd.NaT._value + n for n in range(5)]) - df = DataFrame({"A": 1, "B": ser, "C": ser.view("M8[ns]")}) + df = DataFrame({"A": 1, "B": ser, "C": ser._values.view("M8[ns]")}) gb = df.groupby("A") res = gb.cummax() diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 3c1a35c984031..c401762dace23 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -19,6 +19,7 @@ Series, Timestamp, date_range, + period_range, ) import pandas._testing as tm from pandas.core.groupby.grouper import Grouping @@ -174,23 +175,21 @@ class TestGrouping: @pytest.mark.parametrize( "index", [ - tm.makeFloatIndex, - tm.makeStringIndex, - tm.makeIntIndex, - tm.makeDateIndex, - tm.makePeriodIndex, + Index(list("abcde")), + Index(np.arange(5)), + Index(np.arange(5, dtype=float)), + date_range("2020-01-01", periods=5), + period_range("2020-01-01", periods=5), ], ) - @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_grouper_index_types(self, index): # related GH5375 # groupby misbehaving when using a Floatlike index - df = DataFrame(np.arange(10).reshape(5, 2), columns=list("AB")) + df = DataFrame(np.arange(10).reshape(5, 2), columns=list("AB"), index=index) - df.index = index(len(df)) df.groupby(list("abcde"), group_keys=False).apply(lambda x: x) - df.index = list(reversed(df.index.tolist())) + df.index = df.index[::-1] df.groupby(list("abcde"), group_keys=False).apply(lambda x: x) def test_grouper_multilevel_freq(self): diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index b79bed32e7fa4..b8b34d365d051 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -726,7 +726,7 @@ def test_groupby_groups_periods(self): def test_groupby_first_datetime64(self): df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)]) - df[1] = df[1].view("M8[ns]") + df[1] = df[1].astype("M8[ns]") assert issubclass(df[1].dtype.type, np.datetime64) diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index a69248cf038f8..6c6d9022b1af3 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -257,7 +257,7 @@ def test_duplicated(idx_dup, keep, expected): def test_duplicated_hashtable_impl(keep, monkeypatch): # GH 9125 n, k = 6, 10 - levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)] + levels = [np.arange(n), [str(i) for i in range(n)], 1000 + np.arange(n)] codes = [np.random.default_rng(2).choice(n, k * n) for _ in levels] with monkeypatch.context() as m: m.setattr(libindex, "_SIZE_CUTOFF", 50) diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index 873c4e3e60f4c..5508153322adb 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -566,7 +566,7 @@ def test_loc_setitem_single_column_slice(): tm.assert_frame_equal(df, expected) -def test_loc_nan_multiindex(): +def test_loc_nan_multiindex(using_infer_string): # GH 5286 tups = [ ("Good Things", "C", np.nan), @@ -586,8 +586,12 @@ def test_loc_nan_multiindex(): result = df.loc["Good Things"].loc["C"] expected = DataFrame( np.ones((1, 4)), - index=Index([np.nan], dtype="object", name="u3"), - columns=Index(["d1", "d2", "d3", "d4"], dtype="object"), + index=Index( + [np.nan], + dtype="object" if not using_infer_string else "string[pyarrow_numpy]", + name="u3", + ), + columns=Index(["d1", "d2", "d3", "d4"]), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_at.py b/pandas/tests/indexing/test_at.py index 7504c984794e8..d78694018749c 100644 --- a/pandas/tests/indexing/test_at.py +++ b/pandas/tests/indexing/test_at.py @@ -13,6 +13,7 @@ CategoricalIndex, DataFrame, DatetimeIndex, + Index, MultiIndex, Series, Timestamp, @@ -70,7 +71,11 @@ def test_at_setitem_item_cache_cleared(self): df.at[0, "x"] = 4 df.at[0, "cost"] = 789 - expected = DataFrame({"x": [4], "cost": 789}, index=[0]) + expected = DataFrame( + {"x": [4], "cost": 789}, + index=[0], + columns=Index(["x", "cost"], dtype=object), + ) tm.assert_frame_equal(df, expected) # And in particular, check that the _item_cache has updated correctly. diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 0432c8856e5c5..6f0ef0b357269 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -273,7 +273,7 @@ def test_slicing_doc_examples(self): tm.assert_frame_equal(result, expected) result = df.iloc[2:4, :].dtypes - expected = Series(["category", "int64"], ["cats", "values"]) + expected = Series(["category", "int64"], ["cats", "values"], dtype=object) tm.assert_series_equal(result, expected) result = df.loc["h":"j", "cats"] diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index 21aab6652a300..bf0975a803dce 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -339,7 +339,9 @@ def test_detect_chained_assignment_object_dtype( self, using_array_manager, using_copy_on_write, warn_copy_on_write ): expected = DataFrame({"A": [111, "bbb", "ccc"], "B": [1, 2, 3]}) - df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]}) + df = DataFrame( + {"A": Series(["aaa", "bbb", "ccc"], dtype=object), "B": [1, 2, 3]} + ) df_original = df.copy() if not using_copy_on_write and not warn_copy_on_write: diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index c743166030048..03691ca318037 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.compat import ( IS64, is_platform_windows, @@ -111,7 +113,7 @@ def _assert_setitem_index_conversion( "val,exp_dtype", [("x", object), (5, IndexError), (1.1, object)] ) def test_setitem_index_object(self, val, exp_dtype): - obj = pd.Series([1, 2, 3, 4], index=list("abcd")) + obj = pd.Series([1, 2, 3, 4], index=pd.Index(list("abcd"), dtype=object)) assert obj.index.dtype == object if exp_dtype is IndexError: @@ -122,7 +124,7 @@ def test_setitem_index_object(self, val, exp_dtype): with tm.assert_produces_warning(FutureWarning, match=warn_msg): temp[5] = 5 else: - exp_index = pd.Index(list("abcd") + [val]) + exp_index = pd.Index(list("abcd") + [val], dtype=object) self._assert_setitem_index_conversion(obj, val, exp_index, exp_dtype) @pytest.mark.parametrize( @@ -195,10 +197,10 @@ def _assert_insert_conversion(self, original, value, expected, expected_dtype): ], ) def test_insert_index_object(self, insert, coerced_val, coerced_dtype): - obj = pd.Index(list("abcd")) + obj = pd.Index(list("abcd"), dtype=object) assert obj.dtype == object - exp = pd.Index(["a", coerced_val, "b", "c", "d"]) + exp = pd.Index(["a", coerced_val, "b", "c", "d"], dtype=object) self._assert_insert_conversion(obj, insert, exp, coerced_dtype) @pytest.mark.parametrize( @@ -397,7 +399,7 @@ def _run_test(self, obj, fill_val, klass, exp_dtype): ) def test_where_object(self, index_or_series, fill_val, exp_dtype): klass = index_or_series - obj = klass(list("abcd")) + obj = klass(list("abcd"), dtype=object) assert obj.dtype == object self._run_test(obj, fill_val, klass, exp_dtype) @@ -559,10 +561,10 @@ def _assert_fillna_conversion(self, original, value, expected, expected_dtype): ) def test_fillna_object(self, index_or_series, fill_val, fill_dtype): klass = index_or_series - obj = klass(["a", np.nan, "c", "d"]) + obj = klass(["a", np.nan, "c", "d"], dtype=object) assert obj.dtype == object - exp = klass(["a", fill_val, "c", "d"]) + exp = klass(["a", fill_val, "c", "d"], dtype=object) self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) @pytest.mark.parametrize( @@ -824,6 +826,8 @@ def replacer(self, how, from_key, to_key): raise ValueError return replacer + # Expected needs adjustment for the infer string option, seems to work as expecetd + @pytest.mark.skipif(using_pyarrow_string_dtype(), reason="TODO: test is to complex") def test_replace_series(self, how, to_key, from_key, replacer): index = pd.Index([3, 4], name="xxx") obj = pd.Series(self.rep[from_key], index=index, name="yyy") @@ -870,13 +874,18 @@ def test_replace_series(self, how, to_key, from_key, replacer): @pytest.mark.parametrize( "from_key", ["datetime64[ns, UTC]", "datetime64[ns, US/Eastern]"], indirect=True ) - def test_replace_series_datetime_tz(self, how, to_key, from_key, replacer): + def test_replace_series_datetime_tz( + self, how, to_key, from_key, replacer, using_infer_string + ): index = pd.Index([3, 4], name="xyz") obj = pd.Series(self.rep[from_key], index=index, name="yyy") assert obj.dtype == from_key exp = pd.Series(self.rep[to_key], index=index, name="yyy") - assert exp.dtype == to_key + if using_infer_string and to_key == "object": + assert exp.dtype == "string" + else: + assert exp.dtype == to_key msg = "Downcasting behavior in `replace`" warn = FutureWarning if exp.dtype != object else None diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index c9fbf95751dfe..cf9966145afce 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -6,6 +6,9 @@ Index, RangeIndex, Series, + date_range, + period_range, + timedelta_range, ) import pandas._testing as tm @@ -39,22 +42,21 @@ def check(self, result, original, indexer, getitem): tm.assert_almost_equal(result, expected) @pytest.mark.parametrize( - "index_func", + "index", [ - tm.makeStringIndex, - tm.makeCategoricalIndex, - tm.makeDateIndex, - tm.makeTimedeltaIndex, - tm.makePeriodIndex, + Index(list("abcde")), + Index(list("abcde"), dtype="category"), + date_range("2020-01-01", periods=5), + timedelta_range("1 day", periods=5), + period_range("2020-01-01", periods=5), ], ) - def test_scalar_non_numeric(self, index_func, frame_or_series, indexer_sl): + def test_scalar_non_numeric(self, index, frame_or_series, indexer_sl): # GH 4892 # float_indexers should raise exceptions # on appropriate Index types & accessors - i = index_func(5) - s = gen_obj(frame_or_series, i) + s = gen_obj(frame_or_series, index) # getting with pytest.raises(KeyError, match="^3.0$"): @@ -75,19 +77,18 @@ def test_scalar_non_numeric(self, index_func, frame_or_series, indexer_sl): assert 3.0 not in s2.axes[-1] @pytest.mark.parametrize( - "index_func", + "index", [ - tm.makeStringIndex, - tm.makeCategoricalIndex, - tm.makeDateIndex, - tm.makeTimedeltaIndex, - tm.makePeriodIndex, + Index(list("abcde")), + Index(list("abcde"), dtype="category"), + date_range("2020-01-01", periods=5), + timedelta_range("1 day", periods=5), + period_range("2020-01-01", periods=5), ], ) - def test_scalar_non_numeric_series_fallback(self, index_func): + def test_scalar_non_numeric_series_fallback(self, index): # fallsback to position selection, series only - i = index_func(5) - s = Series(np.arange(len(i)), index=i) + s = Series(np.arange(len(index)), index=index) msg = "Series.__getitem__ treating keys as positions is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -214,21 +215,20 @@ def test_scalar_float(self, frame_or_series): self.check(result, s, 3, False) @pytest.mark.parametrize( - "index_func", + "index", [ - tm.makeStringIndex, - tm.makeDateIndex, - tm.makeTimedeltaIndex, - tm.makePeriodIndex, + Index(list("abcde"), dtype=object), + date_range("2020-01-01", periods=5), + timedelta_range("1 day", periods=5), + period_range("2020-01-01", periods=5), ], ) @pytest.mark.parametrize("idx", [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]) - def test_slice_non_numeric(self, index_func, idx, frame_or_series, indexer_sli): + def test_slice_non_numeric(self, index, idx, frame_or_series, indexer_sli): # GH 4892 # float_indexers should raise exceptions # on appropriate Index types & accessors - index = index_func(5) s = gen_obj(frame_or_series, index) # getitem diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index cbcbf3396363a..baf2cdae43fe4 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -100,9 +100,8 @@ def test_iloc_setitem_fullcol_categorical(self, indexer, key, using_array_manage # we retain the object dtype. frame = DataFrame({0: np.array([0, 1, 2], dtype=object), 1: range(3)}) df = frame.copy() - orig_vals = df.values indexer(df)[key, 0] = cat - expected = DataFrame({0: cat.astype(object), 1: range(3)}) + expected = DataFrame({0: Series(cat.astype(object), dtype=object), 1: range(3)}) tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("box", [array, Series]) @@ -232,7 +231,10 @@ def test_iloc_exceeds_bounds(self): dfl = DataFrame( np.random.default_rng(2).standard_normal((5, 2)), columns=list("AB") ) - tm.assert_frame_equal(dfl.iloc[:, 2:3], DataFrame(index=dfl.index, columns=[])) + tm.assert_frame_equal( + dfl.iloc[:, 2:3], + DataFrame(index=dfl.index, columns=Index([], dtype=dfl.columns.dtype)), + ) tm.assert_frame_equal(dfl.iloc[:, 1:3], dfl.iloc[:, [1]]) tm.assert_frame_equal(dfl.iloc[4:6], dfl.iloc[[4]]) @@ -451,12 +453,16 @@ def test_iloc_setitem(self): def test_iloc_setitem_axis_argument(self): # GH45032 df = DataFrame([[6, "c", 10], [7, "d", 11], [8, "e", 12]]) + df[1] = df[1].astype(object) expected = DataFrame([[6, "c", 10], [7, "d", 11], [5, 5, 5]]) + expected[1] = expected[1].astype(object) df.iloc(axis=0)[2] = 5 tm.assert_frame_equal(df, expected) df = DataFrame([[6, "c", 10], [7, "d", 11], [8, "e", 12]]) + df[1] = df[1].astype(object) expected = DataFrame([[6, "c", 5], [7, "d", 5], [8, "e", 5]]) + expected[1] = expected[1].astype(object) df.iloc(axis=1)[2] = 5 tm.assert_frame_equal(df, expected) @@ -615,7 +621,7 @@ def test_iloc_getitem_labelled_frame(self): assert result == exp # out-of-bounds exception - msg = "index 5 is out of bounds for axis 0 with size 4" + msg = "index 5 is out of bounds for axis 0 with size 4|index out of bounds" with pytest.raises(IndexError, match=msg): df.iloc[10, 5] @@ -1313,7 +1319,9 @@ def test_iloc_setitem_dtypes_duplicate_columns( self, dtypes, init_value, expected_value ): # GH#22035 - df = DataFrame([[init_value, "str", "str2"]], columns=["a", "b", "b"]) + df = DataFrame( + [[init_value, "str", "str2"]], columns=["a", "b", "b"], dtype=object + ) # with the enforcement of GH#45333 in 2.0, this sets values inplace, # so we retain object dtype @@ -1360,7 +1368,10 @@ def test_frame_iloc_getitem_callable(self): def test_frame_iloc_setitem_callable(self): # GH#11485 - df = DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD")) + df = DataFrame( + {"X": [1, 2, 3, 4], "Y": Series(list("aabb"), dtype=object)}, + index=list("ABCD"), + ) # return location res = df.copy() diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index bdbbcabcaab0e..d6ec7ac3e4185 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.errors import IndexingError from pandas.core.dtypes.common import ( @@ -189,7 +191,7 @@ def test_setitem_dtype_upcast(self): ): df.loc[0, "c"] = "foo" expected = DataFrame( - [{"a": 1, "b": np.nan, "c": "foo"}, {"a": 3, "b": 2, "c": np.nan}] + {"a": [1, 3], "b": [np.nan, 2], "c": Series(["foo", np.nan], dtype=object)} ) tm.assert_frame_equal(df, expected) @@ -284,18 +286,27 @@ def test_dups_fancy_indexing_not_in_order(self): with pytest.raises(KeyError, match="not in index"): df.loc[rows] - def test_dups_fancy_indexing_only_missing_label(self): + def test_dups_fancy_indexing_only_missing_label(self, using_infer_string): # List containing only missing label dfnu = DataFrame( np.random.default_rng(2).standard_normal((5, 3)), index=list("AABCD") ) - with pytest.raises( - KeyError, - match=re.escape( - "\"None of [Index(['E'], dtype='object')] are in the [index]\"" - ), - ): - dfnu.loc[["E"]] + if using_infer_string: + with pytest.raises( + KeyError, + match=re.escape( + "\"None of [Index(['E'], dtype='string')] are in the [index]\"" + ), + ): + dfnu.loc[["E"]] + else: + with pytest.raises( + KeyError, + match=re.escape( + "\"None of [Index(['E'], dtype='object')] are in the [index]\"" + ), + ): + dfnu.loc[["E"]] @pytest.mark.parametrize("vals", [[0, 1, 2], list("abc")]) def test_dups_fancy_indexing_missing_label(self, vals): @@ -451,6 +462,9 @@ def test_set_index_nan(self): ) tm.assert_frame_equal(result, df) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't multiply arrow strings" + ) def test_multi_assign(self): # GH 3626, an assignment of a sub-df to a df # set float64 to avoid upcast when setting nan @@ -553,7 +567,7 @@ def test_string_slice_empty(self): with pytest.raises(KeyError, match="^0$"): df.loc["2011", 0] - def test_astype_assignment(self): + def test_astype_assignment(self, using_infer_string): # GH4312 (iloc) df_orig = DataFrame( [["1", "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") @@ -567,8 +581,9 @@ def test_astype_assignment(self): expected = DataFrame( [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - expected["A"] = expected["A"].astype(object) - expected["B"] = expected["B"].astype(object) + if not using_infer_string: + expected["A"] = expected["A"].astype(object) + expected["B"] = expected["B"].astype(object) tm.assert_frame_equal(df, expected) # GH5702 (loc) @@ -577,7 +592,8 @@ def test_astype_assignment(self): expected = DataFrame( [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - expected["A"] = expected["A"].astype(object) + if not using_infer_string: + expected["A"] = expected["A"].astype(object) tm.assert_frame_equal(df, expected) df = df_orig.copy() @@ -585,8 +601,9 @@ def test_astype_assignment(self): expected = DataFrame( [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - expected["B"] = expected["B"].astype(object) - expected["C"] = expected["C"].astype(object) + if not using_infer_string: + expected["B"] = expected["B"].astype(object) + expected["C"] = expected["C"].astype(object) tm.assert_frame_equal(df, expected) def test_astype_assignment_full_replacements(self): @@ -673,6 +690,7 @@ def test_loc_setitem_fullindex_views(self): df.loc[df.index] = df.loc[df.index] tm.assert_frame_equal(df, df2) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set int into string") def test_rhs_alignment(self): # GH8258, tests that both rows & columns are aligned to what is # assigned to. covers both uniform data-type & multi-type cases diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 96fd3f4e6fca0..8459cc5a30130 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -12,6 +12,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import index as libindex from pandas.errors import IndexingError import pandas.util._test_decorators as td @@ -469,7 +471,7 @@ def test_loc_to_fail2(self): msg = r"\"None of \[Index\(\['4'\], dtype='object'\)\] are in the \[index\]\"" with pytest.raises(KeyError, match=msg): - s.loc[["4"]] + s.loc[Index(["4"], dtype=object)] s.loc[-1] = 3 with pytest.raises(KeyError, match="not in index"): @@ -781,7 +783,9 @@ def test_loc_setitem_empty_frame(self): # is inplace, so that dtype is retained sera = Series(val1, index=keys1, dtype=np.float64) serb = Series(val2, index=keys2) - expected = DataFrame({"A": sera, "B": serb}).reindex(index=index) + expected = DataFrame( + {"A": sera, "B": serb}, columns=Index(["A", "B"], dtype=object) + ).reindex(index=index) tm.assert_frame_equal(df, expected) def test_loc_setitem_frame(self): @@ -979,7 +983,7 @@ def test_setitem_new_key_tz(self, indexer_sl): to_datetime(42).tz_localize("UTC"), to_datetime(666).tz_localize("UTC"), ] - expected = Series(vals, index=["foo", "bar"]) + expected = Series(vals, index=Index(["foo", "bar"], dtype=object)) ser = Series(dtype=object) indexer_sl(ser)["foo"] = vals[0] @@ -1254,6 +1258,7 @@ def test_loc_reverse_assignment(self): tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set int into string") def test_loc_setitem_str_to_small_float_conversion_type(self): # GH#20388 @@ -1439,7 +1444,7 @@ def test_loc_setitem_categorical_values_partial_column_slice(self): df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) tm.assert_frame_equal(df, exp) - def test_loc_setitem_single_row_categorical(self): + def test_loc_setitem_single_row_categorical(self, using_infer_string): # GH#25495 df = DataFrame({"Alpha": ["a"], "Numeric": [0]}) categories = Categorical(df["Alpha"], categories=["a", "b", "c"]) @@ -1449,7 +1454,9 @@ def test_loc_setitem_single_row_categorical(self): df.loc[:, "Alpha"] = categories result = df["Alpha"] - expected = Series(categories, index=df.index, name="Alpha").astype(object) + expected = Series(categories, index=df.index, name="Alpha").astype( + object if not using_infer_string else "string[pyarrow_numpy]" + ) tm.assert_series_equal(result, expected) # double-check that the non-loc setting retains categoricalness @@ -1615,7 +1622,7 @@ def test_loc_getitem_index_namedtuple(self): result = df.loc[IndexType("foo", "bar")]["A"] assert result == 1 - def test_loc_setitem_single_column_mixed(self): + def test_loc_setitem_single_column_mixed(self, using_infer_string): df = DataFrame( np.random.default_rng(2).standard_normal((5, 3)), index=["a", "b", "c", "d", "e"], @@ -1623,7 +1630,10 @@ def test_loc_setitem_single_column_mixed(self): ) df["str"] = "qux" df.loc[df.index[::2], "str"] = np.nan - expected = np.array([np.nan, "qux", np.nan, "qux", np.nan], dtype=object) + expected = Series( + [np.nan, "qux", np.nan, "qux", np.nan], + dtype=object if not using_infer_string else "string[pyarrow_numpy]", + ).values tm.assert_almost_equal(df["str"].values, expected) def test_loc_setitem_cast2(self): @@ -2016,11 +2026,15 @@ def test_loc_setitem_empty_series_str_idx(self): # partially set with an empty object series ser = Series(dtype=object) ser.loc["foo"] = 1 - tm.assert_series_equal(ser, Series([1], index=["foo"])) + tm.assert_series_equal(ser, Series([1], index=Index(["foo"], dtype=object))) ser.loc["bar"] = 3 - tm.assert_series_equal(ser, Series([1, 3], index=["foo", "bar"])) + tm.assert_series_equal( + ser, Series([1, 3], index=Index(["foo", "bar"], dtype=object)) + ) ser.loc[3] = 4 - tm.assert_series_equal(ser, Series([1, 3, 4], index=["foo", "bar", 3])) + tm.assert_series_equal( + ser, Series([1, 3, 4], index=Index(["foo", "bar", 3], dtype=object)) + ) def test_loc_setitem_incremental_with_dst(self): # GH#20724 @@ -2050,7 +2064,11 @@ def test_loc_setitem_datetime_keys_cast(self, conv): df.loc[conv(dt1), "one"] = 100 df.loc[conv(dt2), "one"] = 200 - expected = DataFrame({"one": [100.0, 200.0]}, index=[dt1, dt2]) + expected = DataFrame( + {"one": [100.0, 200.0]}, + index=[dt1, dt2], + columns=Index(["one"], dtype=object), + ) tm.assert_frame_equal(df, expected) def test_loc_setitem_categorical_column_retains_dtype(self, ordered): @@ -2168,11 +2186,11 @@ def test_loc_setitem_with_expansion_preserves_nullable_int(self, dtype): result = DataFrame(index=df.index) result.loc[df.index, "data"] = ser - tm.assert_frame_equal(result, df) + tm.assert_frame_equal(result, df, check_column_type=False) result = DataFrame(index=df.index) result.loc[df.index, "data"] = ser._values - tm.assert_frame_equal(result, df) + tm.assert_frame_equal(result, df, check_column_type=False) def test_loc_setitem_ea_not_full_column(self): # GH#39163 @@ -2262,7 +2280,10 @@ def test_frame_loc_getitem_callable_labels(self): def test_frame_loc_setitem_callable(self): # GH#11485 - df = DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD")) + df = DataFrame( + {"X": [1, 2, 3, 4], "Y": Series(list("aabb"), dtype=object)}, + index=list("ABCD"), + ) # return label res = df.copy() diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 3d04cc764563f..2f9018112c03b 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -28,7 +28,9 @@ def test_empty_frame_setitem_index_name_retained(self): df["series"] = series expected = DataFrame( - {"series": [1.23] * 4}, index=pd.RangeIndex(4, name="df_index") + {"series": [1.23] * 4}, + index=pd.RangeIndex(4, name="df_index"), + columns=Index(["series"], dtype=object), ) tm.assert_frame_equal(df, expected) @@ -39,7 +41,9 @@ def test_empty_frame_setitem_index_name_inherited(self): series = Series(1.23, index=pd.RangeIndex(4, name="series_index")) df["series"] = series expected = DataFrame( - {"series": [1.23] * 4}, index=pd.RangeIndex(4, name="series_index") + {"series": [1.23] * 4}, + index=pd.RangeIndex(4, name="series_index"), + columns=Index(["series"], dtype=object), ) tm.assert_frame_equal(df, expected) @@ -92,7 +96,9 @@ def test_partial_set_empty_frame2(self): # these work as they don't really change # anything but the index # GH#5632 - expected = DataFrame(columns=["foo"], index=Index([], dtype="object")) + expected = DataFrame( + columns=Index(["foo"], dtype=object), index=Index([], dtype="object") + ) df = DataFrame(index=Index([], dtype="object")) df["foo"] = Series([], dtype="object") @@ -110,7 +116,9 @@ def test_partial_set_empty_frame2(self): tm.assert_frame_equal(df, expected) def test_partial_set_empty_frame3(self): - expected = DataFrame(columns=["foo"], index=Index([], dtype="int64")) + expected = DataFrame( + columns=Index(["foo"], dtype=object), index=Index([], dtype="int64") + ) expected["foo"] = expected["foo"].astype("float64") df = DataFrame(index=Index([], dtype="int64")) @@ -127,7 +135,9 @@ def test_partial_set_empty_frame4(self): df = DataFrame(index=Index([], dtype="int64")) df["foo"] = range(len(df)) - expected = DataFrame(columns=["foo"], index=Index([], dtype="int64")) + expected = DataFrame( + columns=Index(["foo"], dtype=object), index=Index([], dtype="int64") + ) # range is int-dtype-like, so we get int64 dtype expected["foo"] = expected["foo"].astype("int64") tm.assert_frame_equal(df, expected) @@ -200,10 +210,10 @@ def test_partial_set_empty_frame_empty_copy_assignment(self): df = DataFrame(index=[0]) df = df.copy() df["a"] = 0 - expected = DataFrame(0, index=[0], columns=["a"]) + expected = DataFrame(0, index=[0], columns=Index(["a"], dtype=object)) tm.assert_frame_equal(df, expected) - def test_partial_set_empty_frame_empty_consistencies(self): + def test_partial_set_empty_frame_empty_consistencies(self, using_infer_string): # GH#6171 # consistency on empty frames df = DataFrame(columns=["x", "y"]) @@ -213,7 +223,15 @@ def test_partial_set_empty_frame_empty_consistencies(self): df = DataFrame(columns=["x", "y"]) df["x"] = ["1", "2"] - expected = DataFrame({"x": ["1", "2"], "y": [np.nan, np.nan]}, dtype=object) + expected = DataFrame( + { + "x": Series( + ["1", "2"], + dtype=object if not using_infer_string else "string[pyarrow_numpy]", + ), + "y": Series([np.nan, np.nan], dtype=object), + } + ) tm.assert_frame_equal(df, expected) df = DataFrame(columns=["x", "y"]) @@ -618,7 +636,7 @@ def test_loc_with_list_of_strings_representing_datetimes_missing_value( [ ( period_range(start="2000", periods=20, freq="D"), - ["4D", "8D"], + Index(["4D", "8D"], dtype=object), ( r"None of \[Index\(\['4D', '8D'\], dtype='object'\)\] " r"are in the \[index\]" @@ -626,7 +644,7 @@ def test_loc_with_list_of_strings_representing_datetimes_missing_value( ), ( date_range(start="2000", periods=20, freq="D"), - ["4D", "8D"], + Index(["4D", "8D"], dtype=object), ( r"None of \[Index\(\['4D', '8D'\], dtype='object'\)\] " r"are in the \[index\]" @@ -634,7 +652,7 @@ def test_loc_with_list_of_strings_representing_datetimes_missing_value( ), ( pd.timedelta_range(start="1 day", periods=20), - ["2000-01-04", "2000-01-08"], + Index(["2000-01-04", "2000-01-08"], dtype=object), ( r"None of \[Index\(\['2000-01-04', '2000-01-08'\], " r"dtype='object'\)\] are in the \[index\]" diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index abbdb77efad0e..caa2da1b6123b 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -14,6 +14,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -637,6 +639,9 @@ def test_dtype_backend_and_dtype(self, read_ext): ) tm.assert_frame_equal(result, df) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="infer_string takes precedence" + ) def test_dtype_backend_string(self, read_ext, string_storage): # GH#36712 if read_ext in (".xlsb", ".xls"): diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 22cd0621fd4c4..507d7ed4bf9d0 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -709,7 +709,7 @@ def test_excel_date_datetime_format(self, ext, path): # we need to use df_expected to check the result. tm.assert_frame_equal(rs2, df_expected) - def test_to_excel_interval_no_labels(self, path): + def test_to_excel_interval_no_labels(self, path, using_infer_string): # see gh-19242 # # Test writing Interval without labels. @@ -719,7 +719,9 @@ def test_to_excel_interval_no_labels(self, path): expected = df.copy() df["new"] = pd.cut(df[0], 10) - expected["new"] = pd.cut(expected[0], 10).astype(str) + expected["new"] = pd.cut(expected[0], 10).astype( + str if not using_infer_string else "string[pyarrow_numpy]" + ) df.to_excel(path, sheet_name="test1") with ExcelFile(path) as reader: @@ -1213,10 +1215,9 @@ def test_render_as_column_name(self, path): def test_true_and_false_value_options(self, path): # see gh-13347 - df = DataFrame([["foo", "bar"]], columns=["col1", "col2"]) - msg = "Downcasting behavior in `replace`" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = df.replace({"foo": True, "bar": False}) + df = DataFrame([["foo", "bar"]], columns=["col1", "col2"], dtype=object) + with option_context("future.no_silent_downcasting", True): + expected = df.replace({"foo": True, "bar": False}).astype("bool") df.to_excel(path) read_frame = pd.read_excel( diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index edac82193d1c8..f5738b83a8e64 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -25,7 +25,6 @@ read_csv, reset_option, ) -import pandas._testing as tm from pandas.io.formats import printing import pandas.io.formats.format as fmt @@ -834,19 +833,21 @@ def test_to_string_buffer_all_unicode(self): buf.getvalue() @pytest.mark.parametrize( - "index", + "index_scalar", [ - tm.makeStringIndex, - tm.makeIntIndex, - tm.makeDateIndex, - tm.makePeriodIndex, + "a" * 10, + 1, + Timestamp(2020, 1, 1), + pd.Period("2020-01-01"), ], ) @pytest.mark.parametrize("h", [10, 20]) @pytest.mark.parametrize("w", [10, 20]) - def test_to_string_truncate_indices(self, index, h, w): + def test_to_string_truncate_indices(self, index_scalar, h, w): with option_context("display.expand_frame_repr", False): - df = DataFrame(index=index(h), columns=tm.makeStringIndex(w)) + df = DataFrame( + index=[index_scalar] * h, columns=[str(i) * 10 for i in range(w)] + ) with option_context("display.max_rows", 15): if h == 20: assert has_vertically_truncated_repr(df) diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index a7648cf1c471a..605e5e182d8cc 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -59,7 +59,7 @@ def biggie_df_fixture(request): df = DataFrame( { "A": np.random.default_rng(2).standard_normal(200), - "B": tm.makeStringIndex(200), + "B": Index([f"{i}?!" for i in range(200)]), }, index=np.arange(200), ) diff --git a/pandas/tests/io/formats/test_to_string.py b/pandas/tests/io/formats/test_to_string.py index e607b6eb454a1..02c20b0e25477 100644 --- a/pandas/tests/io/formats/test_to_string.py +++ b/pandas/tests/io/formats/test_to_string.py @@ -804,7 +804,7 @@ def test_to_string(self): biggie = DataFrame( { "A": np.random.default_rng(2).standard_normal(200), - "B": tm.makeStringIndex(200), + "B": Index([f"{i}?!" for i in range(200)]), }, ) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 410c20bb22d1e..ff7d34c85c015 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -93,27 +93,31 @@ def test_read_unsupported_compression_type(): pd.read_json(path, compression="unsupported") +@pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] +) @pytest.mark.parametrize("to_infer", [True, False]) @pytest.mark.parametrize("read_infer", [True, False]) def test_to_json_compression( - compression_only, read_infer, to_infer, compression_to_extension + compression_only, read_infer, to_infer, compression_to_extension, infer_string ): - # see gh-15008 - compression = compression_only + with pd.option_context("future.infer_string", infer_string): + # see gh-15008 + compression = compression_only - # We'll complete file extension subsequently. - filename = "test." - filename += compression_to_extension[compression] + # We'll complete file extension subsequently. + filename = "test." + filename += compression_to_extension[compression] - df = pd.DataFrame({"A": [1]}) + df = pd.DataFrame({"A": [1]}) - to_compression = "infer" if to_infer else compression - read_compression = "infer" if read_infer else compression + to_compression = "infer" if to_infer else compression + read_compression = "infer" if read_infer else compression - with tm.ensure_clean(filename) as path: - df.to_json(path, compression=to_compression) - result = pd.read_json(path, compression=read_compression) - tm.assert_frame_equal(result, df) + with tm.ensure_clean(filename) as path: + df.to_json(path, compression=to_compression) + result = pd.read_json(path, compression=read_compression) + tm.assert_frame_equal(result, df) def test_to_json_compression_mode(compression): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 7312facc44c26..411cc90ba41a7 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -168,7 +168,7 @@ def test_frame_non_unique_columns(self, orient, data): # in milliseconds; these are internally stored in nanosecond, # so divide to get where we need # TODO: a to_epoch method would also solve; see GH 14772 - expected.iloc[:, 0] = expected.iloc[:, 0].view(np.int64) // 1000000 + expected.iloc[:, 0] = expected.iloc[:, 0].astype(np.int64) // 1000000 elif orient == "split": expected = df expected.columns = ["x", "x.1"] diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 32b4b1dedc3cb..0deafda750904 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -531,6 +531,9 @@ def test_dtype_backend_pyarrow(all_parsers, request): tm.assert_frame_equal(result, expected) +# pyarrow engine failing: +# https://github.com/pandas-dev/pandas/issues/56136 +@pytest.mark.usefixtures("pyarrow_xfail") def test_ea_int_avoid_overflow(all_parsers): # GH#32134 parser = all_parsers diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 50cf7d737eb99..89a0162af1c54 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -397,7 +397,14 @@ def check_col(key, name, size): store.append("df_new", df_new) # min_itemsize on Series index (GH 11412) - df = tm.makeMixedDataFrame().set_index("C") + df = DataFrame( + { + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": pd.Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object), + "D": date_range("20130101", periods=5), + } + ).set_index("C") store.append("ss", df["B"], min_itemsize={"index": 4}) tm.assert_series_equal(store.select("ss"), df["B"]) diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py index c109b72e9c239..59a05dc9ea546 100644 --- a/pandas/tests/io/pytables/test_put.py +++ b/pandas/tests/io/pytables/test_put.py @@ -196,32 +196,27 @@ def test_put_mixed_type(setup_path): tm.assert_frame_equal(expected, df) +@pytest.mark.parametrize("format", ["table", "fixed"]) @pytest.mark.parametrize( - "format, index", + "index", [ - ["table", tm.makeFloatIndex], - ["table", tm.makeStringIndex], - ["table", tm.makeIntIndex], - ["table", tm.makeDateIndex], - ["fixed", tm.makeFloatIndex], - ["fixed", tm.makeStringIndex], - ["fixed", tm.makeIntIndex], - ["fixed", tm.makeDateIndex], - ["table", tm.makePeriodIndex], # GH#7796 - ["fixed", tm.makePeriodIndex], + Index([str(i) for i in range(10)]), + Index(np.arange(10, dtype=float)), + Index(np.arange(10)), + date_range("2020-01-01", periods=10), + pd.period_range("2020-01-01", periods=10), ], ) -@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_store_index_types(setup_path, format, index): # GH5386 # test storing various index types with ensure_clean_store(setup_path) as store: df = DataFrame( - np.random.default_rng(2).standard_normal((10, 2)), columns=list("AB") + np.random.default_rng(2).standard_normal((10, 2)), + columns=list("AB"), + index=index, ) - df.index = index(len(df)) - _maybe_remove(store, "df") store.put("df", df, format=format) tm.assert_frame_equal(df, store["df"]) diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 6c24843f18d0d..e05e1e96eb11f 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -51,7 +51,8 @@ def roundtrip(key, obj, **kwargs): def test_long_strings(setup_path): # GH6166 - df = DataFrame({"a": tm.makeStringIndex(10)}, index=tm.makeStringIndex(10)) + data = ["a" * 50] * 10 + df = DataFrame({"a": data}, index=data) with ensure_clean_store(setup_path) as store: store.append("df", df, data_columns=["a"]) @@ -65,7 +66,7 @@ def test_api(tmp_path, setup_path): # API issue when to_hdf doesn't accept append AND format args path = tmp_path / setup_path - df = tm.makeDataFrame() + df = DataFrame(range(20)) df.iloc[:10].to_hdf(path, key="df", append=True, format="table") df.iloc[10:].to_hdf(path, key="df", append=True, format="table") tm.assert_frame_equal(read_hdf(path, "df"), df) @@ -79,7 +80,7 @@ def test_api(tmp_path, setup_path): def test_api_append(tmp_path, setup_path): path = tmp_path / setup_path - df = tm.makeDataFrame() + df = DataFrame(range(20)) df.iloc[:10].to_hdf(path, key="df", append=True) df.iloc[10:].to_hdf(path, key="df", append=True, format="table") tm.assert_frame_equal(read_hdf(path, "df"), df) @@ -93,7 +94,7 @@ def test_api_append(tmp_path, setup_path): def test_api_2(tmp_path, setup_path): path = tmp_path / setup_path - df = tm.makeDataFrame() + df = DataFrame(range(20)) df.to_hdf(path, key="df", append=False, format="fixed") tm.assert_frame_equal(read_hdf(path, "df"), df) @@ -107,7 +108,7 @@ def test_api_2(tmp_path, setup_path): tm.assert_frame_equal(read_hdf(path, "df"), df) with ensure_clean_store(setup_path) as store: - df = tm.makeDataFrame() + df = DataFrame(range(20)) _maybe_remove(store, "df") store.append("df", df.iloc[:10], append=True, format="table") @@ -531,3 +532,18 @@ def test_round_trip_equals(tmp_path, setup_path): tm.assert_frame_equal(df, other) assert df.equals(other) assert other.equals(df) + + +def test_infer_string_columns(tmp_path, setup_path): + # GH# + pytest.importorskip("pyarrow") + path = tmp_path / setup_path + with pd.option_context("future.infer_string", True): + df = DataFrame(1, columns=list("ABCD"), index=list(range(10))).set_index( + ["A", "B"] + ) + expected = df.copy() + df.to_hdf(path, key="df", format="table") + + result = read_hdf(path, "df") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 96c160ab40bd8..5b16ea9ee6b09 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -323,7 +323,14 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path): path = tmp_path / setup_path # min_itemsize in index with to_hdf (GH 10381) - df = tm.makeMixedDataFrame().set_index("C") + df = DataFrame( + { + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object), + "D": date_range("20130101", periods=5), + } + ).set_index("C") df.to_hdf(path, key="ss3", format="table", min_itemsize={"index": 6}) # just make sure there is a longer string: df2 = df.copy().reset_index().assign(C="longer").set_index("C") @@ -956,10 +963,6 @@ def test_to_hdf_with_object_column_names(tmp_path, setup_path): tm.makeTimedeltaIndex, tm.makePeriodIndex, ] - types_should_run = [ - tm.makeStringIndex, - tm.makeCategoricalIndex, - ] for index in types_should_fail: df = DataFrame( @@ -970,14 +973,18 @@ def test_to_hdf_with_object_column_names(tmp_path, setup_path): with pytest.raises(ValueError, match=msg): df.to_hdf(path, key="df", format="table", data_columns=True) - for index in types_should_run: - df = DataFrame( - np.random.default_rng(2).standard_normal((10, 2)), columns=index(2) - ) - path = tmp_path / setup_path - df.to_hdf(path, key="df", format="table", data_columns=True) - result = read_hdf(path, "df", where=f"index = [{df.index[0]}]") - assert len(result) + +@pytest.mark.parametrize("dtype", [None, "category"]) +def test_to_hdf_with_object_column_names_should_run(tmp_path, setup_path, dtype): + # GH9057 + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 2)), + columns=Index(["a", "b"], dtype=dtype), + ) + path = tmp_path / setup_path + df.to_hdf(path, key="df", format="table", data_columns=True) + result = read_hdf(path, "df", where=f"index = [{df.index[0]}]") + assert len(result) def test_hdfstore_strides(setup_path): diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 45d0a839b9e1a..d7c69ff17749c 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1908,7 +1908,7 @@ def test_api_timedelta(conn, request): name="foo", ) else: - expected = df["foo"].view("int64") + expected = df["foo"].astype("int64") tm.assert_series_equal(result["foo"], expected) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 88655483800ee..e4456b0a78e06 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -32,6 +32,7 @@ ArrowStringArray, StringArray, ) +from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics from pandas.io.common import get_handle from pandas.io.xml import read_xml @@ -2004,7 +2005,9 @@ def test_s3_parser_consistency(s3_public_bucket_with_data, s3so): tm.assert_frame_equal(df_lxml, df_etree) -def test_read_xml_nullable_dtypes(parser, string_storage, dtype_backend): +def test_read_xml_nullable_dtypes( + parser, string_storage, dtype_backend, using_infer_string +): # GH#50500 data = """ @@ -2032,7 +2035,12 @@ def test_read_xml_nullable_dtypes(parser, string_storage, dtype_backend): """ - if string_storage == "python": + if using_infer_string: + pa = pytest.importorskip("pyarrow") + string_array = ArrowStringArrayNumpySemantics(pa.array(["x", "y"])) + string_array_na = ArrowStringArrayNumpySemantics(pa.array(["x", None])) + + elif string_storage == "python": string_array = StringArray(np.array(["x", "y"], dtype=np.object_)) string_array_na = StringArray(np.array(["x", NA], dtype=np.object_)) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 9c4ae92224148..303f8550c5a80 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -33,13 +33,13 @@ def get_objs(): indexes = [ - tm.makeBoolIndex(10, name="a"), + Index([True, False] * 5, name="a"), tm.makeIntIndex(10, name="a"), tm.makeFloatIndex(10, name="a"), tm.makeDateIndex(10, name="a"), tm.makeDateIndex(10, name="a").tz_localize(tz="US/Eastern"), tm.makePeriodIndex(10, name="a"), - tm.makeStringIndex(10, name="a"), + Index([str(i) for i in range(10)], name="a"), ] arr = np.random.default_rng(2).standard_normal(10) diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 1dc25cb9d4c1e..3d9098917a12d 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -87,19 +87,17 @@ def f(df): @pytest.mark.parametrize( - "func", + "index", [ - tm.makeIntIndex, - tm.makeStringIndex, - tm.makeFloatIndex, - (lambda m: tm.makeCustomIndex(m, 2)), + Index([1, 2]), + Index(["a", "b"]), + Index([1.1, 2.2]), + pd.MultiIndex.from_arrays([[1, 2], ["a", "b"]]), ], ) -def test_fails_on_no_datetime_index(func): - n = 2 - index = func(n) +def test_fails_on_no_datetime_index(index): name = type(index).__name__ - df = DataFrame({"a": np.random.default_rng(2).standard_normal(n)}, index=index) + df = DataFrame({"a": range(len(index))}, index=index) msg = ( "Only valid with DatetimeIndex, TimedeltaIndex " diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index c4c83e2046b76..f07c223bf0de2 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -11,6 +11,7 @@ MultiIndex, Series, Timestamp, + bdate_range, concat, merge, ) @@ -57,8 +58,13 @@ def df2(self): @pytest.fixture def target_source(self): - index, data = tm.getMixedTypeDict() - target = DataFrame(data, index=index) + data = { + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "D": bdate_range("1/1/2009", periods=5), + } + target = DataFrame(data, index=Index(["a", "b", "c", "d", "e"], dtype=object)) # Join on string value diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index f5d78fbd44812..269d3a2b7078e 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -188,7 +188,7 @@ def test_merge_multiple_cols_with_mixed_cols_index(self): def test_compress_group_combinations(self): # ~ 40000000 possible unique groups - key1 = tm.makeStringIndex(10000) + key1 = [str(i) for i in range(10000)] key1 = np.tile(key1, 2) key2 = key1[::-1] diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index 0201e5d9af2ee..91314a497b1fb 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -457,6 +457,13 @@ def test_constructor_str_infer_reso(self): assert ts == Timestamp("01-01-2013T00:00:00.000000002+0000") assert ts.unit == "ns" + # GH#56208 minute reso through the ISO8601 path with tz offset + ts = Timestamp("2020-01-01 00:00+00:00") + assert ts.unit == "s" + + ts = Timestamp("2020-01-01 00+00:00") + assert ts.unit == "s" + class TestTimestampConstructors: def test_weekday_but_no_day_raises(self): diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index 795b2eab82aca..1cbb7c7982802 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -51,9 +51,9 @@ def test_combine_first(self): tm.assert_series_equal(combined[1::2], series_copy[1::2]) # mixed types - index = tm.makeStringIndex(20) + index = pd.Index([str(i) for i in range(20)]) floats = Series(np.random.default_rng(2).standard_normal(20), index=index) - strings = Series(tm.makeStringIndex(10), index=index[::2], dtype=object) + strings = Series([str(i) for i in range(10)], index=index[::2], dtype=object) combined = strings.combine_first(floats) diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index f86f6069a2ef3..7b14e1289abf0 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -14,6 +14,7 @@ Index, MultiIndex, Series, + bdate_range, isna, timedelta_range, ) @@ -154,8 +155,13 @@ def test_list_raises(string_series): string_series.map([lambda x: x]) -def test_map(datetime_series): - index, data = tm.getMixedTypeDict() +def test_map(): + data = { + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "D": bdate_range("1/1/2009", periods=5), + } source = Series(data["B"], index=data["C"]) target = Series(data["C"][:4], index=data["D"][:4]) @@ -171,10 +177,14 @@ def test_map(datetime_series): for k, v in merged.items(): assert v == source[target[k]] + +def test_map_datetime(datetime_series): # function result = datetime_series.map(lambda x: x * 2) tm.assert_series_equal(result, datetime_series * 2) + +def test_map_category(): # GH 10324 a = Series([1, 2, 3, 4]) b = Series(["even", "odd", "even", "odd"], dtype="category") @@ -185,6 +195,8 @@ def test_map(datetime_series): exp = Series(["odd", "even", "odd", np.nan]) tm.assert_series_equal(a.map(c), exp) + +def test_map_category_numeric(): a = Series(["a", "b", "c", "d"]) b = Series([1, 2, 3, 4], index=pd.CategoricalIndex(["b", "c", "d", "e"])) c = Series([1, 2, 3, 4], index=Index(["b", "c", "d", "e"])) @@ -194,6 +206,8 @@ def test_map(datetime_series): exp = Series([np.nan, 1, 2, 3]) tm.assert_series_equal(a.map(c), exp) + +def test_map_category_string(): a = Series(["a", "b", "c", "d"]) b = Series( ["B", "C", "D", "E"], diff --git a/pandas/tests/series/methods/test_view.py b/pandas/tests/series/methods/test_view.py index a9b29c9329193..7e0ac372cd443 100644 --- a/pandas/tests/series/methods/test_view.py +++ b/pandas/tests/series/methods/test_view.py @@ -9,6 +9,10 @@ ) import pandas._testing as tm +pytestmark = pytest.mark.filterwarnings( + "ignore:Series.view is deprecated and will be removed in a future version.:FutureWarning" # noqa: E501 +) + class TestView: def test_view_i8_to_datetimelike(self): diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index a39b3ff7e6f2b..9d69321ff7dbb 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -68,7 +68,7 @@ def test_tab_completion_with_categorical(self): @pytest.mark.parametrize( "index", [ - tm.makeStringIndex(10), + Index([str(i) for i in range(10)]), tm.makeCategoricalIndex(10), Index(["foo", "bar", "baz"] * 2), tm.makeDateIndex(10), @@ -140,7 +140,9 @@ def test_ndarray_compat_like_func(self): def test_ndarray_compat_ravel(self): # ravel s = Series(np.random.default_rng(2).standard_normal(10)) - tm.assert_almost_equal(s.ravel(order="F"), s.values.ravel(order="F")) + with tm.assert_produces_warning(FutureWarning, match="ravel is deprecated"): + result = s.ravel(order="F") + tm.assert_almost_equal(result, s.values.ravel(order="F")) def test_empty_method(self): s_empty = Series(dtype=object) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 84c612c43da29..f77bff049124b 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -572,7 +572,10 @@ def test_constructor_maskedarray(self): data[1] = 1 result = Series(data, index=index) expected = Series([0, 1, 2], index=index, dtype=int) - tm.assert_series_equal(result, expected) + with pytest.raises(AssertionError, match="Series classes are different"): + # TODO should this be raising at all? + # https://github.com/pandas-dev/pandas/issues/56131 + tm.assert_series_equal(result, expected) data = ma.masked_all((3,), dtype=bool) result = Series(data) @@ -589,7 +592,10 @@ def test_constructor_maskedarray(self): data[1] = True result = Series(data, index=index) expected = Series([True, True, False], index=index, dtype=bool) - tm.assert_series_equal(result, expected) + with pytest.raises(AssertionError, match="Series classes are different"): + # TODO should this be raising at all? + # https://github.com/pandas-dev/pandas/issues/56131 + tm.assert_series_equal(result, expected) data = ma.masked_all((3,), dtype="M8[ns]") result = Series(data) @@ -969,7 +975,7 @@ def test_constructor_dtype_datetime64_10(self): # GH3414 related expected = Series(pydates, dtype="datetime64[ms]") - result = Series(Series(dates).view(np.int64) / 1000000, dtype="M8[ms]") + result = Series(Series(dates).astype(np.int64) / 1000000, dtype="M8[ms]") tm.assert_series_equal(result, expected) result = Series(dates, dtype="datetime64[ms]") diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index a6e63dfd5f409..e5302ec9833f1 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -976,7 +976,7 @@ def test_isin_datetimelike_values_numeric_comps(self, dtype, dtype1): # Anything but object and we get all-False shortcut dta = date_range("2013-01-01", periods=3)._values - arr = Series(dta.view("i8")).view(dtype1)._values + arr = Series(dta.view("i8")).array.view(dtype1) comps = arr.view("i8").astype(dtype) @@ -1663,19 +1663,18 @@ def test_unique_complex_numbers(self, array, expected): class TestHashTable: @pytest.mark.parametrize( - "htable, tm_dtype", + "htable, data", [ - (ht.PyObjectHashTable, "String"), - (ht.StringHashTable, "String"), - (ht.Float64HashTable, "Float"), - (ht.Int64HashTable, "Int"), - (ht.UInt64HashTable, "UInt"), + (ht.PyObjectHashTable, [f"foo_{i}" for i in range(1000)]), + (ht.StringHashTable, [f"foo_{i}" for i in range(1000)]), + (ht.Float64HashTable, np.arange(1000, dtype=np.float64)), + (ht.Int64HashTable, np.arange(1000, dtype=np.int64)), + (ht.UInt64HashTable, np.arange(1000, dtype=np.uint64)), ], ) - def test_hashtable_unique(self, htable, tm_dtype, writable): + def test_hashtable_unique(self, htable, data, writable): # output of maker has guaranteed unique elements - maker = getattr(tm, "make" + tm_dtype + "Index") - s = Series(maker(1000)) + s = Series(data) if htable == ht.Float64HashTable: # add NaN for float column s.loc[500] = np.nan @@ -1703,19 +1702,18 @@ def test_hashtable_unique(self, htable, tm_dtype, writable): tm.assert_numpy_array_equal(reconstr, s_duplicated.values) @pytest.mark.parametrize( - "htable, tm_dtype", + "htable, data", [ - (ht.PyObjectHashTable, "String"), - (ht.StringHashTable, "String"), - (ht.Float64HashTable, "Float"), - (ht.Int64HashTable, "Int"), - (ht.UInt64HashTable, "UInt"), + (ht.PyObjectHashTable, [f"foo_{i}" for i in range(1000)]), + (ht.StringHashTable, [f"foo_{i}" for i in range(1000)]), + (ht.Float64HashTable, np.arange(1000, dtype=np.float64)), + (ht.Int64HashTable, np.arange(1000, dtype=np.int64)), + (ht.UInt64HashTable, np.arange(1000, dtype=np.uint64)), ], ) - def test_hashtable_factorize(self, htable, tm_dtype, writable): + def test_hashtable_factorize(self, htable, writable, data): # output of maker has guaranteed unique elements - maker = getattr(tm, "make" + tm_dtype + "Index") - s = Series(maker(1000)) + s = Series(data) if htable == ht.Float64HashTable: # add NaN for float column s.loc[500] = np.nan diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index a0062d2b6dd44..632d9783c7f81 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -1114,12 +1114,16 @@ def test_nanmean(self, unit): expected = dti[1] for obj in [dti, DatetimeArray(dti), Series(dti)]: + if isinstance(obj, Series): + obj = obj._values result = nanops.nanmean(obj) assert result == expected dti2 = dti.insert(1, pd.NaT) for obj in [dti2, DatetimeArray(dti2), Series(dti2)]: + if isinstance(obj, Series): + obj = obj._values result = nanops.nanmean(obj) assert result == expected diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index ee209f74eb4e0..508e03f617376 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2118,7 +2118,13 @@ def test_float_to_datetime_raise_near_bounds(self): expected = (should_succeed * oneday_in_ns).astype(np.int64) for error_mode in ["raise", "coerce", "ignore"]: result1 = to_datetime(should_succeed, unit="D", errors=error_mode) - tm.assert_almost_equal(result1.astype(np.int64), expected, rtol=1e-10) + # Cast to `np.float64` so that `rtol` and inexact checking kick in + # (`check_exact` doesn't take place for integer dtypes) + tm.assert_almost_equal( + result1.astype(np.int64).astype(np.float64), + expected.astype(np.float64), + rtol=1e-10, + ) # just out of bounds should_fail1 = Series([0, tsmax_in_days + 0.005], dtype=float) should_fail2 = Series([0, -tsmax_in_days - 0.005], dtype=float) diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index 75528a8b99c4d..5d22896d9d055 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -401,7 +401,7 @@ def test_invalid_index_types_unicode(): msg = "Unknown datetime string format" with pytest.raises(ValueError, match=msg): - frequencies.infer_freq(tm.makeStringIndex(10)) + frequencies.infer_freq(Index(["ZqgszYBfuL"])) def test_string_datetime_like_compat(): diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index 0c93ee453bb1c..a074898f6046d 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -211,7 +211,10 @@ def test_assert_frame_equal_extension_dtype_mismatch(): "\\[right\\]: int[32|64]" ) - tm.assert_frame_equal(left, right, check_dtype=False) + # TODO: this shouldn't raise (or should raise a better error message) + # https://github.com/pandas-dev/pandas/issues/56131 + with pytest.raises(AssertionError, match="classes are different"): + tm.assert_frame_equal(left, right, check_dtype=False) with pytest.raises(AssertionError, match=msg): tm.assert_frame_equal(left, right, check_dtype=True) @@ -236,11 +239,18 @@ def test_assert_frame_equal_interval_dtype_mismatch(): tm.assert_frame_equal(left, right, check_dtype=True) -@pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) -def test_assert_frame_equal_ignore_extension_dtype_mismatch(right_dtype): +def test_assert_frame_equal_ignore_extension_dtype_mismatch(): + # https://github.com/pandas-dev/pandas/issues/35715 + left = DataFrame({"a": [1, 2, 3]}, dtype="Int64") + right = DataFrame({"a": [1, 2, 3]}, dtype="Int32") + tm.assert_frame_equal(left, right, check_dtype=False) + + +@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/56131") +def test_assert_frame_equal_ignore_extension_dtype_mismatch_cross_class(): # https://github.com/pandas-dev/pandas/issues/35715 left = DataFrame({"a": [1, 2, 3]}, dtype="Int64") - right = DataFrame({"a": [1, 2, 3]}, dtype=right_dtype) + right = DataFrame({"a": [1, 2, 3]}, dtype="int64") tm.assert_frame_equal(left, right, check_dtype=False) diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index ffc5e105d6f2f..f722f619bc456 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -290,7 +290,10 @@ def test_assert_series_equal_extension_dtype_mismatch(): \\[left\\]: Int64 \\[right\\]: int[32|64]""" - tm.assert_series_equal(left, right, check_dtype=False) + # TODO: this shouldn't raise (or should raise a better error message) + # https://github.com/pandas-dev/pandas/issues/56131 + with pytest.raises(AssertionError, match="Series classes are different"): + tm.assert_series_equal(left, right, check_dtype=False) with pytest.raises(AssertionError, match=msg): tm.assert_series_equal(left, right, check_dtype=True) @@ -362,11 +365,18 @@ def test_series_equal_exact_for_nonnumeric(): tm.assert_series_equal(s3, s1, check_exact=True) -@pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) -def test_assert_series_equal_ignore_extension_dtype_mismatch(right_dtype): +def test_assert_series_equal_ignore_extension_dtype_mismatch(): # https://github.com/pandas-dev/pandas/issues/35715 left = Series([1, 2, 3], dtype="Int64") - right = Series([1, 2, 3], dtype=right_dtype) + right = Series([1, 2, 3], dtype="Int32") + tm.assert_series_equal(left, right, check_dtype=False) + + +@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/56131") +def test_assert_series_equal_ignore_extension_dtype_mismatch_cross_class(): + # https://github.com/pandas-dev/pandas/issues/35715 + left = Series([1, 2, 3], dtype="Int64") + right = Series([1, 2, 3], dtype="int64") tm.assert_series_equal(left, right, check_dtype=False) @@ -437,3 +447,12 @@ def test_check_dtype_false_different_reso(dtype): with pytest.raises(AssertionError, match="Series are different"): tm.assert_series_equal(ser_s, ser_ms, check_dtype=False) + + +@pytest.mark.parametrize("dtype", ["Int64", "int64"]) +def test_large_unequal_ints(dtype): + # https://github.com/pandas-dev/pandas/issues/55882 + left = Series([1577840521123000], dtype=dtype) + right = Series([1577840521123543], dtype=dtype) + with pytest.raises(AssertionError, match="Series are different"): + tm.assert_series_equal(left, right) diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index 00dc184a0ac4d..5f1d905aa4a46 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -136,7 +136,14 @@ def test_multiindex_objects(): DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}), DataFrame(), DataFrame(np.full((10, 4), np.nan)), - tm.makeMixedDataFrame(), + DataFrame( + { + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object), + "D": pd.date_range("20130101", periods=5), + } + ), tm.makeTimeDataFrame(), tm.makeTimeSeries(), Series(tm.makePeriodIndex()), @@ -162,7 +169,14 @@ def test_hash_pandas_object(obj, index): Series([True, False, True]), DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}), DataFrame(np.full((10, 4), np.nan)), - tm.makeMixedDataFrame(), + DataFrame( + { + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object), + "D": pd.date_range("20130101", periods=5), + } + ), tm.makeTimeDataFrame(), tm.makeTimeSeries(), Series(tm.makePeriodIndex()), @@ -328,9 +342,9 @@ def test_alternate_encoding(index): @pytest.mark.parametrize("l_add", [0, 1]) def test_same_len_hash_collisions(l_exp, l_add): length = 2 ** (l_exp + 8) + l_add - s = tm.makeStringIndex(length).to_numpy() + idx = np.array([str(i) for i in range(length)], dtype=object) - result = hash_array(s, "utf8") + result = hash_array(idx, "utf8") assert not result[0] == result[1] diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 33858e10afd75..fe2da210c6fe9 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -70,7 +70,9 @@ def tests_skip_nuisance(step): def test_sum_object_str_raises(step): df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) r = df.rolling(window=3, step=step) - with pytest.raises(DataError, match="Cannot aggregate non-numeric type: object"): + with pytest.raises( + DataError, match="Cannot aggregate non-numeric type: object|string" + ): # GH#42738, enforced in 2.0 r.sum() diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 4fd33d54ef846..400bf10817ab8 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -1181,7 +1181,9 @@ def test_pairwise_methods(self, method, expected_data): ) tm.assert_frame_equal(result, expected) - expected = df.groupby("A").apply(lambda x: getattr(x.ewm(com=1.0), method)()) + expected = df.groupby("A")[["B"]].apply( + lambda x: getattr(x.ewm(com=1.0), method)() + ) tm.assert_frame_equal(result, expected) def test_times(self, times_frame): diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 5bde4c21cfab5..26f1311e950ef 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -51,6 +51,7 @@ "_chained_assignment_msg", "_chained_assignment_method_msg", "_chained_assignment_warning_method_msg", + "_check_cacher", "_version_meson", # The numba extensions need this to mock the iloc object "_iLocIndexer",