From 6dc4a6598172d75f7dfd383af75b1c9efb0f1c11 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 18 Sep 2023 09:00:44 -1000 Subject: [PATCH 001/131] STY: Fix precommit failure (#55186) --- pandas/_libs/tslibs/timedeltas.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 2178e972b4d08..2f6fa35cae070 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1942,7 +1942,7 @@ class Timedelta(_Timedelta): ---------- freq : str Frequency string indicating the rounding resolution. - It uses the same units as class contructor :class:`~pandas.Timedelta`. + It uses the same units as class constructor :class:`~pandas.Timedelta`. Returns ------- @@ -1970,7 +1970,7 @@ class Timedelta(_Timedelta): ---------- freq : str Frequency string indicating the flooring resolution. - It uses the same units as class contructor :class:`~pandas.Timedelta`. + It uses the same units as class constructor :class:`~pandas.Timedelta`. Examples -------- @@ -1990,7 +1990,7 @@ class Timedelta(_Timedelta): ---------- freq : str Frequency string indicating the ceiling resolution. - It uses the same units as class contructor :class:`~pandas.Timedelta`. + It uses the same units as class constructor :class:`~pandas.Timedelta`. Examples -------- From 025e6ae2e9de9036c0b64f9f6571f00888b9f403 Mon Sep 17 00:00:00 2001 From: Rajat Subhra Mukherjee Date: Tue, 19 Sep 2023 01:12:38 +0530 Subject: [PATCH 002/131] DOC: added deprecation message in docs for DataFrame.rolling (#55177) * added deprecation message in docs * requested changes --- pandas/core/window/rolling.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index becbba703f92c..72e94d049a9de 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -937,6 +937,11 @@ class Window(BaseWindow): For `Series` this parameter is unused and defaults to 0. + .. deprecated:: 2.1.0 + + The axis keyword is deprecated. For ``axis=1``, + transpose the DataFrame first instead. + closed : str, default None If ``'right'``, the first point in the window is excluded from calculations. From 68268458ae3091875841b750c9763e64de835e2f Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Mon, 18 Sep 2023 22:19:01 +0200 Subject: [PATCH 003/131] DOC: correct an example in whatsnew v0.14.0.rst (#55182) --- doc/source/whatsnew/v0.14.0.rst | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst index 9c537b3a48c74..33c3de577c063 100644 --- a/doc/source/whatsnew/v0.14.0.rst +++ b/doc/source/whatsnew/v0.14.0.rst @@ -349,9 +349,15 @@ More consistent behavior for some groupby methods: - groupby head and tail respect column selection: - .. ipython:: python + .. code-block:: ipython + + In [19]: g[['B']].head(1) + Out[19]: + B + 0 2 + 2 6 - g[['B']].head(1) + [2 rows x 1 columns] - groupby ``nth`` now reduces by default; filtering can be achieved by passing ``as_index=False``. With an optional ``dropna`` argument to ignore NaN. See :ref:`the docs `. From 6b3e66b53d10e19e83b51f14a66a335d5ff3394b Mon Sep 17 00:00:00 2001 From: Doug Davis Date: Mon, 18 Sep 2023 16:37:33 -0500 Subject: [PATCH 004/131] ENH: add `ExtensionArray._explode` method; adjust pyarrow extension for use of new interface (#54834) * add ExtensionArray._explode method; adjust pyarrow extension for use * black * add to whatsnew 2.1.0 * pre-commit fix * add _explode to docs * Update pandas/core/arrays/arrow/array.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * switch whatsnew files * adjust docstring * fix docstring --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/reference/extensions.rst | 1 + doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/arrow/array.py | 4 +++ pandas/core/arrays/base.py | 36 +++++++++++++++++++++ pandas/core/series.py | 7 ++-- pandas/tests/series/methods/test_explode.py | 10 ++++++ 6 files changed, 54 insertions(+), 5 deletions(-) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index e177e2b1d87d5..83f830bb11198 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -34,6 +34,7 @@ objects. api.extensions.ExtensionArray._accumulate api.extensions.ExtensionArray._concat_same_type + api.extensions.ExtensionArray._explode api.extensions.ExtensionArray._formatter api.extensions.ExtensionArray._from_factorized api.extensions.ExtensionArray._from_sequence diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 55a3419e95703..0fc4afc95a2ce 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -73,6 +73,7 @@ enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ +- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index a329c37c77449..e67b7035822cc 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1609,6 +1609,10 @@ def _explode(self): """ See Series.explode.__doc__. """ + # child class explode method supports only list types; return + # default implementation for non list types. + if not pa.types.is_list(self.dtype.pyarrow_dtype): + return super()._explode() values = self counts = pa.compute.list_value_length(values._pa_array) counts = counts.fill_null(1).to_numpy() diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index f3bb7323c7d5f..933944dbd4632 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -142,6 +142,7 @@ class ExtensionArray: view _accumulate _concat_same_type + _explode _formatter _from_factorized _from_sequence @@ -1924,6 +1925,41 @@ def _hash_pandas_object( values, encoding=encoding, hash_key=hash_key, categorize=categorize ) + def _explode(self) -> tuple[Self, npt.NDArray[np.uint64]]: + """ + Transform each element of list-like to a row. + + For arrays that do not contain list-like elements the default + implementation of this method just returns a copy and an array + of ones (unchanged index). + + Returns + ------- + ExtensionArray + Array with the exploded values. + np.ndarray[uint64] + The original lengths of each list-like for determining the + resulting index. + + See Also + -------- + Series.explode : The method on the ``Series`` object that this + extension array method is meant to support. + + Examples + -------- + >>> import pyarrow as pa + >>> a = pd.array([[1, 2, 3], [4], [5, 6]], + ... dtype=pd.ArrowDtype(pa.list_(pa.int64()))) + >>> a._explode() + ( + [1, 2, 3, 4, 5, 6] + Length: 6, dtype: int64[pyarrow], array([3, 1, 2], dtype=int32)) + """ + values = self.copy() + counts = np.ones(shape=(len(self),), dtype=np.uint64) + return values, counts + def tolist(self) -> list: """ Return a list of the values. diff --git a/pandas/core/series.py b/pandas/core/series.py index e0e27581ef7e2..78ec1554198df 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -76,10 +76,7 @@ pandas_dtype, validate_all_hashable, ) -from pandas.core.dtypes.dtypes import ( - ArrowDtype, - ExtensionDtype, -) +from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCDataFrame from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import ( @@ -4390,7 +4387,7 @@ def explode(self, ignore_index: bool = False) -> Series: 3 4 dtype: object """ - if isinstance(self.dtype, ArrowDtype) and self.dtype.type == list: + if isinstance(self.dtype, ExtensionDtype): values, counts = self._values._explode() elif len(self) and is_object_dtype(self.dtype): values, counts = reshape.explode(np.asarray(self._values)) diff --git a/pandas/tests/series/methods/test_explode.py b/pandas/tests/series/methods/test_explode.py index c8a9eb6f89fde..5a0188585ef30 100644 --- a/pandas/tests/series/methods/test_explode.py +++ b/pandas/tests/series/methods/test_explode.py @@ -163,3 +163,13 @@ def test_explode_pyarrow_list_type(ignore_index): dtype=pd.ArrowDtype(pa.int64()), ) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ignore_index", [True, False]) +def test_explode_pyarrow_non_list_type(ignore_index): + pa = pytest.importorskip("pyarrow") + data = [1, 2, 3] + ser = pd.Series(data, dtype=pd.ArrowDtype(pa.int64())) + result = ser.explode(ignore_index=ignore_index) + expected = pd.Series([1, 2, 3], dtype="int64[pyarrow]", index=[0, 1, 2]) + tm.assert_series_equal(result, expected) From ebca6dfb8f159ddc00081a463a6d9eda0fc96d8c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 18 Sep 2023 12:02:54 -1000 Subject: [PATCH 005/131] CI: Fix no BLAS error in 32 bit build (#55185) --- .github/workflows/unit-tests.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 98c6226b14075..139cfcde95b2c 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -230,11 +230,13 @@ jobs: git -c user.email="you@example.com" merge --no-commit my_ref_name fi - name: Build environment and Run Tests + # https://github.com/numpy/numpy/issues/24703#issuecomment-1722379388 run: | /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install numpy --config-settings=setup-args="-Dallow-noblas=true" + python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . python -m pip list --no-cache-dir export PANDAS_CI=1 From df7c0b772c775a65ae5fa2dae3ac192060ea2fae Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 18 Sep 2023 19:47:10 -0400 Subject: [PATCH 006/131] Replace classes with pytest in test_sql (#55074) * initial working test * passing mixing class removal * converted non-sqlalchemy tests * large refactor * sqlite class conversion * checkpoint * sqlitefallback conversion * fixup tests * no more test classes * factory func * most code cleanups * removed breakpoint; passing tests * fixes * fix when missing SQLAlchemy * more fixups when no SQLAlchemy * fixups * xfail -> skip * sqlite fixture use transaction for cleanup * verbose test for hangs * try skipping sqlite-sqlalchemy-memory on rollback test * sqlite sqlaclchemy memory cleanup * revert verbose logging in tests * mark all db tests * try single_cpu * skip more engine tests that can hang * try no pandasSQL without transaction * more skip * try verbose * transaction skips * remove verbose CI * CI verbose * no more hanging * reverted CI files * type ignore * cleanup skips * remove marks * mark fixtures * mark postgres fixtures --- pandas/tests/io/test_sql.py | 3772 +++++++++++++++++------------------ 1 file changed, 1880 insertions(+), 1892 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 1abe0ad55a864..f015c9efe7122 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -397,6 +397,54 @@ def test_frame3(): return DataFrame(data, columns=columns) +def get_all_views(conn): + if isinstance(conn, sqlite3.Connection): + c = conn.execute("SELECT name FROM sqlite_master WHERE type='view'") + return [view[0] for view in c.fetchall()] + else: + from sqlalchemy import inspect + + return inspect(conn).get_view_names() + + +def get_all_tables(conn): + if isinstance(conn, sqlite3.Connection): + c = conn.execute("SELECT name FROM sqlite_master WHERE type='table'") + return [table[0] for table in c.fetchall()] + else: + from sqlalchemy import inspect + + return inspect(conn).get_table_names() + + +def drop_table( + table_name: str, + conn: sqlite3.Connection | sqlalchemy.engine.Engine | sqlalchemy.engine.Connection, +): + if isinstance(conn, sqlite3.Connection): + conn.execute(f"DROP TABLE IF EXISTS {sql._get_valid_sqlite_name(table_name)}") + conn.commit() + else: + with conn.begin() as con: + sql.SQLDatabase(con).drop_table(table_name) + + +def drop_view( + view_name: str, + conn: sqlite3.Connection | sqlalchemy.engine.Engine | sqlalchemy.engine.Connection, +): + if isinstance(conn, sqlite3.Connection): + conn.execute(f"DROP VIEW IF EXISTS {sql._get_valid_sqlite_name(view_name)}") + conn.commit() + else: + quoted_view = conn.engine.dialect.identifier_preparer.quote_identifier( + view_name + ) + stmt = sqlalchemy.text(f"DROP VIEW IF EXISTS {quoted_view}") + with conn.begin() as con: + con.execute(stmt) # type: ignore[union-attr] + + @pytest.fixture def mysql_pymysql_engine(iris_path, types_data): sqlalchemy = pytest.importorskip("sqlalchemy") @@ -416,10 +464,10 @@ def mysql_pymysql_engine(iris_path, types_data): if not insp.has_table("iris_view"): create_and_load_iris_view(engine) yield engine - with engine.connect() as conn: - with conn.begin(): - stmt = sqlalchemy.text("DROP TABLE IF EXISTS test_frame;") - conn.execute(stmt) + for view in get_all_views(engine): + drop_view(view, engine) + for tbl in get_all_tables(engine): + drop_table(tbl, engine) engine.dispose() @@ -445,10 +493,10 @@ def postgresql_psycopg2_engine(iris_path, types_data): if not insp.has_table("iris_view"): create_and_load_iris_view(engine) yield engine - with engine.connect() as conn: - with conn.begin(): - stmt = sqlalchemy.text("DROP TABLE IF EXISTS test_frame;") - conn.execute(stmt) + for view in get_all_views(engine): + drop_view(view, engine) + for tbl in get_all_tables(engine): + drop_table(tbl, engine) engine.dispose() @@ -481,6 +529,10 @@ def sqlite_engine(sqlite_str, iris_path, types_data): create_and_load_types(engine, types_data, "sqlite") yield engine + for view in get_all_views(engine): + drop_view(view, engine) + for tbl in get_all_tables(engine): + drop_table(tbl, engine) engine.dispose() @@ -528,32 +580,60 @@ def sqlite_buildin(): @pytest.fixture -def sqlite_buildin_iris(sqlite_buildin, iris_path): +def sqlite_sqlalchemy_memory_engine(iris_path, types_data): + sqlalchemy = pytest.importorskip("sqlalchemy") + engine = sqlalchemy.create_engine("sqlite:///:memory:") + + insp = sqlalchemy.inspect(engine) + if not insp.has_table("iris"): + create_and_load_iris(engine, iris_path, "sqlite") + if not insp.has_table("iris_view"): + create_and_load_iris_view(engine) + if not insp.has_table("types"): + for entry in types_data: + entry.pop("DateColWithTz") + create_and_load_types(engine, types_data, "sqlite") + + yield engine + for view in get_all_views(engine): + drop_view(view, engine) + for tbl in get_all_tables(engine): + drop_table(tbl, engine) + + +@pytest.fixture +def sqlite_buildin_iris(sqlite_buildin, iris_path, types_data): create_and_load_iris_sqlite3(sqlite_buildin, iris_path) + + for entry in types_data: + entry.pop("DateColWithTz") + types_data = [tuple(entry.values()) for entry in types_data] + + create_and_load_types_sqlite3(sqlite_buildin, types_data) return sqlite_buildin mysql_connectable = [ - "mysql_pymysql_engine", - "mysql_pymysql_conn", + pytest.param("mysql_pymysql_engine", marks=pytest.mark.db), + pytest.param("mysql_pymysql_conn", marks=pytest.mark.db), ] postgresql_connectable = [ - "postgresql_psycopg2_engine", - "postgresql_psycopg2_conn", + pytest.param("postgresql_psycopg2_engine", marks=pytest.mark.db), + pytest.param("postgresql_psycopg2_conn", marks=pytest.mark.db), ] sqlite_connectable = [ - "sqlite_engine", - "sqlite_conn", - "sqlite_str", + pytest.param("sqlite_engine", marks=pytest.mark.db), + pytest.param("sqlite_conn", marks=pytest.mark.db), + pytest.param("sqlite_str", marks=pytest.mark.db), ] sqlite_iris_connectable = [ - "sqlite_iris_engine", - "sqlite_iris_conn", - "sqlite_iris_str", + pytest.param("sqlite_iris_engine", marks=pytest.mark.db), + pytest.param("sqlite_iris_conn", marks=pytest.mark.db), + pytest.param("sqlite_iris_str", marks=pytest.mark.db), ] sqlalchemy_connectable = mysql_connectable + postgresql_connectable + sqlite_connectable @@ -562,12 +642,17 @@ def sqlite_buildin_iris(sqlite_buildin, iris_path): mysql_connectable + postgresql_connectable + sqlite_iris_connectable ) -all_connectable = sqlalchemy_connectable + ["sqlite_buildin"] +all_connectable = sqlalchemy_connectable + [ + "sqlite_buildin", + "sqlite_sqlalchemy_memory_engine", +] -all_connectable_iris = sqlalchemy_connectable_iris + ["sqlite_buildin_iris"] +all_connectable_iris = sqlalchemy_connectable_iris + [ + "sqlite_buildin_iris", + "sqlite_sqlalchemy_memory_engine", +] -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_dataframe_to_sql(conn, test_frame1, request): # GH 51086 if conn is sqlite_engine @@ -575,7 +660,14 @@ def test_dataframe_to_sql(conn, test_frame1, request): test_frame1.to_sql(name="test", con=conn, if_exists="append", index=False) -@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_dataframe_to_sql_empty(conn, test_frame1, request): + # GH 51086 if conn is sqlite_engine + conn = request.getfixturevalue(conn) + empty_df = test_frame1.iloc[:0] + empty_df.to_sql(name="test", con=conn, if_exists="append", index=False) + + @pytest.mark.parametrize("conn", all_connectable) def test_dataframe_to_sql_arrow_dtypes(conn, request): # GH 52046 @@ -596,7 +688,6 @@ def test_dataframe_to_sql_arrow_dtypes(conn, request): df.to_sql(name="test_arrow", con=conn, if_exists="replace", index=False) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_dataframe_to_sql_arrow_dtypes_missing(conn, request, nulls_fixture): # GH 52046 @@ -612,7 +703,6 @@ def test_dataframe_to_sql_arrow_dtypes_missing(conn, request, nulls_fixture): df.to_sql(name="test_arrow", con=conn, if_exists="replace", index=False) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) @pytest.mark.parametrize("method", [None, "multi"]) def test_to_sql(conn, method, test_frame1, request): @@ -623,7 +713,6 @@ def test_to_sql(conn, method, test_frame1, request): assert count_rows(conn, "test_frame") == len(test_frame1) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) @pytest.mark.parametrize("mode, num_row_coef", [("replace", 1), ("append", 2)]) def test_to_sql_exist(conn, mode, num_row_coef, test_frame1, request): @@ -635,7 +724,6 @@ def test_to_sql_exist(conn, mode, num_row_coef, test_frame1, request): assert count_rows(conn, "test_frame") == num_row_coef * len(test_frame1) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_to_sql_exist_fail(conn, test_frame1, request): conn = request.getfixturevalue(conn) @@ -648,7 +736,6 @@ def test_to_sql_exist_fail(conn, test_frame1, request): pandasSQL.to_sql(test_frame1, "test_frame", if_exists="fail") -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable_iris) def test_read_iris_query(conn, request): conn = request.getfixturevalue(conn) @@ -661,7 +748,6 @@ def test_read_iris_query(conn, request): assert "SepalWidth" in iris_frame.columns -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable_iris) def test_read_iris_query_chunksize(conn, request): conn = request.getfixturevalue(conn) @@ -674,7 +760,6 @@ def test_read_iris_query_chunksize(conn, request): assert "SepalWidth" in iris_frame.columns -@pytest.mark.db @pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) def test_read_iris_query_expression_with_parameter(conn, request): conn = request.getfixturevalue(conn) @@ -696,7 +781,6 @@ def test_read_iris_query_expression_with_parameter(conn, request): autoload_con.dispose() -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable_iris) def test_read_iris_query_string_with_parameter(conn, request, sql_strings): for db, query in sql_strings["read_parameters"].items(): @@ -709,7 +793,6 @@ def test_read_iris_query_string_with_parameter(conn, request, sql_strings): check_iris_frame(iris_frame) -@pytest.mark.db @pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) def test_read_iris_table(conn, request): # GH 51015 if conn = sqlite_iris_str @@ -720,7 +803,6 @@ def test_read_iris_table(conn, request): check_iris_frame(iris_frame) -@pytest.mark.db @pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) def test_read_iris_table_chunksize(conn, request): conn = request.getfixturevalue(conn) @@ -730,7 +812,6 @@ def test_read_iris_table_chunksize(conn, request): check_iris_frame(iris_frame) -@pytest.mark.db @pytest.mark.parametrize("conn", sqlalchemy_connectable) def test_to_sql_callable(conn, test_frame1, request): conn = request.getfixturevalue(conn) @@ -749,26 +830,38 @@ def sample(pd_table, conn, keys, data_iter): assert count_rows(conn, "test_frame") == len(test_frame1) -@pytest.mark.db -@pytest.mark.parametrize("conn", mysql_connectable) +@pytest.mark.parametrize("conn", all_connectable_iris) def test_default_type_conversion(conn, request): + conn_name = conn + if conn_name == "sqlite_buildin_iris": + request.node.add_marker( + pytest.mark.xfail( + reason="sqlite_buildin connection does not implement read_sql_table" + ) + ) + conn = request.getfixturevalue(conn) df = sql.read_sql_table("types", conn) assert issubclass(df.FloatCol.dtype.type, np.floating) assert issubclass(df.IntCol.dtype.type, np.integer) - # MySQL has no real BOOL type (it's an alias for TINYINT) - assert issubclass(df.BoolCol.dtype.type, np.integer) + # MySQL/sqlite has no real BOOL type + if "postgresql" in conn_name: + assert issubclass(df.BoolCol.dtype.type, np.bool_) + else: + assert issubclass(df.BoolCol.dtype.type, np.integer) # Int column with NA values stays as float assert issubclass(df.IntColWithNull.dtype.type, np.floating) # Bool column with NA = int column with NA values => becomes float - assert issubclass(df.BoolColWithNull.dtype.type, np.floating) + if "postgresql" in conn_name: + assert issubclass(df.BoolColWithNull.dtype.type, object) + else: + assert issubclass(df.BoolColWithNull.dtype.type, np.floating) -@pytest.mark.db @pytest.mark.parametrize("conn", mysql_connectable) def test_read_procedure(conn, request): conn = request.getfixturevalue(conn) @@ -806,7 +899,6 @@ def test_read_procedure(conn, request): tm.assert_frame_equal(df, res2) -@pytest.mark.db @pytest.mark.parametrize("conn", postgresql_connectable) @pytest.mark.parametrize("expected_count", [2, "Success!"]) def test_copy_from_callable_insertion_method(conn, expected_count, request): @@ -846,7 +938,6 @@ def psql_insert_copy(table, conn, keys, data_iter): tm.assert_frame_equal(result, expected) -@pytest.mark.db @pytest.mark.parametrize("conn", postgresql_connectable) def test_insertion_method_on_conflict_do_nothing(conn, request): # GH 15988: Example in to_sql docstring @@ -905,7 +996,6 @@ def insert_on_conflict(table, conn, keys, data_iter): pandasSQL.drop_table("test_insert_conflict") -@pytest.mark.db @pytest.mark.parametrize("conn", mysql_connectable) def test_insertion_method_on_conflict_update(conn, request): # GH 14553: Example in to_sql docstring @@ -959,7 +1049,6 @@ def insert_on_conflict(table, conn, keys, data_iter): pandasSQL.drop_table("test_insert_conflict") -@pytest.mark.db @pytest.mark.parametrize("conn", postgresql_connectable) def test_read_view_postgres(conn, request): # GH 52969 @@ -1041,217 +1130,65 @@ def test_execute_deprecated(sqlite_buildin_iris): sql.execute("select * from iris", sqlite_buildin_iris) -class MixInBase: - def teardown_method(self): - # if setup fails, there may not be a connection to close. - if hasattr(self, "conn"): - self.conn.close() - # use a fresh connection to ensure we can drop all tables. - try: - conn = self.connect() - except (sqlalchemy.exc.OperationalError, sqlite3.OperationalError): - pass - else: - with conn: - for view in self._get_all_views(conn): - self.drop_view(view, conn) - for tbl in self._get_all_tables(conn): - self.drop_table(tbl, conn) - - -class SQLiteMixIn(MixInBase): - def connect(self): - return sqlite3.connect(":memory:") - - def drop_table(self, table_name, conn): - conn.execute(f"DROP TABLE IF EXISTS {sql._get_valid_sqlite_name(table_name)}") - conn.commit() - - def _get_all_tables(self, conn): - c = conn.execute("SELECT name FROM sqlite_master WHERE type='table'") - return [table[0] for table in c.fetchall()] - - def drop_view(self, view_name, conn): - conn.execute(f"DROP VIEW IF EXISTS {sql._get_valid_sqlite_name(view_name)}") - conn.commit() - - def _get_all_views(self, conn): - c = conn.execute("SELECT name FROM sqlite_master WHERE type='view'") - return [view[0] for view in c.fetchall()] - - -class SQLAlchemyMixIn(MixInBase): - @classmethod - def teardown_class(cls): - cls.engine.dispose() - - def connect(self): - return self.engine.connect() - - def drop_table(self, table_name, conn): - if conn.in_transaction(): - conn.get_transaction().rollback() - with conn.begin(): - sql.SQLDatabase(conn).drop_table(table_name) - - def _get_all_tables(self, conn): - from sqlalchemy import inspect - - return inspect(conn).get_table_names() - - def drop_view(self, view_name, conn): - quoted_view = conn.engine.dialect.identifier_preparer.quote_identifier( - view_name - ) - if conn.in_transaction(): - conn.get_transaction().rollback() - with conn.begin(): - conn.exec_driver_sql(f"DROP VIEW IF EXISTS {quoted_view}") - - def _get_all_views(self, conn): - from sqlalchemy import inspect - - return inspect(conn).get_view_names() - - -class PandasSQLTest: - """ - Base class with common private methods for SQLAlchemy and fallback cases. - - """ - - def load_iris_data(self, iris_path): - self.drop_view("iris_view", self.conn) - self.drop_table("iris", self.conn) - if isinstance(self.conn, sqlite3.Connection): - create_and_load_iris_sqlite3(self.conn, iris_path) - else: - create_and_load_iris(self.conn, iris_path, self.flavor) - - def load_types_data(self, types_data): - if self.flavor != "postgresql": - for entry in types_data: - entry.pop("DateColWithTz") - if isinstance(self.conn, sqlite3.Connection): - types_data = [tuple(entry.values()) for entry in types_data] - create_and_load_types_sqlite3(self.conn, types_data) - else: - create_and_load_types(self.conn, types_data, self.flavor) - - def _read_sql_iris_parameter(self, sql_strings): - query = sql_strings["read_parameters"][self.flavor] - params = ("Iris-setosa", 5.1) - with self.pandasSQL.run_transaction(): - iris_frame = self.pandasSQL.read_query(query, params=params) - check_iris_frame(iris_frame) - - def _read_sql_iris_named_parameter(self, sql_strings): - query = sql_strings["read_named_parameters"][self.flavor] - params = {"name": "Iris-setosa", "length": 5.1} - with self.pandasSQL.run_transaction(): - iris_frame = self.pandasSQL.read_query(query, params=params) - check_iris_frame(iris_frame) - - def _read_sql_iris_no_parameter_with_percent(self, sql_strings): - query = sql_strings["read_no_parameters_with_percent"][self.flavor] - with self.pandasSQL.run_transaction(): - iris_frame = self.pandasSQL.read_query(query, params=None) - check_iris_frame(iris_frame) - - def _to_sql_empty(self, test_frame1): - self.drop_table("test_frame1", self.conn) - assert self.pandasSQL.to_sql(test_frame1.iloc[:0], "test_frame1") == 0 - - def _to_sql_with_sql_engine(self, test_frame1, engine="auto", **engine_kwargs): - """`to_sql` with the `engine` param""" - # mostly copied from this class's `_to_sql()` method - self.drop_table("test_frame1", self.conn) - - assert ( - self.pandasSQL.to_sql( - test_frame1, "test_frame1", engine=engine, **engine_kwargs - ) - == 4 - ) - assert self.pandasSQL.has_table("test_frame1") - - num_entries = len(test_frame1) - num_rows = count_rows(self.conn, "test_frame1") - assert num_rows == num_entries +@pytest.fixture +def flavor(): + def func(conn_name): + if "postgresql" in conn_name: + return "postgresql" + elif "sqlite" in conn_name: + return "sqlite" + elif "mysql" in conn_name: + return "mysql" - # Nuke table - self.drop_table("test_frame1", self.conn) + raise ValueError(f"unsupported connection: {conn_name}") - def _roundtrip(self, test_frame1): - self.drop_table("test_frame_roundtrip", self.conn) - assert self.pandasSQL.to_sql(test_frame1, "test_frame_roundtrip") == 4 - with self.pandasSQL.run_transaction(): - result = self.pandasSQL.read_query("SELECT * FROM test_frame_roundtrip") + return func - result.set_index("level_0", inplace=True) - # result.index.astype(int) - result.index.name = None +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_read_sql_iris_parameter(conn, request, sql_strings, flavor): + conn_name = conn + conn = request.getfixturevalue(conn) + query = sql_strings["read_parameters"][flavor(conn_name)] + params = ("Iris-setosa", 5.1) + pandasSQL = pandasSQL_builder(conn) - tm.assert_frame_equal(result, test_frame1) + with pandasSQL.run_transaction(): + iris_frame = pandasSQL.read_query(query, params=params) + check_iris_frame(iris_frame) - def _execute_sql(self): - # drop_sql = "DROP TABLE IF EXISTS test" # should already be done - iris_results = self.pandasSQL.execute("SELECT * FROM iris") - row = iris_results.fetchone() - tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) - def _to_sql_save_index(self): - df = DataFrame.from_records( - [(1, 2.1, "line1"), (2, 1.5, "line2")], columns=["A", "B", "C"], index=["A"] - ) - assert self.pandasSQL.to_sql(df, "test_to_sql_saves_index") == 2 - ix_cols = self._get_index_columns("test_to_sql_saves_index") - assert ix_cols == [["A"]] - - def _transaction_test(self): - with self.pandasSQL.run_transaction() as trans: - stmt = "CREATE TABLE test_trans (A INT, B TEXT)" - if isinstance(self.pandasSQL, SQLiteDatabase): - trans.execute(stmt) - else: - from sqlalchemy import text +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_read_sql_iris_named_parameter(conn, request, sql_strings, flavor): + conn_name = conn + conn = request.getfixturevalue(conn) + query = sql_strings["read_named_parameters"][flavor(conn_name)] + params = {"name": "Iris-setosa", "length": 5.1} + pandasSQL = pandasSQL_builder(conn) + with pandasSQL.run_transaction(): + iris_frame = pandasSQL.read_query(query, params=params) + check_iris_frame(iris_frame) - stmt = text(stmt) - trans.execute(stmt) - class DummyException(Exception): - pass +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_read_sql_iris_no_parameter_with_percent(conn, request, sql_strings, flavor): + if "mysql" in conn or "postgresql" in conn: + request.node.add_marker(pytest.mark.xfail(reason="broken test")) - # Make sure when transaction is rolled back, no rows get inserted - ins_sql = "INSERT INTO test_trans (A,B) VALUES (1, 'blah')" - if isinstance(self.pandasSQL, SQLDatabase): - from sqlalchemy import text + conn_name = conn + conn = request.getfixturevalue(conn) - ins_sql = text(ins_sql) - try: - with self.pandasSQL.run_transaction() as trans: - trans.execute(ins_sql) - raise DummyException("error") - except DummyException: - # ignore raised exception - pass - with self.pandasSQL.run_transaction(): - res = self.pandasSQL.read_query("SELECT * FROM test_trans") - assert len(res) == 0 - - # Make sure when transaction is committed, rows do get inserted - with self.pandasSQL.run_transaction() as trans: - trans.execute(ins_sql) - res2 = self.pandasSQL.read_query("SELECT * FROM test_trans") - assert len(res2) == 1 + query = sql_strings["read_no_parameters_with_percent"][flavor(conn_name)] + pandasSQL = pandasSQL_builder(conn) + with pandasSQL.run_transaction(): + iris_frame = pandasSQL.read_query(query, params=None) + check_iris_frame(iris_frame) # ----------------------------------------------------------------------------- # -- Testing the public API -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable_iris) def test_api_read_sql_view(conn, request): conn = request.getfixturevalue(conn) @@ -1259,7 +1196,6 @@ def test_api_read_sql_view(conn, request): check_iris_frame(iris_frame) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable_iris) def test_api_read_sql_with_chunksize_no_result(conn, request): conn = request.getfixturevalue(conn) @@ -1269,7 +1205,6 @@ def test_api_read_sql_with_chunksize_no_result(conn, request): tm.assert_frame_equal(concat(with_batch), without_batch) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_api_to_sql(conn, request, test_frame1): conn = request.getfixturevalue(conn) @@ -1281,7 +1216,6 @@ def test_api_to_sql(conn, request, test_frame1): assert sql.has_table("test_frame1", conn) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_api_to_sql_fail(conn, request, test_frame1): conn = request.getfixturevalue(conn) @@ -1297,7 +1231,6 @@ def test_api_to_sql_fail(conn, request, test_frame1): sql.to_sql(test_frame1, "test_frame2", conn, if_exists="fail") -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_api_to_sql_replace(conn, request, test_frame1): conn = request.getfixturevalue(conn) @@ -1316,7 +1249,6 @@ def test_api_to_sql_replace(conn, request, test_frame1): assert num_rows == num_entries -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_api_to_sql_append(conn, request, test_frame1): conn = request.getfixturevalue(conn) @@ -1336,7 +1268,6 @@ def test_api_to_sql_append(conn, request, test_frame1): assert num_rows == num_entries -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_api_to_sql_type_mapping(conn, request, test_frame3): conn = request.getfixturevalue(conn) @@ -1350,7 +1281,6 @@ def test_api_to_sql_type_mapping(conn, request, test_frame3): tm.assert_frame_equal(test_frame3, result) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_api_to_sql_series(conn, request): conn = request.getfixturevalue(conn) @@ -1364,7 +1294,6 @@ def test_api_to_sql_series(conn, request): tm.assert_frame_equal(s.to_frame(), s2) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_api_roundtrip(conn, request, test_frame1): conn = request.getfixturevalue(conn) @@ -1383,7 +1312,6 @@ def test_api_roundtrip(conn, request, test_frame1): tm.assert_frame_equal(result, test_frame1) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_api_roundtrip_chunksize(conn, request, test_frame1): conn = request.getfixturevalue(conn) @@ -1402,7 +1330,6 @@ def test_api_roundtrip_chunksize(conn, request, test_frame1): tm.assert_frame_equal(result, test_frame1) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable_iris) def test_api_execute_sql(conn, request): # drop_sql = "DROP TABLE IF EXISTS test" # should already be done @@ -1413,13 +1340,9 @@ def test_api_execute_sql(conn, request): tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) -@pytest.mark.db -@pytest.mark.parametrize("conn", all_connectable) +@pytest.mark.parametrize("conn", all_connectable_iris) def test_api_date_parsing(conn, request): conn_name = conn - if conn_name in {"sqlite_buildin", "sqlite_str"}: - pytest.skip("types tables not created in sqlite_buildin or sqlite_str fixture") - conn = request.getfixturevalue(conn) # Test date parsing in read_sql # No Parsing @@ -1473,8 +1396,7 @@ def test_api_date_parsing(conn, request): ] -@pytest.mark.db -@pytest.mark.parametrize("conn", all_connectable) +@pytest.mark.parametrize("conn", all_connectable_iris) @pytest.mark.parametrize("error", ["ignore", "raise", "coerce"]) @pytest.mark.parametrize( "read_sql, text, mode", @@ -1493,10 +1415,11 @@ def test_api_custom_dateparsing_error( conn, request, read_sql, text, mode, error, types_data_frame ): conn_name = conn - if conn_name in {"sqlite_buildin", "sqlite_str"}: - pytest.skip("types tables not created in sqlite_buildin or sqlite_str fixture") - conn = request.getfixturevalue(conn) + if text == "types" and conn_name == "sqlite_buildin_iris": + request.node.add_marker( + pytest.mark.xfail(reason="failing combination of arguments") + ) expected = types_data_frame.astype({"DateCol": "datetime64[ns]"}) @@ -1516,14 +1439,9 @@ def test_api_custom_dateparsing_error( tm.assert_frame_equal(result, expected) -@pytest.mark.db -@pytest.mark.parametrize("conn", all_connectable) +@pytest.mark.parametrize("conn", all_connectable_iris) def test_api_date_and_index(conn, request): # Test case where same column appears in parse_date and index_col - conn_name = conn - if conn_name in {"sqlite_buildin", "sqlite_str"}: - pytest.skip("types tables not created in sqlite_buildin or sqlite_str fixture") - conn = request.getfixturevalue(conn) df = sql.read_sql_query( "SELECT * FROM types", @@ -1536,7 +1454,6 @@ def test_api_date_and_index(conn, request): assert issubclass(df.IntDateCol.dtype.type, np.datetime64) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_api_timedelta(conn, request): # see #6921 @@ -1553,7 +1470,6 @@ def test_api_timedelta(conn, request): tm.assert_series_equal(result["foo"], df["foo"].view("int64")) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_api_complex_raises(conn, request): conn = request.getfixturevalue(conn) @@ -1563,7 +1479,6 @@ def test_api_complex_raises(conn, request): assert df.to_sql("test_complex", con=conn) is None -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) @pytest.mark.parametrize( "index_name,index_label,expected", @@ -1596,13 +1511,14 @@ def test_api_to_sql_index_label(conn, request, index_name, index_label, expected assert frame.columns[0] == expected -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_api_to_sql_index_label_multiindex(conn, request): conn_name = conn if "mysql" in conn_name: request.node.add_marker( - pytest.mark.xfail(reason="MySQL can fail using TEXT without length as key") + pytest.mark.xfail( + reason="MySQL can fail using TEXT without length as key", strict=False + ) ) conn = request.getfixturevalue(conn) @@ -1665,7 +1581,6 @@ def test_api_to_sql_index_label_multiindex(conn, request): ) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_api_multiindex_roundtrip(conn, request): conn = request.getfixturevalue(conn) @@ -1686,7 +1601,6 @@ def test_api_multiindex_roundtrip(conn, request): tm.assert_frame_equal(df, result, check_index_type=True) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) @pytest.mark.parametrize( "dtype", @@ -1719,7 +1633,6 @@ def test_api_dtype_argument(conn, request, dtype): tm.assert_frame_equal(result, expected) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_api_integer_col_names(conn, request): conn = request.getfixturevalue(conn) @@ -1727,7 +1640,6 @@ def test_api_integer_col_names(conn, request): sql.to_sql(df, "test_frame_integer_col_names", conn, if_exists="replace") -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_api_get_schema(conn, request, test_frame1): conn = request.getfixturevalue(conn) @@ -1735,7 +1647,6 @@ def test_api_get_schema(conn, request, test_frame1): assert "CREATE" in create_sql -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_api_get_schema_with_schema(conn, request, test_frame1): # GH28486 @@ -1744,7 +1655,6 @@ def test_api_get_schema_with_schema(conn, request, test_frame1): assert "CREATE TABLE pypi." in create_sql -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_api_get_schema_dtypes(conn, request): conn_name = conn @@ -1762,7 +1672,6 @@ def test_api_get_schema_dtypes(conn, request): assert "INTEGER" in create_sql -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_api_get_schema_keys(conn, request, test_frame1): conn_name = conn @@ -1785,7 +1694,6 @@ def test_api_get_schema_keys(conn, request, test_frame1): assert constraint_sentence in create_sql -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_api_chunksize_read(conn, request): conn_name = conn @@ -1831,7 +1739,6 @@ def test_api_chunksize_read(conn, request): tm.assert_frame_equal(res1, res3) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_api_categorical(conn, request): # GH8624 @@ -1856,7 +1763,6 @@ def test_api_categorical(conn, request): tm.assert_frame_equal(res, df) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_api_unicode_column_name(conn, request): # GH 11431 @@ -1869,7 +1775,6 @@ def test_api_unicode_column_name(conn, request): df.to_sql(name="test_unicode", con=conn, index=False) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_api_escaped_table_name(conn, request): # GH 13206 @@ -1891,7 +1796,6 @@ def test_api_escaped_table_name(conn, request): tm.assert_frame_equal(res, df) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_api_read_sql_duplicate_columns(conn, request): # GH#53117 @@ -1911,1102 +1815,1352 @@ def test_api_read_sql_duplicate_columns(conn, request): tm.assert_frame_equal(result, expected) -class _TestSQLApi(PandasSQLTest): - """ - Base class to test the public API. - - From this two classes are derived to run these tests for both the - sqlalchemy mode (`TestSQLApi`) and the fallback mode - (`TestSQLiteFallbackApi`). These tests are run with sqlite3. Specific - tests for the different sql flavours are included in `_TestSQLAlchemy`. - - Notes: - flavor can always be passed even in SQLAlchemy mode, - should be correctly ignored. +@pytest.mark.parametrize("conn", all_connectable) +def test_read_table_columns(conn, request, test_frame1): + # test columns argument in read_table + conn_name = conn + if conn_name == "sqlite_buildin": + request.node.add_marker(pytest.mark.xfail(reason="Not Implemented")) - we don't use drop_table because that isn't part of the public api + conn = request.getfixturevalue(conn) + sql.to_sql(test_frame1, "test_frame", conn) - """ + cols = ["A", "B"] - flavor = "sqlite" - mode: str + result = sql.read_sql_table("test_frame", conn, columns=cols) + assert result.columns.tolist() == cols - @pytest.fixture(autouse=True) - def setup_method(self, iris_path, types_data): - self.conn = self.connect() - self.load_iris_data(iris_path) - self.load_types_data(types_data) - self.load_test_data_and_sql() - def load_test_data_and_sql(self): - create_and_load_iris_view(self.conn) +@pytest.mark.parametrize("conn", all_connectable) +def test_read_table_index_col(conn, request, test_frame1): + # test columns argument in read_table + conn_name = conn + if conn_name == "sqlite_buildin": + request.node.add_marker(pytest.mark.xfail(reason="Not Implemented")) + conn = request.getfixturevalue(conn) + sql.to_sql(test_frame1, "test_frame", conn) -@pytest.mark.skipif(not SQLALCHEMY_INSTALLED, reason="SQLAlchemy not installed") -class TestSQLApi(SQLAlchemyMixIn, _TestSQLApi): - """ - Test the public API as it would be used directly + result = sql.read_sql_table("test_frame", conn, index_col="index") + assert result.index.names == ["index"] - Tests for `read_sql_table` are included here, as this is specific for the - sqlalchemy mode. + result = sql.read_sql_table("test_frame", conn, index_col=["A", "B"]) + assert result.index.names == ["A", "B"] - """ + result = sql.read_sql_table( + "test_frame", conn, index_col=["A", "B"], columns=["C", "D"] + ) + assert result.index.names == ["A", "B"] + assert result.columns.tolist() == ["C", "D"] - flavor = "sqlite" - mode = "sqlalchemy" - @classmethod - def setup_class(cls): - cls.engine = sqlalchemy.create_engine("sqlite:///:memory:") +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_read_sql_delegate(conn, request): + if conn == "sqlite_buildin_iris": + request.node.add_marker( + pytest.mark.xfail( + reason="sqlite_buildin connection does not implement read_sql_table" + ) + ) - def test_read_table_columns(self, test_frame1): - # test columns argument in read_table - sql.to_sql(test_frame1, "test_frame", self.conn) + conn = request.getfixturevalue(conn) + iris_frame1 = sql.read_sql_query("SELECT * FROM iris", conn) + iris_frame2 = sql.read_sql("SELECT * FROM iris", conn) + tm.assert_frame_equal(iris_frame1, iris_frame2) - cols = ["A", "B"] - result = sql.read_sql_table("test_frame", self.conn, columns=cols) - assert result.columns.tolist() == cols + iris_frame1 = sql.read_sql_table("iris", conn) + iris_frame2 = sql.read_sql("iris", conn) + tm.assert_frame_equal(iris_frame1, iris_frame2) - def test_read_table_index_col(self, test_frame1): - # test columns argument in read_table - sql.to_sql(test_frame1, "test_frame", self.conn) - result = sql.read_sql_table("test_frame", self.conn, index_col="index") - assert result.index.names == ["index"] +def test_not_reflect_all_tables(sqlite_conn): + conn = sqlite_conn + from sqlalchemy import text + from sqlalchemy.engine import Engine - result = sql.read_sql_table("test_frame", self.conn, index_col=["A", "B"]) - assert result.index.names == ["A", "B"] + # create invalid table + query_list = [ + text("CREATE TABLE invalid (x INTEGER, y UNKNOWN);"), + text("CREATE TABLE other_table (x INTEGER, y INTEGER);"), + ] - result = sql.read_sql_table( - "test_frame", self.conn, index_col=["A", "B"], columns=["C", "D"] - ) - assert result.index.names == ["A", "B"] - assert result.columns.tolist() == ["C", "D"] + for query in query_list: + if isinstance(conn, Engine): + with conn.connect() as conn: + with conn.begin(): + conn.execute(query) + else: + with conn.begin(): + conn.execute(query) - def test_read_sql_delegate(self): - iris_frame1 = sql.read_sql_query("SELECT * FROM iris", self.conn) - iris_frame2 = sql.read_sql("SELECT * FROM iris", self.conn) - tm.assert_frame_equal(iris_frame1, iris_frame2) + with tm.assert_produces_warning(None): + sql.read_sql_table("other_table", conn) + sql.read_sql_query("SELECT * FROM other_table", conn) - iris_frame1 = sql.read_sql_table("iris", self.conn) - iris_frame2 = sql.read_sql("iris", self.conn) - tm.assert_frame_equal(iris_frame1, iris_frame2) - def test_not_reflect_all_tables(self): - from sqlalchemy import text - from sqlalchemy.engine import Engine +@pytest.mark.parametrize("conn", all_connectable) +def test_warning_case_insensitive_table_name(conn, request, test_frame1): + conn_name = conn + if conn_name == "sqlite_buildin": + request.node.add_marker(pytest.mark.xfail(reason="Does not raise warning")) - # create invalid table - query_list = [ - text("CREATE TABLE invalid (x INTEGER, y UNKNOWN);"), - text("CREATE TABLE other_table (x INTEGER, y INTEGER);"), - ] - for query in query_list: - if isinstance(self.conn, Engine): - with self.conn.connect() as conn: - with conn.begin(): - conn.execute(query) - else: - with self.conn.begin(): - self.conn.execute(query) + conn = request.getfixturevalue(conn) + # see gh-7815 + with tm.assert_produces_warning( + UserWarning, + match=( + r"The provided table name 'TABLE1' is not found exactly as such in " + r"the database after writing the table, possibly due to case " + r"sensitivity issues. Consider using lower case table names." + ), + ): + sql.SQLDatabase(conn).check_case_sensitive("TABLE1", "") - with tm.assert_produces_warning(None): - sql.read_sql_table("other_table", self.conn) - sql.read_sql_query("SELECT * FROM other_table", self.conn) + # Test that the warning is certainly NOT triggered in a normal case. + with tm.assert_produces_warning(None): + test_frame1.to_sql(name="CaseSensitive", con=conn) - def test_warning_case_insensitive_table_name(self, test_frame1): - # see gh-7815 - with tm.assert_produces_warning( - UserWarning, - match=( - r"The provided table name 'TABLE1' is not found exactly as such in " - r"the database after writing the table, possibly due to case " - r"sensitivity issues. Consider using lower case table names." - ), - ): - sql.SQLDatabase(self.conn).check_case_sensitive("TABLE1", "") - # Test that the warning is certainly NOT triggered in a normal case. - with tm.assert_produces_warning(None): - test_frame1.to_sql(name="CaseSensitive", con=self.conn) +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_sqlalchemy_type_mapping(conn, request): + conn = request.getfixturevalue(conn) + from sqlalchemy import TIMESTAMP - def _get_index_columns(self, tbl_name): - from sqlalchemy.engine import reflection + # Test Timestamp objects (no datetime64 because of timezone) (GH9085) + df = DataFrame( + {"time": to_datetime(["2014-12-12 01:54", "2014-12-11 02:54"], utc=True)} + ) + db = sql.SQLDatabase(conn) + table = sql.SQLTable("test_type", db, frame=df) + # GH 9086: TIMESTAMP is the suggested type for datetimes with timezones + assert isinstance(table.table.c["time"].type, TIMESTAMP) - insp = reflection.Inspector.from_engine(self.conn) - ixs = insp.get_indexes("test_index_saved") - ixs = [i["column_names"] for i in ixs] - return ixs - def test_sqlalchemy_type_mapping(self): - from sqlalchemy import TIMESTAMP +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +@pytest.mark.parametrize( + "integer, expected", + [ + ("int8", "SMALLINT"), + ("Int8", "SMALLINT"), + ("uint8", "SMALLINT"), + ("UInt8", "SMALLINT"), + ("int16", "SMALLINT"), + ("Int16", "SMALLINT"), + ("uint16", "INTEGER"), + ("UInt16", "INTEGER"), + ("int32", "INTEGER"), + ("Int32", "INTEGER"), + ("uint32", "BIGINT"), + ("UInt32", "BIGINT"), + ("int64", "BIGINT"), + ("Int64", "BIGINT"), + (int, "BIGINT" if np.dtype(int).name == "int64" else "INTEGER"), + ], +) +def test_sqlalchemy_integer_mapping(conn, request, integer, expected): + # GH35076 Map pandas integer to optimal SQLAlchemy integer type + conn = request.getfixturevalue(conn) + df = DataFrame([0, 1], columns=["a"], dtype=integer) + db = sql.SQLDatabase(conn) + table = sql.SQLTable("test_type", db, frame=df) - # Test Timestamp objects (no datetime64 because of timezone) (GH9085) - df = DataFrame( - {"time": to_datetime(["2014-12-12 01:54", "2014-12-11 02:54"], utc=True)} - ) - db = sql.SQLDatabase(self.conn) - table = sql.SQLTable("test_type", db, frame=df) - # GH 9086: TIMESTAMP is the suggested type for datetimes with timezones - assert isinstance(table.table.c["time"].type, TIMESTAMP) + result = str(table.table.c.a.type) + assert result == expected - @pytest.mark.parametrize( - "integer, expected", - [ - ("int8", "SMALLINT"), - ("Int8", "SMALLINT"), - ("uint8", "SMALLINT"), - ("UInt8", "SMALLINT"), - ("int16", "SMALLINT"), - ("Int16", "SMALLINT"), - ("uint16", "INTEGER"), - ("UInt16", "INTEGER"), - ("int32", "INTEGER"), - ("Int32", "INTEGER"), - ("uint32", "BIGINT"), - ("UInt32", "BIGINT"), - ("int64", "BIGINT"), - ("Int64", "BIGINT"), - (int, "BIGINT" if np.dtype(int).name == "int64" else "INTEGER"), - ], - ) - def test_sqlalchemy_integer_mapping(self, integer, expected): - # GH35076 Map pandas integer to optimal SQLAlchemy integer type - df = DataFrame([0, 1], columns=["a"], dtype=integer) - db = sql.SQLDatabase(self.conn) - table = sql.SQLTable("test_type", db, frame=df) - - result = str(table.table.c.a.type) - assert result == expected - - @pytest.mark.parametrize("integer", ["uint64", "UInt64"]) - def test_sqlalchemy_integer_overload_mapping(self, integer): - # GH35076 Map pandas integer to optimal SQLAlchemy integer type - df = DataFrame([0, 1], columns=["a"], dtype=integer) - db = sql.SQLDatabase(self.conn) - with pytest.raises( - ValueError, match="Unsigned 64 bit integer datatype is not supported" - ): - sql.SQLTable("test_type", db, frame=df) - - def test_database_uri_string(self, test_frame1): - # Test read_sql and .to_sql method with a database URI (GH10654) - # db_uri = 'sqlite:///:memory:' # raises - # sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) near - # "iris": syntax error [SQL: 'iris'] - with tm.ensure_clean() as name: - db_uri = "sqlite:///" + name - table = "iris" - test_frame1.to_sql(name=table, con=db_uri, if_exists="replace", index=False) - test_frame2 = sql.read_sql(table, db_uri) - test_frame3 = sql.read_sql_table(table, db_uri) - query = "SELECT * FROM iris" - test_frame4 = sql.read_sql_query(query, db_uri) - tm.assert_frame_equal(test_frame1, test_frame2) - tm.assert_frame_equal(test_frame1, test_frame3) - tm.assert_frame_equal(test_frame1, test_frame4) - - @td.skip_if_installed("pg8000") - def test_pg8000_sqlalchemy_passthrough_error(self): - # using driver that will not be installed on CI to trigger error - # in sqlalchemy.create_engine -> test passing of this error to user - db_uri = "postgresql+pg8000://user:pass@host/dbname" - with pytest.raises(ImportError, match="pg8000"): - sql.read_sql("select * from table", db_uri) - - def test_query_by_text_obj(self): - # WIP : GH10846 - from sqlalchemy import text +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +@pytest.mark.parametrize("integer", ["uint64", "UInt64"]) +def test_sqlalchemy_integer_overload_mapping(conn, request, integer): + conn = request.getfixturevalue(conn) + # GH35076 Map pandas integer to optimal SQLAlchemy integer type + df = DataFrame([0, 1], columns=["a"], dtype=integer) + db = sql.SQLDatabase(conn) + with pytest.raises( + ValueError, match="Unsigned 64 bit integer datatype is not supported" + ): + sql.SQLTable("test_type", db, frame=df) + + +@pytest.mark.skipif(not SQLALCHEMY_INSTALLED, reason="fails without SQLAlchemy") +@pytest.mark.parametrize("conn", all_connectable) +def test_database_uri_string(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + # Test read_sql and .to_sql method with a database URI (GH10654) + # db_uri = 'sqlite:///:memory:' # raises + # sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) near + # "iris": syntax error [SQL: 'iris'] + with tm.ensure_clean() as name: + db_uri = "sqlite:///" + name + table = "iris" + test_frame1.to_sql(name=table, con=db_uri, if_exists="replace", index=False) + test_frame2 = sql.read_sql(table, db_uri) + test_frame3 = sql.read_sql_table(table, db_uri) + query = "SELECT * FROM iris" + test_frame4 = sql.read_sql_query(query, db_uri) + tm.assert_frame_equal(test_frame1, test_frame2) + tm.assert_frame_equal(test_frame1, test_frame3) + tm.assert_frame_equal(test_frame1, test_frame4) + + +@td.skip_if_installed("pg8000") +@pytest.mark.skipif(not SQLALCHEMY_INSTALLED, reason="fails without SQLAlchemy") +@pytest.mark.parametrize("conn", all_connectable) +def test_pg8000_sqlalchemy_passthrough_error(conn, request): + conn = request.getfixturevalue(conn) + # using driver that will not be installed on CI to trigger error + # in sqlalchemy.create_engine -> test passing of this error to user + db_uri = "postgresql+pg8000://user:pass@host/dbname" + with pytest.raises(ImportError, match="pg8000"): + sql.read_sql("select * from table", db_uri) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) +def test_query_by_text_obj(conn, request): + # WIP : GH10846 + conn_name = conn + conn = request.getfixturevalue(conn) + from sqlalchemy import text + + if "postgres" in conn_name: + name_text = text('select * from iris where "Name"=:name') + else: name_text = text("select * from iris where name=:name") - iris_df = sql.read_sql(name_text, self.conn, params={"name": "Iris-versicolor"}) - all_names = set(iris_df["Name"]) - assert all_names == {"Iris-versicolor"} - - def test_query_by_select_obj(self): - # WIP : GH10846 - from sqlalchemy import ( - bindparam, - select, - ) + iris_df = sql.read_sql(name_text, conn, params={"name": "Iris-versicolor"}) + all_names = set(iris_df["Name"]) + assert all_names == {"Iris-versicolor"} - iris = iris_table_metadata(self.flavor) - name_select = select(iris).where(iris.c.Name == bindparam("name")) - iris_df = sql.read_sql(name_select, self.conn, params={"name": "Iris-setosa"}) - all_names = set(iris_df["Name"]) - assert all_names == {"Iris-setosa"} - def test_column_with_percentage(self): - # GH 37157 - df = DataFrame({"A": [0, 1, 2], "%_variation": [3, 4, 5]}) - df.to_sql(name="test_column_percentage", con=self.conn, index=False) +@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) +def test_query_by_select_obj(conn, request): + conn = request.getfixturevalue(conn) + # WIP : GH10846 + from sqlalchemy import ( + bindparam, + select, + ) - res = sql.read_sql_table("test_column_percentage", self.conn) + iris = iris_table_metadata("postgres") + name_select = select(iris).where(iris.c.Name == bindparam("name")) + iris_df = sql.read_sql(name_select, conn, params={"name": "Iris-setosa"}) + all_names = set(iris_df["Name"]) + assert all_names == {"Iris-setosa"} - tm.assert_frame_equal(res, df) +@pytest.mark.parametrize("conn", all_connectable) +def test_column_with_percentage(conn, request): + # GH 37157 + conn_name = conn + if conn_name == "sqlite_buildin": + request.node.add_marker(pytest.mark.xfail(reason="Not Implemented")) -class TestSQLiteFallbackApi(SQLiteMixIn, _TestSQLApi): - """ - Test the public sqlite connection fallback API + conn = request.getfixturevalue(conn) + df = DataFrame({"A": [0, 1, 2], "%_variation": [3, 4, 5]}) + df.to_sql(name="test_column_percentage", con=conn, index=False) - """ + res = sql.read_sql_table("test_column_percentage", conn) - flavor = "sqlite" - mode = "fallback" + tm.assert_frame_equal(res, df) - def connect(self, database=":memory:"): - return sqlite3.connect(database) - def test_sql_open_close(self, test_frame3): - # Test if the IO in the database still work if the connection closed - # between the writing and reading (as in many real situations). +def test_sql_open_close(test_frame3): + # Test if the IO in the database still work if the connection closed + # between the writing and reading (as in many real situations). - with tm.ensure_clean() as name: - with closing(self.connect(name)) as conn: - assert ( - sql.to_sql(test_frame3, "test_frame3_legacy", conn, index=False) - == 4 - ) + with tm.ensure_clean() as name: + with closing(sqlite3.connect(name)) as conn: + assert sql.to_sql(test_frame3, "test_frame3_legacy", conn, index=False) == 4 - with closing(self.connect(name)) as conn: - result = sql.read_sql_query("SELECT * FROM test_frame3_legacy;", conn) + with closing(sqlite3.connect(name)) as conn: + result = sql.read_sql_query("SELECT * FROM test_frame3_legacy;", conn) - tm.assert_frame_equal(test_frame3, result) + tm.assert_frame_equal(test_frame3, result) - @pytest.mark.skipif(SQLALCHEMY_INSTALLED, reason="SQLAlchemy is installed") - def test_con_string_import_error(self): - conn = "mysql://root@localhost/pandas" - msg = "Using URI string without sqlalchemy installed" - with pytest.raises(ImportError, match=msg): - sql.read_sql("SELECT * FROM iris", conn) - @pytest.mark.skipif(SQLALCHEMY_INSTALLED, reason="SQLAlchemy is installed") - def test_con_unknown_dbapi2_class_does_not_error_without_sql_alchemy_installed( - self, - ): - class MockSqliteConnection: - def __init__(self, *args, **kwargs) -> None: - self.conn = sqlite3.Connection(*args, **kwargs) - - def __getattr__(self, name): - return getattr(self.conn, name) - - def close(self): - self.conn.close() - - with contextlib.closing(MockSqliteConnection(":memory:")) as conn: - with tm.assert_produces_warning(UserWarning): - sql.read_sql("SELECT 1", conn) - - def test_read_sql_delegate(self): - iris_frame1 = sql.read_sql_query("SELECT * FROM iris", self.conn) - iris_frame2 = sql.read_sql("SELECT * FROM iris", self.conn) - tm.assert_frame_equal(iris_frame1, iris_frame2) - - msg = "Execution failed on sql 'iris': near \"iris\": syntax error" - with pytest.raises(sql.DatabaseError, match=msg): - sql.read_sql("iris", self.conn) - - def test_get_schema2(self, test_frame1): - # without providing a connection object (available for backwards comp) - create_sql = sql.get_schema(test_frame1, "test") - assert "CREATE" in create_sql - - def _get_sqlite_column_type(self, schema, column): - for col in schema.split("\n"): - if col.split()[0].strip('"') == column: - return col.split()[1] - raise ValueError(f"Column {column} not found") - - def test_sqlite_type_mapping(self): - # Test Timestamp objects (no datetime64 because of timezone) (GH9085) - df = DataFrame( - {"time": to_datetime(["2014-12-12 01:54", "2014-12-11 02:54"], utc=True)} - ) - db = sql.SQLiteDatabase(self.conn) - table = sql.SQLiteTable("test_type", db, frame=df) - schema = table.sql_schema() - assert self._get_sqlite_column_type(schema, "time") == "TIMESTAMP" +@pytest.mark.skipif(SQLALCHEMY_INSTALLED, reason="SQLAlchemy is installed") +def test_con_string_import_error(): + conn = "mysql://root@localhost/pandas" + msg = "Using URI string without sqlalchemy installed" + with pytest.raises(ImportError, match=msg): + sql.read_sql("SELECT * FROM iris", conn) -# ----------------------------------------------------------------------------- -# -- Database flavor specific tests +@pytest.mark.skipif(SQLALCHEMY_INSTALLED, reason="SQLAlchemy is installed") +def test_con_unknown_dbapi2_class_does_not_error_without_sql_alchemy_installed(): + class MockSqliteConnection: + def __init__(self, *args, **kwargs) -> None: + self.conn = sqlite3.Connection(*args, **kwargs) + def __getattr__(self, name): + return getattr(self.conn, name) -@pytest.mark.skipif(not SQLALCHEMY_INSTALLED, reason="SQLAlchemy not installed") -class _TestSQLAlchemy(SQLAlchemyMixIn, PandasSQLTest): - """ - Base class for testing the sqlalchemy backend. + def close(self): + self.conn.close() - Subclasses for specific database types are created below. Tests that - deviate for each flavor are overwritten there. + with contextlib.closing(MockSqliteConnection(":memory:")) as conn: + with tm.assert_produces_warning(UserWarning): + sql.read_sql("SELECT 1", conn) - """ - flavor: str +def test_sqlite_read_sql_delegate(sqlite_buildin_iris): + conn = sqlite_buildin_iris + iris_frame1 = sql.read_sql_query("SELECT * FROM iris", conn) + iris_frame2 = sql.read_sql("SELECT * FROM iris", conn) + tm.assert_frame_equal(iris_frame1, iris_frame2) - @classmethod - def setup_class(cls): - cls.setup_driver() - cls.setup_engine() + msg = "Execution failed on sql 'iris': near \"iris\": syntax error" + with pytest.raises(sql.DatabaseError, match=msg): + sql.read_sql("iris", conn) - @pytest.fixture(autouse=True) - def setup_method(self, iris_path, types_data): - try: - self.conn = self.engine.connect() - self.pandasSQL = sql.SQLDatabase(self.conn) - except sqlalchemy.exc.OperationalError: - pytest.skip(f"Can't connect to {self.flavor} server") - self.load_iris_data(iris_path) - self.load_types_data(types_data) - @classmethod - def setup_driver(cls): - raise NotImplementedError() +def test_get_schema2(test_frame1): + # without providing a connection object (available for backwards comp) + create_sql = sql.get_schema(test_frame1, "test") + assert "CREATE" in create_sql - @classmethod - def setup_engine(cls): - raise NotImplementedError() - def test_read_sql_parameter(self, sql_strings): - self._read_sql_iris_parameter(sql_strings) +def test_sqlite_type_mapping(sqlite_buildin): + # Test Timestamp objects (no datetime64 because of timezone) (GH9085) + conn = sqlite_buildin + df = DataFrame( + {"time": to_datetime(["2014-12-12 01:54", "2014-12-11 02:54"], utc=True)} + ) + db = sql.SQLiteDatabase(conn) + table = sql.SQLiteTable("test_type", db, frame=df) + schema = table.sql_schema() + for col in schema.split("\n"): + if col.split()[0].strip('"') == "time": + assert col.split()[1] == "TIMESTAMP" - def test_read_sql_named_parameter(self, sql_strings): - self._read_sql_iris_named_parameter(sql_strings) - def test_to_sql_empty(self, test_frame1): - self._to_sql_empty(test_frame1) +# ----------------------------------------------------------------------------- +# -- Database flavor specific tests - def test_create_table(self): - from sqlalchemy import inspect - temp_conn = self.connect() - temp_frame = DataFrame( - {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]} - ) - with sql.SQLDatabase(temp_conn, need_transaction=True) as pandasSQL: - assert pandasSQL.to_sql(temp_frame, "temp_frame") == 4 +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_create_table(conn, request): + if conn == "sqlite_str": + pytest.skip("sqlite_str has no inspection system") - insp = inspect(temp_conn) - assert insp.has_table("temp_frame") + conn = request.getfixturevalue(conn) - # Cleanup - with sql.SQLDatabase(temp_conn, need_transaction=True) as pandasSQL: - pandasSQL.drop_table("temp_frame") + from sqlalchemy import inspect - def test_drop_table(self): - from sqlalchemy import inspect + temp_frame = DataFrame({"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}) + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + assert pandasSQL.to_sql(temp_frame, "temp_frame") == 4 - temp_conn = self.connect() - temp_frame = DataFrame( - {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]} - ) - pandasSQL = sql.SQLDatabase(temp_conn) + insp = inspect(conn) + assert insp.has_table("temp_frame") + + # Cleanup + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("temp_frame") + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_drop_table(conn, request): + if conn == "sqlite_str": + pytest.skip("sqlite_str has no inspection system") + + conn = request.getfixturevalue(conn) + + from sqlalchemy import inspect + + temp_frame = DataFrame({"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}) + pandasSQL = sql.SQLDatabase(conn) + with pandasSQL.run_transaction(): assert pandasSQL.to_sql(temp_frame, "temp_frame") == 4 - insp = inspect(temp_conn) - assert insp.has_table("temp_frame") + insp = inspect(conn) + assert insp.has_table("temp_frame") + with pandasSQL.run_transaction(): pandasSQL.drop_table("temp_frame") - try: - insp.clear_cache() # needed with SQLAlchemy 2.0, unavailable prior - except AttributeError: - pass - assert not insp.has_table("temp_frame") + try: + insp.clear_cache() # needed with SQLAlchemy 2.0, unavailable prior + except AttributeError: + pass + assert not insp.has_table("temp_frame") - def test_roundtrip(self, test_frame1): - self._roundtrip(test_frame1) - def test_execute_sql(self): - self._execute_sql() +@pytest.mark.parametrize("conn", all_connectable) +def test_roundtrip(conn, request, test_frame1): + if conn == "sqlite_str": + pytest.skip("sqlite_str has no inspection system") - def test_read_table(self): - iris_frame = sql.read_sql_table("iris", con=self.conn) - check_iris_frame(iris_frame) + conn = request.getfixturevalue(conn) + pandasSQL = pandasSQL_builder(conn) + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(test_frame1, "test_frame_roundtrip") == 4 + result = pandasSQL.read_query("SELECT * FROM test_frame_roundtrip") - def test_read_table_columns(self): - iris_frame = sql.read_sql_table( - "iris", con=self.conn, columns=["SepalLength", "SepalLength"] - ) - tm.equalContents(iris_frame.columns.values, ["SepalLength", "SepalLength"]) + result.set_index("level_0", inplace=True) + # result.index.astype(int) - def test_read_table_absent_raises(self): - msg = "Table this_doesnt_exist not found" - with pytest.raises(ValueError, match=msg): - sql.read_sql_table("this_doesnt_exist", con=self.conn) + result.index.name = None - def test_default_type_conversion(self): - df = sql.read_sql_table("types", self.conn) + tm.assert_frame_equal(result, test_frame1) - assert issubclass(df.FloatCol.dtype.type, np.floating) - assert issubclass(df.IntCol.dtype.type, np.integer) - assert issubclass(df.BoolCol.dtype.type, np.bool_) - # Int column with NA values stays as float - assert issubclass(df.IntColWithNull.dtype.type, np.floating) - # Bool column with NA values becomes object - assert issubclass(df.BoolColWithNull.dtype.type, object) +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_execute_sql(conn, request): + conn = request.getfixturevalue(conn) + pandasSQL = pandasSQL_builder(conn) + with pandasSQL.run_transaction(): + iris_results = pandasSQL.execute("SELECT * FROM iris") + row = iris_results.fetchone() + tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) - def test_bigint(self): - # int64 should be converted to BigInteger, GH7433 - df = DataFrame(data={"i64": [2**62]}) - assert df.to_sql(name="test_bigint", con=self.conn, index=False) == 1 - result = sql.read_sql_table("test_bigint", self.conn) - - tm.assert_frame_equal(df, result) - - def test_default_date_load(self): - df = sql.read_sql_table("types", self.conn) - - # IMPORTANT - sqlite has no native date type, so shouldn't parse, but - # MySQL SHOULD be converted. - assert issubclass(df.DateCol.dtype.type, np.datetime64) - - def test_datetime_with_timezone(self, request): - # edge case that converts postgresql datetime with time zone types - # to datetime64[ns,psycopg2.tz.FixedOffsetTimezone..], which is ok - # but should be more natural, so coerce to datetime64[ns] for now - - def check(col): - # check that a column is either datetime64[ns] - # or datetime64[ns, UTC] - if lib.is_np_dtype(col.dtype, "M"): - # "2000-01-01 00:00:00-08:00" should convert to - # "2000-01-01 08:00:00" - assert col[0] == Timestamp("2000-01-01 08:00:00") - - # "2000-06-01 00:00:00-07:00" should convert to - # "2000-06-01 07:00:00" - assert col[1] == Timestamp("2000-06-01 07:00:00") - - elif isinstance(col.dtype, DatetimeTZDtype): - assert str(col.dt.tz) == "UTC" - - # "2000-01-01 00:00:00-08:00" should convert to - # "2000-01-01 08:00:00" - # "2000-06-01 00:00:00-07:00" should convert to - # "2000-06-01 07:00:00" - # GH 6415 - expected_data = [ - Timestamp("2000-01-01 08:00:00", tz="UTC"), - Timestamp("2000-06-01 07:00:00", tz="UTC"), - ] - expected = Series(expected_data, name=col.name) - tm.assert_series_equal(col, expected) - else: - raise AssertionError( - f"DateCol loaded with incorrect type -> {col.dtype}" - ) +@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) +def test_sqlalchemy_read_table(conn, request): + conn = request.getfixturevalue(conn) + iris_frame = sql.read_sql_table("iris", con=conn) + check_iris_frame(iris_frame) - # GH11216 - df = read_sql_query("select * from types", self.conn) - if not hasattr(df, "DateColWithTz"): - request.node.add_marker( - pytest.mark.xfail(reason="no column with datetime with time zone") - ) - # this is parsed on Travis (linux), but not on macosx for some reason - # even with the same versions of psycopg2 & sqlalchemy, possibly a - # Postgresql server version difference - col = df.DateColWithTz - assert isinstance(col.dtype, DatetimeTZDtype) +@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) +def test_sqlalchemy_read_table_columns(conn, request): + conn = request.getfixturevalue(conn) + iris_frame = sql.read_sql_table( + "iris", con=conn, columns=["SepalLength", "SepalLength"] + ) + tm.equalContents(iris_frame.columns.values, ["SepalLength", "SepalLength"]) - df = read_sql_query( - "select * from types", self.conn, parse_dates=["DateColWithTz"] - ) - if not hasattr(df, "DateColWithTz"): - request.node.add_marker( - pytest.mark.xfail(reason="no column with datetime with time zone") - ) - col = df.DateColWithTz - assert isinstance(col.dtype, DatetimeTZDtype) - assert str(col.dt.tz) == "UTC" - check(df.DateColWithTz) - - df = concat( - list(read_sql_query("select * from types", self.conn, chunksize=1)), - ignore_index=True, - ) - col = df.DateColWithTz - assert isinstance(col.dtype, DatetimeTZDtype) - assert str(col.dt.tz) == "UTC" - expected = sql.read_sql_table("types", self.conn) - col = expected.DateColWithTz - assert isinstance(col.dtype, DatetimeTZDtype) - tm.assert_series_equal(df.DateColWithTz, expected.DateColWithTz) - - # xref #7139 - # this might or might not be converted depending on the postgres driver - df = sql.read_sql_table("types", self.conn) - check(df.DateColWithTz) - - def test_datetime_with_timezone_roundtrip(self): - # GH 9086 - # Write datetimetz data to a db and read it back - # For dbs that support timestamps with timezones, should get back UTC - # otherwise naive data should be returned - expected = DataFrame( - {"A": date_range("2013-01-01 09:00:00", periods=3, tz="US/Pacific")} + +@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) +def test_read_table_absent_raises(conn, request): + conn = request.getfixturevalue(conn) + msg = "Table this_doesnt_exist not found" + with pytest.raises(ValueError, match=msg): + sql.read_sql_table("this_doesnt_exist", con=conn) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_sqlalchemy_default_type_conversion(conn, request): + conn_name = conn + if conn_name == "sqlite_str": + pytest.skip("types tables not created in sqlite_str fixture") + elif "mysql" in conn_name or "sqlite" in conn_name: + request.node.add_marker( + pytest.mark.xfail(reason="boolean dtype not inferred properly") ) - assert expected.to_sql(name="test_datetime_tz", con=self.conn, index=False) == 3 - if self.flavor == "postgresql": - # SQLAlchemy "timezones" (i.e. offsets) are coerced to UTC - expected["A"] = expected["A"].dt.tz_convert("UTC") - else: - # Otherwise, timestamps are returned as local, naive - expected["A"] = expected["A"].dt.tz_localize(None) + conn = request.getfixturevalue(conn) + df = sql.read_sql_table("types", conn) - result = sql.read_sql_table("test_datetime_tz", self.conn) - tm.assert_frame_equal(result, expected) + assert issubclass(df.FloatCol.dtype.type, np.floating) + assert issubclass(df.IntCol.dtype.type, np.integer) + assert issubclass(df.BoolCol.dtype.type, np.bool_) - result = sql.read_sql_query("SELECT * FROM test_datetime_tz", self.conn) - if self.flavor == "sqlite": - # read_sql_query does not return datetime type like read_sql_table - assert isinstance(result.loc[0, "A"], str) - result["A"] = to_datetime(result["A"]) - tm.assert_frame_equal(result, expected) + # Int column with NA values stays as float + assert issubclass(df.IntColWithNull.dtype.type, np.floating) + # Bool column with NA values becomes object + assert issubclass(df.BoolColWithNull.dtype.type, object) - def test_out_of_bounds_datetime(self): - # GH 26761 - data = DataFrame({"date": datetime(9999, 1, 1)}, index=[0]) - assert data.to_sql(name="test_datetime_obb", con=self.conn, index=False) == 1 - result = sql.read_sql_table("test_datetime_obb", self.conn) - expected = DataFrame([pd.NaT], columns=["date"]) - tm.assert_frame_equal(result, expected) - def test_naive_datetimeindex_roundtrip(self): - # GH 23510 - # Ensure that a naive DatetimeIndex isn't converted to UTC - dates = date_range("2018-01-01", periods=5, freq="6H")._with_freq(None) - expected = DataFrame({"nums": range(5)}, index=dates) - assert ( - expected.to_sql(name="foo_table", con=self.conn, index_label="info_date") - == 5 - ) - result = sql.read_sql_table("foo_table", self.conn, index_col="info_date") - # result index with gain a name from a set_index operation; expected - tm.assert_frame_equal(result, expected, check_names=False) +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_bigint(conn, request): + # int64 should be converted to BigInteger, GH7433 + conn = request.getfixturevalue(conn) + df = DataFrame(data={"i64": [2**62]}) + assert df.to_sql(name="test_bigint", con=conn, index=False) == 1 + result = sql.read_sql_table("test_bigint", conn) - def test_date_parsing(self): - # No Parsing - df = sql.read_sql_table("types", self.conn) - expected_type = object if self.flavor == "sqlite" else np.datetime64 - assert issubclass(df.DateCol.dtype.type, expected_type) + tm.assert_frame_equal(df, result) - df = sql.read_sql_table("types", self.conn, parse_dates=["DateCol"]) - assert issubclass(df.DateCol.dtype.type, np.datetime64) - df = sql.read_sql_table( - "types", self.conn, parse_dates={"DateCol": "%Y-%m-%d %H:%M:%S"} +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_default_date_load(conn, request): + conn_name = conn + if conn_name == "sqlite_str": + pytest.skip("types tables not created in sqlite_str fixture") + elif "sqlite" in conn_name: + request.node.add_marker( + pytest.mark.xfail(reason="sqlite does not read date properly") ) - assert issubclass(df.DateCol.dtype.type, np.datetime64) - df = sql.read_sql_table( - "types", - self.conn, - parse_dates={"DateCol": {"format": "%Y-%m-%d %H:%M:%S"}}, - ) - assert issubclass(df.DateCol.dtype.type, np.datetime64) + conn = request.getfixturevalue(conn) + df = sql.read_sql_table("types", conn) + + assert issubclass(df.DateCol.dtype.type, np.datetime64) - df = sql.read_sql_table("types", self.conn, parse_dates=["IntDateCol"]) - assert issubclass(df.IntDateCol.dtype.type, np.datetime64) - df = sql.read_sql_table("types", self.conn, parse_dates={"IntDateCol": "s"}) - assert issubclass(df.IntDateCol.dtype.type, np.datetime64) +@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) +def test_datetime_with_timezone(conn, request): + # edge case that converts postgresql datetime with time zone types + # to datetime64[ns,psycopg2.tz.FixedOffsetTimezone..], which is ok + # but should be more natural, so coerce to datetime64[ns] for now + + def check(col): + # check that a column is either datetime64[ns] + # or datetime64[ns, UTC] + if lib.is_np_dtype(col.dtype, "M"): + # "2000-01-01 00:00:00-08:00" should convert to + # "2000-01-01 08:00:00" + assert col[0] == Timestamp("2000-01-01 08:00:00") + + # "2000-06-01 00:00:00-07:00" should convert to + # "2000-06-01 07:00:00" + assert col[1] == Timestamp("2000-06-01 07:00:00") + + elif isinstance(col.dtype, DatetimeTZDtype): + assert str(col.dt.tz) == "UTC" + + # "2000-01-01 00:00:00-08:00" should convert to + # "2000-01-01 08:00:00" + # "2000-06-01 00:00:00-07:00" should convert to + # "2000-06-01 07:00:00" + # GH 6415 + expected_data = [ + Timestamp("2000-01-01 08:00:00", tz="UTC"), + Timestamp("2000-06-01 07:00:00", tz="UTC"), + ] + expected = Series(expected_data, name=col.name) + tm.assert_series_equal(col, expected) - df = sql.read_sql_table( - "types", self.conn, parse_dates={"IntDateCol": {"unit": "s"}} + else: + raise AssertionError(f"DateCol loaded with incorrect type -> {col.dtype}") + + # GH11216 + conn = request.getfixturevalue(conn) + df = read_sql_query("select * from types", conn) + if not hasattr(df, "DateColWithTz"): + request.node.add_marker( + pytest.mark.xfail(reason="no column with datetime with time zone") ) - assert issubclass(df.IntDateCol.dtype.type, np.datetime64) - def test_datetime(self): - df = DataFrame( - {"A": date_range("2013-01-01 09:00:00", periods=3), "B": np.arange(3.0)} + # this is parsed on Travis (linux), but not on macosx for some reason + # even with the same versions of psycopg2 & sqlalchemy, possibly a + # Postgresql server version difference + col = df.DateColWithTz + assert isinstance(col.dtype, DatetimeTZDtype) + + df = read_sql_query("select * from types", conn, parse_dates=["DateColWithTz"]) + if not hasattr(df, "DateColWithTz"): + request.node.add_marker( + pytest.mark.xfail(reason="no column with datetime with time zone") ) - assert df.to_sql(name="test_datetime", con=self.conn) == 3 + col = df.DateColWithTz + assert isinstance(col.dtype, DatetimeTZDtype) + assert str(col.dt.tz) == "UTC" + check(df.DateColWithTz) + + df = concat( + list(read_sql_query("select * from types", conn, chunksize=1)), + ignore_index=True, + ) + col = df.DateColWithTz + assert isinstance(col.dtype, DatetimeTZDtype) + assert str(col.dt.tz) == "UTC" + expected = sql.read_sql_table("types", conn) + col = expected.DateColWithTz + assert isinstance(col.dtype, DatetimeTZDtype) + tm.assert_series_equal(df.DateColWithTz, expected.DateColWithTz) + + # xref #7139 + # this might or might not be converted depending on the postgres driver + df = sql.read_sql_table("types", conn) + check(df.DateColWithTz) - # with read_table -> type information from schema used - result = sql.read_sql_table("test_datetime", self.conn) - result = result.drop("index", axis=1) - tm.assert_frame_equal(result, df) - # with read_sql -> no type information -> sqlite has no native - result = sql.read_sql_query("SELECT * FROM test_datetime", self.conn) - result = result.drop("index", axis=1) - if self.flavor == "sqlite": - assert isinstance(result.loc[0, "A"], str) - result["A"] = to_datetime(result["A"]) - tm.assert_frame_equal(result, df) - else: - tm.assert_frame_equal(result, df) +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_datetime_with_timezone_roundtrip(conn, request): + conn_name = conn + conn = request.getfixturevalue(conn) + # GH 9086 + # Write datetimetz data to a db and read it back + # For dbs that support timestamps with timezones, should get back UTC + # otherwise naive data should be returned + expected = DataFrame( + {"A": date_range("2013-01-01 09:00:00", periods=3, tz="US/Pacific")} + ) + assert expected.to_sql(name="test_datetime_tz", con=conn, index=False) == 3 - def test_datetime_NaT(self): - df = DataFrame( - {"A": date_range("2013-01-01 09:00:00", periods=3), "B": np.arange(3.0)} - ) - df.loc[1, "A"] = np.nan - assert df.to_sql(name="test_datetime", con=self.conn, index=False) == 3 + if "postgresql" in conn_name: + # SQLAlchemy "timezones" (i.e. offsets) are coerced to UTC + expected["A"] = expected["A"].dt.tz_convert("UTC") + else: + # Otherwise, timestamps are returned as local, naive + expected["A"] = expected["A"].dt.tz_localize(None) - # with read_table -> type information from schema used - result = sql.read_sql_table("test_datetime", self.conn) - tm.assert_frame_equal(result, df) + result = sql.read_sql_table("test_datetime_tz", conn) + tm.assert_frame_equal(result, expected) - # with read_sql -> no type information -> sqlite has no native - result = sql.read_sql_query("SELECT * FROM test_datetime", self.conn) - if self.flavor == "sqlite": - assert isinstance(result.loc[0, "A"], str) - result["A"] = to_datetime(result["A"], errors="coerce") - tm.assert_frame_equal(result, df) - else: - tm.assert_frame_equal(result, df) - - def test_datetime_date(self): - # test support for datetime.date - df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) - assert df.to_sql(name="test_date", con=self.conn, index=False) == 2 - res = read_sql_table("test_date", self.conn) - result = res["a"] - expected = to_datetime(df["a"]) - # comes back as datetime64 - tm.assert_series_equal(result, expected) - - def test_datetime_time(self, sqlite_buildin): - # test support for datetime.time - df = DataFrame([time(9, 0, 0), time(9, 1, 30)], columns=["a"]) - assert df.to_sql(name="test_time", con=self.conn, index=False) == 2 - res = read_sql_table("test_time", self.conn) - tm.assert_frame_equal(res, df) - - # GH8341 - # first, use the fallback to have the sqlite adapter put in place - sqlite_conn = sqlite_buildin - assert sql.to_sql(df, "test_time2", sqlite_conn, index=False) == 2 - res = sql.read_sql_query("SELECT * FROM test_time2", sqlite_conn) - ref = df.map(lambda _: _.strftime("%H:%M:%S.%f")) - tm.assert_frame_equal(ref, res) # check if adapter is in place - # then test if sqlalchemy is unaffected by the sqlite adapter - assert sql.to_sql(df, "test_time3", self.conn, index=False) == 2 - if self.flavor == "sqlite": - res = sql.read_sql_query("SELECT * FROM test_time3", self.conn) - ref = df.map(lambda _: _.strftime("%H:%M:%S.%f")) - tm.assert_frame_equal(ref, res) - res = sql.read_sql_table("test_time3", self.conn) - tm.assert_frame_equal(df, res) - - def test_mixed_dtype_insert(self): - # see GH6509 - s1 = Series(2**25 + 1, dtype=np.int32) - s2 = Series(0.0, dtype=np.float32) - df = DataFrame({"s1": s1, "s2": s2}) - - # write and read again - assert df.to_sql(name="test_read_write", con=self.conn, index=False) == 1 - df2 = sql.read_sql_table("test_read_write", self.conn) - - tm.assert_frame_equal(df, df2, check_dtype=False, check_exact=True) - - def test_nan_numeric(self): - # NaNs in numeric float column - df = DataFrame({"A": [0, 1, 2], "B": [0.2, np.nan, 5.6]}) - assert df.to_sql(name="test_nan", con=self.conn, index=False) == 3 - - # with read_table - result = sql.read_sql_table("test_nan", self.conn) - tm.assert_frame_equal(result, df) + result = sql.read_sql_query("SELECT * FROM test_datetime_tz", conn) + if "sqlite" in conn_name: + # read_sql_query does not return datetime type like read_sql_table + assert isinstance(result.loc[0, "A"], str) + result["A"] = to_datetime(result["A"]) + tm.assert_frame_equal(result, expected) - # with read_sql - result = sql.read_sql_query("SELECT * FROM test_nan", self.conn) - tm.assert_frame_equal(result, df) - def test_nan_fullcolumn(self): - # full NaN column (numeric float column) - df = DataFrame({"A": [0, 1, 2], "B": [np.nan, np.nan, np.nan]}) - assert df.to_sql(name="test_nan", con=self.conn, index=False) == 3 +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_out_of_bounds_datetime(conn, request): + # GH 26761 + conn = request.getfixturevalue(conn) + data = DataFrame({"date": datetime(9999, 1, 1)}, index=[0]) + assert data.to_sql(name="test_datetime_obb", con=conn, index=False) == 1 + result = sql.read_sql_table("test_datetime_obb", conn) + expected = DataFrame([pd.NaT], columns=["date"]) + tm.assert_frame_equal(result, expected) - # with read_table - result = sql.read_sql_table("test_nan", self.conn) - tm.assert_frame_equal(result, df) - # with read_sql -> not type info from table -> stays None - df["B"] = df["B"].astype("object") - df["B"] = None - result = sql.read_sql_query("SELECT * FROM test_nan", self.conn) - tm.assert_frame_equal(result, df) +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_naive_datetimeindex_roundtrip(conn, request): + # GH 23510 + # Ensure that a naive DatetimeIndex isn't converted to UTC + conn = request.getfixturevalue(conn) + dates = date_range("2018-01-01", periods=5, freq="6H")._with_freq(None) + expected = DataFrame({"nums": range(5)}, index=dates) + assert expected.to_sql(name="foo_table", con=conn, index_label="info_date") == 5 + result = sql.read_sql_table("foo_table", conn, index_col="info_date") + # result index with gain a name from a set_index operation; expected + tm.assert_frame_equal(result, expected, check_names=False) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) +def test_date_parsing(conn, request): + # No Parsing + conn_name = conn + conn = request.getfixturevalue(conn) + df = sql.read_sql_table("types", conn) + expected_type = object if "sqlite" in conn_name else np.datetime64 + assert issubclass(df.DateCol.dtype.type, expected_type) + + df = sql.read_sql_table("types", conn, parse_dates=["DateCol"]) + assert issubclass(df.DateCol.dtype.type, np.datetime64) + + df = sql.read_sql_table("types", conn, parse_dates={"DateCol": "%Y-%m-%d %H:%M:%S"}) + assert issubclass(df.DateCol.dtype.type, np.datetime64) - def test_nan_string(self): - # NaNs in string column - df = DataFrame({"A": [0, 1, 2], "B": ["a", "b", np.nan]}) - assert df.to_sql(name="test_nan", con=self.conn, index=False) == 3 + df = sql.read_sql_table( + "types", + conn, + parse_dates={"DateCol": {"format": "%Y-%m-%d %H:%M:%S"}}, + ) + assert issubclass(df.DateCol.dtype.type, np.datetime64) + + df = sql.read_sql_table("types", conn, parse_dates=["IntDateCol"]) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + + df = sql.read_sql_table("types", conn, parse_dates={"IntDateCol": "s"}) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + + df = sql.read_sql_table("types", conn, parse_dates={"IntDateCol": {"unit": "s"}}) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) - # NaNs are coming back as None - df.loc[2, "B"] = None - # with read_table - result = sql.read_sql_table("test_nan", self.conn) +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_datetime(conn, request): + conn_name = conn + conn = request.getfixturevalue(conn) + df = DataFrame( + {"A": date_range("2013-01-01 09:00:00", periods=3), "B": np.arange(3.0)} + ) + assert df.to_sql(name="test_datetime", con=conn) == 3 + + # with read_table -> type information from schema used + result = sql.read_sql_table("test_datetime", conn) + result = result.drop("index", axis=1) + tm.assert_frame_equal(result, df) + + # with read_sql -> no type information -> sqlite has no native + result = sql.read_sql_query("SELECT * FROM test_datetime", conn) + result = result.drop("index", axis=1) + if "sqlite" in conn_name: + assert isinstance(result.loc[0, "A"], str) + result["A"] = to_datetime(result["A"]) + tm.assert_frame_equal(result, df) + else: tm.assert_frame_equal(result, df) - # with read_sql - result = sql.read_sql_query("SELECT * FROM test_nan", self.conn) + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_datetime_NaT(conn, request): + conn_name = conn + conn = request.getfixturevalue(conn) + df = DataFrame( + {"A": date_range("2013-01-01 09:00:00", periods=3), "B": np.arange(3.0)} + ) + df.loc[1, "A"] = np.nan + assert df.to_sql(name="test_datetime", con=conn, index=False) == 3 + + # with read_table -> type information from schema used + result = sql.read_sql_table("test_datetime", conn) + tm.assert_frame_equal(result, df) + + # with read_sql -> no type information -> sqlite has no native + result = sql.read_sql_query("SELECT * FROM test_datetime", conn) + if "sqlite" in conn_name: + assert isinstance(result.loc[0, "A"], str) + result["A"] = to_datetime(result["A"], errors="coerce") + tm.assert_frame_equal(result, df) + else: tm.assert_frame_equal(result, df) - def _get_index_columns(self, tbl_name): + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_datetime_date(conn, request): + # test support for datetime.date + conn = request.getfixturevalue(conn) + df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) + assert df.to_sql(name="test_date", con=conn, index=False) == 2 + res = read_sql_table("test_date", conn) + result = res["a"] + expected = to_datetime(df["a"]) + # comes back as datetime64 + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_datetime_time(conn, request, sqlite_buildin): + # test support for datetime.time + conn_name = conn + conn = request.getfixturevalue(conn) + df = DataFrame([time(9, 0, 0), time(9, 1, 30)], columns=["a"]) + assert df.to_sql(name="test_time", con=conn, index=False) == 2 + res = read_sql_table("test_time", conn) + tm.assert_frame_equal(res, df) + + # GH8341 + # first, use the fallback to have the sqlite adapter put in place + sqlite_conn = sqlite_buildin + assert sql.to_sql(df, "test_time2", sqlite_conn, index=False) == 2 + res = sql.read_sql_query("SELECT * FROM test_time2", sqlite_conn) + ref = df.map(lambda _: _.strftime("%H:%M:%S.%f")) + tm.assert_frame_equal(ref, res) # check if adapter is in place + # then test if sqlalchemy is unaffected by the sqlite adapter + assert sql.to_sql(df, "test_time3", conn, index=False) == 2 + if "sqlite" in conn_name: + res = sql.read_sql_query("SELECT * FROM test_time3", conn) + ref = df.map(lambda _: _.strftime("%H:%M:%S.%f")) + tm.assert_frame_equal(ref, res) + res = sql.read_sql_table("test_time3", conn) + tm.assert_frame_equal(df, res) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_mixed_dtype_insert(conn, request): + # see GH6509 + conn = request.getfixturevalue(conn) + s1 = Series(2**25 + 1, dtype=np.int32) + s2 = Series(0.0, dtype=np.float32) + df = DataFrame({"s1": s1, "s2": s2}) + + # write and read again + assert df.to_sql(name="test_read_write", con=conn, index=False) == 1 + df2 = sql.read_sql_table("test_read_write", conn) + + tm.assert_frame_equal(df, df2, check_dtype=False, check_exact=True) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_nan_numeric(conn, request): + # NaNs in numeric float column + conn = request.getfixturevalue(conn) + df = DataFrame({"A": [0, 1, 2], "B": [0.2, np.nan, 5.6]}) + assert df.to_sql(name="test_nan", con=conn, index=False) == 3 + + # with read_table + result = sql.read_sql_table("test_nan", conn) + tm.assert_frame_equal(result, df) + + # with read_sql + result = sql.read_sql_query("SELECT * FROM test_nan", conn) + tm.assert_frame_equal(result, df) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_nan_fullcolumn(conn, request): + # full NaN column (numeric float column) + conn = request.getfixturevalue(conn) + df = DataFrame({"A": [0, 1, 2], "B": [np.nan, np.nan, np.nan]}) + assert df.to_sql(name="test_nan", con=conn, index=False) == 3 + + # with read_table + result = sql.read_sql_table("test_nan", conn) + tm.assert_frame_equal(result, df) + + # with read_sql -> not type info from table -> stays None + df["B"] = df["B"].astype("object") + df["B"] = None + result = sql.read_sql_query("SELECT * FROM test_nan", conn) + tm.assert_frame_equal(result, df) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_nan_string(conn, request): + # NaNs in string column + conn = request.getfixturevalue(conn) + df = DataFrame({"A": [0, 1, 2], "B": ["a", "b", np.nan]}) + assert df.to_sql(name="test_nan", con=conn, index=False) == 3 + + # NaNs are coming back as None + df.loc[2, "B"] = None + + # with read_table + result = sql.read_sql_table("test_nan", conn) + tm.assert_frame_equal(result, df) + + # with read_sql + result = sql.read_sql_query("SELECT * FROM test_nan", conn) + tm.assert_frame_equal(result, df) + + +@pytest.mark.parametrize("conn", all_connectable) +def test_to_sql_save_index(conn, request): + conn_name = conn + conn = request.getfixturevalue(conn) + df = DataFrame.from_records( + [(1, 2.1, "line1"), (2, 1.5, "line2")], columns=["A", "B", "C"], index=["A"] + ) + + pandasSQL = pandasSQL_builder(conn) + tbl_name = "test_to_sql_saves_index" + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(df, tbl_name) == 2 + + if conn_name in {"sqlite_buildin", "sqlite_str"}: + ixs = sql.read_sql_query( + "SELECT * FROM sqlite_master WHERE type = 'index' " + f"AND tbl_name = '{tbl_name}'", + conn, + ) + ix_cols = [] + for ix_name in ixs.name: + ix_info = sql.read_sql_query(f"PRAGMA index_info({ix_name})", conn) + ix_cols.append(ix_info.name.tolist()) + else: from sqlalchemy import inspect - insp = inspect(self.conn) + insp = inspect(conn) ixs = insp.get_indexes(tbl_name) - ixs = [i["column_names"] for i in ixs] - return ixs + ix_cols = [i["column_names"] for i in ixs] + + assert ix_cols == [["A"]] - def test_to_sql_save_index(self): - self._to_sql_save_index() - def test_transactions(self): - self._transaction_test() +@pytest.mark.parametrize("conn", all_connectable) +def test_transactions(conn, request): + conn_name = conn + conn = request.getfixturevalue(conn) - def test_get_schema_create_table(self, test_frame3): - # Use a dataframe without a bool column, since MySQL converts bool to - # TINYINT (which read_sql_table returns as an int and causes a dtype - # mismatch) + stmt = "CREATE TABLE test_trans (A INT, B TEXT)" + pandasSQL = pandasSQL_builder(conn) + if conn_name != "sqlite_buildin": from sqlalchemy import text - from sqlalchemy.engine import Engine - tbl = "test_get_schema_create_table" - create_sql = sql.get_schema(test_frame3, tbl, con=self.conn) - blank_test_df = test_frame3.iloc[:0] + stmt = text(stmt) - self.drop_table(tbl, self.conn) - create_sql = text(create_sql) - if isinstance(self.conn, Engine): - with self.conn.connect() as conn: - with conn.begin(): - conn.execute(create_sql) + with pandasSQL.run_transaction() as trans: + trans.execute(stmt) + + +@pytest.mark.parametrize("conn", all_connectable) +def test_transaction_rollback(conn, request): + conn = request.getfixturevalue(conn) + pandasSQL = pandasSQL_builder(conn) + with pandasSQL.run_transaction() as trans: + stmt = "CREATE TABLE test_trans (A INT, B TEXT)" + if isinstance(pandasSQL, SQLiteDatabase): + trans.execute(stmt) else: - with self.conn.begin(): - self.conn.execute(create_sql) - returned_df = sql.read_sql_table(tbl, self.conn) - tm.assert_frame_equal(returned_df, blank_test_df, check_index_type=False) - self.drop_table(tbl, self.conn) - - def test_dtype(self): - from sqlalchemy import ( - TEXT, - String, - ) - from sqlalchemy.schema import MetaData - - cols = ["A", "B"] - data = [(0.8, True), (0.9, None)] - df = DataFrame(data, columns=cols) - assert df.to_sql(name="dtype_test", con=self.conn) == 2 - assert df.to_sql(name="dtype_test2", con=self.conn, dtype={"B": TEXT}) == 2 - meta = MetaData() - meta.reflect(bind=self.conn) - sqltype = meta.tables["dtype_test2"].columns["B"].type - assert isinstance(sqltype, TEXT) - msg = "The type of B is not a SQLAlchemy type" - with pytest.raises(ValueError, match=msg): - df.to_sql(name="error", con=self.conn, dtype={"B": str}) + from sqlalchemy import text - # GH9083 - assert ( - df.to_sql(name="dtype_test3", con=self.conn, dtype={"B": String(10)}) == 2 - ) - meta.reflect(bind=self.conn) - sqltype = meta.tables["dtype_test3"].columns["B"].type - assert isinstance(sqltype, String) - assert sqltype.length == 10 - - # single dtype - assert df.to_sql(name="single_dtype_test", con=self.conn, dtype=TEXT) == 2 - meta.reflect(bind=self.conn) - sqltypea = meta.tables["single_dtype_test"].columns["A"].type - sqltypeb = meta.tables["single_dtype_test"].columns["B"].type - assert isinstance(sqltypea, TEXT) - assert isinstance(sqltypeb, TEXT) - - def test_notna_dtype(self): - from sqlalchemy import ( - Boolean, - DateTime, - Float, - Integer, + stmt = text(stmt) + trans.execute(stmt) + + class DummyException(Exception): + pass + + # Make sure when transaction is rolled back, no rows get inserted + ins_sql = "INSERT INTO test_trans (A,B) VALUES (1, 'blah')" + if isinstance(pandasSQL, SQLDatabase): + from sqlalchemy import text + + ins_sql = text(ins_sql) + try: + with pandasSQL.run_transaction() as trans: + trans.execute(ins_sql) + raise DummyException("error") + except DummyException: + # ignore raised exception + pass + with pandasSQL.run_transaction(): + res = pandasSQL.read_query("SELECT * FROM test_trans") + assert len(res) == 0 + + # Make sure when transaction is committed, rows do get inserted + with pandasSQL.run_transaction() as trans: + trans.execute(ins_sql) + res2 = pandasSQL.read_query("SELECT * FROM test_trans") + assert len(res2) == 1 + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_get_schema_create_table(conn, request, test_frame3): + # Use a dataframe without a bool column, since MySQL converts bool to + # TINYINT (which read_sql_table returns as an int and causes a dtype + # mismatch) + if conn == "sqlite_str": + request.node.add_marker( + pytest.mark.xfail(reason="test does not support sqlite_str fixture") ) - from sqlalchemy.schema import MetaData - cols = { - "Bool": Series([True, None]), - "Date": Series([datetime(2012, 5, 1), None]), - "Int": Series([1, None], dtype="object"), - "Float": Series([1.1, None]), + conn = request.getfixturevalue(conn) + + from sqlalchemy import text + from sqlalchemy.engine import Engine + + tbl = "test_get_schema_create_table" + create_sql = sql.get_schema(test_frame3, tbl, con=conn) + blank_test_df = test_frame3.iloc[:0] + + create_sql = text(create_sql) + if isinstance(conn, Engine): + with conn.connect() as newcon: + with newcon.begin(): + newcon.execute(create_sql) + else: + conn.execute(create_sql) + returned_df = sql.read_sql_table(tbl, conn) + tm.assert_frame_equal(returned_df, blank_test_df, check_index_type=False) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_dtype(conn, request): + if conn == "sqlite_str": + pytest.skip("sqlite_str has no inspection system") + + conn = request.getfixturevalue(conn) + + from sqlalchemy import ( + TEXT, + String, + ) + from sqlalchemy.schema import MetaData + + cols = ["A", "B"] + data = [(0.8, True), (0.9, None)] + df = DataFrame(data, columns=cols) + assert df.to_sql(name="dtype_test", con=conn) == 2 + assert df.to_sql(name="dtype_test2", con=conn, dtype={"B": TEXT}) == 2 + meta = MetaData() + meta.reflect(bind=conn) + sqltype = meta.tables["dtype_test2"].columns["B"].type + assert isinstance(sqltype, TEXT) + msg = "The type of B is not a SQLAlchemy type" + with pytest.raises(ValueError, match=msg): + df.to_sql(name="error", con=conn, dtype={"B": str}) + + # GH9083 + assert df.to_sql(name="dtype_test3", con=conn, dtype={"B": String(10)}) == 2 + meta.reflect(bind=conn) + sqltype = meta.tables["dtype_test3"].columns["B"].type + assert isinstance(sqltype, String) + assert sqltype.length == 10 + + # single dtype + assert df.to_sql(name="single_dtype_test", con=conn, dtype=TEXT) == 2 + meta.reflect(bind=conn) + sqltypea = meta.tables["single_dtype_test"].columns["A"].type + sqltypeb = meta.tables["single_dtype_test"].columns["B"].type + assert isinstance(sqltypea, TEXT) + assert isinstance(sqltypeb, TEXT) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_notna_dtype(conn, request): + if conn == "sqlite_str": + pytest.skip("sqlite_str has no inspection system") + + conn_name = conn + conn = request.getfixturevalue(conn) + + from sqlalchemy import ( + Boolean, + DateTime, + Float, + Integer, + ) + from sqlalchemy.schema import MetaData + + cols = { + "Bool": Series([True, None]), + "Date": Series([datetime(2012, 5, 1), None]), + "Int": Series([1, None], dtype="object"), + "Float": Series([1.1, None]), + } + df = DataFrame(cols) + + tbl = "notna_dtype_test" + assert df.to_sql(name=tbl, con=conn) == 2 + _ = sql.read_sql_table(tbl, conn) + meta = MetaData() + meta.reflect(bind=conn) + my_type = Integer if "mysql" in conn_name else Boolean + col_dict = meta.tables[tbl].columns + assert isinstance(col_dict["Bool"].type, my_type) + assert isinstance(col_dict["Date"].type, DateTime) + assert isinstance(col_dict["Int"].type, Integer) + assert isinstance(col_dict["Float"].type, Float) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_double_precision(conn, request): + if conn == "sqlite_str": + pytest.skip("sqlite_str has no inspection system") + + conn = request.getfixturevalue(conn) + + from sqlalchemy import ( + BigInteger, + Float, + Integer, + ) + from sqlalchemy.schema import MetaData + + V = 1.23456789101112131415 + + df = DataFrame( + { + "f32": Series([V], dtype="float32"), + "f64": Series([V], dtype="float64"), + "f64_as_f32": Series([V], dtype="float64"), + "i32": Series([5], dtype="int32"), + "i64": Series([5], dtype="int64"), } - df = DataFrame(cols) - - tbl = "notna_dtype_test" - assert df.to_sql(name=tbl, con=self.conn) == 2 - _ = sql.read_sql_table(tbl, self.conn) - meta = MetaData() - meta.reflect(bind=self.conn) - my_type = Integer if self.flavor == "mysql" else Boolean - col_dict = meta.tables[tbl].columns - assert isinstance(col_dict["Bool"].type, my_type) - assert isinstance(col_dict["Date"].type, DateTime) - assert isinstance(col_dict["Int"].type, Integer) - assert isinstance(col_dict["Float"].type, Float) - - def test_double_precision(self): - from sqlalchemy import ( - BigInteger, - Float, - Integer, + ) + + assert ( + df.to_sql( + name="test_dtypes", + con=conn, + index=False, + if_exists="replace", + dtype={"f64_as_f32": Float(precision=23)}, ) - from sqlalchemy.schema import MetaData + == 1 + ) + res = sql.read_sql_table("test_dtypes", conn) - V = 1.23456789101112131415 + # check precision of float64 + assert np.round(df["f64"].iloc[0], 14) == np.round(res["f64"].iloc[0], 14) - df = DataFrame( - { - "f32": Series([V], dtype="float32"), - "f64": Series([V], dtype="float64"), - "f64_as_f32": Series([V], dtype="float64"), - "i32": Series([5], dtype="int32"), - "i64": Series([5], dtype="int64"), - } - ) + # check sql types + meta = MetaData() + meta.reflect(bind=conn) + col_dict = meta.tables["test_dtypes"].columns + assert str(col_dict["f32"].type) == str(col_dict["f64_as_f32"].type) + assert isinstance(col_dict["f32"].type, Float) + assert isinstance(col_dict["f64"].type, Float) + assert isinstance(col_dict["i32"].type, Integer) + assert isinstance(col_dict["i64"].type, BigInteger) - assert ( - df.to_sql( - name="test_dtypes", - con=self.conn, - index=False, - if_exists="replace", - dtype={"f64_as_f32": Float(precision=23)}, - ) - == 1 - ) - res = sql.read_sql_table("test_dtypes", self.conn) - - # check precision of float64 - assert np.round(df["f64"].iloc[0], 14) == np.round(res["f64"].iloc[0], 14) - - # check sql types - meta = MetaData() - meta.reflect(bind=self.conn) - col_dict = meta.tables["test_dtypes"].columns - assert str(col_dict["f32"].type) == str(col_dict["f64_as_f32"].type) - assert isinstance(col_dict["f32"].type, Float) - assert isinstance(col_dict["f64"].type, Float) - assert isinstance(col_dict["i32"].type, Integer) - assert isinstance(col_dict["i64"].type, BigInteger) - - def test_connectable_issue_example(self): - # This tests the example raised in issue - # https://github.com/pandas-dev/pandas/issues/10104 - from sqlalchemy.engine import Engine - def test_select(connection): - query = "SELECT test_foo_data FROM test_foo_data" - return sql.read_sql_query(query, con=connection) +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_connectable_issue_example(conn, request): + conn = request.getfixturevalue(conn) - def test_append(connection, data): - data.to_sql(name="test_foo_data", con=connection, if_exists="append") + # This tests the example raised in issue + # https://github.com/pandas-dev/pandas/issues/10104 + from sqlalchemy.engine import Engine - def test_connectable(conn): - # https://github.com/sqlalchemy/sqlalchemy/commit/ - # 00b5c10846e800304caa86549ab9da373b42fa5d#r48323973 - foo_data = test_select(conn) - test_append(conn, foo_data) + def test_select(connection): + query = "SELECT test_foo_data FROM test_foo_data" + return sql.read_sql_query(query, con=connection) - def main(connectable): - if isinstance(connectable, Engine): - with connectable.connect() as conn: - with conn.begin(): - test_connectable(conn) - else: - test_connectable(connectable) + def test_append(connection, data): + data.to_sql(name="test_foo_data", con=connection, if_exists="append") - assert ( - DataFrame({"test_foo_data": [0, 1, 2]}).to_sql( - name="test_foo_data", con=self.conn - ) - == 3 - ) - main(self.conn) - - @pytest.mark.parametrize( - "input", - [{"foo": [np.inf]}, {"foo": [-np.inf]}, {"foo": [-np.inf], "infe0": ["bar"]}], - ) - def test_to_sql_with_negative_npinf(self, input, request): - # GH 34431 - - df = DataFrame(input) - - if self.flavor == "mysql": - # GH 36465 - # The input {"foo": [-np.inf], "infe0": ["bar"]} does not raise any error - # for pymysql version >= 0.10 - # TODO(GH#36465): remove this version check after GH 36465 is fixed - pymysql = pytest.importorskip("pymysql") - - if ( - Version(pymysql.__version__) < Version("1.0.3") - and "infe0" in df.columns - ): - mark = pytest.mark.xfail(reason="GH 36465") - request.node.add_marker(mark) - - msg = "inf cannot be used with MySQL" - with pytest.raises(ValueError, match=msg): - df.to_sql(name="foobar", con=self.conn, index=False) + def test_connectable(conn): + # https://github.com/sqlalchemy/sqlalchemy/commit/ + # 00b5c10846e800304caa86549ab9da373b42fa5d#r48323973 + foo_data = test_select(conn) + test_append(conn, foo_data) + + def main(connectable): + if isinstance(connectable, Engine): + with connectable.connect() as conn: + with conn.begin(): + test_connectable(conn) else: - assert df.to_sql(name="foobar", con=self.conn, index=False) == 1 - res = sql.read_sql_table("foobar", self.conn) - tm.assert_equal(df, res) - - def test_temporary_table(self): - from sqlalchemy import ( - Column, - Integer, - Unicode, - select, - ) - from sqlalchemy.orm import ( - Session, - declarative_base, - ) + test_connectable(connectable) + + assert ( + DataFrame({"test_foo_data": [0, 1, 2]}).to_sql(name="test_foo_data", con=conn) + == 3 + ) + main(conn) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +@pytest.mark.parametrize( + "input", + [{"foo": [np.inf]}, {"foo": [-np.inf]}, {"foo": [-np.inf], "infe0": ["bar"]}], +) +def test_to_sql_with_negative_npinf(conn, request, input): + # GH 34431 + + df = DataFrame(input) + conn_name = conn + conn = request.getfixturevalue(conn) + + if "mysql" in conn_name: + # GH 36465 + # The input {"foo": [-np.inf], "infe0": ["bar"]} does not raise any error + # for pymysql version >= 0.10 + # TODO(GH#36465): remove this version check after GH 36465 is fixed + pymysql = pytest.importorskip("pymysql") - test_data = "Hello, World!" - expected = DataFrame({"spam": [test_data]}) - Base = declarative_base() - - class Temporary(Base): - __tablename__ = "temp_test" - __table_args__ = {"prefixes": ["TEMPORARY"]} - id = Column(Integer, primary_key=True) - spam = Column(Unicode(30), nullable=False) - - with Session(self.conn) as session: - with session.begin(): - conn = session.connection() - Temporary.__table__.create(conn) - session.add(Temporary(spam=test_data)) - session.flush() - df = sql.read_sql_query(sql=select(Temporary.spam), con=conn) - tm.assert_frame_equal(df, expected) - - # -- SQL Engine tests (in the base class for now) - def test_invalid_engine(self, test_frame1): - msg = "engine must be one of 'auto', 'sqlalchemy'" + if Version(pymysql.__version__) < Version("1.0.3") and "infe0" in df.columns: + mark = pytest.mark.xfail(reason="GH 36465") + request.node.add_marker(mark) + + msg = "inf cannot be used with MySQL" with pytest.raises(ValueError, match=msg): - self._to_sql_with_sql_engine(test_frame1, "bad_engine") + df.to_sql(name="foobar", con=conn, index=False) + else: + assert df.to_sql(name="foobar", con=conn, index=False) == 1 + res = sql.read_sql_table("foobar", conn) + tm.assert_equal(df, res) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_temporary_table(conn, request): + if conn == "sqlite_str": + pytest.skip("test does not work with str connection") + + conn = request.getfixturevalue(conn) + + from sqlalchemy import ( + Column, + Integer, + Unicode, + select, + ) + from sqlalchemy.orm import ( + Session, + declarative_base, + ) + + test_data = "Hello, World!" + expected = DataFrame({"spam": [test_data]}) + Base = declarative_base() + + class Temporary(Base): + __tablename__ = "temp_test" + __table_args__ = {"prefixes": ["TEMPORARY"]} + id = Column(Integer, primary_key=True) + spam = Column(Unicode(30), nullable=False) + + with Session(conn) as session: + with session.begin(): + conn = session.connection() + Temporary.__table__.create(conn) + session.add(Temporary(spam=test_data)) + session.flush() + df = sql.read_sql_query(sql=select(Temporary.spam), con=conn) + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("conn", all_connectable) +def test_invalid_engine(conn, request, test_frame1): + if conn == "sqlite_buildin": + request.node.add_marker( + pytest.mark.xfail(reason="SQLiteDatabase does not raise for bad engine") + ) + + conn = request.getfixturevalue(conn) + msg = "engine must be one of 'auto', 'sqlalchemy'" + pandasSQL = pandasSQL_builder(conn) + with pytest.raises(ValueError, match=msg): + pandasSQL.to_sql(test_frame1, "test_frame1", engine="bad_engine") + + +@pytest.mark.parametrize("conn", all_connectable) +def test_to_sql_with_sql_engine(conn, request, test_frame1): + """`to_sql` with the `engine` param""" + # mostly copied from this class's `_to_sql()` method + conn = request.getfixturevalue(conn) + pandasSQL = pandasSQL_builder(conn) + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(test_frame1, "test_frame1", engine="auto") == 4 + assert pandasSQL.has_table("test_frame1") - def test_options_sqlalchemy(self, test_frame1): - # use the set option - with pd.option_context("io.sql.engine", "sqlalchemy"): - self._to_sql_with_sql_engine(test_frame1) + num_entries = len(test_frame1) + num_rows = count_rows(conn, "test_frame1") + assert num_rows == num_entries + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_options_sqlalchemy(conn, request, test_frame1): + # use the set option + conn = request.getfixturevalue(conn) + with pd.option_context("io.sql.engine", "sqlalchemy"): + pandasSQL = pandasSQL_builder(conn) + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(test_frame1, "test_frame1") == 4 + assert pandasSQL.has_table("test_frame1") - def test_options_auto(self, test_frame1): - # use the set option - with pd.option_context("io.sql.engine", "auto"): - self._to_sql_with_sql_engine(test_frame1) + num_entries = len(test_frame1) + num_rows = count_rows(conn, "test_frame1") + assert num_rows == num_entries + + +@pytest.mark.parametrize("conn", all_connectable) +def test_options_auto(conn, request, test_frame1): + # use the set option + conn = request.getfixturevalue(conn) + with pd.option_context("io.sql.engine", "auto"): + pandasSQL = pandasSQL_builder(conn) + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(test_frame1, "test_frame1") == 4 + assert pandasSQL.has_table("test_frame1") + + num_entries = len(test_frame1) + num_rows = count_rows(conn, "test_frame1") + assert num_rows == num_entries - def test_options_get_engine(self): + +@pytest.mark.skipif(not SQLALCHEMY_INSTALLED, reason="fails without SQLAlchemy") +def test_options_get_engine(): + assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) + + with pd.option_context("io.sql.engine", "sqlalchemy"): + assert isinstance(get_engine("auto"), SQLAlchemyEngine) assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) - with pd.option_context("io.sql.engine", "sqlalchemy"): - assert isinstance(get_engine("auto"), SQLAlchemyEngine) - assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) + with pd.option_context("io.sql.engine", "auto"): + assert isinstance(get_engine("auto"), SQLAlchemyEngine) + assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) - with pd.option_context("io.sql.engine", "auto"): - assert isinstance(get_engine("auto"), SQLAlchemyEngine) - assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) - def test_get_engine_auto_error_message(self): - # Expect different error messages from get_engine(engine="auto") - # if engines aren't installed vs. are installed but bad version - pass - # TODO(GH#36893) fill this in when we add more engines - - @pytest.mark.parametrize("func", ["read_sql", "read_sql_query"]) - def test_read_sql_dtype_backend(self, string_storage, func, dtype_backend): - # GH#50048 - table = "test" - df = self.dtype_backend_data() - df.to_sql(name=table, con=self.conn, index=False, if_exists="replace") - - with pd.option_context("mode.string_storage", string_storage): - result = getattr(pd, func)( - f"Select * from {table}", self.conn, dtype_backend=dtype_backend +def test_get_engine_auto_error_message(): + # Expect different error messages from get_engine(engine="auto") + # if engines aren't installed vs. are installed but bad version + pass + # TODO(GH#36893) fill this in when we add more engines + + +@pytest.mark.parametrize("conn", all_connectable) +@pytest.mark.parametrize("func", ["read_sql", "read_sql_query"]) +def test_read_sql_dtype_backend( + conn, + request, + string_storage, + func, + dtype_backend, + dtype_backend_data, + dtype_backend_expected, +): + # GH#50048 + conn_name = conn + conn = request.getfixturevalue(conn) + table = "test" + df = dtype_backend_data + df.to_sql(name=table, con=conn, index=False, if_exists="replace") + + with pd.option_context("mode.string_storage", string_storage): + result = getattr(pd, func)( + f"Select * from {table}", conn, dtype_backend=dtype_backend + ) + expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + tm.assert_frame_equal(result, expected) + + with pd.option_context("mode.string_storage", string_storage): + iterator = getattr(pd, func)( + f"Select * from {table}", + con=conn, + dtype_backend=dtype_backend, + chunksize=3, + ) + expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + for result in iterator: + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("conn", all_connectable) +@pytest.mark.parametrize("func", ["read_sql", "read_sql_table"]) +def test_read_sql_dtype_backend_table( + conn, + request, + string_storage, + func, + dtype_backend, + dtype_backend_data, + dtype_backend_expected, +): + if "sqlite" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason=( + "SQLite actually returns proper boolean values via " + "read_sql_table, but before pytest refactor was skipped" + ) ) - expected = self.dtype_backend_expected(string_storage, dtype_backend) - tm.assert_frame_equal(result, expected) + ) + # GH#50048 + conn_name = conn + conn = request.getfixturevalue(conn) + table = "test" + df = dtype_backend_data + df.to_sql(name=table, con=conn, index=False, if_exists="replace") + + with pd.option_context("mode.string_storage", string_storage): + result = getattr(pd, func)(table, conn, dtype_backend=dtype_backend) + expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + tm.assert_frame_equal(result, expected) + + with pd.option_context("mode.string_storage", string_storage): + iterator = getattr(pd, func)( + table, + conn, + dtype_backend=dtype_backend, + chunksize=3, + ) + expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + for result in iterator: + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("conn", all_connectable) +@pytest.mark.parametrize("func", ["read_sql", "read_sql_table", "read_sql_query"]) +def test_read_sql_invalid_dtype_backend_table(conn, request, func, dtype_backend_data): + conn = request.getfixturevalue(conn) + table = "test" + df = dtype_backend_data + df.to_sql(name=table, con=conn, index=False, if_exists="replace") + + msg = ( + "dtype_backend numpy is invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + getattr(pd, func)(table, conn, dtype_backend="numpy") - with pd.option_context("mode.string_storage", string_storage): - iterator = getattr(pd, func)( - f"Select * from {table}", - con=self.conn, - dtype_backend=dtype_backend, - chunksize=3, - ) - expected = self.dtype_backend_expected(string_storage, dtype_backend) - for result in iterator: - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("func", ["read_sql", "read_sql_table"]) - def test_read_sql_dtype_backend_table(self, string_storage, func, dtype_backend): - # GH#50048 - table = "test" - df = self.dtype_backend_data() - df.to_sql(name=table, con=self.conn, index=False, if_exists="replace") - - with pd.option_context("mode.string_storage", string_storage): - result = getattr(pd, func)(table, self.conn, dtype_backend=dtype_backend) - expected = self.dtype_backend_expected(string_storage, dtype_backend) - tm.assert_frame_equal(result, expected) - with pd.option_context("mode.string_storage", string_storage): - iterator = getattr(pd, func)( - table, - self.conn, - dtype_backend=dtype_backend, - chunksize=3, - ) - expected = self.dtype_backend_expected(string_storage, dtype_backend) - for result in iterator: - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("func", ["read_sql", "read_sql_table", "read_sql_query"]) - def test_read_sql_invalid_dtype_backend_table(self, func): - table = "test" - df = self.dtype_backend_data() - df.to_sql(name=table, con=self.conn, index=False, if_exists="replace") - - msg = ( - "dtype_backend numpy is invalid, only 'numpy_nullable' and " - "'pyarrow' are allowed." - ) - with pytest.raises(ValueError, match=msg): - getattr(pd, func)(table, self.conn, dtype_backend="numpy") +@pytest.fixture +def dtype_backend_data() -> DataFrame: + return DataFrame( + { + "a": Series([1, np.nan, 3], dtype="Int64"), + "b": Series([1, 2, 3], dtype="Int64"), + "c": Series([1.5, np.nan, 2.5], dtype="Float64"), + "d": Series([1.5, 2.0, 2.5], dtype="Float64"), + "e": [True, False, None], + "f": [True, False, True], + "g": ["a", "b", "c"], + "h": ["a", "b", None], + } + ) - def dtype_backend_data(self) -> DataFrame: - return DataFrame( - { - "a": Series([1, np.nan, 3], dtype="Int64"), - "b": Series([1, 2, 3], dtype="Int64"), - "c": Series([1.5, np.nan, 2.5], dtype="Float64"), - "d": Series([1.5, 2.0, 2.5], dtype="Float64"), - "e": [True, False, None], - "f": [True, False, True], - "g": ["a", "b", "c"], - "h": ["a", "b", None], - } - ) - def dtype_backend_expected(self, storage, dtype_backend) -> DataFrame: +@pytest.fixture +def dtype_backend_expected(): + def func(storage, dtype_backend, conn_name): string_array: StringArray | ArrowStringArray string_array_na: StringArray | ArrowStringArray if storage == "python": @@ -3041,557 +3195,396 @@ def dtype_backend_expected(self, storage, dtype_backend) -> DataFrame: for col in df.columns } ) - return df - - def test_chunksize_empty_dtypes(self): - # GH#50245 - dtypes = {"a": "int64", "b": "object"} - df = DataFrame(columns=["a", "b"]).astype(dtypes) - expected = df.copy() - df.to_sql(name="test", con=self.conn, index=False, if_exists="replace") - - for result in read_sql_query( - "SELECT * FROM test", - self.conn, - dtype=dtypes, - chunksize=1, - ): - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("dtype_backend", [lib.no_default, "numpy_nullable"]) - @pytest.mark.parametrize("func", ["read_sql", "read_sql_query"]) - def test_read_sql_dtype(self, func, dtype_backend): - # GH#50797 - table = "test" - df = DataFrame({"a": [1, 2, 3], "b": 5}) - df.to_sql(name=table, con=self.conn, index=False, if_exists="replace") - - result = getattr(pd, func)( - f"Select * from {table}", - self.conn, - dtype={"a": np.float64}, - dtype_backend=dtype_backend, - ) - expected = DataFrame( - { - "a": Series([1, 2, 3], dtype=np.float64), - "b": Series( - [5, 5, 5], - dtype="int64" if not dtype_backend == "numpy_nullable" else "Int64", - ), - } - ) - tm.assert_frame_equal(result, expected) - -class TestSQLiteAlchemy(_TestSQLAlchemy): - """ - Test the sqlalchemy backend against an in-memory sqlite database. - - """ - - flavor = "sqlite" - - @classmethod - def setup_engine(cls): - cls.engine = sqlalchemy.create_engine("sqlite:///:memory:") - - @classmethod - def setup_driver(cls): - # sqlite3 is built-in - cls.driver = None + if "mysql" in conn_name or "sqlite" in conn_name: + if dtype_backend == "numpy_nullable": + df = df.astype({"e": "Int64", "f": "Int64"}) + else: + df = df.astype({"e": "int64[pyarrow]", "f": "int64[pyarrow]"}) - def test_keyword_deprecation(self): - # GH 54397 - msg = ( - "Starting with pandas version 3.0 all arguments of to_sql except for the " - "arguments 'name' and 'con' will be keyword-only." - ) - df = DataFrame([{"A": 1, "B": 2, "C": 3}, {"A": 1, "B": 2, "C": 3}]) - df.to_sql("example", self.conn) + return df - with tm.assert_produces_warning(FutureWarning, match=msg): - df.to_sql("example", self.conn, None, if_exists="replace") + return func - def test_default_type_conversion(self): - df = sql.read_sql_table("types", self.conn) - assert issubclass(df.FloatCol.dtype.type, np.floating) - assert issubclass(df.IntCol.dtype.type, np.integer) +@pytest.mark.parametrize("conn", all_connectable) +def test_chunksize_empty_dtypes(conn, request): + # GH#50245 + conn = request.getfixturevalue(conn) + dtypes = {"a": "int64", "b": "object"} + df = DataFrame(columns=["a", "b"]).astype(dtypes) + expected = df.copy() + df.to_sql(name="test", con=conn, index=False, if_exists="replace") - # sqlite has no boolean type, so integer type is returned - assert issubclass(df.BoolCol.dtype.type, np.integer) + for result in read_sql_query( + "SELECT * FROM test", + conn, + dtype=dtypes, + chunksize=1, + ): + tm.assert_frame_equal(result, expected) - # Int column with NA values stays as float - assert issubclass(df.IntColWithNull.dtype.type, np.floating) - # Non-native Bool column with NA values stays as float - assert issubclass(df.BoolColWithNull.dtype.type, np.floating) +@pytest.mark.parametrize("conn", all_connectable) +@pytest.mark.parametrize("dtype_backend", [lib.no_default, "numpy_nullable"]) +@pytest.mark.parametrize("func", ["read_sql", "read_sql_query"]) +def test_read_sql_dtype(conn, request, func, dtype_backend): + # GH#50797 + conn = request.getfixturevalue(conn) + table = "test" + df = DataFrame({"a": [1, 2, 3], "b": 5}) + df.to_sql(name=table, con=conn, index=False, if_exists="replace") - def test_default_date_load(self): - df = sql.read_sql_table("types", self.conn) + result = getattr(pd, func)( + f"Select * from {table}", + conn, + dtype={"a": np.float64}, + dtype_backend=dtype_backend, + ) + expected = DataFrame( + { + "a": Series([1, 2, 3], dtype=np.float64), + "b": Series( + [5, 5, 5], + dtype="int64" if not dtype_backend == "numpy_nullable" else "Int64", + ), + } + ) + tm.assert_frame_equal(result, expected) - # IMPORTANT - sqlite has no native date type, so shouldn't parse, but - assert not issubclass(df.DateCol.dtype.type, np.datetime64) - def test_bigint_warning(self): - # test no warning for BIGINT (to support int64) is raised (GH7433) - df = DataFrame({"a": [1, 2]}, dtype="int64") - assert df.to_sql(name="test_bigintwarning", con=self.conn, index=False) == 2 +def test_keyword_deprecation(sqlite_sqlalchemy_memory_engine): + conn = sqlite_sqlalchemy_memory_engine + # GH 54397 + msg = ( + "Starting with pandas version 3.0 all arguments of to_sql except for the " + "arguments 'name' and 'con' will be keyword-only." + ) + df = DataFrame([{"A": 1, "B": 2, "C": 3}, {"A": 1, "B": 2, "C": 3}]) + df.to_sql("example", conn) - with tm.assert_produces_warning(None): - sql.read_sql_table("test_bigintwarning", self.conn) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.to_sql("example", conn, None, if_exists="replace") - def test_valueerror_exception(self): - df = DataFrame({"col1": [1, 2], "col2": [3, 4]}) - with pytest.raises(ValueError, match="Empty table name specified"): - df.to_sql(name="", con=self.conn, if_exists="replace", index=False) - def test_row_object_is_named_tuple(self): - # GH 40682 - # Test for the is_named_tuple() function - # Placed here due to its usage of sqlalchemy +def test_bigint_warning(sqlite_sqlalchemy_memory_engine): + conn = sqlite_sqlalchemy_memory_engine + # test no warning for BIGINT (to support int64) is raised (GH7433) + df = DataFrame({"a": [1, 2]}, dtype="int64") + assert df.to_sql(name="test_bigintwarning", con=conn, index=False) == 2 - from sqlalchemy import ( - Column, - Integer, - String, - ) - from sqlalchemy.orm import ( - declarative_base, - sessionmaker, - ) + with tm.assert_produces_warning(None): + sql.read_sql_table("test_bigintwarning", conn) - BaseModel = declarative_base() - - class Test(BaseModel): - __tablename__ = "test_frame" - id = Column(Integer, primary_key=True) - string_column = Column(String(50)) - - with self.conn.begin(): - BaseModel.metadata.create_all(self.conn) - Session = sessionmaker(bind=self.conn) - with Session() as session: - df = DataFrame({"id": [0, 1], "string_column": ["hello", "world"]}) - assert ( - df.to_sql( - name="test_frame", con=self.conn, index=False, if_exists="replace" - ) - == 2 - ) - session.commit() - test_query = session.query(Test.id, Test.string_column) - df = DataFrame(test_query) - assert list(df.columns) == ["id", "string_column"] +def test_valueerror_exception(sqlite_sqlalchemy_memory_engine): + conn = sqlite_sqlalchemy_memory_engine + df = DataFrame({"col1": [1, 2], "col2": [3, 4]}) + with pytest.raises(ValueError, match="Empty table name specified"): + df.to_sql(name="", con=conn, if_exists="replace", index=False) - def dtype_backend_expected(self, storage, dtype_backend) -> DataFrame: - df = super().dtype_backend_expected(storage, dtype_backend) - if dtype_backend == "numpy_nullable": - df = df.astype({"e": "Int64", "f": "Int64"}) - else: - df = df.astype({"e": "int64[pyarrow]", "f": "int64[pyarrow]"}) - return df +def test_row_object_is_named_tuple(sqlite_sqlalchemy_memory_engine): + conn = sqlite_sqlalchemy_memory_engine + # GH 40682 + # Test for the is_named_tuple() function + # Placed here due to its usage of sqlalchemy - @pytest.mark.parametrize("func", ["read_sql", "read_sql_table"]) - def test_read_sql_dtype_backend_table(self, string_storage, func): - # GH#50048 Not supported for sqlite - pass + from sqlalchemy import ( + Column, + Integer, + String, + ) + from sqlalchemy.orm import ( + declarative_base, + sessionmaker, + ) - def test_read_sql_string_inference(self): - # GH#54430 - pytest.importorskip("pyarrow") - table = "test" - df = DataFrame({"a": ["x", "y"]}) - df.to_sql(table, con=self.conn, index=False, if_exists="replace") + BaseModel = declarative_base() - with pd.option_context("future.infer_string", True): - result = read_sql_table(table, self.conn) + class Test(BaseModel): + __tablename__ = "test_frame" + id = Column(Integer, primary_key=True) + string_column = Column(String(50)) - dtype = "string[pyarrow_numpy]" - expected = DataFrame( - {"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) + with conn.begin(): + BaseModel.metadata.create_all(conn) + Session = sessionmaker(bind=conn) + with Session() as session: + df = DataFrame({"id": [0, 1], "string_column": ["hello", "world"]}) + assert ( + df.to_sql(name="test_frame", con=conn, index=False, if_exists="replace") + == 2 ) + session.commit() + test_query = session.query(Test.id, Test.string_column) + df = DataFrame(test_query) - tm.assert_frame_equal(result, expected) - - def test_roundtripping_datetimes(self): - # GH#54877 - df = DataFrame({"t": [datetime(2020, 12, 31, 12)]}, dtype="datetime64[ns]") - df.to_sql("test", self.conn, if_exists="replace", index=False) - result = pd.read_sql("select * from test", self.conn).iloc[0, 0] - assert result == "2020-12-31 12:00:00.000000" - - -@pytest.mark.db -class TestMySQLAlchemy(_TestSQLAlchemy): - """ - Test the sqlalchemy backend against an MySQL database. + assert list(df.columns) == ["id", "string_column"] - """ - flavor = "mysql" - port = 3306 +def test_read_sql_string_inference(sqlite_sqlalchemy_memory_engine): + conn = sqlite_sqlalchemy_memory_engine + # GH#54430 + pytest.importorskip("pyarrow") + table = "test" + df = DataFrame({"a": ["x", "y"]}) + df.to_sql(table, con=conn, index=False, if_exists="replace") - @classmethod - def setup_engine(cls): - cls.engine = sqlalchemy.create_engine( - f"mysql+{cls.driver}://root@localhost:{cls.port}/pandas", - connect_args=cls.connect_args, - ) + with pd.option_context("future.infer_string", True): + result = read_sql_table(table, conn) - @classmethod - def setup_driver(cls): - pymysql = pytest.importorskip("pymysql") - cls.driver = "pymysql" - cls.connect_args = {"client_flag": pymysql.constants.CLIENT.MULTI_STATEMENTS} + dtype = "string[pyarrow_numpy]" + expected = DataFrame( + {"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) + ) - def test_default_type_conversion(self): - pass + tm.assert_frame_equal(result, expected) - def dtype_backend_expected(self, storage, dtype_backend) -> DataFrame: - df = super().dtype_backend_expected(storage, dtype_backend) - if dtype_backend == "numpy_nullable": - df = df.astype({"e": "Int64", "f": "Int64"}) - else: - df = df.astype({"e": "int64[pyarrow]", "f": "int64[pyarrow]"}) - return df +def test_roundtripping_datetimes(sqlite_sqlalchemy_memory_engine): + conn = sqlite_sqlalchemy_memory_engine + # GH#54877 + df = DataFrame({"t": [datetime(2020, 12, 31, 12)]}, dtype="datetime64[ns]") + df.to_sql("test", conn, if_exists="replace", index=False) + result = pd.read_sql("select * from test", conn).iloc[0, 0] + assert result == "2020-12-31 12:00:00.000000" @pytest.mark.db -class TestPostgreSQLAlchemy(_TestSQLAlchemy): - """ - Test the sqlalchemy backend against an PostgreSQL database. - - """ - - flavor = "postgresql" - port = 5432 - - @classmethod - def setup_engine(cls): - cls.engine = sqlalchemy.create_engine( - f"postgresql+{cls.driver}://postgres:postgres@localhost:{cls.port}/pandas" - ) - - @classmethod - def setup_driver(cls): - pytest.importorskip("psycopg2") - cls.driver = "psycopg2" - - def test_schema_support(self): - from sqlalchemy.engine import Engine - - # only test this for postgresql (schema's not supported in - # mysql/sqlite) - df = DataFrame({"col1": [1, 2], "col2": [0.1, 0.2], "col3": ["a", "n"]}) - - # create a schema - with self.conn.begin(): - self.conn.exec_driver_sql("DROP SCHEMA IF EXISTS other CASCADE;") - self.conn.exec_driver_sql("CREATE SCHEMA other;") - - # write dataframe to different schema's - assert df.to_sql(name="test_schema_public", con=self.conn, index=False) == 2 - assert ( - df.to_sql( - name="test_schema_public_explicit", - con=self.conn, - index=False, - schema="public", - ) - == 2 - ) - assert ( - df.to_sql( - name="test_schema_other", con=self.conn, index=False, schema="other" - ) - == 2 +def test_psycopg2_schema_support(postgresql_psycopg2_engine): + conn = postgresql_psycopg2_engine + + # only test this for postgresql (schema's not supported in + # mysql/sqlite) + df = DataFrame({"col1": [1, 2], "col2": [0.1, 0.2], "col3": ["a", "n"]}) + + # create a schema + with conn.connect() as con: + with con.begin(): + con.exec_driver_sql("DROP SCHEMA IF EXISTS other CASCADE;") + con.exec_driver_sql("CREATE SCHEMA other;") + + # write dataframe to different schema's + assert df.to_sql(name="test_schema_public", con=conn, index=False) == 2 + assert ( + df.to_sql( + name="test_schema_public_explicit", + con=conn, + index=False, + schema="public", ) + == 2 + ) + assert ( + df.to_sql(name="test_schema_other", con=conn, index=False, schema="other") == 2 + ) - # read dataframes back in - res1 = sql.read_sql_table("test_schema_public", self.conn) - tm.assert_frame_equal(df, res1) - res2 = sql.read_sql_table("test_schema_public_explicit", self.conn) - tm.assert_frame_equal(df, res2) - res3 = sql.read_sql_table( - "test_schema_public_explicit", self.conn, schema="public" - ) - tm.assert_frame_equal(df, res3) - res4 = sql.read_sql_table("test_schema_other", self.conn, schema="other") - tm.assert_frame_equal(df, res4) - msg = "Table test_schema_other not found" - with pytest.raises(ValueError, match=msg): - sql.read_sql_table("test_schema_other", self.conn, schema="public") + # read dataframes back in + res1 = sql.read_sql_table("test_schema_public", conn) + tm.assert_frame_equal(df, res1) + res2 = sql.read_sql_table("test_schema_public_explicit", conn) + tm.assert_frame_equal(df, res2) + res3 = sql.read_sql_table("test_schema_public_explicit", conn, schema="public") + tm.assert_frame_equal(df, res3) + res4 = sql.read_sql_table("test_schema_other", conn, schema="other") + tm.assert_frame_equal(df, res4) + msg = "Table test_schema_other not found" + with pytest.raises(ValueError, match=msg): + sql.read_sql_table("test_schema_other", conn, schema="public") - # different if_exists options + # different if_exists options - # create a schema - with self.conn.begin(): - self.conn.exec_driver_sql("DROP SCHEMA IF EXISTS other CASCADE;") - self.conn.exec_driver_sql("CREATE SCHEMA other;") + # create a schema + with conn.connect() as con: + with con.begin(): + con.exec_driver_sql("DROP SCHEMA IF EXISTS other CASCADE;") + con.exec_driver_sql("CREATE SCHEMA other;") - # write dataframe with different if_exists options - assert ( - df.to_sql( - name="test_schema_other", con=self.conn, schema="other", index=False - ) - == 2 - ) + # write dataframe with different if_exists options + assert ( + df.to_sql(name="test_schema_other", con=conn, schema="other", index=False) == 2 + ) + df.to_sql( + name="test_schema_other", + con=conn, + schema="other", + index=False, + if_exists="replace", + ) + assert ( df.to_sql( name="test_schema_other", - con=self.conn, + con=conn, schema="other", index=False, - if_exists="replace", - ) - assert ( - df.to_sql( - name="test_schema_other", - con=self.conn, - schema="other", - index=False, - if_exists="append", - ) - == 2 + if_exists="append", ) - res = sql.read_sql_table("test_schema_other", self.conn, schema="other") - tm.assert_frame_equal(concat([df, df], ignore_index=True), res) - - # specifying schema in user-provided meta - - # The schema won't be applied on another Connection - # because of transactional schemas - if isinstance(self.conn, Engine): - engine2 = self.connect() - pdsql = sql.SQLDatabase(engine2, schema="other") - assert pdsql.to_sql(df, "test_schema_other2", index=False) == 2 - assert ( - pdsql.to_sql(df, "test_schema_other2", index=False, if_exists="replace") - == 2 - ) - assert ( - pdsql.to_sql(df, "test_schema_other2", index=False, if_exists="append") - == 2 - ) - res1 = sql.read_sql_table("test_schema_other2", self.conn, schema="other") - res2 = pdsql.read_table("test_schema_other2") - tm.assert_frame_equal(res1, res2) + == 2 + ) + res = sql.read_sql_table("test_schema_other", conn, schema="other") + tm.assert_frame_equal(concat([df, df], ignore_index=True), res) - def test_self_join_date_columns(self): - # GH 44421 - from sqlalchemy.engine import Engine - from sqlalchemy.sql import text - create_table = text( - """ - CREATE TABLE person - ( - id serial constraint person_pkey primary key, - created_dt timestamp with time zone - ); +@pytest.mark.db +def test_self_join_date_columns(postgresql_psycopg2_engine): + # GH 44421 + conn = postgresql_psycopg2_engine + from sqlalchemy.sql import text - INSERT INTO person - VALUES (1, '2021-01-01T00:00:00Z'); + create_table = text( """ - ) - if isinstance(self.conn, Engine): - with self.conn.connect() as con: - with con.begin(): - con.execute(create_table) - else: - with self.conn.begin(): - self.conn.execute(create_table) + CREATE TABLE person + ( + id serial constraint person_pkey primary key, + created_dt timestamp with time zone + ); - sql_query = ( - 'SELECT * FROM "person" AS p1 INNER JOIN "person" AS p2 ON p1.id = p2.id;' - ) - result = pd.read_sql(sql_query, self.conn) - expected = DataFrame( - [[1, Timestamp("2021", tz="UTC")] * 2], columns=["id", "created_dt"] * 2 - ) - tm.assert_frame_equal(result, expected) + INSERT INTO person + VALUES (1, '2021-01-01T00:00:00Z'); + """ + ) + with conn.connect() as con: + with con.begin(): + con.execute(create_table) - # Cleanup - with sql.SQLDatabase(self.conn, need_transaction=True) as pandasSQL: - pandasSQL.drop_table("person") + sql_query = ( + 'SELECT * FROM "person" AS p1 INNER JOIN "person" AS p2 ON p1.id = p2.id;' + ) + result = pd.read_sql(sql_query, conn) + expected = DataFrame( + [[1, Timestamp("2021", tz="UTC")] * 2], columns=["id", "created_dt"] * 2 + ) + tm.assert_frame_equal(result, expected) + # Cleanup + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("person") -# ----------------------------------------------------------------------------- -# -- Test Sqlite / MySQL fallback +def test_create_and_drop_table(sqlite_sqlalchemy_memory_engine): + conn = sqlite_sqlalchemy_memory_engine + temp_frame = DataFrame({"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}) + pandasSQL = sql.SQLDatabase(conn) -class TestSQLiteFallback(SQLiteMixIn, PandasSQLTest): - """ - Test the fallback mode against an in-memory sqlite database. + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(temp_frame, "drop_test_frame") == 4 - """ + assert pandasSQL.has_table("drop_test_frame") - flavor = "sqlite" + with pandasSQL.run_transaction(): + pandasSQL.drop_table("drop_test_frame") - @pytest.fixture(autouse=True) - def setup_method(self, iris_path, types_data): - self.conn = self.connect() - self.load_iris_data(iris_path) - self.load_types_data(types_data) - self.pandasSQL = sql.SQLiteDatabase(self.conn) + assert not pandasSQL.has_table("drop_test_frame") - def test_read_sql_parameter(self, sql_strings): - self._read_sql_iris_parameter(sql_strings) - def test_read_sql_named_parameter(self, sql_strings): - self._read_sql_iris_named_parameter(sql_strings) +def test_sqlite_datetime_date(sqlite_buildin): + conn = sqlite_buildin + df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) + assert df.to_sql(name="test_date", con=conn, index=False) == 2 + res = read_sql_query("SELECT * FROM test_date", conn) + # comes back as strings + tm.assert_frame_equal(res, df.astype(str)) - def test_to_sql_empty(self, test_frame1): - self._to_sql_empty(test_frame1) - def test_create_and_drop_table(self): - temp_frame = DataFrame( - {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]} - ) +@pytest.mark.parametrize("tz_aware", [False, True]) +def test_sqlite_datetime_time(tz_aware, sqlite_buildin): + conn = sqlite_buildin + # test support for datetime.time, GH #8341 + if not tz_aware: + tz_times = [time(9, 0, 0), time(9, 1, 30)] + else: + tz_dt = date_range("2013-01-01 09:00:00", periods=2, tz="US/Pacific") + tz_times = Series(tz_dt.to_pydatetime()).map(lambda dt: dt.timetz()) - assert self.pandasSQL.to_sql(temp_frame, "drop_test_frame") == 4 + df = DataFrame(tz_times, columns=["a"]) - assert self.pandasSQL.has_table("drop_test_frame") + assert df.to_sql(name="test_time", con=conn, index=False) == 2 + res = read_sql_query("SELECT * FROM test_time", conn) + # comes back as strings + expected = df.map(lambda _: _.strftime("%H:%M:%S.%f")) + tm.assert_frame_equal(res, expected) - self.pandasSQL.drop_table("drop_test_frame") - assert not self.pandasSQL.has_table("drop_test_frame") +def get_sqlite_column_type(conn, table, column): + recs = conn.execute(f"PRAGMA table_info({table})") + for cid, name, ctype, not_null, default, pk in recs: + if name == column: + return ctype + raise ValueError(f"Table {table}, column {column} not found") - def test_roundtrip(self, test_frame1): - self._roundtrip(test_frame1) - def test_execute_sql(self): - self._execute_sql() +def test_sqlite_test_dtype(sqlite_buildin): + conn = sqlite_buildin + cols = ["A", "B"] + data = [(0.8, True), (0.9, None)] + df = DataFrame(data, columns=cols) + assert df.to_sql(name="dtype_test", con=conn) == 2 + assert df.to_sql(name="dtype_test2", con=conn, dtype={"B": "STRING"}) == 2 - def test_datetime_date(self): - # test support for datetime.date - df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) - assert df.to_sql(name="test_date", con=self.conn, index=False) == 2 - res = read_sql_query("SELECT * FROM test_date", self.conn) - if self.flavor == "sqlite": - # comes back as strings - tm.assert_frame_equal(res, df.astype(str)) - elif self.flavor == "mysql": - tm.assert_frame_equal(res, df) + # sqlite stores Boolean values as INTEGER + assert get_sqlite_column_type(conn, "dtype_test", "B") == "INTEGER" - @pytest.mark.parametrize("tz_aware", [False, True]) - def test_datetime_time(self, tz_aware): - # test support for datetime.time, GH #8341 - if not tz_aware: - tz_times = [time(9, 0, 0), time(9, 1, 30)] - else: - tz_dt = date_range("2013-01-01 09:00:00", periods=2, tz="US/Pacific") - tz_times = Series(tz_dt.to_pydatetime()).map(lambda dt: dt.timetz()) + assert get_sqlite_column_type(conn, "dtype_test2", "B") == "STRING" + msg = r"B \(\) not a string" + with pytest.raises(ValueError, match=msg): + df.to_sql(name="error", con=conn, dtype={"B": bool}) - df = DataFrame(tz_times, columns=["a"]) + # single dtype + assert df.to_sql(name="single_dtype_test", con=conn, dtype="STRING") == 2 + assert get_sqlite_column_type(conn, "single_dtype_test", "A") == "STRING" + assert get_sqlite_column_type(conn, "single_dtype_test", "B") == "STRING" - assert df.to_sql(name="test_time", con=self.conn, index=False) == 2 - res = read_sql_query("SELECT * FROM test_time", self.conn) - if self.flavor == "sqlite": - # comes back as strings - expected = df.map(lambda _: _.strftime("%H:%M:%S.%f")) - tm.assert_frame_equal(res, expected) - def _get_index_columns(self, tbl_name): - ixs = sql.read_sql_query( - "SELECT * FROM sqlite_master WHERE type = 'index' " - f"AND tbl_name = '{tbl_name}'", - self.conn, - ) - ix_cols = [] - for ix_name in ixs.name: - ix_info = sql.read_sql_query(f"PRAGMA index_info({ix_name})", self.conn) - ix_cols.append(ix_info.name.tolist()) - return ix_cols - - def test_to_sql_save_index(self): - self._to_sql_save_index() - - def test_transactions(self): - self._transaction_test() - - def _get_sqlite_column_type(self, table, column): - recs = self.conn.execute(f"PRAGMA table_info({table})") - for cid, name, ctype, not_null, default, pk in recs: - if name == column: - return ctype - raise ValueError(f"Table {table}, column {column} not found") - - def test_dtype(self): - if self.flavor == "mysql": - pytest.skip("Not applicable to MySQL legacy") - cols = ["A", "B"] - data = [(0.8, True), (0.9, None)] - df = DataFrame(data, columns=cols) - assert df.to_sql(name="dtype_test", con=self.conn) == 2 - assert df.to_sql(name="dtype_test2", con=self.conn, dtype={"B": "STRING"}) == 2 - - # sqlite stores Boolean values as INTEGER - assert self._get_sqlite_column_type("dtype_test", "B") == "INTEGER" - - assert self._get_sqlite_column_type("dtype_test2", "B") == "STRING" - msg = r"B \(\) not a string" - with pytest.raises(ValueError, match=msg): - df.to_sql(name="error", con=self.conn, dtype={"B": bool}) - - # single dtype - assert df.to_sql(name="single_dtype_test", con=self.conn, dtype="STRING") == 2 - assert self._get_sqlite_column_type("single_dtype_test", "A") == "STRING" - assert self._get_sqlite_column_type("single_dtype_test", "B") == "STRING" - - def test_notna_dtype(self): - if self.flavor == "mysql": - pytest.skip("Not applicable to MySQL legacy") - - cols = { - "Bool": Series([True, None]), - "Date": Series([datetime(2012, 5, 1), None]), - "Int": Series([1, None], dtype="object"), - "Float": Series([1.1, None]), - } - df = DataFrame(cols) +def test_sqlite_notna_dtype(sqlite_buildin): + conn = sqlite_buildin + cols = { + "Bool": Series([True, None]), + "Date": Series([datetime(2012, 5, 1), None]), + "Int": Series([1, None], dtype="object"), + "Float": Series([1.1, None]), + } + df = DataFrame(cols) - tbl = "notna_dtype_test" - assert df.to_sql(name=tbl, con=self.conn) == 2 + tbl = "notna_dtype_test" + assert df.to_sql(name=tbl, con=conn) == 2 - assert self._get_sqlite_column_type(tbl, "Bool") == "INTEGER" - assert self._get_sqlite_column_type(tbl, "Date") == "TIMESTAMP" - assert self._get_sqlite_column_type(tbl, "Int") == "INTEGER" - assert self._get_sqlite_column_type(tbl, "Float") == "REAL" + assert get_sqlite_column_type(conn, tbl, "Bool") == "INTEGER" + assert get_sqlite_column_type(conn, tbl, "Date") == "TIMESTAMP" + assert get_sqlite_column_type(conn, tbl, "Int") == "INTEGER" + assert get_sqlite_column_type(conn, tbl, "Float") == "REAL" - def test_illegal_names(self): - # For sqlite, these should work fine - df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) - msg = "Empty table or column name specified" - with pytest.raises(ValueError, match=msg): - df.to_sql(name="", con=self.conn) - - for ndx, weird_name in enumerate( - [ - "test_weird_name]", - "test_weird_name[", - "test_weird_name`", - 'test_weird_name"', - "test_weird_name'", - "_b.test_weird_name_01-30", - '"_b.test_weird_name_01-30"', - "99beginswithnumber", - "12345", - "\xe9", - ] - ): - assert df.to_sql(name=weird_name, con=self.conn) == 2 - sql.table_exists(weird_name, self.conn) +def test_sqlite_illegal_names(sqlite_buildin): + # For sqlite, these should work fine + conn = sqlite_buildin + df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) + + msg = "Empty table or column name specified" + with pytest.raises(ValueError, match=msg): + df.to_sql(name="", con=conn) + + for ndx, weird_name in enumerate( + [ + "test_weird_name]", + "test_weird_name[", + "test_weird_name`", + 'test_weird_name"', + "test_weird_name'", + "_b.test_weird_name_01-30", + '"_b.test_weird_name_01-30"', + "99beginswithnumber", + "12345", + "\xe9", + ] + ): + assert df.to_sql(name=weird_name, con=conn) == 2 + sql.table_exists(weird_name, conn) - df2 = DataFrame([[1, 2], [3, 4]], columns=["a", weird_name]) - c_tbl = f"test_weird_col_name{ndx:d}" - assert df2.to_sql(name=c_tbl, con=self.conn) == 2 - sql.table_exists(c_tbl, self.conn) + df2 = DataFrame([[1, 2], [3, 4]], columns=["a", weird_name]) + c_tbl = f"test_weird_col_name{ndx:d}" + assert df2.to_sql(name=c_tbl, con=conn) == 2 + sql.table_exists(c_tbl, conn) # ----------------------------------------------------------------------------- @@ -3630,227 +3623,222 @@ def tquery(query, con=None): return None if res is None else list(res) -class TestXSQLite: - def drop_table(self, table_name, conn): - cur = conn.cursor() - cur.execute(f"DROP TABLE IF EXISTS {sql._get_valid_sqlite_name(table_name)}") - conn.commit() +def test_xsqlite_basic(sqlite_buildin): + frame = tm.makeTimeDataFrame() + assert sql.to_sql(frame, name="test_table", con=sqlite_buildin, index=False) == 30 + result = sql.read_sql("select * from test_table", sqlite_buildin) - def test_basic(self, sqlite_buildin): - frame = tm.makeTimeDataFrame() - assert ( - sql.to_sql(frame, name="test_table", con=sqlite_buildin, index=False) == 30 - ) - result = sql.read_sql("select * from test_table", sqlite_buildin) + # HACK! Change this once indexes are handled properly. + result.index = frame.index - # HACK! Change this once indexes are handled properly. - result.index = frame.index + expected = frame + tm.assert_frame_equal(result, frame) - expected = frame - tm.assert_frame_equal(result, frame) + frame["txt"] = ["a"] * len(frame) + frame2 = frame.copy() + new_idx = Index(np.arange(len(frame2)), dtype=np.int64) + 10 + frame2["Idx"] = new_idx.copy() + assert sql.to_sql(frame2, name="test_table2", con=sqlite_buildin, index=False) == 30 + result = sql.read_sql("select * from test_table2", sqlite_buildin, index_col="Idx") + expected = frame.copy() + expected.index = new_idx + expected.index.name = "Idx" + tm.assert_frame_equal(expected, result) - frame["txt"] = ["a"] * len(frame) - frame2 = frame.copy() - new_idx = Index(np.arange(len(frame2)), dtype=np.int64) + 10 - frame2["Idx"] = new_idx.copy() - assert ( - sql.to_sql(frame2, name="test_table2", con=sqlite_buildin, index=False) - == 30 - ) - result = sql.read_sql( - "select * from test_table2", sqlite_buildin, index_col="Idx" - ) - expected = frame.copy() - expected.index = new_idx - expected.index.name = "Idx" - tm.assert_frame_equal(expected, result) - - def test_write_row_by_row(self, sqlite_buildin): - frame = tm.makeTimeDataFrame() - frame.iloc[0, 0] = np.nan - create_sql = sql.get_schema(frame, "test") - cur = sqlite_buildin.cursor() - cur.execute(create_sql) - ins = "INSERT INTO test VALUES (%s, %s, %s, %s)" - for _, row in frame.iterrows(): - fmt_sql = format_query(ins, *row) - tquery(fmt_sql, con=sqlite_buildin) +def test_xsqlite_write_row_by_row(sqlite_buildin): + frame = tm.makeTimeDataFrame() + frame.iloc[0, 0] = np.nan + create_sql = sql.get_schema(frame, "test") + cur = sqlite_buildin.cursor() + cur.execute(create_sql) - sqlite_buildin.commit() + ins = "INSERT INTO test VALUES (%s, %s, %s, %s)" + for _, row in frame.iterrows(): + fmt_sql = format_query(ins, *row) + tquery(fmt_sql, con=sqlite_buildin) - result = sql.read_sql("select * from test", con=sqlite_buildin) - result.index = frame.index - tm.assert_frame_equal(result, frame, rtol=1e-3) + sqlite_buildin.commit() - def test_execute(self, sqlite_buildin): - frame = tm.makeTimeDataFrame() - create_sql = sql.get_schema(frame, "test") - cur = sqlite_buildin.cursor() - cur.execute(create_sql) - ins = "INSERT INTO test VALUES (?, ?, ?, ?)" - - row = frame.iloc[0] - with sql.pandasSQL_builder(sqlite_buildin) as pandas_sql: - pandas_sql.execute(ins, tuple(row)) - sqlite_buildin.commit() - - result = sql.read_sql("select * from test", sqlite_buildin) - result.index = frame.index[:1] - tm.assert_frame_equal(result, frame[:1]) - - def test_schema(self, sqlite_buildin): - frame = tm.makeTimeDataFrame() - create_sql = sql.get_schema(frame, "test") - lines = create_sql.splitlines() - for line in lines: - tokens = line.split(" ") - if len(tokens) == 2 and tokens[0] == "A": - assert tokens[1] == "DATETIME" - - create_sql = sql.get_schema(frame, "test", keys=["A", "B"]) - lines = create_sql.splitlines() - assert 'PRIMARY KEY ("A", "B")' in create_sql - cur = sqlite_buildin.cursor() - cur.execute(create_sql) + result = sql.read_sql("select * from test", con=sqlite_buildin) + result.index = frame.index + tm.assert_frame_equal(result, frame, rtol=1e-3) - def test_execute_fail(self, sqlite_buildin): - create_sql = """ - CREATE TABLE test - ( - a TEXT, - b TEXT, - c REAL, - PRIMARY KEY (a, b) - ); - """ - cur = sqlite_buildin.cursor() + +def test_xsqlite_execute(sqlite_buildin): + frame = tm.makeTimeDataFrame() + create_sql = sql.get_schema(frame, "test") + cur = sqlite_buildin.cursor() + cur.execute(create_sql) + ins = "INSERT INTO test VALUES (?, ?, ?, ?)" + + row = frame.iloc[0] + with sql.pandasSQL_builder(sqlite_buildin) as pandas_sql: + pandas_sql.execute(ins, tuple(row)) + sqlite_buildin.commit() + + result = sql.read_sql("select * from test", sqlite_buildin) + result.index = frame.index[:1] + tm.assert_frame_equal(result, frame[:1]) + + +def test_xsqlite_schema(sqlite_buildin): + frame = tm.makeTimeDataFrame() + create_sql = sql.get_schema(frame, "test") + lines = create_sql.splitlines() + for line in lines: + tokens = line.split(" ") + if len(tokens) == 2 and tokens[0] == "A": + assert tokens[1] == "DATETIME" + + create_sql = sql.get_schema(frame, "test", keys=["A", "B"]) + lines = create_sql.splitlines() + assert 'PRIMARY KEY ("A", "B")' in create_sql + cur = sqlite_buildin.cursor() + cur.execute(create_sql) + + +def test_xsqlite_execute_fail(sqlite_buildin): + create_sql = """ + CREATE TABLE test + ( + a TEXT, + b TEXT, + c REAL, + PRIMARY KEY (a, b) + ); + """ + cur = sqlite_buildin.cursor() + cur.execute(create_sql) + + with sql.pandasSQL_builder(sqlite_buildin) as pandas_sql: + pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)') + pandas_sql.execute('INSERT INTO test VALUES("foo", "baz", 2.567)') + + with pytest.raises(sql.DatabaseError, match="Execution failed on sql"): + pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 7)') + + +def test_xsqlite_execute_closed_connection(): + create_sql = """ + CREATE TABLE test + ( + a TEXT, + b TEXT, + c REAL, + PRIMARY KEY (a, b) + ); + """ + with contextlib.closing(sqlite3.connect(":memory:")) as conn: + cur = conn.cursor() cur.execute(create_sql) - with sql.pandasSQL_builder(sqlite_buildin) as pandas_sql: + with sql.pandasSQL_builder(conn) as pandas_sql: pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)') - pandas_sql.execute('INSERT INTO test VALUES("foo", "baz", 2.567)') - with pytest.raises(sql.DatabaseError, match="Execution failed on sql"): - pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 7)') + msg = "Cannot operate on a closed database." + with pytest.raises(sqlite3.ProgrammingError, match=msg): + tquery("select * from test", con=conn) - def test_execute_closed_connection(self): - create_sql = """ - CREATE TABLE test - ( - a TEXT, - b TEXT, - c REAL, - PRIMARY KEY (a, b) - ); - """ - with contextlib.closing(sqlite3.connect(":memory:")) as conn: - cur = conn.cursor() - cur.execute(create_sql) - - with sql.pandasSQL_builder(conn) as pandas_sql: - pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)') - - msg = "Cannot operate on a closed database." - with pytest.raises(sqlite3.ProgrammingError, match=msg): - tquery("select * from test", con=conn) - - def test_keyword_as_column_names(self, sqlite_buildin): - df = DataFrame({"From": np.ones(5)}) - assert sql.to_sql(df, con=sqlite_buildin, name="testkeywords", index=False) == 5 - - def test_onecolumn_of_integer(self, sqlite_buildin): - # GH 3628 - # a column_of_integers dataframe should transfer well to sql - - mono_df = DataFrame([1, 2], columns=["c0"]) - assert sql.to_sql(mono_df, con=sqlite_buildin, name="mono_df", index=False) == 2 - # computing the sum via sql - con_x = sqlite_buildin - the_sum = sum(my_c0[0] for my_c0 in con_x.execute("select * from mono_df")) - # it should not fail, and gives 3 ( Issue #3628 ) - assert the_sum == 3 - - result = sql.read_sql("select * from mono_df", con_x) - tm.assert_frame_equal(result, mono_df) - - def test_if_exists(self, sqlite_buildin): - df_if_exists_1 = DataFrame({"col1": [1, 2], "col2": ["A", "B"]}) - df_if_exists_2 = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"]}) - table_name = "table_if_exists" - sql_select = f"SELECT * FROM {table_name}" - - msg = "'notvalidvalue' is not valid for if_exists" - with pytest.raises(ValueError, match=msg): - sql.to_sql( - frame=df_if_exists_1, - con=sqlite_buildin, - name=table_name, - if_exists="notvalidvalue", - ) - self.drop_table(table_name, sqlite_buildin) - # test if_exists='fail' +def test_xsqlite_keyword_as_column_names(sqlite_buildin): + df = DataFrame({"From": np.ones(5)}) + assert sql.to_sql(df, con=sqlite_buildin, name="testkeywords", index=False) == 5 + + +def test_xsqlite_onecolumn_of_integer(sqlite_buildin): + # GH 3628 + # a column_of_integers dataframe should transfer well to sql + + mono_df = DataFrame([1, 2], columns=["c0"]) + assert sql.to_sql(mono_df, con=sqlite_buildin, name="mono_df", index=False) == 2 + # computing the sum via sql + con_x = sqlite_buildin + the_sum = sum(my_c0[0] for my_c0 in con_x.execute("select * from mono_df")) + # it should not fail, and gives 3 ( Issue #3628 ) + assert the_sum == 3 + + result = sql.read_sql("select * from mono_df", con_x) + tm.assert_frame_equal(result, mono_df) + + +def test_xsqlite_if_exists(sqlite_buildin): + df_if_exists_1 = DataFrame({"col1": [1, 2], "col2": ["A", "B"]}) + df_if_exists_2 = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"]}) + table_name = "table_if_exists" + sql_select = f"SELECT * FROM {table_name}" + + msg = "'notvalidvalue' is not valid for if_exists" + with pytest.raises(ValueError, match=msg): sql.to_sql( - frame=df_if_exists_1, con=sqlite_buildin, name=table_name, if_exists="fail" + frame=df_if_exists_1, + con=sqlite_buildin, + name=table_name, + if_exists="notvalidvalue", ) - msg = "Table 'table_if_exists' already exists" - with pytest.raises(ValueError, match=msg): - sql.to_sql( - frame=df_if_exists_1, - con=sqlite_buildin, - name=table_name, - if_exists="fail", - ) - # test if_exists='replace' + drop_table(table_name, sqlite_buildin) + + # test if_exists='fail' + sql.to_sql( + frame=df_if_exists_1, con=sqlite_buildin, name=table_name, if_exists="fail" + ) + msg = "Table 'table_if_exists' already exists" + with pytest.raises(ValueError, match=msg): sql.to_sql( frame=df_if_exists_1, con=sqlite_buildin, name=table_name, + if_exists="fail", + ) + # test if_exists='replace' + sql.to_sql( + frame=df_if_exists_1, + con=sqlite_buildin, + name=table_name, + if_exists="replace", + index=False, + ) + assert tquery(sql_select, con=sqlite_buildin) == [(1, "A"), (2, "B")] + assert ( + sql.to_sql( + frame=df_if_exists_2, + con=sqlite_buildin, + name=table_name, if_exists="replace", index=False, ) - assert tquery(sql_select, con=sqlite_buildin) == [(1, "A"), (2, "B")] - assert ( - sql.to_sql( - frame=df_if_exists_2, - con=sqlite_buildin, - name=table_name, - if_exists="replace", - index=False, - ) - == 3 - ) - assert tquery(sql_select, con=sqlite_buildin) == [(3, "C"), (4, "D"), (5, "E")] - self.drop_table(table_name, sqlite_buildin) + == 3 + ) + assert tquery(sql_select, con=sqlite_buildin) == [(3, "C"), (4, "D"), (5, "E")] + drop_table(table_name, sqlite_buildin) - # test if_exists='append' - assert ( - sql.to_sql( - frame=df_if_exists_1, - con=sqlite_buildin, - name=table_name, - if_exists="fail", - index=False, - ) - == 2 + # test if_exists='append' + assert ( + sql.to_sql( + frame=df_if_exists_1, + con=sqlite_buildin, + name=table_name, + if_exists="fail", + index=False, ) - assert tquery(sql_select, con=sqlite_buildin) == [(1, "A"), (2, "B")] - assert ( - sql.to_sql( - frame=df_if_exists_2, - con=sqlite_buildin, - name=table_name, - if_exists="append", - index=False, - ) - == 3 + == 2 + ) + assert tquery(sql_select, con=sqlite_buildin) == [(1, "A"), (2, "B")] + assert ( + sql.to_sql( + frame=df_if_exists_2, + con=sqlite_buildin, + name=table_name, + if_exists="append", + index=False, ) - assert tquery(sql_select, con=sqlite_buildin) == [ - (1, "A"), - (2, "B"), - (3, "C"), - (4, "D"), - (5, "E"), - ] - self.drop_table(table_name, sqlite_buildin) + == 3 + ) + assert tquery(sql_select, con=sqlite_buildin) == [ + (1, "A"), + (2, "B"), + (3, "C"), + (4, "D"), + (5, "E"), + ] + drop_table(table_name, sqlite_buildin) From 835233180aff90d8ee4aa65a95f6daded84a6cfd Mon Sep 17 00:00:00 2001 From: Abdullah Ihsan Secer Date: Tue, 19 Sep 2023 11:40:15 +0100 Subject: [PATCH 007/131] TST: Test GroupBy.__getitem__ with a column from grouper (#55193) --- pandas/tests/groupby/test_grouping.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index d05b60fd56b5f..9e899bf453548 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -131,6 +131,20 @@ def test_getitem_single_column(self): tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "func", [lambda x: x.sum(), lambda x: x.agg(lambda y: y.sum())] + ) + def test_getitem_from_grouper(self, func): + # GH 50383 + df = DataFrame({"a": [1, 1, 2], "b": 3, "c": 4, "d": 5}) + gb = df.groupby(["a", "b"])[["a", "c"]] + + idx = MultiIndex.from_tuples([(1, 3), (2, 3)], names=["a", "b"]) + expected = DataFrame({"a": [2, 2], "c": [8, 4]}, index=idx) + result = func(gb) + + tm.assert_frame_equal(result, expected) + def test_indices_grouped_by_tuple_with_lambda(self): # GH 36158 df = DataFrame( From 6a65cfdccef0974bf5291c0a6c5e237968d1f034 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 19 Sep 2023 11:33:34 -0400 Subject: [PATCH 008/131] DEPS: Bump to 1.26 proper for Python 3.12 (#55175) * DEPS: Bump to 1.26 proper for Python 3.12 * Update unit-tests.yml * Update unit-tests.yml * revert hack --- pyproject.toml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7807a6cc6368d..4e1c77413efda 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,8 +10,7 @@ requires = [ # we don't want to force users to compile with 1.25 though # (Ideally, in the future, though, oldest-supported-numpy can be dropped when our min numpy is 1.25.x) "oldest-supported-numpy>=2022.8.16; python_version<'3.12'", - # TODO: This needs to be updated when the official numpy 1.26 comes out - "numpy>=1.26.0b1; python_version>='3.12'", + "numpy>=1.26.0; python_version>='3.12'", "versioneer[toml]" ] @@ -32,8 +31,7 @@ requires-python = '>=3.9' dependencies = [ "numpy>=1.22.4; python_version<'3.11'", "numpy>=1.23.2; python_version=='3.11'", - # TODO: This needs to be updated when the official numpy 1.26 comes out - "numpy>=1.26.0b1; python_version>='3.12'", + "numpy>=1.26.0; python_version>='3.12'", "python-dateutil>=2.8.2", "pytz>=2020.1", "tzdata>=2022.1" From 20949e287d2688ef3e0e76bb39143db42fcbb813 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Tue, 19 Sep 2023 11:46:13 -0400 Subject: [PATCH 009/131] CLN: do not import Axes/Artist/Figure from matplotlib.pyplot (#55192) CLN: do not import Axes/Artist from matplotlib.pyplot --- pandas/_testing/asserters.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 0591394f5d9ed..d4e7e196dc2d4 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -416,7 +416,8 @@ def assert_attr_equal(attr: str, left, right, obj: str = "Attributes") -> None: def assert_is_valid_plot_return_object(objs) -> None: - import matplotlib.pyplot as plt + from matplotlib.artist import Artist + from matplotlib.axes import Axes if isinstance(objs, (Series, np.ndarray)): for el in objs.ravel(): @@ -424,14 +425,14 @@ def assert_is_valid_plot_return_object(objs) -> None: "one of 'objs' is not a matplotlib Axes instance, " f"type encountered {repr(type(el).__name__)}" ) - assert isinstance(el, (plt.Axes, dict)), msg + assert isinstance(el, (Axes, dict)), msg else: msg = ( "objs is neither an ndarray of Artist instances nor a single " "ArtistArtist instance, tuple, or dict, 'objs' is a " f"{repr(type(objs).__name__)}" ) - assert isinstance(objs, (plt.Artist, tuple, dict)), msg + assert isinstance(objs, (Artist, tuple, dict)), msg def assert_is_sorted(seq) -> None: From 696d21b5fea1c768ee833c3f67db321f9d329921 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Tue, 19 Sep 2023 11:47:06 -0400 Subject: [PATCH 010/131] TYP: Allow None in Period.strftime (#55190) * TYP: Allow None in strftime * Change pyi * Add description to docstring --- pandas/_libs/tslibs/period.pyi | 2 +- pandas/_libs/tslibs/period.pyx | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi index c85865fea8fd0..a4aecd2ce0a09 100644 --- a/pandas/_libs/tslibs/period.pyi +++ b/pandas/_libs/tslibs/period.pyi @@ -90,7 +90,7 @@ class Period(PeriodMixin): def _from_ordinal(cls, ordinal: int, freq) -> Period: ... @classmethod def now(cls, freq: Frequency = ...) -> Period: ... - def strftime(self, fmt: str) -> str: ... + def strftime(self, fmt: str | None) -> str: ... def to_timestamp( self, freq: str | BaseOffset | None = ..., diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index c37e9cd7ef1f3..21937ab2519e4 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2511,11 +2511,12 @@ cdef class _Period(PeriodMixin): object_state = None, self.freq, self.ordinal return (Period, object_state) - def strftime(self, fmt: str) -> str: + def strftime(self, fmt: str | None) -> str: r""" Returns a formatted string representation of the :class:`Period`. - ``fmt`` must be a string containing one or several directives. + ``fmt`` must be ``None`` or a string containing one or several directives. + When ``None``, the format will be determined from the frequency of the Period. The method recognizes the same directives as the :func:`time.strftime` function of the standard Python distribution, as well as the specific additional directives ``%f``, ``%F``, ``%q``, ``%l``, ``%u``, ``%n``. From ef5c01673510341d9045d492027c46ccaa246f13 Mon Sep 17 00:00:00 2001 From: kgmuzungu Date: Tue, 19 Sep 2023 17:50:04 +0200 Subject: [PATCH 011/131] DOC: add alternative to docstring of deprecated Series.bool() method (#55168) * Fix for series boolean documentation * one blank line removed. docstring error * Update pandas/core/generic.py fixing text formating Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Update pandas/core/generic.py text formatting fixed Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/generic.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 271ad40a98272..d6508cc43a7c8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1526,7 +1526,8 @@ def bool(self) -> bool_t: .. deprecated:: 2.1.0 - bool is deprecated and will be removed in future version of pandas + bool is deprecated and will be removed in future version of pandas. + For ``Series`` use ``pandas.Series.item``. This must be a boolean scalar value, either True or False. It will raise a ValueError if the Series or DataFrame does not have exactly 1 element, or that @@ -1556,6 +1557,14 @@ def bool(self) -> bool_t: True >>> pd.DataFrame({'col': [False]}).bool() # doctest: +SKIP False + + This is an alternative method and will only work + for single element objects with a boolean value: + + >>> pd.Series([True]).item() # doctest: +SKIP + True + >>> pd.Series([False]).item() # doctest: +SKIP + False """ warnings.warn( From cb5b2e6f5280df2d26b0a9550828c4bdf1098053 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 19 Sep 2023 10:21:30 -0700 Subject: [PATCH 012/131] BUG: RecursionError in loc.setitem (#55201) --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/dtypes/cast.py | 18 ++++++++++++++++++ pandas/core/internals/blocks.py | 6 ++++++ pandas/tests/indexing/test_loc.py | 8 ++++++++ 4 files changed, 33 insertions(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 0fc4afc95a2ce..24bed22b3a3fe 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -266,6 +266,7 @@ Interval Indexing ^^^^^^^^ - Bug in :meth:`Index.difference` not returning a unique set of values when ``other`` is empty or ``other`` is considered non-comparable (:issue:`55113`) +- Bug in setting :class:`Categorical` values into a :class:`DataFrame` with numpy dtypes raising ``RecursionError`` (:issue:`52927`) - Missing diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 1d5db123068e2..74e785be06356 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -68,6 +68,7 @@ PeriodDtype, ) from pandas.core.dtypes.generic import ( + ABCExtensionArray, ABCIndex, ABCSeries, ) @@ -1772,6 +1773,23 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: return casted raise LossySetitemError + elif isinstance(element, ABCExtensionArray) and isinstance( + element.dtype, CategoricalDtype + ): + # GH#52927 setting Categorical value into non-EA frame + # TODO: general-case for EAs? + try: + casted = element.astype(dtype) + except (ValueError, TypeError): + raise LossySetitemError + # Check for cases of either + # a) lossy overflow/rounding or + # b) semantic changes like dt64->int64 + comp = casted == element + if not comp.all(): + raise LossySetitemError + return casted + # Anything other than integer we cannot hold raise LossySetitemError if ( diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 66b01dfb59f7f..30f6507d02484 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -460,6 +460,12 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: and will receive the same block """ new_dtype = find_result_type(self.values.dtype, other) + if new_dtype == self.dtype: + # GH#52927 avoid RecursionError + raise AssertionError( + "Something has gone wrong, please report a bug at " + "https://github.com/pandas-dev/pandas/issues" + ) # In a future version of pandas, the default will be that # setting `nan` into an integer series won't raise. diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index d0b6adfda0241..a2693c85e507f 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1657,6 +1657,14 @@ def test_loc_setitem_range_key(self, frame_or_series): expected = frame_or_series([0, 1, 10, 9, 11], index=obj.index) tm.assert_equal(obj, expected) + def test_loc_setitem_numpy_frame_categorical_value(self): + # GH#52927 + df = DataFrame({"a": [1, 1, 1, 1, 1], "b": ["a", "a", "a", "a", "a"]}) + df.loc[1:2, "a"] = Categorical([2, 2], categories=[1, 2]) + + expected = DataFrame({"a": [1, 2, 2, 1, 1], "b": ["a", "a", "a", "a", "a"]}) + tm.assert_frame_equal(df, expected) + class TestLocWithEllipsis: @pytest.fixture(params=[tm.loc, tm.iloc]) From f1bf7d4b30a162d7b52417c918e3cf1cc6c14863 Mon Sep 17 00:00:00 2001 From: AG <98327736+ggold7046@users.noreply.github.com> Date: Wed, 20 Sep 2023 00:18:49 +0530 Subject: [PATCH 013/131] Modified doc/make.py to run sphinx-build -b linkcheck (#54265) * Modified doc/make.py to run sphinx-build -b linkcheck * Update make.py * Update make.py * Update make.py * Update make.py * Update make.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update doc/make.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Update make.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update make.py * Update doc/make.py Co-authored-by: Philip Meier * Update doc/make.py Co-authored-by: Philip Meier * Update make.py --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Co-authored-by: Philip Meier --- doc/make.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/doc/make.py b/doc/make.py index 9db4ea406bc1f..23b18d69acc04 100755 --- a/doc/make.py +++ b/doc/make.py @@ -123,14 +123,14 @@ def _sphinx_build(self, kind: str): Parameters ---------- - kind : {'html', 'latex'} + kind : {'html', 'latex', 'linkcheck'} Examples -------- >>> DocBuilder(num_jobs=4)._sphinx_build('html') """ - if kind not in ("html", "latex"): - raise ValueError(f"kind must be html or latex, not {kind}") + if kind not in ("html", "latex", "linkcheck"): + raise ValueError(f"kind must be html, latex or linkcheck, not {kind}") cmd = ["sphinx-build", "-b", kind] if self.num_jobs: @@ -288,6 +288,12 @@ def zip_html(self): os.chdir(dirname) self._run_os("zip", zip_fname, "-r", "-q", *fnames) + def linkcheck(self): + """ + Check for broken links in the documentation. + """ + return self._sphinx_build("linkcheck") + def main(): cmds = [method for method in dir(DocBuilder) if not method.startswith("_")] From 6d58277feef818bb488dbd437f71aaf0bfb5e412 Mon Sep 17 00:00:00 2001 From: Sergey Zakharov Date: Wed, 20 Sep 2023 00:04:56 +0400 Subject: [PATCH 014/131] Fix NameError in 'Evaluation order matters' (#55198) * Fix NameError in 'Evaluation order matters' This code in 'Indexing and selecting data' of doc wouldn't execute * Update indexing.rst Another 'optiion_context' without 'pd.' --- doc/source/user_guide/indexing.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 52bc43f52b1d3..7541cf3c8af3c 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -1837,7 +1837,7 @@ This however is operating on a copy and will not work. :okwarning: :okexcept: - with option_context('mode.chained_assignment','warn'): + with pd.option_context('mode.chained_assignment','warn'): dfb[dfb['a'].str.startswith('o')]['c'] = 42 A chained assignment can also crop up in setting in a mixed dtype frame. @@ -1879,7 +1879,7 @@ Last, the subsequent example will **not** work at all, and so should be avoided: :okwarning: :okexcept: - with option_context('mode.chained_assignment','raise'): + with pd.option_context('mode.chained_assignment','raise'): dfd.loc[0]['a'] = 1111 .. warning:: From 5d8cacda9dd086e340c0cb39e546070d6d313472 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 20 Sep 2023 00:25:36 +0200 Subject: [PATCH 015/131] Revert "DEPR: Deprecate returning a DataFrame in SeriesApply.apply_standard (#55189) * Revert "DEPR: Deprecate returning a DataFrame in SeriesApply.apply_standard (#52123)" This reverts commit fe415f55 * Fix tests * Add whatsnew * Add whatsnew * Add test --- doc/source/user_guide/cookbook.rst | 10 +++---- doc/source/user_guide/groupby.rst | 13 +++++++++ doc/source/whatsnew/v0.10.0.rst | 29 +++++++------------- doc/source/whatsnew/v2.1.0.rst | 1 - doc/source/whatsnew/v2.1.1.rst | 2 +- pandas/core/apply.py | 8 ------ pandas/core/series.py | 5 ---- pandas/tests/apply/test_series_apply.py | 36 ++++++++++++------------- 8 files changed, 45 insertions(+), 59 deletions(-) diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 002e88533ab93..2d2c0a4db4df6 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -794,12 +794,12 @@ Apply index=["I", "II", "III"], ) - def make_df(ser): - new_vals = [pd.Series(value, name=name) for name, value in ser.items()] - return pd.DataFrame(new_vals) - - df_orgz = pd.concat({ind: row.pipe(make_df) for ind, row in df.iterrows()}) + def SeriesFromSubList(aList): + return pd.Series(aList) + df_orgz = pd.concat( + {ind: row.apply(SeriesFromSubList) for ind, row in df.iterrows()} + ) df_orgz `Rolling apply with a DataFrame returning a Series diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 5dd14e243fbb3..4be62090ec645 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -1213,6 +1213,19 @@ The dimension of the returned result can also change: grouped.apply(f) +``apply`` on a Series can operate on a returned value from the applied function +that is itself a series, and possibly upcast the result to a DataFrame: + +.. ipython:: python + + def f(x): + return pd.Series([x, x ** 2], index=["x", "x^2"]) + + + s = pd.Series(np.random.rand(5)) + s + s.apply(f) + Similar to :ref:`groupby.aggregate.agg`, the resulting dtype will reflect that of the apply function. If the results from different groups have different dtypes, then a common dtype will be determined in the same way as ``DataFrame`` construction. diff --git a/doc/source/whatsnew/v0.10.0.rst b/doc/source/whatsnew/v0.10.0.rst index 422efc1b36946..be50c34d7d14c 100644 --- a/doc/source/whatsnew/v0.10.0.rst +++ b/doc/source/whatsnew/v0.10.0.rst @@ -261,26 +261,15 @@ Convenience methods ``ffill`` and ``bfill`` have been added: function, that is itself a series, and possibly upcast the result to a DataFrame - .. code-block:: python - - >>> def f(x): - ... return pd.Series([x, x ** 2], index=["x", "x^2"]) - >>> - >>> s = pd.Series(np.random.rand(5)) - >>> s - 0 0.340445 - 1 0.984729 - 2 0.919540 - 3 0.037772 - 4 0.861549 - dtype: float64 - >>> s.apply(f) - x x^2 - 0 0.340445 0.115903 - 1 0.984729 0.969691 - 2 0.919540 0.845555 - 3 0.037772 0.001427 - 4 0.861549 0.742267 + .. ipython:: python + + def f(x): + return pd.Series([x, x ** 2], index=["x", "x^2"]) + + + s = pd.Series(np.random.rand(5)) + s + s.apply(f) - New API functions for working with pandas options (:issue:`2097`): diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 040ca048d1224..18054e0b01191 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -555,7 +555,6 @@ Other Deprecations - Deprecated behavior of :func:`concat` when :class:`DataFrame` has columns that are all-NA, in a future version these will not be discarded when determining the resulting dtype (:issue:`40893`) - Deprecated behavior of :meth:`Series.dt.to_pydatetime`, in a future version this will return a :class:`Series` containing python ``datetime`` objects instead of an ``ndarray`` of datetimes; this matches the behavior of other :attr:`Series.dt` properties (:issue:`20306`) - Deprecated logical operations (``|``, ``&``, ``^``) between pandas objects and dtype-less sequences (e.g. ``list``, ``tuple``), wrap a sequence in a :class:`Series` or NumPy array before operating instead (:issue:`51521`) -- Deprecated making :meth:`Series.apply` return a :class:`DataFrame` when the passed-in callable returns a :class:`Series` object. In the future this will return a :class:`Series` whose values are themselves :class:`Series`. This pattern was very slow and it's recommended to use alternative methods to archive the same goal (:issue:`52116`) - Deprecated parameter ``convert_type`` in :meth:`Series.apply` (:issue:`52140`) - Deprecated passing a dictionary to :meth:`.SeriesGroupBy.agg`; pass a list of aggregations instead (:issue:`50684`) - Deprecated the ``fastpath`` keyword in :class:`Categorical` constructor, use :meth:`Categorical.from_codes` instead (:issue:`20110`) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index 6d5da7cdff3b3..c9ab496295d85 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -43,7 +43,7 @@ Bug fixes Other ~~~~~ -- +- Reverted the deprecation that disallowed :meth:`Series.apply` returning a :class:`DataFrame` when the passed-in callable returns a :class:`Series` object (:issue:`52116`) .. --------------------------------------------------------------------------- .. _whatsnew_211.contributors: diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 9748d4fe66739..1525e316f345f 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1326,14 +1326,6 @@ def curried(x): ) if len(mapped) and isinstance(mapped[0], ABCSeries): - warnings.warn( - "Returning a DataFrame from Series.apply when the supplied function " - "returns a Series is deprecated and will be removed in a future " - "version.", - FutureWarning, - stacklevel=find_stack_level(), - ) # GH52116 - # GH#43986 Need to do list(mapped) in order to get treated as nested # See also GH#25959 regarding EA support return obj._constructor_expanddim(list(mapped), index=obj.index) diff --git a/pandas/core/series.py b/pandas/core/series.py index 78ec1554198df..3c7270107d71d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4635,11 +4635,6 @@ def apply( """ Invoke function on values of Series. - .. deprecated:: 2.1.0 - - If the result from ``func`` is a ``Series``, wrapping the output in a - ``DataFrame`` instead of a ``Series`` has been deprecated. - Can be ufunc (a NumPy function that applies to the entire Series) or a Python function that only works on single values. diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index d3e5ac1b4ca7a..aeb6a01eb587a 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -420,8 +420,9 @@ def test_agg_evaluate_lambdas(string_series): def test_with_nested_series(datetime_series, op_name): # GH 2316 # .agg with a reducer and a transform, what to do - msg = "Returning a DataFrame from Series.apply when the supplied function" - with tm.assert_produces_warning(FutureWarning, match=msg): + msg = "cannot aggregate" + warning = FutureWarning if op_name == "agg" else None + with tm.assert_produces_warning(warning, match=msg): # GH52123 result = getattr(datetime_series, op_name)( lambda x: Series([x, x**2], index=["x", "x^2"]) @@ -429,6 +430,10 @@ def test_with_nested_series(datetime_series, op_name): expected = DataFrame({"x": datetime_series, "x^2": datetime_series**2}) tm.assert_frame_equal(result, expected) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = datetime_series.agg(lambda x: Series([x, x**2], index=["x", "x^2"])) + tm.assert_frame_equal(result, expected) + def test_replicate_describe(string_series): # this also tests a result set that is all scalars @@ -512,10 +517,7 @@ def test_apply_series_on_date_time_index_aware_series(dti, exp, aware): index = dti.tz_localize("UTC").index else: index = dti.index - msg = "Returning a DataFrame from Series.apply when the supplied function" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH52123 - result = Series(index).apply(lambda x: Series([1, 2])) + result = Series(index).apply(lambda x: Series([1, 2])) tm.assert_frame_equal(result, exp) @@ -662,19 +664,7 @@ def test_apply_dictlike_lambda(ops, by_row, expected): def test_apply_retains_column_name(by_row): # GH 16380 df = DataFrame({"x": range(3)}, Index(range(3), name="x")) - func = lambda x: Series(range(x + 1), Index(range(x + 1), name="y")) - - if not by_row: - # GH53400 - msg = "'Series' object cannot be interpreted as an integer" - with pytest.raises(TypeError, match=msg): - df.x.apply(func, by_row=by_row) - return - - msg = "Returning a DataFrame from Series.apply when the supplied function" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH52123 - result = df.x.apply(func, by_row=by_row) + result = df.x.apply(lambda x: Series(range(x + 1), Index(range(x + 1), name="y"))) expected = DataFrame( [[0.0, np.nan, np.nan], [0.0, 1.0, np.nan], [0.0, 1.0, 2.0]], columns=Index(range(3), name="y"), @@ -689,3 +679,11 @@ def test_apply_type(): result = s.apply(type) expected = Series([int, str, type], index=["a", "b", "c"]) tm.assert_series_equal(result, expected) + + +def test_series_apply_unpack_nested_data(): + # GH#55189 + ser = Series([[1, 2, 3], [4, 5, 6, 7]]) + result = ser.apply(lambda x: Series(x)) + expected = DataFrame({0: [1.0, 4.0], 1: [2.0, 5.0], 2: [3.0, 6.0], 3: [np.nan, 7]}) + tm.assert_frame_equal(result, expected) From 49c89d292c343617f417aaccc6cd564f1a42207c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 20 Sep 2023 03:50:03 +0200 Subject: [PATCH 016/131] REGR: Arrow backed objects not propagating exceptions (#54952) Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.1.1.rst | 1 + pandas/core/arrays/arrow/array.py | 16 +++++++++------- pandas/tests/extension/test_arrow.py | 8 ++++++++ 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index c9ab496295d85..0dc113bd9ab7f 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -26,6 +26,7 @@ Fixed regressions - Fixed regression in :meth:`Series.drop_duplicates` for PyArrow strings (:issue:`54904`) - Fixed regression in :meth:`Series.interpolate` raising when ``fill_value`` was given (:issue:`54920`) - Fixed regression in :meth:`Series.value_counts` raising for numeric data if ``bins`` was specified (:issue:`54857`) +- Fixed regression in comparison operations for PyArrow backed columns not propagating exceptions correctly (:issue:`54944`) - Fixed regression when comparing a :class:`Series` with ``datetime64`` dtype with ``None`` (:issue:`54870`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e67b7035822cc..4b79d0dbb683e 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -627,20 +627,22 @@ def __setstate__(self, state) -> None: def _cmp_method(self, other, op): pc_func = ARROW_CMP_FUNCS[op.__name__] - try: + if isinstance(other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray)): result = pc_func(self._pa_array, self._box_pa(other)) - except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): - if is_scalar(other): + elif is_scalar(other): + try: + result = pc_func(self._pa_array, self._box_pa(other)) + except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): mask = isna(self) | isna(other) valid = ~mask result = np.zeros(len(self), dtype="bool") result[valid] = op(np.array(self)[valid], other) result = pa.array(result, type=pa.bool_()) result = pc.if_else(valid, result, None) - else: - raise NotImplementedError( - f"{op.__name__} not implemented for {type(other)}" - ) + else: + raise NotImplementedError( + f"{op.__name__} not implemented for {type(other)}" + ) return ArrowExtensionArray(result) def _evaluate_op_method(self, other, op, arrow_funcs): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 8968b9a7f25fe..339e97e735f85 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3027,6 +3027,14 @@ def test_duration_fillna_numpy(pa_type): tm.assert_series_equal(result, expected) +def test_comparison_not_propagating_arrow_error(): + # GH#54944 + a = pd.Series([1 << 63], dtype="uint64[pyarrow]") + b = pd.Series([None], dtype="int64[pyarrow]") + with pytest.raises(pa.lib.ArrowInvalid, match="Integer value"): + a < b + + def test_factorize_chunked_dictionary(): # GH 54844 pa_array = pa.chunked_array( From b303665df0337448abbc6e3107be1f0ff7c98fb5 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Tue, 19 Sep 2023 21:53:51 -0400 Subject: [PATCH 017/131] DEPR: ArrayManager (#55044) * DEPR: ArrayManager * Fixup * Test fixup * debug CI * Test fixup * warn if PANDAS_DATA_MANAGER is set * single_cpu --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/__init__.py | 14 +++++++ pandas/conftest.py | 9 ++++- pandas/core/config_init.py | 7 ++++ pandas/core/frame.py | 7 ++-- pandas/core/series.py | 10 ++--- pandas/io/parquet.py | 3 +- pandas/tests/extension/conftest.py | 7 +++- pandas/tests/internals/test_managers.py | 49 +++++++++++++++++++++---- pandas/tests/io/test_parquet.py | 11 +++--- pandas/util/_test_decorators.py | 7 +++- scripts/validate_unwanted_patterns.py | 2 + 12 files changed, 98 insertions(+), 29 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 24bed22b3a3fe..da2e30edc80ea 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -199,6 +199,7 @@ Deprecations - Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`52536`) - Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`52536`) - Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`) +- Deprecated the option ``mode.data_manager`` and the ``ArrayManager``; only the ``BlockManager`` will be available in future versions (:issue:`55043`) - Deprecating downcasting the results of :meth:`DataFrame.fillna`, :meth:`Series.fillna`, :meth:`DataFrame.ffill`, :meth:`Series.ffill`, :meth:`DataFrame.bfill`, :meth:`Series.bfill` in object-dtype cases. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`54261`) .. --------------------------------------------------------------------------- diff --git a/pandas/__init__.py b/pandas/__init__.py index d11a429987ac4..41e34309232ee 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -1,5 +1,8 @@ from __future__ import annotations +import os +import warnings + __docformat__ = "restructuredtext" # Let users know if they're missing any of our hard dependencies @@ -190,6 +193,17 @@ __git_version__ = v.get("full-revisionid") del get_versions, v +# GH#55043 - deprecation of the data_manager option +if "PANDAS_DATA_MANAGER" in os.environ: + warnings.warn( + "The env variable PANDAS_DATA_MANAGER is set. The data_manager option is " + "deprecated and will be removed in a future version. Only the BlockManager " + "will be available. Unset this environment variable to silence this warning.", + FutureWarning, + stacklevel=2, + ) +# Don't allow users to use pandas.os or pandas.warnings +del os, warnings # module level doc-string __doc__ = """ diff --git a/pandas/conftest.py b/pandas/conftest.py index ac0275bf695d4..62f22921f0482 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -49,6 +49,8 @@ utc, ) +from pandas._config.config import _get_option + import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import ( @@ -1983,7 +1985,7 @@ def using_array_manager() -> bool: """ Fixture to check if the array manager is being used. """ - return pd.options.mode.data_manager == "array" + return _get_option("mode.data_manager", silent=True) == "array" @pytest.fixture @@ -1991,7 +1993,10 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ - return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" + return ( + pd.options.mode.copy_on_write + and _get_option("mode.data_manager", silent=True) == "block" + ) warsaws = ["Europe/Warsaw", "dateutil/Europe/Warsaw"] diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 750b374043193..4652acdcae287 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -454,6 +454,13 @@ def use_inf_as_na_cb(key) -> None: validator=is_one_of_factory(["block", "array"]), ) +cf.deprecate_option( + # GH#55043 + "mode.data_manager", + "data_manager option is deprecated and will be removed in a future " + "version. Only the BlockManager will be available.", +) + # TODO better name? copy_on_write_doc = """ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4e87e90278e7b..3e32a6d93b023 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -43,6 +43,7 @@ get_option, using_copy_on_write, ) +from pandas._config.config import _get_option from pandas._libs import ( algos as libalgos, @@ -694,7 +695,7 @@ def __init__( NDFrame.__init__(self, data) return - manager = get_option("mode.data_manager") + manager = _get_option("mode.data_manager", silent=True) # GH47215 if isinstance(index, set): @@ -2411,7 +2412,7 @@ def maybe_reorder( columns = columns.drop(exclude) - manager = get_option("mode.data_manager") + manager = _get_option("mode.data_manager", silent=True) mgr = arrays_to_mgr(arrays, columns, result_index, typ=manager) return cls(mgr) @@ -2612,7 +2613,7 @@ def _from_arrays( if dtype is not None: dtype = pandas_dtype(dtype) - manager = get_option("mode.data_manager") + manager = _get_option("mode.data_manager", silent=True) columns = ensure_index(columns) if len(columns) != len(arrays): raise ValueError("len(columns) must match len(arrays)") diff --git a/pandas/core/series.py b/pandas/core/series.py index 3c7270107d71d..d3a2bb1745cd1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -26,10 +26,8 @@ import numpy as np -from pandas._config import ( - get_option, - using_copy_on_write, -) +from pandas._config import using_copy_on_write +from pandas._config.config import _get_option from pandas._libs import ( lib, @@ -404,7 +402,7 @@ def __init__( if fastpath: # data is a ndarray, index is defined if not isinstance(data, (SingleBlockManager, SingleArrayManager)): - manager = get_option("mode.data_manager") + manager = _get_option("mode.data_manager", silent=True) if manager == "block": data = SingleBlockManager.from_array(data, index) elif manager == "array": @@ -510,7 +508,7 @@ def __init__( else: data = sanitize_array(data, index, dtype, copy) - manager = get_option("mode.data_manager") + manager = _get_option("mode.data_manager", silent=True) if manager == "block": data = SingleBlockManager.from_array(data, index, refs=refs) elif manager == "array": diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index f51b98a929440..ed254191d2736 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -13,6 +13,7 @@ from warnings import catch_warnings from pandas._config import using_pyarrow_string_dtype +from pandas._config.config import _get_option from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -258,7 +259,7 @@ def read( elif using_pyarrow_string_dtype(): to_pandas_kwargs["types_mapper"] = arrow_string_types_mapper() - manager = get_option("mode.data_manager") + manager = _get_option("mode.data_manager", silent=True) if manager == "array": to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment] diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index eb60aea7cc8c2..a94f7de283d01 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -2,6 +2,8 @@ import pytest +from pandas._config.config import _get_option + from pandas import ( Series, options, @@ -212,4 +214,7 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ - return options.mode.copy_on_write and options.mode.data_manager == "block" + return ( + options.mode.copy_on_write + and _get_option("mode.data_manager", silent=True) == "block" + ) diff --git a/pandas/tests/internals/test_managers.py b/pandas/tests/internals/test_managers.py index 75aa901fce910..f40362c299717 100644 --- a/pandas/tests/internals/test_managers.py +++ b/pandas/tests/internals/test_managers.py @@ -1,6 +1,12 @@ """ Testing interaction between the different managers (BlockManager, ArrayManager) """ +import os +import subprocess +import sys + +import pytest + from pandas.core.dtypes.missing import array_equivalent import pandas as pd @@ -14,12 +20,19 @@ def test_dataframe_creation(): - with pd.option_context("mode.data_manager", "block"): - df_block = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]}) + msg = "data_manager option is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + with pd.option_context("mode.data_manager", "block"): + df_block = pd.DataFrame( + {"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]} + ) assert isinstance(df_block._mgr, BlockManager) - with pd.option_context("mode.data_manager", "array"): - df_array = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]}) + with tm.assert_produces_warning(FutureWarning, match=msg): + with pd.option_context("mode.data_manager", "array"): + df_array = pd.DataFrame( + {"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]} + ) assert isinstance(df_array._mgr, ArrayManager) # also ensure both are seen as equal @@ -45,12 +58,15 @@ def test_dataframe_creation(): def test_series_creation(): - with pd.option_context("mode.data_manager", "block"): - s_block = pd.Series([1, 2, 3], name="A", index=["a", "b", "c"]) + msg = "data_manager option is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + with pd.option_context("mode.data_manager", "block"): + s_block = pd.Series([1, 2, 3], name="A", index=["a", "b", "c"]) assert isinstance(s_block._mgr, SingleBlockManager) - with pd.option_context("mode.data_manager", "array"): - s_array = pd.Series([1, 2, 3], name="A", index=["a", "b", "c"]) + with tm.assert_produces_warning(FutureWarning, match=msg): + with pd.option_context("mode.data_manager", "array"): + s_array = pd.Series([1, 2, 3], name="A", index=["a", "b", "c"]) assert isinstance(s_array._mgr, SingleArrayManager) # also ensure both are seen as equal @@ -68,3 +84,20 @@ def test_series_creation(): result = s_array._as_manager("block") assert isinstance(result._mgr, SingleBlockManager) tm.assert_series_equal(result, s_array) + + +@pytest.mark.single_cpu +@pytest.mark.parametrize("manager", ["block", "array"]) +def test_array_manager_depr_env_var(manager): + # GH#55043 + test_env = os.environ.copy() + test_env["PANDAS_DATA_MANAGER"] = manager + response = subprocess.run( + [sys.executable, "-c", "import pandas"], + capture_output=True, + env=test_env, + check=True, + ) + msg = "FutureWarning: The env variable PANDAS_DATA_MANAGER is set" + stderr_msg = response.stderr.decode("utf-8") + assert msg in stderr_msg, stderr_msg diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 55445e44b9366..b043f9fab23ae 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -8,10 +8,8 @@ import numpy as np import pytest -from pandas._config import ( - get_option, - using_copy_on_write, -) +from pandas._config import using_copy_on_write +from pandas._config.config import _get_option from pandas.compat import is_platform_windows from pandas.compat.pyarrow import ( @@ -61,7 +59,8 @@ pytest.param( "fastparquet", marks=pytest.mark.skipif( - not _HAVE_FASTPARQUET or get_option("mode.data_manager") == "array", + not _HAVE_FASTPARQUET + or _get_option("mode.data_manager", silent=True) == "array", reason="fastparquet is not installed or ArrayManager is used", ), ), @@ -88,7 +87,7 @@ def pa(): def fp(): if not _HAVE_FASTPARQUET: pytest.skip("fastparquet is not installed") - elif get_option("mode.data_manager") == "array": + elif _get_option("mode.data_manager", silent=True) == "array": pytest.skip("ArrayManager is not supported with fastparquet") return "fastparquet" diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 03011a1ffe622..9be0c3edaa998 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -38,6 +38,9 @@ def test_foo(): if TYPE_CHECKING: from pandas._typing import F + +from pandas._config.config import _get_option + from pandas.compat import ( IS64, is_platform_windows, @@ -230,12 +233,12 @@ def mark_array_manager_not_yet_implemented(request) -> None: skip_array_manager_not_yet_implemented = pytest.mark.xfail( - get_option("mode.data_manager") == "array", + _get_option("mode.data_manager", silent=True) == "array", reason="Not yet implemented for ArrayManager", ) skip_array_manager_invalid_test = pytest.mark.skipif( - get_option("mode.data_manager") == "array", + _get_option("mode.data_manager", silent=True) == "array", reason="Test that relies on BlockManager internals or specific behaviour", ) diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 0931dd209ee05..d765d7bc7dcb9 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -51,6 +51,8 @@ "_chained_assignment_msg", "_chained_assignment_method_msg", "_version_meson", + # TODO(3.0): GH#55043 - remove upon removal of ArrayManager + "_get_option", } From d303b7cb76d099e77eaf9afd7ae33030c39a33f6 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 20 Sep 2023 06:53:16 -0400 Subject: [PATCH 018/131] CI: Pin matplotlib < 3.8 (#55210) --- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311-downstream_compat.yaml | 2 +- ci/deps/actions-311.yaml | 2 +- ci/deps/actions-39.yaml | 2 +- ci/deps/circle-310-arm64.yaml | 2 +- environment.yml | 2 +- requirements-dev.txt | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 682708d811878..2522605cf5c38 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -34,7 +34,7 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1 + - matplotlib>=3.6.1, <3.8 - numba>=0.55.2 - numexpr>=2.8.0 - odfpy>=1.4.1 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index d09d4e5dea648..dc67122c6e72e 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -35,7 +35,7 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1 + - matplotlib>=3.6.1, <3.8 - numba>=0.55.2 - numexpr>=2.8.0 - odfpy>=1.4.1 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index f458546fc3a1b..c9faaa2146235 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -34,7 +34,7 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1 + - matplotlib>=3.6.1, <3.8 - numba>=0.55.2 - numexpr>=2.8.0 - odfpy>=1.4.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 4ee8dc0c6d3fc..f51a8e04fbc7e 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -34,7 +34,7 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1 + - matplotlib>=3.6.1, <3.8 - numba>=0.55.2 - numexpr>=2.8.0 - odfpy>=1.4.1 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index c520fa17551e0..d1a9e336eaeac 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -34,7 +34,7 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1 + - matplotlib>=3.6.1, <3.8 # test_numba_vs_cython segfaults with numba 0.57 - numba>=0.55.2, <0.57.0 - numexpr>=2.8.0 diff --git a/environment.yml b/environment.yml index 5caa57ef37ee8..db6138c34f37c 100644 --- a/environment.yml +++ b/environment.yml @@ -36,7 +36,7 @@ dependencies: - ipython - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1 + - matplotlib>=3.6.1, <3.8 - numba>=0.55.2 - numexpr>=2.8.0 - openpyxl>=3.0.10 diff --git a/requirements-dev.txt b/requirements-dev.txt index b8e7e376378c6..98339f45a5052 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -25,7 +25,7 @@ gcsfs>=2022.05.0 ipython jinja2>=3.1.2 lxml>=4.8.0 -matplotlib>=3.6.1 +matplotlib>=3.6.1, <3.8 numba>=0.55.2 numexpr>=2.8.0 openpyxl>=3.0.10 From 9dc5163f93db643a764976c52bf76ee6c2ad817a Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 20 Sep 2023 07:57:16 -0400 Subject: [PATCH 019/131] DOC: Add release date for 2.1.1 (#55211) --- doc/source/whatsnew/v2.1.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index 0dc113bd9ab7f..42a7029b1fa77 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -1,6 +1,6 @@ .. _whatsnew_211: -What's new in 2.1.1 (September XX, 2023) +What's new in 2.1.1 (September 20, 2023) ---------------------------------------- These are the changes in pandas 2.1.1. See :ref:`release` for a full changelog From c4efa9203ef7e13a797058600010876fdee7eee6 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 20 Sep 2023 09:52:34 -0400 Subject: [PATCH 020/131] DOC: Add whatsnew for 2.1.2 (#55207) * DOC: Add whatsnew for 2.1.2 * Update v2.1.2.rst * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v2.1.0.rst | 2 +- doc/source/whatsnew/v2.1.1.rst | 2 ++ doc/source/whatsnew/v2.1.2.rst | 39 ++++++++++++++++++++++++++++++++++ 4 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 doc/source/whatsnew/v2.1.2.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 1f12859e4081e..dbc7800196007 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -24,6 +24,7 @@ Version 2.1 .. toctree:: :maxdepth: 2 + v2.1.2 v2.1.1 v2.1.0 diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 18054e0b01191..d422fb1be9fdf 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -891,4 +891,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v2.0.3..v2.1.0|HEAD +.. contributors:: v2.0.3..v2.1.0 diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index 42a7029b1fa77..c52f7db8ec3ec 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -51,3 +51,5 @@ Other Contributors ~~~~~~~~~~~~ + +.. contributors:: v2.1.0..v2.1.1|HEAD diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst new file mode 100644 index 0000000000000..2c9b10160d144 --- /dev/null +++ b/doc/source/whatsnew/v2.1.2.rst @@ -0,0 +1,39 @@ +.. _whatsnew_212: + +What's new in 2.1.2 (October ??, 2023) +--------------------------------------- + +These are the changes in pandas 2.1.2. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- +.. _whatsnew_212.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_212.bug_fixes: + +Bug fixes +~~~~~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_212.other: + +Other +~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_212.contributors: + +Contributors +~~~~~~~~~~~~ From 8371d18218f854fce4c22104eb78eee9c14c62d8 Mon Sep 17 00:00:00 2001 From: Thomas Guillet Date: Wed, 20 Sep 2023 18:57:32 +0200 Subject: [PATCH 021/131] BUG: manage raw ods file without cell cache (#55219) * BUG: manage raw ods file without cell cache * fixup! BUG: manage raw ods file without cell cache --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/io/excel/_odfreader.py | 12 +----------- .../tests/io/data/excel/test_unempty_cells.ods | Bin 0 -> 2307 bytes pandas/tests/io/excel/test_odf.py | 11 +++++++++++ 4 files changed, 13 insertions(+), 11 deletions(-) create mode 100644 pandas/tests/io/data/excel/test_unempty_cells.ods diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index da2e30edc80ea..b97198c36891c 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -223,6 +223,7 @@ Bug fixes - Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`) - Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) +- Bug in :meth:`pandas.read_excel` with a ODS file without cached formatted cell for float values (:issue:`55219`) Categorical ^^^^^^^^^^^ diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 8016dbbaf7f42..277f64f636731 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -150,7 +150,7 @@ def get_sheet_data( max_row_len = len(table_row) row_repeat = self._get_row_repeat(sheet_row) - if self._is_empty_row(sheet_row): + if len(table_row) == 0: empty_rows += row_repeat else: # add blank rows to our table @@ -182,16 +182,6 @@ def _get_column_repeat(self, cell) -> int: return int(cell.attributes.get((TABLENS, "number-columns-repeated"), 1)) - def _is_empty_row(self, row) -> bool: - """ - Helper function to find empty rows - """ - for column in row.childNodes: - if len(column.childNodes) > 0: - return False - - return True - def _get_cell_value(self, cell) -> Scalar | NaTType: from odf.namespaces import OFFICENS diff --git a/pandas/tests/io/data/excel/test_unempty_cells.ods b/pandas/tests/io/data/excel/test_unempty_cells.ods new file mode 100644 index 0000000000000000000000000000000000000000..52458753bae061138b3e8d65d403f1eab8b68086 GIT binary patch literal 2307 zcmZ`*dpMKrAK$DwrRL4FlBCjB^hGM0eX48Xf<5YQwnNrOPd;N1yV{c#vPS%XCM z!=T+s*Dx3|>K{(YpPc4?0#LVz%SsFYkYBr>u`^LOw>H(pq48J`42i4}i1R5)Y{IuF z%bes2W1^APVT;Fd64`w4kc8uC=~-74$BsFYVmVljEqZ5B7lvb3sPm(c)~ua~I}uKe zecd5ZpODcDV=IE1fw6q!*mFV|Xnea{<5uTjvu`2(UJu*1kG3Z(@4w8ew#!3(%edS6 zqqK&Z*R~}%Rw~Njm^y2Fx~+n0d&i;NYGRu#IwAj&Ey_DaDNmQs1zqq2trT1!Wy*?W z$(~6S#_4-F?bd6BW_a-n_zuupnLfYU~?W+ zx)~po>ofsxbt~lgSmSQfwHbaiLv0G`A-IY2gmJ$4O5b5=`GS>GG$swlQ*3URO?xb5 zty_tYlFWxraKbHoKey4RwGV*L(=DWxemmRK43Ap%RxP8cU-a(&k5rbG=X2?C-L&xt zw*Da@5_G!Sw0AG0p3>{Jf(m7nkJj=s{Xr1y3=&J8KnGlg_YYr9j=5()IVuc&%27p} z*g<{d+;Cu-kZuq`RMk_@E%H)x>Ob%p6S1cwvftbEV4nLao-H#RIuk?66Q@`j%Z ziORAvytH$bacAnLIEd<=ozvay&Jgf9$b6tuePYdPZBs;LRffwMYNkSz%@%67IB;BW z`@4VR&C40tH&il;zgH=88;P+wwflH}#yC#fA2=3DJQ{RwYj;?mxN#PiYgM}#oM-GRW7f@V;c^TA%_VtoZo)*2gIL}fJM-xV&pYZ?4qjTbw;Z`Dg_R8G z5r%uBq*iKvs;bGu=O4VSKYxTHTyS}gnh0mZhB8fqG^Ljz)e*UI2ZA}F$N=75P;GaY zF4nC>XKNmq5;~c1(2wAPrH-8~ulvUHVU^Kw6J=C88)oO^fKM-Vy4ViUFE_YZ;S-xZOlg@R zL0I)t1f7cPVB!iqW1DqdteUSnc58}T)Jc@^*o?vmqS;aw@9RDL5}rl={?`-_W*Djr z(+b`b`#>kT9qCc2+59P;joy;~mVZ=dT;g%@dtAYcD>$oRmK_v1>;KRt;&*){r?jCi z@QFtcWo9q|Xe^)z#1X#4!zS9a+e8-!l~`n$T9s;rQt zZgNR?Ka^;6VyZ}r2KbQjR$s@Vj+K;=lr+R`8anJe-q)lM&x*ZsB_ha}4>!N(?Iv9s zZ8cXQM!BP#SoO^q!wRu;SZy_Qo;EX>bTr_l_zEE?Bv1?r`u}k*I?z8ysL0FK!+jm~ z?}7b43;+-cv=dGLHNH1eY)rg$3J1~BqJN5?`L_|Z(Pr0CFi~=@o9;&3#-?4z0l~kt WFbWA4IXD0S7G27s7hy`QJ^c-V>i0YV literal 0 HcmV?d00001 diff --git a/pandas/tests/io/excel/test_odf.py b/pandas/tests/io/excel/test_odf.py index 25079b235d332..9d49718bec357 100644 --- a/pandas/tests/io/excel/test_odf.py +++ b/pandas/tests/io/excel/test_odf.py @@ -48,3 +48,14 @@ def test_read_newlines_between_xml_elements_table(): result = pd.read_excel("test_newlines.ods") tm.assert_frame_equal(result, expected) + + +def test_read_unempty_cells(): + expected = pd.DataFrame( + [1, np.nan, 3, np.nan, 5], + columns=["Column 1"], + ) + + result = pd.read_excel("test_unempty_cells.ods") + + tm.assert_frame_equal(result, expected) From 61a6335263c6dfc5909357584ba33967a69c9b16 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 20 Sep 2023 12:59:55 -0400 Subject: [PATCH 022/131] BUILD: Fix duplicate files warning (#55206) * BUILD: Fix duplicate files warning * missing __init__.py file --- pandas/_libs/meson.build | 37 ++++++++++++++++++++++++++++++--- pandas/_libs/tslibs/meson.build | 27 +++++++++++++++++++++--- pandas/_libs/window/meson.build | 13 ++++++++++++ pandas/meson.build | 1 - 4 files changed, 71 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index 1cf2c4343d844..fd632790546f6 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -114,9 +114,40 @@ foreach ext_name, ext_dict : libs_sources ) endforeach -py.install_sources( +# Basically just __init__.py and the .pyi files +sources_to_install = [ '__init__.py', - subdir: 'pandas/_libs' -) + 'algos.pyi', + 'arrays.pyi', + 'byteswap.pyi', + 'groupby.pyi', + 'hashing.pyi', + 'hashtable.pyi', + 'index.pyi', + 'indexing.pyi', + 'internals.pyi', + 'interval.pyi', + 'join.pyi', + 'json.pyi', + 'lib.pyi', + 'missing.pyi', + 'ops.pyi', + 'ops_dispatch.pyi', + 'parsers.pyi', + 'properties.pyi', + 'reshape.pyi', + 'sas.pyi', + 'sparse.pyi', + 'testing.pyi', + 'tslib.pyi', + 'writers.pyi' +] + +foreach source: sources_to_install + py.install_sources( + source, + subdir: 'pandas/_libs' + ) +endforeach subdir('window') diff --git a/pandas/_libs/tslibs/meson.build b/pandas/_libs/tslibs/meson.build index 167695b84514c..a1b0c54d1f48c 100644 --- a/pandas/_libs/tslibs/meson.build +++ b/pandas/_libs/tslibs/meson.build @@ -31,7 +31,28 @@ foreach ext_name, ext_dict : tslibs_sources ) endforeach -py.install_sources( +sources_to_install = [ '__init__.py', - subdir: 'pandas/_libs/tslibs' -) + 'ccalendar.pyi', + 'conversion.pyi', + 'dtypes.pyi', + 'fields.pyi', + 'nattype.pyi', + 'np_datetime.pyi', + 'offsets.pyi', + 'parsing.pyi', + 'period.pyi', + 'strptime.pyi', + 'timedeltas.pyi', + 'timestamps.pyi', + 'timezones.pyi', + 'tzconversion.pyi', + 'vectorized.pyi' +] + +foreach source: sources_to_install + py.install_sources( + source, + subdir: 'pandas/_libs/tslibs' + ) +endforeach diff --git a/pandas/_libs/window/meson.build b/pandas/_libs/window/meson.build index 85aa060a26406..ad15644f73a0c 100644 --- a/pandas/_libs/window/meson.build +++ b/pandas/_libs/window/meson.build @@ -16,3 +16,16 @@ py.extension_module( subdir: 'pandas/_libs/window', install: true ) + +sources_to_install = [ + '__init__.py', + 'aggregations.pyi', + 'indexers.pyi' +] + +foreach source: sources_to_install + py.install_sources( + source, + subdir: 'pandas/_libs/window' + ) +endforeach diff --git a/pandas/meson.build b/pandas/meson.build index f02258c98d46a..435103a954d86 100644 --- a/pandas/meson.build +++ b/pandas/meson.build @@ -26,7 +26,6 @@ subdir('_libs') subdirs_list = [ '_config', - '_libs', '_testing', 'api', 'arrays', From a98be065e365c3d673d759414586345b14de50ac Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Wed, 20 Sep 2023 19:00:35 +0200 Subject: [PATCH 023/131] DEPR offsets: rename 'M' to 'ME' (#52064) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Frequency: raise warnings when using ‘M’ frequency * Frequency: raise warnings when using ‘M’ frequency II * remove is_period and change str representation for freq in Period [skip ci] * remove is_period and fix some tests [skip ci] * fix some tests * fix some tests II * fix tests in pandas/tests/indexes/period/ [skip ci] * fix tests in pandas/tests/indexes/period/ and correct timedeltas.pyx * update frequencies.py, resample.py, and fix some tests * modify pandas/tseries/frequencies.py * fix tests * fix tests II * fix tests III * rename 'M' to 'ME' in docs * rename 'M' to 'ME' in docs II * rename 'M' to 'ME' in docs III * rename 'M' to 'ME' in docs IV * rename 'M' to 'ME' in docs V * add is_period to to_offset I * add is_period to to_offset II * correct the definition of period_array(…) and fix 19 tests * add is_period to _parse_dtype_strict() and fix tests * add constant OFFSET_TO_PERIOD_FREQSTR to period.pyx and fix tests * correct definitions of extract_ordinals() and _round(), fix tests * add replacement ME to M in _require_matching_freq, _parsed_string_to_bounds, and fix tests * add the constant PERIOD_TO_OFFSET_FREQSTR to period.pyx, correct definition of _resolution_obj and fix tests * fix tests * add the conversion ME to M to _from_datetime64, period_index, raise_on_incompatible and fix tests * fix some tests with resample * correct definitions of to_period, freqstr and get_period_alias, fix tests for plotting * correct pre-commit failures * add key from Grouper to the constructor of TimeGrouper and fix tests * add to asfreq() from resampler the conversion ME to M, fix tests * fix tests for for PeriodIndex and base tests for resample * correct the constructor of TimeGrouper and fix tests for resample and plotting * correct the definition of use_dynamic_x() and fix tests for plotting * correct the definition of the method use_dynamic_x, fix tests * correct the definition of the asfreq for PeriodArray, _get_period_alias, and fix tests * correct documentation, fix tests * correct docs: rename ME to M for periods * add pytest.mark.xfail to test_to_timestamp_quarterly_bug * correct mypy error attr-defined * correct the definition of variables which convert M/ME to ME/M in dtypes.pyx, declare to_offset in offsets.pyi, fix mypy errors * created the c version for dicts which convert M/ME to ME/M and fix mypy errors * fix doc build error in 09_timeseries.rst and mypy error * correct the constructor of Period, fix mypy errors * replace in _attrname_to_abbrevs ME with M and correct the constructor of Period * add conversion ME/M to Period constructor, add conversion M/ME to maybe_resample and reverse changes in _attrname_to_abbrevs * correct dict “time rules”, correct the definition of _parsed_string_to_bounds, remove is_period from definition _parse_weekly_str and _parse_dtype_strict * remove the argument is_period from _parse_dtype_strict * add to is_subperiod, is_superperiod and _is_monthly both M and ME, correct definitions of _downsample and _maybe_cast_slice_bound * add dict ME to M to the definition of freqstr, constructor of Period and remove pytest.mark.xfail from test_round_trip_current * refactor freqstr, extract_ordinals, and _require_matching_freq for Period, asfreq for resample and _parsed_string_to_bounds for datetimes * refactor _resolution_obj in dtypes.pyx and freqstr in /indexes/datetimelike.py * define a new function freq_to_period_freqstr in dtypes to convert ME to M * refactor use_dynamic_x for plotting and to_period in arrays/datetimes.py * refactor def _check_plot_works in plotting and test_to_period in class TestDatetimeArray * refactor name method of PeriodDtype, refactor __arrow_array__ and add test for ValueError in test_period.py * in PeriodArray refactor _from_datetime64 and remove redundant if in asfreq, add test for ValueError in test_period_index.py and ignore mypy error * correct def _resolution_obj in DatetimeLikeArrayMixin, refactor def freqstr in PeriodArray and add tests ValueError for ME * correct def _resolution_obj in DatetimeLikeArrayMixin and def to_offset, refactor def freqstr in PeriodArray and add tests for ‘ValueError’ and 'UserWarning' * add tests for 'UserWarning' * refactor methods to_period in DatetimeArray, _from_datetime64 in PeriodArray, fix test in plotting * add freq_to_offset_freqstr to convert M to ME, refactor _resolution_obj, add tests for ‘ValueError’ and 'UserWarning' * fix pre-commit failures * correct the definition of to_period in DatetimeArray, refactor _check_plot_works, fix test_asfreq_2M * correct definitions of _resolution_obj in dtypes.pyx and in DatetimeLikeArrayMixin, _attrname_to_abbrevs and fix test_get_attrname_from_abbrev * correct def asfreq in PeriodArray, remove unused function freq_to_offset_freqstr, fix tests * roll back in test_fillna_period dtype Period[M] with capital P * refactor the function raise_on_incompatible * fix mypy error in pandas/core/arrays/period.py * fix ruff error in pandas/tests/arrays/period/test_constructors.py * remove ME from definitions of is_monthly, is_subperiod, correct _maybe_coerce_freq and test_period_ordinal_start_values * fix test_dti_to_period_2monthish * update whatsnew/v2.1.0.rst * add an example for old/new behavior in whatsnew/v2.1.0.rst * corrected typo * replace name of section Deprecations with Other Deprecations * remove ME form is_superperiod, refactored tests * correct a test * move some tests to a new place * correct def asfreq for resampling, refactor asfreq for Period, fix tests * correct tests * correct def _shift_with_freq and fix test for shift * correct docs for asfreq in PeriodArray * correct def _shift_with_freq * add ‘me’ to _dont_uppercase, correct _require_matching_freq, fix tests * minor corrections * correct whatsnew * correct an example in user_guide/reshaping.rst * fix tests for plotting * correct tests for plotting * remove from OFFSET_TO_PERIOD_FREQSTR deprecated freqstr, fix tests --- .../intro_tutorials/09_timeseries.rst | 2 +- doc/source/user_guide/cookbook.rst | 2 +- doc/source/user_guide/groupby.rst | 6 +- doc/source/user_guide/reshaping.rst | 2 +- doc/source/user_guide/timeseries.rst | 18 ++-- doc/source/whatsnew/v0.14.0.rst | 18 +++- doc/source/whatsnew/v0.18.0.rst | 19 +++- doc/source/whatsnew/v0.19.0.rst | 22 ++++- doc/source/whatsnew/v2.0.0.rst | 2 +- doc/source/whatsnew/v2.2.0.rst | 25 +++++ pandas/_libs/tslibs/dtypes.pxd | 2 + pandas/_libs/tslibs/dtypes.pyi | 2 + pandas/_libs/tslibs/dtypes.pyx | 40 +++++++- pandas/_libs/tslibs/offsets.pxd | 2 +- pandas/_libs/tslibs/offsets.pyi | 6 +- pandas/_libs/tslibs/offsets.pyx | 28 +++++- pandas/_libs/tslibs/parsing.pyx | 2 +- pandas/_libs/tslibs/period.pyx | 28 ++++-- pandas/_libs/tslibs/timestamps.pyx | 2 +- pandas/core/arrays/datetimes.py | 20 ++-- pandas/core/arrays/period.py | 31 +++++-- pandas/core/dtypes/dtypes.py | 2 +- pandas/core/generic.py | 19 ++-- pandas/core/groupby/groupby.py | 2 +- pandas/core/indexes/datetimelike.py | 13 ++- pandas/core/indexes/datetimes.py | 17 ++-- pandas/core/indexes/period.py | 4 +- pandas/core/resample.py | 27 +++++- pandas/plotting/_core.py | 2 +- pandas/plotting/_matplotlib/converter.py | 4 +- pandas/plotting/_matplotlib/timeseries.py | 14 ++- pandas/tests/arithmetic/test_datetime64.py | 2 +- pandas/tests/arithmetic/test_period.py | 4 +- .../tests/arrays/categorical/test_indexing.py | 6 +- .../tests/arrays/period/test_constructors.py | 10 ++ pandas/tests/arrays/test_datetimelike.py | 5 +- pandas/tests/arrays/test_datetimes.py | 8 ++ pandas/tests/base/test_conversion.py | 2 +- pandas/tests/frame/methods/test_asfreq.py | 16 +++- .../frame/methods/test_first_and_last.py | 10 +- pandas/tests/frame/methods/test_reindex.py | 2 +- pandas/tests/frame/test_iteration.py | 2 +- pandas/tests/frame/test_reductions.py | 2 +- pandas/tests/groupby/test_groupby.py | 2 +- pandas/tests/groupby/test_grouping.py | 14 +-- pandas/tests/groupby/test_timegrouper.py | 40 ++++---- .../tests/groupby/transform/test_transform.py | 2 +- .../indexes/datetimes/methods/test_astype.py | 2 +- .../datetimes/methods/test_factorize.py | 2 +- .../indexes/datetimes/methods/test_insert.py | 6 +- .../datetimes/methods/test_to_period.py | 22 +++-- pandas/tests/indexes/datetimes/test_asof.py | 2 +- .../indexes/datetimes/test_constructors.py | 8 +- .../indexes/datetimes/test_date_range.py | 14 +-- pandas/tests/indexes/datetimes/test_delete.py | 6 +- .../tests/indexes/datetimes/test_indexing.py | 2 +- pandas/tests/indexes/datetimes/test_map.py | 2 +- pandas/tests/indexes/datetimes/test_misc.py | 4 +- pandas/tests/indexes/datetimes/test_ops.py | 2 +- .../indexes/datetimes/test_partial_slicing.py | 6 +- .../indexes/datetimes/test_scalar_compat.py | 6 +- .../tests/indexes/datetimes/test_timezones.py | 4 +- .../indexes/interval/test_interval_range.py | 2 +- pandas/tests/indexes/multi/test_analytics.py | 4 +- .../tests/indexes/multi/test_constructors.py | 2 +- .../indexes/period/methods/test_factorize.py | 6 +- .../tests/indexes/period/test_constructors.py | 42 ++++++--- pandas/tests/indexes/period/test_period.py | 6 ++ .../tests/indexes/period/test_period_range.py | 37 +++++++- .../indexes/period/test_scalar_compat.py | 2 +- .../indexes/timedeltas/test_scalar_compat.py | 6 +- pandas/tests/io/excel/test_writers.py | 4 +- pandas/tests/io/test_pickle.py | 2 +- pandas/tests/plotting/frame/test_frame.py | 4 +- .../plotting/frame/test_frame_subplots.py | 6 +- pandas/tests/plotting/test_datetimelike.py | 43 +++++---- pandas/tests/plotting/test_series.py | 2 +- pandas/tests/resample/test_base.py | 50 ++++++---- pandas/tests/resample/test_datetime_index.py | 92 ++++++++++--------- pandas/tests/resample/test_period_index.py | 82 +++++++++++------ pandas/tests/resample/test_resample_api.py | 6 +- .../tests/resample/test_resampler_grouper.py | 10 +- pandas/tests/resample/test_time_grouper.py | 2 +- pandas/tests/reshape/test_pivot.py | 28 +++--- pandas/tests/scalar/period/test_period.py | 9 +- .../tests/scalar/timedelta/test_timedelta.py | 3 +- .../series/accessors/test_dt_accessor.py | 2 +- pandas/tests/series/indexing/test_datetime.py | 4 +- .../series/methods/test_combine_first.py | 2 +- pandas/tests/series/test_constructors.py | 2 +- .../tseries/frequencies/test_freq_code.py | 4 +- .../tseries/frequencies/test_inference.py | 8 +- pandas/tests/tseries/offsets/test_offsets.py | 2 +- pandas/tests/tslibs/test_parsing.py | 2 +- pandas/tests/tslibs/test_period_asfreq.py | 2 +- pandas/tseries/frequencies.py | 47 +++------- 96 files changed, 726 insertions(+), 397 deletions(-) diff --git a/doc/source/getting_started/intro_tutorials/09_timeseries.rst b/doc/source/getting_started/intro_tutorials/09_timeseries.rst index 470b3908802b2..b0530087e5b84 100644 --- a/doc/source/getting_started/intro_tutorials/09_timeseries.rst +++ b/doc/source/getting_started/intro_tutorials/09_timeseries.rst @@ -295,7 +295,7 @@ Aggregate the current hourly time series values to the monthly maximum value in .. ipython:: python - monthly_max = no_2.resample("M").max() + monthly_max = no_2.resample("ME").max() monthly_max A very powerful method on time series data with a datetime index, is the diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 2d2c0a4db4df6..b1a6aa8753be1 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -771,7 +771,7 @@ To create year and month cross tabulation: df = pd.DataFrame( {"value": np.random.randn(36)}, - index=pd.date_range("2011-01-01", freq="M", periods=36), + index=pd.date_range("2011-01-01", freq="ME", periods=36), ) pd.pivot_table( diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 4be62090ec645..b2df1eaec60cc 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -1416,7 +1416,7 @@ Groupby a specific column with the desired frequency. This is like resampling. .. ipython:: python - df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"])[["Quantity"]].sum() + df.groupby([pd.Grouper(freq="1ME", key="Date"), "Buyer"])[["Quantity"]].sum() When ``freq`` is specified, the object returned by ``pd.Grouper`` will be an instance of ``pandas.api.typing.TimeGrouper``. You have an ambiguous specification @@ -1426,9 +1426,9 @@ in that you have a named index and a column that could be potential groupers. df = df.set_index("Date") df["Date"] = df.index + pd.offsets.MonthEnd(2) - df.groupby([pd.Grouper(freq="6M", key="Date"), "Buyer"])[["Quantity"]].sum() + df.groupby([pd.Grouper(freq="6ME", key="Date"), "Buyer"])[["Quantity"]].sum() - df.groupby([pd.Grouper(freq="6M", level="Date"), "Buyer"])[["Quantity"]].sum() + df.groupby([pd.Grouper(freq="6ME", level="Date"), "Buyer"])[["Quantity"]].sum() Taking the first rows of each group diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index fabe498cb3bdb..83bf453b560ec 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -136,7 +136,7 @@ Also, you can use :class:`Grouper` for ``index`` and ``columns`` keywords. For d .. ipython:: python - pd.pivot_table(df, values="D", index=pd.Grouper(freq="M", key="F"), columns="C") + pd.pivot_table(df, values="D", index=pd.Grouper(freq="ME", key="F"), columns="C") .. _reshaping.pivot.margins: diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 2e9d683cae417..113efeefb48ce 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -107,7 +107,7 @@ data however will be stored as ``object`` data. pd.Series(pd.period_range("1/1/2011", freq="M", periods=3)) pd.Series([pd.DateOffset(1), pd.DateOffset(2)]) - pd.Series(pd.date_range("1/1/2011", freq="M", periods=3)) + pd.Series(pd.date_range("1/1/2011", freq="ME", periods=3)) Lastly, pandas represents null date times, time deltas, and time spans as ``NaT`` which is useful for representing missing or null date like values and behaves similar @@ -450,7 +450,7 @@ variety of :ref:`frequency aliases `: .. ipython:: python - pd.date_range(start, periods=1000, freq="M") + pd.date_range(start, periods=1000, freq="ME") pd.bdate_range(start, periods=250, freq="BQS") @@ -882,7 +882,7 @@ into ``freq`` keyword arguments. The available date offsets and associated frequ :class:`~pandas.tseries.offsets.Week`, ``'W'``, "one week, optionally anchored on a day of the week" :class:`~pandas.tseries.offsets.WeekOfMonth`, ``'WOM'``, "the x-th day of the y-th week of each month" :class:`~pandas.tseries.offsets.LastWeekOfMonth`, ``'LWOM'``, "the x-th day of the last week of each month" - :class:`~pandas.tseries.offsets.MonthEnd`, ``'M'``, "calendar month end" + :class:`~pandas.tseries.offsets.MonthEnd`, ``'ME'``, "calendar month end" :class:`~pandas.tseries.offsets.MonthBegin`, ``'MS'``, "calendar month begin" :class:`~pandas.tseries.offsets.BMonthEnd` or :class:`~pandas.tseries.offsets.BusinessMonthEnd`, ``'BM'``, "business month end" :class:`~pandas.tseries.offsets.BMonthBegin` or :class:`~pandas.tseries.offsets.BusinessMonthBegin`, ``'BMS'``, "business month begin" @@ -1246,7 +1246,7 @@ frequencies. We will refer to these aliases as *offset aliases*. "C", "custom business day frequency" "D", "calendar day frequency" "W", "weekly frequency" - "M", "month end frequency" + "ME", "month end frequency" "SM", "semi-month end frequency (15th and end of month)" "BM", "business month end frequency" "CBM", "custom business month end frequency" @@ -1690,7 +1690,7 @@ the end of the interval. .. warning:: The default values for ``label`` and ``closed`` is '**left**' for all - frequency offsets except for 'M', 'A', 'Q', 'BM', 'BA', 'BQ', and 'W' + frequency offsets except for 'ME', 'A', 'Q', 'BM', 'BA', 'BQ', and 'W' which all have a default of 'right'. This might unintendedly lead to looking ahead, where the value for a later @@ -1856,7 +1856,7 @@ to resample based on datetimelike column in the frame, it can passed to the ), ) df - df.resample("M", on="date")[["a"]].sum() + df.resample("ME", on="date")[["a"]].sum() Similarly, if you instead want to resample by a datetimelike level of ``MultiIndex``, its name or location can be passed to the @@ -1864,7 +1864,7 @@ level of ``MultiIndex``, its name or location can be passed to the .. ipython:: python - df.resample("M", level="d")[["a"]].sum() + df.resample("ME", level="d")[["a"]].sum() .. _timeseries.iterating-label: @@ -2137,7 +2137,7 @@ The ``period`` dtype can be used in ``.astype(...)``. It allows one to change th pi.astype("datetime64[ns]") # convert to PeriodIndex - dti = pd.date_range("2011-01-01", freq="M", periods=3) + dti = pd.date_range("2011-01-01", freq="ME", periods=3) dti dti.astype("period[M]") @@ -2256,7 +2256,7 @@ and vice-versa using ``to_timestamp``: .. ipython:: python - rng = pd.date_range("1/1/2012", periods=5, freq="M") + rng = pd.date_range("1/1/2012", periods=5, freq="ME") ts = pd.Series(np.random.randn(len(rng)), index=rng) diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst index 33c3de577c063..efb30d087b8b4 100644 --- a/doc/source/whatsnew/v0.14.0.rst +++ b/doc/source/whatsnew/v0.14.0.rst @@ -860,10 +860,20 @@ Enhancements datetime.datetime(2013, 9, 5, 10, 0)]}) df - df.pivot_table(values='Quantity', - index=pd.Grouper(freq='M', key='Date'), - columns=pd.Grouper(freq='M', key='PayDay'), - aggfunc="sum") + .. code-block:: ipython + + In [75]: df.pivot_table(values='Quantity', + ....: index=pd.Grouper(freq='M', key='Date'), + ....: columns=pd.Grouper(freq='M', key='PayDay'), + ....: aggfunc="sum") + Out[75]: + PayDay 2013-09-30 2013-10-31 2013-11-30 + Date + 2013-09-30 NaN 3.0 NaN + 2013-10-31 6.0 NaN 1.0 + 2013-11-30 NaN 9.0 NaN + + [3 rows x 3 columns] - Arrays of strings can be wrapped to a specified width (``str.wrap``) (:issue:`6999`) - Add :meth:`~Series.nsmallest` and :meth:`Series.nlargest` methods to Series, See :ref:`the docs ` (:issue:`3960`) diff --git a/doc/source/whatsnew/v0.18.0.rst b/doc/source/whatsnew/v0.18.0.rst index a05b9bb1a88ef..02e47553cd184 100644 --- a/doc/source/whatsnew/v0.18.0.rst +++ b/doc/source/whatsnew/v0.18.0.rst @@ -837,9 +837,24 @@ Previously New API -.. ipython:: python +.. code-block:: ipython - s.resample('M').ffill() + In [91]: s.resample('M').ffill() + Out[91]: + 2010-03-31 0 + 2010-04-30 0 + 2010-05-31 0 + 2010-06-30 1 + 2010-07-31 1 + 2010-08-31 1 + 2010-09-30 2 + 2010-10-31 2 + 2010-11-30 2 + 2010-12-31 3 + 2011-01-31 3 + 2011-02-28 3 + 2011-03-31 4 + Freq: M, Length: 13, dtype: int64 .. note:: diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index d4b879f137698..32b3ced4f9d39 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -498,8 +498,26 @@ Other enhancements ), ) df - df.resample("M", on="date")[["a"]].sum() - df.resample("M", level="d")[["a"]].sum() + + .. code-block:: ipython + + In [74]: df.resample("M", on="date")[["a"]].sum() + Out[74]: + a + date + 2015-01-31 6 + 2015-02-28 4 + + [2 rows x 1 columns] + + In [75]: df.resample("M", level="d")[["a"]].sum() + Out[75]: + a + d + 2015-01-31 6 + 2015-02-28 4 + + [2 rows x 1 columns] - The ``.get_credentials()`` method of ``GbqConnector`` can now first try to fetch `the application default credentials `__. See the docs for more details (:issue:`13577`). - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behavior remains to raising a ``NonExistentTimeError`` (:issue:`13057`) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 7d26005315c33..be63da09846c8 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -76,7 +76,7 @@ Below is a possibly non-exhaustive list of changes: .. ipython:: python - idx = pd.date_range(start='1/1/2018', periods=3, freq='M') + idx = pd.date_range(start='1/1/2018', periods=3, freq='ME') idx.array.year idx.year diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index b97198c36891c..3ad28222cc57a 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -177,6 +177,31 @@ Other API changes Deprecations ~~~~~~~~~~~~ + +Deprecate alias ``M`` in favour of ``ME`` for offsets +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The alias ``M`` is deprecated in favour of ``ME`` for offsets, please use ``ME`` for "month end" instead of ``M`` (:issue:`9586`) + +For example: + +*Previous behavior*: + +.. code-block:: ipython + + In [7]: pd.date_range('2020-01-01', periods=3, freq='M') + Out [7]: + DatetimeIndex(['2020-01-31', '2020-02-29', '2020-03-31'], + dtype='datetime64[ns]', freq='M') + +*Future behavior*: + +.. ipython:: python + + pd.date_range('2020-01-01', periods=3, freq='ME') + +Other Deprecations +^^^^^^^^^^^^^^^^^^ - Changed :meth:`Timedelta.resolution_string` to return ``min``, ``s``, ``ms``, ``us``, and ``ns`` instead of ``T``, ``S``, ``L``, ``U``, and ``N``, for compatibility with respective deprecations in frequency aliases (:issue:`52536`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_clipboard`. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_csv` except ``path_or_buf``. (:issue:`54229`) diff --git a/pandas/_libs/tslibs/dtypes.pxd b/pandas/_libs/tslibs/dtypes.pxd index 37c149343e40f..e050ac5a6c7b7 100644 --- a/pandas/_libs/tslibs/dtypes.pxd +++ b/pandas/_libs/tslibs/dtypes.pxd @@ -11,6 +11,8 @@ cpdef int64_t periods_per_second(NPY_DATETIMEUNIT reso) except? -1 cpdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso) cpdef bint is_supported_unit(NPY_DATETIMEUNIT reso) +cpdef freq_to_period_freqstr(freq_n, freq_name) +cdef dict c_OFFSET_TO_PERIOD_FREQSTR cdef dict c_DEPR_ABBREVS cdef dict attrname_to_abbrevs cdef dict npy_unit_to_attrname diff --git a/pandas/_libs/tslibs/dtypes.pyi b/pandas/_libs/tslibs/dtypes.pyi index 9974d2e648398..72a8fa8ff0b38 100644 --- a/pandas/_libs/tslibs/dtypes.pyi +++ b/pandas/_libs/tslibs/dtypes.pyi @@ -6,6 +6,7 @@ from pandas._libs.tslibs.timedeltas import UnitChoices # are imported in tests. _attrname_to_abbrevs: dict[str, str] _period_code_map: dict[str, int] +OFFSET_TO_PERIOD_FREQSTR: dict[str, str] DEPR_ABBREVS: dict[str, UnitChoices] def periods_per_day(reso: int) -> int: ... @@ -14,6 +15,7 @@ def is_supported_unit(reso: int) -> bool: ... def npy_unit_to_abbrev(reso: int) -> str: ... def get_supported_reso(reso: int) -> int: ... def abbrev_to_npy_unit(abbrev: str) -> int: ... +def freq_to_period_freqstr(freq_n: int, freq_name: str) -> str: ... class PeriodDtypeBase: _dtype_code: int # PeriodDtypeCode diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index f0f73d242cdf0..cca379c620aeb 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -13,7 +13,6 @@ from pandas._libs.tslibs.np_datetime cimport ( import_pandas_datetime() - cdef class PeriodDtypeBase: """ Similar to an actual dtype, this contains all of the information @@ -186,6 +185,45 @@ _attrname_to_abbrevs = { cdef dict attrname_to_abbrevs = _attrname_to_abbrevs cdef dict _abbrev_to_attrnames = {v: k for k, v in attrname_to_abbrevs.items()} +OFFSET_TO_PERIOD_FREQSTR: dict = { + "WEEKDAY": "D", + "EOM": "M", + "BM": "M", + "BQS": "Q", + "QS": "Q", + "BQ": "Q", + "BA": "A", + "AS": "A", + "BAS": "A", + "MS": "M", + "D": "D", + "B": "B", + "min": "min", + "s": "s", + "ms": "ms", + "us": "us", + "ns": "ns", + "H": "H", + "Q": "Q", + "A": "A", + "W": "W", + "ME": "M", + "Y": "A", + "BY": "A", + "YS": "A", + "BYS": "A", +} +cdef dict c_OFFSET_TO_PERIOD_FREQSTR = OFFSET_TO_PERIOD_FREQSTR + +cpdef freq_to_period_freqstr(freq_n, freq_name): + if freq_n == 1: + freqstr = f"""{c_OFFSET_TO_PERIOD_FREQSTR.get( + freq_name, freq_name)}""" + else: + freqstr = f"""{freq_n}{c_OFFSET_TO_PERIOD_FREQSTR.get( + freq_name, freq_name)}""" + return freqstr + # Map deprecated resolution abbreviations to correct resolution abbreviations DEPR_ABBREVS: dict[str, str]= { "T": "min", diff --git a/pandas/_libs/tslibs/offsets.pxd b/pandas/_libs/tslibs/offsets.pxd index 215c3f849281f..cda6825de35be 100644 --- a/pandas/_libs/tslibs/offsets.pxd +++ b/pandas/_libs/tslibs/offsets.pxd @@ -1,7 +1,7 @@ from numpy cimport int64_t -cpdef to_offset(object obj) +cpdef to_offset(object obj, bint is_period=*) cdef bint is_offset_object(object obj) cdef bint is_tick_object(object obj) diff --git a/pandas/_libs/tslibs/offsets.pyi b/pandas/_libs/tslibs/offsets.pyi index 1a4742111db89..1f0fad6abb8b4 100644 --- a/pandas/_libs/tslibs/offsets.pyi +++ b/pandas/_libs/tslibs/offsets.pyi @@ -103,11 +103,11 @@ class SingleConstructorOffset(BaseOffset): def __reduce__(self): ... @overload -def to_offset(freq: None) -> None: ... +def to_offset(freq: None, is_period: bool = ...) -> None: ... @overload -def to_offset(freq: _BaseOffsetT) -> _BaseOffsetT: ... +def to_offset(freq: _BaseOffsetT, is_period: bool = ...) -> _BaseOffsetT: ... @overload -def to_offset(freq: timedelta | str) -> BaseOffset: ... +def to_offset(freq: timedelta | str, is_period: bool = ...) -> BaseOffset: ... class Tick(SingleConstructorOffset): _creso: int diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 2b6f84844ff49..74398eb0e2405 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -15,6 +15,7 @@ from cpython.datetime cimport ( time as dt_time, timedelta, ) +import warnings import_datetime() @@ -45,6 +46,8 @@ from pandas._libs.tslibs.ccalendar import ( int_to_weekday, weekday_to_int, ) +from pandas.util._exceptions import find_stack_level + from pandas._libs.tslibs.ccalendar cimport ( dayofweek, @@ -2851,7 +2854,7 @@ cdef class MonthEnd(MonthOffset): Timestamp('2022-01-31 00:00:00') """ _period_dtype_code = PeriodDtypeCode.M - _prefix = "M" + _prefix = "ME" _day_opt = "end" @@ -4457,7 +4460,7 @@ prefix_mapping = { CustomBusinessMonthEnd, # 'CBM' CustomBusinessMonthBegin, # 'CBMS' CustomBusinessHour, # 'CBH' - MonthEnd, # 'M' + MonthEnd, # 'ME' MonthBegin, # 'MS' Nano, # 'ns' SemiMonthEnd, # 'SM' @@ -4502,7 +4505,8 @@ _lite_rule_alias = { "ns": "ns", } -_dont_uppercase = {"MS", "ms", "s"} +_dont_uppercase = {"MS", "ms", "s", "me"} + INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}" @@ -4543,7 +4547,7 @@ def _get_offset(name: str) -> BaseOffset: return _offset_map[name] -cpdef to_offset(freq): +cpdef to_offset(freq, bint is_period=False): """ Return DateOffset object from string or datetime.timedelta object. @@ -4611,6 +4615,22 @@ cpdef to_offset(freq): tups = zip(split[0::4], split[1::4], split[2::4]) for n, (sep, stride, name) in enumerate(tups): + if is_period is False and name == "M": + warnings.warn( + "\'M\' will be deprecated, please use \'ME\' " + "for \'month end\'", + UserWarning, + stacklevel=find_stack_level(), + ) + name = "ME" + if is_period is True and name == "ME": + raise ValueError( + r"for Period, please use \'M\' " + "instead of \'ME\'" + ) + elif is_period is True and name == "M": + name = "ME" + if sep != "" and not sep.isspace(): raise ValueError("separator must be spaces") prefix = _lite_rule_alias.get(name) or name diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index ef1a870b8653c..71be1d213437a 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -576,7 +576,7 @@ cdef datetime _parse_dateabbr_string(str date_string, datetime default, # e.g. if "Q" is not in date_string and .index raised pass - if date_len == 6 and freq == "M": + if date_len == 6 and freq == "ME": year = int(date_string[:4]) month = int(date_string[4:6]) try: diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 21937ab2519e4..5fecc77044b4b 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -38,6 +38,11 @@ from libc.time cimport ( tm, ) +from pandas._libs.tslibs.dtypes cimport ( + c_OFFSET_TO_PERIOD_FREQSTR, + freq_to_period_freqstr, +) + # import datetime C API import_datetime() @@ -91,6 +96,9 @@ from pandas._libs.tslibs.dtypes cimport ( attrname_to_abbrevs, freq_group_code_to_npy_unit, ) + +from pandas._libs.tslibs.dtypes import freq_to_period_freqstr + from pandas._libs.tslibs.parsing cimport quarter_to_myear from pandas._libs.tslibs.parsing import parse_datetime_string_with_reso @@ -1507,7 +1515,7 @@ def from_ordinals(const int64_t[:] values, freq): int64_t[::1] result = np.empty(len(values), dtype="i8") int64_t val - freq = to_offset(freq) + freq = to_offset(freq, is_period=True) if not isinstance(freq, BaseOffset): raise ValueError("freq not specified and cannot be inferred") @@ -1539,7 +1547,7 @@ def extract_ordinals(ndarray values, freq) -> np.ndarray: # if we don't raise here, we'll segfault later! raise TypeError("extract_ordinals values must be object-dtype") - freqstr = Period._maybe_convert_freq(freq).freqstr + freqstr = freq_to_period_freqstr(freq.n, freq.name) for i in range(n): # Analogous to: p = values[i] @@ -1712,10 +1720,12 @@ cdef class PeriodMixin: condition = self.freq != other_freq if condition: + freqstr = freq_to_period_freqstr(self.freq.n, self.freq.name) + other_freqstr = freq_to_period_freqstr(other_freq.n, other_freq.name) msg = DIFFERENT_FREQ.format( cls=type(self).__name__, - own_freq=self.freqstr, - other_freq=other_freq.freqstr, + own_freq=freqstr, + other_freq=other_freqstr, ) raise IncompatibleFrequency(msg) @@ -1759,7 +1769,7 @@ cdef class _Period(PeriodMixin): elif isinstance(freq, PeriodDtypeBase): freq = freq._freqstr - freq = to_offset(freq) + freq = to_offset(freq, is_period=True) if freq.n <= 0: raise ValueError("Frequency must be positive, because it " @@ -2487,7 +2497,8 @@ cdef class _Period(PeriodMixin): >>> pd.Period('2020-01', 'D').freqstr 'D' """ - return self._dtype._freqstr + freqstr = freq_to_period_freqstr(self.freq.n, self.freq.name) + return freqstr def __repr__(self) -> str: base = self._dtype._dtype_code @@ -2789,7 +2800,8 @@ class Period(_Period): if freq is None and ordinal != NPY_NAT: # Skip NaT, since it doesn't have a resolution freq = attrname_to_abbrevs[reso] - freq = to_offset(freq) + freq = c_OFFSET_TO_PERIOD_FREQSTR.get(freq, freq) + freq = to_offset(freq, is_period=True) elif PyDateTime_Check(value): dt = value @@ -2882,7 +2894,7 @@ cdef _parse_weekly_str(value, BaseOffset freq): if freq is None: day_name = end.day_name()[:3].upper() freqstr = f"W-{day_name}" - freq = to_offset(freqstr) + freq = to_offset(freqstr, is_period=True) # We _should_ have freq.is_on_offset(end) return end, freq diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 65d0d454ac817..1b4332c2d26cf 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1895,7 +1895,7 @@ class Timestamp(_Timestamp): cdef: int64_t nanos - freq = to_offset(freq) + freq = to_offset(freq, is_period=False) freq.nanos # raises on non-fixed freq nanos = delta_to_nanoseconds(freq, self._creso) if nanos == 0: diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index dd2d7c0060392..b520f9f4a6deb 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -39,7 +39,10 @@ tz_convert_from_utc, tzconversion, ) -from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit +from pandas._libs.tslibs.dtypes import ( + abbrev_to_npy_unit, + freq_to_period_freqstr, +) from pandas.errors import PerformanceWarning from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_inclusive @@ -1207,6 +1210,8 @@ def to_period(self, freq=None) -> PeriodArray: if freq is None: freq = self.freqstr or self.inferred_freq + if isinstance(self.freq, BaseOffset): + freq = freq_to_period_freqstr(self.freq.n, self.freq.name) if freq is None: raise ValueError( @@ -1220,7 +1225,6 @@ def to_period(self, freq=None) -> PeriodArray: res = freq freq = res - return PeriodArray._from_datetime64(self._ndarray, freq, tz=self.tz) # ----------------------------------------------------------------- @@ -1245,7 +1249,7 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]: Examples -------- - >>> s = pd.Series(pd.date_range(start='2018-01', freq='M', periods=3)) + >>> s = pd.Series(pd.date_range(start='2018-01', freq='ME', periods=3)) >>> s 0 2018-01-31 1 2018-02-28 @@ -1257,10 +1261,10 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]: 2 March dtype: object - >>> idx = pd.date_range(start='2018-01', freq='M', periods=3) + >>> idx = pd.date_range(start='2018-01', freq='ME', periods=3) >>> idx DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], - dtype='datetime64[ns]', freq='M') + dtype='datetime64[ns]', freq='ME') >>> idx.month_name() Index(['January', 'February', 'March'], dtype='object') @@ -1268,10 +1272,10 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]: for example: ``idx.month_name(locale='pt_BR.utf8')`` will return month names in Brazilian Portuguese language. - >>> idx = pd.date_range(start='2018-01', freq='M', periods=3) + >>> idx = pd.date_range(start='2018-01', freq='ME', periods=3) >>> idx DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], - dtype='datetime64[ns]', freq='M') + dtype='datetime64[ns]', freq='ME') >>> idx.month_name(locale='pt_BR.utf8') # doctest: +SKIP Index(['Janeiro', 'Fevereiro', 'Março'], dtype='object') """ @@ -1520,7 +1524,7 @@ def isocalendar(self) -> DataFrame: Examples -------- >>> datetime_series = pd.Series( - ... pd.date_range("2000-01-01", periods=3, freq="M") + ... pd.date_range("2000-01-01", periods=3, freq="ME") ... ) >>> datetime_series 0 2000-01-31 diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index eeede0ad9e6d2..4532e5bffe7a9 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -33,7 +33,10 @@ period as libperiod, to_offset, ) -from pandas._libs.tslibs.dtypes import FreqGroup +from pandas._libs.tslibs.dtypes import ( + FreqGroup, + freq_to_period_freqstr, +) from pandas._libs.tslibs.fields import isleapyear_arr from pandas._libs.tslibs.offsets import ( Tick, @@ -319,6 +322,8 @@ def _from_datetime64(cls, data, freq, tz=None) -> Self: ------- PeriodArray[freq] """ + if isinstance(freq, BaseOffset): + freq = freq_to_period_freqstr(freq.n, freq.name) data, freq = dt64arr_to_periodarr(data, freq, tz) dtype = PeriodDtype(freq) return cls(data, dtype=dtype) @@ -386,6 +391,10 @@ def freq(self) -> BaseOffset: """ return self.dtype.freq + @property + def freqstr(self) -> str: + return freq_to_period_freqstr(self.freq.n, self.freq.name) + def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: if dtype == "i8": return self.asi8 @@ -717,7 +726,8 @@ def asfreq(self, freq=None, how: str = "E") -> Self: '2015-01'], dtype='period[M]') """ how = libperiod.validate_end_alias(how) - + if isinstance(freq, BaseOffset): + freq = freq_to_period_freqstr(freq.n, freq.name) freq = Period._maybe_convert_freq(freq) base1 = self._dtype._dtype_code @@ -966,13 +976,16 @@ def raise_on_incompatible(left, right) -> IncompatibleFrequency: # GH#24283 error message format depends on whether right is scalar if isinstance(right, (np.ndarray, ABCTimedeltaArray)) or right is None: other_freq = None - elif isinstance(right, (ABCPeriodIndex, PeriodArray, Period, BaseOffset)): + elif isinstance(right, BaseOffset): + other_freq = freq_to_period_freqstr(right.n, right.name) + elif isinstance(right, (ABCPeriodIndex, PeriodArray, Period)): other_freq = right.freqstr else: other_freq = delta_to_tick(Timedelta(right)).freqstr + own_freq = freq_to_period_freqstr(left.freq.n, left.freq.name) msg = DIFFERENT_FREQ.format( - cls=type(left).__name__, own_freq=left.freqstr, other_freq=other_freq + cls=type(left).__name__, own_freq=own_freq, other_freq=other_freq ) return IncompatibleFrequency(msg) @@ -1110,7 +1123,7 @@ def validate_dtype_freq( IncompatibleFrequency : mismatch between dtype and freq """ if freq is not None: - freq = to_offset(freq) + freq = to_offset(freq, is_period=True) if dtype is not None: dtype = pandas_dtype(dtype) @@ -1173,7 +1186,7 @@ def _get_ordinal_range(start, end, periods, freq, mult: int = 1): ) if freq is not None: - freq = to_offset(freq) + freq = to_offset(freq, is_period=True) mult = freq.n if start is not None: @@ -1237,10 +1250,10 @@ def _range_from_fields( if quarter is not None: if freq is None: - freq = to_offset("Q") + freq = to_offset("Q", is_period=True) base = FreqGroup.FR_QTR.value else: - freq = to_offset(freq) + freq = to_offset(freq, is_period=True) base = libperiod.freq_to_dtype_code(freq) if base != FreqGroup.FR_QTR.value: raise AssertionError("base must equal FR_QTR") @@ -1254,7 +1267,7 @@ def _range_from_fields( ) ordinals.append(val) else: - freq = to_offset(freq) + freq = to_offset(freq, is_period=True) base = libperiod.freq_to_dtype_code(freq) arrays = _make_field_arrays(year, month, day, hour, minute, second) for y, mth, d, h, mn, s in zip(*arrays): diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 12de63967c78f..beff71d5e9dd9 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1035,7 +1035,7 @@ def _parse_dtype_strict(cls, freq: str_type) -> BaseOffset: if m is not None: freq = m.group("freq") - freq_offset = to_offset(freq) + freq_offset = to_offset(freq, is_period=True) if freq_offset is not None: return freq_offset diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d6508cc43a7c8..c75c6c546aad0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -39,6 +39,7 @@ Timestamp, to_offset, ) +from pandas._libs.tslibs.dtypes import freq_to_period_freqstr from pandas._typing import ( AlignJoin, AnyArrayLike, @@ -9171,11 +9172,11 @@ def resample( Use frame.T.resample(...) instead. closed : {{'right', 'left'}}, default None Which side of bin interval is closed. The default is 'left' - for all frequency offsets except for 'M', 'A', 'Q', 'BM', + for all frequency offsets except for 'ME', 'A', 'Q', 'BM', 'BA', 'BQ', and 'W' which all have a default of 'right'. label : {{'right', 'left'}}, default None Which bin edge label to label bucket with. The default is 'left' - for all frequency offsets except for 'M', 'A', 'Q', 'BM', + for all frequency offsets except for 'ME', 'A', 'Q', 'BM', 'BA', 'BQ', and 'W' which all have a default of 'right'. convention : {{'start', 'end', 's', 'e'}}, default 'start' For `PeriodIndex` only, controls whether to use the start or @@ -9407,7 +9408,7 @@ def resample( 5 18 100 2018-02-11 6 17 40 2018-02-18 7 19 50 2018-02-25 - >>> df.resample('M', on='week_starting').mean() + >>> df.resample('ME', on='week_starting').mean() price volume week_starting 2018-01-31 10.75 62.5 @@ -9567,7 +9568,7 @@ def first(self, offset) -> Self: ---------- offset : str, DateOffset or dateutil.relativedelta The offset length of the data that will be selected. For instance, - '1M' will display all the rows having their index within the first month. + '1ME' will display all the rows having their index within the first month. Returns ------- @@ -10966,15 +10967,17 @@ def _shift_with_freq(self, periods: int, axis: int, freq) -> Self: raise ValueError(msg) elif isinstance(freq, str): - freq = to_offset(freq) + is_period = isinstance(index, PeriodIndex) + freq = to_offset(freq, is_period=is_period) if isinstance(index, PeriodIndex): orig_freq = to_offset(index.freq) if freq != orig_freq: assert orig_freq is not None # for mypy raise ValueError( - f"Given freq {freq.rule_code} does not match " - f"PeriodIndex freq {orig_freq.rule_code}" + f"Given freq {freq_to_period_freqstr(freq.n, freq.name)} " + f"does not match PeriodIndex freq " + f"{freq_to_period_freqstr(orig_freq.n, orig_freq.name)}" ) new_ax = index.shift(periods) else: @@ -11697,7 +11700,7 @@ def pct_change( .. deprecated:: 2.1 freq : DateOffset, timedelta, or str, optional - Increment to use from time series API (e.g. 'M' or BDay()). + Increment to use from time series API (e.g. 'ME' or BDay()). **kwargs Additional keyword arguments are passed into `DataFrame.shift` or `Series.shift`. diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e6dd6a990d285..d607baf18d6cb 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3637,7 +3637,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Resample by month. Values are assigned to the month of the period. - >>> df.groupby('a').resample('M', include_groups=False).sum() + >>> df.groupby('a').resample('ME', include_groups=False).sum() b a 0 2000-01-31 3 diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index d5a292335a5f6..1b08596c99591 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -31,6 +31,7 @@ parsing, to_offset, ) +from pandas._libs.tslibs.dtypes import freq_to_period_freqstr from pandas.compat.numpy import function as nv from pandas.errors import ( InvalidIndexError, @@ -108,8 +109,16 @@ def asi8(self) -> npt.NDArray[np.int64]: @property @doc(DatetimeLikeArrayMixin.freqstr) - def freqstr(self) -> str | None: - return self._data.freqstr + def freqstr(self) -> str: + from pandas import PeriodIndex + + if self._data.freqstr is not None and isinstance( + self._data, (PeriodArray, PeriodIndex) + ): + freq = freq_to_period_freqstr(self._data.freq.n, self._data.freq.name) + return freq + else: + return self._data.freqstr # type: ignore[return-value] @cache_readonly @abstractmethod diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 400747cbf6b8d..ae0feba1f9bcf 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -65,6 +65,8 @@ PeriodIndex, ) +from pandas._libs.tslibs.dtypes import OFFSET_TO_PERIOD_FREQSTR + def _new_DatetimeIndex(cls, d): """ @@ -533,7 +535,8 @@ def _parsed_string_to_bounds(self, reso: Resolution, parsed: dt.datetime): ------- lower, upper: pd.Timestamp """ - per = Period(parsed, freq=reso.attr_abbrev) + freq = OFFSET_TO_PERIOD_FREQSTR.get(reso.attr_abbrev, reso.attr_abbrev) + per = Period(parsed, freq=freq) start, end = per.start_time, per.end_time # GH 24076 @@ -944,26 +947,26 @@ def date_range( **Other Parameters** - Changed the `freq` (frequency) to ``'M'`` (month end frequency). + Changed the `freq` (frequency) to ``'ME'`` (month end frequency). - >>> pd.date_range(start='1/1/2018', periods=5, freq='M') + >>> pd.date_range(start='1/1/2018', periods=5, freq='ME') DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30', '2018-05-31'], - dtype='datetime64[ns]', freq='M') + dtype='datetime64[ns]', freq='ME') Multiples are allowed - >>> pd.date_range(start='1/1/2018', periods=5, freq='3M') + >>> pd.date_range(start='1/1/2018', periods=5, freq='3ME') DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31', '2019-01-31'], - dtype='datetime64[ns]', freq='3M') + dtype='datetime64[ns]', freq='3ME') `freq` can also be specified as an Offset object. >>> pd.date_range(start='1/1/2018', periods=5, freq=pd.offsets.MonthEnd(3)) DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31', '2019-01-31'], - dtype='datetime64[ns]', freq='3M') + dtype='datetime64[ns]', freq='3ME') Specify `tz` to set the timezone. diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index d05694c00d2a4..b1023febe813d 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -16,6 +16,7 @@ Resolution, Tick, ) +from pandas._libs.tslibs.dtypes import OFFSET_TO_PERIOD_FREQSTR from pandas.util._decorators import ( cache_readonly, doc, @@ -453,7 +454,8 @@ def _maybe_cast_slice_bound(self, label, side: str): return super()._maybe_cast_slice_bound(label, side) def _parsed_string_to_bounds(self, reso: Resolution, parsed: datetime): - iv = Period(parsed, freq=reso.attr_abbrev) + freq = OFFSET_TO_PERIOD_FREQSTR.get(reso.attr_abbrev, reso.attr_abbrev) + iv = Period(parsed, freq=freq) return (iv.asfreq(self.freq, how="start"), iv.asfreq(self.freq, how="end")) @doc(DatetimeIndexOpsMixin.shift) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 9605bf154a8b7..30d654078bd05 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -24,6 +24,7 @@ Timestamp, to_offset, ) +from pandas._libs.tslibs.dtypes import freq_to_period_freqstr from pandas._typing import NDFrameT from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError @@ -2004,7 +2005,7 @@ def get_resampler(obj: Series | DataFrame, kind=None, **kwds) -> Resampler: """ Create a TimeGrouper and return our resampler. """ - tg = TimeGrouper(**kwds) + tg = TimeGrouper(obj, **kwds) # type: ignore[arg-type] return tg._get_resampler(obj, kind=kind) @@ -2060,7 +2061,9 @@ class TimeGrouper(Grouper): def __init__( self, + obj: Grouper | None = None, freq: Frequency = "Min", + key: str | None = None, closed: Literal["left", "right"] | None = None, label: Literal["left", "right"] | None = None, how: str = "mean", @@ -2084,9 +2087,21 @@ def __init__( if convention not in {None, "start", "end", "e", "s"}: raise ValueError(f"Unsupported value {convention} for `convention`") - freq = to_offset(freq) + if ( + key is None + and obj is not None + and isinstance(obj.index, PeriodIndex) # type: ignore[attr-defined] + or ( + key is not None + and obj is not None + and getattr(obj[key], "dtype", None) == "period" # type: ignore[index] + ) + ): + freq = to_offset(freq, is_period=True) + else: + freq = to_offset(freq) - end_types = {"M", "A", "Q", "BM", "BA", "BQ", "W"} + end_types = {"ME", "A", "Q", "BM", "BA", "BQ", "W"} rule = freq.rule_code if rule in end_types or ("-" in rule and rule[: rule.find("-")] in end_types): if closed is None: @@ -2148,7 +2163,7 @@ def __init__( # always sort time groupers kwargs["sort"] = True - super().__init__(freq=freq, axis=axis, **kwargs) + super().__init__(freq=freq, key=key, axis=axis, **kwargs) def _get_resampler(self, obj: NDFrame, kind=None) -> Resampler: """ @@ -2170,7 +2185,6 @@ def _get_resampler(self, obj: NDFrame, kind=None) -> Resampler: """ _, ax, indexer = self._set_grouper(obj, gpr_index=None) - if isinstance(ax, DatetimeIndex): return DatetimeIndexResampler( obj, @@ -2729,6 +2743,9 @@ def asfreq( if how is None: how = "E" + if isinstance(freq, BaseOffset): + freq = freq_to_period_freqstr(freq.n, freq.name) + new_obj = obj.copy() new_obj.index = obj.index.asfreq(freq, how=how) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 07c77ec4f3e0a..0eb8ab2412e0e 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1573,7 +1573,7 @@ def area( ... 'signups': [5, 5, 6, 12, 14, 13], ... 'visits': [20, 42, 28, 62, 81, 50], ... }, index=pd.date_range(start='2018/01/01', end='2018/07/01', - ... freq='M')) + ... freq='ME')) >>> ax = df.plot.area() Area plots are stacked by default. To produce an unstacked plot, diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 3f77a32014bff..e6408a0eae841 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -942,7 +942,7 @@ def __init__( day: int = 1, plot_obj=None, ) -> None: - freq = to_offset(freq) + freq = to_offset(freq, is_period=True) self.freq = freq self.base = base (self.quarter, self.month, self.day) = (quarter, month, day) @@ -1026,7 +1026,7 @@ def __init__( dynamic_mode: bool = True, plot_obj=None, ) -> None: - freq = to_offset(freq) + freq = to_offset(freq, is_period=True) self.format = None self.freq = freq self.locs: list[Any] = [] # unused, for matplotlib compat diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index 90e6f29dd98ab..77de5c1ad7b42 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -15,7 +15,10 @@ Period, to_offset, ) -from pandas._libs.tslibs.dtypes import FreqGroup +from pandas._libs.tslibs.dtypes import ( + OFFSET_TO_PERIOD_FREQSTR, + FreqGroup, +) from pandas.core.dtypes.generic import ( ABCDatetimeIndex, @@ -188,7 +191,7 @@ def _get_ax_freq(ax: Axes): def _get_period_alias(freq: timedelta | BaseOffset | str) -> str | None: - freqstr = to_offset(freq).rule_code + freqstr = to_offset(freq, is_period=True).rule_code return get_period_alias(freqstr) @@ -198,7 +201,7 @@ def _get_freq(ax: Axes, series: Series): freq = getattr(series.index, "freq", None) if freq is None: freq = getattr(series.index, "inferred_freq", None) - freq = to_offset(freq) + freq = to_offset(freq, is_period=True) ax_freq = _get_ax_freq(ax) @@ -232,7 +235,10 @@ def use_dynamic_x(ax: Axes, data: DataFrame | Series) -> bool: # FIXME: hack this for 0.10.1, creating more technical debt...sigh if isinstance(data.index, ABCDatetimeIndex): # error: "BaseOffset" has no attribute "_period_dtype_code" - base = to_offset(freq_str)._period_dtype_code # type: ignore[attr-defined] + freq_str = OFFSET_TO_PERIOD_FREQSTR.get(freq_str, freq_str) + base = to_offset( + freq_str, is_period=True + )._period_dtype_code # type: ignore[attr-defined] x = data.index if base <= FreqGroup.FR_DAY.value: return x[:1].is_normalized diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 34b526bf97408..0c46d22ddcc2e 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1076,7 +1076,7 @@ def test_dt64arr_add_dtlike_raises(self, tz_naive_fixture, box_with_array): # Note: freq here includes both Tick and non-Tick offsets; this is # relevant because historically integer-addition was allowed if we had # a freq. - @pytest.mark.parametrize("freq", ["H", "D", "W", "M", "MS", "Q", "B", None]) + @pytest.mark.parametrize("freq", ["H", "D", "W", "2ME", "MS", "Q", "B", None]) @pytest.mark.parametrize("dtype", [None, "uint8"]) def test_dt64arr_addsub_intlike( self, request, dtype, box_with_array, freq, tz_naive_fixture diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 2a34566d2f9f5..ee8391830db4c 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -977,10 +977,10 @@ def test_pi_add_offset_n_gt1_not_divisible(self, box_with_array): pi = tm.box_expected(pi, box_with_array) expected = tm.box_expected(expected, box_with_array) - result = pi + to_offset("3M") + result = pi + to_offset("3ME") tm.assert_equal(result, expected) - result = to_offset("3M") + pi + result = to_offset("3ME") + pi tm.assert_equal(result, expected) # --------------------------------------------------------------- diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index e15bac9f0b8aa..3377c411a7084 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -141,7 +141,8 @@ def test_getitem_listlike(self): def test_periodindex(self): idx1 = PeriodIndex( - ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], freq="M" + ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], + freq="M", ) cat1 = Categorical(idx1) @@ -152,7 +153,8 @@ def test_periodindex(self): tm.assert_index_equal(cat1.categories, exp_idx) idx2 = PeriodIndex( - ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"], freq="M" + ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"], + freq="M", ) cat2 = Categorical(idx2, ordered=True) str(cat2) diff --git a/pandas/tests/arrays/period/test_constructors.py b/pandas/tests/arrays/period/test_constructors.py index ecc9ee745bad8..0ea26a6ece7eb 100644 --- a/pandas/tests/arrays/period/test_constructors.py +++ b/pandas/tests/arrays/period/test_constructors.py @@ -144,3 +144,13 @@ def test_freq_deprecated(): expected = PeriodArray(data, dtype="period[M]") tm.assert_equal(res, expected) + + +def test_period_array_from_datetime64(): + arr = np.array( + ["2020-01-01T00:00:00", "2020-02-02T00:00:00"], dtype="datetime64[ns]" + ) + result = PeriodArray._from_datetime64(arr, freq=MonthEnd(2)) + + expected = period_array(["2020-01-01", "2020-02-01"], freq=MonthEnd(2)) + tm.assert_period_array_equal(result, expected) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 1843eb03a2b00..291c687d84125 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -11,6 +11,7 @@ OutOfBoundsDatetime, Timestamp, ) +from pandas._libs.tslibs.dtypes import freq_to_period_freqstr import pandas as pd from pandas import ( @@ -31,7 +32,7 @@ # TODO: more freq variants -@pytest.fixture(params=["D", "B", "W", "M", "Q", "Y"]) +@pytest.fixture(params=["D", "B", "W", "ME", "Q", "Y"]) def freqstr(request): """Fixture returning parametrized frequency in string format.""" return request.param @@ -52,6 +53,7 @@ def period_index(freqstr): warnings.filterwarnings( "ignore", message="Period with BDay freq", category=FutureWarning ) + freqstr = freq_to_period_freqstr(1, freqstr) pi = pd.period_range(start=Timestamp("2000-01-01"), periods=100, freq=freqstr) return pi @@ -755,6 +757,7 @@ def test_to_period(self, datetime_index, freqstr): dti = datetime_index arr = DatetimeArray(dti) + freqstr = freq_to_period_freqstr(1, freqstr) expected = dti.to_period(freq=freqstr) result = arr.to_period(freq=freqstr) assert isinstance(result, PeriodArray) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index c2d68a79f32d4..a105852395b3a 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -746,6 +746,14 @@ def test_iter_zoneinfo_fold(self, tz): assert str(left) == str(right2) assert left.utcoffset() == right2.utcoffset() + def test_date_range_frequency_M_deprecated(self): + depr_msg = r"\'M\' will be deprecated, please use \'ME\' for \'month end\'" + + expected = pd.date_range("1/1/2000", periods=4, freq="2ME") + with tm.assert_produces_warning(UserWarning, match=depr_msg): + result = pd.date_range("1/1/2000", periods=4, freq="2M") + tm.assert_index_equal(result, expected) + def test_factorize_sort_without_freq(): dta = DatetimeArray._from_sequence([0, 2, 1]) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 5b9618bfb2abf..db13e979a3c2d 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -176,7 +176,7 @@ def test_iter_box(self): assert s.dtype == "Period[M]" for res, exp in zip(s, vals): assert isinstance(res, pd.Period) - assert res.freq == "M" + assert res.freq == "ME" assert res == exp diff --git a/pandas/tests/frame/methods/test_asfreq.py b/pandas/tests/frame/methods/test_asfreq.py index 72652df2e1e58..a2f8f3e278395 100644 --- a/pandas/tests/frame/methods/test_asfreq.py +++ b/pandas/tests/frame/methods/test_asfreq.py @@ -48,7 +48,7 @@ def test_asfreq2(self, frame_or_series): monthly_ts = daily_ts.asfreq(offsets.BMonthEnd()) tm.assert_equal(monthly_ts, ts) - result = ts[:0].asfreq("M") + result = ts[:0].asfreq("ME") assert len(result) == 0 assert result is not ts @@ -221,11 +221,11 @@ def test_asfreq_after_normalize(self, unit): @pytest.mark.parametrize( "freq, freq_half", [ - ("2M", "M"), + ("2ME", "ME"), (MonthEnd(2), MonthEnd(1)), ], ) - def test_asfreq_2M(self, freq, freq_half): + def test_asfreq_2ME(self, freq, freq_half): index = date_range("1/1/2000", periods=6, freq=freq_half) df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0, 4.0, 5.0], index=index)}) expected = df.asfreq(freq=freq) @@ -233,3 +233,13 @@ def test_asfreq_2M(self, freq, freq_half): index = date_range("1/1/2000", periods=3, freq=freq) result = DataFrame({"s": Series([0.0, 2.0, 4.0], index=index)}) tm.assert_frame_equal(result, expected) + + def test_asfreq_frequency_M_deprecated(self): + depr_msg = r"\'M\' will be deprecated, please use \'ME\' for \'month end\'" + + index = date_range("1/1/2000", periods=4, freq="ME") + df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0], index=index)}) + expected = df.asfreq(freq="5ME") + with tm.assert_produces_warning(UserWarning, match=depr_msg): + result = df.asfreq(freq="5M") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_first_and_last.py b/pandas/tests/frame/methods/test_first_and_last.py index 2e85edc7ed0ea..23355a5549a88 100644 --- a/pandas/tests/frame/methods/test_first_and_last.py +++ b/pandas/tests/frame/methods/test_first_and_last.py @@ -29,7 +29,7 @@ def test_first_subset(self, frame_or_series): assert len(result) == 10 with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): - result = ts.first("3M") + result = ts.first("3ME") expected = ts[:"3/31/2000"] tm.assert_equal(result, expected) @@ -39,7 +39,7 @@ def test_first_subset(self, frame_or_series): tm.assert_equal(result, expected) with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): - result = ts[:0].first("3M") + result = ts[:0].first("3ME") tm.assert_equal(result, ts[:0]) def test_first_last_raises(self, frame_or_series): @@ -87,7 +87,7 @@ def test_last_subset(self, frame_or_series): tm.assert_equal(result, expected) with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg): - result = ts[:0].last("3M") + result = ts[:0].last("3ME") tm.assert_equal(result, ts[:0]) @pytest.mark.parametrize("start, periods", [("2010-03-31", 1), ("2010-03-30", 2)]) @@ -95,7 +95,7 @@ def test_first_with_first_day_last_of_month(self, frame_or_series, start, period # GH#29623 x = frame_or_series([1] * 100, index=bdate_range(start, periods=100)) with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): - result = x.first("1M") + result = x.first("1ME") expected = frame_or_series( [1] * periods, index=bdate_range(start, periods=periods) ) @@ -105,7 +105,7 @@ def test_first_with_first_day_end_of_frq_n_greater_one(self, frame_or_series): # GH#29623 x = frame_or_series([1] * 100, index=bdate_range("2010-03-31", periods=100)) with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): - result = x.first("2M") + result = x.first("2ME") expected = frame_or_series( [1] * 23, index=bdate_range("2010-03-31", "2010-04-30") ) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index 56bdd2fc664cc..0105c41bd0eca 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -35,7 +35,7 @@ class TestReindexSetIndex: def test_dti_set_index_reindex_datetimeindex(self): # GH#6631 df = DataFrame(np.random.default_rng(2).random(6)) - idx1 = date_range("2011/01/01", periods=6, freq="M", tz="US/Eastern") + idx1 = date_range("2011/01/01", periods=6, freq="ME", tz="US/Eastern") idx2 = date_range("2013", periods=6, freq="A", tz="Asia/Tokyo") df = df.set_index(idx1) diff --git a/pandas/tests/frame/test_iteration.py b/pandas/tests/frame/test_iteration.py index 8bc26bff41767..216fd5c2f70c5 100644 --- a/pandas/tests/frame/test_iteration.py +++ b/pandas/tests/frame/test_iteration.py @@ -55,7 +55,7 @@ def test_iterrows_iso8601(self): s = DataFrame( { "non_iso8601": ["M1701", "M1802", "M1903", "M2004"], - "iso8601": date_range("2000-01-01", periods=4, freq="M"), + "iso8601": date_range("2000-01-01", periods=4, freq="ME"), } ) for k, v in s.iterrows(): diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 74473bc54d51e..e66557f132c1d 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -752,7 +752,7 @@ def test_sum_corner(self): tm.makeDateIndex(0), tm.makeNumericIndex(0, dtype=int), tm.makeNumericIndex(0, dtype=float), - tm.makeDateIndex(0, freq="M"), + tm.makeDateIndex(0, freq="ME"), tm.makePeriodIndex(0), ], ) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index fdd959f0e8754..9132be50d5857 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1952,7 +1952,7 @@ def test_pivot_table_values_key_error(): # This test is designed to replicate the error in issue #14938 df = DataFrame( { - "eventDate": date_range(datetime.today(), periods=20, freq="M").tolist(), + "eventDate": date_range(datetime.today(), periods=20, freq="ME").tolist(), "thename": range(20), } ) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 9e899bf453548..5adf9ace255ea 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -283,10 +283,10 @@ def test_grouper_creation_bug(self): names=["one", "two", "three"], ), ) - result = s.groupby(Grouper(level="three", freq="M")).sum() + result = s.groupby(Grouper(level="three", freq="ME")).sum() expected = Series( [28], - index=pd.DatetimeIndex([Timestamp("2013-01-31")], freq="M", name="three"), + index=pd.DatetimeIndex([Timestamp("2013-01-31")], freq="ME", name="three"), ) tm.assert_series_equal(result, expected) @@ -393,12 +393,12 @@ def test_grouper_getting_correct_binner(self): ), ) result = df.groupby( - [Grouper(level="one"), Grouper(level="two", freq="M")] + [Grouper(level="one"), Grouper(level="two", freq="ME")] ).sum() expected = DataFrame( {"A": [31, 28, 21, 31, 28, 21]}, index=MultiIndex.from_product( - [list("ab"), date_range("20130101", freq="M", periods=3)], + [list("ab"), date_range("20130101", freq="ME", periods=3)], names=["one", "two"], ), ) @@ -1083,7 +1083,7 @@ def test_groupby_with_small_elem(self): {"event": ["start", "start"], "change": [1234, 5678]}, index=pd.DatetimeIndex(["2014-09-10", "2013-10-10"]), ) - grouped = df.groupby([Grouper(freq="M"), "event"]) + grouped = df.groupby([Grouper(freq="ME"), "event"]) assert len(grouped.groups) == 2 assert grouped.ngroups == 2 assert (Timestamp("2014-09-30"), "start") in grouped.groups @@ -1098,7 +1098,7 @@ def test_groupby_with_small_elem(self): {"event": ["start", "start", "start"], "change": [1234, 5678, 9123]}, index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-09-15"]), ) - grouped = df.groupby([Grouper(freq="M"), "event"]) + grouped = df.groupby([Grouper(freq="ME"), "event"]) assert len(grouped.groups) == 2 assert grouped.ngroups == 2 assert (Timestamp("2014-09-30"), "start") in grouped.groups @@ -1114,7 +1114,7 @@ def test_groupby_with_small_elem(self): {"event": ["start", "start", "start"], "change": [1234, 5678, 9123]}, index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-08-05"]), ) - grouped = df.groupby([Grouper(freq="M"), "event"]) + grouped = df.groupby([Grouper(freq="ME"), "event"]) assert len(grouped.groups) == 3 assert grouped.ngroups == 3 assert (Timestamp("2014-09-30"), "start") in grouped.groups diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 1a26559ef4447..a9e67df1fb793 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -147,7 +147,7 @@ def test_groupby_with_timegrouper_methods(self, should_sort): df = df.sort_values(by="Quantity", ascending=False) df = df.set_index("Date", drop=False) - g = df.groupby(Grouper(freq="6M")) + g = df.groupby(Grouper(freq="6ME")) assert g.group_keys assert isinstance(g.grouper, BinGrouper) @@ -248,7 +248,7 @@ def test_timegrouper_with_reg_groups(self): result = df.groupby([Grouper(freq="1D"), "Buyer"]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M"), "Buyer"]).sum(numeric_only=True) + result = df.groupby([Grouper(freq="1ME"), "Buyer"]).sum(numeric_only=True) expected = DataFrame( { "Buyer": "Carl Joe Mark".split(), @@ -264,32 +264,32 @@ def test_timegrouper_with_reg_groups(self): # passing the name df = df.reset_index() - result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum( + result = df.groupby([Grouper(freq="1ME", key="Date"), "Buyer"]).sum( numeric_only=True ) tm.assert_frame_equal(result, expected) with pytest.raises(KeyError, match="'The grouper name foo is not found'"): - df.groupby([Grouper(freq="1M", key="foo"), "Buyer"]).sum() + df.groupby([Grouper(freq="1ME", key="foo"), "Buyer"]).sum() # passing the level df = df.set_index("Date") - result = df.groupby([Grouper(freq="1M", level="Date"), "Buyer"]).sum( + result = df.groupby([Grouper(freq="1ME", level="Date"), "Buyer"]).sum( numeric_only=True ) tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M", level=0), "Buyer"]).sum( + result = df.groupby([Grouper(freq="1ME", level=0), "Buyer"]).sum( numeric_only=True ) tm.assert_frame_equal(result, expected) with pytest.raises(ValueError, match="The level foo is not valid"): - df.groupby([Grouper(freq="1M", level="foo"), "Buyer"]).sum() + df.groupby([Grouper(freq="1ME", level="foo"), "Buyer"]).sum() # multi names df = df.copy() df["Date"] = df.index + offsets.MonthEnd(2) - result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum( + result = df.groupby([Grouper(freq="1ME", key="Date"), "Buyer"]).sum( numeric_only=True ) expected = DataFrame( @@ -309,7 +309,7 @@ def test_timegrouper_with_reg_groups(self): msg = "The Grouper cannot specify both a key and a level!" with pytest.raises(ValueError, match=msg): df.groupby( - [Grouper(freq="1M", key="Date", level="Date"), "Buyer"] + [Grouper(freq="1ME", key="Date", level="Date"), "Buyer"] ).sum() # single groupers @@ -320,21 +320,23 @@ def test_timegrouper_with_reg_groups(self): [datetime(2013, 10, 31, 0, 0)], freq=offsets.MonthEnd(), name="Date" ), ) - result = df.groupby(Grouper(freq="1M")).sum(numeric_only=True) + result = df.groupby(Grouper(freq="1ME")).sum(numeric_only=True) tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M")]).sum(numeric_only=True) + result = df.groupby([Grouper(freq="1ME")]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) expected.index = expected.index.shift(1) assert expected.index.freq == offsets.MonthEnd() - result = df.groupby(Grouper(freq="1M", key="Date")).sum(numeric_only=True) + result = df.groupby(Grouper(freq="1ME", key="Date")).sum(numeric_only=True) tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M", key="Date")]).sum(numeric_only=True) + result = df.groupby([Grouper(freq="1ME", key="Date")]).sum( + numeric_only=True + ) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("freq", ["D", "M", "A", "Q-APR"]) + @pytest.mark.parametrize("freq", ["D", "ME", "A", "Q-APR"]) def test_timegrouper_with_reg_groups_freq(self, freq): # GH 6764 multiple grouping with/without sort df = DataFrame( @@ -421,7 +423,7 @@ def test_timegrouper_get_group(self): dt_list = ["2013-09-30", "2013-10-31", "2013-12-31"] for df in [df_original, df_reordered]: - grouped = df.groupby(Grouper(freq="M", key="Date")) + grouped = df.groupby(Grouper(freq="ME", key="Date")) for t, expected in zip(dt_list, expected_list): dt = Timestamp(t) result = grouped.get_group(dt) @@ -436,7 +438,7 @@ def test_timegrouper_get_group(self): g_list = [("Joe", "2013-09-30"), ("Carl", "2013-10-31"), ("Joe", "2013-12-31")] for df in [df_original, df_reordered]: - grouped = df.groupby(["Buyer", Grouper(freq="M", key="Date")]) + grouped = df.groupby(["Buyer", Grouper(freq="ME", key="Date")]) for (b, t), expected in zip(g_list, expected_list): dt = Timestamp(t) result = grouped.get_group((b, dt)) @@ -453,7 +455,7 @@ def test_timegrouper_get_group(self): ] for df in [df_original, df_reordered]: - grouped = df.groupby(Grouper(freq="M")) + grouped = df.groupby(Grouper(freq="ME")) for t, expected in zip(dt_list, expected_list): dt = Timestamp(t) result = grouped.get_group(dt) @@ -475,7 +477,7 @@ def sumfunc_series(x): expected = df.groupby(Grouper(key="date")).apply(sumfunc_series) msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(FutureWarning, match=msg): - result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_series) + result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_series) tm.assert_frame_equal( result.reset_index(drop=True), expected.reset_index(drop=True) ) @@ -495,7 +497,7 @@ def sumfunc_value(x): with tm.assert_produces_warning(FutureWarning, match=msg): expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) with tm.assert_produces_warning(FutureWarning, match=msg): - result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_value) + result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_value) tm.assert_series_equal( result.reset_index(drop=True), expected.reset_index(drop=True) ) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index acb4b93ba1af3..cf3f41e04902c 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -70,7 +70,7 @@ def demean(arr): # GH 8430 df = tm.makeTimeDataFrame() - g = df.groupby(pd.Grouper(freq="M")) + g = df.groupby(pd.Grouper(freq="ME")) g.transform(lambda x: x - 1) # GH 9700 diff --git a/pandas/tests/indexes/datetimes/methods/test_astype.py b/pandas/tests/indexes/datetimes/methods/test_astype.py index d339639dc5def..08a2473f22556 100644 --- a/pandas/tests/indexes/datetimes/methods/test_astype.py +++ b/pandas/tests/indexes/datetimes/methods/test_astype.py @@ -168,7 +168,7 @@ def test_astype_object(self): @pytest.mark.parametrize("tz", [None, "Asia/Tokyo"]) def test_astype_object_tz(self, tz): - idx = date_range(start="2013-01-01", periods=4, freq="M", name="idx", tz=tz) + idx = date_range(start="2013-01-01", periods=4, freq="ME", name="idx", tz=tz) expected_list = [ Timestamp("2013-01-31", tz=tz), Timestamp("2013-02-28", tz=tz), diff --git a/pandas/tests/indexes/datetimes/methods/test_factorize.py b/pandas/tests/indexes/datetimes/methods/test_factorize.py index 3ad927f133fb2..99a3bc910b9ca 100644 --- a/pandas/tests/indexes/datetimes/methods/test_factorize.py +++ b/pandas/tests/indexes/datetimes/methods/test_factorize.py @@ -58,7 +58,7 @@ def test_factorize(self): def test_factorize_preserves_freq(self): # GH#38120 freq should be preserved - idx3 = date_range("2000-01", periods=4, freq="M", tz="Asia/Tokyo") + idx3 = date_range("2000-01", periods=4, freq="ME", tz="Asia/Tokyo") exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) arr, idx = idx3.factorize() diff --git a/pandas/tests/indexes/datetimes/methods/test_insert.py b/pandas/tests/indexes/datetimes/methods/test_insert.py index cedf8cd54b81e..9ef43ace747e2 100644 --- a/pandas/tests/indexes/datetimes/methods/test_insert.py +++ b/pandas/tests/indexes/datetimes/methods/test_insert.py @@ -76,18 +76,18 @@ def test_insert(self): tm.assert_index_equal(result, expected) assert result.name == expected.name - idx = date_range("1/1/2000", periods=3, freq="M", name="idx") + idx = date_range("1/1/2000", periods=3, freq="ME", name="idx") # preserve freq expected_0 = DatetimeIndex( ["1999-12-31", "2000-01-31", "2000-02-29", "2000-03-31"], name="idx", - freq="M", + freq="ME", ) expected_3 = DatetimeIndex( ["2000-01-31", "2000-02-29", "2000-03-31", "2000-04-30"], name="idx", - freq="M", + freq="ME", ) # reset freq to None diff --git a/pandas/tests/indexes/datetimes/methods/test_to_period.py b/pandas/tests/indexes/datetimes/methods/test_to_period.py index 2d762710168ab..5f266ea0b42a6 100644 --- a/pandas/tests/indexes/datetimes/methods/test_to_period.py +++ b/pandas/tests/indexes/datetimes/methods/test_to_period.py @@ -20,7 +20,7 @@ class TestToPeriod: def test_dti_to_period(self): - dti = date_range(start="1/1/2005", end="12/1/2005", freq="M") + dti = date_range(start="1/1/2005", end="12/1/2005", freq="ME") pi1 = dti.to_period() pi2 = dti.to_period(freq="D") pi3 = dti.to_period(freq="3D") @@ -67,21 +67,27 @@ def test_to_period_monthish(self): for off in offsets: rng = date_range("01-Jan-2012", periods=8, freq=off) prng = rng.to_period() - assert prng.freq == "M" + assert prng.freqstr == "M" - rng = date_range("01-Jan-2012", periods=8, freq="M") + rng = date_range("01-Jan-2012", periods=8, freq="ME") prng = rng.to_period() - assert prng.freq == "M" + assert prng.freqstr == "M" with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): date_range("01-Jan-2012", periods=8, freq="EOM") - @pytest.mark.parametrize("freq", ["2M", MonthEnd(2)]) - def test_dti_to_period_2monthish(self, freq): - dti = date_range("2020-01-01", periods=3, freq=freq) + @pytest.mark.parametrize( + "freq_offset, freq_period", + [ + ("2ME", "2M"), + (MonthEnd(2), MonthEnd(2)), + ], + ) + def test_dti_to_period_2monthish(self, freq_offset, freq_period): + dti = date_range("2020-01-01", periods=3, freq=freq_offset) pi = dti.to_period() - tm.assert_index_equal(pi, period_range("2020-01", "2020-05", freq=freq)) + tm.assert_index_equal(pi, period_range("2020-01", "2020-05", freq=freq_period)) def test_to_period_infer(self): # https://github.com/pandas-dev/pandas/issues/33358 diff --git a/pandas/tests/indexes/datetimes/test_asof.py b/pandas/tests/indexes/datetimes/test_asof.py index 7adc400302cb9..f52b6da5b2f07 100644 --- a/pandas/tests/indexes/datetimes/test_asof.py +++ b/pandas/tests/indexes/datetimes/test_asof.py @@ -11,7 +11,7 @@ class TestAsOf: def test_asof_partial(self): - index = date_range("2010-01-01", periods=2, freq="m") + index = date_range("2010-01-01", periods=2, freq="ME") expected = Timestamp("2010-02-28") result = index.asof("2010-02") assert result == expected diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index c8f5c5d561339..61c8cc4a50fe2 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -972,8 +972,8 @@ def test_explicit_none_freq(self): def test_dti_constructor_years_only(self, tz_naive_fixture): tz = tz_naive_fixture # GH 6961 - rng1 = date_range("2014", "2015", freq="M", tz=tz) - expected1 = date_range("2014-01-31", "2014-12-31", freq="M", tz=tz) + rng1 = date_range("2014", "2015", freq="ME", tz=tz) + expected1 = date_range("2014-01-31", "2014-12-31", freq="ME", tz=tz) rng2 = date_range("2014", "2015", freq="MS", tz=tz) expected2 = date_range("2014-01-01", "2015-01-01", freq="MS", tz=tz) @@ -1010,7 +1010,7 @@ def test_ctor_str_intraday(self): assert rng[0].second == 1 def test_is_(self): - dti = date_range(start="1/1/2005", end="12/1/2005", freq="M") + dti = date_range(start="1/1/2005", end="12/1/2005", freq="ME") assert dti.is_(dti) assert dti.is_(dti.view()) assert not dti.is_(dti.copy()) @@ -1036,7 +1036,7 @@ def test_constructor_int64_nocopy(self): assert (index.asi8[50:100] != -1).all() @pytest.mark.parametrize( - "freq", ["M", "Q", "A", "D", "B", "BH", "min", "s", "ms", "us", "H", "ns", "C"] + "freq", ["ME", "Q", "A", "D", "B", "BH", "min", "s", "ms", "us", "H", "ns", "C"] ) def test_from_freq_recreate_from_data(self, freq): org = date_range(start="2001/02/01 09:00", freq=freq, periods=1) diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index d5500a19f2bb6..b93aee1d988de 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -277,10 +277,10 @@ def test_date_range_negative_freq(self): tm.assert_index_equal(rng, exp) assert rng.freq == "-2A" - rng = date_range("2011-01-31", freq="-2M", periods=3) - exp = DatetimeIndex(["2011-01-31", "2010-11-30", "2010-09-30"], freq="-2M") + rng = date_range("2011-01-31", freq="-2ME", periods=3) + exp = DatetimeIndex(["2011-01-31", "2010-11-30", "2010-09-30"], freq="-2ME") tm.assert_index_equal(rng, exp) - assert rng.freq == "-2M" + assert rng.freq == "-2ME" def test_date_range_bms_bug(self): # #1645 @@ -638,7 +638,7 @@ def test_range_tz_dateutil(self): assert dr[0] == start assert dr[2] == end - @pytest.mark.parametrize("freq", ["1D", "3D", "2M", "7W", "3H", "A"]) + @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3H", "A"]) def test_range_closed(self, freq, inclusive_endpoints_fixture): begin = datetime(2011, 1, 1) end = datetime(2014, 1, 1) @@ -653,7 +653,7 @@ def test_range_closed(self, freq, inclusive_endpoints_fixture): tm.assert_index_equal(expected_range, result_range) - @pytest.mark.parametrize("freq", ["1D", "3D", "2M", "7W", "3H", "A"]) + @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3H", "A"]) def test_range_closed_with_tz_aware_start_end( self, freq, inclusive_endpoints_fixture ): @@ -674,7 +674,7 @@ def test_range_closed_with_tz_aware_start_end( tm.assert_index_equal(expected_range, result_range) - @pytest.mark.parametrize("freq", ["1D", "3D", "2M", "7W", "3H", "A"]) + @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3H", "A"]) def test_range_with_tz_closed_with_tz_aware_start_end( self, freq, inclusive_endpoints_fixture ): @@ -750,7 +750,7 @@ def test_range_closed_boundary(self, inclusive_endpoints_fixture): def test_years_only(self): # GH 6961 - dr = date_range("2014", "2015", freq="M") + dr = date_range("2014", "2015", freq="ME") assert dr[0] == datetime(2014, 1, 31) assert dr[-1] == datetime(2014, 12, 31) diff --git a/pandas/tests/indexes/datetimes/test_delete.py b/pandas/tests/indexes/datetimes/test_delete.py index e9de5a055a5c2..3565e516e69d5 100644 --- a/pandas/tests/indexes/datetimes/test_delete.py +++ b/pandas/tests/indexes/datetimes/test_delete.py @@ -10,11 +10,11 @@ class TestDelete: def test_delete(self): - idx = date_range(start="2000-01-01", periods=5, freq="M", name="idx") + idx = date_range(start="2000-01-01", periods=5, freq="ME", name="idx") # preserve freq - expected_0 = date_range(start="2000-02-01", periods=4, freq="M", name="idx") - expected_4 = date_range(start="2000-01-01", periods=4, freq="M", name="idx") + expected_0 = date_range(start="2000-02-01", periods=4, freq="ME", name="idx") + expected_4 = date_range(start="2000-01-01", periods=4, freq="ME", name="idx") # reset freq to None expected_1 = DatetimeIndex( diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index bfecc5d064b11..c3944a4443d67 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -101,7 +101,7 @@ def test_dti_business_getitem_matplotlib_hackaround(self, freq): rng[:, None] def test_getitem_int_list(self): - dti = date_range(start="1/1/2005", end="12/1/2005", freq="M") + dti = date_range(start="1/1/2005", end="12/1/2005", freq="ME") dti2 = dti[[1, 3, 5]] v1 = dti2[0] diff --git a/pandas/tests/indexes/datetimes/test_map.py b/pandas/tests/indexes/datetimes/test_map.py index 45698ef225151..c31e2407190ea 100644 --- a/pandas/tests/indexes/datetimes/test_map.py +++ b/pandas/tests/indexes/datetimes/test_map.py @@ -40,7 +40,7 @@ def test_map_bug_1677(self): def test_index_map(self, name): # see GH#20990 count = 6 - index = date_range("2018-01-01", periods=count, freq="M", name=name).map( + index = date_range("2018-01-01", periods=count, freq="ME", name=name).map( lambda x: (x.year, x.month) ) exp_index = MultiIndex.from_product(((2018,), range(1, 7)), names=[name, name]) diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 139190962895d..185134af165f4 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -140,7 +140,7 @@ def test_datetimeindex_accessors4(self): assert dti.is_month_start[0] == 1 def test_datetimeindex_accessors5(self): - freq_m = to_offset("M") + freq_m = to_offset("ME") bm = to_offset("BM") qfeb = to_offset("Q-FEB") qsfeb = to_offset("QS-FEB") @@ -256,7 +256,7 @@ def test_datetime_name_accessors(self, time_locale): assert np.isnan(ts.day_name(locale=time_locale)) # GH#12805 - dti = date_range(freq="M", start="2012", end="2013") + dti = date_range(freq="ME", start="2012", end="2013") result = dti.month_name(locale=time_locale) expected = Index([month.capitalize() for month in expected_months]) diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index c782121505642..21dc22bea87dc 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -22,7 +22,7 @@ class TestDatetimeIndexOps: [ ("A", "day"), ("Q", "day"), - ("M", "day"), + ("ME", "day"), ("D", "day"), ("H", "hour"), ("min", "minute"), diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 0ecccd7da3a5c..7ad0dfbaf6cb1 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -453,9 +453,11 @@ def test_getitem_with_datestring_with_UTC_offset(self, start, end): def test_slice_reduce_to_series(self): # GH 27516 - df = DataFrame({"A": range(24)}, index=date_range("2000", periods=24, freq="M")) + df = DataFrame( + {"A": range(24)}, index=date_range("2000", periods=24, freq="ME") + ) expected = Series( - range(12), index=date_range("2000", periods=12, freq="M"), name="A" + range(12), index=date_range("2000", periods=12, freq="ME"), name="A" ) result = df.loc["2000", "A"] tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index 1c5b6adf0b527..49597ef885183 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -96,7 +96,7 @@ def test_round_daily(self): "freq, error_msg", [ ("Y", " is a non-fixed frequency"), - ("M", " is a non-fixed frequency"), + ("ME", " is a non-fixed frequency"), ("foobar", "Invalid frequency: foobar"), ], ) @@ -133,9 +133,9 @@ def test_round(self, tz_naive_fixture): msg = " is a non-fixed frequency" with pytest.raises(ValueError, match=msg): - rng.round(freq="M") + rng.round(freq="ME") with pytest.raises(ValueError, match=msg): - elt.round(freq="M") + elt.round(freq="ME") # GH#14440 & GH#15578 index = DatetimeIndex(["2016-10-17 12:00:00.0015"], tz=tz) diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index e9cc69aea0ed5..756a72cf1849a 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -272,8 +272,8 @@ def test_dti_tz_convert_dst(self): def test_tz_convert_roundtrip(self, tz_aware_fixture): tz = tz_aware_fixture - idx1 = date_range(start="2014-01-01", end="2014-12-31", freq="M", tz="UTC") - exp1 = date_range(start="2014-01-01", end="2014-12-31", freq="M") + idx1 = date_range(start="2014-01-01", end="2014-12-31", freq="ME", tz="UTC") + exp1 = date_range(start="2014-01-01", end="2014-12-31", freq="ME") idx2 = date_range(start="2014-01-01", end="2014-12-31", freq="D", tz="UTC") exp2 = date_range(start="2014-01-01", end="2014-12-31", freq="D") diff --git a/pandas/tests/indexes/interval/test_interval_range.py b/pandas/tests/indexes/interval/test_interval_range.py index 57783265b04b3..499220b39279d 100644 --- a/pandas/tests/indexes/interval/test_interval_range.py +++ b/pandas/tests/indexes/interval/test_interval_range.py @@ -58,7 +58,7 @@ def test_constructor_numeric(self, closed, name, freq, periods): @pytest.mark.parametrize("tz", [None, "US/Eastern"]) @pytest.mark.parametrize( - "freq, periods", [("D", 364), ("2D", 182), ("22D18H", 16), ("M", 11)] + "freq, periods", [("D", 364), ("2D", 182), ("22D18H", 16), ("ME", 11)] ) def test_constructor_timestamp(self, closed, name, freq, periods, tz): start, end = Timestamp("20180101", tz=tz), Timestamp("20181231", tz=tz) diff --git a/pandas/tests/indexes/multi/test_analytics.py b/pandas/tests/indexes/multi/test_analytics.py index 7097aa2b61632..87f1439db5fc8 100644 --- a/pandas/tests/indexes/multi/test_analytics.py +++ b/pandas/tests/indexes/multi/test_analytics.py @@ -98,8 +98,8 @@ def test_numpy_repeat(): def test_append_mixed_dtypes(): # GH 13660 - dti = date_range("2011-01-01", freq="M", periods=3) - dti_tz = date_range("2011-01-01", freq="M", periods=3, tz="US/Eastern") + dti = date_range("2011-01-01", freq="ME", periods=3) + dti_tz = date_range("2011-01-01", freq="ME", periods=3, tz="US/Eastern") pi = period_range("2011-01", freq="M", periods=3) mi = MultiIndex.from_arrays( diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index 91ec1b2475cde..cce6c98d71b47 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -778,7 +778,7 @@ def test_datetimeindex(): idx1 = pd.DatetimeIndex( ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"] * 2, tz="Asia/Tokyo" ) - idx2 = date_range("2010/01/01", periods=6, freq="M", tz="US/Eastern") + idx2 = date_range("2010/01/01", periods=6, freq="ME", tz="US/Eastern") idx = MultiIndex.from_arrays([idx1, idx2]) expected1 = pd.DatetimeIndex( diff --git a/pandas/tests/indexes/period/methods/test_factorize.py b/pandas/tests/indexes/period/methods/test_factorize.py index 7705da02bb7d2..ac3d09d76157b 100644 --- a/pandas/tests/indexes/period/methods/test_factorize.py +++ b/pandas/tests/indexes/period/methods/test_factorize.py @@ -10,7 +10,8 @@ class TestFactorize: def test_factorize(self): idx1 = PeriodIndex( - ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], freq="M" + ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], + freq="M", ) exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp) @@ -25,7 +26,8 @@ def test_factorize(self): tm.assert_index_equal(idx, exp_idx) idx2 = PeriodIndex( - ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"], freq="M" + ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"], + freq="M", ) exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.intp) diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index a3d5dc63ad7c8..a5bdfa11140d1 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -179,15 +179,15 @@ def test_constructor_fromarraylike(self): result = PeriodIndex(idx, freq=offsets.MonthEnd()) tm.assert_index_equal(result, idx) - assert result.freq == "M" + assert result.freq == "ME" result = PeriodIndex(idx, freq="2M") tm.assert_index_equal(result, idx.asfreq("2M")) - assert result.freq == "2M" + assert result.freq == "2ME" result = PeriodIndex(idx, freq=offsets.MonthEnd(2)) tm.assert_index_equal(result, idx.asfreq("2M")) - assert result.freq == "2M" + assert result.freq == "2ME" result = PeriodIndex(idx, freq="D") exp = idx.asfreq("D", "e") @@ -205,7 +205,7 @@ def test_constructor_datetime64arr(self): @pytest.mark.parametrize("box", [None, "series", "index"]) def test_constructor_datetime64arr_ok(self, box): # https://github.com/pandas-dev/pandas/issues/23438 - data = date_range("2017", periods=4, freq="M") + data = date_range("2017", periods=4, freq="ME") if box is None: data = data._values elif box == "series": @@ -250,7 +250,7 @@ def test_constructor_empty(self): idx = PeriodIndex([], freq="M") assert isinstance(idx, PeriodIndex) assert len(idx) == 0 - assert idx.freq == "M" + assert idx.freq == "ME" with pytest.raises(ValueError, match="freq not specified"): PeriodIndex([]) @@ -415,14 +415,32 @@ def test_constructor_freq_mult(self): with pytest.raises(ValueError, match=msg): period_range("2011-01", periods=3, freq="0M") - @pytest.mark.parametrize("freq", ["A", "M", "D", "min", "s"]) + @pytest.mark.parametrize( + "freq_offset, freq_period", + [ + ("A", "A"), + ("ME", "M"), + ("D", "D"), + ("min", "min"), + ("s", "s"), + ], + ) @pytest.mark.parametrize("mult", [1, 2, 3, 4, 5]) - def test_constructor_freq_mult_dti_compat(self, mult, freq): - freqstr = str(mult) + freq - pidx = period_range(start="2014-04-01", freq=freqstr, periods=10) - expected = date_range(start="2014-04-01", freq=freqstr, periods=10).to_period( - freqstr - ) + def test_constructor_freq_mult_dti_compat(self, mult, freq_offset, freq_period): + freqstr_offset = str(mult) + freq_offset + freqstr_period = str(mult) + freq_period + pidx = period_range(start="2014-04-01", freq=freqstr_period, periods=10) + expected = date_range( + start="2014-04-01", freq=freqstr_offset, periods=10 + ).to_period(freqstr_period) + tm.assert_index_equal(pidx, expected) + + @pytest.mark.parametrize("mult", [1, 2, 3, 4, 5]) + def test_constructor_freq_mult_dti_compat_month(self, mult): + pidx = period_range(start="2014-04-01", freq=f"{mult}M", periods=10) + expected = date_range( + start="2014-04-01", freq=f"{mult}ME", periods=10 + ).to_period(f"{mult}M") tm.assert_index_equal(pidx, expected) def test_constructor_freq_combined(self): diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 7191175f16f73..bd40fa37897d8 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -278,6 +278,12 @@ def test_format_empty(self): assert empty_idx.format() == [] assert empty_idx.format(name=True) == [""] + def test_period_index_frequency_ME_error_message(self): + msg = "Invalid frequency: 2ME" + + with pytest.raises(ValueError, match=msg): + PeriodIndex(["2020-01-01", "2020-01-02"], freq="2ME") + def test_maybe_convert_timedelta(): pi = PeriodIndex(["2000", "2001"], freq="D") diff --git a/pandas/tests/indexes/period/test_period_range.py b/pandas/tests/indexes/period/test_period_range.py index c94ddf57c0ee1..63acaba2d4f3e 100644 --- a/pandas/tests/indexes/period/test_period_range.py +++ b/pandas/tests/indexes/period/test_period_range.py @@ -20,7 +20,7 @@ def test_required_arguments(self): with pytest.raises(ValueError, match=msg): period_range("2011-1-1", "2012-1-1", "B") - @pytest.mark.parametrize("freq", ["D", "W", "M", "Q", "A"]) + @pytest.mark.parametrize("freq", ["D", "W", "Q", "A"]) def test_construction_from_string(self, freq): # non-empty expected = date_range( @@ -49,11 +49,39 @@ def test_construction_from_string(self, freq): result = period_range(start=end, end=start, freq=freq, name="foo") tm.assert_index_equal(result, expected) + def test_construction_from_string_monthly(self): + # non-empty + expected = date_range( + start="2017-01-01", periods=5, freq="ME", name="foo" + ).to_period() + start, end = str(expected[0]), str(expected[-1]) + + result = period_range(start=start, end=end, freq="M", name="foo") + tm.assert_index_equal(result, expected) + + result = period_range(start=start, periods=5, freq="M", name="foo") + tm.assert_index_equal(result, expected) + + result = period_range(end=end, periods=5, freq="M", name="foo") + tm.assert_index_equal(result, expected) + + # empty + expected = PeriodIndex([], freq="M", name="foo") + + result = period_range(start=start, periods=0, freq="M", name="foo") + tm.assert_index_equal(result, expected) + + result = period_range(end=end, periods=0, freq="M", name="foo") + tm.assert_index_equal(result, expected) + + result = period_range(start=end, end=start, freq="M", name="foo") + tm.assert_index_equal(result, expected) + def test_construction_from_period(self): # upsampling start, end = Period("2017Q1", freq="Q"), Period("2018Q1", freq="Q") expected = date_range( - start="2017-03-31", end="2018-03-31", freq="M", name="foo" + start="2017-03-31", end="2018-03-31", freq="ME", name="foo" ).to_period() result = period_range(start=start, end=end, freq="M", name="foo") tm.assert_index_equal(result, expected) @@ -119,3 +147,8 @@ def test_errors(self): msg = "periods must be a number, got foo" with pytest.raises(TypeError, match=msg): period_range(start="2017Q1", periods="foo") + + def test_period_range_frequency_ME_error_message(self): + msg = "Invalid frequency: 2ME" + with pytest.raises(ValueError, match=msg): + period_range(start="Jan-2000", end="Dec-2000", freq="2ME") diff --git a/pandas/tests/indexes/period/test_scalar_compat.py b/pandas/tests/indexes/period/test_scalar_compat.py index c96444981ff14..d8afd29ff31c5 100644 --- a/pandas/tests/indexes/period/test_scalar_compat.py +++ b/pandas/tests/indexes/period/test_scalar_compat.py @@ -20,7 +20,7 @@ def test_start_time(self): def test_end_time(self): # GH#17157 index = period_range(freq="M", start="2016-01-01", end="2016-05-31") - expected_index = date_range("2016-01-01", end="2016-05-31", freq="M") + expected_index = date_range("2016-01-01", end="2016-05-31", freq="ME") expected_index += Timedelta(1, "D") - Timedelta(1, "ns") tm.assert_index_equal(index.end_time, expected_index) diff --git a/pandas/tests/indexes/timedeltas/test_scalar_compat.py b/pandas/tests/indexes/timedeltas/test_scalar_compat.py index 22e50a974d9e1..fe3ff1799e763 100644 --- a/pandas/tests/indexes/timedeltas/test_scalar_compat.py +++ b/pandas/tests/indexes/timedeltas/test_scalar_compat.py @@ -74,15 +74,15 @@ def test_tdi_round(self): msg = " is a non-fixed frequency" with pytest.raises(ValueError, match=msg): - td.round(freq="M") + td.round(freq="ME") with pytest.raises(ValueError, match=msg): - elt.round(freq="M") + elt.round(freq="ME") @pytest.mark.parametrize( "freq,msg", [ ("Y", " is a non-fixed frequency"), - ("M", " is a non-fixed frequency"), + ("ME", " is a non-fixed frequency"), ("foobar", "Invalid frequency: foobar"), ], ) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index bbd427387625b..e4ce969daab53 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -293,7 +293,7 @@ def test_multiindex_interval_datetimes(self, ext): [ range(4), pd.interval_range( - start=pd.Timestamp("2020-01-01"), periods=4, freq="6M" + start=pd.Timestamp("2020-01-01"), periods=4, freq="6ME" ), ] ) @@ -751,7 +751,7 @@ def test_to_excel_timedelta(self, path): tm.assert_frame_equal(expected, recons) def test_to_excel_periodindex(self, tsframe, path): - xp = tsframe.resample("M", kind="period").mean() + xp = tsframe.resample("ME", kind="period").mean() xp.to_excel(path, sheet_name="sht1") diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index a30b3f64bd75c..bb8f0ce214c96 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -526,7 +526,7 @@ def test_pickle_timeseries_periodindex(): prng = period_range("1/1/2011", "1/1/2012", freq="M") ts = Series(np.random.default_rng(2).standard_normal(len(prng)), prng) new_ts = tm.round_trip_pickle(ts) - assert new_ts.index.freq == "M" + assert new_ts.index.freqstr == "M" @pytest.mark.parametrize( diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index 11008646f0ad4..47114cab47619 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -1759,7 +1759,7 @@ def test_errorbar_with_partial_columns_dti(self): df_err = DataFrame( np.abs(np.random.default_rng(2).standard_normal((10, 2))), columns=[0, 2] ) - ix = date_range("1/1/2000", periods=10, freq="M") + ix = date_range("1/1/2000", periods=10, freq="ME") df.set_index(ix, inplace=True) df_err.set_index(ix, inplace=True) ax = _check_plot_works(df.plot, yerr=df_err, kind="line") @@ -1780,7 +1780,7 @@ def test_errorbar_timeseries(self, kind): d_err = {"x": np.ones(12) * 0.2, "y": np.ones(12) * 0.4} # check time-series plots - ix = date_range("1/1/2000", "1/1/2001", freq="M") + ix = date_range("1/1/2000", "1/1/2001", freq="ME") tdf = DataFrame(d, index=ix) tdf_err = DataFrame(d_err, index=ix) diff --git a/pandas/tests/plotting/frame/test_frame_subplots.py b/pandas/tests/plotting/frame/test_frame_subplots.py index bce00600f6615..4d8d8fa4cdee3 100644 --- a/pandas/tests/plotting/frame/test_frame_subplots.py +++ b/pandas/tests/plotting/frame/test_frame_subplots.py @@ -89,7 +89,7 @@ def test_subplots_no_legend(self, kind): @pytest.mark.parametrize("kind", ["line", "area"]) def test_subplots_timeseries(self, kind): - idx = date_range(start="2014-07-01", freq="M", periods=10) + idx = date_range(start="2014-07-01", freq="ME", periods=10) df = DataFrame(np.random.default_rng(2).random((10, 3)), index=idx) axes = df.plot(kind=kind, subplots=True, sharex=True) @@ -112,7 +112,7 @@ def test_subplots_timeseries(self, kind): @pytest.mark.parametrize("kind", ["line", "area"]) def test_subplots_timeseries_rot(self, kind): - idx = date_range(start="2014-07-01", freq="M", periods=10) + idx = date_range(start="2014-07-01", freq="ME", periods=10) df = DataFrame(np.random.default_rng(2).random((10, 3)), index=idx) axes = df.plot(kind=kind, subplots=True, sharex=False, rot=45, fontsize=7) for ax in axes: @@ -361,7 +361,7 @@ def test_subplots_ts_share_axes(self): mpl.pyplot.subplots_adjust(left=0.05, right=0.95, hspace=0.3, wspace=0.3) df = DataFrame( np.random.default_rng(2).standard_normal((10, 9)), - index=date_range(start="2014-07-01", freq="M", periods=10), + index=date_range(start="2014-07-01", freq="ME", periods=10), ) for i, ax in enumerate(axes.ravel()): df[i].plot(ax=ax, fontsize=5) diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index c2626c2aa30c7..220741cf1ec3d 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -14,6 +14,7 @@ BaseOffset, to_offset, ) +from pandas._libs.tslibs.dtypes import freq_to_period_freqstr from pandas import ( DataFrame, @@ -124,7 +125,7 @@ def test_tsplot_period(self, freq): _check_plot_works(ser.plot, ax=ax) @pytest.mark.parametrize( - "freq", ["s", "min", "H", "D", "W", "M", "Q-DEC", "A", "1B30Min"] + "freq", ["s", "min", "H", "D", "W", "ME", "Q-DEC", "A", "1B30Min"] ) def test_tsplot_datetime(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) @@ -203,14 +204,14 @@ def test_line_plot_period_mlt_series(self, frqncy): _check_plot_works(s.plot, s.index.freq.rule_code) @pytest.mark.parametrize( - "freq", ["s", "min", "H", "D", "W", "M", "Q-DEC", "A", "1B30Min"] + "freq", ["s", "min", "H", "D", "W", "ME", "Q-DEC", "A", "1B30Min"] ) def test_line_plot_datetime_series(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _check_plot_works(ser.plot, ser.index.freq.rule_code) - @pytest.mark.parametrize("freq", ["s", "min", "H", "D", "W", "M", "Q", "A"]) + @pytest.mark.parametrize("freq", ["s", "min", "H", "D", "W", "ME", "Q", "A"]) def test_line_plot_period_frame(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) df = DataFrame( @@ -233,12 +234,13 @@ def test_line_plot_period_mlt_frame(self, frqncy): index=idx, columns=["A", "B", "C"], ) - freq = df.index.asfreq(df.index.freq.rule_code).freq + freq = freq_to_period_freqstr(1, df.index.freq.rule_code) + freq = df.index.asfreq(freq).freq _check_plot_works(df.plot, freq) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") @pytest.mark.parametrize( - "freq", ["s", "min", "H", "D", "W", "M", "Q-DEC", "A", "1B30Min"] + "freq", ["s", "min", "H", "D", "W", "ME", "Q-DEC", "A", "1B30Min"] ) def test_line_plot_datetime_frame(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) @@ -247,11 +249,12 @@ def test_line_plot_datetime_frame(self, freq): index=idx, columns=["A", "B", "C"], ) - freq = df.index.to_period(df.index.freq.rule_code).freq + freq = freq_to_period_freqstr(1, df.index.freq.rule_code) + freq = df.index.to_period(freq).freq _check_plot_works(df.plot, freq) @pytest.mark.parametrize( - "freq", ["s", "min", "H", "D", "W", "M", "Q-DEC", "A", "1B30Min"] + "freq", ["s", "min", "H", "D", "W", "ME", "Q-DEC", "A", "1B30Min"] ) def test_line_plot_inferred_freq(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) @@ -435,7 +438,7 @@ def test_get_finder(self): assert conv.get_finder(to_offset("B")) == conv._daily_finder assert conv.get_finder(to_offset("D")) == conv._daily_finder - assert conv.get_finder(to_offset("M")) == conv._monthly_finder + assert conv.get_finder(to_offset("ME")) == conv._monthly_finder assert conv.get_finder(to_offset("Q")) == conv._quarterly_finder assert conv.get_finder(to_offset("A")) == conv._annual_finder assert conv.get_finder(to_offset("W")) == conv._daily_finder @@ -801,7 +804,7 @@ def test_mixed_freq_irregular_first_df(self): def test_mixed_freq_hf_first(self): idxh = date_range("1/1/1999", periods=365, freq="D") - idxl = date_range("1/1/1999", periods=12, freq="M") + idxl = date_range("1/1/1999", periods=12, freq="ME") high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) _, ax = mpl.pyplot.subplots() @@ -825,7 +828,7 @@ def test_mixed_freq_alignment(self): def test_mixed_freq_lf_first(self): idxh = date_range("1/1/1999", periods=365, freq="D") - idxl = date_range("1/1/1999", periods=12, freq="M") + idxl = date_range("1/1/1999", periods=12, freq="ME") high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) _, ax = mpl.pyplot.subplots() @@ -862,7 +865,7 @@ def test_mixed_freq_irreg_period(self): def test_mixed_freq_shared_ax(self): # GH13341, using sharex=True - idx1 = date_range("2015-01-01", periods=3, freq="M") + idx1 = date_range("2015-01-01", periods=3, freq="ME") idx2 = idx1[:1].union(idx1[2:]) s1 = Series(range(len(idx1)), idx1) s2 = Series(range(len(idx2)), idx2) @@ -877,7 +880,7 @@ def test_mixed_freq_shared_ax(self): def test_mixed_freq_shared_ax_twin_x(self): # GH13341, using sharex=True - idx1 = date_range("2015-01-01", periods=3, freq="M") + idx1 = date_range("2015-01-01", periods=3, freq="ME") idx2 = idx1[:1].union(idx1[2:]) s1 = Series(range(len(idx1)), idx1) s2 = Series(range(len(idx2)), idx2) @@ -915,7 +918,7 @@ def test_nat_handling(self): def test_to_weekly_resampling(self): idxh = date_range("1/1/1999", periods=52, freq="W") - idxl = date_range("1/1/1999", periods=12, freq="M") + idxl = date_range("1/1/1999", periods=12, freq="ME") high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) _, ax = mpl.pyplot.subplots() @@ -926,7 +929,7 @@ def test_to_weekly_resampling(self): def test_from_weekly_resampling(self): idxh = date_range("1/1/1999", periods=52, freq="W") - idxl = date_range("1/1/1999", periods=12, freq="M") + idxl = date_range("1/1/1999", periods=12, freq="ME") high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) _, ax = mpl.pyplot.subplots() @@ -949,7 +952,7 @@ def test_from_weekly_resampling(self): @pytest.mark.parametrize("kind1, kind2", [("line", "area"), ("area", "line")]) def test_from_resampling_area_line_mixed(self, kind1, kind2): idxh = date_range("1/1/1999", periods=52, freq="W") - idxl = date_range("1/1/1999", periods=12, freq="M") + idxl = date_range("1/1/1999", periods=12, freq="ME") high = DataFrame( np.random.default_rng(2).random((len(idxh), 3)), index=idxh, @@ -1005,7 +1008,7 @@ def test_from_resampling_area_line_mixed(self, kind1, kind2): @pytest.mark.parametrize("kind1, kind2", [("line", "area"), ("area", "line")]) def test_from_resampling_area_line_mixed_high_to_low(self, kind1, kind2): idxh = date_range("1/1/1999", periods=52, freq="W") - idxl = date_range("1/1/1999", periods=12, freq="M") + idxl = date_range("1/1/1999", periods=12, freq="ME") high = DataFrame( np.random.default_rng(2).random((len(idxh), 3)), index=idxh, @@ -1211,7 +1214,7 @@ def test_time_musec(self): def test_secondary_upsample(self): idxh = date_range("1/1/1999", periods=365, freq="D") - idxl = date_range("1/1/1999", periods=12, freq="M") + idxl = date_range("1/1/1999", periods=12, freq="ME") high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) _, ax = mpl.pyplot.subplots() @@ -1329,7 +1332,7 @@ def test_secondary_legend_nonts_multi_col(self): @pytest.mark.xfail(reason="Api changed in 3.6.0") def test_format_date_axis(self): - rng = date_range("1/1/2012", periods=12, freq="M") + rng = date_range("1/1/2012", periods=12, freq="ME") df = DataFrame(np.random.default_rng(2).standard_normal((len(rng), 3)), rng) _, ax = mpl.pyplot.subplots() ax = df.plot(ax=ax) @@ -1632,8 +1635,10 @@ def _check_plot_works(f, freq=None, series=None, *args, **kwargs): if orig_axfreq is None: assert ax.freq == dfreq + if freq is not None: + ax_freq = to_offset(ax.freq, is_period=True) if freq is not None and orig_axfreq is None: - assert ax.freq == freq + assert ax_freq == freq ax = fig.add_subplot(212) kwargs["ax"] = ax diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 6f0afab53c267..e77dc0b305171 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -715,7 +715,7 @@ def test_errorbar_plot_yerr_0(self): ) def test_errorbar_plot_ts(self, yerr): # test time series plotting - ix = date_range("1/1/2000", "1/1/2001", freq="M") + ix = date_range("1/1/2000", "1/1/2001", freq="ME") ts = Series(np.arange(12), index=ix, name="x") yerr.index = ix diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index c39268f3b9477..ae28a010d8435 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -100,12 +100,12 @@ def test_raises_on_non_datetimelike_index(): @all_ts -@pytest.mark.parametrize("freq", ["M", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "H"]) def test_resample_empty_series(freq, empty_series_dti, resample_method): # GH12771 & GH12868 ser = empty_series_dti - if freq == "M" and isinstance(ser.index, TimedeltaIndex): + if freq == "ME" and isinstance(ser.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " "e.g. '24H' or '3D', not " @@ -113,7 +113,9 @@ def test_resample_empty_series(freq, empty_series_dti, resample_method): with pytest.raises(ValueError, match=msg): ser.resample(freq) return - + elif freq == "ME" and isinstance(ser.index, PeriodIndex): + # index is PeriodIndex, so convert to corresponding Period freq + freq = "M" rs = ser.resample(freq) result = getattr(rs, resample_method)() @@ -136,7 +138,7 @@ def test_resample_empty_series(freq, empty_series_dti, resample_method): @pytest.mark.parametrize( "freq", [ - pytest.param("M", marks=pytest.mark.xfail(reason="Don't know why this fails")), + pytest.param("ME", marks=pytest.mark.xfail(reason="Don't know why this fails")), "D", "H", ], @@ -162,12 +164,12 @@ def test_resample_nat_index_series(freq, series, resample_method): @all_ts -@pytest.mark.parametrize("freq", ["M", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "H"]) @pytest.mark.parametrize("resample_method", ["count", "size"]) def test_resample_count_empty_series(freq, empty_series_dti, resample_method): # GH28427 ser = empty_series_dti - if freq == "M" and isinstance(ser.index, TimedeltaIndex): + if freq == "ME" and isinstance(ser.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " "e.g. '24H' or '3D', not " @@ -175,7 +177,9 @@ def test_resample_count_empty_series(freq, empty_series_dti, resample_method): with pytest.raises(ValueError, match=msg): ser.resample(freq) return - + elif freq == "ME" and isinstance(ser.index, PeriodIndex): + # index is PeriodIndex, so convert to corresponding Period freq + freq = "M" rs = ser.resample(freq) result = getattr(rs, resample_method)() @@ -188,12 +192,12 @@ def test_resample_count_empty_series(freq, empty_series_dti, resample_method): @all_ts -@pytest.mark.parametrize("freq", ["M", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "H"]) def test_resample_empty_dataframe(empty_frame_dti, freq, resample_method): # GH13212 df = empty_frame_dti # count retains dimensions too - if freq == "M" and isinstance(df.index, TimedeltaIndex): + if freq == "ME" and isinstance(df.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " "e.g. '24H' or '3D', not " @@ -201,7 +205,9 @@ def test_resample_empty_dataframe(empty_frame_dti, freq, resample_method): with pytest.raises(ValueError, match=msg): df.resample(freq, group_keys=False) return - + elif freq == "ME" and isinstance(df.index, PeriodIndex): + # index is PeriodIndex, so convert to corresponding Period freq + freq = "M" rs = df.resample(freq, group_keys=False) result = getattr(rs, resample_method)() if resample_method == "ohlc": @@ -228,13 +234,13 @@ def test_resample_empty_dataframe(empty_frame_dti, freq, resample_method): @all_ts -@pytest.mark.parametrize("freq", ["M", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "H"]) def test_resample_count_empty_dataframe(freq, empty_frame_dti): # GH28427 empty_frame_dti["a"] = [] - if freq == "M" and isinstance(empty_frame_dti.index, TimedeltaIndex): + if freq == "ME" and isinstance(empty_frame_dti.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " "e.g. '24H' or '3D', not " @@ -242,7 +248,9 @@ def test_resample_count_empty_dataframe(freq, empty_frame_dti): with pytest.raises(ValueError, match=msg): empty_frame_dti.resample(freq) return - + elif freq == "ME" and isinstance(empty_frame_dti.index, PeriodIndex): + # index is PeriodIndex, so convert to corresponding Period freq + freq = "M" result = empty_frame_dti.resample(freq).count() index = _asfreq_compat(empty_frame_dti.index, freq) @@ -253,13 +261,13 @@ def test_resample_count_empty_dataframe(freq, empty_frame_dti): @all_ts -@pytest.mark.parametrize("freq", ["M", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "H"]) def test_resample_size_empty_dataframe(freq, empty_frame_dti): # GH28427 empty_frame_dti["a"] = [] - if freq == "M" and isinstance(empty_frame_dti.index, TimedeltaIndex): + if freq == "ME" and isinstance(empty_frame_dti.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " "e.g. '24H' or '3D', not " @@ -267,7 +275,9 @@ def test_resample_size_empty_dataframe(freq, empty_frame_dti): with pytest.raises(ValueError, match=msg): empty_frame_dti.resample(freq) return - + elif freq == "ME" and isinstance(empty_frame_dti.index, PeriodIndex): + # index is PeriodIndex, so convert to corresponding Period freq + freq = "M" result = empty_frame_dti.resample(freq).size() index = _asfreq_compat(empty_frame_dti.index, freq) @@ -298,12 +308,12 @@ def test_resample_empty_dtypes(index, dtype, resample_method): @all_ts -@pytest.mark.parametrize("freq", ["M", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "H"]) def test_apply_to_empty_series(empty_series_dti, freq): # GH 14313 ser = empty_series_dti - if freq == "M" and isinstance(empty_series_dti.index, TimedeltaIndex): + if freq == "ME" and isinstance(empty_series_dti.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " "e.g. '24H' or '3D', not " @@ -311,7 +321,9 @@ def test_apply_to_empty_series(empty_series_dti, freq): with pytest.raises(ValueError, match=msg): empty_series_dti.resample(freq) return - + elif freq == "ME" and isinstance(empty_series_dti.index, PeriodIndex): + # index is PeriodIndex, so convert to corresponding Period freq + freq = "M" result = ser.resample(freq, group_keys=False).apply(lambda x: 1) expected = ser.resample(freq).apply("sum") diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index a955fa0b096f0..d929dfa6e1e59 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -255,11 +255,11 @@ class FnClass: def __call__(self, x): return str(type(x)) - df_standard = df.resample("M").apply(fn) - df_lambda = df.resample("M").apply(lambda x: str(type(x))) - df_partial = df.resample("M").apply(partial(fn)) - df_partial2 = df.resample("M").apply(partial(fn, a=2)) - df_class = df.resample("M").apply(FnClass()) + df_standard = df.resample("ME").apply(fn) + df_lambda = df.resample("ME").apply(lambda x: str(type(x))) + df_partial = df.resample("ME").apply(partial(fn)) + df_partial2 = df.resample("ME").apply(partial(fn, a=2)) + df_class = df.resample("ME").apply(FnClass()) tm.assert_frame_equal(df_standard, df_lambda) tm.assert_frame_equal(df_standard, df_partial) @@ -428,14 +428,14 @@ def test_resample_frame_basic_cy_funcs(f, unit): df = tm.makeTimeDataFrame() df.index = df.index.as_unit(unit) - b = Grouper(freq="M") + b = Grouper(freq="ME") g = df.groupby(b) # check all cython functions work g._cython_agg_general(f, alt=None, numeric_only=True) -@pytest.mark.parametrize("freq", ["A", "M"]) +@pytest.mark.parametrize("freq", ["A", "ME"]) def test_resample_frame_basic_M_A(freq, unit): df = tm.makeTimeDataFrame() df.index = df.index.as_unit(unit) @@ -443,7 +443,7 @@ def test_resample_frame_basic_M_A(freq, unit): tm.assert_series_equal(result["A"], df["A"].resample(freq).mean()) -@pytest.mark.parametrize("freq", ["W-WED", "M"]) +@pytest.mark.parametrize("freq", ["W-WED", "ME"]) def test_resample_frame_basic_kind(freq, unit): df = tm.makeTimeDataFrame() df.index = df.index.as_unit(unit) @@ -517,7 +517,7 @@ def test_upsample_with_limit(unit): @pytest.mark.parametrize("freq", ["5D", "10H", "5Min", "10s"]) -@pytest.mark.parametrize("rule", ["Y", "3M", "15D", "30H", "15Min", "30s"]) +@pytest.mark.parametrize("rule", ["Y", "3ME", "15D", "30H", "15Min", "30s"]) def test_nearest_upsample_with_limit(tz_aware_fixture, freq, rule, unit): # GH 33939 rng = date_range("1/1/2000", periods=3, freq=freq, tz=tz_aware_fixture).as_unit( @@ -670,7 +670,7 @@ def test_resample_reresample(unit): [ ["A-DEC", {"start": "1990", "end": "2000", "freq": "a-dec"}], ["A-JUN", {"start": "1990", "end": "2000", "freq": "a-jun"}], - ["M", {"start": "1990-01", "end": "2000-01", "freq": "M"}], + ["ME", {"start": "1990-01", "end": "2000-01", "freq": "M"}], ], ) def test_resample_timestamp_to_period( @@ -710,7 +710,7 @@ def test_downsample_non_unique(unit): rng2 = rng.repeat(5).values ts = Series(np.random.default_rng(2).standard_normal(len(rng2)), index=rng2) - result = ts.resample("M").mean() + result = ts.resample("ME").mean() expected = ts.groupby(lambda x: x.month).mean() assert len(result) == 2 @@ -739,8 +739,8 @@ def test_resample_axis1(unit): warning_msg = "DataFrame.resample with axis=1 is deprecated." with tm.assert_produces_warning(FutureWarning, match=warning_msg): - result = df.resample("M", axis=1).mean() - expected = df.T.resample("M").mean().T + result = df.resample("ME", axis=1).mean() + expected = df.T.resample("ME").mean().T tm.assert_frame_equal(result, expected) @@ -765,7 +765,7 @@ def test_resample_single_group(end, unit): rng = date_range("2000-1-1", f"2000-{end}-10", freq="D").as_unit(unit) ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - tm.assert_series_equal(ts.resample("M").sum(), ts.resample("M").apply(mysum)) + tm.assert_series_equal(ts.resample("ME").sum(), ts.resample("ME").apply(mysum)) def test_resample_single_group_std(unit): @@ -1045,7 +1045,7 @@ def test_resample_to_period_monthly_buglet(unit): rng = date_range("1/1/2000", "12/31/2000").as_unit(unit) ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - result = ts.resample("M", kind="period").mean() + result = ts.resample("ME", kind="period").mean() exp_index = period_range("Jan-2000", "Dec-2000", freq="M") tm.assert_index_equal(result.index, exp_index) @@ -1138,7 +1138,7 @@ def test_monthly_resample_error(unit): dates = date_range("4/16/2012 20:00", periods=5000, freq="h").as_unit(unit) ts = Series(np.random.default_rng(2).standard_normal(len(dates)), index=dates) # it works! - ts.resample("M") + ts.resample("ME") def test_nanosecond_resample_error(): @@ -1163,20 +1163,20 @@ def test_resample_anchored_intraday(simple_date_range_series, unit): rng = date_range("1/1/2012", "4/1/2012", freq="100min").as_unit(unit) df = DataFrame(rng.month, index=rng) - result = df.resample("M").mean() - expected = df.resample("M", kind="period").mean().to_timestamp(how="end") + result = df.resample("ME").mean() + expected = df.resample("ME", kind="period").mean().to_timestamp(how="end") expected.index += Timedelta(1, "ns") - Timedelta(1, "D") expected.index = expected.index.as_unit(unit)._with_freq("infer") - assert expected.index.freq == "M" + assert expected.index.freq == "ME" tm.assert_frame_equal(result, expected) - result = df.resample("M", closed="left").mean() - exp = df.shift(1, freq="D").resample("M", kind="period").mean() + result = df.resample("ME", closed="left").mean() + exp = df.shift(1, freq="D").resample("ME", kind="period").mean() exp = exp.to_timestamp(how="end") exp.index = exp.index + Timedelta(1, "ns") - Timedelta(1, "D") exp.index = exp.index.as_unit(unit)._with_freq("infer") - assert exp.index.freq == "M" + assert exp.index.freq == "ME" tm.assert_frame_equal(result, exp) rng = date_range("1/1/2012", "4/1/2012", freq="100min").as_unit(unit) @@ -1201,7 +1201,7 @@ def test_resample_anchored_intraday(simple_date_range_series, unit): ts = simple_date_range_series("2012-04-29 23:00", "2012-04-30 5:00", freq="h") ts.index = ts.index.as_unit(unit) - resampled = ts.resample("M").mean() + resampled = ts.resample("ME").mean() assert len(resampled) == 1 @@ -1254,7 +1254,7 @@ def test_corner_cases_date(simple_date_range_series, unit): # resample to periods ts = simple_date_range_series("2000-04-28", "2000-04-30 11:00", freq="h") ts.index = ts.index.as_unit(unit) - result = ts.resample("M", kind="period").mean() + result = ts.resample("ME", kind="period").mean() assert len(result) == 1 assert result.index[0] == Period("2000-04", freq="M") @@ -1321,23 +1321,23 @@ def test_how_lambda_functions(simple_date_range_series, unit): ts = simple_date_range_series("1/1/2000", "4/1/2000") ts.index = ts.index.as_unit(unit) - result = ts.resample("M").apply(lambda x: x.mean()) - exp = ts.resample("M").mean() + result = ts.resample("ME").apply(lambda x: x.mean()) + exp = ts.resample("ME").mean() tm.assert_series_equal(result, exp) - foo_exp = ts.resample("M").mean() + foo_exp = ts.resample("ME").mean() foo_exp.name = "foo" - bar_exp = ts.resample("M").std() + bar_exp = ts.resample("ME").std() bar_exp.name = "bar" - result = ts.resample("M").apply([lambda x: x.mean(), lambda x: x.std(ddof=1)]) + result = ts.resample("ME").apply([lambda x: x.mean(), lambda x: x.std(ddof=1)]) result.columns = ["foo", "bar"] tm.assert_series_equal(result["foo"], foo_exp) tm.assert_series_equal(result["bar"], bar_exp) # this is a MI Series, so comparing the names of the results # doesn't make sense - result = ts.resample("M").aggregate( + result = ts.resample("ME").aggregate( {"foo": lambda x: x.mean(), "bar": lambda x: x.std(ddof=1)} ) tm.assert_series_equal(result["foo"], foo_exp, check_names=False) @@ -1398,10 +1398,10 @@ def test_resample_consistency(unit): def test_resample_timegrouper(dates): # GH 7227 df = DataFrame({"A": dates, "B": np.arange(len(dates))}) - result = df.set_index("A").resample("M").count() + result = df.set_index("A").resample("ME").count() exp_idx = DatetimeIndex( ["2014-07-31", "2014-08-31", "2014-09-30", "2014-10-31", "2014-11-30"], - freq="M", + freq="ME", name="A", ) expected = DataFrame({"B": [1, 0, 2, 2, 1]}, index=exp_idx) @@ -1409,11 +1409,11 @@ def test_resample_timegrouper(dates): expected.index = expected.index._with_freq(None) tm.assert_frame_equal(result, expected) - result = df.groupby(Grouper(freq="M", key="A")).count() + result = df.groupby(Grouper(freq="ME", key="A")).count() tm.assert_frame_equal(result, expected) df = DataFrame({"A": dates, "B": np.arange(len(dates)), "C": np.arange(len(dates))}) - result = df.set_index("A").resample("M").count() + result = df.set_index("A").resample("ME").count() expected = DataFrame( {"B": [1, 0, 2, 2, 1], "C": [1, 0, 2, 2, 1]}, index=exp_idx, @@ -1423,7 +1423,7 @@ def test_resample_timegrouper(dates): expected.index = expected.index._with_freq(None) tm.assert_frame_equal(result, expected) - result = df.groupby(Grouper(freq="M", key="A")).count() + result = df.groupby(Grouper(freq="ME", key="A")).count() tm.assert_frame_equal(result, expected) @@ -1485,7 +1485,7 @@ def test_resample_nunique_with_date_gap(func, unit): index2 = date_range("4-15-2000", "5-15-2000", freq="h").as_unit(unit) index3 = index.append(index2) s = Series(range(len(index3)), index=index3, dtype="int64") - r = s.resample("M") + r = s.resample("ME") result = r.count() expected = func(r) tm.assert_series_equal(result, expected) @@ -1872,9 +1872,9 @@ def test_resample_equivalent_offsets(n1, freq1, n2, freq2, k, unit): ("19910905", "19920406", "D", "19910905", "19920407"), ("19910905 00:00", "19920406 06:00", "D", "19910905", "19920407"), ("19910905 06:00", "19920406 06:00", "H", "19910905 06:00", "19920406 07:00"), - ("19910906", "19920406", "M", "19910831", "19920430"), - ("19910831", "19920430", "M", "19910831", "19920531"), - ("1991-08", "1992-04", "M", "19910831", "19920531"), + ("19910906", "19920406", "ME", "19910831", "19920430"), + ("19910831", "19920430", "ME", "19910831", "19920531"), + ("1991-08", "1992-04", "ME", "19910831", "19920531"), ], ) def test_get_timestamp_range_edges(first, last, freq, exp_first, exp_last, unit): @@ -1895,7 +1895,7 @@ def test_get_timestamp_range_edges(first, last, freq, exp_first, exp_last, unit) @pytest.mark.parametrize("duplicates", [True, False]) def test_resample_apply_product(duplicates, unit): # GH 5586 - index = date_range(start="2012-01-31", freq="M", periods=12).as_unit(unit) + index = date_range(start="2012-01-31", freq="ME", periods=12).as_unit(unit) ts = Series(range(12), index=index) df = DataFrame({"A": ts, "B": ts + 2}) @@ -1968,7 +1968,7 @@ def test_resample_calendar_day_with_dst( @pytest.mark.parametrize("func", ["min", "max", "first", "last"]) def test_resample_aggregate_functions_min_count(func, unit): # GH#37768 - index = date_range(start="2020", freq="M", periods=3).as_unit(unit) + index = date_range(start="2020", freq="ME", periods=3).as_unit(unit) ser = Series([1, np.nan, np.nan], index) result = getattr(ser.resample("Q"), func)(min_count=2) expected = Series( @@ -2041,3 +2041,13 @@ def test_resample_empty_series_with_tz(): ) expected = Series([], index=expected_idx, name="values", dtype="float64") tm.assert_series_equal(result, expected) + + +def test_resample_M_deprecated(): + depr_msg = r"\'M\' will be deprecated, please use \'ME\' for \'month end\'" + + s = Series(range(10), index=date_range("20130101", freq="d", periods=10)) + expected = s.resample("2ME").mean() + with tm.assert_produces_warning(UserWarning, match=depr_msg): + result = s.resample("2M").mean() + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index bc0c1984cf2f3..db3804a6600b9 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -103,17 +103,19 @@ def test_selection(self, index, freq, kind, kwargs): @pytest.mark.parametrize("month", MONTHS) @pytest.mark.parametrize("meth", ["ffill", "bfill"]) @pytest.mark.parametrize("conv", ["start", "end"]) - @pytest.mark.parametrize("targ", ["D", "B", "M"]) + @pytest.mark.parametrize( + ("offset", "period"), [("D", "D"), ("B", "B"), ("ME", "M")] + ) def test_annual_upsample_cases( - self, targ, conv, meth, month, simple_period_range_series + self, offset, period, conv, meth, month, simple_period_range_series ): ts = simple_period_range_series("1/1/1990", "12/31/1991", freq=f"A-{month}") - warn = FutureWarning if targ == "B" else None + warn = FutureWarning if period == "B" else None msg = r"PeriodDtype\[B\] is deprecated" with tm.assert_produces_warning(warn, match=msg): - result = getattr(ts.resample(targ, convention=conv), meth)() - expected = result.to_timestamp(targ, how=conv) - expected = expected.asfreq(targ, meth).to_period() + result = getattr(ts.resample(period, convention=conv), meth)() + expected = result.to_timestamp(period, how=conv) + expected = expected.asfreq(offset, meth).to_period() tm.assert_series_equal(result, expected) def test_basic_downsample(self, simple_period_range_series): @@ -182,19 +184,21 @@ def test_annual_upsample(self, simple_period_range_series): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("month", MONTHS) - @pytest.mark.parametrize("target", ["D", "B", "M"]) @pytest.mark.parametrize("convention", ["start", "end"]) + @pytest.mark.parametrize( + ("offset", "period"), [("D", "D"), ("B", "B"), ("ME", "M")] + ) def test_quarterly_upsample( - self, month, target, convention, simple_period_range_series + self, month, offset, period, convention, simple_period_range_series ): freq = f"Q-{month}" ts = simple_period_range_series("1/1/1990", "12/31/1995", freq=freq) - warn = FutureWarning if target == "B" else None + warn = FutureWarning if period == "B" else None msg = r"PeriodDtype\[B\] is deprecated" with tm.assert_produces_warning(warn, match=msg): - result = ts.resample(target, convention=convention).ffill() - expected = result.to_timestamp(target, how=convention) - expected = expected.asfreq(target, "ffill").to_period() + result = ts.resample(period, convention=convention).ffill() + expected = result.to_timestamp(period, how=convention) + expected = expected.asfreq(offset, "ffill").to_period() tm.assert_series_equal(result, expected) @pytest.mark.parametrize("target", ["D", "B"]) @@ -365,8 +369,8 @@ def test_fill_method_and_how_upsample(self): np.arange(9, dtype="int64"), index=date_range("2010-01-01", periods=9, freq="Q"), ) - last = s.resample("M").ffill() - both = s.resample("M").ffill().resample("M").last().astype("int64") + last = s.resample("ME").ffill() + both = s.resample("ME").ffill().resample("ME").last().astype("int64") tm.assert_series_equal(last, both) @pytest.mark.parametrize("day", DAYS) @@ -630,7 +634,7 @@ def test_resample_bms_2752(self): @pytest.mark.xfail(reason="Commented out for more than 3 years. Should this work?") def test_monthly_convention_span(self): - rng = period_range("2000-01", periods=3, freq="M") + rng = period_range("2000-01", periods=3, freq="ME") ts = Series(np.arange(3), index=rng) # hacky way to get same thing @@ -643,7 +647,7 @@ def test_monthly_convention_span(self): tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "from_freq, to_freq", [("D", "M"), ("Q", "A"), ("M", "Q"), ("D", "W")] + "from_freq, to_freq", [("D", "ME"), ("Q", "A"), ("ME", "Q"), ("D", "W")] ) def test_default_right_closed_label(self, from_freq, to_freq): idx = date_range(start="8/15/2012", periods=100, freq=from_freq) @@ -656,7 +660,7 @@ def test_default_right_closed_label(self, from_freq, to_freq): @pytest.mark.parametrize( "from_freq, to_freq", - [("D", "MS"), ("Q", "AS"), ("M", "QS"), ("H", "D"), ("min", "H")], + [("D", "MS"), ("Q", "AS"), ("ME", "QS"), ("H", "D"), ("min", "H")], ) def test_default_left_closed_label(self, from_freq, to_freq): idx = date_range(start="8/15/2012", periods=100, freq=from_freq) @@ -837,7 +841,6 @@ def test_resample_with_only_nat(self): ("19910905 12:00", "19910909 12:00", "H", "24H", "34H"), ("19910905 12:00", "19910909 12:00", "H", "17H", "10H"), ("19910905 12:00", "19910909 12:00", "H", "17H", "3H"), - ("19910905 12:00", "19910909 1:00", "H", "M", "3H"), ("19910905", "19910913 06:00", "2H", "24H", "10H"), ("19910905", "19910905 01:39", "Min", "5Min", "3Min"), ("19910905", "19910905 03:18", "2Min", "5Min", "3Min"), @@ -851,43 +854,54 @@ def test_resample_with_offset(self, start, end, start_freq, end_freq, offset): result = result.to_timestamp(end_freq) expected = ser.to_timestamp().resample(end_freq, offset=offset).mean() - if end_freq == "M": - # TODO: is non-tick the relevant characteristic? (GH 33815) - expected.index = expected.index._with_freq(None) + tm.assert_series_equal(result, expected) + + def test_resample_with_offset_month(self): + # GH 23882 & 31809 + pi = period_range("19910905 12:00", "19910909 1:00", freq="H") + ser = Series(np.arange(len(pi)), index=pi) + result = ser.resample("M", offset="3H").mean() + result = result.to_timestamp("M") + expected = ser.to_timestamp().resample("ME", offset="3H").mean() + # TODO: is non-tick the relevant characteristic? (GH 33815) + expected.index = expected.index._with_freq(None) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "first,last,freq,exp_first,exp_last", + "first,last,freq,freq_to_offset,exp_first,exp_last", [ - ("19910905", "19920406", "D", "19910905", "19920406"), - ("19910905 00:00", "19920406 06:00", "D", "19910905", "19920406"), + ("19910905", "19920406", "D", "D", "19910905", "19920406"), + ("19910905 00:00", "19920406 06:00", "D", "D", "19910905", "19920406"), ( "19910905 06:00", "19920406 06:00", "H", + "H", "19910905 06:00", "19920406 06:00", ), - ("19910906", "19920406", "M", "1991-09", "1992-04"), - ("19910831", "19920430", "M", "1991-08", "1992-04"), - ("1991-08", "1992-04", "M", "1991-08", "1992-04"), + ("19910906", "19920406", "M", "ME", "1991-09", "1992-04"), + ("19910831", "19920430", "M", "ME", "1991-08", "1992-04"), + ("1991-08", "1992-04", "M", "ME", "1991-08", "1992-04"), ], ) - def test_get_period_range_edges(self, first, last, freq, exp_first, exp_last): + def test_get_period_range_edges( + self, first, last, freq, freq_to_offset, exp_first, exp_last + ): first = Period(first) last = Period(last) exp_first = Period(exp_first, freq=freq) exp_last = Period(exp_last, freq=freq) - freq = pd.tseries.frequencies.to_offset(freq) + freq = pd.tseries.frequencies.to_offset(freq_to_offset) result = _get_period_range_edges(first, last, freq) expected = (exp_first, exp_last) assert result == expected def test_sum_min_count(self): # GH 19974 - index = date_range(start="2018", freq="M", periods=6) + index = date_range(start="2018", freq="ME", periods=6) data = np.ones(6) data[3:6] = np.nan s = Series(data, index).to_period() @@ -915,3 +929,11 @@ def test_resample_t_l_deprecated(self): with tm.assert_produces_warning(FutureWarning, match=msg_t): result = ser.resample("T").mean() tm.assert_series_equal(result, expected) + + +def test_resample_frequency_ME_error_message(series_and_frame): + msg = "Invalid frequency: 2ME" + + obj = series_and_frame + with pytest.raises(ValueError, match=msg): + obj.resample("2ME") diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index f331851596317..86a7439410d8b 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -599,7 +599,7 @@ def test_multi_agg_axis_1_raises(func): ).T warning_msg = "DataFrame.resample with axis=1 is deprecated." with tm.assert_produces_warning(FutureWarning, match=warning_msg): - res = df.resample("M", axis=1) + res = df.resample("ME", axis=1) with pytest.raises( NotImplementedError, match="axis other than 0 is not supported" ): @@ -1023,7 +1023,7 @@ def test_df_axis_param_depr(): # Deprecation error when axis=1 is explicitly passed warning_msg = "DataFrame.resample with axis=1 is deprecated." with tm.assert_produces_warning(FutureWarning, match=warning_msg): - df.resample("M", axis=1) + df.resample("ME", axis=1) # Deprecation error when axis=0 is explicitly passed df = df.T @@ -1032,7 +1032,7 @@ def test_df_axis_param_depr(): "will be removed in a future version." ) with tm.assert_produces_warning(FutureWarning, match=warning_msg): - df.resample("M", axis=0) + df.resample("ME", axis=0) def test_series_axis_param_depr(_test_series): diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index d47a8132f26bb..f394e3a25dc0f 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -310,14 +310,14 @@ def f(x): s = Series([1, 2], index=["a", "b"]) return s - expected = df.groupby(pd.Grouper(freq="M")).apply(f) + expected = df.groupby(pd.Grouper(freq="ME")).apply(f) - result = df.resample("M").apply(f) + result = df.resample("ME").apply(f) tm.assert_frame_equal(result, expected) # A case for series - expected = df["col1"].groupby(pd.Grouper(freq="M"), group_keys=False).apply(f) - result = df["col1"].resample("M").apply(f) + expected = df["col1"].groupby(pd.Grouper(freq="ME"), group_keys=False).apply(f) + result = df["col1"].resample("ME").apply(f) tm.assert_series_equal(result, expected) @@ -469,7 +469,7 @@ def test_resample_groupby_agg_listlike(): # GH 42905 ts = Timestamp("2021-02-28 00:00:00") df = DataFrame({"class": ["beta"], "value": [69]}, index=Index([ts], name="date")) - resampled = df.groupby("class").resample("M")["value"] + resampled = df.groupby("class").resample("ME")["value"] result = resampled.agg(["sum", "size"]) expected = DataFrame( [[69, 1]], diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 8b1eab552c97d..3e0922228cb74 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -70,7 +70,7 @@ def test_apply_iteration(): N = 1000 ind = date_range(start="2000-01-01", freq="D", periods=N) df = DataFrame({"open": 1, "close": 2}, index=ind) - tg = Grouper(freq="M") + tg = Grouper(freq="ME") grouper, _ = tg._get_grouper(df) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 28ad133a0c8d6..8435f4a189c56 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -436,7 +436,7 @@ def test_pivot_no_values(self): }, index=idx, ) - res = df.pivot_table(index=df.index.month, columns=Grouper(key="dt", freq="M")) + res = df.pivot_table(index=df.index.month, columns=Grouper(key="dt", freq="ME")) exp_columns = MultiIndex.from_tuples([("A", pd.Timestamp("2011-01-31"))]) exp_columns.names = [None, "dt"] exp = DataFrame( @@ -445,7 +445,7 @@ def test_pivot_no_values(self): tm.assert_frame_equal(res, exp) res = df.pivot_table( - index=Grouper(freq="A"), columns=Grouper(key="dt", freq="M") + index=Grouper(freq="A"), columns=Grouper(key="dt", freq="ME") ) exp = DataFrame( [3.0], index=pd.DatetimeIndex(["2011-12-31"], freq="A"), columns=exp_columns @@ -1436,8 +1436,8 @@ def test_pivot_timegrouper_double(self): result = pivot_table( df, - index=Grouper(freq="M", key="Date"), - columns=Grouper(freq="M", key="PayDay"), + index=Grouper(freq="ME", key="Date"), + columns=Grouper(freq="ME", key="PayDay"), values="Quantity", aggfunc="sum", ) @@ -1469,7 +1469,7 @@ def test_pivot_timegrouper_double(self): datetime(2013, 11, 30), datetime(2013, 12, 31), ], - freq="M", + freq="ME", ), columns=pd.DatetimeIndex( [ @@ -1478,7 +1478,7 @@ def test_pivot_timegrouper_double(self): datetime(2013, 11, 30), datetime(2013, 12, 31), ], - freq="M", + freq="ME", ), ) expected.index.name = "Date" @@ -1488,8 +1488,8 @@ def test_pivot_timegrouper_double(self): result = pivot_table( df, - index=Grouper(freq="M", key="PayDay"), - columns=Grouper(freq="M", key="Date"), + index=Grouper(freq="ME", key="PayDay"), + columns=Grouper(freq="ME", key="Date"), values="Quantity", aggfunc="sum", ) @@ -1515,7 +1515,7 @@ def test_pivot_timegrouper_double(self): result = pivot_table( df, - index=[Grouper(freq="M", key="Date"), Grouper(freq="M", key="PayDay")], + index=[Grouper(freq="ME", key="Date"), Grouper(freq="ME", key="PayDay")], columns=["Branch"], values="Quantity", aggfunc="sum", @@ -1525,7 +1525,7 @@ def test_pivot_timegrouper_double(self): result = pivot_table( df, index=["Branch"], - columns=[Grouper(freq="M", key="Date"), Grouper(freq="M", key="PayDay")], + columns=[Grouper(freq="ME", key="Date"), Grouper(freq="ME", key="PayDay")], values="Quantity", aggfunc="sum", ) @@ -1731,7 +1731,7 @@ def test_daily(self, i): @pytest.mark.parametrize("i", range(1, 13)) def test_monthly(self, i): - rng = date_range("1/1/2000", "12/31/2004", freq="M") + rng = date_range("1/1/2000", "12/31/2004", freq="ME") ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) annual = pivot_table(DataFrame(ts), index=ts.index.year, columns=ts.index.month) @@ -1770,7 +1770,7 @@ def test_pivot_table_margins_name_with_aggfunc_list(self): { "item": ["bacon", "cheese", "bacon", "cheese"], "cost": [2.5, 4.5, 3.2, 3.3], - "day": ["M", "M", "T", "T"], + "day": ["ME", "ME", "T", "T"], } ) table = costs.pivot_table( @@ -1782,10 +1782,10 @@ def test_pivot_table_margins_name_with_aggfunc_list(self): ) ix = Index(["bacon", "cheese", margins_name], dtype="object", name="item") tups = [ - ("mean", "cost", "M"), + ("mean", "cost", "ME"), ("mean", "cost", "T"), ("mean", "cost", margins_name), - ("max", "cost", "M"), + ("max", "cost", "ME"), ("max", "cost", "T"), ("max", "cost", margins_name), ] diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index a152da15fe71f..7d07f327e3978 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -69,10 +69,7 @@ def test_construction(self): assert i1 == i3 i4 = Period("2005", freq="M") - i5 = Period("2005", freq="m") - assert i1 != i4 - assert i4 == i5 i1 = Period.now(freq="Q") i2 = Period(datetime.now(), freq="Q") @@ -1614,3 +1611,9 @@ def test_invalid_frequency_error_message(): msg = "Invalid frequency: " with pytest.raises(ValueError, match=msg): Period("2012-01-02", freq="WOM-1MON") + + +def test_invalid_frequency_period_error_message(): + msg = "Invalid frequency: ME" + with pytest.raises(ValueError, match=msg): + Period("2012-01-02", freq="ME") diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index cb797a4168088..b1483342eb6e4 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -608,6 +608,7 @@ def test_unit_parser(self, unit, np_unit, wrapper): @pytest.mark.parametrize("unit", ["Y", "y", "M"]) def test_unit_m_y_raises(self, unit): msg = "Units 'M', 'Y', and 'y' are no longer supported" + with pytest.raises(ValueError, match=msg): Timedelta(10, unit) @@ -690,7 +691,7 @@ def test_round_invalid(self): for freq, msg in [ ("Y", " is a non-fixed frequency"), - ("M", " is a non-fixed frequency"), + ("ME", " is a non-fixed frequency"), ("foobar", "Invalid frequency: foobar"), ]: with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index c7f9e7da4f398..fba16749c026e 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -499,7 +499,7 @@ def test_dt_accessor_datetime_name_accessors(self, time_locale): ser = pd.concat([ser, Series([pd.NaT])]) assert np.isnan(ser.dt.day_name(locale=time_locale).iloc[-1]) - ser = Series(date_range(freq="M", start="2012", end="2013")) + ser = Series(date_range(freq="ME", start="2012", end="2013")) result = ser.dt.month_name(locale=time_locale) expected = Series([month.capitalize() for month in expected_months]) diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index dbaba146e600e..317967bcbb7ff 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -411,7 +411,7 @@ def compare(slobj): def test_indexing_unordered2(): # diff freq - rng = date_range(datetime(2005, 1, 1), periods=20, freq="M") + rng = date_range(datetime(2005, 1, 1), periods=20, freq="ME") ts = Series(np.arange(len(rng)), index=rng) ts = ts.take(np.random.default_rng(2).permutation(20)) @@ -421,7 +421,7 @@ def test_indexing_unordered2(): def test_indexing(): - idx = date_range("2001-1-1", periods=20, freq="M") + idx = date_range("2001-1-1", periods=20, freq="ME") ts = Series(np.random.default_rng(2).random(len(idx)), index=idx) # getting diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index d2d8eab1cb38b..aabed794ac557 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -16,7 +16,7 @@ class TestCombineFirst: def test_combine_first_period_datetime(self): # GH#3367 - didx = date_range(start="1950-01-31", end="1950-07-31", freq="M") + didx = date_range(start="1950-01-31", end="1950-07-31", freq="ME") pidx = period_range(start=Period("1950-1"), end=Period("1950-7"), freq="M") # check to be consistent with DatetimeIndex for idx in [didx, pidx]: diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 2c3fdf627788a..d45c655a4c0a2 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1022,7 +1022,7 @@ def test_constructor_dtype_datetime64_8(self): def test_constructor_dtype_datetime64_7(self): # GH6529 # coerce datetime64 non-ns properly - dates = date_range("01-Jan-2015", "01-Dec-2015", freq="M") + dates = date_range("01-Jan-2015", "01-Dec-2015", freq="ME") values2 = dates.view(np.ndarray).astype("datetime64[ns]") expected = Series(values2, index=dates) diff --git a/pandas/tests/tseries/frequencies/test_freq_code.py b/pandas/tests/tseries/frequencies/test_freq_code.py index f25477afa2626..417bf0e90201b 100644 --- a/pandas/tests/tseries/frequencies/test_freq_code.py +++ b/pandas/tests/tseries/frequencies/test_freq_code.py @@ -13,7 +13,7 @@ @pytest.mark.parametrize( "freqstr,exp_freqstr", - [("D", "D"), ("W", "D"), ("M", "D"), ("s", "s"), ("min", "s"), ("H", "s")], + [("D", "D"), ("W", "D"), ("ME", "D"), ("s", "s"), ("min", "s"), ("H", "s")], ) def test_get_to_timestamp_base(freqstr, exp_freqstr): off = to_offset(freqstr) @@ -89,7 +89,7 @@ def test_cat(args): ("1H", "2021-01-01T09:00:00"), ("1D", "2021-01-02T08:00:00"), ("1W", "2021-01-03T08:00:00"), - ("1M", "2021-01-31T08:00:00"), + ("1ME", "2021-01-31T08:00:00"), ("1Y", "2021-12-31T08:00:00"), ], ) diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index ab7bda2fa5792..82cceeac2cd25 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -53,7 +53,7 @@ def base_delta_code_pair(request): freqs = ( [f"Q-{month}" for month in MONTHS] + [f"{annual}-{month}" for annual in ["A", "BA"] for month in MONTHS] - + ["M", "BM", "BMS"] + + ["ME", "BM", "BMS"] + [f"WOM-{count}{day}" for count in range(1, 5) for day in DAYS] + [f"W-{day}" for day in DAYS] ) @@ -162,7 +162,7 @@ def test_fifth_week_of_month(): def test_monthly_ambiguous(): rng = DatetimeIndex(["1/31/2000", "2/29/2000", "3/31/2000"]) - assert rng.inferred_freq == "M" + assert rng.inferred_freq == "ME" def test_annual_ambiguous(): @@ -217,7 +217,7 @@ def test_infer_freq_index(freq, expected): { "AS-JAN": ["2009-01-01", "2010-01-01", "2011-01-01", "2012-01-01"], "Q-OCT": ["2009-01-31", "2009-04-30", "2009-07-31", "2009-10-31"], - "M": ["2010-11-30", "2010-12-31", "2011-01-31", "2011-02-28"], + "ME": ["2010-11-30", "2010-12-31", "2011-01-31", "2011-02-28"], "W-SAT": ["2010-12-25", "2011-01-01", "2011-01-08", "2011-01-15"], "D": ["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"], "H": [ @@ -450,7 +450,7 @@ def test_series_period_index(freq): frequencies.infer_freq(s) -@pytest.mark.parametrize("freq", ["M", "ms", "s"]) +@pytest.mark.parametrize("freq", ["ME", "ms", "s"]) def test_series_datetime_index(freq): s = Series(date_range("20130101", periods=10, freq=freq)) inferred = frequencies.infer_freq(s) diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 29215101a84e0..a7e9854c38f18 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -811,7 +811,7 @@ def test_alias_equality(self): assert k == v.copy() def test_rule_code(self): - lst = ["M", "MS", "BM", "BMS", "D", "B", "H", "min", "s", "ms", "us"] + lst = ["ME", "MS", "BM", "BMS", "D", "B", "H", "min", "s", "ms", "us"] for k in lst: assert k == _get_offset(k).rule_code # should be cached - this is kind of an internals test... diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 2c8a6827a3bf1..ec3579109e7a4 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -168,7 +168,7 @@ def test_parsers_quarter_invalid(date_str): [("201101", datetime(2011, 1, 1, 0, 0)), ("200005", datetime(2000, 5, 1, 0, 0))], ) def test_parsers_month_freq(date_str, expected): - result, _ = parsing.parse_datetime_string_with_reso(date_str, freq="M") + result, _ = parsing.parse_datetime_string_with_reso(date_str, freq="ME") assert result == expected diff --git a/pandas/tests/tslibs/test_period_asfreq.py b/pandas/tests/tslibs/test_period_asfreq.py index 49cb1af9406fe..99f0a82d6711e 100644 --- a/pandas/tests/tslibs/test_period_asfreq.py +++ b/pandas/tests/tslibs/test_period_asfreq.py @@ -15,7 +15,7 @@ def get_freq_code(freqstr: str) -> int: - off = to_offset(freqstr) + off = to_offset(freqstr, is_period=True) # error: "BaseOffset" has no attribute "_period_dtype_code" code = off._period_dtype_code # type: ignore[attr-defined] return code diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index af88bd7b2a6d6..7aa245341cbdd 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -19,6 +19,10 @@ MONTHS, int_to_weekday, ) +from pandas._libs.tslibs.dtypes import ( + OFFSET_TO_PERIOD_FREQSTR, + freq_to_period_freqstr, +) from pandas._libs.tslibs.fields import ( build_field_sarray, month_position_check, @@ -53,58 +57,29 @@ ) from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin # --------------------------------------------------------------------- -# Offset names ("time rules") and related functions - -_offset_to_period_map = { - "WEEKDAY": "D", - "EOM": "M", - "BM": "M", - "BQS": "Q", - "QS": "Q", - "BQ": "Q", - "BA": "A", - "AS": "A", - "BAS": "A", - "MS": "M", - "D": "D", - "B": "B", - "min": "min", - "s": "s", - "ms": "ms", - "us": "us", - "ns": "ns", - "H": "H", - "Q": "Q", - "A": "A", - "W": "W", - "M": "M", - "Y": "A", - "BY": "A", - "YS": "A", - "BYS": "A", -} +# Offset related functions _need_suffix = ["QS", "BQ", "BQS", "YS", "AS", "BY", "BA", "BYS", "BAS"] for _prefix in _need_suffix: for _m in MONTHS: key = f"{_prefix}-{_m}" - _offset_to_period_map[key] = _offset_to_period_map[_prefix] + OFFSET_TO_PERIOD_FREQSTR[key] = OFFSET_TO_PERIOD_FREQSTR[_prefix] for _prefix in ["A", "Q"]: for _m in MONTHS: _alias = f"{_prefix}-{_m}" - _offset_to_period_map[_alias] = _alias + OFFSET_TO_PERIOD_FREQSTR[_alias] = _alias for _d in DAYS: - _offset_to_period_map[f"W-{_d}"] = f"W-{_d}" + OFFSET_TO_PERIOD_FREQSTR[f"W-{_d}"] = f"W-{_d}" def get_period_alias(offset_str: str) -> str | None: """ Alias to closest period strings BQ->Q etc. """ - return _offset_to_period_map.get(offset_str, None) + return OFFSET_TO_PERIOD_FREQSTR.get(offset_str, None) # --------------------------------------------------------------------- @@ -394,7 +369,7 @@ def _get_monthly_rule(self) -> str | None: if pos_check is None: return None else: - return {"cs": "MS", "bs": "BMS", "ce": "M", "be": "BM"}.get(pos_check) + return {"cs": "MS", "bs": "BMS", "ce": "ME", "be": "BM"}.get(pos_check) def _is_business_daily(self) -> bool: # quick check: cannot be business daily @@ -584,7 +559,7 @@ def _maybe_coerce_freq(code) -> str: """ assert code is not None if isinstance(code, DateOffset): - code = code.rule_code + code = freq_to_period_freqstr(1, code.name) if code in {"min", "s", "ms", "us", "ns"}: return code else: From 28f274cb107ef0a252eb4cb954c1cfbe43968395 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 20 Sep 2023 10:04:23 -0700 Subject: [PATCH 024/131] BUG: IntervalIndex.get_indexer incorrectly matching ints to datetimes (#54964) * REF: separate out _nbins_to_bins * Cast x to Index early * BUG: IntervalIndex.get_indexer incorrectly matching ints to datetimes * GH ref --- doc/source/whatsnew/v2.2.0.rst | 3 + pandas/core/indexes/base.py | 6 +- pandas/core/reshape/tile.py | 168 +++++++----------- .../tests/indexes/interval/test_indexing.py | 2 +- pandas/tests/reshape/test_cut.py | 26 ++- 5 files changed, 92 insertions(+), 113 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 3ad28222cc57a..8eab623a2b5f7 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -288,6 +288,8 @@ Strings Interval ^^^^^^^^ - Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown. (:issue:`55015`) +- Bug in :meth:`IntervalIndex.get_indexer` with datetime or timedelta intervals incorrectly matching on integer targets (:issue:`47772`) +- Bug in :meth:`IntervalIndex.get_indexer` with timezone-aware datetime intervals incorrectly matching on a sequence of timezone-naive targets (:issue:`47772`) - Indexing @@ -349,6 +351,7 @@ Styler Other ^^^^^ +- Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`) .. ***DO NOT USE THIS SECTION*** diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3adb4dfa227db..8703fef1e5940 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3938,12 +3938,8 @@ def _should_partial_index(self, target: Index) -> bool: if isinstance(self.dtype, IntervalDtype): if isinstance(target.dtype, IntervalDtype): return False - # See https://github.com/pandas-dev/pandas/issues/47772 the commented - # out code can be restored (instead of hardcoding `return True`) - # once that issue is fixed # "Index" has no attribute "left" - # return self.left._should_compare(target) # type: ignore[attr-defined] - return True + return self.left._should_compare(target) # type: ignore[attr-defined] return False @final diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 126f589f5df71..980e8aa41669f 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -17,10 +17,8 @@ Timestamp, lib, ) -from pandas._libs.lib import infer_dtype from pandas.core.dtypes.common import ( - DT64NS_DTYPE, ensure_platform_int, is_bool_dtype, is_integer, @@ -243,7 +241,7 @@ def cut( original = x x_idx = _preprocess_for_cut(x) - x_idx, dtype = _coerce_to_type(x_idx) + x_idx, _ = _coerce_to_type(x_idx) if not np.iterable(bins): bins = _nbins_to_bins(x_idx, bins, right) @@ -254,16 +252,8 @@ def cut( else: bins = Index(bins) - if isinstance(getattr(bins, "dtype", None), DatetimeTZDtype): - bins = np.asarray(bins, dtype=DT64NS_DTYPE) - else: - bins = np.asarray(bins) - bins = _convert_bin_to_numeric_type(bins, dtype) - - # GH 26045: cast to float64 to avoid an overflow - if (np.diff(bins.astype("float64")) < 0).any(): + if not bins.is_monotonic_increasing: raise ValueError("bins must increase monotonically.") - bins = Index(bins) fac, bins = _bins_to_cuts( x_idx, @@ -272,12 +262,11 @@ def cut( labels=labels, precision=precision, include_lowest=include_lowest, - dtype=dtype, duplicates=duplicates, ordered=ordered, ) - return _postprocess_for_cut(fac, bins, retbins, dtype, original) + return _postprocess_for_cut(fac, bins, retbins, original) def qcut( @@ -343,13 +332,11 @@ def qcut( """ original = x x_idx = _preprocess_for_cut(x) - x_idx, dtype = _coerce_to_type(x_idx) + x_idx, _ = _coerce_to_type(x_idx) quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q - x_np = np.asarray(x_idx) - x_np = x_np[~np.isnan(x_np)] - bins = np.quantile(x_np, quantiles) + bins = x_idx.to_series().dropna().quantile(quantiles) fac, bins = _bins_to_cuts( x_idx, @@ -357,11 +344,10 @@ def qcut( labels=labels, precision=precision, include_lowest=True, - dtype=dtype, duplicates=duplicates, ) - return _postprocess_for_cut(fac, bins, retbins, dtype, original) + return _postprocess_for_cut(fac, bins, retbins, original) def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index: @@ -378,18 +364,41 @@ def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index: rng = (x_idx.min(), x_idx.max()) mn, mx = rng - if np.isinf(mn) or np.isinf(mx): + is_dt_or_td = lib.is_np_dtype(x_idx.dtype, "mM") or isinstance( + x_idx.dtype, DatetimeTZDtype + ) + + if is_numeric_dtype(x_idx.dtype) and (np.isinf(mn) or np.isinf(mx)): # GH#24314 raise ValueError( "cannot specify integer `bins` when input data contains infinity" ) if mn == mx: # adjust end points before binning - mn -= 0.001 * abs(mn) if mn != 0 else 0.001 - mx += 0.001 * abs(mx) if mx != 0 else 0.001 - bins = np.linspace(mn, mx, nbins + 1, endpoint=True) + if is_dt_or_td: + # using seconds=1 is pretty arbitrary here + td = Timedelta(seconds=1) + # Use DatetimeArray/TimedeltaArray method instead of linspace + # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]" + # has no attribute "_generate_range" + bins = x_idx._values._generate_range( # type: ignore[union-attr] + start=mn - td, end=mx + td, periods=nbins + 1, freq=None + ) + else: + mn -= 0.001 * abs(mn) if mn != 0 else 0.001 + mx += 0.001 * abs(mx) if mx != 0 else 0.001 + + bins = np.linspace(mn, mx, nbins + 1, endpoint=True) else: # adjust end points after binning - bins = np.linspace(mn, mx, nbins + 1, endpoint=True) + if is_dt_or_td: + # Use DatetimeArray/TimedeltaArray method instead of linspace + # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]" + # has no attribute "_generate_range" + bins = x_idx._values._generate_range( # type: ignore[union-attr] + start=mn, end=mx, periods=nbins + 1, freq=None + ) + else: + bins = np.linspace(mn, mx, nbins + 1, endpoint=True) adj = (mx - mn) * 0.001 # 0.1% of the range if right: bins[0] -= adj @@ -400,13 +409,12 @@ def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index: def _bins_to_cuts( - x: Index, + x_idx: Index, bins: Index, right: bool = True, labels=None, precision: int = 3, include_lowest: bool = False, - dtype: DtypeObj | None = None, duplicates: str = "raise", ordered: bool = True, ): @@ -422,7 +430,7 @@ def _bins_to_cuts( if isinstance(bins, IntervalIndex): # we have a fast-path here - ids = bins.get_indexer(x) + ids = bins.get_indexer(x_idx) cat_dtype = CategoricalDtype(bins, ordered=True) result = Categorical.from_codes(ids, dtype=cat_dtype, validate=False) return result, bins @@ -437,12 +445,29 @@ def _bins_to_cuts( bins = unique_bins side: Literal["left", "right"] = "left" if right else "right" - ids = ensure_platform_int(bins.searchsorted(x, side=side)) + + try: + ids = bins.searchsorted(x_idx, side=side) + except TypeError as err: + # e.g. test_datetime_nan_error if bins are DatetimeArray and x_idx + # is integers + if x_idx.dtype.kind == "m": + raise ValueError("bins must be of timedelta64 dtype") from err + elif x_idx.dtype.kind == bins.dtype.kind == "M": + raise ValueError( + "Cannot use timezone-naive bins with timezone-aware values, " + "or vice-versa" + ) from err + elif x_idx.dtype.kind == "M": + raise ValueError("bins must be of datetime64 dtype") from err + else: + raise + ids = ensure_platform_int(ids) if include_lowest: - ids[np.asarray(x) == bins[0]] = 1 + ids[x_idx == bins[0]] = 1 - na_mask = isna(x) | (ids == len(bins)) | (ids == 0) + na_mask = isna(x_idx) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: @@ -454,7 +479,7 @@ def _bins_to_cuts( if labels is None: labels = _format_labels( - bins, precision, right=right, include_lowest=include_lowest, dtype=dtype + bins, precision, right=right, include_lowest=include_lowest ) elif ordered and len(set(labels)) != len(labels): raise ValueError( @@ -513,68 +538,7 @@ def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]: x_arr = x.to_numpy(dtype=np.float64, na_value=np.nan) x = Index(x_arr) - if dtype is not None: - # GH 19768: force NaT to NaN during integer conversion - x_arr = np.where(x.notna(), x.view(np.int64), np.nan) - x = Index(x_arr) - - return x, dtype - - -def _convert_bin_to_numeric_type(bins, dtype: DtypeObj | None): - """ - if the passed bin is of datetime/timedelta type, - this method converts it to integer - - Parameters - ---------- - bins : list-like of bins - dtype : dtype of data - - Raises - ------ - ValueError if bins are not of a compat dtype to dtype - """ - bins_dtype = infer_dtype(bins, skipna=False) - if lib.is_np_dtype(dtype, "m"): - if bins_dtype in ["timedelta", "timedelta64"]: - bins = to_timedelta(bins).view(np.int64) - else: - raise ValueError("bins must be of timedelta64 dtype") - elif lib.is_np_dtype(dtype, "M") or isinstance(dtype, DatetimeTZDtype): - if bins_dtype in ["datetime", "datetime64"]: - bins = to_datetime(bins) - if lib.is_np_dtype(bins.dtype, "M"): - # As of 2.0, to_datetime may give non-nano, so we need to convert - # here until the rest of this file recognizes non-nano - bins = bins.astype("datetime64[ns]", copy=False) - bins = bins.view(np.int64) - else: - raise ValueError("bins must be of datetime64 dtype") - - return bins - - -def _convert_bin_to_datelike_type(bins, dtype: DtypeObj | None): - """ - Convert bins to a DatetimeIndex or TimedeltaIndex if the original dtype is - datelike - - Parameters - ---------- - bins : list-like of bins - dtype : dtype of data - - Returns - ------- - bins : Array-like of bins, DatetimeIndex or TimedeltaIndex if dtype is - datelike - """ - if isinstance(dtype, DatetimeTZDtype): - bins = to_datetime(bins.astype(np.int64), utc=True).tz_convert(dtype.tz) - elif lib.is_np_dtype(dtype, "mM"): - bins = Index(bins.astype(np.int64), dtype=dtype) - return bins + return Index(x), dtype def _format_labels( @@ -582,21 +546,20 @@ def _format_labels( precision: int, right: bool = True, include_lowest: bool = False, - dtype: DtypeObj | None = None, ): """based on the dtype, return our labels""" closed: IntervalLeftRight = "right" if right else "left" formatter: Callable[[Any], Timestamp] | Callable[[Any], Timedelta] - if isinstance(dtype, DatetimeTZDtype): - formatter = lambda x: Timestamp(x, tz=dtype.tz) + if isinstance(bins.dtype, DatetimeTZDtype): + formatter = lambda x: x adjust = lambda x: x - Timedelta("1ns") - elif lib.is_np_dtype(dtype, "M"): - formatter = Timestamp + elif lib.is_np_dtype(bins.dtype, "M"): + formatter = lambda x: x adjust = lambda x: x - Timedelta("1ns") - elif lib.is_np_dtype(dtype, "m"): - formatter = Timedelta + elif lib.is_np_dtype(bins.dtype, "m"): + formatter = lambda x: x adjust = lambda x: x - Timedelta("1ns") else: precision = _infer_precision(precision, bins) @@ -628,7 +591,7 @@ def _preprocess_for_cut(x) -> Index: return Index(x) -def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, original): +def _postprocess_for_cut(fac, bins, retbins: bool, original): """ handles post processing for the cut method where we combine the index information if the originally passed @@ -640,7 +603,6 @@ def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, origi if not retbins: return fac - bins = _convert_bin_to_datelike_type(bins, dtype) if isinstance(bins, Index) and is_numeric_dtype(bins.dtype): bins = bins._values diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index e65ae52e348c6..db8f697b95cd8 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -307,9 +307,9 @@ def test_get_indexer_datetime(self): result = ii.get_indexer(DatetimeIndex(["2018-01-02"]).astype(str)) tm.assert_numpy_array_equal(result, expected) - # TODO this should probably be deprecated? # https://github.com/pandas-dev/pandas/issues/47772 result = ii.get_indexer(DatetimeIndex(["2018-01-02"]).asi8) + expected = np.array([-1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 3a284f7732ac1..8c4c51289870b 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -495,15 +495,33 @@ def test_datetime_cut(data): tm.assert_series_equal(Series(result), expected) -@pytest.mark.parametrize( - "bins", - [ - 3, +@pytest.mark.parametrize("box", [list, np.array, Index, Series]) +def test_datetime_tz_cut_mismatched_tzawareness(box): + # GH#54964 + bins = box( [ Timestamp("2013-01-01 04:57:07.200000"), Timestamp("2013-01-01 21:00:00"), Timestamp("2013-01-02 13:00:00"), Timestamp("2013-01-03 05:00:00"), + ] + ) + ser = Series(date_range("20130101", periods=3, tz="US/Eastern")) + + msg = "Cannot use timezone-naive bins with timezone-aware values" + with pytest.raises(ValueError, match=msg): + cut(ser, bins) + + +@pytest.mark.parametrize( + "bins", + [ + 3, + [ + Timestamp("2013-01-01 04:57:07.200000", tz="UTC").tz_convert("US/Eastern"), + Timestamp("2013-01-01 21:00:00", tz="UTC").tz_convert("US/Eastern"), + Timestamp("2013-01-02 13:00:00", tz="UTC").tz_convert("US/Eastern"), + Timestamp("2013-01-03 05:00:00", tz="UTC").tz_convert("US/Eastern"), ], ], ) From 2a08b052305ed35150d1355bc537548bec697778 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 20 Sep 2023 18:39:03 -0400 Subject: [PATCH 025/131] Faster ensure_string_array (#55183) --- pandas/_libs/lib.pyx | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 0c0610f72044e..23c8066b973f8 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -776,6 +776,7 @@ cpdef ndarray[object] ensure_string_array( cdef: Py_ssize_t i = 0, n = len(arr) bint already_copied = True + ndarray[object] newarr if hasattr(arr, "to_numpy"): @@ -800,8 +801,30 @@ cpdef ndarray[object] ensure_string_array( # short-circuit, all elements are str return result + if arr.dtype.kind == "f": # non-optimized path + for i in range(n): + val = arr[i] + + if not already_copied: + result = result.copy() + already_copied = True + + if not checknull(val): + # f"{val}" is not always equivalent to str(val) for floats + result[i] = str(val) + else: + if convert_na_value: + val = na_value + if skipna: + result[i] = val + else: + result[i] = f"{val}" + + return result + + newarr = np.asarray(arr, dtype=object) for i in range(n): - val = arr[i] + val = newarr[i] if isinstance(val, str): continue @@ -2831,9 +2854,14 @@ NoDefault = Literal[_NoDefault.no_default] @cython.boundscheck(False) @cython.wraparound(False) -def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=True, - object na_value=no_default, cnp.dtype dtype=np.dtype(object) - ) -> np.ndarray: +def map_infer_mask( + ndarray[object] arr, + object f, + const uint8_t[:] mask, + bint convert=True, + object na_value=no_default, + cnp.dtype dtype=np.dtype(object) +) -> np.ndarray: """ Substitute for np.vectorize with pandas-friendly dtype inference. From 52036d9f67ad00be7fcf81e0dbf9159f74dd7853 Mon Sep 17 00:00:00 2001 From: Boyd Kane <33420535+beyarkay@users.noreply.github.com> Date: Fri, 22 Sep 2023 02:10:28 +0200 Subject: [PATCH 026/131] Typo: 'whenlines' -> 'when lines' (#55233) --- pandas/io/json/_json.py | 2 +- pandas/tests/io/json/test_readlines.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index ecab14a54beff..d596891197aaa 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -175,7 +175,7 @@ def to_json( if mode == "a" and (not lines or orient != "records"): msg = ( - "mode='a' (append) is only supported when" + "mode='a' (append) is only supported when " "lines is True and orient is 'records'" ) raise ValueError(msg) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index c2f915e33df8a..d7baba87bba31 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -399,7 +399,7 @@ def test_to_json_append_orient(orient_): # Test ValueError when orient is not 'records' df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) msg = ( - r"mode='a' \(append\) is only supported when" + r"mode='a' \(append\) is only supported when " "lines is True and orient is 'records'" ) with pytest.raises(ValueError, match=msg): @@ -411,7 +411,7 @@ def test_to_json_append_lines(): # Test ValueError when lines is not True df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) msg = ( - r"mode='a' \(append\) is only supported when" + r"mode='a' \(append\) is only supported when " "lines is True and orient is 'records'" ) with pytest.raises(ValueError, match=msg): From b43d79d5cb7d9797321f3c7252206b0078124c47 Mon Sep 17 00:00:00 2001 From: Sai-Suraj-27 Date: Fri, 22 Sep 2023 08:00:51 +0530 Subject: [PATCH 027/131] BUG: groupby.rank with nullable types (#54460) --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/arrays/masked.py | 3 +++ pandas/tests/groupby/test_rank.py | 9 +++++++++ 3 files changed, 13 insertions(+) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index d422fb1be9fdf..51b4c4f297b07 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -789,6 +789,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmax` returns wrong dtype when used on an empty DataFrameGroupBy or SeriesGroupBy (:issue:`51423`) +- Bug in :meth:`DataFrame.groupby.rank` on nullable datatypes when passing ``na_option="bottom"`` or ``na_option="top"`` (:issue:`54206`) - Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` in incorrectly allowing non-fixed ``freq`` when resampling on a :class:`TimedeltaIndex` (:issue:`51896`) - Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` losing time zone when resampling empty data (:issue:`53664`) - Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` where ``origin`` has no effect in resample when values are outside of axis (:issue:`53662`) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 2cf28c28427ab..fdcbe67bbc371 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1487,6 +1487,9 @@ def _groupby_op( else: result_mask = np.zeros(ngroups, dtype=bool) + if how == "rank" and kwargs.get("na_option") in ["top", "bottom"]: + result_mask[:] = False + res_values = op._cython_op_ndim_compat( self._data, min_count=min_count, diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 5d85a0783e024..a3b7da3fa836c 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -710,3 +710,12 @@ def test_rank_categorical(): expected = df.astype(object).groupby("col1").rank() tm.assert_frame_equal(res, expected) + + +@pytest.mark.parametrize("na_option", ["top", "bottom"]) +def test_groupby_op_with_nullables(na_option): + # GH 54206 + df = DataFrame({"x": [None]}, dtype="Float64") + result = df.groupby("x", dropna=False)["x"].rank(method="min", na_option=na_option) + expected = Series([1.0], dtype="Float64", name=result.name) + tm.assert_series_equal(result, expected) From 2d16863717c82457522aa915249ba185df79a9c2 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Fri, 22 Sep 2023 10:59:38 -0400 Subject: [PATCH 028/131] PERF: Add type-hints in tzconversion.pyx (#55241) --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/_libs/tslibs/tzconversion.pyx | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 8eab623a2b5f7..ebca1605be8d5 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -238,7 +238,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`) - Performance improvement in :meth:`Index.difference` (:issue:`55108`) - Performance improvement when indexing with more than 4 keys (:issue:`54550`) -- +- Performance improvement when localizing time to UTC (:issue:`55241`) .. --------------------------------------------------------------------------- .. _whatsnew_220.bug_fixes: diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index e17d264333264..a96779aa33255 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -462,7 +462,7 @@ cdef str _render_tstamp(int64_t val, NPY_DATETIMEUNIT creso): cdef _get_utc_bounds( - ndarray vals, + ndarray[int64_t] vals, int64_t* tdata, Py_ssize_t ntrans, const int64_t[::1] deltas, @@ -472,7 +472,7 @@ cdef _get_utc_bounds( # result_a) or right of the DST transition (store in result_b) cdef: - ndarray result_a, result_b + ndarray[int64_t] result_a, result_b Py_ssize_t i, n = vals.size int64_t val, v_left, v_right Py_ssize_t isl, isr, pos_left, pos_right From ee9a736c723f0d6b7097cd608d57a977e8285a77 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Fri, 22 Sep 2023 18:50:51 +0200 Subject: [PATCH 029/131] DOC: point out that explicitly passing axis=None is deprecated for sum, prod, var, std, sem (#55240) point out that explicitly passing axis=None is deprecated for sum, prod, var, std, sem --- pandas/core/generic.py | 45 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c75c6c546aad0..d0fab0037d99d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -12532,6 +12532,39 @@ def last_valid_index(self) -> Hashable | None: {examples} """ +_sum_prod_doc = """ +{desc} + +Parameters +---------- +axis : {axis_descr} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + .. warning:: + + The behavior of DataFrame.{name} with ``axis=None`` is deprecated, + in a future version this will reduce over both axes and return a scalar + To retain the old behavior, pass axis=0 (or do not pass axis). + + .. versionadded:: 2.0.0 + +skipna : bool, default True + Exclude NA/null values when computing the result. +numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + +{min_count}\ +**kwargs + Additional keyword arguments to be passed to the function. + +Returns +------- +{name1} or scalar\ +{see_also}\ +{examples} +""" + _num_ddof_doc = """ {desc} @@ -12539,6 +12572,13 @@ def last_valid_index(self) -> Hashable | None: ---------- axis : {axis_descr} For `Series` this parameter is unused and defaults to 0. + + .. warning:: + + The behavior of DataFrame.{name} with ``axis=None`` is deprecated, + in a future version this will reduce over both axes and return a scalar + To retain the old behavior, pass axis=0 (or do not pass axis). + skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. @@ -13246,7 +13286,7 @@ def make_doc(name: str, ndim: int) -> str: kwargs = {"min_count": ""} elif name == "sum": - base_doc = _num_doc + base_doc = _sum_prod_doc desc = ( "Return the sum of the values over the requested axis.\n\n" "This is equivalent to the method ``numpy.sum``." @@ -13256,7 +13296,7 @@ def make_doc(name: str, ndim: int) -> str: kwargs = {"min_count": _min_count_stub} elif name == "prod": - base_doc = _num_doc + base_doc = _sum_prod_doc desc = "Return the product of the values over the requested axis." see_also = _stat_func_see_also examples = _prod_examples @@ -13540,6 +13580,7 @@ def make_doc(name: str, ndim: int) -> str: docstr = base_doc.format( desc=desc, + name=name, name1=name1, name2=name2, axis_descr=axis_descr, From cae3f16986e73d2dc9a245a52149d8aec1307ab2 Mon Sep 17 00:00:00 2001 From: rebecca-palmer Date: Fri, 22 Sep 2023 18:02:02 +0100 Subject: [PATCH 030/131] DOC: fix build failure with sphinx 7 and SOURCE_DATE_EPOCH (#54653) * DOC: fix build failure with sphinx 7 and SOURCE_DATE_EPOCH * DOC: address review comments --- doc/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index accbff596b12d..86d2494707ce2 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -162,7 +162,7 @@ # General information about the project. project = "pandas" # We have our custom "pandas_footer.html" template, using copyright for the current year -copyright = f"{datetime.now().year}" +copyright = f"{datetime.now().year}," # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the From f19f81fc1ed21a797ada3ede1c17645d12cc261f Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Fri, 22 Sep 2023 20:55:10 +0100 Subject: [PATCH 031/131] BUG: incompatible dtype when creating string column with loc (#55249) dont raise unnecessary warning --- doc/source/whatsnew/v2.1.2.rst | 2 +- pandas/core/dtypes/missing.py | 3 +++ pandas/tests/frame/indexing/test_indexing.py | 15 +++++++++++++++ pandas/tests/frame/indexing/test_set_value.py | 5 +---- 4 files changed, 20 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 2c9b10160d144..97aeb56924e65 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -13,7 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- +- Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 7117e34b23ca4..8760c8eeca454 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -624,6 +624,9 @@ def infer_fill_value(val): return np.array("NaT", dtype=DT64NS_DTYPE) elif dtype in ["timedelta", "timedelta64"]: return np.array("NaT", dtype=TD64NS_DTYPE) + return np.array(np.nan, dtype=object) + elif val.dtype.kind == "U": + return np.array(np.nan, dtype=val.dtype) return np.nan diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index b324291bab31e..370cbf0f33174 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1890,6 +1890,21 @@ def test_setitem_dict_and_set_disallowed_multiindex(self, key): df.loc[key] = 1 +def test_adding_new_conditional_column() -> None: + # https://github.com/pandas-dev/pandas/issues/55025 + df = DataFrame({"x": [1]}) + df.loc[df["x"] == 1, "y"] = "1" + expected = DataFrame({"x": [1], "y": ["1"]}) + tm.assert_frame_equal(df, expected) + + df = DataFrame({"x": [1]}) + # try inserting something which numpy would store as 'object' + value = lambda x: x + df.loc[df["x"] == 1, "y"] = value + expected = DataFrame({"x": [1], "y": [value]}) + tm.assert_frame_equal(df, expected) + + class TestSetitemValidation: # This is adapted from pandas/tests/arrays/masked/test_indexing.py # but checks for warnings instead of errors. diff --git a/pandas/tests/frame/indexing/test_set_value.py b/pandas/tests/frame/indexing/test_set_value.py index cd9ffa0f129a2..32312868adacb 100644 --- a/pandas/tests/frame/indexing/test_set_value.py +++ b/pandas/tests/frame/indexing/test_set_value.py @@ -26,10 +26,7 @@ def test_set_value_resize(self, float_frame): assert float_frame._get_value("foobar", "qux") == 0 res = float_frame.copy() - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): - res._set_value("foobar", "baz", "sam") + res._set_value("foobar", "baz", "sam") assert res["baz"].dtype == np.object_ res = float_frame.copy() From d9a2ad17f8a34f9aa954b43ee52c3e0ccc0fe2b4 Mon Sep 17 00:00:00 2001 From: Amith KK Date: Sat, 23 Sep 2023 05:16:31 +0530 Subject: [PATCH 032/131] ENH: Add on_bad_lines for pyarrow (#54643) * ENH: Add on_bad_lines for pyarrow (SQUASHED) * Update to appropriate version in docstring * Address review comments * Refine whatsnew * Add "error" value * Condense What's New * Move to "Other Enhancements" * Refactor tests in "test_read_errors" to work with added capabilities * Conditionally import pyarrow error types * Revert changes in v2.2.0.rst > enhancements * Address review comments * Address review comments * Wrap ArrowInvalid with ParserError * Change ArrowInvalid to optional import --- doc/source/whatsnew/v2.2.0.rst | 2 + pandas/io/parsers/arrow_parser_wrapper.py | 45 ++++++++++++++++--- pandas/io/parsers/readers.py | 13 ++++-- .../io/parser/common/test_read_errors.py | 39 ++++++++++++++-- pandas/tests/io/parser/test_unsupported.py | 10 +++-- 5 files changed, 93 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index ebca1605be8d5..930e03ae7d75a 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -73,6 +73,8 @@ enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ + +- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index bb6bcd3c4d6a0..765a4ffcd2cb9 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -1,11 +1,17 @@ from __future__ import annotations from typing import TYPE_CHECKING +import warnings from pandas._config import using_pyarrow_string_dtype from pandas._libs import lib from pandas.compat._optional import import_optional_dependency +from pandas.errors import ( + ParserError, + ParserWarning, +) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.inference import is_integer @@ -85,6 +91,30 @@ def _get_pyarrow_options(self) -> None: and option_name in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines") } + + on_bad_lines = self.kwds.get("on_bad_lines") + if on_bad_lines is not None: + if callable(on_bad_lines): + self.parse_options["invalid_row_handler"] = on_bad_lines + elif on_bad_lines == ParserBase.BadLineHandleMethod.ERROR: + self.parse_options[ + "invalid_row_handler" + ] = None # PyArrow raises an exception by default + elif on_bad_lines == ParserBase.BadLineHandleMethod.WARN: + + def handle_warning(invalid_row): + warnings.warn( + f"Expected {invalid_row.expected_columns} columns, but found " + f"{invalid_row.actual_columns}: {invalid_row.text}", + ParserWarning, + stacklevel=find_stack_level(), + ) + return "skip" + + self.parse_options["invalid_row_handler"] = handle_warning + elif on_bad_lines == ParserBase.BadLineHandleMethod.SKIP: + self.parse_options["invalid_row_handler"] = lambda _: "skip" + self.convert_options = { option_name: option_value for option_name, option_value in self.kwds.items() @@ -190,12 +220,15 @@ def read(self) -> DataFrame: pyarrow_csv = import_optional_dependency("pyarrow.csv") self._get_pyarrow_options() - table = pyarrow_csv.read_csv( - self.src, - read_options=pyarrow_csv.ReadOptions(**self.read_options), - parse_options=pyarrow_csv.ParseOptions(**self.parse_options), - convert_options=pyarrow_csv.ConvertOptions(**self.convert_options), - ) + try: + table = pyarrow_csv.read_csv( + self.src, + read_options=pyarrow_csv.ReadOptions(**self.read_options), + parse_options=pyarrow_csv.ParseOptions(**self.parse_options), + convert_options=pyarrow_csv.ConvertOptions(**self.convert_options), + ) + except pa.ArrowInvalid as e: + raise ParserError(e) from e dtype_backend = self.kwds["dtype_backend"] diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index acf35ebd6afe5..6ce6ac71b1ddd 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -401,6 +401,13 @@ expected, a ``ParserWarning`` will be emitted while dropping extra elements. Only supported when ``engine='python'`` + .. versionchanged:: 2.2.0 + + - Callable, function with signature + as described in `pyarrow documentation + _` when ``engine='pyarrow'`` + delim_whitespace : bool, default False Specifies whether or not whitespace (e.g. ``' '`` or ``'\\t'``) will be used as the ``sep`` delimiter. Equivalent to setting ``sep='\\s+'``. If this option @@ -494,7 +501,6 @@ class _Fwf_Defaults(TypedDict): "thousands", "memory_map", "dialect", - "on_bad_lines", "delim_whitespace", "quoting", "lineterminator", @@ -2142,9 +2148,10 @@ def _refine_defaults_read( elif on_bad_lines == "skip": kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP elif callable(on_bad_lines): - if engine != "python": + if engine not in ["python", "pyarrow"]: raise ValueError( - "on_bad_line can only be a callable function if engine='python'" + "on_bad_line can only be a callable function " + "if engine='python' or 'pyarrow'" ) kwds["on_bad_lines"] = on_bad_lines else: diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index 0c5a2e0d04e5a..4e82dca83e2d0 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -1,5 +1,5 @@ """ -Tests that work on both the Python and C engines but do not have a +Tests that work on the Python, C and PyArrow engines but do not have a specific classification into the other test modules. """ import codecs @@ -21,7 +21,8 @@ from pandas import DataFrame import pandas._testing as tm -pytestmark = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") def test_empty_decimal_marker(all_parsers): @@ -33,10 +34,17 @@ def test_empty_decimal_marker(all_parsers): msg = "Only length-1 decimal markers supported" parser = all_parsers + if parser.engine == "pyarrow": + msg = ( + "only single character unicode strings can be " + "converted to Py_UCS4, got length 0" + ) + with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), decimal="") +@skip_pyarrow def test_bad_stream_exception(all_parsers, csv_dir_path): # see gh-13652 # @@ -57,6 +65,7 @@ def test_bad_stream_exception(all_parsers, csv_dir_path): parser.read_csv(stream) +@skip_pyarrow def test_malformed(all_parsers): # see gh-6607 parser = all_parsers @@ -71,6 +80,7 @@ def test_malformed(all_parsers): parser.read_csv(StringIO(data), header=1, comment="#") +@skip_pyarrow @pytest.mark.parametrize("nrows", [5, 3, None]) def test_malformed_chunks(all_parsers, nrows): data = """ignore @@ -90,6 +100,7 @@ def test_malformed_chunks(all_parsers, nrows): reader.read(nrows) +@skip_pyarrow def test_catch_too_many_names(all_parsers): # see gh-5156 data = """\ @@ -109,6 +120,7 @@ def test_catch_too_many_names(all_parsers): parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) +@skip_pyarrow @pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5]) def test_raise_on_no_columns(all_parsers, nrows): parser = all_parsers @@ -147,6 +159,10 @@ def test_error_bad_lines(all_parsers): data = "a\n1\n1,2,3\n4\n5,6,7" msg = "Expected 1 fields in line 3, saw 3" + + if parser.engine == "pyarrow": + msg = "CSV parse error: Expected 1 columns, got 3: 1,2,3" + with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), on_bad_lines="error") @@ -156,9 +172,13 @@ def test_warn_bad_lines(all_parsers): parser = all_parsers data = "a\n1\n1,2,3\n4\n5,6,7" expected = DataFrame({"a": [1, 4]}) + match_msg = "Skipping line" + + if parser.engine == "pyarrow": + match_msg = "Expected 1 columns, but found 3: 1,2,3" with tm.assert_produces_warning( - ParserWarning, match="Skipping line", check_stacklevel=False + ParserWarning, match=match_msg, check_stacklevel=False ): result = parser.read_csv(StringIO(data), on_bad_lines="warn") tm.assert_frame_equal(result, expected) @@ -174,10 +194,14 @@ def test_read_csv_wrong_num_columns(all_parsers): parser = all_parsers msg = "Expected 6 fields in line 3, saw 7" + if parser.engine == "pyarrow": + msg = "Expected 6 columns, got 7: 6,7,8,9,10,11,12" + with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data)) +@skip_pyarrow def test_null_byte_char(request, all_parsers): # see gh-2741 data = "\x00,foo" @@ -200,6 +224,7 @@ def test_null_byte_char(request, all_parsers): parser.read_csv(StringIO(data), names=names) +@skip_pyarrow @pytest.mark.filterwarnings("always::ResourceWarning") def test_open_file(request, all_parsers): # GH 39024 @@ -238,6 +263,8 @@ def test_bad_header_uniform_error(all_parsers): "Could not construct index. Requested to use 1 " "number of columns, but 3 left to parse." ) + elif parser.engine == "pyarrow": + msg = "CSV parse error: Expected 1 columns, got 4: col1,col2,col3,col4" with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), index_col=0, on_bad_lines="error") @@ -253,9 +280,13 @@ def test_on_bad_lines_warn_correct_formatting(all_parsers): a,b """ expected = DataFrame({"1": "a", "2": ["b"] * 2}) + match_msg = "Skipping line" + + if parser.engine == "pyarrow": + match_msg = "Expected 2 columns, but found 3: a,b,c" with tm.assert_produces_warning( - ParserWarning, match="Skipping line", check_stacklevel=False + ParserWarning, match=match_msg, check_stacklevel=False ): result = parser.read_csv(StringIO(data), on_bad_lines="warn") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index ac171568187cd..b489c09e917af 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -151,13 +151,17 @@ def test_pyarrow_engine(self): with pytest.raises(ValueError, match=msg): read_csv(StringIO(data), engine="pyarrow", **kwargs) - def test_on_bad_lines_callable_python_only(self, all_parsers): + def test_on_bad_lines_callable_python_or_pyarrow(self, all_parsers): # GH 5686 + # GH 54643 sio = StringIO("a,b\n1,2") bad_lines_func = lambda x: x parser = all_parsers - if all_parsers.engine != "python": - msg = "on_bad_line can only be a callable function if engine='python'" + if all_parsers.engine not in ["python", "pyarrow"]: + msg = ( + "on_bad_line can only be a callable " + "function if engine='python' or 'pyarrow'" + ) with pytest.raises(ValueError, match=msg): parser.read_csv(sio, on_bad_lines=bad_lines_func) else: From 9e1096e8373bc99675fd1b3490cfb7cf26041395 Mon Sep 17 00:00:00 2001 From: jfadia <90651438+jfadia@users.noreply.github.com> Date: Fri, 22 Sep 2023 16:48:53 -0700 Subject: [PATCH 033/131] DOC: Fix doc for first_valid_index (#55236) * Fix doc for first_valid_index * Replace notes with examples * Add explicit prints --- pandas/core/generic.py | 46 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d0fab0037d99d..427687d9614f9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -12466,11 +12466,6 @@ def first_valid_index(self) -> Hashable | None: ------- type of index - Notes - ----- - If all elements are non-NA/null, returns None. - Also returns None for empty {klass}. - Examples -------- For Series: @@ -12481,6 +12476,22 @@ def first_valid_index(self) -> Hashable | None: >>> s.last_valid_index() 2 + >>> s = pd.Series([None, None]) + >>> print(s.first_valid_index()) + None + >>> print(s.last_valid_index()) + None + + If all elements in Series are NA/null, returns None. + + >>> s = pd.Series() + >>> print(s.first_valid_index()) + None + >>> print(s.last_valid_index()) + None + + If Series is empty, returns None. + For DataFrame: >>> df = pd.DataFrame({{'A': [None, None, 2], 'B': [None, 3, 4]}}) @@ -12493,6 +12504,31 @@ def first_valid_index(self) -> Hashable | None: 1 >>> df.last_valid_index() 2 + + >>> df = pd.DataFrame({{'A': [None, None, None], 'B': [None, None, None]}}) + >>> df + A B + 0 None None + 1 None None + 2 None None + >>> print(df.first_valid_index()) + None + >>> print(df.last_valid_index()) + None + + If all elements in DataFrame are NA/null, returns None. + + >>> df = pd.DataFrame() + >>> df + Empty DataFrame + Columns: [] + Index: [] + >>> print(df.first_valid_index()) + None + >>> print(df.last_valid_index()) + None + + If DataFrame is empty, returns None. """ return self._find_valid_index(how="first") From d01669f5cdfff4edc889994331901b1f97946fdd Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sun, 24 Sep 2023 13:21:30 -0400 Subject: [PATCH 034/131] CLN: Remove temp_setattr from groupby.transform (#55262) --- pandas/core/groupby/groupby.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d607baf18d6cb..a022bfd1bd9bc 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2012,15 +2012,13 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): # If func is a reduction, we need to broadcast the # result to the whole group. Compute func result # and deal with possible broadcasting below. - # Temporarily set observed for dealing with categoricals. - with com.temp_setattr(self, "observed", True): - with com.temp_setattr(self, "as_index", True): - # GH#49834 - result needs groups in the index for - # _wrap_transform_fast_result - if engine is not None: - kwargs["engine"] = engine - kwargs["engine_kwargs"] = engine_kwargs - result = getattr(self, func)(*args, **kwargs) + with com.temp_setattr(self, "as_index", True): + # GH#49834 - result needs groups in the index for + # _wrap_transform_fast_result + if engine is not None: + kwargs["engine"] = engine + kwargs["engine_kwargs"] = engine_kwargs + result = getattr(self, func)(*args, **kwargs) return self._wrap_transform_fast_result(result) From 37221468b5ed118595606d4b0e67321b163a8b09 Mon Sep 17 00:00:00 2001 From: Sai-Suraj-27 Date: Mon, 25 Sep 2023 23:38:06 +0530 Subject: [PATCH 035/131] Updated `pre-commit` hooks versions. (#55277) Updated pre-commit hooks version and checked all the files accordingly. --- .pre-commit-config.yaml | 6 +++--- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c01bf65818167..b0b511e1048c6 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -20,11 +20,11 @@ ci: repos: - repo: https://github.com/hauntsaninja/black-pre-commit-mirror # black compiled with mypyc - rev: 23.7.0 + rev: 23.9.1 hooks: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.0.287 + rev: v0.0.291 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -107,7 +107,7 @@ repos: hooks: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: v3.10.1 + rev: v3.13.0 hooks: - id: pyupgrade args: [--py39-plus] diff --git a/pyproject.toml b/pyproject.toml index 4e1c77413efda..df89bd129901f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -187,7 +187,7 @@ environment = {CFLAGS="-g0"} [tool.black] target-version = ['py39', 'py310'] -required-version = '23.7.0' +required-version = '23.9.1' exclude = ''' ( asv_bench/env From f9ade667e2ebc6cc1731065ccbf40d4e70d9c3b5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 25 Sep 2023 11:08:55 -0700 Subject: [PATCH 036/131] Bump pypa/cibuildwheel from 2.15.0 to 2.16.0 (#55276) Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.15.0 to 2.16.0. - [Release notes](https://github.com/pypa/cibuildwheel/releases) - [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md) - [Commits](https://github.com/pypa/cibuildwheel/compare/v2.15.0...v2.16.0) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 83d14b51092e6..4c7a7b329777b 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -138,7 +138,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.15.0 + uses: pypa/cibuildwheel@v2.16.0 with: package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: From bdc2cd4c6423c23e8796d80b589dea937fffc2aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Lucas=20Mayer?= Date: Mon, 25 Sep 2023 15:20:52 -0300 Subject: [PATCH 037/131] ASV: add GroupBy.sum benchmark with timedelta and integer types (#55273) add SumTimeDelta asv benchmark Co-authored-by: Willian Wang --- asv_bench/benchmarks/groupby.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index b206523dfe851..4567b5b414301 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -841,6 +841,23 @@ def time_groupby_sum_multiindex(self): self.df.groupby(level=[0, 1]).sum() +class SumTimeDelta: + # GH 20660 + def setup(self): + N = 10**4 + self.df = DataFrame( + np.random.randint(1000, 100000, (N, 100)), + index=np.random.randint(200, size=(N,)), + ).astype("timedelta64[ns]") + self.df_int = self.df.copy().astype("int64") + + def time_groupby_sum_timedelta(self): + self.df.groupby(lambda x: x).sum() + + def time_groupby_sum_int(self): + self.df_int.groupby(lambda x: x).sum() + + class Transform: def setup(self): n1 = 400 From ce814c211c787216f10d251081d44ea8996ca9f7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 25 Sep 2023 08:22:33 -1000 Subject: [PATCH 038/131] TST: Clean up autouse fixtures (#55269) * TST: Clean up autouse fixtures * Use another flavor_read_html --- pandas/tests/arithmetic/conftest.py | 18 - pandas/tests/arithmetic/test_numeric.py | 7 + pandas/tests/frame/test_arithmetic.py | 11 +- pandas/tests/io/formats/test_format.py | 15 +- pandas/tests/io/test_html.py | 444 +++++++++++++----------- pandas/tests/series/test_arithmetic.py | 9 +- pandas/tests/test_expressions.py | 118 +++---- 7 files changed, 313 insertions(+), 309 deletions(-) diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index 7dd5169202ba4..7ec77e5b65b7e 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -7,23 +7,8 @@ RangeIndex, ) import pandas._testing as tm -from pandas.core.computation import expressions as expr -@pytest.fixture(autouse=True, params=[0, 1000000], ids=["numexpr", "python"]) -def switch_numexpr_min_elements(request): - _MIN_ELEMENTS = expr._MIN_ELEMENTS - expr._MIN_ELEMENTS = request.param - yield request.param - expr._MIN_ELEMENTS = _MIN_ELEMENTS - - -# ------------------------------------------------------------------ - - -# doctest with +SKIP for one fixture fails during setup with -# 'DoctestItem' object has no attribute 'callspec' -# due to switch_numexpr_min_elements fixture @pytest.fixture(params=[1, np.array(1, dtype=np.int64)]) def one(request): """ @@ -58,9 +43,6 @@ def one(request): zeros.extend([0, 0.0, -0.0]) -# doctest with +SKIP for zero fixture fails during setup with -# 'DoctestItem' object has no attribute 'callspec' -# due to switch_numexpr_min_elements fixture @pytest.fixture(params=zeros) def zero(request): """ diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index fa17c24fffb26..c93a03ad0f479 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -29,6 +29,13 @@ ) +@pytest.fixture(autouse=True, params=[0, 1000000], ids=["numexpr", "python"]) +def switch_numexpr_min_elements(request, monkeypatch): + with monkeypatch.context() as m: + m.setattr(expr, "_MIN_ELEMENTS", request.param) + yield request.param + + @pytest.fixture(params=[Index, Series, tm.to_array]) def box_pandas_1d_array(request): """ diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 1488fa65fabc0..24a70e55e2f0e 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -33,11 +33,10 @@ @pytest.fixture(autouse=True, params=[0, 1000000], ids=["numexpr", "python"]) -def switch_numexpr_min_elements(request): - _MIN_ELEMENTS = expr._MIN_ELEMENTS - expr._MIN_ELEMENTS = request.param - yield request.param - expr._MIN_ELEMENTS = _MIN_ELEMENTS +def switch_numexpr_min_elements(request, monkeypatch): + with monkeypatch.context() as m: + m.setattr(expr, "_MIN_ELEMENTS", request.param) + yield request.param class DummyElement: @@ -1074,7 +1073,7 @@ def test_frame_with_frame_reindex(self): ], ids=lambda x: x.__name__, ) - def test_binop_other(self, op, value, dtype, switch_numexpr_min_elements, request): + def test_binop_other(self, op, value, dtype, switch_numexpr_min_elements): skip = { (operator.truediv, "bool"), (operator.pow, "bool"), diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index f3a1ebe23b568..53ee449c2dc0c 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -22,8 +22,6 @@ import pytest import pytz -from pandas._config import config - import pandas as pd from pandas import ( DataFrame, @@ -51,17 +49,6 @@ def get_local_am_pm(): return am_local, pm_local -@pytest.fixture(autouse=True) -def clean_config(): - curr_deprecated_options = config._deprecated_options.copy() - curr_registered_options = config._registered_options.copy() - curr_global_config = config._global_config.copy() - yield - config._deprecated_options = curr_deprecated_options - config._registered_options = curr_registered_options - config._global_config = curr_global_config - - @pytest.fixture(params=["string", "pathlike", "buffer"]) def filepath_or_buffer_id(request): """ @@ -3604,7 +3591,7 @@ def test_repr_html_ipython_config(ip): df._repr_html_() """ ) - result = ip.run_cell(code) + result = ip.run_cell(code, silent=True) assert not result.error_in_exec diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 6cf90749e5b30..dcee52011a691 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -99,15 +99,18 @@ def test_same_ordering(datapath): assert_framelist_equal(dfs_lxml, dfs_bs4) -@pytest.mark.parametrize( - "flavor", - [ +@pytest.fixture( + params=[ pytest.param("bs4", marks=[td.skip_if_no("bs4"), td.skip_if_no("html5lib")]), pytest.param("lxml", marks=td.skip_if_no("lxml")), ], ) +def flavor_read_html(request): + return partial(read_html, flavor=request.param) + + class TestReadHtml: - def test_literal_html_deprecation(self): + def test_literal_html_deprecation(self, flavor_read_html): # GH 53785 msg = ( "Passing literal html to 'read_html' is deprecated and " @@ -116,7 +119,7 @@ def test_literal_html_deprecation(self): ) with tm.assert_produces_warning(FutureWarning, match=msg): - self.read_html( + flavor_read_html( """ @@ -147,12 +150,7 @@ def spam_data(self, datapath): def banklist_data(self, datapath): return datapath("io", "data", "html", "banklist.html") - @pytest.fixture(autouse=True) - def set_defaults(self, flavor): - self.read_html = partial(read_html, flavor=flavor) - yield - - def test_to_html_compat(self): + def test_to_html_compat(self, flavor_read_html): df = ( tm.makeCustomDataframe( 4, @@ -165,12 +163,12 @@ def test_to_html_compat(self): .map("{:.3f}".format).astype(float) ) out = df.to_html() - res = self.read_html(StringIO(out), attrs={"class": "dataframe"}, index_col=0)[ - 0 - ] + res = flavor_read_html( + StringIO(out), attrs={"class": "dataframe"}, index_col=0 + )[0] tm.assert_frame_equal(res, df) - def test_dtype_backend(self, string_storage, dtype_backend): + def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html): # GH#50286 df = DataFrame( { @@ -196,7 +194,7 @@ def test_dtype_backend(self, string_storage, dtype_backend): out = df.to_html(index=False) with pd.option_context("mode.string_storage", string_storage): - result = self.read_html(StringIO(out), dtype_backend=dtype_backend)[0] + result = flavor_read_html(StringIO(out), dtype_backend=dtype_backend)[0] expected = DataFrame( { @@ -227,16 +225,16 @@ def test_dtype_backend(self, string_storage, dtype_backend): @pytest.mark.network @pytest.mark.single_cpu - def test_banklist_url(self, httpserver, banklist_data): + def test_banklist_url(self, httpserver, banklist_data, flavor_read_html): with open(banklist_data, encoding="utf-8") as f: httpserver.serve_content(content=f.read()) - df1 = self.read_html( + df1 = flavor_read_html( # lxml cannot find attrs leave out for now httpserver.url, match="First Federal Bank of Florida", # attrs={"class": "dataTable"} ) # lxml cannot find attrs leave out for now - df2 = self.read_html( + df2 = flavor_read_html( httpserver.url, match="Metcalf Bank", ) # attrs={"class": "dataTable"}) @@ -245,165 +243,169 @@ def test_banklist_url(self, httpserver, banklist_data): @pytest.mark.network @pytest.mark.single_cpu - def test_spam_url(self, httpserver, spam_data): + def test_spam_url(self, httpserver, spam_data, flavor_read_html): with open(spam_data, encoding="utf-8") as f: httpserver.serve_content(content=f.read()) - df1 = self.read_html(httpserver.url, match=".*Water.*") - df2 = self.read_html(httpserver.url, match="Unit") + df1 = flavor_read_html(httpserver.url, match=".*Water.*") + df2 = flavor_read_html(httpserver.url, match="Unit") assert_framelist_equal(df1, df2) @pytest.mark.slow - def test_banklist(self, banklist_data): - df1 = self.read_html(banklist_data, match=".*Florida.*", attrs={"id": "table"}) - df2 = self.read_html(banklist_data, match="Metcalf Bank", attrs={"id": "table"}) + def test_banklist(self, banklist_data, flavor_read_html): + df1 = flavor_read_html( + banklist_data, match=".*Florida.*", attrs={"id": "table"} + ) + df2 = flavor_read_html( + banklist_data, match="Metcalf Bank", attrs={"id": "table"} + ) assert_framelist_equal(df1, df2) - def test_spam(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*") - df2 = self.read_html(spam_data, match="Unit") + def test_spam(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*") + df2 = flavor_read_html(spam_data, match="Unit") assert_framelist_equal(df1, df2) assert df1[0].iloc[0, 0] == "Proximates" assert df1[0].columns[0] == "Nutrient" - def test_spam_no_match(self, spam_data): - dfs = self.read_html(spam_data) + def test_spam_no_match(self, spam_data, flavor_read_html): + dfs = flavor_read_html(spam_data) for df in dfs: assert isinstance(df, DataFrame) - def test_banklist_no_match(self, banklist_data): - dfs = self.read_html(banklist_data, attrs={"id": "table"}) + def test_banklist_no_match(self, banklist_data, flavor_read_html): + dfs = flavor_read_html(banklist_data, attrs={"id": "table"}) for df in dfs: assert isinstance(df, DataFrame) - def test_spam_header(self, spam_data): - df = self.read_html(spam_data, match=".*Water.*", header=2)[0] + def test_spam_header(self, spam_data, flavor_read_html): + df = flavor_read_html(spam_data, match=".*Water.*", header=2)[0] assert df.columns[0] == "Proximates" assert not df.empty - def test_skiprows_int(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", skiprows=1) - df2 = self.read_html(spam_data, match="Unit", skiprows=1) + def test_skiprows_int(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=1) + df2 = flavor_read_html(spam_data, match="Unit", skiprows=1) assert_framelist_equal(df1, df2) - def test_skiprows_range(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", skiprows=range(2)) - df2 = self.read_html(spam_data, match="Unit", skiprows=range(2)) + def test_skiprows_range(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=range(2)) + df2 = flavor_read_html(spam_data, match="Unit", skiprows=range(2)) assert_framelist_equal(df1, df2) - def test_skiprows_list(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", skiprows=[1, 2]) - df2 = self.read_html(spam_data, match="Unit", skiprows=[2, 1]) + def test_skiprows_list(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=[1, 2]) + df2 = flavor_read_html(spam_data, match="Unit", skiprows=[2, 1]) assert_framelist_equal(df1, df2) - def test_skiprows_set(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", skiprows={1, 2}) - df2 = self.read_html(spam_data, match="Unit", skiprows={2, 1}) + def test_skiprows_set(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows={1, 2}) + df2 = flavor_read_html(spam_data, match="Unit", skiprows={2, 1}) assert_framelist_equal(df1, df2) - def test_skiprows_slice(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", skiprows=1) - df2 = self.read_html(spam_data, match="Unit", skiprows=1) + def test_skiprows_slice(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=1) + df2 = flavor_read_html(spam_data, match="Unit", skiprows=1) assert_framelist_equal(df1, df2) - def test_skiprows_slice_short(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", skiprows=slice(2)) - df2 = self.read_html(spam_data, match="Unit", skiprows=slice(2)) + def test_skiprows_slice_short(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=slice(2)) + df2 = flavor_read_html(spam_data, match="Unit", skiprows=slice(2)) assert_framelist_equal(df1, df2) - def test_skiprows_slice_long(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", skiprows=slice(2, 5)) - df2 = self.read_html(spam_data, match="Unit", skiprows=slice(4, 1, -1)) + def test_skiprows_slice_long(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=slice(2, 5)) + df2 = flavor_read_html(spam_data, match="Unit", skiprows=slice(4, 1, -1)) assert_framelist_equal(df1, df2) - def test_skiprows_ndarray(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", skiprows=np.arange(2)) - df2 = self.read_html(spam_data, match="Unit", skiprows=np.arange(2)) + def test_skiprows_ndarray(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=np.arange(2)) + df2 = flavor_read_html(spam_data, match="Unit", skiprows=np.arange(2)) assert_framelist_equal(df1, df2) - def test_skiprows_invalid(self, spam_data): + def test_skiprows_invalid(self, spam_data, flavor_read_html): with pytest.raises(TypeError, match=("is not a valid type for skipping rows")): - self.read_html(spam_data, match=".*Water.*", skiprows="asdf") + flavor_read_html(spam_data, match=".*Water.*", skiprows="asdf") - def test_index(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", index_col=0) - df2 = self.read_html(spam_data, match="Unit", index_col=0) + def test_index(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", index_col=0) + df2 = flavor_read_html(spam_data, match="Unit", index_col=0) assert_framelist_equal(df1, df2) - def test_header_and_index_no_types(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", header=1, index_col=0) - df2 = self.read_html(spam_data, match="Unit", header=1, index_col=0) + def test_header_and_index_no_types(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", header=1, index_col=0) + df2 = flavor_read_html(spam_data, match="Unit", header=1, index_col=0) assert_framelist_equal(df1, df2) - def test_header_and_index_with_types(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", header=1, index_col=0) - df2 = self.read_html(spam_data, match="Unit", header=1, index_col=0) + def test_header_and_index_with_types(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", header=1, index_col=0) + df2 = flavor_read_html(spam_data, match="Unit", header=1, index_col=0) assert_framelist_equal(df1, df2) - def test_infer_types(self, spam_data): + def test_infer_types(self, spam_data, flavor_read_html): # 10892 infer_types removed - df1 = self.read_html(spam_data, match=".*Water.*", index_col=0) - df2 = self.read_html(spam_data, match="Unit", index_col=0) + df1 = flavor_read_html(spam_data, match=".*Water.*", index_col=0) + df2 = flavor_read_html(spam_data, match="Unit", index_col=0) assert_framelist_equal(df1, df2) - def test_string_io(self, spam_data): + def test_string_io(self, spam_data, flavor_read_html): with open(spam_data, encoding="UTF-8") as f: data1 = StringIO(f.read()) with open(spam_data, encoding="UTF-8") as f: data2 = StringIO(f.read()) - df1 = self.read_html(data1, match=".*Water.*") - df2 = self.read_html(data2, match="Unit") + df1 = flavor_read_html(data1, match=".*Water.*") + df2 = flavor_read_html(data2, match="Unit") assert_framelist_equal(df1, df2) - def test_string(self, spam_data): + def test_string(self, spam_data, flavor_read_html): with open(spam_data, encoding="UTF-8") as f: data = f.read() - df1 = self.read_html(StringIO(data), match=".*Water.*") - df2 = self.read_html(StringIO(data), match="Unit") + df1 = flavor_read_html(StringIO(data), match=".*Water.*") + df2 = flavor_read_html(StringIO(data), match="Unit") assert_framelist_equal(df1, df2) - def test_file_like(self, spam_data): + def test_file_like(self, spam_data, flavor_read_html): with open(spam_data, encoding="UTF-8") as f: - df1 = self.read_html(f, match=".*Water.*") + df1 = flavor_read_html(f, match=".*Water.*") with open(spam_data, encoding="UTF-8") as f: - df2 = self.read_html(f, match="Unit") + df2 = flavor_read_html(f, match="Unit") assert_framelist_equal(df1, df2) @pytest.mark.network @pytest.mark.single_cpu - def test_bad_url_protocol(self, httpserver): + def test_bad_url_protocol(self, httpserver, flavor_read_html): httpserver.serve_content("urlopen error unknown url type: git", code=404) with pytest.raises(URLError, match="urlopen error unknown url type: git"): - self.read_html("git://github.com", match=".*Water.*") + flavor_read_html("git://github.com", match=".*Water.*") @pytest.mark.slow @pytest.mark.network @pytest.mark.single_cpu - def test_invalid_url(self, httpserver): + def test_invalid_url(self, httpserver, flavor_read_html): httpserver.serve_content("Name or service not known", code=404) with pytest.raises((URLError, ValueError), match="HTTP Error 404: NOT FOUND"): - self.read_html(httpserver.url, match=".*Water.*") + flavor_read_html(httpserver.url, match=".*Water.*") @pytest.mark.slow - def test_file_url(self, banklist_data): + def test_file_url(self, banklist_data, flavor_read_html): url = banklist_data - dfs = self.read_html( + dfs = flavor_read_html( file_path_to_url(os.path.abspath(url)), match="First", attrs={"id": "table"} ) assert isinstance(dfs, list) @@ -411,54 +413,78 @@ def test_file_url(self, banklist_data): assert isinstance(df, DataFrame) @pytest.mark.slow - def test_invalid_table_attrs(self, banklist_data): + def test_invalid_table_attrs(self, banklist_data, flavor_read_html): url = banklist_data with pytest.raises(ValueError, match="No tables found"): - self.read_html( + flavor_read_html( url, match="First Federal Bank of Florida", attrs={"id": "tasdfable"} ) - def _bank_data(self, path, **kwargs): - return self.read_html(path, match="Metcalf", attrs={"id": "table"}, **kwargs) - @pytest.mark.slow - def test_multiindex_header(self, banklist_data): - df = self._bank_data(banklist_data, header=[0, 1])[0] + def test_multiindex_header(self, banklist_data, flavor_read_html): + df = flavor_read_html( + banklist_data, match="Metcalf", attrs={"id": "table"}, header=[0, 1] + )[0] assert isinstance(df.columns, MultiIndex) @pytest.mark.slow - def test_multiindex_index(self, banklist_data): - df = self._bank_data(banklist_data, index_col=[0, 1])[0] + def test_multiindex_index(self, banklist_data, flavor_read_html): + df = flavor_read_html( + banklist_data, match="Metcalf", attrs={"id": "table"}, index_col=[0, 1] + )[0] assert isinstance(df.index, MultiIndex) @pytest.mark.slow - def test_multiindex_header_index(self, banklist_data): - df = self._bank_data(banklist_data, header=[0, 1], index_col=[0, 1])[0] + def test_multiindex_header_index(self, banklist_data, flavor_read_html): + df = flavor_read_html( + banklist_data, + match="Metcalf", + attrs={"id": "table"}, + header=[0, 1], + index_col=[0, 1], + )[0] assert isinstance(df.columns, MultiIndex) assert isinstance(df.index, MultiIndex) @pytest.mark.slow - def test_multiindex_header_skiprows_tuples(self, banklist_data): - df = self._bank_data(banklist_data, header=[0, 1], skiprows=1)[0] + def test_multiindex_header_skiprows_tuples(self, banklist_data, flavor_read_html): + df = flavor_read_html( + banklist_data, + match="Metcalf", + attrs={"id": "table"}, + header=[0, 1], + skiprows=1, + )[0] assert isinstance(df.columns, MultiIndex) @pytest.mark.slow - def test_multiindex_header_skiprows(self, banklist_data): - df = self._bank_data(banklist_data, header=[0, 1], skiprows=1)[0] + def test_multiindex_header_skiprows(self, banklist_data, flavor_read_html): + df = flavor_read_html( + banklist_data, + match="Metcalf", + attrs={"id": "table"}, + header=[0, 1], + skiprows=1, + )[0] assert isinstance(df.columns, MultiIndex) @pytest.mark.slow - def test_multiindex_header_index_skiprows(self, banklist_data): - df = self._bank_data( - banklist_data, header=[0, 1], index_col=[0, 1], skiprows=1 + def test_multiindex_header_index_skiprows(self, banklist_data, flavor_read_html): + df = flavor_read_html( + banklist_data, + match="Metcalf", + attrs={"id": "table"}, + header=[0, 1], + index_col=[0, 1], + skiprows=1, )[0] assert isinstance(df.index, MultiIndex) assert isinstance(df.columns, MultiIndex) @pytest.mark.slow - def test_regex_idempotency(self, banklist_data): + def test_regex_idempotency(self, banklist_data, flavor_read_html): url = banklist_data - dfs = self.read_html( + dfs = flavor_read_html( file_path_to_url(os.path.abspath(url)), match=re.compile(re.compile("Florida")), attrs={"id": "table"}, @@ -467,10 +493,10 @@ def test_regex_idempotency(self, banklist_data): for df in dfs: assert isinstance(df, DataFrame) - def test_negative_skiprows(self, spam_data): + def test_negative_skiprows(self, spam_data, flavor_read_html): msg = r"\(you passed a negative value\)" with pytest.raises(ValueError, match=msg): - self.read_html(spam_data, match="Water", skiprows=-1) + flavor_read_html(spam_data, match="Water", skiprows=-1) @pytest.fixture def python_docs(self): @@ -523,20 +549,20 @@ def python_docs(self): @pytest.mark.network @pytest.mark.single_cpu - def test_multiple_matches(self, python_docs, httpserver): + def test_multiple_matches(self, python_docs, httpserver, flavor_read_html): httpserver.serve_content(content=python_docs) - dfs = self.read_html(httpserver.url, match="Python") + dfs = flavor_read_html(httpserver.url, match="Python") assert len(dfs) > 1 @pytest.mark.network @pytest.mark.single_cpu - def test_python_docs_table(self, python_docs, httpserver): + def test_python_docs_table(self, python_docs, httpserver, flavor_read_html): httpserver.serve_content(content=python_docs) - dfs = self.read_html(httpserver.url, match="Python") + dfs = flavor_read_html(httpserver.url, match="Python") zz = [df.iloc[0, 0][0:4] for df in dfs] assert sorted(zz) == ["Pyth", "What"] - def test_empty_tables(self): + def test_empty_tables(self, flavor_read_html): """ Make sure that read_html ignores empty tables. """ @@ -560,13 +586,13 @@ def test_empty_tables(self):
""" - result = self.read_html(StringIO(html)) + result = flavor_read_html(StringIO(html)) assert len(result) == 1 - def test_multiple_tbody(self): + def test_multiple_tbody(self, flavor_read_html): # GH-20690 # Read all tbody tags within a single table. - result = self.read_html( + result = flavor_read_html( StringIO( """ @@ -595,12 +621,12 @@ def test_multiple_tbody(self): tm.assert_frame_equal(result, expected) - def test_header_and_one_column(self): + def test_header_and_one_column(self, flavor_read_html): """ Don't fail with bs4 when there is a header and only one column as described in issue #9178 """ - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -621,11 +647,11 @@ def test_header_and_one_column(self): tm.assert_frame_equal(result, expected) - def test_thead_without_tr(self): + def test_thead_without_tr(self, flavor_read_html): """ Ensure parser adds within on malformed HTML. """ - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -653,7 +679,7 @@ def test_thead_without_tr(self): tm.assert_frame_equal(result, expected) - def test_tfoot_read(self): + def test_tfoot_read(self, flavor_read_html): """ Make sure that read_html reads tfoot, containing td or th. Ignores empty tfoot @@ -685,16 +711,16 @@ def test_tfoot_read(self): data1 = data_template.format(footer="") data2 = data_template.format(footer="") - result1 = self.read_html(StringIO(data1))[0] - result2 = self.read_html(StringIO(data2))[0] + result1 = flavor_read_html(StringIO(data1))[0] + result2 = flavor_read_html(StringIO(data2))[0] tm.assert_frame_equal(result1, expected1) tm.assert_frame_equal(result2, expected2) - def test_parse_header_of_non_string_column(self): + def test_parse_header_of_non_string_column(self, flavor_read_html): # GH5048: if header is specified explicitly, an int column should be # parsed as int while its header is parsed as str - result = self.read_html( + result = flavor_read_html( StringIO( """
footAfootB
@@ -717,7 +743,7 @@ def test_parse_header_of_non_string_column(self): tm.assert_frame_equal(result, expected) @pytest.mark.slow - def test_banklist_header(self, banklist_data, datapath): + def test_banklist_header(self, banklist_data, datapath, flavor_read_html): from pandas.io.html import _remove_whitespace def try_remove_ws(x): @@ -726,7 +752,7 @@ def try_remove_ws(x): except AttributeError: return x - df = self.read_html(banklist_data, match="Metcalf", attrs={"id": "table"})[0] + df = flavor_read_html(banklist_data, match="Metcalf", attrs={"id": "table"})[0] ground_truth = read_csv( datapath("io", "data", "csv", "banklist.csv"), converters={"Updated Date": Timestamp, "Closing Date": Timestamp}, @@ -765,19 +791,19 @@ def try_remove_ws(x): tm.assert_frame_equal(converted, gtnew) @pytest.mark.slow - def test_gold_canyon(self, banklist_data): + def test_gold_canyon(self, banklist_data, flavor_read_html): gc = "Gold Canyon" with open(banklist_data, encoding="utf-8") as f: raw_text = f.read() assert gc in raw_text - df = self.read_html(banklist_data, match="Gold Canyon", attrs={"id": "table"})[ - 0 - ] + df = flavor_read_html( + banklist_data, match="Gold Canyon", attrs={"id": "table"} + )[0] assert gc in df.to_string() - def test_different_number_of_cols(self): - expected = self.read_html( + def test_different_number_of_cols(self, flavor_read_html): + expected = flavor_read_html( StringIO( """
@@ -813,7 +839,7 @@ def test_different_number_of_cols(self): index_col=0, )[0] - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -848,9 +874,9 @@ def test_different_number_of_cols(self): tm.assert_frame_equal(result, expected) - def test_colspan_rowspan_1(self): + def test_colspan_rowspan_1(self, flavor_read_html): # GH17054 - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -873,7 +899,7 @@ def test_colspan_rowspan_1(self): tm.assert_frame_equal(result, expected) - def test_colspan_rowspan_copy_values(self): + def test_colspan_rowspan_copy_values(self, flavor_read_html): # GH17054 # In ASCII, with lowercase letters being copies: @@ -881,7 +907,7 @@ def test_colspan_rowspan_copy_values(self): # X x Y Z W # A B b z C - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -908,7 +934,7 @@ def test_colspan_rowspan_copy_values(self): tm.assert_frame_equal(result, expected) - def test_colspan_rowspan_both_not_1(self): + def test_colspan_rowspan_both_not_1(self, flavor_read_html): # GH17054 # In ASCII, with lowercase letters being copies: @@ -916,7 +942,7 @@ def test_colspan_rowspan_both_not_1(self): # A B b b C # a b b b D - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -940,7 +966,7 @@ def test_colspan_rowspan_both_not_1(self): tm.assert_frame_equal(result, expected) - def test_rowspan_at_end_of_row(self): + def test_rowspan_at_end_of_row(self, flavor_read_html): # GH17054 # In ASCII, with lowercase letters being copies: @@ -948,7 +974,7 @@ def test_rowspan_at_end_of_row(self): # A B # C b - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -969,10 +995,10 @@ def test_rowspan_at_end_of_row(self): tm.assert_frame_equal(result, expected) - def test_rowspan_only_rows(self): + def test_rowspan_only_rows(self, flavor_read_html): # GH17054 - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -990,9 +1016,9 @@ def test_rowspan_only_rows(self): tm.assert_frame_equal(result, expected) - def test_header_inferred_from_rows_with_only_th(self): + def test_header_inferred_from_rows_with_only_th(self, flavor_read_html): # GH17054 - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -1018,15 +1044,15 @@ def test_header_inferred_from_rows_with_only_th(self): tm.assert_frame_equal(result, expected) - def test_parse_dates_list(self): + def test_parse_dates_list(self, flavor_read_html): df = DataFrame({"date": date_range("1/1/2001", periods=10)}) expected = df.to_html() - res = self.read_html(StringIO(expected), parse_dates=[1], index_col=0) + res = flavor_read_html(StringIO(expected), parse_dates=[1], index_col=0) tm.assert_frame_equal(df, res[0]) - res = self.read_html(StringIO(expected), parse_dates=["date"], index_col=0) + res = flavor_read_html(StringIO(expected), parse_dates=["date"], index_col=0) tm.assert_frame_equal(df, res[0]) - def test_parse_dates_combine(self): + def test_parse_dates_combine(self, flavor_read_html): raw_dates = Series(date_range("1/1/2001", periods=10)) df = DataFrame( { @@ -1034,32 +1060,32 @@ def test_parse_dates_combine(self): "time": raw_dates.map(lambda x: str(x.time())), } ) - res = self.read_html( + res = flavor_read_html( StringIO(df.to_html()), parse_dates={"datetime": [1, 2]}, index_col=1 ) newdf = DataFrame({"datetime": raw_dates}) tm.assert_frame_equal(newdf, res[0]) - def test_wikipedia_states_table(self, datapath): + def test_wikipedia_states_table(self, datapath, flavor_read_html): data = datapath("io", "data", "html", "wikipedia_states.html") assert os.path.isfile(data), f"{repr(data)} is not a file" assert os.path.getsize(data), f"{repr(data)} is an empty file" - result = self.read_html(data, match="Arizona", header=1)[0] + result = flavor_read_html(data, match="Arizona", header=1)[0] assert result.shape == (60, 12) assert "Unnamed" in result.columns[-1] assert result["sq mi"].dtype == np.dtype("float64") assert np.allclose(result.loc[0, "sq mi"], 665384.04) - def test_wikipedia_states_multiindex(self, datapath): + def test_wikipedia_states_multiindex(self, datapath, flavor_read_html): data = datapath("io", "data", "html", "wikipedia_states.html") - result = self.read_html(data, match="Arizona", index_col=0)[0] + result = flavor_read_html(data, match="Arizona", index_col=0)[0] assert result.shape == (60, 11) assert "Unnamed" in result.columns[-1][1] assert result.columns.nlevels == 2 assert np.allclose(result.loc["Alaska", ("Total area[2]", "sq mi")], 665384.04) - def test_parser_error_on_empty_header_row(self): - result = self.read_html( + def test_parser_error_on_empty_header_row(self, flavor_read_html): + result = flavor_read_html( StringIO( """
@@ -1083,9 +1109,9 @@ def test_parser_error_on_empty_header_row(self): ) tm.assert_frame_equal(result[0], expected) - def test_decimal_rows(self): + def test_decimal_rows(self, flavor_read_html): # GH 12907 - result = self.read_html( + result = flavor_read_html( StringIO( """ @@ -1113,7 +1139,7 @@ def test_decimal_rows(self): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("arg", [True, False]) - def test_bool_header_arg(self, spam_data, arg): + def test_bool_header_arg(self, spam_data, arg, flavor_read_html): # GH 6114 msg = re.escape( "Passing a bool to header is invalid. Use header=None for no header or " @@ -1121,11 +1147,11 @@ def test_bool_header_arg(self, spam_data, arg): "column names" ) with pytest.raises(TypeError, match=msg): - self.read_html(spam_data, header=arg) + flavor_read_html(spam_data, header=arg) - def test_converters(self): + def test_converters(self, flavor_read_html): # GH 13461 - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -1150,9 +1176,9 @@ def test_converters(self): tm.assert_frame_equal(result, expected) - def test_na_values(self): + def test_na_values(self, flavor_read_html): # GH 13461 - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -1177,7 +1203,7 @@ def test_na_values(self): tm.assert_frame_equal(result, expected) - def test_keep_default_na(self): + def test_keep_default_na(self, flavor_read_html): html_data = """
@@ -1195,15 +1221,15 @@ def test_keep_default_na(self):
""" expected_df = DataFrame({"a": ["N/A", "NA"]}) - html_df = self.read_html(StringIO(html_data), keep_default_na=False)[0] + html_df = flavor_read_html(StringIO(html_data), keep_default_na=False)[0] tm.assert_frame_equal(expected_df, html_df) expected_df = DataFrame({"a": [np.nan, np.nan]}) - html_df = self.read_html(StringIO(html_data), keep_default_na=True)[0] + html_df = flavor_read_html(StringIO(html_data), keep_default_na=True)[0] tm.assert_frame_equal(expected_df, html_df) - def test_preserve_empty_rows(self): - result = self.read_html( + def test_preserve_empty_rows(self, flavor_read_html): + result = flavor_read_html( StringIO( """ @@ -1228,8 +1254,8 @@ def test_preserve_empty_rows(self): tm.assert_frame_equal(result, expected) - def test_ignore_empty_rows_when_inferring_header(self): - result = self.read_html( + def test_ignore_empty_rows_when_inferring_header(self, flavor_read_html): + result = flavor_read_html( StringIO( """
@@ -1251,7 +1277,7 @@ def test_ignore_empty_rows_when_inferring_header(self): tm.assert_frame_equal(result, expected) - def test_multiple_header_rows(self): + def test_multiple_header_rows(self, flavor_read_html): # Issue #13434 expected_df = DataFrame( data=[("Hillary", 68, "D"), ("Bernie", 74, "D"), ("Donald", 69, "R")] @@ -1261,20 +1287,20 @@ def test_multiple_header_rows(self): ["Name", "Unnamed: 1_level_1", "Unnamed: 2_level_1"], ] html = expected_df.to_html(index=False) - html_df = self.read_html(StringIO(html))[0] + html_df = flavor_read_html(StringIO(html))[0] tm.assert_frame_equal(expected_df, html_df) - def test_works_on_valid_markup(self, datapath): + def test_works_on_valid_markup(self, datapath, flavor_read_html): filename = datapath("io", "data", "html", "valid_markup.html") - dfs = self.read_html(filename, index_col=0) + dfs = flavor_read_html(filename, index_col=0) assert isinstance(dfs, list) assert isinstance(dfs[0], DataFrame) @pytest.mark.slow - def test_fallback_success(self, datapath): + def test_fallback_success(self, datapath, flavor_read_html): banklist_data = datapath("io", "data", "html", "banklist.html") - self.read_html(banklist_data, match=".*Water.*", flavor=["lxml", "html5lib"]) + flavor_read_html(banklist_data, match=".*Water.*", flavor=["lxml", "html5lib"]) def test_to_html_timestamp(self): rng = date_range("2000-01-01", periods=10) @@ -1309,7 +1335,7 @@ def test_to_html_borderless(self): (False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"])), ], ) - def test_displayed_only(self, displayed_only, exp0, exp1): + def test_displayed_only(self, displayed_only, exp0, exp1, flavor_read_html): # GH 20027 data = """ @@ -1331,7 +1357,7 @@ def test_displayed_only(self, displayed_only, exp0, exp1): """ - dfs = self.read_html(StringIO(data), displayed_only=displayed_only) + dfs = flavor_read_html(StringIO(data), displayed_only=displayed_only) tm.assert_frame_equal(dfs[0], exp0) if exp1 is not None: @@ -1340,7 +1366,7 @@ def test_displayed_only(self, displayed_only, exp0, exp1): assert len(dfs) == 1 # Should not parse hidden table @pytest.mark.parametrize("displayed_only", [True, False]) - def test_displayed_only_with_many_elements(self, displayed_only): + def test_displayed_only_with_many_elements(self, displayed_only, flavor_read_html): html_table = """
@@ -1357,7 +1383,9 @@ def test_displayed_only_with_many_elements(self, displayed_only):
""" - result = read_html(StringIO(html_table), displayed_only=displayed_only)[0] + result = flavor_read_html(StringIO(html_table), displayed_only=displayed_only)[ + 0 + ] expected = DataFrame({"A": [1, 4], "B": [2, 5]}) tm.assert_frame_equal(result, expected) @@ -1365,23 +1393,23 @@ def test_displayed_only_with_many_elements(self, displayed_only): "ignore:You provided Unicode markup but also provided a value for " "from_encoding.*:UserWarning" ) - def test_encode(self, html_encoding_file): + def test_encode(self, html_encoding_file, flavor_read_html): base_path = os.path.basename(html_encoding_file) root = os.path.splitext(base_path)[0] _, encoding = root.split("_") try: with open(html_encoding_file, "rb") as fobj: - from_string = self.read_html( + from_string = flavor_read_html( fobj.read(), encoding=encoding, index_col=0 ).pop() with open(html_encoding_file, "rb") as fobj: - from_file_like = self.read_html( + from_file_like = flavor_read_html( BytesIO(fobj.read()), encoding=encoding, index_col=0 ).pop() - from_filename = self.read_html( + from_filename = flavor_read_html( html_encoding_file, encoding=encoding, index_col=0 ).pop() tm.assert_frame_equal(from_string, from_file_like) @@ -1393,10 +1421,10 @@ def test_encode(self, html_encoding_file): pytest.skip() raise - def test_parse_failure_unseekable(self): + def test_parse_failure_unseekable(self, flavor_read_html): # Issue #17975 - if self.read_html.keywords.get("flavor") == "lxml": + if flavor_read_html.keywords.get("flavor") == "lxml": pytest.skip("Not applicable for lxml") class UnseekableStringIO(StringIO): @@ -1408,12 +1436,12 @@ def seekable(self):
spameggs
""" ) - assert self.read_html(bad) + assert flavor_read_html(bad) with pytest.raises(ValueError, match="passed a non-rewindable file object"): - self.read_html(bad) + flavor_read_html(bad) - def test_parse_failure_rewinds(self): + def test_parse_failure_rewinds(self, flavor_read_html): # Issue #17975 class MockFile: @@ -1444,12 +1472,12 @@ def __iter__(self) -> Iterator: good = MockFile("
spam
eggs
") bad = MockFile("
spameggs
") - assert self.read_html(good) - assert self.read_html(bad) + assert flavor_read_html(good) + assert flavor_read_html(bad) @pytest.mark.slow @pytest.mark.single_cpu - def test_importcheck_thread_safety(self, datapath): + def test_importcheck_thread_safety(self, datapath, flavor_read_html): # see gh-16928 class ErrorThread(threading.Thread): @@ -1462,8 +1490,8 @@ def run(self): self.err = None filename = datapath("io", "data", "html", "valid_markup.html") - helper_thread1 = ErrorThread(target=self.read_html, args=(filename,)) - helper_thread2 = ErrorThread(target=self.read_html, args=(filename,)) + helper_thread1 = ErrorThread(target=flavor_read_html, args=(filename,)) + helper_thread2 = ErrorThread(target=flavor_read_html, args=(filename,)) helper_thread1.start() helper_thread2.start() @@ -1472,17 +1500,17 @@ def run(self): pass assert None is helper_thread1.err is helper_thread2.err - def test_parse_path_object(self, datapath): + def test_parse_path_object(self, datapath, flavor_read_html): # GH 37705 file_path_string = datapath("io", "data", "html", "spam.html") file_path = Path(file_path_string) - df1 = self.read_html(file_path_string)[0] - df2 = self.read_html(file_path)[0] + df1 = flavor_read_html(file_path_string)[0] + df2 = flavor_read_html(file_path)[0] tm.assert_frame_equal(df1, df2) - def test_parse_br_as_space(self): + def test_parse_br_as_space(self, flavor_read_html): # GH 29528: pd.read_html() convert
to space - result = self.read_html( + result = flavor_read_html( StringIO( """ @@ -1502,7 +1530,7 @@ def test_parse_br_as_space(self): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("arg", ["all", "body", "header", "footer"]) - def test_extract_links(self, arg): + def test_extract_links(self, arg, flavor_read_html): gh_13141_data = """
@@ -1565,7 +1593,7 @@ def test_extract_links(self, arg): elif arg == "header": head_exp = gh_13141_expected["head_extract"] - result = self.read_html(StringIO(gh_13141_data), extract_links=arg)[0] + result = flavor_read_html(StringIO(gh_13141_data), extract_links=arg)[0] expected = DataFrame([data_exp, foot_exp], columns=head_exp) expected = expected.fillna(np.nan) tm.assert_frame_equal(result, expected) @@ -1578,7 +1606,7 @@ def test_extract_links_bad(self, spam_data): with pytest.raises(ValueError, match=msg): read_html(spam_data, extract_links="incorrect") - def test_extract_links_all_no_header(self): + def test_extract_links_all_no_header(self, flavor_read_html): # GH 48316 data = """
@@ -1589,7 +1617,7 @@ def test_extract_links_all_no_header(self):
""" - result = self.read_html(StringIO(data), extract_links="all")[0] + result = flavor_read_html(StringIO(data), extract_links="all")[0] expected = DataFrame([[("Google.com", "https://google.com")]]) tm.assert_frame_equal(result, expected) @@ -1601,7 +1629,7 @@ def test_invalid_dtype_backend(self): with pytest.raises(ValueError, match=msg): read_html("test", dtype_backend="numpy") - def test_style_tag(self): + def test_style_tag(self, flavor_read_html): # GH 48316 data = """ @@ -1622,6 +1650,6 @@ def test_style_tag(self):
""" - result = self.read_html(StringIO(data))[0] + result = flavor_read_html(StringIO(data))[0] expected = DataFrame(data=[["A1", "B1"], ["A2", "B2"]], columns=["A", "B"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 55fc77fb5705f..8547fd6988791 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -30,11 +30,10 @@ @pytest.fixture(autouse=True, params=[0, 1000000], ids=["numexpr", "python"]) -def switch_numexpr_min_elements(request): - _MIN_ELEMENTS = expr._MIN_ELEMENTS - expr._MIN_ELEMENTS = request.param - yield request.param - expr._MIN_ELEMENTS = _MIN_ELEMENTS +def switch_numexpr_min_elements(request, monkeypatch): + with monkeypatch.context() as m: + m.setattr(expr, "_MIN_ELEMENTS", request.param) + yield def _permute(obj): diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index 1e66cefbcfdd0..dfec99f0786eb 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -102,12 +102,6 @@ def _array_mixed2(_mixed2): @pytest.mark.skipif(not expr.USE_NUMEXPR, reason="not using numexpr") class TestExpressions: - @pytest.fixture(autouse=True) - def save_min_elements(self): - min_elements = expr._MIN_ELEMENTS - yield - expr._MIN_ELEMENTS = min_elements - @staticmethod def call_op(df, other, flex: bool, opname: str): if flex: @@ -140,21 +134,24 @@ def call_op(df, other, flex: bool, opname: str): @pytest.mark.parametrize( "arith", ["add", "sub", "mul", "mod", "truediv", "floordiv"] ) - def test_run_arithmetic(self, request, fixture, flex, arith): + def test_run_arithmetic(self, request, fixture, flex, arith, monkeypatch): df = request.getfixturevalue(fixture) - expr._MIN_ELEMENTS = 0 - result, expected = self.call_op(df, df, flex, arith) - - if arith == "truediv": - assert all(x.kind == "f" for x in expected.dtypes.values) - tm.assert_equal(expected, result) + with monkeypatch.context() as m: + m.setattr(expr, "_MIN_ELEMENTS", 0) + result, expected = self.call_op(df, df, flex, arith) - for i in range(len(df.columns)): - result, expected = self.call_op(df.iloc[:, i], df.iloc[:, i], flex, arith) if arith == "truediv": - assert expected.dtype.kind == "f" + assert all(x.kind == "f" for x in expected.dtypes.values) tm.assert_equal(expected, result) + for i in range(len(df.columns)): + result, expected = self.call_op( + df.iloc[:, i], df.iloc[:, i], flex, arith + ) + if arith == "truediv": + assert expected.dtype.kind == "f" + tm.assert_equal(expected, result) + @pytest.mark.parametrize( "fixture", [ @@ -168,7 +165,7 @@ def test_run_arithmetic(self, request, fixture, flex, arith): ], ) @pytest.mark.parametrize("flex", [True, False]) - def test_run_binary(self, request, fixture, flex, comparison_op): + def test_run_binary(self, request, fixture, flex, comparison_op, monkeypatch): """ tests solely that the result is the same whether or not numexpr is enabled. Need to test whether the function does the correct thing @@ -179,18 +176,19 @@ def test_run_binary(self, request, fixture, flex, comparison_op): with option_context("compute.use_numexpr", False): other = df.copy() + 1 - expr._MIN_ELEMENTS = 0 - expr.set_test_mode(True) + with monkeypatch.context() as m: + m.setattr(expr, "_MIN_ELEMENTS", 0) + expr.set_test_mode(True) - result, expected = self.call_op(df, other, flex, arith) + result, expected = self.call_op(df, other, flex, arith) - used_numexpr = expr.get_test_result() - assert used_numexpr, "Did not use numexpr as expected." - tm.assert_equal(expected, result) + used_numexpr = expr.get_test_result() + assert used_numexpr, "Did not use numexpr as expected." + tm.assert_equal(expected, result) - for i in range(len(df.columns)): - binary_comp = other.iloc[:, i] + 1 - self.call_op(df.iloc[:, i], binary_comp, flex, "add") + for i in range(len(df.columns)): + binary_comp = other.iloc[:, i] + 1 + self.call_op(df.iloc[:, i], binary_comp, flex, "add") def test_invalid(self): array = np.random.default_rng(2).standard_normal(1_000_001) @@ -406,7 +404,7 @@ def test_bool_ops_column_name_dtype(self, test_input, expected): "arith", ("add", "sub", "mul", "mod", "truediv", "floordiv") ) @pytest.mark.parametrize("axis", (0, 1)) - def test_frame_series_axis(self, axis, arith, _frame): + def test_frame_series_axis(self, axis, arith, _frame, monkeypatch): # GH#26736 Dataframe.floordiv(Series, axis=1) fails df = _frame @@ -415,15 +413,16 @@ def test_frame_series_axis(self, axis, arith, _frame): else: other = df.iloc[:, 0] - expr._MIN_ELEMENTS = 0 + with monkeypatch.context() as m: + m.setattr(expr, "_MIN_ELEMENTS", 0) - op_func = getattr(df, arith) + op_func = getattr(df, arith) - with option_context("compute.use_numexpr", False): - expected = op_func(other, axis=axis) + with option_context("compute.use_numexpr", False): + expected = op_func(other, axis=axis) - result = op_func(other, axis=axis) - tm.assert_frame_equal(expected, result) + result = op_func(other, axis=axis) + tm.assert_frame_equal(expected, result) @pytest.mark.parametrize( "op", @@ -436,29 +435,32 @@ def test_frame_series_axis(self, axis, arith, _frame): ) @pytest.mark.parametrize("box", [DataFrame, Series, Index]) @pytest.mark.parametrize("scalar", [-5, 5]) - def test_python_semantics_with_numexpr_installed(self, op, box, scalar): + def test_python_semantics_with_numexpr_installed( + self, op, box, scalar, monkeypatch + ): # https://github.com/pandas-dev/pandas/issues/36047 - expr._MIN_ELEMENTS = 0 - data = np.arange(-50, 50) - obj = box(data) - method = getattr(obj, op) - result = method(scalar) - - # compare result with numpy - with option_context("compute.use_numexpr", False): - expected = method(scalar) - - tm.assert_equal(result, expected) - - # compare result element-wise with Python - for i, elem in enumerate(data): - if box == DataFrame: - scalar_result = result.iloc[i, 0] - else: - scalar_result = result[i] - try: - expected = getattr(int(elem), op)(scalar) - except ZeroDivisionError: - pass - else: - assert scalar_result == expected + with monkeypatch.context() as m: + m.setattr(expr, "_MIN_ELEMENTS", 0) + data = np.arange(-50, 50) + obj = box(data) + method = getattr(obj, op) + result = method(scalar) + + # compare result with numpy + with option_context("compute.use_numexpr", False): + expected = method(scalar) + + tm.assert_equal(result, expected) + + # compare result element-wise with Python + for i, elem in enumerate(data): + if box == DataFrame: + scalar_result = result.iloc[i, 0] + else: + scalar_result = result[i] + try: + expected = getattr(int(elem), op)(scalar) + except ZeroDivisionError: + pass + else: + assert scalar_result == expected From 2e3cb7685887f94997b9b83d0662aeb7a406e665 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Mon, 25 Sep 2023 14:42:45 -0400 Subject: [PATCH 039/131] BUG: DatetimeIndex.union returning object dtype for indexes with the same tz but different units (#55259) * add DatetimeTZDtype._get_common_type * mypy --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/dtypes/dtypes.py | 7 +++++++ pandas/tests/indexes/datetimes/test_setops.py | 8 ++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 930e03ae7d75a..ca09a69f603ab 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -259,7 +259,7 @@ Categorical Datetimelike ^^^^^^^^^^^^ -- +- Bug in :meth:`DatetimeIndex.union` returning object dtype for tz-aware indexes with the same timezone but different units (:issue:`55238`) - Timedelta diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index beff71d5e9dd9..196c95c3673e9 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -928,6 +928,13 @@ def __setstate__(self, state) -> None: self._tz = state["tz"] self._unit = state["unit"] + def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: + if all(isinstance(t, DatetimeTZDtype) and t.tz == self.tz for t in dtypes): + np_dtype = np.max([cast(DatetimeTZDtype, t).base for t in [self, *dtypes]]) + unit = np.datetime_data(np_dtype)[0] + return type(self)(unit=unit, tz=self.tz) + return super()._get_common_dtype(dtypes) + @cache_readonly def index_class(self) -> type_t[DatetimeIndex]: from pandas import DatetimeIndex diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index b56bad7f2e833..ca784948a5d29 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -189,6 +189,14 @@ def test_union_with_DatetimeIndex(self, sort): # Fails with "AttributeError: can't set attribute" i2.union(i1, sort=sort) + def test_union_same_timezone_different_units(self): + # GH 55238 + idx1 = date_range("2000-01-01", periods=3, tz="UTC").as_unit("ms") + idx2 = date_range("2000-01-01", periods=3, tz="UTC").as_unit("us") + result = idx1.union(idx2) + expected = date_range("2000-01-01", periods=3, tz="UTC").as_unit("us") + tm.assert_index_equal(result, expected) + # TODO: moved from test_datetimelike; de-duplicate with version below def test_intersection2(self): first = tm.makeDateIndex(10) From 764c460cdd1d14dfa9a28d9367ab9874ac8037ba Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 25 Sep 2023 11:43:40 -0700 Subject: [PATCH 040/131] BUG: Series[slc]=foo raising with IntervalIndex (#55258) --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/indexes/base.py | 12 +++------ .../tests/indexing/interval/test_interval.py | 27 +++++++++++++++++++ 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index ca09a69f603ab..0760840f9950a 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -292,6 +292,7 @@ Interval - Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown. (:issue:`55015`) - Bug in :meth:`IntervalIndex.get_indexer` with datetime or timedelta intervals incorrectly matching on integer targets (:issue:`47772`) - Bug in :meth:`IntervalIndex.get_indexer` with timezone-aware datetime intervals incorrectly matching on a sequence of timezone-naive targets (:issue:`47772`) +- Bug in setting values on a :class:`Series` with an :class:`IntervalIndex` using a slice incorrectly raising (:issue:`54722`) - Indexing diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8703fef1e5940..e23887159c9c6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4204,15 +4204,9 @@ def _convert_slice_indexer(self, key: slice, kind: Literal["loc", "getitem"]): self._validate_indexer("slice", key.step, "getitem") return key - # convert the slice to an indexer here - - # special case for interval_dtype bc we do not do partial-indexing - # on integer Intervals when slicing - # TODO: write this in terms of e.g. should_partial_index? - ints_are_positional = self._should_fallback_to_positional or isinstance( - self.dtype, IntervalDtype - ) - is_positional = is_index_slice and ints_are_positional + # convert the slice to an indexer here; checking that the user didn't + # pass a positional slice to loc + is_positional = is_index_slice and self._should_fallback_to_positional # if we are mixed and have integers if is_positional: diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py index 52a1d433712ff..ae25724972fde 100644 --- a/pandas/tests/indexing/interval/test_interval.py +++ b/pandas/tests/indexing/interval/test_interval.py @@ -134,6 +134,33 @@ def test_getitem_interval_with_nans(self, frame_or_series, indexer_sl): tm.assert_equal(result, expected) + def test_setitem_interval_with_slice(self): + # GH#54722 + ii = IntervalIndex.from_breaks(range(4, 15)) + ser = Series(range(10), index=ii) + + orig = ser.copy() + + # This should be a no-op (used to raise) + ser.loc[1:3] = 20 + tm.assert_series_equal(ser, orig) + + ser.loc[6:8] = 19 + orig.iloc[1:4] = 19 + tm.assert_series_equal(ser, orig) + + ser2 = Series(range(5), index=ii[::2]) + orig2 = ser2.copy() + + # this used to raise + ser2.loc[6:8] = 22 # <- raises on main, sets on branch + orig2.iloc[1] = 22 + tm.assert_series_equal(ser2, orig2) + + ser2.loc[5:7] = 21 + orig2.iloc[:2] = 21 + tm.assert_series_equal(ser2, orig2) + class TestIntervalIndexInsideMultiIndex: def test_mi_intervalindex_slicing_with_scalar(self): From 89bd5695df3ccf5b9186b4645118656656831593 Mon Sep 17 00:00:00 2001 From: Artur Barseghyan Date: Tue, 26 Sep 2023 02:11:06 +0200 Subject: [PATCH 041/131] Update pyproject.toml - replace `output_formatting` with `output-formatting` (#55275) * Update pyproject.toml * Update v2.1.2.rst * Update v2.1.2.rst * Update doc/source/whatsnew/v2.1.2.rst * Update install.rst * Update package-checks.yml --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .github/workflows/package-checks.yml | 2 +- doc/source/getting_started/install.rst | 6 +++--- doc/source/whatsnew/v2.1.2.rst | 2 +- pyproject.toml | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index 64a94d7fde5a9..a2c42af53c3a8 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -24,7 +24,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - extra: ["test", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output_formatting", "clipboard", "compression", "consortium-standard", "all"] + extra: ["test", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output-formatting", "clipboard", "compression", "consortium-standard", "all"] fail-fast: false name: Install Extras - ${{ matrix.extra }} concurrency: diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 2c0787397e047..4a0ad4ae658c3 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -247,14 +247,14 @@ Dependency Minimum Version pip ext Visualization ^^^^^^^^^^^^^ -Installable with ``pip install "pandas[plot, output_formatting]"``. +Installable with ``pip install "pandas[plot, output-formatting]"``. ========================= ================== ================== ============================================================= Dependency Minimum Version pip extra Notes ========================= ================== ================== ============================================================= matplotlib 3.6.1 plot Plotting library -Jinja2 3.1.2 output_formatting Conditional formatting with DataFrame.style -tabulate 0.8.10 output_formatting Printing in Markdown-friendly format (see `tabulate`_) +Jinja2 3.1.2 output-formatting Conditional formatting with DataFrame.style +tabulate 0.8.10 output-formatting Printing in Markdown-friendly format (see `tabulate`_) ========================= ================== ================== ============================================================= Computation diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 97aeb56924e65..6fec66ec8d556 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -29,7 +29,7 @@ Bug fixes Other ~~~~~ -- +- Fixed non-working installation of optional dependency group ``output_formatting``. Replacing underscore ``_`` with a dash ``-`` fixes broken dependency resolution. A correct way to use now is ``pip install pandas[output-formatting]``. - .. --------------------------------------------------------------------------- diff --git a/pyproject.toml b/pyproject.toml index df89bd129901f..89432c2353ea8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,7 +80,7 @@ sql-other = ['SQLAlchemy>=1.4.36'] html = ['beautifulsoup4>=4.11.1', 'html5lib>=1.1', 'lxml>=4.8.0'] xml = ['lxml>=4.8.0'] plot = ['matplotlib>=3.6.1'] -output_formatting = ['jinja2>=3.1.2', 'tabulate>=0.8.10'] +output-formatting = ['jinja2>=3.1.2', 'tabulate>=0.8.10'] clipboard = ['PyQt5>=5.15.6', 'qtpy>=2.2.0'] compression = ['zstandard>=0.17.0'] consortium-standard = ['dataframe-api-compat>=0.1.7'] From 1f167626987f0d87970f966a083124b8d1fe6ade Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Tue, 26 Sep 2023 11:48:56 -0400 Subject: [PATCH 042/131] TYP: Misc changes for pandas-stubs; use Protocol to avoid str in Sequence (#55263) * TYP: misc changes for pandas-stubs test * re-write changes from 47233 with SequenceNotStr * pyupgrade --- pandas/_typing.py | 43 +++++++++++++++++++++++++++++---- pandas/core/frame.py | 13 +++++----- pandas/core/generic.py | 7 +++--- pandas/core/methods/describe.py | 2 +- pandas/core/resample.py | 2 +- pandas/core/reshape/merge.py | 28 +++++++++++---------- pandas/core/series.py | 2 +- pandas/io/formats/csvs.py | 7 +++--- pandas/io/formats/format.py | 3 ++- pandas/tests/io/test_sql.py | 2 -- 10 files changed, 73 insertions(+), 36 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index c2bbebfbe2857..0e2a0881f0122 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -24,6 +24,7 @@ Type as type_t, TypeVar, Union, + overload, ) import numpy as np @@ -85,6 +86,8 @@ # Name "npt._ArrayLikeInt_co" is not defined [name-defined] NumpySorter = Optional[npt._ArrayLikeInt_co] # type: ignore[name-defined] + from typing import SupportsIndex + if sys.version_info >= (3, 10): from typing import TypeGuard # pyright: ignore[reportUnusedImport] else: @@ -109,10 +112,40 @@ # list-like -# Cannot use `Sequence` because a string is a sequence, and we don't want to -# accept that. Could refine if https://github.com/python/typing/issues/256 is -# resolved to differentiate between Sequence[str] and str -ListLike = Union[AnyArrayLike, list, tuple, range] +# from https://github.com/hauntsaninja/useful_types +# includes Sequence-like objects but excludes str and bytes +_T_co = TypeVar("_T_co", covariant=True) + + +class SequenceNotStr(Protocol[_T_co]): + @overload + def __getitem__(self, index: SupportsIndex, /) -> _T_co: + ... + + @overload + def __getitem__(self, index: slice, /) -> Sequence[_T_co]: + ... + + def __contains__(self, value: object, /) -> bool: + ... + + def __len__(self) -> int: + ... + + def __iter__(self) -> Iterator[_T_co]: + ... + + def index(self, value: Any, /, start: int = 0, stop: int = ...) -> int: + ... + + def count(self, value: Any, /) -> int: + ... + + def __reversed__(self) -> Iterator[_T_co]: + ... + + +ListLike = Union[AnyArrayLike, SequenceNotStr, range] # scalars @@ -120,7 +153,7 @@ DatetimeLikeScalar = Union["Period", "Timestamp", "Timedelta"] PandasScalar = Union["Period", "Timestamp", "Timedelta", "Interval"] Scalar = Union[PythonScalar, PandasScalar, np.datetime64, np.timedelta64, date] -IntStrT = TypeVar("IntStrT", int, str) +IntStrT = TypeVar("IntStrT", bound=Union[int, str]) # timestamp and timedelta convertible types diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3e32a6d93b023..432c0a745c7a0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -240,6 +240,7 @@ Renamer, Scalar, Self, + SequenceNotStr, SortKind, StorageOptions, Suffixes, @@ -1187,7 +1188,7 @@ def to_string( buf: None = ..., columns: Axes | None = ..., col_space: int | list[int] | dict[Hashable, int] | None = ..., - header: bool | list[str] = ..., + header: bool | SequenceNotStr[str] = ..., index: bool = ..., na_rep: str = ..., formatters: fmt.FormattersType | None = ..., @@ -1212,7 +1213,7 @@ def to_string( buf: FilePath | WriteBuffer[str], columns: Axes | None = ..., col_space: int | list[int] | dict[Hashable, int] | None = ..., - header: bool | list[str] = ..., + header: bool | SequenceNotStr[str] = ..., index: bool = ..., na_rep: str = ..., formatters: fmt.FormattersType | None = ..., @@ -1250,7 +1251,7 @@ def to_string( buf: FilePath | WriteBuffer[str] | None = None, columns: Axes | None = None, col_space: int | list[int] | dict[Hashable, int] | None = None, - header: bool | list[str] = True, + header: bool | SequenceNotStr[str] = True, index: bool = True, na_rep: str = "NaN", formatters: fmt.FormattersType | None = None, @@ -10563,9 +10564,9 @@ def merge( self, right: DataFrame | Series, how: MergeHow = "inner", - on: IndexLabel | None = None, - left_on: IndexLabel | None = None, - right_on: IndexLabel | None = None, + on: IndexLabel | AnyArrayLike | None = None, + left_on: IndexLabel | AnyArrayLike | None = None, + right_on: IndexLabel | AnyArrayLike | None = None, left_index: bool = False, right_index: bool = False, sort: bool = False, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 427687d9614f9..738f4cbe6bc43 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -72,6 +72,7 @@ Renamer, Scalar, Self, + SequenceNotStr, SortKind, StorageOptions, Suffixes, @@ -3273,7 +3274,7 @@ def to_latex( self, buf: None = ..., columns: Sequence[Hashable] | None = ..., - header: bool_t | list[str] = ..., + header: bool_t | SequenceNotStr[str] = ..., index: bool_t = ..., na_rep: str = ..., formatters: FormattersType | None = ..., @@ -3300,7 +3301,7 @@ def to_latex( self, buf: FilePath | WriteBuffer[str], columns: Sequence[Hashable] | None = ..., - header: bool_t | list[str] = ..., + header: bool_t | SequenceNotStr[str] = ..., index: bool_t = ..., na_rep: str = ..., formatters: FormattersType | None = ..., @@ -3330,7 +3331,7 @@ def to_latex( self, buf: FilePath | WriteBuffer[str] | None = None, columns: Sequence[Hashable] | None = None, - header: bool_t | list[str] = True, + header: bool_t | SequenceNotStr[str] = True, index: bool_t = True, na_rep: str = "NaN", formatters: FormattersType | None = None, diff --git a/pandas/core/methods/describe.py b/pandas/core/methods/describe.py index 5bb6bebd8a87b..dcdf0067d45b0 100644 --- a/pandas/core/methods/describe.py +++ b/pandas/core/methods/describe.py @@ -301,7 +301,7 @@ def describe_timestamp_as_categorical_1d( names = ["count", "unique"] objcounts = data.value_counts() count_unique = len(objcounts[objcounts != 0]) - result = [data.count(), count_unique] + result: list[float | Timestamp] = [data.count(), count_unique] dtype = None if count_unique > 0: top, freq = objcounts.index[0], objcounts.iloc[0] diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 30d654078bd05..b6323e8c8b5f9 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1541,7 +1541,7 @@ def count(self): return result - def quantile(self, q: float | AnyArrayLike = 0.5, **kwargs): + def quantile(self, q: float | list[float] | AnyArrayLike = 0.5, **kwargs): """ Return value at the given quantile. diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 6d1ff07e07c76..4b9fcc80af4bb 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -138,9 +138,9 @@ def merge( left: DataFrame | Series, right: DataFrame | Series, how: MergeHow = "inner", - on: IndexLabel | None = None, - left_on: IndexLabel | None = None, - right_on: IndexLabel | None = None, + on: IndexLabel | AnyArrayLike | None = None, + left_on: IndexLabel | AnyArrayLike | None = None, + right_on: IndexLabel | AnyArrayLike | None = None, left_index: bool = False, right_index: bool = False, sort: bool = False, @@ -187,9 +187,9 @@ def merge( def _cross_merge( left: DataFrame, right: DataFrame, - on: IndexLabel | None = None, - left_on: IndexLabel | None = None, - right_on: IndexLabel | None = None, + on: IndexLabel | AnyArrayLike | None = None, + left_on: IndexLabel | AnyArrayLike | None = None, + right_on: IndexLabel | AnyArrayLike | None = None, left_index: bool = False, right_index: bool = False, sort: bool = False, @@ -239,7 +239,9 @@ def _cross_merge( return res -def _groupby_and_merge(by, left: DataFrame, right: DataFrame, merge_pieces): +def _groupby_and_merge( + by, left: DataFrame | Series, right: DataFrame | Series, merge_pieces +): """ groupby & merge; we are always performing a left-by type operation @@ -255,7 +257,7 @@ def _groupby_and_merge(by, left: DataFrame, right: DataFrame, merge_pieces): by = [by] lby = left.groupby(by, sort=False) - rby: groupby.DataFrameGroupBy | None = None + rby: groupby.DataFrameGroupBy | groupby.SeriesGroupBy | None = None # if we can groupby the rhs # then we can get vastly better perf @@ -295,8 +297,8 @@ def _groupby_and_merge(by, left: DataFrame, right: DataFrame, merge_pieces): def merge_ordered( - left: DataFrame, - right: DataFrame, + left: DataFrame | Series, + right: DataFrame | Series, on: IndexLabel | None = None, left_on: IndexLabel | None = None, right_on: IndexLabel | None = None, @@ -737,9 +739,9 @@ def __init__( left: DataFrame | Series, right: DataFrame | Series, how: MergeHow | Literal["asof"] = "inner", - on: IndexLabel | None = None, - left_on: IndexLabel | None = None, - right_on: IndexLabel | None = None, + on: IndexLabel | AnyArrayLike | None = None, + left_on: IndexLabel | AnyArrayLike | None = None, + right_on: IndexLabel | AnyArrayLike | None = None, left_index: bool = False, right_index: bool = False, sort: bool = True, diff --git a/pandas/core/series.py b/pandas/core/series.py index d3a2bb1745cd1..fd50a85f3c2e3 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2141,7 +2141,7 @@ def groupby( # Statistics, overridden ndarray methods # TODO: integrate bottleneck - def count(self): + def count(self) -> int: """ Return number of non-NA/null observations in the Series. diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 8d0edd88ffb6c..569c8aaf6cef1 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -21,6 +21,7 @@ import numpy as np from pandas._libs import writers as libwriters +from pandas._typing import SequenceNotStr from pandas.util._decorators import cache_readonly from pandas.core.dtypes.generic import ( @@ -109,7 +110,7 @@ def decimal(self) -> str: return self.fmt.decimal @property - def header(self) -> bool | list[str]: + def header(self) -> bool | SequenceNotStr[str]: return self.fmt.header @property @@ -213,7 +214,7 @@ def _need_to_save_header(self) -> bool: return bool(self._has_aliases or self.header) @property - def write_cols(self) -> Sequence[Hashable]: + def write_cols(self) -> SequenceNotStr[Hashable]: if self._has_aliases: assert not isinstance(self.header, bool) if len(self.header) != len(self.cols): @@ -224,7 +225,7 @@ def write_cols(self) -> Sequence[Hashable]: else: # self.cols is an ndarray derived from Index._format_native_types, # so its entries are strings, i.e. hashable - return cast(Sequence[Hashable], self.cols) + return cast(SequenceNotStr[Hashable], self.cols) @property def encoded_labels(self) -> list[Hashable]: diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 2297f7945a264..922d0f37bee3a 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -105,6 +105,7 @@ FloatFormatType, FormattersType, IndexLabel, + SequenceNotStr, StorageOptions, WriteBuffer, ) @@ -566,7 +567,7 @@ def __init__( frame: DataFrame, columns: Axes | None = None, col_space: ColspaceArgType | None = None, - header: bool | list[str] = True, + header: bool | SequenceNotStr[str] = True, index: bool = True, na_rep: str = "NaN", formatters: FormattersType | None = None, diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index f015c9efe7122..e1839fc1b0a67 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -3161,8 +3161,6 @@ def dtype_backend_data() -> DataFrame: @pytest.fixture def dtype_backend_expected(): def func(storage, dtype_backend, conn_name): - string_array: StringArray | ArrowStringArray - string_array_na: StringArray | ArrowStringArray if storage == "python": string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_)) From 98f5a787940e02966f2747edbcbd992f0031c277 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Tue, 26 Sep 2023 16:57:40 +0100 Subject: [PATCH 043/131] BUG: df.resample('MS', closed='right') incorrectly places bins (#55283) * hardcode allowed freqs for hack * whatsnew * tests for business day issues * move whatsnew to 2.1.2 --- doc/source/whatsnew/v2.1.2.rst | 4 +- pandas/core/resample.py | 12 +++- pandas/tests/resample/test_datetime_index.py | 65 +++++++++++++++++++- 3 files changed, 78 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 6fec66ec8d556..1a25b848e0f84 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -13,6 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) - Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`) - @@ -21,7 +22,8 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) +- Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/resample.py b/pandas/core/resample.py index b6323e8c8b5f9..e9b2bacd9e1df 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2297,7 +2297,17 @@ def _adjust_bin_edges( ) -> tuple[DatetimeIndex, npt.NDArray[np.int64]]: # Some hacks for > daily data, see #1471, #1458, #1483 - if self.freq != "D" and is_superperiod(self.freq, "D"): + if self.freq.name in ("BM", "ME", "W") or self.freq.name.split("-")[0] in ( + "BQ", + "BA", + "Q", + "A", + "W", + ): + # If the right end-point is on the last day of the month, roll forwards + # until the last moment of that day. Note that we only do this for offsets + # which correspond to the end of a super-daily period - "month start", for + # example, is excluded. if self.closed == "right": # GH 21459, GH 9119: Adjust the bins relative to the wall time edges_dti = binner.tz_localize(None) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index d929dfa6e1e59..113e2d8986ad2 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -660,7 +660,7 @@ def test_resample_reresample(unit): s = Series(np.random.default_rng(2).random(len(dti)), dti) bs = s.resample("B", closed="right", label="right").mean() result = bs.resample("8H").mean() - assert len(result) == 22 + assert len(result) == 25 assert isinstance(result.index.freq, offsets.DateOffset) assert result.index.freq == offsets.Hour(8) @@ -2051,3 +2051,66 @@ def test_resample_M_deprecated(): with tm.assert_produces_warning(UserWarning, match=depr_msg): result = s.resample("2M").mean() tm.assert_series_equal(result, expected) + + +def test_resample_ms_closed_right(): + # https://github.com/pandas-dev/pandas/issues/55271 + dti = date_range(start="2020-01-31", freq="1min", periods=6000) + df = DataFrame({"ts": dti}, index=dti) + grouped = df.resample("MS", closed="right") + result = grouped.last() + expected = DataFrame( + {"ts": [datetime(2020, 2, 1), datetime(2020, 2, 4, 3, 59)]}, + index=DatetimeIndex([datetime(2020, 1, 1), datetime(2020, 2, 1)], freq="MS"), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("freq", ["B", "C"]) +def test_resample_c_b_closed_right(freq: str): + # https://github.com/pandas-dev/pandas/issues/55281 + dti = date_range(start="2020-01-31", freq="1min", periods=6000) + df = DataFrame({"ts": dti}, index=dti) + grouped = df.resample(freq, closed="right") + result = grouped.last() + expected = DataFrame( + { + "ts": [ + datetime(2020, 1, 31), + datetime(2020, 2, 3), + datetime(2020, 2, 4), + datetime(2020, 2, 4, 3, 59), + ] + }, + index=DatetimeIndex( + [ + datetime(2020, 1, 30), + datetime(2020, 1, 31), + datetime(2020, 2, 3), + datetime(2020, 2, 4), + ], + freq=freq, + ), + ) + tm.assert_frame_equal(result, expected) + + +def test_resample_b_55282(): + # https://github.com/pandas-dev/pandas/issues/55282 + s = Series( + [1, 2, 3, 4, 5, 6], index=date_range("2023-09-26", periods=6, freq="12H") + ) + result = s.resample("B", closed="right", label="right").mean() + expected = Series( + [1.0, 2.5, 4.5, 6.0], + index=DatetimeIndex( + [ + datetime(2023, 9, 26), + datetime(2023, 9, 27), + datetime(2023, 9, 28), + datetime(2023, 9, 29), + ], + freq="B", + ), + ) + tm.assert_series_equal(result, expected) From 61d2056e4251b8d1c387f2577f0407062b9ab903 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 26 Sep 2023 11:02:55 -1000 Subject: [PATCH 044/131] TST: xfail test due to new numexpr version (#55300) --- pandas/tests/frame/test_arithmetic.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 24a70e55e2f0e..bb9a76829c77d 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -22,14 +22,12 @@ ) import pandas._testing as tm from pandas.core.computation import expressions as expr -from pandas.core.computation.expressions import ( - _MIN_ELEMENTS, - NUMEXPR_INSTALLED, -) +from pandas.core.computation.expressions import _MIN_ELEMENTS from pandas.tests.frame.common import ( _check_mixed_float, _check_mixed_int, ) +from pandas.util.version import Version @pytest.fixture(autouse=True, params=[0, 1000000], ids=["numexpr", "python"]) @@ -501,10 +499,19 @@ def test_floordiv_axis0(self): result2 = df.floordiv(ser.values, axis=0) tm.assert_frame_equal(result2, expected) - @pytest.mark.skipif(not NUMEXPR_INSTALLED, reason="numexpr not installed") @pytest.mark.parametrize("opname", ["floordiv", "pow"]) - def test_floordiv_axis0_numexpr_path(self, opname): + def test_floordiv_axis0_numexpr_path(self, opname, request): # case that goes through numexpr and has to fall back to masked_arith_op + ne = pytest.importorskip("numexpr") + if ( + Version(ne.__version__) >= Version("2.8.7") + and opname == "pow" + and "python" in request.node.callspec.id + ): + request.node.add_marker( + pytest.mark.xfail(reason="https://github.com/pydata/numexpr/issues/454") + ) + op = getattr(operator, opname) arr = np.arange(_MIN_ELEMENTS + 100).reshape(_MIN_ELEMENTS // 100 + 1, -1) * 100 From 824a2738dc0b2cdd8dce2d4256c9dc34bb589e6b Mon Sep 17 00:00:00 2001 From: Hedeer El Showk <144284759+hedeershowk@users.noreply.github.com> Date: Wed, 27 Sep 2023 12:50:45 -0400 Subject: [PATCH 045/131] BUG: add pyarrow autogenerated prefix (#55115) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add pyarrow autogenerated prefix * whats new bug fix * test with no head and pyarrow * only test pyarrow * BUG: This fixes #55009 (`raw=True` caused `apply` method of `DataFrame` to ignore passed arguments) (#55089) * fixes #55009 * update documentation * write documentation * add test * change formatting * cite DataDrame directly in docs Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * PR review feedback * Update doc/source/whatsnew/v2.2.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * alphabetical whatsnew --------- Co-authored-by: Martin Šícho Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/io/parsers/arrow_parser_wrapper.py | 6 ++++++ pandas/tests/io/parser/test_header.py | 18 ++++++++++++++++++ 3 files changed, 25 insertions(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 0760840f9950a..445b93705cde5 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -314,6 +314,7 @@ MultiIndex I/O ^^^ - Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`) +- Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`) - Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) - Bug in :func:`to_excel`, with ``OdsWriter`` (``ods`` files) writing boolean/string value (:issue:`54994`) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 765a4ffcd2cb9..35965c90ee7fb 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -130,6 +130,12 @@ def handle_warning(invalid_row): ) } self.convert_options["strings_can_be_null"] = "" in self.kwds["null_values"] + # autogenerated column names are prefixed with 'f' in pyarrow.csv + if self.header is None and "include_columns" in self.convert_options: + self.convert_options["include_columns"] = [ + f"f{n}" for n in self.convert_options["include_columns"] + ] + self.read_options = { "autogenerate_column_names": self.header is None, "skip_rows": self.header diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index d72174c40478e..d6eab59074dd6 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -684,3 +684,21 @@ def test_header_delim_whitespace(all_parsers): result = parser.read_csv(StringIO(data), delim_whitespace=True) expected = DataFrame({"a,b": ["1,2", "3,4"]}) tm.assert_frame_equal(result, expected) + + +def test_usecols_no_header_pyarrow(pyarrow_parser_only): + parser = pyarrow_parser_only + data = """ +a,i,x +b,j,y +""" + result = parser.read_csv( + StringIO(data), + header=None, + usecols=[0, 1], + dtype="string[pyarrow]", + dtype_backend="pyarrow", + engine="pyarrow", + ) + expected = DataFrame([["a", "i"], ["b", "j"]], dtype="string[pyarrow]") + tm.assert_frame_equal(result, expected) From 69b5d5ae40cd544ba6e47dc9ba715f0e8951ac46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Thu, 28 Sep 2023 12:16:11 -0400 Subject: [PATCH 046/131] TYP: overload for DataFrame.to_xml (#55313) * TYP: overload for DataFrame.to_xml * black --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/frame.py | 49 ++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 445b93705cde5..6ddb1613076ac 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -218,6 +218,7 @@ Other Deprecations - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_parquet` except ``path``. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_pickle` except ``path``. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_string` except ``buf``. (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_xml` except ``path_or_buffer``. (:issue:`54229`) - Deprecated automatic downcasting of object-dtype results in :meth:`Series.replace` and :meth:`DataFrame.replace`, explicitly call ``result = result.infer_objects(copy=False)`` instead. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`54710`) - Deprecated downcasting behavior in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, :meth:`DataFrame.mask`, :meth:`Series.clip`, :meth:`DataFrame.clip`; in a future version these will not infer object-dtype columns to non-object dtype, or all-round floats to integer dtype. Call ``result.infer_objects(copy=False)`` on the result for object inference, or explicitly cast floats to ints. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`53656`) - Deprecated including the groups in computations when using :meth:`DataFrameGroupBy.apply` and :meth:`DataFrameGroupBy.resample`; pass ``include_groups=False`` to exclude the groups (:issue:`7155`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 432c0a745c7a0..a16597221ac92 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3272,6 +3272,55 @@ def to_html( render_links=render_links, ) + @overload + def to_xml( + self, + path_or_buffer: None = ..., + *, + index: bool = ..., + root_name: str | None = ..., + row_name: str | None = ..., + na_rep: str | None = ..., + attr_cols: list[str] | None = ..., + elem_cols: list[str] | None = ..., + namespaces: dict[str | None, str] | None = ..., + prefix: str | None = ..., + encoding: str = ..., + xml_declaration: bool | None = ..., + pretty_print: bool | None = ..., + parser: XMLParsers | None = ..., + stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = ..., + compression: CompressionOptions = ..., + storage_options: StorageOptions | None = ..., + ) -> str: + ... + + @overload + def to_xml( + self, + path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str], + *, + index: bool = ..., + root_name: str | None = ..., + row_name: str | None = ..., + na_rep: str | None = ..., + attr_cols: list[str] | None = ..., + elem_cols: list[str] | None = ..., + namespaces: dict[str | None, str] | None = ..., + prefix: str | None = ..., + encoding: str = ..., + xml_declaration: bool | None = ..., + pretty_print: bool | None = ..., + parser: XMLParsers | None = ..., + stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = ..., + compression: CompressionOptions = ..., + storage_options: StorageOptions | None = ..., + ) -> None: + ... + + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "path_or_buffer"], name="to_xml" + ) @doc( storage_options=_shared_docs["storage_options"], compression_options=_shared_docs["compression_options"] % "path_or_buffer", From 8f20e82c4eee12d605ad4eb9d49b4fc37069e2dd Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 29 Sep 2023 10:50:50 -0400 Subject: [PATCH 047/131] CLN: Clean up some iteration logic in tslib (#55320) --- pandas/_libs/tslib.pyx | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 20a18cf56779f..43252ffb5bf13 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -455,18 +455,18 @@ cpdef array_to_datetime( set out_tzoffset_vals = set() tzinfo tz_out = None bint found_tz = False, found_naive = False - cnp.broadcast mi + cnp.flatiter it = cnp.PyArray_IterNew(values) # specify error conditions assert is_raise or is_ignore or is_coerce result = np.empty((values).shape, dtype="M8[ns]") - mi = cnp.PyArray_MultiIterNew2(result, values) iresult = result.view("i8").ravel() for i in range(n): # Analogous to `val = values[i]` - val = (cnp.PyArray_MultiIter_DATA(mi, 1))[0] + val = cnp.PyArray_GETITEM(values, cnp.PyArray_ITER_DATA(it)) + cnp.PyArray_ITER_NEXT(it) try: if checknull_with_nat_and_na(val): @@ -511,7 +511,6 @@ cpdef array_to_datetime( if parse_today_now(val, &iresult[i], utc): # We can't _quite_ dispatch this to convert_str_to_tsobject # bc there isn't a nice way to pass "utc" - cnp.PyArray_MultiIter_NEXT(mi) continue _ts = convert_str_to_tsobject( @@ -540,13 +539,10 @@ cpdef array_to_datetime( else: raise TypeError(f"{type(val)} is not convertible to datetime") - cnp.PyArray_MultiIter_NEXT(mi) - except (TypeError, OverflowError, ValueError) as ex: ex.args = (f"{ex}, at position {i}",) if is_coerce: iresult[i] = NPY_NAT - cnp.PyArray_MultiIter_NEXT(mi) continue elif is_raise: raise From c8e7a98d6e8ef88d2c397d524dcfdb302f6fb4c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dea=20Mar=C3=ADa=20L=C3=A9on?= Date: Fri, 29 Sep 2023 18:34:43 +0200 Subject: [PATCH 048/131] DOC: Removing repeated NA (#54821) * Remove repetead NA * Added file to toctree * changed _api.missing_value * Added NaT to missing values section --------- Co-authored-by: Marco Edward Gorelli --- doc/source/reference/arrays.rst | 13 ------------- doc/source/reference/index.rst | 1 + doc/source/reference/missing_value.rst | 24 ++++++++++++++++++++++++ 3 files changed, 25 insertions(+), 13 deletions(-) create mode 100644 doc/source/reference/missing_value.rst diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index 41ddbd048e6c5..fe65364896f54 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -134,11 +134,6 @@ is the missing value for datetime data. Timestamp -.. autosummary:: - :toctree: api/ - - NaT - Properties ~~~~~~~~~~ .. autosummary:: @@ -257,11 +252,6 @@ is the missing value for timedelta data. Timedelta -.. autosummary:: - :toctree: api/ - - NaT - Properties ~~~~~~~~~~ .. autosummary:: @@ -465,7 +455,6 @@ pandas provides this through :class:`arrays.IntegerArray`. UInt16Dtype UInt32Dtype UInt64Dtype - NA .. _api.arrays.float_na: @@ -484,7 +473,6 @@ Nullable float Float32Dtype Float64Dtype - NA .. _api.arrays.categorical: @@ -621,7 +609,6 @@ with a bool :class:`numpy.ndarray`. :template: autosummary/class_without_autosummary.rst BooleanDtype - NA .. Dtype attributes which are manually listed in their docstrings: including diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst index 6d3ce3d31f005..7da02f7958416 100644 --- a/doc/source/reference/index.rst +++ b/doc/source/reference/index.rst @@ -53,6 +53,7 @@ are mentioned in the documentation. options extensions testing + missing_value .. This is to prevent warnings in the doc build. We don't want to encourage .. these methods. diff --git a/doc/source/reference/missing_value.rst b/doc/source/reference/missing_value.rst new file mode 100644 index 0000000000000..3bf22aef765d1 --- /dev/null +++ b/doc/source/reference/missing_value.rst @@ -0,0 +1,24 @@ +{{ header }} + +.. _api.missing_value: + +============== +Missing values +============== +.. currentmodule:: pandas + +NA is the way to represent missing values for nullable dtypes (see below): + +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + NA + +NaT is the missing value for timedelta and datetime data (see below): + +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + NaT From 6f0cd8d9ed251aa80f7415c565176fc33487110a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 30 Sep 2023 19:18:59 +0100 Subject: [PATCH 049/131] ENH: Implement masked algorithm for value_counts (#54984) --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/_libs/hashtable.pyi | 2 +- pandas/_libs/hashtable_func_helper.pxi.in | 41 +++++++++++-------- pandas/core/algorithms.py | 8 ++-- pandas/core/arrays/masked.py | 32 ++++++--------- pandas/core/arrays/sparse/array.py | 2 +- pandas/tests/libs/test_hashtable.py | 19 +++++++-- .../tests/series/methods/test_value_counts.py | 19 +++++++++ 8 files changed, 78 insertions(+), 46 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 6ddb1613076ac..9dc095e6de6ff 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -77,6 +77,7 @@ Other enhancements - :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) +- Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`) - .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index 2bc6d74fe6aee..0ac914e86f699 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -240,7 +240,7 @@ def value_count( values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = ..., -) -> tuple[np.ndarray, npt.NDArray[np.int64]]: ... # np.ndarray[same-as-values] +) -> tuple[np.ndarray, npt.NDArray[np.int64], int]: ... # np.ndarray[same-as-values] # arr and values should have same dtype def ismember( diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index b9cf6011481af..19acd4acbdee7 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -36,7 +36,7 @@ cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, const uint8_t cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8_t[:] mask=None): {{endif}} cdef: - Py_ssize_t i = 0 + Py_ssize_t i = 0, na_counter = 0, na_add = 0 Py_ssize_t n = len(values) kh_{{ttype}}_t *table @@ -49,9 +49,6 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8 bint uses_mask = mask is not None bint isna_entry = False - if uses_mask and not dropna: - raise NotImplementedError("uses_mask not implemented with dropna=False") - # we track the order in which keys are first seen (GH39009), # khash-map isn't insertion-ordered, thus: # table maps keys to counts @@ -82,25 +79,31 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8 for i in range(n): val = {{to_c_type}}(values[i]) + if uses_mask: + isna_entry = mask[i] + if dropna: - if uses_mask: - isna_entry = mask[i] - else: + if not uses_mask: isna_entry = is_nan_{{c_type}}(val) if not dropna or not isna_entry: - k = kh_get_{{ttype}}(table, val) - if k != table.n_buckets: - table.vals[k] += 1 + if uses_mask and isna_entry: + na_counter += 1 else: - k = kh_put_{{ttype}}(table, val, &ret) - table.vals[k] = 1 - result_keys.append(val) + k = kh_get_{{ttype}}(table, val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_{{ttype}}(table, val, &ret) + table.vals[k] = 1 + result_keys.append(val) {{endif}} # collect counts in the order corresponding to result_keys: + if na_counter > 0: + na_add = 1 cdef: - int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64) + int64_t[::1] result_counts = np.empty(table.size + na_add, dtype=np.int64) for i in range(table.size): {{if dtype == 'object'}} @@ -110,9 +113,13 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8 {{endif}} result_counts[i] = table.vals[k] + if na_counter > 0: + result_counts[table.size] = na_counter + result_keys.append(val) + kh_destroy_{{ttype}}(table) - return result_keys.to_array(), result_counts.base + return result_keys.to_array(), result_counts.base, na_counter @cython.wraparound(False) @@ -399,10 +406,10 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None): ndarray[htfunc_t] modes int64_t[::1] counts - int64_t count, max_count = -1 + int64_t count, _, max_count = -1 Py_ssize_t nkeys, k, j = 0 - keys, counts = value_count(values, dropna, mask=mask) + keys, counts, _ = value_count(values, dropna, mask=mask) nkeys = len(keys) modes = np.empty(nkeys, dtype=values.dtype) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1d74bb8b83e4e..c952178f4c998 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -924,7 +924,7 @@ def value_counts_internal( else: values = _ensure_arraylike(values, func_name="value_counts") - keys, counts = value_counts_arraylike(values, dropna) + keys, counts, _ = value_counts_arraylike(values, dropna) if keys.dtype == np.float16: keys = keys.astype(np.float32) @@ -949,7 +949,7 @@ def value_counts_internal( # Called once from SparseArray, otherwise could be private def value_counts_arraylike( values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None -) -> tuple[ArrayLike, npt.NDArray[np.int64]]: +) -> tuple[ArrayLike, npt.NDArray[np.int64], int]: """ Parameters ---------- @@ -965,7 +965,7 @@ def value_counts_arraylike( original = values values = _ensure_data(values) - keys, counts = htable.value_count(values, dropna, mask=mask) + keys, counts, na_counter = htable.value_count(values, dropna, mask=mask) if needs_i8_conversion(original.dtype): # datetime, timedelta, or period @@ -975,7 +975,7 @@ def value_counts_arraylike( keys, counts = keys[mask], counts[mask] res_keys = _reconstruct_data(keys, original.dtype, original) - return res_keys, counts + return res_keys, counts, na_counter def duplicated( diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index fdcbe67bbc371..9b85fb0477e6f 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1044,28 +1044,22 @@ def value_counts(self, dropna: bool = True) -> Series: ) from pandas.arrays import IntegerArray - keys, value_counts = algos.value_counts_arraylike( - self._data, dropna=True, mask=self._mask + keys, value_counts, na_counter = algos.value_counts_arraylike( + self._data, dropna=dropna, mask=self._mask ) + mask_index = np.zeros((len(value_counts),), dtype=np.bool_) + mask = mask_index.copy() - if dropna: - res = Series(value_counts, index=keys, name="count", copy=False) - res.index = res.index.astype(self.dtype) - res = res.astype("Int64") - return res + if na_counter > 0: + mask_index[-1] = True - # if we want nans, count the mask - counts = np.empty(len(value_counts) + 1, dtype="int64") - counts[:-1] = value_counts - counts[-1] = self._mask.sum() - - index = Index(keys, dtype=self.dtype).insert(len(keys), self.dtype.na_value) - index = index.astype(self.dtype) - - mask = np.zeros(len(counts), dtype="bool") - counts_array = IntegerArray(counts, mask) - - return Series(counts_array, index=index, name="count", copy=False) + arr = IntegerArray(value_counts, mask) + index = Index( + self.dtype.construct_array_type()( + keys, mask_index # type: ignore[arg-type] + ) + ) + return Series(arr, index=index, name="count", copy=False) @doc(ExtensionArray.equals) def equals(self, other) -> bool: diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 00cbe1286c195..4d5eef960293f 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -881,7 +881,7 @@ def value_counts(self, dropna: bool = True) -> Series: Series, ) - keys, counts = algos.value_counts_arraylike(self.sp_values, dropna=dropna) + keys, counts, _ = algos.value_counts_arraylike(self.sp_values, dropna=dropna) fcounts = self.sp_index.ngaps if fcounts > 0 and (not self._null_fill_value or not dropna): mask = isna(keys) if self._null_fill_value else keys == self.fill_value diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index b78e6426ca17f..2c8f4c4149528 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -586,15 +586,26 @@ def test_value_count(self, dtype, writable): expected = (np.arange(N) + N).astype(dtype) values = np.repeat(expected, 5) values.flags.writeable = writable - keys, counts = ht.value_count(values, False) + keys, counts, _ = ht.value_count(values, False) tm.assert_numpy_array_equal(np.sort(keys), expected) assert np.all(counts == 5) + def test_value_count_mask(self, dtype): + if dtype == np.object_: + pytest.skip("mask not implemented for object dtype") + values = np.array([1] * 5, dtype=dtype) + mask = np.zeros((5,), dtype=np.bool_) + mask[1] = True + mask[4] = True + keys, counts, na_counter = ht.value_count(values, False, mask=mask) + assert len(keys) == 2 + assert na_counter == 2 + def test_value_count_stable(self, dtype, writable): # GH12679 values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype) values.flags.writeable = writable - keys, counts = ht.value_count(values, False) + keys, counts, _ = ht.value_count(values, False) tm.assert_numpy_array_equal(keys, values) assert np.all(counts == 1) @@ -685,9 +696,9 @@ def test_unique_label_indices(): class TestHelpFunctionsWithNans: def test_value_count(self, dtype): values = np.array([np.nan, np.nan, np.nan], dtype=dtype) - keys, counts = ht.value_count(values, True) + keys, counts, _ = ht.value_count(values, True) assert len(keys) == 0 - keys, counts = ht.value_count(values, False) + keys, counts, _ = ht.value_count(values, False) assert len(keys) == 1 and np.all(np.isnan(keys)) assert counts[0] == 3 diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py index f54489ac8a8b4..bde9902fec6e9 100644 --- a/pandas/tests/series/methods/test_value_counts.py +++ b/pandas/tests/series/methods/test_value_counts.py @@ -250,3 +250,22 @@ def test_value_counts_complex_numbers(self, input_array, expected): # GH 17927 result = Series(input_array).value_counts() tm.assert_series_equal(result, expected) + + def test_value_counts_masked(self): + # GH#54984 + dtype = "Int64" + ser = Series([1, 2, None, 2, None, 3], dtype=dtype) + result = ser.value_counts(dropna=False) + expected = Series( + [2, 2, 1, 1], + index=Index([2, None, 1, 3], dtype=dtype), + dtype=dtype, + name="count", + ) + tm.assert_series_equal(result, expected) + + result = ser.value_counts(dropna=True) + expected = Series( + [2, 1, 1], index=Index([2, 1, 3], dtype=dtype), dtype=dtype, name="count" + ) + tm.assert_series_equal(result, expected) From 0db27c0df815718bb4cb31ae69dddfc68828bc69 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Mon, 2 Oct 2023 11:48:18 +0200 Subject: [PATCH 050/131] DOC: add parameters and examples to CustomBusinessMonthBegin/End (#55328) * add parameters and examples to CustomBusinessMonthBegin/End * move _attributes to _CustomBusinessMonth --- pandas/_libs/tslibs/offsets.pyx | 136 ++++++++++++++++++++++++++------ 1 file changed, 114 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 74398eb0e2405..8fdba8992f627 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -4329,28 +4329,6 @@ cdef class CustomBusinessHour(BusinessHour): cdef class _CustomBusinessMonth(BusinessMixin): - """ - DateOffset subclass representing custom business month(s). - - Increments between beginning/end of month dates. - - Parameters - ---------- - n : int, default 1 - The number of months represented. - normalize : bool, default False - Normalize start/end dates to midnight before generating date range. - weekmask : str, Default 'Mon Tue Wed Thu Fri' - Weekmask of valid business days, passed to ``numpy.busdaycalendar``. - holidays : list - List/array of dates to exclude from the set of valid business days, - passed to ``numpy.busdaycalendar``. - calendar : np.busdaycalendar - Calendar to integrate. - offset : timedelta, default timedelta(0) - Time offset to apply. - """ - _attributes = tuple( ["n", "normalize", "weekmask", "holidays", "calendar", "offset"] ) @@ -4426,10 +4404,124 @@ cdef class _CustomBusinessMonth(BusinessMixin): cdef class CustomBusinessMonthEnd(_CustomBusinessMonth): + """ + DateOffset subclass representing custom business month(s). + + Increments between end of month dates. + + Parameters + ---------- + n : int, default 1 + The number of months represented. + normalize : bool, default False + Normalize end dates to midnight before generating date range. + weekmask : str, Default 'Mon Tue Wed Thu Fri' + Weekmask of valid business days, passed to ``numpy.busdaycalendar``. + holidays : list + List/array of dates to exclude from the set of valid business days, + passed to ``numpy.busdaycalendar``. + calendar : np.busdaycalendar + Calendar to integrate. + offset : timedelta, default timedelta(0) + Time offset to apply. + + See Also + -------- + :class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment. + + Examples + -------- + In the example below we use the default parameters. + + >>> ts = pd.Timestamp(2022, 8, 5) + >>> ts + pd.offsets.CustomBusinessMonthEnd() + Timestamp('2022-08-31 00:00:00') + + Custom business month end can be specified by ``weekmask`` parameter. + To convert the returned datetime object to its string representation + the function strftime() is used in the next example. + + >>> import datetime as dt + >>> freq = pd.offsets.CustomBusinessMonthEnd(weekmask="Wed Thu") + >>> pd.date_range(dt.datetime(2022, 7, 10), dt.datetime(2022, 12, 18), + ... freq=freq).strftime('%a %d %b %Y %H:%M') + Index(['Thu 28 Jul 2022 00:00', 'Wed 31 Aug 2022 00:00', + 'Thu 29 Sep 2022 00:00', 'Thu 27 Oct 2022 00:00', + 'Wed 30 Nov 2022 00:00'], + dtype='object') + + Using NumPy business day calendar you can define custom holidays. + + >>> import datetime as dt + >>> bdc = np.busdaycalendar(holidays=['2022-08-01', '2022-09-30', + ... '2022-10-31', '2022-11-01']) + >>> freq = pd.offsets.CustomBusinessMonthEnd(calendar=bdc) + >>> pd.date_range(dt.datetime(2022, 7, 10), dt.datetime(2022, 11, 10), freq=freq) + DatetimeIndex(['2022-07-29', '2022-08-31', '2022-09-29', '2022-10-28'], + dtype='datetime64[ns]', freq='CBM') + """ + _prefix = "CBM" cdef class CustomBusinessMonthBegin(_CustomBusinessMonth): + """ + DateOffset subclass representing custom business month(s). + + Increments between beginning of month dates. + + Parameters + ---------- + n : int, default 1 + The number of months represented. + normalize : bool, default False + Normalize start dates to midnight before generating date range. + weekmask : str, Default 'Mon Tue Wed Thu Fri' + Weekmask of valid business days, passed to ``numpy.busdaycalendar``. + holidays : list + List/array of dates to exclude from the set of valid business days, + passed to ``numpy.busdaycalendar``. + calendar : np.busdaycalendar + Calendar to integrate. + offset : timedelta, default timedelta(0) + Time offset to apply. + + See Also + -------- + :class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment. + + Examples + -------- + In the example below we use the default parameters. + + >>> ts = pd.Timestamp(2022, 8, 5) + >>> ts + pd.offsets.CustomBusinessMonthBegin() + Timestamp('2022-09-01 00:00:00') + + Custom business month start can be specified by ``weekmask`` parameter. + To convert the returned datetime object to its string representation + the function strftime() is used in the next example. + + >>> import datetime as dt + >>> freq = pd.offsets.CustomBusinessMonthBegin(weekmask="Wed Thu") + >>> pd.date_range(dt.datetime(2022, 7, 10), dt.datetime(2022, 12, 18), + ... freq=freq).strftime('%a %d %b %Y %H:%M') + Index(['Wed 03 Aug 2022 00:00', 'Thu 01 Sep 2022 00:00', + 'Wed 05 Oct 2022 00:00', 'Wed 02 Nov 2022 00:00', + 'Thu 01 Dec 2022 00:00'], + dtype='object') + + Using NumPy business day calendar you can define custom holidays. + + >>> import datetime as dt + >>> bdc = np.busdaycalendar(holidays=['2022-08-01', '2022-09-30', + ... '2022-10-31', '2022-11-01']) + >>> freq = pd.offsets.CustomBusinessMonthBegin(calendar=bdc) + >>> pd.date_range(dt.datetime(2022, 7, 10), dt.datetime(2022, 11, 10), freq=freq) + DatetimeIndex(['2022-08-02', '2022-09-01', '2022-10-03', '2022-11-02'], + dtype='datetime64[ns]', freq='CBMS') + """ + _prefix = "CBMS" From 9282d9ff003f9a116110f6fc21c7c181d7c6109f Mon Sep 17 00:00:00 2001 From: Philippe THOMY Date: Mon, 2 Oct 2023 18:04:15 +0200 Subject: [PATCH 051/131] PDEP-12: compact-and-reversible-JSON-interface.md (#53714) * Create 0007-compact-and-reversible-JSON-interface.md * change PDEP number (7 -> 12) * Add FAQ to the PDEPS 0012 * Update 0012-compact-and-reversible-JSON-interface.md * Update 0012-compact-and-reversible-JSON-interface.md * Update 0012-compact-and-reversible-JSON-interface.md * pre-commit codespell * Update 0012-compact-and-reversible-JSON-interface.md * Update 0012-compact-and-reversible-JSON-interface.md * delete summary * delete mermaid flowchart * with summary, without mermaid flowchart * rename Annexe -> Appendix * add tableschema specification * add orient="table" * Add Table Schema extension * Update 0012-compact-and-reversible-JSON-interface.md * Update 0012-compact-and-reversible-JSON-interface.md * Update 0012-compact-and-reversible-JSON-interface.md * Update 0012-compact-and-reversible-JSON-interface.md * Update 0012-compact-and-reversible-JSON-interface.md * Update 0012-compact-and-reversible-JSON-interface.md --- ...2-compact-and-reversible-JSON-interface.md | 437 ++++++++++++++++++ 1 file changed, 437 insertions(+) create mode 100644 web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md diff --git a/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md b/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md new file mode 100644 index 0000000000000..4fe4b935f144b --- /dev/null +++ b/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md @@ -0,0 +1,437 @@ +# PDEP-12: Compact and reversible JSON interface + +- Created: 16 June 2023 +- Status: Rejected +- Discussion: + [#53252](https://github.com/pandas-dev/pandas/issues/53252) + [#55038](https://github.com/pandas-dev/pandas/issues/55038) +- Author: [Philippe THOMY](https://github.com/loco-philippe) +- Revision: 3 + + +#### Summary +- [Abstract](./0012-compact-and-reversible-JSON-interface.md/#Abstract) + - [Problem description](./0012-compact-and-reversible-JSON-interface.md/#Problem-description) + - [Feature Description](./0012-compact-and-reversible-JSON-interface.md/#Feature-Description) +- [Scope](./0012-compact-and-reversible-JSON-interface.md/#Scope) +- [Motivation](./0012-compact-and-reversible-JSON-interface.md/#Motivation) + - [Why is it important to have a compact and reversible JSON interface ?](./0012-compact-and-reversible-JSON-interface.md/#Why-is-it-important-to-have-a-compact-and-reversible-JSON-interface-?) + - [Is it relevant to take an extended type into account ?](./0012-compact-and-reversible-JSON-interface.md/#Is-it-relevant-to-take-an-extended-type-into-account-?) + - [Is this only useful for pandas ?](./0012-compact-and-reversible-JSON-interface.md/#Is-this-only-useful-for-pandas-?) +- [Description](./0012-compact-and-reversible-JSON-interface.md/#Description) + - [Data typing](./0012-compact-and-reversible-JSON-interface.md/#Data-typing) + - [Correspondence between TableSchema and pandas](./panda0012-compact-and-reversible-JSON-interfaces_PDEP.md/#Correspondence-between-TableSchema-and-pandas) + - [JSON format](./0012-compact-and-reversible-JSON-interface.md/#JSON-format) + - [Conversion](./0012-compact-and-reversible-JSON-interface.md/#Conversion) +- [Usage and impact](./0012-compact-and-reversible-JSON-interface.md/#Usage-and-impact) + - [Usage](./0012-compact-and-reversible-JSON-interface.md/#Usage) + - [Compatibility](./0012-compact-and-reversible-JSON-interface.md/#Compatibility) + - [Impacts on the pandas framework](./0012-compact-and-reversible-JSON-interface.md/#Impacts-on-the-pandas-framework) + - [Risk to do / risk not to do](./0012-compact-and-reversible-JSON-interface.md/#Risk-to-do-/-risk-not-to-do) +- [Implementation](./0012-compact-and-reversible-JSON-interface.md/#Implementation) + - [Modules](./0012-compact-and-reversible-JSON-interface.md/#Modules) + - [Implementation options](./0012-compact-and-reversible-JSON-interface.md/#Implementation-options) +- [F.A.Q.](./0012-compact-and-reversible-JSON-interface.md/#F.A.Q.) +- [Synthesis](./0012-compact-and-reversible-JSON-interface.md/Synthesis) +- [Core team decision](./0012-compact-and-reversible-JSON-interface.md/#Core-team-decision) +- [Timeline](./0012-compact-and-reversible-JSON-interface.md/#Timeline) +- [PDEP history](./0012-compact-and-reversible-JSON-interface.md/#PDEP-history) +------------------------- +## Abstract + +### Problem description +The `dtype` and "Python type" are not explicitly taken into account in the current JSON interface. + +So, the JSON interface is not always reversible and has inconsistencies related to the consideration of the `dtype`. + +Another consequence is the partial application of the Table Schema specification in the `orient="table"` option (6 Table Schema data types are taken into account out of the 24 defined). + +Some JSON-interface problems are detailed in the [linked NoteBook](https://nbviewer.org/github/loco-philippe/ntv-pandas/blob/main/example/example_json_pandas.ipynb#Current-Json-interface) + + +### Feature Description +To have a simple, compact and reversible solution, I propose to use the [JSON-NTV format (Named and Typed Value)](https://github.com/loco-philippe/NTV#readme) - which integrates the notion of type - and its JSON-TAB variation for tabular data (the JSON-NTV format is defined in an [IETF Internet-Draft](https://datatracker.ietf.org/doc/draft-thomy-json-ntv/) (not yet an RFC !!) ). + +This solution allows to include a large number of types (not necessarily pandas `dtype`) which allows to have: +- a Table Schema JSON interface (`orient="table"`) which respects the Table Schema specification (going from 6 types to 20 types), +- a global JSON interface for all pandas data formats. + +#### Global JSON interface example +In the example below, a DataFrame with several data types is converted to JSON. + +The DataFrame resulting from this JSON is identical to the initial DataFrame (reversibility). + +With the existing JSON interface, this conversion is not possible. + +This example uses `ntv_pandas` module defined in the [ntv-pandas repository](https://github.com/loco-philippe/ntv-pandas#readme). + +*data example* +```python +In [1]: from shapely.geometry import Point + from datetime import date + import pandas as pd + import ntv_pandas as npd + +In [2]: data = {'index': [100, 200, 300, 400, 500, 600], + 'dates::date': [date(1964,1,1), date(1985,2,5), date(2022,1,21), date(1964,1,1), date(1985,2,5), date(2022,1,21)], + 'value': [10, 10, 20, 20, 30, 30], + 'value32': pd.Series([12, 12, 22, 22, 32, 32], dtype='int32'), + 'res': [10, 20, 30, 10, 20, 30], + 'coord::point': [Point(1,2), Point(3,4), Point(5,6), Point(7,8), Point(3,4), Point(5,6)], + 'names': pd.Series(['john', 'eric', 'judith', 'mila', 'hector', 'maria'], dtype='string'), + 'unique': True } + +In [3]: df = pd.DataFrame(data).set_index('index') + +In [4]: df +Out[4]: dates::date value value32 res coord::point names unique + index + 100 1964-01-01 10 12 10 POINT (1 2) john True + 200 1985-02-05 10 12 20 POINT (3 4) eric True + 300 2022-01-21 20 22 30 POINT (5 6) judith True + 400 1964-01-01 20 22 10 POINT (7 8) mila True + 500 1985-02-05 30 32 20 POINT (3 4) hector True + 600 2022-01-21 30 32 30 POINT (5 6) maria True +``` + +*JSON representation* + +```python +In [5]: df_to_json = npd.to_json(df) + pprint(df_to_json, width=120) +Out[5]: {':tab': {'coord::point': [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0], [3.0, 4.0], [5.0, 6.0]], + 'dates::date': ['1964-01-01', '1985-02-05', '2022-01-21', '1964-01-01', '1985-02-05', '2022-01-21'], + 'index': [100, 200, 300, 400, 500, 600], + 'names::string': ['john', 'eric', 'judith', 'mila', 'hector', 'maria'], + 'res': [10, 20, 30, 10, 20, 30], + 'unique': [True, True, True, True, True, True], + 'value': [10, 10, 20, 20, 30, 30], + 'value32::int32': [12, 12, 22, 22, 32, 32]}} +``` + +*Reversibility* + +```python +In [5]: df_from_json = npd.read_json(df_to_json) + print('df created from JSON is equal to initial df ? ', df_from_json.equals(df)) +Out[5]: df created from JSON is equal to initial df ? True +``` +Several other examples are provided in the [linked NoteBook](https://nbviewer.org/github/loco-philippe/ntv-pandas/blob/main/example/example_ntv_pandas.ipynb) + +#### Table Schema JSON interface example +In the example below, a DataFrame with several Table Schema data types is converted to JSON. + +The DataFrame resulting from this JSON is identical to the initial DataFrame (reversibility). + +With the existing Table Schema JSON interface, this conversion is not possible. + +```python +In [1]: from shapely.geometry import Point + from datetime import date + +In [2]: df = pd.DataFrame({ + 'end february::date': ['date(2023,2,28)', 'date(2024,2,29)', 'date(2025,2,28)'], + 'coordinates::point': ['Point([2.3, 48.9])', 'Point([5.4, 43.3])', 'Point([4.9, 45.8])'], + 'contact::email': ['john.doe@table.com', 'lisa.minelli@schema.com', 'walter.white@breaking.com'] + }) + +In [3]: df +Out[3]: end february::date coordinates::point contact::email + 0 2023-02-28 POINT (2.3 48.9) john.doe@table.com + 1 2024-02-29 POINT (5.4 43.3) lisa.minelli@schema.com + 2 2025-02-28 POINT (4.9 45.8) walter.white@breaking.com +``` + +*JSON representation* + +```python +In [4]: df_to_table = npd.to_json(df, table=True) + pprint(df_to_table, width=140, sort_dicts=False) +Out[4]: {'schema': {'fields': [{'name': 'index', 'type': 'integer'}, + {'name': 'end february', 'type': 'date'}, + {'name': 'coordinates', 'type': 'geopoint', 'format': 'array'}, + {'name': 'contact', 'type': 'string', 'format': 'email'}], + 'primaryKey': ['index'], + 'pandas_version': '1.4.0'}, + 'data': [{'index': 0, 'end february': '2023-02-28', 'coordinates': [2.3, 48.9], 'contact': 'john.doe@table.com'}, + {'index': 1, 'end february': '2024-02-29', 'coordinates': [5.4, 43.3], 'contact': 'lisa.minelli@schema.com'}, + {'index': 2, 'end february': '2025-02-28', 'coordinates': [4.9, 45.8], 'contact': 'walter.white@breaking.com'}]} +``` + +*Reversibility* + +```python +In [5]: df_from_table = npd.read_json(df_to_table) + print('df created from JSON is equal to initial df ? ', df_from_table.equals(df)) +Out[5]: df created from JSON is equal to initial df ? True +``` +Several other examples are provided in the [linked NoteBook](https://nbviewer.org/github/loco-philippe/ntv-pandas/blob/main/example/example_table_pandas.ipynb) + +## Scope +The objective is to make available the proposed JSON interface for any type of data and for `orient="table"` option or a new option `orient="ntv"`. + +The proposed interface is compatible with existing data. + +## Motivation + +### Why extend the `orient=table` option to other data types? +- The Table Schema specification defines 24 data types, 6 are taken into account in the pandas interface + +### Why is it important to have a compact and reversible JSON interface ? +- a reversible interface provides an exchange format. +- a textual exchange format facilitates exchanges between platforms (e.g. OpenData) +- a JSON exchange format can be used at API level + +### Is it relevant to take an extended type into account ? +- it avoids the addition of an additional data schema +- it increases the semantic scope of the data processed by pandas +- it is an answer to several issues (e.g. #12997, #14358, #16492, #35420, #35464, #36211, #39537, #49585, #50782, #51375, #52595, #53252) +- the use of a complementary type avoids having to modify the pandas data model + +### Is this only useful for pandas ? +- the JSON-TAB format is applicable to tabular data and multi-dimensional data. +- this JSON interface can therefore be used for any application using tabular or multi-dimensional data. This would allow for example reversible data exchanges between pandas - DataFrame and Xarray - DataArray (Xarray issue under construction) [see example DataFrame / DataArray](https://nbviewer.org/github/loco-philippe/NTV/blob/main/example/example_pandas.ipynb#Multidimensional-data). + +## Description + +The proposed solution is based on several key points: +- data typing +- correspondence between TableSchema and pandas +- JSON format for tabular data +- conversion to and from JSON format + +### Data typing +Data types are defined and managed in the NTV project (name, JSON encoder and decoder). + +Pandas `dtype` are compatible with NTV types : + +| **pandas dtype** | **NTV type** | +|--------------------|------------| +| intxx | intxx | +| uintxx | uintxx | +| floatxx | floatxx | +| datetime[ns] | datetime | +| datetime[ns, ] | datetimetz | +| timedelta[ns] | durationiso| +| string | string | +| boolean | boolean | + +Note: +- datetime with timezone is a single NTV type (string ISO8601) +- `CategoricalDtype` and `SparseDtype` are included in the tabular JSON format +- `object` `dtype` is depending on the context (see below) +- `PeriodDtype` and `IntervalDtype` are to be defined + +JSON types (implicit or explicit) are converted in `dtype` following pandas JSON interface: + +| **JSON type** | **pandas dtype** | +|----------------|-------------------| +| number | int64 / float64 | +| string | string / object | +| array | object | +| object | object | +| true, false | boolean | +| null | NaT / NaN / None | + +Note: +- if an NTV type is defined, the `dtype` is adjusted accordingly +- the consideration of null type data needs to be clarified + +The other NTV types are associated with `object` `dtype`. + +### Correspondence between TableSchema and pandas +The TableSchema typing is carried by two attributes `format` and `type`. + +The table below shows the correspondence between TableSchema format / type and pandas NTVtype / dtype: + +| **format / type** | **NTV type / dtype** | +|--------------------|----------------------| +| default / datetime | / datetime64[ns] | +| default / number | / float64 | +| default / integer | / int64 | +| default / boolean | / bool | +| default / string | / object | +| default / duration | / timedelta64[ns] | +| email / string | email / string | +| uri / string | uri / string | +| default / object | object / object | +| default / array | array / object | +| default / date | date / object | +| default / time | time / object | +| default / year | year / int64 | +| default / yearmonth| month / int64 | +| array / geopoint | point / object | +| default / geojson | geojson / object | + +Note: +- other TableSchema format are defined and are to be studied (uuid, binary, topojson, specific format for geopoint and datation) +- the first six lines correspond to the existing + +### JSON format +The JSON format for the TableSchema interface is the existing. + +The JSON format for the Global interface is defined in [JSON-TAB](https://github.com/loco-philippe/NTV/blob/main/documentation/JSON-TAB-standard.pdf) specification. +It includes the naming rules originally defined in the [JSON-ND project](https://github.com/glenkleidon/JSON-ND) and support for categorical data. +The specification have to be updated to include sparse data. + +### Conversion +When data is associated with a non-`object` `dtype`, pandas conversion methods are used. +Otherwise, NTV conversion is used. + +#### pandas -> JSON +- `NTV type` is not defined : use `to_json()` +- `NTV type` is defined and `dtype` is not `object` : use `to_json()` +- `NTV type` is defined and `dtype` is `object` : use NTV conversion (if pandas conversion does not exist) + +#### JSON -> pandas +- `NTV type` is compatible with a `dtype` : use `read_json()` +- `NTV type` is not compatible with a `dtype` : use NTV conversion (if pandas conversion does not exist) + +## Usage and Impact + +### Usage +It seems to me that this proposal responds to important issues: +- having an efficient text format for data exchange + + The alternative CSV format is not reversible and obsolete (last revision in 2005). Current CSV tools do not comply with the standard. + +- taking into account "semantic" data in pandas objects + +- having a complete Table Schema interface + +### Compatibility +Interface can be used without NTV type (compatibility with existing data - [see examples](https://nbviewer.org/github/loco-philippe/ntv-pandas/blob/main/example/example_ntv_pandas.ipynb#Appendix-:-Series-tests)) + +If the interface is available, throw a new `orient` option in the JSON interface, the use of the feature is decoupled from the other features. + +### Impacts on the pandas framework +Initially, the impacts are very limited: +- modification of the `name` of `Series` or `DataFrame columns` (no functional impact), +- added an option in the Json interface (e.g. `orient='ntv'`) and added associated methods (no functional interference with the other methods) + +In later stages, several developments could be considered: +- validation of the `name` of `Series` or `DataFrame columns` , +- management of the NTV type as a "complementary-object-dtype" +- functional extensions depending on the NTV type + +### Risk to do / risk not to do +The JSON-NTV format and the JSON-TAB format are not (yet) recognized and used formats. The risk for pandas is that this function is not used (no functional impacts). + +On the other hand, the early use by pandas will allow a better consideration of the expectations and needs of pandas as well as a reflection on the evolution of the types supported by pandas. + +## Implementation + +### Modules +Two modules are defined for NTV: + +- json-ntv + + this module manages NTV data without dependency to another module + +- ntvconnector + + those modules manage the conversion between objects and JSON data. They have dependency with objects modules (e.g. connectors with shapely location have dependency with shapely). + +The pandas integration of the JSON interface requires importing only the json-ntv module. + +### Implementation options +The interface can be implemented as NTV connector (`SeriesConnector` and `DataFrameConnector`) and as a new pandas JSON interface `orient` option. + +Several pandas implementations are possible: + +1. External: + + In this implementation, the interface is available only in the NTV side. + This option means that this evolution of the JSON interface is not useful or strategic for pandas. + +2. NTV side: + + In this implementation, the interface is available in the both sides and the conversion is located inside NTV. + This option is the one that minimizes the impacts on the pandas side + +3. pandas side: + + In this implementation, the interface is available in the both sides and the conversion is located inside pandas. + This option allows pandas to keep control of this evolution + +4. pandas restricted: + + In this implementation, the pandas interface and the conversion are located inside pandas and only for non-object `dtype`. + This option makes it possible to offer a compact and reversible interface while prohibiting the introduction of types incompatible with the existing `dtype` + +## F.A.Q. + +**Q: Does `orient="table"` not do what you are proposing already?** + +**A**: In principle, yes, this option takes into account the notion of type. + +But this is very limited (see examples added in the [Notebook](https://nbviewer.org/github/loco-philippe/NTV/blob/main/example/example_pandas.ipynb)) : +- **Types and Json interface** + - the only way to keep the types in the json interface is to use the `orient='table'` option + - few dtypes are not allowed in json-table interface : period, timedelta64, interval + - allowed types are not always kept in json-table interface + - data with 'object' dtype is kept only id data is string + - with categorical dtype, the underlying dtype is not included in json interface +- **Data compactness** + - json-table interface is not compact (in the example in the [Notebook](https://nbviewer.org/github/loco-philippe/NTV/blob/main/example/example_pandas.ipynb#data-compactness))the size is triple or quadruple the size of the compact format +- **Reversibility** + - Interface is reversible only with few dtypes : int64, float64, bool, string, datetime64 and partially categorical +- **External types** + - the interface does not accept external types + - Table-schema defines 20 data types but the `orient="table"` interface takes into account 5 data types (see [table](https://nbviewer.org/github/loco-philippe/NTV/blob/main/example/example_pandas.ipynb#Converting-table-schema-type-to-pandas-dtype)) + - to integrate external types, it is necessary to first create ExtensionArray and ExtensionDtype objects + +The current interface is not compatible with the data structure defined by table-schema. For this to be possible, it is necessary to integrate a "type extension" like the one proposed (this has moreover been partially achieved with the notion of `extDtype` found in the interface for several formats). + +**Q: In general, we should only have 1 `"table"` format for pandas in read_json/to_json. There is also the issue of backwards compatibility if we do change the format. The fact that the table interface is buggy is not a reason to add a new interface (I'd rather fix those bugs). Can the existing format be adapted in a way that fixes the type issues/issues with roundtripping?** + +**A**: I will add two additional remarks: +- the types defined in Tableschema are partially taken into account (examples of types not taken into account in the interface: string-uri, array, date, time, year, geopoint, string-email): +- the `read_json()` interface works too with the following data: `{'simple': [1,2,3] }` (contrary to what is indicated in the documentation) but it is impossible with `to_json()` to recreate this simple json. + +I think that the problem cannot be limited to bug fixes and that a clear strategy must be defined for the Json interface in particular with the gradual abandonment in open-data solutions of the obsolete CSV format in favor of a Json format. + +As stated, the proposed solution addresses several shortcomings of the current interface and could simply fit into the pandas environment (the other option would be to consider that the Json interface is a peripheral function of pandas and can remain external to pandas) regardless of the `orient='table'` option. + +It is nevertheless possible to merge the proposed format and the `orient='table'` format in order to have an explicit management of the notion of `extDtype` + +**Q: As far as I can tell, JSON NTV is not in any form a standardised JSON format. I believe that pandas (and geopandas, which is where I came from to this issue) should try to follow either de facto or de jure standards and do not opt in for a file format that does not have any community support at this moment. This can obviously change in the future and that is where this PR should be revised. Why would pandas use this standard?** + +**A**: As indicated in the issue (and detailed in [the attached Notebook](https://nbviewer.org/github/loco-philippe/NTV/blob/main/example/example_pandas.ipynb)), the json interface is not reversible (`to_json` then `read_json` does not always return the initial object) and several shortcomings and bugs are present. The main cause of this problem is that the data type is not taken into account in the JSON format (or very partially with the `orient='table'` option). + +The proposal made answers this problem ([the example at the beginning of Notebook](https://nbviewer.org/github/loco-philippe/NTV/blob/main/example/example_pandas.ipynb#0---Simple-example) simply and clearly illustrates the interest of the proposal). + +Regarding the underlying JSON-NTV format, its impact is quite low for tabular data (it is limited to adding the type in the field name). +Nevertheless, the question is relevant: The JSON-NTV format ([IETF Internet-Draft](https://datatracker.ietf.org/doc/draft-thomy-json-ntv/)) is a shared, documented, supported and implemented format, but indeed the community support is for the moment reduced but it only asks to expand !! + +## Synthesis +To conclude, +- if it is important (or strategic) to have a reversible JSON interface for any type of data, the proposal can be allowed, +- if not, a third-party package listed in the [ecosystem](https://pandas.pydata.org/community/ecosystem.html) that reads/writes this format to/from pandas DataFrames should be considered + +## Core team decision +Vote was open from september-11 to setpember-26: +- Final tally is 0 approvals, 5 abstentions, 7 disapprove. The quorum has been met. The PDEP fails. + +**Disapprove comments** : +- 1 Given the newness of the proposed JSON NTV format, I would support (as described in the PDEP): "if not, a third-party package listed in the ecosystem that reads/writes this format to/from pandas DataFrames should be considered" +- 2 Same reason as -1-, this should be a third party package for now +- 3 Not mature enough, and not clear what the market size would be. +- 4 for the same reason I left in the PDEP: "I think this (JSON-NTV format) does not meet the bar of being a commonly used format for implementation within pandas" +- 5 agree with -4- +- 6 agree with the other core-dev responders. I think work in the existing json interface is extremely valuable. A number of the original issues raised are just bug fixes / extensions of already existing functionality. Trying to start anew is likely not worth the migration effort. That said if a format is well supported in the community we can reconsider in the future (obviously json is well supported but the actual specification detailed here is too new / not accepted as a standard) +- 7 while I do think having a more comprehensive JSON format would be worthwhile, making a new format part of pandas means an implicit endorsement of a standard that is still being reviewed by the broader community. + +**Decision**: +- add the `ntv-pandas` package in the [ecosystem](https://pandas.pydata.org/community/ecosystem.html) +- revisit again this PDEP at a later stage, for example in 1/2 to 1 year (based on the evolution of the Internet draft [JSON semantic format (JSON-NTV)](https://www.ietf.org/archive/id/draft-thomy-json-ntv-01.html) and the usage of the [ntv-pandas](https://github.com/loco-philippe/ntv-pandas#readme)) + +## Timeline +Not applicable + +## PDEP History + +- 16 June 2023: Initial draft +- 22 July 2023: Add F.A.Q. +- 06 September 2023: Add Table Schema extension +- 01 Octobre: Add Core team decision From a0a6e048ad5c31b87f84789d093ac6a70f4c7f5e Mon Sep 17 00:00:00 2001 From: James Spencer Date: Mon, 2 Oct 2023 17:31:17 +0100 Subject: [PATCH 052/131] TYP: Correct type annotation for to_dict. (#55130) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Correct type annotation for to_dict. The `into` argument of DataFrame.to_dict and Series.to_dict can be either a class or instance of a class of dict; this is covariant - subclasses of dict can also be used. The argument was annotated as `type[dict]` though, so type checkers marked passing initialized objects (required for collections.defaultdict) as an incorrect argument type. Fix by annotating `into` to take either a subclass of dict or an initialized instance of a subclass of dict. * Use generic MutableMapping type for to_dict method. Unfortunately a generic type annotation with a default triggers an existing mypy limitation (https://github.com/python/mypy/issues/3737). The current workaround is to use overloads and then not annotate the implementation containing the default parameter; this still enables mypy to deduce correct return types. Two overloads are added for Series.to_dict, even though they could be combined using a Union type, as at least two overloads are required for a single method. * Fix formatting * return annotation for non-overload * no keyword should return dict * swap overload order to work for dict subclasses that are passed as keywords * fix tests --------- Co-authored-by: Torsten Wörtwein --- pandas/_typing.py | 2 + pandas/core/frame.py | 41 +++++++++++--- pandas/core/methods/to_dict.py | 62 +++++++++++++++++++-- pandas/core/series.py | 37 +++++++++--- pandas/tests/series/methods/test_to_dict.py | 4 +- 5 files changed, 123 insertions(+), 23 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 0e2a0881f0122..f18c67fcb0c90 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -4,6 +4,7 @@ Hashable, Iterator, Mapping, + MutableMapping, Sequence, ) from datetime import ( @@ -103,6 +104,7 @@ TypeGuard: Any = None HashableT = TypeVar("HashableT", bound=Hashable) +MutableMappingT = TypeVar("MutableMappingT", bound=MutableMapping) # array-like diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a16597221ac92..51fceb1f09a62 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -230,6 +230,7 @@ Level, MergeHow, MergeValidate, + MutableMappingT, NaAction, NaPosition, NsmallestNlargestKeep, @@ -1927,6 +1928,27 @@ def _create_data_for_split_and_tight_to_dict( def to_dict( self, orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., + *, + into: type[MutableMappingT] | MutableMappingT, + index: bool = ..., + ) -> MutableMappingT: + ... + + @overload + def to_dict( + self, + orient: Literal["records"], + *, + into: type[MutableMappingT] | MutableMappingT, + index: bool = ..., + ) -> list[MutableMappingT]: + ... + + @overload + def to_dict( + self, + orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., + *, into: type[dict] = ..., index: bool = ..., ) -> dict: @@ -1936,11 +1958,14 @@ def to_dict( def to_dict( self, orient: Literal["records"], + *, into: type[dict] = ..., index: bool = ..., ) -> list[dict]: ... + # error: Incompatible default for argument "into" (default has type "type + # [dict[Any, Any]]", argument has type "type[MutableMappingT] | MutableMappingT") @deprecate_nonkeyword_arguments( version="3.0", allowed_args=["self", "orient"], name="to_dict" ) @@ -1949,9 +1974,10 @@ def to_dict( orient: Literal[ "dict", "list", "series", "split", "tight", "records", "index" ] = "dict", - into: type[dict] = dict, + into: type[MutableMappingT] + | MutableMappingT = dict, # type: ignore[assignment] index: bool = True, - ) -> dict | list[dict]: + ) -> MutableMappingT | list[MutableMappingT]: """ Convert the DataFrame to a dictionary. @@ -1979,7 +2005,7 @@ def to_dict( 'tight' as an allowed value for the ``orient`` argument into : class, default dict - The collections.abc.Mapping subclass used for all Mappings + The collections.abc.MutableMapping subclass used for all Mappings in the return value. Can be the actual class or an empty instance of the mapping type you want. If you want a collections.defaultdict, you must pass it initialized. @@ -1993,9 +2019,10 @@ def to_dict( Returns ------- - dict, list or collections.abc.Mapping - Return a collections.abc.Mapping object representing the DataFrame. - The resulting transformation depends on the `orient` parameter. + dict, list or collections.abc.MutableMapping + Return a collections.abc.MutableMapping object representing the + DataFrame. The resulting transformation depends on the `orient` + parameter. See Also -------- @@ -2054,7 +2081,7 @@ def to_dict( """ from pandas.core.methods.to_dict import to_dict - return to_dict(self, orient, into, index) + return to_dict(self, orient, into=into, index=index) @deprecate_nonkeyword_arguments( version="3.0", allowed_args=["self", "destination_table"], name="to_gbq" diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py index f4e0dcddcd34a..3295c4741c03d 100644 --- a/pandas/core/methods/to_dict.py +++ b/pandas/core/methods/to_dict.py @@ -3,6 +3,7 @@ from typing import ( TYPE_CHECKING, Literal, + overload, ) import warnings @@ -16,17 +17,66 @@ from pandas.core import common as com if TYPE_CHECKING: + from pandas._typing import MutableMappingT + from pandas import DataFrame +@overload +def to_dict( + df: DataFrame, + orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., + *, + into: type[MutableMappingT] | MutableMappingT, + index: bool = ..., +) -> MutableMappingT: + ... + + +@overload +def to_dict( + df: DataFrame, + orient: Literal["records"], + *, + into: type[MutableMappingT] | MutableMappingT, + index: bool = ..., +) -> list[MutableMappingT]: + ... + + +@overload +def to_dict( + df: DataFrame, + orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., + *, + into: type[dict] = ..., + index: bool = ..., +) -> dict: + ... + + +@overload +def to_dict( + df: DataFrame, + orient: Literal["records"], + *, + into: type[dict] = ..., + index: bool = ..., +) -> list[dict]: + ... + + +# error: Incompatible default for argument "into" (default has type "type[dict +# [Any, Any]]", argument has type "type[MutableMappingT] | MutableMappingT") def to_dict( df: DataFrame, orient: Literal[ "dict", "list", "series", "split", "tight", "records", "index" ] = "dict", - into: type[dict] = dict, + *, + into: type[MutableMappingT] | MutableMappingT = dict, # type: ignore[assignment] index: bool = True, -) -> dict | list[dict]: +) -> MutableMappingT | list[MutableMappingT]: """ Convert the DataFrame to a dictionary. @@ -54,7 +104,7 @@ def to_dict( 'tight' as an allowed value for the ``orient`` argument into : class, default dict - The collections.abc.Mapping subclass used for all Mappings + The collections.abc.MutableMapping subclass used for all Mappings in the return value. Can be the actual class or an empty instance of the mapping type you want. If you want a collections.defaultdict, you must pass it initialized. @@ -69,8 +119,8 @@ def to_dict( Returns ------- dict, list or collections.abc.Mapping - Return a collections.abc.Mapping object representing the DataFrame. - The resulting transformation depends on the `orient` parameter. + Return a collections.abc.MutableMapping object representing the + DataFrame. The resulting transformation depends on the `orient` parameter. """ if not df.columns.is_unique: warnings.warn( @@ -103,7 +153,7 @@ def to_dict( are_all_object_dtype_cols = len(box_native_indices) == len(df.dtypes) if orient == "dict": - return into_c((k, v.to_dict(into)) for k, v in df.items()) + return into_c((k, v.to_dict(into=into)) for k, v in df.items()) elif orient == "list": object_dtype_indices_as_set: set[int] = set(box_native_indices) diff --git a/pandas/core/series.py b/pandas/core/series.py index fd50a85f3c2e3..d5785a2171cb3 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -48,6 +48,7 @@ from pandas.util._decorators import ( Appender, Substitution, + deprecate_nonkeyword_arguments, doc, ) from pandas.util._exceptions import find_stack_level @@ -167,6 +168,7 @@ IndexKeyFunc, IndexLabel, Level, + MutableMappingT, NaPosition, NumpySorter, NumpyValueArrayLike, @@ -1922,21 +1924,40 @@ def keys(self) -> Index: """ return self.index - def to_dict(self, into: type[dict] = dict) -> dict: + @overload + def to_dict( + self, *, into: type[MutableMappingT] | MutableMappingT + ) -> MutableMappingT: + ... + + @overload + def to_dict(self, *, into: type[dict] = ...) -> dict: + ... + + # error: Incompatible default for argument "into" (default has type "type[ + # dict[Any, Any]]", argument has type "type[MutableMappingT] | MutableMappingT") + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self"], name="to_dict" + ) + def to_dict( + self, + into: type[MutableMappingT] + | MutableMappingT = dict, # type: ignore[assignment] + ) -> MutableMappingT: """ Convert Series to {label -> value} dict or dict-like object. Parameters ---------- into : class, default dict - The collections.abc.Mapping subclass to use as the return - object. Can be the actual class or an empty - instance of the mapping type you want. If you want a - collections.defaultdict, you must pass it initialized. + The collections.abc.MutableMapping subclass to use as the return + object. Can be the actual class or an empty instance of the mapping + type you want. If you want a collections.defaultdict, you must + pass it initialized. Returns ------- - collections.abc.Mapping + collections.abc.MutableMapping Key-value representation of Series. Examples @@ -1945,10 +1966,10 @@ def to_dict(self, into: type[dict] = dict) -> dict: >>> s.to_dict() {0: 1, 1: 2, 2: 3, 3: 4} >>> from collections import OrderedDict, defaultdict - >>> s.to_dict(OrderedDict) + >>> s.to_dict(into=OrderedDict) OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)]) >>> dd = defaultdict(list) - >>> s.to_dict(dd) + >>> s.to_dict(into=dd) defaultdict(, {0: 1, 1: 2, 2: 3, 3: 4}) """ # GH16122 diff --git a/pandas/tests/series/methods/test_to_dict.py b/pandas/tests/series/methods/test_to_dict.py index 4c3d9592eebe3..41c01f4537f23 100644 --- a/pandas/tests/series/methods/test_to_dict.py +++ b/pandas/tests/series/methods/test_to_dict.py @@ -13,12 +13,12 @@ class TestSeriesToDict: ) def test_to_dict(self, mapping, datetime_series): # GH#16122 - result = Series(datetime_series.to_dict(mapping), name="ts") + result = Series(datetime_series.to_dict(into=mapping), name="ts") expected = datetime_series.copy() expected.index = expected.index._with_freq(None) tm.assert_series_equal(result, expected) - from_method = Series(datetime_series.to_dict(collections.Counter)) + from_method = Series(datetime_series.to_dict(into=collections.Counter)) from_constructor = Series(collections.Counter(datetime_series.items())) tm.assert_series_equal(from_method, from_constructor) From 93abfb3915957aa1078d5b1d87c4c5c2c118f97f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 2 Oct 2023 17:36:52 +0100 Subject: [PATCH 053/131] BUG: Fix convert_dtypes for all na column and arrow backend (#55346) * BUG: Fix convert_dtypes for all na column and arrow backend BUG: Fix convert_dtypes for all na column and arrow backend * Add test * Update cast.py * Fix * Fix typing --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/dtypes/cast.py | 11 ++++++++++- pandas/tests/series/methods/test_convert_dtypes.py | 8 ++++++++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 9dc095e6de6ff..b3671d3618791 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -281,7 +281,7 @@ Numeric Conversion ^^^^^^^^^^ -- +- Bug in :meth:`Series.convert_dtypes` not converting all NA column to ``null[pyarrow]`` (:issue:`55346`) - Strings diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 74e785be06356..3208a742738a3 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1133,7 +1133,16 @@ def convert_dtypes( base_dtype = np.dtype(str) else: base_dtype = inferred_dtype - pa_type = to_pyarrow_type(base_dtype) + if ( + base_dtype.kind == "O" # type: ignore[union-attr] + and len(input_array) > 0 + and isna(input_array).all() + ): + import pyarrow as pa + + pa_type = pa.null() + else: + pa_type = to_pyarrow_type(base_dtype) if pa_type is not None: inferred_dtype = ArrowDtype(pa_type) elif dtype_backend == "numpy_nullable" and isinstance(inferred_dtype, ArrowDtype): diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index d1c79d0f00365..f621604faae4b 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -265,3 +265,11 @@ def test_convert_dtypes_pyarrow_to_np_nullable(self): result = ser.convert_dtypes(dtype_backend="numpy_nullable") expected = pd.Series(range(2), dtype="Int32") tm.assert_series_equal(result, expected) + + def test_convert_dtypes_pyarrow_null(self): + # GH#55346 + pa = pytest.importorskip("pyarrow") + ser = pd.Series([None, None]) + result = ser.convert_dtypes(dtype_backend="pyarrow") + expected = pd.Series([None, None], dtype=pd.ArrowDtype(pa.null())) + tm.assert_series_equal(result, expected) From 9938f688aa4fd4221d0ca10a3e05c91be9ddb258 Mon Sep 17 00:00:00 2001 From: Matthias Bussonnier Date: Mon, 2 Oct 2023 18:48:21 +0200 Subject: [PATCH 054/131] DOC: Typo: missing space. (#55354) Typo: missing space. Without trailing space the interpreted text is not close. Even if it were we want a space in the documentation. (Note, if one does not want a space between the closing backtick and the subsequent text one must use backslash-space. --- pandas/io/xml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 918fe4d22ea62..bd3b515dbca2f 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -88,7 +88,7 @@ class _XMLFrameParser: Parse only the attributes at the specified ``xpath``. names : list - Column names for :class:`~pandas.DataFrame`of parsed XML data. + Column names for :class:`~pandas.DataFrame` of parsed XML data. dtype : dict Data type for data or columns. E.g. {{'a': np.float64, From cd8516aa93159c7d97df3c285d3709e890df3063 Mon Sep 17 00:00:00 2001 From: Matthias Bussonnier Date: Mon, 2 Oct 2023 18:48:56 +0200 Subject: [PATCH 055/131] DOC: Typo: directive must have a space after `::` (#55353) Typo: directive must have a space after `::` Otherwise it may be interpreted as a comment. --- pandas/core/resample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index e9b2bacd9e1df..7d37a9f1d5113 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -986,7 +986,7 @@ def interpolate( downcast : optional, 'infer' or None, defaults to None Downcast dtypes if possible. - .. deprecated::2.1.0 + .. deprecated:: 2.1.0 ``**kwargs`` : optional Keyword arguments to pass on to the interpolating function. From 618bf88acb4325ef92719829d885c2727489df73 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Mon, 2 Oct 2023 12:50:57 -0400 Subject: [PATCH 056/131] BUG: MultiIndex.get_indexer with method not raising for non-monotonic (#55352) raise ValueError for get_indexer with method and non-monotonic --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/indexes/base.py | 4 ++-- pandas/tests/indexes/multi/test_indexing.py | 13 +++++++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index b3671d3618791..bc8c258b40ebc 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -310,7 +310,7 @@ Missing MultiIndex ^^^^^^^^^^ -- +- Bug in :meth:`MultiIndex.get_indexer` not raising ``ValueError`` when ``method`` provided and index is non-monotonic (:issue:`53452`) - I/O diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e23887159c9c6..9017ff121976b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4011,8 +4011,8 @@ def _get_fill_indexer( self, target: Index, method: str_t, limit: int | None = None, tolerance=None ) -> npt.NDArray[np.intp]: if self._is_multi: - # TODO: get_indexer_with_fill docstring says values must be _sorted_ - # but that doesn't appear to be enforced + if not (self.is_monotonic_increasing or self.is_monotonic_decreasing): + raise ValueError("index must be monotonic increasing or decreasing") # error: "IndexEngine" has no attribute "get_indexer_with_fill" engine = self._engine with warnings.catch_warnings(): diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 78b2c493ec116..d86692477f381 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -342,6 +342,19 @@ def test_get_indexer_methods(self): expected = np.array([4, 6, 7], dtype=pad_indexer.dtype) tm.assert_almost_equal(expected, pad_indexer) + @pytest.mark.parametrize("method", ["pad", "ffill", "backfill", "bfill", "nearest"]) + def test_get_indexer_methods_raise_for_non_monotonic(self, method): + # 53452 + mi = MultiIndex.from_arrays([[0, 4, 2], [0, 4, 2]]) + if method == "nearest": + err = NotImplementedError + msg = "not implemented yet for MultiIndex" + else: + err = ValueError + msg = "index must be monotonic increasing or decreasing" + with pytest.raises(err, match=msg): + mi.get_indexer([(1, 1)], method=method) + def test_get_indexer_three_or_more_levels(self): # https://github.com/pandas-dev/pandas/issues/29896 # tests get_indexer() on MultiIndexes with 3+ levels From 71062f6c4ba78c2408993698d135795632b88e93 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 2 Oct 2023 17:52:26 +0100 Subject: [PATCH 057/131] REGR: join segfaulting for arrow string with nulls (#55348) * REGR: join segfaulting for arrow string with nulls * Fix not installed --- doc/source/whatsnew/v2.1.2.rst | 2 +- pandas/core/reshape/merge.py | 2 ++ pandas/tests/frame/methods/test_join.py | 7 ++++++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 1a25b848e0f84..a5ba365f2d456 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -15,7 +15,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) - Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`) -- +- Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`) .. --------------------------------------------------------------------------- .. _whatsnew_212.bug_fixes: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 4b9fcc80af4bb..ba6579a739f54 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2443,6 +2443,8 @@ def _factorize_keys( .astype(np.intp, copy=False), len(dc.dictionary), ) + if dc.null_count > 0: + count += 1 if how == "right": return rlab, llab, count return llab, rlab, count diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index 2d4ac1d4a4444..3d21faf8b1729 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -158,9 +158,14 @@ def test_join_invalid_validate(left_no_dup, right_no_dup): left_no_dup.merge(right_no_dup, on="a", validate="invalid") -def test_join_on_single_col_dup_on_right(left_no_dup, right_w_dups): +@pytest.mark.parametrize("dtype", ["object", "string[pyarrow]"]) +def test_join_on_single_col_dup_on_right(left_no_dup, right_w_dups, dtype): # GH 46622 # Dups on right allowed by one_to_many constraint + if dtype == "string[pyarrow]": + pytest.importorskip("pyarrow") + left_no_dup = left_no_dup.astype(dtype) + right_w_dups.index = right_w_dups.index.astype(dtype) left_no_dup.join( right_w_dups, on="a", From f4427918624e21faac3b31725a357e55f8292821 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 2 Oct 2023 10:34:26 -0700 Subject: [PATCH 058/131] Bump pypa/cibuildwheel from 2.16.0 to 2.16.1 (#55350) Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.16.0 to 2.16.1. - [Release notes](https://github.com/pypa/cibuildwheel/releases) - [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md) - [Commits](https://github.com/pypa/cibuildwheel/compare/v2.16.0...v2.16.1) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 4c7a7b329777b..efa14dd966eb1 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -138,7 +138,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.16.0 + uses: pypa/cibuildwheel@v2.16.1 with: package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: From 7e681836d46fdcd4aa72cc0538f3ebf86516975d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 2 Oct 2023 10:35:25 -0700 Subject: [PATCH 059/131] [pre-commit.ci] pre-commit autoupdate (#55356) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/pylint-dev/pylint: v3.0.0a7 → v3.0.0b0](https://github.com/pylint-dev/pylint/compare/v3.0.0a7...v3.0.0b0) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b0b511e1048c6..c911edfa03670 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -84,7 +84,7 @@ repos: '--filter=-readability/casting,-runtime/int,-build/include_subdir,-readability/fn_size' ] - repo: https://github.com/pylint-dev/pylint - rev: v3.0.0a7 + rev: v3.0.0b0 hooks: - id: pylint stages: [manual] From 6e6a6831aaa2d1606aa897b34f106e655180034b Mon Sep 17 00:00:00 2001 From: torext Date: Tue, 3 Oct 2023 17:46:01 +0200 Subject: [PATCH 060/131] BUG: Silence `Period[B]` warnings in plotting code (#55138) * Silence `Period[B]` warnings in plotting code * Add issue number and removal TODO * Add entry for issue `#55138` to `v2.1.2.rst` --------- Co-authored-by: torext --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/plotting/_matplotlib/converter.py | 61 ++++++++++++++++------- pandas/tests/plotting/frame/test_frame.py | 9 ++++ pandas/tests/plotting/test_series.py | 7 +++ 4 files changed, 60 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index a5ba365f2d456..8aba2bcbecf2f 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -24,6 +24,7 @@ Bug fixes ~~~~~~~~~ - Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) +- Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`) - .. --------------------------------------------------------------------------- diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index e6408a0eae841..5a7ceabbf554e 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -13,6 +13,7 @@ Any, cast, ) +import warnings import matplotlib.dates as mdates from matplotlib.ticker import ( @@ -239,18 +240,29 @@ def _convert_1d(values, units, axis): if not hasattr(axis, "freq"): raise TypeError("Axis must have `freq` set to convert to Periods") valid_types = (str, datetime, Period, pydt.date, pydt.time, np.datetime64) - if isinstance(values, valid_types) or is_integer(values) or is_float(values): - return get_datevalue(values, axis.freq) - elif isinstance(values, PeriodIndex): - return values.asfreq(axis.freq).asi8 - elif isinstance(values, Index): - return values.map(lambda x: get_datevalue(x, axis.freq)) - elif lib.infer_dtype(values, skipna=False) == "period": - # https://github.com/pandas-dev/pandas/issues/24304 - # convert ndarray[period] -> PeriodIndex - return PeriodIndex(values, freq=axis.freq).asi8 - elif isinstance(values, (list, tuple, np.ndarray, Index)): - return [get_datevalue(x, axis.freq) for x in values] + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", "Period with BDay freq is deprecated", category=FutureWarning + ) + warnings.filterwarnings( + "ignore", r"PeriodDtype\[B\] is deprecated", category=FutureWarning + ) + if ( + isinstance(values, valid_types) + or is_integer(values) + or is_float(values) + ): + return get_datevalue(values, axis.freq) + elif isinstance(values, PeriodIndex): + return values.asfreq(axis.freq).asi8 + elif isinstance(values, Index): + return values.map(lambda x: get_datevalue(x, axis.freq)) + elif lib.infer_dtype(values, skipna=False) == "period": + # https://github.com/pandas-dev/pandas/issues/24304 + # convert ndarray[period] -> PeriodIndex + return PeriodIndex(values, freq=axis.freq).asi8 + elif isinstance(values, (list, tuple, np.ndarray, Index)): + return [get_datevalue(x, axis.freq) for x in values] return values @@ -575,11 +587,18 @@ def _daily_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: (vmin, vmax) = (int(vmin), int(vmax)) span = vmax - vmin + 1 - dates_ = period_range( - start=Period(ordinal=vmin, freq=freq), - end=Period(ordinal=vmax, freq=freq), - freq=freq, - ) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", "Period with BDay freq is deprecated", category=FutureWarning + ) + warnings.filterwarnings( + "ignore", r"PeriodDtype\[B\] is deprecated", category=FutureWarning + ) + dates_ = period_range( + start=Period(ordinal=vmin, freq=freq), + end=Period(ordinal=vmax, freq=freq), + freq=freq, + ) # Initialize the output info = np.zeros( @@ -1072,7 +1091,13 @@ def __call__(self, x, pos: int = 0) -> str: fmt = self.formatdict.pop(x, "") if isinstance(fmt, np.bytes_): fmt = fmt.decode("utf-8") - period = Period(ordinal=int(x), freq=self.freq) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Period with BDay freq is deprecated", + category=FutureWarning, + ) + period = Period(ordinal=int(x), freq=self.freq) assert isinstance(period, Period) return period.strftime(fmt) diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index 47114cab47619..2ef1f065f603d 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -2487,6 +2487,15 @@ def test_secondary_y(self, secondary_y): assert ax.get_ylim() == (0, 100) assert ax.get_yticks()[0] == 99 + @pytest.mark.slow + def test_plot_no_warning(self): + # GH 55138 + # TODO(3.0): this can be removed once Period[B] deprecation is enforced + df = tm.makeTimeDataFrame() + with tm.assert_produces_warning(False): + _ = df.plot() + _ = df.T.plot() + def _generate_4_axes_via_gridspec(): import matplotlib.pyplot as plt diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index e77dc0b305171..6719e39612f5d 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -973,3 +973,10 @@ def test_series_none_color(self): ax = series.plot(color=None) expected = _unpack_cycler(mpl.pyplot.rcParams)[:1] _check_colors(ax.get_lines(), linecolors=expected) + + @pytest.mark.slow + def test_plot_no_warning(self, ts): + # GH 55138 + # TODO(3.0): this can be removed once Period[B] deprecation is enforced + with tm.assert_produces_warning(False): + _ = ts.plot() From 3bf0f647ca42bf21de76f9e6027de38cb5589147 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 3 Oct 2023 11:48:01 -0400 Subject: [PATCH 061/131] ENH/PERF: add ExtensionArray.duplicated (#55255) * PERF: Series.duplicated for pyarrow timestamp and duration types * whatsnew * fix setup * add ExtensionArray.duplicated * fix * simplify * add SparseArray.duplicated * simplify * docs * pass mask * mypy * use mask * add optional to docstring * revert asv change --- asv_bench/benchmarks/algorithms.py | 18 ++++++++++++++++- doc/source/reference/extensions.rst | 1 + doc/source/whatsnew/v2.2.0.rst | 2 ++ pandas/core/algorithms.py | 19 +++++++----------- pandas/core/arrays/arrow/array.py | 25 ++++++++++++++++++++++++ pandas/core/arrays/base.py | 27 ++++++++++++++++++++++++++ pandas/core/arrays/masked.py | 8 ++++++++ pandas/core/arrays/sparse/array.py | 9 +++++++++ pandas/core/base.py | 5 ++++- pandas/tests/extension/base/methods.py | 12 ++++++++++++ 10 files changed, 112 insertions(+), 14 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 2584e1f13853a..192f19c36b47d 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -1,6 +1,7 @@ from importlib import import_module import numpy as np +import pyarrow as pa import pandas as pd @@ -72,7 +73,16 @@ class Duplicated: params = [ [True, False], ["first", "last", False], - ["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"], + [ + "int", + "uint", + "float", + "string", + "datetime64[ns]", + "datetime64[ns, tz]", + "timestamp[ms][pyarrow]", + "duration[s][pyarrow]", + ], ] param_names = ["unique", "keep", "dtype"] @@ -87,6 +97,12 @@ def setup(self, unique, keep, dtype): "datetime64[ns, tz]": pd.date_range( "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" ), + "timestamp[ms][pyarrow]": pd.Index( + np.arange(N), dtype=pd.ArrowDtype(pa.timestamp("ms")) + ), + "duration[s][pyarrow]": pd.Index( + np.arange(N), dtype=pd.ArrowDtype(pa.duration("s")) + ), }[dtype] if not unique: data = data.repeat(5) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 83f830bb11198..e412793a328a3 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -49,6 +49,7 @@ objects. api.extensions.ExtensionArray.copy api.extensions.ExtensionArray.view api.extensions.ExtensionArray.dropna + api.extensions.ExtensionArray.duplicated api.extensions.ExtensionArray.equals api.extensions.ExtensionArray.factorize api.extensions.ExtensionArray.fillna diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index bc8c258b40ebc..fa3cef6d9457d 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -76,6 +76,7 @@ Other enhancements - :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) +- :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`) - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`) - @@ -241,6 +242,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`) - Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`) - Performance improvement in :meth:`Index.difference` (:issue:`55108`) +- Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`) - Performance improvement when indexing with more than 4 keys (:issue:`54550`) - Performance improvement when localizing time to UTC (:issue:`55241`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c952178f4c998..4ff3de2fc7b2b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -55,7 +55,6 @@ ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ( - ArrowDtype, BaseMaskedDtype, CategoricalDtype, ExtensionDtype, @@ -979,14 +978,16 @@ def value_counts_arraylike( def duplicated( - values: ArrayLike, keep: Literal["first", "last", False] = "first" + values: ArrayLike, + keep: Literal["first", "last", False] = "first", + mask: npt.NDArray[np.bool_] | None = None, ) -> npt.NDArray[np.bool_]: """ Return boolean ndarray denoting duplicate values. Parameters ---------- - values : nd.array, ExtensionArray or Series + values : np.ndarray or ExtensionArray Array over which to check for duplicate values. keep : {'first', 'last', False}, default 'first' - ``first`` : Mark duplicates as ``True`` except for the first @@ -994,21 +995,15 @@ def duplicated( - ``last`` : Mark duplicates as ``True`` except for the last occurrence. - False : Mark all duplicates as ``True``. + mask : ndarray[bool], optional + array indicating which elements to exclude from checking Returns ------- duplicated : ndarray[bool] """ - if hasattr(values, "dtype"): - if isinstance(values.dtype, ArrowDtype) and values.dtype.kind in "ifub": - values = values._to_masked() # type: ignore[union-attr] - - if isinstance(values.dtype, BaseMaskedDtype): - values = cast("BaseMaskedArray", values) - return htable.duplicated(values._data, keep=keep, mask=values._mask) - values = _ensure_data(values) - return htable.duplicated(values, keep=keep) + return htable.duplicated(values, keep=keep, mask=mask) def mode( diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4b79d0dbb683e..0579aa3760531 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -42,6 +42,7 @@ from pandas.core.dtypes.missing import isna from pandas.core import ( + algorithms as algos, missing, roperator, ) @@ -1289,6 +1290,30 @@ def to_numpy( result[~mask] = data[~mask]._pa_array.to_numpy() return result + @doc(ExtensionArray.duplicated) + def duplicated( + self, keep: Literal["first", "last", False] = "first" + ) -> npt.NDArray[np.bool_]: + pa_type = self._pa_array.type + if pa.types.is_floating(pa_type) or pa.types.is_integer(pa_type): + values = self.to_numpy(na_value=0) + elif pa.types.is_boolean(pa_type): + values = self.to_numpy(na_value=False) + elif pa.types.is_temporal(pa_type): + if pa_type.bit_width == 32: + pa_type = pa.int32() + else: + pa_type = pa.int64() + arr = self.astype(ArrowDtype(pa_type)) + values = arr.to_numpy(na_value=0) + else: + # factorize the values to avoid the performance penalty of + # converting to object dtype + values = self.factorize()[0] + + mask = self.isna() if self._hasna else None + return algos.duplicated(values, keep=keep, mask=mask) + def unique(self) -> Self: """ Compute the ArrowExtensionArray of unique values. diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 933944dbd4632..c06bf7366447b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -61,6 +61,7 @@ roperator, ) from pandas.core.algorithms import ( + duplicated, factorize_array, isin, map_array, @@ -125,6 +126,7 @@ class ExtensionArray: astype copy dropna + duplicated factorize fillna equals @@ -1116,6 +1118,31 @@ def dropna(self) -> Self: # error: Unsupported operand type for ~ ("ExtensionArray") return self[~self.isna()] # type: ignore[operator] + def duplicated( + self, keep: Literal["first", "last", False] = "first" + ) -> npt.NDArray[np.bool_]: + """ + Return boolean ndarray denoting duplicate values. + + Parameters + ---------- + keep : {'first', 'last', False}, default 'first' + - ``first`` : Mark duplicates as ``True`` except for the first occurrence. + - ``last`` : Mark duplicates as ``True`` except for the last occurrence. + - False : Mark all duplicates as ``True``. + + Returns + ------- + ndarray[bool] + + Examples + -------- + >>> pd.array([1, 1, 2, 3, 3], dtype="Int64").duplicated() + array([False, True, False, False, True]) + """ + mask = self.isna().astype(np.bool_, copy=False) + return duplicated(values=self, keep=keep, mask=mask) + def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray: """ Shift values by desired number. diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 9b85fb0477e6f..56d3711c7d13b 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -952,6 +952,14 @@ def copy(self) -> Self: mask = self._mask.copy() return self._simple_new(data, mask) + @doc(ExtensionArray.duplicated) + def duplicated( + self, keep: Literal["first", "last", False] = "first" + ) -> npt.NDArray[np.bool_]: + values = self._data + mask = self._mask + return algos.duplicated(values, keep=keep, mask=mask) + def unique(self) -> Self: """ Compute the BaseMaskedArray of unique values. diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 4d5eef960293f..cf349220e4ba7 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -28,6 +28,7 @@ from pandas._libs.tslibs import NaT from pandas.compat.numpy import function as nv from pandas.errors import PerformanceWarning +from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level from pandas.util._validators import ( validate_bool_kwarg, @@ -830,6 +831,14 @@ def _first_fill_value_loc(self): diff = np.r_[np.diff(indices), 2] return indices[(diff > 1).argmax()] + 1 + @doc(ExtensionArray.duplicated) + def duplicated( + self, keep: Literal["first", "last", False] = "first" + ) -> npt.NDArray[np.bool_]: + values = np.asarray(self) + mask = np.asarray(self.isna()) + return algos.duplicated(values, keep=keep, mask=mask) + def unique(self) -> Self: uniques = algos.unique(self.sp_values) if len(self.sp_values) != len(self): diff --git a/pandas/core/base.py b/pandas/core/base.py index 3026189e747bb..d4421560bcea7 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1365,7 +1365,10 @@ def drop_duplicates(self, *, keep: DropKeep = "first"): @final def _duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]: - return algorithms.duplicated(self._values, keep=keep) + arr = self._values + if isinstance(arr, ExtensionArray): + return arr.duplicated(keep=keep) + return algorithms.duplicated(arr, keep=keep) def _arith_method(self, other, op): res_name = ops.get_op_result_name(self, other) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 4e0bc8d804bab..e10c6ef9a7018 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -248,6 +248,18 @@ def test_sort_values_frame(self, data_for_sorting, ascending): ) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("keep", ["first", "last", False]) + def test_duplicated(self, data, keep): + arr = data.take([0, 1, 0, 1]) + result = arr.duplicated(keep=keep) + if keep == "first": + expected = np.array([False, False, True, True]) + elif keep == "last": + expected = np.array([True, True, False, False]) + else: + expected = np.array([True, True, True, True]) + tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("box", [pd.Series, lambda x: x]) @pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique]) def test_unique(self, data, box, method): From 0ed3626669031613825d0b6634be830ead237429 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 3 Oct 2023 18:54:15 +0200 Subject: [PATCH 062/131] BUG: ndim of string block incorrect with string inference (#55363) * BUG: ndim of string block incorrect with string inference * Update test_constructors.py --- pandas/core/internals/construction.py | 2 +- pandas/tests/frame/test_constructors.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 6f30bc650aa36..d6aeda3d418ed 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -383,7 +383,7 @@ def ndarray_to_mgr( new_block( dtype.construct_array_type()._from_sequence(data, dtype=dtype), BlockPlacement(slice(i, i + 1)), - ndim=1, + ndim=2, ) for i, data in enumerate(obj_columns) ] diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index fd851ab244cb8..4b41050c86467 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2743,6 +2743,13 @@ def test_frame_string_inference_array_string_dtype(self): df = DataFrame(np.array([["a", "c"], ["b", "d"]]), columns=["a", "b"]) tm.assert_frame_equal(df, expected) + def test_frame_string_inference_block_dim(self): + # GH#55363 + pytest.importorskip("pyarrow") + with pd.option_context("future.infer_string", True): + df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) + assert df._mgr.blocks[0].ndim == 2 + class TestDataFrameConstructorIndexInference: def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self): From 06a54adb45bd9c94ad460de09f942aa331814096 Mon Sep 17 00:00:00 2001 From: Eric Han <117511920+eh2077@users.noreply.github.com> Date: Wed, 4 Oct 2023 00:55:34 +0800 Subject: [PATCH 063/131] Fix typo of extra backtick (#55361) --- doc/source/user_guide/10min.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index 5def84b91705c..2c612e31d33b6 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -451,7 +451,7 @@ Merge Concat ~~~~~~ -pandas provides various facilities for easily combining together :class:`Series`` and +pandas provides various facilities for easily combining together :class:`Series` and :class:`DataFrame` objects with various kinds of set logic for the indexes and relational algebra functionality in the case of join / merge-type operations. From 59616c57d6abcf0fb4eb5f81928120c7b000ba88 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 3 Oct 2023 19:48:35 +0200 Subject: [PATCH 064/131] BUG: idxmin/max raising for arrow dtypes (#55368) --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/arrays/arrow/array.py | 17 ++++++++++++++--- pandas/tests/frame/test_reductions.py | 13 +++++++++++++ 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 8aba2bcbecf2f..bd1b301ccb119 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -22,6 +22,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) - Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) - Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 0579aa3760531..2f1f80079e925 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -30,7 +30,10 @@ from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs -from pandas.core.dtypes.cast import can_hold_element +from pandas.core.dtypes.cast import ( + can_hold_element, + infer_dtype_from_scalar, +) from pandas.core.dtypes.common import ( is_array_like, is_bool_dtype, @@ -1624,13 +1627,21 @@ def _reduce( pa_result = self._reduce_pyarrow(name, skipna=skipna, **kwargs) if keepdims: - result = pa.array([pa_result.as_py()], type=pa_result.type) + if isinstance(pa_result, pa.Scalar): + result = pa.array([pa_result.as_py()], type=pa_result.type) + else: + result = pa.array( + [pa_result], + type=to_pyarrow_type(infer_dtype_from_scalar(pa_result)[0]), + ) return type(self)(result) if pc.is_null(pa_result).as_py(): return self.dtype.na_value - else: + elif isinstance(pa_result, pa.Scalar): return pa_result.as_py() + else: + return pa_result def _explode(self): """ diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index e66557f132c1d..77f64b18a82f8 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1056,6 +1056,19 @@ def test_idxmax_numeric_only(self, numeric_only): expected = Series([1, 0, 1], index=["a", "b", "c"]) tm.assert_series_equal(result, expected) + def test_idxmax_arrow_types(self): + # GH#55368 + pytest.importorskip("pyarrow") + + df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1]}, dtype="int64[pyarrow]") + result = df.idxmax() + expected = Series([1, 0], index=["a", "b"]) + tm.assert_series_equal(result, expected) + + result = df.idxmin() + expected = Series([2, 1], index=["a", "b"]) + tm.assert_series_equal(result, expected) + def test_idxmax_axis_2(self, float_frame): frame = float_frame msg = "No axis named 2 for object type DataFrame" From 7df42118d2fca966c86a75ac6bf2c1d0df718eae Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 3 Oct 2023 22:01:34 +0200 Subject: [PATCH 065/131] BUG: eq not implemented for categorical and arrow backed strings (#55364) --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/arrays/arrow/array.py | 5 ++++- pandas/tests/indexes/categorical/test_equals.py | 6 ++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index bd1b301ccb119..8371e02a3372c 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -22,6 +22,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`) - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) - Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 2f1f80079e925..2c788411eb089 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -35,6 +35,7 @@ infer_dtype_from_scalar, ) from pandas.core.dtypes.common import ( + CategoricalDtype, is_array_like, is_bool_dtype, is_integer, @@ -631,7 +632,9 @@ def __setstate__(self, state) -> None: def _cmp_method(self, other, op): pc_func = ARROW_CMP_FUNCS[op.__name__] - if isinstance(other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray)): + if isinstance( + other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray) + ) or isinstance(getattr(other, "dtype", None), CategoricalDtype): result = pc_func(self._pa_array, self._box_pa(other)) elif is_scalar(other): try: diff --git a/pandas/tests/indexes/categorical/test_equals.py b/pandas/tests/indexes/categorical/test_equals.py index 1ed8f3a903439..a8353f301a3c3 100644 --- a/pandas/tests/indexes/categorical/test_equals.py +++ b/pandas/tests/indexes/categorical/test_equals.py @@ -88,3 +88,9 @@ def test_equals_multiindex(self): ci = mi.to_flat_index().astype("category") assert not ci.equals(mi) + + def test_equals_string_dtype(self, any_string_dtype): + # GH#55364 + idx = CategoricalIndex(list("abc"), name="B") + other = Index(["a", "b", "c"], name="B", dtype=any_string_dtype) + assert idx.equals(other) From f196319e52b5823e89648af0c9d26dfdbe821d0c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 3 Oct 2023 22:02:00 +0200 Subject: [PATCH 066/131] BUG: interpolate raising wrong error for ea (#55347) --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/arrays/base.py | 1 - pandas/tests/frame/methods/test_interpolate.py | 6 ++++++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 8371e02a3372c..ed3e97a64c840 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -24,6 +24,7 @@ Bug fixes ~~~~~~~~~ - Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`) - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) +- Fixed bug in :meth:`DataFrame.interpolate` raising incorrect error message (:issue:`55347`) - Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) - Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c06bf7366447b..177c105688d0c 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -893,7 +893,6 @@ def interpolate( limit, limit_direction, limit_area, - fill_value, copy: bool, **kwargs, ) -> Self: diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 291a79815a81c..67aa07dd83764 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -497,3 +497,9 @@ def test_interpolate_empty_df(self): result = df.interpolate(inplace=True) assert result is None tm.assert_frame_equal(df, expected) + + def test_interpolate_ea_raise(self): + # GH#55347 + df = DataFrame({"a": [1, None, 2]}, dtype="Int64") + with pytest.raises(NotImplementedError, match="does not implement"): + df.interpolate() From 8664572fe63dbcb865588d9d564b49a572c3351e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 3 Oct 2023 22:02:36 +0200 Subject: [PATCH 067/131] BUG: Index.insert raising when inserting None into new string dtype (#55365) --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/arrays/string_arrow.py | 5 +++++ pandas/tests/indexes/base_class/test_reshape.py | 8 ++++++++ 3 files changed, 14 insertions(+) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index ed3e97a64c840..38ef8c8455b9d 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -27,6 +27,7 @@ Bug fixes - Fixed bug in :meth:`DataFrame.interpolate` raising incorrect error message (:issue:`55347`) - Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) +- Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`) - Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`) - diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 6262055827428..e904123849821 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -613,3 +613,8 @@ def _reduce( ) else: return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) + + def insert(self, loc: int, item) -> ArrowStringArrayNumpySemantics: + if item is np.nan: + item = libmissing.NA + return super().insert(loc, item) # type: ignore[return-value] diff --git a/pandas/tests/indexes/base_class/test_reshape.py b/pandas/tests/indexes/base_class/test_reshape.py index 5ecb2c753644d..6586f5f9de480 100644 --- a/pandas/tests/indexes/base_class/test_reshape.py +++ b/pandas/tests/indexes/base_class/test_reshape.py @@ -54,6 +54,14 @@ def test_insert_datetime_into_object(self, loc, val): tm.assert_index_equal(result, expected) assert type(expected[2]) is type(val) + def test_insert_none_into_string_numpy(self): + # GH#55365 + pytest.importorskip("pyarrow") + index = Index(["a", "b", "c"], dtype="string[pyarrow_numpy]") + result = index.insert(-1, None) + expected = Index(["a", "b", None, "c"], dtype="string[pyarrow_numpy]") + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( "pos,expected", [ From 22452174d44fd9821289e823e5d3bce21cd68315 Mon Sep 17 00:00:00 2001 From: paulreece <96156234+paulreece@users.noreply.github.com> Date: Tue, 3 Oct 2023 17:56:40 -0400 Subject: [PATCH 068/131] This fix enables you to preserve the datetime precision when using the melt method. (#55270) This fix enables you to preserve the datetime precision when using the melt method. It fixes the issues raised in #55254. --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/reshape/melt.py | 4 ++- pandas/tests/reshape/test_melt.py | 41 +++++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index fa3cef6d9457d..d498c84358448 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -254,6 +254,7 @@ Bug fixes - Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`) - Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) +- Bug in :meth:`pandas.DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) - Bug in :meth:`pandas.read_excel` with a ODS file without cached formatted cell for float values (:issue:`55219`) Categorical diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 74e6a6a28ccb0..387d43f47fe9b 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -134,7 +134,9 @@ def melt( mcolumns = id_vars + var_name + [value_name] - if frame.shape[1] > 0: + if frame.shape[1] > 0 and not any( + not isinstance(dt, np.dtype) and dt._supports_2d for dt in frame.dtypes + ): mdata[value_name] = concat( [frame.iloc[:, i] for i in range(frame.shape[1])] ).values diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 941478066a7d8..ef748e264188c 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -459,6 +459,47 @@ def test_melt_ea_columns(self): ) tm.assert_frame_equal(result, expected) + def test_melt_preserves_datetime(self): + df = DataFrame( + data=[ + { + "type": "A0", + "start_date": pd.Timestamp("2023/03/01", tz="Asia/Tokyo"), + "end_date": pd.Timestamp("2023/03/10", tz="Asia/Tokyo"), + }, + { + "type": "A1", + "start_date": pd.Timestamp("2023/03/01", tz="Asia/Tokyo"), + "end_date": pd.Timestamp("2023/03/11", tz="Asia/Tokyo"), + }, + ], + index=["aaaa", "bbbb"], + ) + result = df.melt( + id_vars=["type"], + value_vars=["start_date", "end_date"], + var_name="start/end", + value_name="date", + ) + expected = DataFrame( + { + "type": {0: "A0", 1: "A1", 2: "A0", 3: "A1"}, + "start/end": { + 0: "start_date", + 1: "start_date", + 2: "end_date", + 3: "end_date", + }, + "date": { + 0: pd.Timestamp("2023-03-01 00:00:00+0900", tz="Asia/Tokyo"), + 1: pd.Timestamp("2023-03-01 00:00:00+0900", tz="Asia/Tokyo"), + 2: pd.Timestamp("2023-03-10 00:00:00+0900", tz="Asia/Tokyo"), + 3: pd.Timestamp("2023-03-11 00:00:00+0900", tz="Asia/Tokyo"), + }, + } + ) + tm.assert_frame_equal(result, expected) + class TestLreshape: def test_pairs(self): From 4976b692ce638594046d79becadb692576fade33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Sok=C3=B3=C5=82?= <8431159+mtsokol@users.noreply.github.com> Date: Wed, 4 Oct 2023 00:42:22 +0200 Subject: [PATCH 069/131] MAINT: Remove `np.int_` and `np.uint` (#55369) MAINT: Remove np.int_ and np.uint --- doc/source/user_guide/enhancingperf.rst | 2 +- pandas/compat/numpy/__init__.py | 15 +++++++++++++++ pandas/core/algorithms.py | 2 +- pandas/tests/arrays/boolean/test_reduction.py | 4 +++- pandas/tests/dtypes/cast/test_infer_dtype.py | 2 +- pandas/tests/extension/base/dim2.py | 2 +- pandas/tests/frame/indexing/test_indexing.py | 4 ++-- pandas/tests/frame/methods/test_shift.py | 9 +++++---- pandas/tests/frame/test_constructors.py | 2 +- pandas/tests/frame/test_reductions.py | 8 ++++++-- pandas/tests/groupby/aggregate/test_cython.py | 2 +- pandas/tests/groupby/test_function.py | 2 +- pandas/tests/groupby/test_groupby.py | 4 ++-- pandas/tests/groupby/test_min_max.py | 2 +- pandas/tests/groupby/test_quantile.py | 2 +- pandas/tests/groupby/test_size.py | 2 +- pandas/tests/groupby/test_value_counts.py | 2 +- pandas/tests/indexes/datetimes/test_datetime.py | 4 +++- pandas/tests/indexes/datetimes/test_indexing.py | 4 +++- pandas/tests/indexes/test_base.py | 2 +- pandas/tests/indexing/multiindex/test_partial.py | 2 +- pandas/tests/indexing/test_loc.py | 12 ++++++------ pandas/tests/indexing/test_partial.py | 4 ++-- pandas/tests/internals/test_internals.py | 2 +- pandas/tests/io/parser/test_parse_dates.py | 4 ++-- pandas/tests/plotting/test_series.py | 7 +++++-- pandas/tests/reshape/merge/test_merge.py | 4 ++-- pandas/tests/scalar/test_na_scalar.py | 11 ++++++----- pandas/tests/series/methods/test_reindex.py | 2 +- pandas/tests/series/test_repr.py | 4 ++-- pyproject.toml | 2 ++ 31 files changed, 81 insertions(+), 49 deletions(-) diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index bc2f4420da784..c4721f3a6b09c 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -184,7 +184,7 @@ can be improved by passing an ``np.ndarray``. ...: cpdef np.ndarray[double] apply_integrate_f(np.ndarray col_a, np.ndarray col_b, ...: np.ndarray col_N): ...: assert (col_a.dtype == np.float64 - ...: and col_b.dtype == np.float64 and col_N.dtype == np.int_) + ...: and col_b.dtype == np.float64 and col_N.dtype == np.dtype(int)) ...: cdef Py_ssize_t i, n = len(col_N) ...: assert (len(col_a) == len(col_b) == n) ...: cdef np.ndarray[double] res = np.empty(n) diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index d376fa4c1919e..1b974a92f8188 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -21,6 +21,21 @@ ) +np_long: type +np_ulong: type + +if _nlv >= Version("2.0.0.dev0"): + try: + np_long = np.long # type: ignore[attr-defined] + np_ulong = np.ulong # type: ignore[attr-defined] + except AttributeError: + np_long = np.int_ + np_ulong = np.uint +else: + np_long = np.int_ + np_ulong = np.uint + + __all__ = [ "np", "_np_version", diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4ff3de2fc7b2b..dd45969a13fd7 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1636,7 +1636,7 @@ def safe_sort( else: mask = None else: - reverse_indexer = np.empty(len(sorter), dtype=np.int_) + reverse_indexer = np.empty(len(sorter), dtype=int) reverse_indexer.put(sorter, np.arange(len(sorter))) # Out of bound indices will be masked with `-1` next, so we # may deal with them here without performance loss using `mode='wrap'` diff --git a/pandas/tests/arrays/boolean/test_reduction.py b/pandas/tests/arrays/boolean/test_reduction.py index dd8c3eda9ed05..71156a4d84ae5 100644 --- a/pandas/tests/arrays/boolean/test_reduction.py +++ b/pandas/tests/arrays/boolean/test_reduction.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_long + import pandas as pd @@ -51,7 +53,7 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions): s = s.dropna() if op in ("sum", "prod"): - assert isinstance(getattr(s, op)(), np.int_) + assert isinstance(getattr(s, op)(), np_long) elif op == "count": # Oddly on the 32 bit build (but not Windows), this is intc (!= intp) assert isinstance(getattr(s, op)(), np.integer) diff --git a/pandas/tests/dtypes/cast/test_infer_dtype.py b/pandas/tests/dtypes/cast/test_infer_dtype.py index ed08df74461ef..50eaa1f4d8713 100644 --- a/pandas/tests/dtypes/cast/test_infer_dtype.py +++ b/pandas/tests/dtypes/cast/test_infer_dtype.py @@ -170,7 +170,7 @@ def test_infer_dtype_from_scalar(value, expected): @pytest.mark.parametrize( "arr, expected", [ - ([1], np.int_), + ([1], np.dtype(int)), (np.array([1], dtype=np.int64), np.int64), ([np.nan, 1, ""], np.object_), (np.array([[1.0, 2.0]]), np.float64), diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index 3d1274df0a21b..12006248b1db3 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -248,7 +248,7 @@ def get_reduction_result_dtype(dtype): return NUMPY_INT_TO_DTYPE[np.dtype(int)] else: # i.e. dtype.kind == "u" - return NUMPY_INT_TO_DTYPE[np.dtype(np.uint)] + return NUMPY_INT_TO_DTYPE[np.dtype("uint")] if method in ["sum", "prod"]: # std and var are not dtype-preserving diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 370cbf0f33174..bc48d00c4a8a8 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -108,11 +108,11 @@ def test_setitem_list(self, float_frame): data["A"] = newcolumndata def test_setitem_list2(self): - df = DataFrame(0, index=range(3), columns=["tt1", "tt2"], dtype=np.int_) + df = DataFrame(0, index=range(3), columns=["tt1", "tt2"], dtype=int) df.loc[1, ["tt1", "tt2"]] = [1, 2] result = df.loc[df.index[1], ["tt1", "tt2"]] - expected = Series([1, 2], df.columns, dtype=np.int_, name=1) + expected = Series([1, 2], df.columns, dtype=int, name=1) tm.assert_series_equal(result, expected) df["tt1"] = df["tt2"] = "0" diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 60c05767a5e1a..6bd441400dc54 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.compat.numpy import np_long import pandas.util._test_decorators as td import pandas as pd @@ -471,22 +472,22 @@ def test_shift_axis1_multiple_blocks_with_int_fill(self): df1 = DataFrame(rng.integers(1000, size=(5, 3), dtype=int)) df2 = DataFrame(rng.integers(1000, size=(5, 2), dtype=int)) df3 = pd.concat([df1.iloc[:4, 1:3], df2.iloc[:4, :]], axis=1) - result = df3.shift(2, axis=1, fill_value=np.int_(0)) + result = df3.shift(2, axis=1, fill_value=np_long(0)) assert len(df3._mgr.blocks) == 2 expected = df3.take([-1, -1, 0, 1], axis=1) - expected.iloc[:, :2] = np.int_(0) + expected.iloc[:, :2] = np_long(0) expected.columns = df3.columns tm.assert_frame_equal(result, expected) # Case with periods < 0 df3 = pd.concat([df1.iloc[:4, 1:3], df2.iloc[:4, :]], axis=1) - result = df3.shift(-2, axis=1, fill_value=np.int_(0)) + result = df3.shift(-2, axis=1, fill_value=np_long(0)) assert len(df3._mgr.blocks) == 2 expected = df3.take([2, 3, -1, -1], axis=1) - expected.iloc[:, -2:] = np.int_(0) + expected.iloc[:, -2:] = np_long(0) expected.columns = df3.columns tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 4b41050c86467..543a5be9544cc 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1823,7 +1823,7 @@ def test_constructor_single_value(self): DataFrame("a", [1, 2], ["a", "c"], float) def test_constructor_with_datetimes(self): - intname = np.dtype(np.int_).name + intname = np.dtype(int).name floatname = np.dtype(np.float64).name objectname = np.dtype(np.object_).name diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 77f64b18a82f8..66f813c80ed16 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -10,6 +10,10 @@ IS64, is_platform_windows, ) +from pandas.compat.numpy import ( + np_long, + np_ulong, +) import pandas.util._test_decorators as td import pandas as pd @@ -1713,11 +1717,11 @@ class TestEmptyDataFrameReductions: "opname, dtype, exp_value, exp_dtype", [ ("sum", np.int8, 0, np.int64), - ("prod", np.int8, 1, np.int_), + ("prod", np.int8, 1, np_long), ("sum", np.int64, 0, np.int64), ("prod", np.int64, 1, np.int64), ("sum", np.uint8, 0, np.uint64), - ("prod", np.uint8, 1, np.uint), + ("prod", np.uint8, 1, np_ulong), ("sum", np.uint64, 0, np.uint64), ("prod", np.uint64, 1, np.uint64), ("sum", np.float32, 0, np.float32), diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 865fda0ab54a2..5c99882cef6d2 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -229,7 +229,7 @@ def test_cython_agg_empty_buckets_nanops(observed): # GH-18869 can't call nanops on empty groups, so hardcode expected # for these df = DataFrame([11, 12, 13], columns=["a"]) - grps = np.arange(0, 25, 5, dtype=np.int_) + grps = np.arange(0, 25, 5, dtype=int) # add / sum result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( "sum", alt=None, numeric_only=True diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 41bbfcf6840a9..a92880c87b847 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -824,7 +824,7 @@ def test_nlargest_and_smallest_noop(data, groups, dtype, method): data = list(reversed(data)) ser = Series(data, name="a") result = getattr(ser.groupby(groups), method)(n=2) - expidx = np.array(groups, dtype=np.int_) if isinstance(groups, list) else groups + expidx = np.array(groups, dtype=int) if isinstance(groups, list) else groups expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9132be50d5857..4ca8b0e317bd2 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3004,12 +3004,12 @@ def test_groupby_reduce_period(): res = gb.max() expected = ser[-10:] - expected.index = Index(range(10), dtype=np.int_) + expected.index = Index(range(10), dtype=int) tm.assert_series_equal(res, expected) res = gb.min() expected = ser[:10] - expected.index = Index(range(10), dtype=np.int_) + expected.index = Index(range(10), dtype=int) tm.assert_series_equal(res, expected) diff --git a/pandas/tests/groupby/test_min_max.py b/pandas/tests/groupby/test_min_max.py index 37eb52be0b37b..30c7e1df1e691 100644 --- a/pandas/tests/groupby/test_min_max.py +++ b/pandas/tests/groupby/test_min_max.py @@ -108,7 +108,7 @@ def test_max_inat_not_all_na(): # Note: in converting to float64, the iNaT + 1 maps to iNaT, i.e. is lossy expected = Series({1: np.nan, 2: np.nan, 3: iNaT + 1}) - expected.index = expected.index.astype(np.int_) + expected.index = expected.index.astype(int) tm.assert_series_equal(result, expected, check_exact=True) diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index efe7b171d630d..805fef2125fda 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -468,7 +468,7 @@ def test_groupby_quantile_dt64tz_period(): # Check that we match the group-by-group result exp = {i: df.iloc[i::5].quantile(0.5) for i in range(5)} expected = DataFrame(exp).T.infer_objects() - expected.index = expected.index.astype(np.int_) + expected.index = expected.index.astype(int) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_size.py b/pandas/tests/groupby/test_size.py index 76b26c04f9f3a..c275db9d1788c 100644 --- a/pandas/tests/groupby/test_size.py +++ b/pandas/tests/groupby/test_size.py @@ -39,7 +39,7 @@ def test_size_axis_1(df, axis_1, by, sort, dropna): if sort: expected = expected.sort_index() if is_integer_dtype(expected.index.dtype) and not any(x is None for x in by): - expected.index = expected.index.astype(np.int_) + expected.index = expected.index.astype(int) msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 944dda8977882..45a33d3b70f71 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -996,7 +996,7 @@ def test_mixed_groupings(normalize, expected_label, expected_values): result = gp.value_counts(sort=True, normalize=normalize) expected = DataFrame( { - "level_0": np.array([4, 4, 5], dtype=np.int_), + "level_0": np.array([4, 4, 5], dtype=int), "A": [1, 1, 2], "level_2": [8, 8, 7], "B": [1, 3, 2], diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index f44cbbf560584..d08757a206e67 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_long + import pandas as pd from pandas import ( DataFrame, @@ -54,7 +56,7 @@ def test_time_overflow_for_32bit_machines(self): # (which has value 1e9) and since the max value for np.int32 is ~2e9, # and since those machines won't promote np.int32 to np.int64, we get # overflow. - periods = np.int_(1000) + periods = np_long(1000) idx1 = date_range(start="2000", periods=periods, freq="s") assert len(idx1) == periods diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index c3944a4443d67..d877110e72b26 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_long + import pandas as pd from pandas import ( DatetimeIndex, @@ -91,7 +93,7 @@ def test_dti_business_getitem(self, freq): assert fancy_indexed.freq is None # 32-bit vs. 64-bit platforms - assert rng[4] == rng[np.int_(4)] + assert rng[4] == rng[np_long(4)] @pytest.mark.parametrize("freq", ["B", "C"]) def test_dti_business_getitem_matplotlib_hackaround(self, freq): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index bc04c1c6612f4..8fd1e296fb79a 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -457,7 +457,7 @@ def test_fancy(self, simple_index): ["string", "int64", "int32", "uint64", "uint32", "float64", "float32"], indirect=True, ) - @pytest.mark.parametrize("dtype", [np.int_, np.bool_]) + @pytest.mark.parametrize("dtype", [int, np.bool_]) def test_empty_fancy(self, index, dtype): empty_arr = np.array([], dtype=dtype) empty_index = type(index)([], dtype=index.dtype) diff --git a/pandas/tests/indexing/multiindex/test_partial.py b/pandas/tests/indexing/multiindex/test_partial.py index de989ad550f2b..081da385ebcc3 100644 --- a/pandas/tests/indexing/multiindex/test_partial.py +++ b/pandas/tests/indexing/multiindex/test_partial.py @@ -166,7 +166,7 @@ def test_getitem_intkey_leading_level( mi = ser.index assert isinstance(mi, MultiIndex) if dtype is int: - assert mi.levels[0].dtype == np.int_ + assert mi.levels[0].dtype == np.dtype(int) else: assert mi.levels[0].dtype == np.float64 diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index a2693c85e507f..20568342222c3 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -442,7 +442,7 @@ def test_loc_to_fail(self): ) msg = ( - rf"\"None of \[Index\(\[1, 2\], dtype='{np.int_().dtype}'\)\] are " + rf"\"None of \[Index\(\[1, 2\], dtype='{np.dtype(int)}'\)\] are " r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): @@ -460,7 +460,7 @@ def test_loc_to_fail2(self): s.loc[-1] msg = ( - rf"\"None of \[Index\(\[-1, -2\], dtype='{np.int_().dtype}'\)\] are " + rf"\"None of \[Index\(\[-1, -2\], dtype='{np.dtype(int)}'\)\] are " r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): @@ -476,7 +476,7 @@ def test_loc_to_fail2(self): s["a"] = 2 msg = ( - rf"\"None of \[Index\(\[-2\], dtype='{np.int_().dtype}'\)\] are " + rf"\"None of \[Index\(\[-2\], dtype='{np.dtype(int)}'\)\] are " r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): @@ -493,7 +493,7 @@ def test_loc_to_fail3(self): df = DataFrame([["a"], ["b"]], index=[1, 2], columns=["value"]) msg = ( - rf"\"None of \[Index\(\[3\], dtype='{np.int_().dtype}'\)\] are " + rf"\"None of \[Index\(\[3\], dtype='{np.dtype(int)}'\)\] are " r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): @@ -510,7 +510,7 @@ def test_loc_getitem_list_with_fail(self): s.loc[[2]] - msg = f"\"None of [Index([3], dtype='{np.int_().dtype}')] are in the [index]" + msg = f"\"None of [Index([3], dtype='{np.dtype(int)}')] are in the [index]" with pytest.raises(KeyError, match=re.escape(msg)): s.loc[[3]] @@ -1209,7 +1209,7 @@ def test_loc_setitem_empty_append_raises(self): df = DataFrame(columns=["x", "y"]) df.index = df.index.astype(np.int64) msg = ( - rf"None of \[Index\(\[0, 1\], dtype='{np.int_().dtype}'\)\] " + rf"None of \[Index\(\[0, 1\], dtype='{np.dtype(int)}'\)\] " r"are in the \[index\]" ) with pytest.raises(KeyError, match=msg): diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 8e5cde42ec91b..8f499644f1013 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -402,7 +402,7 @@ def test_series_partial_set(self): # raises as nothing is in the index msg = ( - rf"\"None of \[Index\(\[3, 3, 3\], dtype='{np.int_().dtype}'\)\] " + rf"\"None of \[Index\(\[3, 3, 3\], dtype='{np.dtype(int)}'\)\] " r"are in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): @@ -483,7 +483,7 @@ def test_series_partial_set_with_name(self): # raises as nothing is in the index msg = ( - rf"\"None of \[Index\(\[3, 3, 3\], dtype='{np.int_().dtype}', " + rf"\"None of \[Index\(\[3, 3, 3\], dtype='{np.dtype(int)}', " r"name='idx'\)\] are in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 597bc2975268e..09567e7012313 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -468,7 +468,7 @@ def test_set_change_dtype(self, mgr): np.random.default_rng(2).standard_normal(N).astype(int), ) idx = mgr2.items.get_loc("quux") - assert mgr2.iget(idx).dtype == np.int_ + assert mgr2.iget(idx).dtype == np.dtype(int) mgr2.iset( mgr2.items.get_loc("quux"), np.random.default_rng(2).standard_normal(N) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index c79fdd9145a6a..9f7840588f89e 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -47,7 +47,7 @@ def test_read_csv_with_custom_date_parser(all_parsers): # GH36111 def __custom_date_parser(time): time = time.astype(np.float64) - time = time.astype(np.int_) # convert float seconds to int type + time = time.astype(int) # convert float seconds to int type return pd.to_timedelta(time, unit="s") testdata = StringIO( @@ -87,7 +87,7 @@ def test_read_csv_with_custom_date_parser_parse_dates_false(all_parsers): # GH44366 def __custom_date_parser(time): time = time.astype(np.float64) - time = time.astype(np.int_) # convert float seconds to int type + time = time.astype(int) # convert float seconds to int type return pd.to_timedelta(time, unit="s") testdata = StringIO( diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 6719e39612f5d..bf4474d085b11 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -6,7 +6,10 @@ import pytest from pandas.compat import is_platform_linux -from pandas.compat.numpy import np_version_gte1p24 +from pandas.compat.numpy import ( + np_long, + np_version_gte1p24, +) import pandas.util._test_decorators as td import pandas as pd @@ -561,7 +564,7 @@ def test_plot_fails_with_dupe_color_and_style(self): [ ["scott", 20], [None, 20], - [None, np.int_(20)], + [None, np_long(20)], [0.5, np.linspace(-100, 100, 20)], ], ) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d889ae2e4806b..d203a04a7fffc 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -365,7 +365,7 @@ def test_merge_join_key_dtype_cast(self): lkey = np.array([1]) rkey = np.array([2]) df = merge(df1, df2, left_on=lkey, right_on=rkey, how="outer") - assert df["key_0"].dtype == np.int_ + assert df["key_0"].dtype == np.dtype(int) def test_handle_join_key_pass_array(self): left = DataFrame( @@ -389,7 +389,7 @@ def test_handle_join_key_pass_array(self): rkey = np.array([1, 1, 2, 3, 4, 5]) merged = merge(left, right, left_on=lkey, right_on=rkey, how="outer") - expected = Series([1, 1, 1, 1, 2, 2, 3, 4, 5], dtype=np.int_, name="key_0") + expected = Series([1, 1, 1, 1, 2, 2, 3, 4, 5], dtype=int, name="key_0") tm.assert_series_equal(merged["key_0"], expected) left = DataFrame({"value": np.arange(3)}) diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index 287b7557f50f9..44ce5c79db348 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -9,6 +9,7 @@ import pytest from pandas._libs.missing import NA +from pandas.compat.numpy import np_long from pandas.core.dtypes.common import is_scalar @@ -102,9 +103,9 @@ def test_comparison_ops(comparison_op, other): -0.0, False, np.bool_(False), - np.int_(0), + np_long(0), np.float64(0), - np.int_(-0), + np_long(-0), np.float64(-0), ], ) @@ -123,7 +124,7 @@ def test_pow_special(value, asarray): @pytest.mark.parametrize( - "value", [1, 1.0, True, np.bool_(True), np.int_(1), np.float64(1)] + "value", [1, 1.0, True, np.bool_(True), np_long(1), np.float64(1)] ) @pytest.mark.parametrize("asarray", [True, False]) def test_rpow_special(value, asarray): @@ -133,14 +134,14 @@ def test_rpow_special(value, asarray): if asarray: result = result[0] - elif not isinstance(value, (np.float64, np.bool_, np.int_)): + elif not isinstance(value, (np.float64, np.bool_, np_long)): # this assertion isn't possible with asarray=True assert isinstance(result, type(value)) assert result == value -@pytest.mark.parametrize("value", [-1, -1.0, np.int_(-1), np.float64(-1)]) +@pytest.mark.parametrize("value", [-1, -1.0, np_long(-1), np.float64(-1)]) @pytest.mark.parametrize("asarray", [True, False]) def test_rpow_minus_one(value, asarray): if asarray: diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index f3075c116883a..9d6611cd53068 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -197,7 +197,7 @@ def test_reindex_int(datetime_series): # NO NaNs introduced reindexed_int = int_ts.reindex(int_ts.index[::2]) - assert reindexed_int.dtype == np.int_ + assert reindexed_int.dtype == np.dtype(int) def test_reindex_bool(datetime_series): diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index f294885fb8f4d..be68918d2a380 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -348,7 +348,7 @@ def test_categorical_series_repr(self): 8 8 9 9 dtype: category -Categories (10, {np.int_().dtype}): [0, 1, 2, 3, ..., 6, 7, 8, 9]""" +Categories (10, {np.dtype(int)}): [0, 1, 2, 3, ..., 6, 7, 8, 9]""" assert repr(s) == exp @@ -374,7 +374,7 @@ def test_categorical_series_repr_ordered(self): 8 8 9 9 dtype: category -Categories (10, {np.int_().dtype}): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]""" +Categories (10, {np.dtype(int)}): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]""" assert repr(s) == exp diff --git a/pyproject.toml b/pyproject.toml index 89432c2353ea8..a8388a9ff52de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -505,6 +505,8 @@ filterwarnings = [ "ignore:distutils Version classes are deprecated:DeprecationWarning:numexpr", "ignore:distutils Version classes are deprecated:DeprecationWarning:fastparquet", "ignore:distutils Version classes are deprecated:DeprecationWarning:fsspec", + # Can be removed once https://github.com/numpy/numpy/pull/24794 is merged + "ignore:.*In the future `np.long` will be defined as.*:FutureWarning", ] junit_family = "xunit2" markers = [ From 1c1bb854cce61a8d653c468bcce70db84ac07cf9 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 3 Oct 2023 20:29:44 -0400 Subject: [PATCH 070/131] BUG: DataFrame.join reordering index levels when joining on subset of levels (#55370) * BUG: DataFrame.join reordering index levels when joining on subset of levels * update more tests * move whatsnew to notable section --- doc/source/whatsnew/v2.2.0.rst | 33 ++++++++++++++++++++--- pandas/core/indexes/base.py | 7 +++++ pandas/tests/reshape/merge/test_join.py | 16 +++++++---- pandas/tests/reshape/merge/test_multi.py | 28 +++++++++++-------- pandas/tests/series/methods/test_align.py | 4 +-- pandas/tests/series/test_arithmetic.py | 4 +-- 6 files changed, 69 insertions(+), 23 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index d498c84358448..7a177344a42c7 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -133,10 +133,36 @@ and ``sort=False``: result -.. _whatsnew_220.notable_bug_fixes.notable_bug_fix2: +.. _whatsnew_220.notable_bug_fixes.multiindex_join_different_levels: -notable_bug_fix2 -^^^^^^^^^^^^^^^^ +:func:`merge` and :meth:`DataFrame.join` no longer reorder levels when levels differ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions of pandas, :func:`merge` and :meth:`DataFrame.join` would reorder +index levels when joining on two indexes with different levels (:issue:`34133`). + +.. ipython:: python + + left = pd.DataFrame({"left": 1}, index=pd.MultiIndex.from_tuples([("x", 1), ("x", 2)], names=["A", "B"])) + right = pd.DataFrame({"right": 2}, index=pd.MultiIndex.from_tuples([(1, 1), (2, 2)], names=["B", "C"])) + result = left.join(right) + +*Old Behavior* + +.. code-block:: ipython + + In [5]: result + Out[5]: + left right + B A C + 1 x 1 1 2 + 2 x 2 1 2 + +*New Behavior* + +.. ipython:: python + + result .. --------------------------------------------------------------------------- .. _whatsnew_220.api_breaking: @@ -342,6 +368,7 @@ Reshaping ^^^^^^^^^ - Bug in :func:`concat` ignoring ``sort`` parameter when passed :class:`DatetimeIndex` indexes (:issue:`54769`) - Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) +- Sparse ^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9017ff121976b..11f2cc8ebf1ff 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4725,6 +4725,13 @@ def _join_multi(self, other: Index, how: JoinHow): multi_join_idx = multi_join_idx.remove_unused_levels() + # maintain the order of the index levels + if how == "right": + level_order = other_names_list + ldrop_names + else: + level_order = self_names_list + rdrop_names + multi_join_idx = multi_join_idx.reorder_levels(level_order) + return multi_join_idx, lidx, ridx jl = next(iter(overlap)) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 20daa388c2c88..c630ba6a43cb1 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -902,7 +902,7 @@ def test_join_inner_multiindex_deterministic_order(): result = left.join(right, how="inner") expected = DataFrame( {"e": [5], "f": [6]}, - index=MultiIndex.from_tuples([(2, 1, 4, 3)], names=("b", "a", "d", "c")), + index=MultiIndex.from_tuples([(1, 2, 4, 3)], names=("a", "b", "d", "c")), ) tm.assert_frame_equal(result, expected) @@ -926,10 +926,16 @@ def test_join_multiindex_one_level(join_type): ) right = DataFrame(data={"d": 4}, index=MultiIndex.from_tuples([(2,)], names=("b",))) result = left.join(right, how=join_type) - expected = DataFrame( - {"c": [3], "d": [4]}, - index=MultiIndex.from_tuples([(2, 1)], names=["b", "a"]), - ) + if join_type == "right": + expected = DataFrame( + {"c": [3], "d": [4]}, + index=MultiIndex.from_tuples([(2, 1)], names=["b", "a"]), + ) + else: + expected = DataFrame( + {"c": [3], "d": [4]}, + index=MultiIndex.from_tuples([(1, 2)], names=["a", "b"]), + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index ab010bdb909f1..c029acf0c8938 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -69,11 +69,6 @@ def on_cols_multi(): return ["Origin", "Destination", "Period"] -@pytest.fixture -def idx_cols_multi(): - return ["Origin", "Destination", "Period", "TripPurp", "LinkType"] - - class TestMergeMulti: def test_merge_on_multikey(self, left, right, join_type): on_cols = ["key1", "key2"] @@ -815,9 +810,13 @@ def test_join_multi_levels2(self): class TestJoinMultiMulti: - def test_join_multi_multi( - self, left_multi, right_multi, join_type, on_cols_multi, idx_cols_multi - ): + def test_join_multi_multi(self, left_multi, right_multi, join_type, on_cols_multi): + left_names = left_multi.index.names + right_names = right_multi.index.names + if join_type == "right": + level_order = right_names + left_names.difference(right_names) + else: + level_order = left_names + right_names.difference(left_names) # Multi-index join tests expected = ( merge( @@ -826,7 +825,7 @@ def test_join_multi_multi( how=join_type, on=on_cols_multi, ) - .set_index(idx_cols_multi) + .set_index(level_order) .sort_index() ) @@ -834,11 +833,18 @@ def test_join_multi_multi( tm.assert_frame_equal(result, expected) def test_join_multi_empty_frames( - self, left_multi, right_multi, join_type, on_cols_multi, idx_cols_multi + self, left_multi, right_multi, join_type, on_cols_multi ): left_multi = left_multi.drop(columns=left_multi.columns) right_multi = right_multi.drop(columns=right_multi.columns) + left_names = left_multi.index.names + right_names = right_multi.index.names + if join_type == "right": + level_order = right_names + left_names.difference(right_names) + else: + level_order = left_names + right_names.difference(left_names) + expected = ( merge( left_multi.reset_index(), @@ -846,7 +852,7 @@ def test_join_multi_empty_frames( how=join_type, on=on_cols_multi, ) - .set_index(idx_cols_multi) + .set_index(level_order) .sort_index() ) diff --git a/pandas/tests/series/methods/test_align.py b/pandas/tests/series/methods/test_align.py index e1b3dd4888ef5..d36fd5335bdfc 100644 --- a/pandas/tests/series/methods/test_align.py +++ b/pandas/tests/series/methods/test_align.py @@ -240,10 +240,10 @@ def test_align_left_different_named_levels(): result_left, result_right = left.align(right) expected_left = Series( - [2], index=pd.MultiIndex.from_tuples([(1, 3, 4, 2)], names=["a", "c", "d", "b"]) + [2], index=pd.MultiIndex.from_tuples([(1, 4, 3, 2)], names=["a", "d", "c", "b"]) ) expected_right = Series( - [1], index=pd.MultiIndex.from_tuples([(1, 3, 4, 2)], names=["a", "c", "d", "b"]) + [1], index=pd.MultiIndex.from_tuples([(1, 4, 3, 2)], names=["a", "d", "c", "b"]) ) tm.assert_series_equal(result_left, expected_left) tm.assert_series_equal(result_right, expected_right) diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 8547fd6988791..d40ff6c139653 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -941,8 +941,8 @@ def test_series_varied_multiindex_alignment(): expected = Series( [1000, 2001, 3002, 4003], index=pd.MultiIndex.from_tuples( - [("x", 1, "a"), ("x", 2, "a"), ("y", 1, "a"), ("y", 2, "a")], - names=["xy", "num", "ab"], + [("a", "x", 1), ("a", "x", 2), ("a", "y", 1), ("a", "y", 2)], + names=["ab", "xy", "num"], ), ) tm.assert_series_equal(result, expected) From 6b84daa3be369d9032cb2965d539407b9b98ff45 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 4 Oct 2023 10:39:20 +0200 Subject: [PATCH 071/131] BUG: Inserting ndim=0 array does not infer string dtype (#55366) --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/construction.py | 5 +++++ pandas/tests/frame/indexing/test_indexing.py | 13 +++++++++++++ 3 files changed, 19 insertions(+) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 38ef8c8455b9d..e89a268c4256e 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -23,6 +23,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`) +- Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`) - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) - Fixed bug in :meth:`DataFrame.interpolate` raising incorrect error message (:issue:`55347`) - Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index aaac0dc73486f..e661d590ab330 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -562,7 +562,12 @@ def sanitize_array( if not is_list_like(data): if index is None: raise ValueError("index must be specified when data is not list-like") + if isinstance(data, str) and using_pyarrow_string_dtype(): + from pandas.core.arrays.string_ import StringDtype + + dtype = StringDtype("pyarrow_numpy") data = construct_1d_arraylike_from_scalar(data, len(index), dtype) + return data elif isinstance(data, ABCExtensionArray): diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index bc48d00c4a8a8..de8df15a9d747 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1905,6 +1905,19 @@ def test_adding_new_conditional_column() -> None: tm.assert_frame_equal(df, expected) +def test_add_new_column_infer_string(): + # GH#55366 + pytest.importorskip("pyarrow") + df = DataFrame({"x": [1]}) + with pd.option_context("future.infer_string", True): + df.loc[df["x"] == 1, "y"] = "1" + expected = DataFrame( + {"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")}, + columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"), + ) + tm.assert_frame_equal(df, expected) + + class TestSetitemValidation: # This is adapted from pandas/tests/arrays/masked/test_indexing.py # but checks for warnings instead of errors. From 6a83910674a2be1f95d75cfd935fdff22b09bd33 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 4 Oct 2023 10:39:37 +0200 Subject: [PATCH 072/131] BUG: rank raising for arrow string dtypes (#55362) --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/arrays/arrow/array.py | 31 ++++++++++++++++++++----- pandas/core/arrays/string_arrow.py | 28 +++++++++++++++++++++- pandas/tests/frame/methods/test_rank.py | 12 ++++++++++ 4 files changed, 65 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index e89a268c4256e..df19b01610a7a 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -29,6 +29,7 @@ Bug fixes - Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) - Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`) +- Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`55362`) - Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`) - diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 2c788411eb089..12fe9b30f3f52 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1747,7 +1747,7 @@ def __setitem__(self, key, value) -> None: data = pa.chunked_array([data]) self._pa_array = data - def _rank( + def _rank_calc( self, *, axis: AxisInt = 0, @@ -1756,9 +1756,6 @@ def _rank( ascending: bool = True, pct: bool = False, ): - """ - See Series.rank.__doc__. - """ if pa_version_under9p0 or axis != 0: ranked = super()._rank( axis=axis, @@ -1773,7 +1770,7 @@ def _rank( else: pa_type = pa.uint64() result = pa.array(ranked, type=pa_type, from_pandas=True) - return type(self)(result) + return result data = self._pa_array.combine_chunks() sort_keys = "ascending" if ascending else "descending" @@ -1812,7 +1809,29 @@ def _rank( divisor = pc.count(result) result = pc.divide(result, divisor) - return type(self)(result) + return result + + def _rank( + self, + *, + axis: AxisInt = 0, + method: str = "average", + na_option: str = "keep", + ascending: bool = True, + pct: bool = False, + ): + """ + See Series.rank.__doc__. + """ + return type(self)( + self._rank_calc( + axis=axis, + method=method, + na_option=na_option, + ascending=ascending, + pct=pct, + ) + ) def _quantile(self, qs: npt.NDArray[np.float64], interpolation: str) -> Self: """ diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index e904123849821..db11e74951ee5 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -53,6 +53,7 @@ from collections.abc import Sequence from pandas._typing import ( + AxisInt, Dtype, Scalar, npt, @@ -501,6 +502,28 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): def _convert_int_dtype(self, result): return Int64Dtype().__from_arrow__(result) + def _rank( + self, + *, + axis: AxisInt = 0, + method: str = "average", + na_option: str = "keep", + ascending: bool = True, + pct: bool = False, + ): + """ + See Series.rank.__doc__. + """ + return self._convert_int_dtype( + self._rank_calc( + axis=axis, + method=method, + na_option=na_option, + ascending=ascending, + pct=pct, + ) + ) + class ArrowStringArrayNumpySemantics(ArrowStringArray): _storage = "pyarrow_numpy" @@ -584,7 +607,10 @@ def _str_map( return lib.map_infer_mask(arr, f, mask.view("uint8")) def _convert_int_dtype(self, result): - result = result.to_numpy() + if isinstance(result, pa.Array): + result = result.to_numpy(zero_copy_only=False) + else: + result = result.to_numpy() if result.dtype == np.int32: result = result.astype(np.int64) return result diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 8b451c84dc5da..b5b5e42691e59 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -488,3 +488,15 @@ def test_rank_mixed_axis_zero(self, data, expected): df.rank() result = df.rank(numeric_only=True) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype, exp_dtype", + [("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")], + ) + def test_rank_string_dtype(self, dtype, exp_dtype): + # GH#55362 + pytest.importorskip("pyarrow") + obj = Series(["foo", "foo", None, "foo"], dtype=dtype) + result = obj.rank(method="first") + expected = Series([1, 2, None, 3], dtype=exp_dtype) + tm.assert_series_equal(result, expected) From da01e38e2affc046b329f054f87f011ce3af4eea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Wed, 4 Oct 2023 10:46:59 -0400 Subject: [PATCH 073/131] TYP: read_csv's usecols (#55330) * TYP: read_csv's usecols * adjust doc * use range --- pandas/_typing.py | 9 ++++++ pandas/io/parsers/readers.py | 54 ++++++++---------------------------- 2 files changed, 21 insertions(+), 42 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index f18c67fcb0c90..de01434c09c39 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -509,3 +509,12 @@ def closed(self) -> bool: # Offsets OffsetCalendar = Union[np.busdaycalendar, "AbstractHolidayCalendar"] + +# read_csv: usecols +UsecolsArgType = Union[ + SequenceNotStr[Hashable], + range, + AnyArrayLike, + Callable[[HashableT], bool], + None, +] diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 6ce6ac71b1ddd..32f8a1fe81a9f 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -77,11 +77,11 @@ DtypeArg, DtypeBackend, FilePath, - HashableT, IndexLabel, ReadCsvBuffer, Self, StorageOptions, + UsecolsArgType, ) _doc_read_csv_and_table = ( r""" @@ -142,7 +142,7 @@ Note: ``index_col=False`` can be used to force pandas to *not* use the first column as the index, e.g., when you have a malformed file with delimiters at the end of each line. -usecols : list of Hashable or Callable, optional +usecols : Sequence of Hashable or Callable, optional Subset of columns to select, denoted either by column labels or column indices. If list-like, all elements must either be positional (i.e. integer indices into the document columns) or strings @@ -645,10 +645,7 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -707,10 +704,7 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -770,10 +764,7 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -833,10 +824,7 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -907,10 +895,7 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = "infer", names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, index_col: IndexLabel | Literal[False] | None = None, - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = None, + usecols: UsecolsArgType = None, # General Parsing Configuration dtype: DtypeArg | None = None, engine: CSVEngine | None = None, @@ -1005,10 +990,7 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1065,10 +1047,7 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1125,10 +1104,7 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1185,10 +1161,7 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1258,10 +1231,7 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = "infer", names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, index_col: IndexLabel | Literal[False] | None = None, - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = None, + usecols: UsecolsArgType = None, # General Parsing Configuration dtype: DtypeArg | None = None, engine: CSVEngine | None = None, From 06cdcc0576e387551462511f82d3eed8c02be570 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Wed, 4 Oct 2023 17:46:01 +0200 Subject: [PATCH 074/131] BUG: is_string_dtype(pd.Index([], dtype='O')) returns False (#54997) * correct def is_all_strings * add a test, correct whatsnew/v2.2.0.rst * move the test and add parametrization, correct v2.2.0.rst * remove the line from v2.2.0.rst * add the note to v2.1.1.rst * correct the whatsnew note and move it to 2.2.0 --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/dtypes/common.py | 9 ++++++--- pandas/tests/dtypes/test_common.py | 25 +++++++++++++++++-------- 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 7a177344a42c7..b04e0e228e72f 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -279,6 +279,7 @@ Bug fixes ~~~~~~~~~ - Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`) - Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) +- Bug in :func:`pandas.api.types.is_string_dtype` while checking object array with no elements is of the string dtype (:issue:`54661`) - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) - Bug in :meth:`pandas.DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) - Bug in :meth:`pandas.read_excel` with a ODS file without cached formatted cell for float values (:issue:`55219`) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 9da4eac6a42c8..42e909a6b9856 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1664,9 +1664,12 @@ def is_all_strings(value: ArrayLike) -> bool: dtype = value.dtype if isinstance(dtype, np.dtype): - return dtype == np.dtype("object") and lib.is_string_array( - np.asarray(value), skipna=False - ) + if len(value) == 0: + return dtype == np.dtype("object") + else: + return dtype == np.dtype("object") and lib.is_string_array( + np.asarray(value), skipna=False + ) elif isinstance(dtype, CategoricalDtype): return dtype.categories.inferred_type == "string" return dtype == "string" diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 4507857418e9e..6f6cc5a5ad5d8 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -301,14 +301,23 @@ def test_is_categorical_dtype(): assert com.is_categorical_dtype(pd.CategoricalIndex([1, 2, 3])) -def test_is_string_dtype(): - assert not com.is_string_dtype(int) - assert not com.is_string_dtype(pd.Series([1, 2])) - - assert com.is_string_dtype(str) - assert com.is_string_dtype(object) - assert com.is_string_dtype(np.array(["a", "b"])) - assert com.is_string_dtype(pd.StringDtype()) +@pytest.mark.parametrize( + "dtype, expected", + [ + (int, False), + (pd.Series([1, 2]), False), + (str, True), + (object, True), + (np.array(["a", "b"]), True), + (pd.StringDtype(), True), + (pd.Index([], dtype="O"), True), + ], +) +def test_is_string_dtype(dtype, expected): + # GH#54661 + + result = com.is_string_dtype(dtype) + assert result is expected @pytest.mark.parametrize( From f3f0795d8076f950efc75a80af4ddbe803238fc5 Mon Sep 17 00:00:00 2001 From: Tim Hoffmann <2836374+timhoffm@users.noreply.github.com> Date: Wed, 4 Oct 2023 18:38:17 +0200 Subject: [PATCH 075/131] ENH: propagating attrs always uses deepcopy (#55314) Always using a deepcopy prevents shared state and thus unintentional modification of the attrs of other objects. IMHO this safety has a higher priority than the slight performance cost of the deepcopy. The implementation now skips the copying if *attrs* are not used (i.e. an empty dict). This check takes only ~20ns. Thus, the attrs propagation mechanism has no performance impact if attrs are not used. Closes #54134. Co-authored-by: Tim Hoffmann --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/generic.py | 28 +++++++++++++++++++++------- pandas/tests/frame/test_api.py | 9 +++++++++ 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index b04e0e228e72f..4749ceec4a330 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -74,12 +74,12 @@ enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ +- :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`). - :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`) - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`) -- .. --------------------------------------------------------------------------- .. _whatsnew_220.notable_bug_fixes: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 738f4cbe6bc43..2e6c8919eff38 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2,6 +2,7 @@ from __future__ import annotations import collections +from copy import deepcopy import datetime as dt from functools import partial import gc @@ -368,6 +369,13 @@ def attrs(self) -> dict[Hashable, Any]: -------- DataFrame.flags : Global flags applying to this object. + Notes + ----- + Many operations that create new datasets will copy ``attrs``. Copies + are always deep so that changing ``attrs`` will only affect the + present dataset. ``pandas.concat`` copies ``attrs`` only if all input + datasets have the same ``attrs``. + Examples -------- For Series: @@ -6191,8 +6199,12 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: stable across pandas releases. """ if isinstance(other, NDFrame): - for name in other.attrs: - self.attrs[name] = other.attrs[name] + if other.attrs: + # We want attrs propagation to have minimal performance + # impact if attrs are not used; i.e. attrs is an empty dict. + # One could make the deepcopy unconditionally, but a deepcopy + # of an empty dict is 50x more expensive than the empty check. + self.attrs = deepcopy(other.attrs) self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels # For subclasses using _metadata. @@ -6201,11 +6213,13 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: object.__setattr__(self, name, getattr(other, name, None)) if method == "concat": - attrs = other.objs[0].attrs - check_attrs = all(objs.attrs == attrs for objs in other.objs[1:]) - if check_attrs: - for name in attrs: - self.attrs[name] = attrs[name] + # propagate attrs only if all concat arguments have the same attrs + if all(bool(obj.attrs) for obj in other.objs): + # all concatenate arguments have non-empty attrs + attrs = other.objs[0].attrs + have_same_attrs = all(obj.attrs == attrs for obj in other.objs[1:]) + if have_same_attrs: + self.attrs = deepcopy(attrs) allows_duplicate_labels = all( x.flags.allows_duplicate_labels for x in other.objs diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 8fc78629beb0a..06bf169bf4dbc 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -315,6 +315,15 @@ def test_attrs(self): result = df.rename(columns=str) assert result.attrs == {"version": 1} + def test_attrs_deepcopy(self): + df = DataFrame({"A": [2, 3]}) + assert df.attrs == {} + df.attrs["tags"] = {"spam", "ham"} + + result = df.rename(columns=str) + assert result.attrs == df.attrs + assert result.attrs["tags"] is not df.attrs["tags"] + @pytest.mark.parametrize("allows_duplicate_labels", [True, False, None]) def test_set_flags( self, allows_duplicate_labels, frame_or_series, using_copy_on_write From eb6317a3cf45dfeb76f81655310604de029ad596 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Sok=C3=B3=C5=82?= <8431159+mtsokol@users.noreply.github.com> Date: Wed, 4 Oct 2023 18:44:43 +0200 Subject: [PATCH 076/131] MAINT: Add warning filter for `np.long` (#55397) MAINT: Add warning filter for np.long --- pandas/compat/numpy/__init__.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 1b974a92f8188..51c9892b64a08 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -1,4 +1,6 @@ """ support numpy compatibility across versions """ +import warnings + import numpy as np from pandas.util.version import Version @@ -26,8 +28,14 @@ if _nlv >= Version("2.0.0.dev0"): try: - np_long = np.long # type: ignore[attr-defined] - np_ulong = np.ulong # type: ignore[attr-defined] + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + r".*In the future `np\.long` will be defined as.*", + FutureWarning, + ) + np_long = np.long # type: ignore[attr-defined] + np_ulong = np.ulong # type: ignore[attr-defined] except AttributeError: np_long = np.int_ np_ulong = np.uint From 6a8055d8a52e1f691acefececa3e7d835f16a062 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 4 Oct 2023 19:05:44 +0200 Subject: [PATCH 077/131] WARN: sort_values for midx warns unnecessarily if nan is in level (#55386) --- pandas/core/indexes/multi.py | 2 +- pandas/tests/indexes/multi/test_sorting.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 144045d40a086..041ef2d742c16 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1948,7 +1948,7 @@ def _sort_levels_monotonic(self, raise_if_incomparable: bool = False) -> MultiIn # indexer to reorder the level codes indexer = ensure_platform_int(indexer) ri = lib.get_reverse_indexer(indexer, len(indexer)) - level_codes = algos.take_nd(ri, level_codes) + level_codes = algos.take_nd(ri, level_codes, fill_value=-1) new_levels.append(lev) new_codes.append(level_codes) diff --git a/pandas/tests/indexes/multi/test_sorting.py b/pandas/tests/indexes/multi/test_sorting.py index 08c1a4092952c..b4dcef71dcf50 100644 --- a/pandas/tests/indexes/multi/test_sorting.py +++ b/pandas/tests/indexes/multi/test_sorting.py @@ -338,3 +338,12 @@ def test_sort_values_with_na_na_position(dtype, na_position): ] expected = MultiIndex.from_arrays(arrays) tm.assert_index_equal(result, expected) + + +def test_sort_unnecessary_warning(): + # GH#55386 + midx = MultiIndex.from_tuples([(1.5, 2), (3.5, 3), (0, 1)]) + midx = midx.set_levels([2.5, np.nan, 1], level=0) + result = midx.sort_values() + expected = MultiIndex.from_tuples([(1, 3), (2.5, 1), (np.nan, 2)]) + tm.assert_index_equal(result, expected) From 364c9cb65acf47119dc251b25226ab415cc9a100 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 4 Oct 2023 19:07:26 +0200 Subject: [PATCH 078/131] BUG: all not ignoring na when skipna=True (#55367) * BUG: all not ignoring na when skipna=True * Update v2.1.2.rst --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/arrays/string_arrow.py | 8 +++++--- pandas/tests/reductions/test_reductions.py | 3 ++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index df19b01610a7a..87b71eac76d1a 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -29,6 +29,7 @@ Bug fixes - Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) - Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`) +- Fixed bug in :meth:`Series.all` and :meth:`Series.any` not treating missing values correctly for ``dtype="string[pyarrow_numpy]"`` (:issue:`55367`) - Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`55362`) - Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`) - diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index db11e74951ee5..24b99b5d4852e 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -631,9 +631,11 @@ def _reduce( self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs ): if name in ["any", "all"]: - arr = pc.and_kleene( - pc.invert(pc.is_null(self._pa_array)), pc.not_equal(self._pa_array, "") - ) + if not skipna and name == "all": + nas = pc.invert(pc.is_null(self._pa_array)) + arr = pc.and_kleene(nas, pc.not_equal(self._pa_array, "")) + else: + arr = pc.not_equal(self._pa_array, "") return ArrowExtensionArray(arr)._reduce( name, skipna=skipna, keepdims=keepdims, **kwargs ) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 021252500e814..560b2377ada70 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1087,7 +1087,8 @@ def test_any_all_pyarrow_string(self): ser = Series([None, "a"], dtype="string[pyarrow_numpy]") assert ser.any() - assert not ser.all() + assert ser.all() + assert not ser.all(skipna=False) ser = Series([None, ""], dtype="string[pyarrow_numpy]") assert not ser.any() From 4145278fa169a79fb44db15c563091e695c54d64 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 4 Oct 2023 11:18:01 -0700 Subject: [PATCH 079/131] TYP: format.formats (#55393) * TYP: format.formats * Fix MultiIndex._values --- pandas/core/arrays/categorical.py | 16 +++++---- pandas/core/arrays/timedeltas.py | 2 +- pandas/core/indexes/multi.py | 6 ++-- pandas/io/formats/format.py | 57 +++++++++++++++++-------------- 4 files changed, 44 insertions(+), 37 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 8d2633c10b428..4152d2a50ea63 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1819,7 +1819,7 @@ def _empty( # type: ignore[override] return arr._from_backing_data(backing) - def _internal_get_values(self): + def _internal_get_values(self) -> ArrayLike: """ Return the values. @@ -1827,15 +1827,19 @@ def _internal_get_values(self): Returns ------- - np.ndarray or Index - A numpy array of the same dtype as categorical.categories.dtype or - Index if datetime / periods. + np.ndarray or ExtensionArray + A numpy array or ExtensionArray of the same dtype as + categorical.categories.dtype. """ # if we are a datetime and period index, return Index to keep metadata if needs_i8_conversion(self.categories.dtype): - return self.categories.take(self._codes, fill_value=NaT) + return self.categories.take(self._codes, fill_value=NaT)._values elif is_integer_dtype(self.categories.dtype) and -1 in self._codes: - return self.categories.astype("object").take(self._codes, fill_value=np.nan) + return ( + self.categories.astype("object") + .take(self._codes, fill_value=np.nan) + ._values + ) return np.array(self) def check_for_ordered(self, op) -> None: diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index b7b81b8271106..931a220d7ab29 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -471,7 +471,7 @@ def _format_native_types( from pandas.io.formats.format import get_format_timedelta64 # Relies on TimeDelta._repr_base - formatter = get_format_timedelta64(self._ndarray, na_rep) + formatter = get_format_timedelta64(self, na_rep) # equiv: np.array([formatter(x) for x in self._ndarray]) # but independent of dimension return np.frompyfunc(formatter, 1, 1)(self._ndarray) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 041ef2d742c16..8955329a7afbf 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -73,9 +73,7 @@ ) from pandas.core.dtypes.generic import ( ABCDataFrame, - ABCDatetimeIndex, ABCSeries, - ABCTimedeltaIndex, ) from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import ( @@ -768,8 +766,8 @@ def _values(self) -> np.ndarray: vals = cast("CategoricalIndex", vals) vals = vals._data._internal_get_values() - if isinstance(vals.dtype, ExtensionDtype) or isinstance( - vals, (ABCDatetimeIndex, ABCTimedeltaIndex) + if isinstance(vals.dtype, ExtensionDtype) or lib.is_np_dtype( + vals.dtype, "mM" ): vals = vals.astype(object) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 922d0f37bee3a..8b65a09ee5ac5 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -72,6 +72,7 @@ from pandas.core.arrays import ( Categorical, DatetimeArray, + ExtensionArray, TimedeltaArray, ) from pandas.core.arrays.string_ import StringDtype @@ -108,6 +109,7 @@ SequenceNotStr, StorageOptions, WriteBuffer, + npt, ) from pandas import ( @@ -1216,7 +1218,7 @@ def get_buffer( def format_array( - values: Any, + values: ArrayLike, formatter: Callable | None, float_format: FloatFormatType | None = None, na_rep: str = "NaN", @@ -1233,7 +1235,7 @@ def format_array( Parameters ---------- - values + values : np.ndarray or ExtensionArray formatter float_format na_rep @@ -1258,10 +1260,13 @@ def format_array( fmt_klass: type[GenericArrayFormatter] if lib.is_np_dtype(values.dtype, "M"): fmt_klass = Datetime64Formatter + values = cast(DatetimeArray, values) elif isinstance(values.dtype, DatetimeTZDtype): fmt_klass = Datetime64TZFormatter + values = cast(DatetimeArray, values) elif lib.is_np_dtype(values.dtype, "m"): fmt_klass = Timedelta64Formatter + values = cast(TimedeltaArray, values) elif isinstance(values.dtype, ExtensionDtype): fmt_klass = ExtensionArrayFormatter elif lib.is_np_dtype(values.dtype, "fc"): @@ -1300,7 +1305,7 @@ def format_array( class GenericArrayFormatter: def __init__( self, - values: Any, + values: ArrayLike, digits: int = 7, formatter: Callable | None = None, na_rep: str = "NaN", @@ -1622,9 +1627,11 @@ def _format_strings(self) -> list[str]: class Datetime64Formatter(GenericArrayFormatter): + values: DatetimeArray + def __init__( self, - values: np.ndarray | Series | DatetimeIndex | DatetimeArray, + values: DatetimeArray, nat_rep: str = "NaT", date_format: None = None, **kwargs, @@ -1637,21 +1644,23 @@ def _format_strings(self) -> list[str]: """we by definition have DO NOT have a TZ""" values = self.values - if not isinstance(values, DatetimeIndex): - values = DatetimeIndex(values) + dti = DatetimeIndex(values) if self.formatter is not None and callable(self.formatter): - return [self.formatter(x) for x in values] + return [self.formatter(x) for x in dti] - fmt_values = values._data._format_native_types( + fmt_values = dti._data._format_native_types( na_rep=self.nat_rep, date_format=self.date_format ) return fmt_values.tolist() class ExtensionArrayFormatter(GenericArrayFormatter): + values: ExtensionArray + def _format_strings(self) -> list[str]: values = extract_array(self.values, extract_numpy=True) + values = cast(ExtensionArray, values) formatter = self.formatter fallback_formatter = None @@ -1813,13 +1822,10 @@ def get_format_datetime64( def get_format_datetime64_from_values( - values: np.ndarray | DatetimeArray | DatetimeIndex, date_format: str | None + values: DatetimeArray, date_format: str | None ) -> str | None: """given values and a date_format, return a string format""" - if isinstance(values, np.ndarray) and values.ndim > 1: - # We don't actually care about the order of values, and DatetimeIndex - # only accepts 1D values - values = values.ravel() + assert isinstance(values, DatetimeArray) ido = is_dates_only(values) if ido: @@ -1829,6 +1835,8 @@ def get_format_datetime64_from_values( class Datetime64TZFormatter(Datetime64Formatter): + values: DatetimeArray + def _format_strings(self) -> list[str]: """we by definition have a TZ""" ido = is_dates_only(self.values) @@ -1842,9 +1850,11 @@ def _format_strings(self) -> list[str]: class Timedelta64Formatter(GenericArrayFormatter): + values: TimedeltaArray + def __init__( self, - values: np.ndarray | TimedeltaIndex, + values: TimedeltaArray, nat_rep: str = "NaT", box: bool = False, **kwargs, @@ -1861,7 +1871,7 @@ def _format_strings(self) -> list[str]: def get_format_timedelta64( - values: np.ndarray | TimedeltaIndex | TimedeltaArray, + values: TimedeltaArray, nat_rep: str | float = "NaT", box: bool = False, ) -> Callable: @@ -1872,18 +1882,13 @@ def get_format_timedelta64( If box, then show the return in quotes """ values_int = values.view(np.int64) + values_int = cast("npt.NDArray[np.int64]", values_int) consider_values = values_int != iNaT one_day_nanos = 86400 * 10**9 - # error: Unsupported operand types for % ("ExtensionArray" and "int") - not_midnight = values_int % one_day_nanos != 0 # type: ignore[operator] - # error: Argument 1 to "__call__" of "ufunc" has incompatible type - # "Union[Any, ExtensionArray, ndarray]"; expected - # "Union[Union[int, float, complex, str, bytes, generic], - # Sequence[Union[int, float, complex, str, bytes, generic]], - # Sequence[Sequence[Any]], _SupportsArray]" - both = np.logical_and(consider_values, not_midnight) # type: ignore[arg-type] + not_midnight = values_int % one_day_nanos != 0 + both = np.logical_and(consider_values, not_midnight) even_days = both.sum() == 0 if even_days: @@ -1941,7 +1946,7 @@ def just(x: str) -> str: return result -def _trim_zeros_complex(str_complexes: np.ndarray, decimal: str = ".") -> list[str]: +def _trim_zeros_complex(str_complexes: ArrayLike, decimal: str = ".") -> list[str]: """ Separates the real and imaginary parts from the complex number, and executes the _trim_zeros_float method on each of those. @@ -1987,7 +1992,7 @@ def _trim_zeros_single_float(str_float: str) -> str: def _trim_zeros_float( - str_floats: np.ndarray | list[str], decimal: str = "." + str_floats: ArrayLike | list[str], decimal: str = "." ) -> list[str]: """ Trims the maximum number of trailing zeros equally from @@ -2000,7 +2005,7 @@ def _trim_zeros_float( def is_number_with_decimal(x) -> bool: return re.match(number_regex, x) is not None - def should_trim(values: np.ndarray | list[str]) -> bool: + def should_trim(values: ArrayLike | list[str]) -> bool: """ Determine if an array of strings should be trimmed. From 6d4819bb01276a86657a83f7251104f550ceb1f8 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 4 Oct 2023 11:19:05 -0700 Subject: [PATCH 080/131] REF: simplify Categorical.__repr__ (#55391) * REF: remove unused arguments from private Categorical methods * REF: remove unused args from CategoricalFormatter * REF: remove CategoricalFormatter --- pandas/core/arrays/categorical.py | 67 +++++++++++++++--------------- pandas/io/formats/format.py | 69 +------------------------------ 2 files changed, 35 insertions(+), 101 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 4152d2a50ea63..5b4a2a524f600 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2151,21 +2151,6 @@ def _formatter(self, boxed: bool = False): # Defer to CategoricalFormatter's formatter. return None - def _tidy_repr(self, max_vals: int = 10, footer: bool = True) -> str: - """ - a short repr displaying only max_vals and an optional (but default - footer) - """ - num = max_vals // 2 - head = self[:num]._get_repr(length=False, footer=False) - tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False) - - result = f"{head[:-1]}, ..., {tail[1:]}" - if footer: - result = f"{result}\n{self._repr_footer()}" - - return str(result) - def _repr_categories(self) -> list[str]: """ return the base repr for the categories @@ -2221,33 +2206,49 @@ def _repr_categories_info(self) -> str: # replace to simple save space by return f"{levheader}[{levstring.replace(' < ... < ', ' ... ')}]" - def _repr_footer(self) -> str: - info = self._repr_categories_info() - return f"Length: {len(self)}\n{info}" - - def _get_repr( - self, length: bool = True, na_rep: str = "NaN", footer: bool = True - ) -> str: + def _get_values_repr(self) -> str: from pandas.io.formats import format as fmt - formatter = fmt.CategoricalFormatter( - self, length=length, na_rep=na_rep, footer=footer + assert len(self) > 0 + + vals = self._internal_get_values() + fmt_values = fmt.format_array( + vals, + None, + float_format=None, + na_rep="NaN", + quoting=QUOTE_NONNUMERIC, ) - result = formatter.to_string() - return str(result) + + fmt_values = [i.strip() for i in fmt_values] + joined = ", ".join(fmt_values) + result = "[" + joined + "]" + return result def __repr__(self) -> str: """ String representation. """ - _maxlen = 10 - if len(self._codes) > _maxlen: - result = self._tidy_repr(_maxlen) - elif len(self._codes) > 0: - result = self._get_repr(length=len(self) > _maxlen) + footer = self._repr_categories_info() + length = len(self) + max_len = 10 + if length > max_len: + # In long cases we do not display all entries, so we add Length + # information to the __repr__. + num = max_len // 2 + head = self[:num]._get_values_repr() + tail = self[-(max_len - num) :]._get_values_repr() + body = f"{head[:-1]}, ..., {tail[1:]}" + length_info = f"Length: {len(self)}" + result = f"{body}\n{length_info}\n{footer}" + elif length > 0: + body = self._get_values_repr() + result = f"{body}\n{footer}" else: - msg = self._get_repr(length=False, footer=True).replace("\n", ", ") - result = f"[], {msg}" + # In the empty case we use a comma instead of newline to get + # a more compact __repr__ + body = "[]" + result = f"{body}, {footer}" return result diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 8b65a09ee5ac5..cf9324cdbb9f2 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -12,10 +12,7 @@ Sequence, ) from contextlib import contextmanager -from csv import ( - QUOTE_NONE, - QUOTE_NONNUMERIC, -) +from csv import QUOTE_NONE from decimal import Decimal from functools import partial from io import StringIO @@ -200,70 +197,6 @@ """ -class CategoricalFormatter: - def __init__( - self, - categorical: Categorical, - buf: IO[str] | None = None, - length: bool = True, - na_rep: str = "NaN", - footer: bool = True, - ) -> None: - self.categorical = categorical - self.buf = buf if buf is not None else StringIO("") - self.na_rep = na_rep - self.length = length - self.footer = footer - self.quoting = QUOTE_NONNUMERIC - - def _get_footer(self) -> str: - footer = "" - - if self.length: - if footer: - footer += ", " - footer += f"Length: {len(self.categorical)}" - - level_info = self.categorical._repr_categories_info() - - # Levels are added in a newline - if footer: - footer += "\n" - footer += level_info - - return str(footer) - - def _get_formatted_values(self) -> list[str]: - return format_array( - self.categorical._internal_get_values(), - None, - float_format=None, - na_rep=self.na_rep, - quoting=self.quoting, - ) - - def to_string(self) -> str: - categorical = self.categorical - - if len(categorical) == 0: - if self.footer: - return self._get_footer() - else: - return "" - - fmt_values = self._get_formatted_values() - - fmt_values = [i.strip() for i in fmt_values] - values = ", ".join(fmt_values) - result = ["[" + values + "]"] - if self.footer: - footer = self._get_footer() - if footer: - result.append(footer) - - return str("\n".join(result)) - - class SeriesFormatter: def __init__( self, From a5c79461f8a5cf252ea83aec90c2eb9c89fe93a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dea=20Mar=C3=ADa=20L=C3=A9on?= Date: Wed, 4 Oct 2023 20:20:59 +0200 Subject: [PATCH 081/131] DOC: Fixing EX03 - Removing flake8 errors in docstrings (#55401) fixing EX03 errors --- ci/code_checks.sh | 10 ---------- pandas/core/arrays/datetimes.py | 2 +- pandas/core/generic.py | 20 ++++++++++---------- pandas/core/indexing.py | 24 ++++++++++++------------ pandas/core/series.py | 2 +- 5 files changed, 24 insertions(+), 34 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index aba42f3733a3f..6caa39ae42926 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -63,16 +63,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then MSG='Partially validate docstrings (EX03)' ; echo $MSG $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX03 --ignore_functions \ - pandas.Series.loc \ - pandas.Series.iloc \ - pandas.Series.pop \ - pandas.Series.describe \ - pandas.Series.skew \ - pandas.Series.var \ - pandas.Series.last \ - pandas.Series.tz_convert \ - pandas.Series.tz_localize \ - pandas.Series.dt.month_name \ pandas.Series.dt.day_name \ pandas.Series.str.len \ pandas.Series.cat.set_categories \ diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index b520f9f4a6deb..8e60180c7bac4 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1276,7 +1276,7 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]: >>> idx DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], dtype='datetime64[ns]', freq='ME') - >>> idx.month_name(locale='pt_BR.utf8') # doctest: +SKIP + >>> idx.month_name(locale='pt_BR.utf8') # doctest: +SKIP Index(['Janeiro', 'Fevereiro', 'Março'], dtype='object') """ values = self._local_timestamps() diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2e6c8919eff38..6f2850559fd77 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9698,7 +9698,7 @@ def last(self, offset) -> Self: Get the rows for the last 3 days: - >>> ts.last('3D') # doctest: +SKIP + >>> ts.last('3D') # doctest: +SKIP A 2018-04-13 3 2018-04-15 4 @@ -11208,7 +11208,7 @@ def tz_convert( Pass None to convert to UTC and get a tz-naive index: >>> s = pd.Series([1], - ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00'])) + ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00'])) >>> s.tz_convert(None) 2018-09-14 23:30:00 1 dtype: int64 @@ -11326,7 +11326,7 @@ def tz_localize( Pass None to convert to tz-naive index and preserve local time: >>> s = pd.Series([1], - ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00'])) + ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00'])) >>> s.tz_localize(None) 2018-09-15 01:30:00 1 dtype: int64 @@ -11569,10 +11569,10 @@ def describe( Describing a ``DataFrame``. By default only numeric fields are returned. - >>> df = pd.DataFrame({'categorical': pd.Categorical(['d','e','f']), + >>> df = pd.DataFrame({'categorical': pd.Categorical(['d', 'e', 'f']), ... 'numeric': [1, 2, 3], ... 'object': ['a', 'b', 'c'] - ... }) + ... }) >>> df.describe() numeric count 3.0 @@ -12688,9 +12688,9 @@ def last_valid_index(self) -> Hashable | None: Examples -------- >>> df = pd.DataFrame({'person_id': [0, 1, 2, 3], -... 'age': [21, 25, 62, 43], -... 'height': [1.61, 1.87, 1.49, 2.01]} -... ).set_index('person_id') +... 'age': [21, 25, 62, 43], +... 'height': [1.61, 1.87, 1.49, 2.01]} +... ).set_index('person_id') >>> df age height person_id @@ -13516,7 +13516,7 @@ def make_doc(name: str, ndim: int) -> str: With a DataFrame >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': [2, 3, 4], 'c': [1, 3, 5]}, - ... index=['tiger', 'zebra', 'cow']) + ... index=['tiger', 'zebra', 'cow']) >>> df a b c tiger 1 2 1 @@ -13540,7 +13540,7 @@ def make_doc(name: str, ndim: int) -> str: getting an error. >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': ['T', 'Z', 'X']}, - ... index=['tiger', 'zebra', 'cow']) + ... index=['tiger', 'zebra', 'cow']) >>> df.skew(numeric_only=True) a 0.0 dtype: float64""" diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 871e5817fdf0d..f3a130672a2e5 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -187,7 +187,7 @@ def iloc(self) -> _iLocIndexer: -------- >>> mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4}, ... {'a': 100, 'b': 200, 'c': 300, 'd': 400}, - ... {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000 }] + ... {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000}] >>> df = pd.DataFrame(mydict) >>> df a b c d @@ -328,7 +328,7 @@ def loc(self) -> _LocIndexer: DataFrame.at : Access a single value for a row/column label pair. DataFrame.iloc : Access group of rows and columns by integer position(s). DataFrame.xs : Returns a cross-section (row(s) or column(s)) from the - Series/DataFrame. + Series/DataFrame. Series.loc : Access group of values using labels. Examples @@ -336,8 +336,8 @@ def loc(self) -> _LocIndexer: **Getting values** >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], - ... index=['cobra', 'viper', 'sidewinder'], - ... columns=['max_speed', 'shield']) + ... index=['cobra', 'viper', 'sidewinder'], + ... columns=['max_speed', 'shield']) >>> df max_speed shield cobra 1 2 @@ -380,8 +380,8 @@ def loc(self) -> _LocIndexer: Alignable boolean Series: >>> df.loc[pd.Series([False, True, False], - ... index=['viper', 'sidewinder', 'cobra'])] - max_speed shield + ... index=['viper', 'sidewinder', 'cobra'])] + max_speed shield sidewinder 7 8 Index (same behavior as ``df.reindex``) @@ -407,7 +407,7 @@ def loc(self) -> _LocIndexer: Multiple conditional using ``&`` that returns a boolean Series >>> df.loc[(df['max_speed'] > 1) & (df['shield'] < 8)] - max_speed shield + max_speed shield viper 4 5 Multiple conditional using ``|`` that returns a boolean Series @@ -496,7 +496,7 @@ def loc(self) -> _LocIndexer: Another example using integers for the index >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], - ... index=[7, 8, 9], columns=['max_speed', 'shield']) + ... index=[7, 8, 9], columns=['max_speed', 'shield']) >>> df max_speed shield 7 1 2 @@ -517,13 +517,13 @@ def loc(self) -> _LocIndexer: A number of examples using a DataFrame with a MultiIndex >>> tuples = [ - ... ('cobra', 'mark i'), ('cobra', 'mark ii'), - ... ('sidewinder', 'mark i'), ('sidewinder', 'mark ii'), - ... ('viper', 'mark ii'), ('viper', 'mark iii') + ... ('cobra', 'mark i'), ('cobra', 'mark ii'), + ... ('sidewinder', 'mark i'), ('sidewinder', 'mark ii'), + ... ('viper', 'mark ii'), ('viper', 'mark iii') ... ] >>> index = pd.MultiIndex.from_tuples(tuples) >>> values = [[12, 2], [0, 4], [10, 20], - ... [1, 4], [7, 1], [16, 36]] + ... [1, 4], [7, 1], [16, 36]] >>> df = pd.DataFrame(values, columns=['max_speed', 'shield'], index=index) >>> df max_speed shield diff --git a/pandas/core/series.py b/pandas/core/series.py index d5785a2171cb3..8ddce3d5cab26 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5186,7 +5186,7 @@ def pop(self, item: Hashable) -> Any: Examples -------- - >>> ser = pd.Series([1,2,3]) + >>> ser = pd.Series([1, 2, 3]) >>> ser.pop(0) 1 From d943c26c8587d1410eccb557db436d97b52b51b8 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 4 Oct 2023 13:38:17 -0700 Subject: [PATCH 082/131] REF: formats.format functions that should be DTA methods (#55394) * REF: move get_format_datetime64_from_values to DatetimeArray * REF: make is_dates_only a DatetimeArray method * mypy fixup * suggested simplification * redundant check --- pandas/core/arrays/datetimes.py | 29 +++++++++++++--- pandas/core/indexes/datetimes.py | 12 ++----- pandas/io/formats/format.py | 46 ++------------------------ pandas/tests/io/formats/test_format.py | 10 ++++-- 4 files changed, 38 insertions(+), 59 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 8e60180c7bac4..fae42f170a6b6 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -28,12 +28,14 @@ get_resolution, get_supported_reso, get_unit_from_dtype, + iNaT, ints_to_pydatetime, is_date_array_normalized, is_supported_unit, is_unitless, normalize_i8_timestamps, npy_unit_to_abbrev, + periods_per_day, timezones, to_offset, tz_convert_from_utc, @@ -735,14 +737,33 @@ def astype(self, dtype, copy: bool = True): def _format_native_types( self, *, na_rep: str | float = "NaT", date_format=None, **kwargs ) -> npt.NDArray[np.object_]: - from pandas.io.formats.format import get_format_datetime64_from_values - - fmt = get_format_datetime64_from_values(self, date_format) + if date_format is None and self._is_dates_only: + # Only dates and no timezone: provide a default format + date_format = "%Y-%m-%d" return tslib.format_array_from_datetime( - self.asi8, tz=self.tz, format=fmt, na_rep=na_rep, reso=self._creso + self.asi8, tz=self.tz, format=date_format, na_rep=na_rep, reso=self._creso ) + @property + def _is_dates_only(self) -> bool: + """ + Check if we are round times at midnight (and no timezone), which will + be given a more compact __repr__ than other cases. + """ + if self.tz is not None: + return False + + values_int = self.asi8 + consider_values = values_int != iNaT + dtype = cast(np.dtype, self.dtype) # since we checked tz above + reso = get_unit_from_dtype(dtype) + ppd = periods_per_day(reso) + + # TODO: can we reuse is_date_array_normalized? would need a skipna kwd + even_days = np.logical_and(consider_values, values_int % ppd != 0).sum() == 0 + return even_days + # ----------------------------------------------------------------- # Comparison Methods diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index ae0feba1f9bcf..62cdce36ed5fb 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -266,6 +266,7 @@ def _engine_type(self) -> type[libindex.DatetimeEngine]: return libindex.DatetimeEngine _data: DatetimeArray + _values: DatetimeArray tz: dt.tzinfo | None # -------------------------------------------------------------------- @@ -393,19 +394,12 @@ def _is_dates_only(self) -> bool: ------- bool """ - - from pandas.io.formats.format import is_dates_only - delta = getattr(self.freq, "delta", None) if delta and delta % dt.timedelta(days=1) != dt.timedelta(days=0): return False - # error: Argument 1 to "is_dates_only" has incompatible type - # "Union[ExtensionArray, ndarray]"; expected "Union[ndarray, - # DatetimeArray, Index, DatetimeIndex]" - - return self.tz is None and is_dates_only(self._values) # type: ignore[arg-type] + return self._values._is_dates_only def __reduce__(self): d = {"data": self._data, "name": self.name} @@ -428,7 +422,7 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: def _formatter_func(self): from pandas.io.formats.format import get_format_datetime64 - formatter = get_format_datetime64(is_dates_only_=self._is_dates_only) + formatter = get_format_datetime64(is_dates_only=self._is_dates_only) return lambda x: f"'{formatter(x)}'" # -------------------------------------------------------------------- diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index cf9324cdbb9f2..6310eb070247e 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -42,9 +42,7 @@ NaT, Timedelta, Timestamp, - get_unit_from_dtype, iNaT, - periods_per_day, ) from pandas._libs.tslibs.nattype import NaTType @@ -1691,31 +1689,6 @@ def format_percentiles( return [i + "%" for i in out] -def is_dates_only(values: np.ndarray | DatetimeArray | Index | DatetimeIndex) -> bool: - # return a boolean if we are only dates (and don't have a timezone) - if not isinstance(values, Index): - values = values.ravel() - - if not isinstance(values, (DatetimeArray, DatetimeIndex)): - values = DatetimeIndex(values) - - if values.tz is not None: - return False - - values_int = values.asi8 - consider_values = values_int != iNaT - # error: Argument 1 to "py_get_unit_from_dtype" has incompatible type - # "Union[dtype[Any], ExtensionDtype]"; expected "dtype[Any]" - reso = get_unit_from_dtype(values.dtype) # type: ignore[arg-type] - ppd = periods_per_day(reso) - - # TODO: can we reuse is_date_array_normalized? would need a skipna kwd - even_days = np.logical_and(consider_values, values_int % ppd != 0).sum() == 0 - if even_days: - return True - return False - - def _format_datetime64(x: NaTType | Timestamp, nat_rep: str = "NaT") -> str: if x is NaT: return nat_rep @@ -1741,12 +1714,12 @@ def _format_datetime64_dateonly( def get_format_datetime64( - is_dates_only_: bool, nat_rep: str = "NaT", date_format: str | None = None + is_dates_only: bool, nat_rep: str = "NaT", date_format: str | None = None ) -> Callable: """Return a formatter callable taking a datetime64 as input and providing a string as output""" - if is_dates_only_: + if is_dates_only: return lambda x: _format_datetime64_dateonly( x, nat_rep=nat_rep, date_format=date_format ) @@ -1754,25 +1727,12 @@ def get_format_datetime64( return lambda x: _format_datetime64(x, nat_rep=nat_rep) -def get_format_datetime64_from_values( - values: DatetimeArray, date_format: str | None -) -> str | None: - """given values and a date_format, return a string format""" - assert isinstance(values, DatetimeArray) - - ido = is_dates_only(values) - if ido: - # Only dates and no timezone: provide a default format - return date_format or "%Y-%m-%d" - return date_format - - class Datetime64TZFormatter(Datetime64Formatter): values: DatetimeArray def _format_strings(self) -> list[str]: """we by definition have a TZ""" - ido = is_dates_only(self.values) + ido = self.values._is_dates_only values = self.values.astype(object) formatter = self.formatter or get_format_datetime64( ido, date_format=self.date_format diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 53ee449c2dc0c..642ee6446e200 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3308,9 +3308,13 @@ def format_func(x): assert result == ["10:10", "12:12"] def test_datetime64formatter_tz_ms(self): - x = Series( - np.array(["2999-01-01", "2999-01-02", "NaT"], dtype="datetime64[ms]") - ).dt.tz_localize("US/Pacific") + x = ( + Series( + np.array(["2999-01-01", "2999-01-02", "NaT"], dtype="datetime64[ms]") + ) + .dt.tz_localize("US/Pacific") + ._values + ) result = fmt.Datetime64TZFormatter(x).get_result() assert result[0].strip() == "2999-01-01 00:00:00-08:00" assert result[1].strip() == "2999-01-02 00:00:00-08:00" From 6c58a217f5d9a2520d9d4bee665edff5397bbfc6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 4 Oct 2023 17:41:22 -0700 Subject: [PATCH 083/131] REF: improve privatization in io.formats.format (#55389) * REF: privatize where possible in io.formats.format * REF: de-privatize VALID_JUSTIFY_PARAMETERS, avoid multiple get_option calls --- pandas/core/frame.py | 2 +- pandas/io/formats/format.py | 81 ++++++++++++------------ pandas/tests/io/formats/test_format.py | 50 +++++++-------- pandas/tests/io/formats/test_printing.py | 10 +-- pandas/tests/io/formats/test_to_html.py | 2 +- 5 files changed, 73 insertions(+), 72 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 51fceb1f09a62..b1ca7557c11ca 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3266,7 +3266,7 @@ def to_html( ... ''' >>> assert html_string == df.to_html() """ - if justify is not None and justify not in fmt._VALID_JUSTIFY_PARAMETERS: + if justify is not None and justify not in fmt.VALID_JUSTIFY_PARAMETERS: raise ValueError("Invalid value for justify parameter") formatter = fmt.DataFrameFormatter( diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 6310eb070247e..356db34918447 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -20,7 +20,6 @@ import re from shutil import get_terminal_size from typing import ( - IO, TYPE_CHECKING, Any, Callable, @@ -172,7 +171,7 @@ Character recognized as decimal separator, e.g. ',' in Europe. """ -_VALID_JUSTIFY_PARAMETERS = ( +VALID_JUSTIFY_PARAMETERS = ( "left", "right", "center", @@ -196,10 +195,15 @@ class SeriesFormatter: + """ + Implement the main logic of Series.to_string, which underlies + Series.__repr__. + """ + def __init__( self, series: Series, - buf: IO[str] | None = None, + *, length: bool | str = True, header: bool = True, index: bool = True, @@ -211,7 +215,7 @@ def __init__( min_rows: int | None = None, ) -> None: self.series = series - self.buf = buf if buf is not None else StringIO() + self.buf = StringIO() self.name = name self.na_rep = na_rep self.header = header @@ -355,7 +359,7 @@ def to_string(self) -> str: return str("".join(result)) -class TextAdjustment: +class _TextAdjustment: def __init__(self) -> None: self.encoding = get_option("display.encoding") @@ -371,7 +375,7 @@ def adjoin(self, space: int, *lists, **kwargs) -> str: ) -class EastAsianTextAdjustment(TextAdjustment): +class _EastAsianTextAdjustment(_TextAdjustment): def __init__(self) -> None: super().__init__() if get_option("display.unicode.ambiguous_as_wide"): @@ -410,12 +414,12 @@ def _get_pad(t): return [x.rjust(_get_pad(x)) for x in texts] -def get_adjustment() -> TextAdjustment: +def get_adjustment() -> _TextAdjustment: use_east_asian_width = get_option("display.unicode.east_asian_width") if use_east_asian_width: - return EastAsianTextAdjustment() + return _EastAsianTextAdjustment() else: - return TextAdjustment() + return _TextAdjustment() def get_dataframe_repr_params() -> dict[str, Any]: @@ -469,16 +473,9 @@ def get_series_repr_params() -> dict[str, Any]: True """ width, height = get_terminal_size() - max_rows = ( - height - if get_option("display.max_rows") == 0 - else get_option("display.max_rows") - ) - min_rows = ( - height - if get_option("display.max_rows") == 0 - else get_option("display.min_rows") - ) + max_rows_opt = get_option("display.max_rows") + max_rows = height if max_rows_opt == 0 else max_rows_opt + min_rows = height if max_rows_opt == 0 else get_option("display.min_rows") return { "name": True, @@ -490,7 +487,11 @@ def get_series_repr_params() -> dict[str, Any]: class DataFrameFormatter: - """Class for processing dataframe formatting options and data.""" + """ + Class for processing dataframe formatting options and data. + + Used by DataFrame.to_string, which backs DataFrame.__repr__. + """ __doc__ = __doc__ if __doc__ else "" __doc__ += common_docstring + return_docstring @@ -1102,16 +1103,16 @@ def save_to_buffer( """ Perform serialization. Write to buf or return as string if buf is None. """ - with get_buffer(buf, encoding=encoding) as f: - f.write(string) + with _get_buffer(buf, encoding=encoding) as fd: + fd.write(string) if buf is None: # error: "WriteBuffer[str]" has no attribute "getvalue" - return f.getvalue() # type: ignore[attr-defined] + return fd.getvalue() # type: ignore[attr-defined] return None @contextmanager -def get_buffer( +def _get_buffer( buf: FilePath | WriteBuffer[str] | None, encoding: str | None = None ) -> Generator[WriteBuffer[str], None, None] | Generator[StringIO, None, None]: """ @@ -1188,24 +1189,24 @@ def format_array( ------- List[str] """ - fmt_klass: type[GenericArrayFormatter] + fmt_klass: type[_GenericArrayFormatter] if lib.is_np_dtype(values.dtype, "M"): - fmt_klass = Datetime64Formatter + fmt_klass = _Datetime64Formatter values = cast(DatetimeArray, values) elif isinstance(values.dtype, DatetimeTZDtype): - fmt_klass = Datetime64TZFormatter + fmt_klass = _Datetime64TZFormatter values = cast(DatetimeArray, values) elif lib.is_np_dtype(values.dtype, "m"): - fmt_klass = Timedelta64Formatter + fmt_klass = _Timedelta64Formatter values = cast(TimedeltaArray, values) elif isinstance(values.dtype, ExtensionDtype): - fmt_klass = ExtensionArrayFormatter + fmt_klass = _ExtensionArrayFormatter elif lib.is_np_dtype(values.dtype, "fc"): fmt_klass = FloatArrayFormatter elif lib.is_np_dtype(values.dtype, "iu"): - fmt_klass = IntArrayFormatter + fmt_klass = _IntArrayFormatter else: - fmt_klass = GenericArrayFormatter + fmt_klass = _GenericArrayFormatter if space is None: space = 12 @@ -1233,7 +1234,7 @@ def format_array( return fmt_obj.get_result() -class GenericArrayFormatter: +class _GenericArrayFormatter: def __init__( self, values: ArrayLike, @@ -1315,7 +1316,7 @@ def _format(x): vals = extract_array(self.values, extract_numpy=True) if not isinstance(vals, np.ndarray): raise TypeError( - "ExtensionArray formatting should use ExtensionArrayFormatter" + "ExtensionArray formatting should use _ExtensionArrayFormatter" ) inferred = lib.map_infer(vals, is_float) is_float_type = ( @@ -1345,7 +1346,7 @@ def _format(x): return fmt_values -class FloatArrayFormatter(GenericArrayFormatter): +class FloatArrayFormatter(_GenericArrayFormatter): def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) @@ -1546,7 +1547,7 @@ def _format_strings(self) -> list[str]: return list(self.get_result_as_array()) -class IntArrayFormatter(GenericArrayFormatter): +class _IntArrayFormatter(_GenericArrayFormatter): def _format_strings(self) -> list[str]: if self.leading_space is False: formatter_str = lambda x: f"{x:d}".format(x=x) @@ -1557,7 +1558,7 @@ def _format_strings(self) -> list[str]: return fmt_values -class Datetime64Formatter(GenericArrayFormatter): +class _Datetime64Formatter(_GenericArrayFormatter): values: DatetimeArray def __init__( @@ -1586,7 +1587,7 @@ def _format_strings(self) -> list[str]: return fmt_values.tolist() -class ExtensionArrayFormatter(GenericArrayFormatter): +class _ExtensionArrayFormatter(_GenericArrayFormatter): values: ExtensionArray def _format_strings(self) -> list[str]: @@ -1727,7 +1728,7 @@ def get_format_datetime64( return lambda x: _format_datetime64(x, nat_rep=nat_rep) -class Datetime64TZFormatter(Datetime64Formatter): +class _Datetime64TZFormatter(_Datetime64Formatter): values: DatetimeArray def _format_strings(self) -> list[str]: @@ -1742,7 +1743,7 @@ def _format_strings(self) -> list[str]: return fmt_values -class Timedelta64Formatter(GenericArrayFormatter): +class _Timedelta64Formatter(_GenericArrayFormatter): values: TimedeltaArray def __init__( @@ -1809,7 +1810,7 @@ def _make_fixed_width( strings: list[str], justify: str = "right", minimum: int | None = None, - adj: TextAdjustment | None = None, + adj: _TextAdjustment | None = None, ) -> list[str]: if len(strings) == 0 or justify == "all": return strings diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 642ee6446e200..0087149021895 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -2933,9 +2933,9 @@ def test_to_string_empty_col(self): class TestGenericArrayFormatter: def test_1d_array(self): - # GenericArrayFormatter is used on types for which there isn't a dedicated + # _GenericArrayFormatter is used on types for which there isn't a dedicated # formatter. np.bool_ is one of those types. - obj = fmt.GenericArrayFormatter(np.array([True, False])) + obj = fmt._GenericArrayFormatter(np.array([True, False])) res = obj.get_result() assert len(res) == 2 # Results should be right-justified. @@ -2943,14 +2943,14 @@ def test_1d_array(self): assert res[1] == " False" def test_2d_array(self): - obj = fmt.GenericArrayFormatter(np.array([[True, False], [False, True]])) + obj = fmt._GenericArrayFormatter(np.array([[True, False], [False, True]])) res = obj.get_result() assert len(res) == 2 assert res[0] == " [True, False]" assert res[1] == " [False, True]" def test_3d_array(self): - obj = fmt.GenericArrayFormatter( + obj = fmt._GenericArrayFormatter( np.array([[[True, True], [False, False]], [[False, True], [True, False]]]) ) res = obj.get_result() @@ -3187,64 +3187,64 @@ def test_all(self): class TestTimedelta64Formatter: def test_days(self): x = pd.to_timedelta(list(range(5)) + [NaT], unit="D") - result = fmt.Timedelta64Formatter(x, box=True).get_result() + result = fmt._Timedelta64Formatter(x, box=True).get_result() assert result[0].strip() == "'0 days'" assert result[1].strip() == "'1 days'" - result = fmt.Timedelta64Formatter(x[1:2], box=True).get_result() + result = fmt._Timedelta64Formatter(x[1:2], box=True).get_result() assert result[0].strip() == "'1 days'" - result = fmt.Timedelta64Formatter(x, box=False).get_result() + result = fmt._Timedelta64Formatter(x, box=False).get_result() assert result[0].strip() == "0 days" assert result[1].strip() == "1 days" - result = fmt.Timedelta64Formatter(x[1:2], box=False).get_result() + result = fmt._Timedelta64Formatter(x[1:2], box=False).get_result() assert result[0].strip() == "1 days" def test_days_neg(self): x = pd.to_timedelta(list(range(5)) + [NaT], unit="D") - result = fmt.Timedelta64Formatter(-x, box=True).get_result() + result = fmt._Timedelta64Formatter(-x, box=True).get_result() assert result[0].strip() == "'0 days'" assert result[1].strip() == "'-1 days'" def test_subdays(self): y = pd.to_timedelta(list(range(5)) + [NaT], unit="s") - result = fmt.Timedelta64Formatter(y, box=True).get_result() + result = fmt._Timedelta64Formatter(y, box=True).get_result() assert result[0].strip() == "'0 days 00:00:00'" assert result[1].strip() == "'0 days 00:00:01'" def test_subdays_neg(self): y = pd.to_timedelta(list(range(5)) + [NaT], unit="s") - result = fmt.Timedelta64Formatter(-y, box=True).get_result() + result = fmt._Timedelta64Formatter(-y, box=True).get_result() assert result[0].strip() == "'0 days 00:00:00'" assert result[1].strip() == "'-1 days +23:59:59'" def test_zero(self): x = pd.to_timedelta(list(range(1)) + [NaT], unit="D") - result = fmt.Timedelta64Formatter(x, box=True).get_result() + result = fmt._Timedelta64Formatter(x, box=True).get_result() assert result[0].strip() == "'0 days'" x = pd.to_timedelta(list(range(1)), unit="D") - result = fmt.Timedelta64Formatter(x, box=True).get_result() + result = fmt._Timedelta64Formatter(x, box=True).get_result() assert result[0].strip() == "'0 days'" -class TestDatetime64Formatter: +class Test_Datetime64Formatter: def test_mixed(self): x = Series([datetime(2013, 1, 1), datetime(2013, 1, 1, 12), NaT]) - result = fmt.Datetime64Formatter(x).get_result() + result = fmt._Datetime64Formatter(x).get_result() assert result[0].strip() == "2013-01-01 00:00:00" assert result[1].strip() == "2013-01-01 12:00:00" def test_dates(self): x = Series([datetime(2013, 1, 1), datetime(2013, 1, 2), NaT]) - result = fmt.Datetime64Formatter(x).get_result() + result = fmt._Datetime64Formatter(x).get_result() assert result[0].strip() == "2013-01-01" assert result[1].strip() == "2013-01-02" def test_date_nanos(self): x = Series([Timestamp(200)]) - result = fmt.Datetime64Formatter(x).get_result() + result = fmt._Datetime64Formatter(x).get_result() assert result[0].strip() == "1970-01-01 00:00:00.000000200" def test_dates_display(self): @@ -3252,35 +3252,35 @@ def test_dates_display(self): # make sure that we are consistently display date formatting x = Series(date_range("20130101 09:00:00", periods=5, freq="D")) x.iloc[1] = np.nan - result = fmt.Datetime64Formatter(x).get_result() + result = fmt._Datetime64Formatter(x).get_result() assert result[0].strip() == "2013-01-01 09:00:00" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-05 09:00:00" x = Series(date_range("20130101 09:00:00", periods=5, freq="s")) x.iloc[1] = np.nan - result = fmt.Datetime64Formatter(x).get_result() + result = fmt._Datetime64Formatter(x).get_result() assert result[0].strip() == "2013-01-01 09:00:00" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:04" x = Series(date_range("20130101 09:00:00", periods=5, freq="ms")) x.iloc[1] = np.nan - result = fmt.Datetime64Formatter(x).get_result() + result = fmt._Datetime64Formatter(x).get_result() assert result[0].strip() == "2013-01-01 09:00:00.000" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:00.004" x = Series(date_range("20130101 09:00:00", periods=5, freq="us")) x.iloc[1] = np.nan - result = fmt.Datetime64Formatter(x).get_result() + result = fmt._Datetime64Formatter(x).get_result() assert result[0].strip() == "2013-01-01 09:00:00.000000" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:00.000004" x = Series(date_range("20130101 09:00:00", periods=5, freq="ns")) x.iloc[1] = np.nan - result = fmt.Datetime64Formatter(x).get_result() + result = fmt._Datetime64Formatter(x).get_result() assert result[0].strip() == "2013-01-01 09:00:00.000000000" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:00.000000004" @@ -3291,7 +3291,7 @@ def test_datetime64formatter_yearmonth(self): def format_func(x): return x.strftime("%Y-%m") - formatter = fmt.Datetime64Formatter(x, formatter=format_func) + formatter = fmt._Datetime64Formatter(x, formatter=format_func) result = formatter.get_result() assert result == ["2016-01", "2016-02"] @@ -3303,7 +3303,7 @@ def test_datetime64formatter_hoursecond(self): def format_func(x): return x.strftime("%H:%M") - formatter = fmt.Datetime64Formatter(x, formatter=format_func) + formatter = fmt._Datetime64Formatter(x, formatter=format_func) result = formatter.get_result() assert result == ["10:10", "12:12"] @@ -3315,7 +3315,7 @@ def test_datetime64formatter_tz_ms(self): .dt.tz_localize("US/Pacific") ._values ) - result = fmt.Datetime64TZFormatter(x).get_result() + result = fmt._Datetime64TZFormatter(x).get_result() assert result[0].strip() == "2999-01-01 00:00:00-08:00" assert result[1].strip() == "2999-01-02 00:00:00-08:00" diff --git a/pandas/tests/io/formats/test_printing.py b/pandas/tests/io/formats/test_printing.py index 2d0dc0d937709..492657587ae0e 100644 --- a/pandas/tests/io/formats/test_printing.py +++ b/pandas/tests/io/formats/test_printing.py @@ -48,7 +48,7 @@ def test_adjoin_unicode(self): adjoined = printing.adjoin(2, *data) assert adjoined == expected - adj = fmt.EastAsianTextAdjustment() + adj = fmt._EastAsianTextAdjustment() expected = """あ dd ggg b ええ hhh @@ -73,7 +73,7 @@ def test_adjoin_unicode(self): assert adj.len(cols[2]) == 26 def test_justify(self): - adj = fmt.EastAsianTextAdjustment() + adj = fmt._EastAsianTextAdjustment() def just(x, *args, **kwargs): # wrapper to test single str @@ -95,7 +95,7 @@ def just(x, *args, **kwargs): assert just("パンダ", 10, mode="right") == " パンダ" def test_east_asian_len(self): - adj = fmt.EastAsianTextAdjustment() + adj = fmt._EastAsianTextAdjustment() assert adj.len("abc") == 3 assert adj.len("abc") == 3 @@ -106,11 +106,11 @@ def test_east_asian_len(self): assert adj.len("パンダpanda") == 10 def test_ambiguous_width(self): - adj = fmt.EastAsianTextAdjustment() + adj = fmt._EastAsianTextAdjustment() assert adj.len("¡¡ab") == 4 with cf.option_context("display.unicode.ambiguous_as_wide", True): - adj = fmt.EastAsianTextAdjustment() + adj = fmt._EastAsianTextAdjustment() assert adj.len("¡¡ab") == 6 data = [["あ", "b", "c"], ["dd", "ええ", "ff"], ["ggg", "¡¡ab", "いいい"]] diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 5811485406b86..38a2bb52930e3 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -68,7 +68,7 @@ def biggie_df_fixture(request): return df -@pytest.fixture(params=fmt._VALID_JUSTIFY_PARAMETERS) +@pytest.fixture(params=fmt.VALID_JUSTIFY_PARAMETERS) def justify(request): return request.param From e4b7174137476e2c1d31ac12067a0565bbf7f262 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 5 Oct 2023 09:13:25 -0700 Subject: [PATCH 084/131] BUG: TimedeltaIndex.__repr__ with non-nano and round values (#55405) * BUG: TimedeltaIndex.__repr__ with non-nano and round values * GH ref * mypy fixup * update doctest * REF: remove redundant _is_dates_only * Fix wrong types passed to formatters * CLN: remove unused import --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/arrays/categorical.py | 6 ++-- pandas/core/arrays/datetimelike.py | 25 +++++++++++++++ pandas/core/arrays/datetimes.py | 21 ------------ pandas/core/indexes/timedeltas.py | 3 +- pandas/io/formats/format.py | 12 +------ .../tests/indexes/timedeltas/test_formats.py | 12 +++++++ pandas/tests/io/formats/test_format.py | 32 +++++++++---------- 8 files changed, 59 insertions(+), 54 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 4749ceec4a330..7743f762d8898 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -296,7 +296,7 @@ Datetimelike Timedelta ^^^^^^^^^ -- +- Bug in rendering (``__repr__``) of :class:`TimedeltaIndex` and :class:`Series` with timedelta64 values with non-nanosecond resolution entries that are all multiples of 24 hours failing to use the compact representation used in the nanosecond cases (:issue:`55405`) - Timezones diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 5b4a2a524f600..5059f5d000ccd 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2167,11 +2167,11 @@ def _repr_categories(self) -> list[str]: ) if len(self.categories) > max_categories: num = max_categories // 2 - head = format_array(self.categories[:num]) - tail = format_array(self.categories[-num:]) + head = format_array(self.categories[:num]._values) + tail = format_array(self.categories[-num:]._values) category_strs = head + ["..."] + tail else: - category_strs = format_array(self.categories) + category_strs = format_array(self.categories._values) # Strip all leading spaces, which format_array adds for columns... category_strs = [x.strip() for x in category_strs] diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 52596f29ffc0c..a2960a2870882 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -41,6 +41,7 @@ iNaT, ints_to_pydatetime, ints_to_pytimedelta, + periods_per_day, to_offset, ) from pandas._libs.tslibs.fields import ( @@ -2312,6 +2313,30 @@ def interpolate( return self return type(self)._simple_new(out_data, dtype=self.dtype) + # -------------------------------------------------------------- + # Unsorted + + @property + def _is_dates_only(self) -> bool: + """ + Check if we are round times at midnight (and no timezone), which will + be given a more compact __repr__ than other cases. For TimedeltaArray + we are checking for multiples of 24H. + """ + if not lib.is_np_dtype(self.dtype): + # i.e. we have a timezone + return False + + values_int = self.asi8 + consider_values = values_int != iNaT + reso = get_unit_from_dtype(self.dtype) + ppd = periods_per_day(reso) + + # TODO: can we reuse is_date_array_normalized? would need a skipna kwd + # (first attempt at this was less performant than this implementation) + even_days = np.logical_and(consider_values, values_int % ppd != 0).sum() == 0 + return even_days + # ------------------------------------------------------------------- # Shared Constructor Helpers diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index fae42f170a6b6..9544a6163562f 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -28,14 +28,12 @@ get_resolution, get_supported_reso, get_unit_from_dtype, - iNaT, ints_to_pydatetime, is_date_array_normalized, is_supported_unit, is_unitless, normalize_i8_timestamps, npy_unit_to_abbrev, - periods_per_day, timezones, to_offset, tz_convert_from_utc, @@ -745,25 +743,6 @@ def _format_native_types( self.asi8, tz=self.tz, format=date_format, na_rep=na_rep, reso=self._creso ) - @property - def _is_dates_only(self) -> bool: - """ - Check if we are round times at midnight (and no timezone), which will - be given a more compact __repr__ than other cases. - """ - if self.tz is not None: - return False - - values_int = self.asi8 - consider_values = values_int != iNaT - dtype = cast(np.dtype, self.dtype) # since we checked tz above - reso = get_unit_from_dtype(dtype) - ppd = periods_per_day(reso) - - # TODO: can we reuse is_date_array_normalized? would need a skipna kwd - even_days = np.logical_and(consider_values, values_int % ppd != 0).sum() == 0 - return even_days - # ----------------------------------------------------------------- # Comparison Methods diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index cd6a4883946d2..5ce3dd33eee48 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -336,8 +336,7 @@ def timedelta_range( **Specify a unit** >>> pd.timedelta_range("1 Day", periods=3, freq="100000D", unit="s") - TimedeltaIndex(['1 days 00:00:00', '100001 days 00:00:00', - '200001 days 00:00:00'], + TimedeltaIndex(['1 days', '100001 days', '200001 days'], dtype='timedelta64[s]', freq='100000D') """ if freq is None and com.any_none(periods, start, end): diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 356db34918447..c6c09c2636852 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -41,7 +41,6 @@ NaT, Timedelta, Timestamp, - iNaT, ) from pandas._libs.tslibs.nattype import NaTType @@ -103,7 +102,6 @@ SequenceNotStr, StorageOptions, WriteBuffer, - npt, ) from pandas import ( @@ -1775,15 +1773,7 @@ def get_format_timedelta64( If box, then show the return in quotes """ - values_int = values.view(np.int64) - values_int = cast("npt.NDArray[np.int64]", values_int) - - consider_values = values_int != iNaT - - one_day_nanos = 86400 * 10**9 - not_midnight = values_int % one_day_nanos != 0 - both = np.logical_and(consider_values, not_midnight) - even_days = both.sum() == 0 + even_days = values._is_dates_only if even_days: format = None diff --git a/pandas/tests/indexes/timedeltas/test_formats.py b/pandas/tests/indexes/timedeltas/test_formats.py index 751f9e4cc9eee..ee090bd0aaf0a 100644 --- a/pandas/tests/indexes/timedeltas/test_formats.py +++ b/pandas/tests/indexes/timedeltas/test_formats.py @@ -8,6 +8,18 @@ class TestTimedeltaIndexRendering: + def test_repr_round_days_non_nano(self): + # GH#55405 + # we should get "1 days", not "1 days 00:00:00" with non-nano + tdi = TimedeltaIndex(["1 days"], freq="D").as_unit("s") + result = repr(tdi) + expected = "TimedeltaIndex(['1 days'], dtype='timedelta64[s]', freq='D')" + assert result == expected + + result2 = repr(Series(tdi)) + expected2 = "0 1 days\ndtype: timedelta64[s]" + assert result2 == expected2 + @pytest.mark.parametrize("method", ["__repr__", "__str__"]) def test_representation(self, method): idx1 = TimedeltaIndex([], freq="D") diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 0087149021895..fe6ddd3aeb1d4 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3186,7 +3186,7 @@ def test_all(self): class TestTimedelta64Formatter: def test_days(self): - x = pd.to_timedelta(list(range(5)) + [NaT], unit="D") + x = pd.to_timedelta(list(range(5)) + [NaT], unit="D")._values result = fmt._Timedelta64Formatter(x, box=True).get_result() assert result[0].strip() == "'0 days'" assert result[1].strip() == "'1 days'" @@ -3202,48 +3202,48 @@ def test_days(self): assert result[0].strip() == "1 days" def test_days_neg(self): - x = pd.to_timedelta(list(range(5)) + [NaT], unit="D") + x = pd.to_timedelta(list(range(5)) + [NaT], unit="D")._values result = fmt._Timedelta64Formatter(-x, box=True).get_result() assert result[0].strip() == "'0 days'" assert result[1].strip() == "'-1 days'" def test_subdays(self): - y = pd.to_timedelta(list(range(5)) + [NaT], unit="s") + y = pd.to_timedelta(list(range(5)) + [NaT], unit="s")._values result = fmt._Timedelta64Formatter(y, box=True).get_result() assert result[0].strip() == "'0 days 00:00:00'" assert result[1].strip() == "'0 days 00:00:01'" def test_subdays_neg(self): - y = pd.to_timedelta(list(range(5)) + [NaT], unit="s") + y = pd.to_timedelta(list(range(5)) + [NaT], unit="s")._values result = fmt._Timedelta64Formatter(-y, box=True).get_result() assert result[0].strip() == "'0 days 00:00:00'" assert result[1].strip() == "'-1 days +23:59:59'" def test_zero(self): - x = pd.to_timedelta(list(range(1)) + [NaT], unit="D") + x = pd.to_timedelta(list(range(1)) + [NaT], unit="D")._values result = fmt._Timedelta64Formatter(x, box=True).get_result() assert result[0].strip() == "'0 days'" - x = pd.to_timedelta(list(range(1)), unit="D") + x = pd.to_timedelta(list(range(1)), unit="D")._values result = fmt._Timedelta64Formatter(x, box=True).get_result() assert result[0].strip() == "'0 days'" class Test_Datetime64Formatter: def test_mixed(self): - x = Series([datetime(2013, 1, 1), datetime(2013, 1, 1, 12), NaT]) + x = Series([datetime(2013, 1, 1), datetime(2013, 1, 1, 12), NaT])._values result = fmt._Datetime64Formatter(x).get_result() assert result[0].strip() == "2013-01-01 00:00:00" assert result[1].strip() == "2013-01-01 12:00:00" def test_dates(self): - x = Series([datetime(2013, 1, 1), datetime(2013, 1, 2), NaT]) + x = Series([datetime(2013, 1, 1), datetime(2013, 1, 2), NaT])._values result = fmt._Datetime64Formatter(x).get_result() assert result[0].strip() == "2013-01-01" assert result[1].strip() == "2013-01-02" def test_date_nanos(self): - x = Series([Timestamp(200)]) + x = Series([Timestamp(200)])._values result = fmt._Datetime64Formatter(x).get_result() assert result[0].strip() == "1970-01-01 00:00:00.000000200" @@ -3252,41 +3252,41 @@ def test_dates_display(self): # make sure that we are consistently display date formatting x = Series(date_range("20130101 09:00:00", periods=5, freq="D")) x.iloc[1] = np.nan - result = fmt._Datetime64Formatter(x).get_result() + result = fmt._Datetime64Formatter(x._values).get_result() assert result[0].strip() == "2013-01-01 09:00:00" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-05 09:00:00" x = Series(date_range("20130101 09:00:00", periods=5, freq="s")) x.iloc[1] = np.nan - result = fmt._Datetime64Formatter(x).get_result() + result = fmt._Datetime64Formatter(x._values).get_result() assert result[0].strip() == "2013-01-01 09:00:00" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:04" x = Series(date_range("20130101 09:00:00", periods=5, freq="ms")) x.iloc[1] = np.nan - result = fmt._Datetime64Formatter(x).get_result() + result = fmt._Datetime64Formatter(x._values).get_result() assert result[0].strip() == "2013-01-01 09:00:00.000" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:00.004" x = Series(date_range("20130101 09:00:00", periods=5, freq="us")) x.iloc[1] = np.nan - result = fmt._Datetime64Formatter(x).get_result() + result = fmt._Datetime64Formatter(x._values).get_result() assert result[0].strip() == "2013-01-01 09:00:00.000000" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:00.000004" x = Series(date_range("20130101 09:00:00", periods=5, freq="ns")) x.iloc[1] = np.nan - result = fmt._Datetime64Formatter(x).get_result() + result = fmt._Datetime64Formatter(x._values).get_result() assert result[0].strip() == "2013-01-01 09:00:00.000000000" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:00.000000004" def test_datetime64formatter_yearmonth(self): - x = Series([datetime(2016, 1, 1), datetime(2016, 2, 2)]) + x = Series([datetime(2016, 1, 1), datetime(2016, 2, 2)])._values def format_func(x): return x.strftime("%Y-%m") @@ -3298,7 +3298,7 @@ def format_func(x): def test_datetime64formatter_hoursecond(self): x = Series( pd.to_datetime(["10:10:10.100", "12:12:12.120"], format="%H:%M:%S.%f") - ) + )._values def format_func(x): return x.strftime("%H:%M") From 940a1b4fafdeca220dc24f15555f6c334dd3d864 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Thu, 5 Oct 2023 12:15:16 -0400 Subject: [PATCH 085/131] TYP: read_csv and read_excel fixes (#55410) --- pandas/io/excel/_base.py | 11 ++++++----- pandas/io/parsers/readers.py | 20 ++++++++++---------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 073115cab8695..6def0024d9073 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -80,6 +80,7 @@ IntStrT, ReadBuffer, Self, + SequenceNotStr, StorageOptions, WriteExcelBuffer, ) @@ -387,7 +388,7 @@ def read_excel( sheet_name: str | int = ..., *, header: int | Sequence[int] | None = ..., - names: list[str] | None = ..., + names: SequenceNotStr[Hashable] | range | None = ..., index_col: int | Sequence[int] | None = ..., usecols: int | str @@ -426,7 +427,7 @@ def read_excel( sheet_name: list[IntStrT] | None, *, header: int | Sequence[int] | None = ..., - names: list[str] | None = ..., + names: SequenceNotStr[Hashable] | range | None = ..., index_col: int | Sequence[int] | None = ..., usecols: int | str @@ -465,7 +466,7 @@ def read_excel( sheet_name: str | int | list[IntStrT] | None = 0, *, header: int | Sequence[int] | None = 0, - names: list[str] | None = None, + names: SequenceNotStr[Hashable] | range | None = None, index_col: int | Sequence[int] | None = None, usecols: int | str @@ -730,7 +731,7 @@ def parse( self, sheet_name: str | int | list[int] | list[str] | None = 0, header: int | Sequence[int] | None = 0, - names=None, + names: SequenceNotStr[Hashable] | range | None = None, index_col: int | Sequence[int] | None = None, usecols=None, dtype: DtypeArg | None = None, @@ -1589,7 +1590,7 @@ def parse( self, sheet_name: str | int | list[int] | list[str] | None = 0, header: int | Sequence[int] | None = 0, - names=None, + names: SequenceNotStr[Hashable] | range | None = None, index_col: int | Sequence[int] | None = None, usecols=None, converters=None, diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 32f8a1fe81a9f..df5b08029a08b 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -666,7 +666,7 @@ def read_csv( infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser: Callable | lib.NoDefault = ..., - date_format: str | None = ..., + date_format: str | dict[Hashable, str] | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: Literal[True], @@ -726,7 +726,7 @@ def read_csv( infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser: Callable | lib.NoDefault = ..., - date_format: str | None = ..., + date_format: str | dict[Hashable, str] | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: bool = ..., @@ -786,7 +786,7 @@ def read_csv( infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser: Callable | lib.NoDefault = ..., - date_format: str | None = ..., + date_format: str | dict[Hashable, str] | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: Literal[False] = ..., @@ -846,7 +846,7 @@ def read_csv( infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser: Callable | lib.NoDefault = ..., - date_format: str | None = ..., + date_format: str | dict[Hashable, str] | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: bool = ..., @@ -920,7 +920,7 @@ def read_csv( infer_datetime_format: bool | lib.NoDefault = lib.no_default, keep_date_col: bool = False, date_parser: Callable | lib.NoDefault = lib.no_default, - date_format: str | None = None, + date_format: str | dict[Hashable, str] | None = None, dayfirst: bool = False, cache_dates: bool = True, # Iteration @@ -1009,7 +1009,7 @@ def read_table( infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser: Callable | lib.NoDefault = ..., - date_format: str | None = ..., + date_format: str | dict[Hashable, str] | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: Literal[True], @@ -1066,7 +1066,7 @@ def read_table( infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser: Callable | lib.NoDefault = ..., - date_format: str | None = ..., + date_format: str | dict[Hashable, str] | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: bool = ..., @@ -1123,7 +1123,7 @@ def read_table( infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser: Callable | lib.NoDefault = ..., - date_format: str | None = ..., + date_format: str | dict[Hashable, str] | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: Literal[False] = ..., @@ -1180,7 +1180,7 @@ def read_table( infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser: Callable | lib.NoDefault = ..., - date_format: str | None = ..., + date_format: str | dict[Hashable, str] | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: bool = ..., @@ -1253,7 +1253,7 @@ def read_table( infer_datetime_format: bool | lib.NoDefault = lib.no_default, keep_date_col: bool = False, date_parser: Callable | lib.NoDefault = lib.no_default, - date_format: str | None = None, + date_format: str | dict[Hashable, str] | None = None, dayfirst: bool = False, cache_dates: bool = True, # Iteration From a76b3f4c73a351a11d2cae6236773192e630d4f5 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Thu, 5 Oct 2023 12:18:56 -0400 Subject: [PATCH 086/131] CLN: Replace _selected_obj with _obj_with_exclusions in SeriesGroupBy (#55392) --- pandas/core/groupby/generic.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index dbabd04a87c36..904ab9bdfc6dd 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -158,7 +158,7 @@ def _wrap_agged_manager(self, mgr: Manager) -> Series: def _get_data_to_aggregate( self, *, numeric_only: bool = False, name: str | None = None ) -> SingleManager: - ser = self._selected_obj + ser = self._obj_with_exclusions single = ser._mgr if numeric_only and not is_numeric_dtype(ser.dtype): # GH#41291 match Series behavior @@ -448,7 +448,7 @@ def _aggregate_named(self, func, *args, **kwargs): initialized = False for name, group in self.grouper.get_iterator( - self._selected_obj, axis=self.axis + self._obj_with_exclusions, axis=self.axis ): # needed for pandas/tests/groupby/test_groupby.py::test_basic_aggregations object.__setattr__(group, "name", name) @@ -519,7 +519,7 @@ def _cython_transform( ): assert axis == 0 # handled by caller - obj = self._selected_obj + obj = self._obj_with_exclusions try: result = self.grouper._cython_operation( @@ -546,7 +546,7 @@ def _transform_general( results = [] for name, group in self.grouper.get_iterator( - self._selected_obj, axis=self.axis + self._obj_with_exclusions, axis=self.axis ): # this setattr is needed for test_transform_lambda_with_datetimetz object.__setattr__(group, "name", name) @@ -618,7 +618,7 @@ def true_and_notna(x) -> bool: indices = [ self._get_index(name) for name, group in self.grouper.get_iterator( - self._selected_obj, axis=self.axis + self._obj_with_exclusions, axis=self.axis ) if true_and_notna(group) ] @@ -1164,7 +1164,7 @@ def nlargest( self, n: int = 5, keep: Literal["first", "last", "all"] = "first" ) -> Series: f = partial(Series.nlargest, n=n, keep=keep) - data = self._selected_obj + data = self._obj_with_exclusions # Don't change behavior if result index happens to be the same, i.e. # already ordered and n >= all group sizes. result = self._python_apply_general(f, data, not_indexed_same=True) @@ -1175,7 +1175,7 @@ def nsmallest( self, n: int = 5, keep: Literal["first", "last", "all"] = "first" ) -> Series: f = partial(Series.nsmallest, n=n, keep=keep) - data = self._selected_obj + data = self._obj_with_exclusions # Don't change behavior if result index happens to be the same, i.e. # already ordered and n >= all group sizes. result = self._python_apply_general(f, data, not_indexed_same=True) From 40f219ccf74182843cb87e6d00f1af4736105987 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 5 Oct 2023 18:49:46 -0400 Subject: [PATCH 087/131] CI: Pin Cython (#55414) --- .github/workflows/unit-tests.yml | 6 +++--- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311-downstream_compat.yaml | 2 +- ci/deps/actions-311-numpydev.yaml | 2 +- ci/deps/actions-311-pyarrownightly.yaml | 2 +- ci/deps/actions-311.yaml | 2 +- ci/deps/actions-39-minimum_versions.yaml | 2 +- ci/deps/actions-39.yaml | 2 +- ci/deps/actions-pypy-39.yaml | 2 +- ci/deps/circle-310-arm64.yaml | 2 +- 10 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 139cfcde95b2c..96010a4a0227d 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -236,7 +236,7 @@ jobs: . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install numpy --config-settings=setup-args="-Dallow-noblas=true" - python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install --no-cache-dir versioneer[toml] "cython<3.0.3" python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . python -m pip list --no-cache-dir export PANDAS_CI=1 @@ -274,7 +274,7 @@ jobs: /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install --no-cache-dir versioneer[toml] "cython<3.0.3" numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . python -m pip list --no-cache-dir @@ -347,7 +347,7 @@ jobs: python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] - python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 + python -m pip install python-dateutil pytz tzdata "cython<3.0.3" hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 python -m pip install -ve . --no-build-isolation --no-index python -m pip list diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 2522605cf5c38..ebd1556b8a5f5 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 - meson[ninja]=1.2.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index dc67122c6e72e..4d0406814c873 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -7,7 +7,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 - meson[ninja]=1.2.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index 7fd3a65ec91f8..9795a1fb39c6f 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -8,7 +8,7 @@ dependencies: - versioneer[toml] - meson[ninja]=1.2.1 - meson-python=0.13.1 - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index 893341350f4ef..c259286a5359c 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -7,7 +7,7 @@ dependencies: # build dependencies - versioneer[toml] - meson[ninja]=1.2.1 - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index c9faaa2146235..f6df5a6e894a7 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 - meson[ninja]=1.2.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index b7cc6e9e891ce..586768765325e 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -8,7 +8,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 - meson[ninja]=1.2.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index f51a8e04fbc7e..3751651a2a2f2 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 - meson[ninja]=1.2.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index 4923c94ab08f3..db0723cd3b8fa 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -9,7 +9,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 - meson[ninja]=1.2.1 - meson-python=0.13.1 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index d1a9e336eaeac..65f72278a0291 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 - meson[ninja]=1.2.1 - meson-python=0.13.1 From 7ec95e4be4b814b92b671a91d544feee2a106d87 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Fri, 6 Oct 2023 11:40:25 +0200 Subject: [PATCH 088/131] DEPR: 'A' for yearly frequency and YearEnd in favour of 'Y' (#55252) --- asv_bench/benchmarks/tslibs/period.py | 2 +- doc/source/user_guide/io.rst | 2 +- doc/source/user_guide/timeseries.rst | 48 ++++----- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/_libs/tslibs/dtypes.pxd | 2 + pandas/_libs/tslibs/dtypes.pyi | 1 + pandas/_libs/tslibs/dtypes.pyx | 68 +++++++----- pandas/_libs/tslibs/offsets.pyx | 33 +++--- pandas/_libs/tslibs/timestamps.pyx | 2 +- pandas/core/arrays/arrow/array.py | 2 +- pandas/core/arrays/datetimes.py | 2 +- pandas/core/arrays/period.py | 18 ++-- pandas/core/dtypes/common.py | 2 +- pandas/core/frame.py | 2 +- pandas/core/generic.py | 8 +- pandas/core/resample.py | 4 +- pandas/core/series.py | 8 +- pandas/tests/arithmetic/test_period.py | 28 ++--- .../tests/arrays/categorical/test_astype.py | 2 +- .../tests/arrays/period/test_arrow_compat.py | 2 +- .../tests/arrays/period/test_constructors.py | 4 +- pandas/tests/arrays/test_array.py | 2 +- pandas/tests/arrays/test_datetimes.py | 2 +- pandas/tests/arrays/test_period.py | 4 +- pandas/tests/base/test_conversion.py | 4 +- pandas/tests/dtypes/test_common.py | 2 +- pandas/tests/frame/methods/test_asfreq.py | 4 +- pandas/tests/frame/methods/test_join.py | 2 +- pandas/tests/frame/methods/test_reindex.py | 2 +- pandas/tests/frame/methods/test_set_index.py | 4 +- .../tests/frame/methods/test_to_timestamp.py | 12 +-- pandas/tests/frame/test_arithmetic.py | 2 +- pandas/tests/frame/test_reductions.py | 4 +- pandas/tests/frame/test_repr_info.py | 2 +- pandas/tests/groupby/test_timegrouper.py | 4 +- .../indexes/datetimelike_/test_sort_values.py | 8 +- .../datetimes/methods/test_to_period.py | 2 +- .../indexes/datetimes/test_constructors.py | 8 +- .../indexes/datetimes/test_date_range.py | 41 +++---- pandas/tests/indexes/datetimes/test_ops.py | 4 +- .../indexes/period/methods/test_asfreq.py | 14 +-- .../indexes/period/methods/test_astype.py | 2 +- .../indexes/period/methods/test_is_full.py | 10 +- .../indexes/period/methods/test_shift.py | 10 +- .../period/methods/test_to_timestamp.py | 8 +- .../tests/indexes/period/test_constructors.py | 10 +- pandas/tests/indexes/period/test_formats.py | 12 +-- pandas/tests/indexes/period/test_indexing.py | 6 +- .../indexes/period/test_partial_slicing.py | 4 +- pandas/tests/indexes/period/test_period.py | 37 ++++--- .../tests/indexes/period/test_period_range.py | 2 +- pandas/tests/indexes/period/test_pickle.py | 2 +- .../tests/indexes/period/test_resolution.py | 2 +- pandas/tests/indexes/period/test_setops.py | 12 +-- pandas/tests/indexes/period/test_tools.py | 4 +- pandas/tests/indexes/test_base.py | 2 +- .../timedeltas/test_timedelta_range.py | 40 +++++++ pandas/tests/indexing/test_loc.py | 14 +-- pandas/tests/internals/test_internals.py | 4 +- .../tests/io/json/test_json_table_schema.py | 6 +- pandas/tests/plotting/test_datetimelike.py | 34 +++--- pandas/tests/resample/test_base.py | 2 +- pandas/tests/resample/test_datetime_index.py | 12 +-- pandas/tests/resample/test_period_index.py | 52 ++++----- pandas/tests/resample/test_time_grouper.py | 8 +- pandas/tests/reshape/concat/test_concat.py | 4 +- pandas/tests/reshape/test_pivot.py | 10 +- pandas/tests/scalar/period/test_asfreq.py | 102 +++++++++--------- pandas/tests/scalar/period/test_period.py | 54 +++++----- pandas/tests/series/methods/test_align.py | 2 +- pandas/tests/series/test_arithmetic.py | 4 +- pandas/tests/series/test_constructors.py | 2 +- pandas/tests/series/test_repr.py | 2 +- .../tseries/frequencies/test_freq_code.py | 8 +- .../tseries/frequencies/test_inference.py | 30 +++--- pandas/tests/tseries/offsets/test_offsets.py | 4 +- pandas/tests/tslibs/test_conversion.py | 2 +- pandas/tests/tslibs/test_libfrequencies.py | 2 - pandas/tests/tslibs/test_parsing.py | 6 +- pandas/tests/tslibs/test_period_asfreq.py | 2 +- pandas/tseries/frequencies.py | 6 +- 81 files changed, 490 insertions(+), 413 deletions(-) diff --git a/asv_bench/benchmarks/tslibs/period.py b/asv_bench/benchmarks/tslibs/period.py index a92fbbe8d4dbe..67f3b7736018d 100644 --- a/asv_bench/benchmarks/tslibs/period.py +++ b/asv_bench/benchmarks/tslibs/period.py @@ -72,7 +72,7 @@ def time_now(self, freq): self.per.now(freq) def time_asfreq(self, freq): - self.per.asfreq("A") + self.per.asfreq("Y") def time_str(self, freq): str(self.per) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 6bd181740c78d..c2fe277c4f4e5 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2332,7 +2332,7 @@ A few notes on the generated table schema: .. ipython:: python - s_per = pd.Series(1, index=pd.period_range("2016", freq="A-DEC", periods=4)) + s_per = pd.Series(1, index=pd.period_range("2016", freq="Y-DEC", periods=4)) build_table_schema(s_per) * Categoricals use the ``any`` type and an ``enum`` constraint listing diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 113efeefb48ce..3ec303323b887 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -895,7 +895,7 @@ into ``freq`` keyword arguments. The available date offsets and associated frequ :class:`~pandas.tseries.offsets.BQuarterEnd`, ``'BQ``, "business quarter end" :class:`~pandas.tseries.offsets.BQuarterBegin`, ``'BQS'``, "business quarter begin" :class:`~pandas.tseries.offsets.FY5253Quarter`, ``'REQ'``, "retail (aka 52-53 week) quarter" - :class:`~pandas.tseries.offsets.YearEnd`, ``'A'``, "calendar year end" + :class:`~pandas.tseries.offsets.YearEnd`, ``'Y'``, "calendar year end" :class:`~pandas.tseries.offsets.YearBegin`, ``'AS'`` or ``'BYS'``,"calendar year begin" :class:`~pandas.tseries.offsets.BYearEnd`, ``'BA'``, "business year end" :class:`~pandas.tseries.offsets.BYearBegin`, ``'BAS'``, "business year begin" @@ -1258,7 +1258,7 @@ frequencies. We will refer to these aliases as *offset aliases*. "BQ", "business quarter end frequency" "QS", "quarter start frequency" "BQS", "business quarter start frequency" - "A, Y", "year end frequency" + "Y", "year end frequency" "BA, BY", "business year end frequency" "AS, YS", "year start frequency" "BAS, BYS", "business year start frequency" @@ -1321,7 +1321,7 @@ frequencies. We will refer to these aliases as *period aliases*. "W", "weekly frequency" "M", "monthly frequency" "Q", "quarterly frequency" - "A, Y", "yearly frequency" + "Y", "yearly frequency" "H", "hourly frequency" "min", "minutely frequency" "s", "secondly frequency" @@ -1331,8 +1331,8 @@ frequencies. We will refer to these aliases as *period aliases*. .. deprecated:: 2.2.0 - Aliases ``T``, ``S``, ``L``, ``U``, and ``N`` are deprecated in favour of the aliases - ``min``, ``s``, ``ms``, ``us``, and ``ns``. + Aliases ``A``, ``T``, ``S``, ``L``, ``U``, and ``N`` are deprecated in favour of the aliases + ``Y``, ``min``, ``s``, ``ms``, ``us``, and ``ns``. Combining aliases @@ -1383,18 +1383,18 @@ For some frequencies you can specify an anchoring suffix: "(B)Q(S)\-SEP", "quarterly frequency, year ends in September" "(B)Q(S)\-OCT", "quarterly frequency, year ends in October" "(B)Q(S)\-NOV", "quarterly frequency, year ends in November" - "(B)A(S)\-DEC", "annual frequency, anchored end of December. Same as 'A'" - "(B)A(S)\-JAN", "annual frequency, anchored end of January" - "(B)A(S)\-FEB", "annual frequency, anchored end of February" - "(B)A(S)\-MAR", "annual frequency, anchored end of March" - "(B)A(S)\-APR", "annual frequency, anchored end of April" - "(B)A(S)\-MAY", "annual frequency, anchored end of May" - "(B)A(S)\-JUN", "annual frequency, anchored end of June" - "(B)A(S)\-JUL", "annual frequency, anchored end of July" - "(B)A(S)\-AUG", "annual frequency, anchored end of August" - "(B)A(S)\-SEP", "annual frequency, anchored end of September" - "(B)A(S)\-OCT", "annual frequency, anchored end of October" - "(B)A(S)\-NOV", "annual frequency, anchored end of November" + "(B)Y(S)\-DEC", "annual frequency, anchored end of December. Same as 'Y'" + "(B)Y(S)\-JAN", "annual frequency, anchored end of January" + "(B)Y(S)\-FEB", "annual frequency, anchored end of February" + "(B)Y(S)\-MAR", "annual frequency, anchored end of March" + "(B)Y(S)\-APR", "annual frequency, anchored end of April" + "(B)Y(S)\-MAY", "annual frequency, anchored end of May" + "(B)Y(S)\-JUN", "annual frequency, anchored end of June" + "(B)Y(S)\-JUL", "annual frequency, anchored end of July" + "(B)Y(S)\-AUG", "annual frequency, anchored end of August" + "(B)Y(S)\-SEP", "annual frequency, anchored end of September" + "(B)Y(S)\-OCT", "annual frequency, anchored end of October" + "(B)Y(S)\-NOV", "annual frequency, anchored end of November" These can be used as arguments to ``date_range``, ``bdate_range``, constructors for ``DatetimeIndex``, as well as various other timeseries-related functions @@ -1690,7 +1690,7 @@ the end of the interval. .. warning:: The default values for ``label`` and ``closed`` is '**left**' for all - frequency offsets except for 'ME', 'A', 'Q', 'BM', 'BA', 'BQ', and 'W' + frequency offsets except for 'ME', 'Y', 'Q', 'BM', 'BA', 'BQ', and 'W' which all have a default of 'right'. This might unintendedly lead to looking ahead, where the value for a later @@ -1995,7 +1995,7 @@ Because ``freq`` represents a span of ``Period``, it cannot be negative like "-3 .. ipython:: python - pd.Period("2012", freq="A-DEC") + pd.Period("2012", freq="Y-DEC") pd.Period("2012-1-1", freq="D") @@ -2008,7 +2008,7 @@ frequency. Arithmetic is not allowed between ``Period`` with different ``freq`` .. ipython:: python - p = pd.Period("2012", freq="A-DEC") + p = pd.Period("2012", freq="Y-DEC") p + 1 p - 3 p = pd.Period("2012-01", freq="2M") @@ -2050,7 +2050,7 @@ return the number of frequency units between them: .. ipython:: python - pd.Period("2012", freq="A-DEC") - pd.Period("2002", freq="A-DEC") + pd.Period("2012", freq="Y-DEC") - pd.Period("2002", freq="Y-DEC") PeriodIndex and period_range ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -2184,7 +2184,7 @@ method. Let's start with the fiscal year 2011, ending in December: .. ipython:: python - p = pd.Period("2011", freq="A-DEC") + p = pd.Period("2011", freq="Y-DEC") p We can convert it to a monthly frequency. Using the ``how`` parameter, we can @@ -2211,10 +2211,10 @@ input period: p = pd.Period("2011-12", freq="M") - p.asfreq("A-NOV") + p.asfreq("Y-NOV") Note that since we converted to an annual frequency that ends the year in -November, the monthly period of December 2011 is actually in the 2012 A-NOV +November, the monthly period of December 2011 is actually in the 2012 Y-NOV period. .. _timeseries.quarterly: diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 7743f762d8898..dc9c3f661dab1 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -251,6 +251,7 @@ Other Deprecations - Deprecated downcasting behavior in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, :meth:`DataFrame.mask`, :meth:`Series.clip`, :meth:`DataFrame.clip`; in a future version these will not infer object-dtype columns to non-object dtype, or all-round floats to integer dtype. Call ``result.infer_objects(copy=False)`` on the result for object inference, or explicitly cast floats to ints. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`53656`) - Deprecated including the groups in computations when using :meth:`DataFrameGroupBy.apply` and :meth:`DataFrameGroupBy.resample`; pass ``include_groups=False`` to exclude the groups (:issue:`7155`) - Deprecated not passing a tuple to :class:`DataFrameGroupBy.get_group` or :class:`SeriesGroupBy.get_group` when grouping by a length-1 list-like (:issue:`25971`) +- Deprecated string ``A`` denoting frequency in :class:`YearEnd` and strings ``A-DEC``, ``A-JAN``, etc. denoting annual frequencies with various fiscal year ends (:issue:`52536`) - Deprecated strings ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`) - Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`52536`) - Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`52536`) diff --git a/pandas/_libs/tslibs/dtypes.pxd b/pandas/_libs/tslibs/dtypes.pxd index e050ac5a6c7b7..bda4fcf04234b 100644 --- a/pandas/_libs/tslibs/dtypes.pxd +++ b/pandas/_libs/tslibs/dtypes.pxd @@ -13,6 +13,8 @@ cpdef bint is_supported_unit(NPY_DATETIMEUNIT reso) cpdef freq_to_period_freqstr(freq_n, freq_name) cdef dict c_OFFSET_TO_PERIOD_FREQSTR +cdef dict c_OFFSET_DEPR_FREQSTR +cdef dict c_REVERSE_OFFSET_DEPR_FREQSTR cdef dict c_DEPR_ABBREVS cdef dict attrname_to_abbrevs cdef dict npy_unit_to_attrname diff --git a/pandas/_libs/tslibs/dtypes.pyi b/pandas/_libs/tslibs/dtypes.pyi index 72a8fa8ff0b38..d8680ed2d27b4 100644 --- a/pandas/_libs/tslibs/dtypes.pyi +++ b/pandas/_libs/tslibs/dtypes.pyi @@ -7,6 +7,7 @@ from pandas._libs.tslibs.timedeltas import UnitChoices _attrname_to_abbrevs: dict[str, str] _period_code_map: dict[str, int] OFFSET_TO_PERIOD_FREQSTR: dict[str, str] +OFFSET_DEPR_FREQSTR: dict[str, str] DEPR_ABBREVS: dict[str, UnitChoices] def periods_per_day(reso: int) -> int: ... diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index cca379c620aeb..ebb6e3a240cbe 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -101,19 +101,19 @@ cdef class PeriodDtypeBase: _period_code_map = { # Annual freqs with various fiscal year ends. - # eg, 2005 for A-FEB runs Mar 1, 2004 to Feb 28, 2005 - "A-DEC": PeriodDtypeCode.A_DEC, # Annual - December year end - "A-JAN": PeriodDtypeCode.A_JAN, # Annual - January year end - "A-FEB": PeriodDtypeCode.A_FEB, # Annual - February year end - "A-MAR": PeriodDtypeCode.A_MAR, # Annual - March year end - "A-APR": PeriodDtypeCode.A_APR, # Annual - April year end - "A-MAY": PeriodDtypeCode.A_MAY, # Annual - May year end - "A-JUN": PeriodDtypeCode.A_JUN, # Annual - June year end - "A-JUL": PeriodDtypeCode.A_JUL, # Annual - July year end - "A-AUG": PeriodDtypeCode.A_AUG, # Annual - August year end - "A-SEP": PeriodDtypeCode.A_SEP, # Annual - September year end - "A-OCT": PeriodDtypeCode.A_OCT, # Annual - October year end - "A-NOV": PeriodDtypeCode.A_NOV, # Annual - November year end + # eg, 2005 for Y-FEB runs Mar 1, 2004 to Feb 28, 2005 + "Y-DEC": PeriodDtypeCode.A_DEC, # Annual - December year end + "Y-JAN": PeriodDtypeCode.A_JAN, # Annual - January year end + "Y-FEB": PeriodDtypeCode.A_FEB, # Annual - February year end + "Y-MAR": PeriodDtypeCode.A_MAR, # Annual - March year end + "Y-APR": PeriodDtypeCode.A_APR, # Annual - April year end + "Y-MAY": PeriodDtypeCode.A_MAY, # Annual - May year end + "Y-JUN": PeriodDtypeCode.A_JUN, # Annual - June year end + "Y-JUL": PeriodDtypeCode.A_JUL, # Annual - July year end + "Y-AUG": PeriodDtypeCode.A_AUG, # Annual - August year end + "Y-SEP": PeriodDtypeCode.A_SEP, # Annual - September year end + "Y-OCT": PeriodDtypeCode.A_OCT, # Annual - October year end + "Y-NOV": PeriodDtypeCode.A_NOV, # Annual - November year end # Quarterly frequencies with various fiscal year ends. # eg, Q42005 for Q-OCT runs Aug 1, 2005 to Oct 31, 2005 @@ -156,22 +156,22 @@ _reverse_period_code_map = { # Yearly aliases; careful not to put these in _reverse_period_code_map _period_code_map.update({"Y" + key[1:]: _period_code_map[key] for key in _period_code_map - if key.startswith("A-")}) + if key.startswith("Y-")}) _period_code_map.update({ "Q": 2000, # Quarterly - December year end (default quarterly) - "A": PeriodDtypeCode.A, # Annual + "Y": PeriodDtypeCode.A, # Annual "W": 4000, # Weekly "C": 5000, # Custom Business Day }) cdef set _month_names = { - x.split("-")[-1] for x in _period_code_map.keys() if x.startswith("A-") + x.split("-")[-1] for x in _period_code_map.keys() if x.startswith("Y-") } # Map attribute-name resolutions to resolution abbreviations _attrname_to_abbrevs = { - "year": "A", + "year": "Y", "quarter": "Q", "month": "M", "day": "D", @@ -192,9 +192,9 @@ OFFSET_TO_PERIOD_FREQSTR: dict = { "BQS": "Q", "QS": "Q", "BQ": "Q", - "BA": "A", - "AS": "A", - "BAS": "A", + "BA": "Y", + "AS": "Y", + "BAS": "Y", "MS": "M", "D": "D", "B": "B", @@ -205,15 +205,19 @@ OFFSET_TO_PERIOD_FREQSTR: dict = { "ns": "ns", "H": "H", "Q": "Q", - "A": "A", + "Y": "Y", "W": "W", "ME": "M", - "Y": "A", - "BY": "A", - "YS": "A", - "BYS": "A", + "BY": "Y", + "YS": "Y", + "BYS": "Y", +} +OFFSET_DEPR_FREQSTR: dict[str, str]= { + "M": "ME", } cdef dict c_OFFSET_TO_PERIOD_FREQSTR = OFFSET_TO_PERIOD_FREQSTR +cdef dict c_OFFSET_DEPR_FREQSTR = OFFSET_DEPR_FREQSTR +cdef dict c_REVERSE_OFFSET_DEPR_FREQSTR = {v: k for k, v in OFFSET_DEPR_FREQSTR.items()} cpdef freq_to_period_freqstr(freq_n, freq_name): if freq_n == 1: @@ -226,6 +230,20 @@ cpdef freq_to_period_freqstr(freq_n, freq_name): # Map deprecated resolution abbreviations to correct resolution abbreviations DEPR_ABBREVS: dict[str, str]= { + "A": "Y", + "a": "Y", + "A-DEC": "Y-DEC", + "A-JAN": "Y-JAN", + "A-FEB": "Y-FEB", + "A-MAR": "Y-MAR", + "A-APR": "Y-APR", + "A-MAY": "Y-MAY", + "A-JUN": "Y-JUN", + "A-JUL": "Y-JUL", + "A-AUG": "Y-AUG", + "A-SEP": "Y-SEP", + "A-OCT": "Y-OCT", + "A-NOV": "Y-NOV", "T": "min", "t": "min", "S": "s", diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 8fdba8992f627..534d8e9926b38 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -15,6 +15,7 @@ from cpython.datetime cimport ( time as dt_time, timedelta, ) + import warnings import_datetime() @@ -48,7 +49,6 @@ from pandas._libs.tslibs.ccalendar import ( ) from pandas.util._exceptions import find_stack_level - from pandas._libs.tslibs.ccalendar cimport ( dayofweek, get_days_in_month, @@ -58,6 +58,8 @@ from pandas._libs.tslibs.ccalendar cimport ( from pandas._libs.tslibs.conversion cimport localize_pydatetime from pandas._libs.tslibs.dtypes cimport ( c_DEPR_ABBREVS, + c_OFFSET_DEPR_FREQSTR, + c_REVERSE_OFFSET_DEPR_FREQSTR, periods_per_day, ) from pandas._libs.tslibs.nattype cimport ( @@ -2496,7 +2498,7 @@ cdef class YearEnd(YearOffset): """ _default_month = 12 - _prefix = "A" + _prefix = "Y" _day_opt = "end" cdef readonly: @@ -4539,7 +4541,7 @@ prefix_mapping = { offset._prefix: offset for offset in [ YearBegin, # 'AS' - YearEnd, # 'A' + YearEnd, # 'Y' BYearBegin, # 'BAS' BYearEnd, # 'BA' BusinessDay, # 'B' @@ -4581,8 +4583,7 @@ _lite_rule_alias = { "W": "W-SUN", "Q": "Q-DEC", - "A": "A-DEC", # YearEnd(month=12), - "Y": "A-DEC", + "Y": "Y-DEC", # YearEnd(month=12), "AS": "AS-JAN", # YearBegin(month=1), "YS": "AS-JAN", "BA": "BA-DEC", # BYearEnd(month=12), @@ -4707,21 +4708,22 @@ cpdef to_offset(freq, bint is_period=False): tups = zip(split[0::4], split[1::4], split[2::4]) for n, (sep, stride, name) in enumerate(tups): - if is_period is False and name == "M": + if is_period is False and name in c_OFFSET_DEPR_FREQSTR: warnings.warn( - "\'M\' will be deprecated, please use \'ME\' " - "for \'month end\'", + f"\'{name}\' will be deprecated, please use " + f"\'{c_OFFSET_DEPR_FREQSTR.get(name)}\' instead.", UserWarning, stacklevel=find_stack_level(), ) - name = "ME" - if is_period is True and name == "ME": + name = c_OFFSET_DEPR_FREQSTR[name] + if is_period is True and name in c_REVERSE_OFFSET_DEPR_FREQSTR: raise ValueError( - r"for Period, please use \'M\' " - "instead of \'ME\'" + f"for Period, please use " + f"\'{c_REVERSE_OFFSET_DEPR_FREQSTR.get(name)}\' " + f"instead of \'{name}\'" ) - elif is_period is True and name == "M": - name = "ME" + elif is_period is True and name in c_OFFSET_DEPR_FREQSTR: + name = c_OFFSET_DEPR_FREQSTR.get(name) if sep != "" and not sep.isspace(): raise ValueError("separator must be spaces") @@ -4740,6 +4742,7 @@ cpdef to_offset(freq, bint is_period=False): stacklevel=find_stack_level(), ) prefix = c_DEPR_ABBREVS[prefix] + if prefix in {"D", "H", "min", "s", "ms", "us", "ns"}: # For these prefixes, we have something like "3H" or # "2.5T", so we can construct a Timedelta with the @@ -4753,7 +4756,7 @@ cpdef to_offset(freq, bint is_period=False): offset *= stride_sign else: stride = int(stride) - offset = _get_offset(name) + offset = _get_offset(prefix) offset = offset * int(np.fabs(stride) * stride_sign) if delta is None: diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 1b4332c2d26cf..8bf1ebb9bf608 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1249,7 +1249,7 @@ cdef class _Timestamp(ABCTimestamp): >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') >>> # Year end frequency >>> ts.to_period(freq='Y') - Period('2020', 'A-DEC') + Period('2020', 'Y-DEC') >>> # Month end frequency >>> ts.to_period(freq='M') diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 12fe9b30f3f52..e844c0542df69 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2525,7 +2525,7 @@ def _round_temporally( if offset is None: raise ValueError(f"Must specify a valid frequency: {freq}") pa_supported_unit = { - "A": "year", + "Y": "year", "AS": "year", "Q": "quarter", "QS": "quarter", diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 9544a6163562f..e0d4098dd1f33 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2042,7 +2042,7 @@ def isocalendar(self) -> DataFrame: >>> idx = pd.date_range("2012-01-01", "2015-01-01", freq="Y") >>> idx DatetimeIndex(['2012-12-31', '2013-12-31', '2014-12-31'], - dtype='datetime64[ns]', freq='A-DEC') + dtype='datetime64[ns]', freq='Y-DEC') >>> idx.is_leap_year array([ True, False, False]) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 4532e5bffe7a9..2468535397568 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -548,7 +548,7 @@ def __arrow_array__(self, type=None): >>> idx = pd.PeriodIndex(["2023", "2024", "2025"], freq="Y") >>> idx - PeriodIndex(['2023', '2024', '2025'], dtype='period[A-DEC]') + PeriodIndex(['2023', '2024', '2025'], dtype='period[Y-DEC]') >>> idx.dayofyear Index([365, 366, 365], dtype='int64') """, @@ -712,10 +712,10 @@ def asfreq(self, freq=None, how: str = "E") -> Self: Examples -------- - >>> pidx = pd.period_range('2010-01-01', '2015-01-01', freq='A') + >>> pidx = pd.period_range('2010-01-01', '2015-01-01', freq='Y') >>> pidx PeriodIndex(['2010', '2011', '2012', '2013', '2014', '2015'], - dtype='period[A-DEC]') + dtype='period[Y-DEC]') >>> pidx.asfreq('M') PeriodIndex(['2010-12', '2011-12', '2012-12', '2013-12', '2014-12', @@ -1025,18 +1025,18 @@ def period_array( Examples -------- - >>> period_array([pd.Period('2017', freq='A'), - ... pd.Period('2018', freq='A')]) + >>> period_array([pd.Period('2017', freq='Y'), + ... pd.Period('2018', freq='Y')]) ['2017', '2018'] - Length: 2, dtype: period[A-DEC] + Length: 2, dtype: period[Y-DEC] - >>> period_array([pd.Period('2017', freq='A'), - ... pd.Period('2018', freq='A'), + >>> period_array([pd.Period('2017', freq='Y'), + ... pd.Period('2018', freq='Y'), ... pd.NaT]) ['2017', '2018', 'NaT'] - Length: 3, dtype: period[A-DEC] + Length: 3, dtype: period[Y-DEC] Integers that look like years are handled diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 42e909a6b9856..3d12e334e7c0f 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -402,7 +402,7 @@ def is_period_dtype(arr_or_dtype) -> bool: False >>> is_period_dtype(pd.Period("2017-01-01")) False - >>> is_period_dtype(pd.PeriodIndex([], freq="A")) + >>> is_period_dtype(pd.PeriodIndex([], freq="Y")) True """ warnings.warn( diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b1ca7557c11ca..28641031f7b21 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -12144,7 +12144,7 @@ def to_period( For the yearly frequency >>> idx.to_period("Y") - PeriodIndex(['2001', '2002', '2003'], dtype='period[A-DEC]') + PeriodIndex(['2001', '2002', '2003'], dtype='period[Y-DEC]') """ new_obj = self.copy(deep=copy and not using_copy_on_write()) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6f2850559fd77..e131d689b6a40 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9187,11 +9187,11 @@ def resample( Use frame.T.resample(...) instead. closed : {{'right', 'left'}}, default None Which side of bin interval is closed. The default is 'left' - for all frequency offsets except for 'ME', 'A', 'Q', 'BM', + for all frequency offsets except for 'ME', 'Y', 'Q', 'BM', 'BA', 'BQ', and 'W' which all have a default of 'right'. label : {{'right', 'left'}}, default None Which bin edge label to label bucket with. The default is 'left' - for all frequency offsets except for 'ME', 'A', 'Q', 'BM', + for all frequency offsets except for 'ME', 'Y', 'Q', 'BM', 'BA', 'BQ', and 'W' which all have a default of 'right'. convention : {{'start', 'end', 's', 'e'}}, default 'start' For `PeriodIndex` only, controls whether to use the start or @@ -9362,12 +9362,12 @@ def resample( assigned to the first quarter of the period. >>> s = pd.Series([1, 2], index=pd.period_range('2012-01-01', - ... freq='A', + ... freq='Y', ... periods=2)) >>> s 2012 1 2013 2 - Freq: A-DEC, dtype: int64 + Freq: Y-DEC, dtype: int64 >>> s.resample('Q', convention='start').asfreq() 2012Q1 1.0 2012Q2 NaN diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 7d37a9f1d5113..b669028ac60bd 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2101,7 +2101,7 @@ def __init__( else: freq = to_offset(freq) - end_types = {"ME", "A", "Q", "BM", "BA", "BQ", "W"} + end_types = {"ME", "Y", "Q", "BM", "BA", "BQ", "W"} rule = freq.rule_code if rule in end_types or ("-" in rule and rule[: rule.find("-")] in end_types): if closed is None: @@ -2301,7 +2301,7 @@ def _adjust_bin_edges( "BQ", "BA", "Q", - "A", + "Y", "W", ): # If the right end-point is on the last day of the month, roll forwards diff --git a/pandas/core/series.py b/pandas/core/series.py index 8ddce3d5cab26..a5e692431e890 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5672,7 +5672,7 @@ def to_timestamp( 2023 1 2024 2 2025 3 - Freq: A-DEC, dtype: int64 + Freq: Y-DEC, dtype: int64 The resulting frequency of the Timestamps is `YearBegin` @@ -5691,7 +5691,7 @@ def to_timestamp( 2023-01-31 1 2024-01-31 2 2025-01-31 3 - Freq: A-JAN, dtype: int64 + Freq: Y-JAN, dtype: int64 """ if not isinstance(self.index, PeriodIndex): raise TypeError(f"unsupported Type {type(self.index).__name__}") @@ -5726,12 +5726,12 @@ def to_period(self, freq: str | None = None, copy: bool | None = None) -> Series 2023 1 2024 2 2025 3 - Freq: A-DEC, dtype: int64 + Freq: Y-DEC, dtype: int64 Viewing the index >>> s.index - PeriodIndex(['2023', '2024', '2025'], dtype='period[A-DEC]') + PeriodIndex(['2023', '2024', '2025'], dtype='period[Y-DEC]') """ if not isinstance(self.index, DatetimeIndex): raise TypeError(f"unsupported Type {type(self.index).__name__}") diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index ee8391830db4c..bee1e1a385672 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -286,14 +286,14 @@ def test_parr_cmp_pi_mismatched_freq(self, freq, box_with_array): msg = rf"Invalid comparison between dtype=period\[{freq}\] and Period" with pytest.raises(TypeError, match=msg): - base <= Period("2011", freq="A") + base <= Period("2011", freq="Y") with pytest.raises(TypeError, match=msg): - Period("2011", freq="A") >= base + Period("2011", freq="Y") >= base # TODO: Could parametrize over boxes for idx? - idx = PeriodIndex(["2011", "2012", "2013", "2014"], freq="A") - rev_msg = r"Invalid comparison between dtype=period\[A-DEC\] and PeriodArray" + idx = PeriodIndex(["2011", "2012", "2013", "2014"], freq="Y") + rev_msg = r"Invalid comparison between dtype=period\[Y-DEC\] and PeriodArray" idx_msg = rev_msg if box_with_array in [tm.to_array, pd.array] else msg with pytest.raises(TypeError, match=idx_msg): base <= idx @@ -405,18 +405,18 @@ def test_cmp_series_period_series_mixed_freq(self): # GH#13200 base = Series( [ - Period("2011", freq="A"), + Period("2011", freq="Y"), Period("2011-02", freq="M"), - Period("2013", freq="A"), + Period("2013", freq="Y"), Period("2011-04", freq="M"), ] ) ser = Series( [ - Period("2012", freq="A"), + Period("2012", freq="Y"), Period("2011-01", freq="M"), - Period("2013", freq="A"), + Period("2013", freq="Y"), Period("2011-05", freq="M"), ] ) @@ -934,9 +934,9 @@ def test_pi_add_sub_int_array_freqn_gt1(self): def test_pi_sub_isub_offset(self): # offset # DateOffset - rng = period_range("2014", "2024", freq="A") + rng = period_range("2014", "2024", freq="Y") result = rng - pd.offsets.YearEnd(5) - expected = period_range("2009", "2019", freq="A") + expected = period_range("2009", "2019", freq="Y") tm.assert_index_equal(result, expected) rng -= pd.offsets.YearEnd(5) tm.assert_index_equal(rng, expected) @@ -1176,17 +1176,17 @@ def test_pi_sub_isub_timedeltalike_hourly(self, two_hours): def test_add_iadd_timedeltalike_annual(self): # offset # DateOffset - rng = period_range("2014", "2024", freq="A") + rng = period_range("2014", "2024", freq="Y") result = rng + pd.offsets.YearEnd(5) - expected = period_range("2019", "2029", freq="A") + expected = period_range("2019", "2029", freq="Y") tm.assert_index_equal(result, expected) rng += pd.offsets.YearEnd(5) tm.assert_index_equal(rng, expected) def test_pi_add_sub_timedeltalike_freq_mismatch_annual(self, mismatched_freq): other = mismatched_freq - rng = period_range("2014", "2024", freq="A") - msg = "Input has different freq(=.+)? from Period.*?\\(freq=A-DEC\\)" + rng = period_range("2014", "2024", freq="Y") + msg = "Input has different freq(=.+)? from Period.*?\\(freq=Y-DEC\\)" with pytest.raises(IncompatibleFrequency, match=msg): rng + other with pytest.raises(IncompatibleFrequency, match=msg): diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py index d2f9f6dffab49..7fba150c9113f 100644 --- a/pandas/tests/arrays/categorical/test_astype.py +++ b/pandas/tests/arrays/categorical/test_astype.py @@ -32,7 +32,7 @@ def test_astype_nan_to_int(self, cls, values): [ array(["2019", "2020"], dtype="datetime64[ns, UTC]"), array([0, 0], dtype="timedelta64[ns]"), - array([Period("2019"), Period("2020")], dtype="period[A-DEC]"), + array([Period("2019"), Period("2020")], dtype="period[Y-DEC]"), array([Interval(0, 1), Interval(1, 2)], dtype="interval"), array([1, np.nan], dtype="Int64"), ], diff --git a/pandas/tests/arrays/period/test_arrow_compat.py b/pandas/tests/arrays/period/test_arrow_compat.py index 903fc3177aa84..6c04d7c603d4c 100644 --- a/pandas/tests/arrays/period/test_arrow_compat.py +++ b/pandas/tests/arrays/period/test_arrow_compat.py @@ -33,7 +33,7 @@ def test_arrow_extension_type(): "data, freq", [ (pd.date_range("2017", periods=3), "D"), - (pd.date_range("2017", periods=3, freq="A"), "A-DEC"), + (pd.date_range("2017", periods=3, freq="Y"), "Y-DEC"), ], ) def test_arrow_array(data, freq): diff --git a/pandas/tests/arrays/period/test_constructors.py b/pandas/tests/arrays/period/test_constructors.py index 0ea26a6ece7eb..d034162f1b46e 100644 --- a/pandas/tests/arrays/period/test_constructors.py +++ b/pandas/tests/arrays/period/test_constructors.py @@ -71,11 +71,11 @@ def test_from_datetime64_freq_2M(freq): "data, freq, msg", [ ( - [pd.Period("2017", "D"), pd.Period("2017", "A")], + [pd.Period("2017", "D"), pd.Period("2017", "Y")], None, "Input has different freq", ), - ([pd.Period("2017", "D")], "A", "Input has different freq"), + ([pd.Period("2017", "D")], "Y", "Input has different freq"), ], ) def test_period_array_raises(data, freq, msg): diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 2746cd91963a0..0aeedf4d03919 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -350,7 +350,7 @@ def test_array_inference(data, expected): "data", [ # mix of frequencies - [pd.Period("2000", "D"), pd.Period("2001", "A")], + [pd.Period("2000", "D"), pd.Period("2001", "Y")], # mix of closed [pd.Interval(0, 1, closed="left"), pd.Interval(1, 2, closed="right")], # Mix of timezones diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index a105852395b3a..fc46e5a372806 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -747,7 +747,7 @@ def test_iter_zoneinfo_fold(self, tz): assert left.utcoffset() == right2.utcoffset() def test_date_range_frequency_M_deprecated(self): - depr_msg = r"\'M\' will be deprecated, please use \'ME\' for \'month end\'" + depr_msg = "'M' will be deprecated, please use 'ME' instead." expected = pd.date_range("1/1/2000", periods=4, freq="2ME") with tm.assert_produces_warning(UserWarning, match=depr_msg): diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index d1e954bc2ebe2..43a80a92573c5 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -82,9 +82,9 @@ def test_setitem(key, value, expected): def test_setitem_raises_incompatible_freq(): arr = PeriodArray(np.arange(3), dtype="period[D]") with pytest.raises(IncompatibleFrequency, match="freq"): - arr[0] = pd.Period("2000", freq="A") + arr[0] = pd.Period("2000", freq="Y") - other = PeriodArray._from_sequence(["2000", "2001"], dtype="period[A]") + other = PeriodArray._from_sequence(["2000", "2001"], dtype="period[Y]") with pytest.raises(IncompatibleFrequency, match="freq"): arr[[0, 1]] = other diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index db13e979a3c2d..20ee2d443f340 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -192,9 +192,9 @@ def test_iter_box(self): "datetime64[ns, US/Central]", ), ( - pd.PeriodIndex([2018, 2019], freq="A"), + pd.PeriodIndex([2018, 2019], freq="Y"), PeriodArray, - pd.core.dtypes.dtypes.PeriodDtype("A-DEC"), + pd.core.dtypes.dtypes.PeriodDtype("Y-DEC"), ), (pd.IntervalIndex.from_breaks([0, 1, 2]), IntervalArray, "interval"), ( diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 6f6cc5a5ad5d8..8e99c074a2c55 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -274,7 +274,7 @@ def test_is_period_dtype(): assert not com.is_period_dtype(pd.Period("2017-01-01")) assert com.is_period_dtype(PeriodDtype(freq="D")) - assert com.is_period_dtype(pd.PeriodIndex([], freq="A")) + assert com.is_period_dtype(pd.PeriodIndex([], freq="Y")) def test_is_interval_dtype(): diff --git a/pandas/tests/frame/methods/test_asfreq.py b/pandas/tests/frame/methods/test_asfreq.py index a2f8f3e278395..0527bfb16492c 100644 --- a/pandas/tests/frame/methods/test_asfreq.py +++ b/pandas/tests/frame/methods/test_asfreq.py @@ -104,7 +104,7 @@ def test_asfreq_keep_index_name(self, frame_or_series): assert index_name == obj.asfreq("10D").index.name def test_asfreq_ts(self, frame_or_series): - index = period_range(freq="A", start="1/1/2001", end="12/31/2010") + index = period_range(freq="Y", start="1/1/2001", end="12/31/2010") obj = DataFrame( np.random.default_rng(2).standard_normal((len(index), 3)), index=index ) @@ -235,7 +235,7 @@ def test_asfreq_2ME(self, freq, freq_half): tm.assert_frame_equal(result, expected) def test_asfreq_frequency_M_deprecated(self): - depr_msg = r"\'M\' will be deprecated, please use \'ME\' for \'month end\'" + depr_msg = "'M' will be deprecated, please use 'ME' instead." index = date_range("1/1/2000", periods=4, freq="ME") df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0], index=index)}) diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index 3d21faf8b1729..735f6c50ab739 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -22,7 +22,7 @@ def frame_with_period_index(): return DataFrame( data=np.arange(20).reshape(4, 5), columns=list("abcde"), - index=period_range(start="2000", freq="A", periods=4), + index=period_range(start="2000", freq="Y", periods=4), ) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index 0105c41bd0eca..bba86b481eadc 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -36,7 +36,7 @@ def test_dti_set_index_reindex_datetimeindex(self): # GH#6631 df = DataFrame(np.random.default_rng(2).random(6)) idx1 = date_range("2011/01/01", periods=6, freq="ME", tz="US/Eastern") - idx2 = date_range("2013", periods=6, freq="A", tz="Asia/Tokyo") + idx2 = date_range("2013", periods=6, freq="Y", tz="Asia/Tokyo") df = df.set_index(idx1) tm.assert_index_equal(df.index, idx1) diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py index 5984e591dd6c1..f755ef0c2763d 100644 --- a/pandas/tests/frame/methods/test_set_index.py +++ b/pandas/tests/frame/methods/test_set_index.py @@ -493,7 +493,7 @@ def test_set_index_period(self): idx1 = idx1.append(idx1) idx2 = period_range("2013-01-01 09:00", periods=2, freq="H") idx2 = idx2.append(idx2).append(idx2) - idx3 = period_range("2005", periods=6, freq="A") + idx3 = period_range("2005", periods=6, freq="Y") df = df.set_index(idx1) df = df.set_index(idx2, append=True) @@ -694,7 +694,7 @@ def test_set_index_periodindex(self): # GH#6631 df = DataFrame(np.random.default_rng(2).random(6)) idx1 = period_range("2011/01/01", periods=6, freq="M") - idx2 = period_range("2013", periods=6, freq="A") + idx2 = period_range("2013", periods=6, freq="Y") df = df.set_index(idx1) tm.assert_index_equal(df.index, idx1) diff --git a/pandas/tests/frame/methods/test_to_timestamp.py b/pandas/tests/frame/methods/test_to_timestamp.py index e72b576fca833..478708ce90488 100644 --- a/pandas/tests/frame/methods/test_to_timestamp.py +++ b/pandas/tests/frame/methods/test_to_timestamp.py @@ -16,7 +16,7 @@ import pandas._testing as tm -def _get_with_delta(delta, freq="A-DEC"): +def _get_with_delta(delta, freq="Y-DEC"): return date_range( to_datetime("1/1/2001") + delta, to_datetime("12/31/2009") + delta, @@ -27,7 +27,7 @@ def _get_with_delta(delta, freq="A-DEC"): class TestToTimestamp: def test_to_timestamp(self, frame_or_series): K = 5 - index = period_range(freq="A", start="1/1/2001", end="12/1/2009") + index = period_range(freq="Y", start="1/1/2001", end="12/1/2009") obj = DataFrame( np.random.default_rng(2).standard_normal((len(index), K)), index=index, @@ -36,7 +36,7 @@ def test_to_timestamp(self, frame_or_series): obj["mix"] = "a" obj = tm.get_obj(obj, frame_or_series) - exp_index = date_range("1/1/2001", end="12/31/2009", freq="A-DEC") + exp_index = date_range("1/1/2001", end="12/31/2009", freq="Y-DEC") exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns") result = obj.to_timestamp("D", "end") tm.assert_index_equal(result.index, exp_index) @@ -71,7 +71,7 @@ def test_to_timestamp(self, frame_or_series): def test_to_timestamp_columns(self): K = 5 - index = period_range(freq="A", start="1/1/2001", end="12/1/2009") + index = period_range(freq="Y", start="1/1/2001", end="12/1/2009") df = DataFrame( np.random.default_rng(2).standard_normal((len(index), K)), index=index, @@ -82,7 +82,7 @@ def test_to_timestamp_columns(self): # columns df = df.T - exp_index = date_range("1/1/2001", end="12/31/2009", freq="A-DEC") + exp_index = date_range("1/1/2001", end="12/31/2009", freq="Y-DEC") exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns") result = df.to_timestamp("D", "end", axis=1) tm.assert_index_equal(result.columns, exp_index) @@ -122,7 +122,7 @@ def test_to_timestamp_columns(self): assert result2.columns.freqstr == "AS-JAN" def test_to_timestamp_invalid_axis(self): - index = period_range(freq="A", start="1/1/2001", end="12/1/2009") + index = period_range(freq="Y", start="1/1/2001", end="12/1/2009") obj = DataFrame( np.random.default_rng(2).standard_normal((len(index), 5)), index=index ) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index bb9a76829c77d..2c3b732fe7196 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1236,7 +1236,7 @@ def test_frame_add_tz_mismatch_converts_to_utc(self): assert result.index.tz is timezone.utc def test_align_frame(self): - rng = pd.period_range("1/1/2000", "1/1/2010", freq="A") + rng = pd.period_range("1/1/2000", "1/1/2010", freq="Y") ts = DataFrame( np.random.default_rng(2).standard_normal((len(rng), 3)), index=rng ) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 66f813c80ed16..0d5c2e3cd6c13 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -912,7 +912,7 @@ def test_mean_datetimelike(self): "A": np.arange(3), "B": date_range("2016-01-01", periods=3), "C": pd.timedelta_range("1D", periods=3), - "D": pd.period_range("2016", periods=3, freq="A"), + "D": pd.period_range("2016", periods=3, freq="Y"), } ) result = df.mean(numeric_only=True) @@ -937,7 +937,7 @@ def test_mean_datetimelike_numeric_only_false(self): tm.assert_series_equal(result, expected) # mean of period is not allowed - df["D"] = pd.period_range("2016", periods=3, freq="A") + df["D"] = pd.period_range("2016", periods=3, freq="Y") with pytest.raises(TypeError, match="mean is not implemented for Period"): df.mean(numeric_only=False) diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 64d516e484991..55c239f7284c1 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -339,7 +339,7 @@ def test_repr_np_nat_with_object(self, arg, box, expected): assert result == expected def test_frame_datetime64_pre1900_repr(self): - df = DataFrame({"year": date_range("1/1/1700", periods=50, freq="A-DEC")}) + df = DataFrame({"year": date_range("1/1/1700", periods=50, freq="Y-DEC")}) # it works! repr(df) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index a9e67df1fb793..a3dc9e3087c7b 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -193,7 +193,7 @@ def test_timegrouper_with_reg_groups(self): ).set_index(["Date", "Buyer"]) msg = "The default value of numeric_only" - result = df.groupby([Grouper(freq="A"), "Buyer"]).sum(numeric_only=True) + result = df.groupby([Grouper(freq="Y"), "Buyer"]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) expected = DataFrame( @@ -336,7 +336,7 @@ def test_timegrouper_with_reg_groups(self): ) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("freq", ["D", "ME", "A", "Q-APR"]) + @pytest.mark.parametrize("freq", ["D", "ME", "Y", "Q-APR"]) def test_timegrouper_with_reg_groups_freq(self, freq): # GH 6764 multiple grouping with/without sort df = DataFrame( diff --git a/pandas/tests/indexes/datetimelike_/test_sort_values.py b/pandas/tests/indexes/datetimelike_/test_sort_values.py index ab1c15f003d4d..cf919bfa29d10 100644 --- a/pandas/tests/indexes/datetimelike_/test_sort_values.py +++ b/pandas/tests/indexes/datetimelike_/test_sort_values.py @@ -127,7 +127,7 @@ def test_sort_values_with_freq_periodindex(self, freq): @pytest.mark.parametrize( "idx", [ - PeriodIndex(["2011", "2012", "2013"], name="pidx", freq="A"), + PeriodIndex(["2011", "2012", "2013"], name="pidx", freq="Y"), Index([2011, 2012, 2013], name="idx"), # for compatibility check ], ) @@ -275,10 +275,10 @@ def test_sort_values_without_freq_datetimeindex( ), ( PeriodIndex( - ["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A" + ["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="Y" ), PeriodIndex( - ["2011", "2011", "2012", "2013", "2015"], name="pidx", freq="A" + ["2011", "2011", "2012", "2013", "2015"], name="pidx", freq="Y" ), ), ( @@ -308,7 +308,7 @@ def test_sort_values_without_freq_periodindex_nat(self): def test_order_stability_compat(): # GH#35922. sort_values is stable both for normal and datetime-like Index - pidx = PeriodIndex(["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A") + pidx = PeriodIndex(["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="Y") iidx = Index([2011, 2013, 2015, 2012, 2011], name="idx") ordered1, indexer1 = pidx.sort_values(return_indexer=True, ascending=False) ordered2, indexer2 = iidx.sort_values(return_indexer=True, ascending=False) diff --git a/pandas/tests/indexes/datetimes/methods/test_to_period.py b/pandas/tests/indexes/datetimes/methods/test_to_period.py index 5f266ea0b42a6..7712a4166329c 100644 --- a/pandas/tests/indexes/datetimes/methods/test_to_period.py +++ b/pandas/tests/indexes/datetimes/methods/test_to_period.py @@ -60,7 +60,7 @@ def test_to_period_quarterlyish(self, off): def test_to_period_annualish(self, off): rng = date_range("01-Jan-2012", periods=8, freq=off) prng = rng.to_period() - assert prng.freq == "A-DEC" + assert prng.freq == "Y-DEC" def test_to_period_monthish(self): offsets = ["MS", "BM"] diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 61c8cc4a50fe2..7dee58e63fa88 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -753,7 +753,7 @@ def test_constructor_invalid_dtype_raises(self, dtype): DatetimeIndex([1, 2], dtype=dtype) def test_constructor_name(self): - idx = date_range(start="2000-01-01", periods=1, freq="A", name="TEST") + idx = date_range(start="2000-01-01", periods=1, freq="Y", name="TEST") assert idx.name == "TEST" def test_000constructor_resolution(self): @@ -978,8 +978,8 @@ def test_dti_constructor_years_only(self, tz_naive_fixture): rng2 = date_range("2014", "2015", freq="MS", tz=tz) expected2 = date_range("2014-01-01", "2015-01-01", freq="MS", tz=tz) - rng3 = date_range("2014", "2020", freq="A", tz=tz) - expected3 = date_range("2014-12-31", "2019-12-31", freq="A", tz=tz) + rng3 = date_range("2014", "2020", freq="Y", tz=tz) + expected3 = date_range("2014-12-31", "2019-12-31", freq="Y", tz=tz) rng4 = date_range("2014", "2020", freq="AS", tz=tz) expected4 = date_range("2014-01-01", "2020-01-01", freq="AS", tz=tz) @@ -1036,7 +1036,7 @@ def test_constructor_int64_nocopy(self): assert (index.asi8[50:100] != -1).all() @pytest.mark.parametrize( - "freq", ["ME", "Q", "A", "D", "B", "BH", "min", "s", "ms", "us", "H", "ns", "C"] + "freq", ["ME", "Q", "Y", "D", "B", "BH", "min", "s", "ms", "us", "H", "ns", "C"] ) def test_from_freq_recreate_from_data(self, freq): org = date_range(start="2001/02/01 09:00", freq=freq, periods=1) diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index b93aee1d988de..f0996d7af917d 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -7,6 +7,7 @@ time, timedelta, ) +import re import numpy as np import pytest @@ -252,12 +253,11 @@ def test_begin_year_alias(self, freq): ) tm.assert_index_equal(rng, exp) - @pytest.mark.parametrize("freq", ["A", "Y"]) - def test_end_year_alias(self, freq): + def test_end_year_alias(self): # see gh-9313 - rng = date_range("1/1/2013", "7/1/2017", freq=freq) + rng = date_range("1/1/2013", "7/1/2017", freq="Y") exp = DatetimeIndex( - ["2013-12-31", "2014-12-31", "2015-12-31", "2016-12-31"], freq=freq + ["2013-12-31", "2014-12-31", "2015-12-31", "2016-12-31"], freq="Y" ) tm.assert_index_equal(rng, exp) @@ -272,10 +272,10 @@ def test_business_end_year_alias(self, freq): def test_date_range_negative_freq(self): # GH 11018 - rng = date_range("2011-12-31", freq="-2A", periods=3) - exp = DatetimeIndex(["2011-12-31", "2009-12-31", "2007-12-31"], freq="-2A") + rng = date_range("2011-12-31", freq="-2Y", periods=3) + exp = DatetimeIndex(["2011-12-31", "2009-12-31", "2007-12-31"], freq="-2Y") tm.assert_index_equal(rng, exp) - assert rng.freq == "-2A" + assert rng.freq == "-2Y" rng = date_range("2011-01-31", freq="-2ME", periods=3) exp = DatetimeIndex(["2011-01-31", "2010-11-30", "2010-09-30"], freq="-2ME") @@ -638,7 +638,7 @@ def test_range_tz_dateutil(self): assert dr[0] == start assert dr[2] == end - @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3H", "A"]) + @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3H", "Y"]) def test_range_closed(self, freq, inclusive_endpoints_fixture): begin = datetime(2011, 1, 1) end = datetime(2014, 1, 1) @@ -653,7 +653,7 @@ def test_range_closed(self, freq, inclusive_endpoints_fixture): tm.assert_index_equal(expected_range, result_range) - @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3H", "A"]) + @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3H", "Y"]) def test_range_closed_with_tz_aware_start_end( self, freq, inclusive_endpoints_fixture ): @@ -674,7 +674,7 @@ def test_range_closed_with_tz_aware_start_end( tm.assert_index_equal(expected_range, result_range) - @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3H", "A"]) + @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3H", "Y"]) def test_range_with_tz_closed_with_tz_aware_start_end( self, freq, inclusive_endpoints_fixture ): @@ -839,20 +839,23 @@ def test_freq_dateoffset_with_relateivedelta_nanos(self): @pytest.mark.parametrize( "freq,freq_depr", [ - ("min", "T"), - ("s", "S"), - ("ms", "L"), - ("us", "U"), - ("ns", "N"), + ("2Y", "2A"), + ("200Y-MAY", "200A-MAY"), + ("2min", "2T"), + ("1s", "1S"), + ("2ms", "2L"), + ("1us", "1U"), + ("2ns", "2N"), ], ) - def test_frequencies_T_S_L_U_N_deprecated(self, freq, freq_depr): + def test_frequencies_A_T_S_L_U_N_deprecated(self, freq, freq_depr): # GH#52536 - msg = f"'{freq_depr}' is deprecated and will be removed in a future version." + freq_msg = re.split("[0-9]*", freq_depr, maxsplit=1)[1] + msg = f"'{freq_msg}' is deprecated and will be removed in a future version." - expected = date_range("1/1/2000", periods=4, freq=freq) + expected = date_range("1/1/2000", periods=2, freq=freq) with tm.assert_produces_warning(FutureWarning, match=msg): - result = date_range("1/1/2000", periods=4, freq=freq_depr) + result = date_range("1/1/2000", periods=2, freq=freq_depr) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 21dc22bea87dc..ac6d0a97956e4 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -20,7 +20,7 @@ class TestDatetimeIndexOps: @pytest.mark.parametrize( "freq,expected", [ - ("A", "day"), + ("Y", "day"), ("Q", "day"), ("ME", "day"), ("D", "day"), @@ -33,7 +33,7 @@ class TestDatetimeIndexOps: ) def test_resolution(self, request, tz_naive_fixture, freq, expected): tz = tz_naive_fixture - if freq == "A" and not IS64 and isinstance(tz, tzlocal): + if freq == "Y" and not IS64 and isinstance(tz, tzlocal): request.node.add_marker( pytest.mark.xfail(reason="OverflowError inside tzlocal past 2038") ) diff --git a/pandas/tests/indexes/period/methods/test_asfreq.py b/pandas/tests/indexes/period/methods/test_asfreq.py index 89ea4fb6472d0..f9838ce272296 100644 --- a/pandas/tests/indexes/period/methods/test_asfreq.py +++ b/pandas/tests/indexes/period/methods/test_asfreq.py @@ -10,7 +10,7 @@ class TestPeriodIndex: def test_asfreq(self): - pi1 = period_range(freq="A", start="1/1/2001", end="1/1/2001") + pi1 = period_range(freq="Y", start="1/1/2001", end="1/1/2001") pi2 = period_range(freq="Q", start="1/1/2001", end="1/1/2001") pi3 = period_range(freq="M", start="1/1/2001", end="1/1/2001") pi4 = period_range(freq="D", start="1/1/2001", end="1/1/2001") @@ -26,42 +26,42 @@ def test_asfreq(self): assert pi1.asfreq("Min", "s") == pi6 assert pi1.asfreq("s", "s") == pi7 - assert pi2.asfreq("A", "s") == pi1 + assert pi2.asfreq("Y", "s") == pi1 assert pi2.asfreq("M", "s") == pi3 assert pi2.asfreq("D", "s") == pi4 assert pi2.asfreq("H", "s") == pi5 assert pi2.asfreq("Min", "s") == pi6 assert pi2.asfreq("s", "s") == pi7 - assert pi3.asfreq("A", "s") == pi1 + assert pi3.asfreq("Y", "s") == pi1 assert pi3.asfreq("Q", "s") == pi2 assert pi3.asfreq("D", "s") == pi4 assert pi3.asfreq("H", "s") == pi5 assert pi3.asfreq("Min", "s") == pi6 assert pi3.asfreq("s", "s") == pi7 - assert pi4.asfreq("A", "s") == pi1 + assert pi4.asfreq("Y", "s") == pi1 assert pi4.asfreq("Q", "s") == pi2 assert pi4.asfreq("M", "s") == pi3 assert pi4.asfreq("H", "s") == pi5 assert pi4.asfreq("Min", "s") == pi6 assert pi4.asfreq("s", "s") == pi7 - assert pi5.asfreq("A", "s") == pi1 + assert pi5.asfreq("Y", "s") == pi1 assert pi5.asfreq("Q", "s") == pi2 assert pi5.asfreq("M", "s") == pi3 assert pi5.asfreq("D", "s") == pi4 assert pi5.asfreq("Min", "s") == pi6 assert pi5.asfreq("s", "s") == pi7 - assert pi6.asfreq("A", "s") == pi1 + assert pi6.asfreq("Y", "s") == pi1 assert pi6.asfreq("Q", "s") == pi2 assert pi6.asfreq("M", "s") == pi3 assert pi6.asfreq("D", "s") == pi4 assert pi6.asfreq("H", "s") == pi5 assert pi6.asfreq("s", "s") == pi7 - assert pi7.asfreq("A", "s") == pi1 + assert pi7.asfreq("Y", "s") == pi1 assert pi7.asfreq("Q", "s") == pi2 assert pi7.asfreq("M", "s") == pi3 assert pi7.asfreq("D", "s") == pi4 diff --git a/pandas/tests/indexes/period/methods/test_astype.py b/pandas/tests/indexes/period/methods/test_astype.py index e54cd73a35f59..07595b6b8c1dd 100644 --- a/pandas/tests/indexes/period/methods/test_astype.py +++ b/pandas/tests/indexes/period/methods/test_astype.py @@ -44,7 +44,7 @@ def test_astype_conversion(self): expected = Index([str(x) for x in idx], name="idx") tm.assert_index_equal(result, expected) - idx = period_range("1990", "2009", freq="A", name="idx") + idx = period_range("1990", "2009", freq="Y", name="idx") result = idx.astype("i8") tm.assert_index_equal(result, Index(idx.asi8, name="idx")) tm.assert_numpy_array_equal(result.values, idx.asi8) diff --git a/pandas/tests/indexes/period/methods/test_is_full.py b/pandas/tests/indexes/period/methods/test_is_full.py index 490f199a59ed7..b4105bedbe21d 100644 --- a/pandas/tests/indexes/period/methods/test_is_full.py +++ b/pandas/tests/indexes/period/methods/test_is_full.py @@ -4,19 +4,19 @@ def test_is_full(): - index = PeriodIndex([2005, 2007, 2009], freq="A") + index = PeriodIndex([2005, 2007, 2009], freq="Y") assert not index.is_full - index = PeriodIndex([2005, 2006, 2007], freq="A") + index = PeriodIndex([2005, 2006, 2007], freq="Y") assert index.is_full - index = PeriodIndex([2005, 2005, 2007], freq="A") + index = PeriodIndex([2005, 2005, 2007], freq="Y") assert not index.is_full - index = PeriodIndex([2005, 2005, 2006], freq="A") + index = PeriodIndex([2005, 2005, 2006], freq="Y") assert index.is_full - index = PeriodIndex([2006, 2005, 2005], freq="A") + index = PeriodIndex([2006, 2005, 2005], freq="Y") with pytest.raises(ValueError, match="Index is not monotonic"): index.is_full diff --git a/pandas/tests/indexes/period/methods/test_shift.py b/pandas/tests/indexes/period/methods/test_shift.py index 48dc5f0e64d08..d649dd3da0864 100644 --- a/pandas/tests/indexes/period/methods/test_shift.py +++ b/pandas/tests/indexes/period/methods/test_shift.py @@ -29,16 +29,16 @@ def test_pi_shift_ndarray(self): tm.assert_index_equal(result, expected) def test_shift(self): - pi1 = period_range(freq="A", start="1/1/2001", end="12/1/2009") - pi2 = period_range(freq="A", start="1/1/2002", end="12/1/2010") + pi1 = period_range(freq="Y", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="Y", start="1/1/2002", end="12/1/2010") tm.assert_index_equal(pi1.shift(0), pi1) assert len(pi1) == len(pi2) tm.assert_index_equal(pi1.shift(1), pi2) - pi1 = period_range(freq="A", start="1/1/2001", end="12/1/2009") - pi2 = period_range(freq="A", start="1/1/2000", end="12/1/2008") + pi1 = period_range(freq="Y", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="Y", start="1/1/2000", end="12/1/2008") assert len(pi1) == len(pi2) tm.assert_index_equal(pi1.shift(-1), pi2) @@ -117,6 +117,6 @@ def test_shift_gh8083(self): def test_shift_periods(self): # GH #22458 : argument 'n' was deprecated in favor of 'periods' - idx = period_range(freq="A", start="1/1/2001", end="12/1/2009") + idx = period_range(freq="Y", start="1/1/2001", end="12/1/2009") tm.assert_index_equal(idx.shift(periods=0), idx) tm.assert_index_equal(idx.shift(0), idx) diff --git a/pandas/tests/indexes/period/methods/test_to_timestamp.py b/pandas/tests/indexes/period/methods/test_to_timestamp.py index 8bb0c3518c835..462f66eef7269 100644 --- a/pandas/tests/indexes/period/methods/test_to_timestamp.py +++ b/pandas/tests/indexes/period/methods/test_to_timestamp.py @@ -47,7 +47,7 @@ def test_to_timestamp_non_contiguous(self): tm.assert_datetime_array_equal(result, expected, check_freq=False) def test_to_timestamp_freq(self): - idx = period_range("2017", periods=12, freq="A-DEC") + idx = period_range("2017", periods=12, freq="Y-DEC") result = idx.to_timestamp() expected = date_range("2017", periods=12, freq="AS-JAN") tm.assert_index_equal(result, expected) @@ -72,12 +72,12 @@ def test_to_timestamp_pi_nat(self): tm.assert_index_equal(result3, exp) assert result3.freqstr == "3M" - msg = "Frequency must be positive, because it represents span: -2A" + msg = "Frequency must be positive, because it represents span: -2Y" with pytest.raises(ValueError, match=msg): - result.to_period(freq="-2A") + result.to_period(freq="-2Y") def test_to_timestamp_preserve_name(self): - index = period_range(freq="A", start="1/1/2001", end="12/1/2009", name="foo") + index = period_range(freq="Y", start="1/1/2001", end="12/1/2009", name="foo") assert index.name == "foo" conv = index.to_timestamp("D") diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index a5bdfa11140d1..ac4edb10d9352 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -166,7 +166,7 @@ def test_constructor_fromarraylike(self): msg = "'Period' object is not iterable" with pytest.raises(TypeError, match=msg): - PeriodIndex(data=Period("2007", freq="A")) + PeriodIndex(data=Period("2007", freq="Y")) result = PeriodIndex(iter(idx)) tm.assert_index_equal(result, idx) @@ -418,7 +418,7 @@ def test_constructor_freq_mult(self): @pytest.mark.parametrize( "freq_offset, freq_period", [ - ("A", "A"), + ("Y", "Y"), ("ME", "M"), ("D", "D"), ("min", "min"), @@ -453,7 +453,7 @@ def test_constructor_freq_combined(self): tm.assert_index_equal(pidx, expected) def test_constructor(self): - pi = period_range(freq="A", start="1/1/2001", end="12/1/2009") + pi = period_range(freq="Y", start="1/1/2001", end="12/1/2009") assert len(pi) == 9 pi = period_range(freq="Q", start="1/1/2001", end="12/1/2009") @@ -526,7 +526,7 @@ def test_constructor(self): Period("2006-12-31", ("w", 1)) @pytest.mark.parametrize( - "freq", ["M", "Q", "A", "D", "B", "min", "s", "ms", "us", "ns", "H"] + "freq", ["M", "Q", "Y", "D", "B", "min", "s", "ms", "us", "ns", "H"] ) @pytest.mark.filterwarnings( r"ignore:Period with BDay freq is deprecated:FutureWarning" @@ -539,7 +539,7 @@ def test_recreate_from_data(self, freq): def test_map_with_string_constructor(self): raw = [2005, 2007, 2009] - index = PeriodIndex(raw, freq="A") + index = PeriodIndex(raw, freq="Y") expected = Index([str(num) for num in raw]) res = index.map(str) diff --git a/pandas/tests/indexes/period/test_formats.py b/pandas/tests/indexes/period/test_formats.py index 87bbb96377a79..67deeccff4e2a 100644 --- a/pandas/tests/indexes/period/test_formats.py +++ b/pandas/tests/indexes/period/test_formats.py @@ -55,7 +55,7 @@ def test_representation(self, method): idx2 = PeriodIndex(["2011-01-01"], freq="D") idx3 = PeriodIndex(["2011-01-01", "2011-01-02"], freq="D") idx4 = PeriodIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D") - idx5 = PeriodIndex(["2011", "2012", "2013"], freq="A") + idx5 = PeriodIndex(["2011", "2012", "2013"], freq="Y") idx6 = PeriodIndex(["2011-01-01 09:00", "2012-02-01 10:00", "NaT"], freq="H") idx7 = pd.period_range("2013Q1", periods=1, freq="Q") idx8 = pd.period_range("2013Q1", periods=2, freq="Q") @@ -73,7 +73,7 @@ def test_representation(self, method): "dtype='period[D]')" ) - exp5 = "PeriodIndex(['2011', '2012', '2013'], dtype='period[A-DEC]')" + exp5 = "PeriodIndex(['2011', '2012', '2013'], dtype='period[Y-DEC]')" exp6 = ( "PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], " @@ -101,7 +101,7 @@ def test_representation_to_series(self): idx2 = PeriodIndex(["2011-01-01"], freq="D") idx3 = PeriodIndex(["2011-01-01", "2011-01-02"], freq="D") idx4 = PeriodIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D") - idx5 = PeriodIndex(["2011", "2012", "2013"], freq="A") + idx5 = PeriodIndex(["2011", "2012", "2013"], freq="Y") idx6 = PeriodIndex(["2011-01-01 09:00", "2012-02-01 10:00", "NaT"], freq="H") idx7 = pd.period_range("2013Q1", periods=1, freq="Q") @@ -125,7 +125,7 @@ def test_representation_to_series(self): exp5 = """0 2011 1 2012 2 2013 -dtype: period[A-DEC]""" +dtype: period[Y-DEC]""" exp6 = """0 2011-01-01 09:00 1 2012-02-01 10:00 @@ -157,7 +157,7 @@ def test_summary(self): idx2 = PeriodIndex(["2011-01-01"], freq="D") idx3 = PeriodIndex(["2011-01-01", "2011-01-02"], freq="D") idx4 = PeriodIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D") - idx5 = PeriodIndex(["2011", "2012", "2013"], freq="A") + idx5 = PeriodIndex(["2011", "2012", "2013"], freq="Y") idx6 = PeriodIndex(["2011-01-01 09:00", "2012-02-01 10:00", "NaT"], freq="H") idx7 = pd.period_range("2013Q1", periods=1, freq="Q") @@ -177,7 +177,7 @@ def test_summary(self): Freq: D""" exp5 = """PeriodIndex: 3 entries, 2011 to 2013 -Freq: A-DEC""" +Freq: Y-DEC""" exp6 = """PeriodIndex: 3 entries, 2011-01-01 09:00 to NaT Freq: H""" diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index 109a4a41e2841..f5af550d94ab1 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -238,9 +238,9 @@ def test_getitem_day(self, idx_range): class TestGetLoc: def test_get_loc_msg(self): - idx = period_range("2000-1-1", freq="A", periods=10) - bad_period = Period("2012", "A") - with pytest.raises(KeyError, match=r"^Period\('2012', 'A-DEC'\)$"): + idx = period_range("2000-1-1", freq="Y", periods=10) + bad_period = Period("2012", "Y") + with pytest.raises(KeyError, match=r"^Period\('2012', 'Y-DEC'\)$"): idx.get_loc(bad_period) try: diff --git a/pandas/tests/indexes/period/test_partial_slicing.py b/pandas/tests/indexes/period/test_partial_slicing.py index 3a272f53091b5..5bc76340badaf 100644 --- a/pandas/tests/indexes/period/test_partial_slicing.py +++ b/pandas/tests/indexes/period/test_partial_slicing.py @@ -14,7 +14,7 @@ class TestPeriodIndex: def test_getitem_periodindex_duplicates_string_slice(self, using_copy_on_write): # monotonic - idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="A-JUN") + idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="Y-JUN") ts = Series(np.random.default_rng(2).standard_normal(len(idx)), index=idx) original = ts.copy() @@ -28,7 +28,7 @@ def test_getitem_periodindex_duplicates_string_slice(self, using_copy_on_write): assert (ts[1:3] == 1).all() # not monotonic - idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq="A-JUN") + idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq="Y-JUN") ts = Series(np.random.default_rng(2).standard_normal(len(idx)), index=idx) result = ts["2007"] diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index bd40fa37897d8..0c445b4cdf770 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -18,7 +18,7 @@ class TestPeriodIndex: def test_make_time_series(self): - index = period_range(freq="A", start="1/1/2001", end="12/1/2009") + index = period_range(freq="Y", start="1/1/2001", end="12/1/2009") series = Series(1, index=index) assert isinstance(series, Series) @@ -67,7 +67,7 @@ def test_values(self): tm.assert_numpy_array_equal(idx.asi8, exp) def test_period_index_length(self): - pi = period_range(freq="A", start="1/1/2001", end="12/1/2009") + pi = period_range(freq="Y", start="1/1/2001", end="12/1/2009") assert len(pi) == 9 pi = period_range(freq="Q", start="1/1/2001", end="12/1/2009") @@ -157,7 +157,7 @@ def test_period_index_length(self): @pytest.mark.parametrize( "periodindex", [ - period_range(freq="A", start="1/1/2001", end="12/1/2005"), + period_range(freq="Y", start="1/1/2001", end="12/1/2005"), period_range(freq="Q", start="1/1/2001", end="12/1/2002"), period_range(freq="M", start="1/1/2001", end="1/1/2002"), period_range(freq="D", start="12/1/2001", end="6/1/2001"), @@ -187,7 +187,7 @@ def test_fields(self, periodindex, field): assert getattr(x, field) == val def test_is_(self): - create_index = lambda: period_range(freq="A", start="1/1/2001", end="12/1/2009") + create_index = lambda: period_range(freq="Y", start="1/1/2001", end="12/1/2009") index = create_index() assert index.is_(index) assert not index.is_(create_index()) @@ -199,23 +199,23 @@ def test_is_(self): assert ind2.is_(index) assert not index.is_(index[:]) assert not index.is_(index.asfreq("M")) - assert not index.is_(index.asfreq("A")) + assert not index.is_(index.asfreq("Y")) assert not index.is_(index - 2) assert not index.is_(index - 0) def test_index_unique(self): - idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="A-JUN") - expected = PeriodIndex([2000, 2007, 2009], freq="A-JUN") + idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="Y-JUN") + expected = PeriodIndex([2000, 2007, 2009], freq="Y-JUN") tm.assert_index_equal(idx.unique(), expected) assert idx.nunique() == 3 def test_negative_ordinals(self): - Period(ordinal=-1000, freq="A") - Period(ordinal=0, freq="A") + Period(ordinal=-1000, freq="Y") + Period(ordinal=0, freq="Y") - idx1 = PeriodIndex(ordinal=[-1, 0, 1], freq="A") - idx2 = PeriodIndex(ordinal=np.array([-1, 0, 1]), freq="A") + idx1 = PeriodIndex(ordinal=[-1, 0, 1], freq="Y") + idx2 = PeriodIndex(ordinal=np.array([-1, 0, 1]), freq="Y") tm.assert_index_equal(idx1, idx2) def test_pindex_fieldaccessor_nat(self): @@ -267,14 +267,14 @@ def test_with_multi_index(self): def test_map(self): # test_map_dictlike generally tests - index = PeriodIndex([2005, 2007, 2009], freq="A") + index = PeriodIndex([2005, 2007, 2009], freq="Y") result = index.map(lambda x: x.ordinal) exp = Index([x.ordinal for x in index]) tm.assert_index_equal(result, exp) def test_format_empty(self): # GH35712 - empty_idx = PeriodIndex([], freq="A") + empty_idx = PeriodIndex([], freq="Y") assert empty_idx.format() == [] assert empty_idx.format(name=True) == [""] @@ -284,6 +284,17 @@ def test_period_index_frequency_ME_error_message(self): with pytest.raises(ValueError, match=msg): PeriodIndex(["2020-01-01", "2020-01-02"], freq="2ME") + @pytest.mark.parametrize("freq", ["2A", "A-DEC", "200A-AUG"]) + def test_a_deprecated_from_time_series(self, freq): + # GH#52536 + freq_msg = freq[freq.index("A") :] + msg = f"'{freq_msg}' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + index = period_range(freq=freq, start="1/1/2001", end="12/1/2009") + series = Series(1, index=index) + assert isinstance(series, Series) + def test_maybe_convert_timedelta(): pi = PeriodIndex(["2000", "2001"], freq="D") diff --git a/pandas/tests/indexes/period/test_period_range.py b/pandas/tests/indexes/period/test_period_range.py index 63acaba2d4f3e..bee8a1282d08b 100644 --- a/pandas/tests/indexes/period/test_period_range.py +++ b/pandas/tests/indexes/period/test_period_range.py @@ -20,7 +20,7 @@ def test_required_arguments(self): with pytest.raises(ValueError, match=msg): period_range("2011-1-1", "2012-1-1", "B") - @pytest.mark.parametrize("freq", ["D", "W", "Q", "A"]) + @pytest.mark.parametrize("freq", ["D", "W", "Q", "Y"]) def test_construction_from_string(self, freq): # non-empty expected = date_range( diff --git a/pandas/tests/indexes/period/test_pickle.py b/pandas/tests/indexes/period/test_pickle.py index cb981ab10064f..7d359fdabb6f1 100644 --- a/pandas/tests/indexes/period/test_pickle.py +++ b/pandas/tests/indexes/period/test_pickle.py @@ -12,7 +12,7 @@ class TestPickle: - @pytest.mark.parametrize("freq", ["D", "M", "A"]) + @pytest.mark.parametrize("freq", ["D", "M", "Y"]) def test_pickle_round_trip(self, freq): idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.nan], freq=freq) result = tm.round_trip_pickle(idx) diff --git a/pandas/tests/indexes/period/test_resolution.py b/pandas/tests/indexes/period/test_resolution.py index 6c876b4f9366f..98ccfe6569798 100644 --- a/pandas/tests/indexes/period/test_resolution.py +++ b/pandas/tests/indexes/period/test_resolution.py @@ -7,7 +7,7 @@ class TestResolution: @pytest.mark.parametrize( "freq,expected", [ - ("A", "year"), + ("Y", "year"), ("Q", "quarter"), ("M", "month"), ("D", "day"), diff --git a/pandas/tests/indexes/period/test_setops.py b/pandas/tests/indexes/period/test_setops.py index dd05210e417b0..9610db5f0336b 100644 --- a/pandas/tests/indexes/period/test_setops.py +++ b/pandas/tests/indexes/period/test_setops.py @@ -81,8 +81,8 @@ def test_union(self, sort): other6 = period_range("2000-04-01", freq="M", periods=7) expected6 = period_range("2000-01-01", freq="M", periods=10) - rng7 = period_range("2003-01-01", freq="A", periods=5) - other7 = period_range("1998-01-01", freq="A", periods=8) + rng7 = period_range("2003-01-01", freq="Y", periods=5) + other7 = period_range("1998-01-01", freq="Y", periods=8) expected7 = PeriodIndex( [ "2003", @@ -96,7 +96,7 @@ def test_union(self, sort): "2001", "2002", ], - freq="A", + freq="Y", ) rng8 = PeriodIndex( @@ -293,9 +293,9 @@ def test_difference(self, sort): expected6 = PeriodIndex(["2000-02-01", "2000-01-01", "2000-03-01"], freq="M") period_rng = ["2003", "2007", "2006", "2005", "2004"] - rng7 = PeriodIndex(period_rng, freq="A") - other7 = period_range("1998-01-01", freq="A", periods=8) - expected7 = PeriodIndex(["2007", "2006"], freq="A") + rng7 = PeriodIndex(period_rng, freq="Y") + other7 = period_range("1998-01-01", freq="Y", periods=8) + expected7 = PeriodIndex(["2007", "2006"], freq="Y") for rng, other, expected in [ (rng1, other1, expected1), diff --git a/pandas/tests/indexes/period/test_tools.py b/pandas/tests/indexes/period/test_tools.py index 18668fd357fd8..2a9149844a353 100644 --- a/pandas/tests/indexes/period/test_tools.py +++ b/pandas/tests/indexes/period/test_tools.py @@ -27,7 +27,7 @@ class TestPeriodRepresentation: ("us", "1970-01-01"), ("ns", "1970-01-01"), ("M", "1970-01"), - ("A", 1970), + ("Y", 1970), ], ) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") @@ -43,7 +43,7 @@ def test_freq(self, freq, base_date): class TestPeriodIndexConversion: def test_tolist(self): - index = period_range(freq="A", start="1/1/2001", end="12/1/2009") + index = period_range(freq="Y", start="1/1/2001", end="12/1/2009") rs = index.tolist() for x in rs: assert isinstance(x, Period) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 8fd1e296fb79a..6afab569797f2 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -980,7 +980,7 @@ def test_str_attribute(self, method): Index(range(5)), tm.makeDateIndex(10), MultiIndex.from_tuples([("foo", "1"), ("bar", "3")]), - period_range(start="2000", end="2010", freq="A"), + period_range(start="2000", end="2010", freq="Y"), ], ) def test_str_attribute_raises(self, index): diff --git a/pandas/tests/indexes/timedeltas/test_timedelta_range.py b/pandas/tests/indexes/timedeltas/test_timedelta_range.py index d0593b3230959..03531547ef042 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta_range.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta_range.py @@ -3,6 +3,7 @@ from pandas import ( Timedelta, + TimedeltaIndex, timedelta_range, to_timedelta, ) @@ -119,3 +120,42 @@ def test_timedelta_range_infer_freq(self): # https://github.com/pandas-dev/pandas/issues/35897 result = timedelta_range("0s", "1s", periods=31) assert result.freq is None + + @pytest.mark.parametrize( + "freq_depr, start, end, expected_values, expected_freq", + [ + ( + "3.5S", + "05:03:01", + "05:03:10", + ["0 days 05:03:01", "0 days 05:03:04.500000", "0 days 05:03:08"], + "3500ms", + ), + ( + "2.5T", + "5 hours", + "5 hours 8 minutes", + [ + "0 days 05:00:00", + "0 days 05:02:30", + "0 days 05:05:00", + "0 days 05:07:30", + ], + "150s", + ), + ], + ) + def test_timedelta_range_deprecated_freq( + self, freq_depr, start, end, expected_values, expected_freq + ): + # GH#52536 + msg = ( + f"'{freq_depr[-1]}' is deprecated and will be removed in a future version." + ) + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = timedelta_range(start=start, end=end, freq=freq_depr) + expected = TimedeltaIndex( + expected_values, dtype="timedelta64[ns]", freq=expected_freq + ) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 20568342222c3..59e0ca0591ced 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -260,19 +260,19 @@ def test_loc_npstr(self): @pytest.mark.parametrize( "msg, key", [ - (r"Period\('2019', 'A-DEC'\), 'foo', 'bar'", (Period(2019), "foo", "bar")), - (r"Period\('2019', 'A-DEC'\), 'y1', 'bar'", (Period(2019), "y1", "bar")), - (r"Period\('2019', 'A-DEC'\), 'foo', 'z1'", (Period(2019), "foo", "z1")), + (r"Period\('2019', 'Y-DEC'\), 'foo', 'bar'", (Period(2019), "foo", "bar")), + (r"Period\('2019', 'Y-DEC'\), 'y1', 'bar'", (Period(2019), "y1", "bar")), + (r"Period\('2019', 'Y-DEC'\), 'foo', 'z1'", (Period(2019), "foo", "z1")), ( - r"Period\('2018', 'A-DEC'\), Period\('2016', 'A-DEC'\), 'bar'", + r"Period\('2018', 'Y-DEC'\), Period\('2016', 'Y-DEC'\), 'bar'", (Period(2018), Period(2016), "bar"), ), - (r"Period\('2018', 'A-DEC'\), 'foo', 'y1'", (Period(2018), "foo", "y1")), + (r"Period\('2018', 'Y-DEC'\), 'foo', 'y1'", (Period(2018), "foo", "y1")), ( - r"Period\('2017', 'A-DEC'\), 'foo', Period\('2015', 'A-DEC'\)", + r"Period\('2017', 'Y-DEC'\), 'foo', Period\('2015', 'Y-DEC'\)", (Period(2017), "foo", Period(2015)), ), - (r"Period\('2017', 'A-DEC'\), 'z1', 'bar'", (Period(2017), "z1", "bar")), + (r"Period\('2017', 'Y-DEC'\), 'z1', 'bar'", (Period(2017), "z1", "bar")), ], ) def test_contains_raise_error_if_period_index_is_in_multi_index(self, msg, key): diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 09567e7012313..d1c1d72ac4afe 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1335,13 +1335,13 @@ def test_interval_can_hold_element(self, dtype, element): assert not blk._can_hold_element(elem) def test_period_can_hold_element_emptylist(self): - pi = period_range("2016", periods=3, freq="A") + pi = period_range("2016", periods=3, freq="Y") blk = new_block(pi._data.reshape(1, 3), BlockPlacement([1]), ndim=2) assert blk._can_hold_element([]) def test_period_can_hold_element(self, element): - pi = period_range("2016", periods=3, freq="A") + pi = period_range("2016", periods=3, freq="Y") elem = element(pi) self.check_series_setitem(elem, pi, True) diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index fc2edc7559a48..d7a4dfca90b5e 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -150,7 +150,7 @@ def test_as_json_table_type_bool_data(self, bool_type): pd.to_datetime(["2016"], utc=True), pd.Series(pd.to_datetime(["2016"])), pd.Series(pd.to_datetime(["2016"], utc=True)), - pd.period_range("2016", freq="A", periods=3), + pd.period_range("2016", freq="Y", periods=3), ], ) def test_as_json_table_type_date_data(self, date_data): @@ -480,9 +480,9 @@ def test_convert_pandas_type_to_json_field_datetime( assert result == expected def test_convert_pandas_type_to_json_period_range(self): - arr = pd.period_range("2016", freq="A-DEC", periods=4) + arr = pd.period_range("2016", freq="Y-DEC", periods=4) result = convert_pandas_type_to_json_field(arr) - expected = {"name": "values", "type": "datetime", "freq": "A-DEC"} + expected = {"name": "values", "type": "datetime", "freq": "Y-DEC"} assert result == expected @pytest.mark.parametrize("kind", [pd.Categorical, pd.CategoricalIndex]) diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 220741cf1ec3d..f488ee7da87ac 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -102,7 +102,7 @@ def test_is_error_nozeroindex(self): _check_plot_works(a.plot, yerr=a) def test_nonnumeric_exclude(self): - idx = date_range("1/1/1987", freq="A", periods=3) + idx = date_range("1/1/1987", freq="Y", periods=3) df = DataFrame({"A": ["x", "y", "z"], "B": [1, 2, 3]}, idx) fig, ax = mpl.pyplot.subplots() @@ -111,13 +111,13 @@ def test_nonnumeric_exclude(self): mpl.pyplot.close(fig) def test_nonnumeric_exclude_error(self): - idx = date_range("1/1/1987", freq="A", periods=3) + idx = date_range("1/1/1987", freq="Y", periods=3) df = DataFrame({"A": ["x", "y", "z"], "B": [1, 2, 3]}, idx) msg = "no numeric data to plot" with pytest.raises(TypeError, match=msg): df["A"].plot() - @pytest.mark.parametrize("freq", ["s", "min", "H", "D", "W", "M", "Q", "A"]) + @pytest.mark.parametrize("freq", ["s", "min", "H", "D", "W", "M", "Q", "Y"]) def test_tsplot_period(self, freq): idx = period_range("12/31/1999", freq=freq, periods=100) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) @@ -125,7 +125,7 @@ def test_tsplot_period(self, freq): _check_plot_works(ser.plot, ax=ax) @pytest.mark.parametrize( - "freq", ["s", "min", "H", "D", "W", "ME", "Q-DEC", "A", "1B30Min"] + "freq", ["s", "min", "H", "D", "W", "ME", "Q-DEC", "Y", "1B30Min"] ) def test_tsplot_datetime(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) @@ -165,8 +165,8 @@ def test_get_datevalue(self): from pandas.plotting._matplotlib.converter import get_datevalue assert get_datevalue(None, "D") is None - assert get_datevalue(1987, "A") == 1987 - assert get_datevalue(Period(1987, "A"), "M") == Period("1987-12", "M").ordinal + assert get_datevalue(1987, "Y") == 1987 + assert get_datevalue(Period(1987, "Y"), "M") == Period("1987-12", "M").ordinal assert get_datevalue("1/1/1987", "D") == Period("1987-1-1", "D").ordinal def test_ts_plot_format_coord(self): @@ -176,7 +176,7 @@ def check_format_of_first_point(ax, expected_string): first_y = first_line.get_ydata()[0] assert expected_string == ax.format_coord(first_x, first_y) - annual = Series(1, index=date_range("2014-01-01", periods=3, freq="A-DEC")) + annual = Series(1, index=date_range("2014-01-01", periods=3, freq="Y-DEC")) _, ax = mpl.pyplot.subplots() annual.plot(ax=ax) check_format_of_first_point(ax, "t = 2014 y = 1.000000") @@ -187,14 +187,14 @@ def check_format_of_first_point(ax, expected_string): daily.plot(ax=ax) check_format_of_first_point(ax, "t = 2014-01-01 y = 1.000000") - @pytest.mark.parametrize("freq", ["s", "min", "H", "D", "W", "M", "Q", "A"]) + @pytest.mark.parametrize("freq", ["s", "min", "H", "D", "W", "M", "Q", "Y"]) def test_line_plot_period_series(self, freq): idx = period_range("12/31/1999", freq=freq, periods=100) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _check_plot_works(ser.plot, ser.index.freq) @pytest.mark.parametrize( - "frqncy", ["1s", "3s", "5min", "7H", "4D", "8W", "11M", "3A"] + "frqncy", ["1s", "3s", "5min", "7H", "4D", "8W", "11M", "3Y"] ) def test_line_plot_period_mlt_series(self, frqncy): # test period index line plot for series with multiples (`mlt`) of the @@ -204,14 +204,14 @@ def test_line_plot_period_mlt_series(self, frqncy): _check_plot_works(s.plot, s.index.freq.rule_code) @pytest.mark.parametrize( - "freq", ["s", "min", "H", "D", "W", "ME", "Q-DEC", "A", "1B30Min"] + "freq", ["s", "min", "H", "D", "W", "ME", "Q-DEC", "Y", "1B30Min"] ) def test_line_plot_datetime_series(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _check_plot_works(ser.plot, ser.index.freq.rule_code) - @pytest.mark.parametrize("freq", ["s", "min", "H", "D", "W", "ME", "Q", "A"]) + @pytest.mark.parametrize("freq", ["s", "min", "H", "D", "W", "ME", "Q", "Y"]) def test_line_plot_period_frame(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) df = DataFrame( @@ -222,7 +222,7 @@ def test_line_plot_period_frame(self, freq): _check_plot_works(df.plot, df.index.freq) @pytest.mark.parametrize( - "frqncy", ["1s", "3s", "5min", "7H", "4D", "8W", "11M", "3A"] + "frqncy", ["1s", "3s", "5min", "7H", "4D", "8W", "11M", "3Y"] ) def test_line_plot_period_mlt_frame(self, frqncy): # test period index line plot for DataFrames with multiples (`mlt`) @@ -240,7 +240,7 @@ def test_line_plot_period_mlt_frame(self, frqncy): @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") @pytest.mark.parametrize( - "freq", ["s", "min", "H", "D", "W", "ME", "Q-DEC", "A", "1B30Min"] + "freq", ["s", "min", "H", "D", "W", "ME", "Q-DEC", "Y", "1B30Min"] ) def test_line_plot_datetime_frame(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) @@ -254,7 +254,7 @@ def test_line_plot_datetime_frame(self, freq): _check_plot_works(df.plot, freq) @pytest.mark.parametrize( - "freq", ["s", "min", "H", "D", "W", "ME", "Q-DEC", "A", "1B30Min"] + "freq", ["s", "min", "H", "D", "W", "ME", "Q-DEC", "Y", "1B30Min"] ) def test_line_plot_inferred_freq(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) @@ -440,7 +440,7 @@ def test_get_finder(self): assert conv.get_finder(to_offset("D")) == conv._daily_finder assert conv.get_finder(to_offset("ME")) == conv._monthly_finder assert conv.get_finder(to_offset("Q")) == conv._quarterly_finder - assert conv.get_finder(to_offset("A")) == conv._annual_finder + assert conv.get_finder(to_offset("Y")) == conv._annual_finder assert conv.get_finder(to_offset("W")) == conv._daily_finder def test_finder_daily(self): @@ -523,10 +523,10 @@ def test_finder_monthly_long(self): def test_finder_annual(self): xp = [1987, 1988, 1990, 1990, 1995, 2020, 2070, 2170] - xp = [Period(x, freq="A").ordinal for x in xp] + xp = [Period(x, freq="Y").ordinal for x in xp] rs = [] for nyears in [5, 10, 19, 49, 99, 199, 599, 1001]: - rng = period_range("1987", periods=nyears, freq="A") + rng = period_range("1987", periods=nyears, freq="Y") ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng) _, ax = mpl.pyplot.subplots() ser.plot(ax=ax) diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index ae28a010d8435..ad6ff70b14d25 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -96,7 +96,7 @@ def test_raises_on_non_datetimelike_index(): "but got an instance of 'RangeIndex'" ) with pytest.raises(TypeError, match=msg): - xp.resample("A") + xp.resample("Y") @all_ts diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 113e2d8986ad2..b40ae7d2bd8c0 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -435,7 +435,7 @@ def test_resample_frame_basic_cy_funcs(f, unit): g._cython_agg_general(f, alt=None, numeric_only=True) -@pytest.mark.parametrize("freq", ["A", "ME"]) +@pytest.mark.parametrize("freq", ["Y", "ME"]) def test_resample_frame_basic_M_A(freq, unit): df = tm.makeTimeDataFrame() df.index = df.index.as_unit(unit) @@ -668,8 +668,8 @@ def test_resample_reresample(unit): @pytest.mark.parametrize( "freq, expected_kwargs", [ - ["A-DEC", {"start": "1990", "end": "2000", "freq": "a-dec"}], - ["A-JUN", {"start": "1990", "end": "2000", "freq": "a-jun"}], + ["Y-DEC", {"start": "1990", "end": "2000", "freq": "y-dec"}], + ["Y-JUN", {"start": "1990", "end": "2000", "freq": "y-jun"}], ["ME", {"start": "1990-01", "end": "2000-01", "freq": "M"}], ], ) @@ -1246,7 +1246,7 @@ def test_corner_cases_period(simple_period_range_series): # miscellaneous test coverage len0pts = simple_period_range_series("2007-01", "2010-05", freq="M")[:0] # it works - result = len0pts.resample("A-DEC").mean() + result = len0pts.resample("Y-DEC").mean() assert len(result) == 0 @@ -2021,7 +2021,7 @@ def test_long_rule_non_nano(): "1900-12-31", ] ).astype("datetime64[s]"), - freq="200A-DEC", + freq="200Y-DEC", ) expected = Series([1.0, 3.0, 6.5, 4.0, 3.0, 6.5, 4.0, 3.0, 6.5], index=expected_idx) tm.assert_series_equal(result, expected) @@ -2044,7 +2044,7 @@ def test_resample_empty_series_with_tz(): def test_resample_M_deprecated(): - depr_msg = r"\'M\' will be deprecated, please use \'ME\' for \'month end\'" + depr_msg = "'M' will be deprecated, please use 'ME' instead." s = Series(range(10), index=date_range("20130101", freq="d", periods=10)) expected = s.resample("2ME").mean() diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index db3804a6600b9..f3e2eecf63d6b 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -109,7 +109,7 @@ def test_selection(self, index, freq, kind, kwargs): def test_annual_upsample_cases( self, offset, period, conv, meth, month, simple_period_range_series ): - ts = simple_period_range_series("1/1/1990", "12/31/1991", freq=f"A-{month}") + ts = simple_period_range_series("1/1/1990", "12/31/1991", freq=f"Y-{month}") warn = FutureWarning if period == "B" else None msg = r"PeriodDtype\[B\] is deprecated" with tm.assert_produces_warning(warn, match=msg): @@ -120,20 +120,20 @@ def test_annual_upsample_cases( def test_basic_downsample(self, simple_period_range_series): ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="M") - result = ts.resample("a-dec").mean() + result = ts.resample("y-dec").mean() expected = ts.groupby(ts.index.year).mean() - expected.index = period_range("1/1/1990", "6/30/1995", freq="a-dec") + expected.index = period_range("1/1/1990", "6/30/1995", freq="y-dec") tm.assert_series_equal(result, expected) # this is ok - tm.assert_series_equal(ts.resample("a-dec").mean(), result) - tm.assert_series_equal(ts.resample("a").mean(), result) + tm.assert_series_equal(ts.resample("y-dec").mean(), result) + tm.assert_series_equal(ts.resample("y").mean(), result) @pytest.mark.parametrize( "rule,expected_error_msg", [ - ("a-dec", ""), + ("y-dec", ""), ("q-mar", ""), ("M", ""), ("w-thu", ""), @@ -152,7 +152,7 @@ def test_not_subperiod(self, simple_period_range_series, rule, expected_error_ms @pytest.mark.parametrize("freq", ["D", "2D"]) def test_basic_upsample(self, freq, simple_period_range_series): ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="M") - result = ts.resample("a-dec").mean() + result = ts.resample("y-dec").mean() resampled = result.resample(freq, convention="end").ffill() expected = result.to_timestamp(freq, how="end") @@ -160,7 +160,7 @@ def test_basic_upsample(self, freq, simple_period_range_series): tm.assert_series_equal(resampled, expected) def test_upsample_with_limit(self): - rng = period_range("1/1/2000", periods=5, freq="A") + rng = period_range("1/1/2000", periods=5, freq="Y") ts = Series(np.random.default_rng(2).standard_normal(len(rng)), rng) result = ts.resample("M", convention="end").ffill(limit=2) @@ -168,13 +168,13 @@ def test_upsample_with_limit(self): tm.assert_series_equal(result, expected) def test_annual_upsample(self, simple_period_range_series): - ts = simple_period_range_series("1/1/1990", "12/31/1995", freq="A-DEC") + ts = simple_period_range_series("1/1/1990", "12/31/1995", freq="Y-DEC") df = DataFrame({"a": ts}) rdf = df.resample("D").ffill() exp = df["a"].resample("D").ffill() tm.assert_series_equal(rdf["a"], exp) - rng = period_range("2000", "2003", freq="A-DEC") + rng = period_range("2000", "2003", freq="Y-DEC") ts = Series([1, 2, 3, 4], index=rng) result = ts.resample("M").ffill() @@ -391,13 +391,13 @@ def test_weekly_upsample(self, day, target, convention, simple_period_range_seri def test_resample_to_timestamps(self, simple_period_range_series): ts = simple_period_range_series("1/1/1990", "12/31/1995", freq="M") - result = ts.resample("A-DEC", kind="timestamp").mean() - expected = ts.to_timestamp(how="start").resample("A-DEC").mean() + result = ts.resample("Y-DEC", kind="timestamp").mean() + expected = ts.to_timestamp(how="start").resample("Y-DEC").mean() tm.assert_series_equal(result, expected) @pytest.mark.parametrize("month", MONTHS) def test_resample_to_quarterly(self, simple_period_range_series, month): - ts = simple_period_range_series("1990", "1992", freq=f"A-{month}") + ts = simple_period_range_series("1990", "1992", freq=f"Y-{month}") quar_ts = ts.resample(f"Q-{month}").ffill() stamps = ts.to_timestamp("D", how="start") @@ -415,7 +415,7 @@ def test_resample_to_quarterly(self, simple_period_range_series, month): @pytest.mark.parametrize("how", ["start", "end"]) def test_resample_to_quarterly_start_end(self, simple_period_range_series, how): # conforms, but different month - ts = simple_period_range_series("1990", "1992", freq="A-JUN") + ts = simple_period_range_series("1990", "1992", freq="Y-JUN") result = ts.resample("Q-MAR", convention=how).ffill() expected = ts.asfreq("Q-MAR", how=how) expected = expected.reindex(result.index, method="ffill") @@ -426,21 +426,21 @@ def test_resample_to_quarterly_start_end(self, simple_period_range_series, how): tm.assert_series_equal(result, expected) def test_resample_fill_missing(self): - rng = PeriodIndex([2000, 2005, 2007, 2009], freq="A") + rng = PeriodIndex([2000, 2005, 2007, 2009], freq="Y") s = Series(np.random.default_rng(2).standard_normal(4), index=rng) stamps = s.to_timestamp() - filled = s.resample("A").ffill() - expected = stamps.resample("A").ffill().to_period("A") + filled = s.resample("Y").ffill() + expected = stamps.resample("Y").ffill().to_period("Y") tm.assert_series_equal(filled, expected) def test_cant_fill_missing_dups(self): - rng = PeriodIndex([2000, 2005, 2005, 2007, 2007], freq="A") + rng = PeriodIndex([2000, 2005, 2005, 2007, 2007], freq="Y") s = Series(np.random.default_rng(2).standard_normal(5), index=rng) msg = "Reindexing only valid with uniquely valued Index objects" with pytest.raises(InvalidIndexError, match=msg): - s.resample("A").ffill() + s.resample("Y").ffill() @pytest.mark.parametrize("freq", ["5min"]) @pytest.mark.parametrize("kind", ["period", None, "timestamp"]) @@ -537,13 +537,13 @@ def test_resample_tz_localized(self): ts["second"] = np.cumsum(np.random.default_rng(2).standard_normal(len(rng))) expected = DataFrame( { - "first": ts.resample("A").sum()["first"], - "second": ts.resample("A").mean()["second"], + "first": ts.resample("Y").sum()["first"], + "second": ts.resample("Y").mean()["second"], }, columns=["first", "second"], ) result = ( - ts.resample("A") + ts.resample("Y") .agg({"first": "sum", "second": "mean"}) .reindex(columns=["first", "second"]) ) @@ -573,8 +573,8 @@ def test_quarterly_resampling(self): rng = period_range("2000Q1", periods=10, freq="Q-DEC") ts = Series(np.arange(10), index=rng) - result = ts.resample("A").mean() - exp = ts.to_timestamp().resample("A").mean().to_period() + result = ts.resample("Y").mean() + exp = ts.to_timestamp().resample("Y").mean().to_period() tm.assert_series_equal(result, exp) def test_resample_weekly_bug_1726(self): @@ -647,7 +647,7 @@ def test_monthly_convention_span(self): tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "from_freq, to_freq", [("D", "ME"), ("Q", "A"), ("ME", "Q"), ("D", "W")] + "from_freq, to_freq", [("D", "ME"), ("Q", "Y"), ("ME", "Q"), ("D", "W")] ) def test_default_right_closed_label(self, from_freq, to_freq): idx = date_range(start="8/15/2012", periods=100, freq=from_freq) @@ -676,7 +676,7 @@ def test_all_values_single_bin(self): index = period_range(start="2012-01-01", end="2012-12-31", freq="M") s = Series(np.random.default_rng(2).standard_normal(len(index)), index=index) - result = s.resample("A").mean() + result = s.resample("Y").mean() tm.assert_almost_equal(result.iloc[0], s.mean()) def test_evenly_divisible_with_no_extra_bins(self): diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 3e0922228cb74..c00366b2e28ce 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -24,7 +24,7 @@ def test_series(): def test_apply(test_series): - grouper = Grouper(freq="A", label="right", closed="right") + grouper = Grouper(freq="Y", label="right", closed="right") grouped = test_series.groupby(grouper) @@ -44,18 +44,18 @@ def test_count(test_series): expected = test_series.groupby(lambda x: x.year).count() - grouper = Grouper(freq="A", label="right", closed="right") + grouper = Grouper(freq="Y", label="right", closed="right") result = test_series.groupby(grouper).count() expected.index = result.index tm.assert_series_equal(result, expected) - result = test_series.resample("A").count() + result = test_series.resample("Y").count() expected.index = result.index tm.assert_series_equal(result, expected) def test_numpy_reduction(test_series): - result = test_series.resample("A", closed="right").prod() + result = test_series.resample("Y", closed="right").prod() msg = "using SeriesGroupBy.prod" with tm.assert_produces_warning(FutureWarning, match=msg): diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 5dde863f246d1..aa1a74aadae12 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -30,8 +30,8 @@ class TestConcatenate: def test_append_concat(self): # GH#1815 - d1 = date_range("12/31/1990", "12/31/1999", freq="A-DEC") - d2 = date_range("12/31/2000", "12/31/2009", freq="A-DEC") + d1 = date_range("12/31/1990", "12/31/1999", freq="Y-DEC") + d2 = date_range("12/31/2000", "12/31/2009", freq="Y-DEC") s1 = Series(np.random.default_rng(2).standard_normal(10), d1) s2 = Series(np.random.default_rng(2).standard_normal(10), d2) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 8435f4a189c56..2d41b6d355ead 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -445,10 +445,10 @@ def test_pivot_no_values(self): tm.assert_frame_equal(res, exp) res = df.pivot_table( - index=Grouper(freq="A"), columns=Grouper(key="dt", freq="ME") + index=Grouper(freq="Y"), columns=Grouper(key="dt", freq="ME") ) exp = DataFrame( - [3.0], index=pd.DatetimeIndex(["2011-12-31"], freq="A"), columns=exp_columns + [3.0], index=pd.DatetimeIndex(["2011-12-31"], freq="Y"), columns=exp_columns ) tm.assert_frame_equal(res, exp) @@ -1273,7 +1273,7 @@ def test_pivot_timegrouper(self, using_array_manager): expected = DataFrame( np.array([10, 18, 3], dtype="int64").reshape(1, 3), - index=pd.DatetimeIndex([datetime(2013, 12, 31)], freq="A"), + index=pd.DatetimeIndex([datetime(2013, 12, 31)], freq="Y"), columns="Carl Joe Mark".split(), ) expected.index.name = "Date" @@ -1281,7 +1281,7 @@ def test_pivot_timegrouper(self, using_array_manager): result = pivot_table( df, - index=Grouper(freq="A"), + index=Grouper(freq="Y"), columns="Buyer", values="Quantity", aggfunc="sum", @@ -1291,7 +1291,7 @@ def test_pivot_timegrouper(self, using_array_manager): result = pivot_table( df, index="Buyer", - columns=Grouper(freq="A"), + columns=Grouper(freq="Y"), values="Quantity", aggfunc="sum", ) diff --git a/pandas/tests/scalar/period/test_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py index 00285148a3c90..4287a69823aef 100644 --- a/pandas/tests/scalar/period/test_asfreq.py +++ b/pandas/tests/scalar/period/test_asfreq.py @@ -18,7 +18,7 @@ class TestFreqConversion: """Test frequency conversion of date objects""" @pytest.mark.filterwarnings("ignore:Period with BDay:FutureWarning") - @pytest.mark.parametrize("freq", ["A", "Q", "M", "W", "B", "D"]) + @pytest.mark.parametrize("freq", ["Y", "Q", "M", "W", "B", "D"]) def test_asfreq_near_zero(self, freq): # GH#19643, GH#19650 per = Period("0001-01-01", freq=freq) @@ -49,7 +49,7 @@ def test_to_timestamp_out_of_bounds(self): per.to_timestamp() def test_asfreq_corner(self): - val = Period(freq="A", year=2007) + val = Period(freq="Y", year=2007) result1 = val.asfreq("5min") result2 = val.asfreq("min") expected = Period("2007-12-31 23:59", freq="min") @@ -61,11 +61,11 @@ def test_asfreq_corner(self): def test_conv_annual(self): # frequency conversion tests: from Annual Frequency - ival_A = Period(freq="A", year=2007) + ival_A = Period(freq="Y", year=2007) - ival_AJAN = Period(freq="A-JAN", year=2007) - ival_AJUN = Period(freq="A-JUN", year=2007) - ival_ANOV = Period(freq="A-NOV", year=2007) + ival_AJAN = Period(freq="Y-JAN", year=2007) + ival_AJUN = Period(freq="Y-JUN", year=2007) + ival_ANOV = Period(freq="Y-NOV", year=2007) ival_A_to_Q_start = Period(freq="Q", year=2007, quarter=1) ival_A_to_Q_end = Period(freq="Q", year=2007, quarter=4) @@ -133,7 +133,7 @@ def test_conv_annual(self): assert ival_ANOV.asfreq("D", "s") == ival_ANOV_to_D_start assert ival_ANOV.asfreq("D", "E") == ival_ANOV_to_D_end - assert ival_A.asfreq("A") == ival_A + assert ival_A.asfreq("Y") == ival_A def test_conv_quarterly(self): # frequency conversion tests: from Quarterly Frequency @@ -144,7 +144,7 @@ def test_conv_quarterly(self): ival_QEJAN = Period(freq="Q-JAN", year=2007, quarter=1) ival_QEJUN = Period(freq="Q-JUN", year=2007, quarter=1) - ival_Q_to_A = Period(freq="A", year=2007) + ival_Q_to_A = Period(freq="Y", year=2007) ival_Q_to_M_start = Period(freq="M", year=2007, month=1) ival_Q_to_M_end = Period(freq="M", year=2007, month=3) ival_Q_to_W_start = Period(freq="W", year=2007, month=1, day=1) @@ -175,8 +175,8 @@ def test_conv_quarterly(self): ival_QEJUN_to_D_start = Period(freq="D", year=2006, month=7, day=1) ival_QEJUN_to_D_end = Period(freq="D", year=2006, month=9, day=30) - assert ival_Q.asfreq("A") == ival_Q_to_A - assert ival_Q_end_of_year.asfreq("A") == ival_Q_to_A + assert ival_Q.asfreq("Y") == ival_Q_to_A + assert ival_Q_end_of_year.asfreq("Y") == ival_Q_to_A assert ival_Q.asfreq("M", "s") == ival_Q_to_M_start assert ival_Q.asfreq("M", "E") == ival_Q_to_M_end @@ -207,7 +207,7 @@ def test_conv_monthly(self): ival_M = Period(freq="M", year=2007, month=1) ival_M_end_of_year = Period(freq="M", year=2007, month=12) ival_M_end_of_quarter = Period(freq="M", year=2007, month=3) - ival_M_to_A = Period(freq="A", year=2007) + ival_M_to_A = Period(freq="Y", year=2007) ival_M_to_Q = Period(freq="Q", year=2007, quarter=1) ival_M_to_W_start = Period(freq="W", year=2007, month=1, day=1) ival_M_to_W_end = Period(freq="W", year=2007, month=1, day=31) @@ -231,8 +231,8 @@ def test_conv_monthly(self): freq="s", year=2007, month=1, day=31, hour=23, minute=59, second=59 ) - assert ival_M.asfreq("A") == ival_M_to_A - assert ival_M_end_of_year.asfreq("A") == ival_M_to_A + assert ival_M.asfreq("Y") == ival_M_to_A + assert ival_M_end_of_year.asfreq("Y") == ival_M_to_A assert ival_M.asfreq("Q") == ival_M_to_Q assert ival_M_end_of_quarter.asfreq("Q") == ival_M_to_Q @@ -282,14 +282,14 @@ def test_conv_weekly(self): ival_W_end_of_year = Period(freq="W", year=2007, month=12, day=31) ival_W_end_of_quarter = Period(freq="W", year=2007, month=3, day=31) ival_W_end_of_month = Period(freq="W", year=2007, month=1, day=31) - ival_W_to_A = Period(freq="A", year=2007) + ival_W_to_A = Period(freq="Y", year=2007) ival_W_to_Q = Period(freq="Q", year=2007, quarter=1) ival_W_to_M = Period(freq="M", year=2007, month=1) if Period(freq="D", year=2007, month=12, day=31).weekday == 6: - ival_W_to_A_end_of_year = Period(freq="A", year=2007) + ival_W_to_A_end_of_year = Period(freq="Y", year=2007) else: - ival_W_to_A_end_of_year = Period(freq="A", year=2008) + ival_W_to_A_end_of_year = Period(freq="Y", year=2008) if Period(freq="D", year=2007, month=3, day=31).weekday == 6: ival_W_to_Q_end_of_quarter = Period(freq="Q", year=2007, quarter=1) @@ -321,8 +321,8 @@ def test_conv_weekly(self): freq="s", year=2007, month=1, day=7, hour=23, minute=59, second=59 ) - assert ival_W.asfreq("A") == ival_W_to_A - assert ival_W_end_of_year.asfreq("A") == ival_W_to_A_end_of_year + assert ival_W.asfreq("Y") == ival_W_to_A + assert ival_W_end_of_year.asfreq("Y") == ival_W_to_A_end_of_year assert ival_W.asfreq("Q") == ival_W_to_Q assert ival_W_end_of_quarter.asfreq("Q") == ival_W_to_Q_end_of_quarter @@ -394,7 +394,7 @@ def test_conv_business(self): ival_B_end_of_month = Period(freq="B", year=2007, month=1, day=31) ival_B_end_of_week = Period(freq="B", year=2007, month=1, day=5) - ival_B_to_A = Period(freq="A", year=2007) + ival_B_to_A = Period(freq="Y", year=2007) ival_B_to_Q = Period(freq="Q", year=2007, quarter=1) ival_B_to_M = Period(freq="M", year=2007, month=1) ival_B_to_W = Period(freq="W", year=2007, month=1, day=7) @@ -414,8 +414,8 @@ def test_conv_business(self): freq="s", year=2007, month=1, day=1, hour=23, minute=59, second=59 ) - assert ival_B.asfreq("A") == ival_B_to_A - assert ival_B_end_of_year.asfreq("A") == ival_B_to_A + assert ival_B.asfreq("Y") == ival_B_to_A + assert ival_B_end_of_year.asfreq("Y") == ival_B_to_A assert ival_B.asfreq("Q") == ival_B_to_Q assert ival_B_end_of_quarter.asfreq("Q") == ival_B_to_Q assert ival_B.asfreq("M") == ival_B_to_M @@ -452,11 +452,11 @@ def test_conv_daily(self): ival_B_friday = Period(freq="B", year=2007, month=1, day=5) ival_B_monday = Period(freq="B", year=2007, month=1, day=8) - ival_D_to_A = Period(freq="A", year=2007) + ival_D_to_A = Period(freq="Y", year=2007) - ival_Deoq_to_AJAN = Period(freq="A-JAN", year=2008) - ival_Deoq_to_AJUN = Period(freq="A-JUN", year=2007) - ival_Deoq_to_ADEC = Period(freq="A-DEC", year=2007) + ival_Deoq_to_AJAN = Period(freq="Y-JAN", year=2008) + ival_Deoq_to_AJUN = Period(freq="Y-JUN", year=2007) + ival_Deoq_to_ADEC = Period(freq="Y-DEC", year=2007) ival_D_to_QEJAN = Period(freq="Q-JAN", year=2007, quarter=4) ival_D_to_QEJUN = Period(freq="Q-JUN", year=2007, quarter=3) @@ -480,13 +480,13 @@ def test_conv_daily(self): freq="s", year=2007, month=1, day=1, hour=23, minute=59, second=59 ) - assert ival_D.asfreq("A") == ival_D_to_A + assert ival_D.asfreq("Y") == ival_D_to_A - assert ival_D_end_of_quarter.asfreq("A-JAN") == ival_Deoq_to_AJAN - assert ival_D_end_of_quarter.asfreq("A-JUN") == ival_Deoq_to_AJUN - assert ival_D_end_of_quarter.asfreq("A-DEC") == ival_Deoq_to_ADEC + assert ival_D_end_of_quarter.asfreq("Y-JAN") == ival_Deoq_to_AJAN + assert ival_D_end_of_quarter.asfreq("Y-JUN") == ival_Deoq_to_AJUN + assert ival_D_end_of_quarter.asfreq("Y-DEC") == ival_Deoq_to_ADEC - assert ival_D_end_of_year.asfreq("A") == ival_D_to_A + assert ival_D_end_of_year.asfreq("Y") == ival_D_to_A assert ival_D_end_of_quarter.asfreq("Q") == ival_D_to_QEDEC assert ival_D.asfreq("Q-JAN") == ival_D_to_QEJAN assert ival_D.asfreq("Q-JUN") == ival_D_to_QEJUN @@ -523,7 +523,7 @@ def test_conv_hourly(self): ival_H_end_of_day = Period(freq="H", year=2007, month=1, day=1, hour=23) ival_H_end_of_bus = Period(freq="H", year=2007, month=1, day=1, hour=23) - ival_H_to_A = Period(freq="A", year=2007) + ival_H_to_A = Period(freq="Y", year=2007) ival_H_to_Q = Period(freq="Q", year=2007, quarter=1) ival_H_to_M = Period(freq="M", year=2007, month=1) ival_H_to_W = Period(freq="W", year=2007, month=1, day=7) @@ -544,8 +544,8 @@ def test_conv_hourly(self): freq="s", year=2007, month=1, day=1, hour=0, minute=59, second=59 ) - assert ival_H.asfreq("A") == ival_H_to_A - assert ival_H_end_of_year.asfreq("A") == ival_H_to_A + assert ival_H.asfreq("Y") == ival_H_to_A + assert ival_H_end_of_year.asfreq("Y") == ival_H_to_A assert ival_H.asfreq("Q") == ival_H_to_Q assert ival_H_end_of_quarter.asfreq("Q") == ival_H_to_Q assert ival_H.asfreq("M") == ival_H_to_M @@ -591,7 +591,7 @@ def test_conv_minutely(self): freq="Min", year=2007, month=1, day=1, hour=0, minute=59 ) - ival_T_to_A = Period(freq="A", year=2007) + ival_T_to_A = Period(freq="Y", year=2007) ival_T_to_Q = Period(freq="Q", year=2007, quarter=1) ival_T_to_M = Period(freq="M", year=2007, month=1) ival_T_to_W = Period(freq="W", year=2007, month=1, day=7) @@ -607,8 +607,8 @@ def test_conv_minutely(self): freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=59 ) - assert ival_T.asfreq("A") == ival_T_to_A - assert ival_T_end_of_year.asfreq("A") == ival_T_to_A + assert ival_T.asfreq("Y") == ival_T_to_A + assert ival_T_end_of_year.asfreq("Y") == ival_T_to_A assert ival_T.asfreq("Q") == ival_T_to_Q assert ival_T_end_of_quarter.asfreq("Q") == ival_T_to_Q assert ival_T.asfreq("M") == ival_T_to_M @@ -657,7 +657,7 @@ def test_conv_secondly(self): freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=59 ) - ival_S_to_A = Period(freq="A", year=2007) + ival_S_to_A = Period(freq="Y", year=2007) ival_S_to_Q = Period(freq="Q", year=2007, quarter=1) ival_S_to_M = Period(freq="M", year=2007, month=1) ival_S_to_W = Period(freq="W", year=2007, month=1, day=7) @@ -667,8 +667,8 @@ def test_conv_secondly(self): ival_S_to_H = Period(freq="H", year=2007, month=1, day=1, hour=0) ival_S_to_T = Period(freq="Min", year=2007, month=1, day=1, hour=0, minute=0) - assert ival_S.asfreq("A") == ival_S_to_A - assert ival_S_end_of_year.asfreq("A") == ival_S_to_A + assert ival_S.asfreq("Y") == ival_S_to_A + assert ival_S_end_of_year.asfreq("Y") == ival_S_to_A assert ival_S.asfreq("Q") == ival_S_to_Q assert ival_S_end_of_quarter.asfreq("Q") == ival_S_to_Q assert ival_S.asfreq("M") == ival_S_to_M @@ -707,44 +707,44 @@ def test_conv_microsecond(self): def test_asfreq_mult(self): # normal freq to mult freq - p = Period(freq="A", year=2007) + p = Period(freq="Y", year=2007) # ordinal will not change - for freq in ["3A", offsets.YearEnd(3)]: + for freq in ["3Y", offsets.YearEnd(3)]: result = p.asfreq(freq) - expected = Period("2007", freq="3A") + expected = Period("2007", freq="3Y") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq # ordinal will not change - for freq in ["3A", offsets.YearEnd(3)]: + for freq in ["3Y", offsets.YearEnd(3)]: result = p.asfreq(freq, how="S") - expected = Period("2007", freq="3A") + expected = Period("2007", freq="3Y") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq # mult freq to normal freq - p = Period(freq="3A", year=2007) + p = Period(freq="3Y", year=2007) # ordinal will change because how=E is the default - for freq in ["A", offsets.YearEnd()]: + for freq in ["Y", offsets.YearEnd()]: result = p.asfreq(freq) - expected = Period("2009", freq="A") + expected = Period("2009", freq="Y") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq # ordinal will not change - for freq in ["A", offsets.YearEnd()]: + for freq in ["Y", offsets.YearEnd()]: result = p.asfreq(freq, how="s") - expected = Period("2007", freq="A") + expected = Period("2007", freq="Y") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq - p = Period(freq="A", year=2007) + p = Period(freq="Y", year=2007) for freq in ["2M", offsets.MonthEnd(2)]: result = p.asfreq(freq) expected = Period("2007-12", freq="2M") @@ -760,7 +760,7 @@ def test_asfreq_mult(self): assert result.ordinal == expected.ordinal assert result.freq == expected.freq - p = Period(freq="3A", year=2007) + p = Period(freq="3Y", year=2007) for freq in ["2M", offsets.MonthEnd(2)]: result = p.asfreq(freq) expected = Period("2009-12", freq="2M") diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 7d07f327e3978..c7d42a83e663c 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -61,9 +61,9 @@ def test_construction(self): assert i1 == i2 - i1 = Period("2005", freq="A") + i1 = Period("2005", freq="Y") i2 = Period("2005") - i3 = Period("2005", freq="a") + i3 = Period("2005", freq="y") assert i1 == i2 assert i1 == i3 @@ -224,7 +224,7 @@ def test_period_constructor_offsets(self): assert Period("1/1/2005", freq=offsets.MonthEnd()) == Period( "1/1/2005", freq="M" ) - assert Period("2005", freq=offsets.YearEnd()) == Period("2005", freq="A") + assert Period("2005", freq=offsets.YearEnd()) == Period("2005", freq="Y") assert Period("2005", freq=offsets.MonthEnd()) == Period("2005", freq="M") with tm.assert_produces_warning(FutureWarning, match=bday_msg): assert Period("3/10/12", freq=offsets.BusinessDay()) == Period( @@ -315,13 +315,13 @@ def test_invalid_arguments(self): msg = '^Given date string "-2000" not likely a datetime$' with pytest.raises(ValueError, match=msg): - Period("-2000", "A") + Period("-2000", "Y") msg = "day is out of range for month" with pytest.raises(DateParseError, match=msg): - Period("0", "A") + Period("0", "Y") msg = "Unknown datetime string format, unable to parse" with pytest.raises(DateParseError, match=msg): - Period("1/1/-2000", "A") + Period("1/1/-2000", "Y") def test_constructor_corner(self): expected = Period("2007-01", freq="2M") @@ -331,8 +331,8 @@ def test_constructor_corner(self): p = Period("2007-01-01", freq="D") - result = Period(p, freq="A") - exp = Period("2007", freq="A") + result = Period(p, freq="Y") + exp = Period("2007", freq="Y") assert result == exp def test_constructor_infer_freq(self): @@ -360,11 +360,11 @@ def test_constructor_infer_freq(self): assert p.freq == "us" def test_multiples(self): - result1 = Period("1989", freq="2A") - result2 = Period("1989", freq="A") + result1 = Period("1989", freq="2Y") + result2 = Period("1989", freq="Y") assert result1.ordinal == result2.ordinal - assert result1.freqstr == "2A-DEC" - assert result2.freqstr == "A-DEC" + assert result1.freqstr == "2Y-DEC" + assert result2.freqstr == "Y-DEC" assert result1.freq == offsets.YearEnd(2) assert result2.freq == offsets.YearEnd() @@ -390,7 +390,7 @@ def test_period_cons_quarterly(self, month): @pytest.mark.parametrize("month", MONTHS) def test_period_cons_annual(self, month): # bugs in scikits.timeseries - freq = f"A-{month}" + freq = f"Y-{month}" exp = Period("1989", freq=freq) stamp = exp.to_timestamp("D", how="end") + timedelta(days=30) p = Period(stamp, freq=freq) @@ -428,7 +428,7 @@ def test_period_from_ordinal(self): assert p == res assert isinstance(res, Period) - @pytest.mark.parametrize("freq", ["A", "M", "D", "H"]) + @pytest.mark.parametrize("freq", ["Y", "M", "D", "H"]) def test_construct_from_nat_string_and_freq(self, freq): per = Period("NaT", freq=freq) assert per is NaT @@ -621,7 +621,7 @@ def test_to_timestamp_mult(self): "ignore:Period with BDay freq is deprecated:FutureWarning" ) def test_to_timestamp(self): - p = Period("1982", freq="A") + p = Period("1982", freq="Y") start_ts = p.to_timestamp(how="S") aliases = ["s", "StarT", "BEGIn"] for a in aliases: @@ -635,7 +635,7 @@ def test_to_timestamp(self): assert end_ts == p.to_timestamp("D", how=a) assert end_ts == p.to_timestamp("3D", how=a) - from_lst = ["A", "Q", "M", "W", "B", "D", "H", "Min", "s"] + from_lst = ["Y", "Q", "M", "W", "B", "D", "H", "Min", "s"] def _ex(p): if p.freq == "B": @@ -653,7 +653,7 @@ def _ex(p): # Frequency other than daily - p = Period("1985", freq="A") + p = Period("1985", freq="Y") result = p.to_timestamp("H", how="end") expected = Timestamp(1986, 1, 1) - Timedelta(1, "ns") @@ -732,7 +732,7 @@ def test_to_timestamp_microsecond(self, ts, expected, freq): ("2000-12-15 13:45:26", "s", "2000-12-15 13:45:26", "s"), ("2000-12-15 13:45:26", "min", "2000-12-15 13:45", "min"), ("2000-12-15 13:45:26", "H", "2000-12-15 13:00", "H"), - ("2000-12-15", "Y", "2000", "A-DEC"), + ("2000-12-15", "Y", "2000", "Y-DEC"), ("2000-12-15", "Q", "2000Q4", "Q-DEC"), ("2000-12-15", "M", "2000-12", "M"), ("2000-12-15", "W", "2000-12-11/2000-12-17", "W-SUN"), @@ -763,7 +763,7 @@ def test_strftime(self): class TestPeriodProperties: """Test properties such as year, month, weekday, etc....""" - @pytest.mark.parametrize("freq", ["A", "M", "D", "H"]) + @pytest.mark.parametrize("freq", ["Y", "M", "D", "H"]) def test_is_leap_year(self, freq): # GH 13727 p = Period("2000-01-01 00:00:00", freq=freq) @@ -861,7 +861,7 @@ def test_inner_bounds_start_and_end_time(self, bound, offset, period_property): assert getattr(period, period_property).floor("s") == expected def test_start_time(self): - freq_lst = ["A", "Q", "M", "D", "H", "min", "s"] + freq_lst = ["Y", "Q", "M", "D", "H", "min", "s"] xp = datetime(2012, 1, 1) for f in freq_lst: p = Period("2012", freq=f) @@ -871,7 +871,7 @@ def test_start_time(self): assert Period("2012", freq="W").start_time == datetime(2011, 12, 26) def test_end_time(self): - p = Period("2012", freq="A") + p = Period("2012", freq="Y") def _ex(*args): return Timestamp(Timestamp(datetime(*args)).as_unit("ns")._value - 1) @@ -936,7 +936,7 @@ def _ex(*args): def test_properties_annually(self): # Test properties on Periods with annually frequency. - a_date = Period(freq="A", year=2007) + a_date = Period(freq="Y", year=2007) assert a_date.year == 2007 def test_properties_quarterly(self): @@ -1196,11 +1196,11 @@ def test_add_sub_td64_nat(self, unit): nat - per def test_sub_delta(self): - left, right = Period("2011", freq="A"), Period("2007", freq="A") + left, right = Period("2011", freq="Y"), Period("2007", freq="Y") result = left - right assert result == 4 * right.freq - msg = r"Input has different freq=M from Period\(freq=A-DEC\)" + msg = r"Input has different freq=M from Period\(freq=Y-DEC\)" with pytest.raises(IncompatibleFrequency, match=msg): left - Period("2007-01", freq="M") @@ -1316,7 +1316,7 @@ def test_sub_n_gt_1_offsets(self, offset, kwd_name, n, normalize): def test_add_offset(self): # freq is DateOffset - for freq in ["A", "2A", "3A"]: + for freq in ["Y", "2Y", "3Y"]: p = Period("2011", freq=freq) exp = Period("2013", freq=freq) assert p + offsets.YearEnd(2) == exp @@ -1467,7 +1467,7 @@ def test_sub_offset(self): ] ) - for freq in ["A", "2A", "3A"]: + for freq in ["Y", "2Y", "3Y"]: p = Period("2011", freq=freq) assert p - offsets.YearEnd(2) == Period("2009", freq=freq) @@ -1589,7 +1589,7 @@ def test_small_year_parsing(): def test_negone_ordinals(): - freqs = ["A", "M", "Q", "D", "H", "min", "s"] + freqs = ["Y", "M", "Q", "D", "H", "min", "s"] period = Period(ordinal=-1, freq="D") for freq in freqs: diff --git a/pandas/tests/series/methods/test_align.py b/pandas/tests/series/methods/test_align.py index d36fd5335bdfc..eedd8b654f3d0 100644 --- a/pandas/tests/series/methods/test_align.py +++ b/pandas/tests/series/methods/test_align.py @@ -204,7 +204,7 @@ def test_align_dt64tzindex_mismatched_tzs(): def test_align_periodindex(join_type): - rng = period_range("1/1/2000", "1/1/2010", freq="A") + rng = period_range("1/1/2000", "1/1/2010", freq="Y") ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) # TODO: assert something? diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index d40ff6c139653..522b86274a517 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -153,7 +153,7 @@ class TestSeriesArithmetic: # Some of these may end up in tests/arithmetic, but are not yet sorted def test_add_series_with_period_index(self): - rng = pd.period_range("1/1/2000", "1/1/2010", freq="A") + rng = pd.period_range("1/1/2000", "1/1/2010", freq="Y") ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) result = ts + ts[::2] @@ -164,7 +164,7 @@ def test_add_series_with_period_index(self): result = ts + _permute(ts[::2]) tm.assert_series_equal(result, expected) - msg = "Input has different freq=D from Period\\(freq=A-DEC\\)" + msg = "Input has different freq=D from Period\\(freq=Y-DEC\\)" with pytest.raises(IncompatibleFrequency, match=msg): ts + ts.asfreq("D", how="end") diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index d45c655a4c0a2..5a05a1840b644 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1306,7 +1306,7 @@ def test_construct_from_ints_including_iNaT_scalar_period_dtype(self): assert isna(series[2]) def test_constructor_period_incompatible_frequency(self): - data = [Period("2000", "D"), Period("2001", "A")] + data = [Period("2000", "D"), Period("2001", "Y")] result = Series(data) assert result.dtype == object assert result.tolist() == data diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index be68918d2a380..4f5196a400d58 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -247,7 +247,7 @@ def test_index_repr_in_frame_with_nan(self): assert repr(s) == exp def test_format_pre_1900_dates(self): - rng = date_range("1/1/1850", "1/1/1950", freq="A-DEC") + rng = date_range("1/1/1850", "1/1/1950", freq="Y-DEC") rng.format() ts = Series(1, index=rng) repr(ts) diff --git a/pandas/tests/tseries/frequencies/test_freq_code.py b/pandas/tests/tseries/frequencies/test_freq_code.py index 417bf0e90201b..ca8db972e2185 100644 --- a/pandas/tests/tseries/frequencies/test_freq_code.py +++ b/pandas/tests/tseries/frequencies/test_freq_code.py @@ -27,7 +27,7 @@ def test_get_to_timestamp_base(freqstr, exp_freqstr): @pytest.mark.parametrize( "freqstr,expected", [ - ("A", "year"), + ("Y", "year"), ("Q", "quarter"), ("M", "month"), ("D", "day"), @@ -99,9 +99,9 @@ def test_compatibility(freqstr, expected): assert ts_np + do == np.datetime64(expected) -@pytest.mark.parametrize("freq", ["T", "S", "L", "N", "U"]) -def test_units_t_l_deprecated_from__attrname_to_abbrevs(freq): - # GH 52536 +@pytest.mark.parametrize("freq", ["A", "T", "S", "L", "U", "N"]) +def test_units_A_T_S_L_U_N_deprecated_from_attrname_to_abbrevs(freq): + # GH#52536 msg = f"'{freq}' is deprecated and will be removed in a future version." with tm.assert_produces_warning(FutureWarning, match=msg): diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index 82cceeac2cd25..0b2978389ea88 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -52,7 +52,7 @@ def base_delta_code_pair(request): freqs = ( [f"Q-{month}" for month in MONTHS] - + [f"{annual}-{month}" for annual in ["A", "BA"] for month in MONTHS] + + [f"{annual}-{month}" for annual in ["Y", "BA"] for month in MONTHS] + ["ME", "BM", "BMS"] + [f"WOM-{count}{day}" for count in range(1, 5) for day in DAYS] + [f"W-{day}" for day in DAYS] @@ -167,7 +167,7 @@ def test_monthly_ambiguous(): def test_annual_ambiguous(): rng = DatetimeIndex(["1/31/2000", "1/31/2001", "1/31/2002"]) - assert rng.inferred_freq == "A-JAN" + assert rng.inferred_freq == "Y-JAN" @pytest.mark.parametrize("count", range(1, 5)) @@ -359,7 +359,7 @@ def test_not_monotonic(): rng = DatetimeIndex(["1/31/2000", "1/31/2001", "1/31/2002"]) rng = rng[::-1] - assert rng.inferred_freq == "-1A-JAN" + assert rng.inferred_freq == "-1Y-JAN" def test_non_datetime_index2(): @@ -479,18 +479,18 @@ def test_series_datetime_index(freq): "Q@JAN", "Q@FEB", "Q@MAR", - "A@JAN", - "A@FEB", - "A@MAR", - "A@APR", - "A@MAY", - "A@JUN", - "A@JUL", - "A@AUG", - "A@SEP", - "A@OCT", - "A@NOV", - "A@DEC", + "Y@JAN", + "Y@FEB", + "Y@MAR", + "Y@APR", + "Y@MAY", + "Y@JUN", + "Y@JUL", + "Y@AUG", + "Y@SEP", + "Y@OCT", + "Y@NOV", + "Y@DEC", "Y@JAN", "WOM@1MON", "WOM@2MON", diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index a7e9854c38f18..de44cf3f94d26 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -839,7 +839,7 @@ def test_rule_code(self): "NOV", "DEC", ] - base_lst = ["A", "AS", "BA", "BAS", "Q", "QS", "BQ", "BQS"] + base_lst = ["Y", "AS", "BA", "BAS", "Q", "QS", "BQ", "BQS"] for base in base_lst: for v in suffix_lst: alias = "-".join([base, v]) @@ -858,7 +858,7 @@ def test_freq_offsets(): class TestReprNames: def test_str_for_named_is_name(self): # look at all the amazing combinations! - month_prefixes = ["A", "AS", "BA", "BAS", "Q", "BQ", "BQS", "QS"] + month_prefixes = ["Y", "AS", "BA", "BAS", "Q", "BQ", "BQS", "QS"] names = [ prefix + "-" + month for prefix in month_prefixes diff --git a/pandas/tests/tslibs/test_conversion.py b/pandas/tests/tslibs/test_conversion.py index c1ab0ba0b5e6f..d0f8923f3ad89 100644 --- a/pandas/tests/tslibs/test_conversion.py +++ b/pandas/tests/tslibs/test_conversion.py @@ -73,7 +73,7 @@ def test_tz_convert_single_matches_tz_convert_hourly(tz_aware_fixture): _compare_local_to_utc(tz_didx, naive_didx) -@pytest.mark.parametrize("freq", ["D", "A"]) +@pytest.mark.parametrize("freq", ["D", "Y"]) def test_tz_convert_single_matches_tz_convert(tz_aware_fixture, freq): tz = tz_aware_fixture tz_didx = date_range("2018-01-01", "2020-01-01", freq=freq, tz=tz) diff --git a/pandas/tests/tslibs/test_libfrequencies.py b/pandas/tests/tslibs/test_libfrequencies.py index 83f28f6b5dc01..effd3b4b8b4e5 100644 --- a/pandas/tests/tslibs/test_libfrequencies.py +++ b/pandas/tests/tslibs/test_libfrequencies.py @@ -16,10 +16,8 @@ (offsets.QuarterEnd(startingMonth=12).freqstr, "DEC"), ("Q-JAN", "JAN"), (offsets.QuarterEnd(startingMonth=1).freqstr, "JAN"), - ("A-DEC", "DEC"), ("Y-DEC", "DEC"), (offsets.YearEnd().freqstr, "DEC"), - ("A-MAY", "MAY"), ("Y-MAY", "MAY"), (offsets.YearEnd(month=5).freqstr, "MAY"), ], diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index ec3579109e7a4..425decc14251a 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -138,8 +138,8 @@ def test_parsers_quarterly_with_freq_error(date_str, kwargs, msg): "date_str,freq,expected", [ ("2013Q2", None, datetime(2013, 4, 1)), - ("2013Q2", "A-APR", datetime(2012, 8, 1)), - ("2013-Q2", "A-DEC", datetime(2013, 4, 1)), + ("2013Q2", "Y-APR", datetime(2012, 8, 1)), + ("2013-Q2", "Y-DEC", datetime(2013, 4, 1)), ], ) def test_parsers_quarterly_with_freq(date_str, freq, expected): @@ -148,7 +148,7 @@ def test_parsers_quarterly_with_freq(date_str, freq, expected): @pytest.mark.parametrize( - "date_str", ["2Q 2005", "2Q-200A", "2Q-200", "22Q2005", "2Q200.", "6Q-20"] + "date_str", ["2Q 2005", "2Q-200Y", "2Q-200", "22Q2005", "2Q200.", "6Q-20"] ) def test_parsers_quarter_invalid(date_str): if date_str == "6Q-20": diff --git a/pandas/tests/tslibs/test_period_asfreq.py b/pandas/tests/tslibs/test_period_asfreq.py index 99f0a82d6711e..ca207e1031653 100644 --- a/pandas/tests/tslibs/test_period_asfreq.py +++ b/pandas/tests/tslibs/test_period_asfreq.py @@ -54,7 +54,7 @@ def test_intra_day_conversion_factors(freq1, freq2, expected): @pytest.mark.parametrize( - "freq,expected", [("A", 0), ("M", 0), ("W", 1), ("D", 0), ("B", 0)] + "freq,expected", [("Y", 0), ("M", 0), ("W", 1), ("D", 0), ("B", 0)] ) def test_period_ordinal_start_values(freq, expected): # information for Jan. 1, 1970. diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 7aa245341cbdd..e77f56a9928ae 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -66,7 +66,7 @@ key = f"{_prefix}-{_m}" OFFSET_TO_PERIOD_FREQSTR[key] = OFFSET_TO_PERIOD_FREQSTR[_prefix] -for _prefix in ["A", "Q"]: +for _prefix in ["Y", "Q"]: for _m in MONTHS: _alias = f"{_prefix}-{_m}" OFFSET_TO_PERIOD_FREQSTR[_alias] = _alias @@ -345,7 +345,7 @@ def _get_annual_rule(self) -> str | None: if pos_check is None: return None else: - return {"cs": "AS", "bs": "BAS", "ce": "A", "be": "BA"}.get(pos_check) + return {"cs": "AS", "bs": "BAS", "ce": "Y", "be": "BA"}.get(pos_check) def _get_quarterly_rule(self) -> str | None: if len(self.mdiffs) > 1: @@ -574,7 +574,7 @@ def _quarter_months_conform(source: str, target: str) -> bool: def _is_annual(rule: str) -> bool: rule = rule.upper() - return rule == "A" or rule.startswith("A-") + return rule == "Y" or rule.startswith("Y-") def _is_quarterly(rule: str) -> bool: From 494df5c767656a7108dc3100a04f785eaf8dc567 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 6 Oct 2023 09:04:14 -0700 Subject: [PATCH 089/131] REF: remove unused box keyword from Timedelta64Formatter (#55417) --- pandas/io/formats/format.py | 5 ++- pandas/tests/io/formats/test_format.py | 42 +++++++++++++------------- 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index c6c09c2636852..726de09b60d70 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1748,16 +1748,15 @@ def __init__( self, values: TimedeltaArray, nat_rep: str = "NaT", - box: bool = False, **kwargs, ) -> None: + # TODO: nat_rep is never passed, na_rep is. super().__init__(values, **kwargs) self.nat_rep = nat_rep - self.box = box def _format_strings(self) -> list[str]: formatter = self.formatter or get_format_timedelta64( - self.values, nat_rep=self.nat_rep, box=self.box + self.values, nat_rep=self.nat_rep, box=False ) return [formatter(x) for x in self.values] diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index fe6ddd3aeb1d4..51965f753a0fb 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3187,49 +3187,49 @@ def test_all(self): class TestTimedelta64Formatter: def test_days(self): x = pd.to_timedelta(list(range(5)) + [NaT], unit="D")._values - result = fmt._Timedelta64Formatter(x, box=True).get_result() - assert result[0].strip() == "'0 days'" - assert result[1].strip() == "'1 days'" + result = fmt._Timedelta64Formatter(x).get_result() + assert result[0].strip() == "0 days" + assert result[1].strip() == "1 days" - result = fmt._Timedelta64Formatter(x[1:2], box=True).get_result() - assert result[0].strip() == "'1 days'" + result = fmt._Timedelta64Formatter(x[1:2]).get_result() + assert result[0].strip() == "1 days" - result = fmt._Timedelta64Formatter(x, box=False).get_result() + result = fmt._Timedelta64Formatter(x).get_result() assert result[0].strip() == "0 days" assert result[1].strip() == "1 days" - result = fmt._Timedelta64Formatter(x[1:2], box=False).get_result() + result = fmt._Timedelta64Formatter(x[1:2]).get_result() assert result[0].strip() == "1 days" def test_days_neg(self): x = pd.to_timedelta(list(range(5)) + [NaT], unit="D")._values - result = fmt._Timedelta64Formatter(-x, box=True).get_result() - assert result[0].strip() == "'0 days'" - assert result[1].strip() == "'-1 days'" + result = fmt._Timedelta64Formatter(-x).get_result() + assert result[0].strip() == "0 days" + assert result[1].strip() == "-1 days" def test_subdays(self): y = pd.to_timedelta(list(range(5)) + [NaT], unit="s")._values - result = fmt._Timedelta64Formatter(y, box=True).get_result() - assert result[0].strip() == "'0 days 00:00:00'" - assert result[1].strip() == "'0 days 00:00:01'" + result = fmt._Timedelta64Formatter(y).get_result() + assert result[0].strip() == "0 days 00:00:00" + assert result[1].strip() == "0 days 00:00:01" def test_subdays_neg(self): y = pd.to_timedelta(list(range(5)) + [NaT], unit="s")._values - result = fmt._Timedelta64Formatter(-y, box=True).get_result() - assert result[0].strip() == "'0 days 00:00:00'" - assert result[1].strip() == "'-1 days +23:59:59'" + result = fmt._Timedelta64Formatter(-y).get_result() + assert result[0].strip() == "0 days 00:00:00" + assert result[1].strip() == "-1 days +23:59:59" def test_zero(self): x = pd.to_timedelta(list(range(1)) + [NaT], unit="D")._values - result = fmt._Timedelta64Formatter(x, box=True).get_result() - assert result[0].strip() == "'0 days'" + result = fmt._Timedelta64Formatter(x).get_result() + assert result[0].strip() == "0 days" x = pd.to_timedelta(list(range(1)), unit="D")._values - result = fmt._Timedelta64Formatter(x, box=True).get_result() - assert result[0].strip() == "'0 days'" + result = fmt._Timedelta64Formatter(x).get_result() + assert result[0].strip() == "0 days" -class Test_Datetime64Formatter: +class TestDatetime64Formatter: def test_mixed(self): x = Series([datetime(2013, 1, 1), datetime(2013, 1, 1, 12), NaT])._values result = fmt._Datetime64Formatter(x).get_result() From c18ae3b2e5706031f1b03a00e1038dd4020af767 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 6 Oct 2023 09:05:39 -0700 Subject: [PATCH 090/131] REF: remove unnecessary IntervalIndex._format_native_types (#55416) * REF: make _format_with_header keyword-only * REF: remove IntervalIndex._format_native_types --- pandas/core/indexes/base.py | 4 ++-- pandas/core/indexes/category.py | 2 +- pandas/core/indexes/datetimelike.py | 6 ++++-- pandas/core/indexes/interval.py | 8 +------- pandas/core/indexes/range.py | 2 +- pandas/tests/indexes/interval/test_formats.py | 2 +- 6 files changed, 10 insertions(+), 14 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 11f2cc8ebf1ff..c3d7bfc1998c0 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1390,9 +1390,9 @@ def format( if formatter is not None: return header + list(self.map(formatter)) - return self._format_with_header(header, na_rep=na_rep) + return self._format_with_header(header=header, na_rep=na_rep) - def _format_with_header(self, header: list[str_t], na_rep: str_t) -> list[str_t]: + def _format_with_header(self, *, header: list[str_t], na_rep: str_t) -> list[str_t]: from pandas.io.formats.format import format_array values = self._values diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index bc6fb61700aec..9cf7e861584d9 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -356,7 +356,7 @@ def _format_attrs(self): extra = super()._format_attrs() return attrs + extra - def _format_with_header(self, header: list[str], na_rep: str) -> list[str]: + def _format_with_header(self, *, header: list[str], na_rep: str) -> list[str]: result = [ pprint_thing(x, escape_chars=("\t", "\r", "\n")) if notna(x) else na_rep for x in self._values diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 1b08596c99591..94ad556219b35 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -209,10 +209,12 @@ def format( if formatter is not None: return header + list(self.map(formatter)) - return self._format_with_header(header, na_rep=na_rep, date_format=date_format) + return self._format_with_header( + header=header, na_rep=na_rep, date_format=date_format + ) def _format_with_header( - self, header: list[str], na_rep: str = "NaT", date_format: str | None = None + self, *, header: list[str], na_rep: str, date_format: str | None = None ) -> list[str]: # matches base class except for whitespace padding and date_format return header + list( diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index eb8d25bcea592..ae027ea68a525 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -846,16 +846,10 @@ def length(self) -> Index: # Rendering Methods # __repr__ associated methods are based on MultiIndex - def _format_with_header(self, header: list[str], na_rep: str) -> list[str]: + def _format_with_header(self, *, header: list[str], na_rep: str) -> list[str]: # matches base class except for whitespace padding return header + list(self._format_native_types(na_rep=na_rep)) - def _format_native_types( - self, *, na_rep: str = "NaN", quoting=None, **kwargs - ) -> npt.NDArray[np.object_]: - # GH 28210: use base method but with different default na_rep - return super()._format_native_types(na_rep=na_rep, quoting=quoting, **kwargs) - def _format_data(self, name=None) -> str: # TODO: integrate with categorical and make generic # name argument is unused here; just for compat with base / categorical diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index aeb7bb1813fef..bb2ccfe930ab8 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -266,7 +266,7 @@ def _format_data(self, name=None): # we are formatting thru the attributes return None - def _format_with_header(self, header: list[str], na_rep: str) -> list[str]: + def _format_with_header(self, *, header: list[str], na_rep: str) -> list[str]: # Equivalent to Index implementation, but faster if not len(self._range): return header diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index f003211abd857..acb330c190d6f 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -104,7 +104,7 @@ def test_repr_floats(self): def test_to_native_types(self, tuples, closed, expected_data): # GH 28210 index = IntervalIndex.from_tuples(tuples, closed=closed) - result = index._format_native_types() + result = index._format_native_types(na_rep="NaN") expected = np.array(expected_data) tm.assert_numpy_array_equal(result, expected) From 40c2d4d8c00a3be74a5a81bc565ae6c6d8e6bebb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 6 Oct 2023 09:07:41 -0700 Subject: [PATCH 091/131] Series.__repr__ with MultiIndex with level name 0 (#55415) * Series.__repr__ with MultiIndex with level name 0 * GH ref --- doc/source/whatsnew/v2.2.0.rst | 2 ++ pandas/io/formats/format.py | 14 ++------------ pandas/tests/series/test_repr.py | 10 ++++++++++ 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index dc9c3f661dab1..7fb2e26c3fa38 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -390,6 +390,8 @@ Styler Other ^^^^^ - Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`) +- Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`) +- .. ***DO NOT USE THIS SECTION*** diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 726de09b60d70..1efd6f18dc0af 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -298,17 +298,6 @@ def _get_footer(self) -> str: return str(footer) - def _get_formatted_index(self) -> tuple[list[str], bool]: - index = self.tr_series.index - - if isinstance(index, MultiIndex): - have_header = any(name for name in index.names) - fmt_index = index.format(names=True) - else: - have_header = index.name is not None - fmt_index = index.format(name=True) - return fmt_index, have_header - def _get_formatted_values(self) -> list[str]: return format_array( self.tr_series._values, @@ -325,7 +314,8 @@ def to_string(self) -> str: if len(series) == 0: return f"{type(self.series).__name__}([], {footer})" - fmt_index, have_header = self._get_formatted_index() + have_header = _has_names(series.index) + fmt_index = self.tr_series.index.format(name=True) fmt_values = self._get_formatted_values() if self.is_truncated_vertically: diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 4f5196a400d58..535a64f303ec2 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -21,6 +21,16 @@ class TestSeriesRepr: + def test_multilevel_name_print_0(self): + # GH#55415 None does not get printed, but 0 does + # (matching DataFrame and flat index behavior) + mi = pd.MultiIndex.from_product([range(2, 3), range(3, 4)], names=[0, None]) + ser = Series(1.5, index=mi) + + res = repr(ser) + expected = "0 \n2 3 1.5\ndtype: float64" + assert res == expected + def test_multilevel_name_print(self, lexsorted_two_level_string_multiindex): index = lexsorted_two_level_string_multiindex ser = Series(range(len(index)), index=index, name="sth") From a6893c1dd7c2c4a91657742346750628c0afd7e2 Mon Sep 17 00:00:00 2001 From: Philippe THOMY Date: Fri, 6 Oct 2023 18:10:14 +0200 Subject: [PATCH 092/131] add `ntv-pandas` to the ecosystem (#55421) * Create 0007-compact-and-reversible-JSON-interface.md * change PDEP number (7 -> 12) * Add FAQ to the PDEPS 0012 * Update 0012-compact-and-reversible-JSON-interface.md * Update 0012-compact-and-reversible-JSON-interface.md * Update 0012-compact-and-reversible-JSON-interface.md * pre-commit codespell * Update 0012-compact-and-reversible-JSON-interface.md * Update 0012-compact-and-reversible-JSON-interface.md * delete summary * delete mermaid flowchart * with summary, without mermaid flowchart * rename Annexe -> Appendix * add tableschema specification * add orient="table" * Add Table Schema extension * Update 0012-compact-and-reversible-JSON-interface.md * Update 0012-compact-and-reversible-JSON-interface.md * Update 0012-compact-and-reversible-JSON-interface.md * Update 0012-compact-and-reversible-JSON-interface.md * Update 0012-compact-and-reversible-JSON-interface.md * Update 0012-compact-and-reversible-JSON-interface.md * add 'ntv-pandas' in the 'ecosystem' file * Update ecosystem.md * Update ecosystem.md * Update ecosystem.md --- web/pandas/community/ecosystem.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index 561503de416a5..10ac3bccc6137 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -345,6 +345,27 @@ which pandas excels. ## IO +### [NTV-pandas](https://github.com/loco-philippe/ntv-pandas)*: A semantic, compact and reversible JSON-pandas converter* + +NTV-pandas provides a JSON converter with more data types than the ones supported by pandas directly. + +It supports the following data types: +- pandas data types +- data types defined in the [NTV format](https://loco-philippe.github.io/ES/JSON%20semantic%20format%20(JSON-NTV).htm) +- data types defined in [Table Schema specification](http://dataprotocols.org/json-table-schema/#field-types-and-formats) + +The interface is always reversible (conversion round trip) with two formats (JSON-NTV and JSON-TableSchema). + +*example* +```python +import ntv_pandas as npd + +jsn = df.npd.to_json(table=False) # save df as a JSON-value (format Table Schema if table is True else format NTV ) +df = npd.read_json(jsn) # load a JSON-value as a DataFrame + +df.equals(npd.read_json(df.npd.to_json(df))) # True in any case, whether table=True or not +``` + ### [BCPandas](https://github.com/yehoshuadimarsky/bcpandas) BCPandas provides high performance writes from pandas to Microsoft SQL Server, From e4635ac3d3982bd9b3872b21ad8390bde312d226 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Fri, 6 Oct 2023 19:14:33 +0300 Subject: [PATCH 093/131] DOC move resample bug whatsnew notes to 2.2 (#55309) * move resample bug whatsnew notes to 2.2 * move to correct place * remove moved note --- doc/source/whatsnew/v2.1.2.rst | 3 --- doc/source/whatsnew/v2.2.0.rst | 5 ++++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 87b71eac76d1a..ddd1f95c56aea 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -13,7 +13,6 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) - Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`) - Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`) @@ -26,8 +25,6 @@ Bug fixes - Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`) - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) - Fixed bug in :meth:`DataFrame.interpolate` raising incorrect error message (:issue:`55347`) -- Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) -- Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) - Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`) - Fixed bug in :meth:`Series.all` and :meth:`Series.any` not treating missing values correctly for ``dtype="string[pyarrow_numpy]"`` (:issue:`55367`) - Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`55362`) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 7fb2e26c3fa38..017a28ffb573a 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -284,6 +284,7 @@ Bug fixes - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) - Bug in :meth:`pandas.DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) - Bug in :meth:`pandas.read_excel` with a ODS file without cached formatted cell for float values (:issue:`55219`) +- Categorical ^^^^^^^^^^^ @@ -363,7 +364,9 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- +- Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) +- Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) +- Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) - Reshaping From 29834640fc3d1c7a3cd6e255ffaa9126ebae8328 Mon Sep 17 00:00:00 2001 From: caneff Date: Fri, 6 Oct 2023 13:29:31 -0400 Subject: [PATCH 094/131] TYP: Add overload for ser.rename_axis (#55412) * Add overload for rename_axis * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- pandas/core/series.py | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index a5e692431e890..6403bee8d8915 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5004,8 +5004,44 @@ def reindex( # type: ignore[override] tolerance=tolerance, ) + @overload # type: ignore[override] + def rename_axis( + self, + mapper: IndexLabel | lib.NoDefault = ..., + *, + index=..., + axis: Axis = ..., + copy: bool = ..., + inplace: Literal[True], + ) -> None: + ... + + @overload + def rename_axis( + self, + mapper: IndexLabel | lib.NoDefault = ..., + *, + index=..., + axis: Axis = ..., + copy: bool = ..., + inplace: Literal[False] = ..., + ) -> Self: + ... + + @overload + def rename_axis( + self, + mapper: IndexLabel | lib.NoDefault = ..., + *, + index=..., + axis: Axis = ..., + copy: bool = ..., + inplace: bool = ..., + ) -> Self | None: + ... + @doc(NDFrame.rename_axis) - def rename_axis( # type: ignore[override] + def rename_axis( self, mapper: IndexLabel | lib.NoDefault = lib.no_default, *, From 1681cc79d2af88e4f70152a8f8a4caa671fd6b5b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 6 Oct 2023 13:01:48 -0700 Subject: [PATCH 095/131] CLN: assorted (#55359) * CLN: assorted * docstring fixup * docstring fixup * revert doc --- doc/redirects.csv | 1 - .../development/contributing_codebase.rst | 2 +- pandas/core/arrays/base.py | 11 ++++++++ pandas/core/indexes/base.py | 24 +++++++++-------- pandas/core/series.py | 3 ++- pandas/core/sorting.py | 2 +- pandas/tests/indexing/test_loc.py | 13 +++++++++ pandas/tests/tslibs/test_npy_units.py | 27 +++++++++++++++++++ 8 files changed, 68 insertions(+), 15 deletions(-) create mode 100644 pandas/tests/tslibs/test_npy_units.py diff --git a/doc/redirects.csv b/doc/redirects.csv index 97cd20b295e65..bd60cc6a732bd 100644 --- a/doc/redirects.csv +++ b/doc/redirects.csv @@ -127,7 +127,6 @@ generated/pandas.api.types.is_number,../reference/api/pandas.api.types.is_number generated/pandas.api.types.is_numeric_dtype,../reference/api/pandas.api.types.is_numeric_dtype generated/pandas.api.types.is_object_dtype,../reference/api/pandas.api.types.is_object_dtype generated/pandas.api.types.is_period_dtype,../reference/api/pandas.api.types.is_period_dtype -generated/pandas.api.types.is_period,../reference/api/pandas.api.types.is_period generated/pandas.api.types.is_re_compilable,../reference/api/pandas.api.types.is_re_compilable generated/pandas.api.types.is_re,../reference/api/pandas.api.types.is_re generated/pandas.api.types.is_scalar,../reference/api/pandas.api.types.is_scalar diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index 41f4b4d5783ea..e0aa8be066914 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -528,7 +528,7 @@ If a test is known to fail but the manner in which it fails is not meant to be captured, use ``pytest.mark.xfail`` It is common to use this method for a test that exhibits buggy behavior or a non-implemented feature. If the failing test has flaky behavior, use the argument ``strict=False``. This -will make it so pytest does not fail if the test happens to pass. +will make it so pytest does not fail if the test happens to pass. Using ``strict=False`` is highly undesirable, please use it only as a last resort. Prefer the decorator ``@pytest.mark.xfail`` and the argument ``pytest.param`` over usage within a test so that the test is appropriately marked during the diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 177c105688d0c..31c143ee012bb 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1729,6 +1729,17 @@ def transpose(self, *axes: int) -> ExtensionArray: Because ExtensionArrays are always 1D, this is a no-op. It is included for compatibility with np.ndarray. + + Returns + ------- + ExtensionArray + + Examples + -------- + >>> pd.array([1, 2, 3]).transpose() + + [1, 2, 3] + Length: 3, dtype: Int64 """ return self[:] diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c3d7bfc1998c0..147113a1e505e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3501,7 +3501,7 @@ def _intersection(self, other: Index, sort: bool = False): pass else: # TODO: algos.unique1d should preserve DTA/TDA - if is_numeric_dtype(self): + if is_numeric_dtype(self.dtype): # This is faster, because Index.unique() checks for uniqueness # before calculating the unique values. res = algos.unique1d(res_indexer) @@ -5020,7 +5020,10 @@ def _can_use_libjoin(self) -> bool: ) # Exclude index types where the conversion to numpy converts to object dtype, # which negates the performance benefit of libjoin - # TODO: exclude RangeIndex? Seems to break test_concat_datetime_timezone + # Subclasses should override to return False if _get_join_target is + # not zero-copy. + # TODO: exclude RangeIndex (which allocates memory)? + # Doing so seems to break test_concat_datetime_timezone return not isinstance(self, (ABCIntervalIndex, ABCMultiIndex)) # -------------------------------------------------------------------- @@ -6176,8 +6179,8 @@ def _get_indexer_non_comparable( If doing an inequality check, i.e. method is not None. """ if method is not None: - other = _unpack_nested_dtype(target) - raise TypeError(f"Cannot compare dtypes {self.dtype} and {other.dtype}") + other_dtype = _unpack_nested_dtype(target) + raise TypeError(f"Cannot compare dtypes {self.dtype} and {other_dtype}") no_matches = -1 * np.ones(target.shape, dtype=np.intp) if unique: @@ -6288,8 +6291,7 @@ def _should_compare(self, other: Index) -> bool: # respectively. return False - other = _unpack_nested_dtype(other) - dtype = other.dtype + dtype = _unpack_nested_dtype(other) return self._is_comparable_dtype(dtype) or is_object_dtype(dtype) def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: @@ -7592,7 +7594,7 @@ def get_unanimous_names(*indexes: Index) -> tuple[Hashable, ...]: return names -def _unpack_nested_dtype(other: Index) -> Index: +def _unpack_nested_dtype(other: Index) -> DtypeObj: """ When checking if our dtype is comparable with another, we need to unpack CategoricalDtype to look at its categories.dtype. @@ -7603,20 +7605,20 @@ def _unpack_nested_dtype(other: Index) -> Index: Returns ------- - Index + np.dtype or ExtensionDtype """ dtype = other.dtype if isinstance(dtype, CategoricalDtype): # If there is ever a SparseIndex, this could get dispatched # here too. - return dtype.categories + return dtype.categories.dtype elif isinstance(dtype, ArrowDtype): # GH 53617 import pyarrow as pa if pa.types.is_dictionary(dtype.pyarrow_dtype): - other = other.astype(ArrowDtype(dtype.pyarrow_dtype.value_type)) - return other + other = other[:0].astype(ArrowDtype(dtype.pyarrow_dtype.value_type)) + return other.dtype def _maybe_try_sort(result: Index | ArrayLike, sort: bool | None): diff --git a/pandas/core/series.py b/pandas/core/series.py index 6403bee8d8915..ae9f77a6c0bad 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4004,7 +4004,8 @@ def argsort( if mask.any(): # TODO(3.0): once this deprecation is enforced we can call - # self.array.argsort directly, which will close GH#43840 + # self.array.argsort directly, which will close GH#43840 and + # GH#12694 warnings.warn( "The behavior of Series.argsort in the presence of NA values is " "deprecated. In a future version, NA values will be ordered " diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index d96fc02e16d0d..1b1d9d7640058 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -88,7 +88,7 @@ def get_indexer_indexer( # error: Incompatible types in assignment (expression has type # "Union[ExtensionArray, ndarray[Any, Any], Index, Series]", variable has # type "Index") - target = ensure_key_mapped(target, key, levels=level) # type:ignore[assignment] + target = ensure_key_mapped(target, key, levels=level) # type: ignore[assignment] target = target._sort_levels_monotonic() if level is not None: diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 59e0ca0591ced..2eee506e1feb7 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -2166,6 +2166,19 @@ def test_loc_setitem_with_expansion_preserves_nullable_int(self, dtype): result.loc[df.index, "data"] = ser._values tm.assert_frame_equal(result, df) + def test_loc_setitem_ea_not_full_column(self): + # GH#39163 + df = DataFrame({"A": range(5)}) + + val = date_range("2016-01-01", periods=3, tz="US/Pacific") + + df.loc[[0, 1, 2], "B"] = val + + bex = val.append(DatetimeIndex([pd.NaT, pd.NaT], dtype=val.dtype)) + expected = DataFrame({"A": range(5), "B": bex}) + assert expected.dtypes["B"] == val.dtype + tm.assert_frame_equal(df, expected) + class TestLocCallable: def test_frame_loc_getitem_callable(self): diff --git a/pandas/tests/tslibs/test_npy_units.py b/pandas/tests/tslibs/test_npy_units.py new file mode 100644 index 0000000000000..6d05dc79fbb2c --- /dev/null +++ b/pandas/tests/tslibs/test_npy_units.py @@ -0,0 +1,27 @@ +import numpy as np + +from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit +from pandas._libs.tslibs.vectorized import is_date_array_normalized + +# a datetime64 ndarray which *is* normalized +day_arr = np.arange(10, dtype="i8").view("M8[D]") + + +class TestIsDateArrayNormalized: + def test_is_date_array_normalized_day(self): + arr = day_arr + abbrev = "D" + unit = abbrev_to_npy_unit(abbrev) + result = is_date_array_normalized(arr.view("i8"), None, unit) + assert result is True + + def test_is_date_array_normalized_seconds(self): + abbrev = "s" + arr = day_arr.astype(f"M8[{abbrev}]") + unit = abbrev_to_npy_unit(abbrev) + result = is_date_array_normalized(arr.view("i8"), None, unit) + assert result is True + + arr[0] += np.timedelta64(1, abbrev) + result2 = is_date_array_normalized(arr.view("i8"), None, unit) + assert result2 is False From 21689e5fabf1a75f66948d5da35aa4cef97af416 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Sat, 7 Oct 2023 11:20:46 +0300 Subject: [PATCH 096/131] CI: Exclude benchmarks directory when publishing docs (#55380) --- .github/workflows/docbuild-and-upload.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml index deaf2be0a0423..af452363666b5 100644 --- a/.github/workflows/docbuild-and-upload.yml +++ b/.github/workflows/docbuild-and-upload.yml @@ -67,7 +67,7 @@ jobs: run: cp doc/cheatsheet/Pandas_Cheat_Sheet* web/build/ - name: Upload web - run: rsync -az --delete --exclude='pandas-docs' --exclude='docs' web/build/ web@${{ secrets.server_ip }}:/var/www/html + run: rsync -az --delete --exclude='pandas-docs' --exclude='docs' --exclude='benchmarks' web/build/ web@${{ secrets.server_ip }}:/var/www/html if: github.event_name == 'push' && github.ref == 'refs/heads/main' - name: Upload dev docs From 43332c279ef013bae8f11992a7e903cd8c90ccb3 Mon Sep 17 00:00:00 2001 From: chris-caballero <46636284+chris-caballero@users.noreply.github.com> Date: Sat, 7 Oct 2023 10:22:52 -0400 Subject: [PATCH 097/131] DOC: Update Issue Triage Instructions (#55373) --- doc/source/development/maintaining.rst | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index b49c9644e1b2a..29cc256f35a4e 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -44,6 +44,9 @@ reading. Issue triage ------------ +Triage is an important first step in addressing issues reported by the community, and even +partial contributions are a great way to help maintain pandas. Only remove the "Needs Triage" +tag once all of the steps below have been completed. Here's a typical workflow for triaging a newly opened issue. @@ -67,9 +70,9 @@ Here's a typical workflow for triaging a newly opened issue. 3. **Is this a duplicate issue?** We have many open issues. If a new issue is clearly a duplicate, label the - new issue as "Duplicate" assign the milestone "No Action", and close the issue - with a link to the original issue. Make sure to still thank the reporter, and - encourage them to chime in on the original issue, and perhaps try to fix it. + new issue as "Duplicate" and close the issue with a link to the original issue. + Make sure to still thank the reporter, and encourage them to chime in on the + original issue, and perhaps try to fix it. If the new issue provides relevant information, such as a better or slightly different example, add it to the original issue as a comment or an edit to @@ -90,6 +93,10 @@ Here's a typical workflow for triaging a newly opened issue. If a reproducible example is provided, but you see a simplification, edit the original post with your simpler reproducible example. + Ensure the issue exists on the main branch and that it has the "Needs Triage" tag + until all steps have been completed. Add a comment to the issue once you have + verified it exists on the main branch, so others know it has been confirmed. + 5. **Is this a clearly defined feature request?** Generally, pandas prefers to discuss and design new features in issues, before @@ -97,8 +104,9 @@ Here's a typical workflow for triaging a newly opened issue. for the new feature. Having them write a full docstring is a good way to pin down specifics. - We'll need a discussion from several pandas maintainers before deciding whether - the proposal is in scope for pandas. + Tag new feature requests with "Needs Discussion", as we'll need a discussion + from several pandas maintainers before deciding whether the proposal is in + scope for pandas. 6. **Is this a usage question?** @@ -117,10 +125,6 @@ Here's a typical workflow for triaging a newly opened issue. If the issue is clearly defined and the fix seems relatively straightforward, label the issue as "Good first issue". - Typically, new issues will be assigned the "Contributions welcome" milestone, - unless it's know that this issue should be addressed in a specific release (say - because it's a large regression). - Once you have completed the above, make sure to remove the "needs triage" label. .. _maintaining.regressions: From fe5f0c97ef71f2cd7a0657155a3e7d5ee7c65092 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sat, 7 Oct 2023 16:38:30 -0400 Subject: [PATCH 098/131] better cython debugging (#53821) --- .gitignore | 2 +- .../development/debugging_extensions.rst | 30 ++++++++++++++-- pandas/_libs/meson.build | 10 +++++- pandas/_libs/tslibs/meson.build | 11 +++++- setup.py | 3 ++ tooling/debug/Dockerfile.pandas-debug | 35 +++++++++++++++++++ tooling/debug/README | 19 ++++++++++ 7 files changed, 105 insertions(+), 5 deletions(-) create mode 100644 tooling/debug/Dockerfile.pandas-debug create mode 100644 tooling/debug/README diff --git a/.gitignore b/.gitignore index cd22c2bb8cb5b..051a3ec11b794 100644 --- a/.gitignore +++ b/.gitignore @@ -39,7 +39,7 @@ .mesonpy-native-file.ini MANIFEST compile_commands.json -debug +.debug # Python files # ################ diff --git a/doc/source/development/debugging_extensions.rst b/doc/source/development/debugging_extensions.rst index 27f812b65e261..63154369dfd88 100644 --- a/doc/source/development/debugging_extensions.rst +++ b/doc/source/development/debugging_extensions.rst @@ -14,8 +14,8 @@ For Python developers with limited or no C/C++ experience this can seem a daunti 2. `Fundamental Python Debugging Part 2 - Python Extensions `_ 3. `Fundamental Python Debugging Part 3 - Cython Extensions `_ -Generating debug builds ------------------------ +Debugging locally +----------------- By default building pandas from source will generate a release build. To generate a development build you can type:: @@ -27,6 +27,32 @@ By default building pandas from source will generate a release build. To generat By specifying ``builddir="debug"`` all of the targets will be built and placed in the debug directory relative to the project root. This helps to keep your debug and release artifacts separate; you are of course able to choose a different directory name or omit altogether if you do not care to separate build types. +Using Docker +------------ + +To simplify the debugging process, pandas has created a Docker image with a debug build of Python and the gdb/Cython debuggers pre-installed. You may either ``docker pull pandas/pandas-debug`` to get access to this image or build it from the ``tooling/debug`` folder locallly. + +You can then mount your pandas repository into this image via: + +.. code-block:: sh + + docker run --rm -it -w /data -v ${PWD}:/data pandas/pandas-debug + +Inside the image, you can use meson to build/install pandas and place the build artifacts into a ``debug`` folder using a command as follows: + +.. code-block:: sh + + python -m pip install -ve . --no-build-isolation --config-settings=builddir="debug" --config-settings=setup-args="-Dbuildtype=debug" + +If planning to use cygdb, the files required by that application are placed within the build folder. So you have to first ``cd`` to the build folder, then start that application. + +.. code-block:: sh + + cd debug + cygdb + +Within the debugger you can use `cygdb commands `_ to navigate cython extensions. + Editor support -------------- diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index fd632790546f6..b4662d6bf8dd2 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -101,12 +101,20 @@ libs_sources = { 'writers': {'sources': ['writers.pyx']} } +cython_args = [ + '--include-dir', + meson.current_build_dir(), + '-X always_allow_keywords=true' +] +if get_option('buildtype') == 'debug' + cython_args += ['--gdb'] +endif foreach ext_name, ext_dict : libs_sources py.extension_module( ext_name, ext_dict.get('sources'), - cython_args: ['--include-dir', meson.current_build_dir(), '-X always_allow_keywords=true'], + cython_args: cython_args, include_directories: [inc_np, inc_pd], dependencies: ext_dict.get('deps', ''), subdir: 'pandas/_libs', diff --git a/pandas/_libs/tslibs/meson.build b/pandas/_libs/tslibs/meson.build index a1b0c54d1f48c..85410f771233f 100644 --- a/pandas/_libs/tslibs/meson.build +++ b/pandas/_libs/tslibs/meson.build @@ -19,11 +19,20 @@ tslibs_sources = { 'vectorized': {'sources': ['vectorized.pyx']}, } +cython_args = [ + '--include-dir', + meson.current_build_dir(), + '-X always_allow_keywords=true' +] +if get_option('buildtype') == 'debug' + cython_args += ['--gdb'] +endif + foreach ext_name, ext_dict : tslibs_sources py.extension_module( ext_name, ext_dict.get('sources'), - cython_args: ['--include-dir', meson.current_build_dir(), '-X always_allow_keywords=true'], + cython_args: cython_args, include_directories: [inc_np, inc_pd], dependencies: ext_dict.get('deps', ''), subdir: 'pandas/_libs/tslibs', diff --git a/setup.py b/setup.py index 663bbd3952eab..db3717efb738d 100755 --- a/setup.py +++ b/setup.py @@ -418,6 +418,9 @@ def maybe_cythonize(extensions, *args, **kwargs): kwargs["nthreads"] = parsed.parallel build_ext.render_templates(_pxifiles) + if debugging_symbols_requested: + kwargs["gdb_debug"] = True + return cythonize(extensions, *args, **kwargs) diff --git a/tooling/debug/Dockerfile.pandas-debug b/tooling/debug/Dockerfile.pandas-debug new file mode 100644 index 0000000000000..00e10a85d7ab9 --- /dev/null +++ b/tooling/debug/Dockerfile.pandas-debug @@ -0,0 +1,35 @@ +FROM ubuntu:latest + +RUN apt-get update && apt-get upgrade -y +RUN apt-get install -y build-essential git valgrind + +# cpython dev install +RUN git clone -b 3.10 --depth 1 https://github.com/python/cpython.git /clones/cpython +RUN apt-get install -y libbz2-dev libffi-dev libssl-dev zlib1g-dev liblzma-dev libsqlite3-dev libreadline-dev +RUN cd /clones/cpython && ./configure --with-pydebug && CFLAGS="-g3" make -s -j$(nproc) && make install + +# gdb installation +RUN apt-get install -y wget libgmp-dev +RUN cd /tmp && wget http://mirrors.kernel.org/sourceware/gdb/releases/gdb-12.1.tar.gz && tar -zxf gdb-12.1.tar.gz +RUN cd /tmp/gdb-12.1 && ./configure --with-python=python3 && make -j$(nproc) && make install +RUN rm -r /tmp/gdb-12.1 + +# pandas dependencies +RUN python3 -m pip install \ + cython \ + hypothesis \ + ninja \ + numpy \ + meson \ + meson-python \ + pytest \ + pytest-asyncio \ + python-dateutil \ + pytz \ + versioneer[toml] + +# At the time this docker image was built, there was a bug/limitation +# with meson where only having a python3 executable and not python +# would cause the build to fail. This symlink could be removed if +# users stick to always calling python3 within the container +RUN ln -s /usr/local/bin/python3 /usr/local/bin/python diff --git a/tooling/debug/README b/tooling/debug/README new file mode 100644 index 0000000000000..111a958ff5ef5 --- /dev/null +++ b/tooling/debug/README @@ -0,0 +1,19 @@ +The Docker image here helps to set up an isolated environment containing a debug version of Python and a gdb installation which the Cython debugger can work with. + +If you have internet access, you can pull a pre-built image via + +```sh +docker pull pandas/pandas-debug +``` + +To build the image locally, you can do + +```sh +docker build . -t pandas/pandas-debug -f Dockerfile.pandas-debug +``` + +For pandas developers, you can push a new copy of the image to dockerhub via + +```sh +docker push pandas/pandas-debug +``` From a0babcb2c63dd721ea47e75f6229c5fe727b2395 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Sun, 8 Oct 2023 15:57:45 +0200 Subject: [PATCH 099/131] DEPR: deprecate string H, BH, CBH in offsets frequencies, resolution abbreviations, _attrname_to_abbrevs (#54939) * deprecate H in favour of h, fix tests * correct def _maybe_coerce_freq, fix tests * fix tests * correct cdef parse_iso_format_string, fix tests * deprecate BH, CBH and fix tests * fix docs * correct whatsnew * fix pre-commit errors * fix errors in v1.5.0.rst and docs * fix errors in whatsnew and docs * add tests for FutureWarning, update v2.2.0.rst * correct tables of Offset/Period aliases * correct tests * correct def to_offset, add tests * change the condition to _prefix.startswith(('c')) * correct def to_offset --- doc/source/user_guide/advanced.rst | 2 +- doc/source/user_guide/timedeltas.rst | 2 +- doc/source/user_guide/timeseries.rst | 50 ++++----- doc/source/whatsnew/v0.14.0.rst | 39 ++++++- doc/source/whatsnew/v0.15.0.rst | 72 ++++++++++--- doc/source/whatsnew/v0.18.0.rst | 12 ++- doc/source/whatsnew/v0.18.1.rst | 82 +++++++++++--- doc/source/whatsnew/v0.20.0.rst | 32 ++++-- doc/source/whatsnew/v0.21.0.rst | 30 +++++- doc/source/whatsnew/v0.22.0.rst | 21 +++- doc/source/whatsnew/v1.5.0.rst | 68 ++++++++---- doc/source/whatsnew/v2.2.0.rst | 7 +- pandas/_libs/tslibs/dtypes.pyx | 13 ++- pandas/_libs/tslibs/nattype.pyx | 30 +++--- pandas/_libs/tslibs/offsets.pyx | 51 ++++----- pandas/_libs/tslibs/parsing.pyx | 2 +- pandas/_libs/tslibs/period.pyx | 26 ++--- pandas/_libs/tslibs/timedeltas.pyx | 12 +-- pandas/_libs/tslibs/timestamps.pyx | 30 +++--- pandas/core/arrays/arrow/array.py | 2 +- pandas/core/arrays/datetimelike.py | 24 ++--- pandas/core/arrays/datetimes.py | 18 ++-- pandas/core/arrays/period.py | 2 +- pandas/core/arrays/timedeltas.py | 2 +- pandas/core/construction.py | 2 +- pandas/core/dtypes/base.py | 2 +- pandas/core/generic.py | 4 +- pandas/core/indexes/accessors.py | 2 +- pandas/core/indexes/datetimes.py | 8 +- pandas/core/indexes/timedeltas.py | 6 +- pandas/core/resample.py | 18 ++-- pandas/core/tools/timedeltas.py | 8 +- pandas/tests/apply/test_frame_apply.py | 2 +- pandas/tests/apply/test_series_apply.py | 4 +- pandas/tests/arithmetic/conftest.py | 6 +- pandas/tests/arithmetic/test_datetime64.py | 10 +- pandas/tests/arithmetic/test_period.py | 32 +++--- pandas/tests/arithmetic/test_timedelta64.py | 30 +++--- pandas/tests/arrays/categorical/test_repr.py | 38 +++---- .../arrays/datetimes/test_constructors.py | 4 +- .../tests/arrays/period/test_arrow_compat.py | 2 +- pandas/tests/arrays/test_array.py | 16 +-- pandas/tests/arrays/test_datetimelike.py | 2 +- pandas/tests/arrays/test_period.py | 4 +- pandas/tests/arrays/test_timedeltas.py | 10 +- .../arrays/timedeltas/test_reductions.py | 16 +-- pandas/tests/base/test_conversion.py | 2 +- pandas/tests/copy_view/test_methods.py | 2 +- .../dtypes/cast/test_find_common_type.py | 2 +- pandas/tests/dtypes/test_dtypes.py | 14 +-- pandas/tests/extension/test_arrow.py | 6 +- pandas/tests/frame/methods/test_align.py | 4 +- pandas/tests/frame/methods/test_asfreq.py | 8 +- pandas/tests/frame/methods/test_asof.py | 2 +- pandas/tests/frame/methods/test_at_time.py | 6 +- .../tests/frame/methods/test_between_time.py | 2 +- pandas/tests/frame/methods/test_describe.py | 2 +- pandas/tests/frame/methods/test_drop.py | 2 +- pandas/tests/frame/methods/test_reindex.py | 10 +- pandas/tests/frame/methods/test_set_index.py | 6 +- pandas/tests/frame/methods/test_shift.py | 34 +++--- pandas/tests/frame/methods/test_to_csv.py | 6 +- .../tests/frame/methods/test_to_timestamp.py | 4 +- pandas/tests/frame/methods/test_transpose.py | 2 +- pandas/tests/frame/methods/test_tz_convert.py | 2 +- .../tests/frame/methods/test_tz_localize.py | 10 +- pandas/tests/frame/test_arithmetic.py | 2 +- pandas/tests/frame/test_constructors.py | 4 +- pandas/tests/frame/test_repr_info.py | 2 +- pandas/tests/generic/test_finalize.py | 10 +- pandas/tests/groupby/aggregate/test_other.py | 2 +- pandas/tests/groupby/test_apply.py | 2 +- .../tests/groupby/test_groupby_shift_diff.py | 10 +- pandas/tests/groupby/test_grouping.py | 2 +- pandas/tests/groupby/test_quantile.py | 2 +- pandas/tests/groupby/test_timegrouper.py | 8 +- pandas/tests/indexes/conftest.py | 2 +- .../datetimelike_/test_drop_duplicates.py | 2 +- .../indexes/datetimelike_/test_equals.py | 4 +- .../datetimelike_/test_is_monotonic.py | 2 +- .../indexes/datetimelike_/test_sort_values.py | 4 +- .../datetimelike_/test_value_counts.py | 8 +- .../indexes/datetimes/methods/test_astype.py | 4 +- .../datetimes/methods/test_factorize.py | 8 +- .../indexes/datetimes/methods/test_insert.py | 4 +- .../indexes/datetimes/methods/test_shift.py | 22 ++-- .../datetimes/methods/test_to_period.py | 2 +- .../indexes/datetimes/test_constructors.py | 10 +- .../indexes/datetimes/test_date_range.py | 45 ++++---- .../tests/indexes/datetimes/test_datetime.py | 29 ++++- pandas/tests/indexes/datetimes/test_delete.py | 14 +-- .../tests/indexes/datetimes/test_formats.py | 14 +-- .../tests/indexes/datetimes/test_freq_attr.py | 2 +- .../tests/indexes/datetimes/test_indexing.py | 8 +- pandas/tests/indexes/datetimes/test_join.py | 2 +- pandas/tests/indexes/datetimes/test_ops.py | 2 +- .../indexes/datetimes/test_partial_slicing.py | 2 +- .../indexes/datetimes/test_scalar_compat.py | 16 +-- pandas/tests/indexes/datetimes/test_setops.py | 6 +- .../tests/indexes/datetimes/test_timezones.py | 62 +++++------ .../tests/indexes/interval/test_interval.py | 6 +- .../indexes/interval/test_interval_range.py | 8 +- pandas/tests/indexes/multi/test_compat.py | 2 +- .../tests/indexes/multi/test_constructors.py | 8 +- pandas/tests/indexes/multi/test_indexing.py | 6 +- .../indexes/multi/test_partial_indexing.py | 2 +- .../indexes/period/methods/test_asfreq.py | 30 +++--- .../indexes/period/methods/test_fillna.py | 14 +-- .../indexes/period/methods/test_shift.py | 10 +- .../period/methods/test_to_timestamp.py | 4 +- .../tests/indexes/period/test_constructors.py | 16 +-- pandas/tests/indexes/period/test_formats.py | 12 +-- pandas/tests/indexes/period/test_indexing.py | 22 ++-- pandas/tests/indexes/period/test_period.py | 13 ++- .../tests/indexes/period/test_resolution.py | 2 +- .../tests/indexes/period/test_searchsorted.py | 4 +- pandas/tests/indexes/period/test_setops.py | 10 +- pandas/tests/indexes/period/test_tools.py | 2 +- .../indexes/timedeltas/methods/test_astype.py | 8 +- .../indexes/timedeltas/methods/test_shift.py | 10 +- .../indexes/timedeltas/test_freq_attr.py | 2 +- .../tests/indexes/timedeltas/test_indexing.py | 4 +- .../indexes/timedeltas/test_scalar_compat.py | 6 +- .../tests/indexes/timedeltas/test_setops.py | 4 +- .../timedeltas/test_timedelta_range.py | 18 +++- pandas/tests/indexing/test_loc.py | 12 +-- pandas/tests/io/formats/test_format.py | 14 +-- pandas/tests/io/formats/test_to_csv.py | 2 +- .../tests/io/json/test_json_table_schema.py | 6 +- pandas/tests/io/json/test_pandas.py | 2 +- .../io/parser/dtypes/test_categorical.py | 4 +- .../io/pytables/test_retain_attributes.py | 8 +- pandas/tests/io/pytables/test_timezones.py | 4 +- pandas/tests/io/test_sql.py | 2 +- pandas/tests/plotting/test_datetimelike.py | 32 +++--- .../tests/reductions/test_stat_reductions.py | 2 +- pandas/tests/resample/test_base.py | 40 +++---- pandas/tests/resample/test_datetime_index.py | 78 +++++++------- pandas/tests/resample/test_period_index.py | 70 ++++++------ pandas/tests/resample/test_resample_api.py | 30 +++--- .../tests/resample/test_resampler_grouper.py | 12 +-- pandas/tests/resample/test_time_grouper.py | 6 +- pandas/tests/resample/test_timedelta.py | 8 +- pandas/tests/reshape/concat/test_concat.py | 2 +- pandas/tests/reshape/concat/test_datetimes.py | 10 +- pandas/tests/reshape/merge/test_merge.py | 4 +- pandas/tests/scalar/interval/test_interval.py | 4 +- pandas/tests/scalar/period/test_asfreq.py | 100 +++++++++--------- pandas/tests/scalar/period/test_period.py | 84 +++++++-------- .../scalar/timedelta/test_constructors.py | 4 +- .../tests/scalar/timedelta/test_timedelta.py | 9 +- .../tests/scalar/timestamp/test_unary_ops.py | 14 +-- .../series/accessors/test_dt_accessor.py | 16 +-- pandas/tests/series/indexing/test_datetime.py | 8 +- pandas/tests/series/indexing/test_getitem.py | 2 +- pandas/tests/series/indexing/test_indexing.py | 2 +- pandas/tests/series/indexing/test_setitem.py | 4 +- pandas/tests/series/methods/test_align.py | 2 +- pandas/tests/series/methods/test_asof.py | 4 +- .../series/methods/test_combine_first.py | 2 +- .../tests/series/methods/test_interpolate.py | 2 +- pandas/tests/series/methods/test_map.py | 4 +- pandas/tests/series/methods/test_reindex.py | 2 +- pandas/tests/series/test_arithmetic.py | 6 +- pandas/tests/series/test_constructors.py | 8 +- pandas/tests/series/test_reductions.py | 2 +- pandas/tests/series/test_repr.py | 16 +-- pandas/tests/tools/test_to_datetime.py | 2 +- .../tseries/frequencies/test_freq_code.py | 18 ++-- .../tseries/frequencies/test_inference.py | 18 ++-- .../tseries/offsets/test_business_hour.py | 44 ++++---- .../offsets/test_custom_business_hour.py | 4 +- pandas/tests/tseries/offsets/test_offsets.py | 2 +- pandas/tests/tslibs/test_conversion.py | 4 +- pandas/tests/tslibs/test_period_asfreq.py | 12 +-- pandas/tests/tslibs/test_to_offset.py | 16 +-- pandas/tests/window/test_groupby.py | 8 +- pandas/tests/window/test_pairwise.py | 2 +- pandas/tests/window/test_timeseries_window.py | 4 +- pandas/tseries/frequencies.py | 44 ++++---- 180 files changed, 1298 insertions(+), 1045 deletions(-) diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 682fa4c9b4fcc..453536098cfbb 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -976,7 +976,7 @@ of :ref:`frequency aliases ` with datetime-like inter pd.interval_range(start=pd.Timestamp("2017-01-01"), periods=4, freq="W") - pd.interval_range(start=pd.Timedelta("0 days"), periods=3, freq="9H") + pd.interval_range(start=pd.Timedelta("0 days"), periods=3, freq="9h") Additionally, the ``closed`` parameter can be used to specify which side(s) the intervals are closed on. Intervals are closed on the right side by default. diff --git a/doc/source/user_guide/timedeltas.rst b/doc/source/user_guide/timedeltas.rst index cd567f8442671..5daf204f39bcf 100644 --- a/doc/source/user_guide/timedeltas.rst +++ b/doc/source/user_guide/timedeltas.rst @@ -392,7 +392,7 @@ The ``freq`` parameter can passed a variety of :ref:`frequency aliases = 0.9.3. (:issue:`5945`) - ``pd.stats.moments.rolling_var`` now uses Welford's method for increased numerical stability (:issue:`6817`) diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst index dffb4c7b9ff9e..8dafed1efee97 100644 --- a/doc/source/whatsnew/v0.15.0.rst +++ b/doc/source/whatsnew/v0.15.0.rst @@ -332,16 +332,37 @@ Timezone handling improvements - ``tz_localize(None)`` for tz-aware ``Timestamp`` and ``DatetimeIndex`` now removes timezone holding local time, previously this resulted in ``Exception`` or ``TypeError`` (:issue:`7812`) - .. ipython:: python + .. code-block:: ipython + + In [58]: ts = pd.Timestamp('2014-08-01 09:00', tz='US/Eastern') + + In[59]: ts + Out[59]: Timestamp('2014-08-01 09:00:00-0400', tz='US/Eastern') - ts = pd.Timestamp('2014-08-01 09:00', tz='US/Eastern') - ts - ts.tz_localize(None) + In [60]: ts.tz_localize(None) + Out[60]: Timestamp('2014-08-01 09:00:00') - didx = pd.date_range(start='2014-08-01 09:00', freq='H', - periods=10, tz='US/Eastern') - didx - didx.tz_localize(None) + In [61]: didx = pd.date_range(start='2014-08-01 09:00', freq='H', + ....: periods=10, tz='US/Eastern') + ....: + + In [62]: didx + Out[62]: + DatetimeIndex(['2014-08-01 09:00:00-04:00', '2014-08-01 10:00:00-04:00', + '2014-08-01 11:00:00-04:00', '2014-08-01 12:00:00-04:00', + '2014-08-01 13:00:00-04:00', '2014-08-01 14:00:00-04:00', + '2014-08-01 15:00:00-04:00', '2014-08-01 16:00:00-04:00', + '2014-08-01 17:00:00-04:00', '2014-08-01 18:00:00-04:00'], + dtype='datetime64[ns, US/Eastern]', freq='H') + + In [63]: didx.tz_localize(None) + Out[63]: + DatetimeIndex(['2014-08-01 09:00:00', '2014-08-01 10:00:00', + '2014-08-01 11:00:00', '2014-08-01 12:00:00', + '2014-08-01 13:00:00', '2014-08-01 14:00:00', + '2014-08-01 15:00:00', '2014-08-01 16:00:00', + '2014-08-01 17:00:00', '2014-08-01 18:00:00'], + dtype='datetime64[ns]', freq=None) - ``tz_localize`` now accepts the ``ambiguous`` keyword which allows for passing an array of bools indicating whether the date belongs in DST or not, 'NaT' for setting transition times to NaT, @@ -1050,16 +1071,35 @@ Other: If ``Period`` freq is ``D``, ``H``, ``T``, ``S``, ``L``, ``U``, ``N``, ``Timedelta``-like can be added if the result can have same freq. Otherwise, only the same ``offsets`` can be added. - .. ipython:: python + .. code-block:: ipython - idx = pd.period_range('2014-07-01 09:00', periods=5, freq='H') - idx - idx + pd.offsets.Hour(2) - idx + pd.Timedelta('120m') + In [104]: idx = pd.period_range('2014-07-01 09:00', periods=5, freq='H') - idx = pd.period_range('2014-07', periods=5, freq='M') - idx - idx + pd.offsets.MonthEnd(3) + In [105]: idx + Out[105]: + PeriodIndex(['2014-07-01 09:00', '2014-07-01 10:00', '2014-07-01 11:00', + '2014-07-01 12:00', '2014-07-01 13:00'], + dtype='period[H]') + + In [106]: idx + pd.offsets.Hour(2) + Out[106]: + PeriodIndex(['2014-07-01 11:00', '2014-07-01 12:00', '2014-07-01 13:00', + '2014-07-01 14:00', '2014-07-01 15:00'], + dtype='period[H]') + + In [107]: idx + pd.Timedelta('120m') + Out[107]: + PeriodIndex(['2014-07-01 11:00', '2014-07-01 12:00', '2014-07-01 13:00', + '2014-07-01 14:00', '2014-07-01 15:00'], + dtype='period[H]') + + In [108]: idx = pd.period_range('2014-07', periods=5, freq='M') + + In [109]: idx + Out[109]: PeriodIndex(['2014-07', '2014-08', '2014-09', '2014-10', '2014-11'], dtype='period[M]') + + In [110]: idx + pd.offsets.MonthEnd(3) + Out[110]: PeriodIndex(['2014-10', '2014-11', '2014-12', '2015-01', '2015-02'], dtype='period[M]') - Added experimental compatibility with ``openpyxl`` for versions >= 2.0. The ``DataFrame.to_excel`` method ``engine`` keyword now recognizes ``openpyxl1`` and ``openpyxl2`` diff --git a/doc/source/whatsnew/v0.18.0.rst b/doc/source/whatsnew/v0.18.0.rst index 02e47553cd184..8984109da2a43 100644 --- a/doc/source/whatsnew/v0.18.0.rst +++ b/doc/source/whatsnew/v0.18.0.rst @@ -1000,10 +1000,16 @@ Other API changes ^^^^^^^^^^^^^^^^^ - ``DataFrame.between_time`` and ``Series.between_time`` now only parse a fixed set of time strings. Parsing of date strings is no longer supported and raises a ``ValueError``. (:issue:`11818`) - .. ipython:: python + .. code-block:: ipython + + In [107]: s = pd.Series(range(10), pd.date_range('2015-01-01', freq='H', periods=10)) - s = pd.Series(range(10), pd.date_range('2015-01-01', freq='H', periods=10)) - s.between_time("7:00am", "9:00am") + In [108]: s.between_time("7:00am", "9:00am") + Out[108]: + 2015-01-01 07:00:00 7 + 2015-01-01 08:00:00 8 + 2015-01-01 09:00:00 9 + Freq: H, Length: 3, dtype: int64 This will now raise. diff --git a/doc/source/whatsnew/v0.18.1.rst b/doc/source/whatsnew/v0.18.1.rst index ee6a60144bc35..85e0e63016729 100644 --- a/doc/source/whatsnew/v0.18.1.rst +++ b/doc/source/whatsnew/v0.18.1.rst @@ -256,26 +256,78 @@ Partial string indexing on ``DatetimeIndex`` when part of a ``MultiIndex`` Partial string indexing now matches on ``DateTimeIndex`` when part of a ``MultiIndex`` (:issue:`10331`) -.. ipython:: python +.. code-block:: ipython - dft2 = pd.DataFrame( - np.random.randn(20, 1), - columns=["A"], - index=pd.MultiIndex.from_product( - [pd.date_range("20130101", periods=10, freq="12H"), ["a", "b"]] - ), - ) - dft2 - dft2.loc["2013-01-05"] + In [20]: dft2 = pd.DataFrame( + ....: np.random.randn(20, 1), + ....: columns=["A"], + ....: index=pd.MultiIndex.from_product( + ....: [pd.date_range("20130101", periods=10, freq="12H"), ["a", "b"]] + ....: ), + ....: ) + ....: + + In [21]: dft2 + Out[21]: + A + 2013-01-01 00:00:00 a 0.469112 + b -0.282863 + 2013-01-01 12:00:00 a -1.509059 + b -1.135632 + 2013-01-02 00:00:00 a 1.212112 + ... ... + 2013-01-04 12:00:00 b 0.271860 + 2013-01-05 00:00:00 a -0.424972 + b 0.567020 + 2013-01-05 12:00:00 a 0.276232 + b -1.087401 + + [20 rows x 1 columns] + + In [22]: dft2.loc["2013-01-05"] + Out[22]: + A + 2013-01-05 00:00:00 a -0.424972 + b 0.567020 + 2013-01-05 12:00:00 a 0.276232 + b -1.087401 + + [4 rows x 1 columns] On other levels -.. ipython:: python +.. code-block:: ipython - idx = pd.IndexSlice - dft2 = dft2.swaplevel(0, 1).sort_index() - dft2 - dft2.loc[idx[:, "2013-01-05"], :] + In [26]: idx = pd.IndexSlice + + In [27]: dft2 = dft2.swaplevel(0, 1).sort_index() + + In [28]: dft2 + Out[28]: + A + a 2013-01-01 00:00:00 0.469112 + 2013-01-01 12:00:00 -1.509059 + 2013-01-02 00:00:00 1.212112 + 2013-01-02 12:00:00 0.119209 + 2013-01-03 00:00:00 -0.861849 + ... ... + b 2013-01-03 12:00:00 1.071804 + 2013-01-04 00:00:00 -0.706771 + 2013-01-04 12:00:00 0.271860 + 2013-01-05 00:00:00 0.567020 + 2013-01-05 12:00:00 -1.087401 + + [20 rows x 1 columns] + + In [29]: dft2.loc[idx[:, "2013-01-05"], :] + Out[29]: + A + a 2013-01-05 00:00:00 -0.424972 + 2013-01-05 12:00:00 0.276232 + b 2013-01-05 00:00:00 0.567020 + 2013-01-05 12:00:00 -1.087401 + + [4 rows x 1 columns] .. _whatsnew_0181.enhancements.assembling: diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index 9005fafaf18f7..ae70eb078f6d9 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -614,11 +614,18 @@ New behavior: ``map`` on a ``Series`` with ``datetime64`` values may return ``int64`` dtypes rather than ``int32`` -.. ipython:: python +.. code-block:: ipython + + In [64]: s = pd.Series(pd.date_range('2011-01-02T00:00', '2011-01-02T02:00', freq='H') + ....: .tz_localize('Asia/Tokyo')) + ....: - s = pd.Series(pd.date_range('2011-01-02T00:00', '2011-01-02T02:00', freq='H') - .tz_localize('Asia/Tokyo')) - s + In [65]: s + Out[65]: + 0 2011-01-02 00:00:00+09:00 + 1 2011-01-02 01:00:00+09:00 + 2 2011-01-02 02:00:00+09:00 + Length: 3, dtype: datetime64[ns, Asia/Tokyo] Previous behavior: @@ -633,9 +640,14 @@ Previous behavior: New behavior: -.. ipython:: python +.. code-block:: ipython - s.map(lambda x: x.hour) + In [66]: s.map(lambda x: x.hour) + Out[66]: + 0 0 + 1 1 + 2 2 + Length: 3, dtype: int64 .. _whatsnew_0200.api_breaking.index_dt_field: @@ -659,10 +671,12 @@ Previous behaviour: New behavior: -.. ipython:: python +.. code-block:: ipython + + In [67]: idx = pd.date_range("2015-01-01", periods=5, freq='10H') - idx = pd.date_range("2015-01-01", periods=5, freq='10H') - idx.hour + In [68]: idx.hour + Out[68]: Index([0, 10, 20, 6, 16], dtype='int32') This has the advantage that specific ``Index`` methods are still available on the result. On the other hand, this might have backward incompatibilities: e.g. diff --git a/doc/source/whatsnew/v0.21.0.rst b/doc/source/whatsnew/v0.21.0.rst index f8eacd28fa795..62296550a472b 100644 --- a/doc/source/whatsnew/v0.21.0.rst +++ b/doc/source/whatsnew/v0.21.0.rst @@ -671,15 +671,35 @@ Previous behavior: New behavior: -.. ipython:: python +.. code-block:: ipython + + In [56]: pi = pd.period_range(start='2000-01-01', freq='D', periods=10) + + In [57]: s = pd.Series(np.arange(10), index=pi) - pi = pd.period_range(start='2000-01-01', freq='D', periods=10) + In [58]: s.resample('H').ohlc() + Out[58]: + open high low close + 2000-01-01 00:00 0.0 0.0 0.0 0.0 + 2000-01-01 01:00 NaN NaN NaN NaN + 2000-01-01 02:00 NaN NaN NaN NaN + 2000-01-01 03:00 NaN NaN NaN NaN + 2000-01-01 04:00 NaN NaN NaN NaN + ... ... ... ... ... + 2000-01-10 19:00 NaN NaN NaN NaN + 2000-01-10 20:00 NaN NaN NaN NaN + 2000-01-10 21:00 NaN NaN NaN NaN + 2000-01-10 22:00 NaN NaN NaN NaN + 2000-01-10 23:00 NaN NaN NaN NaN - s = pd.Series(np.arange(10), index=pi) + [240 rows x 4 columns] - s.resample('H').ohlc() + In [59]: s.resample('M').ohlc() + Out[59]: + open high low close + 2000-01 0 9 0 9 - s.resample('M').ohlc() + [1 rows x 4 columns] .. _whatsnew_0210.api_breaking.pandas_eval: diff --git a/doc/source/whatsnew/v0.22.0.rst b/doc/source/whatsnew/v0.22.0.rst index c494b4f286662..a33a8f7addeef 100644 --- a/doc/source/whatsnew/v0.22.0.rst +++ b/doc/source/whatsnew/v0.22.0.rst @@ -187,16 +187,27 @@ entirely valid. *pandas 0.22.0* -.. ipython:: python +.. code-block:: ipython - idx = pd.DatetimeIndex(["2017-01-01", "2017-01-02"]) - pd.Series([1, 2], index=idx).resample("12H").sum() + In [14]: idx = pd.DatetimeIndex(["2017-01-01", "2017-01-02"]) + In [15]: pd.Series([1, 2], index=idx).resample("12H").sum() + Out[15]: + 2017-01-01 00:00:00 1 + 2017-01-01 12:00:00 0 + 2017-01-02 00:00:00 2 + Freq: 12H, Length: 3, dtype: int64 Once again, the ``min_count`` keyword is available to restore the 0.21 behavior. -.. ipython:: python +.. code-block:: ipython + + In [16]: pd.Series([1, 2], index=idx).resample("12H").sum(min_count=1) + Out[16]: + 2017-01-01 00:00:00 1.0 + 2017-01-01 12:00:00 NaN + 2017-01-02 00:00:00 2.0 + Freq: 12H, Length: 3, dtype: float64 - pd.Series([1, 2], index=idx).resample("12H").sum(min_count=1) Rolling and expanding ^^^^^^^^^^^^^^^^^^^^^ diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 44728e7e552ab..8fa1361cc30c1 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -112,14 +112,33 @@ to the index in the resample when :meth:`.Resampler.apply` is used. of pandas, not specifying ``group_keys`` will default to the same behavior as ``group_keys=False``. -.. ipython:: python +.. code-block:: ipython - df = pd.DataFrame( - {'a': range(6)}, - index=pd.date_range("2021-01-01", periods=6, freq="8H") - ) - df.resample("D", group_keys=True).apply(lambda x: x) - df.resample("D", group_keys=False).apply(lambda x: x) + In [11]: df = pd.DataFrame( + ....: {'a': range(6)}, + ....: index=pd.date_range("2021-01-01", periods=6, freq="8H") + ....: ) + ....: + + In [12]: df.resample("D", group_keys=True).apply(lambda x: x) + Out[12]: + a + 2021-01-01 2021-01-01 00:00:00 0 + 2021-01-01 08:00:00 1 + 2021-01-01 16:00:00 2 + 2021-01-02 2021-01-02 00:00:00 3 + 2021-01-02 08:00:00 4 + 2021-01-02 16:00:00 5 + + In [13]: df.resample("D", group_keys=False).apply(lambda x: x) + Out[13]: + a + 2021-01-01 00:00:00 0 + 2021-01-01 08:00:00 1 + 2021-01-01 16:00:00 2 + 2021-01-02 00:00:00 3 + 2021-01-02 08:00:00 4 + 2021-01-02 16:00:00 5 Previously, the resulting index would depend upon the values returned by ``apply``, as seen in the following example. @@ -461,19 +480,20 @@ upon serialization. (Related issue :issue:`12997`) *Old Behavior* -.. ipython:: python +.. code-block:: ipython - index = pd.date_range( - start='2020-12-28 00:00:00', - end='2020-12-28 02:00:00', - freq='1H', - ) - a = pd.Series( - data=range(3), - index=index, - ) + In [32]: index = pd.date_range( + ....: start='2020-12-28 00:00:00', + ....: end='2020-12-28 02:00:00', + ....: freq='1H', + ....: ) + ....: -.. code-block:: ipython + In [33]: a = pd.Series( + ....: data=range(3), + ....: index=index, + ....: ) + ....: In [4]: from io import StringIO @@ -485,12 +505,16 @@ upon serialization. (Related issue :issue:`12997`) *New Behavior* -.. ipython:: python +.. code-block:: ipython + + In [34]: from io import StringIO + + In [35]: a.to_json(date_format='iso') + Out[35]: '{"2020-12-28T00:00:00.000Z":0,"2020-12-28T01:00:00.000Z":1,"2020-12-28T02:00:00.000Z":2}' - from io import StringIO - a.to_json(date_format='iso') # Roundtripping now works - pd.read_json(StringIO(a.to_json(date_format='iso')), typ="series").index == a.index + In [36]: pd.read_json(StringIO(a.to_json(date_format='iso')), typ="series").index == a.index + Out[36]: array([ True, True, True]) .. _whatsnew_150.notable_bug_fixes.groupby_value_counts_categorical: diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 017a28ffb573a..04a76b481bfc3 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -232,7 +232,7 @@ For example: Other Deprecations ^^^^^^^^^^^^^^^^^^ -- Changed :meth:`Timedelta.resolution_string` to return ``min``, ``s``, ``ms``, ``us``, and ``ns`` instead of ``T``, ``S``, ``L``, ``U``, and ``N``, for compatibility with respective deprecations in frequency aliases (:issue:`52536`) +- Changed :meth:`Timedelta.resolution_string` to return ``h``, ``min``, ``s``, ``ms``, ``us``, and ``ns`` instead of ``H``, ``T``, ``S``, ``L``, ``U``, and ``N``, for compatibility with respective deprecations in frequency aliases (:issue:`52536`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_clipboard`. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_csv` except ``path_or_buf``. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_dict`. (:issue:`54229`) @@ -252,9 +252,10 @@ Other Deprecations - Deprecated including the groups in computations when using :meth:`DataFrameGroupBy.apply` and :meth:`DataFrameGroupBy.resample`; pass ``include_groups=False`` to exclude the groups (:issue:`7155`) - Deprecated not passing a tuple to :class:`DataFrameGroupBy.get_group` or :class:`SeriesGroupBy.get_group` when grouping by a length-1 list-like (:issue:`25971`) - Deprecated string ``A`` denoting frequency in :class:`YearEnd` and strings ``A-DEC``, ``A-JAN``, etc. denoting annual frequencies with various fiscal year ends (:issue:`52536`) -- Deprecated strings ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`) +- Deprecated strings ``H``, ``BH``, and ``CBH`` denoting frequencies in :class:`Hour`, :class:`BusinessHour`, :class:`CustomBusinessHour` (:issue:`52536`) +- Deprecated strings ``H``, ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`) +- Deprecated strings ``H``, ``T``, ``S``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`52536`) - Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`52536`) -- Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`52536`) - Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`) - Deprecated the option ``mode.data_manager`` and the ``ArrayManager``; only the ``BlockManager`` will be available in future versions (:issue:`55043`) - Deprecating downcasting the results of :meth:`DataFrame.fillna`, :meth:`Series.fillna`, :meth:`DataFrame.ffill`, :meth:`Series.ffill`, :meth:`DataFrame.bfill`, :meth:`Series.bfill` in object-dtype cases. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`54261`) diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index ebb6e3a240cbe..86f620beeec3b 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -142,7 +142,7 @@ _period_code_map = { "B": PeriodDtypeCode.B, # Business days "D": PeriodDtypeCode.D, # Daily - "H": PeriodDtypeCode.H, # Hourly + "h": PeriodDtypeCode.H, # Hourly "min": PeriodDtypeCode.T, # Minutely "s": PeriodDtypeCode.S, # Secondly "ms": PeriodDtypeCode.L, # Millisecondly @@ -175,7 +175,7 @@ _attrname_to_abbrevs = { "quarter": "Q", "month": "M", "day": "D", - "hour": "H", + "hour": "h", "minute": "min", "second": "s", "millisecond": "ms", @@ -203,7 +203,7 @@ OFFSET_TO_PERIOD_FREQSTR: dict = { "ms": "ms", "us": "us", "ns": "ns", - "H": "H", + "h": "h", "Q": "Q", "Y": "Y", "W": "W", @@ -244,6 +244,9 @@ DEPR_ABBREVS: dict[str, str]= { "A-SEP": "Y-SEP", "A-OCT": "Y-OCT", "A-NOV": "Y-NOV", + "H": "h", + "BH": "bh", + "CBH": "cbh", "T": "min", "t": "min", "S": "s", @@ -339,10 +342,10 @@ class Resolution(Enum): Examples -------- - >>> Resolution.get_reso_from_freqstr('H') + >>> Resolution.get_reso_from_freqstr('h') - >>> Resolution.get_reso_from_freqstr('H') == Resolution.RESO_HR + >>> Resolution.get_reso_from_freqstr('h') == Resolution.RESO_HR True """ try: diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index bb497f2e17b93..9f9549b93fefe 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -1000,7 +1000,7 @@ timedelta}, default 'raise' A timestamp can be rounded using multiple frequency units: - >>> ts.round(freq='H') # hour + >>> ts.round(freq='h') # hour Timestamp('2020-03-14 16:00:00') >>> ts.round(freq='min') # minute @@ -1017,9 +1017,9 @@ timedelta}, default 'raise' >>> ts.round(freq='5min') Timestamp('2020-03-14 15:35:00') - or a combination of multiple units, like '1H30min' (i.e. 1 hour and 30 minutes): + or a combination of multiple units, like '1h30min' (i.e. 1 hour and 30 minutes): - >>> ts.round(freq='1H30min') + >>> ts.round(freq='1h30min') Timestamp('2020-03-14 15:00:00') Analogous for ``pd.NaT``: @@ -1032,10 +1032,10 @@ timedelta}, default 'raise' >>> ts_tz = pd.Timestamp("2021-10-31 01:30:00").tz_localize("Europe/Amsterdam") - >>> ts_tz.round("H", ambiguous=False) + >>> ts_tz.round("h", ambiguous=False) Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam') - >>> ts_tz.round("H", ambiguous=True) + >>> ts_tz.round("h", ambiguous=True) Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam') """, ) @@ -1089,7 +1089,7 @@ timedelta}, default 'raise' A timestamp can be floored using multiple frequency units: - >>> ts.floor(freq='H') # hour + >>> ts.floor(freq='h') # hour Timestamp('2020-03-14 15:00:00') >>> ts.floor(freq='min') # minute @@ -1106,9 +1106,9 @@ timedelta}, default 'raise' >>> ts.floor(freq='5min') Timestamp('2020-03-14 15:30:00') - or a combination of multiple units, like '1H30min' (i.e. 1 hour and 30 minutes): + or a combination of multiple units, like '1h30min' (i.e. 1 hour and 30 minutes): - >>> ts.floor(freq='1H30min') + >>> ts.floor(freq='1h30min') Timestamp('2020-03-14 15:00:00') Analogous for ``pd.NaT``: @@ -1121,10 +1121,10 @@ timedelta}, default 'raise' >>> ts_tz = pd.Timestamp("2021-10-31 03:30:00").tz_localize("Europe/Amsterdam") - >>> ts_tz.floor("2H", ambiguous=False) + >>> ts_tz.floor("2h", ambiguous=False) Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam') - >>> ts_tz.floor("2H", ambiguous=True) + >>> ts_tz.floor("2h", ambiguous=True) Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam') """, ) @@ -1178,7 +1178,7 @@ timedelta}, default 'raise' A timestamp can be ceiled using multiple frequency units: - >>> ts.ceil(freq='H') # hour + >>> ts.ceil(freq='h') # hour Timestamp('2020-03-14 16:00:00') >>> ts.ceil(freq='min') # minute @@ -1195,9 +1195,9 @@ timedelta}, default 'raise' >>> ts.ceil(freq='5min') Timestamp('2020-03-14 15:35:00') - or a combination of multiple units, like '1H30min' (i.e. 1 hour and 30 minutes): + or a combination of multiple units, like '1h30min' (i.e. 1 hour and 30 minutes): - >>> ts.ceil(freq='1H30min') + >>> ts.ceil(freq='1h30min') Timestamp('2020-03-14 16:30:00') Analogous for ``pd.NaT``: @@ -1210,10 +1210,10 @@ timedelta}, default 'raise' >>> ts_tz = pd.Timestamp("2021-10-31 01:30:00").tz_localize("Europe/Amsterdam") - >>> ts_tz.ceil("H", ambiguous=False) + >>> ts_tz.ceil("h", ambiguous=False) Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam') - >>> ts_tz.ceil("H", ambiguous=True) + >>> ts_tz.ceil("h", ambiguous=True) Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam') """, ) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 534d8e9926b38..042d5dafe3046 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -603,10 +603,10 @@ cdef class BaseOffset: Examples -------- >>> pd.offsets.Hour().name - 'H' + 'h' >>> pd.offsets.Hour(5).name - 'H' + 'h' """ return self.rule_code @@ -629,7 +629,7 @@ cdef class BaseOffset: '<5 * DateOffsets>' >>> pd.offsets.BusinessHour(2).freqstr - '2BH' + '2bh' >>> pd.offsets.Nano().freqstr 'ns' @@ -1166,7 +1166,7 @@ cdef class Hour(Tick): Timestamp('2022-12-09 11:00:00') """ _nanos_inc = 3600 * 1_000_000_000 - _prefix = "H" + _prefix = "h" _period_dtype_code = PeriodDtypeCode.H _creso = NPY_DATETIMEUNIT.NPY_FR_h @@ -1630,7 +1630,7 @@ cdef class BusinessMixin(SingleConstructorOffset): # Older (<0.22.0) versions have offset attribute instead of _offset self._offset = state.pop("offset") - if self._prefix.startswith("C"): + if self._prefix.startswith(("C", "c")): # i.e. this is a Custom class weekmask = state.pop("weekmask") holidays = state.pop("holidays") @@ -1696,7 +1696,7 @@ cdef class BusinessDay(BusinessMixin): s = td.seconds hrs = int(s / 3600) if hrs != 0: - off_str += str(hrs) + "H" + off_str += str(hrs) + "h" s -= hrs * 3600 mts = int(s / 60) if mts != 0: @@ -1893,10 +1893,10 @@ cdef class BusinessHour(BusinessMixin): '2022-12-12 06:00:00', '2022-12-12 07:00:00', '2022-12-12 10:00:00', '2022-12-12 11:00:00', '2022-12-12 15:00:00', '2022-12-12 16:00:00'], - dtype='datetime64[ns]', freq='BH') + dtype='datetime64[ns]', freq='bh') """ - _prefix = "BH" + _prefix = "bh" _anchor = 0 _attributes = tuple(["n", "normalize", "start", "end", "offset"]) _adjust_dst = False @@ -2016,7 +2016,7 @@ cdef class BusinessHour(BusinessMixin): nb_offset = 1 else: nb_offset = -1 - if self._prefix.startswith("C"): + if self._prefix.startswith(("c")): # CustomBusinessHour return CustomBusinessDay( n=nb_offset, @@ -2176,7 +2176,7 @@ cdef class BusinessHour(BusinessMixin): # adjust by business days first if bd != 0: - if self._prefix.startswith("C"): + if self._prefix.startswith("c"): # GH#30593 this is a Custom offset skip_bd = CustomBusinessDay( n=bd, @@ -2242,7 +2242,7 @@ cdef class BusinessHour(BusinessMixin): dt = datetime( dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second, dt.microsecond ) - # Valid BH can be on the different BusinessDay during midnight + # Valid bh can be on the different BusinessDay during midnight # Distinguish by the time spent from previous opening time return self._is_on_offset(dt) @@ -2252,7 +2252,7 @@ cdef class BusinessHour(BusinessMixin): """ # if self.normalize and not _is_normalized(dt): # return False - # Valid BH can be on the different BusinessDay during midnight + # Valid bh can be on the different BusinessDay during midnight # Distinguish by the time spent from previous opening time if self.n >= 0: op = self._prev_opening_time(dt) @@ -4277,7 +4277,7 @@ cdef class CustomBusinessHour(BusinessHour): '2022-12-12 06:00:00', '2022-12-12 07:00:00', '2022-12-12 10:00:00', '2022-12-12 11:00:00', '2022-12-12 15:00:00', '2022-12-12 16:00:00'], - dtype='datetime64[ns]', freq='CBH') + dtype='datetime64[ns]', freq='cbh') Business days can be specified by ``weekmask`` parameter. To convert the returned datetime object to its string representation @@ -4306,10 +4306,10 @@ cdef class CustomBusinessHour(BusinessHour): '2022-12-15 11:00:00', '2022-12-15 12:00:00', '2022-12-16 10:00:00', '2022-12-16 11:00:00', '2022-12-16 12:00:00'], - dtype='datetime64[ns]', freq='CBH') + dtype='datetime64[ns]', freq='cbh') """ - _prefix = "CBH" + _prefix = "cbh" _anchor = 0 _attributes = tuple( ["n", "normalize", "weekmask", "holidays", "calendar", "start", "end", "offset"] @@ -4549,11 +4549,11 @@ prefix_mapping = { BusinessMonthEnd, # 'BM' BQuarterEnd, # 'BQ' BQuarterBegin, # 'BQS' - BusinessHour, # 'BH' + BusinessHour, # 'bh' CustomBusinessDay, # 'C' CustomBusinessMonthEnd, # 'CBM' CustomBusinessMonthBegin, # 'CBMS' - CustomBusinessHour, # 'CBH' + CustomBusinessHour, # 'cbh' MonthEnd, # 'ME' MonthBegin, # 'MS' Nano, # 'ns' @@ -4566,7 +4566,7 @@ prefix_mapping = { QuarterEnd, # 'Q' QuarterBegin, # 'QS' Milli, # 'ms' - Hour, # 'H' + Hour, # 'h' Day, # 'D' WeekOfMonth, # 'WOM' FY5253, @@ -4598,7 +4598,7 @@ _lite_rule_alias = { "ns": "ns", } -_dont_uppercase = {"MS", "ms", "s", "me"} +_dont_uppercase = {"h", "bh", "cbh", "MS", "ms", "s", "me"} INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}" @@ -4667,7 +4667,7 @@ cpdef to_offset(freq, bint is_period=False): >>> to_offset("5min") <5 * Minutes> - >>> to_offset("1D1H") + >>> to_offset("1D1h") <25 * Hours> >>> to_offset("2W") @@ -4735,17 +4735,18 @@ cpdef to_offset(freq, bint is_period=False): if prefix in c_DEPR_ABBREVS: warnings.warn( - f"\'{prefix}\' is deprecated and will be removed in a " - f"future version. Please use \'{c_DEPR_ABBREVS.get(prefix)}\' " + f"\'{prefix}\' is deprecated and will be removed " + f"in a future version. Please use " + f"\'{c_DEPR_ABBREVS.get(prefix)}\' " f"instead of \'{prefix}\'.", FutureWarning, stacklevel=find_stack_level(), ) prefix = c_DEPR_ABBREVS[prefix] - if prefix in {"D", "H", "min", "s", "ms", "us", "ns"}: - # For these prefixes, we have something like "3H" or - # "2.5T", so we can construct a Timedelta with the + if prefix in {"D", "h", "min", "s", "ms", "us", "ns"}: + # For these prefixes, we have something like "3h" or + # "2.5min", so we can construct a Timedelta with the # matching unit and get our offset from delta_to_tick td = Timedelta(1, unit=prefix) off = delta_to_tick(td) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 71be1d213437a..b23611124ea7c 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -716,7 +716,7 @@ cdef datetime dateutil_parse( elif res.tzoffset: ret = ret.replace(tzinfo=tzoffset(res.tzname, res.tzoffset)) - # dateutil can return a datetime with a tzoffset outside of (-24H, 24H) + # dateutil can return a datetime with a tzoffset outside of (-24h, 24h) # bounds, which is invalid (can be constructed, but raises if we call # str(ret)). Check that and raise here if necessary. try: diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 5fecc77044b4b..cacfe43b236d8 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1942,8 +1942,8 @@ cdef class _Period(PeriodMixin): Examples -------- >>> period = pd.Period('2023-1-1', freq='D') - >>> period.asfreq('H') - Period('2023-01-01 23:00', 'H') + >>> period.asfreq('h') + Period('2023-01-01 23:00', 'h') """ freq = self._maybe_convert_freq(freq) how = validate_end_alias(how) @@ -2054,7 +2054,7 @@ cdef class _Period(PeriodMixin): Examples -------- - >>> p = pd.Period("2018-03-11", freq='H') + >>> p = pd.Period("2018-03-11", freq='h') >>> p.day 11 """ @@ -2155,7 +2155,7 @@ cdef class _Period(PeriodMixin): Examples -------- - >>> p = pd.Period("2018-03-11", "H") + >>> p = pd.Period("2018-03-11", "h") >>> p.weekofyear 10 @@ -2186,7 +2186,7 @@ cdef class _Period(PeriodMixin): Examples -------- - >>> p = pd.Period("2018-03-11", "H") + >>> p = pd.Period("2018-03-11", "h") >>> p.week 10 @@ -2226,14 +2226,14 @@ cdef class _Period(PeriodMixin): Examples -------- - >>> per = pd.Period('2017-12-31 22:00', 'H') + >>> per = pd.Period('2017-12-31 22:00', 'h') >>> per.day_of_week 6 For periods that span over multiple days, the day at the beginning of the period is returned. - >>> per = pd.Period('2017-12-31 22:00', '4H') + >>> per = pd.Period('2017-12-31 22:00', '4h') >>> per.day_of_week 6 >>> per.start_time.day_of_week @@ -2277,14 +2277,14 @@ cdef class _Period(PeriodMixin): Examples -------- - >>> per = pd.Period('2017-12-31 22:00', 'H') + >>> per = pd.Period('2017-12-31 22:00', 'h') >>> per.dayofweek 6 For periods that span over multiple days, the day at the beginning of the period is returned. - >>> per = pd.Period('2017-12-31 22:00', '4H') + >>> per = pd.Period('2017-12-31 22:00', '4h') >>> per.dayofweek 6 >>> per.start_time.dayofweek @@ -2326,7 +2326,7 @@ cdef class _Period(PeriodMixin): Examples -------- - >>> period = pd.Period("2015-10-23", freq='H') + >>> period = pd.Period("2015-10-23", freq='h') >>> period.day_of_year 296 >>> period = pd.Period("2012-12-31", freq='D') @@ -2447,7 +2447,7 @@ cdef class _Period(PeriodMixin): Examples -------- - >>> p = pd.Period("2018-03-11", freq='H') + >>> p = pd.Period("2018-03-11", freq='h') >>> p.daysinmonth 31 """ @@ -2482,8 +2482,8 @@ cdef class _Period(PeriodMixin): Examples -------- - >>> pd.Period.now('H') # doctest: +SKIP - Period('2023-06-12 11:00', 'H') + >>> pd.Period.now('h') # doctest: +SKIP + Period('2023-06-12 11:00', 'h') """ return Period(datetime.now(), freq=freq) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 2f6fa35cae070..e5d81bd5928b9 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -898,8 +898,8 @@ cdef int64_t parse_iso_format_string(str ts) except? -1: elif c in ["W", "D", "H", "M"]: if c in ["H", "M"] and len(number) > 2: raise ValueError(err_msg) - if c == "M": - c = "min" + if c in ["M", "H"]: + c = c.replace("M", "min").replace("H", "h") unit.append(c) r = timedelta_from_spec(number, "0", unit) result += timedelta_as_neg(r, neg) @@ -1442,7 +1442,7 @@ cdef class _Timedelta(timedelta): Resolution: Return value * Days: 'D' - * Hours: 'H' + * Hours: 'h' * Minutes: 'min' * Seconds: 's' * Milliseconds: 'ms' @@ -1484,7 +1484,7 @@ cdef class _Timedelta(timedelta): elif self._m: return "min" elif self._h: - return "H" + return "h" else: return "D" @@ -1725,8 +1725,8 @@ class Timedelta(_Timedelta): .. deprecated:: 2.2.0 - Values `T`, `S`, `L`, `U`, and `N` are deprecated in favour of the values - `min`, `s`, `ms`, `us`, and `ns`. + Values `H`, `T`, `S`, `L`, `U`, and `N` are deprecated in favour + of the values `h`, `min`, `s`, `ms`, `us`, and `ns`. **kwargs Available kwargs: {days, seconds, microseconds, diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 8bf1ebb9bf608..edd061fd8cdf1 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1982,7 +1982,7 @@ timedelta}, default 'raise' A timestamp can be rounded using multiple frequency units: - >>> ts.round(freq='H') # hour + >>> ts.round(freq='h') # hour Timestamp('2020-03-14 16:00:00') >>> ts.round(freq='min') # minute @@ -1999,9 +1999,9 @@ timedelta}, default 'raise' >>> ts.round(freq='5min') Timestamp('2020-03-14 15:35:00') - or a combination of multiple units, like '1H30min' (i.e. 1 hour and 30 minutes): + or a combination of multiple units, like '1h30min' (i.e. 1 hour and 30 minutes): - >>> ts.round(freq='1H30min') + >>> ts.round(freq='1h30min') Timestamp('2020-03-14 15:00:00') Analogous for ``pd.NaT``: @@ -2014,10 +2014,10 @@ timedelta}, default 'raise' >>> ts_tz = pd.Timestamp("2021-10-31 01:30:00").tz_localize("Europe/Amsterdam") - >>> ts_tz.round("H", ambiguous=False) + >>> ts_tz.round("h", ambiguous=False) Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam') - >>> ts_tz.round("H", ambiguous=True) + >>> ts_tz.round("h", ambiguous=True) Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam') """ return self._round( @@ -2073,7 +2073,7 @@ timedelta}, default 'raise' A timestamp can be floored using multiple frequency units: - >>> ts.floor(freq='H') # hour + >>> ts.floor(freq='h') # hour Timestamp('2020-03-14 15:00:00') >>> ts.floor(freq='min') # minute @@ -2090,9 +2090,9 @@ timedelta}, default 'raise' >>> ts.floor(freq='5min') Timestamp('2020-03-14 15:30:00') - or a combination of multiple units, like '1H30min' (i.e. 1 hour and 30 minutes): + or a combination of multiple units, like '1h30min' (i.e. 1 hour and 30 minutes): - >>> ts.floor(freq='1H30min') + >>> ts.floor(freq='1h30min') Timestamp('2020-03-14 15:00:00') Analogous for ``pd.NaT``: @@ -2105,10 +2105,10 @@ timedelta}, default 'raise' >>> ts_tz = pd.Timestamp("2021-10-31 03:30:00").tz_localize("Europe/Amsterdam") - >>> ts_tz.floor("2H", ambiguous=False) + >>> ts_tz.floor("2h", ambiguous=False) Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam') - >>> ts_tz.floor("2H", ambiguous=True) + >>> ts_tz.floor("2h", ambiguous=True) Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam') """ return self._round(freq, RoundTo.MINUS_INFTY, ambiguous, nonexistent) @@ -2162,7 +2162,7 @@ timedelta}, default 'raise' A timestamp can be ceiled using multiple frequency units: - >>> ts.ceil(freq='H') # hour + >>> ts.ceil(freq='h') # hour Timestamp('2020-03-14 16:00:00') >>> ts.ceil(freq='min') # minute @@ -2179,9 +2179,9 @@ timedelta}, default 'raise' >>> ts.ceil(freq='5min') Timestamp('2020-03-14 15:35:00') - or a combination of multiple units, like '1H30min' (i.e. 1 hour and 30 minutes): + or a combination of multiple units, like '1h30min' (i.e. 1 hour and 30 minutes): - >>> ts.ceil(freq='1H30min') + >>> ts.ceil(freq='1h30min') Timestamp('2020-03-14 16:30:00') Analogous for ``pd.NaT``: @@ -2194,10 +2194,10 @@ timedelta}, default 'raise' >>> ts_tz = pd.Timestamp("2021-10-31 01:30:00").tz_localize("Europe/Amsterdam") - >>> ts_tz.ceil("H", ambiguous=False) + >>> ts_tz.ceil("h", ambiguous=False) Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam') - >>> ts_tz.ceil("H", ambiguous=True) + >>> ts_tz.ceil("h", ambiguous=True) Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam') """ return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e844c0542df69..60c42c01e9f6f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2533,7 +2533,7 @@ def _round_temporally( "MS": "month", "W": "week", "D": "day", - "H": "hour", + "h": "hour", "min": "minute", "s": "second", "ms": "millisecond", diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a2960a2870882..fd51303ebd55f 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1826,14 +1826,14 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: dtype='datetime64[ns]', freq='min') """ -_round_example = """>>> rng.round('H') +_round_example = """>>> rng.round('h') DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', '2018-01-01 12:00:00'], dtype='datetime64[ns]', freq=None) **Series** - >>> pd.Series(rng).dt.round("H") + >>> pd.Series(rng).dt.round("h") 0 2018-01-01 12:00:00 1 2018-01-01 12:00:00 2 2018-01-01 12:00:00 @@ -1844,23 +1844,23 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: >>> rng_tz = pd.DatetimeIndex(["2021-10-31 03:30:00"], tz="Europe/Amsterdam") - >>> rng_tz.floor("2H", ambiguous=False) + >>> rng_tz.floor("2h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], dtype='datetime64[ns, Europe/Amsterdam]', freq=None) - >>> rng_tz.floor("2H", ambiguous=True) + >>> rng_tz.floor("2h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], dtype='datetime64[ns, Europe/Amsterdam]', freq=None) """ -_floor_example = """>>> rng.floor('H') +_floor_example = """>>> rng.floor('h') DatetimeIndex(['2018-01-01 11:00:00', '2018-01-01 12:00:00', '2018-01-01 12:00:00'], dtype='datetime64[ns]', freq=None) **Series** - >>> pd.Series(rng).dt.floor("H") + >>> pd.Series(rng).dt.floor("h") 0 2018-01-01 11:00:00 1 2018-01-01 12:00:00 2 2018-01-01 12:00:00 @@ -1871,23 +1871,23 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: >>> rng_tz = pd.DatetimeIndex(["2021-10-31 03:30:00"], tz="Europe/Amsterdam") - >>> rng_tz.floor("2H", ambiguous=False) + >>> rng_tz.floor("2h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], dtype='datetime64[ns, Europe/Amsterdam]', freq=None) - >>> rng_tz.floor("2H", ambiguous=True) + >>> rng_tz.floor("2h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], dtype='datetime64[ns, Europe/Amsterdam]', freq=None) """ -_ceil_example = """>>> rng.ceil('H') +_ceil_example = """>>> rng.ceil('h') DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', '2018-01-01 13:00:00'], dtype='datetime64[ns]', freq=None) **Series** - >>> pd.Series(rng).dt.ceil("H") + >>> pd.Series(rng).dt.ceil("h") 0 2018-01-01 12:00:00 1 2018-01-01 12:00:00 2 2018-01-01 13:00:00 @@ -1898,11 +1898,11 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: >>> rng_tz = pd.DatetimeIndex(["2021-10-31 01:30:00"], tz="Europe/Amsterdam") - >>> rng_tz.ceil("H", ambiguous=False) + >>> rng_tz.ceil("h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], dtype='datetime64[ns, Europe/Amsterdam]', freq=None) - >>> rng_tz.ceil("H", ambiguous=True) + >>> rng_tz.ceil("h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], dtype='datetime64[ns, Europe/Amsterdam]', freq=None) """ diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index e0d4098dd1f33..a2742aed31e4c 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -857,37 +857,37 @@ def tz_convert(self, tz) -> Self: to other time zones: >>> dti = pd.date_range(start='2014-08-01 09:00', - ... freq='H', periods=3, tz='Europe/Berlin') + ... freq='h', periods=3, tz='Europe/Berlin') >>> dti DatetimeIndex(['2014-08-01 09:00:00+02:00', '2014-08-01 10:00:00+02:00', '2014-08-01 11:00:00+02:00'], - dtype='datetime64[ns, Europe/Berlin]', freq='H') + dtype='datetime64[ns, Europe/Berlin]', freq='h') >>> dti.tz_convert('US/Central') DatetimeIndex(['2014-08-01 02:00:00-05:00', '2014-08-01 03:00:00-05:00', '2014-08-01 04:00:00-05:00'], - dtype='datetime64[ns, US/Central]', freq='H') + dtype='datetime64[ns, US/Central]', freq='h') With the ``tz=None``, we can remove the timezone (after converting to UTC if necessary): - >>> dti = pd.date_range(start='2014-08-01 09:00', freq='H', + >>> dti = pd.date_range(start='2014-08-01 09:00', freq='h', ... periods=3, tz='Europe/Berlin') >>> dti DatetimeIndex(['2014-08-01 09:00:00+02:00', '2014-08-01 10:00:00+02:00', '2014-08-01 11:00:00+02:00'], - dtype='datetime64[ns, Europe/Berlin]', freq='H') + dtype='datetime64[ns, Europe/Berlin]', freq='h') >>> dti.tz_convert(None) DatetimeIndex(['2014-08-01 07:00:00', '2014-08-01 08:00:00', '2014-08-01 09:00:00'], - dtype='datetime64[ns]', freq='H') + dtype='datetime64[ns]', freq='h') """ tz = timezones.maybe_get_tz(tz) @@ -1042,7 +1042,7 @@ def tz_localize( 1 2015-03-29 03:30:00+02:00 dtype: datetime64[ns, Europe/Warsaw] - >>> s.dt.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1H')) + >>> s.dt.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1h')) 0 2015-03-29 03:30:00+02:00 1 2015-03-29 03:30:00+02:00 dtype: datetime64[ns, Europe/Warsaw] @@ -1132,13 +1132,13 @@ def normalize(self) -> Self: Examples -------- - >>> idx = pd.date_range(start='2014-08-01 10:00', freq='H', + >>> idx = pd.date_range(start='2014-08-01 10:00', freq='h', ... periods=3, tz='Asia/Calcutta') >>> idx DatetimeIndex(['2014-08-01 10:00:00+05:30', '2014-08-01 11:00:00+05:30', '2014-08-01 12:00:00+05:30'], - dtype='datetime64[ns, Asia/Calcutta]', freq='H') + dtype='datetime64[ns, Asia/Calcutta]', freq='h') >>> idx.normalize() DatetimeIndex(['2014-08-01 00:00:00+05:30', '2014-08-01 00:00:00+05:30', diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 2468535397568..f188b73b4fc64 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -477,7 +477,7 @@ def __arrow_array__(self, type=None): Examples -------- - >>> idx = pd.PeriodIndex(["2023-01-01 10:00", "2023-01-01 11:00"], freq='H') + >>> idx = pd.PeriodIndex(["2023-01-01 10:00", "2023-01-01 11:00"], freq='h') >>> idx.hour Index([10, 11], dtype='int64') """, diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 931a220d7ab29..ca908c11a97bb 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -136,7 +136,7 @@ class TimedeltaArray(dtl.TimelikeOps): Examples -------- - >>> pd.arrays.TimedeltaArray(pd.TimedeltaIndex(['1H', '2H'])) + >>> pd.arrays.TimedeltaArray(pd.TimedeltaIndex(['1h', '2h'])) ['0 days 01:00:00', '0 days 02:00:00'] Length: 2, dtype: timedelta64[ns] diff --git a/pandas/core/construction.py b/pandas/core/construction.py index e661d590ab330..b4b9a4176472d 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -203,7 +203,7 @@ def array( ['2015-01-01 00:00:00', '2016-01-01 00:00:00'] Length: 2, dtype: datetime64[ns] - >>> pd.array(["1H", "2H"], dtype='timedelta64[ns]') + >>> pd.array(["1h", "2h"], dtype='timedelta64[ns]') ['0 days 01:00:00', '0 days 02:00:00'] Length: 2, dtype: timedelta64[ns] diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 6567ca7155b0d..6b00a5284ec5b 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -242,7 +242,7 @@ def construct_from_string(cls, string: str) -> Self: This is useful mainly for data types that accept parameters. For example, a period dtype accepts a frequency parameter that - can be set as ``period[H]`` (where H means hourly frequency). + can be set as ``period[h]`` (where H means hourly frequency). By default, in the abstract class, just the name of the type is expected. But subclasses can overwrite this method to accept diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e131d689b6a40..b7b01a2097358 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9039,7 +9039,7 @@ def at_time(self, time, asof: bool_t = False, axis: Axis | None = None) -> Self: Examples -------- - >>> i = pd.date_range('2018-04-09', periods=4, freq='12H') + >>> i = pd.date_range('2018-04-09', periods=4, freq='12h') >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i) >>> ts A @@ -11380,7 +11380,7 @@ def tz_localize( 2015-03-29 01:59:59.999999999+01:00 0 2015-03-29 03:30:00+02:00 1 dtype: int64 - >>> s.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1H')) + >>> s.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1h')) 2015-03-29 03:30:00+02:00 0 2015-03-29 03:30:00+02:00 1 dtype: int64 diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index af8fa441f8b3f..d90de383adb48 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -544,7 +544,7 @@ class PeriodProperties(Properties): 1 2000-01-01 01:00 2 2000-01-01 02:00 3 2000-01-01 03:00 - dtype: period[H] + dtype: period[h] >>> hours_series.dt.hour 0 0 1 1 diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 62cdce36ed5fb..f3b2a35f379f4 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -779,11 +779,11 @@ def indexer_between_time( Examples -------- - >>> idx = pd.date_range("2023-01-01", periods=4, freq="H") + >>> idx = pd.date_range("2023-01-01", periods=4, freq="h") >>> idx DatetimeIndex(['2023-01-01 00:00:00', '2023-01-01 01:00:00', '2023-01-01 02:00:00', '2023-01-01 03:00:00'], - dtype='datetime64[ns]', freq='H') + dtype='datetime64[ns]', freq='h') >>> idx.indexer_between_time("00:00", "2:00", include_end=False) array([0, 1]) """ @@ -848,7 +848,7 @@ def date_range( periods : int, optional Number of periods to generate. freq : str, Timedelta, datetime.timedelta, or DateOffset, default 'D' - Frequency strings can have multiples, e.g. '5H'. See + Frequency strings can have multiples, e.g. '5h'. See :ref:`here ` for a list of frequency aliases. tz : str or tzinfo, optional @@ -1040,7 +1040,7 @@ def bdate_range( periods : int, default None Number of periods to generate. freq : str, Timedelta, datetime.timedelta, or DateOffset, default 'B' - Frequency strings can have multiples, e.g. '5H'. The default is + Frequency strings can have multiples, e.g. '5h'. The default is business daily ('B'). tz : str or None Time zone name for returning localized DatetimeIndex, for example diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 5ce3dd33eee48..498fe56a7ae7f 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -278,7 +278,7 @@ def timedelta_range( periods : int, default None Number of periods to generate. freq : str, Timedelta, datetime.timedelta, or DateOffset, default 'D' - Frequency strings can have multiples, e.g. '5H'. + Frequency strings can have multiples, e.g. '5h'. name : str, default None Name of the resulting TimedeltaIndex. closed : str, default None @@ -320,10 +320,10 @@ def timedelta_range( Only fixed frequencies can be passed, non-fixed frequencies such as 'M' (month end) will raise. - >>> pd.timedelta_range(start='1 day', end='2 days', freq='6H') + >>> pd.timedelta_range(start='1 day', end='2 days', freq='6h') TimedeltaIndex(['1 days 00:00:00', '1 days 06:00:00', '1 days 12:00:00', '1 days 18:00:00', '2 days 00:00:00'], - dtype='timedelta64[ns]', freq='6H') + dtype='timedelta64[ns]', freq='6h') Specify ``start``, ``end``, and ``periods``; the frequency is generated automatically (linearly spaced). diff --git a/pandas/core/resample.py b/pandas/core/resample.py index b669028ac60bd..bb5f3ce56b470 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -376,13 +376,13 @@ def transform(self, arg, *args, **kwargs): >>> s 2018-01-01 00:00:00 1 2018-01-01 01:00:00 2 - Freq: H, dtype: int64 + Freq: h, dtype: int64 >>> resampled = s.resample('15min') >>> resampled.transform(lambda x: (x - x.mean()) / x.std()) 2018-01-01 00:00:00 NaN 2018-01-01 01:00:00 NaN - Freq: H, dtype: float64 + Freq: h, dtype: float64 """ return self._selected_obj.groupby(self._timegrouper).transform( arg, *args, **kwargs @@ -612,7 +612,7 @@ def nearest(self, limit: int | None = None): >>> s 2018-01-01 00:00:00 1 2018-01-01 01:00:00 2 - Freq: H, dtype: int64 + Freq: h, dtype: int64 >>> s.resample('15min').nearest() 2018-01-01 00:00:00 1 @@ -681,7 +681,7 @@ def bfill(self, limit: int | None = None): 2018-01-01 00:00:00 1 2018-01-01 01:00:00 2 2018-01-01 02:00:00 3 - Freq: H, dtype: int64 + Freq: h, dtype: int64 >>> s.resample('30min').bfill() 2018-01-01 00:00:00 1 @@ -792,7 +792,7 @@ def fillna(self, method, limit: int | None = None): 2018-01-01 00:00:00 1 2018-01-01 01:00:00 2 2018-01-01 02:00:00 3 - Freq: H, dtype: int64 + Freq: h, dtype: int64 Without filling the missing values you get: @@ -848,7 +848,7 @@ def fillna(self, method, limit: int | None = None): 2018-01-01 00:00:00 1.0 2018-01-01 01:00:00 NaN 2018-01-01 02:00:00 3.0 - Freq: H, dtype: float64 + Freq: h, dtype: float64 >>> sm.resample('30min').fillna('backfill') 2018-01-01 00:00:00 1.0 @@ -2339,7 +2339,7 @@ def _get_time_delta_bins(self, ax: TimedeltaIndex): # GH#51896 raise ValueError( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " - f"e.g. '24H' or '3D', not {self.freq}" + f"e.g. '24h' or '3D', not {self.freq}" ) if not len(ax): @@ -2544,8 +2544,8 @@ def _get_timestamp_range_edges( origin = Timestamp("1970-01-01", tz=index_tz) if isinstance(freq, Day): - # _adjust_dates_anchored assumes 'D' means 24H, but first/last - # might contain a DST transition (23H, 24H, or 25H). + # _adjust_dates_anchored assumes 'D' means 24h, but first/last + # might contain a DST transition (23h, 24h, or 25h). # So "pretend" the dates are naive when adjusting the endpoints first = first.tz_localize(None) last = last.tz_localize(None) diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index a9abc0714baa3..8db77725a1aa3 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -111,7 +111,7 @@ def to_timedelta( * 'W' * 'D' / 'days' / 'day' - * 'hours' / 'hour' / 'hr' / 'h' + * 'hours' / 'hour' / 'hr' / 'h' / 'H' * 'm' / 'minute' / 'min' / 'minutes' / 'T' * 's' / 'seconds' / 'sec' / 'second' / 'S' * 'ms' / 'milliseconds' / 'millisecond' / 'milli' / 'millis' / 'L' @@ -121,9 +121,9 @@ def to_timedelta( Must not be specified when `arg` context strings and ``errors="raise"``. .. deprecated:: 2.2.0 - Units 'T', 'S', 'L', 'U' and 'N' are deprecated and will be removed - in a future version. Please use 'min', 's', 'ms', 'us', and 'ns' instead of - 'T', 'S', 'L', 'U' and 'N'. + Units 'H', 'T', 'S', 'L', 'U' and 'N' are deprecated and will be removed + in a future version. Please use 'h', 'min', 's', 'ms', 'us', and 'ns' + instead of 'H', 'T', 'S', 'L', 'U' and 'N'. errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception. diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 227b72573f979..be988594ebf58 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -839,7 +839,7 @@ def test_with_listlike_columns(): { "a": Series(np.random.default_rng(2).standard_normal(4)), "b": ["a", "list", "of", "words"], - "ts": date_range("2016-10-01", periods=4, freq="H"), + "ts": date_range("2016-10-01", periods=4, freq="h"), } ) diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index aeb6a01eb587a..643b9220999f7 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -188,13 +188,13 @@ def test_apply_box(): def test_apply_datetimetz(by_row): - values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize( + values = pd.date_range("2011-01-01", "2011-01-02", freq="h").tz_localize( "Asia/Tokyo" ) s = Series(values, name="XX") result = s.apply(lambda x: x + pd.offsets.Day(), by_row=by_row) - exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="H").tz_localize( + exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="h").tz_localize( "Asia/Tokyo" ) exp = Series(exp_values, name="XX") diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index 7ec77e5b65b7e..f77b81574e1c1 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -21,13 +21,13 @@ def one(request): Examples -------- - dti = pd.date_range('2016-01-01', periods=2, freq='H') + dti = pd.date_range('2016-01-01', periods=2, freq='h') dti DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 01:00:00'], - dtype='datetime64[ns]', freq='H') + dtype='datetime64[ns]', freq='h') dti + one DatetimeIndex(['2016-01-01 01:00:00', '2016-01-01 02:00:00'], - dtype='datetime64[ns]', freq='H') + dtype='datetime64[ns]', freq='h') """ return request.param diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 0c46d22ddcc2e..df6ccda27ab85 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1076,7 +1076,7 @@ def test_dt64arr_add_dtlike_raises(self, tz_naive_fixture, box_with_array): # Note: freq here includes both Tick and non-Tick offsets; this is # relevant because historically integer-addition was allowed if we had # a freq. - @pytest.mark.parametrize("freq", ["H", "D", "W", "2ME", "MS", "Q", "B", None]) + @pytest.mark.parametrize("freq", ["h", "D", "W", "2ME", "MS", "Q", "B", None]) @pytest.mark.parametrize("dtype", [None, "uint8"]) def test_dt64arr_addsub_intlike( self, request, dtype, box_with_array, freq, tz_naive_fixture @@ -1144,7 +1144,7 @@ def test_dt64arr_add_sub_invalid(self, dti_freq, other, box_with_array): ) assert_invalid_addsub_type(dtarr, other, msg) - @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "H"]) + @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "h"]) @pytest.mark.parametrize("dti_freq", [None, "D"]) def test_dt64arr_add_sub_parr( self, dti_freq, pi_freq, box_with_array, box_with_array2 @@ -1282,10 +1282,10 @@ def test_dti_add_tick_tzaware(self, tz_aware_fixture, box_with_array): offset = dates + pd.offsets.Hour(5) assert dates[0] + pd.offsets.Hour(5) == offset[0] - dates = date_range("2010-11-01 00:00", periods=3, tz=tz, freq="H") + dates = date_range("2010-11-01 00:00", periods=3, tz=tz, freq="h") expected = DatetimeIndex( ["2010-11-01 05:00", "2010-11-01 06:00", "2010-11-01 07:00"], - freq="H", + freq="h", tz=tz, ) @@ -1953,7 +1953,7 @@ def test_operators_datetimelike_with_timezones(self): dt2 = dt1.copy() dt2.iloc[2] = np.nan - td1 = Series(pd.timedelta_range("1 days 1 min", periods=5, freq="H")) + td1 = Series(pd.timedelta_range("1 days 1 min", periods=5, freq="h")) td2 = td1.copy() td2.iloc[1] = np.nan assert td2._values.freq is None diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index bee1e1a385672..5af63258921ed 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -637,11 +637,11 @@ def test_pi_sub_pi_with_nat(self): def test_parr_sub_pi_mismatched_freq(self, box_with_array, box_with_array2): rng = period_range("1/1/2000", freq="D", periods=5) - other = period_range("1/6/2000", freq="H", periods=5) + other = period_range("1/6/2000", freq="h", periods=5) rng = tm.box_expected(rng, box_with_array) other = tm.box_expected(other, box_with_array2) - msg = r"Input has different freq=[HD] from PeriodArray\(freq=[DH]\)" + msg = r"Input has different freq=[hD] from PeriodArray\(freq=[Dh]\)" with pytest.raises(IncompatibleFrequency, match=msg): rng - other @@ -696,7 +696,7 @@ def test_sub_n_gt_1_offsets(self, offset, kwd_name, n): Timestamp("2016-01-01").to_pydatetime(), Timestamp("2016-01-01").to_datetime64(), # datetime-like arrays - pd.date_range("2016-01-01", periods=3, freq="H"), + pd.date_range("2016-01-01", periods=3, freq="h"), pd.date_range("2016-01-01", periods=3, tz="Europe/Brussels"), pd.date_range("2016-01-01", periods=3, freq="s")._data, pd.date_range("2016-01-01", periods=3, tz="Asia/Tokyo")._data, @@ -779,8 +779,8 @@ def test_pi_add_sub_td64_array_tick(self): with pytest.raises(TypeError, match=msg): tdi - rng - @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "H"]) - @pytest.mark.parametrize("tdi_freq", [None, "H"]) + @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "h"]) + @pytest.mark.parametrize("tdi_freq", [None, "h"]) def test_parr_sub_td64array(self, box_with_array, tdi_freq, pi_freq): box = box_with_array xbox = box if box not in [pd.array, tm.to_array] else pd.Index @@ -792,7 +792,7 @@ def test_parr_sub_td64array(self, box_with_array, tdi_freq, pi_freq): # TODO: parametrize over box for pi? td64obj = tm.box_expected(tdi, box) - if pi_freq == "H": + if pi_freq == "h": result = pi - td64obj expected = (pi.to_timestamp("s") - tdi).to_period(pi_freq) expected = tm.box_expected(expected, xbox) @@ -891,9 +891,9 @@ def test_pi_sub_offset_array(self, box): def test_pi_add_iadd_int(self, one): # Variants of `one` for #19012 - rng = period_range("2000-01-01 09:00", freq="H", periods=10) + rng = period_range("2000-01-01 09:00", freq="h", periods=10) result = rng + one - expected = period_range("2000-01-01 10:00", freq="H", periods=10) + expected = period_range("2000-01-01 10:00", freq="h", periods=10) tm.assert_index_equal(result, expected) rng += one tm.assert_index_equal(rng, expected) @@ -903,9 +903,9 @@ def test_pi_sub_isub_int(self, one): PeriodIndex.__sub__ and __isub__ with several representations of the integer 1, e.g. int, np.int64, np.uint8, ... """ - rng = period_range("2000-01-01 09:00", freq="H", periods=10) + rng = period_range("2000-01-01 09:00", freq="h", periods=10) result = rng - one - expected = period_range("2000-01-01 08:00", freq="H", periods=10) + expected = period_range("2000-01-01 08:00", freq="h", periods=10) tm.assert_index_equal(result, expected) rng -= one tm.assert_index_equal(rng, expected) @@ -1131,8 +1131,8 @@ def test_parr_add_sub_timedeltalike_freq_mismatch_daily( def test_pi_add_iadd_timedeltalike_hourly(self, two_hours): other = two_hours - rng = period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="H") - expected = period_range("2014-01-01 12:00", "2014-01-05 12:00", freq="H") + rng = period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="h") + expected = period_range("2014-01-01 12:00", "2014-01-05 12:00", freq="h") result = rng + other tm.assert_index_equal(result, expected) @@ -1144,12 +1144,12 @@ def test_parr_add_timedeltalike_mismatched_freq_hourly( self, not_hourly, box_with_array ): other = not_hourly - rng = period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="H") + rng = period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="h") rng = tm.box_expected(rng, box_with_array) msg = "|".join( [ # non-timedelta-like DateOffset - "Input has different freq(=.+)? from Period.*?\\(freq=H\\)", + "Input has different freq(=.+)? from Period.*?\\(freq=h\\)", # timedelta/td64/Timedelta but not a multiple of 24H "Cannot add/subtract timedelta-like from PeriodArray that is " "not an integer multiple of the PeriodArray's freq.", @@ -1164,8 +1164,8 @@ def test_parr_add_timedeltalike_mismatched_freq_hourly( def test_pi_sub_isub_timedeltalike_hourly(self, two_hours): other = two_hours - rng = period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="H") - expected = period_range("2014-01-01 08:00", "2014-01-05 08:00", freq="H") + rng = period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="h") + expected = period_range("2014-01-01 08:00", "2014-01-05 08:00", freq="h") result = rng - other tm.assert_index_equal(result, expected) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 3d237b3ac4a31..205e6472aaecb 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -70,7 +70,7 @@ def test_compare_timedelta64_zerodim(self, box_with_array): box = box_with_array xbox = box_with_array if box_with_array not in [Index, pd.array] else np.ndarray - tdi = timedelta_range("2H", periods=4) + tdi = timedelta_range("2h", periods=4) other = np.array(tdi.to_numpy()[0]) tdi = tm.box_expected(tdi, box) @@ -276,32 +276,32 @@ class TestTimedelta64ArithmeticUnsorted: def test_ufunc_coercions(self): # normal ops are also tested in tseries/test_timedeltas.py - idx = TimedeltaIndex(["2H", "4H", "6H", "8H", "10H"], freq="2H", name="x") + idx = TimedeltaIndex(["2h", "4h", "6h", "8h", "10h"], freq="2h", name="x") for result in [idx * 2, np.multiply(idx, 2)]: assert isinstance(result, TimedeltaIndex) - exp = TimedeltaIndex(["4H", "8H", "12H", "16H", "20H"], freq="4H", name="x") + exp = TimedeltaIndex(["4h", "8h", "12h", "16h", "20h"], freq="4h", name="x") tm.assert_index_equal(result, exp) - assert result.freq == "4H" + assert result.freq == "4h" for result in [idx / 2, np.divide(idx, 2)]: assert isinstance(result, TimedeltaIndex) - exp = TimedeltaIndex(["1H", "2H", "3H", "4H", "5H"], freq="H", name="x") + exp = TimedeltaIndex(["1h", "2h", "3h", "4h", "5h"], freq="h", name="x") tm.assert_index_equal(result, exp) - assert result.freq == "H" + assert result.freq == "h" for result in [-idx, np.negative(idx)]: assert isinstance(result, TimedeltaIndex) exp = TimedeltaIndex( - ["-2H", "-4H", "-6H", "-8H", "-10H"], freq="-2H", name="x" + ["-2h", "-4h", "-6h", "-8h", "-10h"], freq="-2h", name="x" ) tm.assert_index_equal(result, exp) - assert result.freq == "-2H" + assert result.freq == "-2h" - idx = TimedeltaIndex(["-2H", "-1H", "0H", "1H", "2H"], freq="H", name="x") + idx = TimedeltaIndex(["-2h", "-1h", "0h", "1h", "2h"], freq="h", name="x") for result in [abs(idx), np.absolute(idx)]: assert isinstance(result, TimedeltaIndex) - exp = TimedeltaIndex(["2H", "1H", "0H", "1H", "2H"], freq=None, name="x") + exp = TimedeltaIndex(["2h", "1h", "0h", "1h", "2h"], freq=None, name="x") tm.assert_index_equal(result, exp) assert result.freq is None @@ -1073,8 +1073,8 @@ def test_td64arr_add_dt64_array(self, box_with_array): # ------------------------------------------------------------------ # Invalid __add__/__sub__ operations - @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "H"]) - @pytest.mark.parametrize("tdi_freq", [None, "H"]) + @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "h"]) + @pytest.mark.parametrize("tdi_freq", [None, "h"]) def test_td64arr_sub_periodlike( self, box_with_array, box_with_array2, tdi_freq, pi_freq ): @@ -1133,7 +1133,7 @@ def test_td64arr_addsub_numeric_arr_invalid( def test_td64arr_add_sub_int(self, box_with_array, one): # Variants of `one` for #19012, deprecated GH#22535 - rng = timedelta_range("1 days 09:00:00", freq="H", periods=10) + rng = timedelta_range("1 days 09:00:00", freq="h", periods=10) tdarr = tm.box_expected(rng, box_with_array) msg = "Addition/subtraction of integers" @@ -1152,7 +1152,7 @@ def test_td64arr_add_sub_integer_array(self, box_with_array): box = box_with_array xbox = np.ndarray if box is pd.array else box - rng = timedelta_range("1 days 09:00:00", freq="H", periods=3) + rng = timedelta_range("1 days 09:00:00", freq="h", periods=3) tdarr = tm.box_expected(rng, box) other = tm.box_expected([4, 3, 2], xbox) @@ -2011,7 +2011,7 @@ def test_td64arr_div_numeric_array( tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") vector = vector.astype(any_real_numpy_dtype) - expected = Series(["2.95D", "1D 23H 12m", "NaT"], dtype="timedelta64[ns]") + expected = Series(["2.95D", "1D 23h 12m", "NaT"], dtype="timedelta64[ns]") tdser = tm.box_expected(tdser, box_with_array) xbox = get_upcast_box(tdser, vector) diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index cdf5d967d9c3d..dca171bf81047 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -148,7 +148,7 @@ def test_categorical_repr_ordered(self): assert repr(c) == exp def test_categorical_repr_datetime(self): - idx = date_range("2011-01-01 09:00", freq="H", periods=5) + idx = date_range("2011-01-01 09:00", freq="h", periods=5) c = Categorical(idx) exp = ( @@ -176,7 +176,7 @@ def test_categorical_repr_datetime(self): assert repr(c) == exp - idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern") c = Categorical(idx) exp = ( "[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, " @@ -210,7 +210,7 @@ def test_categorical_repr_datetime(self): assert repr(c) == exp def test_categorical_repr_datetime_ordered(self): - idx = date_range("2011-01-01 09:00", freq="H", periods=5) + idx = date_range("2011-01-01 09:00", freq="h", periods=5) c = Categorical(idx, ordered=True) exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00] Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 < @@ -225,7 +225,7 @@ def test_categorical_repr_datetime_ordered(self): assert repr(c) == exp - idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern") c = Categorical(idx, ordered=True) exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00] Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < @@ -254,17 +254,17 @@ def test_categorical_repr_int_with_nan(self): assert repr(s) == s_exp def test_categorical_repr_period(self): - idx = period_range("2011-01-01 09:00", freq="H", periods=5) + idx = period_range("2011-01-01 09:00", freq="h", periods=5) c = Categorical(idx) exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] -Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, +Categories (5, period[h]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]""" # noqa: E501 assert repr(c) == exp c = Categorical(idx.append(idx), categories=idx) exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] -Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, +Categories (5, period[h]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]""" # noqa: E501 assert repr(c) == exp @@ -283,17 +283,17 @@ def test_categorical_repr_period(self): assert repr(c) == exp def test_categorical_repr_period_ordered(self): - idx = period_range("2011-01-01 09:00", freq="H", periods=5) + idx = period_range("2011-01-01 09:00", freq="h", periods=5) c = Categorical(idx, ordered=True) exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] -Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < +Categories (5, period[h]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < 2011-01-01 13:00]""" # noqa: E501 assert repr(c) == exp c = Categorical(idx.append(idx), categories=idx, ordered=True) exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] -Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < +Categories (5, period[h]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < 2011-01-01 13:00]""" # noqa: E501 assert repr(c) == exp @@ -396,7 +396,7 @@ def test_categorical_index_repr_ordered(self): assert repr(i) == exp def test_categorical_index_repr_datetime(self): - idx = date_range("2011-01-01 09:00", freq="H", periods=5) + idx = date_range("2011-01-01 09:00", freq="h", periods=5) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00', '2011-01-01 11:00:00', '2011-01-01 12:00:00', @@ -405,7 +405,7 @@ def test_categorical_index_repr_datetime(self): assert repr(i) == exp - idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern") i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', @@ -415,7 +415,7 @@ def test_categorical_index_repr_datetime(self): assert repr(i) == exp def test_categorical_index_repr_datetime_ordered(self): - idx = date_range("2011-01-01 09:00", freq="H", periods=5) + idx = date_range("2011-01-01 09:00", freq="h", periods=5) i = CategoricalIndex(Categorical(idx, ordered=True)) exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00', '2011-01-01 11:00:00', '2011-01-01 12:00:00', @@ -424,7 +424,7 @@ def test_categorical_index_repr_datetime_ordered(self): assert repr(i) == exp - idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern") i = CategoricalIndex(Categorical(idx, ordered=True)) exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', @@ -445,22 +445,22 @@ def test_categorical_index_repr_datetime_ordered(self): def test_categorical_index_repr_period(self): # test all length - idx = period_range("2011-01-01 09:00", freq="H", periods=1) + idx = period_range("2011-01-01 09:00", freq="h", periods=1) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00'], categories=[2011-01-01 09:00], ordered=False, dtype='category')""" # noqa: E501 assert repr(i) == exp - idx = period_range("2011-01-01 09:00", freq="H", periods=2) + idx = period_range("2011-01-01 09:00", freq="h", periods=2) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00], ordered=False, dtype='category')""" # noqa: E501 assert repr(i) == exp - idx = period_range("2011-01-01 09:00", freq="H", periods=3) + idx = period_range("2011-01-01 09:00", freq="h", periods=3) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00], ordered=False, dtype='category')""" # noqa: E501 assert repr(i) == exp - idx = period_range("2011-01-01 09:00", freq="H", periods=5) + idx = period_range("2011-01-01 09:00", freq="h", periods=5) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00', '2011-01-01 13:00'], @@ -483,7 +483,7 @@ def test_categorical_index_repr_period(self): assert repr(i) == exp def test_categorical_index_repr_period_ordered(self): - idx = period_range("2011-01-01 09:00", freq="H", periods=5) + idx = period_range("2011-01-01 09:00", freq="h", periods=5) i = CategoricalIndex(Categorical(idx, ordered=True)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00', '2011-01-01 13:00'], diff --git a/pandas/tests/arrays/datetimes/test_constructors.py b/pandas/tests/arrays/datetimes/test_constructors.py index 30f47e37fedf5..e513457819eb5 100644 --- a/pandas/tests/arrays/datetimes/test_constructors.py +++ b/pandas/tests/arrays/datetimes/test_constructors.py @@ -34,7 +34,7 @@ def test_freq_validation(self): arr = np.arange(5, dtype=np.int64) * 3600 * 10**9 msg = ( - "Inferred frequency H from passed values does not " + "Inferred frequency h from passed values does not " "conform to passed frequency W-SUN" ) with pytest.raises(ValueError, match=msg): @@ -70,7 +70,7 @@ def test_from_pandas_array(self): result = DatetimeArray._from_sequence(arr)._with_freq("infer") - expected = pd.date_range("1970-01-01", periods=5, freq="H")._data + expected = pd.date_range("1970-01-01", periods=5, freq="h")._data tm.assert_datetime_array_equal(result, expected) def test_mismatched_timezone_raises(self): diff --git a/pandas/tests/arrays/period/test_arrow_compat.py b/pandas/tests/arrays/period/test_arrow_compat.py index 6c04d7c603d4c..a1e1e8efe6dee 100644 --- a/pandas/tests/arrays/period/test_arrow_compat.py +++ b/pandas/tests/arrays/period/test_arrow_compat.py @@ -110,7 +110,7 @@ def test_arrow_load_from_zero_chunks(): def test_arrow_table_roundtrip_without_metadata(): - arr = PeriodArray([1, 2, 3], dtype="period[H]") + arr = PeriodArray([1, 2, 3], dtype="period[h]") arr[1] = pd.NaT df = pd.DataFrame({"a": arr}) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 0aeedf4d03919..92536a222296e 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -135,14 +135,14 @@ def test_dt64_array(dtype_unit): ), # Timedelta ( - ["1H", "2H"], + ["1h", "2h"], np.dtype("timedelta64[ns]"), - TimedeltaArray._from_sequence(["1H", "2H"]), + TimedeltaArray._from_sequence(["1h", "2h"]), ), ( - pd.TimedeltaIndex(["1H", "2H"]), + pd.TimedeltaIndex(["1h", "2h"]), np.dtype("timedelta64[ns]"), - TimedeltaArray._from_sequence(["1H", "2H"]), + TimedeltaArray._from_sequence(["1h", "2h"]), ), ( np.array([1, 2], dtype="m8[s]"), @@ -150,9 +150,9 @@ def test_dt64_array(dtype_unit): TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[s]")), ), ( - pd.TimedeltaIndex(["1H", "2H"]), + pd.TimedeltaIndex(["1h", "2h"]), None, - TimedeltaArray._from_sequence(["1H", "2H"]), + TimedeltaArray._from_sequence(["1h", "2h"]), ), ( # preserve non-nano, i.e. don't cast to NumpyExtensionArray @@ -298,8 +298,8 @@ def test_array_copy(): ), # timedelta ( - [pd.Timedelta("1H"), pd.Timedelta("2H")], - TimedeltaArray._from_sequence(["1H", "2H"]), + [pd.Timedelta("1h"), pd.Timedelta("2h")], + TimedeltaArray._from_sequence(["1h", "2h"]), ), ( np.array([1, 2], dtype="m8[ns]"), diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 291c687d84125..3f91b9b03e1de 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -858,7 +858,7 @@ def test_concat_same_type_invalid(self, arr1d): def test_concat_same_type_different_freq(self): # we *can* concatenate DTI with different freqs. a = DatetimeArray(pd.date_range("2000", periods=2, freq="D", tz="US/Central")) - b = DatetimeArray(pd.date_range("2000", periods=2, freq="H", tz="US/Central")) + b = DatetimeArray(pd.date_range("2000", periods=2, freq="h", tz="US/Central")) result = DatetimeArray._concat_same_type([a, b]) expected = DatetimeArray( pd.to_datetime( diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index 43a80a92573c5..48453ba19e9a1 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -133,8 +133,8 @@ def test_sub_period_overflow(): @pytest.mark.parametrize( "other", [ - pd.Period("2000", freq="H"), - PeriodArray._from_sequence(["2000", "2001", "2000"], dtype="period[H]"), + pd.Period("2000", freq="h"), + PeriodArray._from_sequence(["2000", "2001", "2000"], dtype="period[h]"), ], ) def test_where_different_freq_raises(other): diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py index 1043c2ee6c9b6..21bc85a4d070e 100644 --- a/pandas/tests/arrays/test_timedeltas.py +++ b/pandas/tests/arrays/test_timedeltas.py @@ -196,7 +196,7 @@ def test_add_timedeltaarraylike(self, tda): class TestTimedeltaArray: @pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"]) def test_astype_int(self, dtype): - arr = TimedeltaArray._from_sequence([Timedelta("1H"), Timedelta("2H")]) + arr = TimedeltaArray._from_sequence([Timedelta("1h"), Timedelta("2h")]) if np.dtype(dtype) != np.int64: with pytest.raises(TypeError, match=r"Do obj.astype\('int64'\)"): @@ -208,8 +208,8 @@ def test_astype_int(self, dtype): tm.assert_numpy_array_equal(result, expected) def test_setitem_clears_freq(self): - a = TimedeltaArray(pd.timedelta_range("1H", periods=2, freq="H")) - a[0] = Timedelta("1H") + a = TimedeltaArray(pd.timedelta_range("1h", periods=2, freq="h")) + a[0] = Timedelta("1h") assert a.freq is None @pytest.mark.parametrize( @@ -222,7 +222,7 @@ def test_setitem_clears_freq(self): ) def test_setitem_objects(self, obj): # make sure we accept timedelta64 and timedelta in addition to Timedelta - tdi = pd.timedelta_range("2 Days", periods=4, freq="H") + tdi = pd.timedelta_range("2 Days", periods=4, freq="h") arr = TimedeltaArray(tdi, freq=tdi.freq) arr[0] = obj @@ -299,7 +299,7 @@ def test_neg(self): tm.assert_timedelta_array_equal(result2, expected) def test_neg_freq(self): - tdi = pd.timedelta_range("2 Days", periods=4, freq="H") + tdi = pd.timedelta_range("2 Days", periods=4, freq="h") arr = TimedeltaArray(tdi, freq=tdi.freq) expected = TimedeltaArray(-tdi._data, freq=-tdi.freq) diff --git a/pandas/tests/arrays/timedeltas/test_reductions.py b/pandas/tests/arrays/timedeltas/test_reductions.py index 72d45f5b9a78c..3718e7e646ea9 100644 --- a/pandas/tests/arrays/timedeltas/test_reductions.py +++ b/pandas/tests/arrays/timedeltas/test_reductions.py @@ -35,14 +35,14 @@ def test_sum_empty(self, skipna): assert result == Timedelta(0) def test_min_max(self): - arr = TimedeltaArray._from_sequence(["3H", "3H", "NaT", "2H", "5H", "4H"]) + arr = TimedeltaArray._from_sequence(["3h", "3h", "NaT", "2h", "5h", "4h"]) result = arr.min() - expected = Timedelta("2H") + expected = Timedelta("2h") assert result == expected result = arr.max() - expected = Timedelta("5H") + expected = Timedelta("5h") assert result == expected result = arr.min(skipna=False) @@ -52,7 +52,7 @@ def test_min_max(self): assert result is pd.NaT def test_sum(self): - tdi = pd.TimedeltaIndex(["3H", "3H", "NaT", "2H", "5H", "4H"]) + tdi = pd.TimedeltaIndex(["3h", "3h", "NaT", "2h", "5h", "4h"]) arr = tdi.array result = arr.sum(skipna=True) @@ -86,7 +86,7 @@ def test_sum(self): def test_npsum(self): # GH#25282, GH#25335 np.sum should return a Timedelta, not timedelta64 - tdi = pd.TimedeltaIndex(["3H", "3H", "2H", "5H", "4H"]) + tdi = pd.TimedeltaIndex(["3h", "3h", "2h", "5h", "4h"]) arr = tdi.array result = np.sum(tdi) @@ -133,7 +133,7 @@ def test_sum_2d_skipna_false(self): ], ) def test_std(self, add): - tdi = pd.TimedeltaIndex(["0H", "4H", "NaT", "4H", "0H", "2H"]) + add + tdi = pd.TimedeltaIndex(["0h", "4h", "NaT", "4h", "0h", "2h"]) + add arr = tdi.array result = arr.std(skipna=True) @@ -162,7 +162,7 @@ def test_std(self, add): assert np.isnat(result) def test_median(self): - tdi = pd.TimedeltaIndex(["0H", "3H", "NaT", "5H06m", "0H", "2H"]) + tdi = pd.TimedeltaIndex(["0h", "3h", "NaT", "5h06m", "0h", "2h"]) arr = tdi.array result = arr.median(skipna=True) @@ -181,7 +181,7 @@ def test_median(self): assert result is pd.NaT def test_mean(self): - tdi = pd.TimedeltaIndex(["0H", "3H", "NaT", "5H06m", "0H", "2H"]) + tdi = pd.TimedeltaIndex(["0h", "3h", "NaT", "5h06m", "0h", "2h"]) arr = tdi._data # manually verified result diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 20ee2d443f340..3e0b0dbeb5624 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -314,7 +314,7 @@ def test_array_multiindex_raises(): ), # Timedelta ( - TimedeltaArray(np.array([0, 3600000000000], dtype="i8"), freq="H"), + TimedeltaArray(np.array([0, 3600000000000], dtype="i8"), freq="h"), np.array([0, 3600000000000], dtype="m8[ns]"), ), # GH#26406 tz is preserved in Categorical[dt64tz] diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 76b974330cbf1..45bfc6a6fcf9b 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1311,7 +1311,7 @@ def test_rename_axis(using_copy_on_write, kwargs): def test_tz_convert_localize(using_copy_on_write, func, tz): # GH 49473 ser = Series( - [1, 2], index=date_range(start="2014-08-01 09:00", freq="H", periods=2, tz=tz) + [1, 2], index=date_range(start="2014-08-01 09:00", freq="h", periods=2, tz=tz) ) ser_orig = ser.copy() ser2 = getattr(ser, func)("US/Central") diff --git a/pandas/tests/dtypes/cast/test_find_common_type.py b/pandas/tests/dtypes/cast/test_find_common_type.py index 8ce05337be70b..83ef7382fbe8a 100644 --- a/pandas/tests/dtypes/cast/test_find_common_type.py +++ b/pandas/tests/dtypes/cast/test_find_common_type.py @@ -122,7 +122,7 @@ def test_period_dtype_match(): [ DatetimeTZDtype(unit="ns", tz="Asia/Tokyo"), PeriodDtype(freq="2D"), - PeriodDtype(freq="H"), + PeriodDtype(freq="h"), np.dtype("datetime64[ns]"), object, np.int64, diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 6562074eee634..1f9c371c50ad4 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -432,12 +432,12 @@ def test_construction(self): assert dt.freq == pd.tseries.offsets.Day(3) for s in [ - "period[26H]", - "Period[26H]", - "26H", - "period[1D2H]", - "Period[1D2H]", - "1D2H", + "period[26h]", + "Period[26h]", + "26h", + "period[1D2h]", + "Period[1D2h]", + "1D2h", ]: dt = PeriodDtype(s) assert dt.freq == pd.tseries.offsets.Hour(26) @@ -533,7 +533,7 @@ def test_basic(self, dtype): with tm.assert_produces_warning(FutureWarning, match=msg): assert is_period_dtype(dtype) - pidx = pd.period_range("2013-01-01 09:00", periods=5, freq="H") + pidx = pd.period_range("2013-01-01 09:00", periods=5, freq="h") assert is_period_dtype(pidx.dtype) assert is_period_dtype(pidx) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 339e97e735f85..41312f45838a9 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2484,10 +2484,10 @@ def test_dt_roundlike_tz_options_not_supported(method): dtype=ArrowDtype(pa.timestamp("ns")), ) with pytest.raises(NotImplementedError, match="ambiguous is not supported."): - getattr(ser.dt, method)("1H", ambiguous="NaT") + getattr(ser.dt, method)("1h", ambiguous="NaT") with pytest.raises(NotImplementedError, match="nonexistent is not supported."): - getattr(ser.dt, method)("1H", nonexistent="NaT") + getattr(ser.dt, method)("1h", nonexistent="NaT") @pytest.mark.parametrize("method", ["ceil", "floor", "round"]) @@ -2506,7 +2506,7 @@ def test_dt_roundlike_unsupported_freq(method): @pytest.mark.xfail( pa_version_under7p0, reason="Methods not supported for pyarrow < 7.0" ) -@pytest.mark.parametrize("freq", ["D", "H", "min", "s", "ms", "us", "ns"]) +@pytest.mark.parametrize("freq", ["D", "h", "min", "s", "ms", "us", "ns"]) @pytest.mark.parametrize("method", ["ceil", "floor", "round"]) def test_dt_ceil_year_floor(freq, method): ser = pd.Series( diff --git a/pandas/tests/frame/methods/test_align.py b/pandas/tests/frame/methods/test_align.py index 87a56c0736287..25eb2ccb18361 100644 --- a/pandas/tests/frame/methods/test_align.py +++ b/pandas/tests/frame/methods/test_align.py @@ -23,8 +23,8 @@ def test_align_asfreq_method_raises(self): df.align(df.iloc[::-1], method="asfreq") def test_frame_align_aware(self): - idx1 = date_range("2001", periods=5, freq="H", tz="US/Eastern") - idx2 = date_range("2001", periods=5, freq="2H", tz="US/Eastern") + idx1 = date_range("2001", periods=5, freq="h", tz="US/Eastern") + idx2 = date_range("2001", periods=5, freq="2h", tz="US/Eastern") df1 = DataFrame(np.random.default_rng(2).standard_normal((len(idx1), 3)), idx1) df2 = DataFrame(np.random.default_rng(2).standard_normal((len(idx2), 3)), idx2) new1, new2 = df1.align(df2) diff --git a/pandas/tests/frame/methods/test_asfreq.py b/pandas/tests/frame/methods/test_asfreq.py index 0527bfb16492c..b3ab11d07bd7e 100644 --- a/pandas/tests/frame/methods/test_asfreq.py +++ b/pandas/tests/frame/methods/test_asfreq.py @@ -63,8 +63,8 @@ def test_asfreq2(self, frame_or_series): def test_asfreq_datetimeindex_empty(self, frame_or_series): # GH#14320 index = DatetimeIndex(["2016-09-29 11:00"]) - expected = frame_or_series(index=index, dtype=object).asfreq("H") - result = frame_or_series([3], index=index.copy()).asfreq("H") + expected = frame_or_series(index=index, dtype=object).asfreq("h") + result = frame_or_series([3], index=index.copy()).asfreq("h") tm.assert_index_equal(expected.index, result.index) @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) @@ -194,8 +194,8 @@ def test_asfreq_with_date_object_index(self, frame_or_series): ts2 = ts.copy() ts2.index = [x.date() for x in ts2.index] - result = ts2.asfreq("4H", method="ffill") - expected = ts.asfreq("4H", method="ffill") + result = ts2.asfreq("4h", method="ffill") + expected = ts.asfreq("4h", method="ffill") tm.assert_equal(result, expected) def test_asfreq_with_unsorted_index(self, frame_or_series): diff --git a/pandas/tests/frame/methods/test_asof.py b/pandas/tests/frame/methods/test_asof.py index 5683ec60b0d88..4a8adf89b3aef 100644 --- a/pandas/tests/frame/methods/test_asof.py +++ b/pandas/tests/frame/methods/test_asof.py @@ -178,7 +178,7 @@ def test_is_copy(self, date_range_frame): def test_asof_periodindex_mismatched_freq(self): N = 50 - rng = period_range("1/1/1990", periods=N, freq="H") + rng = period_range("1/1/1990", periods=N, freq="h") df = DataFrame(np.random.default_rng(2).standard_normal(N), index=rng) # Mismatched freq diff --git a/pandas/tests/frame/methods/test_at_time.py b/pandas/tests/frame/methods/test_at_time.py index 67200396f6375..4c1434bd66aff 100644 --- a/pandas/tests/frame/methods/test_at_time.py +++ b/pandas/tests/frame/methods/test_at_time.py @@ -18,7 +18,7 @@ class TestAtTime: def test_localized_at_time(self, tzstr, frame_or_series): tz = timezones.maybe_get_tz(tzstr) - rng = date_range("4/16/2012", "5/1/2012", freq="H") + rng = date_range("4/16/2012", "5/1/2012", freq="h") ts = frame_or_series( np.random.default_rng(2).standard_normal(len(rng)), index=rng ) @@ -69,7 +69,7 @@ def test_at_time_nonexistent(self, frame_or_series): ) def test_at_time_errors(self, hour): # GH#24043 - dti = date_range("2018", periods=3, freq="H") + dti = date_range("2018", periods=3, freq="h") df = DataFrame(list(range(len(dti))), index=dti) if getattr(hour, "tzinfo", None) is None: result = df.at_time(hour) @@ -81,7 +81,7 @@ def test_at_time_errors(self, hour): def test_at_time_tz(self): # GH#24043 - dti = date_range("2018", periods=3, freq="H", tz="US/Pacific") + dti = date_range("2018", periods=3, freq="h", tz="US/Pacific") df = DataFrame(list(range(len(dti))), index=dti) result = df.at_time(time(4, tzinfo=pytz.timezone("US/Eastern"))) expected = df.iloc[1:2] diff --git a/pandas/tests/frame/methods/test_between_time.py b/pandas/tests/frame/methods/test_between_time.py index 4c1e009b04639..74d6291707e19 100644 --- a/pandas/tests/frame/methods/test_between_time.py +++ b/pandas/tests/frame/methods/test_between_time.py @@ -46,7 +46,7 @@ def test_between_time_formats(self, frame_or_series): def test_localized_between_time(self, tzstr, frame_or_series): tz = timezones.maybe_get_tz(tzstr) - rng = date_range("4/16/2012", "5/1/2012", freq="H") + rng = date_range("4/16/2012", "5/1/2012", freq="h") ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) if frame_or_series is DataFrame: ts = ts.to_frame() diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index f56a7896c753e..5beb09940acf3 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -204,7 +204,7 @@ def test_describe_datetime_columns(self): def test_describe_timedelta_values(self): # GH#6145 t1 = pd.timedelta_range("1 days", freq="D", periods=5) - t2 = pd.timedelta_range("1 hours", freq="H", periods=5) + t2 = pd.timedelta_range("1 hours", freq="h", periods=5) df = DataFrame({"t1": t1, "t2": t2}) expected = DataFrame( diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index 9a4882f11e961..f72c0594fa1f7 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -57,7 +57,7 @@ def test_drop_with_non_unique_datetime_index_and_invalid_keys(): df = DataFrame( np.random.default_rng(2).standard_normal((5, 3)), columns=["a", "b", "c"], - index=pd.date_range("2012", freq="H", periods=5), + index=pd.date_range("2012", freq="h", periods=5), ) # create dataframe with non-unique datetime index df = df.iloc[[0, 2, 2, 3]].copy() diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index bba86b481eadc..fb6e08cd52d97 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -46,7 +46,7 @@ def test_dti_set_index_reindex_datetimeindex(self): def test_dti_set_index_reindex_freq_with_tz(self): # GH#11314 with tz index = date_range( - datetime(2015, 10, 1), datetime(2015, 10, 1, 23), freq="H", tz="US/Eastern" + datetime(2015, 10, 1), datetime(2015, 10, 1, 23), freq="h", tz="US/Eastern" ) df = DataFrame( np.random.default_rng(2).standard_normal((24, 1)), @@ -54,7 +54,7 @@ def test_dti_set_index_reindex_freq_with_tz(self): index=index, ) new_index = date_range( - datetime(2015, 10, 2), datetime(2015, 10, 2, 23), freq="H", tz="US/Eastern" + datetime(2015, 10, 2), datetime(2015, 10, 2, 23), freq="h", tz="US/Eastern" ) result = df.set_index(new_index) @@ -389,9 +389,9 @@ def test_reindex_frame_tz_ffill_bfill(self, frame_or_series, method, exp_values) # GH#38566 obj = frame_or_series( [0, 1, 2, 3], - index=date_range("2020-01-01 00:00:00", periods=4, freq="H", tz="UTC"), + index=date_range("2020-01-01 00:00:00", periods=4, freq="h", tz="UTC"), ) - new_index = date_range("2020-01-01 00:01:00", periods=4, freq="H", tz="UTC") + new_index = date_range("2020-01-01 00:01:00", periods=4, freq="h", tz="UTC") result = obj.reindex(new_index, method=method, tolerance=pd.Timedelta("1 hour")) expected = frame_or_series(exp_values, index=new_index) tm.assert_equal(result, expected) @@ -1067,7 +1067,7 @@ def test_reindex_multi_categorical_time(self): midx = MultiIndex.from_product( [ Categorical(["a", "b", "c"]), - Categorical(date_range("2012-01-01", periods=3, freq="H")), + Categorical(date_range("2012-01-01", periods=3, freq="h")), ] ) df = DataFrame({"a": range(len(midx))}, index=midx) diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py index f755ef0c2763d..9b87ffb0241ef 100644 --- a/pandas/tests/frame/methods/test_set_index.py +++ b/pandas/tests/frame/methods/test_set_index.py @@ -99,7 +99,7 @@ def test_set_index_cast_datetimeindex(self): assert isinstance(idf.index, DatetimeIndex) def test_set_index_dst(self): - di = date_range("2006-10-29 00:00:00", periods=3, freq="H", tz="US/Pacific") + di = date_range("2006-10-29 00:00:00", periods=3, freq="h", tz="US/Pacific") df = DataFrame(data={"a": [0, 1, 2], "b": [3, 4, 5]}, index=di).reset_index() # single level @@ -491,7 +491,7 @@ def test_set_index_period(self): df = DataFrame(np.random.default_rng(2).random(6)) idx1 = period_range("2011-01-01", periods=3, freq="M") idx1 = idx1.append(idx1) - idx2 = period_range("2013-01-01 09:00", periods=2, freq="H") + idx2 = period_range("2013-01-01 09:00", periods=2, freq="h") idx2 = idx2.append(idx2).append(idx2) idx3 = period_range("2005", periods=6, freq="Y") @@ -500,7 +500,7 @@ def test_set_index_period(self): df = df.set_index(idx3, append=True) expected1 = period_range("2011-01-01", periods=3, freq="M") - expected2 = period_range("2013-01-01 09:00", periods=2, freq="H") + expected2 = period_range("2013-01-01 09:00", periods=2, freq="h") tm.assert_index_equal(df.index.levels[0], expected1) tm.assert_index_equal(df.index.levels[1], expected2) diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 6bd441400dc54..201046ebafc35 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -37,7 +37,7 @@ def test_shift_deprecate_freq_and_fill_value(self, frame_or_series): # Can't pass both! obj = frame_or_series( np.random.default_rng(2).standard_normal(5), - index=date_range("1/1/2000", periods=5, freq="H"), + index=date_range("1/1/2000", periods=5, freq="h"), ) msg = ( @@ -45,12 +45,12 @@ def test_shift_deprecate_freq_and_fill_value(self, frame_or_series): "fill_value" ) with tm.assert_produces_warning(FutureWarning, match=msg): - obj.shift(1, fill_value=1, freq="H") + obj.shift(1, fill_value=1, freq="h") if frame_or_series is DataFrame: - obj.columns = date_range("1/1/2000", periods=1, freq="H") + obj.columns = date_range("1/1/2000", periods=1, freq="h") with tm.assert_produces_warning(FutureWarning, match=msg): - obj.shift(1, axis=1, fill_value=1, freq="H") + obj.shift(1, axis=1, fill_value=1, freq="h") @pytest.mark.parametrize( "input_data, output_data", @@ -77,7 +77,7 @@ def test_shift_non_writable_array(self, input_data, output_data, frame_or_series def test_shift_mismatched_freq(self, frame_or_series): ts = frame_or_series( np.random.default_rng(2).standard_normal(5), - index=date_range("1/1/2000", periods=5, freq="H"), + index=date_range("1/1/2000", periods=5, freq="h"), ) result = ts.shift(1, freq="5min") @@ -85,7 +85,7 @@ def test_shift_mismatched_freq(self, frame_or_series): tm.assert_index_equal(result.index, exp_index) # GH#1063, multiple of same base - result = ts.shift(1, freq="4H") + result = ts.shift(1, freq="4h") exp_index = ts.index + offsets.Hour(4) tm.assert_index_equal(result.index, exp_index) @@ -93,7 +93,7 @@ def test_shift_mismatched_freq(self, frame_or_series): "obj", [ Series([np.arange(5)]), - date_range("1/1/2011", periods=24, freq="H"), + date_range("1/1/2011", periods=24, freq="h"), Series(range(5), index=date_range("2017", periods=5)), ], ) @@ -145,20 +145,20 @@ def test_shift_preserve_freqstr(self, periods, frame_or_series): # GH#21275 obj = frame_or_series( range(periods), - index=date_range("2016-1-1 00:00:00", periods=periods, freq="H"), + index=date_range("2016-1-1 00:00:00", periods=periods, freq="h"), ) - result = obj.shift(1, "2H") + result = obj.shift(1, "2h") expected = frame_or_series( range(periods), - index=date_range("2016-1-1 02:00:00", periods=periods, freq="H"), + index=date_range("2016-1-1 02:00:00", periods=periods, freq="h"), ) tm.assert_equal(result, expected) def test_shift_dst(self, frame_or_series): # GH#13926 - dates = date_range("2016-11-06", freq="H", periods=10, tz="US/Eastern") + dates = date_range("2016-11-06", freq="h", periods=10, tz="US/Eastern") obj = frame_or_series(dates) res = obj.shift(0) @@ -180,7 +180,7 @@ def test_shift_dst(self, frame_or_series): @pytest.mark.parametrize("ex", [10, -10, 20, -20]) def test_shift_dst_beyond(self, frame_or_series, ex): # GH#13926 - dates = date_range("2016-11-06", freq="H", periods=10, tz="US/Eastern") + dates = date_range("2016-11-06", freq="h", periods=10, tz="US/Eastern") obj = frame_or_series(dates) res = obj.shift(ex) exp = frame_or_series([NaT] * 10, dtype="datetime64[ns, US/Eastern]") @@ -367,7 +367,7 @@ def test_shift_categorical_fill_value(self, frame_or_series): def test_shift_fill_value(self, frame_or_series): # GH#24128 - dti = date_range("1/1/2000", periods=5, freq="H") + dti = date_range("1/1/2000", periods=5, freq="h") ts = frame_or_series([1.0, 2.0, 3.0, 4.0, 5.0], index=dti) exp = frame_or_series([0.0, 1.0, 2.0, 3.0, 4.0], index=dti) @@ -707,7 +707,7 @@ def test_shift_with_iterable_freq_and_fill_value(self): # GH#44424 df = DataFrame( np.random.default_rng(2).standard_normal(5), - index=date_range("1/1/2000", periods=5, freq="H"), + index=date_range("1/1/2000", periods=5, freq="h"), ) tm.assert_frame_equal( @@ -717,8 +717,8 @@ def test_shift_with_iterable_freq_and_fill_value(self): ) tm.assert_frame_equal( - df.shift([1], freq="H").rename(columns=lambda x: int(x[0])), - df.shift(1, freq="H"), + df.shift([1], freq="h").rename(columns=lambda x: int(x[0])), + df.shift(1, freq="h"), ) msg = ( @@ -726,7 +726,7 @@ def test_shift_with_iterable_freq_and_fill_value(self): "fill_value" ) with tm.assert_produces_warning(FutureWarning, match=msg): - df.shift([1, 2], fill_value=1, freq="H") + df.shift([1, 2], fill_value=1, freq="h") def test_shift_with_iterable_check_other_arguments(self): # GH#44424 diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 9f45347c31165..94c98ad477cc1 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -420,7 +420,7 @@ def test_to_csv_from_csv_w_some_infs(self, float_frame): # test roundtrip with inf, -inf, nan, as full columns and mix float_frame["G"] = np.nan f = lambda x: [np.inf, np.nan][np.random.default_rng(2).random() < 0.5] - float_frame["H"] = float_frame.index.map(f) + float_frame["h"] = float_frame.index.map(f) with tm.ensure_clean() as path: float_frame.to_csv(path) @@ -1077,7 +1077,7 @@ def test_to_csv_with_dst_transitions(self, td): "2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", - freq="H", + freq="h", ambiguous="infer", ) i = times + td @@ -1095,7 +1095,7 @@ def test_to_csv_with_dst_transitions(self, td): def test_to_csv_with_dst_transitions_with_pickle(self): # GH11619 - idx = date_range("2015-01-01", "2015-12-31", freq="H", tz="Europe/Paris") + idx = date_range("2015-01-01", "2015-12-31", freq="h", tz="Europe/Paris") idx = idx._with_freq(None) # freq does not round-trip idx._data._freq = None # otherwise there is trouble on unpickle df = DataFrame({"values": 1, "idx": idx}, index=idx) diff --git a/pandas/tests/frame/methods/test_to_timestamp.py b/pandas/tests/frame/methods/test_to_timestamp.py index 478708ce90488..aeb65d98d8ab2 100644 --- a/pandas/tests/frame/methods/test_to_timestamp.py +++ b/pandas/tests/frame/methods/test_to_timestamp.py @@ -132,12 +132,12 @@ def test_to_timestamp_invalid_axis(self): obj.to_timestamp(axis=2) def test_to_timestamp_hourly(self, frame_or_series): - index = period_range(freq="H", start="1/1/2001", end="1/2/2001") + index = period_range(freq="h", start="1/1/2001", end="1/2/2001") obj = Series(1, index=index, name="foo") if frame_or_series is not Series: obj = obj.to_frame() - exp_index = date_range("1/1/2001 00:59:59", end="1/2/2001 00:59:59", freq="H") + exp_index = date_range("1/1/2001 00:59:59", end="1/2/2001 00:59:59", freq="h") result = obj.to_timestamp(how="end") exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") tm.assert_index_equal(result.index, exp_index) diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index 8ff6ea37eae18..50fc6fe6984e7 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -70,7 +70,7 @@ def test_transpose_tzaware_2col_mixed_tz(self): @pytest.mark.parametrize("tz", [None, "America/New_York"]) def test_transpose_preserves_dtindex_equality_with_dst(self, tz): # GH#19970 - idx = date_range("20161101", "20161130", freq="4H", tz=tz) + idx = date_range("20161101", "20161130", freq="4h", tz=tz) df = DataFrame({"a": range(len(idx)), "b": range(len(idx))}, index=idx) result = df.T == df.T expected = DataFrame(True, index=list("ab"), columns=idx) diff --git a/pandas/tests/frame/methods/test_tz_convert.py b/pandas/tests/frame/methods/test_tz_convert.py index 8a484abaab54c..bcb8e423980fd 100644 --- a/pandas/tests/frame/methods/test_tz_convert.py +++ b/pandas/tests/frame/methods/test_tz_convert.py @@ -120,7 +120,7 @@ def test_tz_convert_copy_inplace_mutate(self, copy, frame_or_series): # GH#6326 obj = frame_or_series( np.arange(0, 5), - index=date_range("20131027", periods=5, freq="1H", tz="Europe/Berlin"), + index=date_range("20131027", periods=5, freq="h", tz="Europe/Berlin"), ) orig = obj.copy() result = obj.tz_convert("UTC", copy=copy) diff --git a/pandas/tests/frame/methods/test_tz_localize.py b/pandas/tests/frame/methods/test_tz_localize.py index ed2b0b247e62c..b167afc17f484 100644 --- a/pandas/tests/frame/methods/test_tz_localize.py +++ b/pandas/tests/frame/methods/test_tz_localize.py @@ -16,7 +16,7 @@ class TestTZLocalize: # test_tz_convert_and_localize in test_tz_convert def test_tz_localize(self, frame_or_series): - rng = date_range("1/1/2011", periods=100, freq="H") + rng = date_range("1/1/2011", periods=100, freq="h") obj = DataFrame({"a": 1}, index=rng) obj = tm.get_obj(obj, frame_or_series) @@ -29,7 +29,7 @@ def test_tz_localize(self, frame_or_series): tm.assert_equal(result, expected) def test_tz_localize_axis1(self): - rng = date_range("1/1/2011", periods=100, freq="H") + rng = date_range("1/1/2011", periods=100, freq="h") df = DataFrame({"a": 1}, index=rng) @@ -43,7 +43,7 @@ def test_tz_localize_axis1(self): def test_tz_localize_naive(self, frame_or_series): # Can't localize if already tz-aware - rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") + rng = date_range("1/1/2011", periods=100, freq="h", tz="utc") ts = Series(1, index=rng) ts = frame_or_series(ts) @@ -54,13 +54,13 @@ def test_tz_localize_naive(self, frame_or_series): def test_tz_localize_copy_inplace_mutate(self, copy, frame_or_series): # GH#6326 obj = frame_or_series( - np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz=None) + np.arange(0, 5), index=date_range("20131027", periods=5, freq="1h", tz=None) ) orig = obj.copy() result = obj.tz_localize("UTC", copy=copy) expected = frame_or_series( np.arange(0, 5), - index=date_range("20131027", periods=5, freq="1H", tz="UTC"), + index=date_range("20131027", periods=5, freq="1h", tz="UTC"), ) tm.assert_equal(result, expected) tm.assert_equal(obj, orig) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 2c3b732fe7196..09a5cda4b3458 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1223,7 +1223,7 @@ def test_frame_single_columns_object_sum_axis_1(): class TestFrameArithmeticUnsorted: def test_frame_add_tz_mismatch_converts_to_utc(self): - rng = pd.date_range("1/1/2011", periods=10, freq="H", tz="US/Eastern") + rng = pd.date_range("1/1/2011", periods=10, freq="h", tz="US/Eastern") df = DataFrame( np.random.default_rng(2).standard_normal(len(rng)), index=rng, columns=["a"] ) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 543a5be9544cc..3d8053703e906 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2471,8 +2471,8 @@ def test_dataframe_constructor_infer_multiindex(self): [ ([1, 2]), (["1", "2"]), - (list(date_range("1/1/2011", periods=2, freq="H"))), - (list(date_range("1/1/2011", periods=2, freq="H", tz="US/Eastern"))), + (list(date_range("1/1/2011", periods=2, freq="h"))), + (list(date_range("1/1/2011", periods=2, freq="h", tz="US/Eastern"))), ([Interval(left=0, right=5)]), ], ) diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 55c239f7284c1..0634b8268c04c 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -313,7 +313,7 @@ def test_latex_repr(self): def test_repr_categorical_dates_periods(self): # normal DataFrame - dt = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + dt = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern") p = period_range("2011-01", freq="M", periods=5) df = DataFrame({"dt": dt, "p": p}) exp = """ dt p diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 1522b83a4f5d0..0f7ae998a4b2b 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -281,12 +281,12 @@ ( pd.Series, (1, pd.date_range("2000", periods=4)), - operator.methodcaller("asfreq", "H"), + operator.methodcaller("asfreq", "h"), ), ( pd.DataFrame, ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), - operator.methodcaller("asfreq", "H"), + operator.methodcaller("asfreq", "h"), ), ( pd.Series, @@ -628,9 +628,9 @@ def test_string_method(method): operator.methodcaller("tz_localize", "CET"), operator.methodcaller("normalize"), operator.methodcaller("strftime", "%Y"), - operator.methodcaller("round", "H"), - operator.methodcaller("floor", "H"), - operator.methodcaller("ceil", "H"), + operator.methodcaller("round", "h"), + operator.methodcaller("floor", "h"), + operator.methodcaller("ceil", "h"), operator.methodcaller("month_name"), operator.methodcaller("day_name"), ], diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 7ea107f254104..398e9b09693e6 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -106,7 +106,7 @@ def test_agg_dict_parameter_cast_result_dtypes(): df = DataFrame( { "class": ["A", "A", "B", "B", "C", "C", "D", "D"], - "time": date_range("1/1/2011", periods=8, freq="H"), + "time": date_range("1/1/2011", periods=8, freq="h"), } ) df.loc[[0, 1, 2, 5], "time"] = None diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index abcb9f68e0f5c..5331b2e2c5d81 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1306,7 +1306,7 @@ def test_positional_slice_groups_datetimelike(): # GH 21651 expected = DataFrame( { - "date": pd.date_range("2010-01-01", freq="12H", periods=5), + "date": pd.date_range("2010-01-01", freq="12h", periods=5), "vals": range(5), "let": list("abcde"), } diff --git a/pandas/tests/groupby/test_groupby_shift_diff.py b/pandas/tests/groupby/test_groupby_shift_diff.py index bb4b9aa866ac9..f2d40867af03a 100644 --- a/pandas/tests/groupby/test_groupby_shift_diff.py +++ b/pandas/tests/groupby/test_groupby_shift_diff.py @@ -205,11 +205,11 @@ def test_group_shift_with_multiple_periods_and_freq(): # GH#44424 df = DataFrame( {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]}, - index=date_range("1/1/2000", periods=5, freq="H"), + index=date_range("1/1/2000", periods=5, freq="h"), ) shifted_df = df.groupby("b")[["a"]].shift( [0, 1], - freq="H", + freq="h", ) expected_df = DataFrame( { @@ -223,7 +223,7 @@ def test_group_shift_with_multiple_periods_and_freq(): 5.0, ], }, - index=date_range("1/1/2000", periods=6, freq="H"), + index=date_range("1/1/2000", periods=6, freq="h"), ) tm.assert_frame_equal(shifted_df, expected_df) @@ -244,11 +244,11 @@ def test_group_shift_with_multiple_periods_and_both_fill_and_freq_deprecated(): # GH#44424 df = DataFrame( {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]}, - index=date_range("1/1/2000", periods=5, freq="H"), + index=date_range("1/1/2000", periods=5, freq="h"), ) msg = ( "Passing a 'freq' together with a 'fill_value' silently ignores the " "fill_value" ) with tm.assert_produces_warning(FutureWarning, match=msg): - df.groupby("b")[["a"]].shift([1, 2], fill_value=1, freq="H") + df.groupby("b")[["a"]].shift([1, 2], fill_value=1, freq="h") diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 5adf9ace255ea..88ee8a35e5c94 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -475,7 +475,7 @@ def test_groupby_with_datetime_key(self): df = DataFrame( { "id": ["a", "b"] * 3, - "b": date_range("2000-01-01", "2000-01-03", freq="9H"), + "b": date_range("2000-01-01", "2000-01-03", freq="9h"), } ) grouper = Grouper(key="b", freq="D") diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 805fef2125fda..4e7c09b70feb0 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -421,7 +421,7 @@ def test_timestamp_groupby_quantile(): { "timestamp": pd.date_range( start="2020-04-19 00:00:00", freq="1min", periods=100, tz="UTC" - ).floor("1H"), + ).floor("1h"), "category": list(range(1, 101)), "value": list(range(101, 201)), } diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index a3dc9e3087c7b..31629ba697e33 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -655,7 +655,7 @@ def test_groupby_groups_periods(self): df = DataFrame( { "label": ["a", "a", "a", "b", "b", "b"], - "period": [pd.Period(d, freq="H") for d in dates], + "period": [pd.Period(d, freq="h") for d in dates], "value1": np.arange(6, dtype="int64"), "value2": [1, 2] * 3, } @@ -670,7 +670,7 @@ def test_groupby_groups_periods(self): "2011-07-19 09:00:00", "2011-07-19 09:00:00", ], - freq="H", + freq="h", name="period", ) exp_idx2 = Index(["a", "b"] * 3, name="label") @@ -685,7 +685,7 @@ def test_groupby_groups_periods(self): tm.assert_frame_equal(result, expected) # by level - didx = pd.PeriodIndex(dates, freq="H") + didx = pd.PeriodIndex(dates, freq="h") df = DataFrame( {"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]}, index=didx, @@ -693,7 +693,7 @@ def test_groupby_groups_periods(self): exp_idx = pd.PeriodIndex( ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], - freq="H", + freq="h", ) expected = DataFrame( {"value1": [3, 5, 7], "value2": [2, 4, 6]}, diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py index fe397e2c7c88e..808a1687390ff 100644 --- a/pandas/tests/indexes/conftest.py +++ b/pandas/tests/indexes/conftest.py @@ -25,7 +25,7 @@ def sort(request): return request.param -@pytest.fixture(params=["D", "3D", "-3D", "H", "2H", "-2H", "min", "2min", "s", "-3s"]) +@pytest.fixture(params=["D", "3D", "-3D", "h", "2h", "-2h", "min", "2min", "s", "-3s"]) def freq_sample(request): """ Valid values for 'freq' parameter used to create date_range and diff --git a/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py b/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py index c38e24232f181..61a79c4ceabf9 100644 --- a/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py +++ b/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py @@ -68,7 +68,7 @@ def test_drop_duplicates(self, keep, expected, index, idx): class TestDropDuplicatesPeriodIndex(DropDuplicates): - @pytest.fixture(params=["D", "3D", "H", "2H", "min", "2min", "s", "3s"]) + @pytest.fixture(params=["D", "3D", "h", "2h", "min", "2min", "s", "3s"]) def freq(self, request): return request.param diff --git a/pandas/tests/indexes/datetimelike_/test_equals.py b/pandas/tests/indexes/datetimelike_/test_equals.py index d85d7103fe381..7845d99614d34 100644 --- a/pandas/tests/indexes/datetimelike_/test_equals.py +++ b/pandas/tests/indexes/datetimelike_/test_equals.py @@ -65,7 +65,7 @@ def test_equals2(self, freq): assert not idx.equals(list(idx)) assert not idx.equals(pd.Series(idx)) - idx2 = PeriodIndex(["2011-01-01", "2011-01-02", "NaT"], freq="H") + idx2 = PeriodIndex(["2011-01-01", "2011-01-02", "NaT"], freq="h") assert not idx.equals(idx2) assert not idx.equals(idx2.copy()) assert not idx.equals(idx2.astype(object)) @@ -75,7 +75,7 @@ def test_equals2(self, freq): # same internal, different tz idx3 = PeriodIndex._simple_new( - idx._values._simple_new(idx._values.asi8, dtype=pd.PeriodDtype("H")) + idx._values._simple_new(idx._values.asi8, dtype=pd.PeriodDtype("h")) ) tm.assert_numpy_array_equal(idx.asi8, idx3.asi8) assert not idx.equals(idx3) diff --git a/pandas/tests/indexes/datetimelike_/test_is_monotonic.py b/pandas/tests/indexes/datetimelike_/test_is_monotonic.py index 088ccc406cb81..b0e42e660b751 100644 --- a/pandas/tests/indexes/datetimelike_/test_is_monotonic.py +++ b/pandas/tests/indexes/datetimelike_/test_is_monotonic.py @@ -34,7 +34,7 @@ def test_is_monotonic_with_nat(): assert obj.is_unique dti2 = dti.insert(3, NaT) - pi2 = dti2.to_period("H") + pi2 = dti2.to_period("h") tdi2 = Index(dti2.view("timedelta64[ns]")) for obj in [pi2, pi2._engine, dti2, dti2._engine, tdi2, tdi2._engine]: diff --git a/pandas/tests/indexes/datetimelike_/test_sort_values.py b/pandas/tests/indexes/datetimelike_/test_sort_values.py index cf919bfa29d10..a2c349c8b0ef6 100644 --- a/pandas/tests/indexes/datetimelike_/test_sort_values.py +++ b/pandas/tests/indexes/datetimelike_/test_sort_values.py @@ -92,7 +92,7 @@ def check_sort_values_with_freq(self, idx): tm.assert_numpy_array_equal(indexer, np.array([2, 1, 0], dtype=np.intp)) check_freq_ascending(ordered, idx, False) - @pytest.mark.parametrize("freq", ["D", "H"]) + @pytest.mark.parametrize("freq", ["D", "h"]) def test_sort_values_with_freq_timedeltaindex(self, freq): # GH#10295 idx = timedelta_range(start=f"1{freq}", periods=3, freq=freq).rename("idx") @@ -107,7 +107,7 @@ def test_sort_values_with_freq_timedeltaindex(self, freq): ), DatetimeIndex( ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], - freq="H", + freq="h", name="tzidx", tz="Asia/Tokyo", ), diff --git a/pandas/tests/indexes/datetimelike_/test_value_counts.py b/pandas/tests/indexes/datetimelike_/test_value_counts.py index a0f05a1a35d79..069e354a364c9 100644 --- a/pandas/tests/indexes/datetimelike_/test_value_counts.py +++ b/pandas/tests/indexes/datetimelike_/test_value_counts.py @@ -18,15 +18,15 @@ class TestValueCounts: def test_value_counts_unique_datetimeindex(self, tz_naive_fixture): tz = tz_naive_fixture - orig = date_range("2011-01-01 09:00", freq="H", periods=10, tz=tz) + orig = date_range("2011-01-01 09:00", freq="h", periods=10, tz=tz) self._check_value_counts_with_repeats(orig) def test_value_counts_unique_timedeltaindex(self): - orig = timedelta_range("1 days 09:00:00", freq="H", periods=10) + orig = timedelta_range("1 days 09:00:00", freq="h", periods=10) self._check_value_counts_with_repeats(orig) def test_value_counts_unique_periodindex(self): - orig = period_range("2011-01-01 09:00", freq="H", periods=10) + orig = period_range("2011-01-01 09:00", freq="h", periods=10) self._check_value_counts_with_repeats(orig) def _check_value_counts_with_repeats(self, orig): @@ -83,7 +83,7 @@ def test_value_counts_unique_periodindex2(self): "2013-01-01 08:00", NaT, ], - freq="H", + freq="h", ) self._check_value_counts_dropna(idx) diff --git a/pandas/tests/indexes/datetimes/methods/test_astype.py b/pandas/tests/indexes/datetimes/methods/test_astype.py index 08a2473f22556..94390acb35624 100644 --- a/pandas/tests/indexes/datetimes/methods/test_astype.py +++ b/pandas/tests/indexes/datetimes/methods/test_astype.py @@ -117,7 +117,7 @@ def test_astype_str_tz_and_name(self): def test_astype_str_freq_and_name(self): # test astype string with freqH and name - dti = date_range("1/1/2011", periods=3, freq="H", name="test_name") + dti = date_range("1/1/2011", periods=3, freq="h", name="test_name") result = dti.astype(str) expected = Index( ["2011-01-01 00:00:00", "2011-01-01 01:00:00", "2011-01-01 02:00:00"], @@ -129,7 +129,7 @@ def test_astype_str_freq_and_name(self): def test_astype_str_freq_and_tz(self): # test astype string with freqH and timezone dti = date_range( - "3/6/2012 00:00", periods=2, freq="H", tz="Europe/London", name="test_name" + "3/6/2012 00:00", periods=2, freq="h", tz="Europe/London", name="test_name" ) result = dti.astype(str) expected = Index( diff --git a/pandas/tests/indexes/datetimes/methods/test_factorize.py b/pandas/tests/indexes/datetimes/methods/test_factorize.py index 99a3bc910b9ca..41ecf9ee6b823 100644 --- a/pandas/tests/indexes/datetimes/methods/test_factorize.py +++ b/pandas/tests/indexes/datetimes/methods/test_factorize.py @@ -74,7 +74,7 @@ def test_factorize_preserves_freq(self): def test_factorize_tz(self, tz_naive_fixture, index_or_series): tz = tz_naive_fixture # GH#13750 - base = date_range("2016-11-05", freq="H", periods=100, tz=tz) + base = date_range("2016-11-05", freq="h", periods=100, tz=tz) idx = base.repeat(5) exp_arr = np.arange(100, dtype=np.intp).repeat(5) @@ -89,7 +89,7 @@ def test_factorize_tz(self, tz_naive_fixture, index_or_series): def test_factorize_dst(self, index_or_series): # GH#13750 - idx = date_range("2016-11-06", freq="H", periods=12, tz="US/Eastern") + idx = date_range("2016-11-06", freq="h", periods=12, tz="US/Eastern") obj = index_or_series(idx) arr, res = obj.factorize() @@ -98,7 +98,7 @@ def test_factorize_dst(self, index_or_series): if index_or_series is Index: assert res.freq == idx.freq - idx = date_range("2016-06-13", freq="H", periods=12, tz="US/Eastern") + idx = date_range("2016-06-13", freq="h", periods=12, tz="US/Eastern") obj = index_or_series(idx) arr, res = obj.factorize() @@ -112,7 +112,7 @@ def test_factorize_no_freq_non_nano(self, tz_naive_fixture, sort): # GH#51978 case that does not go through the fastpath based on # non-None freq tz = tz_naive_fixture - idx = date_range("2016-11-06", freq="H", periods=5, tz=tz)[[0, 4, 1, 3, 2]] + idx = date_range("2016-11-06", freq="h", periods=5, tz=tz)[[0, 4, 1, 3, 2]] exp_codes, exp_uniques = idx.factorize(sort=sort) res_codes, res_uniques = idx.as_unit("s").factorize(sort=sort) diff --git a/pandas/tests/indexes/datetimes/methods/test_insert.py b/pandas/tests/indexes/datetimes/methods/test_insert.py index 9ef43ace747e2..4ef162913b622 100644 --- a/pandas/tests/indexes/datetimes/methods/test_insert.py +++ b/pandas/tests/indexes/datetimes/methods/test_insert.py @@ -128,10 +128,10 @@ def test_insert(self): assert result.freq is None for tz in ["US/Pacific", "Asia/Singapore"]: - idx = date_range("1/1/2000 09:00", periods=6, freq="H", tz=tz, name="idx") + idx = date_range("1/1/2000 09:00", periods=6, freq="h", tz=tz, name="idx") # preserve freq expected = date_range( - "1/1/2000 09:00", periods=7, freq="H", tz=tz, name="idx" + "1/1/2000 09:00", periods=7, freq="h", tz=tz, name="idx" ) for d in [ Timestamp("2000-01-01 15:00", tz=tz), diff --git a/pandas/tests/indexes/datetimes/methods/test_shift.py b/pandas/tests/indexes/datetimes/methods/test_shift.py index e8661fafc3bb7..064f664a4de10 100644 --- a/pandas/tests/indexes/datetimes/methods/test_shift.py +++ b/pandas/tests/indexes/datetimes/methods/test_shift.py @@ -24,30 +24,30 @@ def test_dti_shift_tzaware(self, tz_naive_fixture): # GH#9903 tz = tz_naive_fixture idx = DatetimeIndex([], name="xxx", tz=tz) - tm.assert_index_equal(idx.shift(0, freq="H"), idx) - tm.assert_index_equal(idx.shift(3, freq="H"), idx) + tm.assert_index_equal(idx.shift(0, freq="h"), idx) + tm.assert_index_equal(idx.shift(3, freq="h"), idx) idx = DatetimeIndex( ["2011-01-01 10:00", "2011-01-01 11:00", "2011-01-01 12:00"], name="xxx", tz=tz, - freq="H", + freq="h", ) - tm.assert_index_equal(idx.shift(0, freq="H"), idx) + tm.assert_index_equal(idx.shift(0, freq="h"), idx) exp = DatetimeIndex( ["2011-01-01 13:00", "2011-01-01 14:00", "2011-01-01 15:00"], name="xxx", tz=tz, - freq="H", + freq="h", ) - tm.assert_index_equal(idx.shift(3, freq="H"), exp) + tm.assert_index_equal(idx.shift(3, freq="h"), exp) exp = DatetimeIndex( ["2011-01-01 07:00", "2011-01-01 08:00", "2011-01-01 09:00"], name="xxx", tz=tz, - freq="H", + freq="h", ) - tm.assert_index_equal(idx.shift(-3, freq="H"), exp) + tm.assert_index_equal(idx.shift(-3, freq="h"), exp) def test_dti_shift_freqs(self): # test shift for DatetimeIndex and non DatetimeIndex @@ -101,9 +101,9 @@ def test_dti_shift_localized(self, tzstr): def test_dti_shift_across_dst(self): # GH 8616 - idx = date_range("2013-11-03", tz="America/Chicago", periods=7, freq="H") + idx = date_range("2013-11-03", tz="America/Chicago", periods=7, freq="h") s = Series(index=idx[:-1], dtype=object) - result = s.shift(freq="H") + result = s.shift(freq="h") expected = Series(index=idx[1:], dtype=object) tm.assert_series_equal(result, expected) @@ -120,7 +120,7 @@ def test_dti_shift_near_midnight(self, shift, result_time): dt = datetime(2014, 11, 14, 0) dt_est = pytz.timezone("EST").localize(dt) s = Series(data=[1], index=[dt_est]) - result = s.shift(shift, freq="H") + result = s.shift(shift, freq="h") expected = Series(1, index=DatetimeIndex([result_time], tz="EST")) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/methods/test_to_period.py b/pandas/tests/indexes/datetimes/methods/test_to_period.py index 7712a4166329c..8900c5cdbca14 100644 --- a/pandas/tests/indexes/datetimes/methods/test_to_period.py +++ b/pandas/tests/indexes/datetimes/methods/test_to_period.py @@ -112,7 +112,7 @@ def test_period_dt64_round_trip(self): tm.assert_index_equal(pi.to_timestamp(), dti) dti = date_range("1/1/2000", "1/7/2002", freq="B") - pi = dti.to_period(freq="H") + pi = dti.to_period(freq="h") tm.assert_index_equal(pi.to_timestamp(), dti) def test_to_period_millisecond(self): diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 7dee58e63fa88..6da215715482d 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -185,7 +185,7 @@ def test_construction_caching(self): ) def test_construction_with_alt(self, kwargs, tz_aware_fixture): tz = tz_aware_fixture - i = date_range("20130101", periods=5, freq="H", tz=tz) + i = date_range("20130101", periods=5, freq="h", tz=tz) kwargs = {key: attrgetter(val)(i) for key, val in kwargs.items()} result = DatetimeIndex(i, **kwargs) tm.assert_index_equal(i, result) @@ -196,7 +196,7 @@ def test_construction_with_alt(self, kwargs, tz_aware_fixture): ) def test_construction_with_alt_tz_localize(self, kwargs, tz_aware_fixture): tz = tz_aware_fixture - i = date_range("20130101", periods=5, freq="H", tz=tz) + i = date_range("20130101", periods=5, freq="h", tz=tz) i = i._with_freq(None) kwargs = {key: attrgetter(val)(i) for key, val in kwargs.items()} @@ -902,7 +902,7 @@ def test_constructor_with_nonexistent_keyword_arg(self, warsaw): start = Timestamp("2015-03-29 02:30:00").tz_localize( timezone, nonexistent="shift_forward" ) - result = date_range(start=start, periods=2, freq="H") + result = date_range(start=start, periods=2, freq="h") expected = DatetimeIndex( [ Timestamp("2015-03-29 03:00:00+02:00", tz=timezone), @@ -916,7 +916,7 @@ def test_constructor_with_nonexistent_keyword_arg(self, warsaw): end = Timestamp("2015-03-29 02:30:00").tz_localize( timezone, nonexistent="shift_forward" ) - result = date_range(end=end, periods=2, freq="H") + result = date_range(end=end, periods=2, freq="h") expected = DatetimeIndex( [ Timestamp("2015-03-29 01:00:00+01:00", tz=timezone), @@ -1036,7 +1036,7 @@ def test_constructor_int64_nocopy(self): assert (index.asi8[50:100] != -1).all() @pytest.mark.parametrize( - "freq", ["ME", "Q", "Y", "D", "B", "BH", "min", "s", "ms", "us", "H", "ns", "C"] + "freq", ["ME", "Q", "Y", "D", "B", "bh", "min", "s", "ms", "us", "h", "ns", "C"] ) def test_from_freq_recreate_from_data(self, freq): org = date_range(start="2001/02/01 09:00", freq=freq, periods=1) diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index f0996d7af917d..ededf78621699 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -124,7 +124,7 @@ def test_date_range_timestamp_equiv_preserve_frequency(self): class TestDateRanges: - @pytest.mark.parametrize("freq", ["ns", "us", "ms", "min", "s", "H", "D"]) + @pytest.mark.parametrize("freq", ["ns", "us", "ms", "min", "s", "h", "D"]) def test_date_range_edges(self, freq): # GH#13672 td = Timedelta(f"1{freq}") @@ -206,11 +206,11 @@ def test_date_range_int64_overflow_non_recoverable(self): # case with start later than 1970-01-01, overflow int64 but not uint64 msg = "Cannot generate range with" with pytest.raises(OutOfBoundsDatetime, match=msg): - date_range(start="1970-02-01", periods=106752 * 24, freq="H") + date_range(start="1970-02-01", periods=106752 * 24, freq="h") # case with end before 1970-01-01, overflow int64 but not uint64 with pytest.raises(OutOfBoundsDatetime, match=msg): - date_range(end="1969-11-14", periods=106752 * 24, freq="H") + date_range(end="1969-11-14", periods=106752 * 24, freq="h") @pytest.mark.slow @pytest.mark.parametrize( @@ -224,11 +224,11 @@ def test_date_range_int64_overflow_stride_endpoint_different_signs( start = Timestamp(s_ts) end = Timestamp(e_ts) - expected = date_range(start=start, end=end, freq="-1H") + expected = date_range(start=start, end=end, freq="-1h") assert expected[0] == start assert expected[-1] == end - dti = date_range(end=end, periods=len(expected), freq="-1H") + dti = date_range(end=end, periods=len(expected), freq="-1h") tm.assert_index_equal(dti, expected) def test_date_range_out_of_bounds(self): @@ -416,13 +416,13 @@ def test_date_range_businesshour(self): "2014-07-04 15:00", "2014-07-04 16:00", ], - freq="BH", + freq="bh", ) - rng = date_range("2014-07-04 09:00", "2014-07-04 16:00", freq="BH") + rng = date_range("2014-07-04 09:00", "2014-07-04 16:00", freq="bh") tm.assert_index_equal(idx, rng) - idx = DatetimeIndex(["2014-07-04 16:00", "2014-07-07 09:00"], freq="BH") - rng = date_range("2014-07-04 16:00", "2014-07-07 09:00", freq="BH") + idx = DatetimeIndex(["2014-07-04 16:00", "2014-07-07 09:00"], freq="bh") + rng = date_range("2014-07-04 16:00", "2014-07-07 09:00", freq="bh") tm.assert_index_equal(idx, rng) idx = DatetimeIndex( @@ -452,9 +452,9 @@ def test_date_range_businesshour(self): "2014-07-08 15:00", "2014-07-08 16:00", ], - freq="BH", + freq="bh", ) - rng = date_range("2014-07-04 09:00", "2014-07-08 16:00", freq="BH") + rng = date_range("2014-07-04 09:00", "2014-07-08 16:00", freq="bh") tm.assert_index_equal(idx, rng) def test_date_range_timedelta(self): @@ -481,13 +481,13 @@ def test_range_misspecified(self): date_range(periods=10) with pytest.raises(ValueError, match=msg): - date_range(start="1/1/2000", freq="H") + date_range(start="1/1/2000", freq="h") with pytest.raises(ValueError, match=msg): - date_range(end="1/1/2000", freq="H") + date_range(end="1/1/2000", freq="h") with pytest.raises(ValueError, match=msg): - date_range(periods=10, freq="H") + date_range(periods=10, freq="h") with pytest.raises(ValueError, match=msg): date_range() @@ -524,14 +524,14 @@ def test_construct_over_dst(self): pre_dst, pst_dst, ] - expected = DatetimeIndex(expect_data, freq="H") - result = date_range(start="2010-11-7", periods=3, freq="H", tz="US/Pacific") + expected = DatetimeIndex(expect_data, freq="h") + result = date_range(start="2010-11-7", periods=3, freq="h", tz="US/Pacific") tm.assert_index_equal(result, expected) def test_construct_with_different_start_end_string_format(self): # GH 12064 result = date_range( - "2013-01-01 00:00:00+09:00", "2013/01/01 02:00:00+09:00", freq="H" + "2013-01-01 00:00:00+09:00", "2013/01/01 02:00:00+09:00", freq="h" ) expected = DatetimeIndex( [ @@ -539,7 +539,7 @@ def test_construct_with_different_start_end_string_format(self): Timestamp("2013-01-01 01:00:00+09:00"), Timestamp("2013-01-01 02:00:00+09:00"), ], - freq="H", + freq="h", ) tm.assert_index_equal(result, expected) @@ -638,7 +638,7 @@ def test_range_tz_dateutil(self): assert dr[0] == start assert dr[2] == end - @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3H", "Y"]) + @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3h", "Y"]) def test_range_closed(self, freq, inclusive_endpoints_fixture): begin = datetime(2011, 1, 1) end = datetime(2014, 1, 1) @@ -653,7 +653,7 @@ def test_range_closed(self, freq, inclusive_endpoints_fixture): tm.assert_index_equal(expected_range, result_range) - @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3H", "Y"]) + @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3h", "Y"]) def test_range_closed_with_tz_aware_start_end( self, freq, inclusive_endpoints_fixture ): @@ -674,7 +674,7 @@ def test_range_closed_with_tz_aware_start_end( tm.assert_index_equal(expected_range, result_range) - @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3H", "Y"]) + @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3h", "Y"]) def test_range_with_tz_closed_with_tz_aware_start_end( self, freq, inclusive_endpoints_fixture ): @@ -841,6 +841,7 @@ def test_freq_dateoffset_with_relateivedelta_nanos(self): [ ("2Y", "2A"), ("200Y-MAY", "200A-MAY"), + ("h", "H"), ("2min", "2T"), ("1s", "1S"), ("2ms", "2L"), @@ -908,7 +909,7 @@ def test_date_range_with_tz(self, tzstr): stamp = Timestamp("3/11/2012 05:00", tz=tzstr) assert stamp.hour == 5 - rng = date_range("3/11/2012 04:00", periods=10, freq="H", tz=tzstr) + rng = date_range("3/11/2012 04:00", periods=10, freq="h", tz=tzstr) assert stamp == rng[1] diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index d08757a206e67..156075e3fafec 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -1,3 +1,4 @@ +import datetime as dt from datetime import date import dateutil @@ -96,7 +97,7 @@ def test_append_nondatetimeindex(self): def test_iteration_preserves_tz(self): # see gh-8890 - index = date_range("2012-01-01", periods=3, freq="H", tz="US/Eastern") + index = date_range("2012-01-01", periods=3, freq="h", tz="US/Eastern") for i, ts in enumerate(index): result = ts @@ -104,7 +105,7 @@ def test_iteration_preserves_tz(self): assert result == expected index = date_range( - "2012-01-01", periods=3, freq="H", tz=dateutil.tz.tzoffset(None, -28800) + "2012-01-01", periods=3, freq="h", tz=dateutil.tz.tzoffset(None, -28800) ) for i, ts in enumerate(index): @@ -201,3 +202,27 @@ def test_asarray_tz_aware(self): result = np.asarray(idx, dtype=object) tm.assert_numpy_array_equal(result, expected) + + def test_CBH_deprecated(self): + msg = "'CBH' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = date_range( + dt.datetime(2022, 12, 11), dt.datetime(2022, 12, 13), freq="CBH" + ) + result = DatetimeIndex( + [ + "2022-12-12 09:00:00", + "2022-12-12 10:00:00", + "2022-12-12 11:00:00", + "2022-12-12 12:00:00", + "2022-12-12 13:00:00", + "2022-12-12 14:00:00", + "2022-12-12 15:00:00", + "2022-12-12 16:00:00", + ], + dtype="datetime64[ns]", + freq="cbh", + ) + + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_delete.py b/pandas/tests/indexes/datetimes/test_delete.py index 3565e516e69d5..90dfdb46cdfa5 100644 --- a/pandas/tests/indexes/datetimes/test_delete.py +++ b/pandas/tests/indexes/datetimes/test_delete.py @@ -42,25 +42,25 @@ def test_delete(self): for tz in [None, "Asia/Tokyo", "US/Pacific"]: idx = date_range( - start="2000-01-01 09:00", periods=10, freq="H", name="idx", tz=tz + start="2000-01-01 09:00", periods=10, freq="h", name="idx", tz=tz ) expected = date_range( - start="2000-01-01 10:00", periods=9, freq="H", name="idx", tz=tz + start="2000-01-01 10:00", periods=9, freq="h", name="idx", tz=tz ) result = idx.delete(0) tm.assert_index_equal(result, expected) assert result.name == expected.name - assert result.freqstr == "H" + assert result.freqstr == "h" assert result.tz == expected.tz expected = date_range( - start="2000-01-01 09:00", periods=9, freq="H", name="idx", tz=tz + start="2000-01-01 09:00", periods=9, freq="h", name="idx", tz=tz ) result = idx.delete(-1) tm.assert_index_equal(result, expected) assert result.name == expected.name - assert result.freqstr == "H" + assert result.freqstr == "h" assert result.tz == expected.tz def test_delete_slice(self): @@ -105,13 +105,13 @@ def test_delete_slice(self): ts = Series( 1, index=date_range( - "2000-01-01 09:00", periods=10, freq="H", name="idx", tz=tz + "2000-01-01 09:00", periods=10, freq="h", name="idx", tz=tz ), ) # preserve freq result = ts.drop(ts.index[:5]).index expected = date_range( - "2000-01-01 14:00", periods=5, freq="H", name="idx", tz=tz + "2000-01-01 14:00", periods=5, freq="h", name="idx", tz=tz ) tm.assert_index_equal(result, expected) assert result.name == expected.name diff --git a/pandas/tests/indexes/datetimes/test_formats.py b/pandas/tests/indexes/datetimes/test_formats.py index 502cb0407bfcd..9fb5db9e034ee 100644 --- a/pandas/tests/indexes/datetimes/test_formats.py +++ b/pandas/tests/indexes/datetimes/test_formats.py @@ -87,8 +87,8 @@ def test_dti_repr_short(self): ), ( ["2012-01-01"], - "24H", - "DatetimeIndex(['2012-01-01'], dtype='datetime64[ns]', freq='24H')", + "24h", + "DatetimeIndex(['2012-01-01'], dtype='datetime64[ns]', freq='24h')", ), ], ) @@ -108,7 +108,7 @@ def test_dti_representation(self, method): idxs.append( DatetimeIndex( ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], - freq="H", + freq="h", tz="Asia/Tokyo", ) ) @@ -135,7 +135,7 @@ def test_dti_representation(self, method): exp.append( "DatetimeIndex(['2011-01-01 09:00:00+09:00', " "'2011-01-01 10:00:00+09:00', '2011-01-01 11:00:00+09:00']" - ", dtype='datetime64[ns, Asia/Tokyo]', freq='H')" + ", dtype='datetime64[ns, Asia/Tokyo]', freq='h')" ) exp.append( "DatetimeIndex(['2011-01-01 09:00:00-05:00', " @@ -161,7 +161,7 @@ def test_dti_representation_to_series(self): idx4 = DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D") idx5 = DatetimeIndex( ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], - freq="H", + freq="h", tz="Asia/Tokyo", ) idx6 = DatetimeIndex( @@ -218,7 +218,7 @@ def test_dti_summary(self): idx4 = DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D") idx5 = DatetimeIndex( ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], - freq="H", + freq="h", tz="Asia/Tokyo", ) idx6 = DatetimeIndex( @@ -236,7 +236,7 @@ def test_dti_summary(self): exp5 = ( "DatetimeIndex: 3 entries, 2011-01-01 09:00:00+09:00 " "to 2011-01-01 11:00:00+09:00\n" - "Freq: H" + "Freq: h" ) exp6 = """DatetimeIndex: 3 entries, 2011-01-01 09:00:00-05:00 to NaT""" diff --git a/pandas/tests/indexes/datetimes/test_freq_attr.py b/pandas/tests/indexes/datetimes/test_freq_attr.py index f5821a316358d..5cddf56cd1c73 100644 --- a/pandas/tests/indexes/datetimes/test_freq_attr.py +++ b/pandas/tests/indexes/datetimes/test_freq_attr.py @@ -31,7 +31,7 @@ def test_freq_setter_errors(self): idx._data.freq = "foo" @pytest.mark.parametrize("values", [["20180101", "20180103", "20180105"], []]) - @pytest.mark.parametrize("freq", ["2D", Day(2), "2B", BDay(2), "48H", Hour(48)]) + @pytest.mark.parametrize("freq", ["2D", Day(2), "2B", BDay(2), "48h", Hour(48)]) @pytest.mark.parametrize("tz", [None, "US/Eastern"]) def test_freq_setter(self, values, freq, tz): # GH#20678 diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index d877110e72b26..37c580c98b139 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -31,7 +31,7 @@ def test_getitem_slice_keeps_name(self): # GH4226 st = Timestamp("2013-07-01 00:00:00", tz="America/Los_Angeles") et = Timestamp("2013-07-02 00:00:00", tz="America/Los_Angeles") - dr = date_range(st, et, freq="H", name="timebucket") + dr = date_range(st, et, freq="h", name="timebucket") assert dr[1:].name == dr.name def test_getitem(self): @@ -301,7 +301,7 @@ def test_take2(self, tz): idx = date_range( start="2010-01-01 09:00", end="2010-02-01 09:00", - freq="H", + freq="h", tz=tz, name="idx", ) @@ -407,7 +407,7 @@ def test_get_loc_key_unit_mismatch_not_castable(self): def test_get_loc_time_obj(self): # time indexing - idx = date_range("2000-01-01", periods=24, freq="H") + idx = date_range("2000-01-01", periods=24, freq="h") result = idx.get_loc(time(12)) expected = np.array([12]) @@ -603,7 +603,7 @@ def test_get_indexer_pad_requires_monotonicity(self): class TestMaybeCastSliceBound: def test_maybe_cast_slice_bounds_empty(self): # GH#14354 - empty_idx = date_range(freq="1H", periods=0, end="2015") + empty_idx = date_range(freq="1h", periods=0, end="2015") right = empty_idx._maybe_cast_slice_bound("2015-01-02", "right") exp = Timestamp("2015-01-02 23:59:59.999999999") diff --git a/pandas/tests/indexes/datetimes/test_join.py b/pandas/tests/indexes/datetimes/test_join.py index ccfdb55fc8119..959fbab0dcec6 100644 --- a/pandas/tests/indexes/datetimes/test_join.py +++ b/pandas/tests/indexes/datetimes/test_join.py @@ -65,7 +65,7 @@ def test_join_object_index(self): assert isinstance(result[0], Timestamp) def test_join_utc_convert(self, join_type): - rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") + rng = date_range("1/1/2011", periods=100, freq="h", tz="utc") left = rng.tz_convert("US/Eastern") right = rng.tz_convert("Europe/Berlin") diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index ac6d0a97956e4..7eea05c753b8a 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -24,7 +24,7 @@ class TestDatetimeIndexOps: ("Q", "day"), ("ME", "day"), ("D", "day"), - ("H", "hour"), + ("h", "hour"), ("min", "minute"), ("s", "second"), ("ms", "millisecond"), diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 7ad0dfbaf6cb1..c4e23154b7ffc 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -194,7 +194,7 @@ def test_partial_slice(self): s["2004-12-31"] def test_partial_slice_daily(self): - rng = date_range(freq="H", start=datetime(2005, 1, 31), periods=500) + rng = date_range(freq="h", start=datetime(2005, 1, 31), periods=500) s = Series(np.arange(len(rng)), index=rng) result = s["2005-1-31"] diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index 49597ef885183..7c7e57b51ccc0 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -29,7 +29,7 @@ def test_dti_time(self): assert (result == expected).all() def test_dti_date(self): - rng = date_range("1/1/2000", freq="12H", periods=10) + rng = date_range("1/1/2000", freq="12h", periods=10) result = pd.Index(rng).date expected = [t.date() for t in rng] assert (result == expected).all() @@ -122,8 +122,8 @@ def test_round(self, tz_naive_fixture): ) expected_elt = expected_rng[1] - tm.assert_index_equal(rng.round(freq="H"), expected_rng) - assert elt.round(freq="H") == expected_elt + tm.assert_index_equal(rng.round(freq="h"), expected_rng) + assert elt.round(freq="h") == expected_elt msg = INVALID_FREQ_ERR_MSG with pytest.raises(ValueError, match=msg): @@ -198,8 +198,8 @@ def test_no_rounding_occurs(self, tz_naive_fixture): (["1823-01-01 00:00:01"], "ceil", "1s", ["1823-01-01 00:00:01"]), (["2018-01-01 00:15:00"], "ceil", "15min", ["2018-01-01 00:15:00"]), (["2018-01-01 00:15:00"], "floor", "15min", ["2018-01-01 00:15:00"]), - (["1823-01-01 03:00:00"], "ceil", "3H", ["1823-01-01 03:00:00"]), - (["1823-01-01 03:00:00"], "floor", "3H", ["1823-01-01 03:00:00"]), + (["1823-01-01 03:00:00"], "ceil", "3h", ["1823-01-01 03:00:00"]), + (["1823-01-01 03:00:00"], "floor", "3h", ["1823-01-01 03:00:00"]), ( ("NaT", "1823-01-01 00:00:01"), "floor", @@ -223,7 +223,7 @@ def test_ceil_floor_edge(self, test_input, rounder, freq, expected): @pytest.mark.parametrize( "start, index_freq, periods", - [("2018-01-01", "12H", 25), ("2018-01-01 0:0:0.124999", "1ns", 1000)], + [("2018-01-01", "12h", 25), ("2018-01-01 0:0:0.124999", "1ns", 1000)], ) @pytest.mark.parametrize( "round_freq", @@ -245,7 +245,7 @@ def test_ceil_floor_edge(self, test_input, rounder, freq, expected): "1s", "2s", "3s", - "12H", + "12h", "1D", ], ) @@ -326,7 +326,7 @@ def test_2000(self): tm.assert_index_equal(r1, r2) def test_hour(self): - dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="H") + dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="h") r1 = pd.Index([x.to_julian_date() for x in dr]) r2 = dr.to_julian_date() assert isinstance(r2, pd.Index) and r2.dtype == np.float64 diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index ca784948a5d29..6071c7fa8319b 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -98,8 +98,8 @@ def test_union_coverage(self, sort): assert result.freq == ordered.freq def test_union_bug_1730(self, sort): - rng_a = date_range("1/1/2012", periods=4, freq="3H") - rng_b = date_range("1/1/2012", periods=4, freq="4H") + rng_a = date_range("1/1/2012", periods=4, freq="3h") + rng_b = date_range("1/1/2012", periods=4, freq="4h") result = rng_a.union(rng_b, sort=sort) exp = list(rng_a) + list(rng_b[1:]) @@ -308,7 +308,7 @@ def test_intersection_empty(self, tz_aware_fixture, freq): def test_intersection_bug_1708(self): from pandas import DateOffset - index_1 = date_range("1/1/2012", periods=4, freq="12H") + index_1 = date_range("1/1/2012", periods=4, freq="12h") index_2 = index_1 + DateOffset(hours=1) result = index_1.intersection(index_2) diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 756a72cf1849a..eb54ea8e4316f 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -192,7 +192,7 @@ def test_dti_tz_convert_hour_overflow_dst_timestamps(self, tz): expected = Index([9, 9, 9], dtype=np.int32) tm.assert_index_equal(ut.hour, expected) - @pytest.mark.parametrize("freq, n", [("H", 1), ("min", 60), ("s", 3600)]) + @pytest.mark.parametrize("freq, n", [("h", 1), ("min", 60), ("s", 3600)]) def test_dti_tz_convert_trans_pos_plus_1__bug(self, freq, n): # Regression test for tslib.tz_convert(vals, tz1, tz2). # See https://github.com/pandas-dev/pandas/issues/4496 for details. @@ -204,7 +204,7 @@ def test_dti_tz_convert_trans_pos_plus_1__bug(self, freq, n): tm.assert_index_equal(idx.hour, Index(expected, dtype=np.int32)) def test_dti_tz_convert_dst(self): - for freq, n in [("H", 1), ("min", 60), ("s", 3600)]: + for freq, n in [("h", 1), ("min", 60), ("s", 3600)]: # Start DST idx = date_range( "2014-03-08 23:00", "2014-03-09 09:00", freq=freq, tz="UTC" @@ -278,8 +278,8 @@ def test_tz_convert_roundtrip(self, tz_aware_fixture): idx2 = date_range(start="2014-01-01", end="2014-12-31", freq="D", tz="UTC") exp2 = date_range(start="2014-01-01", end="2014-12-31", freq="D") - idx3 = date_range(start="2014-01-01", end="2014-03-01", freq="H", tz="UTC") - exp3 = date_range(start="2014-01-01", end="2014-03-01", freq="H") + idx3 = date_range(start="2014-01-01", end="2014-03-01", freq="h", tz="UTC") + exp3 = date_range(start="2014-01-01", end="2014-03-01", freq="h") idx4 = date_range(start="2014-08-01", end="2014-10-31", freq="min", tz="UTC") exp4 = date_range(start="2014-08-01", end="2014-10-31", freq="min") @@ -314,7 +314,7 @@ def test_dti_tz_convert_tzlocal(self): ], ) def test_dti_tz_convert_utc_to_local_no_modify(self, tz): - rng = date_range("3/11/2012", "3/12/2012", freq="H", tz="utc") + rng = date_range("3/11/2012", "3/12/2012", freq="h", tz="utc") rng_eastern = rng.tz_convert(tz) # Values are unmodified @@ -324,7 +324,7 @@ def test_dti_tz_convert_utc_to_local_no_modify(self, tz): @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_tz_convert_unsorted(self, tzstr): - dr = date_range("2012-03-09", freq="H", periods=100, tz="utc") + dr = date_range("2012-03-09", freq="h", periods=100, tz="utc") dr = dr.tz_convert(tzstr) result = dr[::-1].hour @@ -504,10 +504,10 @@ def test_dti_tz_localize_roundtrip(self, tz_aware_fixture): tm.assert_index_equal(reset, expected) def test_dti_tz_localize_naive(self): - rng = date_range("1/1/2011", periods=100, freq="H") + rng = date_range("1/1/2011", periods=100, freq="h") conv = rng.tz_localize("US/Pacific") - exp = date_range("1/1/2011", periods=100, freq="H", tz="US/Pacific") + exp = date_range("1/1/2011", periods=100, freq="h", tz="US/Pacific") tm.assert_index_equal(conv, exp._with_freq(None)) @@ -613,11 +613,11 @@ def test_dti_construction_ambiguous_endpoint(self, tz): with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): date_range( - "2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", freq="H" + "2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", freq="h" ) times = date_range( - "2013-10-26 23:00", "2013-10-27 01:00", freq="H", tz=tz, ambiguous="infer" + "2013-10-26 23:00", "2013-10-27 01:00", freq="h", tz=tz, ambiguous="infer" ) assert times[0] == Timestamp("2013-10-26 23:00", tz=tz) assert times[-1] == Timestamp("2013-10-27 01:00:00+0000", tz=tz) @@ -637,11 +637,11 @@ def test_dti_construction_nonexistent_endpoint(self, tz, option, expected): with pytest.raises(pytz.NonExistentTimeError, match="2019-03-10 02:00:00"): date_range( - "2019-03-10 00:00", "2019-03-10 02:00", tz="US/Pacific", freq="H" + "2019-03-10 00:00", "2019-03-10 02:00", tz="US/Pacific", freq="h" ) times = date_range( - "2019-03-10 00:00", "2019-03-10 02:00", freq="H", tz=tz, nonexistent=option + "2019-03-10 00:00", "2019-03-10 02:00", freq="h", tz=tz, nonexistent=option ) assert times[-1] == Timestamp(expected, tz=tz) @@ -820,7 +820,7 @@ def test_dti_tz_constructors(self, tzstr): arr = ["11/10/2005 08:00:00", "11/10/2005 09:00:00"] idx1 = to_datetime(arr).tz_localize(tzstr) - idx2 = date_range(start="2005-11-10 08:00:00", freq="H", periods=2, tz=tzstr) + idx2 = date_range(start="2005-11-10 08:00:00", freq="h", periods=2, tz=tzstr) idx2 = idx2._with_freq(None) # the others all have freq=None idx3 = DatetimeIndex(arr, tz=tzstr) idx4 = DatetimeIndex(np.array(arr), tz=tzstr) @@ -877,7 +877,7 @@ def test_dti_drop_dont_lose_tz(self): def test_dti_tz_conversion_freq(self, tz_naive_fixture): # GH25241 - t3 = DatetimeIndex(["2019-01-01 10:00"], freq="H") + t3 = DatetimeIndex(["2019-01-01 10:00"], freq="h") assert t3.tz_localize(tz=tz_naive_fixture).freq == t3.freq t4 = DatetimeIndex(["2019-01-02 12:00"], tz="UTC", freq="min") assert t4.tz_convert(tz="UTC").freq == t4.freq @@ -927,9 +927,9 @@ def test_drop_dst_boundary(self): tm.assert_index_equal(result, expected) def test_date_range_localize(self): - rng = date_range("3/11/2012 03:00", periods=15, freq="H", tz="US/Eastern") + rng = date_range("3/11/2012 03:00", periods=15, freq="h", tz="US/Eastern") rng2 = DatetimeIndex(["3/11/2012 03:00", "3/11/2012 04:00"], tz="US/Eastern") - rng3 = date_range("3/11/2012 03:00", periods=15, freq="H") + rng3 = date_range("3/11/2012 03:00", periods=15, freq="h") rng3 = rng3.tz_localize("US/Eastern") tm.assert_index_equal(rng._with_freq(None), rng3) @@ -944,9 +944,9 @@ def test_date_range_localize(self): tm.assert_index_equal(rng[:2], rng2) # Right before the DST transition - rng = date_range("3/11/2012 00:00", periods=2, freq="H", tz="US/Eastern") + rng = date_range("3/11/2012 00:00", periods=2, freq="h", tz="US/Eastern") rng2 = DatetimeIndex( - ["3/11/2012 00:00", "3/11/2012 01:00"], tz="US/Eastern", freq="H" + ["3/11/2012 00:00", "3/11/2012 01:00"], tz="US/Eastern", freq="h" ) tm.assert_index_equal(rng, rng2) exp = Timestamp("3/11/2012 00:00", tz="US/Eastern") @@ -956,7 +956,7 @@ def test_date_range_localize(self): assert exp.hour == 1 assert rng[1] == exp - rng = date_range("3/11/2012 00:00", periods=10, freq="H", tz="US/Eastern") + rng = date_range("3/11/2012 00:00", periods=10, freq="h", tz="US/Eastern") assert rng[2].hour == 3 def test_timestamp_equality_different_timezones(self): @@ -974,7 +974,7 @@ def test_timestamp_equality_different_timezones(self): assert (berlin_range == eastern_range).all() def test_dti_intersection(self): - rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") + rng = date_range("1/1/2011", periods=100, freq="h", tz="utc") left = rng[10:90][::-1] right = rng[20:80][::-1] @@ -984,8 +984,8 @@ def test_dti_intersection(self): assert result.tz == left.tz def test_dti_equals_with_tz(self): - left = date_range("1/1/2011", periods=100, freq="H", tz="utc") - right = date_range("1/1/2011", periods=100, freq="H", tz="US/Eastern") + left = date_range("1/1/2011", periods=100, freq="h", tz="utc") + right = date_range("1/1/2011", periods=100, freq="h", tz="US/Eastern") assert not left.equals(right) @@ -1036,7 +1036,7 @@ def test_dti_take_dont_lose_meta(self, tzstr): def test_utc_box_timestamp_and_localize(self, tzstr): tz = timezones.maybe_get_tz(tzstr) - rng = date_range("3/11/2012", "3/12/2012", freq="H", tz="utc") + rng = date_range("3/11/2012", "3/12/2012", freq="h", tz="utc") rng_eastern = rng.tz_convert(tzstr) expected = rng[-1].astimezone(tz) @@ -1046,7 +1046,7 @@ def test_utc_box_timestamp_and_localize(self, tzstr): assert stamp.tzinfo == expected.tzinfo # right tzinfo - rng = date_range("3/13/2012", "3/14/2012", freq="H", tz="utc") + rng = date_range("3/13/2012", "3/14/2012", freq="h", tz="utc") rng_eastern = rng.tz_convert(tzstr) # test not valid for dateutil timezones. # assert 'EDT' in repr(rng_eastern[0].tzinfo) @@ -1148,9 +1148,9 @@ def test_dti_convert_tz_aware_datetime_datetime(self, tz): def test_dti_setop_aware(self, setop): # non-overlapping # GH#39328 as of 2.0 we cast these to UTC instead of object - rng = date_range("2012-11-15 00:00:00", periods=6, freq="H", tz="US/Central") + rng = date_range("2012-11-15 00:00:00", periods=6, freq="h", tz="US/Central") - rng2 = date_range("2012-11-15 12:00:00", periods=6, freq="H", tz="US/Eastern") + rng2 = date_range("2012-11-15 12:00:00", periods=6, freq="h", tz="US/Eastern") result = getattr(rng, setop)(rng2) @@ -1195,14 +1195,14 @@ def test_tz_localize_invalidates_freq(): # we only preserve freq in unambiguous cases # if localized to US/Eastern, this crosses a DST transition - dti = date_range("2014-03-08 23:00", "2014-03-09 09:00", freq="H") - assert dti.freq == "H" + dti = date_range("2014-03-08 23:00", "2014-03-09 09:00", freq="h") + assert dti.freq == "h" result = dti.tz_localize(None) # no-op - assert result.freq == "H" + assert result.freq == "h" result = dti.tz_localize("UTC") # unambiguous freq preservation - assert result.freq == "H" + assert result.freq == "h" result = dti.tz_localize("US/Eastern", nonexistent="shift_forward") assert result.freq is None @@ -1211,4 +1211,4 @@ def test_tz_localize_invalidates_freq(): # Case where we _can_ keep freq because we're length==1 dti2 = dti[:1] result = dti2.tz_localize("US/Eastern") - assert result.freq == "H" + assert result.freq == "h" diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index aff4944e7bd55..fcf297fd1b092 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -87,7 +87,7 @@ def test_properties(self, closed): [1, 1, 2, 5, 15, 53, 217, 1014, 5335, 31240, 201608], [-np.inf, -100, -10, 0.5, 1, 1.5, 3.8, 101, 202, np.inf], pd.to_datetime(["20170101", "20170202", "20170303", "20170404"]), - pd.to_timedelta(["1ns", "2ms", "3s", "4min", "5H", "6D"]), + pd.to_timedelta(["1ns", "2ms", "3s", "4min", "5h", "6D"]), ], ) def test_length(self, closed, breaks): @@ -689,13 +689,13 @@ def test_datetime(self, tz): # test get_indexer start = Timestamp("1999-12-31T12:00", tz=tz) - target = date_range(start=start, periods=7, freq="12H") + target = date_range(start=start, periods=7, freq="12h") actual = index.get_indexer(target) expected = np.array([-1, -1, 0, 0, 1, 1, 2], dtype="intp") tm.assert_numpy_array_equal(actual, expected) start = Timestamp("2000-01-08T18:00", tz=tz) - target = date_range(start=start, periods=7, freq="6H") + target = date_range(start=start, periods=7, freq="6h") actual = index.get_indexer(target) expected = np.array([7, 7, 8, 8, 8, 8, -1], dtype="intp") tm.assert_numpy_array_equal(actual, expected) diff --git a/pandas/tests/indexes/interval/test_interval_range.py b/pandas/tests/indexes/interval/test_interval_range.py index 499220b39279d..6c531fb0428a3 100644 --- a/pandas/tests/indexes/interval/test_interval_range.py +++ b/pandas/tests/indexes/interval/test_interval_range.py @@ -58,7 +58,7 @@ def test_constructor_numeric(self, closed, name, freq, periods): @pytest.mark.parametrize("tz", [None, "US/Eastern"]) @pytest.mark.parametrize( - "freq, periods", [("D", 364), ("2D", 182), ("22D18H", 16), ("ME", 11)] + "freq, periods", [("D", 364), ("2D", 182), ("22D18h", 16), ("ME", 11)] ) def test_constructor_timestamp(self, closed, name, freq, periods, tz): start, end = Timestamp("20180101", tz=tz), Timestamp("20181231", tz=tz) @@ -93,7 +93,7 @@ def test_constructor_timestamp(self, closed, name, freq, periods, tz): tm.assert_index_equal(result, expected) @pytest.mark.parametrize( - "freq, periods", [("D", 100), ("2D12H", 40), ("5D", 20), ("25D", 4)] + "freq, periods", [("D", 100), ("2D12h", 40), ("5D", 20), ("25D", 4)] ) def test_constructor_timedelta(self, closed, name, freq, periods): start, end = Timedelta("0 days"), Timedelta("100 days") @@ -130,7 +130,7 @@ def test_constructor_timedelta(self, closed, name, freq, periods): (0, 10, 3, 9), (0, 10, 1.5, 9), (0.5, 10, 3, 9.5), - (Timedelta("0D"), Timedelta("10D"), "2D4H", Timedelta("8D16H")), + (Timedelta("0D"), Timedelta("10D"), "2D4h", Timedelta("8D16h")), ( Timestamp("2018-01-01"), Timestamp("2018-02-09"), @@ -140,7 +140,7 @@ def test_constructor_timedelta(self, closed, name, freq, periods): ( Timestamp("2018-01-01", tz="US/Eastern"), Timestamp("2018-01-20", tz="US/Eastern"), - "5D12H", + "5D12h", Timestamp("2018-01-17 12:00:00", tz="US/Eastern"), ), ], diff --git a/pandas/tests/indexes/multi/test_compat.py b/pandas/tests/indexes/multi/test_compat.py index f91856c3948a0..27a8c6e9b7158 100644 --- a/pandas/tests/indexes/multi/test_compat.py +++ b/pandas/tests/indexes/multi/test_compat.py @@ -87,7 +87,7 @@ def test_inplace_mutation_resets_values(): def test_boxable_categorical_values(): - cat = pd.Categorical(pd.date_range("2012-01-01", periods=3, freq="H")) + cat = pd.Categorical(pd.date_range("2012-01-01", periods=3, freq="h")) result = MultiIndex.from_product([["a", "b", "c"], cat]).values expected = pd.Series( [ diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index cce6c98d71b47..c51dcb395c795 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -203,15 +203,15 @@ def test_from_arrays_tuples(idx): [ ( pd.period_range("2011-01-01", freq="D", periods=3), - pd.period_range("2015-01-01", freq="H", periods=3), + pd.period_range("2015-01-01", freq="h", periods=3), ), ( date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern"), - date_range("2015-01-01 10:00", freq="H", periods=3, tz="Asia/Tokyo"), + date_range("2015-01-01 10:00", freq="h", periods=3, tz="Asia/Tokyo"), ), ( pd.timedelta_range("1 days", freq="D", periods=3), - pd.timedelta_range("2 hours", freq="H", periods=3), + pd.timedelta_range("2 hours", freq="h", periods=3), ), ], ) @@ -229,7 +229,7 @@ def test_from_arrays_index_series_period_datetimetz_and_timedelta(idx1, idx2): def test_from_arrays_index_datetimelike_mixed(): idx1 = date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern") - idx2 = date_range("2015-01-01 10:00", freq="H", periods=3) + idx2 = date_range("2015-01-01 10:00", freq="h", periods=3) idx3 = pd.timedelta_range("1 days", freq="D", periods=3) idx4 = pd.period_range("2011-01-01", freq="D", periods=3) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index d86692477f381..a65677bba35e4 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -261,7 +261,7 @@ def test_get_indexer_categorical_time(self): midx = MultiIndex.from_product( [ Categorical(["a", "b", "c"]), - Categorical(date_range("2012-01-01", periods=3, freq="H")), + Categorical(date_range("2012-01-01", periods=3, freq="h")), ] ) result = midx.get_indexer(midx) @@ -854,7 +854,7 @@ def test_timestamp_multiindex_indexer(): # https://github.com/pandas-dev/pandas/issues/26944 idx = MultiIndex.from_product( [ - date_range("2019-01-01T00:15:33", periods=100, freq="H", name="date"), + date_range("2019-01-01T00:15:33", periods=100, freq="h", name="date"), ["x"], [3], ] @@ -866,7 +866,7 @@ def test_timestamp_multiindex_indexer(): date_range( start="2019-01-02T00:15:33", end="2019-01-05T03:15:33", - freq="H", + freq="h", name="date", ), ["x"], diff --git a/pandas/tests/indexes/multi/test_partial_indexing.py b/pandas/tests/indexes/multi/test_partial_indexing.py index 66163dad3deae..64cc1fa621b31 100644 --- a/pandas/tests/indexes/multi/test_partial_indexing.py +++ b/pandas/tests/indexes/multi/test_partial_indexing.py @@ -28,7 +28,7 @@ def df(): # 2016-01-03 00:00:00 a 12 # b 13 # c 14 - dr = date_range("2016-01-01", "2016-01-03", freq="12H") + dr = date_range("2016-01-01", "2016-01-03", freq="12h") abc = ["a", "b", "c"] mi = MultiIndex.from_product([dr, abc]) frame = DataFrame({"c1": range(15)}, index=mi) diff --git a/pandas/tests/indexes/period/methods/test_asfreq.py b/pandas/tests/indexes/period/methods/test_asfreq.py index f9838ce272296..ed078a3e8fb8b 100644 --- a/pandas/tests/indexes/period/methods/test_asfreq.py +++ b/pandas/tests/indexes/period/methods/test_asfreq.py @@ -14,7 +14,7 @@ def test_asfreq(self): pi2 = period_range(freq="Q", start="1/1/2001", end="1/1/2001") pi3 = period_range(freq="M", start="1/1/2001", end="1/1/2001") pi4 = period_range(freq="D", start="1/1/2001", end="1/1/2001") - pi5 = period_range(freq="H", start="1/1/2001", end="1/1/2001 00:00") + pi5 = period_range(freq="h", start="1/1/2001", end="1/1/2001 00:00") pi6 = period_range(freq="Min", start="1/1/2001", end="1/1/2001 00:00") pi7 = period_range(freq="s", start="1/1/2001", end="1/1/2001 00:00:00") @@ -22,28 +22,28 @@ def test_asfreq(self): assert pi1.asfreq("Q", "s") == pi2 assert pi1.asfreq("M", "start") == pi3 assert pi1.asfreq("D", "StarT") == pi4 - assert pi1.asfreq("H", "beGIN") == pi5 + assert pi1.asfreq("h", "beGIN") == pi5 assert pi1.asfreq("Min", "s") == pi6 assert pi1.asfreq("s", "s") == pi7 assert pi2.asfreq("Y", "s") == pi1 assert pi2.asfreq("M", "s") == pi3 assert pi2.asfreq("D", "s") == pi4 - assert pi2.asfreq("H", "s") == pi5 + assert pi2.asfreq("h", "s") == pi5 assert pi2.asfreq("Min", "s") == pi6 assert pi2.asfreq("s", "s") == pi7 assert pi3.asfreq("Y", "s") == pi1 assert pi3.asfreq("Q", "s") == pi2 assert pi3.asfreq("D", "s") == pi4 - assert pi3.asfreq("H", "s") == pi5 + assert pi3.asfreq("h", "s") == pi5 assert pi3.asfreq("Min", "s") == pi6 assert pi3.asfreq("s", "s") == pi7 assert pi4.asfreq("Y", "s") == pi1 assert pi4.asfreq("Q", "s") == pi2 assert pi4.asfreq("M", "s") == pi3 - assert pi4.asfreq("H", "s") == pi5 + assert pi4.asfreq("h", "s") == pi5 assert pi4.asfreq("Min", "s") == pi6 assert pi4.asfreq("s", "s") == pi7 @@ -58,14 +58,14 @@ def test_asfreq(self): assert pi6.asfreq("Q", "s") == pi2 assert pi6.asfreq("M", "s") == pi3 assert pi6.asfreq("D", "s") == pi4 - assert pi6.asfreq("H", "s") == pi5 + assert pi6.asfreq("h", "s") == pi5 assert pi6.asfreq("s", "s") == pi7 assert pi7.asfreq("Y", "s") == pi1 assert pi7.asfreq("Q", "s") == pi2 assert pi7.asfreq("M", "s") == pi3 assert pi7.asfreq("D", "s") == pi4 - assert pi7.asfreq("H", "s") == pi5 + assert pi7.asfreq("h", "s") == pi5 assert pi7.asfreq("Min", "s") == pi6 msg = "How must be one of S or E" @@ -100,23 +100,23 @@ def test_asfreq_mult_pi(self, freq): assert result.freq == exp.freq def test_asfreq_combined_pi(self): - pi = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="H") - exp = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="25H") - for freq, how in zip(["1D1H", "1H1D"], ["S", "E"]): + pi = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="h") + exp = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="25h") + for freq, how in zip(["1D1h", "1h1D"], ["S", "E"]): result = pi.asfreq(freq, how=how) tm.assert_index_equal(result, exp) assert result.freq == exp.freq - for freq in ["1D1H", "1H1D"]: + for freq in ["1D1h", "1h1D"]: pi = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq=freq) - result = pi.asfreq("H") - exp = PeriodIndex(["2001-01-02 00:00", "2001-01-03 02:00", "NaT"], freq="H") + result = pi.asfreq("h") + exp = PeriodIndex(["2001-01-02 00:00", "2001-01-03 02:00", "NaT"], freq="h") tm.assert_index_equal(result, exp) assert result.freq == exp.freq pi = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq=freq) - result = pi.asfreq("H", how="S") - exp = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="H") + result = pi.asfreq("h", how="S") + exp = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="h") tm.assert_index_equal(result, exp) assert result.freq == exp.freq diff --git a/pandas/tests/indexes/period/methods/test_fillna.py b/pandas/tests/indexes/period/methods/test_fillna.py index 12a07bac25a59..ed6b4686a06de 100644 --- a/pandas/tests/indexes/period/methods/test_fillna.py +++ b/pandas/tests/indexes/period/methods/test_fillna.py @@ -10,19 +10,19 @@ class TestFillNA: def test_fillna_period(self): # GH#11343 - idx = PeriodIndex(["2011-01-01 09:00", NaT, "2011-01-01 11:00"], freq="H") + idx = PeriodIndex(["2011-01-01 09:00", NaT, "2011-01-01 11:00"], freq="h") exp = PeriodIndex( - ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], freq="H" + ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], freq="h" ) - result = idx.fillna(Period("2011-01-01 10:00", freq="H")) + result = idx.fillna(Period("2011-01-01 10:00", freq="h")) tm.assert_index_equal(result, exp) exp = Index( [ - Period("2011-01-01 09:00", freq="H"), + Period("2011-01-01 09:00", freq="h"), "x", - Period("2011-01-01 11:00", freq="H"), + Period("2011-01-01 11:00", freq="h"), ], dtype=object, ) @@ -31,9 +31,9 @@ def test_fillna_period(self): exp = Index( [ - Period("2011-01-01 09:00", freq="H"), + Period("2011-01-01 09:00", freq="h"), Period("2011-01-01", freq="D"), - Period("2011-01-01 11:00", freq="H"), + Period("2011-01-01 11:00", freq="h"), ], dtype=object, ) diff --git a/pandas/tests/indexes/period/methods/test_shift.py b/pandas/tests/indexes/period/methods/test_shift.py index d649dd3da0864..fca3e3a559e1f 100644 --- a/pandas/tests/indexes/period/methods/test_shift.py +++ b/pandas/tests/indexes/period/methods/test_shift.py @@ -64,12 +64,12 @@ def test_shift(self): def test_shift_corner_cases(self): # GH#9903 - idx = PeriodIndex([], name="xxx", freq="H") + idx = PeriodIndex([], name="xxx", freq="h") msg = "`freq` argument is not supported for PeriodIndex.shift" with pytest.raises(TypeError, match=msg): # period shift doesn't accept freq - idx.shift(1, freq="H") + idx.shift(1, freq="h") tm.assert_index_equal(idx.shift(0), idx) tm.assert_index_equal(idx.shift(3), idx) @@ -77,19 +77,19 @@ def test_shift_corner_cases(self): idx = PeriodIndex( ["2011-01-01 10:00", "2011-01-01 11:00", "2011-01-01 12:00"], name="xxx", - freq="H", + freq="h", ) tm.assert_index_equal(idx.shift(0), idx) exp = PeriodIndex( ["2011-01-01 13:00", "2011-01-01 14:00", "2011-01-01 15:00"], name="xxx", - freq="H", + freq="h", ) tm.assert_index_equal(idx.shift(3), exp) exp = PeriodIndex( ["2011-01-01 07:00", "2011-01-01 08:00", "2011-01-01 09:00"], name="xxx", - freq="H", + freq="h", ) tm.assert_index_equal(idx.shift(-3), exp) diff --git a/pandas/tests/indexes/period/methods/test_to_timestamp.py b/pandas/tests/indexes/period/methods/test_to_timestamp.py index 462f66eef7269..2394efb353ab6 100644 --- a/pandas/tests/indexes/period/methods/test_to_timestamp.py +++ b/pandas/tests/indexes/period/methods/test_to_timestamp.py @@ -107,7 +107,7 @@ def test_to_timestamp_pi_mult(self): tm.assert_index_equal(result, expected) def test_to_timestamp_pi_combined(self): - idx = period_range(start="2011", periods=2, freq="1D1H", name="idx") + idx = period_range(start="2011", periods=2, freq="1D1h", name="idx") result = idx.to_timestamp() expected = DatetimeIndex(["2011-01-01 00:00", "2011-01-02 01:00"], name="idx") @@ -120,7 +120,7 @@ def test_to_timestamp_pi_combined(self): expected = expected + Timedelta(1, "s") - Timedelta(1, "ns") tm.assert_index_equal(result, expected) - result = idx.to_timestamp(how="E", freq="H") + result = idx.to_timestamp(how="E", freq="h") expected = DatetimeIndex(["2011-01-02 00:00", "2011-01-03 01:00"], name="idx") expected = expected + Timedelta(1, "h") - Timedelta(1, "ns") tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index ac4edb10d9352..f1db5ab28be30 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -397,9 +397,9 @@ def test_constructor_freq_mult(self): ) tm.assert_index_equal(pidx, expected) - pidx = period_range(end="2014-01-01 17:00", freq="4H", periods=3) + pidx = period_range(end="2014-01-01 17:00", freq="4h", periods=3) expected = PeriodIndex( - ["2014-01-01 09:00", "2014-01-01 13:00", "2014-01-01 17:00"], freq="4H" + ["2014-01-01 09:00", "2014-01-01 13:00", "2014-01-01 17:00"], freq="4h" ) tm.assert_index_equal(pidx, expected) @@ -444,12 +444,12 @@ def test_constructor_freq_mult_dti_compat_month(self, mult): tm.assert_index_equal(pidx, expected) def test_constructor_freq_combined(self): - for freq in ["1D1H", "1H1D"]: + for freq in ["1D1h", "1h1D"]: pidx = PeriodIndex(["2016-01-01", "2016-01-02"], freq=freq) - expected = PeriodIndex(["2016-01-01 00:00", "2016-01-02 00:00"], freq="25H") - for freq in ["1D1H", "1H1D"]: + expected = PeriodIndex(["2016-01-01 00:00", "2016-01-02 00:00"], freq="25h") + for freq in ["1D1h", "1h1D"]: pidx = period_range(start="2016-01-01", periods=2, freq=freq) - expected = PeriodIndex(["2016-01-01 00:00", "2016-01-02 01:00"], freq="25H") + expected = PeriodIndex(["2016-01-01 00:00", "2016-01-02 01:00"], freq="25h") tm.assert_index_equal(pidx, expected) def test_constructor(self): @@ -470,7 +470,7 @@ def test_constructor(self): pi = period_range(freq="B", start="1/1/2001", end="12/31/2009") assert len(pi) == 261 * 9 - pi = period_range(freq="H", start="1/1/2001", end="12/31/2001 23:00") + pi = period_range(freq="h", start="1/1/2001", end="12/31/2001 23:00") assert len(pi) == 365 * 24 pi = period_range(freq="Min", start="1/1/2001", end="1/1/2001 23:59") @@ -526,7 +526,7 @@ def test_constructor(self): Period("2006-12-31", ("w", 1)) @pytest.mark.parametrize( - "freq", ["M", "Q", "Y", "D", "B", "min", "s", "ms", "us", "ns", "H"] + "freq", ["M", "Q", "Y", "D", "B", "min", "s", "ms", "us", "ns", "h"] ) @pytest.mark.filterwarnings( r"ignore:Period with BDay freq is deprecated:FutureWarning" diff --git a/pandas/tests/indexes/period/test_formats.py b/pandas/tests/indexes/period/test_formats.py index 67deeccff4e2a..9441f56a75f03 100644 --- a/pandas/tests/indexes/period/test_formats.py +++ b/pandas/tests/indexes/period/test_formats.py @@ -56,7 +56,7 @@ def test_representation(self, method): idx3 = PeriodIndex(["2011-01-01", "2011-01-02"], freq="D") idx4 = PeriodIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D") idx5 = PeriodIndex(["2011", "2012", "2013"], freq="Y") - idx6 = PeriodIndex(["2011-01-01 09:00", "2012-02-01 10:00", "NaT"], freq="H") + idx6 = PeriodIndex(["2011-01-01 09:00", "2012-02-01 10:00", "NaT"], freq="h") idx7 = pd.period_range("2013Q1", periods=1, freq="Q") idx8 = pd.period_range("2013Q1", periods=2, freq="Q") idx9 = pd.period_range("2013Q1", periods=3, freq="Q") @@ -77,7 +77,7 @@ def test_representation(self, method): exp6 = ( "PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], " - "dtype='period[H]')" + "dtype='period[h]')" ) exp7 = "PeriodIndex(['2013Q1'], dtype='period[Q-DEC]')" @@ -102,7 +102,7 @@ def test_representation_to_series(self): idx3 = PeriodIndex(["2011-01-01", "2011-01-02"], freq="D") idx4 = PeriodIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D") idx5 = PeriodIndex(["2011", "2012", "2013"], freq="Y") - idx6 = PeriodIndex(["2011-01-01 09:00", "2012-02-01 10:00", "NaT"], freq="H") + idx6 = PeriodIndex(["2011-01-01 09:00", "2012-02-01 10:00", "NaT"], freq="h") idx7 = pd.period_range("2013Q1", periods=1, freq="Q") idx8 = pd.period_range("2013Q1", periods=2, freq="Q") @@ -130,7 +130,7 @@ def test_representation_to_series(self): exp6 = """0 2011-01-01 09:00 1 2012-02-01 10:00 2 NaT -dtype: period[H]""" +dtype: period[h]""" exp7 = """0 2013Q1 dtype: period[Q-DEC]""" @@ -158,7 +158,7 @@ def test_summary(self): idx3 = PeriodIndex(["2011-01-01", "2011-01-02"], freq="D") idx4 = PeriodIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D") idx5 = PeriodIndex(["2011", "2012", "2013"], freq="Y") - idx6 = PeriodIndex(["2011-01-01 09:00", "2012-02-01 10:00", "NaT"], freq="H") + idx6 = PeriodIndex(["2011-01-01 09:00", "2012-02-01 10:00", "NaT"], freq="h") idx7 = pd.period_range("2013Q1", periods=1, freq="Q") idx8 = pd.period_range("2013Q1", periods=2, freq="Q") @@ -180,7 +180,7 @@ def test_summary(self): Freq: Y-DEC""" exp6 = """PeriodIndex: 3 entries, 2011-01-01 09:00 to NaT -Freq: H""" +Freq: h""" exp7 = """PeriodIndex: 1 entries, 2013Q1 to 2013Q1 Freq: Q-DEC""" diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index f5af550d94ab1..2683e25eda618 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -183,7 +183,7 @@ def test_getitem_seconds(self): "2014", "2013/02", "2013/01/02", - "2013/02/01 9H", + "2013/02/01 9h", "2013/02/01 09:00", ] for val in values: @@ -195,7 +195,7 @@ def test_getitem_seconds(self): ser = Series(np.random.default_rng(2).random(len(idx)), index=idx) tm.assert_series_equal(ser["2013/01/01 10:00"], ser[3600:3660]) - tm.assert_series_equal(ser["2013/01/01 9H"], ser[:3600]) + tm.assert_series_equal(ser["2013/01/01 9h"], ser[:3600]) for d in ["2013/01/01", "2013/01", "2013"]: tm.assert_series_equal(ser[d], ser) @@ -215,7 +215,7 @@ def test_getitem_day(self, idx_range): "2014", "2013/02", "2013/01/02", - "2013/02/01 9H", + "2013/02/01 9h", "2013/02/01 09:00", ] for val in values: @@ -230,7 +230,7 @@ def test_getitem_day(self, idx_range): tm.assert_series_equal(ser["2013/02"], ser[31:59]) tm.assert_series_equal(ser["2014"], ser[365:]) - invalid = ["2013/02/01 9H", "2013/02/01 09:00"] + invalid = ["2013/02/01 9h", "2013/02/01 09:00"] for val in invalid: with pytest.raises(KeyError, match=val): ser[val] @@ -479,13 +479,13 @@ def test_get_indexer_non_unique(self): # TODO: This method came from test_period; de-dup with version above def test_get_indexer2(self): - idx = period_range("2000-01-01", periods=3).asfreq("H", how="start") + idx = period_range("2000-01-01", periods=3).asfreq("h", how="start") tm.assert_numpy_array_equal( idx.get_indexer(idx), np.array([0, 1, 2], dtype=np.intp) ) target = PeriodIndex( - ["1999-12-31T23", "2000-01-01T12", "2000-01-02T01"], freq="H" + ["1999-12-31T23", "2000-01-01T12", "2000-01-02T01"], freq="h" ) tm.assert_numpy_array_equal( idx.get_indexer(target, "pad"), np.array([-1, 0, 1], dtype=np.intp) @@ -501,7 +501,7 @@ def test_get_indexer2(self): np.array([0, -1, 1], dtype=np.intp), ) - msg = "Input has different freq=None from PeriodArray\\(freq=H\\)" + msg = "Input has different freq=None from PeriodArray\\(freq=h\\)" with pytest.raises(ValueError, match=msg): idx.get_indexer(target, "nearest", tolerance="1 minute") @@ -714,7 +714,7 @@ def test_take_fill_value(self): class TestGetValue: - @pytest.mark.parametrize("freq", ["H", "D"]) + @pytest.mark.parametrize("freq", ["h", "D"]) def test_get_value_datetime_hourly(self, freq): # get_loc and get_value should treat datetime objects symmetrically # TODO: this test used to test get_value, which is removed in 2.0. @@ -730,7 +730,7 @@ def test_get_value_datetime_hourly(self, freq): assert ser.loc[ts] == 7 ts2 = ts + Timedelta(hours=3) - if freq == "H": + if freq == "h": with pytest.raises(KeyError, match="2016-01-01 03:00"): pi.get_loc(ts2) with pytest.raises(KeyError, match="2016-01-01 03:00"): @@ -795,7 +795,7 @@ class TestAsOfLocs: def test_asof_locs_mismatched_type(self): dti = date_range("2016-01-01", periods=3) pi = dti.to_period("D") - pi2 = dti.to_period("H") + pi2 = dti.to_period("h") mask = np.array([0, 1, 0], dtype=bool) @@ -810,6 +810,6 @@ def test_asof_locs_mismatched_type(self): # TimedeltaIndex pi.asof_locs(dti - dti, mask) - msg = "Input has different freq=H" + msg = "Input has different freq=h" with pytest.raises(libperiod.IncompatibleFrequency, match=msg): pi.asof_locs(pi2, mask) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 0c445b4cdf770..22bb63d67f57f 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -161,7 +161,7 @@ def test_period_index_length(self): period_range(freq="Q", start="1/1/2001", end="12/1/2002"), period_range(freq="M", start="1/1/2001", end="1/1/2002"), period_range(freq="D", start="12/1/2001", end="6/1/2001"), - period_range(freq="H", start="12/31/2001", end="1/1/2002 23:00"), + period_range(freq="h", start="12/31/2001", end="1/1/2002 23:00"), period_range(freq="Min", start="12/31/2001", end="1/1/2002 00:20"), period_range( freq="s", start="12/31/2001 00:00:00", end="12/31/2001 00:05:00" @@ -255,7 +255,7 @@ def test_iteration(self): def test_with_multi_index(self): # #1705 - index = date_range("1/1/2012", periods=4, freq="12H") + index = date_range("1/1/2012", periods=4, freq="12h") index_as_arrays = [index.to_period(freq="D"), index.hour] s = Series([0, 1, 2, 3], index_as_arrays) @@ -284,6 +284,15 @@ def test_period_index_frequency_ME_error_message(self): with pytest.raises(ValueError, match=msg): PeriodIndex(["2020-01-01", "2020-01-02"], freq="2ME") + def test_H_deprecated_from_time_series(self): + # GH#52536 + msg = "'H' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + index = period_range(freq="2H", start="1/1/2001", end="12/1/2009") + series = Series(1, index=index) + assert isinstance(series, Series) + @pytest.mark.parametrize("freq", ["2A", "A-DEC", "200A-AUG"]) def test_a_deprecated_from_time_series(self, freq): # GH#52536 diff --git a/pandas/tests/indexes/period/test_resolution.py b/pandas/tests/indexes/period/test_resolution.py index 98ccfe6569798..680bdaa2e2a44 100644 --- a/pandas/tests/indexes/period/test_resolution.py +++ b/pandas/tests/indexes/period/test_resolution.py @@ -11,7 +11,7 @@ class TestResolution: ("Q", "quarter"), ("M", "month"), ("D", "day"), - ("H", "hour"), + ("h", "hour"), ("min", "minute"), ("s", "second"), ("ms", "millisecond"), diff --git a/pandas/tests/indexes/period/test_searchsorted.py b/pandas/tests/indexes/period/test_searchsorted.py index b9863d1bb019a..9b02a2f35fd01 100644 --- a/pandas/tests/indexes/period/test_searchsorted.py +++ b/pandas/tests/indexes/period/test_searchsorted.py @@ -27,9 +27,9 @@ def test_searchsorted(self, freq): assert pidx.searchsorted(NaT) == 5 - msg = "Input has different freq=H from PeriodArray" + msg = "Input has different freq=h from PeriodArray" with pytest.raises(IncompatibleFrequency, match=msg): - pidx.searchsorted(Period("2014-01-01", freq="H")) + pidx.searchsorted(Period("2014-01-01", freq="h")) msg = "Input has different freq=5D from PeriodArray" with pytest.raises(IncompatibleFrequency, match=msg): diff --git a/pandas/tests/indexes/period/test_setops.py b/pandas/tests/indexes/period/test_setops.py index 9610db5f0336b..b9a5940795a5b 100644 --- a/pandas/tests/indexes/period/test_setops.py +++ b/pandas/tests/indexes/period/test_setops.py @@ -43,8 +43,8 @@ def test_union(self, sort): other3 = PeriodIndex([], freq="D") expected3 = period_range("1/1/2000", freq="D", periods=5) - rng4 = period_range("2000-01-01 09:00", freq="H", periods=5) - other4 = period_range("2000-01-02 09:00", freq="H", periods=5) + rng4 = period_range("2000-01-01 09:00", freq="h", periods=5) + other4 = period_range("2000-01-02 09:00", freq="h", periods=5) expected4 = PeriodIndex( [ "2000-01-01 09:00", @@ -58,7 +58,7 @@ def test_union(self, sort): "2000-01-02 12:00", "2000-01-02 13:00", ], - freq="H", + freq="h", ) rng5 = PeriodIndex( @@ -269,8 +269,8 @@ def test_difference(self, sort): "2000-01-01 11:00", "2000-01-01 13:00", ] - rng4 = PeriodIndex(period_rng, freq="H") - other4 = period_range("2000-01-02 09:00", freq="H", periods=5) + rng4 = PeriodIndex(period_rng, freq="h") + other4 = period_range("2000-01-02 09:00", freq="h", periods=5) expected4 = rng4 rng5 = PeriodIndex( diff --git a/pandas/tests/indexes/period/test_tools.py b/pandas/tests/indexes/period/test_tools.py index 2a9149844a353..f507e64d88b06 100644 --- a/pandas/tests/indexes/period/test_tools.py +++ b/pandas/tests/indexes/period/test_tools.py @@ -20,7 +20,7 @@ class TestPeriodRepresentation: ("W-THU", "1970-01-01"), ("D", "1970-01-01"), ("B", "1970-01-01"), - ("H", "1970-01-01"), + ("h", "1970-01-01"), ("min", "1970-01-01"), ("s", "1970-01-01"), ("ms", "1970-01-01"), diff --git a/pandas/tests/indexes/timedeltas/methods/test_astype.py b/pandas/tests/indexes/timedeltas/methods/test_astype.py index f69f0fd3d78e2..4f5ece61fc30c 100644 --- a/pandas/tests/indexes/timedeltas/methods/test_astype.py +++ b/pandas/tests/indexes/timedeltas/methods/test_astype.py @@ -69,7 +69,7 @@ def test_astype(self): tm.assert_numpy_array_equal(rng.asi8, result.values) def test_astype_uint(self): - arr = timedelta_range("1H", periods=2) + arr = timedelta_range("1h", periods=2) with pytest.raises(TypeError, match=r"Do obj.astype\('int64'\)"): arr.astype("uint64") @@ -104,10 +104,10 @@ def test_astype_raises(self, dtype): idx.astype(dtype) def test_astype_category(self): - obj = timedelta_range("1H", periods=2, freq="H") + obj = timedelta_range("1h", periods=2, freq="h") result = obj.astype("category") - expected = pd.CategoricalIndex([Timedelta("1H"), Timedelta("2H")]) + expected = pd.CategoricalIndex([Timedelta("1h"), Timedelta("2h")]) tm.assert_index_equal(result, expected) result = obj._data.astype("category") @@ -115,7 +115,7 @@ def test_astype_category(self): tm.assert_categorical_equal(result, expected) def test_astype_array_fallback(self): - obj = timedelta_range("1H", periods=2) + obj = timedelta_range("1h", periods=2) result = obj.astype(bool) expected = Index(np.array([True, True])) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/timedeltas/methods/test_shift.py b/pandas/tests/indexes/timedeltas/methods/test_shift.py index e33b8de3e6594..a0986d1496881 100644 --- a/pandas/tests/indexes/timedeltas/methods/test_shift.py +++ b/pandas/tests/indexes/timedeltas/methods/test_shift.py @@ -14,17 +14,17 @@ class TestTimedeltaIndexShift: def test_tdi_shift_empty(self): # GH#9903 idx = TimedeltaIndex([], name="xxx") - tm.assert_index_equal(idx.shift(0, freq="H"), idx) - tm.assert_index_equal(idx.shift(3, freq="H"), idx) + tm.assert_index_equal(idx.shift(0, freq="h"), idx) + tm.assert_index_equal(idx.shift(3, freq="h"), idx) def test_tdi_shift_hours(self): # GH#9903 idx = TimedeltaIndex(["5 hours", "6 hours", "9 hours"], name="xxx") - tm.assert_index_equal(idx.shift(0, freq="H"), idx) + tm.assert_index_equal(idx.shift(0, freq="h"), idx) exp = TimedeltaIndex(["8 hours", "9 hours", "12 hours"], name="xxx") - tm.assert_index_equal(idx.shift(3, freq="H"), exp) + tm.assert_index_equal(idx.shift(3, freq="h"), exp) exp = TimedeltaIndex(["2 hours", "3 hours", "6 hours"], name="xxx") - tm.assert_index_equal(idx.shift(-3, freq="H"), exp) + tm.assert_index_equal(idx.shift(-3, freq="h"), exp) def test_tdi_shift_minutes(self): # GH#9903 diff --git a/pandas/tests/indexes/timedeltas/test_freq_attr.py b/pandas/tests/indexes/timedeltas/test_freq_attr.py index 868da4329dccf..1912c49d3000f 100644 --- a/pandas/tests/indexes/timedeltas/test_freq_attr.py +++ b/pandas/tests/indexes/timedeltas/test_freq_attr.py @@ -12,7 +12,7 @@ class TestFreq: @pytest.mark.parametrize("values", [["0 days", "2 days", "4 days"], []]) - @pytest.mark.parametrize("freq", ["2D", Day(2), "48H", Hour(48)]) + @pytest.mark.parametrize("freq", ["2D", Day(2), "48h", Hour(48)]) def test_freq_setter(self, values, freq): # GH#20678 idx = TimedeltaIndex(values) diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index 31cc8e18f58ce..397f9d9e18331 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -21,7 +21,7 @@ class TestGetItem: def test_getitem_slice_keeps_name(self): # GH#4226 - tdi = timedelta_range("1d", "5d", freq="H", name="timebucket") + tdi = timedelta_range("1d", "5d", freq="h", name="timebucket") assert tdi[1:].name == tdi.name def test_getitem(self): @@ -230,7 +230,7 @@ def test_take_invalid_kwargs(self): def test_take_equiv_getitem(self): tds = ["1day 02:00:00", "1 day 04:00:00", "1 day 10:00:00"] - idx = timedelta_range(start="1d", end="2d", freq="H", name="idx") + idx = timedelta_range(start="1d", end="2d", freq="h", name="idx") expected = TimedeltaIndex(tds, freq=None, name="idx") taken1 = idx.take([2, 4, 10]) diff --git a/pandas/tests/indexes/timedeltas/test_scalar_compat.py b/pandas/tests/indexes/timedeltas/test_scalar_compat.py index fe3ff1799e763..63db5c1b9c91d 100644 --- a/pandas/tests/indexes/timedeltas/test_scalar_compat.py +++ b/pandas/tests/indexes/timedeltas/test_scalar_compat.py @@ -63,8 +63,8 @@ def test_tdi_round(self): ) expected_elt = expected_rng[1] - tm.assert_index_equal(td.round(freq="H"), expected_rng) - assert elt.round(freq="H") == expected_elt + tm.assert_index_equal(td.round(freq="h"), expected_rng) + assert elt.round(freq="h") == expected_elt msg = INVALID_FREQ_ERR_MSG with pytest.raises(ValueError, match=msg): @@ -121,7 +121,7 @@ def test_round(self): ), ), ("12min", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), - ("H", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), + ("h", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), ("d", t1c, TimedeltaIndex([-1, -1, -1], unit="D")), ]: r1 = t1.round(freq) diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index 6cdd6944e90ea..727b4eee00566 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -52,8 +52,8 @@ def test_union_coverage(self): assert result.freq == ordered.freq def test_union_bug_1730(self): - rng_a = timedelta_range("1 day", periods=4, freq="3H") - rng_b = timedelta_range("1 day", periods=4, freq="4H") + rng_a = timedelta_range("1 day", periods=4, freq="3h") + rng_b = timedelta_range("1 day", periods=4, freq="4h") result = rng_a.union(rng_b) exp = TimedeltaIndex(sorted(set(rng_a) | set(rng_b))) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta_range.py b/pandas/tests/indexes/timedeltas/test_timedelta_range.py index 03531547ef042..f22bdb7a90516 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta_range.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta_range.py @@ -46,6 +46,7 @@ def test_timedelta_range(self): @pytest.mark.parametrize( "depr_unit, unit", [ + ("H", "hour"), ("T", "minute"), ("t", "minute"), ("S", "second"), @@ -57,7 +58,8 @@ def test_timedelta_range(self): ("n", "nanosecond"), ], ) - def test_timedelta_units_T_S_L_U_N_deprecated(self, depr_unit, unit): + def test_timedelta_units_H_T_S_L_U_N_deprecated(self, depr_unit, unit): + # GH#52536 depr_msg = ( f"'{depr_unit}' is deprecated and will be removed in a future version." ) @@ -68,7 +70,7 @@ def test_timedelta_units_T_S_L_U_N_deprecated(self, depr_unit, unit): tm.assert_index_equal(result, expected) @pytest.mark.parametrize( - "periods, freq", [(3, "2D"), (5, "D"), (6, "19H12min"), (7, "16H"), (9, "12H")] + "periods, freq", [(3, "2D"), (5, "D"), (6, "19h12min"), (7, "16h"), (9, "12h")] ) def test_linspace_behavior(self, periods, freq): # GH 20976 @@ -76,6 +78,16 @@ def test_linspace_behavior(self, periods, freq): expected = timedelta_range(start="0 days", end="4 days", freq=freq) tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("msg_freq, freq", [("H", "19H12min"), ("T", "19h12T")]) + def test_timedelta_range_H_T_deprecated(self, freq, msg_freq): + # GH#52536 + msg = f"'{msg_freq}' is deprecated and will be removed in a future version." + + result = timedelta_range(start="0 days", end="4 days", periods=6) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = timedelta_range(start="0 days", end="4 days", freq=freq) + tm.assert_index_equal(result, expected) + def test_errors(self): # not enough params msg = ( @@ -96,7 +108,7 @@ def test_errors(self): # too many params with pytest.raises(ValueError, match=msg): - timedelta_range(start="0 days", end="5 days", periods=10, freq="H") + timedelta_range(start="0 days", end="5 days", periods=10, freq="h") @pytest.mark.parametrize( "start, end, freq, expected_periods", diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 2eee506e1feb7..70eada188f3c8 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1464,7 +1464,7 @@ def test_loc_setitem_datetime_coercion(self): def test_loc_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): # GH#11365 tz = tz_naive_fixture - idx = date_range(start="2015-07-12", periods=3, freq="H", tz=tz) + idx = date_range(start="2015-07-12", periods=3, freq="h", tz=tz) expected = DataFrame(1.2, index=idx, columns=["var"]) # if result started off with object dtype, then the .loc.__setitem__ # below would retain object dtype @@ -2057,7 +2057,7 @@ def test_loc_setitem_with_expansion_and_existing_dst(self): start = Timestamp("2017-10-29 00:00:00+0200", tz="Europe/Madrid") end = Timestamp("2017-10-29 03:00:00+0100", tz="Europe/Madrid") ts = Timestamp("2016-10-10 03:00:00", tz="Europe/Madrid") - idx = date_range(start, end, inclusive="left", freq="H") + idx = date_range(start, end, inclusive="left", freq="h") assert ts not in idx # i.e. result.loc setitem is with-expansion result = DataFrame(index=idx, columns=["value"]) @@ -2330,7 +2330,7 @@ def test_loc_getitem_partial_string_slicing_with_periodindex(self): tm.assert_series_equal(result, expected) def test_loc_getitem_partial_string_slicing_with_timedeltaindex(self): - ix = timedelta_range(start="1 day", end="2 days", freq="1H") + ix = timedelta_range(start="1 day", end="2 days", freq="1h") ser = ix.to_series() result = ser.loc[:"1 days"] expected = ser.iloc[:-1] @@ -2432,7 +2432,7 @@ def test_loc_getitem_label_slice_across_dst(self): "index", [ pd.period_range(start="2017-01-01", end="2018-01-01", freq="M"), - timedelta_range(start="1 day", end="2 days", freq="1H"), + timedelta_range(start="1 day", end="2 days", freq="1h"), ], ) def test_loc_getitem_label_slice_period_timedelta(self, index): @@ -2588,7 +2588,7 @@ def test_loc_setitem_mask_and_label_with_datetimeindex(self): df = DataFrame( np.arange(6.0).reshape(3, 2), columns=list("AB"), - index=date_range("1/1/2000", periods=3, freq="1H"), + index=date_range("1/1/2000", periods=3, freq="1h"), ) expected = df.copy() expected["C"] = [expected.index[0]] + [pd.NaT, pd.NaT] @@ -2887,7 +2887,7 @@ def test_loc_datetimelike_mismatched_dtypes(): df = DataFrame( np.random.default_rng(2).standard_normal((5, 3)), columns=["a", "b", "c"], - index=date_range("2012", freq="H", periods=5), + index=date_range("2012", freq="h", periods=5), ) # create dataframe with non-unique DatetimeIndex df = df.iloc[[0, 2, 2, 3]].copy() diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 51965f753a0fb..585958765f0b6 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -2166,7 +2166,7 @@ def test_period(self): "B": [ pd.Period("2011-01", freq="M"), pd.Period("2011-02-01", freq="D"), - pd.Period("2011-03-01 09:00", freq="H"), + pd.Period("2011-03-01 09:00", freq="h"), pd.Period("2011-04", freq="M"), ], "C": list("abcd"), @@ -2703,7 +2703,7 @@ def test_period(self): [ pd.Period("2011-01", freq="M"), pd.Period("2011-02-01", freq="D"), - pd.Period("2011-03-01 09:00", freq="H"), + pd.Period("2011-03-01 09:00", freq="h"), ] ) exp = ( @@ -3330,7 +3330,7 @@ def test_str(self): class TestPeriodIndexFormat: def test_period_format_and_strftime_default(self): - per = pd.PeriodIndex([datetime(2003, 1, 1, 12), None], freq="H") + per = pd.PeriodIndex([datetime(2003, 1, 1, 12), None], freq="h") # Default formatting formatted = per.format() @@ -3377,13 +3377,13 @@ def test_period_tz(self): # Converting to a period looses the timezone information # Since tz is currently set as utc, we'll see 2012 with tm.assert_produces_warning(UserWarning, match="will drop timezone"): - per = dt.to_period(freq="H") + per = dt.to_period(freq="h") assert per.format()[0] == "2012-12-31 23:00" # If tz is currently set as paris before conversion, we'll see 2013 dt = dt.tz_convert("Europe/Paris") with tm.assert_produces_warning(UserWarning, match="will drop timezone"): - per = dt.to_period(freq="H") + per = dt.to_period(freq="h") assert per.format()[0] == "2013-01-01 00:00" @pytest.mark.parametrize( @@ -3406,7 +3406,7 @@ def test_period_non_ascii_fmt(self, locale_str): # Change locale temporarily for this test. with tm.set_locale(locale_str, locale.LC_ALL) if locale_str else nullcontext(): # Scalar - per = pd.Period("2018-03-11 13:00", freq="H") + per = pd.Period("2018-03-11 13:00", freq="h") assert per.strftime("%y é") == "18 é" # Index @@ -3438,7 +3438,7 @@ def test_period_custom_locale_directive(self, locale_str): am_local, pm_local = get_local_am_pm() # Scalar - per = pd.Period("2018-03-11 13:00", freq="H") + per = pd.Period("2018-03-11 13:00", freq="h") assert per.strftime("%p") == pm_local # Index diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 822bd14610388..613f609320f31 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -285,7 +285,7 @@ def test_to_csv_different_datetime_formats(self): df = DataFrame( { "date": pd.to_datetime("1970-01-01"), - "datetime": pd.date_range("1970-01-01", periods=2, freq="H"), + "datetime": pd.date_range("1970-01-01", periods=2, freq="h"), } ) expected_rows = [ diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index d7a4dfca90b5e..943515acd33b5 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -32,7 +32,7 @@ def df_schema(): "A": [1, 2, 3, 4], "B": ["a", "b", "c", "c"], "C": pd.date_range("2016-01-01", freq="d", periods=4), - "D": pd.timedelta_range("1H", periods=4, freq="min"), + "D": pd.timedelta_range("1h", periods=4, freq="min"), }, index=pd.Index(range(4), name="idx"), ) @@ -45,7 +45,7 @@ def df_table(): "A": [1, 2, 3, 4], "B": ["a", "b", "c", "c"], "C": pd.date_range("2016-01-01", freq="d", periods=4), - "D": pd.timedelta_range("1H", periods=4, freq="min"), + "D": pd.timedelta_range("1h", periods=4, freq="min"), "E": pd.Series(pd.Categorical(["a", "b", "c", "c"])), "F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)), "G": [1.0, 2.0, 3, 4.0], @@ -695,7 +695,7 @@ def test_read_json_table_orient(self, index_nm, vals, recwarn): @pytest.mark.parametrize("index_nm", [None, "idx", "index"]) @pytest.mark.parametrize( "vals", - [{"timedeltas": pd.timedelta_range("1H", periods=4, freq="min")}], + [{"timedeltas": pd.timedelta_range("1h", periods=4, freq="min")}], ) def test_read_json_table_orient_raises(self, index_nm, vals, recwarn): df = DataFrame(vals, index=pd.Index(range(4), name=index_nm)) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index b3c2e67f7c318..4280f8617517f 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -521,7 +521,7 @@ def test_v12_compat(self, datapath): tm.assert_frame_equal(df_iso, df_unser_iso) def test_blocks_compat_GH9037(self): - index = pd.date_range("20000101", periods=10, freq="H") + index = pd.date_range("20000101", periods=10, freq="h") # freq doesn't round-trip index = DatetimeIndex(list(index), freq=None) diff --git a/pandas/tests/io/parser/dtypes/test_categorical.py b/pandas/tests/io/parser/dtypes/test_categorical.py index 8671bccbc1bbd..8640c17a1349f 100644 --- a/pandas/tests/io/parser/dtypes/test_categorical.py +++ b/pandas/tests/io/parser/dtypes/test_categorical.py @@ -275,9 +275,9 @@ def test_categorical_coerces_timestamp(all_parsers): def test_categorical_coerces_timedelta(all_parsers): parser = all_parsers - dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))} + dtype = {"b": CategoricalDtype(pd.to_timedelta(["1h", "2h", "3h"]))} - data = "b\n1H\n2H\n3H" + data = "b\n1h\n2h\n3h" expected = DataFrame({"b": Categorical(dtype["b"].categories)}) result = parser.read_csv(StringIO(data), dtype=dtype) diff --git a/pandas/tests/io/pytables/test_retain_attributes.py b/pandas/tests/io/pytables/test_retain_attributes.py index 799c2f22a9dc6..aef6fc0460cd9 100644 --- a/pandas/tests/io/pytables/test_retain_attributes.py +++ b/pandas/tests/io/pytables/test_retain_attributes.py @@ -21,7 +21,7 @@ def test_retain_index_attributes(setup_path): # GH 3499, losing frequency info on index recreation df = DataFrame( - {"A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="H"))} + {"A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="h"))} ) with ensure_clean_store(setup_path) as store: @@ -76,7 +76,7 @@ def test_retain_index_attributes2(tmp_path, setup_path): with tm.assert_produces_warning(errors.AttributeConflictWarning): df = DataFrame( - {"A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="H"))} + {"A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="h"))} ) df.to_hdf(path, key="data", mode="w", append=True) df2 = DataFrame( @@ -85,7 +85,7 @@ def test_retain_index_attributes2(tmp_path, setup_path): df2.to_hdf(path, key="data", append=True) - idx = date_range("2000-1-1", periods=3, freq="H") + idx = date_range("2000-1-1", periods=3, freq="h") idx.name = "foo" df = DataFrame({"A": Series(range(3), index=idx)}) df.to_hdf(path, key="data", mode="w", append=True) @@ -93,7 +93,7 @@ def test_retain_index_attributes2(tmp_path, setup_path): assert read_hdf(path, key="data").index.name == "foo" with tm.assert_produces_warning(errors.AttributeConflictWarning): - idx2 = date_range("2001-1-1", periods=3, freq="H") + idx2 = date_range("2001-1-1", periods=3, freq="h") idx2.name = "bar" df2 = DataFrame({"A": Series(range(3), index=idx2)}) df2.to_hdf(path, key="data", append=True) diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 1eb7a34bead56..676b9374514e8 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -131,7 +131,7 @@ def test_append_with_timezones(setup_path, gettz): def test_append_with_timezones_as_index(setup_path, gettz): # GH#4098 example - dti = date_range("2000-1-1", periods=3, freq="H", tz=gettz("US/Eastern")) + dti = date_range("2000-1-1", periods=3, freq="h", tz=gettz("US/Eastern")) dti = dti._with_freq(None) # freq doesn't round-trip df = DataFrame({"A": Series(range(3), index=dti)}) @@ -332,7 +332,7 @@ def test_dst_transitions(setup_path): "2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", - freq="H", + freq="h", ambiguous="infer", ) times = times._with_freq(None) # freq doesn't round-trip diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index e1839fc1b0a67..82fb98615100f 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2411,7 +2411,7 @@ def test_naive_datetimeindex_roundtrip(conn, request): # GH 23510 # Ensure that a naive DatetimeIndex isn't converted to UTC conn = request.getfixturevalue(conn) - dates = date_range("2018-01-01", periods=5, freq="6H")._with_freq(None) + dates = date_range("2018-01-01", periods=5, freq="6h")._with_freq(None) expected = DataFrame({"nums": range(5)}, index=dates) assert expected.to_sql(name="foo_table", con=conn, index_label="info_date") == 5 result = sql.read_sql_table("foo_table", conn, index_col="info_date") diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index f488ee7da87ac..db7c0cec09e6c 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -49,7 +49,7 @@ class TestTSPlot: def test_ts_plot_with_tz(self, tz_aware_fixture): # GH2877, GH17173, GH31205, GH31580 tz = tz_aware_fixture - index = date_range("1/1/2011", periods=2, freq="H", tz=tz) + index = date_range("1/1/2011", periods=2, freq="h", tz=tz) ts = Series([188.5, 328.25], index=index) _check_plot_works(ts.plot) ax = ts.plot() @@ -117,7 +117,7 @@ def test_nonnumeric_exclude_error(self): with pytest.raises(TypeError, match=msg): df["A"].plot() - @pytest.mark.parametrize("freq", ["s", "min", "H", "D", "W", "M", "Q", "Y"]) + @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "M", "Q", "Y"]) def test_tsplot_period(self, freq): idx = period_range("12/31/1999", freq=freq, periods=100) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) @@ -125,7 +125,7 @@ def test_tsplot_period(self, freq): _check_plot_works(ser.plot, ax=ax) @pytest.mark.parametrize( - "freq", ["s", "min", "H", "D", "W", "ME", "Q-DEC", "Y", "1B30Min"] + "freq", ["s", "min", "h", "D", "W", "ME", "Q-DEC", "Y", "1B30Min"] ) def test_tsplot_datetime(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) @@ -187,14 +187,14 @@ def check_format_of_first_point(ax, expected_string): daily.plot(ax=ax) check_format_of_first_point(ax, "t = 2014-01-01 y = 1.000000") - @pytest.mark.parametrize("freq", ["s", "min", "H", "D", "W", "M", "Q", "Y"]) + @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "M", "Q", "Y"]) def test_line_plot_period_series(self, freq): idx = period_range("12/31/1999", freq=freq, periods=100) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _check_plot_works(ser.plot, ser.index.freq) @pytest.mark.parametrize( - "frqncy", ["1s", "3s", "5min", "7H", "4D", "8W", "11M", "3Y"] + "frqncy", ["1s", "3s", "5min", "7h", "4D", "8W", "11M", "3Y"] ) def test_line_plot_period_mlt_series(self, frqncy): # test period index line plot for series with multiples (`mlt`) of the @@ -204,14 +204,14 @@ def test_line_plot_period_mlt_series(self, frqncy): _check_plot_works(s.plot, s.index.freq.rule_code) @pytest.mark.parametrize( - "freq", ["s", "min", "H", "D", "W", "ME", "Q-DEC", "Y", "1B30Min"] + "freq", ["s", "min", "h", "D", "W", "ME", "Q-DEC", "Y", "1B30Min"] ) def test_line_plot_datetime_series(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _check_plot_works(ser.plot, ser.index.freq.rule_code) - @pytest.mark.parametrize("freq", ["s", "min", "H", "D", "W", "ME", "Q", "Y"]) + @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "ME", "Q", "Y"]) def test_line_plot_period_frame(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) df = DataFrame( @@ -222,7 +222,7 @@ def test_line_plot_period_frame(self, freq): _check_plot_works(df.plot, df.index.freq) @pytest.mark.parametrize( - "frqncy", ["1s", "3s", "5min", "7H", "4D", "8W", "11M", "3Y"] + "frqncy", ["1s", "3s", "5min", "7h", "4D", "8W", "11M", "3Y"] ) def test_line_plot_period_mlt_frame(self, frqncy): # test period index line plot for DataFrames with multiples (`mlt`) @@ -240,7 +240,7 @@ def test_line_plot_period_mlt_frame(self, frqncy): @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") @pytest.mark.parametrize( - "freq", ["s", "min", "H", "D", "W", "ME", "Q-DEC", "Y", "1B30Min"] + "freq", ["s", "min", "h", "D", "W", "ME", "Q-DEC", "Y", "1B30Min"] ) def test_line_plot_datetime_frame(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) @@ -254,7 +254,7 @@ def test_line_plot_datetime_frame(self, freq): _check_plot_works(df.plot, freq) @pytest.mark.parametrize( - "freq", ["s", "min", "H", "D", "W", "ME", "Q-DEC", "Y", "1B30Min"] + "freq", ["s", "min", "h", "D", "W", "ME", "Q-DEC", "Y", "1B30Min"] ) def test_line_plot_inferred_freq(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) @@ -382,7 +382,7 @@ def test_freq_with_no_period_alias(self): def test_nonzero_base(self): # GH2571 - idx = date_range("2012-12-20", periods=24, freq="H") + timedelta(minutes=30) + idx = date_range("2012-12-20", periods=24, freq="h") + timedelta(minutes=30) df = DataFrame(np.arange(24), index=idx) _, ax = mpl.pyplot.subplots() df.plot(ax=ax) @@ -551,13 +551,13 @@ def test_finder_minutely(self): def test_finder_hourly(self): nhours = 23 - rng = date_range("1/1/1999", freq="H", periods=nhours) + rng = date_range("1/1/1999", freq="h", periods=nhours) ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng) _, ax = mpl.pyplot.subplots() ser.plot(ax=ax) xaxis = ax.get_xaxis() rs = xaxis.get_majorticklocs()[0] - xp = Period("1/1/1999", freq="H").ordinal + xp = Period("1/1/1999", freq="h").ordinal assert rs == xp @@ -814,7 +814,7 @@ def test_mixed_freq_hf_first(self): assert PeriodIndex(data=line.get_xdata()).freq == "D" def test_mixed_freq_alignment(self): - ts_ind = date_range("2012-01-01 13:00", "2012-01-02", freq="H") + ts_ind = date_range("2012-01-01 13:00", "2012-01-02", freq="h") ts_data = np.random.default_rng(2).standard_normal(12) ts = Series(ts_data, index=ts_ind) @@ -842,7 +842,7 @@ def test_mixed_freq_lf_first(self): def test_mixed_freq_lf_first_hourly(self): idxh = date_range("1/1/1999", periods=240, freq="min") - idxl = date_range("1/1/1999", periods=4, freq="H") + idxl = date_range("1/1/1999", periods=4, freq="h") high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) _, ax = mpl.pyplot.subplots() @@ -1517,7 +1517,7 @@ def test_timedelta_short_period(self): def test_hist(self): # https://github.com/matplotlib/matplotlib/issues/8459 - rng = date_range("1/1/2011", periods=10, freq="H") + rng = date_range("1/1/2011", periods=10, freq="h") x = rng w1 = np.arange(0, 1, 0.1) w2 = np.arange(0, 1, 0.1)[::-1] diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py index 3cea39fa75ece..74e521ab71f41 100644 --- a/pandas/tests/reductions/test_stat_reductions.py +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -41,7 +41,7 @@ def test_dt64_mean(self, tz_naive_fixture, box): assert obj.mean(skipna=False) is pd.NaT @pytest.mark.parametrize("box", [Series, pd.Index, PeriodArray]) - @pytest.mark.parametrize("freq", ["s", "H", "D", "W", "B"]) + @pytest.mark.parametrize("freq", ["s", "h", "D", "W", "B"]) def test_period_mean(self, box, freq): # GH#24757 dti = pd.date_range("2001-01-01", periods=11) diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index ad6ff70b14d25..42e741119b0a1 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -44,7 +44,7 @@ def _create_index(*args, **kwargs): return _create_index -@pytest.mark.parametrize("freq", ["2D", "1H"]) +@pytest.mark.parametrize("freq", ["2D", "1h"]) @pytest.mark.parametrize( "_index_factory,_series_name,_index_start,_index_end", [DATE_RANGE, TIMEDELTA_RANGE] ) @@ -65,16 +65,16 @@ def test_asfreq_fill_value(series, create_index): ser = series - result = ser.resample("1H").asfreq() - new_index = create_index(ser.index[0], ser.index[-1], freq="1H") + result = ser.resample("1h").asfreq() + new_index = create_index(ser.index[0], ser.index[-1], freq="1h") expected = ser.reindex(new_index) tm.assert_series_equal(result, expected) # Explicit cast to float to avoid implicit cast when setting None frame = ser.astype("float").to_frame("value") frame.iloc[1] = None - result = frame.resample("1H").asfreq(fill_value=4.0) - new_index = create_index(frame.index[0], frame.index[-1], freq="1H") + result = frame.resample("1h").asfreq(fill_value=4.0) + new_index = create_index(frame.index[0], frame.index[-1], freq="1h") expected = frame.reindex(new_index, fill_value=4.0) tm.assert_frame_equal(result, expected) @@ -100,7 +100,7 @@ def test_raises_on_non_datetimelike_index(): @all_ts -@pytest.mark.parametrize("freq", ["ME", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "h"]) def test_resample_empty_series(freq, empty_series_dti, resample_method): # GH12771 & GH12868 @@ -108,7 +108,7 @@ def test_resample_empty_series(freq, empty_series_dti, resample_method): if freq == "ME" and isinstance(ser.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " - "e.g. '24H' or '3D', not " + "e.g. '24h' or '3D', not " ) with pytest.raises(ValueError, match=msg): ser.resample(freq) @@ -140,7 +140,7 @@ def test_resample_empty_series(freq, empty_series_dti, resample_method): [ pytest.param("ME", marks=pytest.mark.xfail(reason="Don't know why this fails")), "D", - "H", + "h", ], ) def test_resample_nat_index_series(freq, series, resample_method): @@ -164,7 +164,7 @@ def test_resample_nat_index_series(freq, series, resample_method): @all_ts -@pytest.mark.parametrize("freq", ["ME", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "h"]) @pytest.mark.parametrize("resample_method", ["count", "size"]) def test_resample_count_empty_series(freq, empty_series_dti, resample_method): # GH28427 @@ -172,7 +172,7 @@ def test_resample_count_empty_series(freq, empty_series_dti, resample_method): if freq == "ME" and isinstance(ser.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " - "e.g. '24H' or '3D', not " + "e.g. '24h' or '3D', not " ) with pytest.raises(ValueError, match=msg): ser.resample(freq) @@ -192,7 +192,7 @@ def test_resample_count_empty_series(freq, empty_series_dti, resample_method): @all_ts -@pytest.mark.parametrize("freq", ["ME", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "h"]) def test_resample_empty_dataframe(empty_frame_dti, freq, resample_method): # GH13212 df = empty_frame_dti @@ -200,7 +200,7 @@ def test_resample_empty_dataframe(empty_frame_dti, freq, resample_method): if freq == "ME" and isinstance(df.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " - "e.g. '24H' or '3D', not " + "e.g. '24h' or '3D', not " ) with pytest.raises(ValueError, match=msg): df.resample(freq, group_keys=False) @@ -234,7 +234,7 @@ def test_resample_empty_dataframe(empty_frame_dti, freq, resample_method): @all_ts -@pytest.mark.parametrize("freq", ["ME", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "h"]) def test_resample_count_empty_dataframe(freq, empty_frame_dti): # GH28427 @@ -243,7 +243,7 @@ def test_resample_count_empty_dataframe(freq, empty_frame_dti): if freq == "ME" and isinstance(empty_frame_dti.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " - "e.g. '24H' or '3D', not " + "e.g. '24h' or '3D', not " ) with pytest.raises(ValueError, match=msg): empty_frame_dti.resample(freq) @@ -261,7 +261,7 @@ def test_resample_count_empty_dataframe(freq, empty_frame_dti): @all_ts -@pytest.mark.parametrize("freq", ["ME", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "h"]) def test_resample_size_empty_dataframe(freq, empty_frame_dti): # GH28427 @@ -270,7 +270,7 @@ def test_resample_size_empty_dataframe(freq, empty_frame_dti): if freq == "ME" and isinstance(empty_frame_dti.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " - "e.g. '24H' or '3D', not " + "e.g. '24h' or '3D', not " ) with pytest.raises(ValueError, match=msg): empty_frame_dti.resample(freq) @@ -308,7 +308,7 @@ def test_resample_empty_dtypes(index, dtype, resample_method): @all_ts -@pytest.mark.parametrize("freq", ["ME", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "h"]) def test_apply_to_empty_series(empty_series_dti, freq): # GH 14313 ser = empty_series_dti @@ -316,7 +316,7 @@ def test_apply_to_empty_series(empty_series_dti, freq): if freq == "ME" and isinstance(empty_series_dti.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " - "e.g. '24H' or '3D', not " + "e.g. '24h' or '3D', not " ) with pytest.raises(ValueError, match=msg): empty_series_dti.resample(freq) @@ -333,7 +333,7 @@ def test_apply_to_empty_series(empty_series_dti, freq): @all_ts def test_resampler_is_iterable(series): # GH 15314 - freq = "H" + freq = "h" tg = Grouper(freq=freq, convention="start") grouped = series.groupby(tg) resampled = series.resample(freq) @@ -347,7 +347,7 @@ def test_resample_quantile(series): # GH 15023 ser = series q = 0.75 - freq = "H" + freq = "h" result = ser.resample(freq).quantile(q) expected = ser.resample(freq).agg(lambda x: x.quantile(q)).rename(ser.name) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index b40ae7d2bd8c0..63f9de2da733d 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -516,8 +516,8 @@ def test_upsample_with_limit(unit): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("freq", ["5D", "10H", "5Min", "10s"]) -@pytest.mark.parametrize("rule", ["Y", "3ME", "15D", "30H", "15Min", "30s"]) +@pytest.mark.parametrize("freq", ["5D", "10h", "5Min", "10s"]) +@pytest.mark.parametrize("rule", ["Y", "3ME", "15D", "30h", "15Min", "30s"]) def test_nearest_upsample_with_limit(tz_aware_fixture, freq, rule, unit): # GH 33939 rng = date_range("1/1/2000", periods=3, freq=freq, tz=tz_aware_fixture).as_unit( @@ -604,9 +604,9 @@ def test_resample_ohlc_dataframe(unit): ).reindex(["VOLUME", "PRICE"], axis=1) df.index = df.index.as_unit(unit) df.columns.name = "Cols" - res = df.resample("H").ohlc() + res = df.resample("h").ohlc() exp = pd.concat( - [df["VOLUME"].resample("H").ohlc(), df["PRICE"].resample("H").ohlc()], + [df["VOLUME"].resample("h").ohlc(), df["PRICE"].resample("h").ohlc()], axis=1, keys=df.columns, ) @@ -614,7 +614,7 @@ def test_resample_ohlc_dataframe(unit): tm.assert_frame_equal(exp, res) df.columns = [["a", "b"], ["c", "d"]] - res = df.resample("H").ohlc() + res = df.resample("h").ohlc() exp.columns = pd.MultiIndex.from_tuples( [ ("a", "c", "open"), @@ -659,7 +659,7 @@ def test_resample_reresample(unit): ).as_unit(unit) s = Series(np.random.default_rng(2).random(len(dti)), dti) bs = s.resample("B", closed="right", label="right").mean() - result = bs.resample("8H").mean() + result = bs.resample("8h").mean() assert len(result) == 25 assert isinstance(result.index.freq, offsets.DateOffset) assert result.index.freq == offsets.Hour(8) @@ -954,13 +954,13 @@ def test_resample_origin_epoch_with_tz_day_vs_24h(unit): ts_1 = Series(random_values, index=rng) result_1 = ts_1.resample("D", origin="epoch").mean() - result_2 = ts_1.resample("24H", origin="epoch").mean() + result_2 = ts_1.resample("24h", origin="epoch").mean() tm.assert_series_equal(result_1, result_2) # check that we have the same behavior with epoch even if we are not timezone aware ts_no_tz = ts_1.tz_localize(None) result_3 = ts_no_tz.resample("D", origin="epoch").mean() - result_4 = ts_no_tz.resample("24H", origin="epoch").mean() + result_4 = ts_no_tz.resample("24h", origin="epoch").mean() tm.assert_series_equal(result_1, result_3.tz_localize(rng.tz), check_freq=False) tm.assert_series_equal(result_1, result_4.tz_localize(rng.tz), check_freq=False) @@ -969,7 +969,7 @@ def test_resample_origin_epoch_with_tz_day_vs_24h(unit): rng = date_range(start, end, freq="7min").as_unit(unit) ts_2 = Series(random_values, index=rng) result_5 = ts_2.resample("D", origin="epoch").mean() - result_6 = ts_2.resample("24H", origin="epoch").mean() + result_6 = ts_2.resample("24h", origin="epoch").mean() tm.assert_series_equal(result_1.tz_localize(None), result_5.tz_localize(None)) tm.assert_series_equal(result_1.tz_localize(None), result_6.tz_localize(None)) @@ -1005,27 +1005,27 @@ def _create_series(values, timestamps, freq="D"): expected_ts = ["2013-11-02 22:00-05:00", "2013-11-03 22:00-06:00"] expected = _create_series([23.0, 2.0], expected_ts) - result = ts.resample("D", origin="start", offset="-2H").sum() + result = ts.resample("D", origin="start", offset="-2h").sum() tm.assert_series_equal(result, expected) expected_ts = ["2013-11-02 22:00-05:00", "2013-11-03 21:00-06:00"] - expected = _create_series([22.0, 3.0], expected_ts, freq="24H") - result = ts.resample("24H", origin="start", offset="-2H").sum() + expected = _create_series([22.0, 3.0], expected_ts, freq="24h") + result = ts.resample("24h", origin="start", offset="-2h").sum() tm.assert_series_equal(result, expected) expected_ts = ["2013-11-02 02:00-05:00", "2013-11-03 02:00-06:00"] expected = _create_series([3.0, 22.0], expected_ts) - result = ts.resample("D", origin="start", offset="2H").sum() + result = ts.resample("D", origin="start", offset="2h").sum() tm.assert_series_equal(result, expected) expected_ts = ["2013-11-02 23:00-05:00", "2013-11-03 23:00-06:00"] expected = _create_series([24.0, 1.0], expected_ts) - result = ts.resample("D", origin="start", offset="-1H").sum() + result = ts.resample("D", origin="start", offset="-1h").sum() tm.assert_series_equal(result, expected) expected_ts = ["2013-11-02 01:00-05:00", "2013-11-03 01:00:00-0500"] expected = _create_series([1.0, 24.0], expected_ts) - result = ts.resample("D", origin="start", offset="1H").sum() + result = ts.resample("D", origin="start", offset="1h").sum() tm.assert_series_equal(result, expected) @@ -1054,7 +1054,7 @@ def test_period_with_agg(): # aggregate a period resampler with a lambda s2 = Series( np.random.default_rng(2).integers(0, 5, 50), - index=period_range("2012-01-01", freq="H", periods=50), + index=period_range("2012-01-01", freq="h", periods=50), dtype="float64", ) @@ -1114,12 +1114,12 @@ def test_resample_dtype_coercion(unit): df = {"a": [1, 3, 1, 4]} df = DataFrame(df, index=date_range("2017-01-01", "2017-01-04").as_unit(unit)) - expected = df.astype("float64").resample("H").mean()["a"].interpolate("cubic") + expected = df.astype("float64").resample("h").mean()["a"].interpolate("cubic") - result = df.resample("H")["a"].mean().interpolate("cubic") + result = df.resample("h")["a"].mean().interpolate("cubic") tm.assert_series_equal(result, expected) - result = df.resample("H").mean()["a"].interpolate("cubic") + result = df.resample("h").mean()["a"].interpolate("cubic") tm.assert_series_equal(result, expected) @@ -1564,11 +1564,11 @@ def test_resample_across_dst(): pd.to_datetime(df2.ts, unit="s") .dt.tz_localize("UTC") .dt.tz_convert("Europe/Madrid"), - freq="H", + freq="h", ) df = DataFrame([5, 5], index=dti1) - result = df.resample(rule="H").sum() + result = df.resample(rule="h").sum() expected = DataFrame([5, 5], index=dti2) tm.assert_frame_equal(result, expected) @@ -1694,11 +1694,11 @@ def test_downsample_across_dst(unit): # GH 8531 tz = pytz.timezone("Europe/Berlin") dt = datetime(2014, 10, 26) - dates = date_range(tz.localize(dt), periods=4, freq="2H").as_unit(unit) - result = Series(5, index=dates).resample("H").mean() + dates = date_range(tz.localize(dt), periods=4, freq="2h").as_unit(unit) + result = Series(5, index=dates).resample("h").mean() expected = Series( [5.0, np.nan] * 3 + [5.0], - index=date_range(tz.localize(dt), periods=7, freq="H").as_unit(unit), + index=date_range(tz.localize(dt), periods=7, freq="h").as_unit(unit), ) tm.assert_series_equal(result, expected) @@ -1724,7 +1724,7 @@ def test_downsample_across_dst_weekly(unit): def test_downsample_across_dst_weekly_2(unit): # GH 9119, GH 21459 - idx = date_range("2013-04-01", "2013-05-01", tz="Europe/London", freq="H").as_unit( + idx = date_range("2013-04-01", "2013-05-01", tz="Europe/London", freq="h").as_unit( unit ) s = Series(index=idx, dtype=np.float64) @@ -1742,7 +1742,7 @@ def test_downsample_dst_at_midnight(unit): # GH 25758 start = datetime(2018, 11, 3, 12) end = datetime(2018, 11, 5, 12) - index = date_range(start, end, freq="1H").as_unit(unit) + index = date_range(start, end, freq="1h").as_unit(unit) index = index.tz_localize("UTC").tz_convert("America/Havana") data = list(range(len(index))) dataframe = DataFrame(data, index=index) @@ -1844,14 +1844,14 @@ def f(data, add_arg): [ (30, "s", 0.5, "Min"), (60, "s", 1, "Min"), - (3600, "s", 1, "H"), - (60, "Min", 1, "H"), + (3600, "s", 1, "h"), + (60, "Min", 1, "h"), (21600, "s", 0.25, "D"), (86400, "s", 1, "D"), (43200, "s", 0.5, "D"), (1440, "Min", 1, "D"), - (12, "H", 0.5, "D"), - (24, "H", 1, "D"), + (12, "h", 0.5, "D"), + (24, "h", 1, "D"), ], ) def test_resample_equivalent_offsets(n1, freq1, n2, freq2, k, unit): @@ -1871,7 +1871,7 @@ def test_resample_equivalent_offsets(n1, freq1, n2, freq2, k, unit): [ ("19910905", "19920406", "D", "19910905", "19920407"), ("19910905 00:00", "19920406 06:00", "D", "19910905", "19920407"), - ("19910905 06:00", "19920406 06:00", "H", "19910905 06:00", "19920406 07:00"), + ("19910905 06:00", "19920406 06:00", "h", "19910905 06:00", "19920406 07:00"), ("19910906", "19920406", "ME", "19910831", "19920430"), ("19910831", "19920430", "ME", "19910831", "19920531"), ("1991-08", "1992-04", "ME", "19910831", "19920531"), @@ -1922,32 +1922,32 @@ def test_resample_apply_product(duplicates, unit): "2020-03-28", "2020-03-31", "D", - "24H", + "24h", "2020-03-30 01:00", ), # includes transition into DST ( "2020-03-28", "2020-10-27", "D", - "24H", + "24h", "2020-10-27 00:00", ), # includes transition into and out of DST ( "2020-10-25", "2020-10-27", "D", - "24H", + "24h", "2020-10-26 23:00", ), # includes transition out of DST ( "2020-03-28", "2020-03-31", - "24H", + "24h", "D", "2020-03-30 00:00", ), # same as above, but from 24H to D - ("2020-03-28", "2020-10-27", "24H", "D", "2020-10-27 00:00"), - ("2020-10-25", "2020-10-27", "24H", "D", "2020-10-26 00:00"), + ("2020-03-28", "2020-10-27", "24h", "D", "2020-10-27 00:00"), + ("2020-10-25", "2020-10-27", "24h", "D", "2020-10-26 00:00"), ], ) def test_resample_calendar_day_with_dst( @@ -1981,7 +1981,7 @@ def test_resample_aggregate_functions_min_count(func, unit): def test_resample_unsigned_int(any_unsigned_int_numpy_dtype, unit): # gh-43329 df = DataFrame( - index=date_range(start="2000-01-01", end="2000-01-03 23", freq="12H").as_unit( + index=date_range(start="2000-01-01", end="2000-01-03 23", freq="12h").as_unit( unit ), columns=["x"], @@ -2098,7 +2098,7 @@ def test_resample_c_b_closed_right(freq: str): def test_resample_b_55282(): # https://github.com/pandas-dev/pandas/issues/55282 s = Series( - [1, 2, 3, 4, 5, 6], index=date_range("2023-09-26", periods=6, freq="12H") + [1, 2, 3, 4, 5, 6], index=date_range("2023-09-26", periods=6, freq="12h") ) result = s.resample("B", closed="right", label="right").mean() expected = Series( diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index f3e2eecf63d6b..d214e1b4ae4ae 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -41,7 +41,7 @@ def _series_name(): class TestPeriodIndex: - @pytest.mark.parametrize("freq", ["2D", "1H", "2H"]) + @pytest.mark.parametrize("freq", ["2D", "1h", "2h"]) @pytest.mark.parametrize("kind", ["period", None, "timestamp"]) def test_asfreq(self, series_and_frame, freq, kind): # GH 12884, 15944 @@ -65,23 +65,23 @@ def test_asfreq_fill_value(self, series): new_index = date_range( s.index[0].to_timestamp(how="start"), (s.index[-1]).to_timestamp(how="start"), - freq="1H", + freq="1h", ) expected = s.to_timestamp().reindex(new_index, fill_value=4.0) - result = s.resample("1H", kind="timestamp").asfreq(fill_value=4.0) + result = s.resample("1h", kind="timestamp").asfreq(fill_value=4.0) tm.assert_series_equal(result, expected) frame = s.to_frame("value") new_index = date_range( frame.index[0].to_timestamp(how="start"), (frame.index[-1]).to_timestamp(how="start"), - freq="1H", + freq="1h", ) expected = frame.to_timestamp().reindex(new_index, fill_value=3.0) - result = frame.resample("1H", kind="timestamp").asfreq(fill_value=3.0) + result = frame.resample("1h", kind="timestamp").asfreq(fill_value=3.0) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("freq", ["H", "12H", "2D", "W"]) + @pytest.mark.parametrize("freq", ["h", "12h", "2D", "W"]) @pytest.mark.parametrize("kind", [None, "period", "timestamp"]) @pytest.mark.parametrize("kwargs", [{"on": "date"}, {"level": "d"}]) def test_selection(self, index, freq, kind, kwargs): @@ -271,7 +271,7 @@ def test_with_local_timezone_pytz(self): # 1 day later end = datetime(year=2013, month=11, day=2, hour=0, minute=0, tzinfo=pytz.utc) - index = date_range(start, end, freq="H") + index = date_range(start, end, freq="h") series = Series(1, index=index) series = series.tz_convert(local_timezone) @@ -287,7 +287,7 @@ def test_with_local_timezone_pytz(self): def test_resample_with_pytz(self): # GH 13238 s = Series( - 2, index=date_range("2017-01-01", periods=48, freq="H", tz="US/Eastern") + 2, index=date_range("2017-01-01", periods=48, freq="h", tz="US/Eastern") ) result = s.resample("D").mean() expected = Series( @@ -312,7 +312,7 @@ def test_with_local_timezone_dateutil(self): year=2013, month=11, day=2, hour=0, minute=0, tzinfo=dateutil.tz.tzutc() ) - index = date_range(start, end, freq="H", name="idx") + index = date_range(start, end, freq="h", name="idx") series = Series(1, index=index) series = series.tz_convert(local_timezone) @@ -337,7 +337,7 @@ def test_resample_nonexistent_time_bin_edge(self): tm.assert_series_equal(result, expected) # GH 23742 - index = date_range(start="2017-10-10", end="2017-10-20", freq="1H") + index = date_range(start="2017-10-10", end="2017-10-20", freq="1h") index = index.tz_localize("UTC").tz_convert("America/Sao_Paulo") df = DataFrame(data=list(range(len(index))), index=index) result = df.groupby(pd.Grouper(freq="1D")).count() @@ -461,9 +461,9 @@ def test_upsample_daily_business_daily(self, simple_period_range_series): tm.assert_series_equal(result, expected) ts = simple_period_range_series("1/1/2000", "2/1/2000") - result = ts.resample("H", convention="s").asfreq() - exp_rng = period_range("1/1/2000", "2/1/2000 23:00", freq="H") - expected = ts.asfreq("H", how="s").reindex(exp_rng) + result = ts.resample("h", convention="s").asfreq() + exp_rng = period_range("1/1/2000", "2/1/2000 23:00", freq="h") + expected = ts.asfreq("h", how="s").reindex(exp_rng) tm.assert_series_equal(result, expected) def test_resample_irregular_sparse(self): @@ -530,7 +530,7 @@ def test_resample_tz_localized(self): # GH 6397 # comparing an offset that doesn't propagate tz's - rng = date_range("1/1/2011", periods=20000, freq="H") + rng = date_range("1/1/2011", periods=20000, freq="h") rng = rng.tz_localize("EST") ts = DataFrame(index=rng) ts["first"] = np.random.default_rng(2).standard_normal(len(rng)) @@ -660,7 +660,7 @@ def test_default_right_closed_label(self, from_freq, to_freq): @pytest.mark.parametrize( "from_freq, to_freq", - [("D", "MS"), ("Q", "AS"), ("ME", "QS"), ("H", "D"), ("min", "H")], + [("D", "MS"), ("Q", "AS"), ("ME", "QS"), ("h", "D"), ("min", "h")], ) def test_default_left_closed_label(self, from_freq, to_freq): idx = date_range(start="8/15/2012", periods=100, freq=from_freq) @@ -752,7 +752,7 @@ def test_evenly_divisible_with_no_extra_bins(self): result = df.resample("7D").sum() tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("freq, period_mult", [("H", 24), ("12H", 2)]) + @pytest.mark.parametrize("freq, period_mult", [("h", 24), ("12h", 2)]) @pytest.mark.parametrize("kind", [None, "period"]) def test_upsampling_ohlc(self, freq, period_mult, kind): # GH 13083 @@ -829,19 +829,19 @@ def test_resample_with_only_nat(self): @pytest.mark.parametrize( "start,end,start_freq,end_freq,offset", [ - ("19910905", "19910909 03:00", "H", "24H", "10H"), - ("19910905", "19910909 12:00", "H", "24H", "10H"), - ("19910905", "19910909 23:00", "H", "24H", "10H"), - ("19910905 10:00", "19910909", "H", "24H", "10H"), - ("19910905 10:00", "19910909 10:00", "H", "24H", "10H"), - ("19910905", "19910909 10:00", "H", "24H", "10H"), - ("19910905 12:00", "19910909", "H", "24H", "10H"), - ("19910905 12:00", "19910909 03:00", "H", "24H", "10H"), - ("19910905 12:00", "19910909 12:00", "H", "24H", "10H"), - ("19910905 12:00", "19910909 12:00", "H", "24H", "34H"), - ("19910905 12:00", "19910909 12:00", "H", "17H", "10H"), - ("19910905 12:00", "19910909 12:00", "H", "17H", "3H"), - ("19910905", "19910913 06:00", "2H", "24H", "10H"), + ("19910905", "19910909 03:00", "h", "24h", "10h"), + ("19910905", "19910909 12:00", "h", "24h", "10h"), + ("19910905", "19910909 23:00", "h", "24h", "10h"), + ("19910905 10:00", "19910909", "h", "24h", "10h"), + ("19910905 10:00", "19910909 10:00", "h", "24h", "10h"), + ("19910905", "19910909 10:00", "h", "24h", "10h"), + ("19910905 12:00", "19910909", "h", "24h", "10h"), + ("19910905 12:00", "19910909 03:00", "h", "24h", "10h"), + ("19910905 12:00", "19910909 12:00", "h", "24h", "10h"), + ("19910905 12:00", "19910909 12:00", "h", "24h", "34h"), + ("19910905 12:00", "19910909 12:00", "h", "17h", "10h"), + ("19910905 12:00", "19910909 12:00", "h", "17h", "3h"), + ("19910905", "19910913 06:00", "2h", "24h", "10h"), ("19910905", "19910905 01:39", "Min", "5Min", "3Min"), ("19910905", "19910905 03:18", "2Min", "5Min", "3Min"), ], @@ -858,11 +858,11 @@ def test_resample_with_offset(self, start, end, start_freq, end_freq, offset): def test_resample_with_offset_month(self): # GH 23882 & 31809 - pi = period_range("19910905 12:00", "19910909 1:00", freq="H") + pi = period_range("19910905 12:00", "19910909 1:00", freq="h") ser = Series(np.arange(len(pi)), index=pi) - result = ser.resample("M", offset="3H").mean() + result = ser.resample("M", offset="3h").mean() result = result.to_timestamp("M") - expected = ser.to_timestamp().resample("ME", offset="3H").mean() + expected = ser.to_timestamp().resample("ME", offset="3h").mean() # TODO: is non-tick the relevant characteristic? (GH 33815) expected.index = expected.index._with_freq(None) tm.assert_series_equal(result, expected) @@ -875,8 +875,8 @@ def test_resample_with_offset_month(self): ( "19910905 06:00", "19920406 06:00", - "H", - "H", + "h", + "h", "19910905 06:00", "19920406 06:00", ), @@ -912,7 +912,7 @@ def test_sum_min_count(self): tm.assert_series_equal(result, expected) def test_resample_t_l_deprecated(self): - # GH 52536 + # GH#52536 msg_t = "'T' is deprecated and will be removed in a future version." msg_l = "'L' is deprecated and will be removed in a future version." diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 86a7439410d8b..ff7b129c52f71 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -33,13 +33,13 @@ def test_frame(dti, _test_series): def test_str(_test_series): - r = _test_series.resample("H") + r = _test_series.resample("h") assert ( "DatetimeIndexResampler [freq=, axis=0, closed=left, " "label=left, convention=start, origin=start_day]" in str(r) ) - r = _test_series.resample("H", origin="2000-01-01") + r = _test_series.resample("h", origin="2000-01-01") assert ( "DatetimeIndexResampler [freq=, axis=0, closed=left, " "label=left, convention=start, origin=2000-01-01 00:00:00]" in str(r) @@ -47,12 +47,12 @@ def test_str(_test_series): def test_api(_test_series): - r = _test_series.resample("H") + r = _test_series.resample("h") result = r.mean() assert isinstance(result, Series) assert len(result) == 217 - r = _test_series.to_frame().resample("H") + r = _test_series.to_frame().resample("h") result = r.mean() assert isinstance(result, DataFrame) assert len(result) == 217 @@ -127,36 +127,36 @@ def test_pipe(test_frame, _test_series): # GH17905 # series - r = _test_series.resample("H") + r = _test_series.resample("h") expected = r.max() - r.mean() result = r.pipe(lambda x: x.max() - x.mean()) tm.assert_series_equal(result, expected) # dataframe - r = test_frame.resample("H") + r = test_frame.resample("h") expected = r.max() - r.mean() result = r.pipe(lambda x: x.max() - x.mean()) tm.assert_frame_equal(result, expected) def test_getitem(test_frame): - r = test_frame.resample("H") + r = test_frame.resample("h") tm.assert_index_equal(r._selected_obj.columns, test_frame.columns) - r = test_frame.resample("H")["B"] + r = test_frame.resample("h")["B"] assert r._selected_obj.name == test_frame.columns[1] # technically this is allowed - r = test_frame.resample("H")["A", "B"] + r = test_frame.resample("h")["A", "B"] tm.assert_index_equal(r._selected_obj.columns, test_frame.columns[[0, 1]]) - r = test_frame.resample("H")["A", "B"] + r = test_frame.resample("h")["A", "B"] tm.assert_index_equal(r._selected_obj.columns, test_frame.columns[[0, 1]]) @pytest.mark.parametrize("key", [["D"], ["A", "D"]]) def test_select_bad_cols(key, test_frame): - g = test_frame.resample("H") + g = test_frame.resample("h") # 'A' should not be referenced as a bad column... # will have to rethink regex if you change message! msg = r"^\"Columns not found: 'D'\"$" @@ -165,7 +165,7 @@ def test_select_bad_cols(key, test_frame): def test_attribute_access(test_frame): - r = test_frame.resample("H") + r = test_frame.resample("h") tm.assert_series_equal(r.A.sum(), r["A"].sum()) @@ -188,7 +188,7 @@ def test_api_compat_before_use(attr): def tests_raises_on_nuisance(test_frame): df = test_frame df["D"] = "foo" - r = df.resample("H") + r = df.resample("h") result = r[["A", "B"]].mean() expected = pd.concat([r.A.mean(), r.B.mean()], axis=1) tm.assert_frame_equal(result, expected) @@ -1041,7 +1041,7 @@ def test_series_axis_param_depr(_test_series): "deprecated and will be removed in a future version." ) with tm.assert_produces_warning(FutureWarning, match=warning_msg): - _test_series.resample("H", axis=0) + _test_series.resample("h", axis=0) def test_resample_empty(): @@ -1061,5 +1061,5 @@ def test_resample_empty(): ] ) ) - result = df.resample("8H").mean() + result = df.resample("8h").mean() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index f394e3a25dc0f..55365d14a68ca 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -327,10 +327,10 @@ def test_apply_columns_multilevel(): ind = date_range(start="2017-01-01", freq="15Min", periods=8) df = DataFrame(np.array([0] * 16).reshape(8, 2), index=ind, columns=cols) agg_dict = {col: (np.sum if col[3] == "one" else np.mean) for col in df.columns} - result = df.resample("H").apply(lambda x: agg_dict[x.name](x)) + result = df.resample("h").apply(lambda x: agg_dict[x.name](x)) expected = DataFrame( 2 * [[0, 0.0]], - index=date_range(start="2017-01-01", freq="1H", periods=2), + index=date_range(start="2017-01-01", freq="1h", periods=2), columns=pd.MultiIndex.from_tuples( [("A", "a", "", "one"), ("B", "b", "i", "two")] ), @@ -418,14 +418,14 @@ def test_apply_to_one_column_of_df(): ) # access "col" via getattr -> make sure we handle AttributeError - result = df.resample("H").apply(lambda group: group.col.sum()) + result = df.resample("h").apply(lambda group: group.col.sum()) expected = Series( - [3, 12, 21, 9], index=date_range("2012-01-01", periods=4, freq="H") + [3, 12, 21, 9], index=date_range("2012-01-01", periods=4, freq="h") ) tm.assert_series_equal(result, expected) # access "col" via _getitem__ -> make sure we handle KeyErrpr - result = df.resample("H").apply(lambda group: group["col"].sum()) + result = df.resample("h").apply(lambda group: group["col"].sum()) tm.assert_series_equal(result, expected) @@ -608,7 +608,7 @@ def test_groupby_resample_size_all_index_same(): # GH 46826 df = DataFrame( {"A": [1] * 3 + [2] * 3 + [1] * 3 + [2] * 3, "B": np.arange(12)}, - index=date_range("31/12/2000 18:00", freq="H", periods=12), + index=date_range("31/12/2000 18:00", freq="h", periods=12), ) msg = "DataFrameGroupBy.resample operated on the grouping columns" with tm.assert_produces_warning(FutureWarning, match=msg): diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index c00366b2e28ce..e5593302625ec 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -273,7 +273,7 @@ def test_aggregate_with_nat_size(): def test_repr(): # GH18203 - result = repr(Grouper(key="A", freq="H")) + result = repr(Grouper(key="A", freq="h")) expected = ( "TimeGrouper(key='A', freq=, axis=0, sort=True, dropna=True, " "closed='left', label='left', how='mean', " @@ -281,7 +281,7 @@ def test_repr(): ) assert result == expected - result = repr(Grouper(key="A", freq="H", origin="2000-01-01")) + result = repr(Grouper(key="A", freq="h", origin="2000-01-01")) expected = ( "TimeGrouper(key='A', freq=, axis=0, sort=True, dropna=True, " "closed='left', label='left', how='mean', " @@ -304,7 +304,7 @@ def test_repr(): ], ) def test_upsample_sum(method, method_args, expected_values): - s = Series(1, index=date_range("2017", periods=2, freq="H")) + s = Series(1, index=date_range("2017", periods=2, freq="h")) resampled = s.resample("30min") index = pd.DatetimeIndex( ["2017-01-01T00:00:00", "2017-01-01T00:30:00", "2017-01-01T01:00:00"], diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index 79b13673e70c6..606403ba56494 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -128,13 +128,13 @@ def test_resample_timedelta_values(): @pytest.mark.parametrize( "start, end, freq, resample_freq", [ - ("8H", "21h59min50s", "10s", "3H"), # GH 30353 example - ("3H", "22H", "1H", "5H"), + ("8h", "21h59min50s", "10s", "3h"), # GH 30353 example + ("3h", "22h", "1h", "5h"), ("527D", "5006D", "3D", "10D"), ("1D", "10D", "1D", "2D"), # GH 13022 example # tests that worked before GH 33498: - ("8H", "21h59min50s", "10s", "2H"), - ("0H", "21h59min50s", "10s", "3H"), + ("8h", "21h59min50s", "10s", "2h"), + ("0h", "21h59min50s", "10s", "3h"), ("10D", "85D", "D", "2D"), ], ) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index aa1a74aadae12..74c79d20a3fb3 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -272,7 +272,7 @@ def test_concat_mixed_objs(self): # G2385 # axis 1 - index = date_range("01-Jan-2013", periods=10, freq="H") + index = date_range("01-Jan-2013", periods=10, freq="h") arr = np.arange(10, dtype="int64") s1 = Series(arr, index=index) s2 = Series(arr, index=index) diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 12d28c388d508..51398acd6ec57 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -46,8 +46,8 @@ def test_concat_datetime_datetime64_frame(self): def test_concat_datetime_timezone(self): # GH 18523 - idx1 = date_range("2011-01-01", periods=3, freq="H", tz="Europe/Paris") - idx2 = date_range(start=idx1[0], end=idx1[-1], freq="H") + idx1 = date_range("2011-01-01", periods=3, freq="h", tz="Europe/Paris") + idx2 = date_range(start=idx1[0], end=idx1[-1], freq="h") df1 = DataFrame({"a": [1, 2, 3]}, index=idx1) df2 = DataFrame({"b": [1, 2, 3]}, index=idx2) result = concat([df1, df2], axis=1) @@ -59,7 +59,7 @@ def test_concat_datetime_timezone(self): "2011-01-01 01:00:00+01:00", "2011-01-01 02:00:00+01:00", ], - freq="H", + freq="h", ) .tz_convert("UTC") .tz_convert("Europe/Paris") @@ -71,7 +71,7 @@ def test_concat_datetime_timezone(self): tm.assert_frame_equal(result, expected) - idx3 = date_range("2011-01-01", periods=3, freq="H", tz="Asia/Tokyo") + idx3 = date_range("2011-01-01", periods=3, freq="h", tz="Asia/Tokyo") df3 = DataFrame({"b": [1, 2, 3]}, index=idx3) result = concat([df1, df3], axis=1) @@ -102,7 +102,7 @@ def test_concat_datetime_timezone(self): tm.assert_frame_equal(result, expected) # GH 13783: Concat after resample - result = concat([df1.resample("H").mean(), df2.resample("H").mean()], sort=True) + result = concat([df1.resample("h").mean(), df2.resample("h").mean()], sort=True) expected = DataFrame( {"a": [1, 2, 3] + [np.nan] * 3, "b": [np.nan] * 3 + [1, 2, 3]}, index=idx1.append(idx1), diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d203a04a7fffc..4d779349b5c14 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -901,7 +901,7 @@ def test_merge_on_datetime64tz_empty(self): def test_merge_datetime64tz_with_dst_transition(self): # GH 18885 df1 = DataFrame( - pd.date_range("2017-10-29 01:00", periods=4, freq="H", tz="Europe/Madrid"), + pd.date_range("2017-10-29 01:00", periods=4, freq="h", tz="Europe/Madrid"), columns=["date"], ) df1["value"] = 1 @@ -922,7 +922,7 @@ def test_merge_datetime64tz_with_dst_transition(self): expected = DataFrame( { "date": pd.date_range( - "2017-10-29 01:00", periods=7, freq="H", tz="Europe/Madrid" + "2017-10-29 01:00", periods=7, freq="h", tz="Europe/Madrid" ), "value_x": [1] * 4 + [np.nan] * 3, "value_y": [np.nan] * 4 + [2] * 3, diff --git a/pandas/tests/scalar/interval/test_interval.py b/pandas/tests/scalar/interval/test_interval.py index a02dbf0a0413f..4841c488a5768 100644 --- a/pandas/tests/scalar/interval/test_interval.py +++ b/pandas/tests/scalar/interval/test_interval.py @@ -80,8 +80,8 @@ def test_hash(self, interval): (-np.inf, np.inf, np.inf), (Timedelta("0 days"), Timedelta("5 days"), Timedelta("5 days")), (Timedelta("10 days"), Timedelta("10 days"), Timedelta("0 days")), - (Timedelta("1H10min"), Timedelta("5H5min"), Timedelta("3H55min")), - (Timedelta("5s"), Timedelta("1H"), Timedelta("59min55s")), + (Timedelta("1h10min"), Timedelta("5h5min"), Timedelta("3h55min")), + (Timedelta("5s"), Timedelta("1h"), Timedelta("59min55s")), ], ) def test_length(self, left, right, expected): diff --git a/pandas/tests/scalar/period/test_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py index 4287a69823aef..597282e10052e 100644 --- a/pandas/tests/scalar/period/test_asfreq.py +++ b/pandas/tests/scalar/period/test_asfreq.py @@ -78,8 +78,8 @@ def test_conv_annual(self): ival_A_to_B_end = Period(freq="B", year=2007, month=12, day=31) ival_A_to_D_start = Period(freq="D", year=2007, month=1, day=1) ival_A_to_D_end = Period(freq="D", year=2007, month=12, day=31) - ival_A_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) - ival_A_to_H_end = Period(freq="H", year=2007, month=12, day=31, hour=23) + ival_A_to_H_start = Period(freq="h", year=2007, month=1, day=1, hour=0) + ival_A_to_H_end = Period(freq="h", year=2007, month=12, day=31, hour=23) ival_A_to_T_start = Period( freq="Min", year=2007, month=1, day=1, hour=0, minute=0 ) @@ -111,8 +111,10 @@ def test_conv_annual(self): assert ival_A.asfreq("B", "E") == ival_A_to_B_end assert ival_A.asfreq("D", "s") == ival_A_to_D_start assert ival_A.asfreq("D", "E") == ival_A_to_D_end - assert ival_A.asfreq("H", "s") == ival_A_to_H_start - assert ival_A.asfreq("H", "E") == ival_A_to_H_end + msg = "'H' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + assert ival_A.asfreq("H", "s") == ival_A_to_H_start + assert ival_A.asfreq("H", "E") == ival_A_to_H_end assert ival_A.asfreq("min", "s") == ival_A_to_T_start assert ival_A.asfreq("min", "E") == ival_A_to_T_end msg = "'T' is deprecated and will be removed in a future version." @@ -154,8 +156,8 @@ def test_conv_quarterly(self): ival_Q_to_B_end = Period(freq="B", year=2007, month=3, day=30) ival_Q_to_D_start = Period(freq="D", year=2007, month=1, day=1) ival_Q_to_D_end = Period(freq="D", year=2007, month=3, day=31) - ival_Q_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) - ival_Q_to_H_end = Period(freq="H", year=2007, month=3, day=31, hour=23) + ival_Q_to_H_start = Period(freq="h", year=2007, month=1, day=1, hour=0) + ival_Q_to_H_end = Period(freq="h", year=2007, month=3, day=31, hour=23) ival_Q_to_T_start = Period( freq="Min", year=2007, month=1, day=1, hour=0, minute=0 ) @@ -187,8 +189,8 @@ def test_conv_quarterly(self): assert ival_Q.asfreq("B", "E") == ival_Q_to_B_end assert ival_Q.asfreq("D", "s") == ival_Q_to_D_start assert ival_Q.asfreq("D", "E") == ival_Q_to_D_end - assert ival_Q.asfreq("H", "s") == ival_Q_to_H_start - assert ival_Q.asfreq("H", "E") == ival_Q_to_H_end + assert ival_Q.asfreq("h", "s") == ival_Q_to_H_start + assert ival_Q.asfreq("h", "E") == ival_Q_to_H_end assert ival_Q.asfreq("Min", "s") == ival_Q_to_T_start assert ival_Q.asfreq("Min", "E") == ival_Q_to_T_end assert ival_Q.asfreq("s", "s") == ival_Q_to_S_start @@ -216,8 +218,8 @@ def test_conv_monthly(self): ival_M_to_B_end = Period(freq="B", year=2007, month=1, day=31) ival_M_to_D_start = Period(freq="D", year=2007, month=1, day=1) ival_M_to_D_end = Period(freq="D", year=2007, month=1, day=31) - ival_M_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) - ival_M_to_H_end = Period(freq="H", year=2007, month=1, day=31, hour=23) + ival_M_to_H_start = Period(freq="h", year=2007, month=1, day=1, hour=0) + ival_M_to_H_end = Period(freq="h", year=2007, month=1, day=31, hour=23) ival_M_to_T_start = Period( freq="Min", year=2007, month=1, day=1, hour=0, minute=0 ) @@ -243,8 +245,8 @@ def test_conv_monthly(self): assert ival_M.asfreq("B", "E") == ival_M_to_B_end assert ival_M.asfreq("D", "s") == ival_M_to_D_start assert ival_M.asfreq("D", "E") == ival_M_to_D_end - assert ival_M.asfreq("H", "s") == ival_M_to_H_start - assert ival_M.asfreq("H", "E") == ival_M_to_H_end + assert ival_M.asfreq("h", "s") == ival_M_to_H_start + assert ival_M.asfreq("h", "E") == ival_M_to_H_end assert ival_M.asfreq("Min", "s") == ival_M_to_T_start assert ival_M.asfreq("Min", "E") == ival_M_to_T_end assert ival_M.asfreq("s", "s") == ival_M_to_S_start @@ -306,8 +308,8 @@ def test_conv_weekly(self): ival_W_to_B_end = Period(freq="B", year=2007, month=1, day=5) ival_W_to_D_start = Period(freq="D", year=2007, month=1, day=1) ival_W_to_D_end = Period(freq="D", year=2007, month=1, day=7) - ival_W_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) - ival_W_to_H_end = Period(freq="H", year=2007, month=1, day=7, hour=23) + ival_W_to_H_start = Period(freq="h", year=2007, month=1, day=1, hour=0) + ival_W_to_H_end = Period(freq="h", year=2007, month=1, day=7, hour=23) ival_W_to_T_start = Period( freq="Min", year=2007, month=1, day=1, hour=0, minute=0 ) @@ -352,8 +354,8 @@ def test_conv_weekly(self): assert ival_WMON.asfreq("D", "s") == ival_WMON_to_D_start assert ival_WMON.asfreq("D", "E") == ival_WMON_to_D_end - assert ival_W.asfreq("H", "s") == ival_W_to_H_start - assert ival_W.asfreq("H", "E") == ival_W_to_H_end + assert ival_W.asfreq("h", "s") == ival_W_to_H_start + assert ival_W.asfreq("h", "E") == ival_W_to_H_end assert ival_W.asfreq("Min", "s") == ival_W_to_T_start assert ival_W.asfreq("Min", "E") == ival_W_to_T_end assert ival_W.asfreq("s", "s") == ival_W_to_S_start @@ -399,8 +401,8 @@ def test_conv_business(self): ival_B_to_M = Period(freq="M", year=2007, month=1) ival_B_to_W = Period(freq="W", year=2007, month=1, day=7) ival_B_to_D = Period(freq="D", year=2007, month=1, day=1) - ival_B_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) - ival_B_to_H_end = Period(freq="H", year=2007, month=1, day=1, hour=23) + ival_B_to_H_start = Period(freq="h", year=2007, month=1, day=1, hour=0) + ival_B_to_H_end = Period(freq="h", year=2007, month=1, day=1, hour=23) ival_B_to_T_start = Period( freq="Min", year=2007, month=1, day=1, hour=0, minute=0 ) @@ -425,8 +427,8 @@ def test_conv_business(self): assert ival_B.asfreq("D") == ival_B_to_D - assert ival_B.asfreq("H", "s") == ival_B_to_H_start - assert ival_B.asfreq("H", "E") == ival_B_to_H_end + assert ival_B.asfreq("h", "s") == ival_B_to_H_start + assert ival_B.asfreq("h", "E") == ival_B_to_H_end assert ival_B.asfreq("Min", "s") == ival_B_to_T_start assert ival_B.asfreq("Min", "E") == ival_B_to_T_end assert ival_B.asfreq("s", "s") == ival_B_to_S_start @@ -465,8 +467,8 @@ def test_conv_daily(self): ival_D_to_M = Period(freq="M", year=2007, month=1) ival_D_to_W = Period(freq="W", year=2007, month=1, day=7) - ival_D_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) - ival_D_to_H_end = Period(freq="H", year=2007, month=1, day=1, hour=23) + ival_D_to_H_start = Period(freq="h", year=2007, month=1, day=1, hour=0) + ival_D_to_H_end = Period(freq="h", year=2007, month=1, day=1, hour=23) ival_D_to_T_start = Period( freq="Min", year=2007, month=1, day=1, hour=0, minute=0 ) @@ -503,8 +505,8 @@ def test_conv_daily(self): assert ival_D_sunday.asfreq("B", "s") == ival_B_friday assert ival_D_sunday.asfreq("B", "E") == ival_B_monday - assert ival_D.asfreq("H", "s") == ival_D_to_H_start - assert ival_D.asfreq("H", "E") == ival_D_to_H_end + assert ival_D.asfreq("h", "s") == ival_D_to_H_start + assert ival_D.asfreq("h", "E") == ival_D_to_H_end assert ival_D.asfreq("Min", "s") == ival_D_to_T_start assert ival_D.asfreq("Min", "E") == ival_D_to_T_end assert ival_D.asfreq("s", "s") == ival_D_to_S_start @@ -515,13 +517,13 @@ def test_conv_daily(self): def test_conv_hourly(self): # frequency conversion tests: from Hourly Frequency" - ival_H = Period(freq="H", year=2007, month=1, day=1, hour=0) - ival_H_end_of_year = Period(freq="H", year=2007, month=12, day=31, hour=23) - ival_H_end_of_quarter = Period(freq="H", year=2007, month=3, day=31, hour=23) - ival_H_end_of_month = Period(freq="H", year=2007, month=1, day=31, hour=23) - ival_H_end_of_week = Period(freq="H", year=2007, month=1, day=7, hour=23) - ival_H_end_of_day = Period(freq="H", year=2007, month=1, day=1, hour=23) - ival_H_end_of_bus = Period(freq="H", year=2007, month=1, day=1, hour=23) + ival_H = Period(freq="h", year=2007, month=1, day=1, hour=0) + ival_H_end_of_year = Period(freq="h", year=2007, month=12, day=31, hour=23) + ival_H_end_of_quarter = Period(freq="h", year=2007, month=3, day=31, hour=23) + ival_H_end_of_month = Period(freq="h", year=2007, month=1, day=31, hour=23) + ival_H_end_of_week = Period(freq="h", year=2007, month=1, day=7, hour=23) + ival_H_end_of_day = Period(freq="h", year=2007, month=1, day=1, hour=23) + ival_H_end_of_bus = Period(freq="h", year=2007, month=1, day=1, hour=23) ival_H_to_A = Period(freq="Y", year=2007) ival_H_to_Q = Period(freq="Q", year=2007, quarter=1) @@ -563,7 +565,7 @@ def test_conv_hourly(self): assert ival_H.asfreq("s", "s") == ival_H_to_S_start assert ival_H.asfreq("s", "E") == ival_H_to_S_end - assert ival_H.asfreq("H") == ival_H + assert ival_H.asfreq("h") == ival_H def test_conv_minutely(self): # frequency conversion tests: from Minutely Frequency" @@ -598,7 +600,7 @@ def test_conv_minutely(self): ival_T_to_D = Period(freq="D", year=2007, month=1, day=1) with tm.assert_produces_warning(FutureWarning, match=bday_msg): ival_T_to_B = Period(freq="B", year=2007, month=1, day=1) - ival_T_to_H = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_T_to_H = Period(freq="h", year=2007, month=1, day=1, hour=0) ival_T_to_S_start = Period( freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=0 @@ -620,8 +622,8 @@ def test_conv_minutely(self): with tm.assert_produces_warning(FutureWarning, match=bday_msg): assert ival_T.asfreq("B") == ival_T_to_B assert ival_T_end_of_bus.asfreq("B") == ival_T_to_B - assert ival_T.asfreq("H") == ival_T_to_H - assert ival_T_end_of_hour.asfreq("H") == ival_T_to_H + assert ival_T.asfreq("h") == ival_T_to_H + assert ival_T_end_of_hour.asfreq("h") == ival_T_to_H assert ival_T.asfreq("s", "s") == ival_T_to_S_start assert ival_T.asfreq("s", "E") == ival_T_to_S_end @@ -664,7 +666,7 @@ def test_conv_secondly(self): ival_S_to_D = Period(freq="D", year=2007, month=1, day=1) with tm.assert_produces_warning(FutureWarning, match=bday_msg): ival_S_to_B = Period(freq="B", year=2007, month=1, day=1) - ival_S_to_H = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_S_to_H = Period(freq="h", year=2007, month=1, day=1, hour=0) ival_S_to_T = Period(freq="Min", year=2007, month=1, day=1, hour=0, minute=0) assert ival_S.asfreq("Y") == ival_S_to_A @@ -680,8 +682,8 @@ def test_conv_secondly(self): with tm.assert_produces_warning(FutureWarning, match=bday_msg): assert ival_S.asfreq("B") == ival_S_to_B assert ival_S_end_of_bus.asfreq("B") == ival_S_to_B - assert ival_S.asfreq("H") == ival_S_to_H - assert ival_S_end_of_hour.asfreq("H") == ival_S_to_H + assert ival_S.asfreq("h") == ival_S_to_H + assert ival_S_end_of_hour.asfreq("h") == ival_S_to_H assert ival_S.asfreq("Min") == ival_S_to_T assert ival_S_end_of_minute.asfreq("Min") == ival_S_to_T @@ -778,24 +780,24 @@ def test_asfreq_mult(self): def test_asfreq_combined(self): # normal freq to combined freq - p = Period("2007", freq="H") + p = Period("2007", freq="h") # ordinal will not change - expected = Period("2007", freq="25H") - for freq, how in zip(["1D1H", "1H1D"], ["E", "S"]): + expected = Period("2007", freq="25h") + for freq, how in zip(["1D1h", "1h1D"], ["E", "S"]): result = p.asfreq(freq, how=how) assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq # combined freq to normal freq - p1 = Period(freq="1D1H", year=2007) - p2 = Period(freq="1H1D", year=2007) + p1 = Period(freq="1D1h", year=2007) + p2 = Period(freq="1h1D", year=2007) # ordinal will change because how=E is the default - result1 = p1.asfreq("H") - result2 = p2.asfreq("H") - expected = Period("2007-01-02", freq="H") + result1 = p1.asfreq("h") + result2 = p2.asfreq("h") + expected = Period("2007-01-02", freq="h") assert result1 == expected assert result1.ordinal == expected.ordinal assert result1.freq == expected.freq @@ -804,9 +806,9 @@ def test_asfreq_combined(self): assert result2.freq == expected.freq # ordinal will not change - result1 = p1.asfreq("H", how="S") - result2 = p2.asfreq("H", how="S") - expected = Period("2007-01-01", freq="H") + result1 = p1.asfreq("h", how="S") + result2 = p2.asfreq("h", how="S") + expected = Period("2007-01-01", freq="h") assert result1 == expected assert result1.ordinal == expected.ordinal assert result1.freq == expected.freq diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index c7d42a83e663c..6c27881e44b56 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -340,7 +340,7 @@ def test_constructor_infer_freq(self): assert p.freq == "D" p = Period("2007-01-01 07") - assert p.freq == "H" + assert p.freq == "h" p = Period("2007-01-01 07:10") assert p.freq == "min" @@ -428,7 +428,7 @@ def test_period_from_ordinal(self): assert p == res assert isinstance(res, Period) - @pytest.mark.parametrize("freq", ["Y", "M", "D", "H"]) + @pytest.mark.parametrize("freq", ["Y", "M", "D", "h"]) def test_construct_from_nat_string_and_freq(self, freq): per = Period("NaT", freq=freq) assert per is NaT @@ -449,7 +449,7 @@ def test_period_cons_nat(self): p = Period(iNaT, freq="3D") assert p is NaT - p = Period(iNaT, freq="1D1H") + p = Period(iNaT, freq="1D1h") assert p is NaT p = Period("NaT") @@ -491,14 +491,14 @@ def test_period_cons_mult(self): def test_period_cons_combined(self): p = [ ( - Period("2011-01", freq="1D1H"), - Period("2011-01", freq="1H1D"), - Period("2011-01", freq="H"), + Period("2011-01", freq="1D1h"), + Period("2011-01", freq="1h1D"), + Period("2011-01", freq="h"), ), ( - Period(ordinal=1, freq="1D1H"), - Period(ordinal=1, freq="1H1D"), - Period(ordinal=1, freq="H"), + Period(ordinal=1, freq="1D1h"), + Period(ordinal=1, freq="1h1D"), + Period(ordinal=1, freq="h"), ), ] @@ -507,49 +507,49 @@ def test_period_cons_combined(self): assert p2.ordinal == p3.ordinal assert p1.freq == offsets.Hour(25) - assert p1.freqstr == "25H" + assert p1.freqstr == "25h" assert p2.freq == offsets.Hour(25) - assert p2.freqstr == "25H" + assert p2.freqstr == "25h" assert p3.freq == offsets.Hour() - assert p3.freqstr == "H" + assert p3.freqstr == "h" result = p1 + 1 assert result.ordinal == (p3 + 25).ordinal assert result.freq == p1.freq - assert result.freqstr == "25H" + assert result.freqstr == "25h" result = p2 + 1 assert result.ordinal == (p3 + 25).ordinal assert result.freq == p2.freq - assert result.freqstr == "25H" + assert result.freqstr == "25h" result = p1 - 1 assert result.ordinal == (p3 - 25).ordinal assert result.freq == p1.freq - assert result.freqstr == "25H" + assert result.freqstr == "25h" result = p2 - 1 assert result.ordinal == (p3 - 25).ordinal assert result.freq == p2.freq - assert result.freqstr == "25H" + assert result.freqstr == "25h" - msg = "Frequency must be positive, because it represents span: -25H" + msg = "Frequency must be positive, because it represents span: -25h" with pytest.raises(ValueError, match=msg): - Period("2011-01", freq="-1D1H") + Period("2011-01", freq="-1D1h") with pytest.raises(ValueError, match=msg): - Period("2011-01", freq="-1H1D") + Period("2011-01", freq="-1h1D") with pytest.raises(ValueError, match=msg): - Period(ordinal=1, freq="-1D1H") + Period(ordinal=1, freq="-1D1h") with pytest.raises(ValueError, match=msg): - Period(ordinal=1, freq="-1H1D") + Period(ordinal=1, freq="-1h1D") msg = "Frequency must be positive, because it represents span: 0D" with pytest.raises(ValueError, match=msg): - Period("2011-01", freq="0D0H") + Period("2011-01", freq="0D0h") with pytest.raises(ValueError, match=msg): - Period(ordinal=1, freq="0D0H") + Period(ordinal=1, freq="0D0h") # You can only combine together day and intraday offsets msg = "Invalid frequency: 1W1D" @@ -584,7 +584,7 @@ def test_period_constructor_nanosecond(self, day, hour, sec_float, expected): def test_period_large_ordinal(self, hour): # Issue #36430 # Integer overflow for Period over the maximum timestamp - p = Period(ordinal=2562048 + hour, freq="1H") + p = Period(ordinal=2562048 + hour, freq="1h") assert p.hour == hour @@ -635,7 +635,7 @@ def test_to_timestamp(self): assert end_ts == p.to_timestamp("D", how=a) assert end_ts == p.to_timestamp("3D", how=a) - from_lst = ["Y", "Q", "M", "W", "B", "D", "H", "Min", "s"] + from_lst = ["Y", "Q", "M", "W", "B", "D", "h", "Min", "s"] def _ex(p): if p.freq == "B": @@ -655,10 +655,10 @@ def _ex(p): p = Period("1985", freq="Y") - result = p.to_timestamp("H", how="end") + result = p.to_timestamp("h", how="end") expected = Timestamp(1986, 1, 1) - Timedelta(1, "ns") assert result == expected - result = p.to_timestamp("3H", how="end") + result = p.to_timestamp("3h", how="end") assert result == expected result = p.to_timestamp("min", how="end") @@ -672,13 +672,13 @@ def _ex(p): assert result == expected expected = datetime(1985, 1, 1) - result = p.to_timestamp("H", how="start") + result = p.to_timestamp("h", how="start") assert result == expected result = p.to_timestamp("min", how="start") assert result == expected result = p.to_timestamp("s", how="start") assert result == expected - result = p.to_timestamp("3H", how="start") + result = p.to_timestamp("3h", how="start") assert result == expected result = p.to_timestamp("5s", how="start") assert result == expected @@ -731,7 +731,7 @@ def test_to_timestamp_microsecond(self, ts, expected, freq): ("2000-12-15 13:45:26.123", None, "2000-12-15 13:45:26.123", "ms"), ("2000-12-15 13:45:26", "s", "2000-12-15 13:45:26", "s"), ("2000-12-15 13:45:26", "min", "2000-12-15 13:45", "min"), - ("2000-12-15 13:45:26", "H", "2000-12-15 13:00", "H"), + ("2000-12-15 13:45:26", "h", "2000-12-15 13:00", "h"), ("2000-12-15", "Y", "2000", "Y-DEC"), ("2000-12-15", "Q", "2000Q4", "Q-DEC"), ("2000-12-15", "M", "2000-12", "M"), @@ -763,7 +763,7 @@ def test_strftime(self): class TestPeriodProperties: """Test properties such as year, month, weekday, etc....""" - @pytest.mark.parametrize("freq", ["Y", "M", "D", "H"]) + @pytest.mark.parametrize("freq", ["Y", "M", "D", "h"]) def test_is_leap_year(self, freq): # GH 13727 p = Period("2000-01-01 00:00:00", freq=freq) @@ -808,7 +808,7 @@ def test_period_deprecated_freq(self): "M": ["MTH", "MONTH", "MONTHLY", "Mth", "month", "monthly"], "B": ["BUS", "BUSINESS", "BUSINESSLY", "WEEKDAY", "bus"], "D": ["DAY", "DLY", "DAILY", "Day", "Dly", "Daily"], - "H": ["HR", "HOUR", "HRLY", "HOURLY", "hr", "Hour", "HRly"], + "h": ["HR", "HOUR", "HRLY", "HOURLY", "hr", "Hour", "HRly"], "min": ["minute", "MINUTE", "MINUTELY", "minutely"], "s": ["sec", "SEC", "SECOND", "SECONDLY", "second"], "ms": ["MILLISECOND", "MILLISECONDLY", "millisecond"], @@ -861,7 +861,7 @@ def test_inner_bounds_start_and_end_time(self, bound, offset, period_property): assert getattr(period, period_property).floor("s") == expected def test_start_time(self): - freq_lst = ["Y", "Q", "M", "D", "H", "min", "s"] + freq_lst = ["Y", "Q", "M", "D", "h", "min", "s"] xp = datetime(2012, 1, 1) for f in freq_lst: p = Period("2012", freq=f) @@ -891,7 +891,7 @@ def _ex(*args): xp = _ex(2012, 1, 2) assert xp == p.end_time - p = Period("2012", freq="H") + p = Period("2012", freq="h") xp = _ex(2012, 1, 1, 1) assert xp == p.end_time @@ -909,11 +909,11 @@ def _ex(*args): xp = _ex(2012, 1, 16) assert xp == p.end_time - p = Period("2012", freq="1D1H") + p = Period("2012", freq="1D1h") xp = _ex(2012, 1, 2, 1) assert xp == p.end_time - p = Period("2012", freq="1H1D") + p = Period("2012", freq="1h1D") xp = _ex(2012, 1, 2, 1) assert xp == p.end_time @@ -1023,8 +1023,8 @@ def test_properties_daily(self): def test_properties_hourly(self): # Test properties on Periods with hourly frequency. - h_date1 = Period(freq="H", year=2007, month=1, day=1, hour=0) - h_date2 = Period(freq="2H", year=2007, month=1, day=1, hour=0) + h_date1 = Period(freq="h", year=2007, month=1, day=1, hour=0) + h_date2 = Period(freq="2h", year=2007, month=1, day=1, hour=0) for h_date in [h_date1, h_date2]: assert h_date.year == 2007 @@ -1036,7 +1036,7 @@ def test_properties_hourly(self): assert h_date.hour == 0 assert h_date.days_in_month == 31 assert ( - Period(freq="H", year=2012, month=2, day=1, hour=0).days_in_month == 29 + Period(freq="h", year=2012, month=2, day=1, hour=0).days_in_month == 29 ) def test_properties_minutely(self): @@ -1411,7 +1411,7 @@ def test_add_offset(self): with pytest.raises(IncompatibleFrequency, match=msg): o + p - for freq in ["H", "2H", "3H"]: + for freq in ["h", "2h", "3h"]: p = Period("2011-04-01 09:00", freq=freq) exp = Period("2011-04-03 09:00", freq=freq) @@ -1516,7 +1516,7 @@ def test_sub_offset(self): with pytest.raises(IncompatibleFrequency, match=msg): p - o - for freq in ["H", "2H", "3H"]: + for freq in ["h", "2h", "3h"]: p = Period("2011-04-01 09:00", freq=freq) assert p - offsets.Day(2) == Period("2011-03-30 09:00", freq=freq) assert p - offsets.Hour(3) == Period("2011-04-01 06:00", freq=freq) @@ -1589,7 +1589,7 @@ def test_small_year_parsing(): def test_negone_ordinals(): - freqs = ["Y", "M", "Q", "D", "H", "min", "s"] + freqs = ["Y", "M", "Q", "D", "h", "min", "s"] period = Period(ordinal=-1, freq="D") for freq in freqs: diff --git a/pandas/tests/scalar/timedelta/test_constructors.py b/pandas/tests/scalar/timedelta/test_constructors.py index 7bd9e5fc5e293..0d876fbb9bde8 100644 --- a/pandas/tests/scalar/timedelta/test_constructors.py +++ b/pandas/tests/scalar/timedelta/test_constructors.py @@ -208,8 +208,8 @@ def test_construction(): assert Timedelta(offsets.Second(2)) == Timedelta(seconds=2) # GH#11995: unicode - expected = Timedelta("1H") - result = Timedelta("1H") + expected = Timedelta("1h") + result = Timedelta("1h") assert result == expected assert to_timedelta(offsets.Hour(2)) == Timedelta("0 days, 02:00:00") diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index b1483342eb6e4..ef780a090c12d 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -673,7 +673,7 @@ def test_to_numpy_alias(self): ("5s", Timedelta("1 days 02:34:55"), Timedelta("-1 days 02:34:55")), ("min", Timedelta("1 days 02:35:00"), Timedelta("-1 days 02:35:00")), ("12min", Timedelta("1 days 02:36:00"), Timedelta("-1 days 02:36:00")), - ("H", Timedelta("1 days 03:00:00"), Timedelta("-1 days 03:00:00")), + ("h", Timedelta("1 days 03:00:00"), Timedelta("-1 days 03:00:00")), ("d", Timedelta("1 days"), Timedelta("-1 days")), ], ) @@ -991,7 +991,7 @@ def test_total_seconds_precision(self): def test_resolution_string(self): assert Timedelta(days=1).resolution_string == "D" - assert Timedelta(days=1, hours=6).resolution_string == "H" + assert Timedelta(days=1, hours=6).resolution_string == "h" assert Timedelta(days=1, minutes=6).resolution_string == "min" assert Timedelta(days=1, seconds=6).resolution_string == "s" assert Timedelta(days=1, milliseconds=6).resolution_string == "ms" @@ -1043,6 +1043,7 @@ def test_timedelta_attribute_precision(): @pytest.mark.parametrize( "unit,unit_depr", [ + ("h", "H"), ("min", "T"), ("s", "S"), ("ms", "L"), @@ -1050,8 +1051,8 @@ def test_timedelta_attribute_precision(): ("us", "U"), ], ) -def test_units_t_l_u_n_deprecated(unit, unit_depr): - # GH 52536 +def test_units_H_T_S_L_N_U_deprecated(unit, unit_depr): + # GH#52536 msg = f"'{unit_depr}' is deprecated and will be removed in a future version." expected = Timedelta(1, unit=unit) diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index e501bd93bc1c6..b7d5bbe71269a 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -45,7 +45,7 @@ def test_round_division_by_zero_raises(self): ("20130201 12:00:00", "D", "20130202"), ("20130104 12:00:00", "D", "20130105"), ("2000-01-05 05:09:15.13", "D", "2000-01-05 00:00:00"), - ("2000-01-05 05:09:15.13", "H", "2000-01-05 05:00:00"), + ("2000-01-05 05:09:15.13", "h", "2000-01-05 05:00:00"), ("2000-01-05 05:09:15.13", "s", "2000-01-05 05:09:15"), ], ) @@ -141,7 +141,7 @@ def test_ceil_floor_edge(self, test_input, rounder, freq, expected): ("2018-01-01 00:04:00", "4min", "2018-01-01 00:04:00"), ("2018-01-01 00:15:00", "15min", "2018-01-01 00:15:00"), ("2018-01-01 00:20:00", "20min", "2018-01-01 00:20:00"), - ("2018-01-01 03:00:00", "3H", "2018-01-01 03:00:00"), + ("2018-01-01 03:00:00", "3h", "2018-01-01 03:00:00"), ], ) @pytest.mark.parametrize("rounder", ["ceil", "floor", "round"]) @@ -181,30 +181,30 @@ def test_round_dst_border_ambiguous(self, method, unit): ts = Timestamp("2017-10-29 00:00:00", tz="UTC").tz_convert("Europe/Madrid") ts = ts.as_unit(unit) # - result = getattr(ts, method)("H", ambiguous=True) + result = getattr(ts, method)("h", ambiguous=True) assert result == ts assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value - result = getattr(ts, method)("H", ambiguous=False) + result = getattr(ts, method)("h", ambiguous=False) expected = Timestamp("2017-10-29 01:00:00", tz="UTC").tz_convert( "Europe/Madrid" ) assert result == expected assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value - result = getattr(ts, method)("H", ambiguous="NaT") + result = getattr(ts, method)("h", ambiguous="NaT") assert result is NaT msg = "Cannot infer dst time" with pytest.raises(pytz.AmbiguousTimeError, match=msg): - getattr(ts, method)("H", ambiguous="raise") + getattr(ts, method)("h", ambiguous="raise") @pytest.mark.parametrize( "method, ts_str, freq", [ ["ceil", "2018-03-11 01:59:00-0600", "5min"], ["round", "2018-03-11 01:59:00-0600", "5min"], - ["floor", "2018-03-11 03:01:00-0500", "2H"], + ["floor", "2018-03-11 03:01:00-0500", "2h"], ], ) @pytest.mark.parametrize( diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index fba16749c026e..d7d33ae058af8 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -345,30 +345,30 @@ def test_dt_round_tz_ambiguous(self, method): ) df1["date"] = df1["date"].dt.tz_convert("Europe/Madrid") # infer - result = getattr(df1.date.dt, method)("H", ambiguous="infer") + result = getattr(df1.date.dt, method)("h", ambiguous="infer") expected = df1["date"] tm.assert_series_equal(result, expected) # bool-array - result = getattr(df1.date.dt, method)("H", ambiguous=[True, False, False]) + result = getattr(df1.date.dt, method)("h", ambiguous=[True, False, False]) tm.assert_series_equal(result, expected) # NaT - result = getattr(df1.date.dt, method)("H", ambiguous="NaT") + result = getattr(df1.date.dt, method)("h", ambiguous="NaT") expected = df1["date"].copy() expected.iloc[0:2] = pd.NaT tm.assert_series_equal(result, expected) # raise with tm.external_error_raised(pytz.AmbiguousTimeError): - getattr(df1.date.dt, method)("H", ambiguous="raise") + getattr(df1.date.dt, method)("h", ambiguous="raise") @pytest.mark.parametrize( "method, ts_str, freq", [ ["ceil", "2018-03-11 01:59:00-0600", "5min"], ["round", "2018-03-11 01:59:00-0600", "5min"], - ["floor", "2018-03-11 03:01:00-0500", "2H"], + ["floor", "2018-03-11 03:01:00-0500", "2h"], ], ) def test_dt_round_tz_nonexistent(self, method, ts_str, freq): @@ -598,7 +598,7 @@ def test_strftime_dt64_microsecond_resolution(self): tm.assert_series_equal(result, expected) def test_strftime_period_hours(self): - ser = Series(period_range("20130101", periods=4, freq="H")) + ser = Series(period_range("20130101", periods=4, freq="h")) result = ser.dt.strftime("%Y/%m/%d %H:%M:%S") expected = Series( [ @@ -776,8 +776,8 @@ class TestSeriesPeriodValuesDtAccessor: [Period("2016-01", freq="M"), Period("2016-02", freq="M")], [Period("2016-01-01", freq="D"), Period("2016-01-02", freq="D")], [ - Period("2016-01-01 00:00:00", freq="H"), - Period("2016-01-01 01:00:00", freq="H"), + Period("2016-01-01 00:00:00", freq="h"), + Period("2016-01-01 01:00:00", freq="h"), ], [ Period("2016-01-01 00:00:00", freq="M"), diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 317967bcbb7ff..84cf80fa1ffce 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -76,7 +76,7 @@ def test_getitem_setitem_datetime_tz(tz_source): N = 50 # testing with timezone, GH #2785 - rng = date_range("1/1/1990", periods=N, freq="H", tz=tzget("US/Eastern")) + rng = date_range("1/1/1990", periods=N, freq="h", tz=tzget("US/Eastern")) ts = Series(np.random.default_rng(2).standard_normal(N), index=rng) # also test Timestamp tz handling, GH #2789 @@ -107,7 +107,7 @@ def test_getitem_setitem_datetime_tz(tz_source): def test_getitem_setitem_datetimeindex(): N = 50 # testing with timezone, GH #2785 - rng = date_range("1/1/1990", periods=N, freq="H", tz="US/Eastern") + rng = date_range("1/1/1990", periods=N, freq="h", tz="US/Eastern") ts = Series(np.random.default_rng(2).standard_normal(N), index=rng) result = ts["1990-01-01 04:00:00"] @@ -213,7 +213,7 @@ def test_getitem_setitem_datetimeindex(): def test_getitem_setitem_periodindex(): N = 50 - rng = period_range("1/1/1990", periods=N, freq="H") + rng = period_range("1/1/1990", periods=N, freq="h") ts = Series(np.random.default_rng(2).standard_normal(N), index=rng) result = ts["1990-01-01 04"] @@ -450,7 +450,7 @@ def test_indexing(): def test_getitem_str_month_with_datetimeindex(): # GH3546 (not including times on the last day) - idx = date_range(start="2013-05-31 00:00", end="2013-05-31 23:00", freq="H") + idx = date_range(start="2013-05-31 00:00", end="2013-05-31 23:00", freq="h") ts = Series(range(len(idx)), index=idx) expected = ts["2013-05"] tm.assert_series_equal(expected, ts) diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index 458988491aae8..479e74703bc0e 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -137,7 +137,7 @@ def test_getitem_pydatetime_tz(self, tzstr): tz = timezones.maybe_get_tz(tzstr) index = date_range( - start="2012-12-24 16:00", end="2012-12-24 18:00", freq="H", tz=tzstr + start="2012-12-24 16:00", end="2012-12-24 18:00", freq="h", tz=tzstr ) ts = Series(index=index, data=index.hour) time_pandas = Timestamp("2012-12-24 17:00", tz=tzstr) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 0fa28920d41bd..a52d87b1a0457 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -327,7 +327,7 @@ def test_multilevel_preserve_name(lexsorted_two_level_string_multiindex, indexer [ date_range("2014-01-01", periods=20, freq="MS"), period_range("2014-01", periods=20, freq="M"), - timedelta_range("0", periods=20, freq="H"), + timedelta_range("0", periods=20, freq="h"), ], ) def test_slice_with_negative_step(index): diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index f419ff9384042..5fcd3a19dcaa4 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -79,7 +79,7 @@ def test_setitem_tuple_with_datetimetz_values(self): @pytest.mark.parametrize("tz", ["US/Eastern", "UTC", "Asia/Tokyo"]) def test_setitem_with_tz(self, tz, indexer_sli): - orig = Series(date_range("2016-01-01", freq="H", periods=3, tz=tz)) + orig = Series(date_range("2016-01-01", freq="h", periods=3, tz=tz)) assert orig.dtype == f"datetime64[ns, {tz}]" exp = Series( @@ -117,7 +117,7 @@ def test_setitem_with_tz(self, tz, indexer_sli): def test_setitem_with_tz_dst(self, indexer_sli): # GH#14146 trouble setting values near DST boundary tz = "US/Eastern" - orig = Series(date_range("2016-11-06", freq="H", periods=3, tz=tz)) + orig = Series(date_range("2016-11-06", freq="h", periods=3, tz=tz)) assert orig.dtype == f"datetime64[ns, {tz}]" exp = Series( diff --git a/pandas/tests/series/methods/test_align.py b/pandas/tests/series/methods/test_align.py index eedd8b654f3d0..cb60cd2e5bcf3 100644 --- a/pandas/tests/series/methods/test_align.py +++ b/pandas/tests/series/methods/test_align.py @@ -193,7 +193,7 @@ def test_align_with_dataframe_method(method): def test_align_dt64tzindex_mismatched_tzs(): - idx1 = date_range("2001", periods=5, freq="H", tz="US/Eastern") + idx1 = date_range("2001", periods=5, freq="h", tz="US/Eastern") ser = Series(np.random.default_rng(2).standard_normal(len(idx1)), index=idx1) ser_central = ser.tz_convert("US/Central") # different timezones convert to UTC diff --git a/pandas/tests/series/methods/test_asof.py b/pandas/tests/series/methods/test_asof.py index 31c264d74d063..2acc2921e5efc 100644 --- a/pandas/tests/series/methods/test_asof.py +++ b/pandas/tests/series/methods/test_asof.py @@ -118,7 +118,7 @@ def test_with_nan(self): def test_periodindex(self): # array or list or dates N = 50 - rng = period_range("1/1/1990", periods=N, freq="H") + rng = period_range("1/1/1990", periods=N, freq="h") ts = Series(np.random.default_rng(2).standard_normal(N), index=rng) ts.iloc[15:30] = np.nan dates = date_range("1/1/1990", periods=N * 3, freq="37min") @@ -133,7 +133,7 @@ def test_periodindex(self): lb = ts.index[14] ub = ts.index[30] - pix = PeriodIndex(result.index.values, freq="H") + pix = PeriodIndex(result.index.values, freq="h") mask = (pix >= lb) & (pix < ub) rs = result[mask] assert (rs == ts[lb]).all() diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index aabed794ac557..89b6f9b01bc66 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -107,7 +107,7 @@ def test_combine_first_timezone_series_with_empty_series(self): time_index = date_range( datetime(2021, 1, 1, 1), datetime(2021, 1, 1, 10), - freq="H", + freq="h", tz="Europe/Rome", ) s1 = Series(range(10), index=time_index) diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py index 549f429f09d35..f8bbd4c25a4c0 100644 --- a/pandas/tests/series/methods/test_interpolate.py +++ b/pandas/tests/series/methods/test_interpolate.py @@ -780,7 +780,7 @@ def test_series_interpolate_intraday(self): exp = ts.reindex(new_index).interpolate(method="time") - index = date_range("1/1/2012", periods=4, freq="12H") + index = date_range("1/1/2012", periods=4, freq="12h") ts = Series([0, 12, 24, 36], index) new_index = index.append(index + pd.DateOffset(hours=1)).sort_values() result = ts.reindex(new_index).interpolate(method="time") diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index 783e18e541ad8..ae6c62e95f696 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -494,14 +494,14 @@ def test_map_categorical_na_action(na_action, expected): def test_map_datetimetz(): - values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize( + values = pd.date_range("2011-01-01", "2011-01-02", freq="h").tz_localize( "Asia/Tokyo" ) s = Series(values, name="XX") # keep tz result = s.map(lambda x: x + pd.offsets.Day()) - exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="H").tz_localize( + exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="h").tz_localize( "Asia/Tokyo" ) exp = Series(exp_values, name="XX") diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index 9d6611cd53068..0923a2d42ce10 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -329,7 +329,7 @@ def test_reindex_fill_value_datetimelike_upcast(dtype, fill_value, using_array_m def test_reindex_datetimeindexes_tz_naive_and_aware(): # GH 8306 idx = date_range("20131101", tz="America/Chicago", periods=7) - newidx = date_range("20131103", periods=10, freq="H") + newidx = date_range("20131103", periods=10, freq="h") s = Series(range(7), index=idx) msg = ( r"Cannot compare dtypes datetime64\[ns, America/Chicago\] " diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 522b86274a517..e9eb906a9cf10 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -457,7 +457,7 @@ def test_ser_flex_cmp_return_dtypes_empty(self, opname): def test_ser_cmp_result_names(self, names, comparison_op): # datetime64 dtype op = comparison_op - dti = date_range("1949-06-07 03:00:00", freq="H", periods=5, name=names[0]) + dti = date_range("1949-06-07 03:00:00", freq="h", periods=5, name=names[0]) ser = Series(dti).rename(names[1]) result = op(ser, dti) assert result.name == names[2] @@ -713,7 +713,7 @@ def test_compare_series_interval_keyword(self): class TestTimeSeriesArithmetic: def test_series_add_tz_mismatch_converts_to_utc(self): - rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") + rng = date_range("1/1/2011", periods=100, freq="h", tz="utc") perm = np.random.default_rng(2).permutation(100)[:90] ser1 = Series( @@ -737,7 +737,7 @@ def test_series_add_tz_mismatch_converts_to_utc(self): tm.assert_series_equal(result, expected) def test_series_add_aware_naive_raises(self): - rng = date_range("1/1/2011", periods=10, freq="H") + rng = date_range("1/1/2011", periods=10, freq="h") ser = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) ser_utc = ser.tz_localize("utc") diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 5a05a1840b644..4f9050be100ca 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -335,8 +335,8 @@ def test_constructor_index_dtype(self, dtype): [ ([1, 2]), (["1", "2"]), - (list(date_range("1/1/2011", periods=2, freq="H"))), - (list(date_range("1/1/2011", periods=2, freq="H", tz="US/Eastern"))), + (list(date_range("1/1/2011", periods=2, freq="h"))), + (list(date_range("1/1/2011", periods=2, freq="h", tz="US/Eastern"))), ([Interval(left=0, right=5)]), ], ) @@ -2158,7 +2158,7 @@ def test_constructor_no_pandas_array(self, using_array_manager): @td.skip_array_manager_invalid_test def test_from_array(self): - result = Series(pd.array(["1H", "2H"], dtype="timedelta64[ns]")) + result = Series(pd.array(["1h", "2h"], dtype="timedelta64[ns]")) assert result._mgr.blocks[0].is_extension is False result = Series(pd.array(["2015"], dtype="datetime64[ns]")) @@ -2166,7 +2166,7 @@ def test_from_array(self): @td.skip_array_manager_invalid_test def test_from_list_dtype(self): - result = Series(["1H", "2H"], dtype="timedelta64[ns]") + result = Series(["1h", "2h"], dtype="timedelta64[ns]") assert result._mgr.blocks[0].is_extension is False result = Series(["2015"], dtype="datetime64[ns]") diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index 1e1ac100b21bf..fbdf843a998bb 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -50,7 +50,7 @@ def test_td64_sum_empty(skipna): def test_td64_summation_overflow(): # GH#9442 - ser = Series(pd.date_range("20130101", periods=100000, freq="H")) + ser = Series(pd.date_range("20130101", periods=100000, freq="h")) ser[0] += pd.Timedelta("1s 1ms") # mean diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 535a64f303ec2..86474a38d29fb 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -389,7 +389,7 @@ def test_categorical_series_repr_ordered(self): assert repr(s) == exp def test_categorical_series_repr_datetime(self): - idx = date_range("2011-01-01 09:00", freq="H", periods=5) + idx = date_range("2011-01-01 09:00", freq="h", periods=5) s = Series(Categorical(idx)) exp = """0 2011-01-01 09:00:00 1 2011-01-01 10:00:00 @@ -402,7 +402,7 @@ def test_categorical_series_repr_datetime(self): assert repr(s) == exp - idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern") s = Series(Categorical(idx)) exp = """0 2011-01-01 09:00:00-05:00 1 2011-01-01 10:00:00-05:00 @@ -417,7 +417,7 @@ def test_categorical_series_repr_datetime(self): assert repr(s) == exp def test_categorical_series_repr_datetime_ordered(self): - idx = date_range("2011-01-01 09:00", freq="H", periods=5) + idx = date_range("2011-01-01 09:00", freq="h", periods=5) s = Series(Categorical(idx, ordered=True)) exp = """0 2011-01-01 09:00:00 1 2011-01-01 10:00:00 @@ -430,7 +430,7 @@ def test_categorical_series_repr_datetime_ordered(self): assert repr(s) == exp - idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern") s = Series(Categorical(idx, ordered=True)) exp = """0 2011-01-01 09:00:00-05:00 1 2011-01-01 10:00:00-05:00 @@ -445,7 +445,7 @@ def test_categorical_series_repr_datetime_ordered(self): assert repr(s) == exp def test_categorical_series_repr_period(self): - idx = period_range("2011-01-01 09:00", freq="H", periods=5) + idx = period_range("2011-01-01 09:00", freq="h", periods=5) s = Series(Categorical(idx)) exp = """0 2011-01-01 09:00 1 2011-01-01 10:00 @@ -453,7 +453,7 @@ def test_categorical_series_repr_period(self): 3 2011-01-01 12:00 4 2011-01-01 13:00 dtype: category -Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, +Categories (5, period[h]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]""" # noqa: E501 assert repr(s) == exp @@ -471,7 +471,7 @@ def test_categorical_series_repr_period(self): assert repr(s) == exp def test_categorical_series_repr_period_ordered(self): - idx = period_range("2011-01-01 09:00", freq="H", periods=5) + idx = period_range("2011-01-01 09:00", freq="h", periods=5) s = Series(Categorical(idx, ordered=True)) exp = """0 2011-01-01 09:00 1 2011-01-01 10:00 @@ -479,7 +479,7 @@ def test_categorical_series_repr_period_ordered(self): 3 2011-01-01 12:00 4 2011-01-01 13:00 dtype: category -Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < +Categories (5, period[h]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < 2011-01-01 13:00]""" # noqa: E501 assert repr(s) == exp diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 93fe9b05adb4f..b0406dbfa3469 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2817,7 +2817,7 @@ class TestToDatetimeInferFormat: def test_to_datetime_infer_datetime_format_consistent_format( self, cache, test_format ): - ser = Series(date_range("20000101", periods=50, freq="H")) + ser = Series(date_range("20000101", periods=50, freq="h")) s_as_dt_strings = ser.apply(lambda x: x.strftime(test_format)) diff --git a/pandas/tests/tseries/frequencies/test_freq_code.py b/pandas/tests/tseries/frequencies/test_freq_code.py index ca8db972e2185..fa92590fb5ec1 100644 --- a/pandas/tests/tseries/frequencies/test_freq_code.py +++ b/pandas/tests/tseries/frequencies/test_freq_code.py @@ -13,7 +13,7 @@ @pytest.mark.parametrize( "freqstr,exp_freqstr", - [("D", "D"), ("W", "D"), ("ME", "D"), ("s", "s"), ("min", "s"), ("H", "s")], + [("D", "D"), ("W", "D"), ("ME", "D"), ("s", "s"), ("min", "s"), ("h", "s")], ) def test_get_to_timestamp_base(freqstr, exp_freqstr): off = to_offset(freqstr) @@ -31,7 +31,7 @@ def test_get_to_timestamp_base(freqstr, exp_freqstr): ("Q", "quarter"), ("M", "month"), ("D", "day"), - ("H", "hour"), + ("h", "hour"), ("min", "minute"), ("s", "second"), ("ms", "millisecond"), @@ -43,7 +43,7 @@ def test_get_attrname_from_abbrev(freqstr, expected): assert Resolution.get_reso_from_freqstr(freqstr).attrname == expected -@pytest.mark.parametrize("freq", ["D", "H", "min", "s", "ms", "us", "ns"]) +@pytest.mark.parametrize("freq", ["D", "h", "min", "s", "ms", "us", "ns"]) def test_get_freq_roundtrip2(freq): obj = Resolution.get_reso_from_freqstr(freq) result = _attrname_to_abbrevs[obj.attrname] @@ -55,9 +55,9 @@ def test_get_freq_roundtrip2(freq): [ ((1.5, "min"), (90, "s")), ((62.4, "min"), (3744, "s")), - ((1.04, "H"), (3744, "s")), + ((1.04, "h"), (3744, "s")), ((1, "D"), (1, "D")), - ((0.342931, "H"), (1234551600, "us")), + ((0.342931, "h"), (1234551600, "us")), ((1.2345, "D"), (106660800, "ms")), ], ) @@ -73,7 +73,7 @@ def test_resolution_bumping(args, expected): [ (0.5, "ns"), # Too much precision in the input can prevent. - (0.3429324798798269273987982, "H"), + (0.3429324798798269273987982, "h"), ], ) def test_cat(args): @@ -86,7 +86,7 @@ def test_cat(args): @pytest.mark.parametrize( "freqstr,expected", [ - ("1H", "2021-01-01T09:00:00"), + ("1h", "2021-01-01T09:00:00"), ("1D", "2021-01-02T08:00:00"), ("1W", "2021-01-03T08:00:00"), ("1ME", "2021-01-31T08:00:00"), @@ -99,8 +99,8 @@ def test_compatibility(freqstr, expected): assert ts_np + do == np.datetime64(expected) -@pytest.mark.parametrize("freq", ["A", "T", "S", "L", "U", "N"]) -def test_units_A_T_S_L_U_N_deprecated_from_attrname_to_abbrevs(freq): +@pytest.mark.parametrize("freq", ["A", "H", "T", "S", "L", "U", "N"]) +def test_units_A_H_T_S_L_U_N_deprecated_from_attrname_to_abbrevs(freq): # GH#52536 msg = f"'{freq}' is deprecated and will be removed in a future version." diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index 0b2978389ea88..51d0dd298f841 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -38,7 +38,7 @@ @pytest.fixture( params=[ (timedelta(1), "D"), - (timedelta(hours=1), "H"), + (timedelta(hours=1), "h"), (timedelta(minutes=1), "min"), (timedelta(seconds=1), "s"), (np.timedelta64(1, "ns"), "ns"), @@ -220,7 +220,7 @@ def test_infer_freq_index(freq, expected): "ME": ["2010-11-30", "2010-12-31", "2011-01-31", "2011-02-28"], "W-SAT": ["2010-12-25", "2011-01-01", "2011-01-08", "2011-01-15"], "D": ["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"], - "H": [ + "h": [ "2011-12-31 22:00", "2011-12-31 23:00", "2012-01-01 00:00", @@ -255,7 +255,7 @@ def test_infer_freq_tz_series(tz_naive_fixture): ) @pytest.mark.parametrize( "freq", - ["H", "3H", "10min", "3601s", "3600001ms", "3600000001us", "3600000000001ns"], + ["h", "3h", "10min", "3601s", "3600001ms", "3600000001us", "3600000000001ns"], ) def test_infer_freq_tz_transition(tz_naive_fixture, date_pair, freq): # see gh-8772 @@ -265,7 +265,7 @@ def test_infer_freq_tz_transition(tz_naive_fixture, date_pair, freq): def test_infer_freq_tz_transition_custom(): - index = date_range("2013-11-03", periods=5, freq="3H").tz_localize( + index = date_range("2013-11-03", periods=5, freq="3h").tz_localize( "America/Chicago" ) assert index.inferred_freq is None @@ -274,7 +274,7 @@ def test_infer_freq_tz_transition_custom(): @pytest.mark.parametrize( "data,expected", [ - # Hourly freq in a day must result in "H" + # Hourly freq in a day must result in "h" ( [ "2014-07-01 09:00", @@ -284,7 +284,7 @@ def test_infer_freq_tz_transition_custom(): "2014-07-01 13:00", "2014-07-01 14:00", ], - "H", + "h", ), ( [ @@ -300,7 +300,7 @@ def test_infer_freq_tz_transition_custom(): "2014-07-02 10:00", "2014-07-02 11:00", ], - "BH", + "bh", ), ( [ @@ -316,7 +316,7 @@ def test_infer_freq_tz_transition_custom(): "2014-07-07 10:00", "2014-07-07 11:00", ], - "BH", + "bh", ), ( [ @@ -345,7 +345,7 @@ def test_infer_freq_tz_transition_custom(): "2014-07-08 15:00", "2014-07-08 16:00", ], - "BH", + "bh", ), ], ) diff --git a/pandas/tests/tseries/offsets/test_business_hour.py b/pandas/tests/tseries/offsets/test_business_hour.py index 319cc053d5d7d..f0d065f8bb7ef 100644 --- a/pandas/tests/tseries/offsets/test_business_hour.py +++ b/pandas/tests/tseries/offsets/test_business_hour.py @@ -148,17 +148,17 @@ def test_repr( offset9, offset10, ): - assert repr(offset1) == "" - assert repr(offset2) == "<3 * BusinessHours: BH=09:00-17:00>" - assert repr(offset3) == "<-1 * BusinessHour: BH=09:00-17:00>" - assert repr(offset4) == "<-4 * BusinessHours: BH=09:00-17:00>" - - assert repr(offset5) == "" - assert repr(offset6) == "" - assert repr(offset7) == "<-2 * BusinessHours: BH=21:30-06:30>" - assert repr(offset8) == "" - assert repr(offset9) == "<3 * BusinessHours: BH=09:00-13:00,22:00-03:00>" - assert repr(offset10) == "<-1 * BusinessHour: BH=13:00-17:00,23:00-02:00>" + assert repr(offset1) == "" + assert repr(offset2) == "<3 * BusinessHours: bh=09:00-17:00>" + assert repr(offset3) == "<-1 * BusinessHour: bh=09:00-17:00>" + assert repr(offset4) == "<-4 * BusinessHours: bh=09:00-17:00>" + + assert repr(offset5) == "" + assert repr(offset6) == "" + assert repr(offset7) == "<-2 * BusinessHours: bh=21:30-06:30>" + assert repr(offset8) == "" + assert repr(offset9) == "<3 * BusinessHours: bh=09:00-13:00,22:00-03:00>" + assert repr(offset10) == "<-1 * BusinessHour: bh=13:00-17:00,23:00-02:00>" def test_with_offset(self, dt): expected = Timestamp("2014-07-01 13:00") @@ -947,9 +947,9 @@ def test_apply_nanoseconds(self): assert_offset_equal(offset, base, expected) def test_datetimeindex(self): - idx1 = date_range(start="2014-07-04 15:00", end="2014-07-08 10:00", freq="BH") - idx2 = date_range(start="2014-07-04 15:00", periods=12, freq="BH") - idx3 = date_range(end="2014-07-08 10:00", periods=12, freq="BH") + idx1 = date_range(start="2014-07-04 15:00", end="2014-07-08 10:00", freq="bh") + idx2 = date_range(start="2014-07-04 15:00", periods=12, freq="bh") + idx3 = date_range(end="2014-07-08 10:00", periods=12, freq="bh") expected = DatetimeIndex( [ "2014-07-04 15:00", @@ -965,14 +965,14 @@ def test_datetimeindex(self): "2014-07-08 09:00", "2014-07-08 10:00", ], - freq="BH", + freq="bh", ) for idx in [idx1, idx2, idx3]: tm.assert_index_equal(idx, expected) - idx1 = date_range(start="2014-07-04 15:45", end="2014-07-08 10:45", freq="BH") - idx2 = date_range(start="2014-07-04 15:45", periods=12, freq="BH") - idx3 = date_range(end="2014-07-08 10:45", periods=12, freq="BH") + idx1 = date_range(start="2014-07-04 15:45", end="2014-07-08 10:45", freq="bh") + idx2 = date_range(start="2014-07-04 15:45", periods=12, freq="bh") + idx3 = date_range(end="2014-07-08 10:45", periods=12, freq="bh") expected = idx1 for idx in [idx1, idx2, idx3]: @@ -980,13 +980,13 @@ def test_datetimeindex(self): def test_short_datetimeindex_creation(self): # gh-49835 - idx4 = date_range(start="2014-07-01 10:00", freq="BH", periods=1) - expected4 = DatetimeIndex(["2014-07-01 10:00"], freq="BH") + idx4 = date_range(start="2014-07-01 10:00", freq="bh", periods=1) + expected4 = DatetimeIndex(["2014-07-01 10:00"], freq="bh") tm.assert_index_equal(idx4, expected4) def test_bday_ignores_timedeltas(self): - idx = date_range("2010/02/01", "2010/02/10", freq="12H") - t1 = idx + BDay(offset=Timedelta(3, unit="H")) + idx = date_range("2010/02/01", "2010/02/10", freq="12h") + t1 = idx + BDay(offset=Timedelta(3, unit="h")) expected = DatetimeIndex( [ diff --git a/pandas/tests/tseries/offsets/test_custom_business_hour.py b/pandas/tests/tseries/offsets/test_custom_business_hour.py index 38b5d74fe170f..55a184f95c2d8 100644 --- a/pandas/tests/tseries/offsets/test_custom_business_hour.py +++ b/pandas/tests/tseries/offsets/test_custom_business_hour.py @@ -69,8 +69,8 @@ def test_different_normalize_equals(self, _offset): assert offset != offset2 def test_repr(self, offset1, offset2): - assert repr(offset1) == "" - assert repr(offset2) == "" + assert repr(offset1) == "" + assert repr(offset2) == "" def test_with_offset(self, dt): expected = Timestamp("2014-07-01 13:00") diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index de44cf3f94d26..7f96ea98fa047 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -811,7 +811,7 @@ def test_alias_equality(self): assert k == v.copy() def test_rule_code(self): - lst = ["ME", "MS", "BM", "BMS", "D", "B", "H", "min", "s", "ms", "us"] + lst = ["ME", "MS", "BM", "BMS", "D", "B", "h", "min", "s", "ms", "us"] for k in lst: assert k == _get_offset(k).rule_code # should be cached - this is kind of an internals test... diff --git a/pandas/tests/tslibs/test_conversion.py b/pandas/tests/tslibs/test_conversion.py index d0f8923f3ad89..cefe449f3484d 100644 --- a/pandas/tests/tslibs/test_conversion.py +++ b/pandas/tests/tslibs/test_conversion.py @@ -66,8 +66,8 @@ def test_tz_localize_to_utc_copies(): def test_tz_convert_single_matches_tz_convert_hourly(tz_aware_fixture): tz = tz_aware_fixture - tz_didx = date_range("2014-03-01", "2015-01-10", freq="H", tz=tz) - naive_didx = date_range("2014-03-01", "2015-01-10", freq="H") + tz_didx = date_range("2014-03-01", "2015-01-10", freq="h", tz=tz) + naive_didx = date_range("2014-03-01", "2015-01-10", freq="h") _compare_utc_to_local(tz_didx) _compare_local_to_utc(tz_didx, naive_didx) diff --git a/pandas/tests/tslibs/test_period_asfreq.py b/pandas/tests/tslibs/test_period_asfreq.py index ca207e1031653..149817357fbd6 100644 --- a/pandas/tests/tslibs/test_period_asfreq.py +++ b/pandas/tests/tslibs/test_period_asfreq.py @@ -24,17 +24,17 @@ def get_freq_code(freqstr: str) -> int: @pytest.mark.parametrize( "freq1,freq2,expected", [ - ("D", "H", 24), + ("D", "h", 24), ("D", "min", 1440), ("D", "s", 86400), ("D", "ms", 86400000), ("D", "us", 86400000000), ("D", "ns", 86400000000000), - ("H", "min", 60), - ("H", "s", 3600), - ("H", "ms", 3600000), - ("H", "us", 3600000000), - ("H", "ns", 3600000000000), + ("h", "min", 60), + ("h", "s", 3600), + ("h", "ms", 3600000), + ("h", "us", 3600000000), + ("h", "ns", 3600000000000), ("min", "s", 60), ("min", "ms", 60000), ("min", "us", 60000000), diff --git a/pandas/tests/tslibs/test_to_offset.py b/pandas/tests/tslibs/test_to_offset.py index bc3e06646b235..82b0c78002972 100644 --- a/pandas/tests/tslibs/test_to_offset.py +++ b/pandas/tests/tslibs/test_to_offset.py @@ -53,7 +53,7 @@ def test_to_offset_negative(freqstr, expected): "-us", "3us1", "-2-3us", - "-2D:3H", + "-2D:3h", "1.5.0s", "2SMS-15-15", "2SMS-15D", @@ -105,12 +105,12 @@ def test_to_offset_tuple_unsupported(): @pytest.mark.parametrize( "freqstr,expected", [ - ("2D 3H", offsets.Hour(51)), - ("2 D3 H", offsets.Hour(51)), - ("2 D 3 H", offsets.Hour(51)), - (" 2 D 3 H ", offsets.Hour(51)), - (" H ", offsets.Hour()), - (" 3 H ", offsets.Hour(3)), + ("2D 3h", offsets.Hour(51)), + ("2 D3 h", offsets.Hour(51)), + ("2 D 3 h", offsets.Hour(51)), + (" 2 D 3 h ", offsets.Hour(51)), + (" h ", offsets.Hour()), + (" 3 h ", offsets.Hour(3)), ], ) def test_to_offset_whitespace(freqstr, expected): @@ -119,7 +119,7 @@ def test_to_offset_whitespace(freqstr, expected): @pytest.mark.parametrize( - "freqstr,expected", [("00H 00min 01s", 1), ("-00H 03min 14s", -194)] + "freqstr,expected", [("00h 00min 01s", 1), ("-00h 03min 14s", -194)] ) def test_to_offset_leading_zero(freqstr, expected): result = to_offset(freqstr) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index b8e0173ee131f..a23c91df5eef6 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -1239,15 +1239,15 @@ def test_dont_mutate_obj_after_slicing(self): df = DataFrame( { "id": ["a", "a", "b", "b", "b"], - "timestamp": date_range("2021-9-1", periods=5, freq="H"), + "timestamp": date_range("2021-9-1", periods=5, freq="h"), "y": range(5), } ) - grp = df.groupby("id").rolling("1H", on="timestamp") + grp = df.groupby("id").rolling("1h", on="timestamp") result = grp.count() expected_df = DataFrame( { - "timestamp": date_range("2021-9-1", periods=5, freq="H"), + "timestamp": date_range("2021-9-1", periods=5, freq="h"), "y": [1.0] * 5, }, index=MultiIndex.from_arrays( @@ -1262,7 +1262,7 @@ def test_dont_mutate_obj_after_slicing(self): index=MultiIndex.from_arrays( [ ["a", "a", "b", "b", "b"], - date_range("2021-9-1", periods=5, freq="H"), + date_range("2021-9-1", periods=5, freq="h"), ], names=["id", "timestamp"], ), diff --git a/pandas/tests/window/test_pairwise.py b/pandas/tests/window/test_pairwise.py index b6f2365afb457..2258a4106fe92 100644 --- a/pandas/tests/window/test_pairwise.py +++ b/pandas/tests/window/test_pairwise.py @@ -392,7 +392,7 @@ def test_pairwise_with_series(self, pairwise_frames, pairwise_target_frame, f): def test_corr_freq_memory_error(self): # GH 31789 s = Series(range(5), index=date_range("2020", periods=5)) - result = s.rolling("12H").corr(s) + result = s.rolling("12h").corr(s) expected = Series([np.nan] * 5, index=date_range("2020", periods=5)) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index caea3e98f262f..482c8992feb13 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -599,12 +599,12 @@ def test_all2(self, arithmetic_win_operators): # more sophisticated comparison of integer vs. # time-based windowing df = DataFrame( - {"B": np.arange(50)}, index=date_range("20130101", periods=50, freq="H") + {"B": np.arange(50)}, index=date_range("20130101", periods=50, freq="h") ) # in-range data dft = df.between_time("09:00", "16:00") - r = dft.rolling(window="5H") + r = dft.rolling(window="5h") result = getattr(r, f)() diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index e77f56a9928ae..0ed0fe4b87576 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -56,7 +56,7 @@ TimedeltaIndex, ) from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin -# --------------------------------------------------------------------- +# -------------------------------------------------------------------- # Offset related functions _need_suffix = ["QS", "BQ", "BQS", "YS", "AS", "BY", "BA", "BYS", "BAS"] @@ -229,7 +229,7 @@ def get_freq(self) -> str | None: # Business hourly, maybe. 17: one day / 65: one weekend if self.hour_deltas in ([1, 17], [1, 65], [1, 17, 65]): - return "BH" + return "bh" # Possibly intraday frequency. Here we use the # original .asi8 values as the modified values @@ -243,7 +243,7 @@ def get_freq(self) -> str | None: pps = ppm // 60 if _is_multiple(delta, pph): # Hours - return _maybe_add_count("H", delta / pph) + return _maybe_add_count("h", delta / pph) elif _is_multiple(delta, ppm): # Minutes return _maybe_add_count("min", delta / ppm) @@ -457,21 +457,21 @@ def is_subperiod(source, target) -> bool: return _quarter_months_conform( get_rule_month(source), get_rule_month(target) ) - return source in {"D", "C", "B", "M", "H", "min", "s", "ms", "us", "ns"} + return source in {"D", "C", "B", "M", "h", "min", "s", "ms", "us", "ns"} elif _is_quarterly(target): - return source in {"D", "C", "B", "M", "H", "min", "s", "ms", "us", "ns"} + return source in {"D", "C", "B", "M", "h", "min", "s", "ms", "us", "ns"} elif _is_monthly(target): - return source in {"D", "C", "B", "H", "min", "s", "ms", "us", "ns"} + return source in {"D", "C", "B", "h", "min", "s", "ms", "us", "ns"} elif _is_weekly(target): - return source in {target, "D", "C", "B", "H", "min", "s", "ms", "us", "ns"} + return source in {target, "D", "C", "B", "h", "min", "s", "ms", "us", "ns"} elif target == "B": - return source in {"B", "H", "min", "s", "ms", "us", "ns"} + return source in {"B", "h", "min", "s", "ms", "us", "ns"} elif target == "C": - return source in {"C", "H", "min", "s", "ms", "us", "ns"} + return source in {"C", "h", "min", "s", "ms", "us", "ns"} elif target == "D": - return source in {"D", "H", "min", "s", "ms", "us", "ns"} - elif target == "H": - return source in {"H", "min", "s", "ms", "us", "ns"} + return source in {"D", "h", "min", "s", "ms", "us", "ns"} + elif target == "h": + return source in {"h", "min", "s", "ms", "us", "ns"} elif target == "min": return source in {"min", "s", "ms", "us", "ns"} elif target == "s": @@ -515,21 +515,21 @@ def is_superperiod(source, target) -> bool: smonth = get_rule_month(source) tmonth = get_rule_month(target) return _quarter_months_conform(smonth, tmonth) - return target in {"D", "C", "B", "M", "H", "min", "s", "ms", "us", "ns"} + return target in {"D", "C", "B", "M", "h", "min", "s", "ms", "us", "ns"} elif _is_quarterly(source): - return target in {"D", "C", "B", "M", "H", "min", "s", "ms", "us", "ns"} + return target in {"D", "C", "B", "M", "h", "min", "s", "ms", "us", "ns"} elif _is_monthly(source): - return target in {"D", "C", "B", "H", "min", "s", "ms", "us", "ns"} + return target in {"D", "C", "B", "h", "min", "s", "ms", "us", "ns"} elif _is_weekly(source): - return target in {source, "D", "C", "B", "H", "min", "s", "ms", "us", "ns"} + return target in {source, "D", "C", "B", "h", "min", "s", "ms", "us", "ns"} elif source == "B": - return target in {"D", "C", "B", "H", "min", "s", "ms", "us", "ns"} + return target in {"D", "C", "B", "h", "min", "s", "ms", "us", "ns"} elif source == "C": - return target in {"D", "C", "B", "H", "min", "s", "ms", "us", "ns"} + return target in {"D", "C", "B", "h", "min", "s", "ms", "us", "ns"} elif source == "D": - return target in {"D", "C", "B", "H", "min", "s", "ms", "us", "ns"} - elif source == "H": - return target in {"H", "min", "s", "ms", "us", "ns"} + return target in {"D", "C", "B", "h", "min", "s", "ms", "us", "ns"} + elif source == "h": + return target in {"h", "min", "s", "ms", "us", "ns"} elif source == "min": return target in {"min", "s", "ms", "us", "ns"} elif source == "s": @@ -560,7 +560,7 @@ def _maybe_coerce_freq(code) -> str: assert code is not None if isinstance(code, DateOffset): code = freq_to_period_freqstr(1, code.name) - if code in {"min", "s", "ms", "us", "ns"}: + if code in {"h", "min", "s", "ms", "us", "ns"}: return code else: return code.upper() From 3fac6f26927a67718afafc42facfb199987c8548 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sun, 8 Oct 2023 14:49:57 -0400 Subject: [PATCH 100/131] BUG: groupby.idxmax/idxmin consistently raise on unobserved categorical (#55268) --- .github/workflows/code-checks.yml | 2 +- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/generic.py | 2 +- pandas/core/groupby/generic.py | 40 +----- pandas/core/groupby/groupby.py | 119 +++++++++++++++++- pandas/tests/groupby/test_categorical.py | 52 ++++++-- pandas/tests/groupby/test_function.py | 33 +++++ pandas/tests/groupby/test_groupby.py | 31 +++-- pandas/tests/groupby/test_groupby_dropna.py | 27 ++-- pandas/tests/groupby/test_raises.py | 55 ++++---- .../tests/groupby/transform/test_transform.py | 16 +++ 11 files changed, 274 insertions(+), 104 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 3bd68c07dcbc3..4260c0836bbea 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -124,7 +124,7 @@ jobs: run: | cd asv_bench asv machine --yes - asv run --quick --dry-run --durations=30 --python=same + asv run --quick --dry-run --durations=30 --python=same --show-stderr build_docker_dev_environment: name: Build Docker Dev Environment diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 04a76b481bfc3..d10b00e37f808 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -365,6 +365,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ +- Bug in :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax`, and :meth:`SeriesGroupBy.idxmin` would not consistently raise when grouping with ``observed=False`` and unobserved categoricals (:issue:`10694`) - Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) - Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b7b01a2097358..6527dcaa2de59 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11910,7 +11910,7 @@ def _logical_func( def any( self, - axis: Axis = 0, + axis: Axis | None = 0, bool_only: bool_t = False, skipna: bool_t = True, **kwargs, diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 904ab9bdfc6dd..a2f556eba08a4 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1185,15 +1185,13 @@ def nsmallest( def idxmin( self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True ) -> Series: - result = self._op_via_apply("idxmin", axis=axis, skipna=skipna) - return result.astype(self.obj.index.dtype) if result.empty else result + return self._idxmax_idxmin("idxmin", axis=axis, skipna=skipna) @doc(Series.idxmax.__doc__) def idxmax( self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True ) -> Series: - result = self._op_via_apply("idxmax", axis=axis, skipna=skipna) - return result.astype(self.obj.index.dtype) if result.empty else result + return self._idxmax_idxmin("idxmax", axis=axis, skipna=skipna) @doc(Series.corr.__doc__) def corr( @@ -2187,22 +2185,9 @@ def idxmax( Beef co2_emissions dtype: object """ - if axis is not lib.no_default: - if axis is None: - axis = self.axis - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "idxmax") - else: - axis = self.axis - - def func(df): - return df.idxmax(axis=axis, skipna=skipna, numeric_only=numeric_only) - - func.__name__ = "idxmax" - result = self._python_apply_general( - func, self._obj_with_exclusions, not_indexed_same=True + return self._idxmax_idxmin( + "idxmax", axis=axis, numeric_only=numeric_only, skipna=skipna ) - return result.astype(self.obj.index.dtype) if result.empty else result def idxmin( self, @@ -2282,22 +2267,9 @@ def idxmin( Beef consumption dtype: object """ - if axis is not lib.no_default: - if axis is None: - axis = self.axis - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "idxmin") - else: - axis = self.axis - - def func(df): - return df.idxmin(axis=axis, skipna=skipna, numeric_only=numeric_only) - - func.__name__ = "idxmin" - result = self._python_apply_general( - func, self._obj_with_exclusions, not_indexed_same=True + return self._idxmax_idxmin( + "idxmin", axis=axis, numeric_only=numeric_only, skipna=skipna ) - return result.astype(self.obj.index.dtype) if result.empty else result boxplot = boxplot_frame_groupby diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a022bfd1bd9bc..e33c4b3579c69 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2015,10 +2015,14 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): with com.temp_setattr(self, "as_index", True): # GH#49834 - result needs groups in the index for # _wrap_transform_fast_result - if engine is not None: - kwargs["engine"] = engine - kwargs["engine_kwargs"] = engine_kwargs - result = getattr(self, func)(*args, **kwargs) + if func in ["idxmin", "idxmax"]: + func = cast(Literal["idxmin", "idxmax"], func) + result = self._idxmax_idxmin(func, True, *args, **kwargs) + else: + if engine is not None: + kwargs["engine"] = engine + kwargs["engine_kwargs"] = engine_kwargs + result = getattr(self, func)(*args, **kwargs) return self._wrap_transform_fast_result(result) @@ -5720,6 +5724,113 @@ def sample( sampled_indices = np.concatenate(sampled_indices) return self._selected_obj.take(sampled_indices, axis=self.axis) + def _idxmax_idxmin( + self, + how: Literal["idxmax", "idxmin"], + ignore_unobserved: bool = False, + axis: Axis | None | lib.NoDefault = lib.no_default, + skipna: bool = True, + numeric_only: bool = False, + ): + """Compute idxmax/idxmin. + + Parameters + ---------- + how: {"idxmin", "idxmax"} + Whether to compute idxmin or idxmax. + axis : {{0 or 'index', 1 or 'columns'}}, default None + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + If axis is not provided, grouper's axis is used. + numeric_only : bool, default False + Include only float, int, boolean columns. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ignore_unobserved : bool, default False + When True and an unobserved group is encountered, do not raise. This used + for transform where unobserved groups do not play an impact on the result. + + Returns + ------- + Series or DataFrame + idxmax or idxmin for the groupby operation. + """ + if axis is not lib.no_default: + if axis is None: + axis = self.axis + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, how) + else: + axis = self.axis + + if not self.observed and any( + ping._passed_categorical for ping in self.grouper.groupings + ): + expected_len = np.prod( + [len(ping.group_index) for ping in self.grouper.groupings] + ) + if len(self.grouper.groupings) == 1: + result_len = len(self.grouper.groupings[0].grouping_vector.unique()) + else: + # result_index only contains observed groups in this case + result_len = len(self.grouper.result_index) + assert result_len <= expected_len + has_unobserved = result_len < expected_len + + raise_err: bool | np.bool_ = not ignore_unobserved and has_unobserved + # Only raise an error if there are columns to compute; otherwise we return + # an empty DataFrame with an index (possibly including unobserved) but no + # columns + data = self._obj_with_exclusions + if raise_err and isinstance(data, DataFrame): + if numeric_only: + data = data._get_numeric_data() + raise_err = len(data.columns) > 0 + else: + raise_err = False + if raise_err: + raise ValueError( + f"Can't get {how} of an empty group due to unobserved categories. " + "Specify observed=True in groupby instead." + ) + + try: + if self.obj.ndim == 1: + result = self._op_via_apply(how, skipna=skipna) + else: + + def func(df): + method = getattr(df, how) + return method(axis=axis, skipna=skipna, numeric_only=numeric_only) + + func.__name__ = how + result = self._python_apply_general( + func, self._obj_with_exclusions, not_indexed_same=True + ) + except ValueError as err: + name = "argmax" if how == "idxmax" else "argmin" + if f"attempt to get {name} of an empty sequence" in str(err): + raise ValueError( + f"Can't get {how} of an empty group due to unobserved categories. " + "Specify observed=True in groupby instead." + ) from None + raise + + result = result.astype(self.obj.index.dtype) if result.empty else result + + if not skipna: + has_na_value = result.isnull().any(axis=None) + if has_na_value: + warnings.warn( + f"The behavior of {type(self).__name__}.{how} with all-NA " + "values, or any-NA and skipna=False, is deprecated. In a future " + "version this will raise ValueError", + FutureWarning, + stacklevel=find_stack_level(), + ) + + return result + @doc(GroupBy) def get_groupby( diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index b11240c841420..11291bb89b604 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1416,6 +1416,15 @@ def test_series_groupby_on_2_categoricals_unobserved(reduction_func, observed): return agg = getattr(series_groupby, reduction_func) + + if not observed and reduction_func in ["idxmin", "idxmax"]: + # idxmin and idxmax are designed to fail on empty inputs + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + agg(*args) + return + result = agg(*args) assert len(result) == expected_length @@ -1448,6 +1457,15 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"] agg = getattr(series_groupby, reduction_func) + + if reduction_func in ["idxmin", "idxmax"]: + # idxmin and idxmax are designed to fail on empty inputs + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + agg(*args) + return + result = agg(*args) zero_or_nan = _results_for_groupbys_with_missing_categories[reduction_func] @@ -1514,6 +1532,15 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( df_grp = df.groupby(["cat_1", "cat_2"], observed=observed) args = get_groupby_method_args(reduction_func, df) + + if not observed and reduction_func in ["idxmin", "idxmax"]: + # idxmin and idxmax are designed to fail on empty inputs + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + getattr(df_grp, reduction_func)(*args) + return + res = getattr(df_grp, reduction_func)(*args) expected = _results_for_groupbys_with_missing_categories[reduction_func] @@ -1883,14 +1910,7 @@ def test_category_order_reducer( request, as_index, sort, observed, reduction_func, index_kind, ordered ): # GH#48749 - if ( - reduction_func in ("idxmax", "idxmin") - and not observed - and index_kind != "multi" - ): - msg = "GH#10694 - idxmax/min fail with unused categories" - request.node.add_marker(pytest.mark.xfail(reason=msg)) - elif reduction_func == "corrwith" and not as_index: + if reduction_func == "corrwith" and not as_index: msg = "GH#49950 - corrwith with as_index=False may not have grouping column" request.node.add_marker(pytest.mark.xfail(reason=msg)) elif index_kind != "range" and not as_index: @@ -1912,6 +1932,15 @@ def test_category_order_reducer( df = df.set_index(keys) args = get_groupby_method_args(reduction_func, df) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) + + if not observed and reduction_func in ["idxmin", "idxmax"]: + # idxmin and idxmax are designed to fail on empty inputs + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + getattr(gb, reduction_func)(*args) + return + op_result = getattr(gb, reduction_func)(*args) if as_index: result = op_result.index.get_level_values("a").categories @@ -2114,6 +2143,13 @@ def test_agg_list(request, as_index, observed, reduction_func, test_series, keys gb = gb["b"] args = get_groupby_method_args(reduction_func, df) + if not observed and reduction_func in ["idxmin", "idxmax"] and keys == ["a1", "a2"]: + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + gb.agg([reduction_func], *args) + return + result = gb.agg([reduction_func], *args) expected = getattr(gb, reduction_func)(*args) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index a92880c87b847..08372541988d0 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -544,6 +544,39 @@ def test_idxmin_idxmax_axis1(): gb2.idxmax(axis=1) +@pytest.mark.parametrize( + "func, values, expected_values, warn", + [ + ("idxmin", [0, 1, 2], [0, 2], None), + ("idxmax", [0, 1, 2], [1, 2], None), + ("idxmin", [0, np.nan, 2], [np.nan, 2], FutureWarning), + ("idxmax", [0, np.nan, 2], [np.nan, 2], FutureWarning), + ("idxmin", [1, 0, np.nan], [1, np.nan], FutureWarning), + ("idxmax", [1, 0, np.nan], [0, np.nan], FutureWarning), + ], +) +@pytest.mark.parametrize("test_series", [True, False]) +def test_idxmin_idxmax_skipna_false(func, values, expected_values, warn, test_series): + # GH#54234 + df = DataFrame( + { + "a": [1, 1, 2], + "b": values, + } + ) + gb = df.groupby("a") + index = Index([1, 2], name="a") + expected = DataFrame({"b": expected_values}, index=index) + if test_series: + gb = gb["b"] + expected = expected["b"] + klass = "Series" if test_series else "DataFrame" + msg = f"The behavior of {klass}GroupBy.{func} with all-NA values" + with tm.assert_produces_warning(warn, match=msg): + result = getattr(gb, func)(skipna=False) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("numeric_only", [True, False, None]) def test_axis1_numeric_only(request, groupby_func, numeric_only): if groupby_func in ("idxmax", "idxmin"): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4ca8b0e317bd2..e3e4a7efaa307 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2001,22 +2001,10 @@ def test_pivot_table_values_key_error(): @pytest.mark.parametrize( "op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew"] ) -def test_empty_groupby( - columns, keys, values, method, op, request, using_array_manager, dropna -): +def test_empty_groupby(columns, keys, values, method, op, using_array_manager, dropna): # GH8093 & GH26411 override_dtype = None - if ( - isinstance(values, Categorical) - and len(keys) == 1 - and op in ["idxmax", "idxmin"] - ): - mark = pytest.mark.xfail( - raises=ValueError, match="attempt to get arg(min|max) of an empty sequence" - ) - request.node.add_marker(mark) - if isinstance(values, BooleanArray) and op in ["sum", "prod"]: # We expect to get Int64 back for these override_dtype = "Int64" @@ -2061,12 +2049,21 @@ def get_categorical_invalid_expected(): is_dt64 = df.dtypes.iloc[0].kind == "M" is_cat = isinstance(values, Categorical) - if isinstance(values, Categorical) and not values.ordered and op in ["min", "max"]: - msg = f"Cannot perform {op} with non-ordered Categorical" - with pytest.raises(TypeError, match=msg): + if ( + isinstance(values, Categorical) + and not values.ordered + and op in ["min", "max", "idxmin", "idxmax"] + ): + if op in ["min", "max"]: + msg = f"Cannot perform {op} with non-ordered Categorical" + klass = TypeError + else: + msg = f"Can't get {op} of an empty group due to unobserved categories" + klass = ValueError + with pytest.raises(klass, match=msg): get_result() - if isinstance(columns, list): + if op in ["min", "max"] and isinstance(columns, list): # i.e. DataframeGroupBy, not SeriesGroupBy result = get_result(numeric_only=True) expected = get_categorical_invalid_expected() diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index d82278c277d48..8065aa63dff81 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -503,18 +503,7 @@ def test_null_is_null_for_dtype( @pytest.mark.parametrize("index_kind", ["range", "single", "multi"]) -def test_categorical_reducers( - request, reduction_func, observed, sort, as_index, index_kind -): - # GH#36327 - if ( - reduction_func in ("idxmin", "idxmax") - and not observed - and index_kind != "multi" - ): - msg = "GH#10694 - idxmin/max broken for categorical with observed=False" - request.node.add_marker(pytest.mark.xfail(reason=msg)) - +def test_categorical_reducers(reduction_func, observed, sort, as_index, index_kind): # Ensure there is at least one null value by appending to the end values = np.append(np.random.default_rng(2).choice([1, 2, None], size=19), None) df = pd.DataFrame( @@ -544,6 +533,17 @@ def test_categorical_reducers( args = (args[0].drop(columns=keys),) args_filled = (args_filled[0].drop(columns=keys),) + gb_keepna = df.groupby( + keys, dropna=False, observed=observed, sort=sort, as_index=as_index + ) + + if not observed and reduction_func in ["idxmin", "idxmax"]: + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + getattr(gb_keepna, reduction_func)(*args) + return + gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True) expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index() expected["x"] = expected["x"].replace(4, None) @@ -573,9 +573,6 @@ def test_categorical_reducers( if as_index: expected = expected["size"].rename(None) - gb_keepna = df.groupby( - keys, dropna=False, observed=observed, sort=sort, as_index=as_index - ) if as_index or index_kind == "range" or reduction_func == "size": warn = None else: diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index f9a2b3d44b117..46bf324fad1d7 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -97,22 +97,24 @@ def df_with_cat_col(): return df -def _call_and_check(klass, msg, how, gb, groupby_func, args): - if klass is None: - if how == "method": - getattr(gb, groupby_func)(*args) - elif how == "agg": - gb.agg(groupby_func, *args) - else: - gb.transform(groupby_func, *args) - else: - with pytest.raises(klass, match=msg): +def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""): + warn_klass = None if warn_msg == "" else FutureWarning + with tm.assert_produces_warning(warn_klass, match=warn_msg): + if klass is None: if how == "method": getattr(gb, groupby_func)(*args) elif how == "agg": gb.agg(groupby_func, *args) else: gb.transform(groupby_func, *args) + else: + with pytest.raises(klass, match=msg): + if how == "method": + getattr(gb, groupby_func)(*args) + elif how == "agg": + gb.agg(groupby_func, *args) + else: + gb.transform(groupby_func, *args) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) @@ -233,8 +235,7 @@ def test_groupby_raises_string_np( warn_msg = "using SeriesGroupBy.[sum|mean]" else: warn_msg = "using DataFrameGroupBy.[sum|mean]" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - _call_and_check(klass, msg, how, gb, groupby_func_np, ()) + _call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) @@ -297,13 +298,11 @@ def test_groupby_raises_datetime( "var": (TypeError, "datetime64 type does not support var operations"), }[groupby_func] - warn = None - warn_msg = f"'{groupby_func}' with datetime64 dtypes is deprecated" if groupby_func in ["any", "all"]: - warn = FutureWarning - - with tm.assert_produces_warning(warn, match=warn_msg): - _call_and_check(klass, msg, how, gb, groupby_func, args) + warn_msg = f"'{groupby_func}' with datetime64 dtypes is deprecated" + else: + warn_msg = "" + _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=warn_msg) @pytest.mark.parametrize("how", ["agg", "transform"]) @@ -342,8 +341,7 @@ def test_groupby_raises_datetime_np( warn_msg = "using SeriesGroupBy.[sum|mean]" else: warn_msg = "using DataFrameGroupBy.[sum|mean]" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - _call_and_check(klass, msg, how, gb, groupby_func_np, ()) + _call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg) @pytest.mark.parametrize("func", ["prod", "cumprod", "skew", "var"]) @@ -540,8 +538,7 @@ def test_groupby_raises_category_np( warn_msg = "using SeriesGroupBy.[sum|mean]" else: warn_msg = "using DataFrameGroupBy.[sum|mean]" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - _call_and_check(klass, msg, how, gb, groupby_func_np, ()) + _call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) @@ -572,6 +569,16 @@ def test_groupby_raises_category_on_category( return empty_groups = any(group.empty for group in gb.groups.values()) + if ( + not observed + and how != "transform" + and isinstance(by, list) + and isinstance(by[0], str) + and by == ["a", "b"] + ): + assert not empty_groups + # TODO: empty_groups should be true due to unobserved categorical combinations + empty_groups = True klass, msg = { "all": (None, ""), @@ -617,10 +624,10 @@ def test_groupby_raises_category_on_category( if not using_copy_on_write else (None, ""), # no-op with CoW "first": (None, ""), - "idxmax": (ValueError, "attempt to get argmax of an empty sequence") + "idxmax": (ValueError, "empty group due to unobserved categories") if empty_groups else (None, ""), - "idxmin": (ValueError, "attempt to get argmin of an empty sequence") + "idxmin": (ValueError, "empty group due to unobserved categories") if empty_groups else (None, ""), "last": (None, ""), diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index cf3f41e04902c..4a493ef3fd52c 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1637,3 +1637,19 @@ def test_as_index_no_change(keys, df, groupby_func): result = gb_as_index_true.transform(groupby_func, *args) expected = gb_as_index_false.transform(groupby_func, *args) tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("how", ["idxmax", "idxmin"]) +@pytest.mark.parametrize("numeric_only", [True, False]) +def test_idxmin_idxmax_transform_args(how, skipna, numeric_only): + # GH#55268 - ensure *args are passed through when calling transform + df = DataFrame({"a": [1, 1, 1, 2], "b": [3.0, 4.0, np.nan, 6.0], "c": list("abcd")}) + gb = df.groupby("a") + msg = f"'axis' keyword in DataFrameGroupBy.{how} is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.transform(how, 0, skipna, numeric_only) + warn = None if skipna else FutureWarning + msg = f"The behavior of DataFrame.{how} with .* any-NA and skipna=False" + with tm.assert_produces_warning(warn, match=msg): + expected = gb.transform(how, skipna=skipna, numeric_only=numeric_only) + tm.assert_frame_equal(result, expected) From cbac7cdecd45e2fc07a0a187b8591bada7e75736 Mon Sep 17 00:00:00 2001 From: Issam Date: Sun, 8 Oct 2023 18:56:48 -0400 Subject: [PATCH 101/131] DOC: Reorganize whatsnew 2.2.0 bug fixes into appropriate subsections (#55443) --- doc/source/whatsnew/v2.2.0.rst | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index d10b00e37f808..cfcc99bf5bda0 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -279,13 +279,6 @@ Performance improvements Bug fixes ~~~~~~~~~ -- Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`) -- Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) -- Bug in :func:`pandas.api.types.is_string_dtype` while checking object array with no elements is of the string dtype (:issue:`54661`) -- Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) -- Bug in :meth:`pandas.DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) -- Bug in :meth:`pandas.read_excel` with a ODS file without cached formatted cell for float values (:issue:`55219`) -- Categorical ^^^^^^^^^^^ @@ -304,7 +297,7 @@ Timedelta Timezones ^^^^^^^^^ -- +- Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`) - Numeric @@ -319,6 +312,7 @@ Conversion Strings ^^^^^^^ +- Bug in :func:`pandas.api.types.is_string_dtype` while checking object array with no elements is of the string dtype (:issue:`54661`) - - @@ -352,6 +346,7 @@ I/O - Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`) - Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) - Bug in :func:`to_excel`, with ``OdsWriter`` (``ods`` files) writing boolean/string value (:issue:`54994`) +- Bug in :meth:`pandas.read_excel` with an ODS file without cached formatted cell for float values (:issue:`55219`) Period ^^^^^^ @@ -365,16 +360,17 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug in :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax`, and :meth:`SeriesGroupBy.idxmin` would not consistently raise when grouping with ``observed=False`` and unobserved categoricals (:issue:`10694`) -- Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) -- Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) -- Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) +- Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) +- Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) +- Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) +- Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) - Reshaping ^^^^^^^^^ - Bug in :func:`concat` ignoring ``sort`` parameter when passed :class:`DatetimeIndex` indexes (:issue:`54769`) - Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) +- Bug in :meth:`pandas.DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) - Sparse @@ -395,6 +391,7 @@ Styler Other ^^^^^ - Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`) +- Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) - Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`) - From 288ab31ce5696c4207dce9f48b41de2a4ba9bb9d Mon Sep 17 00:00:00 2001 From: Philippe THOMY Date: Mon, 9 Oct 2023 18:57:10 +0200 Subject: [PATCH 102/131] WEB: Fix rendering of PDEP-12 (#55441) Fix rendering (with markdownlint analysis) --- ...2-compact-and-reversible-JSON-interface.md | 106 ++++++++++++------ 1 file changed, 74 insertions(+), 32 deletions(-) diff --git a/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md b/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md index 4fe4b935f144b..3ae34f4bb6ebb 100644 --- a/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md +++ b/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md @@ -8,38 +8,41 @@ - Author: [Philippe THOMY](https://github.com/loco-philippe) - Revision: 3 +##### Summary -#### Summary - [Abstract](./0012-compact-and-reversible-JSON-interface.md/#Abstract) - - [Problem description](./0012-compact-and-reversible-JSON-interface.md/#Problem-description) - - [Feature Description](./0012-compact-and-reversible-JSON-interface.md/#Feature-Description) + - [Problem description](./0012-compact-and-reversible-JSON-interface.md/#Problem-description) + - [Feature Description](./0012-compact-and-reversible-JSON-interface.md/#Feature-Description) - [Scope](./0012-compact-and-reversible-JSON-interface.md/#Scope) - [Motivation](./0012-compact-and-reversible-JSON-interface.md/#Motivation) - - [Why is it important to have a compact and reversible JSON interface ?](./0012-compact-and-reversible-JSON-interface.md/#Why-is-it-important-to-have-a-compact-and-reversible-JSON-interface-?) - - [Is it relevant to take an extended type into account ?](./0012-compact-and-reversible-JSON-interface.md/#Is-it-relevant-to-take-an-extended-type-into-account-?) - - [Is this only useful for pandas ?](./0012-compact-and-reversible-JSON-interface.md/#Is-this-only-useful-for-pandas-?) + - [Why is it important to have a compact and reversible JSON interface ?](./0012-compact-and-reversible-JSON-interface.md/#Why-is-it-important-to-have-a-compact-and-reversible-JSON-interface-?) + - [Is it relevant to take an extended type into account ?](./0012-compact-and-reversible-JSON-interface.md/#Is-it-relevant-to-take-an-extended-type-into-account-?) + - [Is this only useful for pandas ?](./0012-compact-and-reversible-JSON-interface.md/#Is-this-only-useful-for-pandas-?) - [Description](./0012-compact-and-reversible-JSON-interface.md/#Description) - - [Data typing](./0012-compact-and-reversible-JSON-interface.md/#Data-typing) - - [Correspondence between TableSchema and pandas](./panda0012-compact-and-reversible-JSON-interfaces_PDEP.md/#Correspondence-between-TableSchema-and-pandas) - - [JSON format](./0012-compact-and-reversible-JSON-interface.md/#JSON-format) - - [Conversion](./0012-compact-and-reversible-JSON-interface.md/#Conversion) + - [Data typing](./0012-compact-and-reversible-JSON-interface.md/#Data-typing) + - [Correspondence between TableSchema and pandas](./panda0012-compact-and-reversible-JSON-interfaces_PDEP.md/#Correspondence-between-TableSchema-and-pandas) + - [JSON format](./0012-compact-and-reversible-JSON-interface.md/#JSON-format) + - [Conversion](./0012-compact-and-reversible-JSON-interface.md/#Conversion) - [Usage and impact](./0012-compact-and-reversible-JSON-interface.md/#Usage-and-impact) - - [Usage](./0012-compact-and-reversible-JSON-interface.md/#Usage) - - [Compatibility](./0012-compact-and-reversible-JSON-interface.md/#Compatibility) - - [Impacts on the pandas framework](./0012-compact-and-reversible-JSON-interface.md/#Impacts-on-the-pandas-framework) - - [Risk to do / risk not to do](./0012-compact-and-reversible-JSON-interface.md/#Risk-to-do-/-risk-not-to-do) + - [Usage](./0012-compact-and-reversible-JSON-interface.md/#Usage) + - [Compatibility](./0012-compact-and-reversible-JSON-interface.md/#Compatibility) + - [Impacts on the pandas framework](./0012-compact-and-reversible-JSON-interface.md/#Impacts-on-the-pandas-framework) + - [Risk to do / risk not to do](./0012-compact-and-reversible-JSON-interface.md/#Risk-to-do-/-risk-not-to-do) - [Implementation](./0012-compact-and-reversible-JSON-interface.md/#Implementation) - - [Modules](./0012-compact-and-reversible-JSON-interface.md/#Modules) - - [Implementation options](./0012-compact-and-reversible-JSON-interface.md/#Implementation-options) + - [Modules](./0012-compact-and-reversible-JSON-interface.md/#Modules) + - [Implementation options](./0012-compact-and-reversible-JSON-interface.md/#Implementation-options) - [F.A.Q.](./0012-compact-and-reversible-JSON-interface.md/#F.A.Q.) - [Synthesis](./0012-compact-and-reversible-JSON-interface.md/Synthesis) - [Core team decision](./0012-compact-and-reversible-JSON-interface.md/#Core-team-decision) - [Timeline](./0012-compact-and-reversible-JSON-interface.md/#Timeline) - [PDEP history](./0012-compact-and-reversible-JSON-interface.md/#PDEP-history) + ------------------------- + ## Abstract ### Problem description + The `dtype` and "Python type" are not explicitly taken into account in the current JSON interface. So, the JSON interface is not always reversible and has inconsistencies related to the consideration of the `dtype`. @@ -48,15 +51,17 @@ Another consequence is the partial application of the Table Schema specification Some JSON-interface problems are detailed in the [linked NoteBook](https://nbviewer.org/github/loco-philippe/ntv-pandas/blob/main/example/example_json_pandas.ipynb#Current-Json-interface) - ### Feature Description + To have a simple, compact and reversible solution, I propose to use the [JSON-NTV format (Named and Typed Value)](https://github.com/loco-philippe/NTV#readme) - which integrates the notion of type - and its JSON-TAB variation for tabular data (the JSON-NTV format is defined in an [IETF Internet-Draft](https://datatracker.ietf.org/doc/draft-thomy-json-ntv/) (not yet an RFC !!) ). This solution allows to include a large number of types (not necessarily pandas `dtype`) which allows to have: + - a Table Schema JSON interface (`orient="table"`) which respects the Table Schema specification (going from 6 types to 20 types), - a global JSON interface for all pandas data formats. #### Global JSON interface example + In the example below, a DataFrame with several data types is converted to JSON. The DataFrame resulting from this JSON is identical to the initial DataFrame (reversibility). @@ -65,7 +70,8 @@ With the existing JSON interface, this conversion is not possible. This example uses `ntv_pandas` module defined in the [ntv-pandas repository](https://github.com/loco-philippe/ntv-pandas#readme). -*data example* +Data example: + ```python In [1]: from shapely.geometry import Point from datetime import date @@ -94,7 +100,7 @@ Out[4]: dates::date value value32 res coord::point names unique 600 2022-01-21 30 32 30 POINT (5 6) maria True ``` -*JSON representation* +JSON representation ```python In [5]: df_to_json = npd.to_json(df) @@ -109,16 +115,18 @@ Out[5]: {':tab': {'coord::point': [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0 'value32::int32': [12, 12, 22, 22, 32, 32]}} ``` -*Reversibility* +Reversibility ```python In [5]: df_from_json = npd.read_json(df_to_json) print('df created from JSON is equal to initial df ? ', df_from_json.equals(df)) Out[5]: df created from JSON is equal to initial df ? True ``` + Several other examples are provided in the [linked NoteBook](https://nbviewer.org/github/loco-philippe/ntv-pandas/blob/main/example/example_ntv_pandas.ipynb) #### Table Schema JSON interface example + In the example below, a DataFrame with several Table Schema data types is converted to JSON. The DataFrame resulting from this JSON is identical to the initial DataFrame (reversibility). @@ -142,7 +150,7 @@ Out[3]: end february::date coordinates::point contact::email 2 2025-02-28 POINT (4.9 45.8) walter.white@breaking.com ``` -*JSON representation* +JSON representation ```python In [4]: df_to_table = npd.to_json(df, table=True) @@ -158,16 +166,18 @@ Out[4]: {'schema': {'fields': [{'name': 'index', 'type': 'integer'}, {'index': 2, 'end february': '2025-02-28', 'coordinates': [4.9, 45.8], 'contact': 'walter.white@breaking.com'}]} ``` -*Reversibility* +Reversibility ```python In [5]: df_from_table = npd.read_json(df_to_table) print('df created from JSON is equal to initial df ? ', df_from_table.equals(df)) Out[5]: df created from JSON is equal to initial df ? True ``` + Several other examples are provided in the [linked NoteBook](https://nbviewer.org/github/loco-philippe/ntv-pandas/blob/main/example/example_table_pandas.ipynb) ## Scope + The objective is to make available the proposed JSON interface for any type of data and for `orient="table"` option or a new option `orient="ntv"`. The proposed interface is compatible with existing data. @@ -175,32 +185,38 @@ The proposed interface is compatible with existing data. ## Motivation ### Why extend the `orient=table` option to other data types? + - The Table Schema specification defines 24 data types, 6 are taken into account in the pandas interface ### Why is it important to have a compact and reversible JSON interface ? + - a reversible interface provides an exchange format. - a textual exchange format facilitates exchanges between platforms (e.g. OpenData) - a JSON exchange format can be used at API level ### Is it relevant to take an extended type into account ? + - it avoids the addition of an additional data schema - it increases the semantic scope of the data processed by pandas - it is an answer to several issues (e.g. #12997, #14358, #16492, #35420, #35464, #36211, #39537, #49585, #50782, #51375, #52595, #53252) - the use of a complementary type avoids having to modify the pandas data model ### Is this only useful for pandas ? + - the JSON-TAB format is applicable to tabular data and multi-dimensional data. - this JSON interface can therefore be used for any application using tabular or multi-dimensional data. This would allow for example reversible data exchanges between pandas - DataFrame and Xarray - DataArray (Xarray issue under construction) [see example DataFrame / DataArray](https://nbviewer.org/github/loco-philippe/NTV/blob/main/example/example_pandas.ipynb#Multidimensional-data). ## Description The proposed solution is based on several key points: + - data typing - correspondence between TableSchema and pandas - JSON format for tabular data - conversion to and from JSON format ### Data typing + Data types are defined and managed in the NTV project (name, JSON encoder and decoder). Pandas `dtype` are compatible with NTV types : @@ -217,6 +233,7 @@ Pandas `dtype` are compatible with NTV types : | boolean | boolean | Note: + - datetime with timezone is a single NTV type (string ISO8601) - `CategoricalDtype` and `SparseDtype` are included in the tabular JSON format - `object` `dtype` is depending on the context (see below) @@ -234,12 +251,14 @@ JSON types (implicit or explicit) are converted in `dtype` following pandas JSON | null | NaT / NaN / None | Note: + - if an NTV type is defined, the `dtype` is adjusted accordingly - the consideration of null type data needs to be clarified The other NTV types are associated with `object` `dtype`. ### Correspondence between TableSchema and pandas + The TableSchema typing is carried by two attributes `format` and `type`. The table below shows the correspondence between TableSchema format / type and pandas NTVtype / dtype: @@ -264,10 +283,12 @@ The table below shows the correspondence between TableSchema format / type and p | default / geojson | geojson / object | Note: + - other TableSchema format are defined and are to be studied (uuid, binary, topojson, specific format for geopoint and datation) - the first six lines correspond to the existing ### JSON format + The JSON format for the TableSchema interface is the existing. The JSON format for the Global interface is defined in [JSON-TAB](https://github.com/loco-philippe/NTV/blob/main/documentation/JSON-TAB-standard.pdf) specification. @@ -275,22 +296,27 @@ It includes the naming rules originally defined in the [JSON-ND project](https:/ The specification have to be updated to include sparse data. ### Conversion + When data is associated with a non-`object` `dtype`, pandas conversion methods are used. Otherwise, NTV conversion is used. #### pandas -> JSON + - `NTV type` is not defined : use `to_json()` - `NTV type` is defined and `dtype` is not `object` : use `to_json()` - `NTV type` is defined and `dtype` is `object` : use NTV conversion (if pandas conversion does not exist) #### JSON -> pandas + - `NTV type` is compatible with a `dtype` : use `read_json()` - `NTV type` is not compatible with a `dtype` : use NTV conversion (if pandas conversion does not exist) ## Usage and Impact ### Usage + It seems to me that this proposal responds to important issues: + - having an efficient text format for data exchange The alternative CSV format is not reversible and obsolete (last revision in 2005). Current CSV tools do not comply with the standard. @@ -300,21 +326,26 @@ It seems to me that this proposal responds to important issues: - having a complete Table Schema interface ### Compatibility + Interface can be used without NTV type (compatibility with existing data - [see examples](https://nbviewer.org/github/loco-philippe/ntv-pandas/blob/main/example/example_ntv_pandas.ipynb#Appendix-:-Series-tests)) If the interface is available, throw a new `orient` option in the JSON interface, the use of the feature is decoupled from the other features. ### Impacts on the pandas framework + Initially, the impacts are very limited: + - modification of the `name` of `Series` or `DataFrame columns` (no functional impact), - added an option in the Json interface (e.g. `orient='ntv'`) and added associated methods (no functional interference with the other methods) In later stages, several developments could be considered: + - validation of the `name` of `Series` or `DataFrame columns` , - management of the NTV type as a "complementary-object-dtype" - functional extensions depending on the NTV type ### Risk to do / risk not to do + The JSON-NTV format and the JSON-TAB format are not (yet) recognized and used formats. The risk for pandas is that this function is not used (no functional impacts). On the other hand, the early use by pandas will allow a better consideration of the expectations and needs of pandas as well as a reflection on the evolution of the types supported by pandas. @@ -322,6 +353,7 @@ On the other hand, the early use by pandas will allow a better consideration of ## Implementation ### Modules + Two modules are defined for NTV: - json-ntv @@ -335,6 +367,7 @@ Two modules are defined for NTV: The pandas integration of the JSON interface requires importing only the json-ntv module. ### Implementation options + The interface can be implemented as NTV connector (`SeriesConnector` and `DataFrameConnector`) and as a new pandas JSON interface `orient` option. Several pandas implementations are possible: @@ -366,26 +399,28 @@ Several pandas implementations are possible: **A**: In principle, yes, this option takes into account the notion of type. But this is very limited (see examples added in the [Notebook](https://nbviewer.org/github/loco-philippe/NTV/blob/main/example/example_pandas.ipynb)) : + - **Types and Json interface** - - the only way to keep the types in the json interface is to use the `orient='table'` option - - few dtypes are not allowed in json-table interface : period, timedelta64, interval - - allowed types are not always kept in json-table interface - - data with 'object' dtype is kept only id data is string - - with categorical dtype, the underlying dtype is not included in json interface + - the only way to keep the types in the json interface is to use the `orient='table'` option + - few dtypes are not allowed in json-table interface : period, timedelta64, interval + - allowed types are not always kept in json-table interface + - data with 'object' dtype is kept only id data is string + - with categorical dtype, the underlying dtype is not included in json interface - **Data compactness** - - json-table interface is not compact (in the example in the [Notebook](https://nbviewer.org/github/loco-philippe/NTV/blob/main/example/example_pandas.ipynb#data-compactness))the size is triple or quadruple the size of the compact format + - json-table interface is not compact (in the example in the [Notebook](https://nbviewer.org/github/loco-philippe/NTV/blob/main/example/example_pandas.ipynb#data-compactness))the size is triple or quadruple the size of the compact format - **Reversibility** - - Interface is reversible only with few dtypes : int64, float64, bool, string, datetime64 and partially categorical + - Interface is reversible only with few dtypes : int64, float64, bool, string, datetime64 and partially categorical - **External types** - - the interface does not accept external types - - Table-schema defines 20 data types but the `orient="table"` interface takes into account 5 data types (see [table](https://nbviewer.org/github/loco-philippe/NTV/blob/main/example/example_pandas.ipynb#Converting-table-schema-type-to-pandas-dtype)) - - to integrate external types, it is necessary to first create ExtensionArray and ExtensionDtype objects + - the interface does not accept external types + - Table-schema defines 20 data types but the `orient="table"` interface takes into account 5 data types (see [table](https://nbviewer.org/github/loco-philippe/NTV/blob/main/example/example_pandas.ipynb#Converting-table-schema-type-to-pandas-dtype)) + - to integrate external types, it is necessary to first create ExtensionArray and ExtensionDtype objects The current interface is not compatible with the data structure defined by table-schema. For this to be possible, it is necessary to integrate a "type extension" like the one proposed (this has moreover been partially achieved with the notion of `extDtype` found in the interface for several formats). **Q: In general, we should only have 1 `"table"` format for pandas in read_json/to_json. There is also the issue of backwards compatibility if we do change the format. The fact that the table interface is buggy is not a reason to add a new interface (I'd rather fix those bugs). Can the existing format be adapted in a way that fixes the type issues/issues with roundtripping?** **A**: I will add two additional remarks: + - the types defined in Tableschema are partially taken into account (examples of types not taken into account in the interface: string-uri, array, date, time, year, geopoint, string-email): - the `read_json()` interface works too with the following data: `{'simple': [1,2,3] }` (contrary to what is indicated in the documentation) but it is impossible with `to_json()` to recreate this simple json. @@ -405,15 +440,20 @@ Regarding the underlying JSON-NTV format, its impact is quite low for tabular da Nevertheless, the question is relevant: The JSON-NTV format ([IETF Internet-Draft](https://datatracker.ietf.org/doc/draft-thomy-json-ntv/)) is a shared, documented, supported and implemented format, but indeed the community support is for the moment reduced but it only asks to expand !! ## Synthesis + To conclude, + - if it is important (or strategic) to have a reversible JSON interface for any type of data, the proposal can be allowed, - if not, a third-party package listed in the [ecosystem](https://pandas.pydata.org/community/ecosystem.html) that reads/writes this format to/from pandas DataFrames should be considered ## Core team decision + Vote was open from september-11 to setpember-26: + - Final tally is 0 approvals, 5 abstentions, 7 disapprove. The quorum has been met. The PDEP fails. **Disapprove comments** : + - 1 Given the newness of the proposed JSON NTV format, I would support (as described in the PDEP): "if not, a third-party package listed in the ecosystem that reads/writes this format to/from pandas DataFrames should be considered" - 2 Same reason as -1-, this should be a third party package for now - 3 Not mature enough, and not clear what the market size would be. @@ -423,10 +463,12 @@ Vote was open from september-11 to setpember-26: - 7 while I do think having a more comprehensive JSON format would be worthwhile, making a new format part of pandas means an implicit endorsement of a standard that is still being reviewed by the broader community. **Decision**: + - add the `ntv-pandas` package in the [ecosystem](https://pandas.pydata.org/community/ecosystem.html) - revisit again this PDEP at a later stage, for example in 1/2 to 1 year (based on the evolution of the Internet draft [JSON semantic format (JSON-NTV)](https://www.ietf.org/archive/id/draft-thomy-json-ntv-01.html) and the usage of the [ntv-pandas](https://github.com/loco-philippe/ntv-pandas#readme)) ## Timeline + Not applicable ## PDEP History From a984fed0d734bfb1f10104b0c1f9de737cd70013 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 9 Oct 2023 09:59:02 -0700 Subject: [PATCH 103/131] REF: avoid accessing internals in io.formats.csv (#55446) * REF: to_native_types->get_values_for_csv, make kwargs explicit * REF: avoid internals in to_csv --- pandas/core/frame.py | 19 +++++++++++++++++++ pandas/core/internals/array_manager.py | 15 ++++++++++++--- pandas/core/internals/blocks.py | 23 +++++++++++++++++------ pandas/core/internals/managers.py | 13 +++++++++++-- pandas/io/formats/csvs.py | 4 ++-- 5 files changed, 61 insertions(+), 13 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 28641031f7b21..7802657269044 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1323,6 +1323,25 @@ def to_string( line_width=line_width, ) + def _get_values_for_csv( + self, + *, + float_format: FloatFormatType | None, + date_format: str | None, + decimal: str, + na_rep: str, + quoting, # int csv.QUOTE_FOO from stdlib + ) -> Self: + # helper used by to_csv + mgr = self._mgr.get_values_for_csv( + float_format=float_format, + date_format=date_format, + decimal=decimal, + na_rep=na_rep, + quoting=quoting, + ) + return self._constructor_from_mgr(mgr, axes=mgr.axes) + # ---------------------------------------------------------------------- @property diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index b4e3fdb78b77b..99af4f51661b1 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -79,9 +79,9 @@ ensure_block_shape, external_values, extract_pandas_array, + get_values_for_csv, maybe_coerce_values, new_block, - to_native_types, ) from pandas.core.internals.managers import make_na_array @@ -343,8 +343,17 @@ def _convert(arr): return self.apply(_convert) - def to_native_types(self, **kwargs) -> Self: - return self.apply(to_native_types, **kwargs) + def get_values_for_csv( + self, *, float_format, date_format, decimal, na_rep: str = "nan", quoting=None + ) -> Self: + return self.apply( + get_values_for_csv, + na_rep=na_rep, + quoting=quoting, + float_format=float_format, + date_format=date_format, + decimal=decimal, + ) @property def any_extension_types(self) -> bool: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 30f6507d02484..f0c14eec81c3c 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -692,9 +692,18 @@ def astype( return newb @final - def to_native_types(self, na_rep: str = "nan", quoting=None, **kwargs) -> Block: + def get_values_for_csv( + self, *, float_format, date_format, decimal, na_rep: str = "nan", quoting=None + ) -> Block: """convert to our native types format""" - result = to_native_types(self.values, na_rep=na_rep, quoting=quoting, **kwargs) + result = get_values_for_csv( + self.values, + na_rep=na_rep, + quoting=quoting, + float_format=float_format, + date_format=date_format, + decimal=decimal, + ) return self.make_block(result) @final @@ -2593,14 +2602,14 @@ def ensure_block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike: return values -def to_native_types( +def get_values_for_csv( values: ArrayLike, *, + date_format, na_rep: str = "nan", quoting=None, float_format=None, decimal: str = ".", - **kwargs, ) -> npt.NDArray[np.object_]: """convert to our native types format""" if isinstance(values, Categorical) and values.categories.dtype.kind in "Mm": @@ -2615,14 +2624,16 @@ def to_native_types( if isinstance(values, (DatetimeArray, TimedeltaArray)): if values.ndim == 1: - result = values._format_native_types(na_rep=na_rep, **kwargs) + result = values._format_native_types(na_rep=na_rep, date_format=date_format) result = result.astype(object, copy=False) return result # GH#21734 Process every column separately, they might have different formats results_converted = [] for i in range(len(values)): - result = values[i, :]._format_native_types(na_rep=na_rep, **kwargs) + result = values[i, :]._format_native_types( + na_rep=na_rep, date_format=date_format + ) results_converted.append(result.astype(object, copy=False)) return np.vstack(results_converted) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b1db2d2e708e8..86cef032ec6e4 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -430,12 +430,21 @@ def convert(self, copy: bool | None) -> Self: return self.apply("convert", copy=copy, using_cow=using_copy_on_write()) - def to_native_types(self, **kwargs) -> Self: + def get_values_for_csv( + self, *, float_format, date_format, decimal, na_rep: str = "nan", quoting=None + ) -> Self: """ Convert values to native types (strings / python objects) that are used in formatting (repr / csv). """ - return self.apply("to_native_types", **kwargs) + return self.apply( + "get_values_for_csv", + na_rep=na_rep, + quoting=quoting, + float_format=float_format, + date_format=date_format, + decimal=decimal, + ) @property def any_extension_types(self) -> bool: diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 569c8aaf6cef1..717dae6eea97c 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -314,8 +314,8 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: slicer = slice(start_i, end_i) df = self.obj.iloc[slicer] - res = df._mgr.to_native_types(**self._number_format) - data = [res.iget_values(i) for i in range(len(res.items))] + res = df._get_values_for_csv(**self._number_format) + data = list(res._iter_column_arrays()) ix = self.data_index[slicer]._format_native_types(**self._number_format) libwriters.write_csv_rows( From 88af8871caa80d90a4417264b9fbadc551240388 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 9 Oct 2023 10:26:42 -0700 Subject: [PATCH 104/131] REF: privatize/annotate in io.formats (#55438) * REF: privatize in formats.info * privatize in formats.xml * REF: privatize in formats.xml * TYP: annotate in style_render * lint fixup --- pandas/io/formats/info.py | 52 +++++++-------- pandas/io/formats/style_render.py | 8 +-- pandas/io/formats/xml.py | 103 +++++++++++++++++------------- 3 files changed, 90 insertions(+), 73 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index d20c2a62c61e2..552affbd053f2 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -356,7 +356,7 @@ def _initialize_memory_usage( return memory_usage -class BaseInfo(ABC): +class _BaseInfo(ABC): """ Base class for DataFrameInfo and SeriesInfo. @@ -439,7 +439,7 @@ def render( pass -class DataFrameInfo(BaseInfo): +class DataFrameInfo(_BaseInfo): """ Class storing dataframe-specific info. """ @@ -503,7 +503,7 @@ def render( verbose: bool | None, show_counts: bool | None, ) -> None: - printer = DataFrameInfoPrinter( + printer = _DataFrameInfoPrinter( info=self, max_cols=max_cols, verbose=verbose, @@ -512,7 +512,7 @@ def render( printer.to_buffer(buf) -class SeriesInfo(BaseInfo): +class SeriesInfo(_BaseInfo): """ Class storing series-specific info. """ @@ -538,7 +538,7 @@ def render( "Argument `max_cols` can only be passed " "in DataFrame.info, not Series.info" ) - printer = SeriesInfoPrinter( + printer = _SeriesInfoPrinter( info=self, verbose=verbose, show_counts=show_counts, @@ -572,7 +572,7 @@ def memory_usage_bytes(self) -> int: return self.data.memory_usage(index=True, deep=deep) -class InfoPrinterAbstract: +class _InfoPrinterAbstract: """ Class for printing dataframe or series info. """ @@ -586,11 +586,11 @@ def to_buffer(self, buf: WriteBuffer[str] | None = None) -> None: fmt.buffer_put_lines(buf, lines) @abstractmethod - def _create_table_builder(self) -> TableBuilderAbstract: + def _create_table_builder(self) -> _TableBuilderAbstract: """Create instance of table builder.""" -class DataFrameInfoPrinter(InfoPrinterAbstract): +class _DataFrameInfoPrinter(_InfoPrinterAbstract): """ Class for printing dataframe info. @@ -650,27 +650,27 @@ def _initialize_show_counts(self, show_counts: bool | None) -> bool: else: return show_counts - def _create_table_builder(self) -> DataFrameTableBuilder: + def _create_table_builder(self) -> _DataFrameTableBuilder: """ Create instance of table builder based on verbosity and display settings. """ if self.verbose: - return DataFrameTableBuilderVerbose( + return _DataFrameTableBuilderVerbose( info=self.info, with_counts=self.show_counts, ) elif self.verbose is False: # specifically set to False, not necessarily None - return DataFrameTableBuilderNonVerbose(info=self.info) + return _DataFrameTableBuilderNonVerbose(info=self.info) elif self.exceeds_info_cols: - return DataFrameTableBuilderNonVerbose(info=self.info) + return _DataFrameTableBuilderNonVerbose(info=self.info) else: - return DataFrameTableBuilderVerbose( + return _DataFrameTableBuilderVerbose( info=self.info, with_counts=self.show_counts, ) -class SeriesInfoPrinter(InfoPrinterAbstract): +class _SeriesInfoPrinter(_InfoPrinterAbstract): """Class for printing series info. Parameters @@ -694,17 +694,17 @@ def __init__( self.verbose = verbose self.show_counts = self._initialize_show_counts(show_counts) - def _create_table_builder(self) -> SeriesTableBuilder: + def _create_table_builder(self) -> _SeriesTableBuilder: """ Create instance of table builder based on verbosity. """ if self.verbose or self.verbose is None: - return SeriesTableBuilderVerbose( + return _SeriesTableBuilderVerbose( info=self.info, with_counts=self.show_counts, ) else: - return SeriesTableBuilderNonVerbose(info=self.info) + return _SeriesTableBuilderNonVerbose(info=self.info) def _initialize_show_counts(self, show_counts: bool | None) -> bool: if show_counts is None: @@ -713,13 +713,13 @@ def _initialize_show_counts(self, show_counts: bool | None) -> bool: return show_counts -class TableBuilderAbstract(ABC): +class _TableBuilderAbstract(ABC): """ Abstract builder for info table. """ _lines: list[str] - info: BaseInfo + info: _BaseInfo @abstractmethod def get_lines(self) -> list[str]: @@ -769,7 +769,7 @@ def add_dtypes_line(self) -> None: self._lines.append(f"dtypes: {', '.join(collected_dtypes)}") -class DataFrameTableBuilder(TableBuilderAbstract): +class _DataFrameTableBuilder(_TableBuilderAbstract): """ Abstract builder for dataframe info table. @@ -820,7 +820,7 @@ def add_memory_usage_line(self) -> None: self._lines.append(f"memory usage: {self.memory_usage_string}") -class DataFrameTableBuilderNonVerbose(DataFrameTableBuilder): +class _DataFrameTableBuilderNonVerbose(_DataFrameTableBuilder): """ Dataframe info table builder for non-verbose output. """ @@ -838,7 +838,7 @@ def add_columns_summary_line(self) -> None: self._lines.append(self.ids._summary(name="Columns")) -class TableBuilderVerboseMixin(TableBuilderAbstract): +class _TableBuilderVerboseMixin(_TableBuilderAbstract): """ Mixin for verbose info output. """ @@ -931,7 +931,7 @@ def _gen_dtypes(self) -> Iterator[str]: yield pprint_thing(dtype) -class DataFrameTableBuilderVerbose(DataFrameTableBuilder, TableBuilderVerboseMixin): +class _DataFrameTableBuilderVerbose(_DataFrameTableBuilder, _TableBuilderVerboseMixin): """ Dataframe info table builder for verbose output. """ @@ -997,7 +997,7 @@ def _gen_columns(self) -> Iterator[str]: yield pprint_thing(col) -class SeriesTableBuilder(TableBuilderAbstract): +class _SeriesTableBuilder(_TableBuilderAbstract): """ Abstract builder for series info table. @@ -1029,7 +1029,7 @@ def _fill_non_empty_info(self) -> None: """Add lines to the info table, pertaining to non-empty series.""" -class SeriesTableBuilderNonVerbose(SeriesTableBuilder): +class _SeriesTableBuilderNonVerbose(_SeriesTableBuilder): """ Series info table builder for non-verbose output. """ @@ -1043,7 +1043,7 @@ def _fill_non_empty_info(self) -> None: self.add_memory_usage_line() -class SeriesTableBuilderVerbose(SeriesTableBuilder, TableBuilderVerboseMixin): +class _SeriesTableBuilderVerbose(_SeriesTableBuilder, _TableBuilderVerboseMixin): """ Series info table builder for verbose output. """ diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 90e9b1f0486db..829ed4a33f6a4 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -2357,7 +2357,7 @@ def color(value, user_arg, command, comm_arg): return latex_styles -def _escape_latex(s): +def _escape_latex(s: str) -> str: r""" Replace the characters ``&``, ``%``, ``$``, ``#``, ``_``, ``{``, ``}``, ``~``, ``^``, and ``\`` in the string with LaTeX-safe sequences. @@ -2392,7 +2392,7 @@ def _escape_latex(s): ) -def _math_mode_with_dollar(s): +def _math_mode_with_dollar(s: str) -> str: r""" All characters in LaTeX math mode are preserved. @@ -2425,7 +2425,7 @@ def _math_mode_with_dollar(s): return "".join(res).replace(r"rt8§=§7wz", r"\$") -def _math_mode_with_parentheses(s): +def _math_mode_with_parentheses(s: str) -> str: r""" All characters in LaTeX math mode are preserved. @@ -2461,7 +2461,7 @@ def _math_mode_with_parentheses(s): return "".join(res) -def _escape_latex_math(s): +def _escape_latex_math(s: str) -> str: r""" All characters in LaTeX math mode are preserved. diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index a6ee8407988ec..f56fca8d7ef44 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -8,11 +8,15 @@ from typing import ( TYPE_CHECKING, Any, + final, ) import warnings from pandas.errors import AbstractMethodError -from pandas.util._decorators import doc +from pandas.util._decorators import ( + cache_readonly, + doc, +) from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.missing import isna @@ -41,7 +45,7 @@ storage_options=_shared_docs["storage_options"], compression_options=_shared_docs["compression_options"] % "path_or_buffer", ) -class BaseXMLFormatter: +class _BaseXMLFormatter: """ Subclass for formatting data in XML. @@ -138,14 +142,14 @@ def __init__( self.storage_options = storage_options self.orig_cols = self.frame.columns.tolist() - self.frame_dicts = self.process_dataframe() + self.frame_dicts = self._process_dataframe() - self.validate_columns() - self.validate_encoding() - self.prefix_uri = self.get_prefix_uri() - self.handle_indexes() + self._validate_columns() + self._validate_encoding() + self.prefix_uri = self._get_prefix_uri() + self._handle_indexes() - def build_tree(self) -> bytes: + def _build_tree(self) -> bytes: """ Build tree from data. @@ -154,7 +158,8 @@ def build_tree(self) -> bytes: """ raise AbstractMethodError(self) - def validate_columns(self) -> None: + @final + def _validate_columns(self) -> None: """ Validate elems_cols and attrs_cols. @@ -175,7 +180,8 @@ def validate_columns(self) -> None: f"{type(self.elem_cols).__name__} is not a valid type for elem_cols" ) - def validate_encoding(self) -> None: + @final + def _validate_encoding(self) -> None: """ Validate encoding. @@ -189,7 +195,8 @@ def validate_encoding(self) -> None: codecs.lookup(self.encoding) - def process_dataframe(self) -> dict[int | str, dict[str, Any]]: + @final + def _process_dataframe(self) -> dict[int | str, dict[str, Any]]: """ Adjust Data Frame to fit xml output. @@ -213,7 +220,8 @@ def process_dataframe(self) -> dict[int | str, dict[str, Any]]: return df.to_dict(orient="index") - def handle_indexes(self) -> None: + @final + def _handle_indexes(self) -> None: """ Handle indexes. @@ -234,7 +242,7 @@ def handle_indexes(self) -> None: if self.elem_cols: self.elem_cols = indexes + self.elem_cols - def get_prefix_uri(self) -> str: + def _get_prefix_uri(self) -> str: """ Get uri of namespace prefix. @@ -248,7 +256,8 @@ def get_prefix_uri(self) -> str: raise AbstractMethodError(self) - def other_namespaces(self) -> dict: + @final + def _other_namespaces(self) -> dict: """ Define other namespaces. @@ -267,7 +276,8 @@ def other_namespaces(self) -> dict: return nmsp_dict - def build_attribs(self, d: dict[str, Any], elem_row: Any) -> Any: + @final + def _build_attribs(self, d: dict[str, Any], elem_row: Any) -> Any: """ Create attributes of row. @@ -287,6 +297,7 @@ def build_attribs(self, d: dict[str, Any], elem_row: Any) -> Any: raise KeyError(f"no valid column, {col}") return elem_row + @final def _get_flat_col_name(self, col: str | tuple) -> str: flat_col = col if isinstance(col, tuple): @@ -297,17 +308,20 @@ def _get_flat_col_name(self, col: str | tuple) -> str: ) return f"{self.prefix_uri}{flat_col}" - def build_elems(self, d: dict[str, Any], elem_row: Any) -> None: + @cache_readonly + def _sub_element_cls(self): + raise AbstractMethodError(self) + + @final + def _build_elems(self, d: dict[str, Any], elem_row: Any) -> None: """ Create child elements of row. This method adds child elements using elem_cols to row element and works with tuples for multindex or hierarchical columns. """ + sub_element_cls = self._sub_element_cls - raise AbstractMethodError(self) - - def _build_elems(self, sub_element_cls, d: dict[str, Any], elem_row: Any) -> None: if not self.elem_cols: return @@ -319,8 +333,9 @@ def _build_elems(self, sub_element_cls, d: dict[str, Any], elem_row: Any) -> Non except KeyError: raise KeyError(f"no valid column, {col}") + @final def write_output(self) -> str | None: - xml_doc = self.build_tree() + xml_doc = self._build_tree() if self.path_or_buffer is not None: with get_handle( @@ -337,13 +352,13 @@ def write_output(self) -> str | None: return xml_doc.decode(self.encoding).rstrip() -class EtreeXMLFormatter(BaseXMLFormatter): +class EtreeXMLFormatter(_BaseXMLFormatter): """ Class for formatting data in xml using Python standard library modules: `xml.etree.ElementTree` and `xml.dom.minidom`. """ - def build_tree(self) -> bytes: + def _build_tree(self) -> bytes: from xml.etree.ElementTree import ( Element, SubElement, @@ -351,7 +366,7 @@ def build_tree(self) -> bytes: ) self.root = Element( - f"{self.prefix_uri}{self.root_name}", attrib=self.other_namespaces() + f"{self.prefix_uri}{self.root_name}", attrib=self._other_namespaces() ) for d in self.frame_dicts.values(): @@ -359,11 +374,11 @@ def build_tree(self) -> bytes: if not self.attr_cols and not self.elem_cols: self.elem_cols = list(d.keys()) - self.build_elems(d, elem_row) + self._build_elems(d, elem_row) else: - elem_row = self.build_attribs(d, elem_row) - self.build_elems(d, elem_row) + elem_row = self._build_attribs(d, elem_row) + self._build_elems(d, elem_row) self.out_xml = tostring( self.root, @@ -373,7 +388,7 @@ def build_tree(self) -> bytes: ) if self.pretty_print: - self.out_xml = self.prettify_tree() + self.out_xml = self._prettify_tree() if self.stylesheet is not None: raise ValueError( @@ -382,7 +397,7 @@ def build_tree(self) -> bytes: return self.out_xml - def get_prefix_uri(self) -> str: + def _get_prefix_uri(self) -> str: from xml.etree.ElementTree import register_namespace uri = "" @@ -402,12 +417,13 @@ def get_prefix_uri(self) -> str: return uri - def build_elems(self, d: dict[str, Any], elem_row: Any) -> None: + @cache_readonly + def _sub_element_cls(self): from xml.etree.ElementTree import SubElement - self._build_elems(SubElement, d, elem_row) + return SubElement - def prettify_tree(self) -> bytes: + def _prettify_tree(self) -> bytes: """ Output tree for pretty print format. @@ -421,7 +437,7 @@ def prettify_tree(self) -> bytes: return dom.toprettyxml(indent=" ", encoding=self.encoding) -class LxmlXMLFormatter(BaseXMLFormatter): +class LxmlXMLFormatter(_BaseXMLFormatter): """ Class for formatting data in xml using Python standard library modules: `xml.etree.ElementTree` and `xml.dom.minidom`. @@ -430,9 +446,9 @@ class LxmlXMLFormatter(BaseXMLFormatter): def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) - self.convert_empty_str_key() + self._convert_empty_str_key() - def build_tree(self) -> bytes: + def _build_tree(self) -> bytes: """ Build tree from data. @@ -452,11 +468,11 @@ def build_tree(self) -> bytes: if not self.attr_cols and not self.elem_cols: self.elem_cols = list(d.keys()) - self.build_elems(d, elem_row) + self._build_elems(d, elem_row) else: - elem_row = self.build_attribs(d, elem_row) - self.build_elems(d, elem_row) + elem_row = self._build_attribs(d, elem_row) + self._build_elems(d, elem_row) self.out_xml = tostring( self.root, @@ -467,11 +483,11 @@ def build_tree(self) -> bytes: ) if self.stylesheet is not None: - self.out_xml = self.transform_doc() + self.out_xml = self._transform_doc() return self.out_xml - def convert_empty_str_key(self) -> None: + def _convert_empty_str_key(self) -> None: """ Replace zero-length string in `namespaces`. @@ -482,7 +498,7 @@ def convert_empty_str_key(self) -> None: if self.namespaces and "" in self.namespaces.keys(): self.namespaces[None] = self.namespaces.pop("", "default") - def get_prefix_uri(self) -> str: + def _get_prefix_uri(self) -> str: uri = "" if self.namespaces: if self.prefix: @@ -497,12 +513,13 @@ def get_prefix_uri(self) -> str: return uri - def build_elems(self, d: dict[str, Any], elem_row: Any) -> None: + @cache_readonly + def _sub_element_cls(self): from lxml.etree import SubElement - self._build_elems(SubElement, d, elem_row) + return SubElement - def transform_doc(self) -> bytes: + def _transform_doc(self) -> bytes: """ Parse stylesheet from file or buffer and run it. From 69cd075fa3016c86b5741dc39ad0e6731235094b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 9 Oct 2023 10:44:48 -0700 Subject: [PATCH 105/131] Bump pypa/cibuildwheel from 2.16.1 to 2.16.2 (#55450) Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.16.1 to 2.16.2. - [Release notes](https://github.com/pypa/cibuildwheel/releases) - [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md) - [Commits](https://github.com/pypa/cibuildwheel/compare/v2.16.1...v2.16.2) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index efa14dd966eb1..6647bc03df17f 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -138,7 +138,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.16.1 + uses: pypa/cibuildwheel@v2.16.2 with: package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: From fa96a52d7cb7ad44267db57fa0803c7e4fd52b7d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 9 Oct 2023 10:51:50 -0700 Subject: [PATCH 106/131] REF: remove redundant Index formatting methods (#55432) * REF: remove IntervalIndex._format_data, _format_space * improve check, remove outdated comment --- pandas/core/arrays/interval.py | 8 -------- pandas/core/indexes/base.py | 15 +++------------ pandas/core/indexes/interval.py | 6 ------ 3 files changed, 3 insertions(+), 26 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 58ade1ee935ec..b55a40f5488ea 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1235,7 +1235,6 @@ def value_counts(self, dropna: bool = True) -> Series: def _format_data(self) -> str: # TODO: integrate with categorical and make generic - # name argument is unused here; just for compat with base / categorical n = len(self) max_seq_items = min((get_option("display.max_seq_items") or n) // 10, 10) @@ -1266,19 +1265,12 @@ def _format_data(self) -> str: return summary def __repr__(self) -> str: - # the short repr has no trailing newline, while the truncated - # repr does. So we include a newline in our template, and strip - # any trailing newlines from format_object_summary data = self._format_data() class_name = f"<{type(self).__name__}>\n" template = f"{class_name}{data}\nLength: {len(self)}, dtype: {self.dtype}" return template - def _format_space(self) -> str: - space = " " * (len(type(self).__name__) + 1) - return f"\n{space}" - # --------------------------------------------------------------------- # Vectorized Interval Properties/Attributes diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 147113a1e505e..c3cab965041e0 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1284,25 +1284,16 @@ def __repr__(self) -> str_t: klass_name = type(self).__name__ data = self._format_data() attrs = self._format_attrs() - space = self._format_space() attrs_str = [f"{k}={v}" for k, v in attrs] - prepr = f",{space}".join(attrs_str) + prepr = ", ".join(attrs_str) # no data provided, just attributes if data is None: + # i.e. RangeIndex data = "" return f"{klass_name}({data}{prepr})" - def _format_space(self) -> str_t: - # using space here controls if the attributes - # are line separated or not (the default) - - # max_seq_items = get_option('display.max_seq_items') - # if len(self) > max_seq_items: - # space = "\n%s" % (' ' * (len(klass) + 1)) - return " " - @property def _formatter_func(self): """ @@ -1319,7 +1310,7 @@ def _format_data(self, name=None) -> str_t: if self.inferred_type == "string": is_justify = False - elif self.inferred_type == "categorical": + elif isinstance(self.dtype, CategoricalDtype): self = cast("CategoricalIndex", self) if is_object_dtype(self.categories.dtype): is_justify = False diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index ae027ea68a525..209ac84869e85 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -844,17 +844,11 @@ def length(self) -> Index: # -------------------------------------------------------------------- # Rendering Methods - # __repr__ associated methods are based on MultiIndex def _format_with_header(self, *, header: list[str], na_rep: str) -> list[str]: # matches base class except for whitespace padding return header + list(self._format_native_types(na_rep=na_rep)) - def _format_data(self, name=None) -> str: - # TODO: integrate with categorical and make generic - # name argument is unused here; just for compat with base / categorical - return f"{self._data._format_data()},{self._format_space()}" - # -------------------------------------------------------------------- # Set Operations From 42282ce514e1dd265aad0c80b4a4610e4254be97 Mon Sep 17 00:00:00 2001 From: Philippe THOMY Date: Mon, 9 Oct 2023 20:27:37 +0200 Subject: [PATCH 107/131] WEB: Fix rendering of ntv-pandas ecosystem page (#55430) * Create 0007-compact-and-reversible-JSON-interface.md * change PDEP number (7 -> 12) * Add FAQ to the PDEPS 0012 * Update 0012-compact-and-reversible-JSON-interface.md * Update 0012-compact-and-reversible-JSON-interface.md * Update 0012-compact-and-reversible-JSON-interface.md * pre-commit codespell * Update 0012-compact-and-reversible-JSON-interface.md * Update 0012-compact-and-reversible-JSON-interface.md * delete summary * delete mermaid flowchart * with summary, without mermaid flowchart * rename Annexe -> Appendix * add tableschema specification * add orient="table" * Add Table Schema extension * Update 0012-compact-and-reversible-JSON-interface.md * Update 0012-compact-and-reversible-JSON-interface.md * Update 0012-compact-and-reversible-JSON-interface.md * Update 0012-compact-and-reversible-JSON-interface.md * Update 0012-compact-and-reversible-JSON-interface.md * Update 0012-compact-and-reversible-JSON-interface.md * add 'ntv-pandas' in the 'ecosystem' file * Update ecosystem.md * Update ecosystem.md * Update ecosystem.md * Update ecosystem.md * Update ecosystem.md (NTV-pandas) --- web/pandas/community/ecosystem.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index 10ac3bccc6137..e6a6b3a8531ca 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -345,25 +345,27 @@ which pandas excels. ## IO -### [NTV-pandas](https://github.com/loco-philippe/ntv-pandas)*: A semantic, compact and reversible JSON-pandas converter* +### [NTV-pandas](https://github.com/loco-philippe/ntv-pandas) NTV-pandas provides a JSON converter with more data types than the ones supported by pandas directly. It supports the following data types: + - pandas data types - data types defined in the [NTV format](https://loco-philippe.github.io/ES/JSON%20semantic%20format%20(JSON-NTV).htm) - data types defined in [Table Schema specification](http://dataprotocols.org/json-table-schema/#field-types-and-formats) The interface is always reversible (conversion round trip) with two formats (JSON-NTV and JSON-TableSchema). -*example* +Example: + ```python import ntv_pandas as npd -jsn = df.npd.to_json(table=False) # save df as a JSON-value (format Table Schema if table is True else format NTV ) -df = npd.read_json(jsn) # load a JSON-value as a DataFrame +jsn = df.npd.to_json(table=False) # save df as a JSON-value (format Table Schema if table is True else format NTV ) +df = npd.read_json(jsn) # load a JSON-value as a `DataFrame` -df.equals(npd.read_json(df.npd.to_json(df))) # True in any case, whether table=True or not +df.equals(npd.read_json(df.npd.to_json(df))) # `True` in any case, whether `table=True` or not ``` ### [BCPandas](https://github.com/yehoshuadimarsky/bcpandas) From dc3035da0aee23089a66b7f01bb2f1ee76d3f7be Mon Sep 17 00:00:00 2001 From: cobalt <61329810+RedGuy12@users.noreply.github.com> Date: Mon, 9 Oct 2023 13:29:15 -0500 Subject: [PATCH 108/131] DOC: Remove outdated docs about NumPy's broadcasting (#55427) Remove outdated docs about NumPy's broadcasting Signed-off-by: RedGuy12 <61329810+RedGuy12@users.noreply.github.com> --- doc/source/user_guide/basics.rst | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 2e299da5e5794..d3c9d83b943ce 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -408,20 +408,6 @@ raise a ValueError: pd.Series(['foo', 'bar', 'baz']) == pd.Series(['foo']) -Note that this is different from the NumPy behavior where a comparison can -be broadcast: - -.. ipython:: python - - np.array([1, 2, 3]) == np.array([2]) - -or it can return False if broadcasting can not be done: - -.. ipython:: python - :okwarning: - - np.array([1, 2, 3]) == np.array([1, 2]) - Combining overlapping data sets ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 361b62ecc2327c8cf0b6173482e1edd1641c54bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Lucas=20Mayer?= Date: Mon, 9 Oct 2023 15:30:35 -0300 Subject: [PATCH 109/131] TST: GroupBy.ffill on a multi-index dataframe (#55408) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add new automated test with ffill on a multi-index dataframe This test checks whether the ffill method works correctly on a multi-index dataframe. In it, we define a dataframe with indexes [0, 1, 2, 0, 1, 2] and column "a" with values [1, 2, NaN, 3, 4, 5]. We group this dataframe by columns (level = 0) and shift it by one, so we have: 0: [1, 3] -> [NaN, 1] 1: [2, 4] -> [NaN, 2] 2: [NaN, 5] -> [NaN, NaN] Then, since index order remain the same, if we apply ffill method, it should give us a dataframe with column "a" equal to [NaN, NaN, NaN, 1, 2, 2]. Co-authored-by: José Lucas Silva Mayer Co-authored-by: Willian Wang * remove shift and add cast dataframe dtype to float Signed-off-by: José Lucas Silva Mayer * remove check of dataframe dtype on assert Signed-off-by: José Lucas Silva Mayer --------- Signed-off-by: José Lucas Silva Mayer Co-authored-by: Willian Wang --- pandas/tests/groupby/test_groupby.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e3e4a7efaa307..7297d049587e6 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3242,3 +3242,12 @@ def test_get_group_axis_1(): } ) tm.assert_frame_equal(result, expected) + + +def test_groupby_ffill_with_duplicated_index(): + # GH#43412 + df = DataFrame({"a": [1, 2, 3, 4, np.nan, np.nan]}, index=[0, 1, 2, 0, 1, 2]) + + result = df.groupby(level=0).ffill() + expected = DataFrame({"a": [1, 2, 3, 4, 2, 3]}, index=[0, 1, 2, 0, 1, 2]) + tm.assert_frame_equal(result, expected, check_dtype=False) From e74138cb1cd02211ecfa68e8ca187dcfc381f5f5 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Mon, 9 Oct 2023 21:33:13 +0300 Subject: [PATCH 110/131] Revert "BUG: Timestamp origin takes no effect in resample for 'MS' frequency (#53938)" (#55077) * Revert "BUG: Timestamp origin takes no effect in resample for 'MS' frequency (#53938)" This reverts commit dfc4c3900a181e8a9dff724217d5a14e59ca2e4f. * note that origin only takes effect for tick frequencies * fixup doctest * move whatsnew to v2.1.2 --------- Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/generic.py | 16 +++-- pandas/core/groupby/grouper.py | 12 ++-- pandas/core/resample.py | 13 +--- pandas/tests/resample/test_datetime_index.py | 67 +++++-------------- .../tests/resample/test_resampler_grouper.py | 32 ++++----- 6 files changed, 48 insertions(+), 93 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index ddd1f95c56aea..569864ba71122 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -15,6 +15,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`) - Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`) +- Fixed regression in :meth:`DataFrame.resample` which was extrapolating back to ``origin`` when ``origin`` was outside its bounds (:issue:`55064`) .. --------------------------------------------------------------------------- .. _whatsnew_212.bug_fixes: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6527dcaa2de59..a7183a9d9498a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9221,6 +9221,10 @@ def resample( .. versionadded:: 1.3.0 + .. note:: + + Only takes effect for Tick-frequencies (i.e. fixed frequencies like + days, hours, and minutes, rather than months or quarters). offset : Timedelta or str, default is None An offset timedelta added to the origin. @@ -9491,12 +9495,12 @@ def resample( 2000-10-02 00:26:00 24 Freq: 17min, dtype: int64 - >>> ts.resample('17W', origin='2000-01-01').sum() - 2000-01-02 0 - 2000-04-30 0 - 2000-08-27 0 - 2000-12-24 108 - Freq: 17W-SUN, dtype: int64 + >>> ts.resample('17min', origin='2000-01-01').sum() + 2000-10-01 23:24:00 3 + 2000-10-01 23:41:00 15 + 2000-10-01 23:58:00 45 + 2000-10-02 00:15:00 45 + Freq: 17min, dtype: int64 If you want to adjust the start of the bins with an `offset` Timedelta, the two following lines are equivalent: diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index c51c17e04796a..06e6755079a22 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -207,12 +207,12 @@ class Grouper: 2000-10-02 00:26:00 24 Freq: 17min, dtype: int64 - >>> ts.groupby(pd.Grouper(freq='17W', origin='2000-01-01')).sum() - 2000-01-02 0 - 2000-04-30 0 - 2000-08-27 0 - 2000-12-24 108 - Freq: 17W-SUN, dtype: int64 + >>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum() + 2000-10-01 23:24:00 3 + 2000-10-01 23:41:00 15 + 2000-10-01 23:58:00 45 + 2000-10-02 00:15:00 45 + Freq: 17min, dtype: int64 If you want to adjust the start of the bins with an `offset` Timedelta, the two following lines are equivalent: diff --git a/pandas/core/resample.py b/pandas/core/resample.py index bb5f3ce56b470..59e6a20915c18 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2528,16 +2528,8 @@ def _get_timestamp_range_edges( """ if isinstance(freq, Tick): index_tz = first.tz - - if isinstance(origin, Timestamp) and origin.tz != index_tz: + if isinstance(origin, Timestamp) and (origin.tz is None) != (index_tz is None): raise ValueError("The origin must have the same timezone as the index.") - - elif isinstance(origin, Timestamp): - if origin <= first: - first = origin - elif origin >= last: - last = origin - if origin == "epoch": # set the epoch based on the timezone to have similar bins results when # resampling on the same kind of indexes on different timezones @@ -2559,9 +2551,6 @@ def _get_timestamp_range_edges( first = first.tz_localize(index_tz) last = last.tz_localize(index_tz) else: - if isinstance(origin, Timestamp): - first = origin - first = first.normalize() last = last.normalize() diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 63f9de2da733d..28d02576156a0 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -796,34 +796,24 @@ def test_resample_offset(unit): @pytest.mark.parametrize( - "kwargs, expected", + "kwargs", [ - ( - {"origin": "1999-12-31 23:57:00"}, - ["1999-12-31 23:57:00", "2000-01-01 01:57:00"], - ), - ( - {"origin": Timestamp("1970-01-01 00:02:00")}, - ["1970-01-01 00:02:00", "2000-01-01 01:57:00"], - ), - ( - {"origin": "epoch", "offset": "2m"}, - ["1999-12-31 23:57:00", "2000-01-01 01:57:00"], - ), + {"origin": "1999-12-31 23:57:00"}, + {"origin": Timestamp("1970-01-01 00:02:00")}, + {"origin": "epoch", "offset": "2m"}, # origin of '1999-31-12 12:02:00' should be equivalent for this case - ( - {"origin": "1999-12-31 12:02:00"}, - ["1999-12-31 12:02:00", "2000-01-01 01:57:00"], - ), - ({"offset": "-3m"}, ["1999-12-31 23:57:00", "2000-01-01 01:57:00"]), + {"origin": "1999-12-31 12:02:00"}, + {"offset": "-3m"}, ], ) -def test_resample_origin(kwargs, unit, expected): +def test_resample_origin(kwargs, unit): # GH 31809 rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s").as_unit(unit) ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - exp_rng = date_range(expected[0], expected[1], freq="5min").as_unit(unit) + exp_rng = date_range( + "1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min" + ).as_unit(unit) resampled = ts.resample("5min", **kwargs).mean() tm.assert_index_equal(resampled.index, exp_rng) @@ -853,31 +843,6 @@ def test_resample_bad_offset(offset, unit): ts.resample("5min", offset=offset) -def test_resample_monthstart_origin(): - # GH 53662 - df = DataFrame({"ts": [datetime(1999, 12, 31, 0, 0, 0)], "values": [10.0]}) - result = df.resample("2MS", on="ts", origin="1999-11-01")["values"].sum() - excepted = Series( - [10.0], - index=DatetimeIndex( - ["1999-11-01"], dtype="datetime64[ns]", name="ts", freq="2MS" - ), - ) - tm.assert_index_equal(result.index, excepted.index) - - df = DataFrame({"ts": [datetime(1999, 12, 31, 20)], "values": [10.0]}) - result = df.resample( - "3YS", on="ts", closed="left", label="left", origin=datetime(1995, 1, 1) - )["values"].sum() - expected = Series( - [0, 10.0], - index=DatetimeIndex( - ["1995-01-01", "1998-01-01"], dtype="datetime64[ns]", name="ts", freq="3YS" - ), - ) - tm.assert_index_equal(result.index, expected.index) - - def test_resample_origin_prime_freq(unit): # GH 31809 start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00" @@ -909,7 +874,7 @@ def test_resample_origin_prime_freq(unit): tm.assert_index_equal(resampled.index, exp_rng) exp_rng = date_range( - "2000-01-01 00:00:00", "2000-10-02 00:15:00", freq="17min" + "2000-10-01 23:24:00", "2000-10-02 00:15:00", freq="17min" ).as_unit(unit) resampled = ts.resample("17min", origin="2000-01-01").mean() tm.assert_index_equal(resampled.index, exp_rng) @@ -928,12 +893,14 @@ def test_resample_origin_with_tz(unit): exp_rng = date_range( "1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min", tz=tz ).as_unit(unit) - resampled = ts.resample("5min", origin="epoch", offset="2m").mean() + resampled = ts.resample("5min", origin="1999-12-31 23:57:00+00:00").mean() tm.assert_index_equal(resampled.index, exp_rng) - resampled = ts.resample( - "5min", origin=Timestamp("1999-12-31 23:57:00", tz=tz) - ).mean() + # origin of '1999-31-12 12:02:00+03:00' should be equivalent for this case + resampled = ts.resample("5min", origin="1999-12-31 12:02:00+03:00").mean() + tm.assert_index_equal(resampled.index, exp_rng) + + resampled = ts.resample("5min", origin="epoch", offset="2m").mean() tm.assert_index_equal(resampled.index, exp_rng) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 55365d14a68ca..b85ccdc70068f 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -151,19 +151,6 @@ def test_groupby_with_origin(): start, end = "1/1/2000 00:00:00", "1/31/2000 00:00" middle = "1/15/2000 00:00:00" - # test origin on 1970-01-01 00:00:00 - rng = date_range("1970-01-01 00:00:00", end, freq="1231min") # prime number - ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - middle_ts = rng[len(rng) // 2] - ts2 = ts[middle_ts:end] - - origin = Timestamp(0) - adjusted_grouper = pd.Grouper(freq=freq, origin=origin) - adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count") - adjusted_count_ts = adjusted_count_ts[middle_ts:end] - adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count") - tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2[middle_ts:end]) - rng = date_range(start, end, freq="1231min") # prime number ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) ts2 = ts[middle:end] @@ -177,19 +164,26 @@ def test_groupby_with_origin(): with pytest.raises(AssertionError, match="Index are different"): tm.assert_index_equal(count_ts.index, count_ts2.index) - # test origin on 2049-10-18 20:00:00 + # test origin on 1970-01-01 00:00:00 + origin = Timestamp(0) + adjusted_grouper = pd.Grouper(freq=freq, origin=origin) + adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count") + adjusted_count_ts = adjusted_count_ts[middle:end] + adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count") + tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2) - rng = date_range(start, "2049-10-18 20:00:00", freq="1231min") # prime number - ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - middle_ts = rng[len(rng) // 2] - ts2 = ts[middle_ts:end] + # test origin on 2049-10-18 20:00:00 origin_future = Timestamp(0) + pd.Timedelta("1399min") * 30_000 adjusted_grouper2 = pd.Grouper(freq=freq, origin=origin_future) adjusted2_count_ts = ts.groupby(adjusted_grouper2).agg("count") - adjusted2_count_ts = adjusted2_count_ts[middle_ts:end] + adjusted2_count_ts = adjusted2_count_ts[middle:end] adjusted2_count_ts2 = ts2.groupby(adjusted_grouper2).agg("count") tm.assert_series_equal(adjusted2_count_ts, adjusted2_count_ts2) + # both grouper use an adjusted timestamp that is a multiple of 1399 min + # they should be equals even if the adjusted_timestamp is in the future + tm.assert_series_equal(adjusted_count_ts, adjusted2_count_ts2) + def test_nearest(): # GH 17496 From 1ab06f85be101a0e9ea7e5b129398a8f5f6ef6da Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 9 Oct 2023 12:39:48 -1000 Subject: [PATCH 111/131] COMPAT: Update old numpy aliases (#55460) * COMPAT: Update old numpy aliases * change more usages --- pandas/io/stata.py | 2 +- pandas/tests/frame/constructors/test_from_records.py | 2 +- pandas/tests/frame/methods/test_select_dtypes.py | 4 +--- pandas/tests/frame/methods/test_to_records.py | 2 +- pandas/tests/indexes/base_class/test_setops.py | 10 +++++----- pandas/tests/io/json/test_pandas.py | 2 +- pandas/tests/reshape/concat/test_append.py | 4 ++-- pandas/tests/series/methods/test_astype.py | 4 ++-- 8 files changed, 14 insertions(+), 16 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index d630a5ff8a41c..d00f1c666d5d6 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -977,7 +977,7 @@ def __init__(self) -> None: # with a label, but the underlying variable is -127 to 100 # we're going to drop the label and cast to int self.DTYPE_MAP = dict( - list(zip(range(1, 245), [np.dtype("a" + str(i)) for i in range(1, 245)])) + [(i, np.dtype(f"S{i}")) for i in range(1, 245)] + [ (251, np.dtype(np.int8)), (252, np.dtype(np.int16)), diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 7dffa7bb242d5..55d4a6c3b39fa 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -281,7 +281,7 @@ def test_frame_from_records_utc(self): def test_from_records_to_records(self): # from numpy documentation - arr = np.zeros((2,), dtype=("i4,f4,a10")) + arr = np.zeros((2,), dtype=("i4,f4,S10")) arr[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] DataFrame.from_records(arr) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 67dd5b6217187..a38d2c6fd016a 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -339,9 +339,7 @@ def test_select_dtypes_datetime_with_tz(self): expected = df3.reindex(columns=[]) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "dtype", [str, "str", np.bytes_, "S1", "unicode", np.str_, "U1"] - ) + @pytest.mark.parametrize("dtype", [str, "str", np.bytes_, "S1", np.str_, "U1"]) @pytest.mark.parametrize("arg", ["include", "exclude"]) def test_select_dtypes_str_raises(self, dtype, arg): df = DataFrame( diff --git a/pandas/tests/frame/methods/test_to_records.py b/pandas/tests/frame/methods/test_to_records.py index fa8c4e4811ea6..fab90b112fa94 100644 --- a/pandas/tests/frame/methods/test_to_records.py +++ b/pandas/tests/frame/methods/test_to_records.py @@ -253,7 +253,7 @@ def test_to_records_with_categorical(self): ), # Pass in a dtype instance. ( - {"column_dtypes": np.dtype("unicode")}, + {"column_dtypes": np.dtype(np.str_)}, np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[ diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index 21d1630af9de2..488f79eea0d11 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -182,7 +182,7 @@ def test_symmetric_difference(self): "intersection", np.array( [(1, "A"), (2, "A"), (1, "B"), (2, "B")], - dtype=[("num", int), ("let", "a1")], + dtype=[("num", int), ("let", "S1")], ), False, ), @@ -190,7 +190,7 @@ def test_symmetric_difference(self): "intersection", np.array( [(1, "A"), (1, "B"), (2, "A"), (2, "B")], - dtype=[("num", int), ("let", "a1")], + dtype=[("num", int), ("let", "S1")], ), None, ), @@ -198,7 +198,7 @@ def test_symmetric_difference(self): "union", np.array( [(1, "A"), (1, "B"), (1, "C"), (2, "A"), (2, "B"), (2, "C")], - dtype=[("num", int), ("let", "a1")], + dtype=[("num", int), ("let", "S1")], ), None, ), @@ -208,13 +208,13 @@ def test_tuple_union_bug(self, method, expected, sort): index1 = Index( np.array( [(1, "A"), (2, "A"), (1, "B"), (2, "B")], - dtype=[("num", int), ("let", "a1")], + dtype=[("num", int), ("let", "S1")], ) ) index2 = Index( np.array( [(1, "A"), (2, "A"), (1, "B"), (2, "B"), (1, "C"), (2, "C")], - dtype=[("num", int), ("let", "a1")], + dtype=[("num", int), ("let", "S1")], ) ) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 4280f8617517f..2767078674632 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -604,7 +604,7 @@ def test_blocks_compat_GH9037(self): ) # JSON deserialisation always creates unicode strings - df_mixed.columns = df_mixed.columns.astype("unicode") + df_mixed.columns = df_mixed.columns.astype(np.str_) data = StringIO(df_mixed.to_json(orient="split")) df_roundtrip = read_json(data, orient="split") tm.assert_frame_equal( diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py index 386f363c81557..81ca227fb7afb 100644 --- a/pandas/tests/reshape/concat/test_append.py +++ b/pandas/tests/reshape/concat/test_append.py @@ -91,10 +91,10 @@ def test_append_length0_frame(self, sort): tm.assert_frame_equal(df5, expected) def test_append_records(self): - arr1 = np.zeros((2,), dtype=("i4,f4,a10")) + arr1 = np.zeros((2,), dtype=("i4,f4,S10")) arr1[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] - arr2 = np.zeros((3,), dtype=("i4,f4,a10")) + arr2 = np.zeros((3,), dtype=("i4,f4,S10")) arr2[:] = [(3, 4.0, "foo"), (5, 6.0, "bar"), (7.0, 8.0, "baz")] df1 = DataFrame(arr1) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index b6c409397c9fb..03fc6cba2902a 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -403,12 +403,12 @@ def test_astype_unicode(self): # bytes with obj.decode() instead of str(obj) item = "野菜食べないとやばい" ser = Series([item.encode()]) - result = ser.astype("unicode") + result = ser.astype(np.str_) expected = Series([item]) tm.assert_series_equal(result, expected) for ser in test_series: - res = ser.astype("unicode") + res = ser.astype(np.str_) expec = ser.map(str) tm.assert_series_equal(res, expec) From 2a65fdd227734cbda5d8e33171e857ab4aaeb9d5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 9 Oct 2023 17:37:38 -0700 Subject: [PATCH 112/131] TST: fix file left behind (#55464) --- pandas/tests/io/pytables/test_store.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 0ff84bcf136cd..f031ac46c670c 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -303,8 +303,10 @@ def test_store_dropna(tmp_path, setup_path): tm.assert_frame_equal(df_without_missing, reloaded) -def test_keyword_deprecation(): +def test_keyword_deprecation(tmp_path, setup_path): # GH 54229 + path = tmp_path / setup_path + msg = ( "Starting with pandas version 3.0 all arguments of to_hdf except for the " "argument 'path_or_buf' will be keyword-only." @@ -312,7 +314,7 @@ def test_keyword_deprecation(): df = DataFrame([{"A": 1, "B": 2, "C": 3}, {"A": 1, "B": 2, "C": 3}]) with tm.assert_produces_warning(FutureWarning, match=msg): - df.to_hdf("example", "key") + df.to_hdf(path, "key") def test_to_hdf_with_min_itemsize(tmp_path, setup_path): From f7b49d8eac88f9db365def1cd2babe03b84dadf4 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 10 Oct 2023 11:41:20 -0400 Subject: [PATCH 113/131] TST: sort index with sliced MultiIndex (#55473) * TST: sort index with sliced MultiIndex * whatsnew --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/tests/frame/methods/test_sort_index.py | 39 +++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 569864ba71122..cc51e22265d7c 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -16,6 +16,7 @@ Fixed regressions - Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`) - Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`) - Fixed regression in :meth:`DataFrame.resample` which was extrapolating back to ``origin`` when ``origin`` was outside its bounds (:issue:`55064`) +- Fixed regression in :meth:`DataFrame.sort_index` which was not sorting correctly when the index was a sliced :class:`MultiIndex` (:issue:`55379`) .. --------------------------------------------------------------------------- .. _whatsnew_212.bug_fixes: diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 985a9e3602410..f0222e5cec9b5 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -955,3 +955,42 @@ def test_sort_index_multiindex_sort_remaining(self, ascending): ) tm.assert_frame_equal(result, expected) + + +def test_sort_index_with_sliced_multiindex(): + # GH 55379 + mi = MultiIndex.from_tuples( + [ + ("a", "10"), + ("a", "18"), + ("a", "25"), + ("b", "16"), + ("b", "26"), + ("a", "45"), + ("b", "28"), + ("a", "5"), + ("a", "50"), + ("a", "51"), + ("b", "4"), + ], + names=["group", "str"], + ) + + df = DataFrame({"x": range(len(mi))}, index=mi) + result = df.iloc[0:6].sort_index() + + expected = DataFrame( + {"x": [0, 1, 2, 5, 3, 4]}, + index=MultiIndex.from_tuples( + [ + ("a", "10"), + ("a", "18"), + ("a", "25"), + ("a", "45"), + ("b", "16"), + ("b", "26"), + ], + names=["group", "str"], + ), + ) + tm.assert_frame_equal(result, expected) From 1025151f71cb4f0efa1fd935e41d943428bd3d33 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 10 Oct 2023 08:50:43 -0700 Subject: [PATCH 114/131] REF: de-duplicate IntervalArray formatting functions (#55469) --- pandas/core/arrays/interval.py | 43 ++++------------------------------ 1 file changed, 4 insertions(+), 39 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index b55a40f5488ea..4bc30d6dbd029 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -15,8 +15,6 @@ import numpy as np -from pandas._config import get_option - from pandas._libs import lib from pandas._libs.interval import ( VALID_CLOSED, @@ -1233,43 +1231,10 @@ def value_counts(self, dropna: bool = True) -> Series: # --------------------------------------------------------------------- # Rendering Methods - def _format_data(self) -> str: - # TODO: integrate with categorical and make generic - n = len(self) - max_seq_items = min((get_option("display.max_seq_items") or n) // 10, 10) - - formatter = str - - if n == 0: - summary = "[]" - elif n == 1: - first = formatter(self[0]) - summary = f"[{first}]" - elif n == 2: - first = formatter(self[0]) - last = formatter(self[-1]) - summary = f"[{first}, {last}]" - else: - if n > max_seq_items: - n = min(max_seq_items // 2, 10) - head = [formatter(x) for x in self[:n]] - tail = [formatter(x) for x in self[-n:]] - head_str = ", ".join(head) - tail_str = ", ".join(tail) - summary = f"[{head_str} ... {tail_str}]" - else: - tail = [formatter(x) for x in self] - tail_str = ", ".join(tail) - summary = f"[{tail_str}]" - - return summary - - def __repr__(self) -> str: - data = self._format_data() - class_name = f"<{type(self).__name__}>\n" - - template = f"{class_name}{data}\nLength: {len(self)}, dtype: {self.dtype}" - return template + def _formatter(self, boxed: bool = False): + # returning 'str' here causes us to render as e.g. "(0, 1]" instead of + # "Interval(0, 1, closed='right')" + return str # --------------------------------------------------------------------- # Vectorized Interval Properties/Attributes From e22bfb3e1721d602086c0ca0b3af311cbe84cd9b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 10 Oct 2023 08:53:19 -0700 Subject: [PATCH 115/131] REF: avoid circular imports in formats (#55467) * REF: avoid circular import of get_adjustment * revert --- pandas/core/indexes/multi.py | 7 ++- pandas/io/common.py | 6 +- pandas/io/formats/format.py | 73 ++-------------------- pandas/io/formats/printing.py | 79 ++++++++++++++++++++++-- pandas/tests/io/formats/test_format.py | 2 +- pandas/tests/io/formats/test_printing.py | 11 ++-- 6 files changed, 92 insertions(+), 86 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 8955329a7afbf..c5cab225fa7b1 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -107,7 +107,10 @@ lexsort_indexer, ) -from pandas.io.formats.printing import pprint_thing +from pandas.io.formats.printing import ( + get_adjustment, + pprint_thing, +) if TYPE_CHECKING: from pandas import ( @@ -1437,8 +1440,6 @@ def format( ) if adjoin: - from pandas.io.formats.format import get_adjustment - adj = get_adjustment() return adj.adjoin(space, *result_levels).split("\n") else: diff --git a/pandas/io/common.py b/pandas/io/common.py index f255ea8197304..d08612f4f09f6 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -68,8 +68,8 @@ is_integer, is_list_like, ) +from pandas.core.dtypes.generic import ABCMultiIndex -from pandas.core.indexes.api import MultiIndex from pandas.core.shared_docs import _shared_docs _VALID_URLS = set(uses_relative + uses_netloc + uses_params) @@ -91,6 +91,8 @@ WriteBuffer, ) + from pandas import MultiIndex + @dataclasses.dataclass class IOArgs: @@ -1228,7 +1230,7 @@ def is_potential_multi_index( return bool( len(columns) - and not isinstance(columns, MultiIndex) + and not isinstance(columns, ABCMultiIndex) and all(isinstance(c, tuple) for c in columns if c not in list(index_col)) ) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 1efd6f18dc0af..cac83e2a48972 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -7,7 +7,6 @@ from collections.abc import ( Generator, Hashable, - Iterable, Mapping, Sequence, ) @@ -26,7 +25,6 @@ Final, cast, ) -from unicodedata import east_asian_width import numpy as np @@ -226,7 +224,7 @@ def __init__( float_format = get_option("display.float_format") self.float_format = float_format self.dtype = dtype - self.adj = get_adjustment() + self.adj = printing.get_adjustment() self._chk_truncate() @@ -347,69 +345,6 @@ def to_string(self) -> str: return str("".join(result)) -class _TextAdjustment: - def __init__(self) -> None: - self.encoding = get_option("display.encoding") - - def len(self, text: str) -> int: - return len(text) - - def justify(self, texts: Any, max_len: int, mode: str = "right") -> list[str]: - return printing.justify(texts, max_len, mode=mode) - - def adjoin(self, space: int, *lists, **kwargs) -> str: - return printing.adjoin( - space, *lists, strlen=self.len, justfunc=self.justify, **kwargs - ) - - -class _EastAsianTextAdjustment(_TextAdjustment): - def __init__(self) -> None: - super().__init__() - if get_option("display.unicode.ambiguous_as_wide"): - self.ambiguous_width = 2 - else: - self.ambiguous_width = 1 - - # Definition of East Asian Width - # https://unicode.org/reports/tr11/ - # Ambiguous width can be changed by option - self._EAW_MAP = {"Na": 1, "N": 1, "W": 2, "F": 2, "H": 1} - - def len(self, text: str) -> int: - """ - Calculate display width considering unicode East Asian Width - """ - if not isinstance(text, str): - return len(text) - - return sum( - self._EAW_MAP.get(east_asian_width(c), self.ambiguous_width) for c in text - ) - - def justify( - self, texts: Iterable[str], max_len: int, mode: str = "right" - ) -> list[str]: - # re-calculate padding space per str considering East Asian Width - def _get_pad(t): - return max_len - self.len(t) + len(t) - - if mode == "left": - return [x.ljust(_get_pad(x)) for x in texts] - elif mode == "center": - return [x.center(_get_pad(x)) for x in texts] - else: - return [x.rjust(_get_pad(x)) for x in texts] - - -def get_adjustment() -> _TextAdjustment: - use_east_asian_width = get_option("display.unicode.east_asian_width") - if use_east_asian_width: - return _EastAsianTextAdjustment() - else: - return _TextAdjustment() - - def get_dataframe_repr_params() -> dict[str, Any]: """Get the parameters used to repr(dataFrame) calls using DataFrame.to_string. @@ -529,7 +464,7 @@ def __init__( self.tr_frame = self.frame self.truncate() - self.adj = get_adjustment() + self.adj = printing.get_adjustment() def get_strcols(self) -> list[list[str]]: """ @@ -1789,13 +1724,13 @@ def _make_fixed_width( strings: list[str], justify: str = "right", minimum: int | None = None, - adj: _TextAdjustment | None = None, + adj: printing._TextAdjustment | None = None, ) -> list[str]: if len(strings) == 0 or justify == "all": return strings if adj is None: - adjustment = get_adjustment() + adjustment = printing.get_adjustment() else: adjustment = adj diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index b57797b7ec717..2cc9368f8846a 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -15,11 +15,14 @@ TypeVar, Union, ) +from unicodedata import east_asian_width from pandas._config import get_option from pandas.core.dtypes.inference import is_sequence +from pandas.io.formats.console import get_console_size + EscapeChars = Union[Mapping[str, str], Iterable[str]] _KT = TypeVar("_KT") _VT = TypeVar("_VT") @@ -42,7 +45,7 @@ def adjoin(space: int, *lists: list[str], **kwargs) -> str: function used to justify str. Needed for unicode handling. """ strlen = kwargs.pop("strlen", len) - justfunc = kwargs.pop("justfunc", justify) + justfunc = kwargs.pop("justfunc", _adj_justify) newLists = [] lengths = [max(map(strlen, x)) + space for x in lists[:-1]] @@ -57,7 +60,7 @@ def adjoin(space: int, *lists: list[str], **kwargs) -> str: return "\n".join("".join(lines) for lines in toJoin) -def justify(texts: Iterable[str], max_len: int, mode: str = "right") -> list[str]: +def _adj_justify(texts: Iterable[str], max_len: int, mode: str = "right") -> list[str]: """ Perform ljust, center, rjust against string or list-like """ @@ -314,9 +317,6 @@ def format_object_summary( ------- summary string """ - from pandas.io.formats.console import get_console_size - from pandas.io.formats.format import get_adjustment - display_width, _ = get_console_size() if display_width is None: display_width = get_option("display.width") or 80 @@ -501,3 +501,72 @@ class PrettyDict(dict[_KT, _VT]): def __repr__(self) -> str: return pprint_thing(self) + + +class _TextAdjustment: + def __init__(self) -> None: + self.encoding = get_option("display.encoding") + + def len(self, text: str) -> int: + return len(text) + + def justify(self, texts: Any, max_len: int, mode: str = "right") -> list[str]: + """ + Perform ljust, center, rjust against string or list-like + """ + if mode == "left": + return [x.ljust(max_len) for x in texts] + elif mode == "center": + return [x.center(max_len) for x in texts] + else: + return [x.rjust(max_len) for x in texts] + + def adjoin(self, space: int, *lists, **kwargs) -> str: + return adjoin(space, *lists, strlen=self.len, justfunc=self.justify, **kwargs) + + +class _EastAsianTextAdjustment(_TextAdjustment): + def __init__(self) -> None: + super().__init__() + if get_option("display.unicode.ambiguous_as_wide"): + self.ambiguous_width = 2 + else: + self.ambiguous_width = 1 + + # Definition of East Asian Width + # https://unicode.org/reports/tr11/ + # Ambiguous width can be changed by option + self._EAW_MAP = {"Na": 1, "N": 1, "W": 2, "F": 2, "H": 1} + + def len(self, text: str) -> int: + """ + Calculate display width considering unicode East Asian Width + """ + if not isinstance(text, str): + return len(text) + + return sum( + self._EAW_MAP.get(east_asian_width(c), self.ambiguous_width) for c in text + ) + + def justify( + self, texts: Iterable[str], max_len: int, mode: str = "right" + ) -> list[str]: + # re-calculate padding space per str considering East Asian Width + def _get_pad(t): + return max_len - self.len(t) + len(t) + + if mode == "left": + return [x.ljust(_get_pad(x)) for x in texts] + elif mode == "center": + return [x.center(_get_pad(x)) for x in texts] + else: + return [x.rjust(_get_pad(x)) for x in texts] + + +def get_adjustment() -> _TextAdjustment: + use_east_asian_width = get_option("display.unicode.east_asian_width") + if use_east_asian_width: + return _EastAsianTextAdjustment() + else: + return _TextAdjustment() diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 585958765f0b6..57f1f082708ae 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -212,7 +212,7 @@ def test_repr_truncation(self): r = repr(df) r = r[r.find("\n") + 1 :] - adj = fmt.get_adjustment() + adj = printing.get_adjustment() for line, value in zip(r.split("\n"), df["B"]): if adj.len(value) + 1 > max_len: diff --git a/pandas/tests/io/formats/test_printing.py b/pandas/tests/io/formats/test_printing.py index 492657587ae0e..78198bce71460 100644 --- a/pandas/tests/io/formats/test_printing.py +++ b/pandas/tests/io/formats/test_printing.py @@ -8,7 +8,6 @@ import pandas as pd from pandas.io.formats import printing -import pandas.io.formats.format as fmt def test_adjoin(): @@ -48,7 +47,7 @@ def test_adjoin_unicode(self): adjoined = printing.adjoin(2, *data) assert adjoined == expected - adj = fmt._EastAsianTextAdjustment() + adj = printing._EastAsianTextAdjustment() expected = """あ dd ggg b ええ hhh @@ -73,7 +72,7 @@ def test_adjoin_unicode(self): assert adj.len(cols[2]) == 26 def test_justify(self): - adj = fmt._EastAsianTextAdjustment() + adj = printing._EastAsianTextAdjustment() def just(x, *args, **kwargs): # wrapper to test single str @@ -95,7 +94,7 @@ def just(x, *args, **kwargs): assert just("パンダ", 10, mode="right") == " パンダ" def test_east_asian_len(self): - adj = fmt._EastAsianTextAdjustment() + adj = printing._EastAsianTextAdjustment() assert adj.len("abc") == 3 assert adj.len("abc") == 3 @@ -106,11 +105,11 @@ def test_east_asian_len(self): assert adj.len("パンダpanda") == 10 def test_ambiguous_width(self): - adj = fmt._EastAsianTextAdjustment() + adj = printing._EastAsianTextAdjustment() assert adj.len("¡¡ab") == 4 with cf.option_context("display.unicode.ambiguous_as_wide", True): - adj = fmt._EastAsianTextAdjustment() + adj = printing._EastAsianTextAdjustment() assert adj.len("¡¡ab") == 6 data = [["あ", "b", "c"], ["dd", "ええ", "ff"], ["ggg", "¡¡ab", "いいい"]] From 66a54a37d843cb641bfd4f694946e63a49124b3d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 10 Oct 2023 08:58:41 -0700 Subject: [PATCH 116/131] DEPR: Series constructor fastpath keyword (#55466) --- asv_bench/benchmarks/series_methods.py | 3 --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/frame.py | 7 +------ pandas/core/series.py | 15 +++++++++++++-- pandas/tests/copy_view/test_constructors.py | 8 ++++++-- 5 files changed, 21 insertions(+), 13 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 288369145576e..f52f7a4bef37a 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -28,9 +28,6 @@ def time_constructor_dict(self): def time_constructor_no_data(self): Series(data=None, index=self.idx) - def time_constructor_fastpath(self): - Series(self.array, index=self.idx2, name="name", fastpath=True) - class ToFrame: params = [["int64", "datetime64[ns]", "category", "Int64"], [None, "foo"]] diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index cfcc99bf5bda0..09067d541d01b 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -256,6 +256,7 @@ Other Deprecations - Deprecated strings ``H``, ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`) - Deprecated strings ``H``, ``T``, ``S``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`52536`) - Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`52536`) +- Deprecated the ``fastpath`` keyword in the :class:`Series` constructor (:issue:`20110`) - Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`) - Deprecated the option ``mode.data_manager`` and the ``ArrayManager``; only the ``BlockManager`` will be available in future versions (:issue:`55043`) - Deprecating downcasting the results of :meth:`DataFrame.fillna`, :meth:`Series.fillna`, :meth:`DataFrame.ffill`, :meth:`Series.ffill`, :meth:`DataFrame.bfill`, :meth:`Series.bfill` in object-dtype cases. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`54261`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7802657269044..09c43822e11e4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5168,12 +5168,7 @@ def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]: @property def _series(self): - return { - item: Series( - self._mgr.iget(idx), index=self.index, name=item, fastpath=True - ) - for idx, item in enumerate(self.columns) - } + return {item: self._ixs(idx, axis=1) for idx, item in enumerate(self.columns)} # ---------------------------------------------------------------------- # Reindexing and alignment diff --git a/pandas/core/series.py b/pandas/core/series.py index ae9f77a6c0bad..c2eea371ddef3 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -374,8 +374,18 @@ def __init__( dtype: Dtype | None = None, name=None, copy: bool | None = None, - fastpath: bool = False, + fastpath: bool | lib.NoDefault = lib.no_default, ) -> None: + if fastpath is not lib.no_default: + warnings.warn( + "The 'fastpath' keyword in pd.Series is deprecated and will " + "be removed in a future version.", + DeprecationWarning, + stacklevel=find_stack_level(), + ) + else: + fastpath = False + if ( isinstance(data, (SingleBlockManager, SingleArrayManager)) and index is None @@ -1009,7 +1019,8 @@ def _slice(self, slobj: slice, axis: AxisInt = 0) -> Series: # axis kwarg is retained for compat with NDFrame method # _slice is *always* positional mgr = self._mgr.get_slice(slobj, axis=axis) - out = self._constructor(mgr, fastpath=True) + out = self._constructor_from_mgr(mgr, axes=mgr.axes) + out._name = self._name return out.__finalize__(self) def __getitem__(self, key): diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index af7e759902f9f..31f80d300ccca 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -99,7 +99,9 @@ def test_series_from_series_with_reindex(using_copy_on_write): def test_series_from_array(using_copy_on_write, idx, dtype, fastpath, arr): if idx is None or dtype is not None: fastpath = False - ser = Series(arr, dtype=dtype, index=idx, fastpath=fastpath) + msg = "The 'fastpath' keyword in pd.Series is deprecated" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + ser = Series(arr, dtype=dtype, index=idx, fastpath=fastpath) ser_orig = ser.copy() data = getattr(arr, "_data", arr) if using_copy_on_write: @@ -157,7 +159,9 @@ def test_series_from_index_different_dtypes(using_copy_on_write): def test_series_from_block_manager(using_copy_on_write, idx, dtype, fastpath): ser = Series([1, 2, 3], dtype="int64") ser_orig = ser.copy() - ser2 = Series(ser._mgr, dtype=dtype, fastpath=fastpath, index=idx) + msg = "The 'fastpath' keyword in pd.Series is deprecated" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + ser2 = Series(ser._mgr, dtype=dtype, fastpath=fastpath, index=idx) assert np.shares_memory(get_array(ser), get_array(ser2)) if using_copy_on_write: assert not ser2._mgr._has_no_reference(0) From b284101fbcfede2ce805324407a2a3072f640fa6 Mon Sep 17 00:00:00 2001 From: Xiao Yuan Date: Wed, 11 Oct 2023 02:57:46 +0800 Subject: [PATCH 117/131] EHN: read_spss stores the metadata in df.attrs (#55472) * EHN: read_spss stores the metadata in df.attrs * filter warning * Make separate variable --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/io/spss.py | 3 ++- pandas/tests/io/test_spss.py | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 09067d541d01b..29a2d5c0b5877 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -76,6 +76,7 @@ Other enhancements - :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`). - :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`) +- :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs`. (:issue:`54264`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`) - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) diff --git a/pandas/io/spss.py b/pandas/io/spss.py index 58487c6cd721b..db31a07df79e6 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -63,9 +63,10 @@ def read_spss( raise TypeError("usecols must be list-like.") usecols = list(usecols) # pyreadstat requires a list - df, _ = pyreadstat.read_sav( + df, metadata = pyreadstat.read_sav( stringify_path(path), usecols=usecols, apply_value_formats=convert_categoricals ) + df.attrs = metadata.__dict__ if dtype_backend is not lib.no_default: df = df.convert_dtypes(dtype_backend=dtype_backend) return df diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index 5d8dce72d69c9..b612b64e3b020 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -116,3 +116,35 @@ def test_invalid_dtype_backend(): ) with pytest.raises(ValueError, match=msg): pd.read_spss("test", dtype_backend="numpy") + + +@pytest.mark.filterwarnings("ignore::pandas.errors.ChainedAssignmentError") +def test_spss_metadata(datapath): + # GH 54264 + fname = datapath("io", "data", "spss", "labelled-num.sav") + + df = pd.read_spss(fname) + metadata = { + "column_names": ["VAR00002"], + "column_labels": [None], + "column_names_to_labels": {"VAR00002": None}, + "file_encoding": "UTF-8", + "number_columns": 1, + "number_rows": 1, + "variable_value_labels": {"VAR00002": {1.0: "This is one"}}, + "value_labels": {"labels0": {1.0: "This is one"}}, + "variable_to_label": {"VAR00002": "labels0"}, + "notes": [], + "original_variable_types": {"VAR00002": "F8.0"}, + "readstat_variable_types": {"VAR00002": "double"}, + "table_name": None, + "missing_ranges": {}, + "missing_user_values": {}, + "variable_storage_width": {"VAR00002": 8}, + "variable_display_width": {"VAR00002": 8}, + "variable_alignment": {"VAR00002": "unknown"}, + "variable_measure": {"VAR00002": "unknown"}, + "file_label": None, + "file_format": "sav/zsav", + } + assert df.attrs == metadata From 6786846764cd9cf9d5041e5bdc86430be38eeb8e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 11 Oct 2023 10:54:23 -0700 Subject: [PATCH 118/131] ENH: EA._get_repr_footer (#55478) --- pandas/core/arrays/base.py | 12 ++++++++++-- pandas/core/arrays/categorical.py | 4 ++-- pandas/io/formats/format.py | 13 +++++++------ 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 31c143ee012bb..e7b7ecba60e0b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1663,7 +1663,14 @@ def __repr__(self) -> str: self, self._formatter(), indent_for_name=False ).rstrip(", \n") class_name = f"<{type(self).__name__}>\n" - return f"{class_name}{data}\nLength: {len(self)}, dtype: {self.dtype}" + footer = self._get_repr_footer() + return f"{class_name}{data}\n{footer}" + + def _get_repr_footer(self) -> str: + # GH#24278 + if self.ndim > 1: + return f"Shape: {self.shape}, dtype: {self.dtype}" + return f"Length: {len(self)}, dtype: {self.dtype}" def _repr_2d(self) -> str: from pandas.io.formats.printing import format_object_summary @@ -1679,7 +1686,8 @@ def _repr_2d(self) -> str: ] data = ",\n".join(lines) class_name = f"<{type(self).__name__}>" - return f"{class_name}\n[\n{data}\n]\nShape: {self.shape}, dtype: {self.dtype}" + footer = self._get_repr_footer() + return f"{class_name}\n[\n{data}\n]\n{footer}" def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]: """ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 5059f5d000ccd..e19635af6ab0b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2177,7 +2177,7 @@ def _repr_categories(self) -> list[str]: category_strs = [x.strip() for x in category_strs] return category_strs - def _repr_categories_info(self) -> str: + def _get_repr_footer(self) -> str: """ Returns a string representation of the footer. """ @@ -2229,7 +2229,7 @@ def __repr__(self) -> str: """ String representation. """ - footer = self._repr_categories_info() + footer = self._get_repr_footer() length = len(self) max_len = 10 if length > max_len: diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index cac83e2a48972..322f7f88494de 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -259,11 +259,12 @@ def _get_footer(self) -> str: name = self.series.name footer = "" - if getattr(self.series.index, "freq", None) is not None: - assert isinstance( - self.series.index, (DatetimeIndex, PeriodIndex, TimedeltaIndex) - ) - footer += f"Freq: {self.series.index.freqstr}" + index = self.series.index + if ( + isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)) + and index.freq is not None + ): + footer += f"Freq: {index.freqstr}" if self.name is not False and name is not None: if footer: @@ -289,7 +290,7 @@ def _get_footer(self) -> str: # level infos are added to the end and in a new line, like it is done # for Categoricals if isinstance(self.tr_series.dtype, CategoricalDtype): - level_info = self.tr_series._values._repr_categories_info() + level_info = self.tr_series._values._get_repr_footer() if footer: footer += "\n" footer += level_info From 943c3cb38d23e60905891724f878b91483597c83 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 11 Oct 2023 10:54:57 -0700 Subject: [PATCH 119/131] BUG: repr of inf values with use_inf_as_na (#55483) * BUG: repr of inf values with use_inf_as_na * GH ref --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/_libs/missing.pyi | 1 - pandas/_libs/missing.pyx | 25 ------------------------- pandas/core/indexes/base.py | 13 ++----------- pandas/io/formats/format.py | 22 ++++++++++------------ pandas/tests/frame/test_repr_info.py | 2 +- 6 files changed, 14 insertions(+), 50 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 29a2d5c0b5877..eec82ae26afcc 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -394,6 +394,7 @@ Other ^^^^^ - Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`) - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) +- Bug in rendering ``inf`` values inside a a :class:`DataFrame` with the ``use_inf_as_na`` option enabled (:issue:`55483`) - Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`) - diff --git a/pandas/_libs/missing.pyi b/pandas/_libs/missing.pyi index d5c9f1342a089..282dcee3ed6cf 100644 --- a/pandas/_libs/missing.pyi +++ b/pandas/_libs/missing.pyi @@ -14,4 +14,3 @@ def isneginf_scalar(val: object) -> bool: ... def checknull(val: object, inf_as_na: bool = ...) -> bool: ... def isnaobj(arr: np.ndarray, inf_as_na: bool = ...) -> npt.NDArray[np.bool_]: ... def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ... -def is_float_nan(values: np.ndarray) -> npt.NDArray[np.bool_]: ... diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index e3e7d8daa03e1..8ef59b46ca25f 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -255,31 +255,6 @@ cdef bint checknull_with_nat_and_na(object obj): return checknull_with_nat(obj) or obj is C_NA -@cython.wraparound(False) -@cython.boundscheck(False) -def is_float_nan(values: ndarray) -> ndarray: - """ - True for elements which correspond to a float nan - - Returns - ------- - ndarray[bool] - """ - cdef: - ndarray[uint8_t] result - Py_ssize_t i, N - object val - - N = len(values) - result = np.zeros(N, dtype=np.uint8) - - for i in range(N): - val = values[i] - if util.is_nan(val): - result[i] = True - return result.view(bool) - - @cython.wraparound(False) @cython.boundscheck(False) def is_numeric_na(values: ndarray) -> ndarray: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c3cab965041e0..d749235e2cd2c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -37,7 +37,6 @@ is_datetime_array, no_default, ) -from pandas._libs.missing import is_float_nan from pandas._libs.tslibs import ( IncompatibleFrequency, OutOfBoundsDatetime, @@ -1390,16 +1389,8 @@ def _format_with_header(self, *, header: list[str_t], na_rep: str_t) -> list[str if is_object_dtype(values.dtype) or is_string_dtype(values.dtype): values = np.asarray(values) - values = lib.maybe_convert_objects(values, safe=True) - - result = [pprint_thing(x, escape_chars=("\t", "\r", "\n")) for x in values] - - # could have nans - mask = is_float_nan(values) - if mask.any(): - result_arr = np.array(result) - result_arr[mask] = na_rep - result = result_arr.tolist() + # TODO: why do we need different justify for these cases? + result = trim_front(format_array(values, None, justify="all")) else: result = trim_front(format_array(values, None, justify="left")) return header + result diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 322f7f88494de..bb976b3a0208e 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1216,18 +1216,16 @@ def _format_strings(self) -> list[str]: def _format(x): if self.na_rep is not None and is_scalar(x) and isna(x): - try: - # try block for np.isnat specifically - # determine na_rep if x is None or NaT-like - if x is None: - return "None" - elif x is NA: - return str(NA) - elif x is NaT or np.isnat(x): - return "NaT" - except (TypeError, ValueError): - # np.isnat only handles datetime or timedelta objects - pass + if x is None: + return "None" + elif x is NA: + return str(NA) + elif lib.is_float(x) and np.isinf(x): + # TODO(3.0): this will be unreachable when use_inf_as_na + # deprecation is enforced + return str(x) + elif x is NaT or isinstance(x, (np.datetime64, np.timedelta64)): + return "NaT" return self.na_rep elif isinstance(x, PandasObject): return str(x) diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 0634b8268c04c..63ecdfa5e001b 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -411,7 +411,7 @@ def test_to_records_with_na_record(self): def test_to_records_with_inf_as_na_record(self): # GH 48526 expected = """ NaN inf record -0 NaN b [0, inf, b] +0 inf b [0, inf, b] 1 NaN NaN [1, nan, nan] 2 e f [2, e, f]""" msg = "use_inf_as_na option is deprecated" From c9a98f0f0fe3a1ff2ce549b2f2c7551cc9afc58a Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Wed, 11 Oct 2023 19:55:10 +0200 Subject: [PATCH 120/131] =?UTF-8?q?DEPR:=20=20=E2=80=98AS=E2=80=99,=20?= =?UTF-8?q?=E2=80=98BA=E2=80=99=20and=20=E2=80=98BAS=E2=80=99=20in=20favou?= =?UTF-8?q?r=20of=20=E2=80=98YS=E2=80=99,=20=E2=80=98BY=E2=80=99=20and=20?= =?UTF-8?q?=E2=80=98BYS=E2=80=99=20(#55479)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * deprecate AS/BA/BAS in favour of YS/BY/BYS, fix tests * correct def get_start_end_field, correct docstrings and v0.20.0.rst * fix pre-commit error * add deprecated frequencies to dict DEPR_ABBREVS, add tests --- doc/source/user_guide/timeseries.rst | 14 +++--- doc/source/whatsnew/v0.20.0.rst | 20 +++++++-- pandas/_libs/tslibs/dtypes.pyx | 44 +++++++++++++++++-- pandas/_libs/tslibs/fields.pyx | 4 +- pandas/_libs/tslibs/offsets.pyx | 21 ++++----- pandas/core/arrays/arrow/array.py | 2 +- pandas/core/indexes/datetimes.py | 4 +- pandas/core/resample.py | 4 +- pandas/core/series.py | 2 +- pandas/tests/arithmetic/test_datetime64.py | 6 +-- .../tests/frame/methods/test_to_timestamp.py | 10 ++--- pandas/tests/groupby/test_grouping.py | 2 +- .../datetimes/methods/test_to_period.py | 2 +- .../indexes/datetimes/test_constructors.py | 6 +-- .../indexes/datetimes/test_date_range.py | 14 +++--- .../tests/indexes/datetimes/test_datetime.py | 38 ++++++++++++++++ pandas/tests/indexes/datetimes/test_misc.py | 2 +- .../period/methods/test_to_timestamp.py | 2 +- pandas/tests/resample/test_datetime_index.py | 4 +- pandas/tests/resample/test_period_index.py | 2 +- .../tseries/frequencies/test_inference.py | 4 +- pandas/tests/tseries/offsets/test_offsets.py | 4 +- pandas/tseries/frequencies.py | 4 +- 23 files changed, 148 insertions(+), 67 deletions(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index e1415e2ca23ca..9f3077e266e98 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -896,9 +896,9 @@ into ``freq`` keyword arguments. The available date offsets and associated frequ :class:`~pandas.tseries.offsets.BQuarterBegin`, ``'BQS'``, "business quarter begin" :class:`~pandas.tseries.offsets.FY5253Quarter`, ``'REQ'``, "retail (aka 52-53 week) quarter" :class:`~pandas.tseries.offsets.YearEnd`, ``'Y'``, "calendar year end" - :class:`~pandas.tseries.offsets.YearBegin`, ``'AS'`` or ``'BYS'``,"calendar year begin" - :class:`~pandas.tseries.offsets.BYearEnd`, ``'BA'``, "business year end" - :class:`~pandas.tseries.offsets.BYearBegin`, ``'BAS'``, "business year begin" + :class:`~pandas.tseries.offsets.YearBegin`, ``'YS'`` or ``'BYS'``,"calendar year begin" + :class:`~pandas.tseries.offsets.BYearEnd`, ``'BY'``, "business year end" + :class:`~pandas.tseries.offsets.BYearBegin`, ``'BYS'``, "business year begin" :class:`~pandas.tseries.offsets.FY5253`, ``'RE'``, "retail (aka 52-53 week) year" :class:`~pandas.tseries.offsets.Easter`, None, "Easter holiday" :class:`~pandas.tseries.offsets.BusinessHour`, ``'bh'``, "business hour" @@ -1259,9 +1259,9 @@ frequencies. We will refer to these aliases as *offset aliases*. "QS", "quarter start frequency" "BQS", "business quarter start frequency" "Y", "year end frequency" - "BA, BY", "business year end frequency" - "AS, YS", "year start frequency" - "BAS, BYS", "business year start frequency" + "BY", "business year end frequency" + "YS", "year start frequency" + "BYS", "business year start frequency" "h", "hourly frequency" "bh", "business hour frequency" "cbh", "custom business hour frequency" @@ -1692,7 +1692,7 @@ the end of the interval. .. warning:: The default values for ``label`` and ``closed`` is '**left**' for all - frequency offsets except for 'ME', 'Y', 'Q', 'BM', 'BA', 'BQ', and 'W' + frequency offsets except for 'ME', 'Y', 'Q', 'BM', 'BY', 'BQ', and 'W' which all have a default of 'right'. This might unintendedly lead to looking ahead, where the value for a later diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index ae70eb078f6d9..09bf5428d0432 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -886,11 +886,23 @@ This would happen with a ``lexsorted``, but non-monotonic levels. (:issue:`15622 This is *unchanged* from prior versions, but shown for illustration purposes: -.. ipython:: python +.. code-block:: python - df = pd.DataFrame(np.arange(6), columns=['value'], - index=pd.MultiIndex.from_product([list('BA'), range(3)])) - df + In [81]: df = pd.DataFrame(np.arange(6), columns=['value'], + ....: index=pd.MultiIndex.from_product([list('BA'), range(3)])) + ....: + In [82]: df + + Out[82]: + value + B 0 0 + 1 1 + 2 2 + A 0 3 + 1 4 + 2 5 + + [6 rows x 1 columns] .. code-block:: python diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index 86f620beeec3b..26181d8f15518 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -192,9 +192,6 @@ OFFSET_TO_PERIOD_FREQSTR: dict = { "BQS": "Q", "QS": "Q", "BQ": "Q", - "BA": "Y", - "AS": "Y", - "BAS": "Y", "MS": "M", "D": "D", "B": "B", @@ -205,9 +202,9 @@ OFFSET_TO_PERIOD_FREQSTR: dict = { "ns": "ns", "h": "h", "Q": "Q", - "Y": "Y", "W": "W", "ME": "M", + "Y": "Y", "BY": "Y", "YS": "Y", "BYS": "Y", @@ -244,6 +241,45 @@ DEPR_ABBREVS: dict[str, str]= { "A-SEP": "Y-SEP", "A-OCT": "Y-OCT", "A-NOV": "Y-NOV", + "BA": "BY", + "BA-DEC": "BY-DEC", + "BA-JAN": "BY-JAN", + "BA-FEB": "BY-FEB", + "BA-MAR": "BY-MAR", + "BA-APR": "BY-APR", + "BA-MAY": "BY-MAY", + "BA-JUN": "BY-JUN", + "BA-JUL": "BY-JUL", + "BA-AUG": "BY-AUG", + "BA-SEP": "BY-SEP", + "BA-OCT": "BY-OCT", + "BA-NOV": "BY-NOV", + "AS": "YS", + "AS-DEC": "YS-DEC", + "AS-JAN": "YS-JAN", + "AS-FEB": "YS-FEB", + "AS-MAR": "YS-MAR", + "AS-APR": "YS-APR", + "AS-MAY": "YS-MAY", + "AS-JUN": "YS-JUN", + "AS-JUL": "YS-JUL", + "AS-AUG": "YS-AUG", + "AS-SEP": "YS-SEP", + "AS-OCT": "YS-OCT", + "AS-NOV": "YS-NOV", + "BAS": "BYS", + "BAS-DEC": "BYS-DEC", + "BAS-JAN": "BYS-JAN", + "BAS-FEB": "BYS-FEB", + "BAS-MAR": "BYS-MAR", + "BAS-APR": "BYS-APR", + "BAS-MAY": "BYS-MAY", + "BAS-JUN": "BYS-JUN", + "BAS-JUL": "BYS-JUL", + "BAS-AUG": "BYS-AUG", + "BAS-SEP": "BYS-SEP", + "BAS-OCT": "BYS-OCT", + "BAS-NOV": "BYS-NOV", "H": "h", "BH": "bh", "CBH": "cbh", diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index ad37add17967d..a726c735bf9a1 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -253,8 +253,8 @@ def get_start_end_field( # month of year. Other offsets use month, startingMonth as ending # month of year. - if (freqstr[0:2] in ["MS", "QS", "AS"]) or ( - freqstr[1:3] in ["MS", "QS", "AS"]): + if (freqstr[0:2] in ["MS", "QS", "YS"]) or ( + freqstr[1:3] in ["MS", "QS", "YS"]): end_month = 12 if month_kw == 1 else month_kw - 1 start_month = month_kw else: diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 042d5dafe3046..6a6f30de8dade 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -2414,7 +2414,7 @@ cdef class BYearEnd(YearOffset): _outputName = "BusinessYearEnd" _default_month = 12 - _prefix = "BA" + _prefix = "BY" _day_opt = "business_end" @@ -2453,7 +2453,7 @@ cdef class BYearBegin(YearOffset): _outputName = "BusinessYearBegin" _default_month = 1 - _prefix = "BAS" + _prefix = "BYS" _day_opt = "business_start" @@ -2552,7 +2552,7 @@ cdef class YearBegin(YearOffset): """ _default_month = 1 - _prefix = "AS" + _prefix = "YS" _day_opt = "start" @@ -4540,10 +4540,10 @@ CDay = CustomBusinessDay prefix_mapping = { offset._prefix: offset for offset in [ - YearBegin, # 'AS' + YearBegin, # 'YS' YearEnd, # 'Y' - BYearBegin, # 'BAS' - BYearEnd, # 'BA' + BYearBegin, # 'BYS' + BYearEnd, # 'BY' BusinessDay, # 'B' BusinessMonthBegin, # 'BMS' BusinessMonthEnd, # 'BM' @@ -4584,12 +4584,9 @@ _lite_rule_alias = { "Q": "Q-DEC", "Y": "Y-DEC", # YearEnd(month=12), - "AS": "AS-JAN", # YearBegin(month=1), - "YS": "AS-JAN", - "BA": "BA-DEC", # BYearEnd(month=12), - "BY": "BA-DEC", - "BAS": "BAS-JAN", # BYearBegin(month=1), - "BYS": "BAS-JAN", + "YS": "YS-JAN", # YearBegin(month=1), + "BY": "BY-DEC", # BYearEnd(month=12), + "BYS": "BYS-JAN", # BYearBegin(month=1), "Min": "min", "min": "min", diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 60c42c01e9f6f..c91f892936640 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2526,7 +2526,7 @@ def _round_temporally( raise ValueError(f"Must specify a valid frequency: {freq}") pa_supported_unit = { "Y": "year", - "AS": "year", + "YS": "year", "Q": "quarter", "QS": "quarter", "M": "month", diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index f3b2a35f379f4..12f93cf482a1d 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -992,11 +992,11 @@ def date_range( **Specify a unit** - >>> pd.date_range(start="2017-01-01", periods=10, freq="100AS", unit="s") + >>> pd.date_range(start="2017-01-01", periods=10, freq="100YS", unit="s") DatetimeIndex(['2017-01-01', '2117-01-01', '2217-01-01', '2317-01-01', '2417-01-01', '2517-01-01', '2617-01-01', '2717-01-01', '2817-01-01', '2917-01-01'], - dtype='datetime64[s]', freq='100AS-JAN') + dtype='datetime64[s]', freq='100YS-JAN') """ if freq is None and com.any_none(periods, start, end): freq = "D" diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 59e6a20915c18..8b3071a6f8582 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2101,7 +2101,7 @@ def __init__( else: freq = to_offset(freq) - end_types = {"ME", "Y", "Q", "BM", "BA", "BQ", "W"} + end_types = {"ME", "Y", "Q", "BM", "BY", "BQ", "W"} rule = freq.rule_code if rule in end_types or ("-" in rule and rule[: rule.find("-")] in end_types): if closed is None: @@ -2299,7 +2299,7 @@ def _adjust_bin_edges( if self.freq.name in ("BM", "ME", "W") or self.freq.name.split("-")[0] in ( "BQ", - "BA", + "BY", "Q", "Y", "W", diff --git a/pandas/core/series.py b/pandas/core/series.py index c2eea371ddef3..fdd03debf6de4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5729,7 +5729,7 @@ def to_timestamp( 2023-01-01 1 2024-01-01 2 2025-01-01 3 - Freq: AS-JAN, dtype: int64 + Freq: YS-JAN, dtype: int64 Using `freq` which is the offset that the Timestamps will have diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index df6ccda27ab85..693b8d9483407 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1594,7 +1594,7 @@ def test_dt64arr_add_sub_offset_array( Timestamp("2016-04-01"), Timestamp("2017-04-01"), ], - "AS-APR", + "YS-APR", ), ( "__sub__", @@ -1616,7 +1616,7 @@ def test_dt64arr_add_sub_offset_array( Timestamp("2015-10-01"), Timestamp("2016-10-01"), ], - "AS-OCT", + "YS-OCT", ), ], ) @@ -1625,7 +1625,7 @@ def test_dti_add_sub_nonzero_mth_offset( ): # GH 26258 tz = tz_aware_fixture - date = date_range(start="01 Jan 2014", end="01 Jan 2017", freq="AS", tz=tz) + date = date_range(start="01 Jan 2014", end="01 Jan 2017", freq="YS", tz=tz) date = tm.box_expected(date, box_with_array, False) mth = getattr(date, op) result = mth(offset) diff --git a/pandas/tests/frame/methods/test_to_timestamp.py b/pandas/tests/frame/methods/test_to_timestamp.py index aeb65d98d8ab2..859dc56de4a25 100644 --- a/pandas/tests/frame/methods/test_to_timestamp.py +++ b/pandas/tests/frame/methods/test_to_timestamp.py @@ -44,7 +44,7 @@ def test_to_timestamp(self, frame_or_series): if frame_or_series is Series: assert result.name == "A" - exp_index = date_range("1/1/2001", end="1/1/2009", freq="AS-JAN") + exp_index = date_range("1/1/2001", end="1/1/2009", freq="YS-JAN") result = obj.to_timestamp("D", "start") tm.assert_index_equal(result.index, exp_index) @@ -88,7 +88,7 @@ def test_to_timestamp_columns(self): tm.assert_index_equal(result.columns, exp_index) tm.assert_numpy_array_equal(result.values, df.values) - exp_index = date_range("1/1/2001", end="1/1/2009", freq="AS-JAN") + exp_index = date_range("1/1/2001", end="1/1/2009", freq="YS-JAN") result = df.to_timestamp("D", "start", axis=1) tm.assert_index_equal(result.columns, exp_index) @@ -112,14 +112,14 @@ def test_to_timestamp_columns(self): result1 = df.to_timestamp("5min", axis=1) result2 = df.to_timestamp("min", axis=1) - expected = date_range("2001-01-01", "2009-01-01", freq="AS") + expected = date_range("2001-01-01", "2009-01-01", freq="YS") assert isinstance(result1.columns, DatetimeIndex) assert isinstance(result2.columns, DatetimeIndex) tm.assert_numpy_array_equal(result1.columns.asi8, expected.asi8) tm.assert_numpy_array_equal(result2.columns.asi8, expected.asi8) # PeriodIndex.to_timestamp always use 'infer' - assert result1.columns.freqstr == "AS-JAN" - assert result2.columns.freqstr == "AS-JAN" + assert result1.columns.freqstr == "YS-JAN" + assert result2.columns.freqstr == "YS-JAN" def test_to_timestamp_invalid_axis(self): index = period_range(freq="Y", start="1/1/2001", end="12/1/2009") diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 88ee8a35e5c94..76a543050097d 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -734,7 +734,7 @@ def test_list_grouper_with_nat(self): # GH 14715 df = DataFrame({"date": date_range("1/1/2011", periods=365, freq="D")}) df.iloc[-1] = pd.NaT - grouper = Grouper(key="date", freq="AS") + grouper = Grouper(key="date", freq="YS") # Grouper in a list grouping result = df.groupby([grouper]) diff --git a/pandas/tests/indexes/datetimes/methods/test_to_period.py b/pandas/tests/indexes/datetimes/methods/test_to_period.py index 8900c5cdbca14..6839fafcdc114 100644 --- a/pandas/tests/indexes/datetimes/methods/test_to_period.py +++ b/pandas/tests/indexes/datetimes/methods/test_to_period.py @@ -56,7 +56,7 @@ def test_to_period_quarterlyish(self, off): prng = rng.to_period() assert prng.freq == "Q-DEC" - @pytest.mark.parametrize("off", ["BA", "AS", "BAS"]) + @pytest.mark.parametrize("off", ["BY", "YS", "BYS"]) def test_to_period_annualish(self, off): rng = date_range("01-Jan-2012", periods=8, freq=off) prng = rng.to_period() diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 6da215715482d..077b4fa5a0696 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -648,7 +648,7 @@ def test_constructor_coverage(self): with pytest.raises(ValueError, match=msg): date_range(periods=10, freq="D") - @pytest.mark.parametrize("freq", ["AS", "W-SUN"]) + @pytest.mark.parametrize("freq", ["YS", "W-SUN"]) def test_constructor_datetime64_tzformat(self, freq): # see GH#6572: ISO 8601 format results in stdlib timezone object idx = date_range( @@ -981,8 +981,8 @@ def test_dti_constructor_years_only(self, tz_naive_fixture): rng3 = date_range("2014", "2020", freq="Y", tz=tz) expected3 = date_range("2014-12-31", "2019-12-31", freq="Y", tz=tz) - rng4 = date_range("2014", "2020", freq="AS", tz=tz) - expected4 = date_range("2014-01-01", "2020-01-01", freq="AS", tz=tz) + rng4 = date_range("2014", "2020", freq="YS", tz=tz) + expected4 = date_range("2014-01-01", "2020-01-01", freq="YS", tz=tz) for rng, expected in [ (rng1, expected1), diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index ededf78621699..a74d31747fbb0 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -243,13 +243,12 @@ def test_date_range_gen_error(self): rng = date_range("1/1/2000 00:00", "1/1/2000 00:18", freq="5min") assert len(rng) == 4 - @pytest.mark.parametrize("freq", ["AS", "YS"]) - def test_begin_year_alias(self, freq): + def test_begin_year_alias(self): # see gh-9313 - rng = date_range("1/1/2013", "7/1/2017", freq=freq) + rng = date_range("1/1/2013", "7/1/2017", freq="YS") exp = DatetimeIndex( ["2013-01-01", "2014-01-01", "2015-01-01", "2016-01-01", "2017-01-01"], - freq=freq, + freq="YS", ) tm.assert_index_equal(rng, exp) @@ -261,12 +260,11 @@ def test_end_year_alias(self): ) tm.assert_index_equal(rng, exp) - @pytest.mark.parametrize("freq", ["BA", "BY"]) - def test_business_end_year_alias(self, freq): + def test_business_end_year_alias(self): # see gh-9313 - rng = date_range("1/1/2013", "7/1/2017", freq=freq) + rng = date_range("1/1/2013", "7/1/2017", freq="BY") exp = DatetimeIndex( - ["2013-12-31", "2014-12-31", "2015-12-31", "2016-12-30"], freq=freq + ["2013-12-31", "2014-12-31", "2015-12-31", "2016-12-30"], freq="BY" ) tm.assert_index_equal(rng, exp) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 156075e3fafec..a18501a193b60 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -1,5 +1,6 @@ import datetime as dt from datetime import date +import re import dateutil import numpy as np @@ -226,3 +227,40 @@ def test_CBH_deprecated(self): ) tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "freq_depr, expected_values, expected_freq", + [ + ( + "2BA", + ["2020-12-31", "2022-12-30"], + "2BY-DEC", + ), + ( + "AS-AUG", + ["2021-08-01", "2022-08-01", "2023-08-01"], + "YS-AUG", + ), + ( + "1BAS-MAY", + ["2021-05-03", "2022-05-02", "2023-05-01"], + "1BYS-MAY", + ), + ], + ) + def test_AS_BA_BAS_deprecated(self, freq_depr, expected_values, expected_freq): + # GH#55479 + freq_msg = re.split("[0-9]*", freq_depr, maxsplit=1)[1] + msg = f"'{freq_msg}' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = date_range( + dt.datetime(2020, 12, 1), dt.datetime(2023, 12, 1), freq=freq_depr + ) + result = DatetimeIndex( + expected_values, + dtype="datetime64[ns]", + freq=expected_freq, + ) + + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 185134af165f4..0a5287d154adc 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -146,7 +146,7 @@ def test_datetimeindex_accessors5(self): qsfeb = to_offset("QS-FEB") bq = to_offset("BQ") bqs_apr = to_offset("BQS-APR") - as_nov = to_offset("AS-NOV") + as_nov = to_offset("YS-NOV") tests = [ (freq_m.is_month_start(Timestamp("2013-06-01")), 1), diff --git a/pandas/tests/indexes/period/methods/test_to_timestamp.py b/pandas/tests/indexes/period/methods/test_to_timestamp.py index 2394efb353ab6..977ad8b26a369 100644 --- a/pandas/tests/indexes/period/methods/test_to_timestamp.py +++ b/pandas/tests/indexes/period/methods/test_to_timestamp.py @@ -49,7 +49,7 @@ def test_to_timestamp_non_contiguous(self): def test_to_timestamp_freq(self): idx = period_range("2017", periods=12, freq="Y-DEC") result = idx.to_timestamp() - expected = date_range("2017", periods=12, freq="AS-JAN") + expected = date_range("2017", periods=12, freq="YS-JAN") tm.assert_index_equal(result, expected) def test_to_timestamp_pi_nat(self): diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 28d02576156a0..f66f5bf50974e 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1172,7 +1172,7 @@ def test_resample_anchored_intraday(simple_date_range_series, unit): assert len(resampled) == 1 -@pytest.mark.parametrize("freq", ["MS", "BMS", "QS-MAR", "AS-DEC", "AS-JUN"]) +@pytest.mark.parametrize("freq", ["MS", "BMS", "QS-MAR", "YS-DEC", "YS-JUN"]) def test_resample_anchored_monthstart(simple_date_range_series, freq, unit): ts = simple_date_range_series("1/1/2000", "12/31/2002") ts.index = ts.index.as_unit(unit) @@ -1320,7 +1320,7 @@ def test_resample_unequal_times(unit): df = DataFrame({"close": 1}, index=bad_ind) # it works! - df.resample("AS").sum() + df.resample("YS").sum() def test_resample_consistency(unit): diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index d214e1b4ae4ae..6ad09f12525b4 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -660,7 +660,7 @@ def test_default_right_closed_label(self, from_freq, to_freq): @pytest.mark.parametrize( "from_freq, to_freq", - [("D", "MS"), ("Q", "AS"), ("ME", "QS"), ("h", "D"), ("min", "h")], + [("D", "MS"), ("Q", "YS"), ("ME", "QS"), ("h", "D"), ("min", "h")], ) def test_default_left_closed_label(self, from_freq, to_freq): idx = date_range(start="8/15/2012", periods=100, freq=from_freq) diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index 51d0dd298f841..22ff7f8405a40 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -52,7 +52,7 @@ def base_delta_code_pair(request): freqs = ( [f"Q-{month}" for month in MONTHS] - + [f"{annual}-{month}" for annual in ["Y", "BA"] for month in MONTHS] + + [f"{annual}-{month}" for annual in ["Y", "BY"] for month in MONTHS] + ["ME", "BM", "BMS"] + [f"WOM-{count}{day}" for count in range(1, 5) for day in DAYS] + [f"W-{day}" for day in DAYS] @@ -215,7 +215,7 @@ def test_infer_freq_index(freq, expected): "expected,dates", list( { - "AS-JAN": ["2009-01-01", "2010-01-01", "2011-01-01", "2012-01-01"], + "YS-JAN": ["2009-01-01", "2010-01-01", "2011-01-01", "2012-01-01"], "Q-OCT": ["2009-01-31", "2009-04-30", "2009-07-31", "2009-10-31"], "ME": ["2010-11-30", "2010-12-31", "2011-01-31", "2011-02-28"], "W-SAT": ["2010-12-25", "2011-01-01", "2011-01-08", "2011-01-15"], diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 7f96ea98fa047..9389f78c9e672 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -839,7 +839,7 @@ def test_rule_code(self): "NOV", "DEC", ] - base_lst = ["Y", "AS", "BA", "BAS", "Q", "QS", "BQ", "BQS"] + base_lst = ["Y", "YS", "BY", "BYS", "Q", "QS", "BQ", "BQS"] for base in base_lst: for v in suffix_lst: alias = "-".join([base, v]) @@ -858,7 +858,7 @@ def test_freq_offsets(): class TestReprNames: def test_str_for_named_is_name(self): # look at all the amazing combinations! - month_prefixes = ["Y", "AS", "BA", "BAS", "Q", "BQ", "BQS", "QS"] + month_prefixes = ["Y", "YS", "BY", "BYS", "Q", "BQ", "BQS", "QS"] names = [ prefix + "-" + month for prefix in month_prefixes diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 0ed0fe4b87576..db4fdf0d24465 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -59,7 +59,7 @@ # -------------------------------------------------------------------- # Offset related functions -_need_suffix = ["QS", "BQ", "BQS", "YS", "AS", "BY", "BA", "BYS", "BAS"] +_need_suffix = ["QS", "BQ", "BQS", "YS", "BY", "BYS"] for _prefix in _need_suffix: for _m in MONTHS: @@ -345,7 +345,7 @@ def _get_annual_rule(self) -> str | None: if pos_check is None: return None else: - return {"cs": "AS", "bs": "BAS", "ce": "Y", "be": "BA"}.get(pos_check) + return {"cs": "YS", "bs": "BYS", "ce": "Y", "be": "BY"}.get(pos_check) def _get_quarterly_rule(self) -> str | None: if len(self.mdiffs) > 1: From 7e8148f4ff44482b8541959d5e095fef00adad23 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 12 Oct 2023 08:08:25 -0700 Subject: [PATCH 121/131] REF: De-special-case _format_with_header (#55491) * REF: De-special-case _format_with_header * simplify --- pandas/core/indexes/base.py | 16 ++++++++++++---- pandas/core/indexes/category.py | 10 ---------- pandas/core/indexes/datetimelike.py | 1 + pandas/core/indexes/interval.py | 7 ------- 4 files changed, 13 insertions(+), 21 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d749235e2cd2c..515f750f11219 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1387,12 +1387,20 @@ def _format_with_header(self, *, header: list[str_t], na_rep: str_t) -> list[str values = self._values - if is_object_dtype(values.dtype) or is_string_dtype(values.dtype): - values = np.asarray(values) + if ( + is_object_dtype(values.dtype) + or is_string_dtype(values.dtype) + or isinstance(self.dtype, (IntervalDtype, CategoricalDtype)) + ): # TODO: why do we need different justify for these cases? - result = trim_front(format_array(values, None, justify="all")) + justify = "all" else: - result = trim_front(format_array(values, None, justify="left")) + justify = "left" + # passing leading_space=False breaks test_format_missing, + # test_index_repr_in_frame_with_nan, but would otherwise make + # trim_front unnecessary + formatted = format_array(values, None, justify=justify) + result = trim_front(formatted) return header + result def _format_native_types( diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 9cf7e861584d9..b307be004ad6e 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -21,7 +21,6 @@ from pandas.core.dtypes.missing import ( is_valid_na_for_dtype, isna, - notna, ) from pandas.core.arrays.categorical import ( @@ -38,8 +37,6 @@ inherit_names, ) -from pandas.io.formats.printing import pprint_thing - if TYPE_CHECKING: from collections.abc import Hashable @@ -356,13 +353,6 @@ def _format_attrs(self): extra = super()._format_attrs() return attrs + extra - def _format_with_header(self, *, header: list[str], na_rep: str) -> list[str]: - result = [ - pprint_thing(x, escape_chars=("\t", "\r", "\n")) if notna(x) else na_rep - for x in self._values - ] - return header + result - # -------------------------------------------------------------------- @property diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 94ad556219b35..f02f7dcb65251 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -216,6 +216,7 @@ def format( def _format_with_header( self, *, header: list[str], na_rep: str, date_format: str | None = None ) -> list[str]: + # TODO: not reached in tests 2023-10-11 # matches base class except for whitespace padding and date_format return header + list( self._format_native_types(na_rep=na_rep, date_format=date_format) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 209ac84869e85..f73e2b5512262 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -842,13 +842,6 @@ def mid(self) -> Index: def length(self) -> Index: return Index(self._data.length, copy=False) - # -------------------------------------------------------------------- - # Rendering Methods - - def _format_with_header(self, *, header: list[str], na_rep: str) -> list[str]: - # matches base class except for whitespace padding - return header + list(self._format_native_types(na_rep=na_rep)) - # -------------------------------------------------------------------- # Set Operations From 7ef617e88e1e51bbfd5d0cf71f57ef4502b91b38 Mon Sep 17 00:00:00 2001 From: shiersansi <143710553+shiersansi@users.noreply.github.com> Date: Thu, 12 Oct 2023 23:10:59 +0800 Subject: [PATCH 122/131] DOC: Add DataFrame.index.levels (#55437) * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: ../pandas/core/indexes/multi.py * modified: ../pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * modified: pandas/core/indexes/multi.py * Update pandas/core/indexes/multi.py --------- Co-authored-by: Marc Garcia --- pandas/core/indexes/multi.py | 47 ++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index c5cab225fa7b1..0c3593eca178d 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -843,6 +843,53 @@ def size(self) -> int: @cache_readonly def levels(self) -> FrozenList: + """ + Levels of the MultiIndex. + + Levels refer to the different hierarchical levels or layers in a MultiIndex. + In a MultiIndex, each level represents a distinct dimension or category of + the index. + + To access the levels, you can use the levels attribute of the MultiIndex, + which returns a tuple of Index objects. Each Index object represents a + level in the MultiIndex and contains the unique values found in that + specific level. + + If a MultiIndex is created with levels A, B, C, and the DataFrame using + it filters out all rows of the level C, MultiIndex.levels will still + return A, B, C. + + Examples + -------- + >>> index = pd.MultiIndex.from_product([['mammal'], + ... ('goat', 'human', 'cat', 'dog')], names=['Category', 'Animals']) + >>> leg_num = pd.DataFrame(data=(4, 2, 4, 4), index=index, columns=['Legs']) + >>> leg_num + Legs + Category Animals + mammal goat 4 + human 2 + cat 4 + dog 4 + + >>> leg_num.index.levels + FrozenList([['mammal'], ['cat', 'dog', 'goat', 'human']]) + + MultiIndex levels will not change even if the DataFrame using the MultiIndex + does not contain all them anymore. + See how "human" is not in the DataFrame, but it is still in levels: + + >>> large_leg_num = leg_num[leg_num.Legs > 2] + >>> large_leg_num + Legs + Category Animals + mammal goat 4 + cat 4 + dog 4 + + >>> large_leg_num.index.levels + FrozenList([['mammal'], ['cat', 'dog', 'goat', 'human']]) + """ # Use cache_readonly to ensure that self.get_locs doesn't repeatedly # create new IndexEngine # https://github.com/pandas-dev/pandas/issues/31648 From ae177e88472a0f71e4fa8d41e773f6fb7029a8dc Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 12 Oct 2023 05:12:37 -1000 Subject: [PATCH 123/131] TST: Close ad-hoc db connections (#55445) * TST: Close ad-hoc db connections * Add runtime import --- pandas/tests/io/test_sql.py | 227 +++++++++++++++++------------------- 1 file changed, 105 insertions(+), 122 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 82fb98615100f..11f95ff104767 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1,20 +1,3 @@ -"""SQL io tests - -The SQL tests are broken down in different classes: - -- `PandasSQLTest`: base class with common methods for all test classes -- Tests for the public API (only tests with sqlite3) - - `_TestSQLApi` base class - - `TestSQLApi`: test the public API with sqlalchemy engine - - `TestSQLiteFallbackApi`: test the public API with a sqlite DBAPI - connection -- Tests for the different SQL flavors (flavor specific type conversions) - - Tests for the sqlalchemy mode: `_TestSQLAlchemy` is the base class with - common methods. The different tested flavors (sqlite3, MySQL, - PostgreSQL) derive from the base class - - Tests for the fallback mode (`TestSQLiteFallback`) - -""" from __future__ import annotations import contextlib @@ -29,6 +12,7 @@ from io import StringIO from pathlib import Path import sqlite3 +from typing import TYPE_CHECKING import uuid import numpy as np @@ -69,13 +53,9 @@ read_sql_table, ) -try: +if TYPE_CHECKING: import sqlalchemy - SQLALCHEMY_INSTALLED = True -except ImportError: - SQLALCHEMY_INSTALLED = False - @pytest.fixture def sql_strings(): @@ -426,13 +406,16 @@ def drop_table( conn.commit() else: with conn.begin() as con: - sql.SQLDatabase(con).drop_table(table_name) + with sql.SQLDatabase(con) as db: + db.drop_table(table_name) def drop_view( view_name: str, conn: sqlite3.Connection | sqlalchemy.engine.Engine | sqlalchemy.engine.Connection, ): + import sqlalchemy + if isinstance(conn, sqlite3.Connection): conn.execute(f"DROP VIEW IF EXISTS {sql._get_valid_sqlite_name(view_name)}") conn.commit() @@ -1151,10 +1134,9 @@ def test_read_sql_iris_parameter(conn, request, sql_strings, flavor): conn = request.getfixturevalue(conn) query = sql_strings["read_parameters"][flavor(conn_name)] params = ("Iris-setosa", 5.1) - pandasSQL = pandasSQL_builder(conn) - - with pandasSQL.run_transaction(): - iris_frame = pandasSQL.read_query(query, params=params) + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction(): + iris_frame = pandasSQL.read_query(query, params=params) check_iris_frame(iris_frame) @@ -1179,9 +1161,9 @@ def test_read_sql_iris_no_parameter_with_percent(conn, request, sql_strings, fla conn = request.getfixturevalue(conn) query = sql_strings["read_no_parameters_with_percent"][flavor(conn_name)] - pandasSQL = pandasSQL_builder(conn) - with pandasSQL.run_transaction(): - iris_frame = pandasSQL.read_query(query, params=None) + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction(): + iris_frame = pandasSQL.read_query(query, params=None) check_iris_frame(iris_frame) @@ -1914,7 +1896,8 @@ def test_warning_case_insensitive_table_name(conn, request, test_frame1): r"sensitivity issues. Consider using lower case table names." ), ): - sql.SQLDatabase(conn).check_case_sensitive("TABLE1", "") + with sql.SQLDatabase(conn) as db: + db.check_case_sensitive("TABLE1", "") # Test that the warning is certainly NOT triggered in a normal case. with tm.assert_produces_warning(None): @@ -1930,10 +1913,10 @@ def test_sqlalchemy_type_mapping(conn, request): df = DataFrame( {"time": to_datetime(["2014-12-12 01:54", "2014-12-11 02:54"], utc=True)} ) - db = sql.SQLDatabase(conn) - table = sql.SQLTable("test_type", db, frame=df) - # GH 9086: TIMESTAMP is the suggested type for datetimes with timezones - assert isinstance(table.table.c["time"].type, TIMESTAMP) + with sql.SQLDatabase(conn) as db: + table = sql.SQLTable("test_type", db, frame=df) + # GH 9086: TIMESTAMP is the suggested type for datetimes with timezones + assert isinstance(table.table.c["time"].type, TIMESTAMP) @pytest.mark.parametrize("conn", sqlalchemy_connectable) @@ -1961,10 +1944,10 @@ def test_sqlalchemy_integer_mapping(conn, request, integer, expected): # GH35076 Map pandas integer to optimal SQLAlchemy integer type conn = request.getfixturevalue(conn) df = DataFrame([0, 1], columns=["a"], dtype=integer) - db = sql.SQLDatabase(conn) - table = sql.SQLTable("test_type", db, frame=df) + with sql.SQLDatabase(conn) as db: + table = sql.SQLTable("test_type", db, frame=df) - result = str(table.table.c.a.type) + result = str(table.table.c.a.type) assert result == expected @@ -1974,16 +1957,16 @@ def test_sqlalchemy_integer_overload_mapping(conn, request, integer): conn = request.getfixturevalue(conn) # GH35076 Map pandas integer to optimal SQLAlchemy integer type df = DataFrame([0, 1], columns=["a"], dtype=integer) - db = sql.SQLDatabase(conn) - with pytest.raises( - ValueError, match="Unsigned 64 bit integer datatype is not supported" - ): - sql.SQLTable("test_type", db, frame=df) + with sql.SQLDatabase(conn) as db: + with pytest.raises( + ValueError, match="Unsigned 64 bit integer datatype is not supported" + ): + sql.SQLTable("test_type", db, frame=df) -@pytest.mark.skipif(not SQLALCHEMY_INSTALLED, reason="fails without SQLAlchemy") @pytest.mark.parametrize("conn", all_connectable) def test_database_uri_string(conn, request, test_frame1): + pytest.importorskip("sqlalchemy") conn = request.getfixturevalue(conn) # Test read_sql and .to_sql method with a database URI (GH10654) # db_uri = 'sqlite:///:memory:' # raises @@ -2003,9 +1986,9 @@ def test_database_uri_string(conn, request, test_frame1): @td.skip_if_installed("pg8000") -@pytest.mark.skipif(not SQLALCHEMY_INSTALLED, reason="fails without SQLAlchemy") @pytest.mark.parametrize("conn", all_connectable) def test_pg8000_sqlalchemy_passthrough_error(conn, request): + pytest.importorskip("sqlalchemy") conn = request.getfixturevalue(conn) # using driver that will not be installed on CI to trigger error # in sqlalchemy.create_engine -> test passing of this error to user @@ -2076,7 +2059,7 @@ def test_sql_open_close(test_frame3): tm.assert_frame_equal(test_frame3, result) -@pytest.mark.skipif(SQLALCHEMY_INSTALLED, reason="SQLAlchemy is installed") +@td.skip_if_installed("sqlalchemy") def test_con_string_import_error(): conn = "mysql://root@localhost/pandas" msg = "Using URI string without sqlalchemy installed" @@ -2084,7 +2067,7 @@ def test_con_string_import_error(): sql.read_sql("SELECT * FROM iris", conn) -@pytest.mark.skipif(SQLALCHEMY_INSTALLED, reason="SQLAlchemy is installed") +@td.skip_if_installed("sqlalchemy") def test_con_unknown_dbapi2_class_does_not_error_without_sql_alchemy_installed(): class MockSqliteConnection: def __init__(self, *args, **kwargs) -> None: @@ -2167,20 +2150,20 @@ def test_drop_table(conn, request): from sqlalchemy import inspect temp_frame = DataFrame({"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}) - pandasSQL = sql.SQLDatabase(conn) - with pandasSQL.run_transaction(): - assert pandasSQL.to_sql(temp_frame, "temp_frame") == 4 + with sql.SQLDatabase(conn) as pandasSQL: + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(temp_frame, "temp_frame") == 4 - insp = inspect(conn) - assert insp.has_table("temp_frame") + insp = inspect(conn) + assert insp.has_table("temp_frame") - with pandasSQL.run_transaction(): - pandasSQL.drop_table("temp_frame") - try: - insp.clear_cache() # needed with SQLAlchemy 2.0, unavailable prior - except AttributeError: - pass - assert not insp.has_table("temp_frame") + with pandasSQL.run_transaction(): + pandasSQL.drop_table("temp_frame") + try: + insp.clear_cache() # needed with SQLAlchemy 2.0, unavailable prior + except AttributeError: + pass + assert not insp.has_table("temp_frame") @pytest.mark.parametrize("conn", all_connectable) @@ -2206,9 +2189,10 @@ def test_roundtrip(conn, request, test_frame1): def test_execute_sql(conn, request): conn = request.getfixturevalue(conn) pandasSQL = pandasSQL_builder(conn) - with pandasSQL.run_transaction(): - iris_results = pandasSQL.execute("SELECT * FROM iris") - row = iris_results.fetchone() + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction(): + iris_results = pandasSQL.execute("SELECT * FROM iris") + row = iris_results.fetchone() tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) @@ -2616,10 +2600,10 @@ def test_to_sql_save_index(conn, request): [(1, 2.1, "line1"), (2, 1.5, "line2")], columns=["A", "B", "C"], index=["A"] ) - pandasSQL = pandasSQL_builder(conn) tbl_name = "test_to_sql_saves_index" - with pandasSQL.run_transaction(): - assert pandasSQL.to_sql(df, tbl_name) == 2 + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(df, tbl_name) == 2 if conn_name in {"sqlite_buildin", "sqlite_str"}: ixs = sql.read_sql_query( @@ -2648,55 +2632,55 @@ def test_transactions(conn, request): conn = request.getfixturevalue(conn) stmt = "CREATE TABLE test_trans (A INT, B TEXT)" - pandasSQL = pandasSQL_builder(conn) if conn_name != "sqlite_buildin": from sqlalchemy import text stmt = text(stmt) - with pandasSQL.run_transaction() as trans: - trans.execute(stmt) + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction() as trans: + trans.execute(stmt) @pytest.mark.parametrize("conn", all_connectable) def test_transaction_rollback(conn, request): conn = request.getfixturevalue(conn) - pandasSQL = pandasSQL_builder(conn) - with pandasSQL.run_transaction() as trans: - stmt = "CREATE TABLE test_trans (A INT, B TEXT)" - if isinstance(pandasSQL, SQLiteDatabase): - trans.execute(stmt) - else: - from sqlalchemy import text + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction() as trans: + stmt = "CREATE TABLE test_trans (A INT, B TEXT)" + if isinstance(pandasSQL, SQLiteDatabase): + trans.execute(stmt) + else: + from sqlalchemy import text - stmt = text(stmt) - trans.execute(stmt) + stmt = text(stmt) + trans.execute(stmt) - class DummyException(Exception): - pass + class DummyException(Exception): + pass - # Make sure when transaction is rolled back, no rows get inserted - ins_sql = "INSERT INTO test_trans (A,B) VALUES (1, 'blah')" - if isinstance(pandasSQL, SQLDatabase): - from sqlalchemy import text + # Make sure when transaction is rolled back, no rows get inserted + ins_sql = "INSERT INTO test_trans (A,B) VALUES (1, 'blah')" + if isinstance(pandasSQL, SQLDatabase): + from sqlalchemy import text + + ins_sql = text(ins_sql) + try: + with pandasSQL.run_transaction() as trans: + trans.execute(ins_sql) + raise DummyException("error") + except DummyException: + # ignore raised exception + pass + with pandasSQL.run_transaction(): + res = pandasSQL.read_query("SELECT * FROM test_trans") + assert len(res) == 0 - ins_sql = text(ins_sql) - try: + # Make sure when transaction is committed, rows do get inserted with pandasSQL.run_transaction() as trans: trans.execute(ins_sql) - raise DummyException("error") - except DummyException: - # ignore raised exception - pass - with pandasSQL.run_transaction(): - res = pandasSQL.read_query("SELECT * FROM test_trans") - assert len(res) == 0 - - # Make sure when transaction is committed, rows do get inserted - with pandasSQL.run_transaction() as trans: - trans.execute(ins_sql) - res2 = pandasSQL.read_query("SELECT * FROM test_trans") - assert len(res2) == 1 + res2 = pandasSQL.read_query("SELECT * FROM test_trans") + assert len(res2) == 1 @pytest.mark.parametrize("conn", sqlalchemy_connectable) @@ -2975,9 +2959,9 @@ def test_invalid_engine(conn, request, test_frame1): conn = request.getfixturevalue(conn) msg = "engine must be one of 'auto', 'sqlalchemy'" - pandasSQL = pandasSQL_builder(conn) - with pytest.raises(ValueError, match=msg): - pandasSQL.to_sql(test_frame1, "test_frame1", engine="bad_engine") + with pandasSQL_builder(conn) as pandasSQL: + with pytest.raises(ValueError, match=msg): + pandasSQL.to_sql(test_frame1, "test_frame1", engine="bad_engine") @pytest.mark.parametrize("conn", all_connectable) @@ -2985,10 +2969,10 @@ def test_to_sql_with_sql_engine(conn, request, test_frame1): """`to_sql` with the `engine` param""" # mostly copied from this class's `_to_sql()` method conn = request.getfixturevalue(conn) - pandasSQL = pandasSQL_builder(conn) - with pandasSQL.run_transaction(): - assert pandasSQL.to_sql(test_frame1, "test_frame1", engine="auto") == 4 - assert pandasSQL.has_table("test_frame1") + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(test_frame1, "test_frame1", engine="auto") == 4 + assert pandasSQL.has_table("test_frame1") num_entries = len(test_frame1) num_rows = count_rows(conn, "test_frame1") @@ -3000,10 +2984,10 @@ def test_options_sqlalchemy(conn, request, test_frame1): # use the set option conn = request.getfixturevalue(conn) with pd.option_context("io.sql.engine", "sqlalchemy"): - pandasSQL = pandasSQL_builder(conn) - with pandasSQL.run_transaction(): - assert pandasSQL.to_sql(test_frame1, "test_frame1") == 4 - assert pandasSQL.has_table("test_frame1") + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(test_frame1, "test_frame1") == 4 + assert pandasSQL.has_table("test_frame1") num_entries = len(test_frame1) num_rows = count_rows(conn, "test_frame1") @@ -3015,18 +2999,18 @@ def test_options_auto(conn, request, test_frame1): # use the set option conn = request.getfixturevalue(conn) with pd.option_context("io.sql.engine", "auto"): - pandasSQL = pandasSQL_builder(conn) - with pandasSQL.run_transaction(): - assert pandasSQL.to_sql(test_frame1, "test_frame1") == 4 - assert pandasSQL.has_table("test_frame1") + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(test_frame1, "test_frame1") == 4 + assert pandasSQL.has_table("test_frame1") num_entries = len(test_frame1) num_rows = count_rows(conn, "test_frame1") assert num_rows == num_entries -@pytest.mark.skipif(not SQLALCHEMY_INSTALLED, reason="fails without SQLAlchemy") def test_options_get_engine(): + pytest.importorskip("sqlalchemy") assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) with pd.option_context("io.sql.engine", "sqlalchemy"): @@ -3463,17 +3447,16 @@ def test_self_join_date_columns(postgresql_psycopg2_engine): def test_create_and_drop_table(sqlite_sqlalchemy_memory_engine): conn = sqlite_sqlalchemy_memory_engine temp_frame = DataFrame({"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}) - pandasSQL = sql.SQLDatabase(conn) - - with pandasSQL.run_transaction(): - assert pandasSQL.to_sql(temp_frame, "drop_test_frame") == 4 + with sql.SQLDatabase(conn) as pandasSQL: + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(temp_frame, "drop_test_frame") == 4 - assert pandasSQL.has_table("drop_test_frame") + assert pandasSQL.has_table("drop_test_frame") - with pandasSQL.run_transaction(): - pandasSQL.drop_table("drop_test_frame") + with pandasSQL.run_transaction(): + pandasSQL.drop_table("drop_test_frame") - assert not pandasSQL.has_table("drop_test_frame") + assert not pandasSQL.has_table("drop_test_frame") def test_sqlite_datetime_date(sqlite_buildin): From 9de2a19b9651336cc14a2830c8460e9ad1e2d505 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Thu, 12 Oct 2023 18:31:34 -0400 Subject: [PATCH 124/131] REF: Add tests.groupby.methods (#55312) * REF: Add tests.groupby.methods * Merge cleanup * Refactor * Refactor * Show value of ymin * fixup * Revert * Revert --- pandas/tests/groupby/methods/__init__.py | 0 pandas/tests/groupby/methods/test_corrwith.py | 24 + pandas/tests/groupby/methods/test_describe.py | 221 +++++ .../{ => methods}/test_groupby_shift_diff.py | 0 .../groupby/methods/test_is_monotonic.py | 78 ++ .../methods/test_nlargest_nsmallest.py | 115 +++ .../tests/groupby/{ => methods}/test_nth.py | 0 .../groupby/{ => methods}/test_quantile.py | 0 .../tests/groupby/{ => methods}/test_rank.py | 0 .../groupby/{ => methods}/test_sample.py | 0 .../tests/groupby/{ => methods}/test_size.py | 0 .../tests/groupby/{ => methods}/test_skew.py | 0 .../{ => methods}/test_value_counts.py | 0 pandas/tests/groupby/test_any_all.py | 188 ---- pandas/tests/groupby/test_cumulative.py | 291 ++++++ pandas/tests/groupby/test_function.py | 916 ------------------ pandas/tests/groupby/test_min_max.py | 272 ------ pandas/tests/groupby/test_nunique.py | 190 ---- pandas/tests/groupby/test_reductions.py | 838 ++++++++++++++++ 19 files changed, 1567 insertions(+), 1566 deletions(-) create mode 100644 pandas/tests/groupby/methods/__init__.py create mode 100644 pandas/tests/groupby/methods/test_corrwith.py create mode 100644 pandas/tests/groupby/methods/test_describe.py rename pandas/tests/groupby/{ => methods}/test_groupby_shift_diff.py (100%) create mode 100644 pandas/tests/groupby/methods/test_is_monotonic.py create mode 100644 pandas/tests/groupby/methods/test_nlargest_nsmallest.py rename pandas/tests/groupby/{ => methods}/test_nth.py (100%) rename pandas/tests/groupby/{ => methods}/test_quantile.py (100%) rename pandas/tests/groupby/{ => methods}/test_rank.py (100%) rename pandas/tests/groupby/{ => methods}/test_sample.py (100%) rename pandas/tests/groupby/{ => methods}/test_size.py (100%) rename pandas/tests/groupby/{ => methods}/test_skew.py (100%) rename pandas/tests/groupby/{ => methods}/test_value_counts.py (100%) delete mode 100644 pandas/tests/groupby/test_any_all.py create mode 100644 pandas/tests/groupby/test_cumulative.py delete mode 100644 pandas/tests/groupby/test_min_max.py delete mode 100644 pandas/tests/groupby/test_nunique.py create mode 100644 pandas/tests/groupby/test_reductions.py diff --git a/pandas/tests/groupby/methods/__init__.py b/pandas/tests/groupby/methods/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/groupby/methods/test_corrwith.py b/pandas/tests/groupby/methods/test_corrwith.py new file mode 100644 index 0000000000000..53e8bdc4534dc --- /dev/null +++ b/pandas/tests/groupby/methods/test_corrwith.py @@ -0,0 +1,24 @@ +import numpy as np + +from pandas import ( + DataFrame, + Index, + Series, +) +import pandas._testing as tm + + +def test_corrwith_with_1_axis(): + # GH 47723 + df = DataFrame({"a": [1, 1, 2], "b": [3, 7, 4]}) + gb = df.groupby("a") + + msg = "DataFrameGroupBy.corrwith with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.corrwith(df, axis=1) + index = Index( + data=[(1, 0), (1, 1), (1, 2), (2, 2), (2, 0), (2, 1)], + name=("a", None), + ) + expected = Series([np.nan] * 6, index=index) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/methods/test_describe.py b/pandas/tests/groupby/methods/test_describe.py new file mode 100644 index 0000000000000..f38de8faddb59 --- /dev/null +++ b/pandas/tests/groupby/methods/test_describe.py @@ -0,0 +1,221 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + Timestamp, +) +import pandas._testing as tm + + +def test_apply_describe_bug(mframe): + grouped = mframe.groupby(level="first") + grouped.describe() # it works! + + +def test_series_describe_multikey(): + ts = tm.makeTimeSeries() + grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.describe() + tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False) + tm.assert_series_equal(result["std"], grouped.std(), check_names=False) + tm.assert_series_equal(result["min"], grouped.min(), check_names=False) + + +def test_series_describe_single(): + ts = tm.makeTimeSeries() + grouped = ts.groupby(lambda x: x.month) + result = grouped.apply(lambda x: x.describe()) + expected = grouped.describe().stack(future_stack=True) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]]) +def test_series_describe_as_index(as_index, keys): + # GH#49256 + df = DataFrame( + { + "key1": ["one", "two", "two", "three", "two"], + "key2": ["one", "two", "two", "three", "two"], + "foo2": [1, 2, 4, 4, 6], + } + ) + gb = df.groupby(keys, as_index=as_index)["foo2"] + result = gb.describe() + expected = DataFrame( + { + "key1": ["one", "three", "two"], + "count": [1.0, 1.0, 3.0], + "mean": [1.0, 4.0, 4.0], + "std": [np.nan, np.nan, 2.0], + "min": [1.0, 4.0, 2.0], + "25%": [1.0, 4.0, 3.0], + "50%": [1.0, 4.0, 4.0], + "75%": [1.0, 4.0, 5.0], + "max": [1.0, 4.0, 6.0], + } + ) + if len(keys) == 2: + expected.insert(1, "key2", expected["key1"]) + if as_index: + expected = expected.set_index(keys) + tm.assert_frame_equal(result, expected) + + +def test_frame_describe_multikey(tsframe): + grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.describe() + desc_groups = [] + for col in tsframe: + group = grouped[col].describe() + # GH 17464 - Remove duplicate MultiIndex levels + group_col = MultiIndex( + levels=[[col], group.columns], + codes=[[0] * len(group.columns), range(len(group.columns))], + ) + group = DataFrame(group.values, columns=group_col, index=group.index) + desc_groups.append(group) + expected = pd.concat(desc_groups, axis=1) + tm.assert_frame_equal(result, expected) + + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) + result = groupedT.describe() + expected = tsframe.describe().T + # reverting the change from https://github.com/pandas-dev/pandas/pull/35441/ + expected.index = MultiIndex( + levels=[[0, 1], expected.index], + codes=[[0, 0, 1, 1], range(len(expected.index))], + ) + tm.assert_frame_equal(result, expected) + + +def test_frame_describe_tupleindex(): + # GH 14848 - regression from 0.19.0 to 0.19.1 + df1 = DataFrame( + { + "x": [1, 2, 3, 4, 5] * 3, + "y": [10, 20, 30, 40, 50] * 3, + "z": [100, 200, 300, 400, 500] * 3, + } + ) + df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 + df2 = df1.rename(columns={"k": "key"}) + msg = "Names should be list-like for a MultiIndex" + with pytest.raises(ValueError, match=msg): + df1.groupby("k").describe() + with pytest.raises(ValueError, match=msg): + df2.groupby("key").describe() + + +def test_frame_describe_unstacked_format(): + # GH 4792 + prices = { + Timestamp("2011-01-06 10:59:05", tz=None): 24990, + Timestamp("2011-01-06 12:43:33", tz=None): 25499, + Timestamp("2011-01-06 12:54:09", tz=None): 25499, + } + volumes = { + Timestamp("2011-01-06 10:59:05", tz=None): 1500000000, + Timestamp("2011-01-06 12:43:33", tz=None): 5000000000, + Timestamp("2011-01-06 12:54:09", tz=None): 100000000, + } + df = DataFrame({"PRICE": prices, "VOLUME": volumes}) + result = df.groupby("PRICE").VOLUME.describe() + data = [ + df[df.PRICE == 24990].VOLUME.describe().values.tolist(), + df[df.PRICE == 25499].VOLUME.describe().values.tolist(), + ] + expected = DataFrame( + data, + index=Index([24990, 25499], name="PRICE"), + columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.filterwarnings( + "ignore:" + "indexing past lexsort depth may impact performance:" + "pandas.errors.PerformanceWarning" +) +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) +def test_describe_with_duplicate_output_column_names(as_index, keys): + # GH 35314 + df = DataFrame( + { + "a1": [99, 99, 99, 88, 88, 88], + "a2": [99, 99, 99, 88, 88, 88], + "b": [1, 2, 3, 4, 5, 6], + "c": [10, 20, 30, 40, 50, 60], + }, + columns=["a1", "a2", "b", "b"], + copy=False, + ) + if keys == ["a1"]: + df = df.drop(columns="a2") + + expected = ( + DataFrame.from_records( + [ + ("b", "count", 3.0, 3.0), + ("b", "mean", 5.0, 2.0), + ("b", "std", 1.0, 1.0), + ("b", "min", 4.0, 1.0), + ("b", "25%", 4.5, 1.5), + ("b", "50%", 5.0, 2.0), + ("b", "75%", 5.5, 2.5), + ("b", "max", 6.0, 3.0), + ("b", "count", 3.0, 3.0), + ("b", "mean", 5.0, 2.0), + ("b", "std", 1.0, 1.0), + ("b", "min", 4.0, 1.0), + ("b", "25%", 4.5, 1.5), + ("b", "50%", 5.0, 2.0), + ("b", "75%", 5.5, 2.5), + ("b", "max", 6.0, 3.0), + ], + ) + .set_index([0, 1]) + .T + ) + expected.columns.names = [None, None] + if len(keys) == 2: + expected.index = MultiIndex( + levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"] + ) + else: + expected.index = Index([88, 99], name="a1") + + if not as_index: + expected = expected.reset_index() + + result = df.groupby(keys, as_index=as_index).describe() + + tm.assert_frame_equal(result, expected) + + +def test_describe_duplicate_columns(): + # GH#50806 + df = DataFrame([[0, 1, 2, 3]]) + df.columns = [0, 1, 2, 0] + gb = df.groupby(df[1]) + result = gb.describe(percentiles=[]) + + columns = ["count", "mean", "std", "min", "50%", "max"] + frames = [ + DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns) + for val in (0.0, 2.0, 3.0) + ] + expected = pd.concat(frames, axis=1) + expected.columns = MultiIndex( + levels=[[0, 2], columns], + codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))], + ) + expected.index.names = [1] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby_shift_diff.py b/pandas/tests/groupby/methods/test_groupby_shift_diff.py similarity index 100% rename from pandas/tests/groupby/test_groupby_shift_diff.py rename to pandas/tests/groupby/methods/test_groupby_shift_diff.py diff --git a/pandas/tests/groupby/methods/test_is_monotonic.py b/pandas/tests/groupby/methods/test_is_monotonic.py new file mode 100644 index 0000000000000..3428fc90f6e51 --- /dev/null +++ b/pandas/tests/groupby/methods/test_is_monotonic.py @@ -0,0 +1,78 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Index, + Series, +) +import pandas._testing as tm + + +@pytest.mark.parametrize( + "in_vals, out_vals", + [ + # Basics: strictly increasing (T), strictly decreasing (F), + # abs val increasing (F), non-strictly increasing (T) + ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]), + # Test with inf vals + ( + [1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], + [True, False, True, False], + ), + # Test with nan vals; should always be False + ( + [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False], + ), + ], +) +def test_is_monotonic_increasing(in_vals, out_vals): + # GH 17015 + source_dict = { + "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], + "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], + "C": in_vals, + } + df = DataFrame(source_dict) + result = df.groupby("B").C.is_monotonic_increasing + index = Index(list("abcd"), name="B") + expected = Series(index=index, data=out_vals, name="C") + tm.assert_series_equal(result, expected) + + # Also check result equal to manually taking x.is_monotonic_increasing. + expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "in_vals, out_vals", + [ + # Basics: strictly decreasing (T), strictly increasing (F), + # abs val decreasing (F), non-strictly increasing (T) + ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]), + # Test with inf vals + ( + [np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], + [True, True, False, True], + ), + # Test with nan vals; should always be False + ( + [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False], + ), + ], +) +def test_is_monotonic_decreasing(in_vals, out_vals): + # GH 17015 + source_dict = { + "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], + "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], + "C": in_vals, + } + + df = DataFrame(source_dict) + result = df.groupby("B").C.is_monotonic_decreasing + index = Index(list("abcd"), name="B") + expected = Series(index=index, data=out_vals, name="C") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/methods/test_nlargest_nsmallest.py b/pandas/tests/groupby/methods/test_nlargest_nsmallest.py new file mode 100644 index 0000000000000..bf983f04a3f3f --- /dev/null +++ b/pandas/tests/groupby/methods/test_nlargest_nsmallest.py @@ -0,0 +1,115 @@ +import numpy as np +import pytest + +from pandas import ( + MultiIndex, + Series, + date_range, +) +import pandas._testing as tm + + +def test_nlargest(): + a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) + b = Series(list("a" * 5 + "b" * 5)) + gb = a.groupby(b) + r = gb.nlargest(3) + e = Series( + [7, 5, 3, 10, 9, 6], + index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]), + ) + tm.assert_series_equal(r, e) + + a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) + gb = a.groupby(b) + e = Series( + [3, 2, 1, 3, 3, 2], + index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]), + ) + tm.assert_series_equal(gb.nlargest(3, keep="last"), e) + + +def test_nlargest_mi_grouper(): + # see gh-21411 + npr = np.random.default_rng(2) + + dts = date_range("20180101", periods=10) + iterables = [dts, ["one", "two"]] + + idx = MultiIndex.from_product(iterables, names=["first", "second"]) + s = Series(npr.standard_normal(20), index=idx) + + result = s.groupby("first").nlargest(1) + + exp_idx = MultiIndex.from_tuples( + [ + (dts[0], dts[0], "one"), + (dts[1], dts[1], "one"), + (dts[2], dts[2], "one"), + (dts[3], dts[3], "two"), + (dts[4], dts[4], "one"), + (dts[5], dts[5], "one"), + (dts[6], dts[6], "one"), + (dts[7], dts[7], "one"), + (dts[8], dts[8], "one"), + (dts[9], dts[9], "one"), + ], + names=["first", "first", "second"], + ) + + exp_values = [ + 0.18905338179353307, + -0.41306354339189344, + 1.799707382720902, + 0.7738065867276614, + 0.28121066979764925, + 0.9775674511260357, + -0.3288239040579627, + 0.45495807124085547, + 0.5452887139646817, + 0.12682784711186987, + ] + + expected = Series(exp_values, index=exp_idx) + tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3) + + +def test_nsmallest(): + a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) + b = Series(list("a" * 5 + "b" * 5)) + gb = a.groupby(b) + r = gb.nsmallest(3) + e = Series( + [1, 2, 3, 0, 4, 6], + index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]), + ) + tm.assert_series_equal(r, e) + + a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) + gb = a.groupby(b) + e = Series( + [0, 1, 1, 0, 1, 2], + index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]), + ) + tm.assert_series_equal(gb.nsmallest(3, keep="last"), e) + + +@pytest.mark.parametrize( + "data, groups", + [([0, 1, 2, 3], [0, 0, 1, 1]), ([0], [0])], +) +@pytest.mark.parametrize("dtype", [None, *tm.ALL_INT_NUMPY_DTYPES]) +@pytest.mark.parametrize("method", ["nlargest", "nsmallest"]) +def test_nlargest_and_smallest_noop(data, groups, dtype, method): + # GH 15272, GH 16345, GH 29129 + # Test nlargest/smallest when it results in a noop, + # i.e. input is sorted and group size <= n + if dtype is not None: + data = np.array(data, dtype=dtype) + if method == "nlargest": + data = list(reversed(data)) + ser = Series(data, name="a") + result = getattr(ser.groupby(groups), method)(n=2) + expidx = np.array(groups, dtype=int) if isinstance(groups, list) else groups + expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/methods/test_nth.py similarity index 100% rename from pandas/tests/groupby/test_nth.py rename to pandas/tests/groupby/methods/test_nth.py diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py similarity index 100% rename from pandas/tests/groupby/test_quantile.py rename to pandas/tests/groupby/methods/test_quantile.py diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/methods/test_rank.py similarity index 100% rename from pandas/tests/groupby/test_rank.py rename to pandas/tests/groupby/methods/test_rank.py diff --git a/pandas/tests/groupby/test_sample.py b/pandas/tests/groupby/methods/test_sample.py similarity index 100% rename from pandas/tests/groupby/test_sample.py rename to pandas/tests/groupby/methods/test_sample.py diff --git a/pandas/tests/groupby/test_size.py b/pandas/tests/groupby/methods/test_size.py similarity index 100% rename from pandas/tests/groupby/test_size.py rename to pandas/tests/groupby/methods/test_size.py diff --git a/pandas/tests/groupby/test_skew.py b/pandas/tests/groupby/methods/test_skew.py similarity index 100% rename from pandas/tests/groupby/test_skew.py rename to pandas/tests/groupby/methods/test_skew.py diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py similarity index 100% rename from pandas/tests/groupby/test_value_counts.py rename to pandas/tests/groupby/methods/test_value_counts.py diff --git a/pandas/tests/groupby/test_any_all.py b/pandas/tests/groupby/test_any_all.py deleted file mode 100644 index 57a83335be849..0000000000000 --- a/pandas/tests/groupby/test_any_all.py +++ /dev/null @@ -1,188 +0,0 @@ -import builtins - -import numpy as np -import pytest - -import pandas as pd -from pandas import ( - DataFrame, - Index, - Series, - isna, -) -import pandas._testing as tm - - -@pytest.mark.parametrize("agg_func", ["any", "all"]) -@pytest.mark.parametrize( - "vals", - [ - ["foo", "bar", "baz"], - ["foo", "", ""], - ["", "", ""], - [1, 2, 3], - [1, 0, 0], - [0, 0, 0], - [1.0, 2.0, 3.0], - [1.0, 0.0, 0.0], - [0.0, 0.0, 0.0], - [True, True, True], - [True, False, False], - [False, False, False], - [np.nan, np.nan, np.nan], - ], -) -def test_groupby_bool_aggs(skipna, agg_func, vals): - df = DataFrame({"key": ["a"] * 3 + ["b"] * 3, "val": vals * 2}) - - # Figure out expectation using Python builtin - exp = getattr(builtins, agg_func)(vals) - - # edge case for missing data with skipna and 'any' - if skipna and all(isna(vals)) and agg_func == "any": - exp = False - - expected = DataFrame( - [exp] * 2, columns=["val"], index=Index(["a", "b"], name="key") - ) - result = getattr(df.groupby("key"), agg_func)(skipna=skipna) - tm.assert_frame_equal(result, expected) - - -def test_any(): - df = DataFrame( - [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]], - columns=["A", "B", "C"], - ) - expected = DataFrame( - [[True, True], [False, True]], columns=["B", "C"], index=[1, 3] - ) - expected.index.name = "A" - result = df.groupby("A").any() - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) -def test_bool_aggs_dup_column_labels(bool_agg_func): - # GH#21668 - df = DataFrame([[True, True]], columns=["a", "a"]) - grp_by = df.groupby([0]) - result = getattr(grp_by, bool_agg_func)() - - expected = df.set_axis(np.array([0])) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) -@pytest.mark.parametrize( - "data", - [ - [False, False, False], - [True, True, True], - [pd.NA, pd.NA, pd.NA], - [False, pd.NA, False], - [True, pd.NA, True], - [True, pd.NA, False], - ], -) -def test_masked_kleene_logic(bool_agg_func, skipna, data): - # GH#37506 - ser = Series(data, dtype="boolean") - - # The result should match aggregating on the whole series. Correctness - # there is verified in test_reductions.py::test_any_all_boolean_kleene_logic - expected_data = getattr(ser, bool_agg_func)(skipna=skipna) - expected = Series(expected_data, index=np.array([0]), dtype="boolean") - - result = ser.groupby([0, 0, 0]).agg(bool_agg_func, skipna=skipna) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "dtype1,dtype2,exp_col1,exp_col2", - [ - ( - "float", - "Float64", - np.array([True], dtype=bool), - pd.array([pd.NA], dtype="boolean"), - ), - ( - "Int64", - "float", - pd.array([pd.NA], dtype="boolean"), - np.array([True], dtype=bool), - ), - ( - "Int64", - "Int64", - pd.array([pd.NA], dtype="boolean"), - pd.array([pd.NA], dtype="boolean"), - ), - ( - "Float64", - "boolean", - pd.array([pd.NA], dtype="boolean"), - pd.array([pd.NA], dtype="boolean"), - ), - ], -) -def test_masked_mixed_types(dtype1, dtype2, exp_col1, exp_col2): - # GH#37506 - data = [1.0, np.nan] - df = DataFrame( - {"col1": pd.array(data, dtype=dtype1), "col2": pd.array(data, dtype=dtype2)} - ) - result = df.groupby([1, 1]).agg("all", skipna=False) - - expected = DataFrame({"col1": exp_col1, "col2": exp_col2}, index=np.array([1])) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) -@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) -def test_masked_bool_aggs_skipna(bool_agg_func, dtype, skipna, frame_or_series): - # GH#40585 - obj = frame_or_series([pd.NA, 1], dtype=dtype) - expected_res = True - if not skipna and bool_agg_func == "all": - expected_res = pd.NA - expected = frame_or_series([expected_res], index=np.array([1]), dtype="boolean") - - result = obj.groupby([1, 1]).agg(bool_agg_func, skipna=skipna) - tm.assert_equal(result, expected) - - -@pytest.mark.parametrize( - "bool_agg_func,data,expected_res", - [ - ("any", [pd.NA, np.nan], False), - ("any", [pd.NA, 1, np.nan], True), - ("all", [pd.NA, pd.NaT], True), - ("all", [pd.NA, False, pd.NaT], False), - ], -) -def test_object_type_missing_vals(bool_agg_func, data, expected_res, frame_or_series): - # GH#37501 - obj = frame_or_series(data, dtype=object) - result = obj.groupby([1] * len(data)).agg(bool_agg_func) - expected = frame_or_series([expected_res], index=np.array([1]), dtype="bool") - tm.assert_equal(result, expected) - - -@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) -def test_object_NA_raises_with_skipna_false(bool_agg_func): - # GH#37501 - ser = Series([pd.NA], dtype=object) - with pytest.raises(TypeError, match="boolean value of NA is ambiguous"): - ser.groupby([1]).agg(bool_agg_func, skipna=False) - - -@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) -def test_empty(frame_or_series, bool_agg_func): - # GH 45231 - kwargs = {"columns": ["a"]} if frame_or_series is DataFrame else {"name": "a"} - obj = frame_or_series(**kwargs, dtype=object) - result = getattr(obj.groupby(obj.index), bool_agg_func)() - expected = frame_or_series(**kwargs, dtype=bool) - tm.assert_equal(result, expected) diff --git a/pandas/tests/groupby/test_cumulative.py b/pandas/tests/groupby/test_cumulative.py new file mode 100644 index 0000000000000..eecb82cd5050b --- /dev/null +++ b/pandas/tests/groupby/test_cumulative.py @@ -0,0 +1,291 @@ +import numpy as np +import pytest + +from pandas.errors import UnsupportedFunctionCall +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm + + +@pytest.fixture( + params=[np.int32, np.int64, np.float32, np.float64, "Int64", "Float64"], + ids=["np.int32", "np.int64", "np.float32", "np.float64", "Int64", "Float64"], +) +def dtypes_for_minmax(request): + """ + Fixture of dtypes with min and max values used for testing + cummin and cummax + """ + dtype = request.param + + np_type = dtype + if dtype == "Int64": + np_type = np.int64 + elif dtype == "Float64": + np_type = np.float64 + + min_val = ( + np.iinfo(np_type).min + if np.dtype(np_type).kind == "i" + else np.finfo(np_type).min + ) + max_val = ( + np.iinfo(np_type).max + if np.dtype(np_type).kind == "i" + else np.finfo(np_type).max + ) + + return (dtype, min_val, max_val) + + +def test_groupby_cumprod(): + # GH 4095 + df = DataFrame({"key": ["b"] * 10, "value": 2}) + + actual = df.groupby("key")["value"].cumprod() + expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod()) + expected.name = "value" + tm.assert_series_equal(actual, expected) + + df = DataFrame({"key": ["b"] * 100, "value": 2}) + df["value"] = df["value"].astype(float) + actual = df.groupby("key")["value"].cumprod() + expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod()) + expected.name = "value" + tm.assert_series_equal(actual, expected) + + +def test_groupby_cumprod_overflow(): + # GH#37493 if we overflow we return garbage consistent with numpy + df = DataFrame({"key": ["b"] * 4, "value": 100_000}) + actual = df.groupby("key")["value"].cumprod() + expected = Series( + [100_000, 10_000_000_000, 1_000_000_000_000_000, 7766279631452241920], + name="value", + ) + tm.assert_series_equal(actual, expected) + + numpy_result = df.groupby("key", group_keys=False)["value"].apply( + lambda x: x.cumprod() + ) + numpy_result.name = "value" + tm.assert_series_equal(actual, numpy_result) + + +def test_groupby_cumprod_nan_influences_other_columns(): + # GH#48064 + df = DataFrame( + { + "a": 1, + "b": [1, np.nan, 2], + "c": [1, 2, 3.0], + } + ) + result = df.groupby("a").cumprod(numeric_only=True, skipna=False) + expected = DataFrame({"b": [1, np.nan, np.nan], "c": [1, 2, 6.0]}) + tm.assert_frame_equal(result, expected) + + +def test_cummin(dtypes_for_minmax): + dtype = dtypes_for_minmax[0] + min_val = dtypes_for_minmax[1] + + # GH 15048 + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) + expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] + + df = base_df.astype(dtype) + + expected = DataFrame({"B": expected_mins}).astype(dtype) + result = df.groupby("A").cummin() + tm.assert_frame_equal(result, expected) + result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() + tm.assert_frame_equal(result, expected) + + # Test w/ min value for dtype + df.loc[[2, 6], "B"] = min_val + df.loc[[1, 5], "B"] = min_val + 1 + expected.loc[[2, 3, 6, 7], "B"] = min_val + expected.loc[[1, 5], "B"] = min_val + 1 # should not be rounded to min_val + result = df.groupby("A").cummin() + tm.assert_frame_equal(result, expected, check_exact=True) + expected = ( + df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() + ) + tm.assert_frame_equal(result, expected, check_exact=True) + + # Test nan in some values + # Explicit cast to float to avoid implicit cast when setting nan + base_df = base_df.astype({"B": "float"}) + base_df.loc[[0, 2, 4, 6], "B"] = np.nan + expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]}) + result = base_df.groupby("A").cummin() + tm.assert_frame_equal(result, expected) + expected = ( + base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() + ) + tm.assert_frame_equal(result, expected) + + # GH 15561 + df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) + expected = Series(pd.to_datetime("2001"), index=[0], name="b") + + result = df.groupby("a")["b"].cummin() + tm.assert_series_equal(expected, result) + + # GH 15635 + df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]}) + result = df.groupby("a").b.cummin() + expected = Series([1, 2, 1], name="b") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +@pytest.mark.parametrize("dtype", ["UInt64", "Int64", "Float64", "float", "boolean"]) +def test_cummin_max_all_nan_column(method, dtype): + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8}) + base_df["B"] = base_df["B"].astype(dtype) + grouped = base_df.groupby("A") + + expected = DataFrame({"B": [np.nan] * 8}, dtype=dtype) + result = getattr(grouped, method)() + tm.assert_frame_equal(expected, result) + + result = getattr(grouped["B"], method)().to_frame() + tm.assert_frame_equal(expected, result) + + +def test_cummax(dtypes_for_minmax): + dtype = dtypes_for_minmax[0] + max_val = dtypes_for_minmax[2] + + # GH 15048 + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) + expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] + + df = base_df.astype(dtype) + + expected = DataFrame({"B": expected_maxs}).astype(dtype) + result = df.groupby("A").cummax() + tm.assert_frame_equal(result, expected) + result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() + tm.assert_frame_equal(result, expected) + + # Test w/ max value for dtype + df.loc[[2, 6], "B"] = max_val + expected.loc[[2, 3, 6, 7], "B"] = max_val + result = df.groupby("A").cummax() + tm.assert_frame_equal(result, expected) + expected = ( + df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() + ) + tm.assert_frame_equal(result, expected) + + # Test nan in some values + # Explicit cast to float to avoid implicit cast when setting nan + base_df = base_df.astype({"B": "float"}) + base_df.loc[[0, 2, 4, 6], "B"] = np.nan + expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]}) + result = base_df.groupby("A").cummax() + tm.assert_frame_equal(result, expected) + expected = ( + base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() + ) + tm.assert_frame_equal(result, expected) + + # GH 15561 + df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) + expected = Series(pd.to_datetime("2001"), index=[0], name="b") + + result = df.groupby("a")["b"].cummax() + tm.assert_series_equal(expected, result) + + # GH 15635 + df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]}) + result = df.groupby("a").b.cummax() + expected = Series([2, 1, 2], name="b") + tm.assert_series_equal(result, expected) + + +def test_cummax_i8_at_implementation_bound(): + # the minimum value used to be treated as NPY_NAT+1 instead of NPY_NAT + # for int64 dtype GH#46382 + ser = Series([pd.NaT._value + n for n in range(5)]) + df = DataFrame({"A": 1, "B": ser, "C": ser.view("M8[ns]")}) + gb = df.groupby("A") + + res = gb.cummax() + exp = df[["B", "C"]] + tm.assert_frame_equal(res, exp) + + +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +@pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"]) +@pytest.mark.parametrize( + "groups,expected_data", + [ + ([1, 1, 1], [1, None, None]), + ([1, 2, 3], [1, None, 2]), + ([1, 3, 3], [1, None, None]), + ], +) +def test_cummin_max_skipna(method, dtype, groups, expected_data): + # GH-34047 + df = DataFrame({"a": Series([1, None, 2], dtype=dtype)}) + orig = df.copy() + gb = df.groupby(groups)["a"] + + result = getattr(gb, method)(skipna=False) + expected = Series(expected_data, dtype=dtype, name="a") + + # check we didn't accidentally alter df + tm.assert_frame_equal(df, orig) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +def test_cummin_max_skipna_multiple_cols(method): + # Ensure missing value in "a" doesn't cause "b" to be nan-filled + df = DataFrame({"a": [np.nan, 2.0, 2.0], "b": [2.0, 2.0, 2.0]}) + gb = df.groupby([1, 1, 1])[["a", "b"]] + + result = getattr(gb, method)(skipna=False) + expected = DataFrame({"a": [np.nan, np.nan, np.nan], "b": [2.0, 2.0, 2.0]}) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("func", ["cumprod", "cumsum"]) +def test_numpy_compat(func): + # see gh-12811 + df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]}) + g = df.groupby("A") + + msg = "numpy operations are not valid with groupby" + + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(g, func)(1, 2, 3) + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(g, func)(foo=1) + + +@td.skip_if_32bit +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +@pytest.mark.parametrize( + "dtype,val", [("UInt64", np.iinfo("uint64").max), ("Int64", 2**53 + 1)] +) +def test_nullable_int_not_cast_as_float(method, dtype, val): + data = [val, pd.NA] + df = DataFrame({"grp": [1, 1], "b": data}, dtype=dtype) + grouped = df.groupby("grp") + + result = grouped.transform(method) + expected = DataFrame({"b": data}, dtype=dtype) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 08372541988d0..4876267c72f12 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1,12 +1,10 @@ import builtins -from io import StringIO import re import numpy as np import pytest from pandas._libs import lib -from pandas.errors import UnsupportedFunctionCall import pandas as pd from pandas import ( @@ -22,37 +20,6 @@ from pandas.util import _test_decorators as td -@pytest.fixture( - params=[np.int32, np.int64, np.float32, np.float64, "Int64", "Float64"], - ids=["np.int32", "np.int64", "np.float32", "np.float64", "Int64", "Float64"], -) -def dtypes_for_minmax(request): - """ - Fixture of dtypes with min and max values used for testing - cummin and cummax - """ - dtype = request.param - - np_type = dtype - if dtype == "Int64": - np_type = np.int64 - elif dtype == "Float64": - np_type = np.float64 - - min_val = ( - np.iinfo(np_type).min - if np.dtype(np_type).kind == "i" - else np.finfo(np_type).min - ) - max_val = ( - np.iinfo(np_type).max - if np.dtype(np_type).kind == "i" - else np.finfo(np_type).max - ) - - return (dtype, min_val, max_val) - - def test_intercept_builtin_sum(): s = Series([1.0, 2.0, np.nan, 3.0]) grouped = s.groupby([0, 1, 2, 2]) @@ -372,39 +339,6 @@ def test_cython_api2(): tm.assert_frame_equal(result, expected) -def test_cython_median(): - arr = np.random.default_rng(2).standard_normal(1000) - arr[::2] = np.nan - df = DataFrame(arr) - - labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float) - labels[::17] = np.nan - - result = df.groupby(labels).median() - msg = "using DataFrameGroupBy.median" - with tm.assert_produces_warning(FutureWarning, match=msg): - exp = df.groupby(labels).agg(np.nanmedian) - tm.assert_frame_equal(result, exp) - - df = DataFrame(np.random.default_rng(2).standard_normal((1000, 5))) - msg = "using DataFrameGroupBy.median" - with tm.assert_produces_warning(FutureWarning, match=msg): - rs = df.groupby(labels).agg(np.median) - xp = df.groupby(labels).median() - tm.assert_frame_equal(rs, xp) - - -def test_median_empty_bins(observed): - df = DataFrame(np.random.default_rng(2).integers(0, 44, 500)) - - grps = range(0, 55, 5) - bins = pd.cut(df[0], grps) - - result = df.groupby(bins, observed=observed).median() - expected = df.groupby(bins, observed=observed).agg(lambda x: x.median()) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( "dtype", ["int8", "int16", "int32", "int64", "float32", "float64", "uint64"] ) @@ -478,105 +412,6 @@ def test_groupby_non_arithmetic_agg_int_like_precision(i): assert res.iloc[0].b == data["expected"] -@pytest.mark.parametrize( - "func, values", - [ - ("idxmin", {"c_int": [0, 2], "c_float": [1, 3], "c_date": [1, 2]}), - ("idxmax", {"c_int": [1, 3], "c_float": [0, 2], "c_date": [0, 3]}), - ], -) -@pytest.mark.parametrize("numeric_only", [True, False]) -def test_idxmin_idxmax_returns_int_types(func, values, numeric_only): - # GH 25444 - df = DataFrame( - { - "name": ["A", "A", "B", "B"], - "c_int": [1, 2, 3, 4], - "c_float": [4.02, 3.03, 2.04, 1.05], - "c_date": ["2019", "2018", "2016", "2017"], - } - ) - df["c_date"] = pd.to_datetime(df["c_date"]) - df["c_date_tz"] = df["c_date"].dt.tz_localize("US/Pacific") - df["c_timedelta"] = df["c_date"] - df["c_date"].iloc[0] - df["c_period"] = df["c_date"].dt.to_period("W") - df["c_Integer"] = df["c_int"].astype("Int64") - df["c_Floating"] = df["c_float"].astype("Float64") - - result = getattr(df.groupby("name"), func)(numeric_only=numeric_only) - - expected = DataFrame(values, index=Index(["A", "B"], name="name")) - if numeric_only: - expected = expected.drop(columns=["c_date"]) - else: - expected["c_date_tz"] = expected["c_date"] - expected["c_timedelta"] = expected["c_date"] - expected["c_period"] = expected["c_date"] - expected["c_Integer"] = expected["c_int"] - expected["c_Floating"] = expected["c_float"] - - tm.assert_frame_equal(result, expected) - - -def test_idxmin_idxmax_axis1(): - df = DataFrame( - np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "B", "C", "D"] - ) - df["A"] = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4] - - gb = df.groupby("A") - - warn_msg = "DataFrameGroupBy.idxmax with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - res = gb.idxmax(axis=1) - - alt = df.iloc[:, 1:].idxmax(axis=1) - indexer = res.index.get_level_values(1) - - tm.assert_series_equal(alt[indexer], res.droplevel("A")) - - df["E"] = date_range("2016-01-01", periods=10) - gb2 = df.groupby("A") - - msg = "'>' not supported between instances of 'Timestamp' and 'float'" - with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - gb2.idxmax(axis=1) - - -@pytest.mark.parametrize( - "func, values, expected_values, warn", - [ - ("idxmin", [0, 1, 2], [0, 2], None), - ("idxmax", [0, 1, 2], [1, 2], None), - ("idxmin", [0, np.nan, 2], [np.nan, 2], FutureWarning), - ("idxmax", [0, np.nan, 2], [np.nan, 2], FutureWarning), - ("idxmin", [1, 0, np.nan], [1, np.nan], FutureWarning), - ("idxmax", [1, 0, np.nan], [0, np.nan], FutureWarning), - ], -) -@pytest.mark.parametrize("test_series", [True, False]) -def test_idxmin_idxmax_skipna_false(func, values, expected_values, warn, test_series): - # GH#54234 - df = DataFrame( - { - "a": [1, 1, 2], - "b": values, - } - ) - gb = df.groupby("a") - index = Index([1, 2], name="a") - expected = DataFrame({"b": expected_values}, index=index) - if test_series: - gb = gb["b"] - expected = expected["b"] - klass = "Series" if test_series else "DataFrame" - msg = f"The behavior of {klass}GroupBy.{func} with all-NA values" - with tm.assert_produces_warning(warn, match=msg): - result = getattr(gb, func)(skipna=False) - tm.assert_equal(result, expected) - - @pytest.mark.parametrize("numeric_only", [True, False, None]) def test_axis1_numeric_only(request, groupby_func, numeric_only): if groupby_func in ("idxmax", "idxmin"): @@ -658,54 +493,6 @@ def test_axis1_numeric_only(request, groupby_func, numeric_only): tm.assert_equal(result, expected) -def test_groupby_cumprod(): - # GH 4095 - df = DataFrame({"key": ["b"] * 10, "value": 2}) - - actual = df.groupby("key")["value"].cumprod() - expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod()) - expected.name = "value" - tm.assert_series_equal(actual, expected) - - df = DataFrame({"key": ["b"] * 100, "value": 2}) - df["value"] = df["value"].astype(float) - actual = df.groupby("key")["value"].cumprod() - expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod()) - expected.name = "value" - tm.assert_series_equal(actual, expected) - - -def test_groupby_cumprod_overflow(): - # GH#37493 if we overflow we return garbage consistent with numpy - df = DataFrame({"key": ["b"] * 4, "value": 100_000}) - actual = df.groupby("key")["value"].cumprod() - expected = Series( - [100_000, 10_000_000_000, 1_000_000_000_000_000, 7766279631452241920], - name="value", - ) - tm.assert_series_equal(actual, expected) - - numpy_result = df.groupby("key", group_keys=False)["value"].apply( - lambda x: x.cumprod() - ) - numpy_result.name = "value" - tm.assert_series_equal(actual, numpy_result) - - -def test_groupby_cumprod_nan_influences_other_columns(): - # GH#48064 - df = DataFrame( - { - "a": 1, - "b": [1, np.nan, 2], - "c": [1, 2, 3.0], - } - ) - result = df.groupby("a").cumprod(numeric_only=True, skipna=False) - expected = DataFrame({"b": [1, np.nan, np.nan], "c": [1, 2, 6.0]}) - tm.assert_frame_equal(result, expected) - - def scipy_sem(*args, **kwargs): from scipy.stats import sem @@ -741,627 +528,12 @@ def test_ops_general(op, targop): tm.assert_frame_equal(result, expected) -def test_max_nan_bug(): - raw = """,Date,app,File --04-23,2013-04-23 00:00:00,,log080001.log --05-06,2013-05-06 00:00:00,,log.log --05-07,2013-05-07 00:00:00,OE,xlsx""" - - with tm.assert_produces_warning(UserWarning, match="Could not infer format"): - df = pd.read_csv(StringIO(raw), parse_dates=[0]) - gb = df.groupby("Date") - r = gb[["File"]].max() - e = gb["File"].max().to_frame() - tm.assert_frame_equal(r, e) - assert not r["File"].isna().any() - - -def test_nlargest(): - a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) - b = Series(list("a" * 5 + "b" * 5)) - gb = a.groupby(b) - r = gb.nlargest(3) - e = Series( - [7, 5, 3, 10, 9, 6], - index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]), - ) - tm.assert_series_equal(r, e) - - a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) - gb = a.groupby(b) - e = Series( - [3, 2, 1, 3, 3, 2], - index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]), - ) - tm.assert_series_equal(gb.nlargest(3, keep="last"), e) - - -def test_nlargest_mi_grouper(): - # see gh-21411 - npr = np.random.default_rng(2) - - dts = date_range("20180101", periods=10) - iterables = [dts, ["one", "two"]] - - idx = MultiIndex.from_product(iterables, names=["first", "second"]) - s = Series(npr.standard_normal(20), index=idx) - - result = s.groupby("first").nlargest(1) - - exp_idx = MultiIndex.from_tuples( - [ - (dts[0], dts[0], "one"), - (dts[1], dts[1], "one"), - (dts[2], dts[2], "one"), - (dts[3], dts[3], "two"), - (dts[4], dts[4], "one"), - (dts[5], dts[5], "one"), - (dts[6], dts[6], "one"), - (dts[7], dts[7], "one"), - (dts[8], dts[8], "one"), - (dts[9], dts[9], "one"), - ], - names=["first", "first", "second"], - ) - - exp_values = [ - 0.18905338179353307, - -0.41306354339189344, - 1.799707382720902, - 0.7738065867276614, - 0.28121066979764925, - 0.9775674511260357, - -0.3288239040579627, - 0.45495807124085547, - 0.5452887139646817, - 0.12682784711186987, - ] - - expected = Series(exp_values, index=exp_idx) - tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3) - - -def test_nsmallest(): - a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) - b = Series(list("a" * 5 + "b" * 5)) - gb = a.groupby(b) - r = gb.nsmallest(3) - e = Series( - [1, 2, 3, 0, 4, 6], - index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]), - ) - tm.assert_series_equal(r, e) - - a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) - gb = a.groupby(b) - e = Series( - [0, 1, 1, 0, 1, 2], - index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]), - ) - tm.assert_series_equal(gb.nsmallest(3, keep="last"), e) - - -@pytest.mark.parametrize( - "data, groups", - [([0, 1, 2, 3], [0, 0, 1, 1]), ([0], [0])], -) -@pytest.mark.parametrize("dtype", [None, *tm.ALL_INT_NUMPY_DTYPES]) -@pytest.mark.parametrize("method", ["nlargest", "nsmallest"]) -def test_nlargest_and_smallest_noop(data, groups, dtype, method): - # GH 15272, GH 16345, GH 29129 - # Test nlargest/smallest when it results in a noop, - # i.e. input is sorted and group size <= n - if dtype is not None: - data = np.array(data, dtype=dtype) - if method == "nlargest": - data = list(reversed(data)) - ser = Series(data, name="a") - result = getattr(ser.groupby(groups), method)(n=2) - expidx = np.array(groups, dtype=int) if isinstance(groups, list) else groups - expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a") - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("func", ["cumprod", "cumsum"]) -def test_numpy_compat(func): - # see gh-12811 - df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]}) - g = df.groupby("A") - - msg = "numpy operations are not valid with groupby" - - with pytest.raises(UnsupportedFunctionCall, match=msg): - getattr(g, func)(1, 2, 3) - with pytest.raises(UnsupportedFunctionCall, match=msg): - getattr(g, func)(foo=1) - - -def test_cummin(dtypes_for_minmax): - dtype = dtypes_for_minmax[0] - min_val = dtypes_for_minmax[1] - - # GH 15048 - base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) - expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] - - df = base_df.astype(dtype) - - expected = DataFrame({"B": expected_mins}).astype(dtype) - result = df.groupby("A").cummin() - tm.assert_frame_equal(result, expected) - result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() - tm.assert_frame_equal(result, expected) - - # Test w/ min value for dtype - df.loc[[2, 6], "B"] = min_val - df.loc[[1, 5], "B"] = min_val + 1 - expected.loc[[2, 3, 6, 7], "B"] = min_val - expected.loc[[1, 5], "B"] = min_val + 1 # should not be rounded to min_val - result = df.groupby("A").cummin() - tm.assert_frame_equal(result, expected, check_exact=True) - expected = ( - df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() - ) - tm.assert_frame_equal(result, expected, check_exact=True) - - # Test nan in some values - # Explicit cast to float to avoid implicit cast when setting nan - base_df = base_df.astype({"B": "float"}) - base_df.loc[[0, 2, 4, 6], "B"] = np.nan - expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]}) - result = base_df.groupby("A").cummin() - tm.assert_frame_equal(result, expected) - expected = ( - base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() - ) - tm.assert_frame_equal(result, expected) - - # GH 15561 - df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) - expected = Series(pd.to_datetime("2001"), index=[0], name="b") - - result = df.groupby("a")["b"].cummin() - tm.assert_series_equal(expected, result) - - # GH 15635 - df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]}) - result = df.groupby("a").b.cummin() - expected = Series([1, 2, 1], name="b") - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("method", ["cummin", "cummax"]) -@pytest.mark.parametrize("dtype", ["UInt64", "Int64", "Float64", "float", "boolean"]) -def test_cummin_max_all_nan_column(method, dtype): - base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8}) - base_df["B"] = base_df["B"].astype(dtype) - grouped = base_df.groupby("A") - - expected = DataFrame({"B": [np.nan] * 8}, dtype=dtype) - result = getattr(grouped, method)() - tm.assert_frame_equal(expected, result) - - result = getattr(grouped["B"], method)().to_frame() - tm.assert_frame_equal(expected, result) - - -def test_cummax(dtypes_for_minmax): - dtype = dtypes_for_minmax[0] - max_val = dtypes_for_minmax[2] - - # GH 15048 - base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) - expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] - - df = base_df.astype(dtype) - - expected = DataFrame({"B": expected_maxs}).astype(dtype) - result = df.groupby("A").cummax() - tm.assert_frame_equal(result, expected) - result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() - tm.assert_frame_equal(result, expected) - - # Test w/ max value for dtype - df.loc[[2, 6], "B"] = max_val - expected.loc[[2, 3, 6, 7], "B"] = max_val - result = df.groupby("A").cummax() - tm.assert_frame_equal(result, expected) - expected = ( - df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() - ) - tm.assert_frame_equal(result, expected) - - # Test nan in some values - # Explicit cast to float to avoid implicit cast when setting nan - base_df = base_df.astype({"B": "float"}) - base_df.loc[[0, 2, 4, 6], "B"] = np.nan - expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]}) - result = base_df.groupby("A").cummax() - tm.assert_frame_equal(result, expected) - expected = ( - base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() - ) - tm.assert_frame_equal(result, expected) - - # GH 15561 - df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) - expected = Series(pd.to_datetime("2001"), index=[0], name="b") - - result = df.groupby("a")["b"].cummax() - tm.assert_series_equal(expected, result) - - # GH 15635 - df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]}) - result = df.groupby("a").b.cummax() - expected = Series([2, 1, 2], name="b") - tm.assert_series_equal(result, expected) - - -def test_cummax_i8_at_implementation_bound(): - # the minimum value used to be treated as NPY_NAT+1 instead of NPY_NAT - # for int64 dtype GH#46382 - ser = Series([pd.NaT._value + n for n in range(5)]) - df = DataFrame({"A": 1, "B": ser, "C": ser.view("M8[ns]")}) - gb = df.groupby("A") - - res = gb.cummax() - exp = df[["B", "C"]] - tm.assert_frame_equal(res, exp) - - -@pytest.mark.parametrize("method", ["cummin", "cummax"]) -@pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"]) -@pytest.mark.parametrize( - "groups,expected_data", - [ - ([1, 1, 1], [1, None, None]), - ([1, 2, 3], [1, None, 2]), - ([1, 3, 3], [1, None, None]), - ], -) -def test_cummin_max_skipna(method, dtype, groups, expected_data): - # GH-34047 - df = DataFrame({"a": Series([1, None, 2], dtype=dtype)}) - orig = df.copy() - gb = df.groupby(groups)["a"] - - result = getattr(gb, method)(skipna=False) - expected = Series(expected_data, dtype=dtype, name="a") - - # check we didn't accidentally alter df - tm.assert_frame_equal(df, orig) - - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("method", ["cummin", "cummax"]) -def test_cummin_max_skipna_multiple_cols(method): - # Ensure missing value in "a" doesn't cause "b" to be nan-filled - df = DataFrame({"a": [np.nan, 2.0, 2.0], "b": [2.0, 2.0, 2.0]}) - gb = df.groupby([1, 1, 1])[["a", "b"]] - - result = getattr(gb, method)(skipna=False) - expected = DataFrame({"a": [np.nan, np.nan, np.nan], "b": [2.0, 2.0, 2.0]}) - - tm.assert_frame_equal(result, expected) - - -@td.skip_if_32bit -@pytest.mark.parametrize("method", ["cummin", "cummax"]) -@pytest.mark.parametrize( - "dtype,val", [("UInt64", np.iinfo("uint64").max), ("Int64", 2**53 + 1)] -) -def test_nullable_int_not_cast_as_float(method, dtype, val): - data = [val, pd.NA] - df = DataFrame({"grp": [1, 1], "b": data}, dtype=dtype) - grouped = df.groupby("grp") - - result = grouped.transform(method) - expected = DataFrame({"b": data}, dtype=dtype) - - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "in_vals, out_vals", - [ - # Basics: strictly increasing (T), strictly decreasing (F), - # abs val increasing (F), non-strictly increasing (T) - ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]), - # Test with inf vals - ( - [1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], - [True, False, True, False], - ), - # Test with nan vals; should always be False - ( - [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], - [False, False, False, False], - ), - ], -) -def test_is_monotonic_increasing(in_vals, out_vals): - # GH 17015 - source_dict = { - "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], - "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], - "C": in_vals, - } - df = DataFrame(source_dict) - result = df.groupby("B").C.is_monotonic_increasing - index = Index(list("abcd"), name="B") - expected = Series(index=index, data=out_vals, name="C") - tm.assert_series_equal(result, expected) - - # Also check result equal to manually taking x.is_monotonic_increasing. - expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "in_vals, out_vals", - [ - # Basics: strictly decreasing (T), strictly increasing (F), - # abs val decreasing (F), non-strictly increasing (T) - ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]), - # Test with inf vals - ( - [np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], - [True, True, False, True], - ), - # Test with nan vals; should always be False - ( - [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], - [False, False, False, False], - ), - ], -) -def test_is_monotonic_decreasing(in_vals, out_vals): - # GH 17015 - source_dict = { - "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], - "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], - "C": in_vals, - } - - df = DataFrame(source_dict) - result = df.groupby("B").C.is_monotonic_decreasing - index = Index(list("abcd"), name="B") - expected = Series(index=index, data=out_vals, name="C") - tm.assert_series_equal(result, expected) - - -# describe -# -------------------------------- - - -def test_apply_describe_bug(mframe): - grouped = mframe.groupby(level="first") - grouped.describe() # it works! - - -def test_series_describe_multikey(): - ts = tm.makeTimeSeries() - grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) - result = grouped.describe() - tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False) - tm.assert_series_equal(result["std"], grouped.std(), check_names=False) - tm.assert_series_equal(result["min"], grouped.min(), check_names=False) - - -def test_series_describe_single(): - ts = tm.makeTimeSeries() - grouped = ts.groupby(lambda x: x.month) - result = grouped.apply(lambda x: x.describe()) - expected = grouped.describe().stack(future_stack=True) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]]) -def test_series_describe_as_index(as_index, keys): - # GH#49256 - df = DataFrame( - { - "key1": ["one", "two", "two", "three", "two"], - "key2": ["one", "two", "two", "three", "two"], - "foo2": [1, 2, 4, 4, 6], - } - ) - gb = df.groupby(keys, as_index=as_index)["foo2"] - result = gb.describe() - expected = DataFrame( - { - "key1": ["one", "three", "two"], - "count": [1.0, 1.0, 3.0], - "mean": [1.0, 4.0, 4.0], - "std": [np.nan, np.nan, 2.0], - "min": [1.0, 4.0, 2.0], - "25%": [1.0, 4.0, 3.0], - "50%": [1.0, 4.0, 4.0], - "75%": [1.0, 4.0, 5.0], - "max": [1.0, 4.0, 6.0], - } - ) - if len(keys) == 2: - expected.insert(1, "key2", expected["key1"]) - if as_index: - expected = expected.set_index(keys) - tm.assert_frame_equal(result, expected) - - def test_series_index_name(df): grouped = df.loc[:, ["C"]].groupby(df["A"]) result = grouped.agg(lambda x: x.mean()) assert result.index.name == "A" -def test_frame_describe_multikey(tsframe): - grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) - result = grouped.describe() - desc_groups = [] - for col in tsframe: - group = grouped[col].describe() - # GH 17464 - Remove duplicate MultiIndex levels - group_col = MultiIndex( - levels=[[col], group.columns], - codes=[[0] * len(group.columns), range(len(group.columns))], - ) - group = DataFrame(group.values, columns=group_col, index=group.index) - desc_groups.append(group) - expected = pd.concat(desc_groups, axis=1) - tm.assert_frame_equal(result, expected) - - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) - result = groupedT.describe() - expected = tsframe.describe().T - # reverting the change from https://github.com/pandas-dev/pandas/pull/35441/ - expected.index = MultiIndex( - levels=[[0, 1], expected.index], - codes=[[0, 0, 1, 1], range(len(expected.index))], - ) - tm.assert_frame_equal(result, expected) - - -def test_frame_describe_tupleindex(): - # GH 14848 - regression from 0.19.0 to 0.19.1 - df1 = DataFrame( - { - "x": [1, 2, 3, 4, 5] * 3, - "y": [10, 20, 30, 40, 50] * 3, - "z": [100, 200, 300, 400, 500] * 3, - } - ) - df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 - df2 = df1.rename(columns={"k": "key"}) - msg = "Names should be list-like for a MultiIndex" - with pytest.raises(ValueError, match=msg): - df1.groupby("k").describe() - with pytest.raises(ValueError, match=msg): - df2.groupby("key").describe() - - -def test_frame_describe_unstacked_format(): - # GH 4792 - prices = { - Timestamp("2011-01-06 10:59:05", tz=None): 24990, - Timestamp("2011-01-06 12:43:33", tz=None): 25499, - Timestamp("2011-01-06 12:54:09", tz=None): 25499, - } - volumes = { - Timestamp("2011-01-06 10:59:05", tz=None): 1500000000, - Timestamp("2011-01-06 12:43:33", tz=None): 5000000000, - Timestamp("2011-01-06 12:54:09", tz=None): 100000000, - } - df = DataFrame({"PRICE": prices, "VOLUME": volumes}) - result = df.groupby("PRICE").VOLUME.describe() - data = [ - df[df.PRICE == 24990].VOLUME.describe().values.tolist(), - df[df.PRICE == 25499].VOLUME.describe().values.tolist(), - ] - expected = DataFrame( - data, - index=Index([24990, 25499], name="PRICE"), - columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.filterwarnings( - "ignore:" - "indexing past lexsort depth may impact performance:" - "pandas.errors.PerformanceWarning" -) -@pytest.mark.parametrize("as_index", [True, False]) -@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) -def test_describe_with_duplicate_output_column_names(as_index, keys): - # GH 35314 - df = DataFrame( - { - "a1": [99, 99, 99, 88, 88, 88], - "a2": [99, 99, 99, 88, 88, 88], - "b": [1, 2, 3, 4, 5, 6], - "c": [10, 20, 30, 40, 50, 60], - }, - columns=["a1", "a2", "b", "b"], - copy=False, - ) - if keys == ["a1"]: - df = df.drop(columns="a2") - - expected = ( - DataFrame.from_records( - [ - ("b", "count", 3.0, 3.0), - ("b", "mean", 5.0, 2.0), - ("b", "std", 1.0, 1.0), - ("b", "min", 4.0, 1.0), - ("b", "25%", 4.5, 1.5), - ("b", "50%", 5.0, 2.0), - ("b", "75%", 5.5, 2.5), - ("b", "max", 6.0, 3.0), - ("b", "count", 3.0, 3.0), - ("b", "mean", 5.0, 2.0), - ("b", "std", 1.0, 1.0), - ("b", "min", 4.0, 1.0), - ("b", "25%", 4.5, 1.5), - ("b", "50%", 5.0, 2.0), - ("b", "75%", 5.5, 2.5), - ("b", "max", 6.0, 3.0), - ], - ) - .set_index([0, 1]) - .T - ) - expected.columns.names = [None, None] - if len(keys) == 2: - expected.index = MultiIndex( - levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"] - ) - else: - expected.index = Index([88, 99], name="a1") - - if not as_index: - expected = expected.reset_index() - - result = df.groupby(keys, as_index=as_index).describe() - - tm.assert_frame_equal(result, expected) - - -def test_describe_duplicate_columns(): - # GH#50806 - df = DataFrame([[0, 1, 2, 3]]) - df.columns = [0, 1, 2, 0] - gb = df.groupby(df[1]) - result = gb.describe(percentiles=[]) - - columns = ["count", "mean", "std", "min", "50%", "max"] - frames = [ - DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns) - for val in (0.0, 2.0, 3.0) - ] - expected = pd.concat(frames, axis=1) - expected.columns = MultiIndex( - levels=[[0, 2], columns], - codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))], - ) - expected.index.names = [1] - tm.assert_frame_equal(result, expected) - - -def test_groupby_mean_no_overflow(): - # Regression test for (#22487) - df = DataFrame( - { - "user": ["A", "A", "A", "A", "A"], - "connections": [4970, 4749, 4719, 4704, 18446744073699999744], - } - ) - assert df.groupby("user")["connections"].mean()["A"] == 3689348814740003840 - - @pytest.mark.parametrize( "values", [ @@ -1393,78 +565,6 @@ def test_apply_to_nullable_integer_returns_float(values, function): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("min_count", [0, 10]) -def test_groupby_sum_mincount_boolean(min_count): - b = True - a = False - na = np.nan - dfg = pd.array([b, b, na, na, a, a, b], dtype="boolean") - - df = DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": dfg}) - result = df.groupby("A").sum(min_count=min_count) - if min_count == 0: - expected = DataFrame( - {"B": pd.array([3, 0, 0], dtype="Int64")}, - index=Index([1, 2, 3], name="A"), - ) - tm.assert_frame_equal(result, expected) - else: - expected = DataFrame( - {"B": pd.array([pd.NA] * 3, dtype="Int64")}, - index=Index([1, 2, 3], name="A"), - ) - tm.assert_frame_equal(result, expected) - - -def test_groupby_sum_below_mincount_nullable_integer(): - # https://github.com/pandas-dev/pandas/issues/32861 - df = DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64") - grouped = df.groupby("a") - idx = Index([0, 1, 2], name="a", dtype="Int64") - - result = grouped["b"].sum(min_count=2) - expected = Series([pd.NA] * 3, dtype="Int64", index=idx, name="b") - tm.assert_series_equal(result, expected) - - result = grouped.sum(min_count=2) - expected = DataFrame({"b": [pd.NA] * 3, "c": [pd.NA] * 3}, dtype="Int64", index=idx) - tm.assert_frame_equal(result, expected) - - -def test_mean_on_timedelta(): - # GH 17382 - df = DataFrame({"time": pd.to_timedelta(range(10)), "cat": ["A", "B"] * 5}) - result = df.groupby("cat")["time"].mean() - expected = Series( - pd.to_timedelta([4, 5]), name="time", index=Index(["A", "B"], name="cat") - ) - tm.assert_series_equal(result, expected) - - -def test_groupby_sum_timedelta_with_nat(): - # GH#42659 - df = DataFrame( - { - "a": [1, 1, 2, 2], - "b": [pd.Timedelta("1d"), pd.Timedelta("2d"), pd.Timedelta("3d"), pd.NaT], - } - ) - td3 = pd.Timedelta(days=3) - - gb = df.groupby("a") - - res = gb.sum() - expected = DataFrame({"b": [td3, td3]}, index=Index([1, 2], name="a")) - tm.assert_frame_equal(res, expected) - - res = gb["b"].sum() - tm.assert_series_equal(res, expected["b"]) - - res = gb["b"].sum(min_count=2) - expected = Series([td3, pd.NaT], dtype="m8[ns]", name="b", index=expected.index) - tm.assert_series_equal(res, expected) - - @pytest.mark.parametrize( "kernel, has_arg", [ @@ -1706,22 +806,6 @@ def test_groupby_empty_dataset(dtype, kwargs): tm.assert_frame_equal(result, expected) -def test_corrwith_with_1_axis(): - # GH 47723 - df = DataFrame({"a": [1, 1, 2], "b": [3, 7, 4]}) - gb = df.groupby("a") - - msg = "DataFrameGroupBy.corrwith with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = gb.corrwith(df, axis=1) - index = Index( - data=[(1, 0), (1, 1), (1, 2), (2, 2), (2, 0), (2, 1)], - name=("a", None), - ) - expected = Series([np.nan] * 6, index=index) - tm.assert_series_equal(result, expected) - - def test_multiindex_group_all_columns_when_empty(groupby_func): # GH 32464 df = DataFrame({"a": [], "b": [], "c": []}).set_index(["a", "b", "c"]) diff --git a/pandas/tests/groupby/test_min_max.py b/pandas/tests/groupby/test_min_max.py deleted file mode 100644 index 30c7e1df1e691..0000000000000 --- a/pandas/tests/groupby/test_min_max.py +++ /dev/null @@ -1,272 +0,0 @@ -import numpy as np -import pytest - -from pandas._libs.tslibs import iNaT - -import pandas as pd -from pandas import ( - DataFrame, - Index, - Series, -) -import pandas._testing as tm - - -def test_max_min_non_numeric(): - # #2700 - aa = DataFrame({"nn": [11, 11, 22, 22], "ii": [1, 2, 3, 4], "ss": 4 * ["mama"]}) - - result = aa.groupby("nn").max() - assert "ss" in result - - result = aa.groupby("nn").max(numeric_only=False) - assert "ss" in result - - result = aa.groupby("nn").min() - assert "ss" in result - - result = aa.groupby("nn").min(numeric_only=False) - assert "ss" in result - - -def test_max_min_object_multiple_columns(using_array_manager): - # GH#41111 case where the aggregation is valid for some columns but not - # others; we split object blocks column-wise, consistent with - # DataFrame._reduce - - df = DataFrame( - { - "A": [1, 1, 2, 2, 3], - "B": [1, "foo", 2, "bar", False], - "C": ["a", "b", "c", "d", "e"], - } - ) - df._consolidate_inplace() # should already be consolidate, but double-check - if not using_array_manager: - assert len(df._mgr.blocks) == 2 - - gb = df.groupby("A") - - result = gb[["C"]].max() - # "max" is valid for column "C" but not for "B" - ei = Index([1, 2, 3], name="A") - expected = DataFrame({"C": ["b", "d", "e"]}, index=ei) - tm.assert_frame_equal(result, expected) - - result = gb[["C"]].min() - # "min" is valid for column "C" but not for "B" - ei = Index([1, 2, 3], name="A") - expected = DataFrame({"C": ["a", "c", "e"]}, index=ei) - tm.assert_frame_equal(result, expected) - - -def test_min_date_with_nans(): - # GH26321 - dates = pd.to_datetime( - Series(["2019-05-09", "2019-05-09", "2019-05-09"]), format="%Y-%m-%d" - ).dt.date - df = DataFrame({"a": [np.nan, "1", np.nan], "b": [0, 1, 1], "c": dates}) - - result = df.groupby("b", as_index=False)["c"].min()["c"] - expected = pd.to_datetime( - Series(["2019-05-09", "2019-05-09"], name="c"), format="%Y-%m-%d" - ).dt.date - tm.assert_series_equal(result, expected) - - result = df.groupby("b")["c"].min() - expected.index.name = "b" - tm.assert_series_equal(result, expected) - - -def test_max_inat(): - # GH#40767 dont interpret iNaT as NaN - ser = Series([1, iNaT]) - key = np.array([1, 1], dtype=np.int64) - gb = ser.groupby(key) - - result = gb.max(min_count=2) - expected = Series({1: 1}, dtype=np.int64) - tm.assert_series_equal(result, expected, check_exact=True) - - result = gb.min(min_count=2) - expected = Series({1: iNaT}, dtype=np.int64) - tm.assert_series_equal(result, expected, check_exact=True) - - # not enough entries -> gets masked to NaN - result = gb.min(min_count=3) - expected = Series({1: np.nan}) - tm.assert_series_equal(result, expected, check_exact=True) - - -def test_max_inat_not_all_na(): - # GH#40767 dont interpret iNaT as NaN - - # make sure we dont round iNaT+1 to iNaT - ser = Series([1, iNaT, 2, iNaT + 1]) - gb = ser.groupby([1, 2, 3, 3]) - result = gb.min(min_count=2) - - # Note: in converting to float64, the iNaT + 1 maps to iNaT, i.e. is lossy - expected = Series({1: np.nan, 2: np.nan, 3: iNaT + 1}) - expected.index = expected.index.astype(int) - tm.assert_series_equal(result, expected, check_exact=True) - - -@pytest.mark.parametrize("func", ["min", "max"]) -def test_groupby_aggregate_period_column(func): - # GH 31471 - groups = [1, 2] - periods = pd.period_range("2020", periods=2, freq="Y") - df = DataFrame({"a": groups, "b": periods}) - - result = getattr(df.groupby("a")["b"], func)() - idx = Index([1, 2], name="a") - expected = Series(periods, index=idx, name="b") - - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("func", ["min", "max"]) -def test_groupby_aggregate_period_frame(func): - # GH 31471 - groups = [1, 2] - periods = pd.period_range("2020", periods=2, freq="Y") - df = DataFrame({"a": groups, "b": periods}) - - result = getattr(df.groupby("a"), func)() - idx = Index([1, 2], name="a") - expected = DataFrame({"b": periods}, index=idx) - - tm.assert_frame_equal(result, expected) - - -def test_aggregate_numeric_object_dtype(): - # https://github.com/pandas-dev/pandas/issues/39329 - # simplified case: multiple object columns where one is all-NaN - # -> gets split as the all-NaN is inferred as float - df = DataFrame( - {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": [np.nan] * 4}, - ).astype(object) - result = df.groupby("key").min() - expected = ( - DataFrame( - {"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]}, - ) - .set_index("key") - .astype(object) - ) - tm.assert_frame_equal(result, expected) - - # same but with numbers - df = DataFrame( - {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": range(4)}, - ).astype(object) - result = df.groupby("key").min() - expected = ( - DataFrame({"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]}) - .set_index("key") - .astype(object) - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("func", ["min", "max"]) -def test_aggregate_categorical_lost_index(func: str): - # GH: 28641 groupby drops index, when grouping over categorical column with min/max - ds = Series(["b"], dtype="category").cat.as_ordered() - df = DataFrame({"A": [1997], "B": ds}) - result = df.groupby("A").agg({"B": func}) - expected = DataFrame({"B": ["b"]}, index=Index([1997], name="A")) - - # ordered categorical dtype should be preserved - expected["B"] = expected["B"].astype(ds.dtype) - - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("dtype", ["Int64", "Int32", "Float64", "Float32", "boolean"]) -def test_groupby_min_max_nullable(dtype): - if dtype == "Int64": - # GH#41743 avoid precision loss - ts = 1618556707013635762 - elif dtype == "boolean": - ts = 0 - else: - ts = 4.0 - - df = DataFrame({"id": [2, 2], "ts": [ts, ts + 1]}) - df["ts"] = df["ts"].astype(dtype) - - gb = df.groupby("id") - - result = gb.min() - expected = df.iloc[:1].set_index("id") - tm.assert_frame_equal(result, expected) - - res_max = gb.max() - expected_max = df.iloc[1:].set_index("id") - tm.assert_frame_equal(res_max, expected_max) - - result2 = gb.min(min_count=3) - expected2 = DataFrame({"ts": [pd.NA]}, index=expected.index, dtype=dtype) - tm.assert_frame_equal(result2, expected2) - - res_max2 = gb.max(min_count=3) - tm.assert_frame_equal(res_max2, expected2) - - # Case with NA values - df2 = DataFrame({"id": [2, 2, 2], "ts": [ts, pd.NA, ts + 1]}) - df2["ts"] = df2["ts"].astype(dtype) - gb2 = df2.groupby("id") - - result3 = gb2.min() - tm.assert_frame_equal(result3, expected) - - res_max3 = gb2.max() - tm.assert_frame_equal(res_max3, expected_max) - - result4 = gb2.min(min_count=100) - tm.assert_frame_equal(result4, expected2) - - res_max4 = gb2.max(min_count=100) - tm.assert_frame_equal(res_max4, expected2) - - -def test_min_max_nullable_uint64_empty_group(): - # don't raise NotImplementedError from libgroupby - cat = pd.Categorical([0] * 10, categories=[0, 1]) - df = DataFrame({"A": cat, "B": pd.array(np.arange(10, dtype=np.uint64))}) - gb = df.groupby("A", observed=False) - - res = gb.min() - - idx = pd.CategoricalIndex([0, 1], dtype=cat.dtype, name="A") - expected = DataFrame({"B": pd.array([0, pd.NA], dtype="UInt64")}, index=idx) - tm.assert_frame_equal(res, expected) - - res = gb.max() - expected.iloc[0, 0] = 9 - tm.assert_frame_equal(res, expected) - - -@pytest.mark.parametrize("func", ["first", "last", "min", "max"]) -def test_groupby_min_max_categorical(func): - # GH: 52151 - df = DataFrame( - { - "col1": pd.Categorical(["A"], categories=list("AB"), ordered=True), - "col2": pd.Categorical([1], categories=[1, 2], ordered=True), - "value": 0.1, - } - ) - result = getattr(df.groupby("col1", observed=False), func)() - - idx = pd.CategoricalIndex(data=["A", "B"], name="col1", ordered=True) - expected = DataFrame( - { - "col2": pd.Categorical([1, None], categories=[1, 2], ordered=True), - "value": [0.1, None], - }, - index=idx, - ) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_nunique.py b/pandas/tests/groupby/test_nunique.py deleted file mode 100644 index 9c9e32d9ce226..0000000000000 --- a/pandas/tests/groupby/test_nunique.py +++ /dev/null @@ -1,190 +0,0 @@ -import datetime as dt -from string import ascii_lowercase - -import numpy as np -import pytest - -import pandas as pd -from pandas import ( - DataFrame, - MultiIndex, - NaT, - Series, - Timestamp, - date_range, -) -import pandas._testing as tm - - -@pytest.mark.slow -@pytest.mark.parametrize("sort", [False, True]) -@pytest.mark.parametrize("dropna", [False, True]) -@pytest.mark.parametrize("as_index", [True, False]) -@pytest.mark.parametrize("with_nan", [True, False]) -@pytest.mark.parametrize("keys", [["joe"], ["joe", "jim"]]) -def test_series_groupby_nunique(sort, dropna, as_index, with_nan, keys): - n = 100 - m = 10 - days = date_range("2015-08-23", periods=10) - df = DataFrame( - { - "jim": np.random.default_rng(2).choice(list(ascii_lowercase), n), - "joe": np.random.default_rng(2).choice(days, n), - "julie": np.random.default_rng(2).integers(0, m, n), - } - ) - if with_nan: - df = df.astype({"julie": float}) # Explicit cast to avoid implicit cast below - df.loc[1::17, "jim"] = None - df.loc[3::37, "joe"] = None - df.loc[7::19, "julie"] = None - df.loc[8::19, "julie"] = None - df.loc[9::19, "julie"] = None - original_df = df.copy() - gr = df.groupby(keys, as_index=as_index, sort=sort) - left = gr["julie"].nunique(dropna=dropna) - - gr = df.groupby(keys, as_index=as_index, sort=sort) - right = gr["julie"].apply(Series.nunique, dropna=dropna) - if not as_index: - right = right.reset_index(drop=True) - - if as_index: - tm.assert_series_equal(left, right, check_names=False) - else: - tm.assert_frame_equal(left, right, check_names=False) - tm.assert_frame_equal(df, original_df) - - -def test_nunique(): - df = DataFrame({"A": list("abbacc"), "B": list("abxacc"), "C": list("abbacx")}) - - expected = DataFrame({"A": list("abc"), "B": [1, 2, 1], "C": [1, 1, 2]}) - result = df.groupby("A", as_index=False).nunique() - tm.assert_frame_equal(result, expected) - - # as_index - expected.index = list("abc") - expected.index.name = "A" - expected = expected.drop(columns="A") - result = df.groupby("A").nunique() - tm.assert_frame_equal(result, expected) - - # with na - result = df.replace({"x": None}).groupby("A").nunique(dropna=False) - tm.assert_frame_equal(result, expected) - - # dropna - expected = DataFrame({"B": [1] * 3, "C": [1] * 3}, index=list("abc")) - expected.index.name = "A" - result = df.replace({"x": None}).groupby("A").nunique() - tm.assert_frame_equal(result, expected) - - -def test_nunique_with_object(): - # GH 11077 - data = DataFrame( - [ - [100, 1, "Alice"], - [200, 2, "Bob"], - [300, 3, "Charlie"], - [-400, 4, "Dan"], - [500, 5, "Edith"], - ], - columns=["amount", "id", "name"], - ) - - result = data.groupby(["id", "amount"])["name"].nunique() - index = MultiIndex.from_arrays([data.id, data.amount]) - expected = Series([1] * 5, name="name", index=index) - tm.assert_series_equal(result, expected) - - -def test_nunique_with_empty_series(): - # GH 12553 - data = Series(name="name", dtype=object) - result = data.groupby(level=0).nunique() - expected = Series(name="name", dtype="int64") - tm.assert_series_equal(result, expected) - - -def test_nunique_with_timegrouper(): - # GH 13453 - test = DataFrame( - { - "time": [ - Timestamp("2016-06-28 09:35:35"), - Timestamp("2016-06-28 16:09:30"), - Timestamp("2016-06-28 16:46:28"), - ], - "data": ["1", "2", "3"], - } - ).set_index("time") - result = test.groupby(pd.Grouper(freq="h"))["data"].nunique() - expected = test.groupby(pd.Grouper(freq="h"))["data"].apply(Series.nunique) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "key, data, dropna, expected", - [ - ( - ["x", "x", "x"], - [Timestamp("2019-01-01"), NaT, Timestamp("2019-01-01")], - True, - Series([1], index=pd.Index(["x"], name="key"), name="data"), - ), - ( - ["x", "x", "x"], - [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], - True, - Series([1], index=pd.Index(["x"], name="key"), name="data"), - ), - ( - ["x", "x", "x", "y", "y"], - [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], - False, - Series([2, 2], index=pd.Index(["x", "y"], name="key"), name="data"), - ), - ( - ["x", "x", "x", "x", "y"], - [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], - False, - Series([2, 1], index=pd.Index(["x", "y"], name="key"), name="data"), - ), - ], -) -def test_nunique_with_NaT(key, data, dropna, expected): - # GH 27951 - df = DataFrame({"key": key, "data": data}) - result = df.groupby(["key"])["data"].nunique(dropna=dropna) - tm.assert_series_equal(result, expected) - - -def test_nunique_preserves_column_level_names(): - # GH 23222 - test = DataFrame([1, 2, 2], columns=pd.Index(["A"], name="level_0")) - result = test.groupby([0, 0, 0]).nunique() - expected = DataFrame([2], index=np.array([0]), columns=test.columns) - tm.assert_frame_equal(result, expected) - - -def test_nunique_transform_with_datetime(): - # GH 35109 - transform with nunique on datetimes results in integers - df = DataFrame(date_range("2008-12-31", "2009-01-02"), columns=["date"]) - result = df.groupby([0, 0, 1])["date"].transform("nunique") - expected = Series([2, 2, 1], name="date") - tm.assert_series_equal(result, expected) - - -def test_empty_categorical(observed): - # GH#21334 - cat = Series([1]).astype("category") - ser = cat[:0] - gb = ser.groupby(ser, observed=observed) - result = gb.nunique() - if observed: - expected = Series([], index=cat[:0], dtype="int64") - else: - expected = Series([0], index=cat, dtype="int64") - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py new file mode 100644 index 0000000000000..fdfb211ac2269 --- /dev/null +++ b/pandas/tests/groupby/test_reductions.py @@ -0,0 +1,838 @@ +import builtins +import datetime as dt +from io import StringIO +from string import ascii_lowercase + +import numpy as np +import pytest + +from pandas._libs.tslibs import iNaT + +import pandas as pd +from pandas import ( + DataFrame, + MultiIndex, + Series, + Timestamp, + date_range, + isna, +) +import pandas._testing as tm + + +@pytest.mark.parametrize("agg_func", ["any", "all"]) +@pytest.mark.parametrize( + "vals", + [ + ["foo", "bar", "baz"], + ["foo", "", ""], + ["", "", ""], + [1, 2, 3], + [1, 0, 0], + [0, 0, 0], + [1.0, 2.0, 3.0], + [1.0, 0.0, 0.0], + [0.0, 0.0, 0.0], + [True, True, True], + [True, False, False], + [False, False, False], + [np.nan, np.nan, np.nan], + ], +) +def test_groupby_bool_aggs(skipna, agg_func, vals): + df = DataFrame({"key": ["a"] * 3 + ["b"] * 3, "val": vals * 2}) + + # Figure out expectation using Python builtin + exp = getattr(builtins, agg_func)(vals) + + # edge case for missing data with skipna and 'any' + if skipna and all(isna(vals)) and agg_func == "any": + exp = False + + expected = DataFrame( + [exp] * 2, columns=["val"], index=pd.Index(["a", "b"], name="key") + ) + result = getattr(df.groupby("key"), agg_func)(skipna=skipna) + tm.assert_frame_equal(result, expected) + + +def test_any(): + df = DataFrame( + [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]], + columns=["A", "B", "C"], + ) + expected = DataFrame( + [[True, True], [False, True]], columns=["B", "C"], index=[1, 3] + ) + expected.index.name = "A" + result = df.groupby("A").any() + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) +def test_bool_aggs_dup_column_labels(bool_agg_func): + # GH#21668 + df = DataFrame([[True, True]], columns=["a", "a"]) + grp_by = df.groupby([0]) + result = getattr(grp_by, bool_agg_func)() + + expected = df.set_axis(np.array([0])) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) +@pytest.mark.parametrize( + "data", + [ + [False, False, False], + [True, True, True], + [pd.NA, pd.NA, pd.NA], + [False, pd.NA, False], + [True, pd.NA, True], + [True, pd.NA, False], + ], +) +def test_masked_kleene_logic(bool_agg_func, skipna, data): + # GH#37506 + ser = Series(data, dtype="boolean") + + # The result should match aggregating on the whole series. Correctness + # there is verified in test_reductions.py::test_any_all_boolean_kleene_logic + expected_data = getattr(ser, bool_agg_func)(skipna=skipna) + expected = Series(expected_data, index=np.array([0]), dtype="boolean") + + result = ser.groupby([0, 0, 0]).agg(bool_agg_func, skipna=skipna) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype1,dtype2,exp_col1,exp_col2", + [ + ( + "float", + "Float64", + np.array([True], dtype=bool), + pd.array([pd.NA], dtype="boolean"), + ), + ( + "Int64", + "float", + pd.array([pd.NA], dtype="boolean"), + np.array([True], dtype=bool), + ), + ( + "Int64", + "Int64", + pd.array([pd.NA], dtype="boolean"), + pd.array([pd.NA], dtype="boolean"), + ), + ( + "Float64", + "boolean", + pd.array([pd.NA], dtype="boolean"), + pd.array([pd.NA], dtype="boolean"), + ), + ], +) +def test_masked_mixed_types(dtype1, dtype2, exp_col1, exp_col2): + # GH#37506 + data = [1.0, np.nan] + df = DataFrame( + {"col1": pd.array(data, dtype=dtype1), "col2": pd.array(data, dtype=dtype2)} + ) + result = df.groupby([1, 1]).agg("all", skipna=False) + + expected = DataFrame({"col1": exp_col1, "col2": exp_col2}, index=np.array([1])) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) +@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) +def test_masked_bool_aggs_skipna(bool_agg_func, dtype, skipna, frame_or_series): + # GH#40585 + obj = frame_or_series([pd.NA, 1], dtype=dtype) + expected_res = True + if not skipna and bool_agg_func == "all": + expected_res = pd.NA + expected = frame_or_series([expected_res], index=np.array([1]), dtype="boolean") + + result = obj.groupby([1, 1]).agg(bool_agg_func, skipna=skipna) + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "bool_agg_func,data,expected_res", + [ + ("any", [pd.NA, np.nan], False), + ("any", [pd.NA, 1, np.nan], True), + ("all", [pd.NA, pd.NaT], True), + ("all", [pd.NA, False, pd.NaT], False), + ], +) +def test_object_type_missing_vals(bool_agg_func, data, expected_res, frame_or_series): + # GH#37501 + obj = frame_or_series(data, dtype=object) + result = obj.groupby([1] * len(data)).agg(bool_agg_func) + expected = frame_or_series([expected_res], index=np.array([1]), dtype="bool") + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) +def test_object_NA_raises_with_skipna_false(bool_agg_func): + # GH#37501 + ser = Series([pd.NA], dtype=object) + with pytest.raises(TypeError, match="boolean value of NA is ambiguous"): + ser.groupby([1]).agg(bool_agg_func, skipna=False) + + +@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) +def test_empty(frame_or_series, bool_agg_func): + # GH 45231 + kwargs = {"columns": ["a"]} if frame_or_series is DataFrame else {"name": "a"} + obj = frame_or_series(**kwargs, dtype=object) + result = getattr(obj.groupby(obj.index), bool_agg_func)() + expected = frame_or_series(**kwargs, dtype=bool) + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "func, values", + [ + ("idxmin", {"c_int": [0, 2], "c_float": [1, 3], "c_date": [1, 2]}), + ("idxmax", {"c_int": [1, 3], "c_float": [0, 2], "c_date": [0, 3]}), + ], +) +@pytest.mark.parametrize("numeric_only", [True, False]) +def test_idxmin_idxmax_returns_int_types(func, values, numeric_only): + # GH 25444 + df = DataFrame( + { + "name": ["A", "A", "B", "B"], + "c_int": [1, 2, 3, 4], + "c_float": [4.02, 3.03, 2.04, 1.05], + "c_date": ["2019", "2018", "2016", "2017"], + } + ) + df["c_date"] = pd.to_datetime(df["c_date"]) + df["c_date_tz"] = df["c_date"].dt.tz_localize("US/Pacific") + df["c_timedelta"] = df["c_date"] - df["c_date"].iloc[0] + df["c_period"] = df["c_date"].dt.to_period("W") + df["c_Integer"] = df["c_int"].astype("Int64") + df["c_Floating"] = df["c_float"].astype("Float64") + + result = getattr(df.groupby("name"), func)(numeric_only=numeric_only) + + expected = DataFrame(values, index=pd.Index(["A", "B"], name="name")) + if numeric_only: + expected = expected.drop(columns=["c_date"]) + else: + expected["c_date_tz"] = expected["c_date"] + expected["c_timedelta"] = expected["c_date"] + expected["c_period"] = expected["c_date"] + expected["c_Integer"] = expected["c_int"] + expected["c_Floating"] = expected["c_float"] + + tm.assert_frame_equal(result, expected) + + +def test_idxmin_idxmax_axis1(): + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "B", "C", "D"] + ) + df["A"] = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4] + + gb = df.groupby("A") + + warn_msg = "DataFrameGroupBy.idxmax with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + res = gb.idxmax(axis=1) + + alt = df.iloc[:, 1:].idxmax(axis=1) + indexer = res.index.get_level_values(1) + + tm.assert_series_equal(alt[indexer], res.droplevel("A")) + + df["E"] = date_range("2016-01-01", periods=10) + gb2 = df.groupby("A") + + msg = "'>' not supported between instances of 'Timestamp' and 'float'" + with pytest.raises(TypeError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + gb2.idxmax(axis=1) + + +def test_groupby_mean_no_overflow(): + # Regression test for (#22487) + df = DataFrame( + { + "user": ["A", "A", "A", "A", "A"], + "connections": [4970, 4749, 4719, 4704, 18446744073699999744], + } + ) + assert df.groupby("user")["connections"].mean()["A"] == 3689348814740003840 + + +def test_mean_on_timedelta(): + # GH 17382 + df = DataFrame({"time": pd.to_timedelta(range(10)), "cat": ["A", "B"] * 5}) + result = df.groupby("cat")["time"].mean() + expected = Series( + pd.to_timedelta([4, 5]), name="time", index=pd.Index(["A", "B"], name="cat") + ) + tm.assert_series_equal(result, expected) + + +def test_cython_median(): + arr = np.random.default_rng(2).standard_normal(1000) + arr[::2] = np.nan + df = DataFrame(arr) + + labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float) + labels[::17] = np.nan + + result = df.groupby(labels).median() + msg = "using DataFrameGroupBy.median" + with tm.assert_produces_warning(FutureWarning, match=msg): + exp = df.groupby(labels).agg(np.nanmedian) + tm.assert_frame_equal(result, exp) + + df = DataFrame(np.random.default_rng(2).standard_normal((1000, 5))) + msg = "using DataFrameGroupBy.median" + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = df.groupby(labels).agg(np.median) + xp = df.groupby(labels).median() + tm.assert_frame_equal(rs, xp) + + +def test_median_empty_bins(observed): + df = DataFrame(np.random.default_rng(2).integers(0, 44, 500)) + + grps = range(0, 55, 5) + bins = pd.cut(df[0], grps) + + result = df.groupby(bins, observed=observed).median() + expected = df.groupby(bins, observed=observed).agg(lambda x: x.median()) + tm.assert_frame_equal(result, expected) + + +def test_max_min_non_numeric(): + # #2700 + aa = DataFrame({"nn": [11, 11, 22, 22], "ii": [1, 2, 3, 4], "ss": 4 * ["mama"]}) + + result = aa.groupby("nn").max() + assert "ss" in result + + result = aa.groupby("nn").max(numeric_only=False) + assert "ss" in result + + result = aa.groupby("nn").min() + assert "ss" in result + + result = aa.groupby("nn").min(numeric_only=False) + assert "ss" in result + + +def test_max_min_object_multiple_columns(using_array_manager): + # GH#41111 case where the aggregation is valid for some columns but not + # others; we split object blocks column-wise, consistent with + # DataFrame._reduce + + df = DataFrame( + { + "A": [1, 1, 2, 2, 3], + "B": [1, "foo", 2, "bar", False], + "C": ["a", "b", "c", "d", "e"], + } + ) + df._consolidate_inplace() # should already be consolidate, but double-check + if not using_array_manager: + assert len(df._mgr.blocks) == 2 + + gb = df.groupby("A") + + result = gb[["C"]].max() + # "max" is valid for column "C" but not for "B" + ei = pd.Index([1, 2, 3], name="A") + expected = DataFrame({"C": ["b", "d", "e"]}, index=ei) + tm.assert_frame_equal(result, expected) + + result = gb[["C"]].min() + # "min" is valid for column "C" but not for "B" + ei = pd.Index([1, 2, 3], name="A") + expected = DataFrame({"C": ["a", "c", "e"]}, index=ei) + tm.assert_frame_equal(result, expected) + + +def test_min_date_with_nans(): + # GH26321 + dates = pd.to_datetime( + Series(["2019-05-09", "2019-05-09", "2019-05-09"]), format="%Y-%m-%d" + ).dt.date + df = DataFrame({"a": [np.nan, "1", np.nan], "b": [0, 1, 1], "c": dates}) + + result = df.groupby("b", as_index=False)["c"].min()["c"] + expected = pd.to_datetime( + Series(["2019-05-09", "2019-05-09"], name="c"), format="%Y-%m-%d" + ).dt.date + tm.assert_series_equal(result, expected) + + result = df.groupby("b")["c"].min() + expected.index.name = "b" + tm.assert_series_equal(result, expected) + + +def test_max_inat(): + # GH#40767 dont interpret iNaT as NaN + ser = Series([1, iNaT]) + key = np.array([1, 1], dtype=np.int64) + gb = ser.groupby(key) + + result = gb.max(min_count=2) + expected = Series({1: 1}, dtype=np.int64) + tm.assert_series_equal(result, expected, check_exact=True) + + result = gb.min(min_count=2) + expected = Series({1: iNaT}, dtype=np.int64) + tm.assert_series_equal(result, expected, check_exact=True) + + # not enough entries -> gets masked to NaN + result = gb.min(min_count=3) + expected = Series({1: np.nan}) + tm.assert_series_equal(result, expected, check_exact=True) + + +def test_max_inat_not_all_na(): + # GH#40767 dont interpret iNaT as NaN + + # make sure we dont round iNaT+1 to iNaT + ser = Series([1, iNaT, 2, iNaT + 1]) + gb = ser.groupby([1, 2, 3, 3]) + result = gb.min(min_count=2) + + # Note: in converting to float64, the iNaT + 1 maps to iNaT, i.e. is lossy + expected = Series({1: np.nan, 2: np.nan, 3: iNaT + 1}) + expected.index = expected.index.astype(int) + tm.assert_series_equal(result, expected, check_exact=True) + + +@pytest.mark.parametrize("func", ["min", "max"]) +def test_groupby_aggregate_period_column(func): + # GH 31471 + groups = [1, 2] + periods = pd.period_range("2020", periods=2, freq="Y") + df = DataFrame({"a": groups, "b": periods}) + + result = getattr(df.groupby("a")["b"], func)() + idx = pd.Index([1, 2], name="a") + expected = Series(periods, index=idx, name="b") + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("func", ["min", "max"]) +def test_groupby_aggregate_period_frame(func): + # GH 31471 + groups = [1, 2] + periods = pd.period_range("2020", periods=2, freq="Y") + df = DataFrame({"a": groups, "b": periods}) + + result = getattr(df.groupby("a"), func)() + idx = pd.Index([1, 2], name="a") + expected = DataFrame({"b": periods}, index=idx) + + tm.assert_frame_equal(result, expected) + + +def test_aggregate_numeric_object_dtype(): + # https://github.com/pandas-dev/pandas/issues/39329 + # simplified case: multiple object columns where one is all-NaN + # -> gets split as the all-NaN is inferred as float + df = DataFrame( + {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": [np.nan] * 4}, + ).astype(object) + result = df.groupby("key").min() + expected = ( + DataFrame( + {"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]}, + ) + .set_index("key") + .astype(object) + ) + tm.assert_frame_equal(result, expected) + + # same but with numbers + df = DataFrame( + {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": range(4)}, + ).astype(object) + result = df.groupby("key").min() + expected = ( + DataFrame({"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]}) + .set_index("key") + .astype(object) + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("func", ["min", "max"]) +def test_aggregate_categorical_lost_index(func: str): + # GH: 28641 groupby drops index, when grouping over categorical column with min/max + ds = Series(["b"], dtype="category").cat.as_ordered() + df = DataFrame({"A": [1997], "B": ds}) + result = df.groupby("A").agg({"B": func}) + expected = DataFrame({"B": ["b"]}, index=pd.Index([1997], name="A")) + + # ordered categorical dtype should be preserved + expected["B"] = expected["B"].astype(ds.dtype) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["Int64", "Int32", "Float64", "Float32", "boolean"]) +def test_groupby_min_max_nullable(dtype): + if dtype == "Int64": + # GH#41743 avoid precision loss + ts = 1618556707013635762 + elif dtype == "boolean": + ts = 0 + else: + ts = 4.0 + + df = DataFrame({"id": [2, 2], "ts": [ts, ts + 1]}) + df["ts"] = df["ts"].astype(dtype) + + gb = df.groupby("id") + + result = gb.min() + expected = df.iloc[:1].set_index("id") + tm.assert_frame_equal(result, expected) + + res_max = gb.max() + expected_max = df.iloc[1:].set_index("id") + tm.assert_frame_equal(res_max, expected_max) + + result2 = gb.min(min_count=3) + expected2 = DataFrame({"ts": [pd.NA]}, index=expected.index, dtype=dtype) + tm.assert_frame_equal(result2, expected2) + + res_max2 = gb.max(min_count=3) + tm.assert_frame_equal(res_max2, expected2) + + # Case with NA values + df2 = DataFrame({"id": [2, 2, 2], "ts": [ts, pd.NA, ts + 1]}) + df2["ts"] = df2["ts"].astype(dtype) + gb2 = df2.groupby("id") + + result3 = gb2.min() + tm.assert_frame_equal(result3, expected) + + res_max3 = gb2.max() + tm.assert_frame_equal(res_max3, expected_max) + + result4 = gb2.min(min_count=100) + tm.assert_frame_equal(result4, expected2) + + res_max4 = gb2.max(min_count=100) + tm.assert_frame_equal(res_max4, expected2) + + +def test_min_max_nullable_uint64_empty_group(): + # don't raise NotImplementedError from libgroupby + cat = pd.Categorical([0] * 10, categories=[0, 1]) + df = DataFrame({"A": cat, "B": pd.array(np.arange(10, dtype=np.uint64))}) + gb = df.groupby("A", observed=False) + + res = gb.min() + + idx = pd.CategoricalIndex([0, 1], dtype=cat.dtype, name="A") + expected = DataFrame({"B": pd.array([0, pd.NA], dtype="UInt64")}, index=idx) + tm.assert_frame_equal(res, expected) + + res = gb.max() + expected.iloc[0, 0] = 9 + tm.assert_frame_equal(res, expected) + + +@pytest.mark.parametrize("func", ["first", "last", "min", "max"]) +def test_groupby_min_max_categorical(func): + # GH: 52151 + df = DataFrame( + { + "col1": pd.Categorical(["A"], categories=list("AB"), ordered=True), + "col2": pd.Categorical([1], categories=[1, 2], ordered=True), + "value": 0.1, + } + ) + result = getattr(df.groupby("col1", observed=False), func)() + + idx = pd.CategoricalIndex(data=["A", "B"], name="col1", ordered=True) + expected = DataFrame( + { + "col2": pd.Categorical([1, None], categories=[1, 2], ordered=True), + "value": [0.1, None], + }, + index=idx, + ) + tm.assert_frame_equal(result, expected) + + +def test_max_nan_bug(): + raw = """,Date,app,File +-04-23,2013-04-23 00:00:00,,log080001.log +-05-06,2013-05-06 00:00:00,,log.log +-05-07,2013-05-07 00:00:00,OE,xlsx""" + + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + df = pd.read_csv(StringIO(raw), parse_dates=[0]) + gb = df.groupby("Date") + r = gb[["File"]].max() + e = gb["File"].max().to_frame() + tm.assert_frame_equal(r, e) + assert not r["File"].isna().any() + + +@pytest.mark.slow +@pytest.mark.parametrize("sort", [False, True]) +@pytest.mark.parametrize("dropna", [False, True]) +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize("with_nan", [True, False]) +@pytest.mark.parametrize("keys", [["joe"], ["joe", "jim"]]) +def test_series_groupby_nunique(sort, dropna, as_index, with_nan, keys): + n = 100 + m = 10 + days = date_range("2015-08-23", periods=10) + df = DataFrame( + { + "jim": np.random.default_rng(2).choice(list(ascii_lowercase), n), + "joe": np.random.default_rng(2).choice(days, n), + "julie": np.random.default_rng(2).integers(0, m, n), + } + ) + if with_nan: + df = df.astype({"julie": float}) # Explicit cast to avoid implicit cast below + df.loc[1::17, "jim"] = None + df.loc[3::37, "joe"] = None + df.loc[7::19, "julie"] = None + df.loc[8::19, "julie"] = None + df.loc[9::19, "julie"] = None + original_df = df.copy() + gr = df.groupby(keys, as_index=as_index, sort=sort) + left = gr["julie"].nunique(dropna=dropna) + + gr = df.groupby(keys, as_index=as_index, sort=sort) + right = gr["julie"].apply(Series.nunique, dropna=dropna) + if not as_index: + right = right.reset_index(drop=True) + + if as_index: + tm.assert_series_equal(left, right, check_names=False) + else: + tm.assert_frame_equal(left, right, check_names=False) + tm.assert_frame_equal(df, original_df) + + +def test_nunique(): + df = DataFrame({"A": list("abbacc"), "B": list("abxacc"), "C": list("abbacx")}) + + expected = DataFrame({"A": list("abc"), "B": [1, 2, 1], "C": [1, 1, 2]}) + result = df.groupby("A", as_index=False).nunique() + tm.assert_frame_equal(result, expected) + + # as_index + expected.index = list("abc") + expected.index.name = "A" + expected = expected.drop(columns="A") + result = df.groupby("A").nunique() + tm.assert_frame_equal(result, expected) + + # with na + result = df.replace({"x": None}).groupby("A").nunique(dropna=False) + tm.assert_frame_equal(result, expected) + + # dropna + expected = DataFrame({"B": [1] * 3, "C": [1] * 3}, index=list("abc")) + expected.index.name = "A" + result = df.replace({"x": None}).groupby("A").nunique() + tm.assert_frame_equal(result, expected) + + +def test_nunique_with_object(): + # GH 11077 + data = DataFrame( + [ + [100, 1, "Alice"], + [200, 2, "Bob"], + [300, 3, "Charlie"], + [-400, 4, "Dan"], + [500, 5, "Edith"], + ], + columns=["amount", "id", "name"], + ) + + result = data.groupby(["id", "amount"])["name"].nunique() + index = MultiIndex.from_arrays([data.id, data.amount]) + expected = Series([1] * 5, name="name", index=index) + tm.assert_series_equal(result, expected) + + +def test_nunique_with_empty_series(): + # GH 12553 + data = Series(name="name", dtype=object) + result = data.groupby(level=0).nunique() + expected = Series(name="name", dtype="int64") + tm.assert_series_equal(result, expected) + + +def test_nunique_with_timegrouper(): + # GH 13453 + test = DataFrame( + { + "time": [ + Timestamp("2016-06-28 09:35:35"), + Timestamp("2016-06-28 16:09:30"), + Timestamp("2016-06-28 16:46:28"), + ], + "data": ["1", "2", "3"], + } + ).set_index("time") + result = test.groupby(pd.Grouper(freq="h"))["data"].nunique() + expected = test.groupby(pd.Grouper(freq="h"))["data"].apply(Series.nunique) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "key, data, dropna, expected", + [ + ( + ["x", "x", "x"], + [Timestamp("2019-01-01"), pd.NaT, Timestamp("2019-01-01")], + True, + Series([1], index=pd.Index(["x"], name="key"), name="data"), + ), + ( + ["x", "x", "x"], + [dt.date(2019, 1, 1), pd.NaT, dt.date(2019, 1, 1)], + True, + Series([1], index=pd.Index(["x"], name="key"), name="data"), + ), + ( + ["x", "x", "x", "y", "y"], + [ + dt.date(2019, 1, 1), + pd.NaT, + dt.date(2019, 1, 1), + pd.NaT, + dt.date(2019, 1, 1), + ], + False, + Series([2, 2], index=pd.Index(["x", "y"], name="key"), name="data"), + ), + ( + ["x", "x", "x", "x", "y"], + [ + dt.date(2019, 1, 1), + pd.NaT, + dt.date(2019, 1, 1), + pd.NaT, + dt.date(2019, 1, 1), + ], + False, + Series([2, 1], index=pd.Index(["x", "y"], name="key"), name="data"), + ), + ], +) +def test_nunique_with_NaT(key, data, dropna, expected): + # GH 27951 + df = DataFrame({"key": key, "data": data}) + result = df.groupby(["key"])["data"].nunique(dropna=dropna) + tm.assert_series_equal(result, expected) + + +def test_nunique_preserves_column_level_names(): + # GH 23222 + test = DataFrame([1, 2, 2], columns=pd.Index(["A"], name="level_0")) + result = test.groupby([0, 0, 0]).nunique() + expected = DataFrame([2], index=np.array([0]), columns=test.columns) + tm.assert_frame_equal(result, expected) + + +def test_nunique_transform_with_datetime(): + # GH 35109 - transform with nunique on datetimes results in integers + df = DataFrame(date_range("2008-12-31", "2009-01-02"), columns=["date"]) + result = df.groupby([0, 0, 1])["date"].transform("nunique") + expected = Series([2, 2, 1], name="date") + tm.assert_series_equal(result, expected) + + +def test_empty_categorical(observed): + # GH#21334 + cat = Series([1]).astype("category") + ser = cat[:0] + gb = ser.groupby(ser, observed=observed) + result = gb.nunique() + if observed: + expected = Series([], index=cat[:0], dtype="int64") + else: + expected = Series([0], index=cat, dtype="int64") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("min_count", [0, 10]) +def test_groupby_sum_mincount_boolean(min_count): + b = True + a = False + na = np.nan + dfg = pd.array([b, b, na, na, a, a, b], dtype="boolean") + + df = DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": dfg}) + result = df.groupby("A").sum(min_count=min_count) + if min_count == 0: + expected = DataFrame( + {"B": pd.array([3, 0, 0], dtype="Int64")}, + index=pd.Index([1, 2, 3], name="A"), + ) + tm.assert_frame_equal(result, expected) + else: + expected = DataFrame( + {"B": pd.array([pd.NA] * 3, dtype="Int64")}, + index=pd.Index([1, 2, 3], name="A"), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_sum_below_mincount_nullable_integer(): + # https://github.com/pandas-dev/pandas/issues/32861 + df = DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64") + grouped = df.groupby("a") + idx = pd.Index([0, 1, 2], name="a", dtype="Int64") + + result = grouped["b"].sum(min_count=2) + expected = Series([pd.NA] * 3, dtype="Int64", index=idx, name="b") + tm.assert_series_equal(result, expected) + + result = grouped.sum(min_count=2) + expected = DataFrame({"b": [pd.NA] * 3, "c": [pd.NA] * 3}, dtype="Int64", index=idx) + tm.assert_frame_equal(result, expected) + + +def test_groupby_sum_timedelta_with_nat(): + # GH#42659 + df = DataFrame( + { + "a": [1, 1, 2, 2], + "b": [pd.Timedelta("1d"), pd.Timedelta("2d"), pd.Timedelta("3d"), pd.NaT], + } + ) + td3 = pd.Timedelta(days=3) + + gb = df.groupby("a") + + res = gb.sum() + expected = DataFrame({"b": [td3, td3]}, index=pd.Index([1, 2], name="a")) + tm.assert_frame_equal(res, expected) + + res = gb["b"].sum() + tm.assert_series_equal(res, expected["b"]) + + res = gb["b"].sum(min_count=2) + expected = Series([td3, pd.NaT], dtype="m8[ns]", name="b", index=expected.index) + tm.assert_series_equal(res, expected) From fe07fd5a201c551df679053fbe068302a1337a4a Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 12 Oct 2023 18:32:58 -0400 Subject: [PATCH 125/131] BLD: Pin numpy to < 2 (#55488) * BLD: Pin numpy to < 2 * typo * Update action.yml * Update unit-tests.yml --- .github/actions/build_pandas/action.yml | 4 ++-- .github/workflows/unit-tests.yml | 2 +- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311-downstream_compat.yaml | 2 +- ci/deps/actions-311-pyarrownightly.yaml | 2 +- ci/deps/actions-311.yaml | 2 +- ci/deps/actions-39-minimum_versions.yaml | 2 +- ci/deps/actions-39.yaml | 2 +- ci/deps/actions-pypy-39.yaml | 2 +- ci/deps/circle-310-arm64.yaml | 2 +- environment.yml | 2 +- pyproject.toml | 8 ++++---- requirements-dev.txt | 2 +- 13 files changed, 17 insertions(+), 17 deletions(-) diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml index 73d7723e2fb49..3ee10efaaf96f 100644 --- a/.github/actions/build_pandas/action.yml +++ b/.github/actions/build_pandas/action.yml @@ -25,8 +25,8 @@ runs: - name: Build Pandas run: | if [[ ${{ inputs.editable }} == "true" ]]; then - pip install -e . --no-build-isolation -v + pip install -e . --no-build-isolation -v --no-deps else - pip install . --no-build-isolation -v + pip install . --no-build-isolation -v --no-deps fi shell: bash -el {0} diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 96010a4a0227d..53a1f5b95374d 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -348,7 +348,7 @@ jobs: python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] python -m pip install python-dateutil pytz tzdata "cython<3.0.3" hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 - python -m pip install -ve . --no-build-isolation --no-index + python -m pip install -ve . --no-build-isolation --no-index --no-deps python -m pip list - name: Run Tests diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index ebd1556b8a5f5..180d425b07d82 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -20,7 +20,7 @@ dependencies: # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz # optional dependencies diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 4d0406814c873..c8a5d8cfb7640 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -21,7 +21,7 @@ dependencies: # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz # optional dependencies diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index c259286a5359c..1a770d74043bf 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -19,7 +19,7 @@ dependencies: # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz - pip diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index f6df5a6e894a7..3d1df9e3bcd59 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -20,7 +20,7 @@ dependencies: # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz # optional dependencies diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index 586768765325e..691bdf25d4d96 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -22,7 +22,7 @@ dependencies: # required dependencies - python-dateutil=2.8.2 - - numpy=1.22.4 + - numpy=1.22.4, <2 - pytz=2020.1 # optional dependencies diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 3751651a2a2f2..199ce4dea1ac0 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -20,7 +20,7 @@ dependencies: # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz # optional dependencies diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index db0723cd3b8fa..a2f4d6395783a 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -21,7 +21,7 @@ dependencies: - hypothesis>=6.46.1 # required - - numpy + - numpy<2 - python-dateutil - pytz - pip: diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index 65f72278a0291..c9f46b98273b3 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -20,7 +20,7 @@ dependencies: # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz # optional dependencies diff --git a/environment.yml b/environment.yml index db6138c34f37c..a9648f3298198 100644 --- a/environment.yml +++ b/environment.yml @@ -21,7 +21,7 @@ dependencies: # required dependencies - python-dateutil - - numpy + - numpy<2 - pytz # optional dependencies diff --git a/pyproject.toml b/pyproject.toml index a8388a9ff52de..a5aaa72289209 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ requires = [ # we don't want to force users to compile with 1.25 though # (Ideally, in the future, though, oldest-supported-numpy can be dropped when our min numpy is 1.25.x) "oldest-supported-numpy>=2022.8.16; python_version<'3.12'", - "numpy>=1.26.0; python_version>='3.12'", + "numpy>=1.26.0,<2; python_version>='3.12'", "versioneer[toml]" ] @@ -29,9 +29,9 @@ authors = [ license = {file = 'LICENSE'} requires-python = '>=3.9' dependencies = [ - "numpy>=1.22.4; python_version<'3.11'", - "numpy>=1.23.2; python_version=='3.11'", - "numpy>=1.26.0; python_version>='3.12'", + "numpy>=1.22.4,<2; python_version<'3.11'", + "numpy>=1.23.2,<2; python_version=='3.11'", + "numpy>=1.26.0,<2; python_version>='3.12'", "python-dateutil>=2.8.2", "pytz>=2020.1", "tzdata>=2022.1" diff --git a/requirements-dev.txt b/requirements-dev.txt index 98339f45a5052..6e1a6058dce0e 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -12,7 +12,7 @@ pytest-xdist>=2.2.0 pytest-asyncio>=0.17.0 coverage python-dateutil -numpy +numpy<2 pytz beautifulsoup4>=4.11.1 blosc From 579b8268474c5c89133ae7106a323dd94e1e33fa Mon Sep 17 00:00:00 2001 From: Paras Gupta Date: Fri, 13 Oct 2023 21:57:36 +0530 Subject: [PATCH 126/131] BUG: categorical dtype equality for level in different type (#55486) * BUG: categorical dtype equality for level in different type * BUG: categorical dtype equality for level in different type - Comments#1 --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/dtypes/dtypes.py | 2 +- pandas/tests/dtypes/test_dtypes.py | 18 ++++++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index eec82ae26afcc..ef1a34c4c27c6 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -285,6 +285,7 @@ Bug fixes Categorical ^^^^^^^^^^^ - :meth:`Categorical.isin` raising ``InvalidIndexError`` for categorical containing overlapping :class:`Interval` values (:issue:`34974`) +- Bug in :meth:`CategoricalDtype.__eq__` returning false for unordered categorical data with mixed types (:issue:`55468`) - Datetimelike diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 196c95c3673e9..9f85f82df666f 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -456,7 +456,7 @@ def __eq__(self, other: object) -> bool: # With object-dtype we need a comparison that identifies # e.g. int(2) as distinct from float(2) - return hash(self) == hash(other) + return set(left) == set(right) def __repr__(self) -> str_type: if self.categories is None: diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 1f9c371c50ad4..27994708d2bdb 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -917,6 +917,24 @@ def test_equal_but_different(self): assert c1 is not c2 assert c1 != c2 + def test_equal_but_different_mixed_dtypes(self): + c1 = CategoricalDtype([1, 2, "3"]) + c2 = CategoricalDtype(["3", 1, 2]) + assert c1 is not c2 + assert c1 == c2 + + def test_equal_empty_ordered(self): + c1 = CategoricalDtype([], ordered=True) + c2 = CategoricalDtype([], ordered=True) + assert c1 is not c2 + assert c1 == c2 + + def test_equal_empty_unordered(self): + c1 = CategoricalDtype([]) + c2 = CategoricalDtype([]) + assert c1 is not c2 + assert c1 == c2 + @pytest.mark.parametrize("v1, v2", [([1, 2, 3], [1, 2, 3]), ([1, 2, 3], [3, 2, 1])]) def test_order_hashes_different(self, v1, v2): c1 = CategoricalDtype(v1, ordered=False) From c1d36044bd80e987105518a8986b016f7b54d584 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 13 Oct 2023 12:32:41 -0400 Subject: [PATCH 127/131] Update get_utc declarations (#55507) * updated parameter definitions * revert memview -> ndarray declaration --- pandas/_libs/tslibs/tzconversion.pyx | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index a96779aa33255..9c8865fbdf428 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -425,7 +425,11 @@ timedelta-like} return result.base # .base to get underlying ndarray -cdef Py_ssize_t bisect_right_i8(int64_t *data, int64_t val, Py_ssize_t n): +cdef Py_ssize_t bisect_right_i8( + const int64_t *data, + int64_t val, + Py_ssize_t n +) noexcept: # Caller is responsible for checking n > 0 # This looks very similar to local_search_right in the ndarray.searchsorted # implementation. @@ -463,7 +467,7 @@ cdef str _render_tstamp(int64_t val, NPY_DATETIMEUNIT creso): cdef _get_utc_bounds( ndarray[int64_t] vals, - int64_t* tdata, + const int64_t* tdata, Py_ssize_t ntrans, const int64_t[::1] deltas, NPY_DATETIMEUNIT creso, From e1368cfd506a782f386a0c3afe02b32c31e2856e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 13 Oct 2023 09:35:01 -0700 Subject: [PATCH 128/131] BUG: tslibs uncaught overflows (#55503) * BUG: tslibs uncaught overflows * GH refs * windows/32bit builds --- doc/source/whatsnew/v2.2.0.rst | 8 +++--- pandas/_libs/tslibs/offsets.pyx | 7 ++++- pandas/_libs/tslibs/period.pyx | 7 ++--- pandas/_libs/tslibs/timedeltas.pyx | 26 ++++++++++++++----- pandas/tests/scalar/period/test_period.py | 20 ++++++++++++++ .../scalar/timedelta/test_constructors.py | 10 +++++++ .../tests/scalar/timestamp/test_arithmetic.py | 9 ++----- pandas/tests/tseries/offsets/test_ticks.py | 9 +++++++ 8 files changed, 75 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index ef1a34c4c27c6..e495c3c204de5 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -291,12 +291,14 @@ Categorical Datetimelike ^^^^^^^^^^^^ - Bug in :meth:`DatetimeIndex.union` returning object dtype for tz-aware indexes with the same timezone but different units (:issue:`55238`) -- +- Bug in :meth:`Tick.delta` with very large ticks raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) +- Bug in addition or subtraction of very large :class:`Tick` objects with :class:`Timestamp` or :class:`Timedelta` objects raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) + Timedelta ^^^^^^^^^ +- Bug in :class:`Timedelta` construction raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in rendering (``__repr__``) of :class:`TimedeltaIndex` and :class:`Series` with timedelta64 values with non-nanosecond resolution entries that are all multiples of 24 hours failing to use the compact representation used in the nanosecond cases (:issue:`55405`) -- Timezones ^^^^^^^^^ @@ -353,7 +355,7 @@ I/O Period ^^^^^^ -- +- Bug in :class:`Period` addition silently wrapping around instead of raising ``OverflowError`` (:issue:`55503`) - Plotting diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 6a6f30de8dade..6c5cdde20da5f 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -961,7 +961,12 @@ cdef class Tick(SingleConstructorOffset): @property def delta(self): - return self.n * Timedelta(self._nanos_inc) + try: + return self.n * Timedelta(self._nanos_inc) + except OverflowError as err: + # GH#55503 as_unit will raise a more useful OutOfBoundsTimedelta + Timedelta(self).as_unit("ns") + raise AssertionError("This should not be reached.") @property def nanos(self) -> int64_t: diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index cacfe43b236d8..d305f27dd1090 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1814,7 +1814,7 @@ cdef class _Period(PeriodMixin): def _add_timedeltalike_scalar(self, other) -> "Period": cdef: - int64_t inc + int64_t inc, ordinal if not self._dtype._is_tick_like(): raise IncompatibleFrequency("Input cannot be converted to " @@ -1832,8 +1832,8 @@ cdef class _Period(PeriodMixin): except ValueError as err: raise IncompatibleFrequency("Input cannot be converted to " f"Period(freq={self.freqstr})") from err - # TODO: overflow-check here - ordinal = self.ordinal + inc + with cython.overflowcheck(True): + ordinal = self.ordinal + inc return Period(ordinal=ordinal, freq=self.freq) def _add_offset(self, other) -> "Period": @@ -1846,6 +1846,7 @@ cdef class _Period(PeriodMixin): ordinal = self.ordinal + other.n return Period(ordinal=ordinal, freq=self.freq) + @cython.overflowcheck(True) def __add__(self, other): if not is_period_object(self): # cython semantics; this is analogous to a call to __radd__ diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index e5d81bd5928b9..a573d9a8ed0c0 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1784,7 +1784,7 @@ class Timedelta(_Timedelta): ) # GH43764, convert any input to nanoseconds first and then - # create the timestamp. This ensures that any potential + # create the timedelta. This ensures that any potential # nanosecond contributions from kwargs parsed as floats # are taken into consideration. seconds = int(( @@ -1797,12 +1797,24 @@ class Timedelta(_Timedelta): ) * 1_000_000_000 ) - value = np.timedelta64( - int(kwargs.get("nanoseconds", 0)) - + int(kwargs.get("microseconds", 0) * 1_000) - + int(kwargs.get("milliseconds", 0) * 1_000_000) - + seconds - ) + ns = kwargs.get("nanoseconds", 0) + us = kwargs.get("microseconds", 0) + ms = kwargs.get("milliseconds", 0) + try: + value = np.timedelta64( + int(ns) + + int(us * 1_000) + + int(ms * 1_000_000) + + seconds + ) + except OverflowError as err: + # GH#55503 + msg = ( + f"seconds={seconds}, milliseconds={ms}, " + f"microseconds={us}, nanoseconds={ns}" + ) + raise OutOfBoundsTimedelta(msg) from err + if unit in {"Y", "y", "M"}: raise ValueError( "Units 'M', 'Y', and 'y' are no longer supported, as they do not " diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 6c27881e44b56..dc2938ec345f3 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -1182,6 +1182,26 @@ def test_comparison_numpy_zerodim_arr(self, zerodim_arr, expected): class TestArithmetic: + def test_add_overflow_raises(self): + # GH#55503 + per = Timestamp.max.to_period("ns") + + msg = "|".join( + [ + "Python int too large to convert to C long", + # windows, 32bit linux builds + "int too big to convert", + ] + ) + with pytest.raises(OverflowError, match=msg): + per + 1 + + msg = "value too large" + with pytest.raises(OverflowError, match=msg): + per + Timedelta(1) + with pytest.raises(OverflowError, match=msg): + per + offsets.Nano(1) + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "m"]) def test_add_sub_td64_nat(self, unit): # GH#47196 diff --git a/pandas/tests/scalar/timedelta/test_constructors.py b/pandas/tests/scalar/timedelta/test_constructors.py index 0d876fbb9bde8..858ab29d79c5e 100644 --- a/pandas/tests/scalar/timedelta/test_constructors.py +++ b/pandas/tests/scalar/timedelta/test_constructors.py @@ -15,6 +15,16 @@ ) +def test_construct_from_kwargs_overflow(): + # GH#55503 + msg = "seconds=86400000000000000000, milliseconds=0, microseconds=0, nanoseconds=0" + with pytest.raises(OutOfBoundsTimedelta, match=msg): + Timedelta(days=10**6) + msg = "seconds=60000000000000000000, milliseconds=0, microseconds=0, nanoseconds=0" + with pytest.raises(OutOfBoundsTimedelta, match=msg): + Timedelta(minutes=10**9) + + def test_construct_with_weeks_unit_overflow(): # GH#47268 don't silently wrap around with pytest.raises(OutOfBoundsTimedelta, match="without overflow"): diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py index f5c9c576abc24..9c24d364841d1 100644 --- a/pandas/tests/scalar/timestamp/test_arithmetic.py +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -40,17 +40,12 @@ def test_overflow_offset_raises(self): stamp = Timestamp("2017-01-13 00:00:00").as_unit("ns") offset_overflow = 20169940 * offsets.Day(1) - msg = ( - "the add operation between " - r"\<-?\d+ \* Days\> and \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} " - "will overflow" - ) lmsg2 = r"Cannot cast -?20169940 days \+?00:00:00 to unit='ns' without overflow" with pytest.raises(OutOfBoundsTimedelta, match=lmsg2): stamp + offset_overflow - with pytest.raises(OverflowError, match=msg): + with pytest.raises(OutOfBoundsTimedelta, match=lmsg2): offset_overflow + stamp with pytest.raises(OutOfBoundsTimedelta, match=lmsg2): @@ -68,7 +63,7 @@ def test_overflow_offset_raises(self): with pytest.raises(OutOfBoundsTimedelta, match=lmsg3): stamp + offset_overflow - with pytest.raises(OverflowError, match=msg): + with pytest.raises(OutOfBoundsTimedelta, match=lmsg3): offset_overflow + stamp with pytest.raises(OutOfBoundsTimedelta, match=lmsg3): diff --git a/pandas/tests/tseries/offsets/test_ticks.py b/pandas/tests/tseries/offsets/test_ticks.py index 69953955ebbce..abf187ace7cb3 100644 --- a/pandas/tests/tseries/offsets/test_ticks.py +++ b/pandas/tests/tseries/offsets/test_ticks.py @@ -15,6 +15,7 @@ import pytest from pandas._libs.tslibs.offsets import delta_to_tick +from pandas.errors import OutOfBoundsTimedelta from pandas import ( Timedelta, @@ -237,6 +238,14 @@ def test_tick_addition(kls, expected): assert result == expected +def test_tick_delta_overflow(): + # GH#55503 raise OutOfBoundsTimedelta, not OverflowError + tick = offsets.Day(10**9) + msg = "Cannot cast 1000000000 days 00:00:00 to unit='ns' without overflow" + with pytest.raises(OutOfBoundsTimedelta, match=msg): + tick.delta + + @pytest.mark.parametrize("cls", tick_classes) def test_tick_division(cls): off = cls(10) From abba4e2df1ba1651f89ca4b02f8eeeffc17375c2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 13 Oct 2023 09:41:22 -0700 Subject: [PATCH 129/131] REF: share Index/Block get_values_for_csv (#55485) * REF: share Index/Block get_values_for_csv * mypy fixup --- pandas/core/indexes/base.py | 154 +++++++++++++++--- pandas/core/indexes/datetimelike.py | 2 +- pandas/core/indexes/datetimes.py | 1 - pandas/core/indexes/multi.py | 6 +- pandas/core/indexes/period.py | 2 +- pandas/core/indexes/timedeltas.py | 1 - pandas/core/internals/array_manager.py | 2 +- pandas/core/internals/blocks.py | 93 +---------- pandas/io/formats/csvs.py | 15 +- pandas/io/formats/format.py | 2 +- .../tests/indexes/datetimes/test_formats.py | 16 +- pandas/tests/indexes/interval/test_formats.py | 2 +- pandas/tests/indexes/period/test_formats.py | 10 +- 13 files changed, 158 insertions(+), 148 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 515f750f11219..f53c5606db4d3 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -30,6 +30,7 @@ algos as libalgos, index as libindex, lib, + writers, ) from pandas._libs.internals import BlockValuesRefs import pandas._libs.join as libjoin @@ -97,7 +98,6 @@ is_bool_dtype, is_ea_or_datetimelike_dtype, is_float, - is_float_dtype, is_hashable, is_integer, is_iterator, @@ -119,6 +119,7 @@ ExtensionDtype, IntervalDtype, PeriodDtype, + SparseDtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -151,7 +152,9 @@ ArrowExtensionArray, BaseMaskedArray, Categorical, + DatetimeArray, ExtensionArray, + TimedeltaArray, ) from pandas.core.arrays.string_ import StringArray from pandas.core.base import ( @@ -199,7 +202,10 @@ MultiIndex, Series, ) - from pandas.core.arrays import PeriodArray + from pandas.core.arrays import ( + IntervalArray, + PeriodArray, + ) __all__ = ["Index"] @@ -1403,7 +1409,7 @@ def _format_with_header(self, *, header: list[str_t], na_rep: str_t) -> list[str result = trim_front(formatted) return header + result - def _format_native_types( + def _get_values_for_csv( self, *, na_rep: str_t = "", @@ -1412,30 +1418,14 @@ def _format_native_types( date_format=None, quoting=None, ) -> npt.NDArray[np.object_]: - """ - Actually format specific types of the index. - """ - from pandas.io.formats.format import FloatArrayFormatter - - if is_float_dtype(self.dtype) and not isinstance(self.dtype, ExtensionDtype): - formatter = FloatArrayFormatter( - self._values, - na_rep=na_rep, - float_format=float_format, - decimal=decimal, - quoting=quoting, - fixed_width=False, - ) - return formatter.get_result_as_array() - - mask = isna(self) - if self.dtype != object and not quoting: - values = np.asarray(self).astype(str) - else: - values = np.array(self, dtype=object, copy=True) - - values[mask] = na_rep - return values + return get_values_for_csv( + self._values, + na_rep=na_rep, + decimal=decimal, + float_format=float_format, + date_format=date_format, + quoting=quoting, + ) def _summary(self, name=None) -> str_t: """ @@ -7629,3 +7619,113 @@ def _maybe_try_sort(result: Index | ArrayLike, sort: bool | None): stacklevel=find_stack_level(), ) return result + + +def get_values_for_csv( + values: ArrayLike, + *, + date_format, + na_rep: str = "nan", + quoting=None, + float_format=None, + decimal: str = ".", +) -> npt.NDArray[np.object_]: + """ + Convert to types which can be consumed by the standard library's + csv.writer.writerows. + """ + if isinstance(values, Categorical) and values.categories.dtype.kind in "Mm": + # GH#40754 Convert categorical datetimes to datetime array + values = algos.take_nd( + values.categories._values, + ensure_platform_int(values._codes), + fill_value=na_rep, + ) + + values = ensure_wrapped_if_datetimelike(values) + + if isinstance(values, (DatetimeArray, TimedeltaArray)): + if values.ndim == 1: + result = values._format_native_types(na_rep=na_rep, date_format=date_format) + result = result.astype(object, copy=False) + return result + + # GH#21734 Process every column separately, they might have different formats + results_converted = [] + for i in range(len(values)): + result = values[i, :]._format_native_types( + na_rep=na_rep, date_format=date_format + ) + results_converted.append(result.astype(object, copy=False)) + return np.vstack(results_converted) + + elif isinstance(values.dtype, PeriodDtype): + # TODO: tests that get here in column path + values = cast("PeriodArray", values) + res = values._format_native_types(na_rep=na_rep, date_format=date_format) + return res + + elif isinstance(values.dtype, IntervalDtype): + # TODO: tests that get here in column path + values = cast("IntervalArray", values) + mask = values.isna() + if not quoting: + result = np.asarray(values).astype(str) + else: + result = np.array(values, dtype=object, copy=True) + + result[mask] = na_rep + return result + + elif values.dtype.kind == "f" and not isinstance(values.dtype, SparseDtype): + # see GH#13418: no special formatting is desired at the + # output (important for appropriate 'quoting' behaviour), + # so do not pass it through the FloatArrayFormatter + if float_format is None and decimal == ".": + mask = isna(values) + + if not quoting: + values = values.astype(str) + else: + values = np.array(values, dtype="object") + + values[mask] = na_rep + values = values.astype(object, copy=False) + return values + + from pandas.io.formats.format import FloatArrayFormatter + + formatter = FloatArrayFormatter( + values, + na_rep=na_rep, + float_format=float_format, + decimal=decimal, + quoting=quoting, + fixed_width=False, + ) + res = formatter.get_result_as_array() + res = res.astype(object, copy=False) + return res + + elif isinstance(values, ExtensionArray): + mask = isna(values) + + new_values = np.asarray(values.astype(object)) + new_values[mask] = na_rep + return new_values + + else: + mask = isna(values) + itemsize = writers.word_len(na_rep) + + if values.dtype != _dtype_obj and not quoting and itemsize: + values = values.astype(str) + if values.dtype.itemsize / np.dtype("U1").itemsize < itemsize: + # enlarge for the na_rep + values = values.astype(f" npt.NDArray[np.object_]: new_levels = [] @@ -1392,7 +1392,7 @@ def _format_native_types( # go through the levels and format them for level, level_codes in zip(self.levels, self.codes): - level_strs = level._format_native_types(na_rep=na_rep, **kwargs) + level_strs = level._get_values_for_csv(na_rep=na_rep, **kwargs) # add nan values, if there are any mask = level_codes == -1 if mask.any(): @@ -1408,7 +1408,7 @@ def _format_native_types( if len(new_levels) == 1: # a single-level multi-index - return Index(new_levels[0].take(new_codes[0]))._format_native_types() + return Index(new_levels[0].take(new_codes[0]))._get_values_for_csv() else: # reconstruct the multi-index mi = MultiIndex( diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index b1023febe813d..09b41d9c32ec2 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -80,7 +80,7 @@ def _new_PeriodIndex(cls, **d): PeriodArray, wrap=True, ) -@inherit_names(["is_leap_year", "_format_native_types"], PeriodArray) +@inherit_names(["is_leap_year"], PeriodArray) class PeriodIndex(DatetimeIndexOpsMixin): """ Immutable ndarray holding ordinal values indicating regular periods in time. diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 498fe56a7ae7f..b1d8d0efb60e8 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -48,7 +48,6 @@ "sum", "std", "median", - "_format_native_types", ], TimedeltaArray, ) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 99af4f51661b1..9987908f407b3 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -68,6 +68,7 @@ Index, ensure_index, ) +from pandas.core.indexes.base import get_values_for_csv from pandas.core.internals.base import ( DataManager, SingleDataManager, @@ -79,7 +80,6 @@ ensure_block_shape, external_values, extract_pandas_array, - get_values_for_csv, maybe_coerce_values, new_block, ) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f0c14eec81c3c..330effe0f0a9f 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -24,7 +24,6 @@ NaT, internals as libinternals, lib, - writers, ) from pandas._libs.internals import ( BlockPlacement, @@ -61,7 +60,6 @@ np_can_hold_element, ) from pandas.core.dtypes.common import ( - ensure_platform_int, is_1d_only_ea_dtype, is_float_dtype, is_integer_dtype, @@ -75,7 +73,6 @@ IntervalDtype, NumpyEADtype, PeriodDtype, - SparseDtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -122,6 +119,7 @@ extract_array, ) from pandas.core.indexers import check_setitem_lengths +from pandas.core.indexes.base import get_values_for_csv if TYPE_CHECKING: from collections.abc import ( @@ -2602,95 +2600,6 @@ def ensure_block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike: return values -def get_values_for_csv( - values: ArrayLike, - *, - date_format, - na_rep: str = "nan", - quoting=None, - float_format=None, - decimal: str = ".", -) -> npt.NDArray[np.object_]: - """convert to our native types format""" - if isinstance(values, Categorical) and values.categories.dtype.kind in "Mm": - # GH#40754 Convert categorical datetimes to datetime array - values = algos.take_nd( - values.categories._values, - ensure_platform_int(values._codes), - fill_value=na_rep, - ) - - values = ensure_wrapped_if_datetimelike(values) - - if isinstance(values, (DatetimeArray, TimedeltaArray)): - if values.ndim == 1: - result = values._format_native_types(na_rep=na_rep, date_format=date_format) - result = result.astype(object, copy=False) - return result - - # GH#21734 Process every column separately, they might have different formats - results_converted = [] - for i in range(len(values)): - result = values[i, :]._format_native_types( - na_rep=na_rep, date_format=date_format - ) - results_converted.append(result.astype(object, copy=False)) - return np.vstack(results_converted) - - elif values.dtype.kind == "f" and not isinstance(values.dtype, SparseDtype): - # see GH#13418: no special formatting is desired at the - # output (important for appropriate 'quoting' behaviour), - # so do not pass it through the FloatArrayFormatter - if float_format is None and decimal == ".": - mask = isna(values) - - if not quoting: - values = values.astype(str) - else: - values = np.array(values, dtype="object") - - values[mask] = na_rep - values = values.astype(object, copy=False) - return values - - from pandas.io.formats.format import FloatArrayFormatter - - formatter = FloatArrayFormatter( - values, - na_rep=na_rep, - float_format=float_format, - decimal=decimal, - quoting=quoting, - fixed_width=False, - ) - res = formatter.get_result_as_array() - res = res.astype(object, copy=False) - return res - - elif isinstance(values, ExtensionArray): - mask = isna(values) - - new_values = np.asarray(values.astype(object)) - new_values[mask] = na_rep - return new_values - - else: - mask = isna(values) - itemsize = writers.word_len(na_rep) - - if values.dtype != _dtype_obj and not quoting and itemsize: - values = values.astype(str) - if values.dtype.itemsize / np.dtype("U1").itemsize < itemsize: - # enlarge for the na_rep - values = values.astype(f" ArrayLike: """ The array that Series.values returns (public attribute). diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 717dae6eea97c..50503e862ef43 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -44,6 +44,7 @@ IndexLabel, StorageOptions, WriteBuffer, + npt, ) from pandas.io.formats.format import DataFrameFormatter @@ -53,7 +54,7 @@ class CSVFormatter: - cols: np.ndarray + cols: npt.NDArray[np.object_] def __init__( self, @@ -149,7 +150,9 @@ def _initialize_quotechar(self, quotechar: str | None) -> str | None: def has_mi_columns(self) -> bool: return bool(isinstance(self.obj.columns, ABCMultiIndex)) - def _initialize_columns(self, cols: Iterable[Hashable] | None) -> np.ndarray: + def _initialize_columns( + self, cols: Iterable[Hashable] | None + ) -> npt.NDArray[np.object_]: # validate mi options if self.has_mi_columns: if cols is not None: @@ -158,7 +161,7 @@ def _initialize_columns(self, cols: Iterable[Hashable] | None) -> np.ndarray: if cols is not None: if isinstance(cols, ABCIndex): - cols = cols._format_native_types(**self._number_format) + cols = cols._get_values_for_csv(**self._number_format) else: cols = list(cols) self.obj = self.obj.loc[:, cols] @@ -166,7 +169,7 @@ def _initialize_columns(self, cols: Iterable[Hashable] | None) -> np.ndarray: # update columns to include possible multiplicity of dupes # and make sure cols is just a list of labels new_cols = self.obj.columns - return new_cols._format_native_types(**self._number_format) + return new_cols._get_values_for_csv(**self._number_format) def _initialize_chunksize(self, chunksize: int | None) -> int: if chunksize is None: @@ -223,7 +226,7 @@ def write_cols(self) -> SequenceNotStr[Hashable]: ) return self.header else: - # self.cols is an ndarray derived from Index._format_native_types, + # self.cols is an ndarray derived from Index._get_values_for_csv, # so its entries are strings, i.e. hashable return cast(SequenceNotStr[Hashable], self.cols) @@ -317,7 +320,7 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: res = df._get_values_for_csv(**self._number_format) data = list(res._iter_column_arrays()) - ix = self.data_index[slicer]._format_native_types(**self._number_format) + ix = self.data_index[slicer]._get_values_for_csv(**self._number_format) libwriters.write_csv_rows( data, ix, diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index bb976b3a0208e..c87b8261916f2 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1105,7 +1105,7 @@ def format_array( the leading space to pad between columns. When formatting an Index subclass - (e.g. IntervalIndex._format_native_types), we don't want the + (e.g. IntervalIndex._get_values_for_csv), we don't want the leading space since it should be left-aligned. fallback_formatter diff --git a/pandas/tests/indexes/datetimes/test_formats.py b/pandas/tests/indexes/datetimes/test_formats.py index 9fb5db9e034ee..caeb7fcb86f49 100644 --- a/pandas/tests/indexes/datetimes/test_formats.py +++ b/pandas/tests/indexes/datetimes/test_formats.py @@ -13,38 +13,38 @@ import pandas._testing as tm -def test_format_native_types(): +def test_get_values_for_csv(): index = pd.date_range(freq="1D", periods=3, start="2017-01-01") # First, with no arguments. expected = np.array(["2017-01-01", "2017-01-02", "2017-01-03"], dtype=object) - result = index._format_native_types() + result = index._get_values_for_csv() tm.assert_numpy_array_equal(result, expected) # No NaN values, so na_rep has no effect - result = index._format_native_types(na_rep="pandas") + result = index._get_values_for_csv(na_rep="pandas") tm.assert_numpy_array_equal(result, expected) # Make sure date formatting works expected = np.array(["01-2017-01", "01-2017-02", "01-2017-03"], dtype=object) - result = index._format_native_types(date_format="%m-%Y-%d") + result = index._get_values_for_csv(date_format="%m-%Y-%d") tm.assert_numpy_array_equal(result, expected) # NULL object handling should work index = DatetimeIndex(["2017-01-01", pd.NaT, "2017-01-03"]) expected = np.array(["2017-01-01", "NaT", "2017-01-03"], dtype=object) - result = index._format_native_types() + result = index._get_values_for_csv(na_rep="NaT") tm.assert_numpy_array_equal(result, expected) expected = np.array(["2017-01-01", "pandas", "2017-01-03"], dtype=object) - result = index._format_native_types(na_rep="pandas") + result = index._get_values_for_csv(na_rep="pandas") tm.assert_numpy_array_equal(result, expected) - result = index._format_native_types(date_format="%Y-%m-%d %H:%M:%S.%f") + result = index._get_values_for_csv(na_rep="NaT", date_format="%Y-%m-%d %H:%M:%S.%f") expected = np.array( ["2017-01-01 00:00:00.000000", "NaT", "2017-01-03 00:00:00.000000"], dtype=object, @@ -52,7 +52,7 @@ def test_format_native_types(): tm.assert_numpy_array_equal(result, expected) # invalid format - result = index._format_native_types(date_format="foo") + result = index._get_values_for_csv(na_rep="NaT", date_format="foo") expected = np.array(["foo", "NaT", "foo"], dtype=object) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index acb330c190d6f..5b509edc9ff88 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -104,7 +104,7 @@ def test_repr_floats(self): def test_to_native_types(self, tuples, closed, expected_data): # GH 28210 index = IntervalIndex.from_tuples(tuples, closed=closed) - result = index._format_native_types(na_rep="NaN") + result = index._get_values_for_csv(na_rep="NaN") expected = np.array(expected_data) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/period/test_formats.py b/pandas/tests/indexes/period/test_formats.py index 9441f56a75f03..7245c6a7116fc 100644 --- a/pandas/tests/indexes/period/test_formats.py +++ b/pandas/tests/indexes/period/test_formats.py @@ -15,29 +15,29 @@ def test_to_native_types(): # First, with no arguments. expected = np.array(["2017-01-01", "2017-01-02", "2017-01-03"], dtype=object) - result = index._format_native_types() + result = index._get_values_for_csv() tm.assert_numpy_array_equal(result, expected) # No NaN values, so na_rep has no effect - result = index._format_native_types(na_rep="pandas") + result = index._get_values_for_csv(na_rep="pandas") tm.assert_numpy_array_equal(result, expected) # Make sure date formatting works expected = np.array(["01-2017-01", "01-2017-02", "01-2017-03"], dtype=object) - result = index._format_native_types(date_format="%m-%Y-%d") + result = index._get_values_for_csv(date_format="%m-%Y-%d") tm.assert_numpy_array_equal(result, expected) # NULL object handling should work index = PeriodIndex(["2017-01-01", pd.NaT, "2017-01-03"], freq="D") expected = np.array(["2017-01-01", "NaT", "2017-01-03"], dtype=object) - result = index._format_native_types() + result = index._get_values_for_csv(na_rep="NaT") tm.assert_numpy_array_equal(result, expected) expected = np.array(["2017-01-01", "pandas", "2017-01-03"], dtype=object) - result = index._format_native_types(na_rep="pandas") + result = index._get_values_for_csv(na_rep="pandas") tm.assert_numpy_array_equal(result, expected) From 2f3b0eda7231f33131c5fabf065483e61ac1ea59 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 13 Oct 2023 09:57:40 -0700 Subject: [PATCH 130/131] DEPR: Index.format (#55439) * REF: implement Index._format_flat, _format_multi * de-duplicate, change keyword * DEPR: Index.format * add formatter kwd * Post-merge fixup --- doc/source/whatsnew/v2.2.0.rst | 2 + pandas/core/indexes/base.py | 33 +++++++++ pandas/core/indexes/datetimelike.py | 11 +++ pandas/core/indexes/multi.py | 72 +++++++++++++++++++ pandas/io/formats/excel.py | 8 +-- pandas/io/formats/format.py | 23 +++--- pandas/io/formats/html.py | 14 ++-- pandas/io/formats/style_render.py | 4 +- .../tests/indexes/base_class/test_formats.py | 9 ++- .../tests/indexes/categorical/test_formats.py | 5 +- .../indexes/datetimes/test_datetimelike.py | 5 +- .../tests/indexes/datetimes/test_formats.py | 8 ++- pandas/tests/indexes/multi/test_formats.py | 20 ++++-- pandas/tests/indexes/period/test_period.py | 7 +- pandas/tests/indexes/ranges/test_range.py | 11 ++- pandas/tests/indexes/test_base.py | 12 +++- pandas/tests/indexes/test_old_base.py | 15 ++-- pandas/tests/io/excel/test_writers.py | 2 +- pandas/tests/io/formats/test_format.py | 57 ++++++++++----- pandas/tests/series/test_repr.py | 4 +- 20 files changed, 259 insertions(+), 63 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index e495c3c204de5..93ca2541d7ecd 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -234,6 +234,7 @@ For example: Other Deprecations ^^^^^^^^^^^^^^^^^^ - Changed :meth:`Timedelta.resolution_string` to return ``h``, ``min``, ``s``, ``ms``, ``us``, and ``ns`` instead of ``H``, ``T``, ``S``, ``L``, ``U``, and ``N``, for compatibility with respective deprecations in frequency aliases (:issue:`52536`) +- Deprecated :meth:`Index.format`, use ``index.astype(str)`` or ``index.map(formatter)`` instead (:issue:`55413`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_clipboard`. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_csv` except ``path_or_buf``. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_dict`. (:issue:`54229`) @@ -261,6 +262,7 @@ Other Deprecations - Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`) - Deprecated the option ``mode.data_manager`` and the ``ArrayManager``; only the ``BlockManager`` will be available in future versions (:issue:`55043`) - Deprecating downcasting the results of :meth:`DataFrame.fillna`, :meth:`Series.fillna`, :meth:`DataFrame.ffill`, :meth:`Series.ffill`, :meth:`DataFrame.bfill`, :meth:`Series.bfill` in object-dtype cases. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`54261`) +- .. --------------------------------------------------------------------------- .. _whatsnew_220.performance: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f53c5606db4d3..252e88d7c7d51 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1375,6 +1375,14 @@ def format( """ Render a string representation of the Index. """ + warnings.warn( + # GH#55413 + f"{type(self).__name__}.format is deprecated and will be removed " + "in a future version. Convert using index.astype(str) or " + "index.map(formatter) instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) header = [] if name: header.append( @@ -1388,6 +1396,31 @@ def format( return self._format_with_header(header=header, na_rep=na_rep) + _default_na_rep = "NaN" + + @final + def _format_flat( + self, + *, + include_name: bool, + formatter: Callable | None = None, + ) -> list[str_t]: + """ + Render a string representation of the Index. + """ + header = [] + if include_name: + header.append( + pprint_thing(self.name, escape_chars=("\t", "\r", "\n")) + if self.name is not None + else "" + ) + + if formatter is not None: + return header + list(self.map(formatter)) + + return self._format_with_header(header=header, na_rep=self._default_na_rep) + def _format_with_header(self, *, header: list[str_t], na_rep: str_t) -> list[str_t]: from pandas.io.formats.format import format_array diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 9ab3f26f88152..a3e6c50b21642 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -14,6 +14,7 @@ cast, final, ) +import warnings import numpy as np @@ -42,6 +43,7 @@ cache_readonly, doc, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_integer, @@ -187,6 +189,7 @@ def _convert_tolerance(self, tolerance, target): # -------------------------------------------------------------------- # Rendering Methods + _default_na_rep = "NaT" def format( self, @@ -198,6 +201,14 @@ def format( """ Render a string representation of the Index. """ + warnings.warn( + # GH#55413 + f"{type(self).__name__}.format is deprecated and will be removed " + "in a future version. Convert using index.astype(str) or " + "index.map(formatter) instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) header = [] if name: header.append( diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index d88eb3f18a3bd..86693f241ddb1 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1430,6 +1430,15 @@ def format( sparsify=None, adjoin: bool = True, ) -> list: + warnings.warn( + # GH#55413 + f"{type(self).__name__}.format is deprecated and will be removed " + "in a future version. Convert using index.astype(str) or " + "index.map(formatter) instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if name is not None: names = name @@ -1492,6 +1501,69 @@ def format( else: return result_levels + def _format_multi( + self, + *, + include_names: bool, + sparsify: bool | None | lib.NoDefault, + formatter: Callable | None = None, + ) -> list: + if len(self) == 0: + return [] + + stringified_levels = [] + for lev, level_codes in zip(self.levels, self.codes): + na = _get_na_rep(lev.dtype) + + if len(lev) > 0: + taken = formatted = lev.take(level_codes) + formatted = taken._format_flat(include_name=False, formatter=formatter) + + # we have some NA + mask = level_codes == -1 + if mask.any(): + formatted = np.array(formatted, dtype=object) + formatted[mask] = na + formatted = formatted.tolist() + + else: + # weird all NA case + formatted = [ + pprint_thing(na if isna(x) else x, escape_chars=("\t", "\r", "\n")) + for x in algos.take_nd(lev._values, level_codes) + ] + stringified_levels.append(formatted) + + result_levels = [] + for lev, lev_name in zip(stringified_levels, self.names): + level = [] + + if include_names: + level.append( + pprint_thing(lev_name, escape_chars=("\t", "\r", "\n")) + if lev_name is not None + else "" + ) + + level.extend(np.array(lev, dtype=object)) + result_levels.append(level) + + if sparsify is None: + sparsify = get_option("display.multi_sparse") + + if sparsify: + sentinel: Literal[""] | bool | lib.NoDefault = "" + # GH3547 use value of sparsify as sentinel if it's "Falsey" + assert isinstance(sparsify, bool) or sparsify is lib.no_default + if sparsify is lib.no_default: + sentinel = sparsify + # little bit of a kludge job for #1217 + result_levels = sparsify_labels( + result_levels, start=int(include_names), sentinel=sentinel + ) + + return result_levels + # -------------------------------------------------------------------- # Names Methods diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index b344d9849f16c..684cd4340cd2b 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -623,8 +623,8 @@ def _format_header_mi(self) -> Iterable[ExcelCell]: return columns = self.columns - level_strs = columns.format( - sparsify=self.merge_cells, adjoin=False, names=False + level_strs = columns._format_multi( + sparsify=self.merge_cells, include_names=False ) level_lengths = get_level_lengths(level_strs) coloffset = 0 @@ -813,8 +813,8 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: if self.merge_cells: # Format hierarchical rows as merged cells. - level_strs = self.df.index.format( - sparsify=True, adjoin=False, names=False + level_strs = self.df.index._format_multi( + sparsify=True, include_names=False ) level_lengths = get_level_lengths(level_strs) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index c87b8261916f2..1a7e4d7a80e13 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -313,8 +313,14 @@ def to_string(self) -> str: if len(series) == 0: return f"{type(self.series).__name__}([], {footer})" - have_header = _has_names(series.index) - fmt_index = self.tr_series.index.format(name=True) + index = series.index + have_header = _has_names(index) + if isinstance(index, MultiIndex): + fmt_index = index._format_multi(include_names=True, sparsify=None) + adj = printing.get_adjustment() + fmt_index = adj.adjoin(2, *fmt_index).split("\n") + else: + fmt_index = index._format_flat(include_name=True) fmt_values = self._get_formatted_values() if self.is_truncated_vertically: @@ -777,7 +783,7 @@ def _get_formatted_column_labels(self, frame: DataFrame) -> list[list[str]]: columns = frame.columns if isinstance(columns, MultiIndex): - fmt_columns = columns.format(sparsify=False, adjoin=False) + fmt_columns = columns._format_multi(sparsify=False, include_names=False) fmt_columns = list(zip(*fmt_columns)) dtypes = self.frame.dtypes._values @@ -802,7 +808,7 @@ def space_format(x, y): str_columns = [list(x) for x in zip(*str_columns)] else: - fmt_columns = columns.format() + fmt_columns = columns._format_flat(include_name=False) dtypes = self.frame.dtypes need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) str_columns = [ @@ -821,14 +827,15 @@ def _get_formatted_index(self, frame: DataFrame) -> list[str]: fmt = self._get_formatter("__index__") if isinstance(index, MultiIndex): - fmt_index = index.format( + fmt_index = index._format_multi( sparsify=self.sparsify, - adjoin=False, - names=self.show_row_idx_names, + include_names=self.show_row_idx_names, formatter=fmt, ) else: - fmt_index = [index.format(name=self.show_row_idx_names, formatter=fmt)] + fmt_index = [ + index._format_flat(include_name=self.show_row_idx_names, formatter=fmt) + ] fmt_index = [ tuple( diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index b1a3504d46b27..794ce77b3b45e 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -282,7 +282,7 @@ def _write_col_header(self, indent: int) -> None: sentinel = lib.no_default else: sentinel = False - levels = self.columns.format(sparsify=sentinel, adjoin=False, names=False) + levels = self.columns._format_multi(sparsify=sentinel, include_names=False) level_lengths = get_level_lengths(levels, sentinel) inner_lvl = len(level_lengths) - 1 for lnum, (records, values) in enumerate(zip(level_lengths, levels)): @@ -437,7 +437,8 @@ def _write_regular_rows( if fmt is not None: index_values = self.fmt.tr_frame.index.map(fmt) else: - index_values = self.fmt.tr_frame.index.format() + # only reached with non-Multi index + index_values = self.fmt.tr_frame.index._format_flat(include_name=False) row: list[str] = [] for i in range(nrows): @@ -480,13 +481,13 @@ def _write_hierarchical_rows( nrows = len(frame) assert isinstance(frame.index, MultiIndex) - idx_values = frame.index.format(sparsify=False, adjoin=False, names=False) + idx_values = frame.index._format_multi(sparsify=False, include_names=False) idx_values = list(zip(*idx_values)) if self.fmt.sparsify: # GH3547 sentinel = lib.no_default - levels = frame.index.format(sparsify=sentinel, adjoin=False, names=False) + levels = frame.index._format_multi(sparsify=sentinel, include_names=False) level_lengths = get_level_lengths(levels, sentinel) inner_lvl = len(level_lengths) - 1 @@ -579,7 +580,7 @@ def _write_hierarchical_rows( ) idx_values = list( - zip(*frame.index.format(sparsify=False, adjoin=False, names=False)) + zip(*frame.index._format_multi(sparsify=False, include_names=False)) ) row = [] row.extend(idx_values[i]) @@ -606,7 +607,8 @@ def _get_formatted_values(self) -> dict[int, list[str]]: return {i: self.fmt.format_col(i) for i in range(self.ncols)} def _get_columns_formatted_values(self) -> list[str]: - return self.columns.format() + # only reached with non-Multi Index + return self.columns._format_flat(include_name=False) def write_style(self) -> None: # We use the "scoped" attribute here so that the desired diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 829ed4a33f6a4..416b263ba8497 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -1652,9 +1652,9 @@ def _get_level_lengths( Result is a dictionary of (level, initial_position): span """ if isinstance(index, MultiIndex): - levels = index.format(sparsify=lib.no_default, adjoin=False) + levels = index._format_multi(sparsify=lib.no_default, include_names=False) else: - levels = index.format() + levels = index._format_flat(include_name=False) if hidden_elements is None: hidden_elements = [] diff --git a/pandas/tests/indexes/base_class/test_formats.py b/pandas/tests/indexes/base_class/test_formats.py index 9053d45dee623..20f94010f56f8 100644 --- a/pandas/tests/indexes/base_class/test_formats.py +++ b/pandas/tests/indexes/base_class/test_formats.py @@ -4,6 +4,7 @@ import pandas._config.config as cf from pandas import Index +import pandas._testing as tm class TestIndexRendering: @@ -133,7 +134,9 @@ def test_summary_bug(self): def test_index_repr_bool_nan(self): # GH32146 arr = Index([True, False, np.nan], dtype=object) - exp1 = arr.format() + msg = "Index.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + exp1 = arr.format() out1 = ["True", "False", "NaN"] assert out1 == exp1 @@ -145,4 +148,6 @@ def test_format_different_scalar_lengths(self): # GH#35439 idx = Index(["aaaaaaaaa", "b"]) expected = ["aaaaaaaaa", "b"] - assert idx.format() == expected + msg = r"Index\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert idx.format() == expected diff --git a/pandas/tests/indexes/categorical/test_formats.py b/pandas/tests/indexes/categorical/test_formats.py index 7dbcaaa8d4ba6..ea3e4ce213e67 100644 --- a/pandas/tests/indexes/categorical/test_formats.py +++ b/pandas/tests/indexes/categorical/test_formats.py @@ -4,6 +4,7 @@ import pandas._config.config as cf from pandas import CategoricalIndex +import pandas._testing as tm class TestCategoricalIndexRepr: @@ -11,7 +12,9 @@ def test_format_different_scalar_lengths(self): # GH#35439 idx = CategoricalIndex(["aaaaaaaaa", "b"]) expected = ["aaaaaaaaa", "b"] - assert idx.format() == expected + msg = r"CategoricalIndex\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert idx.format() == expected def test_string_categorical_index_repr(self): # short diff --git a/pandas/tests/indexes/datetimes/test_datetimelike.py b/pandas/tests/indexes/datetimes/test_datetimelike.py index a6bee20d3d3ec..a012a2985b41c 100644 --- a/pandas/tests/indexes/datetimes/test_datetimelike.py +++ b/pandas/tests/indexes/datetimes/test_datetimelike.py @@ -1,5 +1,6 @@ """ generic tests from the Datetimelike class """ from pandas import date_range +import pandas._testing as tm class TestDatetimeIndex: @@ -7,4 +8,6 @@ def test_format(self): # GH35439 idx = date_range("20130101", periods=5) expected = [f"{x:%Y-%m-%d}" for x in idx] - assert idx.format() == expected + msg = r"DatetimeIndex\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert idx.format() == expected diff --git a/pandas/tests/indexes/datetimes/test_formats.py b/pandas/tests/indexes/datetimes/test_formats.py index caeb7fcb86f49..6f75ac1b569c0 100644 --- a/pandas/tests/indexes/datetimes/test_formats.py +++ b/pandas/tests/indexes/datetimes/test_formats.py @@ -285,13 +285,17 @@ def test_format_with_name_time_info(self): # bug I fixed 12/20/2011 dates = pd.date_range("2011-01-01 04:00:00", periods=10, name="something") - formatted = dates.format(name=True) + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = dates.format(name=True) assert formatted[0] == "something" def test_format_datetime_with_time(self): dti = DatetimeIndex([datetime(2012, 2, 7), datetime(2012, 2, 7, 23)]) - result = dti.format() + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = dti.format() expected = ["2012-02-07 00:00:00", "2012-02-07 23:00:00"] assert len(result) == 2 assert result == expected diff --git a/pandas/tests/indexes/multi/test_formats.py b/pandas/tests/indexes/multi/test_formats.py index 011f61fac90e8..bbe94824eefa1 100644 --- a/pandas/tests/indexes/multi/test_formats.py +++ b/pandas/tests/indexes/multi/test_formats.py @@ -6,24 +6,31 @@ Index, MultiIndex, ) +import pandas._testing as tm def test_format(idx): - idx.format() - idx[:0].format() + msg = "MultiIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + idx.format() + idx[:0].format() def test_format_integer_names(): index = MultiIndex( levels=[[0, 1], [0, 1]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]], names=[0, 1] ) - index.format(names=True) + msg = "MultiIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + index.format(names=True) def test_format_sparse_config(idx): # GH1538 + msg = "MultiIndex.format is deprecated" with pd.option_context("display.multi_sparse", False): - result = idx.format() + with tm.assert_produces_warning(FutureWarning, match=msg): + result = idx.format() assert result[1] == "foo two" @@ -37,8 +44,9 @@ def test_format_sparse_display(): [0, 0, 0, 0, 0, 0], ], ) - - result = index.format() + msg = "MultiIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = index.format() assert result[3] == "1 0 0 0" diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 22bb63d67f57f..1bb8d66332cd0 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -275,8 +275,11 @@ def test_map(self): def test_format_empty(self): # GH35712 empty_idx = PeriodIndex([], freq="Y") - assert empty_idx.format() == [] - assert empty_idx.format(name=True) == [""] + msg = r"PeriodIndex\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert empty_idx.format() == [] + with tm.assert_produces_warning(FutureWarning, match=msg): + assert empty_idx.format(name=True) == [""] def test_period_index_frequency_ME_error_message(self): msg = "Invalid frequency: 2ME" diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 132704434829e..95756b04bca69 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -240,7 +240,9 @@ def test_cache(self): pass assert idx._cache == {} - idx.format() + msg = "RangeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + idx.format() assert idx._cache == {} df = pd.DataFrame({"a": range(10)}, index=idx) @@ -566,8 +568,11 @@ def test_engineless_lookup(self): def test_format_empty(self): # GH35712 empty_idx = RangeIndex(0) - assert empty_idx.format() == [] - assert empty_idx.format(name=True) == [""] + msg = r"RangeIndex\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert empty_idx.format() == [] + with tm.assert_produces_warning(FutureWarning, match=msg): + assert empty_idx.format(name=True) == [""] @pytest.mark.parametrize( "ri", diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 6afab569797f2..04ab2020b4c7a 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -666,13 +666,17 @@ def test_format_bug(self): # include us since the default for Timestamp shows these but Index # formatting does not we are skipping) now = datetime.now() + msg = r"Index\.format is deprecated" + if not str(now).endswith("000"): index = Index([now]) - formatted = index.format() + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = index.format() expected = [str(index[0])] assert formatted == expected - Index([]).format() + with tm.assert_produces_warning(FutureWarning, match=msg): + Index([]).format() @pytest.mark.parametrize("vals", [[1, 2.0 + 3.0j, 4.0], ["a", "b", "c"]]) def test_format_missing(self, vals, nulls_fixture): @@ -682,7 +686,9 @@ def test_format_missing(self, vals, nulls_fixture): index = Index(vals, dtype=object) # TODO: case with complex dtype? - formatted = index.format() + msg = r"Index\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = index.format() null_repr = "NaN" if isinstance(nulls_fixture, float) else str(nulls_fixture) expected = [str(index[0]), str(index[1]), str(index[2]), null_repr] diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 79dc423f12a85..32adbc693390b 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -559,15 +559,20 @@ def test_format(self, simple_index): pytest.skip("Tested elsewhere.") idx = simple_index expected = [str(x) for x in idx] - assert idx.format() == expected + msg = r"Index\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert idx.format() == expected def test_format_empty(self, simple_index): # GH35712 if isinstance(simple_index, (PeriodIndex, RangeIndex)): pytest.skip("Tested elsewhere") empty_idx = type(simple_index)([]) - assert empty_idx.format() == [] - assert empty_idx.format(name=True) == [""] + msg = r"Index\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert empty_idx.format() == [] + with tm.assert_produces_warning(FutureWarning, match=msg): + assert empty_idx.format(name=True) == [""] def test_fillna(self, index): # GH 11343 @@ -955,7 +960,9 @@ def test_format(self, simple_index): idx = simple_index max_width = max(len(str(x)) for x in idx) expected = [str(x).ljust(max_width) for x in idx] - assert idx.format() == expected + msg = r"Index\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert idx.format() == expected def test_insert_non_na(self, simple_index): # GH#43921 inserting an element that we know we can hold should diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index e4ce969daab53..18af18ade85f4 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -809,7 +809,7 @@ def test_to_excel_multiindex_cols(self, merge_cells, frame, path): reader, sheet_name="test1", header=header, index_col=[0, 1] ) if not merge_cells: - fm = frame.columns.format(sparsify=False, adjoin=False, names=False) + fm = frame.columns._format_multi(sparsify=False, include_names=False) frame.columns = [".".join(map(str, q)) for q in zip(*fm)] tm.assert_frame_equal(frame, df) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 57f1f082708ae..d18c333d79244 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3333,7 +3333,9 @@ def test_period_format_and_strftime_default(self): per = pd.PeriodIndex([datetime(2003, 1, 1, 12), None], freq="h") # Default formatting - formatted = per.format() + msg = "PeriodIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = per.format() assert formatted[0] == "2003-01-01 12:00" # default: minutes not shown assert formatted[1] == "NaT" # format is equivalent to strftime(None)... @@ -3342,35 +3344,40 @@ def test_period_format_and_strftime_default(self): # Same test with nanoseconds freq per = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="ns") - formatted = per.format() + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = per.format() assert (formatted == per.strftime(None)).all() assert formatted[0] == "2003-01-01 12:01:01.123456789" assert formatted[1] == "2003-01-01 12:01:01.123456790" def test_period_custom(self): # GH#46252 custom formatting directives %l (ms) and %u (us) + msg = "PeriodIndex.format is deprecated" # 3 digits per = pd.period_range("2003-01-01 12:01:01.123", periods=2, freq="ms") - formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") assert formatted[0] == "03 12:01:01 (ms=123 us=123000 ns=123000000)" assert formatted[1] == "03 12:01:01 (ms=124 us=124000 ns=124000000)" # 6 digits per = pd.period_range("2003-01-01 12:01:01.123456", periods=2, freq="us") - formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") assert formatted[0] == "03 12:01:01 (ms=123 us=123456 ns=123456000)" assert formatted[1] == "03 12:01:01 (ms=123 us=123457 ns=123457000)" # 9 digits per = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="ns") - formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") assert formatted[0] == "03 12:01:01 (ms=123 us=123456 ns=123456789)" assert formatted[1] == "03 12:01:01 (ms=123 us=123456 ns=123456790)" def test_period_tz(self): # Formatting periods created from a datetime with timezone. - + msg = r"PeriodIndex\.format is deprecated" # This timestamp is in 2013 in Europe/Paris but is 2012 in UTC dt = pd.to_datetime(["2013-01-01 00:00:00+01:00"], utc=True) @@ -3378,13 +3385,15 @@ def test_period_tz(self): # Since tz is currently set as utc, we'll see 2012 with tm.assert_produces_warning(UserWarning, match="will drop timezone"): per = dt.to_period(freq="h") - assert per.format()[0] == "2012-12-31 23:00" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert per.format()[0] == "2012-12-31 23:00" # If tz is currently set as paris before conversion, we'll see 2013 dt = dt.tz_convert("Europe/Paris") with tm.assert_produces_warning(UserWarning, match="will drop timezone"): per = dt.to_period(freq="h") - assert per.format()[0] == "2013-01-01 00:00" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert per.format()[0] == "2013-01-01 00:00" @pytest.mark.parametrize( "locale_str", @@ -3411,7 +3420,9 @@ def test_period_non_ascii_fmt(self, locale_str): # Index per = pd.period_range("2003-01-01 01:00:00", periods=2, freq="12h") - formatted = per.format(date_format="%y é") + msg = "PeriodIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = per.format(date_format="%y é") assert formatted[0] == "03 é" assert formatted[1] == "03 é" @@ -3443,33 +3454,45 @@ def test_period_custom_locale_directive(self, locale_str): # Index per = pd.period_range("2003-01-01 01:00:00", periods=2, freq="12h") - formatted = per.format(date_format="%y %I:%M:%S%p") + msg = "PeriodIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = per.format(date_format="%y %I:%M:%S%p") assert formatted[0] == f"03 01:00:00{am_local}" assert formatted[1] == f"03 01:00:00{pm_local}" class TestDatetimeIndexFormat: def test_datetime(self): - formatted = pd.to_datetime([datetime(2003, 1, 1, 12), NaT]).format() + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = pd.to_datetime([datetime(2003, 1, 1, 12), NaT]).format() assert formatted[0] == "2003-01-01 12:00:00" assert formatted[1] == "NaT" def test_date(self): - formatted = pd.to_datetime([datetime(2003, 1, 1), NaT]).format() + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = pd.to_datetime([datetime(2003, 1, 1), NaT]).format() assert formatted[0] == "2003-01-01" assert formatted[1] == "NaT" def test_date_tz(self): - formatted = pd.to_datetime([datetime(2013, 1, 1)], utc=True).format() + dti = pd.to_datetime([datetime(2013, 1, 1)], utc=True) + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = dti.format() assert formatted[0] == "2013-01-01 00:00:00+00:00" - formatted = pd.to_datetime([datetime(2013, 1, 1), NaT], utc=True).format() + dti = pd.to_datetime([datetime(2013, 1, 1), NaT], utc=True) + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = dti.format() assert formatted[0] == "2013-01-01 00:00:00+00:00" def test_date_explicit_date_format(self): - formatted = pd.to_datetime([datetime(2003, 2, 1), NaT]).format( - date_format="%m-%d-%Y", na_rep="UT" - ) + dti = pd.to_datetime([datetime(2003, 2, 1), NaT]) + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = dti.format(date_format="%m-%d-%Y", na_rep="UT") assert formatted[0] == "02-01-2003" assert formatted[1] == "UT" diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 86474a38d29fb..86addb9dadfad 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -258,7 +258,9 @@ def test_index_repr_in_frame_with_nan(self): def test_format_pre_1900_dates(self): rng = date_range("1/1/1850", "1/1/1950", freq="Y-DEC") - rng.format() + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + rng.format() ts = Series(1, index=rng) repr(ts) From b5b8be04b2a63fc02e89650a740779e718d7a99b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 13 Oct 2023 11:41:56 -0700 Subject: [PATCH 131/131] REF: de-duplicate some Timedelta helpers (#55501) * REF: implement disallow_ambiguous_unit * REF: implement get_unit_for_round --- pandas/_libs/tslibs/timedeltas.pxd | 1 + pandas/_libs/tslibs/timedeltas.pyi | 2 ++ pandas/_libs/tslibs/timedeltas.pyx | 27 ++++++++++++++++++--------- pandas/_libs/tslibs/timestamps.pyx | 7 ++----- pandas/core/arrays/datetimelike.py | 6 ++---- pandas/core/indexes/timedeltas.py | 7 ++----- pandas/core/tools/timedeltas.py | 8 ++------ 7 files changed, 29 insertions(+), 29 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pxd b/pandas/_libs/tslibs/timedeltas.pxd index fb6e29a8932a1..f3473e46b6699 100644 --- a/pandas/_libs/tslibs/timedeltas.pxd +++ b/pandas/_libs/tslibs/timedeltas.pxd @@ -4,6 +4,7 @@ from numpy cimport int64_t from .np_datetime cimport NPY_DATETIMEUNIT +cpdef int64_t get_unit_for_round(freq, NPY_DATETIMEUNIT creso) except? -1 # Exposed for tslib, not intended for outside use. cpdef int64_t delta_to_nanoseconds( delta, NPY_DATETIMEUNIT reso=*, bint round_ok=* diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi index 6d993722ce1d4..181703c5f55b2 100644 --- a/pandas/_libs/tslibs/timedeltas.pyi +++ b/pandas/_libs/tslibs/timedeltas.pyi @@ -68,6 +68,8 @@ UnitChoices: TypeAlias = Literal[ ] _S = TypeVar("_S", bound=timedelta) +def get_unit_for_round(freq, creso: int) -> int: ... +def disallow_ambiguous_unit(unit: str | None) -> None: ... def ints_to_pytimedelta( arr: npt.NDArray[np.timedelta64], box: bool = ..., diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index a573d9a8ed0c0..5e124b89eab5e 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -827,6 +827,14 @@ def _binary_op_method_timedeltalike(op, name): # ---------------------------------------------------------------------- # Timedelta Construction +cpdef disallow_ambiguous_unit(unit): + if unit in {"Y", "y", "M"}: + raise ValueError( + "Units 'M', 'Y', and 'y' are no longer supported, as they do not " + "represent unambiguous timedelta values durations." + ) + + cdef int64_t parse_iso_format_string(str ts) except? -1: """ Extracts and cleanses the appropriate values from a match object with @@ -1815,11 +1823,7 @@ class Timedelta(_Timedelta): ) raise OutOfBoundsTimedelta(msg) from err - if unit in {"Y", "y", "M"}: - raise ValueError( - "Units 'M', 'Y', and 'y' are no longer supported, as they do not " - "represent unambiguous timedelta values durations." - ) + disallow_ambiguous_unit(unit) # GH 30543 if pd.Timedelta already passed, return it # check that only value is passed @@ -1932,10 +1936,7 @@ class Timedelta(_Timedelta): int64_t result, unit ndarray[int64_t] arr - from pandas._libs.tslibs.offsets import to_offset - - to_offset(freq).nanos # raises on non-fixed freq - unit = delta_to_nanoseconds(to_offset(freq), self._creso) + unit = get_unit_for_round(freq, self._creso) arr = np.array([self._value], dtype="i8") try: @@ -2292,3 +2293,11 @@ cdef bint _should_cast_to_timedelta(object obj): return ( is_any_td_scalar(obj) or obj is None or obj is NaT or isinstance(obj, str) ) + + +cpdef int64_t get_unit_for_round(freq, NPY_DATETIMEUNIT creso) except? -1: + from pandas._libs.tslibs.offsets import to_offset + + freq = to_offset(freq) + freq.nanos # raises on non-fixed freq + return delta_to_nanoseconds(freq, creso) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index edd061fd8cdf1..66b1cec63e9e9 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -107,7 +107,7 @@ from pandas._libs.tslibs.np_datetime import ( from pandas._libs.tslibs.offsets cimport to_offset from pandas._libs.tslibs.timedeltas cimport ( _Timedelta, - delta_to_nanoseconds, + get_unit_for_round, is_any_td_scalar, ) @@ -1896,8 +1896,7 @@ class Timestamp(_Timestamp): int64_t nanos freq = to_offset(freq, is_period=False) - freq.nanos # raises on non-fixed freq - nanos = delta_to_nanoseconds(freq, self._creso) + nanos = get_unit_for_round(freq, self._creso) if nanos == 0: if freq.nanos == 0: raise ValueError("Division by zero in rounding") @@ -1905,8 +1904,6 @@ class Timestamp(_Timestamp): # e.g. self.unit == "s" and sub-second freq return self - # TODO: problem if nanos==0 - if self.tz is not None: value = self.tz_localize(None)._value else: diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index fd51303ebd55f..22ffaceeff1bb 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -36,7 +36,6 @@ Timedelta, Timestamp, astype_overflowsafe, - delta_to_nanoseconds, get_unit_from_dtype, iNaT, ints_to_pydatetime, @@ -49,6 +48,7 @@ round_nsint64, ) from pandas._libs.tslibs.np_datetime import compare_mismatched_resolutions +from pandas._libs.tslibs.timedeltas import get_unit_for_round from pandas._libs.tslibs.timestamps import integer_op_not_supported from pandas._typing import ( ArrayLike, @@ -2129,9 +2129,7 @@ def _round(self, freq, mode, ambiguous, nonexistent): values = self.view("i8") values = cast(np.ndarray, values) - offset = to_offset(freq) - offset.nanos # raises on non-fixed frequencies - nanos = delta_to_nanoseconds(offset, self._creso) + nanos = get_unit_for_round(freq, self._creso) if nanos == 0: # GH 52761 return self.copy() diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index b1d8d0efb60e8..9d8ef5b0a69cd 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -13,6 +13,7 @@ Timedelta, to_offset, ) +from pandas._libs.tslibs.timedeltas import disallow_ambiguous_unit from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( @@ -170,11 +171,7 @@ def __new__( if is_scalar(data): cls._raise_scalar_data_error(data) - if unit in {"Y", "y", "M"}: - raise ValueError( - "Units 'M', 'Y', and 'y' are no longer supported, as they do not " - "represent unambiguous timedelta values durations." - ) + disallow_ambiguous_unit(unit) if dtype is not None: dtype = pandas_dtype(dtype) diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index 8db77725a1aa3..587946aba5041 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -17,6 +17,7 @@ ) from pandas._libs.tslibs.timedeltas import ( Timedelta, + disallow_ambiguous_unit, parse_timedelta_unit, ) @@ -178,16 +179,11 @@ def to_timedelta( """ if unit is not None: unit = parse_timedelta_unit(unit) + disallow_ambiguous_unit(unit) if errors not in ("ignore", "raise", "coerce"): raise ValueError("errors must be one of 'ignore', 'raise', or 'coerce'.") - if unit in {"Y", "y", "M"}: - raise ValueError( - "Units 'M', 'Y', and 'y' are no longer supported, as they do not " - "represent unambiguous timedelta values durations." - ) - if arg is None: return arg elif isinstance(arg, ABCSeries):