From e6ef75004358c25a6047c075c1884aaf2f7005b8 Mon Sep 17 00:00:00 2001 From: veljanin Date: Wed, 2 Oct 2024 09:56:12 +0200 Subject: [PATCH 1/6] Handling the case where converting empty categorical to 'pyarrow' dtype_backend results in error. Since conversion of non-empty categorical returns categorical of 'numpy_nullable' dtype_backend, now, instead of raising an error, we ensure empty categorical is returned as well. --- pandas/core/dtypes/cast.py | 10 +++++++--- .../tests/frame/methods/test_convert_dtypes.py | 16 ++++++++++++++++ .../tests/series/methods/test_convert_dtypes.py | 13 +++++++++++++ 3 files changed, 36 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 6ba07b1761557..dee3452136438 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1112,7 +1112,7 @@ def convert_dtypes( else: inferred_dtype = input_array.dtype - + if dtype_backend == "pyarrow": from pandas.core.arrays.arrow.array import to_pyarrow_type from pandas.core.arrays.string_ import StringDtype @@ -1145,12 +1145,16 @@ def convert_dtypes( and isna(input_array).all() ): import pyarrow as pa - + pa_type = pa.null() else: pa_type = to_pyarrow_type(base_dtype) if pa_type is not None: - inferred_dtype = ArrowDtype(pa_type) + if isna(input_array).all() and hasattr(input_array, 'categories'): + inferred_dtype = input_array.dtype + else: + inferred_dtype = ArrowDtype(pa_type) + elif dtype_backend == "numpy_nullable" and isinstance(inferred_dtype, ArrowDtype): # GH 53648 inferred_dtype = _arrow_dtype_mapping()[inferred_dtype.pyarrow_dtype] diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index e7f6e5d625d3e..29a05652b38a2 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -34,6 +34,22 @@ def test_convert_empty(self): # Empty DataFrame can pass convert_dtypes, see GH#40393 empty_df = pd.DataFrame() tm.assert_frame_equal(empty_df, empty_df.convert_dtypes()) + + def test_convert_empty_categorical_to_pyarrow(self): + df = pd.DataFrame( + { + "A": pd.Series(pd.Categorical([None] * 5)), + "B": pd.Series(pd.Categorical([None] * 5, categories=["B1", "B2"])), + } + ) + converted = df.convert_dtypes(dtype_backend="pyarrow") + expected = df + tm.assert_frame_equal(converted, expected) + + assert df.A.dtype == "category", "Dtype in column A is not 'category'" + assert df.B.dtype == "category", "Dtype in column B is not 'category'" + assert df.A.cat.categories.empty, "Categories in column A are not empty" + assert (df.B.cat.categories == ["B1", "B2"]).all(), "Categories in column A are not empty" def test_convert_dtypes_retain_column_names(self): # GH#41435 diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 90c4056a39e84..cc2635c35e798 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -297,3 +297,16 @@ def test_convert_dtypes_pyarrow_null(self): result = ser.convert_dtypes(dtype_backend="pyarrow") expected = pd.Series([None, None], dtype=pd.ArrowDtype(pa.null())) tm.assert_series_equal(result, expected) + + def test_convert_empty_categorical_to_pyarrow(self): + ser = pd.Series(pd.Series(pd.Categorical([None] * 5))) + + converted = ser.convert_dtypes(dtype_backend="pyarrow") + expected = ser + tm.assert_series_equal(converted, expected) + + assert ser.dtype == "category", "Series dtype is not 'category'" + assert ser.cat.categories.empty, "Series categories are not empty" + + ser2 = pd.Series(pd.Series(pd.Categorical([None] * 5, categories=["S1", "S2"]))) + assert (ser2.cat.categories == ["S1", "S2"]).all(), "Series categories are not empty" From 94769a1cc1e5fc089864337dc56324d705b18376 Mon Sep 17 00:00:00 2001 From: veljanin Date: Fri, 4 Oct 2024 11:48:40 +0200 Subject: [PATCH 2/6] additional revisions --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/dtypes/cast.py | 7 +++-- .../frame/methods/test_convert_dtypes.py | 23 +++++++++------ .../series/methods/test_convert_dtypes.py | 29 +++++++++++-------- 4 files changed, 36 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a5b4560a47bc4..d6dde366506c2 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -544,7 +544,7 @@ Bug fixes Categorical ^^^^^^^^^^^ -- +- Bug in :func:`convert_dtypes` with ``dtype_backend='pyarrow'`` parameter where empty categorical series raise error or get converted to null[pyarrow] (:issue:`59934`) - Datetimelike diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index dee3452136438..c7272a517497c 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1112,7 +1112,7 @@ def convert_dtypes( else: inferred_dtype = input_array.dtype - + if dtype_backend == "pyarrow": from pandas.core.arrays.arrow.array import to_pyarrow_type from pandas.core.arrays.string_ import StringDtype @@ -1143,14 +1143,15 @@ def convert_dtypes( base_dtype.kind == "O" # type: ignore[union-attr] and input_array.size > 0 and isna(input_array).all() + and not isinstance(input_array.dtype, CategoricalDtype) ): import pyarrow as pa - + pa_type = pa.null() else: pa_type = to_pyarrow_type(base_dtype) if pa_type is not None: - if isna(input_array).all() and hasattr(input_array, 'categories'): + if isna(input_array).all() and hasattr(input_array, "categories"): inferred_dtype = input_array.dtype else: inferred_dtype = ArrowDtype(pa_type) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 29a05652b38a2..5a0010d06a951 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -34,22 +34,27 @@ def test_convert_empty(self): # Empty DataFrame can pass convert_dtypes, see GH#40393 empty_df = pd.DataFrame() tm.assert_frame_equal(empty_df, empty_df.convert_dtypes()) - + def test_convert_empty_categorical_to_pyarrow(self): + # GH#59934 df = pd.DataFrame( { - "A": pd.Series(pd.Categorical([None] * 5)), - "B": pd.Series(pd.Categorical([None] * 5, categories=["B1", "B2"])), - } + "A": pd.Categorical([None] * 5), + "B": pd.Categorical([None] * 5, categories=["B1", "B2"]), + } ) converted = df.convert_dtypes(dtype_backend="pyarrow") expected = df tm.assert_frame_equal(converted, expected) - - assert df.A.dtype == "category", "Dtype in column A is not 'category'" - assert df.B.dtype == "category", "Dtype in column B is not 'category'" - assert df.A.cat.categories.empty, "Categories in column A are not empty" - assert (df.B.cat.categories == ["B1", "B2"]).all(), "Categories in column A are not empty" + + assert converted.A.dtype == "category", "Dtype in column A is not 'category'" + assert converted.B.dtype == "category", "Dtype in column B is not 'category'" + assert converted.A.cat.categories.empty, "Categories in column A are not empty" + assert converted.B.cat.categories.__contains__( + "B1" + ) and converted.B.cat.categories.__contains__( + "B2" + ), "Categories in column B doesn't contain adequate categories" def test_convert_dtypes_retain_column_names(self): # GH#41435 diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index cc2635c35e798..22d4ec6d7906a 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -297,16 +297,21 @@ def test_convert_dtypes_pyarrow_null(self): result = ser.convert_dtypes(dtype_backend="pyarrow") expected = pd.Series([None, None], dtype=pd.ArrowDtype(pa.null())) tm.assert_series_equal(result, expected) - + def test_convert_empty_categorical_to_pyarrow(self): - ser = pd.Series(pd.Series(pd.Categorical([None] * 5))) - - converted = ser.convert_dtypes(dtype_backend="pyarrow") - expected = ser - tm.assert_series_equal(converted, expected) - - assert ser.dtype == "category", "Series dtype is not 'category'" - assert ser.cat.categories.empty, "Series categories are not empty" - - ser2 = pd.Series(pd.Series(pd.Categorical([None] * 5, categories=["S1", "S2"]))) - assert (ser2.cat.categories == ["S1", "S2"]).all(), "Series categories are not empty" + # GH#59934 + ser1 = pd.Series(pd.Categorical([None] * 5)) + converted1 = ser1.convert_dtypes(dtype_backend="pyarrow") + expected = ser1 + + tm.assert_series_equal(converted1, expected) + assert converted1.dtype == "category", "Series dtype is not 'category'" + assert converted1.cat.categories.empty, "Series categories are not empty" + + ser2 = pd.Series(pd.Categorical([None] * 5, categories=["S1", "S2"])) + converted2 = ser2.convert_dtypes(dtype_backend="pyarrow") + assert converted2.cat.categories.__contains__( + "S1" + ) and converted2.cat.categories.__contains__( + "S2" + ), "Categories in ser2 doesn't contain adequate categories" From 8e879da17e8f97322e945755586c081fb8521c28 Mon Sep 17 00:00:00 2001 From: veljanin Date: Fri, 4 Oct 2024 11:53:11 +0200 Subject: [PATCH 3/6] removing the change for input_array... --- pandas/core/dtypes/cast.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index c7272a517497c..597f76510618a 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1151,10 +1151,7 @@ def convert_dtypes( else: pa_type = to_pyarrow_type(base_dtype) if pa_type is not None: - if isna(input_array).all() and hasattr(input_array, "categories"): - inferred_dtype = input_array.dtype - else: - inferred_dtype = ArrowDtype(pa_type) + inferred_dtype = ArrowDtype(pa_type) elif dtype_backend == "numpy_nullable" and isinstance(inferred_dtype, ArrowDtype): # GH 53648 From a5b388263b4cc39fadd37c05c0e832d53a2857bc Mon Sep 17 00:00:00 2001 From: veljanin Date: Mon, 7 Oct 2024 08:20:04 +0200 Subject: [PATCH 4/6] reverting newline in Series.convert_dtypes and precising respective docs in whatsnew --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/dtypes/cast.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index d6dde366506c2..3c707d7931c97 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -544,7 +544,7 @@ Bug fixes Categorical ^^^^^^^^^^^ -- Bug in :func:`convert_dtypes` with ``dtype_backend='pyarrow'`` parameter where empty categorical series raise error or get converted to null[pyarrow] (:issue:`59934`) +- Bug in :meth:`Series.convert_dtypes` with ``dtype_backend='pyarrow'`` parameter where empty categorical series raise error or get converted to null[pyarrow] (:issue:`59934`) - Datetimelike diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 597f76510618a..1255c5a557d27 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1152,7 +1152,6 @@ def convert_dtypes( pa_type = to_pyarrow_type(base_dtype) if pa_type is not None: inferred_dtype = ArrowDtype(pa_type) - elif dtype_backend == "numpy_nullable" and isinstance(inferred_dtype, ArrowDtype): # GH 53648 inferred_dtype = _arrow_dtype_mapping()[inferred_dtype.pyarrow_dtype] From c1ed1f0e5a1cb9e6e01409f27adfd6566e754391 Mon Sep 17 00:00:00 2001 From: veljanin Date: Wed, 9 Oct 2024 12:10:29 +0200 Subject: [PATCH 5/6] resolved conflicts and updated main with latest changes --- .github/workflows/package-checks.yml | 2 +- .github/workflows/wheels.yml | 2 +- .pre-commit-config.yaml | 10 +- ci/code_checks.sh | 30 - doc/source/user_guide/style.ipynb | 689 ++++++++++-------- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/_libs/lib.pyx | 14 + pandas/_libs/tslibs/nattype.pyi | 45 +- pandas/_testing/asserters.py | 10 + pandas/core/accessor.py | 7 +- pandas/core/apply.py | 14 +- pandas/core/arrays/categorical.py | 14 +- pandas/core/arrays/datetimes.py | 8 + pandas/core/arrays/sparse/accessor.py | 20 +- pandas/core/arrays/string_.py | 2 +- pandas/core/frame.py | 10 +- pandas/core/groupby/generic.py | 28 +- pandas/core/groupby/groupby.py | 4 +- pandas/core/groupby/ops.py | 39 +- pandas/core/indexes/period.py | 8 + pandas/core/reshape/encoding.py | 3 +- pandas/core/tools/numeric.py | 6 + pandas/errors/__init__.py | 5 + pandas/io/formats/html.py | 2 + pandas/io/formats/style.py | 3 +- pandas/io/formats/style_render.py | 24 +- pandas/io/stata.py | 20 + pandas/plotting/_misc.py | 17 +- pandas/tests/apply/test_frame_apply.py | 3 +- pandas/tests/apply/test_series_apply.py | 6 +- pandas/tests/arrays/sparse/test_accessor.py | 4 + .../tests/frame/methods/test_value_counts.py | 4 +- .../groupby/methods/test_value_counts.py | 86 ++- pandas/tests/indexes/test_old_base.py | 2 +- pandas/tests/io/formats/style/test_style.py | 13 +- pandas/tests/io/formats/test_format.py | 19 +- pandas/tests/tools/test_to_numeric.py | 15 + pandas/util/version/__init__.py | 8 + pyproject.toml | 9 +- 39 files changed, 746 insertions(+), 461 deletions(-) diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index 6748832903e30..331af6e05b650 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -67,7 +67,7 @@ jobs: fetch-depth: 0 - name: Set up Python - uses: mamba-org/setup-micromamba@v1 + uses: mamba-org/setup-micromamba@v2 with: environment-name: recipe-test create-args: >- diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 2aaec8c9b56b0..de59a454c827c 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -165,7 +165,7 @@ jobs: CIBW_PLATFORM: ${{ matrix.buildplat[1] == 'pyodide_wasm32' && 'pyodide' || 'auto' }} - name: Set up Python - uses: mamba-org/setup-micromamba@v1 + uses: mamba-org/setup-micromamba@v2 with: environment-name: wheel-env # Use a fixed Python, since we might have an unreleased Python not diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f6717dd503c9b..7c9ebf7d94173 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ ci: skip: [pyright, mypy] repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.5.0 + rev: v0.6.9 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -34,7 +34,7 @@ repos: - id: ruff-format exclude: ^scripts|^pandas/tests/frame/test_query_eval.py - repo: https://github.com/jendrikseipp/vulture - rev: 'v2.11' + rev: 'v2.13' hooks: - id: vulture entry: python scripts/run_vulture.py @@ -52,7 +52,7 @@ repos: - id: cython-lint - id: double-quote-cython-strings - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 + rev: v5.0.0 hooks: - id: check-case-conflict - id: check-toml @@ -90,12 +90,12 @@ repos: types: [text] # overwrite types: [rst] types_or: [python, rst] - repo: https://github.com/sphinx-contrib/sphinx-lint - rev: v0.9.1 + rev: v1.0.0 hooks: - id: sphinx-lint args: ["--enable", "all", "--disable", "line-too-long"] - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v18.1.8 + rev: v19.1.1 hooks: - id: clang-format files: ^pandas/_libs/src|^pandas/_libs/include diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 4a1a0042405e3..6fb675069e81d 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -73,30 +73,9 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Period.freq GL08" \ -i "pandas.Period.ordinal GL08" \ -i "pandas.RangeIndex.from_range PR01,SA01" \ - -i "pandas.Series.cat.add_categories PR01,PR02" \ - -i "pandas.Series.cat.as_ordered PR01" \ - -i "pandas.Series.cat.as_unordered PR01" \ - -i "pandas.Series.cat.remove_categories PR01,PR02" \ - -i "pandas.Series.cat.remove_unused_categories PR01" \ - -i "pandas.Series.cat.rename_categories PR01,PR02" \ - -i "pandas.Series.cat.reorder_categories PR01,PR02" \ - -i "pandas.Series.cat.set_categories PR01,PR02" \ - -i "pandas.Series.dt.as_unit PR01,PR02" \ - -i "pandas.Series.dt.ceil PR01,PR02" \ - -i "pandas.Series.dt.day_name PR01,PR02" \ - -i "pandas.Series.dt.floor PR01,PR02" \ -i "pandas.Series.dt.freq GL08" \ - -i "pandas.Series.dt.month_name PR01,PR02" \ - -i "pandas.Series.dt.normalize PR01" \ - -i "pandas.Series.dt.round PR01,PR02" \ - -i "pandas.Series.dt.strftime PR01,PR02" \ - -i "pandas.Series.dt.to_period PR01,PR02" \ - -i "pandas.Series.dt.total_seconds PR01" \ - -i "pandas.Series.dt.tz_convert PR01,PR02" \ - -i "pandas.Series.dt.tz_localize PR01,PR02" \ -i "pandas.Series.dt.unit GL08" \ -i "pandas.Series.pad PR01,SA01" \ - -i "pandas.Series.sparse.from_coo PR07,SA01" \ -i "pandas.Timedelta.max PR02" \ -i "pandas.Timedelta.min PR02" \ -i "pandas.Timedelta.resolution PR02" \ @@ -106,13 +85,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.resolution PR02" \ -i "pandas.Timestamp.tzinfo GL08" \ -i "pandas.Timestamp.year GL08" \ - -i "pandas.api.types.is_float PR01,SA01" \ -i "pandas.api.types.is_integer PR01,SA01" \ -i "pandas.api.types.is_iterator PR07,SA01" \ -i "pandas.api.types.is_re_compilable PR07,SA01" \ -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \ -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \ - -i "pandas.arrays.DatetimeArray SA01" \ -i "pandas.arrays.IntegerArray SA01" \ -i "pandas.arrays.IntervalArray.left SA01" \ -i "pandas.arrays.IntervalArray.length SA01" \ @@ -163,7 +140,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.errors.DuplicateLabelError SA01" \ -i "pandas.errors.IntCastingNaNError SA01" \ -i "pandas.errors.InvalidIndexError SA01" \ - -i "pandas.errors.InvalidVersion SA01" \ -i "pandas.errors.NullFrequencyError SA01" \ -i "pandas.errors.NumExprClobberingError SA01" \ -i "pandas.errors.NumbaUtilError SA01" \ @@ -172,24 +148,18 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.errors.PerformanceWarning SA01" \ -i "pandas.errors.PossibleDataLossError SA01" \ -i "pandas.errors.PossiblePrecisionLoss SA01" \ - -i "pandas.errors.SpecificationError SA01" \ -i "pandas.errors.UndefinedVariableError PR01,SA01" \ -i "pandas.errors.UnsortedIndexError SA01" \ -i "pandas.errors.UnsupportedFunctionCall SA01" \ -i "pandas.errors.ValueLabelTypeMismatch SA01" \ -i "pandas.infer_freq SA01" \ -i "pandas.io.json.build_table_schema PR07,RT03,SA01" \ - -i "pandas.io.stata.StataReader.data_label SA01" \ - -i "pandas.io.stata.StataReader.value_labels RT03,SA01" \ -i "pandas.io.stata.StataReader.variable_labels RT03,SA01" \ -i "pandas.io.stata.StataWriter.write_file SA01" \ -i "pandas.json_normalize RT03,SA01" \ - -i "pandas.period_range RT03,SA01" \ -i "pandas.plotting.andrews_curves RT03,SA01" \ - -i "pandas.plotting.lag_plot RT03,SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ -i "pandas.set_eng_float_format RT03,SA01" \ - -i "pandas.testing.assert_extension_array_equal SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ -i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.n GL08" \ diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index daecfce6ecebc..abb7181fc8d72 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -38,19 +38,6 @@ "[concatfunc]: ../reference/api/pandas.io.formats.style.Styler.concat.rst" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "nbsphinx": "hidden" - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot\n", - "# We have this here to trigger matplotlib's font cache stuff.\n", - "# This cell is hidden from the output" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -78,17 +65,13 @@ "source": [ "import pandas as pd\n", "import numpy as np\n", - "import matplotlib as mpl\n", "\n", - "df = pd.DataFrame({\n", - " \"strings\": [\"Adam\", \"Mike\"],\n", - " \"ints\": [1, 3],\n", - " \"floats\": [1.123, 1000.23]\n", - "})\n", - "df.style \\\n", - " .format(precision=3, thousands=\".\", decimal=\",\") \\\n", - " .format_index(str.upper, axis=1) \\\n", - " .relabel_index([\"row 1\", \"row 2\"], axis=0)" + "df = pd.DataFrame(\n", + " {\"strings\": [\"Adam\", \"Mike\"], \"ints\": [1, 3], \"floats\": [1.123, 1000.23]}\n", + ")\n", + "df.style.format(precision=3, thousands=\".\", decimal=\",\").format_index(\n", + " str.upper, axis=1\n", + ").relabel_index([\"row 1\", \"row 2\"], axis=0)" ] }, { @@ -104,17 +87,21 @@ "metadata": {}, "outputs": [], "source": [ - "weather_df = pd.DataFrame(np.random.rand(10,2)*5, \n", - " index=pd.date_range(start=\"2021-01-01\", periods=10),\n", - " columns=[\"Tokyo\", \"Beijing\"])\n", + "weather_df = pd.DataFrame(\n", + " np.random.default_rng().random((10, 2)) * 5,\n", + " index=pd.date_range(start=\"2021-01-01\", periods=10),\n", + " columns=[\"Tokyo\", \"Beijing\"],\n", + ")\n", + "\n", "\n", - "def rain_condition(v): \n", + "def rain_condition(v):\n", " if v < 1.75:\n", " return \"Dry\"\n", " elif v < 2.75:\n", " return \"Rain\"\n", " return \"Heavy Rain\"\n", "\n", + "\n", "def make_pretty(styler):\n", " styler.set_caption(\"Weather Conditions\")\n", " styler.format(rain_condition)\n", @@ -122,6 +109,7 @@ " styler.background_gradient(axis=None, vmin=1, vmax=5, cmap=\"YlGnBu\")\n", " return styler\n", "\n", + "\n", "weather_df" ] }, @@ -157,10 +145,8 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.DataFrame(np.random.randn(5, 5))\n", - "df.style \\\n", - " .hide(subset=[0, 2, 4], axis=0) \\\n", - " .hide(subset=[0, 2, 4], axis=1)" + "df = pd.DataFrame(np.random.default_rng().standard_normal((5, 5)))\n", + "df.style.hide(subset=[0, 2, 4], axis=0).hide(subset=[0, 2, 4], axis=1)" ] }, { @@ -177,9 +163,9 @@ "outputs": [], "source": [ "show = [0, 2, 4]\n", - "df.style \\\n", - " .hide([row for row in df.index if row not in show], axis=0) \\\n", - " .hide([col for col in df.columns if col not in show], axis=1)" + "df.style.hide([row for row in df.index if row not in show], axis=0).hide(\n", + " [col for col in df.columns if col not in show], axis=1\n", + ")" ] }, { @@ -199,9 +185,9 @@ "metadata": {}, "outputs": [], "source": [ - "summary_styler = df.agg([\"sum\", \"mean\"]).style \\\n", - " .format(precision=3) \\\n", - " .relabel_index([\"Sum\", \"Average\"])\n", + "summary_styler = (\n", + " df.agg([\"sum\", \"mean\"]).style.format(precision=3).relabel_index([\"Sum\", \"Average\"])\n", + ")\n", "df.style.format(precision=1).concat(summary_styler)" ] }, @@ -227,9 +213,16 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.DataFrame([[38.0, 2.0, 18.0, 22.0, 21, np.nan],[19, 439, 6, 452, 226,232]], \n", - " index=pd.Index(['Tumour (Positive)', 'Non-Tumour (Negative)'], name='Actual Label:'), \n", - " columns=pd.MultiIndex.from_product([['Decision Tree', 'Regression', 'Random'],['Tumour', 'Non-Tumour']], names=['Model:', 'Predicted:']))\n", + "idx = pd.Index([\"Tumour (Positive)\", \"Non-Tumour (Negative)\"], name=\"Actual Label:\")\n", + "cols = pd.MultiIndex.from_product(\n", + " [[\"Decision Tree\", \"Regression\", \"Random\"], [\"Tumour\", \"Non-Tumour\"]],\n", + " names=[\"Model:\", \"Predicted:\"],\n", + ")\n", + "df = pd.DataFrame(\n", + " [[38.0, 2.0, 18.0, 22.0, 21, np.nan], [19, 439, 6, 452, 226, 232]],\n", + " index=idx,\n", + " columns=cols,\n", + ")\n", "df.style" ] }, @@ -242,63 +235,68 @@ "outputs": [], "source": [ "# Hidden cell to just create the below example: code is covered throughout the guide.\n", - "s = df.style\\\n", - " .hide([('Random', 'Tumour'), ('Random', 'Non-Tumour')], axis='columns')\\\n", - " .format('{:.0f}')\\\n", - " .set_table_styles([{\n", - " 'selector': '',\n", - " 'props': 'border-collapse: separate;'\n", - " },{\n", - " 'selector': 'caption',\n", - " 'props': 'caption-side: bottom; font-size:1.3em;'\n", - " },{\n", - " 'selector': '.index_name',\n", - " 'props': 'font-style: italic; color: darkgrey; font-weight:normal;'\n", - " },{\n", - " 'selector': 'th:not(.index_name)',\n", - " 'props': 'background-color: #000066; color: white;'\n", - " },{\n", - " 'selector': 'th.col_heading',\n", - " 'props': 'text-align: center;'\n", - " },{\n", - " 'selector': 'th.col_heading.level0',\n", - " 'props': 'font-size: 1.5em;'\n", - " },{\n", - " 'selector': 'th.col2',\n", - " 'props': 'border-left: 1px solid white;'\n", - " },{\n", - " 'selector': '.col2',\n", - " 'props': 'border-left: 1px solid #000066;'\n", - " },{\n", - " 'selector': 'td',\n", - " 'props': 'text-align: center; font-weight:bold;'\n", - " },{\n", - " 'selector': '.true',\n", - " 'props': 'background-color: #e6ffe6;'\n", - " },{\n", - " 'selector': '.false',\n", - " 'props': 'background-color: #ffe6e6;'\n", - " },{\n", - " 'selector': '.border-red',\n", - " 'props': 'border: 2px dashed red;'\n", - " },{\n", - " 'selector': '.border-green',\n", - " 'props': 'border: 2px dashed green;'\n", - " },{\n", - " 'selector': 'td:hover',\n", - " 'props': 'background-color: #ffffb3;'\n", - " }])\\\n", - " .set_td_classes(pd.DataFrame([['true border-green', 'false', 'true', 'false border-red', '', ''],\n", - " ['false', 'true', 'false', 'true', '', '']], \n", - " index=df.index, columns=df.columns))\\\n", - " .set_caption(\"Confusion matrix for multiple cancer prediction models.\")\\\n", - " .set_tooltips(pd.DataFrame([['This model has a very strong true positive rate', '', '', \"This model's total number of false negatives is too high\", '', ''],\n", - " ['', '', '', '', '', '']], \n", - " index=df.index, columns=df.columns),\n", - " css_class='pd-tt', props=\n", - " 'visibility: hidden; position: absolute; z-index: 1; border: 1px solid #000066;'\n", - " 'background-color: white; color: #000066; font-size: 0.8em;' \n", - " 'transform: translate(0px, -24px); padding: 0.6em; border-radius: 0.5em;')\n" + "s = (\n", + " df.style.hide([(\"Random\", \"Tumour\"), (\"Random\", \"Non-Tumour\")], axis=\"columns\")\n", + " .format(\"{:.0f}\")\n", + " .set_table_styles(\n", + " [\n", + " {\"selector\": \"\", \"props\": \"border-collapse: separate;\"},\n", + " {\"selector\": \"caption\", \"props\": \"caption-side: bottom; font-size:1.3em;\"},\n", + " {\n", + " \"selector\": \".index_name\",\n", + " \"props\": \"font-style: italic; color: darkgrey; font-weight:normal;\",\n", + " },\n", + " {\n", + " \"selector\": \"th:not(.index_name)\",\n", + " \"props\": \"background-color: #000066; color: white;\",\n", + " },\n", + " {\"selector\": \"th.col_heading\", \"props\": \"text-align: center;\"},\n", + " {\"selector\": \"th.col_heading.level0\", \"props\": \"font-size: 1.5em;\"},\n", + " {\"selector\": \"th.col2\", \"props\": \"border-left: 1px solid white;\"},\n", + " {\"selector\": \".col2\", \"props\": \"border-left: 1px solid #000066;\"},\n", + " {\"selector\": \"td\", \"props\": \"text-align: center; font-weight:bold;\"},\n", + " {\"selector\": \".true\", \"props\": \"background-color: #e6ffe6;\"},\n", + " {\"selector\": \".false\", \"props\": \"background-color: #ffe6e6;\"},\n", + " {\"selector\": \".border-red\", \"props\": \"border: 2px dashed red;\"},\n", + " {\"selector\": \".border-green\", \"props\": \"border: 2px dashed green;\"},\n", + " {\"selector\": \"td:hover\", \"props\": \"background-color: #ffffb3;\"},\n", + " ]\n", + " )\n", + " .set_td_classes(\n", + " pd.DataFrame(\n", + " [\n", + " [\"true border-green\", \"false\", \"true\", \"false border-red\", \"\", \"\"],\n", + " [\"false\", \"true\", \"false\", \"true\", \"\", \"\"],\n", + " ],\n", + " index=df.index,\n", + " columns=df.columns,\n", + " )\n", + " )\n", + " .set_caption(\"Confusion matrix for multiple cancer prediction models.\")\n", + " .set_tooltips(\n", + " pd.DataFrame(\n", + " [\n", + " [\n", + " \"This model has a very strong true positive rate\",\n", + " \"\",\n", + " \"\",\n", + " \"This model's total number of false negatives is too high\",\n", + " \"\",\n", + " \"\",\n", + " ],\n", + " [\"\", \"\", \"\", \"\", \"\", \"\"],\n", + " ],\n", + " index=df.index,\n", + " columns=df.columns,\n", + " ),\n", + " css_class=\"pd-tt\",\n", + " props=\"visibility: hidden; \"\n", + " \"position: absolute; z-index: 1; \"\n", + " \"border: 1px solid #000066;\"\n", + " \"background-color: white; color: #000066; font-size: 0.8em;\"\n", + " \"transform: translate(0px, -24px); padding: 0.6em; border-radius: 0.5em;\",\n", + " )\n", + ")" ] }, { @@ -325,7 +323,9 @@ "metadata": {}, "outputs": [], "source": [ - "s = df.style.format('{:.0f}').hide([('Random', 'Tumour'), ('Random', 'Non-Tumour')], axis=\"columns\")\n", + "s = df.style.format(\"{:.0f}\").hide(\n", + " [(\"Random\", \"Tumour\"), (\"Random\", \"Non-Tumour\")], axis=\"columns\"\n", + ")\n", "s" ] }, @@ -337,8 +337,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s.set_uuid('after_hide')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s.set_uuid(\"after_hide\")" ] }, { @@ -395,16 +395,16 @@ "outputs": [], "source": [ "cell_hover = { # for row hover use instead of \n", - " 'selector': 'td:hover',\n", - " 'props': [('background-color', '#ffffb3')]\n", + " \"selector\": \"td:hover\",\n", + " \"props\": [(\"background-color\", \"#ffffb3\")],\n", "}\n", "index_names = {\n", - " 'selector': '.index_name',\n", - " 'props': 'font-style: italic; color: darkgrey; font-weight:normal;'\n", + " \"selector\": \".index_name\",\n", + " \"props\": \"font-style: italic; color: darkgrey; font-weight:normal;\",\n", "}\n", "headers = {\n", - " 'selector': 'th:not(.index_name)',\n", - " 'props': 'background-color: #000066; color: white;'\n", + " \"selector\": \"th:not(.index_name)\",\n", + " \"props\": \"background-color: #000066; color: white;\",\n", "}\n", "s.set_table_styles([cell_hover, index_names, headers])" ] @@ -417,8 +417,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s.set_uuid('after_tab_styles1')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s.set_uuid(\"after_tab_styles1\")" ] }, { @@ -434,11 +434,14 @@ "metadata": {}, "outputs": [], "source": [ - "s.set_table_styles([\n", - " {'selector': 'th.col_heading', 'props': 'text-align: center;'},\n", - " {'selector': 'th.col_heading.level0', 'props': 'font-size: 1.5em;'},\n", - " {'selector': 'td', 'props': 'text-align: center; font-weight: bold;'},\n", - "], overwrite=False)" + "s.set_table_styles(\n", + " [\n", + " {\"selector\": \"th.col_heading\", \"props\": \"text-align: center;\"},\n", + " {\"selector\": \"th.col_heading.level0\", \"props\": \"font-size: 1.5em;\"},\n", + " {\"selector\": \"td\", \"props\": \"text-align: center; font-weight: bold;\"},\n", + " ],\n", + " overwrite=False,\n", + ")" ] }, { @@ -449,8 +452,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s.set_uuid('after_tab_styles2')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s.set_uuid(\"after_tab_styles2\")" ] }, { @@ -468,10 +471,16 @@ "metadata": {}, "outputs": [], "source": [ - "s.set_table_styles({\n", - " ('Regression', 'Tumour'): [{'selector': 'th', 'props': 'border-left: 1px solid white'},\n", - " {'selector': 'td', 'props': 'border-left: 1px solid #000066'}]\n", - "}, overwrite=False, axis=0)" + "s.set_table_styles(\n", + " {\n", + " (\"Regression\", \"Tumour\"): [\n", + " {\"selector\": \"th\", \"props\": \"border-left: 1px solid white\"},\n", + " {\"selector\": \"td\", \"props\": \"border-left: 1px solid #000066\"},\n", + " ]\n", + " },\n", + " overwrite=False,\n", + " axis=0,\n", + ")" ] }, { @@ -482,8 +491,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s.set_uuid('xyz01')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s.set_uuid(\"xyz01\")" ] }, { @@ -508,7 +517,7 @@ "outputs": [], "source": [ "out = s.set_table_attributes('class=\"my-table-cls\"').to_html()\n", - "print(out[out.find('