Skip to content

Commit

Permalink
BUG (string dtype): fix qualifier in memory usage info (pandas-dev#60221
Browse files Browse the repository at this point in the history
)

(cherry picked from commit 0937c95)
  • Loading branch information
jorisvandenbossche committed Nov 7, 2024
1 parent 168e353 commit c0df110
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 25 deletions.
4 changes: 3 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5326,7 +5326,9 @@ def _is_memory_usage_qualified(self) -> bool:
"""
Return a boolean if we need a qualified .info display.
"""
return is_object_dtype(self.dtype)
return is_object_dtype(self.dtype) or (
is_string_dtype(self.dtype) and self.dtype.storage == "python" # type: ignore[union-attr]
)

def __contains__(self, key: Any) -> bool:
"""
Expand Down
9 changes: 6 additions & 3 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
is_list_like,
is_object_dtype,
is_scalar,
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import (
Expand Down Expand Up @@ -1344,10 +1345,12 @@ def dtype(self) -> np.dtype:
def _is_memory_usage_qualified(self) -> bool:
"""return a boolean if we need a qualified .info display"""

def f(level) -> bool:
return "mixed" in level or "string" in level or "unicode" in level
def f(dtype) -> bool:
return is_object_dtype(dtype) or (
is_string_dtype(dtype) and dtype.storage == "python"
)

return any(f(level) for level in self._inferred_type_levels)
return any(f(level.dtype) for level in self.levels)

# Cannot determine type of "memory_usage"
@doc(Index.memory_usage) # type: ignore[has-type]
Expand Down
34 changes: 24 additions & 10 deletions pandas/tests/frame/methods/test_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from pandas._config import using_string_dtype

from pandas.compat import (
HAS_PYARROW,
IS64,
PYPY,
)
Expand Down Expand Up @@ -435,18 +436,25 @@ def test_usage_via_getsizeof():
assert abs(diff) < 100


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_info_memory_usage_qualified():
def test_info_memory_usage_qualified(using_infer_string):
buf = StringIO()
df = DataFrame(1, columns=list("ab"), index=[1, 2, 3])
df.info(buf=buf)
assert "+" not in buf.getvalue()

buf = StringIO()
df = DataFrame(1, columns=list("ab"), index=list("ABC"))
df = DataFrame(1, columns=list("ab"), index=Index(list("ABC"), dtype=object))
df.info(buf=buf)
assert "+" in buf.getvalue()

buf = StringIO()
df = DataFrame(1, columns=list("ab"), index=Index(list("ABC"), dtype="str"))
df.info(buf=buf)
if using_infer_string and HAS_PYARROW:
assert "+" not in buf.getvalue()
else:
assert "+" in buf.getvalue()

buf = StringIO()
df = DataFrame(
1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)])
Expand All @@ -459,7 +467,10 @@ def test_info_memory_usage_qualified():
1, columns=list("ab"), index=MultiIndex.from_product([range(3), ["foo", "bar"]])
)
df.info(buf=buf)
assert "+" in buf.getvalue()
if using_infer_string and HAS_PYARROW:
assert "+" not in buf.getvalue()
else:
assert "+" in buf.getvalue()


def test_info_memory_usage_bug_on_multiindex():
Expand Down Expand Up @@ -496,16 +507,15 @@ def test_info_categorical():
df.info(buf=buf)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system")
def test_info_int_columns():
def test_info_int_columns(using_infer_string):
# GH#37245
df = DataFrame({1: [1, 2], 2: [2, 3]}, index=["A", "B"])
buf = StringIO()
df.info(show_counts=True, buf=buf)
result = buf.getvalue()
expected = textwrap.dedent(
"""\
f"""\
<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, A to B
Data columns (total 2 columns):
Expand All @@ -514,19 +524,23 @@ def test_info_int_columns():
0 1 2 non-null int64
1 2 2 non-null int64
dtypes: int64(2)
memory usage: 48.0+ bytes
memory usage: {'50.0' if using_infer_string and HAS_PYARROW else '48.0+'} bytes
"""
)
assert result == expected


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_memory_usage_empty_no_warning():
def test_memory_usage_empty_no_warning(using_infer_string):
# GH#50066
df = DataFrame(index=["a", "b"])
with tm.assert_produces_warning(None):
result = df.memory_usage()
expected = Series(16 if IS64 else 8, index=["Index"])
if using_infer_string and HAS_PYARROW:
value = 18
else:
value = 16 if IS64 else 8
expected = Series(value, index=["Index"])
tm.assert_series_equal(result, expected)


Expand Down
30 changes: 19 additions & 11 deletions pandas/tests/series/methods/test_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,14 @@

from pandas._config import using_string_dtype

from pandas.compat import PYPY
from pandas.compat import (
HAS_PYARROW,
PYPY,
)

from pandas import (
CategoricalIndex,
Index,
MultiIndex,
Series,
date_range,
Expand Down Expand Up @@ -41,7 +45,9 @@ def test_info_categorical():


@pytest.mark.parametrize("verbose", [True, False])
def test_info_series(lexsorted_two_level_string_multiindex, verbose):
def test_info_series(
lexsorted_two_level_string_multiindex, verbose, using_infer_string
):
index = lexsorted_two_level_string_multiindex
ser = Series(range(len(index)), index=index, name="sth")
buf = StringIO()
Expand All @@ -63,10 +69,11 @@ def test_info_series(lexsorted_two_level_string_multiindex, verbose):
10 non-null int64
"""
)
qualifier = "" if using_infer_string and HAS_PYARROW else "+"
expected += textwrap.dedent(
f"""\
dtypes: int64(1)
memory usage: {ser.memory_usage()}.0+ bytes
memory usage: {ser.memory_usage()}.0{qualifier} bytes
"""
)
assert result == expected
Expand Down Expand Up @@ -142,20 +149,21 @@ def test_info_memory_usage_deep_pypy():
assert s_object.memory_usage(deep=True) == s_object.memory_usage()


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize(
"series, plus",
"index, plus",
[
(Series(1, index=[1, 2, 3]), False),
(Series(1, index=list("ABC")), True),
(Series(1, index=MultiIndex.from_product([range(3), range(3)])), False),
([1, 2, 3], False),
(Index(list("ABC"), dtype="str"), not (using_string_dtype() and HAS_PYARROW)),
(Index(list("ABC"), dtype=object), True),
(MultiIndex.from_product([range(3), range(3)]), False),
(
Series(1, index=MultiIndex.from_product([range(3), ["foo", "bar"]])),
True,
MultiIndex.from_product([range(3), ["foo", "bar"]]),
not (using_string_dtype() and HAS_PYARROW),
),
],
)
def test_info_memory_usage_qualified(series, plus):
def test_info_memory_usage_qualified(index, plus):
series = Series(1, index=index)
buf = StringIO()
series.info(buf=buf)
if plus:
Expand Down

0 comments on commit c0df110

Please sign in to comment.