From 976280e29bac4b9862ea8702906271befe2a852b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 6 Nov 2024 21:20:12 +0100 Subject: [PATCH 1/4] BUG (string dtype): fix qualifier in memory usage info --- pandas/core/indexes/base.py | 4 +++- pandas/core/indexes/multi.py | 9 ++++++--- pandas/tests/series/methods/test_info.py | 15 +++++++++++---- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index cf3d1e6a2ee2d..25a3ce0e3ba83 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5139,7 +5139,9 @@ def _is_memory_usage_qualified(self) -> bool: """ Return a boolean if we need a qualified .info display. """ - return is_object_dtype(self.dtype) + return is_object_dtype(self.dtype) or ( + is_string_dtype(self.dtype) and self.dtype.storage == "python" + ) def __contains__(self, key: Any) -> bool: """ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index e6ce00cb714a4..d1c99cb864e57 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -66,6 +66,7 @@ is_list_like, is_object_dtype, is_scalar, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -1425,10 +1426,12 @@ def dtype(self) -> np.dtype: def _is_memory_usage_qualified(self) -> bool: """return a boolean if we need a qualified .info display""" - def f(level) -> bool: - return "mixed" in level or "string" in level or "unicode" in level + def f(dtype) -> bool: + return is_object_dtype(dtype) or ( + is_string_dtype(dtype) and dtype.storage == "python" + ) - return any(f(level.inferred_type) for level in self.levels) + return any(f(level.dtype) for level in self.levels) # Cannot determine type of "memory_usage" @doc(Index.memory_usage) # type: ignore[has-type] diff --git a/pandas/tests/series/methods/test_info.py b/pandas/tests/series/methods/test_info.py index 097976b0a7ac0..f8b7402d8b4e7 100644 --- a/pandas/tests/series/methods/test_info.py +++ b/pandas/tests/series/methods/test_info.py @@ -7,10 +7,14 @@ from pandas._config import using_string_dtype -from pandas.compat import PYPY +from pandas.compat import ( + HAS_PYARROW, + PYPY, +) from pandas import ( CategoricalIndex, + Index, MultiIndex, Series, date_range, @@ -142,14 +146,17 @@ def test_info_memory_usage_deep_pypy(): assert s_object.memory_usage(deep=True) == s_object.memory_usage() -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "index, plus", [ ([1, 2, 3], False), - (list("ABC"), True), + (Index(list("ABC"), dtype="str"), not (using_string_dtype() and HAS_PYARROW)), + (Index(list("ABC"), dtype=object), True), (MultiIndex.from_product([range(3), range(3)]), False), - (MultiIndex.from_product([range(3), ["foo", "bar"]]), True), + ( + MultiIndex.from_product([range(3), ["foo", "bar"]]), + not (using_string_dtype() and HAS_PYARROW), + ), ], ) def test_info_memory_usage_qualified(index, plus): From cf0880183ad7a226d1c7fbc4dbe4951c5281504f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 7 Nov 2024 08:41:06 +0100 Subject: [PATCH 2/4] fix test --- pandas/tests/series/methods/test_info.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/methods/test_info.py b/pandas/tests/series/methods/test_info.py index f8b7402d8b4e7..8997d267f2695 100644 --- a/pandas/tests/series/methods/test_info.py +++ b/pandas/tests/series/methods/test_info.py @@ -45,7 +45,9 @@ def test_info_categorical(): @pytest.mark.parametrize("verbose", [True, False]) -def test_info_series(lexsorted_two_level_string_multiindex, verbose): +def test_info_series( + lexsorted_two_level_string_multiindex, verbose, using_infer_string +): index = lexsorted_two_level_string_multiindex ser = Series(range(len(index)), index=index, name="sth") buf = StringIO() @@ -70,7 +72,7 @@ def test_info_series(lexsorted_two_level_string_multiindex, verbose): expected += textwrap.dedent( f"""\ dtypes: int64(1) - memory usage: {ser.memory_usage()}.0+ bytes + memory usage: {ser.memory_usage()}.0{'' if using_infer_string else '+'} bytes """ ) assert result == expected From e54364406adc5c8f484f83e2b02131875a0be995 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 7 Nov 2024 08:42:13 +0100 Subject: [PATCH 3/4] fix typing --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 25a3ce0e3ba83..d6035c82aaaf8 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5140,7 +5140,7 @@ def _is_memory_usage_qualified(self) -> bool: Return a boolean if we need a qualified .info display. """ return is_object_dtype(self.dtype) or ( - is_string_dtype(self.dtype) and self.dtype.storage == "python" + is_string_dtype(self.dtype) and self.dtype.storage == "python" # type: ignore[union-attr] ) def __contains__(self, key: Any) -> bool: From b5b2fe24fbb8acff064e9177514b8f63e1a17a36 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 7 Nov 2024 09:56:35 +0100 Subject: [PATCH 4/4] fix tests for object-dtype fallback --- pandas/tests/frame/methods/test_info.py | 36 +++++++++++++++--------- pandas/tests/series/methods/test_info.py | 3 +- 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py index aad43b7a77ac7..74e4383950174 100644 --- a/pandas/tests/frame/methods/test_info.py +++ b/pandas/tests/frame/methods/test_info.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import ( HAS_PYARROW, IS64, @@ -436,18 +434,25 @@ def test_usage_via_getsizeof(): assert abs(diff) < 100 -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_info_memory_usage_qualified(): +def test_info_memory_usage_qualified(using_infer_string): buf = StringIO() df = DataFrame(1, columns=list("ab"), index=[1, 2, 3]) df.info(buf=buf) assert "+" not in buf.getvalue() buf = StringIO() - df = DataFrame(1, columns=list("ab"), index=list("ABC")) + df = DataFrame(1, columns=list("ab"), index=Index(list("ABC"), dtype=object)) df.info(buf=buf) assert "+" in buf.getvalue() + buf = StringIO() + df = DataFrame(1, columns=list("ab"), index=Index(list("ABC"), dtype="str")) + df.info(buf=buf) + if using_infer_string and HAS_PYARROW: + assert "+" not in buf.getvalue() + else: + assert "+" in buf.getvalue() + buf = StringIO() df = DataFrame( 1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)]) @@ -460,7 +465,10 @@ def test_info_memory_usage_qualified(): 1, columns=list("ab"), index=MultiIndex.from_product([range(3), ["foo", "bar"]]) ) df.info(buf=buf) - assert "+" in buf.getvalue() + if using_infer_string and HAS_PYARROW: + assert "+" not in buf.getvalue() + else: + assert "+" in buf.getvalue() def test_info_memory_usage_bug_on_multiindex(): @@ -497,16 +505,15 @@ def test_info_categorical(): df.info(buf=buf) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system") -def test_info_int_columns(): +def test_info_int_columns(using_infer_string): # GH#37245 df = DataFrame({1: [1, 2], 2: [2, 3]}, index=["A", "B"]) buf = StringIO() df.info(show_counts=True, buf=buf) result = buf.getvalue() expected = textwrap.dedent( - """\ + f"""\ Index: 2 entries, A to B Data columns (total 2 columns): @@ -515,19 +522,22 @@ def test_info_int_columns(): 0 1 2 non-null int64 1 2 2 non-null int64 dtypes: int64(2) - memory usage: 48.0+ bytes + memory usage: {'50.0' if using_infer_string and HAS_PYARROW else '48.0+'} bytes """ ) assert result == expected -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") -def test_memory_usage_empty_no_warning(): +def test_memory_usage_empty_no_warning(using_infer_string): # GH#50066 df = DataFrame(index=["a", "b"]) with tm.assert_produces_warning(None): result = df.memory_usage() - expected = Series(16 if IS64 else 8, index=["Index"]) + if using_infer_string and HAS_PYARROW: + value = 18 + else: + value = 16 if IS64 else 8 + expected = Series(value, index=["Index"]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_info.py b/pandas/tests/series/methods/test_info.py index 8997d267f2695..e2831fb80b7a0 100644 --- a/pandas/tests/series/methods/test_info.py +++ b/pandas/tests/series/methods/test_info.py @@ -69,10 +69,11 @@ def test_info_series( 10 non-null int64 """ ) + qualifier = "" if using_infer_string and HAS_PYARROW else "+" expected += textwrap.dedent( f"""\ dtypes: int64(1) - memory usage: {ser.memory_usage()}.0{'' if using_infer_string else '+'} bytes + memory usage: {ser.memory_usage()}.0{qualifier} bytes """ ) assert result == expected