From ada3f71bc79ac941369da21bed208ac7a27cc0d1 Mon Sep 17 00:00:00 2001 From: iabhi4 Date: Fri, 30 May 2025 19:41:31 -0700 Subject: [PATCH 1/2] BUG: Fix pivot_table margins to include NaN groups when dropna=False --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/reshape/pivot.py | 23 ++++++++++++++++------- pandas/tests/reshape/test_pivot.py | 30 ++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 099e5bc48353a..eb381e1ce2241 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -844,6 +844,7 @@ Reshaping - Bug in :meth:`DataFrame.merge` when merging two :class:`DataFrame` on ``intc`` or ``uintc`` types on Windows (:issue:`60091`, :issue:`58713`) - Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`) - Bug in :meth:`DataFrame.pivot_table` incorrectly ignoring the ``values`` argument when also supplied to the ``index`` or ``columns`` parameters (:issue:`57876`, :issue:`61292`) +- Bug in :meth:`DataFrame.pivot_table` where ``margins=True`` did not correctly include groups with ``NaN`` values in the index or columns when ``dropna=False`` was explicitly passed. (:issue:`61509`) - Bug in :meth:`DataFrame.stack` with the new implementation where ``ValueError`` is raised when ``level=[]`` (:issue:`60740`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) - Bug in :meth:`concat` where concatenating DataFrame and Series with ``ignore_index = True`` drops the series name (:issue:`60723`, :issue:`56257`) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index ac89f19b80a0f..c80ee69047ea1 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -396,6 +396,7 @@ def __internal_pivot_table( observed=dropna, margins_name=margins_name, fill_value=fill_value, + dropna=dropna, ) # discard the top level @@ -422,6 +423,7 @@ def _add_margins( observed: bool, margins_name: Hashable = "All", fill_value=None, + dropna: bool = True, ): if not isinstance(margins_name, str): raise ValueError("margins_name argument must be a string") @@ -461,6 +463,7 @@ def _add_margins( kwargs, observed, margins_name, + dropna, ) if not isinstance(marginal_result_set, tuple): return marginal_result_set @@ -469,7 +472,7 @@ def _add_margins( # no values, and table is a DataFrame assert isinstance(table, ABCDataFrame) marginal_result_set = _generate_marginal_results_without_values( - table, data, rows, cols, aggfunc, kwargs, observed, margins_name + table, data, rows, cols, aggfunc, kwargs, observed, margins_name, dropna ) if not isinstance(marginal_result_set, tuple): return marginal_result_set @@ -538,6 +541,7 @@ def _generate_marginal_results( kwargs, observed: bool, margins_name: Hashable = "All", + dropna: bool = True, ): margin_keys: list | Index if len(cols) > 0: @@ -551,7 +555,7 @@ def _all_key(key): if len(rows) > 0: margin = ( data[rows + values] - .groupby(rows, observed=observed) + .groupby(rows, observed=observed, dropna=dropna) .agg(aggfunc, **kwargs) ) cat_axis = 1 @@ -567,7 +571,7 @@ def _all_key(key): else: margin = ( data[cols[:1] + values] - .groupby(cols[:1], observed=observed) + .groupby(cols[:1], observed=observed, dropna=dropna) .agg(aggfunc, **kwargs) .T ) @@ -610,7 +614,9 @@ def _all_key(key): if len(cols) > 0: row_margin = ( - data[cols + values].groupby(cols, observed=observed).agg(aggfunc, **kwargs) + data[cols + values] + .groupby(cols, observed=observed, dropna=dropna) + .agg(aggfunc, **kwargs) ) row_margin = row_margin.stack() @@ -633,6 +639,7 @@ def _generate_marginal_results_without_values( kwargs, observed: bool, margins_name: Hashable = "All", + dropna: bool = True, ): margin_keys: list | Index if len(cols) > 0: @@ -645,7 +652,7 @@ def _all_key(): return (margins_name,) + ("",) * (len(cols) - 1) if len(rows) > 0: - margin = data.groupby(rows, observed=observed)[rows].apply( + margin = data.groupby(rows, observed=observed, dropna=dropna)[rows].apply( aggfunc, **kwargs ) all_key = _all_key() @@ -654,7 +661,9 @@ def _all_key(): margin_keys.append(all_key) else: - margin = data.groupby(level=0, observed=observed).apply(aggfunc, **kwargs) + margin = data.groupby(level=0, observed=observed, dropna=dropna).apply( + aggfunc, **kwargs + ) all_key = _all_key() table[all_key] = margin result = table @@ -665,7 +674,7 @@ def _all_key(): margin_keys = table.columns if len(cols): - row_margin = data.groupby(cols, observed=observed)[cols].apply( + row_margin = data.groupby(cols, observed=observed, dropna=dropna)[cols].apply( aggfunc, **kwargs ) else: diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 2a58815c1cece..2cfc673c4dbdf 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2594,6 +2594,36 @@ def test_pivot_table_values_as_two_params( expected = DataFrame(data=e_data, index=e_index, columns=e_cols) tm.assert_frame_equal(result, expected) + def test_pivot_table_margins_include_nan_groups(self): + # GH#61509 + df = DataFrame( + { + "i": [1, 2, 3], + "g1": ["a", "b", "b"], + "g2": ["x", None, None], + } + ) + + result = df.pivot_table( + index="g1", + columns="g2", + values="i", + aggfunc="count", + dropna=False, + margins=True, + ) + + expected = DataFrame( + { + "x": {"a": 1.0, "b": np.nan, "All": 1.0}, + np.nan: {"a": np.nan, "b": 2.0, "All": 2.0}, + "All": {"a": 1.0, "b": 2.0, "All": 3.0}, + } + ) + expected.index.name = "g1" + expected.columns.name = "g2" + tm.assert_frame_equal(result, expected, check_dtype=False) + class TestPivot: def test_pivot(self): From e426ac7f2168ac30b05e9bd1a2c3f874d7ccdbac Mon Sep 17 00:00:00 2001 From: iabhi4 Date: Sat, 31 May 2025 14:49:28 -0700 Subject: [PATCH 2/2] TST: Fix crosstab margin tests for correct NaN handling with dropna=False (GH#61509) --- pandas/tests/reshape/test_crosstab.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 070c756e8c928..6ea98cdbdfc6e 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -289,7 +289,7 @@ def test_margin_dropna4(self): # GH: 10772: Keep np.nan in result with dropna=False df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]}) actual = crosstab(df.a, df.b, margins=True, dropna=False) - expected = DataFrame([[1, 0, 1.0], [1, 3, 4.0], [0, 1, np.nan], [2, 4, 6.0]]) + expected = DataFrame([[1, 0, 1], [1, 3, 4], [0, 1, 1], [2, 4, 6]]) expected.index = Index([1.0, 2.0, np.nan, "All"], name="a") expected.columns = Index([3, 4, "All"], name="b") tm.assert_frame_equal(actual, expected) @@ -301,11 +301,11 @@ def test_margin_dropna5(self): ) actual = crosstab(df.a, df.b, margins=True, dropna=False) expected = DataFrame( - [[1, 0, 0, 1.0], [0, 1, 0, 1.0], [0, 3, 1, np.nan], [1, 4, 0, 6.0]] + [[1, 0, 0, 1.0], [0, 1, 0, 1.0], [0, 3, 1, 4.0], [1, 4, 1, 6.0]] ) expected.index = Index([1.0, 2.0, np.nan, "All"], name="a") expected.columns = Index([3.0, 4.0, np.nan, "All"], name="b") - tm.assert_frame_equal(actual, expected) + tm.assert_frame_equal(actual, expected, check_dtype=False) def test_margin_dropna6(self): # GH: 10772: Keep np.nan in result with dropna=False @@ -326,7 +326,7 @@ def test_margin_dropna6(self): names=["b", "c"], ) expected = DataFrame( - [[1, 0, 1, 0, 0, 0, 2], [2, 0, 1, 1, 0, 1, 5], [3, 0, 2, 1, 0, 0, 7]], + [[1, 0, 1, 0, 0, 0, 2], [2, 0, 1, 1, 0, 1, 5], [3, 0, 2, 1, 0, 1, 7]], columns=m, ) expected.index = Index(["bar", "foo", "All"], name="a") @@ -344,13 +344,13 @@ def test_margin_dropna6(self): ) expected = DataFrame( [ - [1, 0, 1.0], - [1, 0, 1.0], + [1, 0, 1], + [1, 0, 1], [0, 0, np.nan], - [2, 0, 2.0], - [1, 1, 2.0], - [0, 1, np.nan], - [5, 2, 7.0], + [2, 0, 2], + [1, 1, 2], + [0, 1, 1], + [5, 2, 7], ], index=m, )