diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 6c2fef3808566..6ddf2c43119d3 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -1140,6 +1140,8 @@ Groupby/resample/rolling - Bug in :class:`DataFrameGroupBy` aggregations incorrectly failing to drop columns with invalid dtypes for that aggregation when there are no valid columns (:issue:`41291`) - Bug in :meth:`DataFrame.rolling.__iter__` where ``on`` was not assigned to the index of the resulting objects (:issue:`40373`) - Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.DataFrameGroupBy.agg` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`41647`) +- Bug in :class:`DataFrameGroupBy` methods ``agg``, ``transform``, ``sum``, ``bfill``, ``ffill``, ``pad``, ``pct_change``, ``shift``, ``ohlc`` dropping ``.columns.names`` (:issue:`41497`) + Reshaping ^^^^^^^^^ diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 9e787555f2b1f..49e06825617bd 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -348,6 +348,7 @@ def agg_list_like(self) -> FrameOrSeriesUnion: # multiples else: + indices = [] for index, col in enumerate(selected_obj): colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) try: @@ -369,7 +370,9 @@ def agg_list_like(self) -> FrameOrSeriesUnion: raise else: results.append(new_res) - keys.append(col) + indices.append(index) + + keys = selected_obj.columns.take(indices) # if we are empty if not len(results): @@ -407,6 +410,7 @@ def agg_dict_like(self) -> FrameOrSeriesUnion: ------- Result of aggregation. """ + from pandas import Index from pandas.core.reshape.concat import concat obj = self.obj @@ -443,8 +447,18 @@ def agg_dict_like(self) -> FrameOrSeriesUnion: keys_to_use = [k for k in keys if not results[k].empty] # Have to check, if at least one DataFrame is not empty. keys_to_use = keys_to_use if keys_to_use != [] else keys + if selected_obj.ndim == 2: + # keys are columns, so we can preserve names + ktu = Index(keys_to_use) + ktu._set_names(selected_obj.columns.names) + # Incompatible types in assignment (expression has type "Index", + # variable has type "List[Hashable]") + keys_to_use = ktu # type: ignore[assignment] + axis = 0 if isinstance(obj, ABCSeries) else 1 - result = concat({k: results[k] for k in keys_to_use}, axis=axis) + result = concat( + {k: results[k] for k in keys_to_use}, axis=axis, keys=keys_to_use + ) elif any(is_ndframe): # There is a mix of NDFrames and scalars raise ValueError( diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 69f992f840c7c..5cb0eac5d9074 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1020,13 +1020,15 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) if isinstance(sobj, Series): # GH#35246 test_groupby_as_index_select_column_sum_empty_df - result.columns = [sobj.name] + result.columns = self._obj_with_exclusions.columns.copy() else: + # Retain our column names + result.columns._set_names( + sobj.columns.names, level=list(range(sobj.columns.nlevels)) + ) # select everything except for the last level, which is the one # containing the name of the function(s), see GH#32040 - result.columns = result.columns.rename( - [sobj.columns.name] * result.columns.nlevels - ).droplevel(-1) + result.columns = result.columns.droplevel(-1) if not self.as_index: self._insert_inaxis_grouper_inplace(result) @@ -1665,7 +1667,7 @@ def _wrap_transformed_output( result.columns = self.obj.columns else: columns = Index(key.label for key in output) - columns.name = self.obj.columns.name + columns._set_names(self.obj._get_axis(1 - self.axis).names) result.columns = columns result.index = self.obj.index @@ -1800,7 +1802,6 @@ def nunique(self, dropna: bool = True) -> DataFrame: results = self._apply_to_column_groupbys( lambda sgb: sgb.nunique(dropna), obj=obj ) - results.columns.names = obj.columns.names # TODO: do at higher level? if not self.as_index: results.index = Index(range(len(results))) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index ea34bc75b4e31..b49622f4ac36a 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -362,8 +362,13 @@ def __init__( clean_keys.append(k) clean_objs.append(v) objs = clean_objs - name = getattr(keys, "name", None) - keys = Index(clean_keys, name=name) + + if isinstance(keys, MultiIndex): + # TODO: retain levels? + keys = type(keys).from_tuples(clean_keys, names=keys.names) + else: + name = getattr(keys, "name", None) + keys = Index(clean_keys, name=name) if len(objs) == 0: raise ValueError("All objects passed were None") diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 1d4ff25c518ee..393dc0813661f 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -300,13 +300,13 @@ def test_agg_multiple_functions_same_name_with_ohlc_present(): # ohlc expands dimensions, so different test to the above is required. df = DataFrame( np.random.randn(1000, 3), - index=pd.date_range("1/1/2012", freq="S", periods=1000), - columns=["A", "B", "C"], + index=pd.date_range("1/1/2012", freq="S", periods=1000, name="dti"), + columns=Index(["A", "B", "C"], name="alpha"), ) result = df.resample("3T").agg( {"A": ["ohlc", partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]} ) - expected_index = pd.date_range("1/1/2012", freq="3T", periods=6) + expected_index = pd.date_range("1/1/2012", freq="3T", periods=6, name="dti") expected_columns = MultiIndex.from_tuples( [ ("A", "ohlc", "open"), @@ -315,7 +315,8 @@ def test_agg_multiple_functions_same_name_with_ohlc_present(): ("A", "ohlc", "close"), ("A", "quantile", "A"), ("A", "quantile", "A"), - ] + ], + names=["alpha", None, None], ) non_ohlc_expected_values = np.array( [df.resample("3T").A.quantile(q=q).values for q in [0.9999, 0.1111]] @@ -901,7 +902,12 @@ def test_grouby_agg_loses_results_with_as_index_false_relabel_multiindex(): def test_multiindex_custom_func(func): # GH 31777 data = [[1, 4, 2], [5, 7, 1]] - df = DataFrame(data, columns=MultiIndex.from_arrays([[1, 1, 2], [3, 4, 3]])) + df = DataFrame( + data, + columns=MultiIndex.from_arrays( + [[1, 1, 2], [3, 4, 3]], names=["Sisko", "Janeway"] + ), + ) result = df.groupby(np.array([0, 1])).agg(func) expected_dict = { (1, 3): {0: 1.0, 1: 5.0}, @@ -909,6 +915,7 @@ def test_multiindex_custom_func(func): (2, 3): {0: 2.0, 1: 1.0}, } expected = DataFrame(expected_dict) + expected.columns = df.columns tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 382a940d2a92c..89944e2a745e4 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -637,10 +637,11 @@ def test_as_index_select_column(): def test_groupby_as_index_select_column_sum_empty_df(): # GH 35246 - df = DataFrame(columns=["A", "B", "C"]) + df = DataFrame(columns=Index(["A", "B", "C"], name="alpha")) left = df.groupby(by="A", as_index=False)["B"].sum(numeric_only=False) - assert type(left) is DataFrame - assert left.to_dict() == {"A": {}, "B": {}} + + expected = DataFrame(columns=df.columns[:2], index=range(0)) + tm.assert_frame_equal(left, expected) def test_groupby_as_index_agg(df): @@ -1944,8 +1945,8 @@ def test_groupby_agg_ohlc_non_first(): # GH 21716 df = DataFrame( [[1], [1]], - columns=["foo"], - index=date_range("2018-01-01", periods=2, freq="D"), + columns=Index(["foo"], name="mycols"), + index=date_range("2018-01-01", periods=2, freq="D", name="dti"), ) expected = DataFrame( @@ -1957,9 +1958,10 @@ def test_groupby_agg_ohlc_non_first(): ("foo", "ohlc", "high"), ("foo", "ohlc", "low"), ("foo", "ohlc", "close"), - ) + ), + names=["mycols", None, None], ), - index=date_range("2018-01-01", periods=2, freq="D"), + index=date_range("2018-01-01", periods=2, freq="D", name="dti"), ) result = df.groupby(Grouper(freq="D")).agg(["sum", "ohlc"]) @@ -2131,7 +2133,11 @@ def test_groupby_duplicate_index(): @pytest.mark.parametrize( - "idx", [Index(["a", "a"]), MultiIndex.from_tuples((("a", "a"), ("a", "a")))] + "idx", + [ + Index(["a", "a"], name="foo"), + MultiIndex.from_tuples((("a", "a"), ("a", "a")), names=["foo", "bar"]), + ], ) @pytest.mark.filterwarnings("ignore:tshift is deprecated:FutureWarning") def test_dup_labels_output_shape(groupby_func, idx):