From 3901f956fa738e9edc6d4c58a7dc5210df80ec6e Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 15 May 2021 16:39:11 -0700 Subject: [PATCH 1/3] BUG: columns name retention in groupby methods --- pandas/core/apply.py | 15 +++++++++++-- pandas/core/groupby/generic.py | 13 ++++++----- pandas/core/reshape/concat.py | 9 ++++++-- .../tests/groupby/aggregate/test_aggregate.py | 17 +++++++++----- pandas/tests/groupby/test_groupby.py | 22 ++++++++++++------- 5 files changed, 53 insertions(+), 23 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index d0c6a1a841edb..da9738a31ad47 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -348,6 +348,7 @@ def agg_list_like(self) -> FrameOrSeriesUnion: # multiples else: + indices = [] for index, col in enumerate(selected_obj): colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) try: @@ -369,7 +370,9 @@ def agg_list_like(self) -> FrameOrSeriesUnion: raise else: results.append(new_res) - keys.append(col) + indices.append(index) + + keys = selected_obj.columns.take(indices) # if we are empty if not len(results): @@ -399,6 +402,7 @@ def agg_dict_like(self) -> FrameOrSeriesUnion: ------- Result of aggregation. """ + from pandas import Index from pandas.core.reshape.concat import concat obj = self.obj @@ -435,8 +439,15 @@ def agg_dict_like(self) -> FrameOrSeriesUnion: keys_to_use = [k for k in keys if not results[k].empty] # Have to check, if at least one DataFrame is not empty. keys_to_use = keys_to_use if keys_to_use != [] else keys + if selected_obj.ndim == 2: + # keys are columns, so we can preserve names + keys_to_use = Index(keys_to_use) + keys_to_use._set_names(selected_obj.columns.names) + axis = 0 if isinstance(obj, ABCSeries) else 1 - result = concat({k: results[k] for k in keys_to_use}, axis=axis) + result = concat( + {k: results[k] for k in keys_to_use}, axis=axis, keys=keys_to_use + ) elif any(is_ndframe): # There is a mix of NDFrames and scalars raise ValueError( diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 5c28a15532174..3d7f98d8bfc98 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1031,13 +1031,15 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) if isinstance(sobj, Series): # GH#35246 test_groupby_as_index_select_column_sum_empty_df - result.columns = [sobj.name] + result.columns = self._obj_with_exclusions.columns.copy() else: + # Retain our column names + result.columns._set_names( + sobj.columns.names, level=list(range(sobj.columns.nlevels)) + ) # select everything except for the last level, which is the one # containing the name of the function(s), see GH#32040 - result.columns = result.columns.rename( - [sobj.columns.name] * result.columns.nlevels - ).droplevel(-1) + result.columns = result.columns.droplevel(-1) if not self.as_index: self._insert_inaxis_grouper_inplace(result) @@ -1655,7 +1657,7 @@ def _wrap_transformed_output( result.columns = self.obj.columns else: columns = Index(key.label for key in output) - columns.name = self.obj.columns.name + columns._set_names(self.obj._get_axis(1 - self.axis).names) result.columns = columns result.index = self.obj.index @@ -1790,7 +1792,6 @@ def nunique(self, dropna: bool = True) -> DataFrame: results = self._apply_to_column_groupbys( lambda sgb: sgb.nunique(dropna), obj=obj ) - results.columns.names = obj.columns.names # TODO: do at higher level? if not self.as_index: results.index = Index(range(len(results))) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index b3b453ea6355a..f9b652cdc48c2 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -358,8 +358,13 @@ def __init__( clean_keys.append(k) clean_objs.append(v) objs = clean_objs - name = getattr(keys, "name", None) - keys = Index(clean_keys, name=name) + + if isinstance(keys, MultiIndex): + # TODO: retain levels? + keys = type(keys).from_tuples(clean_keys, names=keys.names) + else: + name = getattr(keys, "name", None) + keys = Index(clean_keys, name=name) if len(objs) == 0: raise ValueError("All objects passed were None") diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index b601ba92886d9..95afe4fcf26f4 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -298,13 +298,13 @@ def test_agg_multiple_functions_same_name_with_ohlc_present(): # ohlc expands dimensions, so different test to the above is required. df = DataFrame( np.random.randn(1000, 3), - index=pd.date_range("1/1/2012", freq="S", periods=1000), - columns=["A", "B", "C"], + index=pd.date_range("1/1/2012", freq="S", periods=1000, name="dti"), + columns=Index(["A", "B", "C"], name="alpha"), ) result = df.resample("3T").agg( {"A": ["ohlc", partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]} ) - expected_index = pd.date_range("1/1/2012", freq="3T", periods=6) + expected_index = pd.date_range("1/1/2012", freq="3T", periods=6, name="dti") expected_columns = MultiIndex.from_tuples( [ ("A", "ohlc", "open"), @@ -313,7 +313,8 @@ def test_agg_multiple_functions_same_name_with_ohlc_present(): ("A", "ohlc", "close"), ("A", "quantile", "A"), ("A", "quantile", "A"), - ] + ], + names=["alpha", None, None], ) non_ohlc_expected_values = np.array( [df.resample("3T").A.quantile(q=q).values for q in [0.9999, 0.1111]] @@ -897,7 +898,12 @@ def test_grouby_agg_loses_results_with_as_index_false_relabel_multiindex(): def test_multiindex_custom_func(func): # GH 31777 data = [[1, 4, 2], [5, 7, 1]] - df = DataFrame(data, columns=MultiIndex.from_arrays([[1, 1, 2], [3, 4, 3]])) + df = DataFrame( + data, + columns=MultiIndex.from_arrays( + [[1, 1, 2], [3, 4, 3]], names=["Sisko", "Janeway"] + ), + ) result = df.groupby(np.array([0, 1])).agg(func) expected_dict = { (1, 3): {0: 1.0, 1: 5.0}, @@ -905,6 +911,7 @@ def test_multiindex_custom_func(func): (2, 3): {0: 2.0, 1: 1.0}, } expected = DataFrame(expected_dict) + expected.columns = df.columns tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 83aeb29ec53df..4aaa127949711 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -637,10 +637,11 @@ def test_as_index_select_column(): def test_groupby_as_index_select_column_sum_empty_df(): # GH 35246 - df = DataFrame(columns=["A", "B", "C"]) + df = DataFrame(columns=Index(["A", "B", "C"], name="alpha")) left = df.groupby(by="A", as_index=False)["B"].sum() - assert type(left) is DataFrame - assert left.to_dict() == {"A": {}, "B": {}} + + expected = DataFrame(columns=df.columns[:2], index=range(0)) + tm.assert_frame_equal(left, expected) def test_groupby_as_index_agg(df): @@ -1865,8 +1866,8 @@ def test_groupby_agg_ohlc_non_first(): # GH 21716 df = DataFrame( [[1], [1]], - columns=["foo"], - index=date_range("2018-01-01", periods=2, freq="D"), + columns=Index(["foo"], name="mycols"), + index=date_range("2018-01-01", periods=2, freq="D", name="dti"), ) expected = DataFrame( @@ -1878,9 +1879,10 @@ def test_groupby_agg_ohlc_non_first(): ("foo", "ohlc", "high"), ("foo", "ohlc", "low"), ("foo", "ohlc", "close"), - ) + ), + names=["mycols", None, None], ), - index=date_range("2018-01-01", periods=2, freq="D"), + index=date_range("2018-01-01", periods=2, freq="D", name="dti"), ) result = df.groupby(Grouper(freq="D")).agg(["sum", "ohlc"]) @@ -2052,7 +2054,11 @@ def test_groupby_duplicate_index(): @pytest.mark.parametrize( - "idx", [Index(["a", "a"]), MultiIndex.from_tuples((("a", "a"), ("a", "a")))] + "idx", + [ + Index(["a", "a"], name="foo"), + MultiIndex.from_tuples((("a", "a"), ("a", "a")), names=["foo", "bar"]), + ], ) @pytest.mark.filterwarnings("ignore:tshift is deprecated:FutureWarning") def test_dup_labels_output_shape(groupby_func, idx): From 23de535615448205249fb905c44fa3fa3bb66e7c Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 16 May 2021 17:14:52 -0700 Subject: [PATCH 2/3] mypy fixup --- pandas/core/apply.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index da9738a31ad47..6425d8f75b75f 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -441,8 +441,11 @@ def agg_dict_like(self) -> FrameOrSeriesUnion: keys_to_use = keys_to_use if keys_to_use != [] else keys if selected_obj.ndim == 2: # keys are columns, so we can preserve names - keys_to_use = Index(keys_to_use) - keys_to_use._set_names(selected_obj.columns.names) + ktu = Index(keys_to_use) + ktu._set_names(selected_obj.columns.names) + # Incompatible types in assignment (expression has type "Index", + # variable has type "List[Hashable]") + keys_to_use = ktu # type: ignore[assignment] axis = 0 if isinstance(obj, ABCSeries) else 1 result = concat( From bab04e138dd466402a98e5e165f2c55150d73603 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 31 May 2021 23:22:57 -0700 Subject: [PATCH 3/3] whatsnew --- doc/source/whatsnew/v1.3.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index e06085c4c5c26..9d0538f9001ec 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -233,7 +233,7 @@ Other enhancements - Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`) - Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`) - :meth:`Series.replace` will now cast results to ``PeriodDtype`` where possible instead of ``object`` dtype (:issue:`41526`) -- Improved error message in ``corr` and ``cov`` methods on :class:`.Rolling`, :class:`.Expanding`, and :class:`.ExponentialMovingWindow` when ``other`` is not a :class:`DataFrame` or :class:`Series` (:issue:`41741`) +- Improved error message in ``corr`` and ``cov`` methods on :class:`.Rolling`, :class:`.Expanding`, and :class:`.ExponentialMovingWindow` when ``other`` is not a :class:`DataFrame` or :class:`Series` (:issue:`41741`) .. --------------------------------------------------------------------------- @@ -1058,6 +1058,7 @@ Groupby/resample/rolling - Bug in :meth:`SeriesGroupBy` aggregations incorrectly returning empty :class:`Series` instead of raising ``TypeError`` on aggregations that are invalid for its dtype, e.g. ``.prod`` with ``datetime64[ns]`` dtype (:issue:`41342`) - Bug in :meth:`DataFrame.rolling.__iter__` where ``on`` was not assigned to the index of the resulting objects (:issue:`40373`) - Bug in :meth:`DataFrameGroupBy.transform` and :meth:`DataFrameGroupBy.agg` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`41647`) +- Bug in :class:`DataFrameGroupBy` methods ``agg``, ``transform``, ``sum``, ``bfill``, ``ffill``, ``pad``, ``pct_change``, ``shift``, ``ohlc`` dropping ``.columns.names`` (:issue:`41497`) Reshaping ^^^^^^^^^