Skip to content

BUG: columns name retention in groupby methods #41497

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jun 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1139,6 +1139,8 @@ Groupby/resample/rolling
- Bug in :class:`DataFrameGroupBy` aggregations incorrectly failing to drop columns with invalid dtypes for that aggregation when there are no valid columns (:issue:`41291`)
- Bug in :meth:`DataFrame.rolling.__iter__` where ``on`` was not assigned to the index of the resulting objects (:issue:`40373`)
- Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.DataFrameGroupBy.agg` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`41647`)
- Bug in :class:`DataFrameGroupBy` methods ``agg``, ``transform``, ``sum``, ``bfill``, ``ffill``, ``pad``, ``pct_change``, ``shift``, ``ohlc`` dropping ``.columns.names`` (:issue:`41497`)


Reshaping
^^^^^^^^^
Expand Down
18 changes: 16 additions & 2 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ def agg_list_like(self) -> FrameOrSeriesUnion:

# multiples
else:
indices = []
for index, col in enumerate(selected_obj):
colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index])
try:
Expand All @@ -369,7 +370,9 @@ def agg_list_like(self) -> FrameOrSeriesUnion:
raise
else:
results.append(new_res)
keys.append(col)
indices.append(index)

keys = selected_obj.columns.take(indices)

# if we are empty
if not len(results):
Expand Down Expand Up @@ -407,6 +410,7 @@ def agg_dict_like(self) -> FrameOrSeriesUnion:
-------
Result of aggregation.
"""
from pandas import Index
from pandas.core.reshape.concat import concat

obj = self.obj
Expand Down Expand Up @@ -443,8 +447,18 @@ def agg_dict_like(self) -> FrameOrSeriesUnion:
keys_to_use = [k for k in keys if not results[k].empty]
# Have to check, if at least one DataFrame is not empty.
keys_to_use = keys_to_use if keys_to_use != [] else keys
if selected_obj.ndim == 2:
# keys are columns, so we can preserve names
ktu = Index(keys_to_use)
ktu._set_names(selected_obj.columns.names)
# Incompatible types in assignment (expression has type "Index",
# variable has type "List[Hashable]")
keys_to_use = ktu # type: ignore[assignment]

axis = 0 if isinstance(obj, ABCSeries) else 1
result = concat({k: results[k] for k in keys_to_use}, axis=axis)
result = concat(
{k: results[k] for k in keys_to_use}, axis=axis, keys=keys_to_use
)
elif any(is_ndframe):
# There is a mix of NDFrames and scalars
raise ValueError(
Expand Down
13 changes: 7 additions & 6 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1020,13 +1020,15 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)

if isinstance(sobj, Series):
# GH#35246 test_groupby_as_index_select_column_sum_empty_df
result.columns = [sobj.name]
result.columns = self._obj_with_exclusions.columns.copy()
else:
# Retain our column names
result.columns._set_names(
sobj.columns.names, level=list(range(sobj.columns.nlevels))
)
# select everything except for the last level, which is the one
# containing the name of the function(s), see GH#32040
result.columns = result.columns.rename(
[sobj.columns.name] * result.columns.nlevels
).droplevel(-1)
result.columns = result.columns.droplevel(-1)

if not self.as_index:
self._insert_inaxis_grouper_inplace(result)
Expand Down Expand Up @@ -1665,7 +1667,7 @@ def _wrap_transformed_output(
result.columns = self.obj.columns
else:
columns = Index(key.label for key in output)
columns.name = self.obj.columns.name
columns._set_names(self.obj._get_axis(1 - self.axis).names)
result.columns = columns

result.index = self.obj.index
Expand Down Expand Up @@ -1800,7 +1802,6 @@ def nunique(self, dropna: bool = True) -> DataFrame:
results = self._apply_to_column_groupbys(
lambda sgb: sgb.nunique(dropna), obj=obj
)
results.columns.names = obj.columns.names # TODO: do at higher level?

if not self.as_index:
results.index = Index(range(len(results)))
Expand Down
9 changes: 7 additions & 2 deletions pandas/core/reshape/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,8 +362,13 @@ def __init__(
clean_keys.append(k)
clean_objs.append(v)
objs = clean_objs
name = getattr(keys, "name", None)
keys = Index(clean_keys, name=name)

if isinstance(keys, MultiIndex):
# TODO: retain levels?
keys = type(keys).from_tuples(clean_keys, names=keys.names)
else:
name = getattr(keys, "name", None)
keys = Index(clean_keys, name=name)

if len(objs) == 0:
raise ValueError("All objects passed were None")
Expand Down
17 changes: 12 additions & 5 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,13 +300,13 @@ def test_agg_multiple_functions_same_name_with_ohlc_present():
# ohlc expands dimensions, so different test to the above is required.
df = DataFrame(
np.random.randn(1000, 3),
index=pd.date_range("1/1/2012", freq="S", periods=1000),
columns=["A", "B", "C"],
index=pd.date_range("1/1/2012", freq="S", periods=1000, name="dti"),
columns=Index(["A", "B", "C"], name="alpha"),
)
result = df.resample("3T").agg(
{"A": ["ohlc", partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]}
)
expected_index = pd.date_range("1/1/2012", freq="3T", periods=6)
expected_index = pd.date_range("1/1/2012", freq="3T", periods=6, name="dti")
expected_columns = MultiIndex.from_tuples(
[
("A", "ohlc", "open"),
Expand All @@ -315,7 +315,8 @@ def test_agg_multiple_functions_same_name_with_ohlc_present():
("A", "ohlc", "close"),
("A", "quantile", "A"),
("A", "quantile", "A"),
]
],
names=["alpha", None, None],
)
non_ohlc_expected_values = np.array(
[df.resample("3T").A.quantile(q=q).values for q in [0.9999, 0.1111]]
Expand Down Expand Up @@ -901,14 +902,20 @@ def test_grouby_agg_loses_results_with_as_index_false_relabel_multiindex():
def test_multiindex_custom_func(func):
# GH 31777
data = [[1, 4, 2], [5, 7, 1]]
df = DataFrame(data, columns=MultiIndex.from_arrays([[1, 1, 2], [3, 4, 3]]))
df = DataFrame(
data,
columns=MultiIndex.from_arrays(
[[1, 1, 2], [3, 4, 3]], names=["Sisko", "Janeway"]
),
)
result = df.groupby(np.array([0, 1])).agg(func)
expected_dict = {
(1, 3): {0: 1.0, 1: 5.0},
(1, 4): {0: 4.0, 1: 7.0},
(2, 3): {0: 2.0, 1: 1.0},
}
expected = DataFrame(expected_dict)
expected.columns = df.columns
tm.assert_frame_equal(result, expected)


Expand Down
22 changes: 14 additions & 8 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -637,10 +637,11 @@ def test_as_index_select_column():

def test_groupby_as_index_select_column_sum_empty_df():
# GH 35246
df = DataFrame(columns=["A", "B", "C"])
df = DataFrame(columns=Index(["A", "B", "C"], name="alpha"))
left = df.groupby(by="A", as_index=False)["B"].sum(numeric_only=False)
assert type(left) is DataFrame
assert left.to_dict() == {"A": {}, "B": {}}

expected = DataFrame(columns=df.columns[:2], index=range(0))
tm.assert_frame_equal(left, expected)


def test_groupby_as_index_agg(df):
Expand Down Expand Up @@ -1944,8 +1945,8 @@ def test_groupby_agg_ohlc_non_first():
# GH 21716
df = DataFrame(
[[1], [1]],
columns=["foo"],
index=date_range("2018-01-01", periods=2, freq="D"),
columns=Index(["foo"], name="mycols"),
index=date_range("2018-01-01", periods=2, freq="D", name="dti"),
)

expected = DataFrame(
Expand All @@ -1957,9 +1958,10 @@ def test_groupby_agg_ohlc_non_first():
("foo", "ohlc", "high"),
("foo", "ohlc", "low"),
("foo", "ohlc", "close"),
)
),
names=["mycols", None, None],
),
index=date_range("2018-01-01", periods=2, freq="D"),
index=date_range("2018-01-01", periods=2, freq="D", name="dti"),
)

result = df.groupby(Grouper(freq="D")).agg(["sum", "ohlc"])
Expand Down Expand Up @@ -2131,7 +2133,11 @@ def test_groupby_duplicate_index():


@pytest.mark.parametrize(
"idx", [Index(["a", "a"]), MultiIndex.from_tuples((("a", "a"), ("a", "a")))]
"idx",
[
Index(["a", "a"], name="foo"),
MultiIndex.from_tuples((("a", "a"), ("a", "a")), names=["foo", "bar"]),
],
)
@pytest.mark.filterwarnings("ignore:tshift is deprecated:FutureWarning")
def test_dup_labels_output_shape(groupby_func, idx):
Expand Down