Skip to content

Commit 3408a61

Browse files
authored
BUG: Groupby ops on empty objects loses index, columns, dtypes (#39940)
1 parent 212323f commit 3408a61

File tree

9 files changed

+94
-38
lines changed

9 files changed

+94
-38
lines changed

doc/source/whatsnew/v1.3.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,7 @@ Groupby/resample/rolling
440440
- Bug in :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` where 1 would be returned instead of ``np.nan`` when providing ``other`` that was longer than each group (:issue:`39591`)
441441
- Bug in :meth:`.GroupBy.mean`, :meth:`.GroupBy.median` and :meth:`DataFrame.pivot_table` not propagating metadata (:issue:`28283`)
442442
- Bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not calculating window bounds correctly when window is an offset and dates are in descending order (:issue:`40002`)
443+
- Bug in :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` on an empty ``Series`` or ``DataFrame`` would lose index, columns, and/or data types when directly using the methods ``idxmax``, ``idxmin``, ``mad``, ``min``, ``max``, ``sum``, ``prod``, and ``skew`` or using them through ``apply``, ``aggregate``, or ``resample`` (:issue:`26411`)
443444
-
444445

445446
Reshaping
@@ -455,6 +456,7 @@ Reshaping
455456
- Bug in :meth:`DataFrame.sort_values` not reshaping index correctly after sorting on columns, when ``ignore_index=True`` (:issue:`39464`)
456457
- Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`)
457458
- Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``datetime64`` and ``timedelta64`` dtypes (:issue:`39574`)
459+
- Bug in :meth:`DataFrame.pivot_table` returning a ``MultiIndex`` for a single value when operating on and empty ``DataFrame`` (:issue:`13483`)
458460

459461
Sparse
460462
^^^^^^

pandas/core/groupby/generic.py

+17-4
Original file line numberDiff line numberDiff line change
@@ -450,13 +450,19 @@ def _wrap_transformed_output(
450450
return result
451451

452452
def _wrap_applied_output(
453-
self, keys: Index, values: Optional[List[Any]], not_indexed_same: bool = False
453+
self,
454+
data: Series,
455+
keys: Index,
456+
values: Optional[List[Any]],
457+
not_indexed_same: bool = False,
454458
) -> FrameOrSeriesUnion:
455459
"""
456460
Wrap the output of SeriesGroupBy.apply into the expected result.
457461
458462
Parameters
459463
----------
464+
data : Series
465+
Input data for groupby operation.
460466
keys : Index
461467
Keys of groups that Series was grouped by.
462468
values : Optional[List[Any]]
@@ -471,7 +477,10 @@ def _wrap_applied_output(
471477
if len(keys) == 0:
472478
# GH #6265
473479
return self.obj._constructor(
474-
[], name=self._selection_name, index=keys, dtype=np.float64
480+
[],
481+
name=self._selection_name,
482+
index=self.grouper.result_index,
483+
dtype=data.dtype,
475484
)
476485
assert values is not None
477486

@@ -1229,9 +1238,13 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame:
12291238

12301239
return self.obj._constructor(result, columns=result_columns)
12311240

1232-
def _wrap_applied_output(self, keys, values, not_indexed_same=False):
1241+
def _wrap_applied_output(self, data, keys, values, not_indexed_same=False):
12331242
if len(keys) == 0:
1234-
return self.obj._constructor(index=keys)
1243+
result = self.obj._constructor(
1244+
index=self.grouper.result_index, columns=data.columns
1245+
)
1246+
result = result.astype(data.dtypes.to_dict(), copy=False)
1247+
return result
12351248

12361249
# GH12824
12371250
first_not_none = next(com.not_none(*values), None)

pandas/core/groupby/groupby.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -981,7 +981,7 @@ def _python_apply_general(
981981
keys, values, mutated = self.grouper.apply(f, data, self.axis)
982982

983983
return self._wrap_applied_output(
984-
keys, values, not_indexed_same=mutated or self.mutated
984+
data, keys, values, not_indexed_same=mutated or self.mutated
985985
)
986986

987987
def _iterate_slices(self) -> Iterable[Series]:
@@ -1058,7 +1058,7 @@ def _wrap_aggregated_output(
10581058
def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]):
10591059
raise AbstractMethodError(self)
10601060

1061-
def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False):
1061+
def _wrap_applied_output(self, data, keys, values, not_indexed_same: bool = False):
10621062
raise AbstractMethodError(self)
10631063

10641064
@final

pandas/core/reshape/pivot.py

+3-16
Original file line numberDiff line numberDiff line change
@@ -236,14 +236,8 @@ def __internal_pivot_table(
236236
)
237237

238238
# discard the top level
239-
if (
240-
values_passed
241-
and not values_multi
242-
and not table.empty
243-
and (table.columns.nlevels > 1)
244-
):
245-
table = table[values[0]]
246-
239+
if values_passed and not values_multi and table.columns.nlevels > 1:
240+
table = table.droplevel(0, axis=1)
247241
if len(index) == 0 and len(columns) > 0:
248242
table = table.T
249243

@@ -650,7 +644,6 @@ def crosstab(
650644
**dict(zip(unique_colnames, columns)),
651645
}
652646
df = DataFrame(data, index=common_idx)
653-
original_df_cols = df.columns
654647

655648
if values is None:
656649
df["__dummy__"] = 0
@@ -660,7 +653,7 @@ def crosstab(
660653
kwargs = {"aggfunc": aggfunc}
661654

662655
table = df.pivot_table(
663-
["__dummy__"],
656+
"__dummy__",
664657
index=unique_rownames,
665658
columns=unique_colnames,
666659
margins=margins,
@@ -669,12 +662,6 @@ def crosstab(
669662
**kwargs,
670663
)
671664

672-
# GH18321, after pivoting, an extra top level of column index of `__dummy__` is
673-
# created, and this extra level should not be included in the further steps
674-
if not table.empty:
675-
cols_diff = df.columns.difference(original_df_cols)[0]
676-
table = table[cols_diff]
677-
678665
# Post-process
679666
if normalize is not False:
680667
table = _normalize(

pandas/tests/groupby/aggregate/test_aggregate.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -147,11 +147,13 @@ def test_agg_apply_corner(ts, tsframe):
147147
# DataFrame
148148
grouped = tsframe.groupby(tsframe["A"] * np.nan)
149149
exp_df = DataFrame(
150-
columns=tsframe.columns, dtype=float, index=Index([], dtype=np.float64)
150+
columns=tsframe.columns,
151+
dtype=float,
152+
index=Index([], name="A", dtype=np.float64),
151153
)
152-
tm.assert_frame_equal(grouped.sum(), exp_df, check_names=False)
153-
tm.assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False)
154-
tm.assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], check_names=False)
154+
tm.assert_frame_equal(grouped.sum(), exp_df)
155+
tm.assert_frame_equal(grouped.agg(np.sum), exp_df)
156+
tm.assert_frame_equal(grouped.apply(np.sum), exp_df)
155157

156158

157159
def test_agg_grouping_is_list_tuple(ts):

pandas/tests/groupby/test_groupby.py

+44-9
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
import pandas as pd
1212
from pandas import (
13+
Categorical,
1314
DataFrame,
1415
Grouper,
1516
Index,
@@ -18,6 +19,7 @@
1819
Timestamp,
1920
date_range,
2021
read_csv,
22+
to_datetime,
2123
)
2224
import pandas._testing as tm
2325
from pandas.core.base import SpecificationError
@@ -1716,15 +1718,48 @@ def test_pivot_table_values_key_error():
17161718
)
17171719

17181720

1719-
def test_empty_dataframe_groupby():
1720-
# GH8093
1721-
df = DataFrame(columns=["A", "B", "C"])
1722-
1723-
result = df.groupby("A").sum()
1724-
expected = DataFrame(columns=["B", "C"], dtype=np.float64)
1725-
expected.index.name = "A"
1726-
1727-
tm.assert_frame_equal(result, expected)
1721+
@pytest.mark.parametrize("columns", ["C", ["C"]])
1722+
@pytest.mark.parametrize("keys", [["A"], ["A", "B"]])
1723+
@pytest.mark.parametrize(
1724+
"values",
1725+
[
1726+
[True],
1727+
[0],
1728+
[0.0],
1729+
["a"],
1730+
[Categorical([0])],
1731+
[to_datetime(0)],
1732+
[date_range(0, 1, 1, tz="US/Eastern")],
1733+
[pd.array([0], dtype="Int64")],
1734+
],
1735+
)
1736+
@pytest.mark.parametrize("method", ["attr", "agg", "apply"])
1737+
@pytest.mark.parametrize(
1738+
"op", ["idxmax", "idxmin", "mad", "min", "max", "sum", "prod", "skew"]
1739+
)
1740+
def test_empty_groupby(columns, keys, values, method, op):
1741+
# GH8093 & GH26411
1742+
1743+
override_dtype = None
1744+
if isinstance(values[0], bool) and op in ("prod", "sum") and method != "apply":
1745+
# sum/product of bools is an integer
1746+
override_dtype = "int64"
1747+
1748+
df = DataFrame([3 * values], columns=list("ABC"))
1749+
df = df.iloc[:0]
1750+
1751+
gb = df.groupby(keys)[columns]
1752+
if method == "attr":
1753+
result = getattr(gb, op)()
1754+
else:
1755+
result = getattr(gb, method)(op)
1756+
1757+
expected = df.set_index(keys)[columns]
1758+
if override_dtype is not None:
1759+
expected = expected.astype(override_dtype)
1760+
if len(keys) == 1:
1761+
expected.index.name = keys[0]
1762+
tm.assert_equal(result, expected)
17281763

17291764

17301765
def test_tuple_as_grouping():

pandas/tests/resample/test_resampler_grouper.py

+13
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from pandas import (
1111
DataFrame,
1212
Series,
13+
TimedeltaIndex,
1314
Timestamp,
1415
)
1516
import pandas._testing as tm
@@ -398,6 +399,18 @@ def test_resample_groupby_agg():
398399
tm.assert_frame_equal(result, expected)
399400

400401

402+
@pytest.mark.parametrize("keys", [["a"], ["a", "b"]])
403+
def test_empty(keys):
404+
# GH 26411
405+
df = pd.DataFrame([], columns=["a", "b"], index=TimedeltaIndex([]))
406+
result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean()
407+
expected = DataFrame(columns=["a", "b"]).set_index(keys, drop=False)
408+
if len(keys) == 1:
409+
expected.index.name = keys[0]
410+
411+
tm.assert_frame_equal(result, expected)
412+
413+
401414
@pytest.mark.parametrize("consolidate", [True, False])
402415
def test_resample_groupby_agg_object_dtype_all_nan(consolidate):
403416
# https://github.com/pandas-dev/pandas/issues/39329

pandas/tests/reshape/test_crosstab.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,10 @@ def test_crosstab_no_overlap(self):
240240
s2 = Series([4, 5, 6], index=[4, 5, 6])
241241

242242
actual = crosstab(s1, s2)
243-
expected = DataFrame()
243+
expected = DataFrame(
244+
index=Index([], dtype="int64", name="row_0"),
245+
columns=Index([], dtype="int64", name="col_0"),
246+
)
244247

245248
tm.assert_frame_equal(actual, expected)
246249

pandas/tests/reshape/test_pivot.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -2040,7 +2040,7 @@ def test_pivot_table_aggfunc_scalar_dropna(self, dropna):
20402040
tm.assert_frame_equal(result, expected)
20412041

20422042
def test_pivot_table_empty_aggfunc(self):
2043-
# GH 9186
2043+
# GH 9186 & GH 13483
20442044
df = DataFrame(
20452045
{
20462046
"A": [2, 2, 3, 3, 2],
@@ -2050,7 +2050,8 @@ def test_pivot_table_empty_aggfunc(self):
20502050
}
20512051
)
20522052
result = df.pivot_table(index="A", columns="D", values="id", aggfunc=np.size)
2053-
expected = DataFrame()
2053+
expected = DataFrame(index=Index([], dtype="int64", name="A"))
2054+
expected.columns.name = "D"
20542055
tm.assert_frame_equal(result, expected)
20552056

20562057
def test_pivot_table_no_column_raises(self):

0 commit comments

Comments
 (0)