Skip to content

Commit 6b94e24

Browse files
authored
BUG: DataFrameGroupBy with numeric_only and empty non-numeric data (#41706)
1 parent 8caf370 commit 6b94e24

File tree

9 files changed

+95
-44
lines changed

9 files changed

+95
-44
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1061,6 +1061,7 @@ Groupby/resample/rolling
10611061
- Bug in :meth:`DataFrameGroupBy.transform` with non-unique columns incorrectly raising ``AttributeError`` (:issue:`41427`)
10621062
- Bug in :meth:`Resampler.apply` with non-unique columns incorrectly dropping duplicated columns (:issue:`41445`)
10631063
- Bug in :meth:`SeriesGroupBy` aggregations incorrectly returning empty :class:`Series` instead of raising ``TypeError`` on aggregations that are invalid for its dtype, e.g. ``.prod`` with ``datetime64[ns]`` dtype (:issue:`41342`)
1064+
- Bug in :class:`DataFrameGroupBy` aggregations incorrectly failing to drop columns with invalid dtypes for that aggregation when there are no valid columns (:issue:`41291`)
10641065
- Bug in :meth:`DataFrame.rolling.__iter__` where ``on`` was not assigned to the index of the resulting objects (:issue:`40373`)
10651066
- Bug in :meth:`DataFrameGroupBy.transform` and :meth:`DataFrameGroupBy.agg` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`41647`)
10661067

pandas/_libs/groupby.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -516,7 +516,7 @@ def group_add(add_t[:, ::1] out,
516516
val = values[i, j]
517517

518518
# not nan
519-
if val == val:
519+
if not checknull(val):
520520
nobs[lab, j] += 1
521521

522522
if nobs[lab, j] == 1:

pandas/core/groupby/generic.py

+3-14
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,7 @@
6767
validate_func_kwargs,
6868
)
6969
from pandas.core.apply import GroupByApply
70-
from pandas.core.base import (
71-
DataError,
72-
SpecificationError,
73-
)
70+
from pandas.core.base import SpecificationError
7471
import pandas.core.common as com
7572
from pandas.core.construction import create_series_with_explicit_dtype
7673
from pandas.core.frame import DataFrame
@@ -516,16 +513,12 @@ def _cython_transform(
516513

517514
obj = self._selected_obj
518515

519-
is_numeric = is_numeric_dtype(obj.dtype)
520-
if numeric_only and not is_numeric:
521-
raise DataError("No numeric types to aggregate")
522-
523516
try:
524517
result = self.grouper._cython_operation(
525518
"transform", obj._values, how, axis, **kwargs
526519
)
527-
except (NotImplementedError, TypeError):
528-
raise DataError("No numeric types to aggregate")
520+
except NotImplementedError as err:
521+
raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err
529522

530523
return obj._constructor(result, index=self.obj.index, name=obj.name)
531524

@@ -1064,7 +1057,6 @@ def _cython_agg_general(
10641057
# Note: we never get here with how="ohlc"; that goes through SeriesGroupBy
10651058

10661059
data: Manager2D = self._get_data_to_aggregate()
1067-
orig = data
10681060

10691061
if numeric_only:
10701062
data = data.get_numeric_data(copy=False)
@@ -1087,9 +1079,6 @@ def array_func(values: ArrayLike) -> ArrayLike:
10871079
# continue and exclude the block
10881080
new_mgr = data.grouped_reduce(array_func, ignore_failures=True)
10891081

1090-
if not len(new_mgr) and len(orig):
1091-
# If the original Manager was already empty, no need to raise
1092-
raise DataError("No numeric types to aggregate")
10931082
if len(new_mgr) < len(data):
10941083
warnings.warn(
10951084
f"Dropping invalid columns in {type(self).__name__}.{how} "

pandas/core/groupby/groupby.py

+6-14
Original file line numberDiff line numberDiff line change
@@ -1339,20 +1339,12 @@ def _agg_general(
13391339

13401340
with group_selection_context(self):
13411341
# try a cython aggregation if we can
1342-
result = None
1343-
try:
1344-
result = self._cython_agg_general(
1345-
how=alias,
1346-
alt=npfunc,
1347-
numeric_only=numeric_only,
1348-
min_count=min_count,
1349-
)
1350-
except DataError:
1351-
pass
1352-
1353-
# apply a non-cython aggregation
1354-
if result is None:
1355-
result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
1342+
result = self._cython_agg_general(
1343+
how=alias,
1344+
alt=npfunc,
1345+
numeric_only=numeric_only,
1346+
min_count=min_count,
1347+
)
13561348
return result.__finalize__(self.obj, method="groupby")
13571349

13581350
def _agg_py_fallback(

pandas/tests/groupby/aggregate/test_aggregate.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -128,8 +128,9 @@ def test_groupby_aggregation_multi_level_column():
128128
columns=MultiIndex.from_tuples([("A", 0), ("A", 1), ("B", 0), ("B", 1)]),
129129
)
130130

131-
result = df.groupby(level=1, axis=1).sum()
132-
expected = DataFrame({0: [2.0, 1, 1, 1], 1: [1, 0, 1, 1]})
131+
gb = df.groupby(level=1, axis=1)
132+
result = gb.sum(numeric_only=False)
133+
expected = DataFrame({0: [2.0, True, True, True], 1: [1, 0, 1, 1]})
133134

134135
tm.assert_frame_equal(result, expected)
135136

pandas/tests/groupby/aggregate/test_cython.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
bdate_range,
1919
)
2020
import pandas._testing as tm
21-
from pandas.core.groupby.groupby import DataError
2221

2322

2423
@pytest.mark.parametrize(
@@ -98,9 +97,9 @@ def test_cython_agg_nothing_to_agg():
9897

9998
frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
10099

101-
msg = "No numeric types to aggregate"
102-
with pytest.raises(DataError, match=msg):
103-
frame[["b"]].groupby(frame["a"]).mean()
100+
result = frame[["b"]].groupby(frame["a"]).mean()
101+
expected = DataFrame([], index=frame["a"].sort_values().drop_duplicates())
102+
tm.assert_frame_equal(result, expected)
104103

105104

106105
def test_cython_agg_nothing_to_agg_with_dates():

pandas/tests/groupby/aggregate/test_other.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -433,15 +433,22 @@ def test_agg_over_numpy_arrays():
433433
],
434434
columns=["category", "arraydata"],
435435
)
436-
result = df.groupby("category").agg(sum)
436+
gb = df.groupby("category")
437437

438438
expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]]
439439
expected_index = Index([1, 2], name="category")
440440
expected_column = ["arraydata"]
441441
expected = DataFrame(expected_data, index=expected_index, columns=expected_column)
442442

443+
alt = gb.sum(numeric_only=False)
444+
tm.assert_frame_equal(alt, expected)
445+
446+
result = gb.agg("sum", numeric_only=False)
443447
tm.assert_frame_equal(result, expected)
444448

449+
# FIXME: the original version of this test called `gb.agg(sum)`
450+
# and that raises TypeError if `numeric_only=False` is passed
451+
445452

446453
@pytest.mark.parametrize("as_period", [True, False])
447454
def test_agg_tzaware_non_datetime_result(as_period):
@@ -524,9 +531,14 @@ def test_sum_uint64_overflow():
524531
)
525532

526533
expected.index.name = 0
527-
result = df.groupby(0).sum()
534+
result = df.groupby(0).sum(numeric_only=False)
528535
tm.assert_frame_equal(result, expected)
529536

537+
# out column is non-numeric, so with numeric_only=True it is dropped
538+
result2 = df.groupby(0).sum(numeric_only=True)
539+
expected2 = expected[[]]
540+
tm.assert_frame_equal(result2, expected2)
541+
530542

531543
@pytest.mark.parametrize(
532544
"structure, expected",

pandas/tests/groupby/test_groupby.py

+50-2
Original file line numberDiff line numberDiff line change
@@ -638,7 +638,7 @@ def test_as_index_select_column():
638638
def test_groupby_as_index_select_column_sum_empty_df():
639639
# GH 35246
640640
df = DataFrame(columns=["A", "B", "C"])
641-
left = df.groupby(by="A", as_index=False)["B"].sum()
641+
left = df.groupby(by="A", as_index=False)["B"].sum(numeric_only=False)
642642
assert type(left) is DataFrame
643643
assert left.to_dict() == {"A": {}, "B": {}}
644644

@@ -1861,6 +1861,49 @@ def get_result():
18611861
get_result()
18621862

18631863
return
1864+
else:
1865+
# ie. DataFrameGroupBy
1866+
if op in ["prod", "sum"]:
1867+
# ops that require more than just ordered-ness
1868+
if method != "apply":
1869+
# FIXME: apply goes through different code path
1870+
if df.dtypes[0].kind == "M":
1871+
# GH#41291
1872+
# datetime64 -> prod and sum are invalid
1873+
result = get_result()
1874+
1875+
# with numeric_only=True, these are dropped, and we get
1876+
# an empty DataFrame back
1877+
expected = df.set_index(keys)[[]]
1878+
tm.assert_equal(result, expected)
1879+
return
1880+
1881+
elif isinstance(values, Categorical):
1882+
# GH#41291
1883+
# Categorical doesn't implement sum or prod
1884+
result = get_result()
1885+
1886+
# with numeric_only=True, these are dropped, and we get
1887+
# an empty DataFrame back
1888+
expected = df.set_index(keys)[[]]
1889+
if len(keys) != 1 and op == "prod":
1890+
# TODO: why just prod and not sum?
1891+
# Categorical is special without 'observed=True'
1892+
lev = Categorical([0], dtype=values.dtype)
1893+
mi = MultiIndex.from_product([lev, lev], names=["A", "B"])
1894+
expected = DataFrame([], columns=[], index=mi)
1895+
1896+
tm.assert_equal(result, expected)
1897+
return
1898+
1899+
elif df.dtypes[0] == object:
1900+
# FIXME: the test is actually wrong here, xref #41341
1901+
result = get_result()
1902+
# In this case we have list-of-list, will raise TypeError,
1903+
# and subsequently be dropped as nuisance columns
1904+
expected = df.set_index(keys)[[]]
1905+
tm.assert_equal(result, expected)
1906+
return
18641907

18651908
result = get_result()
18661909
expected = df.set_index(keys)[columns]
@@ -2313,12 +2356,17 @@ def test_groupby_all_nan_groups_drop():
23132356

23142357
def test_groupby_empty_multi_column():
23152358
# GH 15106
2316-
result = DataFrame(data=[], columns=["A", "B", "C"]).groupby(["A", "B"]).sum()
2359+
df = DataFrame(data=[], columns=["A", "B", "C"])
2360+
gb = df.groupby(["A", "B"])
2361+
result = gb.sum(numeric_only=False)
23172362
expected = DataFrame(
23182363
[], columns=["C"], index=MultiIndex([[], []], [[], []], names=["A", "B"])
23192364
)
23202365
tm.assert_frame_equal(result, expected)
23212366

2367+
result = gb.sum(numeric_only=True)
2368+
tm.assert_frame_equal(result, expected[[]])
2369+
23222370

23232371
def test_groupby_filtered_df_std():
23242372
# GH 16174

pandas/tests/groupby/transform/test_transform.py

+14-5
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
DataFrameGroupBy,
2525
SeriesGroupBy,
2626
)
27-
from pandas.core.groupby.groupby import DataError
2827

2928

3029
def assert_fp_equal(a, b):
@@ -741,11 +740,21 @@ def test_cython_transform_frame(op, args, targop):
741740
tm.assert_frame_equal(expected, getattr(gb, op)(*args).sort_index(axis=1))
742741
# individual columns
743742
for c in df:
744-
if c not in ["float", "int", "float_missing"] and op != "shift":
745-
msg = "No numeric types to aggregate"
746-
with pytest.raises(DataError, match=msg):
743+
if (
744+
c not in ["float", "int", "float_missing"]
745+
and op != "shift"
746+
and not (c == "timedelta" and op == "cumsum")
747+
):
748+
msg = "|".join(
749+
[
750+
"does not support .* operations",
751+
".* is not supported for object dtype",
752+
"is not implemented for this dtype",
753+
]
754+
)
755+
with pytest.raises(TypeError, match=msg):
747756
gb[c].transform(op)
748-
with pytest.raises(DataError, match=msg):
757+
with pytest.raises(TypeError, match=msg):
749758
getattr(gb[c], op)()
750759
else:
751760
expected = gb[c].apply(targop)

0 commit comments

Comments
 (0)