Skip to content

TST: fix groupby-empty xfails #44092

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Oct 20, 2021
2 changes: 2 additions & 0 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1536,6 +1536,7 @@ def func(df):
result = [index[i] if i >= 0 else np.nan for i in indices]
return df._constructor_sliced(result, index=res.index)

func.__name__ = "idxmax"
return self._python_apply_general(func, self._obj_with_exclusions)

@Appender(DataFrame.idxmin.__doc__)
Expand All @@ -1557,6 +1558,7 @@ def func(df):
result = [index[i] if i >= 0 else np.nan for i in indices]
return df._constructor_sliced(result, index=res.index)

func.__name__ = "idxmin"
return self._python_apply_general(func, self._obj_with_exclusions)

boxplot = boxplot_frame_groupby
Expand Down
12 changes: 12 additions & 0 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -753,6 +753,18 @@ def apply(
mutated = True
result_values.append(res)

# getattr pattern for __name__ is needed for functools.partial objects
if len(group_keys) == 0 and getattr(f, "__name__", None) not in [
"idxmin",
"idxmax",
"nanargmin",
"nanargmax",
]:
# If group_keys is empty, then no function calls have been made,
# so we will not have raised even if this is an invalid dtype.
# So do one dummy call here to raise appropriate TypeError.
f(data.iloc[:0])

return result_values, mutated

@cache_readonly
Expand Down
94 changes: 81 additions & 13 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@
to_datetime,
)
import pandas._testing as tm
from pandas.core.arrays import (
BooleanArray,
FloatingArray,
IntegerArray,
)
from pandas.core.base import SpecificationError
import pandas.core.common as com

Expand Down Expand Up @@ -1822,17 +1827,23 @@ def test_pivot_table_values_key_error():
)
@pytest.mark.filterwarnings("ignore:Dropping invalid columns:FutureWarning")
@pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning")
def test_empty_groupby(columns, keys, values, method, op, request):
def test_empty_groupby(columns, keys, values, method, op, request, using_array_manager):
# GH8093 & GH26411
override_dtype = None

if (
isinstance(values, Categorical)
and not isinstance(columns, list)
and op in ["sum", "prod"]
and op in ["sum", "prod", "skew", "mad"]
):
# handled below GH#41291
pass

if using_array_manager and op == "mad":
right_msg = "Cannot interpret 'CategoricalDtype.* as a data type"
msg = "Regex pattern \"'Categorical' does not implement.*" + right_msg
mark = pytest.mark.xfail(raises=AssertionError, match=msg)
request.node.add_marker(mark)

elif (
isinstance(values, Categorical)
and len(keys) == 1
Expand All @@ -1851,11 +1862,7 @@ def test_empty_groupby(columns, keys, values, method, op, request):
raises=TypeError, match="'Categorical' does not implement"
)
request.node.add_marker(mark)
elif (
isinstance(values, Categorical)
and len(keys) == 1
and op in ["mad", "min", "max", "sum", "prod", "skew"]
):
elif isinstance(values, Categorical) and len(keys) == 1 and op in ["sum", "prod"]:
mark = pytest.mark.xfail(
raises=AssertionError, match="(DataFrame|Series) are different"
)
Expand All @@ -1869,7 +1876,30 @@ def test_empty_groupby(columns, keys, values, method, op, request):
raises=AssertionError, match="(DataFrame|Series) are different"
)
request.node.add_marker(mark)
elif isinstance(values, pd.core.arrays.BooleanArray) and op in ["sum", "prod"]:
elif (
isinstance(values, (IntegerArray, FloatingArray))
and op == "mad"
and isinstance(columns, list)
):
mark = pytest.mark.xfail(
raises=TypeError, match="can only perform ops with numeric values"
)
request.node.add_marker(mark)

elif (
op == "mad"
and not isinstance(columns, list)
and isinstance(values, pd.DatetimeIndex)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a giant test, prob worth splitting / fixing special cases. (of course in followons)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yah, once the xfails are fixed it'll be a lot more reasonably sized

and values.tz is not None
and using_array_manager
):
mark = pytest.mark.xfail(
raises=TypeError,
match=r"Cannot interpret 'datetime64\[ns, US/Eastern\]' as a data type",
)
request.node.add_marker(mark)

elif isinstance(values, BooleanArray) and op in ["sum", "prod"]:
# We expect to get Int64 back for these
override_dtype = "Int64"

Expand All @@ -1895,19 +1925,29 @@ def get_result():

if columns == "C":
# i.e. SeriesGroupBy
if op in ["prod", "sum"]:
if op in ["prod", "sum", "skew"]:
# ops that require more than just ordered-ness
if df.dtypes[0].kind == "M":
# GH#41291
# datetime64 -> prod and sum are invalid
msg = "datetime64 type does not support"
if op == "skew":
msg = "'DatetimeArray' does not implement reduction 'skew'"
else:
msg = "datetime64 type does not support"
with pytest.raises(TypeError, match=msg):
get_result()

return
elif isinstance(values, Categorical):
if op in ["prod", "sum", "skew", "mad"]:
if isinstance(values, Categorical):
# GH#41291
msg = "category type does not support"
if op == "mad":
# mad calls mean, which Categorical doesn't implement
msg = "'Categorical' does not implement reduction 'mean'"
elif op == "skew":
msg = f"'Categorical' does not implement reduction '{op}'"
else:
msg = "category type does not support"
with pytest.raises(TypeError, match=msg):
get_result()

Expand Down Expand Up @@ -1954,6 +1994,34 @@ def get_result():
tm.assert_equal(result, expected)
return

if (
op in ["mad", "min", "max", "skew"]
and isinstance(values, Categorical)
and len(keys) == 1
):
# Categorical doesn't implement, so with numeric_only=True
# these are dropped and we get an empty DataFrame back
result = get_result()
expected = df.set_index(keys)[[]]

# with numeric_only=True, these are dropped, and we get
# an empty DataFrame back
if len(keys) != 1:
# Categorical is special without 'observed=True'
lev = Categorical([0], dtype=values.dtype)
mi = MultiIndex.from_product([lev, lev], names=keys)
expected = DataFrame([], columns=[], index=mi)
else:
# all columns are dropped, but we end up with one row
# Categorical is special without 'observed=True'
lev = Categorical([0], dtype=values.dtype)
ci = Index(lev, name=keys[0])
expected = DataFrame([], columns=[], index=ci)
# expected = df.set_index(keys)[columns]

tm.assert_equal(result, expected)
return

result = get_result()
expected = df.set_index(keys)[columns]
if override_dtype is not None:
Expand Down