Skip to content

CLN: aggregation.transform #36478

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Oct 6, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,7 @@ Reshaping
- Bug in :func:`union_indexes` where input index names are not preserved in some cases. Affects :func:`concat` and :class:`DataFrame` constructor (:issue:`13475`)
- Bug in func :meth:`crosstab` when using multiple columns with ``margins=True`` and ``normalize=True`` (:issue:`35144`)
- Bug in :meth:`DataFrame.agg` with ``func={'name':<FUNC>}`` incorrectly raising ``TypeError`` when ``DataFrame.columns==['Name']`` (:issue:`36212`)
- Bug in :meth:`Series.transform` would give incorrect results or raise when the argument ``func`` was dictionary (:issue:`35811`)
-

Sparse
Expand All @@ -368,7 +369,6 @@ Other
^^^^^
- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising ``AssertionError`` instead of ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`)
- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` with numeric values and string ``to_replace`` (:issue:`34789`)
- Bug in :meth:`Series.transform` would give incorrect results or raise when the argument ``func`` was dictionary (:issue:`35811`)
- Bug in :meth:`Index.union` behaving differently depending on whether operand is a :class:`Index` or other list-like (:issue:`36384`)

.. ---------------------------------------------------------------------------
Expand Down
123 changes: 79 additions & 44 deletions pandas/core/aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,17 @@
Sequence,
Tuple,
Union,
cast,
)

from pandas._typing import AggFuncType, Axis, FrameOrSeries, Label
from pandas._typing import (
AggFuncType,
AggFuncTypeBase,
Axis,
FrameOrSeries,
FrameOrSeriesUnion,
Label,
)

from pandas.core.dtypes.common import is_dict_like, is_list_like
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
Expand Down Expand Up @@ -388,7 +396,7 @@ def validate_func_kwargs(

def transform(
obj: FrameOrSeries, func: AggFuncType, axis: Axis, *args, **kwargs,
) -> FrameOrSeries:
) -> FrameOrSeriesUnion:
"""
Transform a DataFrame or Series

Expand All @@ -415,67 +423,94 @@ def transform(
ValueError
If the transform function fails or does not transform.
"""
from pandas.core.reshape.concat import concat

is_series = obj.ndim == 1

if obj._get_axis_number(axis) == 1:
assert not is_series
return transform(obj.T, func, 0, *args, **kwargs).T

if isinstance(func, list):
if is_list_like(func) and not is_dict_like(func):
func = cast(List[AggFuncTypeBase], func)
# Convert func equivalent dict
if is_series:
func = {com.get_callable_name(v) or v: v for v in func}
else:
func = {col: func for col in obj}

if isinstance(func, dict):
if not is_series:
cols = sorted(set(func.keys()) - set(obj.columns))
if len(cols) > 0:
raise SpecificationError(f"Column(s) {cols} do not exist")

if any(isinstance(v, dict) for v in func.values()):
# GH 15931 - deprecation of renaming keys
raise SpecificationError("nested renamer is not supported")

results = {}
for name, how in func.items():
colg = obj._gotitem(name, ndim=1)
try:
results[name] = transform(colg, how, 0, *args, **kwargs)
except Exception as e:
if str(e) == "Function did not transform":
raise e

# combine results
if len(results) == 0:
if is_dict_like(func):
func = cast(Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]], func)
result = transform_dict_like(obj, func, *args, **kwargs)
else:
func = cast(AggFuncTypeBase, func)
try:
result = transform_str_or_callable(obj, func, *args, **kwargs)
except Exception:
raise ValueError("Transform function failed")
return concat(results, axis=1)

# func is either str or callable
try:
if isinstance(func, str):
result = obj._try_aggregate_string_function(func, *args, **kwargs)
else:
f = obj._get_cython_func(func)
if f and not args and not kwargs:
result = getattr(obj, f)()
else:
try:
result = obj.apply(func, args=args, **kwargs)
except Exception:
result = func(obj, *args, **kwargs)
except Exception:
raise ValueError("Transform function failed")

# Functions that transform may return empty Series/DataFrame
# when the dtype is not appropriate
if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty:
raise ValueError("Transform function failed")

if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals(
obj.index
):
raise ValueError("Function did not transform")

return result


def transform_dict_like(
obj: FrameOrSeries,
func: Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]],
*args,
**kwargs,
):
"""
Compute transform in the case of a dict-like func
"""
from pandas.core.reshape.concat import concat

if obj.ndim != 1:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe extract method _is_series(obj). I noticed that this dimensions check is run in another function, but there is a local variable is_series.

# Check for missing columns on a frame
cols = sorted(set(func.keys()) - set(obj.columns))
if len(cols) > 0:
raise SpecificationError(f"Column(s) {cols} do not exist")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better rename cols to missing_cols. Even better if extract function _check_missing_columns(obj, func).


if any(isinstance(v, dict) for v in func.values()):
# GH 15931 - deprecation of renaming keys
raise SpecificationError("nested renamer is not supported")

results: Dict[Label, FrameOrSeriesUnion] = {}
for name, how in func.items():
colg = obj._gotitem(name, ndim=1)
try:
results[name] = transform(colg, how, 0, *args, **kwargs)
except Exception as e:
if str(e) == "Function did not transform":
raise e

# combine results
if len(results) == 0:
raise ValueError("Transform function failed")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this tested? is this new?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think you should just let concat work and bubble up if needed

Copy link
Member Author

@rhshadrach rhshadrach Sep 20, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not new, this is hit in tests.frame.apply.test_frame_transform.test_transform_bad_dtype. Removing these two lines results in the ValueError "No objects to concatenate" rather than "Transform function failed". I'm okay with either, slight preference for the current behavior (which is the same as this PR). Let me know if you prefer "No objects to concatenate" and can change.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm yeah I think we should change this, maybe to something a bit more userfriendly like
the input transform did not contain any transform functions

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you thinking of the case where an aggregator is used? If we detect the function not transforming, we'll raise here:

try:
    results[name] = transform(colg, how, 0, *args, **kwargs)
except Exception as e:
    if str(e) == "Function did not transform":
        raise e

before we get to this point. The raising of the code highlighted above should only occur if all the key/value pairs of the dictionary entirely failed - e.g. trying to take a product of strings or lambda x: raise ValueError.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, no this is the case of an empty list or dict right? (of functions that are supplied for aggregation), IOW its user visible

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I see! I did not realize this line would be the one that raised for an empty list or dict. I've added that check before getting to this section of the code with the message "no results" (this is the message from 1.1 - but can change), along with tests for this.

return concat(results, axis=1)


def transform_str_or_callable(
obj: FrameOrSeries, func: AggFuncTypeBase, *args, **kwargs
) -> FrameOrSeriesUnion:
"""
Compute transform in the case of a string or callable func
"""
if isinstance(func, str):
return obj._try_aggregate_string_function(func, *args, **kwargs)

if not args and not kwargs:
f = obj._get_cython_func(func)
if f:
return getattr(obj, f)()

# Two possible ways to use a UDF - apply or call directly
try:
return obj.apply(func, args=args, **kwargs)
except Exception:
return func(obj, *args, **kwargs)
2 changes: 1 addition & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -7255,7 +7255,7 @@ def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame:

def _gotitem(
self,
key: Union[str, List[str]],
key: Union[Label, List[Label]],
ndim: int,
subset: Optional[FrameOrSeriesUnion] = None,
) -> FrameOrSeriesUnion:
Expand Down
9 changes: 5 additions & 4 deletions pandas/core/shared_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,16 +265,17 @@

Parameters
----------
func : function, str, list or dict
func : function, str, list-like or dict-like
Function to use for transforming the data. If a function, must either
work when passed a {klass} or when passed to {klass}.apply.
work when passed a {klass} or when passed to {klass}.apply. If func
is both list-like and dict-like, dict-like behavior takes precedence.

Accepted combinations are:

- function
- string function name
- list of functions and/or function names, e.g. ``[np.exp, 'sqrt']``
- dict of axis labels -> functions, function names or list of such.
- list-like of functions and/or function names, e.g. ``[np.exp, 'sqrt']``
- dict-like of axis labels -> functions, function names or list-like of such.
{axis}
*args
Positional arguments to pass to `func`.
Expand Down