Skip to content

CLN: aggregation.transform #36478

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Oct 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,7 @@ Reshaping
- Bug in :func:`union_indexes` where input index names are not preserved in some cases. Affects :func:`concat` and :class:`DataFrame` constructor (:issue:`13475`)
- Bug in func :meth:`crosstab` when using multiple columns with ``margins=True`` and ``normalize=True`` (:issue:`35144`)
- Bug in :meth:`DataFrame.agg` with ``func={'name':<FUNC>}`` incorrectly raising ``TypeError`` when ``DataFrame.columns==['Name']`` (:issue:`36212`)
- Bug in :meth:`Series.transform` would give incorrect results or raise when the argument ``func`` was dictionary (:issue:`35811`)
-

Sparse
Expand All @@ -442,7 +443,6 @@ Other
- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising ``AssertionError`` instead of ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`)
- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` with numeric values and string ``to_replace`` (:issue:`34789`)
- Fixed metadata propagation in the :class:`Series.dt` accessor (:issue:`28283`)
- Bug in :meth:`Series.transform` would give incorrect results or raise when the argument ``func`` was dictionary (:issue:`35811`)
- Bug in :meth:`Index.union` behaving differently depending on whether operand is a :class:`Index` or other list-like (:issue:`36384`)

.. ---------------------------------------------------------------------------
Expand Down
49 changes: 38 additions & 11 deletions pandas/core/aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,17 @@
Sequence,
Tuple,
Union,
cast,
)

from pandas._typing import AggFuncType, Axis, FrameOrSeries, Label
from pandas._typing import (
AggFuncType,
AggFuncTypeBase,
Axis,
FrameOrSeries,
FrameOrSeriesUnion,
Label,
)

from pandas.core.dtypes.common import is_dict_like, is_list_like
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
Expand Down Expand Up @@ -391,7 +399,7 @@ def validate_func_kwargs(

def transform(
obj: FrameOrSeries, func: AggFuncType, axis: Axis, *args, **kwargs
) -> FrameOrSeries:
) -> FrameOrSeriesUnion:
"""
Transform a DataFrame or Series

Expand Down Expand Up @@ -424,16 +432,20 @@ def transform(
assert not is_series
return transform(obj.T, func, 0, *args, **kwargs).T

if isinstance(func, list):
if is_list_like(func) and not is_dict_like(func):
func = cast(List[AggFuncTypeBase], func)
# Convert func equivalent dict
if is_series:
func = {com.get_callable_name(v) or v: v for v in func}
else:
func = {col: func for col in obj}

if isinstance(func, dict):
if is_dict_like(func):
func = cast(Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]], func)
return transform_dict_like(obj, func, *args, **kwargs)

# func is either str or callable
func = cast(AggFuncTypeBase, func)
try:
result = transform_str_or_callable(obj, func, *args, **kwargs)
except Exception:
Expand All @@ -451,37 +463,52 @@ def transform(
return result


def transform_dict_like(obj, func, *args, **kwargs):
def transform_dict_like(
obj: FrameOrSeries,
func: Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]],
*args,
**kwargs,
):
"""
Compute transform in the case of a dict-like func
"""
from pandas.core.reshape.concat import concat

if len(func) == 0:
raise ValueError("No transform functions were provided")

if obj.ndim != 1:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe extract method _is_series(obj). I noticed that this dimensions check is run in another function, but there is a local variable is_series.

# Check for missing columns on a frame
cols = sorted(set(func.keys()) - set(obj.columns))
if len(cols) > 0:
raise SpecificationError(f"Column(s) {cols} do not exist")

if any(isinstance(v, dict) for v in func.values()):
# Can't use func.values(); wouldn't work for a Series
if any(is_dict_like(v) for _, v in func.items()):
# GH 15931 - deprecation of renaming keys
raise SpecificationError("nested renamer is not supported")

results = {}
results: Dict[Label, FrameOrSeriesUnion] = {}
for name, how in func.items():
colg = obj._gotitem(name, ndim=1)
try:
results[name] = transform(colg, how, 0, *args, **kwargs)
except Exception as e:
if str(e) == "Function did not transform":
raise e
except Exception as err:
if (
str(err) == "Function did not transform"
or str(err) == "No transform functions were provided"
):
raise err

# combine results
if len(results) == 0:
raise ValueError("Transform function failed")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this tested? is this new?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think you should just let concat work and bubble up if needed

Copy link
Member Author

@rhshadrach rhshadrach Sep 20, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not new, this is hit in tests.frame.apply.test_frame_transform.test_transform_bad_dtype. Removing these two lines results in the ValueError "No objects to concatenate" rather than "Transform function failed". I'm okay with either, slight preference for the current behavior (which is the same as this PR). Let me know if you prefer "No objects to concatenate" and can change.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm yeah I think we should change this, maybe to something a bit more userfriendly like
the input transform did not contain any transform functions

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you thinking of the case where an aggregator is used? If we detect the function not transforming, we'll raise here:

try:
    results[name] = transform(colg, how, 0, *args, **kwargs)
except Exception as e:
    if str(e) == "Function did not transform":
        raise e

before we get to this point. The raising of the code highlighted above should only occur if all the key/value pairs of the dictionary entirely failed - e.g. trying to take a product of strings or lambda x: raise ValueError.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, no this is the case of an empty list or dict right? (of functions that are supplied for aggregation), IOW its user visible

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I see! I did not realize this line would be the one that raised for an empty list or dict. I've added that check before getting to this section of the code with the message "no results" (this is the message from 1.1 - but can change), along with tests for this.

return concat(results, axis=1)


def transform_str_or_callable(obj, func, *args, **kwargs):
def transform_str_or_callable(
obj: FrameOrSeries, func: AggFuncTypeBase, *args, **kwargs
) -> FrameOrSeriesUnion:
"""
Compute transform in the case of a string or callable func
"""
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -7270,7 +7270,7 @@ def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame:

def _gotitem(
self,
key: Union[str, List[str]],
key: Union[Label, List[Label]],
ndim: int,
subset: Optional[FrameOrSeriesUnion] = None,
) -> FrameOrSeriesUnion:
Expand Down
9 changes: 5 additions & 4 deletions pandas/core/shared_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,16 +265,17 @@

Parameters
----------
func : function, str, list or dict
func : function, str, list-like or dict-like
Function to use for transforming the data. If a function, must either
work when passed a {klass} or when passed to {klass}.apply.
work when passed a {klass} or when passed to {klass}.apply. If func
is both list-like and dict-like, dict-like behavior takes precedence.

Accepted combinations are:

- function
- string function name
- list of functions and/or function names, e.g. ``[np.exp, 'sqrt']``
- dict of axis labels -> functions, function names or list of such.
- list-like of functions and/or function names, e.g. ``[np.exp, 'sqrt']``
- dict-like of axis labels -> functions, function names or list-like of such.
{axis}
*args
Positional arguments to pass to `func`.
Expand Down
39 changes: 34 additions & 5 deletions pandas/tests/frame/apply/test_frame_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
import pytest

from pandas import DataFrame, MultiIndex
from pandas import DataFrame, MultiIndex, Series
import pandas._testing as tm
from pandas.core.base import SpecificationError
from pandas.core.groupby.base import transformation_kernels
Expand Down Expand Up @@ -41,9 +41,15 @@ def test_transform_groupby_kernel(axis, float_frame, op):


@pytest.mark.parametrize(
"ops, names", [([np.sqrt], ["sqrt"]), ([np.abs, np.sqrt], ["absolute", "sqrt"])]
"ops, names",
[
([np.sqrt], ["sqrt"]),
([np.abs, np.sqrt], ["absolute", "sqrt"]),
(np.array([np.sqrt]), ["sqrt"]),
(np.array([np.abs, np.sqrt]), ["absolute", "sqrt"]),
],
)
def test_transform_list(axis, float_frame, ops, names):
def test_transform_listlike(axis, float_frame, ops, names):
# GH 35964
other_axis = 1 if axis in {0, "index"} else 0
with np.errstate(all="ignore"):
Expand All @@ -56,18 +62,41 @@ def test_transform_list(axis, float_frame, ops, names):
tm.assert_frame_equal(result, expected)


def test_transform_dict(axis, float_frame):
@pytest.mark.parametrize("ops", [[], np.array([])])
def test_transform_empty_listlike(float_frame, ops):
with pytest.raises(ValueError, match="No transform functions were provided"):
float_frame.transform(ops)


@pytest.mark.parametrize("box", [dict, Series])
def test_transform_dictlike(axis, float_frame, box):
# GH 35964
if axis == 0 or axis == "index":
e = float_frame.columns[0]
expected = float_frame[[e]].transform(np.abs)
else:
e = float_frame.index[0]
expected = float_frame.iloc[[0]].transform(np.abs)
result = float_frame.transform({e: np.abs}, axis=axis)
result = float_frame.transform(box({e: np.abs}), axis=axis)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"ops",
[
{},
{"A": []},
{"A": [], "B": "cumsum"},
{"A": "cumsum", "B": []},
{"A": [], "B": ["cumsum"]},
{"A": ["cumsum"], "B": []},
],
)
def test_transform_empty_dictlike(float_frame, ops):
with pytest.raises(ValueError, match="No transform functions were provided"):
float_frame.transform(ops)


@pytest.mark.parametrize("use_apply", [True, False])
def test_transform_udf(axis, float_frame, use_apply):
# GH 35964
Expand Down
37 changes: 33 additions & 4 deletions pandas/tests/series/apply/test_series_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,15 @@ def test_transform_groupby_kernel(string_series, op):


@pytest.mark.parametrize(
"ops, names", [([np.sqrt], ["sqrt"]), ([np.abs, np.sqrt], ["absolute", "sqrt"])]
"ops, names",
[
([np.sqrt], ["sqrt"]),
([np.abs, np.sqrt], ["absolute", "sqrt"]),
(np.array([np.sqrt]), ["sqrt"]),
(np.array([np.abs, np.sqrt]), ["absolute", "sqrt"]),
],
)
def test_transform_list(string_series, ops, names):
def test_transform_listlike(string_series, ops, names):
# GH 35964
with np.errstate(all="ignore"):
expected = concat([op(string_series) for op in ops], axis=1)
Expand All @@ -45,15 +51,38 @@ def test_transform_list(string_series, ops, names):
tm.assert_frame_equal(result, expected)


def test_transform_dict(string_series):
@pytest.mark.parametrize("ops", [[], np.array([])])
def test_transform_empty_listlike(string_series, ops):
with pytest.raises(ValueError, match="No transform functions were provided"):
string_series.transform(ops)


@pytest.mark.parametrize("box", [dict, Series])
def test_transform_dictlike(string_series, box):
# GH 35964
with np.errstate(all="ignore"):
expected = concat([np.sqrt(string_series), np.abs(string_series)], axis=1)
expected.columns = ["foo", "bar"]
result = string_series.transform({"foo": np.sqrt, "bar": np.abs})
result = string_series.transform(box({"foo": np.sqrt, "bar": np.abs}))
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"ops",
[
{},
{"A": []},
{"A": [], "B": ["cumsum"]},
{"A": ["cumsum"], "B": []},
{"A": [], "B": "cumsum"},
{"A": "cumsum", "B": []},
],
)
def test_transform_empty_dictlike(string_series, ops):
with pytest.raises(ValueError, match="No transform functions were provided"):
string_series.transform(ops)


def test_transform_udf(axis, string_series):
# GH 35964
# via apply
Expand Down