diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 08b3175ad0ad6..b2bec79dc0669 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -44,6 +44,7 @@ Other enhancements - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) +- Cythonized transformations now supports python fallback (:issue:`49758`) - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) - diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 0a048d11d0b4d..781fbe4b7ecf3 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -75,6 +75,7 @@ all_indexes_same, default_index, ) +from pandas.core.internals.blocks import ensure_block_shape from pandas.core.series import Series from pandas.core.sorting import get_group_index from pandas.core.util.numba_ import maybe_use_numba @@ -535,17 +536,28 @@ def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs ) - def _cython_transform(self, how: str, numeric_only: bool = False, **kwargs): + def _cython_transform( + self, + how: str, + alt: Callable | None = None, + numeric_only: bool = False, + **kwargs, + ): obj = self._obj_with_exclusions + values = obj._values try: result = self._grouper._cython_operation( - "transform", obj._values, how, 0, **kwargs + "transform", values, how, 0, **kwargs ) - except NotImplementedError as err: - # e.g. test_groupby_raises_string - raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err + except NotImplementedError: + if alt is None: + raise + else: + return obj._constructor(result, index=self.obj.index, name=obj.name) + assert alt is not None + result = self._transform_py_fallback(how, values, alt=alt) return obj._constructor(result, index=self.obj.index, name=obj.name) def _transform_general( @@ -583,6 +595,23 @@ def _transform_general( result.name = self.obj.name return result + def _transform_py_fallback( + self, how: str, values: ArrayLike, alt: Callable + ) -> ArrayLike: + assert alt is not None + + series = Series(values, copy=False) + try: + res_values = self._grouper.transform_series(series, alt) + except Exception as err: + msg = f"transform function failed [how->{how},dtype->{series.dtype}]" + # preserve the kind of exception that raised + raise type(err)(msg) from err + + if series.dtype == object: + res_values = res_values.astype(object, copy=False) + return res_values + def filter(self, func, dropna: bool = True, *args, **kwargs): """ Filter elements from groups that don't satisfy a criterion. @@ -1742,6 +1771,7 @@ def _wrap_applied_output_series( def _cython_transform( self, how: str, + alt: Callable | None = None, numeric_only: bool = False, **kwargs, ) -> DataFrame: @@ -1753,9 +1783,17 @@ def _cython_transform( ) def arr_func(bvalues: ArrayLike) -> ArrayLike: - return self._grouper._cython_operation( - "transform", bvalues, how, 1, **kwargs - ) + try: + return self._grouper._cython_operation( + "transform", bvalues, how, 1, **kwargs + ) + except NotImplementedError: + if alt is None: + raise + + assert alt is not None + result = self._transform_py_fallback(how, bvalues, alt=alt) + return result res_mgr = mgr.apply(arr_func) @@ -1866,6 +1904,27 @@ def _transform_general(self, func, engine, engine_kwargs, *args, **kwargs): """ ) + def _transform_py_fallback( + self, how: str, values: ArrayLike, alt: Callable + ) -> ArrayLike: + print("IN TRANSFORM PY FALLBACK") + assert alt is not None + + df = DataFrame(values.T, dtype=values.dtype) + assert df.shape[1] == 1 + series = df.iloc[:, 0] + + try: + res_values = self._grouper.transform_series(series, alt) + except Exception as err: + msg = f"transform function failed [how->{how},dtype->{series.dtype}]" + # preserve the kind of exception that raised + raise type(err)(msg) from err + + if series.dtype == object: + res_values = res_values.astype(object, copy=False) + return ensure_block_shape(res_values, ndim=2) + @Substitution(klass="DataFrame", example=__examples_dataframe_doc) @Appender(_transform_template) def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4ebc149256336..c024a47fb7e3a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4656,6 +4656,13 @@ def rank( return self._cython_transform( "rank", numeric_only=False, + alt=lambda x: Series(x, copy=False).rank( + method=method, + numeric_only=False, + na_option=na_option, + ascending=ascending, + pct=pct, + ), **kwargs, ) @@ -4716,7 +4723,7 @@ def cumprod(self, *args, **kwargs) -> NDFrameT: bull 6 9 """ nv.validate_groupby_func("cumprod", args, kwargs, ["numeric_only", "skipna"]) - return self._cython_transform("cumprod", **kwargs) + return self._cython_transform("cumprod", alt=np.cumprod, **kwargs) @final @Substitution(name="groupby") @@ -4775,7 +4782,7 @@ def cumsum(self, *args, **kwargs) -> NDFrameT: lion 6 9 """ nv.validate_groupby_func("cumsum", args, kwargs, ["numeric_only", "skipna"]) - return self._cython_transform("cumsum", **kwargs) + return self._cython_transform("cumsum", alt=np.cumsum, **kwargs) @final @Substitution(name="groupby") @@ -4845,7 +4852,10 @@ def cummin( """ skipna = kwargs.get("skipna", True) return self._cython_transform( - "cummin", numeric_only=numeric_only, skipna=skipna + "cummin", + numeric_only=numeric_only, + skipna=skipna, + alt=np.minimum.accumulate, ) @final @@ -4916,7 +4926,10 @@ def cummax( """ skipna = kwargs.get("skipna", True) return self._cython_transform( - "cummax", numeric_only=numeric_only, skipna=skipna + "cummax", + numeric_only=numeric_only, + skipna=skipna, + alt=np.maximum.accumulate, ) @final diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 4f40c4f4283f0..45605656ba863 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -45,12 +45,14 @@ ensure_uint64, is_1d_only_ea_dtype, ) +from pandas.core.dtypes.dtypes import ArrowDtype from pandas.core.dtypes.missing import ( isna, maybe_fill, ) from pandas.core.arrays import Categorical +from pandas.core.arrays.arrow.array import ArrowExtensionArray from pandas.core.frame import DataFrame from pandas.core.groupby import grouper from pandas.core.indexes.api import ( @@ -910,6 +912,72 @@ def _cython_operation( **kwargs, ) + @final + def transform_series( + self, obj: Series, func: Callable, preserve_dtype: bool = False + ) -> ArrayLike: + """ + Parameters + ---------- + obj : Series + func : function taking a Series and returning a Series + preserve_dtype : bool + Whether the aggregation is known to be dtype-preserving. + + Returns + ------- + np.ndarray or ExtensionArray + """ + # GH#58129 + result = self._transform_series_pure_python(obj, func) + npvalues = lib.maybe_convert_objects(result, try_float=False) + + if isinstance(obj._values, ArrowExtensionArray): + out = maybe_cast_pointwise_result( + npvalues, obj.dtype, numeric_only=True, same_dtype=preserve_dtype + ) + import pyarrow as pa + + if isinstance(out.dtype, ArrowDtype) and pa.types.is_struct( + out.dtype.pyarrow_dtype + ): + out = npvalues + + elif not isinstance(obj._values, np.ndarray): + out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) + else: + out = npvalues + + return out + + @final + def _transform_series_pure_python( + self, obj: Series, func: Callable + ) -> npt.NDArray[np.object_]: + splitter = self._get_splitter(obj) + res_by_group = [] + + for group in splitter: + res = func(group) + if hasattr(res, "_values"): + res = res._values + + res_by_group.append(res) + + res_by_group_pointers = np.zeros(self.ngroups, dtype=np.int64) + series_len = len(obj._values) + result = np.empty(series_len, dtype="O") + + for i in range(series_len): + label = splitter.labels[i] + group_res = res_by_group[label] + pointer = res_by_group_pointers[label] + result[i] = group_res[pointer] + + res_by_group_pointers[label] = pointer + 1 + + return result + @final def agg_series( self, obj: Series, func: Callable, preserve_dtype: bool = False diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index 33cdd1883e1b9..ab53f80267d79 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -159,25 +159,35 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): # object dtypes for transformations are not implemented in Cython and # have no Python fallback - exception = NotImplementedError if method.startswith("cum") else TypeError + exception = TypeError - if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"): + if method in ("min", "max"): # The methods default to numeric_only=False and raise TypeError msg = "|".join( [ "Categorical is not ordered", f"Cannot perform {method} with non-ordered Categorical", re.escape(f"agg function failed [how->{method},dtype->object]"), - # cumsum/cummin/cummax/cumprod - "function is not implemented for this dtype", ] ) with pytest.raises(exception, match=msg): getattr(gb, method)() - elif method in ("sum", "mean", "median", "prod"): + elif method in ( + "sum", + "mean", + "median", + "prod", + "cummin", + "cummax", + "cumsum", + "cumprod", + ): msg = "|".join( [ - "category type does not support sum operations", + re.escape(f"category type does not support {method} operations"), + re.escape( + f"transform function failed [how->{method},dtype->object]" + ), re.escape(f"agg function failed [how->{method},dtype->object]"), re.escape(f"agg function failed [how->{method},dtype->string]"), ] @@ -195,6 +205,9 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): "category type does not support", "function is not implemented for this dtype", f"Cannot perform {method} with non-ordered Categorical", + re.escape( + f"transform function failed [how->{method},dtype->object]" + ), re.escape(f"agg function failed [how->{method},dtype->object]"), re.escape(f"agg function failed [how->{method},dtype->string]"), ] @@ -276,9 +289,7 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): assert numeric_only is not True # kernels that are successful on any dtype were above; this will fail - # object dtypes for transformations are not implemented in Cython and - # have no Python fallback - exception = NotImplementedError if kernel.startswith("cum") else TypeError + exception = TypeError msg = "|".join( [ @@ -289,6 +300,7 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): "unsupported operand type", "function is not implemented for this dtype", re.escape(f"agg function failed [how->{kernel},dtype->object]"), + re.escape(f"transform function failed [how->{kernel},dtype->object]"), ] ) if kernel == "idxmin": @@ -334,10 +346,6 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): fails_on_numeric_object = ( "corr", "cov", - "cummax", - "cummin", - "cumprod", - "cumsum", "quantile", ) # ops that give an object result on object input @@ -358,6 +366,11 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): "max", "prod", "skew", + "cummax", + "cummin", + "cumsum", + # cumprod does not fail for object dtype, if element are numeric + "cumprod", ) # Test default behavior; kernels that fail may be enabled in the future but kernels @@ -376,6 +389,13 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): expected = expected.astype(object) tm.assert_series_equal(result, expected) + valid_func_has_numeric_only = ( + "cummin", + "cummax", + "cumsum", + # cumprod does not fail for object dtype, if element are numeric + "cumprod", + ) has_numeric_only = ( "first", "last", @@ -399,7 +419,7 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): msg = "got an unexpected keyword argument 'numeric_only'" with pytest.raises(TypeError, match=msg): method(*args, numeric_only=True) - elif dtype is object: + elif dtype is object and groupby_func not in valid_func_has_numeric_only: msg = "|".join( [ "SeriesGroupBy.sem called with numeric_only=True and dtype object", diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 9301f8d56d9d2..706ce93e53949 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -126,22 +126,13 @@ def test_groupby_raises_string( "corrwith": (TypeError, "Could not convert"), "count": (None, ""), "cumcount": (None, ""), - "cummax": ( - (NotImplementedError, TypeError), - "(function|cummax) is not (implemented|supported) for (this|object) dtype", - ), - "cummin": ( - (NotImplementedError, TypeError), - "(function|cummin) is not (implemented|supported) for (this|object) dtype", - ), + "cummax": (None, ""), + "cummin": (None, ""), "cumprod": ( - (NotImplementedError, TypeError), - "(function|cumprod) is not (implemented|supported) for (this|object) dtype", - ), - "cumsum": ( - (NotImplementedError, TypeError), - "(function|cumsum) is not (implemented|supported) for (this|object) dtype", + TypeError, + re.escape("transform function failed [how->cumprod,dtype->object]"), ), + "cumsum": (None, ""), "diff": (TypeError, "unsupported operand type"), "ffill": (None, ""), "fillna": (None, ""), diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index d6d545a8c4834..315693896618a 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1,5 +1,7 @@ """test with the .transform""" +import re + import numpy as np import pytest @@ -26,6 +28,34 @@ def assert_fp_equal(a, b): assert (np.abs(a - b) < 1e-12).all() +def _test_apply_and_transform(gb, c, targop, op, args): + expected = gb[c].apply(targop) + expected.name = c + if c in ["string_missing", "string"]: + expected = expected.fillna(np.nan) + + res = gb[c].transform(op, *args) + tm.assert_series_equal(expected, res) + res2 = getattr(gb[c], op)(*args) + tm.assert_series_equal(expected, res2) + + +def _test_raises_typeerror(groupby, op): + msg = "|".join( + [ + "does not support .* operations", + "does not support operation", + ".* is not supported for object dtype", + "is not implemented for this dtype", + re.escape(f"transform function failed [how->{op},dtype->object]"), + ] + ) + with pytest.raises(TypeError, match=msg): + groupby.transform(op) + with pytest.raises(TypeError, match=msg): + getattr(groupby, op)() + + def test_transform(): data = Series(np.arange(9) // 3, index=np.arange(9)) @@ -715,12 +745,17 @@ def test_cython_transform_frame(request, op, args, targop, df_fix, gb_target): @pytest.mark.parametrize( "gb_target", [ - {"by": np.random.default_rng(2).integers(0, 50, size=10).astype(float)}, - {"level": 0}, - {"by": "string"}, + { + "id": 0, + "value": { + "by": np.random.default_rng(2).integers(0, 50, size=10).astype(float) + }, + }, + {"id": 1, "value": {"level": 0}}, + {"id": 2, "value": {"by": "string"}}, # TODO: create xfail condition given other params - # {"by": 'string_missing'}, - {"by": ["int", "string"]}, + # {"id": 3, "value": {"by": 'string_missing'}}, + {"id": 4, "value": {"by": ["int", "string"]}}, ], ) @pytest.mark.parametrize( @@ -739,35 +774,38 @@ def test_cython_transform_frame_column( request, op, args, targop, df_fix, gb_target, column ): df = request.getfixturevalue(df_fix) - gb = df.groupby(group_keys=False, **gb_target) + gb = df.groupby(group_keys=False, **gb_target["value"]) c = column - if ( - c not in ["float", "int", "float_missing"] - and op != "shift" - and not (c == "timedelta" and op == "cumsum") - ): - msg = "|".join( - [ - "does not support .* operations", - "does not support operation", - ".* is not supported for object dtype", - "is not implemented for this dtype", - ] - ) - with pytest.raises(TypeError, match=msg): - gb[c].transform(op) - with pytest.raises(TypeError, match=msg): - getattr(gb[c], op)() - else: - expected = gb[c].apply(targop) - expected.name = c - if c in ["string_missing", "string"]: - expected = expected.fillna(np.nan) - res = gb[c].transform(op, *args) - tm.assert_series_equal(expected, res) - res2 = getattr(gb[c], op)(*args) - tm.assert_series_equal(expected, res2) + # mapping df_fix -> gb_target that results in single element groups + single_element_groups = {"frame": [0, 1], "frame_mi": [0]} + nan_single_groups = single_element_groups + + if op == "cumprod" and c not in ["float", "int", "float_missing"]: + # np.cumprod does not raise error is each group only has 1 element + if ( + c in ["string", "string_missing"] + and gb_target["id"] in single_element_groups[df_fix] + ): + _test_apply_and_transform(gb, c, targop, op, args) + else: + _test_raises_typeerror(gb[c], op) + + elif op == "cumsum" and c not in [ + "float", + "int", + "float_missing", + "timedelta", + "string", + ]: + # np.cumsum does not raise error is nan is in its own group only + if c == "string_missing" and gb_target["id"] in nan_single_groups[df_fix]: + _test_apply_and_transform(gb, c, targop, op, args) + else: + _test_raises_typeerror(gb[c], op) + + else: + _test_apply_and_transform(gb, c, targop, op, args) @pytest.mark.parametrize( @@ -1537,6 +1575,139 @@ def test_transform_sum_one_column_with_matching_labels_and_missing_labels(): tm.assert_frame_equal(result, expected) +def test_transform_string_dtype_cummin_fallback(): + # GH#49758 + df = DataFrame( + { + "str_col": ["xyz", "klm", "nop", "hij", "abc", "xxz", "efg"], + "num_col": [5, 3, 3, 3, 3, 5, 3], + } + ) + gb = df.groupby("num_col") + result = gb.transform("cummin") + + expected = DataFrame( + {"str_col": ["xyz", "klm", "klm", "hij", "abc", "xxz", "abc"]}, index=range(7) + ) + tm.assert_frame_equal(result, expected) + + +def test_transform_string_dtype_cummax_fallback(): + # GH#49758 + df = Series( + ["xyz", "klm", "nop", "hij", "abc", "xxz", "efg"], + name="str_col", + index=Index([5, 3, 3, 3, 3, 5, 3], name="num_idx"), + ) + + gb = df.groupby("num_idx") + result = gb.transform("cummax") + + expected = Series( + ["xyz", "klm", "nop", "nop", "nop", "xyz", "nop"], + name="str_col", + index=Index([5, 3, 3, 3, 3, 5, 3], name="num_idx"), + ) + tm.assert_series_equal(result, expected) + + +def test_transform_string_dtype_cumsum_fallback(): + # GH#49758 + df = Series( + ["xyz", "klm", "nop", "hij", "abc", "xxz", "efg"], + name="str_col", + index=Index([5, 3, 3, 3, 3, 5, 3], name="num_idx"), + ) + + gb = df.groupby("num_idx") + result = gb.transform("cumsum") + + expected = Series( + [ + "xyz", + "klm", + "klmnop", + "klmnophij", + "klmnophijabc", + "xyzxxz", + "klmnophijabcefg", + ], + name="str_col", + index=Index([5, 3, 3, 3, 3, 5, 3], name="num_idx"), + ) + tm.assert_series_equal(result, expected) + + +def test_transform_binary_pyarrow_dtype_rank_fallback(): + # GH#49758 + df = DataFrame( + { + "str_col": pd.array( + ["xyz", "klm", "nop", "hij", "abc", "xxz", "abc"], + dtype="binary[pyarrow]", + ), + "num_col": [5, 3, 3, 3, 3, 5, 3], + } + ) + gb = df.groupby("num_col") + result = gb.transform("rank", method="max") + + expected = DataFrame( + {"str_col": pd.array([2, 4, 5, 3, 2, 1, 2], dtype="int64[pyarrow]")}, + index=range(7), + ) + tm.assert_frame_equal(result, expected) + + +def test_transform_decimal_pyarrow_dtype_cumprod_fallback(): + # GH#49758 + from decimal import Decimal + + import pyarrow as pa + + decimal_type = pd.ArrowDtype(pa.decimal128(20, scale=10)) + + df = DataFrame( + { + "float_col": pd.array( + [ + Decimal("43"), + Decimal("45"), + Decimal("-5.657657"), + Decimal("55"), + Decimal("-8.454"), + Decimal("0"), + Decimal("-0.4544"), + ], + dtype=decimal_type, + ), + "num_col": [5, 3, 3, 3, 3, 5, 3], + } + ) + gb = df.groupby("num_col") + result = gb.transform("cumprod") + + expected_decimal_type = pd.ArrowDtype(pa.decimal128(29, scale=23)) + expected = DataFrame( + { + "float_col": pd.array( + [ + Decimal("43.0"), + Decimal("45.0"), + Decimal("-254.594565"), + Decimal("-14002.701075"), + Decimal("118378.83488805"), + Decimal("0.0"), + Decimal("-53791.34257312992"), + ], + dtype=expected_decimal_type, + ) + }, + index=range(7), + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype", ["int32", "float32"]) def test_min_one_unobserved_category_no_type_coercion(dtype): # GH#58084