From 515de45865526527808f40035e5e23e493d229c0 Mon Sep 17 00:00:00 2001 From: alonme Date: Thu, 14 May 2020 02:05:33 +0300 Subject: [PATCH 01/15] Fix apply to only call `func` once on the first column/row --- pandas/_libs/reduction.pyx | 21 +++++++++++--- pandas/core/apply.py | 49 ++++++++++++++++++++------------ pandas/tests/frame/test_apply.py | 4 ++- 3 files changed, 51 insertions(+), 23 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 18422c2f86129..5b3d519c9c049 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -107,6 +107,7 @@ cdef class Reducer: result = np.empty(self.nresults, dtype='O') it = PyArray_IterNew(result) + partial_result = None try: for i in range(self.nresults): @@ -134,21 +135,33 @@ cdef class Reducer: res = self.f(chunk) # TODO: reason for not squeezing here? - res = _extract_result(res, squeeze=False) + extracted_res = _extract_result(res, squeeze=False) if i == 0: # On the first pass, we check the output shape to see # if this looks like a reduction. - _check_result_array(res, len(self.dummy)) + # if it does not, return the computed value to be used by the pure python implementation, + # so the function won't be called twice on the same object (and side effects would occur twice) + try: + _check_result_array(extracted_res, len(self.dummy)) + except ValueError as err: + if "Function does not reduce" not in str(err): + # catch only the specific exception + raise - PyArray_SETITEM(result, PyArray_ITER_DATA(it), res) + partial_result = copy(res) + break + + + PyArray_SETITEM(result, PyArray_ITER_DATA(it), extracted_res) chunk.data = chunk.data + self.increment PyArray_ITER_NEXT(it) + finally: # so we don't free the wrong memory chunk.data = dummy_buf result = maybe_convert_objects(result) - return result + return result, partial_result cdef class _BaseGrouper: diff --git a/pandas/core/apply.py b/pandas/core/apply.py index a013434491589..408eaf14680f6 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -18,6 +18,9 @@ from pandas.core.construction import create_series_with_explicit_dtype +from pandas.core.series import Series +from pandas import DataFrame + if TYPE_CHECKING: from pandas import DataFrame, Series, Index @@ -220,14 +223,13 @@ def apply_empty_result(self): def apply_raw(self): """ apply to the values as a numpy array """ - try: - result = libreduction.compute_reduction(self.values, self.f, axis=self.axis) - except ValueError as err: - if "Function does not reduce" not in str(err): - # catch only ValueError raised intentionally in libreduction - raise - # We expect np.apply_along_axis to give a two-dimensional result, or - # also raise. + result, partial_result = libreduction.compute_reduction( + self.values, self.f, axis=self.axis + ) + + # A non None partial_result means that the reduction was unsuccessful + # We expect np.apply_along_axis to give a two-dimensional result, or raise. + if partial_result is not None: result = np.apply_along_axis(self.f, self.axis, self.values) # TODO: mixed type case @@ -265,6 +267,7 @@ def apply_broadcast(self, target: "DataFrame") -> "DataFrame": def apply_standard(self): + partial_result = None # try to reduce first (by default) # this only matters if the reduction in values is of different dtype # e.g. if we want to apply to a SparseFrame, then can't directly reduce @@ -292,13 +295,9 @@ def apply_standard(self): ) try: - result = libreduction.compute_reduction( + result, partial_result = libreduction.compute_reduction( values, self.f, axis=self.axis, dummy=dummy, labels=labels ) - except ValueError as err: - if "Function does not reduce" not in str(err): - # catch only ValueError raised intentionally in libreduction - raise except TypeError: # e.g. test_apply_ignore_failures we just ignore if not self.ignore_failures: @@ -307,23 +306,36 @@ def apply_standard(self): # reached via numexpr; fall back to python implementation pass else: - return self.obj._constructor_sliced(result, index=labels) + # this means that the reduction was successful + if partial_result is None: + return self.obj._constructor_sliced(result, index=labels) + else: + if isinstance(partial_result, Series): + partial_result = DataFrame.infer_objects(partial_result) # compute the result using the series generator - results, res_index = self.apply_series_generator() + results, res_index = self.apply_series_generator(partial_result) # wrap results return self.wrap_results(results, res_index) - def apply_series_generator(self) -> Tuple[ResType, "Index"]: + def apply_series_generator(self, partial_result=None) -> Tuple[ResType, "Index"]: series_gen = self.series_generator res_index = self.result_index keys = [] results = {} + + # If a partial result was already computed, use it instead of running on the first element again + series_gen_enumeration = enumerate(series_gen) + if partial_result is not None: + i, v = next(series_gen_enumeration) + results[i] = partial_result + keys.append(v.name) + if self.ignore_failures: successes = [] - for i, v in enumerate(series_gen): + for i, v in series_gen_enumeration: try: results[i] = self.f(v) except Exception: @@ -337,7 +349,8 @@ def apply_series_generator(self) -> Tuple[ResType, "Index"]: res_index = res_index.take(successes) else: - for i, v in enumerate(series_gen): + for i, v in series_gen_enumeration: + results[i] = self.f(v) keys.append(v.name) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index e328523253144..d2803734f8d94 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -718,7 +718,9 @@ def apply_list(row): def test_apply_noreduction_tzaware_object(self): # https://github.com/pandas-dev/pandas/issues/31505 - df = pd.DataFrame({"foo": [pd.Timestamp("2020", tz="UTC")]}, dtype="object") + df = pd.DataFrame( + {"foo": [pd.Timestamp("2020", tz="UTC")]}, dtype="datetime64[ns, UTC]" + ) result = df.apply(lambda x: x) tm.assert_frame_equal(result, df) result = df.apply(lambda x: x.copy()) From b71921b5a03af085b25d2161476c2d824c073e02 Mon Sep 17 00:00:00 2001 From: alonme Date: Fri, 15 May 2020 19:37:08 +0300 Subject: [PATCH 02/15] Fix groupby reducer tests --- pandas/tests/groupby/test_bin_groupby.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index e999b88fccb08..9df45f7a23f55 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -153,20 +153,20 @@ def test_int_index(self): ) dummy = Series(0.0, index=np.arange(100)) - result = libreduction.compute_reduction( + result, _ = libreduction.compute_reduction( arr, np.sum, dummy=dummy, labels=Index(np.arange(4)) ) expected = arr.sum(0) tm.assert_almost_equal(result, expected) dummy = Series(0.0, index=np.arange(4)) - result = libreduction.compute_reduction( + result, _ = libreduction.compute_reduction( arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) ) expected = arr.sum(1) tm.assert_almost_equal(result, expected) - result = libreduction.compute_reduction( + result, _ = libreduction.compute_reduction( arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) ) tm.assert_almost_equal(result, expected) From a8a6fd50d7349fd9202aa832408e52cc02c448f1 Mon Sep 17 00:00:00 2001 From: alonme Date: Fri, 15 May 2020 19:48:49 +0300 Subject: [PATCH 03/15] fix pep8 --- pandas/_libs/reduction.pyx | 7 ++++--- pandas/core/apply.py | 5 +++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 5b3d519c9c049..06fdc40b9d42b 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -139,8 +139,10 @@ cdef class Reducer: if i == 0: # On the first pass, we check the output shape to see # if this looks like a reduction. - # if it does not, return the computed value to be used by the pure python implementation, - # so the function won't be called twice on the same object (and side effects would occur twice) + # If it does not, return the computed value to be used by the + # pure python implementation, + # so the function won't be called twice on the same object, + # and side effects would occur twice try: _check_result_array(extracted_res, len(self.dummy)) except ValueError as err: @@ -151,7 +153,6 @@ cdef class Reducer: partial_result = copy(res) break - PyArray_SETITEM(result, PyArray_ITER_DATA(it), extracted_res) chunk.data = chunk.data + self.increment PyArray_ITER_NEXT(it) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 408eaf14680f6..d94e7cde92514 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -18,8 +18,8 @@ from pandas.core.construction import create_series_with_explicit_dtype +from pandas.core.frame import DataFrame from pandas.core.series import Series -from pandas import DataFrame if TYPE_CHECKING: from pandas import DataFrame, Series, Index @@ -326,7 +326,8 @@ def apply_series_generator(self, partial_result=None) -> Tuple[ResType, "Index"] keys = [] results = {} - # If a partial result was already computed, use it instead of running on the first element again + # If a partial result was already computed, + # use it instead of running on the first element again series_gen_enumeration = enumerate(series_gen) if partial_result is not None: i, v = next(series_gen_enumeration) From 3957956c2d60daf78e5575fcd580ce87b7bee9b3 Mon Sep 17 00:00:00 2001 From: alonme Date: Sun, 17 May 2020 22:31:37 +0300 Subject: [PATCH 04/15] Fix imports for linting --- pandas/core/apply.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index d94e7cde92514..617a16dc91f7a 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -17,12 +17,11 @@ from pandas.core.dtypes.generic import ABCSeries from pandas.core.construction import create_series_with_explicit_dtype - from pandas.core.frame import DataFrame from pandas.core.series import Series if TYPE_CHECKING: - from pandas import DataFrame, Series, Index + from pandas import Index ResType = Dict[int, Any] From 4564e9f29f44711c8173cea6fbecb6209a753bba Mon Sep 17 00:00:00 2001 From: alonme Date: Mon, 18 May 2020 23:11:28 +0300 Subject: [PATCH 05/15] Add some tests, use ABCseries, remove wrong docs for applymap --- pandas/core/apply.py | 5 ++--- pandas/core/frame.py | 8 -------- pandas/tests/frame/test_apply.py | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 617a16dc91f7a..2519e2eae1d00 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -18,10 +18,9 @@ from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.frame import DataFrame -from pandas.core.series import Series if TYPE_CHECKING: - from pandas import Index + from pandas import Series, Index ResType = Dict[int, Any] @@ -309,7 +308,7 @@ def apply_standard(self): if partial_result is None: return self.obj._constructor_sliced(result, index=labels) else: - if isinstance(partial_result, Series): + if isinstance(partial_result, ABCSeries): partial_result = DataFrame.infer_objects(partial_result) # compute the result using the series generator diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9abc6e4245d81..3fba648551a93 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7421,14 +7421,6 @@ def applymap(self, func) -> "DataFrame": -------- DataFrame.apply : Apply a function along input axis of DataFrame. - Notes - ----- - In the current implementation applymap calls `func` twice on the - first column/row to decide whether it can take a fast or slow - code path. This can lead to unexpected behavior if `func` has - side-effects, as they will take effect twice for the first - column/row. - Examples -------- >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]]) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index d2803734f8d94..92a810e78fbcf 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -726,6 +726,38 @@ def test_apply_noreduction_tzaware_object(self): result = df.apply(lambda x: x.copy()) tm.assert_frame_equal(result, df) + def test_apply_function_runs_once(self): + # https://github.com/pandas-dev/pandas/issues/30815 + def non_reducing_func_with_state(row): + non_reducing_func_with_state.call_count = getattr(non_reducing_func_with_state, 'call_count', 0) + 1 + return row * non_reducing_func_with_state.call_count + + def reducing_func_with_state(_): + reducing_func_with_state.call_count = getattr(reducing_func_with_state, 'call_count', 0) + 1 + return reducing_func_with_state.call_count + + df = pd.DataFrame({'a': [1, 2, 3]}) + + # no reduction + res0 = df.apply(non_reducing_func_with_state) + tm.assert_frame_equal(res0, df) + + # reduction + res1 = df.apply(reducing_func_with_state) + tm.assert_series_equal(res1, Series(data=[1], index=['a'])) + + def test_applymap_function_runs_once(self): + + # This function will create the same values as in the DataFrame + def func_with_state(_): + func_with_state.call_count = getattr(func_with_state, 'call_count', 0) + 1 + return func_with_state.call_count + + df = pd.DataFrame({'a': [1, 2, 3]}) + result = df.applymap(func_with_state) + tm.assert_frame_equal(result, df) + + class TestInferOutputShape: # the user has supplied an opaque UDF where From e5bdb207cea0a6d29dcaf7618e183ec0284fb548 Mon Sep 17 00:00:00 2001 From: alonme Date: Tue, 19 May 2020 21:21:19 +0300 Subject: [PATCH 06/15] Use mock to test call_count --- pandas/tests/frame/test_apply.py | 33 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 92a810e78fbcf..f83ebbe0e31f3 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -2,6 +2,7 @@ from datetime import datetime from itertools import chain import operator +from unittest.mock import Mock import warnings import numpy as np @@ -728,35 +729,25 @@ def test_apply_noreduction_tzaware_object(self): def test_apply_function_runs_once(self): # https://github.com/pandas-dev/pandas/issues/30815 - def non_reducing_func_with_state(row): - non_reducing_func_with_state.call_count = getattr(non_reducing_func_with_state, 'call_count', 0) + 1 - return row * non_reducing_func_with_state.call_count + non_reducing_mock = Mock(side_effect=lambda x: x) + reducing_mock = Mock(return_value=1) - def reducing_func_with_state(_): - reducing_func_with_state.call_count = getattr(reducing_func_with_state, 'call_count', 0) + 1 - return reducing_func_with_state.call_count - - df = pd.DataFrame({'a': [1, 2, 3]}) + df = pd.DataFrame({"a": [1, 2, 3]}) # no reduction - res0 = df.apply(non_reducing_func_with_state) - tm.assert_frame_equal(res0, df) + df.apply(non_reducing_mock, axis=1) + assert non_reducing_mock.call_count == 3 # reduction - res1 = df.apply(reducing_func_with_state) - tm.assert_series_equal(res1, Series(data=[1], index=['a'])) + df.apply(reducing_mock, axis=1) + assert reducing_mock.call_count == 3 def test_applymap_function_runs_once(self): + reducing_mock = Mock(return_value=1) - # This function will create the same values as in the DataFrame - def func_with_state(_): - func_with_state.call_count = getattr(func_with_state, 'call_count', 0) + 1 - return func_with_state.call_count - - df = pd.DataFrame({'a': [1, 2, 3]}) - result = df.applymap(func_with_state) - tm.assert_frame_equal(result, df) - + df = pd.DataFrame({"a": [1, 2, 3]}) + df.applymap(reducing_mock) + assert reducing_mock.call_count == 3 class TestInferOutputShape: From 1eb9442fe7641750ff8963a91495efe093426c19 Mon Sep 17 00:00:00 2001 From: alonme Date: Tue, 19 May 2020 22:24:02 +0300 Subject: [PATCH 07/15] Remove DataFrame import --- pandas/core/apply.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 2519e2eae1d00..1dc8f94ec15a5 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -17,10 +17,9 @@ from pandas.core.dtypes.generic import ABCSeries from pandas.core.construction import create_series_with_explicit_dtype -from pandas.core.frame import DataFrame if TYPE_CHECKING: - from pandas import Series, Index + from pandas import DataFrame, Series, Index ResType = Dict[int, Any] @@ -309,7 +308,7 @@ def apply_standard(self): return self.obj._constructor_sliced(result, index=labels) else: if isinstance(partial_result, ABCSeries): - partial_result = DataFrame.infer_objects(partial_result) + partial_result = partial_result.infer_objects() # compute the result using the series generator results, res_index = self.apply_series_generator(partial_result) From a59934d7ae893f7ec611c9d88fbed293f956e110 Mon Sep 17 00:00:00 2001 From: alonme Date: Wed, 20 May 2020 21:18:53 +0300 Subject: [PATCH 08/15] Reduction returns success value and partial result is returned in results --- pandas/_libs/reduction.pyx | 7 ++++--- pandas/core/apply.py | 19 +++++++++++-------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 06fdc40b9d42b..99c6f8bde5dd8 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -107,7 +107,7 @@ cdef class Reducer: result = np.empty(self.nresults, dtype='O') it = PyArray_IterNew(result) - partial_result = None + reduction_success = True try: for i in range(self.nresults): @@ -150,7 +150,8 @@ cdef class Reducer: # catch only the specific exception raise - partial_result = copy(res) + reduction_success = False + PyArray_SETITEM(result, PyArray_ITER_DATA(it), copy(res)) break PyArray_SETITEM(result, PyArray_ITER_DATA(it), extracted_res) @@ -162,7 +163,7 @@ cdef class Reducer: chunk.data = dummy_buf result = maybe_convert_objects(result) - return result, partial_result + return result, reduction_success cdef class _BaseGrouper: diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 1dc8f94ec15a5..f0f966bc9c10b 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -220,13 +220,12 @@ def apply_empty_result(self): def apply_raw(self): """ apply to the values as a numpy array """ - result, partial_result = libreduction.compute_reduction( + result, reduction_success = libreduction.compute_reduction( self.values, self.f, axis=self.axis ) - # A non None partial_result means that the reduction was unsuccessful # We expect np.apply_along_axis to give a two-dimensional result, or raise. - if partial_result is not None: + if not reduction_success: result = np.apply_along_axis(self.f, self.axis, self.values) # TODO: mixed type case @@ -264,7 +263,8 @@ def apply_broadcast(self, target: "DataFrame") -> "DataFrame": def apply_standard(self): - partial_result = None + partial_result = None # partial result that may be returned from reduction. + # try to reduce first (by default) # this only matters if the reduction in values is of different dtype # e.g. if we want to apply to a SparseFrame, then can't directly reduce @@ -292,7 +292,7 @@ def apply_standard(self): ) try: - result, partial_result = libreduction.compute_reduction( + result, reduction_success = libreduction.compute_reduction( values, self.f, axis=self.axis, dummy=dummy, labels=labels ) except TypeError: @@ -303,14 +303,17 @@ def apply_standard(self): # reached via numexpr; fall back to python implementation pass else: - # this means that the reduction was successful - if partial_result is None: + if reduction_success: return self.obj._constructor_sliced(result, index=labels) else: + # no exceptions - however reduction was unsuccessful, + # use the computed function result for first element + partial_result = result[0] if isinstance(partial_result, ABCSeries): partial_result = partial_result.infer_objects() - # compute the result using the series generator + # compute the result using the series generator, + # use the result computed while trying to reduce if available. results, res_index = self.apply_series_generator(partial_result) # wrap results From 7ee1faa617f42a7bea3c8b11ea04a8623ad34ff6 Mon Sep 17 00:00:00 2001 From: alonme Date: Thu, 28 May 2020 08:45:25 +0300 Subject: [PATCH 09/15] Remove unittest.mock --- pandas/tests/frame/test_apply.py | 38 ++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index f83ebbe0e31f3..1879f9866b38a 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -2,7 +2,6 @@ from datetime import datetime from itertools import chain import operator -from unittest.mock import Mock import warnings import numpy as np @@ -729,25 +728,40 @@ def test_apply_noreduction_tzaware_object(self): def test_apply_function_runs_once(self): # https://github.com/pandas-dev/pandas/issues/30815 - non_reducing_mock = Mock(side_effect=lambda x: x) - reducing_mock = Mock(return_value=1) df = pd.DataFrame({"a": [1, 2, 3]}) + names = [] # Save row names function is applied to - # no reduction - df.apply(non_reducing_mock, axis=1) - assert non_reducing_mock.call_count == 3 + def reducing_function(row): + names.append(row.name) + + def non_reducing_function(row): + names.append(row.name) + return row + + for func in [reducing_function, non_reducing_function]: + del names[:] - # reduction - df.apply(reducing_mock, axis=1) - assert reducing_mock.call_count == 3 + df.apply(func, axis=1) + assert names == list(df.index) def test_applymap_function_runs_once(self): - reducing_mock = Mock(return_value=1) df = pd.DataFrame({"a": [1, 2, 3]}) - df.applymap(reducing_mock) - assert reducing_mock.call_count == 3 + values = [] # Save values function is applied to + + def reducing_function(val): + values.append(val) + + def non_reducing_function(val): + values.append(val) + return val + + for func in [reducing_function, non_reducing_function]: + del values[:] + + df.applymap(func) + assert values == df.a.to_list() class TestInferOutputShape: From fa6299d5771352cbdaad38db197a8747bf56409a Mon Sep 17 00:00:00 2001 From: alonme Date: Fri, 29 May 2020 11:11:00 +0300 Subject: [PATCH 10/15] Add whatsnew, small CR refactor --- doc/source/whatsnew/v1.1.0.rst | 59 ++++++++++++++++++++++++++++++++++ pandas/core/apply.py | 19 +++++------ 2 files changed, 67 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5ef1f9dea5091..08d9f6c49090b 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -630,6 +630,65 @@ Using :meth:`DataFrame.groupby` with ``as_index=False`` and the function ``idxma df.groupby("a", as_index=False).nunique() +.. _whatsnew_110.api_breaking.apply_applymap_first_once: + +apply and applymap on ``DataFrame`` evaluates first row/column only once +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. ipython:: python + + In [1]: import pandas as pd + ...: df = pd.DataFrame({'a': [1,2], 'b': [3,6]}) + ...: + + In [2]: df + Out[2]: + a b + 0 1 3 + 1 2 6 + + In [3]: def func(row): + ...: print(row) + ...: return row + ...: + + +*Previous behavior*: + +.. ipython:: python + + In [4]: df.apply(func, axis=1) + a 1 + b 3 + Name: 0, dtype: int64 + a 1 + b 3 + Name: 0, dtype: int64 + a 2 + b 6 + Name: 1, dtype: int64 + Out[4]: + a b + 0 1 3 + 1 2 6 + +*New behavior*: + +.. ipython:: python + + In [4]: df.apply(func, axis=1) + a 1 + b 3 + Name: 0, dtype: int64 + a 2 + b 6 + Name: 1, dtype: int64 + Out[4]: + a b + 0 1 3 + 1 2 6 + + .. _whatsnew_110.deprecations: Deprecations diff --git a/pandas/core/apply.py b/pandas/core/apply.py index f0f966bc9c10b..0a274d8becd72 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -263,7 +263,8 @@ def apply_broadcast(self, target: "DataFrame") -> "DataFrame": def apply_standard(self): - partial_result = None # partial result that may be returned from reduction. + # partial result that may be returned from reduction + partial_result = None # try to reduce first (by default) # this only matters if the reduction in values is of different dtype @@ -305,12 +306,12 @@ def apply_standard(self): else: if reduction_success: return self.obj._constructor_sliced(result, index=labels) - else: - # no exceptions - however reduction was unsuccessful, - # use the computed function result for first element - partial_result = result[0] - if isinstance(partial_result, ABCSeries): - partial_result = partial_result.infer_objects() + + # no exceptions - however reduction was unsuccessful, + # use the computed function result for first element + partial_result = result[0] + if isinstance(partial_result, ABCSeries): + partial_result = partial_result.infer_objects() # compute the result using the series generator, # use the result computed while trying to reduce if available. @@ -323,7 +324,6 @@ def apply_series_generator(self, partial_result=None) -> Tuple[ResType, "Index"] series_gen = self.series_generator res_index = self.result_index - keys = [] results = {} # If a partial result was already computed, @@ -332,7 +332,6 @@ def apply_series_generator(self, partial_result=None) -> Tuple[ResType, "Index"] if partial_result is not None: i, v = next(series_gen_enumeration) results[i] = partial_result - keys.append(v.name) if self.ignore_failures: successes = [] @@ -342,7 +341,6 @@ def apply_series_generator(self, partial_result=None) -> Tuple[ResType, "Index"] except Exception: pass else: - keys.append(v.name) successes.append(i) # so will work with MultiIndex @@ -353,7 +351,6 @@ def apply_series_generator(self, partial_result=None) -> Tuple[ResType, "Index"] for i, v in series_gen_enumeration: results[i] = self.f(v) - keys.append(v.name) return results, res_index From 58cefd6b525c9bd83401d79e735f8d7efd103cc5 Mon Sep 17 00:00:00 2001 From: alonme Date: Fri, 29 May 2020 13:44:59 +0300 Subject: [PATCH 11/15] Fix doc linting --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 08d9f6c49090b..f881d2761081e 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -638,7 +638,7 @@ apply and applymap on ``DataFrame`` evaluates first row/column only once .. ipython:: python In [1]: import pandas as pd - ...: df = pd.DataFrame({'a': [1,2], 'b': [3,6]}) + ...: df = pd.DataFrame({'a': [1, 2], 'b': [3, 6]}) ...: In [2]: df From 07edc871f5f220c4e4835447cdcf61d64d5a93ab Mon Sep 17 00:00:00 2001 From: alonme Date: Fri, 29 May 2020 14:17:59 +0300 Subject: [PATCH 12/15] Fix code blocks in doc --- doc/source/whatsnew/v1.1.0.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index f881d2761081e..46eb412caa665 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -635,7 +635,7 @@ Using :meth:`DataFrame.groupby` with ``as_index=False`` and the function ``idxma apply and applymap on ``DataFrame`` evaluates first row/column only once ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. ipython:: python +.. code-block:: ipython In [1]: import pandas as pd ...: df = pd.DataFrame({'a': [1, 2], 'b': [3, 6]}) @@ -655,7 +655,7 @@ apply and applymap on ``DataFrame`` evaluates first row/column only once *Previous behavior*: -.. ipython:: python +.. code-block:: ipython In [4]: df.apply(func, axis=1) a 1 @@ -674,7 +674,7 @@ apply and applymap on ``DataFrame`` evaluates first row/column only once *New behavior*: -.. ipython:: python +.. code-block:: ipython In [4]: df.apply(func, axis=1) a 1 From 2cd8ee6cc841f52c2f3d450ed8f49d38cae0ba05 Mon Sep 17 00:00:00 2001 From: alonme Date: Sun, 31 May 2020 22:55:05 +0300 Subject: [PATCH 13/15] Add test for apply_raw with xfail --- pandas/tests/frame/test_apply.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 1879f9866b38a..d12699397d1e4 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -745,6 +745,28 @@ def non_reducing_function(row): df.apply(func, axis=1) assert names == list(df.index) + @pytest.mark.xfail( + reason="The 'run once' enhancement for apply_raw not implemented yet." + ) + def test_apply_raw_function_runs_once(self): + # https://github.com/pandas-dev/pandas/issues/34506 + + df = pd.DataFrame({"a": [1, 2, 3]}) + values = [] # Save row values function is applied to + + def reducing_function(row): + values.extend(row) + + def non_reducing_function(row): + values.extend(row) + return row + + for func in [reducing_function, non_reducing_function]: + del values[:] + + df.apply(func, raw=True, axis=1) + assert values == list(df.a.to_list()) + def test_applymap_function_runs_once(self): df = pd.DataFrame({"a": [1, 2, 3]}) From ec22529753f061821ff1b7c8ed57174d251099fc Mon Sep 17 00:00:00 2001 From: alonme Date: Tue, 2 Jun 2020 16:37:32 +0300 Subject: [PATCH 14/15] Fix whatsnew python blocks --- doc/source/whatsnew/v1.1.0.rst | 34 +++++++--------------------------- 1 file changed, 7 insertions(+), 27 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 46eb412caa665..70fe9c0c90bd8 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -635,23 +635,13 @@ Using :meth:`DataFrame.groupby` with ``as_index=False`` and the function ``idxma apply and applymap on ``DataFrame`` evaluates first row/column only once ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code-block:: ipython - - In [1]: import pandas as pd - ...: df = pd.DataFrame({'a': [1, 2], 'b': [3, 6]}) - ...: - - In [2]: df - Out[2]: - a b - 0 1 3 - 1 2 6 +.. ipython:: python - In [3]: def func(row): - ...: print(row) - ...: return row - ...: + df = pd.DataFrame({'a': [1, 2], 'b': [3, 6]}) + def func(row): + print(row) + return row *Previous behavior*: @@ -674,19 +664,9 @@ apply and applymap on ``DataFrame`` evaluates first row/column only once *New behavior*: -.. code-block:: ipython +.. ipython:: python - In [4]: df.apply(func, axis=1) - a 1 - b 3 - Name: 0, dtype: int64 - a 2 - b 6 - Name: 1, dtype: int64 - Out[4]: - a b - 0 1 3 - 1 2 6 + df.apply(func, axis=1) .. _whatsnew_110.deprecations: From 55db8716727c0769effd4ff597c1344a76e4e6e2 Mon Sep 17 00:00:00 2001 From: alonme Date: Tue, 2 Jun 2020 16:59:46 +0300 Subject: [PATCH 15/15] fix indent --- doc/source/whatsnew/v1.1.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 70fe9c0c90bd8..86b97dad8ad10 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -640,8 +640,8 @@ apply and applymap on ``DataFrame`` evaluates first row/column only once df = pd.DataFrame({'a': [1, 2], 'b': [3, 6]}) def func(row): - print(row) - return row + print(row) + return row *Previous behavior*: