Skip to content

Commit 9a57f45

Browse files
authored
BUG/ENH: Fix apply to only call func once on the first column/row (#34183)
1 parent b3c3126 commit 9a57f45

File tree

6 files changed

+157
-39
lines changed

6 files changed

+157
-39
lines changed

doc/source/whatsnew/v1.1.0.rst

+39
Original file line numberDiff line numberDiff line change
@@ -665,6 +665,45 @@ Using :meth:`DataFrame.groupby` with ``as_index=False`` and the function ``idxma
665665
666666
df.groupby("a", as_index=False).nunique()
667667
668+
.. _whatsnew_110.api_breaking.apply_applymap_first_once:
669+
670+
apply and applymap on ``DataFrame`` evaluates first row/column only once
671+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
672+
673+
.. ipython:: python
674+
675+
df = pd.DataFrame({'a': [1, 2], 'b': [3, 6]})
676+
677+
def func(row):
678+
print(row)
679+
return row
680+
681+
*Previous behavior*:
682+
683+
.. code-block:: ipython
684+
685+
In [4]: df.apply(func, axis=1)
686+
a 1
687+
b 3
688+
Name: 0, dtype: int64
689+
a 1
690+
b 3
691+
Name: 0, dtype: int64
692+
a 2
693+
b 6
694+
Name: 1, dtype: int64
695+
Out[4]:
696+
a b
697+
0 1 3
698+
1 2 6
699+
700+
*New behavior*:
701+
702+
.. ipython:: python
703+
704+
df.apply(func, axis=1)
705+
706+
668707
.. _whatsnew_110.deprecations:
669708

670709
Deprecations

pandas/_libs/reduction.pyx

+20-5
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ cdef class Reducer:
107107

108108
result = np.empty(self.nresults, dtype='O')
109109
it = <flatiter>PyArray_IterNew(result)
110+
reduction_success = True
110111

111112
try:
112113
for i in range(self.nresults):
@@ -134,21 +135,35 @@ cdef class Reducer:
134135
res = self.f(chunk)
135136

136137
# TODO: reason for not squeezing here?
137-
res = _extract_result(res, squeeze=False)
138+
extracted_res = _extract_result(res, squeeze=False)
138139
if i == 0:
139140
# On the first pass, we check the output shape to see
140141
# if this looks like a reduction.
141-
_check_result_array(res, len(self.dummy))
142-
143-
PyArray_SETITEM(result, PyArray_ITER_DATA(it), res)
142+
# If it does not, return the computed value to be used by the
143+
# pure python implementation,
144+
# so the function won't be called twice on the same object,
145+
# and side effects would occur twice
146+
try:
147+
_check_result_array(extracted_res, len(self.dummy))
148+
except ValueError as err:
149+
if "Function does not reduce" not in str(err):
150+
# catch only the specific exception
151+
raise
152+
153+
reduction_success = False
154+
PyArray_SETITEM(result, PyArray_ITER_DATA(it), copy(res))
155+
break
156+
157+
PyArray_SETITEM(result, PyArray_ITER_DATA(it), extracted_res)
144158
chunk.data = chunk.data + self.increment
145159
PyArray_ITER_NEXT(it)
160+
146161
finally:
147162
# so we don't free the wrong memory
148163
chunk.data = dummy_buf
149164

150165
result = maybe_convert_objects(result)
151-
return result
166+
return result, reduction_success
152167

153168

154169
cdef class _BaseGrouper:

pandas/core/apply.py

+33-22
Original file line numberDiff line numberDiff line change
@@ -220,14 +220,12 @@ def apply_empty_result(self):
220220

221221
def apply_raw(self):
222222
""" apply to the values as a numpy array """
223-
try:
224-
result = libreduction.compute_reduction(self.values, self.f, axis=self.axis)
225-
except ValueError as err:
226-
if "Function does not reduce" not in str(err):
227-
# catch only ValueError raised intentionally in libreduction
228-
raise
229-
# We expect np.apply_along_axis to give a two-dimensional result, or
230-
# also raise.
223+
result, reduction_success = libreduction.compute_reduction(
224+
self.values, self.f, axis=self.axis
225+
)
226+
227+
# We expect np.apply_along_axis to give a two-dimensional result, or raise.
228+
if not reduction_success:
231229
result = np.apply_along_axis(self.f, self.axis, self.values)
232230

233231
# TODO: mixed type case
@@ -265,6 +263,9 @@ def apply_broadcast(self, target: "DataFrame") -> "DataFrame":
265263

266264
def apply_standard(self):
267265

266+
# partial result that may be returned from reduction
267+
partial_result = None
268+
268269
# try to reduce first (by default)
269270
# this only matters if the reduction in values is of different dtype
270271
# e.g. if we want to apply to a SparseFrame, then can't directly reduce
@@ -292,13 +293,9 @@ def apply_standard(self):
292293
)
293294

294295
try:
295-
result = libreduction.compute_reduction(
296+
result, reduction_success = libreduction.compute_reduction(
296297
values, self.f, axis=self.axis, dummy=dummy, labels=labels
297298
)
298-
except ValueError as err:
299-
if "Function does not reduce" not in str(err):
300-
# catch only ValueError raised intentionally in libreduction
301-
raise
302299
except TypeError:
303300
# e.g. test_apply_ignore_failures we just ignore
304301
if not self.ignore_failures:
@@ -307,39 +304,53 @@ def apply_standard(self):
307304
# reached via numexpr; fall back to python implementation
308305
pass
309306
else:
310-
return self.obj._constructor_sliced(result, index=labels)
307+
if reduction_success:
308+
return self.obj._constructor_sliced(result, index=labels)
311309

312-
# compute the result using the series generator
313-
results, res_index = self.apply_series_generator()
310+
# no exceptions - however reduction was unsuccessful,
311+
# use the computed function result for first element
312+
partial_result = result[0]
313+
if isinstance(partial_result, ABCSeries):
314+
partial_result = partial_result.infer_objects()
315+
316+
# compute the result using the series generator,
317+
# use the result computed while trying to reduce if available.
318+
results, res_index = self.apply_series_generator(partial_result)
314319

315320
# wrap results
316321
return self.wrap_results(results, res_index)
317322

318-
def apply_series_generator(self) -> Tuple[ResType, "Index"]:
323+
def apply_series_generator(self, partial_result=None) -> Tuple[ResType, "Index"]:
319324
series_gen = self.series_generator
320325
res_index = self.result_index
321326

322-
keys = []
323327
results = {}
328+
329+
# If a partial result was already computed,
330+
# use it instead of running on the first element again
331+
series_gen_enumeration = enumerate(series_gen)
332+
if partial_result is not None:
333+
i, v = next(series_gen_enumeration)
334+
results[i] = partial_result
335+
324336
if self.ignore_failures:
325337
successes = []
326-
for i, v in enumerate(series_gen):
338+
for i, v in series_gen_enumeration:
327339
try:
328340
results[i] = self.f(v)
329341
except Exception:
330342
pass
331343
else:
332-
keys.append(v.name)
333344
successes.append(i)
334345

335346
# so will work with MultiIndex
336347
if len(successes) < len(res_index):
337348
res_index = res_index.take(successes)
338349

339350
else:
340-
for i, v in enumerate(series_gen):
351+
for i, v in series_gen_enumeration:
352+
341353
results[i] = self.f(v)
342-
keys.append(v.name)
343354

344355
return results, res_index
345356

pandas/core/frame.py

-8
Original file line numberDiff line numberDiff line change
@@ -7526,14 +7526,6 @@ def applymap(self, func) -> "DataFrame":
75267526
--------
75277527
DataFrame.apply : Apply a function along input axis of DataFrame.
75287528
7529-
Notes
7530-
-----
7531-
In the current implementation applymap calls `func` twice on the
7532-
first column/row to decide whether it can take a fast or slow
7533-
code path. This can lead to unexpected behavior if `func` has
7534-
side-effects, as they will take effect twice for the first
7535-
column/row.
7536-
75377529
Examples
75387530
--------
75397531
>>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])

pandas/tests/frame/test_apply.py

+62-1
Original file line numberDiff line numberDiff line change
@@ -718,12 +718,73 @@ def apply_list(row):
718718

719719
def test_apply_noreduction_tzaware_object(self):
720720
# https://github.com/pandas-dev/pandas/issues/31505
721-
df = pd.DataFrame({"foo": [pd.Timestamp("2020", tz="UTC")]}, dtype="object")
721+
df = pd.DataFrame(
722+
{"foo": [pd.Timestamp("2020", tz="UTC")]}, dtype="datetime64[ns, UTC]"
723+
)
722724
result = df.apply(lambda x: x)
723725
tm.assert_frame_equal(result, df)
724726
result = df.apply(lambda x: x.copy())
725727
tm.assert_frame_equal(result, df)
726728

729+
def test_apply_function_runs_once(self):
730+
# https://github.com/pandas-dev/pandas/issues/30815
731+
732+
df = pd.DataFrame({"a": [1, 2, 3]})
733+
names = [] # Save row names function is applied to
734+
735+
def reducing_function(row):
736+
names.append(row.name)
737+
738+
def non_reducing_function(row):
739+
names.append(row.name)
740+
return row
741+
742+
for func in [reducing_function, non_reducing_function]:
743+
del names[:]
744+
745+
df.apply(func, axis=1)
746+
assert names == list(df.index)
747+
748+
@pytest.mark.xfail(
749+
reason="The 'run once' enhancement for apply_raw not implemented yet."
750+
)
751+
def test_apply_raw_function_runs_once(self):
752+
# https://github.com/pandas-dev/pandas/issues/34506
753+
754+
df = pd.DataFrame({"a": [1, 2, 3]})
755+
values = [] # Save row values function is applied to
756+
757+
def reducing_function(row):
758+
values.extend(row)
759+
760+
def non_reducing_function(row):
761+
values.extend(row)
762+
return row
763+
764+
for func in [reducing_function, non_reducing_function]:
765+
del values[:]
766+
767+
df.apply(func, raw=True, axis=1)
768+
assert values == list(df.a.to_list())
769+
770+
def test_applymap_function_runs_once(self):
771+
772+
df = pd.DataFrame({"a": [1, 2, 3]})
773+
values = [] # Save values function is applied to
774+
775+
def reducing_function(val):
776+
values.append(val)
777+
778+
def non_reducing_function(val):
779+
values.append(val)
780+
return val
781+
782+
for func in [reducing_function, non_reducing_function]:
783+
del values[:]
784+
785+
df.applymap(func)
786+
assert values == df.a.to_list()
787+
727788

728789
class TestInferOutputShape:
729790
# the user has supplied an opaque UDF where

pandas/tests/groupby/test_bin_groupby.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -153,20 +153,20 @@ def test_int_index(self):
153153
)
154154

155155
dummy = Series(0.0, index=np.arange(100))
156-
result = libreduction.compute_reduction(
156+
result, _ = libreduction.compute_reduction(
157157
arr, np.sum, dummy=dummy, labels=Index(np.arange(4))
158158
)
159159
expected = arr.sum(0)
160160
tm.assert_almost_equal(result, expected)
161161

162162
dummy = Series(0.0, index=np.arange(4))
163-
result = libreduction.compute_reduction(
163+
result, _ = libreduction.compute_reduction(
164164
arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100))
165165
)
166166
expected = arr.sum(1)
167167
tm.assert_almost_equal(result, expected)
168168

169-
result = libreduction.compute_reduction(
169+
result, _ = libreduction.compute_reduction(
170170
arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100))
171171
)
172172
tm.assert_almost_equal(result, expected)

0 commit comments

Comments
 (0)