Skip to content

Commit 6b46ee3

Browse files
jbrockmendelfangchenli
authored andcommitted
PERF: avoid creating many Series in apply_standard (pandas-dev#34909)
1 parent 3616377 commit 6b46ee3

File tree

1 file changed

+48
-65
lines changed

1 file changed

+48
-65
lines changed

pandas/core/apply.py

+48-65
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,13 @@
44

55
import numpy as np
66

7+
from pandas._config import option_context
8+
79
from pandas._libs import reduction as libreduction
810
from pandas._typing import Axis
911
from pandas.util._decorators import cache_readonly
1012

11-
from pandas.core.dtypes.common import (
12-
is_dict_like,
13-
is_extension_array_dtype,
14-
is_list_like,
15-
is_sequence,
16-
)
13+
from pandas.core.dtypes.common import is_dict_like, is_list_like, is_sequence
1714
from pandas.core.dtypes.generic import ABCSeries
1815

1916
from pandas.core.construction import create_series_with_explicit_dtype
@@ -260,53 +257,6 @@ def apply_standard(self):
260257
# partial result that may be returned from reduction
261258
partial_result = None
262259

263-
# try to reduce first (by default)
264-
# this only matters if the reduction in values is of different dtype
265-
# e.g. if we want to apply to a SparseFrame, then can't directly reduce
266-
267-
# we cannot reduce using non-numpy dtypes,
268-
# as demonstrated in gh-12244
269-
if (
270-
self.result_type in ["reduce", None]
271-
and not self.dtypes.apply(is_extension_array_dtype).any()
272-
# Disallow dtypes where setting _index_data will break
273-
# ExtensionArray values, see GH#31182
274-
and not self.dtypes.apply(lambda x: x.kind in ["m", "M"]).any()
275-
# Disallow complex_internals since libreduction shortcut raises a TypeError
276-
and not self.agg_axis._has_complex_internals
277-
):
278-
279-
values = self.values
280-
index = self.obj._get_axis(self.axis)
281-
labels = self.agg_axis
282-
empty_arr = np.empty(len(index), dtype=values.dtype)
283-
284-
# Preserve subclass for e.g. test_subclassed_apply
285-
dummy = self.obj._constructor_sliced(
286-
empty_arr, index=index, dtype=values.dtype
287-
)
288-
289-
try:
290-
result, reduction_success = libreduction.compute_reduction(
291-
values, self.f, axis=self.axis, dummy=dummy, labels=labels
292-
)
293-
except TypeError:
294-
# e.g. test_apply_ignore_failures we just ignore
295-
if not self.ignore_failures:
296-
raise
297-
except ZeroDivisionError:
298-
# reached via numexpr; fall back to python implementation
299-
pass
300-
else:
301-
if reduction_success:
302-
return self.obj._constructor_sliced(result, index=labels)
303-
304-
# no exceptions - however reduction was unsuccessful,
305-
# use the computed function result for first element
306-
partial_result = result[0]
307-
if isinstance(partial_result, ABCSeries):
308-
partial_result = partial_result.infer_objects()
309-
310260
# compute the result using the series generator,
311261
# use the result computed while trying to reduce if available.
312262
results, res_index = self.apply_series_generator(partial_result)
@@ -344,7 +294,14 @@ def apply_series_generator(self, partial_result=None) -> Tuple[ResType, "Index"]
344294
else:
345295
for i, v in series_gen_enumeration:
346296

347-
results[i] = self.f(v)
297+
with option_context("mode.chained_assignment", None):
298+
# ignore SettingWithCopy here in case the user mutates
299+
results[i] = self.f(v)
300+
301+
if isinstance(results[i], ABCSeries):
302+
# If we have a view on v, we need to make a copy because
303+
# series_generator will swap out the underlying data
304+
results[i] = results[i].copy(deep=False)
348305

349306
return results, res_index
350307

@@ -355,7 +312,6 @@ def wrap_results(
355312

356313
# see if we can infer the results
357314
if len(results) > 0 and 0 in results and is_sequence(results[0]):
358-
359315
return self.wrap_results_for_axis(results, res_index)
360316

361317
# dict of scalars
@@ -395,9 +351,30 @@ def result_columns(self) -> "Index":
395351

396352
def wrap_results_for_axis(
397353
self, results: ResType, res_index: "Index"
398-
) -> "DataFrame":
354+
) -> Union["Series", "DataFrame"]:
399355
""" return the results for the rows """
400-
result = self.obj._constructor(data=results)
356+
357+
if self.result_type == "reduce":
358+
# e.g. test_apply_dict GH#8735
359+
return self.obj._constructor_sliced(results)
360+
elif self.result_type is None and all(
361+
isinstance(x, dict) for x in results.values()
362+
):
363+
# Our operation was a to_dict op e.g.
364+
# test_apply_dict GH#8735, test_apply_reduce_rows_to_dict GH#25196
365+
return self.obj._constructor_sliced(results)
366+
367+
try:
368+
result = self.obj._constructor(data=results)
369+
except ValueError as err:
370+
if "arrays must all be same length" in str(err):
371+
# e.g. result = [[2, 3], [1.5], ['foo', 'bar']]
372+
# see test_agg_listlike_result GH#29587
373+
res = self.obj._constructor_sliced(results)
374+
res.index = res_index
375+
return res
376+
else:
377+
raise
401378

402379
if not isinstance(results[0], ABCSeries):
403380
if len(result.index) == len(self.res_columns):
@@ -418,11 +395,19 @@ def apply_broadcast(self, target: "DataFrame") -> "DataFrame":
418395

419396
@property
420397
def series_generator(self):
421-
constructor = self.obj._constructor_sliced
422-
return (
423-
constructor(arr, index=self.columns, name=name)
424-
for i, (arr, name) in enumerate(zip(self.values, self.index))
425-
)
398+
values = self.values
399+
assert len(values) > 0
400+
401+
# We create one Series object, and will swap out the data inside
402+
# of it. Kids: don't do this at home.
403+
ser = self.obj._ixs(0, axis=0)
404+
mgr = ser._mgr
405+
blk = mgr.blocks[0]
406+
407+
for (arr, name) in zip(values, self.index):
408+
blk.values = arr
409+
ser.name = name
410+
yield ser
426411

427412
@property
428413
def result_index(self) -> "Index":
@@ -444,9 +429,7 @@ def wrap_results_for_axis(
444429

445430
# we have a non-series and don't want inference
446431
elif not isinstance(results[0], ABCSeries):
447-
from pandas import Series
448-
449-
result = Series(results)
432+
result = self.obj._constructor_sliced(results)
450433
result.index = res_index
451434

452435
# we may want to infer results

0 commit comments

Comments
 (0)