From f09fa5fc5c220949786ab4e0029864df23a39e0b Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 20 Jun 2020 09:12:24 -0700 Subject: [PATCH 1/6] PERF: avoid creating many Series in series_generator --- pandas/core/apply.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 0a274d8becd72..98e1f209ab42b 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -424,11 +424,19 @@ def apply_broadcast(self, target: "DataFrame") -> "DataFrame": @property def series_generator(self): - constructor = self.obj._constructor_sliced - return ( - constructor(arr, index=self.columns, name=name) - for i, (arr, name) in enumerate(zip(self.values, self.index)) - ) + values = self.values + assert len(values) > 0 + + # We create one Series object, and will swap out the data inside + # of it. Kids: don't do this at home. + ser = self.obj._ixs(0, axis=0) + mgr = ser._mgr + blk = mgr.blocks[0] + + for (arr, name) in zip(values, self.index): + blk.values = arr + ser.name = name + yield ser @property def result_index(self) -> "Index": From 107094da7c2e0563cecde61b093eff0dc0d8b75a Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 20 Jun 2020 09:19:11 -0700 Subject: [PATCH 2/6] CLN: dont bother with compute_reduction --- pandas/core/apply.py | 47 -------------------------------------------- 1 file changed, 47 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 98e1f209ab42b..80516408d1839 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -266,53 +266,6 @@ def apply_standard(self): # partial result that may be returned from reduction partial_result = None - # try to reduce first (by default) - # this only matters if the reduction in values is of different dtype - # e.g. if we want to apply to a SparseFrame, then can't directly reduce - - # we cannot reduce using non-numpy dtypes, - # as demonstrated in gh-12244 - if ( - self.result_type in ["reduce", None] - and not self.dtypes.apply(is_extension_array_dtype).any() - # Disallow dtypes where setting _index_data will break - # ExtensionArray values, see GH#31182 - and not self.dtypes.apply(lambda x: x.kind in ["m", "M"]).any() - # Disallow complex_internals since libreduction shortcut raises a TypeError - and not self.agg_axis._has_complex_internals - ): - - values = self.values - index = self.obj._get_axis(self.axis) - labels = self.agg_axis - empty_arr = np.empty(len(index), dtype=values.dtype) - - # Preserve subclass for e.g. test_subclassed_apply - dummy = self.obj._constructor_sliced( - empty_arr, index=index, dtype=values.dtype - ) - - try: - result, reduction_success = libreduction.compute_reduction( - values, self.f, axis=self.axis, dummy=dummy, labels=labels - ) - except TypeError: - # e.g. test_apply_ignore_failures we just ignore - if not self.ignore_failures: - raise - except ZeroDivisionError: - # reached via numexpr; fall back to python implementation - pass - else: - if reduction_success: - return self.obj._constructor_sliced(result, index=labels) - - # no exceptions - however reduction was unsuccessful, - # use the computed function result for first element - partial_result = result[0] - if isinstance(partial_result, ABCSeries): - partial_result = partial_result.infer_objects() - # compute the result using the series generator, # use the result computed while trying to reduce if available. results, res_index = self.apply_series_generator(partial_result) From c17e82bd9a5c863d12cf7ebdfb9aad5d149879bd Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 20 Jun 2020 09:44:54 -0700 Subject: [PATCH 3/6] CLN: remove unused import --- pandas/core/apply.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 80516408d1839..9b65cd046cc02 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -8,12 +8,7 @@ from pandas._typing import Axis from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.common import ( - is_dict_like, - is_extension_array_dtype, - is_list_like, - is_sequence, -) +from pandas.core.dtypes.common import is_dict_like, is_list_like, is_sequence from pandas.core.dtypes.generic import ABCSeries from pandas.core.construction import create_series_with_explicit_dtype From 0534517823f253281aa0bd79b2fd03da50eecdd4 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 20 Jun 2020 13:39:00 -0700 Subject: [PATCH 4/6] Troubleshoot test failures --- pandas/core/apply.py | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 9b65cd046cc02..bedcf56a762a8 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -4,6 +4,8 @@ import numpy as np +from pandas._config import option_context + from pandas._libs import reduction as libreduction from pandas._typing import Axis from pandas.util._decorators import cache_readonly @@ -298,7 +300,14 @@ def apply_series_generator(self, partial_result=None) -> Tuple[ResType, "Index"] else: for i, v in series_gen_enumeration: - results[i] = self.f(v) + with option_context("mode.chained_assignment", None): + # ignore SettingWithCopy here in case the user mutates + results[i] = self.f(v) + + if isinstance(results[i], ABCSeries): + # If we have a view on v, we need to make a copy because + # series_generator will swap out the underlying data + results[i] = results[i].copy(deep=False) return results, res_index @@ -309,7 +318,6 @@ def wrap_results( # see if we can infer the results if len(results) > 0 and 0 in results and is_sequence(results[0]): - return self.wrap_results_for_axis(results, res_index) # dict of scalars @@ -351,7 +359,24 @@ def wrap_results_for_axis( self, results: ResType, res_index: "Index" ) -> "DataFrame": """ return the results for the rows """ - result = self.obj._constructor(data=results) + try: + result = self.obj._constructor(data=results) + except ValueError as err: + if "arrays must all be same length" in str(err): + # e.g. result = [[2, 3], [1.5], ['foo', 'bar']] + if isinstance(results, dict): + # e.g. test_agg_listlike_result GH#29587 + arr = results + else: + arr = np.empty(len(results), dtype=object) + arr[:] = results + + result = self.obj._constructor_sliced(data=arr) + if len(result) == len(res_index): + result.index = res_index + return result + else: + raise if not isinstance(results[0], ABCSeries): if len(result.index) == len(self.res_columns): @@ -406,9 +431,7 @@ def wrap_results_for_axis( # we have a non-series and don't want inference elif not isinstance(results[0], ABCSeries): - from pandas import Series - - result = Series(results) + result = self.obj._constructor_sliced(results) result.index = res_index # we may want to infer results From fcead318ab98e8f712233af872b908f36e2b02cc Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 20 Jun 2020 15:29:19 -0700 Subject: [PATCH 5/6] Fix tests --- pandas/core/apply.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index bedcf56a762a8..be5323ad5106c 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -359,21 +359,25 @@ def wrap_results_for_axis( self, results: ResType, res_index: "Index" ) -> "DataFrame": """ return the results for the rows """ + + if self.result_type == "reduce": + # e.g. test_apply_dict GH#8735 + return self.obj._constructor_sliced(results) + elif self.result_type is None and all( + isinstance(x, dict) for x in results.values() + ): + # Our operation was a to_dict op e.g. + # test_apply_dict GH#8735, test_apply_reduce_rows_to_dict GH#25196 + return self.obj._constructor_sliced(results) + try: result = self.obj._constructor(data=results) except ValueError as err: if "arrays must all be same length" in str(err): # e.g. result = [[2, 3], [1.5], ['foo', 'bar']] - if isinstance(results, dict): - # e.g. test_agg_listlike_result GH#29587 - arr = results - else: - arr = np.empty(len(results), dtype=object) - arr[:] = results - - result = self.obj._constructor_sliced(data=arr) - if len(result) == len(res_index): - result.index = res_index + # see test_agg_listlike_result GH#29587 + result = self.obj._constructor_sliced(results) + result.index = res_index return result else: raise From dbdc9152c3f08337aa1258003d093a77ce8b384b Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 20 Jun 2020 15:55:05 -0700 Subject: [PATCH 6/6] mypy fixup --- pandas/core/apply.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index be5323ad5106c..e39915cdce2e7 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -357,7 +357,7 @@ def result_columns(self) -> "Index": def wrap_results_for_axis( self, results: ResType, res_index: "Index" - ) -> "DataFrame": + ) -> Union["Series", "DataFrame"]: """ return the results for the rows """ if self.result_type == "reduce": @@ -376,9 +376,9 @@ def wrap_results_for_axis( if "arrays must all be same length" in str(err): # e.g. result = [[2, 3], [1.5], ['foo', 'bar']] # see test_agg_listlike_result GH#29587 - result = self.obj._constructor_sliced(results) - result.index = res_index - return result + res = self.obj._constructor_sliced(results) + res.index = res_index + return res else: raise