Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit fd6b05b

Browse files
committedDec 13, 2017
CLN: factor apply out of frame.py
1 parent d2fd22e commit fd6b05b

File tree

3 files changed

+372
-248
lines changed

3 files changed

+372
-248
lines changed
 

‎pandas/core/apply.py

Lines changed: 301 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,301 @@
1+
import numpy as np
2+
from pandas import compat
3+
from pandas._libs import lib
4+
from pandas.core.dtypes.common import (
5+
is_extension_type,
6+
is_sequence)
7+
8+
from pandas.io.formats.printing import pprint_thing
9+
10+
11+
def frame_apply(obj, func, axis=0, broadcast=False,
12+
raw=False, reduce=None, args=(), **kwds):
13+
""" construct and return a row or column based frame apply object """
14+
15+
axis = obj._get_axis_number(axis)
16+
if axis == 0:
17+
klass = FrameRowApply
18+
elif axis == 1:
19+
klass = FrameColumnApply
20+
21+
return klass(obj, func, broadcast=broadcast,
22+
raw=raw, reduce=reduce, args=args, kwds=kwds)
23+
24+
25+
class FrameApply(object):
26+
27+
def __init__(self, obj, func, broadcast, raw, reduce, args, kwds):
28+
self.obj = obj
29+
self.broadcast = broadcast
30+
self.raw = raw
31+
self.reduce = reduce
32+
self.args = args
33+
34+
self.ignore_failures = kwds.pop('ignore_failures', False)
35+
self.kwds = kwds
36+
37+
# curry if needed
38+
if kwds or args and not isinstance(func, np.ufunc):
39+
def f(x):
40+
return func(x, *args, **kwds)
41+
else:
42+
f = func
43+
44+
self.f = f
45+
46+
@property
47+
def columns(self):
48+
return self.obj.columns
49+
50+
@property
51+
def index(self):
52+
return self.obj.index
53+
54+
@property
55+
def values(self):
56+
return self.obj.values
57+
58+
@property
59+
def agg_axis(self):
60+
return self.obj._get_agg_axis(self.axis)
61+
62+
def get_result(self):
63+
""" compute the results """
64+
65+
# all empty
66+
if len(self.columns) == 0 and len(self.index) == 0:
67+
return self.apply_empty_result()
68+
69+
# string dispatch
70+
if isinstance(self.f, compat.string_types):
71+
if self.axis:
72+
self.kwds['axis'] = self.axis
73+
return getattr(self.obj, self.f)(*self.args, **self.kwds)
74+
75+
# ufunc
76+
elif isinstance(self.f, np.ufunc):
77+
with np.errstate(all='ignore'):
78+
results = self.f(self.values)
79+
return self.obj._constructor(data=results, index=self.index,
80+
columns=self.columns, copy=False)
81+
82+
# broadcasting
83+
if self.broadcast:
84+
return self.apply_broadcast()
85+
86+
# one axis empty
87+
if not all(self.obj.shape):
88+
return self.apply_empty_result()
89+
90+
# raw
91+
if self.raw and not self.obj._is_mixed_type:
92+
return self.apply_raw()
93+
94+
return self.apply_standard()
95+
96+
def apply_empty_result(self):
97+
from pandas import Series
98+
reduce = self.reduce
99+
100+
if reduce is None:
101+
reduce = False
102+
103+
EMPTY_SERIES = Series([])
104+
try:
105+
r = self.f(EMPTY_SERIES, *self.args, **self.kwds)
106+
reduce = not isinstance(r, Series)
107+
except Exception:
108+
pass
109+
110+
if reduce:
111+
return Series(np.nan, index=self.agg_axis)
112+
else:
113+
return self.obj.copy()
114+
115+
def apply_raw(self):
116+
try:
117+
result = lib.reduce(self.values, self.f, axis=self.axis)
118+
except Exception:
119+
result = np.apply_along_axis(self.f, self.axis, self.values)
120+
121+
# TODO: mixed type case
122+
from pandas import DataFrame, Series
123+
if result.ndim == 2:
124+
return DataFrame(result, index=self.index, columns=self.columns)
125+
else:
126+
return Series(result, index=self.agg_axis)
127+
128+
def apply_standard(self):
129+
from pandas import Series
130+
131+
reduce = self.reduce
132+
if reduce is None:
133+
reduce = True
134+
135+
# try to reduce first (by default)
136+
# this only matters if the reduction in values is of different dtype
137+
# e.g. if we want to apply to a SparseFrame, then can't directly reduce
138+
if reduce:
139+
values = self.values
140+
141+
# we cannot reduce using non-numpy dtypes,
142+
# as demonstrated in gh-12244
143+
if not is_extension_type(values):
144+
145+
# Create a dummy Series from an empty array
146+
index = self.obj._get_axis(self.axis)
147+
empty_arr = np.empty(len(index), dtype=values.dtype)
148+
149+
dummy = Series(empty_arr, index=index, dtype=values.dtype)
150+
151+
try:
152+
labels = self.agg_axis
153+
result = lib.reduce(values, self.f,
154+
axis=self.axis,
155+
dummy=dummy,
156+
labels=labels)
157+
return Series(result, index=labels)
158+
except Exception:
159+
pass
160+
161+
# compute the result using the series generator
162+
results, res_index, res_columns = self._apply_series_generator()
163+
164+
# wrap results
165+
return self.wrap_results(results, res_index, res_columns)
166+
167+
def _apply_series_generator(self):
168+
series_gen = self.series_generator
169+
res_index = self.result_index
170+
res_columns = self.result_columns
171+
172+
i = None
173+
keys = []
174+
results = {}
175+
if self.ignore_failures:
176+
successes = []
177+
for i, v in enumerate(series_gen):
178+
try:
179+
results[i] = self.f(v)
180+
keys.append(v.name)
181+
successes.append(i)
182+
except Exception:
183+
pass
184+
185+
# so will work with MultiIndex
186+
if len(successes) < len(res_index):
187+
res_index = res_index.take(successes)
188+
189+
else:
190+
try:
191+
for i, v in enumerate(series_gen):
192+
results[i] = self.f(v)
193+
keys.append(v.name)
194+
except Exception as e:
195+
if hasattr(e, 'args'):
196+
197+
# make sure i is defined
198+
if i is not None:
199+
k = res_index[i]
200+
e.args = e.args + ('occurred at index %s' %
201+
pprint_thing(k), )
202+
raise
203+
204+
return results, res_index, res_columns
205+
206+
def wrap_results(self, results, res_index, res_columns):
207+
from pandas import Series
208+
209+
if len(results) > 0 and is_sequence(results[0]):
210+
if not isinstance(results[0], Series):
211+
index = res_columns
212+
else:
213+
index = None
214+
215+
result = self.obj._constructor(data=results, index=index)
216+
result.columns = res_index
217+
218+
if self.axis == 1:
219+
result = result.T
220+
result = result._convert(
221+
datetime=True, timedelta=True, copy=False)
222+
223+
else:
224+
225+
result = Series(results)
226+
result.index = res_index
227+
228+
return result
229+
230+
def _apply_broadcast(self, target):
231+
result_values = np.empty_like(target.values)
232+
columns = target.columns
233+
for i, col in enumerate(columns):
234+
result_values[:, i] = self.f(target[col])
235+
236+
result = self.obj._constructor(result_values, index=target.index,
237+
columns=target.columns)
238+
return result
239+
240+
241+
class FrameRowApply(FrameApply):
242+
axis = 0
243+
244+
def get_result(self):
245+
246+
# dispatch to agg
247+
if isinstance(self.f, (list, dict)):
248+
return self.obj.aggregate(self.f, axis=self.axis,
249+
*self.args, **self.kwds)
250+
251+
return super(FrameRowApply, self).get_result()
252+
253+
def apply_broadcast(self):
254+
return self._apply_broadcast(self.obj)
255+
256+
@property
257+
def series_generator(self):
258+
return (self.obj._ixs(i, axis=1)
259+
for i in range(len(self.columns)))
260+
261+
@property
262+
def result_index(self):
263+
return self.columns
264+
265+
@property
266+
def result_columns(self):
267+
return self.index
268+
269+
270+
class FrameColumnApply(FrameApply):
271+
axis = 1
272+
273+
def __init__(self, obj, func, broadcast, raw, reduce, args, kwds):
274+
super(FrameColumnApply, self).__init__(obj, func, broadcast,
275+
raw, reduce, args, kwds)
276+
277+
# skip if we are mixed datelike and trying reduce across axes
278+
# GH6125
279+
if self.reduce:
280+
if self.obj._is_mixed_type and self.obj._is_datelike_mixed_type:
281+
self.reduce = False
282+
283+
def apply_broadcast(self):
284+
return self._apply_broadcast(self.obj.T).T
285+
286+
@property
287+
def series_generator(self):
288+
from pandas import Series
289+
dtype = object if self.obj._is_mixed_type else None
290+
return (Series._from_array(arr, index=self.columns, name=name,
291+
dtype=dtype)
292+
for i, (arr, name) in enumerate(zip(self.values,
293+
self.index)))
294+
295+
@property
296+
def result_index(self):
297+
return self.index
298+
299+
@property
300+
def result_columns(self):
301+
return self.columns

‎pandas/core/frame.py

Lines changed: 67 additions & 246 deletions
Original file line numberDiff line numberDiff line change
@@ -4808,256 +4808,79 @@ def aggregate(self, func, axis=0, *args, **kwargs):
48084808

48094809
agg = aggregate
48104810

4811-
def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None,
4812-
args=(), **kwds):
4813-
"""
4814-
Applies function along input axis of DataFrame.
4815-
4816-
Objects passed to functions are Series objects having index
4817-
either the DataFrame's index (axis=0) or the columns (axis=1).
4818-
Return type depends on whether passed function aggregates, or the
4819-
reduce argument if the DataFrame is empty.
4820-
4821-
Parameters
4822-
----------
4823-
func : function
4824-
Function to apply to each column/row
4825-
axis : {0 or 'index', 1 or 'columns'}, default 0
4826-
* 0 or 'index': apply function to each column
4827-
* 1 or 'columns': apply function to each row
4828-
broadcast : boolean, default False
4829-
For aggregation functions, return object of same size with values
4830-
propagated
4831-
raw : boolean, default False
4832-
If False, convert each row or column into a Series. If raw=True the
4833-
passed function will receive ndarray objects instead. If you are
4834-
just applying a NumPy reduction function this will achieve much
4835-
better performance
4836-
reduce : boolean or None, default None
4837-
Try to apply reduction procedures. If the DataFrame is empty,
4838-
apply will use reduce to determine whether the result should be a
4839-
Series or a DataFrame. If reduce is None (the default), apply's
4840-
return value will be guessed by calling func an empty Series (note:
4841-
while guessing, exceptions raised by func will be ignored). If
4842-
reduce is True a Series will always be returned, and if False a
4843-
DataFrame will always be returned.
4844-
args : tuple
4845-
Positional arguments to pass to function in addition to the
4846-
array/series
4847-
Additional keyword arguments will be passed as keywords to the function
4848-
4849-
Notes
4850-
-----
4851-
In the current implementation apply calls func twice on the
4852-
first column/row to decide whether it can take a fast or slow
4853-
code path. This can lead to unexpected behavior if func has
4854-
side-effects, as they will take effect twice for the first
4855-
column/row.
4856-
4857-
Examples
4858-
--------
4859-
>>> df.apply(numpy.sqrt) # returns DataFrame
4860-
>>> df.apply(numpy.sum, axis=0) # equiv to df.sum(0)
4861-
>>> df.apply(numpy.sum, axis=1) # equiv to df.sum(1)
4862-
4863-
See also
4864-
--------
4865-
DataFrame.applymap: For elementwise operations
4866-
DataFrame.aggregate: only perform aggregating type operations
4867-
DataFrame.transform: only perform transformating type operations
4868-
4869-
Returns
4870-
-------
4871-
applied : Series or DataFrame
4872-
"""
4873-
axis = self._get_axis_number(axis)
4874-
ignore_failures = kwds.pop('ignore_failures', False)
4875-
4876-
# dispatch to agg
4877-
if axis == 0 and isinstance(func, (list, dict)):
4878-
return self.aggregate(func, axis=axis, *args, **kwds)
4879-
4880-
if len(self.columns) == 0 and len(self.index) == 0:
4881-
return self._apply_empty_result(func, axis, reduce, *args, **kwds)
4882-
4883-
# if we are a string, try to dispatch
4884-
if isinstance(func, compat.string_types):
4885-
if axis:
4886-
kwds['axis'] = axis
4887-
return getattr(self, func)(*args, **kwds)
4888-
4889-
if kwds or args and not isinstance(func, np.ufunc):
4890-
def f(x):
4891-
return func(x, *args, **kwds)
4892-
else:
4893-
f = func
4894-
4895-
if isinstance(f, np.ufunc):
4896-
with np.errstate(all='ignore'):
4897-
results = f(self.values)
4898-
return self._constructor(data=results, index=self.index,
4899-
columns=self.columns, copy=False)
4900-
else:
4901-
if not broadcast:
4902-
if not all(self.shape):
4903-
return self._apply_empty_result(func, axis, reduce, *args,
4904-
**kwds)
4905-
4906-
if raw and not self._is_mixed_type:
4907-
return self._apply_raw(f, axis)
4908-
else:
4909-
if reduce is None:
4910-
reduce = True
4911-
return self._apply_standard(
4912-
f, axis,
4913-
reduce=reduce,
4914-
ignore_failures=ignore_failures)
4915-
else:
4916-
return self._apply_broadcast(f, axis)
4811+
_shared_docs['apply'] = ("""
4812+
Applies function along input axis of DataFrame.
49174813
4918-
def _apply_empty_result(self, func, axis, reduce, *args, **kwds):
4919-
if reduce is None:
4920-
reduce = False
4921-
try:
4922-
reduce = not isinstance(func(_EMPTY_SERIES, *args, **kwds),
4923-
Series)
4924-
except Exception:
4925-
pass
4814+
Objects passed to functions are Series objects having index
4815+
either the DataFrame's index (axis=0) or the columns (axis=1).
4816+
Return type depends on whether passed function aggregates, or the
4817+
reduce argument if the DataFrame is empty.
49264818
4927-
if reduce:
4928-
return Series(np.nan, index=self._get_agg_axis(axis))
4929-
else:
4930-
return self.copy()
4931-
4932-
def _apply_raw(self, func, axis):
4933-
try:
4934-
result = lib.reduce(self.values, func, axis=axis)
4935-
except Exception:
4936-
result = np.apply_along_axis(func, axis, self.values)
4937-
4938-
# TODO: mixed type case
4939-
if result.ndim == 2:
4940-
return DataFrame(result, index=self.index, columns=self.columns)
4941-
else:
4942-
return Series(result, index=self._get_agg_axis(axis))
4943-
4944-
def _apply_standard(self, func, axis, ignore_failures=False, reduce=True):
4945-
4946-
# skip if we are mixed datelike and trying reduce across axes
4947-
# GH6125
4948-
if (reduce and axis == 1 and self._is_mixed_type and
4949-
self._is_datelike_mixed_type):
4950-
reduce = False
4951-
4952-
# try to reduce first (by default)
4953-
# this only matters if the reduction in values is of different dtype
4954-
# e.g. if we want to apply to a SparseFrame, then can't directly reduce
4955-
if reduce:
4956-
values = self.values
4957-
4958-
# we cannot reduce using non-numpy dtypes,
4959-
# as demonstrated in gh-12244
4960-
if not is_extension_type(values):
4961-
# Create a dummy Series from an empty array
4962-
index = self._get_axis(axis)
4963-
empty_arr = np.empty(len(index), dtype=values.dtype)
4964-
dummy = Series(empty_arr, index=self._get_axis(axis),
4965-
dtype=values.dtype)
4966-
4967-
try:
4968-
labels = self._get_agg_axis(axis)
4969-
result = lib.reduce(values, func, axis=axis, dummy=dummy,
4970-
labels=labels)
4971-
return Series(result, index=labels)
4972-
except Exception:
4973-
pass
4974-
4975-
dtype = object if self._is_mixed_type else None
4976-
if axis == 0:
4977-
series_gen = (self._ixs(i, axis=1)
4978-
for i in range(len(self.columns)))
4979-
res_index = self.columns
4980-
res_columns = self.index
4981-
elif axis == 1:
4982-
res_index = self.index
4983-
res_columns = self.columns
4984-
values = self.values
4985-
series_gen = (Series._from_array(arr, index=res_columns, name=name,
4986-
dtype=dtype)
4987-
for i, (arr, name) in enumerate(zip(values,
4988-
res_index)))
4989-
else: # pragma : no cover
4990-
raise AssertionError('Axis must be 0 or 1, got %s' % str(axis))
4991-
4992-
i = None
4993-
keys = []
4994-
results = {}
4995-
if ignore_failures:
4996-
successes = []
4997-
for i, v in enumerate(series_gen):
4998-
try:
4999-
results[i] = func(v)
5000-
keys.append(v.name)
5001-
successes.append(i)
5002-
except Exception:
5003-
pass
5004-
# so will work with MultiIndex
5005-
if len(successes) < len(res_index):
5006-
res_index = res_index.take(successes)
5007-
else:
5008-
try:
5009-
for i, v in enumerate(series_gen):
5010-
results[i] = func(v)
5011-
keys.append(v.name)
5012-
except Exception as e:
5013-
if hasattr(e, 'args'):
5014-
# make sure i is defined
5015-
if i is not None:
5016-
k = res_index[i]
5017-
e.args = e.args + ('occurred at index %s' %
5018-
pprint_thing(k), )
5019-
raise
5020-
5021-
if len(results) > 0 and is_sequence(results[0]):
5022-
if not isinstance(results[0], Series):
5023-
index = res_columns
5024-
else:
5025-
index = None
5026-
5027-
result = self._constructor(data=results, index=index)
5028-
result.columns = res_index
5029-
5030-
if axis == 1:
5031-
result = result.T
5032-
result = result._convert(datetime=True, timedelta=True, copy=False)
5033-
5034-
else:
5035-
5036-
result = Series(results)
5037-
result.index = res_index
5038-
5039-
return result
5040-
5041-
def _apply_broadcast(self, func, axis):
5042-
if axis == 0:
5043-
target = self
5044-
elif axis == 1:
5045-
target = self.T
5046-
else: # pragma: no cover
5047-
raise AssertionError('Axis must be 0 or 1, got %s' % axis)
4819+
Parameters
4820+
----------
4821+
func : function
4822+
Function to apply to each column/row
4823+
axis : {0 or 'index', 1 or 'columns'}, default 0
4824+
* 0 or 'index': apply function to each column
4825+
* 1 or 'columns': apply function to each row
4826+
broadcast : boolean, default False
4827+
For aggregation functions, return object of same size with values
4828+
propagated
4829+
raw : boolean, default False
4830+
If False, convert each row or column into a Series. If raw=True the
4831+
passed function will receive ndarray objects instead. If you are
4832+
just applying a NumPy reduction function this will achieve much
4833+
better performance
4834+
reduce : boolean or None, default None
4835+
Try to apply reduction procedures. If the DataFrame is empty,
4836+
apply will use reduce to determine whether the result should be a
4837+
Series or a DataFrame. If reduce is None (the default), apply's
4838+
return value will be guessed by calling func an empty Series (note:
4839+
while guessing, exceptions raised by func will be ignored). If
4840+
reduce is True a Series will always be returned, and if False a
4841+
DataFrame will always be returned.
4842+
args : tuple
4843+
Positional arguments to pass to function in addition to the
4844+
array/series
4845+
Additional keyword arguments will be passed as keywords to the function
4846+
4847+
Notes
4848+
-----
4849+
In the current implementation apply calls func twice on the
4850+
first column/row to decide whether it can take a fast or slow
4851+
code path. This can lead to unexpected behavior if func has
4852+
side-effects, as they will take effect twice for the first
4853+
column/row.
50484854
5049-
result_values = np.empty_like(target.values)
5050-
columns = target.columns
5051-
for i, col in enumerate(columns):
5052-
result_values[:, i] = func(target[col])
4855+
Examples
4856+
--------
4857+
>>> df.apply(numpy.sqrt) # returns DataFrame
4858+
>>> df.apply(numpy.sum, axis=0) # equiv to df.sum(0)
4859+
>>> df.apply(numpy.sum, axis=1) # equiv to df.sum(1)
50534860
5054-
result = self._constructor(result_values, index=target.index,
5055-
columns=target.columns)
4861+
See also
4862+
--------
4863+
DataFrame.applymap: For elementwise operations
4864+
DataFrame.aggregate: only perform aggregating type operations
4865+
DataFrame.transform: only perform transformating type operations
50564866
5057-
if axis == 1:
5058-
result = result.T
4867+
Returns
4868+
-------
4869+
applied : Series or DataFrame
4870+
""")
50594871

5060-
return result
4872+
@Appender(_shared_docs['apply'])
4873+
def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None,
4874+
args=(), **kwds):
4875+
from pandas.core.apply import frame_apply
4876+
op = frame_apply(self,
4877+
func=func,
4878+
axis=axis,
4879+
broadcast=broadcast,
4880+
raw=raw,
4881+
reduce=reduce,
4882+
args=args, **kwds)
4883+
return op.get_result()
50614884

50624885
def applymap(self, func):
50634886
"""
@@ -6189,8 +6012,6 @@ def isin(self, values):
61896012
ops.add_flex_arithmetic_methods(DataFrame, **ops.frame_flex_funcs)
61906013
ops.add_special_arithmetic_methods(DataFrame, **ops.frame_special_funcs)
61916014

6192-
_EMPTY_SERIES = Series([])
6193-
61946015

61956016
def _arrays_to_mgr(arrays, arr_names, index, columns, dtype=None):
61966017
"""

‎pandas/tests/frame/test_apply.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
Timestamp, compat)
1414
import pandas as pd
1515
from pandas.core.dtypes.dtypes import CategoricalDtype
16+
from pandas.core.apply import frame_apply
1617
from pandas.util.testing import (assert_series_equal,
1718
assert_frame_equal)
1819
import pandas.util.testing as tm
@@ -153,8 +154,9 @@ def test_apply_axis1(self):
153154
assert tapplied[d] == np.mean(self.frame.xs(d))
154155

155156
def test_apply_ignore_failures(self):
156-
result = self.mixed_frame._apply_standard(np.mean, 0,
157-
ignore_failures=True)
157+
result = frame_apply(self.mixed_frame,
158+
np.mean, 0,
159+
ignore_failures=True).apply_standard()
158160
expected = self.mixed_frame._get_numeric_data().apply(np.mean)
159161
assert_series_equal(result, expected)
160162

0 commit comments

Comments
 (0)
Please sign in to comment.