Skip to content

Commit 50d82bb

Browse files
committed
API/BUG: .apply will correctly infer output shape when axis=1
closes pandas-dev#16353 closes pandas-dev#17348 closes pandas-dev#17437 closes pandas-dev#18573 closes pandas-dev#17970 closes pandas-dev#17892 closes pandas-dev#17602 closes pandas-dev#18775 closes pandas-dev#18901 closes pandas-dev#18919
1 parent bcaa5da commit 50d82bb

File tree

3 files changed

+253
-34
lines changed

3 files changed

+253
-34
lines changed

doc/source/whatsnew/v0.23.0.txt

+48-2
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ Previous Behavior:
114114
4 NaN
115115
dtype: float64
116116

117-
Current Behavior
117+
Current Behavior:
118118

119119
.. ipython:: python
120120

@@ -139,7 +139,7 @@ Previous Behavior:
139139
3 2.5
140140
dtype: float64
141141

142-
Current Behavior
142+
Current Behavior:
143143

144144
.. ipython:: python
145145

@@ -260,6 +260,52 @@ Convert to an xarray DataArray
260260

261261
p.to_xarray()
262262

263+
.. _whatsnew_0230.api_breaking.apply:
264+
265+
Apply Changes
266+
~~~~~~~~~~~~~
267+
268+
:func:`DataFrame.apply` was inconsistent when applying an arbitrary user-defined-function that returned a list-like with ``axis=1``. Several bugs and inconsistencies
269+
are resolved. If the applied function returns a Series, then pandas will return a DataFrame; otherwise a Series will be returned, this includes the case
270+
where a list-like (e.g. ``tuple`` or ``list`` is returned), (:issue:`16353`, :issue:`17437`, :issue:`17970`, :issue:`17348`, :issue:`17892`, :issue:`18573`,
271+
:issue:`17602`, :issue:`18775`, :issue:`18901`, :issue:`18919`)
272+
273+
.. ipython:: python
274+
275+
df = pd.DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C'])
276+
df
277+
278+
Previous Behavior. If the returned shape happened to match the index, this would return a list-like.
279+
280+
.. code-block:: python
281+
282+
In [3]: df.apply(lambda x: [1, 2, 3], axis=1)
283+
Out[3]:
284+
A B C
285+
0 1 2 3
286+
1 1 2 3
287+
2 1 2 3
288+
3 1 2 3
289+
4 1 2 3
290+
5 1 2 3
291+
292+
In [4]: df.apply(lambda x: [1, 2], axis=1)
293+
Out[4]:
294+
0 [1, 2]
295+
1 [1, 2]
296+
2 [1, 2]
297+
3 [1, 2]
298+
4 [1, 2]
299+
5 [1, 2]
300+
dtype: object
301+
302+
303+
New Behavior. The behavior is consistent. These will *always* return a ``Series``.
304+
305+
.. ipython:: python
306+
307+
df.apply(lambda x: [1, 2, 3], axis=1)
308+
df.apply(lambda x: [1, 2], axis=1)
263309

264310
Build Changes
265311
^^^^^^^^^^^^^

pandas/core/apply.py

+61-22
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,20 @@ def frame_apply(obj, func, axis=0, broadcast=False,
1919
klass = FrameColumnApply
2020

2121
return klass(obj, func, broadcast=broadcast,
22-
raw=raw, reduce=reduce, args=args, kwds=kwds)
22+
raw=raw, reduce=reduce,
23+
args=args, kwds=kwds)
2324

2425

2526
class FrameApply(object):
2627

27-
def __init__(self, obj, func, broadcast, raw, reduce, args, kwds):
28+
def __init__(self, obj, func, broadcast, raw, reduce,
29+
args, kwds):
2830
self.obj = obj
2931
self.broadcast = broadcast
3032
self.raw = raw
3133
self.reduce = reduce
32-
self.args = args
3334

35+
self.args = args
3436
self.ignore_failures = kwds.pop('ignore_failures', False)
3537
self.kwds = kwds
3638

@@ -94,6 +96,13 @@ def get_result(self):
9496
return self.apply_standard()
9597

9698
def apply_empty_result(self):
99+
"""
100+
we have an empty result; at least 1 axis is 0
101+
102+
we will try to apply the function to an empty
103+
series in order to see if this is a reduction function
104+
"""
105+
97106
from pandas import Series
98107
reduce = self.reduce
99108

@@ -113,6 +122,8 @@ def apply_empty_result(self):
113122
return self.obj.copy()
114123

115124
def apply_raw(self):
125+
""" apply to the values as a numpy array """
126+
116127
try:
117128
result = reduction.reduce(self.values, self.f, axis=self.axis)
118129
except Exception:
@@ -207,19 +218,57 @@ def wrap_results(self, results, res_index, res_columns):
207218
from pandas import Series
208219

209220
if len(results) > 0 and is_sequence(results[0]):
210-
if not isinstance(results[0], Series):
211-
index = res_columns
221+
222+
# map to rows
223+
if self.axis == 0:
224+
result = self.obj._constructor(data=results)
225+
226+
if not isinstance(results[0], Series):
227+
try:
228+
result.index = res_columns
229+
except ValueError:
230+
pass
231+
232+
try:
233+
result.columns = res_index
234+
except ValueError:
235+
pass
236+
237+
# map to columns
212238
else:
213-
index = None
214239

215-
result = self.obj._constructor(data=results, index=index)
216-
result.columns = res_index
240+
def infer_to_same_shape():
241+
result = self.obj._constructor(data=results)
242+
result = result.T
243+
244+
# try to assign the result indices;
245+
# this may fail, if so we have
246+
# received an invalid return shape
247+
try:
248+
result.index = res_index
249+
except ValueError:
250+
pass
251+
252+
try:
253+
result.columns = res_columns
254+
except ValueError:
255+
pass
256+
257+
# infer dtypes
258+
result = result.infer_objects()
217259

218-
if self.axis == 1:
219-
result = result.T
220-
result = result._convert(
221-
datetime=True, timedelta=True, copy=False)
260+
return result
222261

262+
# we have a non-series and don't want inference
263+
if not isinstance(results[0], Series):
264+
result = Series(results)
265+
result.index = res_index
266+
267+
# we may want to infer results
268+
else:
269+
result = infer_to_same_shape()
270+
271+
# dict of scalars
223272
else:
224273

225274
result = Series(results)
@@ -270,16 +319,6 @@ def result_columns(self):
270319
class FrameColumnApply(FrameApply):
271320
axis = 1
272321

273-
def __init__(self, obj, func, broadcast, raw, reduce, args, kwds):
274-
super(FrameColumnApply, self).__init__(obj, func, broadcast,
275-
raw, reduce, args, kwds)
276-
277-
# skip if we are mixed datelike and trying reduce across axes
278-
# GH6125
279-
if self.reduce:
280-
if self.obj._is_mixed_type and self.obj._is_datelike_mixed_type:
281-
self.reduce = False
282-
283322
def apply_broadcast(self):
284323
return self._apply_broadcast(self.obj.T).T
285324

pandas/tests/frame/test_apply.py

+144-10
Original file line numberDiff line numberDiff line change
@@ -350,11 +350,10 @@ def test_apply_attach_name(self):
350350

351351
result = self.frame.apply(lambda x: np.repeat(x.name, len(x)),
352352
axis=1)
353-
expected = DataFrame(np.tile(self.frame.index,
354-
(len(self.frame.columns), 1)).T,
355-
index=self.frame.index,
356-
columns=self.frame.columns)
357-
assert_frame_equal(result, expected)
353+
expected = Series(np.repeat(t[0], len(self.frame.columns))
354+
for t in self.frame.itertuples())
355+
expected.index = self.frame.index
356+
assert_series_equal(result, expected)
358357

359358
def test_apply_multi_index(self):
360359
s = DataFrame([[1, 2], [3, 4], [5, 6]])
@@ -367,10 +366,10 @@ def test_apply_dict(self):
367366

368367
# GH 8735
369368
A = DataFrame([['foo', 'bar'], ['spam', 'eggs']])
370-
A_dicts = pd.Series([dict([(0, 'foo'), (1, 'spam')]),
371-
dict([(0, 'bar'), (1, 'eggs')])])
369+
A_dicts = Series([dict([(0, 'foo'), (1, 'spam')]),
370+
dict([(0, 'bar'), (1, 'eggs')])])
372371
B = DataFrame([[0, 1], [2, 3]])
373-
B_dicts = pd.Series([dict([(0, 0), (1, 2)]), dict([(0, 1), (1, 3)])])
372+
B_dicts = Series([dict([(0, 0), (1, 2)]), dict([(0, 1), (1, 3)])])
374373
fn = lambda x: x.to_dict()
375374

376375
for df, dicts in [(A, A_dicts), (B, B_dicts)]:
@@ -472,6 +471,141 @@ def test_apply_non_numpy_dtype(self):
472471
assert_frame_equal(result, df)
473472

474473

474+
class TestInferOutputShape(object):
475+
# the user has supplied an opaque UDF where
476+
# they are transforming the input that requires
477+
# us to infer the output
478+
479+
def test_infer_row_shape(self):
480+
# gh-17437
481+
# if row shape is changing, infer it
482+
df = pd.DataFrame(np.random.rand(10, 2))
483+
result = df.apply(np.fft.fft, axis=0)
484+
assert result.shape == (10, 2)
485+
486+
result = df.apply(np.fft.rfft, axis=0)
487+
assert result.shape == (6, 2)
488+
489+
def test_with_dictlike_columns(self):
490+
# gh 17602
491+
492+
df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
493+
result = df.apply(lambda x: {'s': x['a'] + x['b']}, 1)
494+
expected = Series([{'s': 3} for t in df.itertuples()])
495+
assert_series_equal(result, expected)
496+
497+
df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'),
498+
pd.Timestamp('2017-05-02 00:00:00')]
499+
assert_series_equal(result, expected)
500+
501+
# compose a series
502+
result = (df['a'] + df['b']).apply(lambda x: {'s': x})
503+
expected = Series([{'s': 3}, {'s': 3}])
504+
assert_series_equal(result, expected)
505+
506+
# gh-18775
507+
df = DataFrame()
508+
df["author"] = ["X", "Y", "Z"]
509+
df["publisher"] = ["BBC", "NBC", "N24"]
510+
df["date"] = pd.to_datetime(['17-10-2010 07:15:30',
511+
'13-05-2011 08:20:35',
512+
'15-01-2013 09:09:09'])
513+
result = df.apply(lambda x: {}, axis=1)
514+
expected = Series([{}, {}, {}])
515+
assert_series_equal(result, expected)
516+
517+
def test_with_listlike_columns(self):
518+
# gh-17348
519+
df = DataFrame({'a': Series(np.random.randn(4)),
520+
'b': ['a', 'list', 'of', 'words'],
521+
'ts': date_range('2016-10-01', periods=4, freq='H')})
522+
523+
result = df[['a', 'b']].apply(tuple, axis=1)
524+
expected = Series([t[1:] for t in df[['a', 'b']].itertuples()])
525+
assert_series_equal(result, expected)
526+
527+
result = df[['a', 'ts']].apply(tuple, axis=1)
528+
expected = Series([t[1:] for t in df[['a', 'ts']].itertuples()])
529+
assert_series_equal(result, expected)
530+
531+
# gh-18919
532+
df = DataFrame({'x': Series([['a', 'b'], ['q']]),
533+
'y': Series([['z'], ['q', 't']])})
534+
df.index = MultiIndex.from_tuples([('i0', 'j0'), ('i1', 'j1')])
535+
536+
result = df.apply(
537+
lambda row: [el for el in row['x'] if el in row['y']],
538+
axis=1)
539+
expected = Series([[], ['q']], index=df.index)
540+
assert_series_equal(result, expected)
541+
542+
def test_infer_output_shape_columns(self):
543+
# gh-18573
544+
545+
df = DataFrame({'number': [1., 2.],
546+
'string': ['foo', 'bar'],
547+
'datetime': [pd.Timestamp('2017-11-29 03:30:00'),
548+
pd.Timestamp('2017-11-29 03:45:00')]})
549+
result = df.apply(lambda row: (row.number, row.string), axis=1)
550+
expected = Series([t[2:] for t in df.itertuples()])
551+
assert_series_equal(result, expected)
552+
553+
def test_infer_output_shape_listlike_columns(self):
554+
# gh-16353
555+
556+
df = DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C'])
557+
558+
result = df.apply(lambda x: [1, 2, 3], axis=1)
559+
expected = Series([[1, 2, 3] for t in df.itertuples()])
560+
assert_series_equal(result, expected)
561+
562+
result = df.apply(lambda x: [1, 2], axis=1)
563+
expected = Series([[1, 2] for t in df.itertuples()])
564+
assert_series_equal(result, expected)
565+
566+
# gh-17970
567+
df = DataFrame({"a": [1, 2, 3]}, index=list('abc'))
568+
569+
result = df.apply(lambda row: np.ones(1), axis=1)
570+
expected = Series([np.ones(1) for t in df.itertuples()],
571+
index=df.index)
572+
assert_series_equal(result, expected)
573+
574+
result = df.apply(lambda row: np.ones(2), axis=1)
575+
expected = Series([np.ones(2) for t in df.itertuples()],
576+
index=df.index)
577+
assert_series_equal(result, expected)
578+
579+
# gh-17892
580+
df = pd.DataFrame({'a': [pd.Timestamp('2010-02-01'),
581+
pd.Timestamp('2010-02-04'),
582+
pd.Timestamp('2010-02-05'),
583+
pd.Timestamp('2010-02-06')],
584+
'b': [9, 5, 4, 3],
585+
'c': [5, 3, 4, 2],
586+
'd': [1, 2, 3, 4]})
587+
588+
def fun(x):
589+
return (1, 2)
590+
591+
result = df.apply(fun, axis=1)
592+
expected = Series([(1, 2) for t in df.itertuples()])
593+
assert_series_equal(result, expected)
594+
595+
def test_consistent_coerce_for_shapes(self):
596+
# we want column names to NOT be propagated
597+
# just because the shape matches the input shape
598+
df = DataFrame(np.random.randn(4, 3), columns=['A', 'B', 'C'])
599+
600+
result = df.apply(lambda x: [1, 2, 3], axis=1)
601+
expected = Series([[1, 2, 3] for t in df.itertuples()])
602+
assert_series_equal(result, expected)
603+
604+
result = df.apply(lambda x: [1, 2], axis=1)
605+
expected = Series([[1, 2] for t in df.itertuples()])
606+
assert_series_equal(result, expected)
607+
608+
475609
def zip_frames(*frames):
476610
"""
477611
take a list of frames, zip the columns together for each
@@ -649,13 +783,13 @@ def test_non_callable_aggregates(self):
649783

650784
# Function aggregate
651785
result = df.agg({'A': 'count'})
652-
expected = pd.Series({'A': 2})
786+
expected = Series({'A': 2})
653787

654788
assert_series_equal(result, expected)
655789

656790
# Non-function aggregate
657791
result = df.agg({'A': 'size'})
658-
expected = pd.Series({'A': 3})
792+
expected = Series({'A': 3})
659793

660794
assert_series_equal(result, expected)
661795

0 commit comments

Comments
 (0)