Skip to content

Commit c4a2fa3

Browse files
committed
API/BUG: .apply will correctly infer output shape when axis=1
closes pandas-dev#16353 closes pandas-dev#17348 closes pandas-dev#17437 closes pandas-dev#18573 closes pandas-dev#17970 closes pandas-dev#17892 closes pandas-dev#17602 closes pandas-dev#18775 closes pandas-dev#18901 closes pandas-dev#18919
1 parent 3597de0 commit c4a2fa3

File tree

3 files changed

+253
-34
lines changed

3 files changed

+253
-34
lines changed

doc/source/whatsnew/v0.23.0.txt

+48-2
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ Previous Behavior:
114114
4 NaN
115115
dtype: float64
116116

117-
Current Behavior
117+
Current Behavior:
118118

119119
.. ipython:: python
120120

@@ -139,7 +139,7 @@ Previous Behavior:
139139
3 2.5
140140
dtype: float64
141141

142-
Current Behavior
142+
Current Behavior:
143143

144144
.. ipython:: python
145145

@@ -259,6 +259,52 @@ Convert to an xarray DataArray
259259

260260
p.to_xarray()
261261

262+
.. _whatsnew_0230.api_breaking.apply:
263+
264+
Apply Changes
265+
~~~~~~~~~~~~~
266+
267+
:func:`DataFrame.apply` was inconsistent when applying an arbitrary user-defined-function that returned a list-like with ``axis=1``. Several bugs and inconsistencies
268+
are resolved. If the applied function returns a Series, then pandas will return a DataFrame; otherwise a Series will be returned, this includes the case
269+
where a list-like (e.g. ``tuple`` or ``list`` is returned), (:issue:`16353`, :issue:`17437`, :issue:`17970`, :issue:`17348`, :issue:`17892`, :issue:`18573`,
270+
:issue:`17602`, :issue:`18775`, :issue:`18901`, :issue:`18919`)
271+
272+
.. ipython:: python
273+
274+
df = pd.DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C'])
275+
df
276+
277+
Previous Behavior. If the returned shape happened to match the index, this would return a list-like.
278+
279+
.. code-block:: python
280+
281+
In [3]: df.apply(lambda x: [1, 2, 3], axis=1)
282+
Out[3]:
283+
A B C
284+
0 1 2 3
285+
1 1 2 3
286+
2 1 2 3
287+
3 1 2 3
288+
4 1 2 3
289+
5 1 2 3
290+
291+
In [4]: df.apply(lambda x: [1, 2], axis=1)
292+
Out[4]:
293+
0 [1, 2]
294+
1 [1, 2]
295+
2 [1, 2]
296+
3 [1, 2]
297+
4 [1, 2]
298+
5 [1, 2]
299+
dtype: object
300+
301+
302+
New Behavior. The behavior is consistent. These will *always* return a ``Series``.
303+
304+
.. ipython:: python
305+
306+
df.apply(lambda x: [1, 2, 3], axis=1)
307+
df.apply(lambda x: [1, 2], axis=1)
262308

263309
Build Changes
264310
^^^^^^^^^^^^^

pandas/core/apply.py

+61-22
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,20 @@ def frame_apply(obj, func, axis=0, broadcast=False,
1919
klass = FrameColumnApply
2020

2121
return klass(obj, func, broadcast=broadcast,
22-
raw=raw, reduce=reduce, args=args, kwds=kwds)
22+
raw=raw, reduce=reduce,
23+
args=args, kwds=kwds)
2324

2425

2526
class FrameApply(object):
2627

27-
def __init__(self, obj, func, broadcast, raw, reduce, args, kwds):
28+
def __init__(self, obj, func, broadcast, raw, reduce,
29+
args, kwds):
2830
self.obj = obj
2931
self.broadcast = broadcast
3032
self.raw = raw
3133
self.reduce = reduce
32-
self.args = args
3334

35+
self.args = args
3436
self.ignore_failures = kwds.pop('ignore_failures', False)
3537
self.kwds = kwds
3638

@@ -94,6 +96,13 @@ def get_result(self):
9496
return self.apply_standard()
9597

9698
def apply_empty_result(self):
99+
"""
100+
we have an empty result; at least 1 axis is 0
101+
102+
we will try to apply the function to an empty
103+
series in order to see if this is a reduction function
104+
"""
105+
97106
from pandas import Series
98107
reduce = self.reduce
99108

@@ -113,6 +122,8 @@ def apply_empty_result(self):
113122
return self.obj.copy()
114123

115124
def apply_raw(self):
125+
""" apply to the values as a numpy array """
126+
116127
try:
117128
result = reduction.reduce(self.values, self.f, axis=self.axis)
118129
except Exception:
@@ -207,19 +218,57 @@ def wrap_results(self, results, res_index, res_columns):
207218
from pandas import Series
208219

209220
if len(results) > 0 and is_sequence(results[0]):
210-
if not isinstance(results[0], Series):
211-
index = res_columns
221+
222+
# map to rows
223+
if self.axis == 0:
224+
result = self.obj._constructor(data=results)
225+
226+
if not isinstance(results[0], Series):
227+
try:
228+
result.index = res_columns
229+
except ValueError:
230+
pass
231+
232+
try:
233+
result.columns = res_index
234+
except ValueError:
235+
pass
236+
237+
# map to columns
212238
else:
213-
index = None
214239

215-
result = self.obj._constructor(data=results, index=index)
216-
result.columns = res_index
240+
def infer_to_same_shape():
241+
result = self.obj._constructor(data=results)
242+
result = result.T
243+
244+
# try to assign the result indices;
245+
# this may fail, if so we have
246+
# received an invalid return shape
247+
try:
248+
result.index = res_index
249+
except ValueError:
250+
pass
251+
252+
try:
253+
result.columns = res_columns
254+
except ValueError:
255+
pass
256+
257+
# infer dtypes
258+
result = result.infer_objects()
217259

218-
if self.axis == 1:
219-
result = result.T
220-
result = result._convert(
221-
datetime=True, timedelta=True, copy=False)
260+
return result
222261

262+
# we have a non-series and don't want inference
263+
if not isinstance(results[0], Series):
264+
result = Series(results)
265+
result.index = res_index
266+
267+
# we may want to infer results
268+
else:
269+
result = infer_to_same_shape()
270+
271+
# dict of scalars
223272
else:
224273

225274
result = Series(results)
@@ -270,16 +319,6 @@ def result_columns(self):
270319
class FrameColumnApply(FrameApply):
271320
axis = 1
272321

273-
def __init__(self, obj, func, broadcast, raw, reduce, args, kwds):
274-
super(FrameColumnApply, self).__init__(obj, func, broadcast,
275-
raw, reduce, args, kwds)
276-
277-
# skip if we are mixed datelike and trying reduce across axes
278-
# GH6125
279-
if self.reduce:
280-
if self.obj._is_mixed_type and self.obj._is_datelike_mixed_type:
281-
self.reduce = False
282-
283322
def apply_broadcast(self):
284323
return self._apply_broadcast(self.obj.T).T
285324

pandas/tests/frame/test_apply.py

+144-10
Original file line numberDiff line numberDiff line change
@@ -350,11 +350,10 @@ def test_apply_attach_name(self):
350350

351351
result = self.frame.apply(lambda x: np.repeat(x.name, len(x)),
352352
axis=1)
353-
expected = DataFrame(np.tile(self.frame.index,
354-
(len(self.frame.columns), 1)).T,
355-
index=self.frame.index,
356-
columns=self.frame.columns)
357-
assert_frame_equal(result, expected)
353+
expected = Series(np.repeat(t[0], len(self.frame.columns))
354+
for t in self.frame.itertuples())
355+
expected.index = self.frame.index
356+
assert_series_equal(result, expected)
358357

359358
def test_apply_multi_index(self):
360359
s = DataFrame([[1, 2], [3, 4], [5, 6]])
@@ -367,10 +366,10 @@ def test_apply_dict(self):
367366

368367
# GH 8735
369368
A = DataFrame([['foo', 'bar'], ['spam', 'eggs']])
370-
A_dicts = pd.Series([dict([(0, 'foo'), (1, 'spam')]),
371-
dict([(0, 'bar'), (1, 'eggs')])])
369+
A_dicts = Series([dict([(0, 'foo'), (1, 'spam')]),
370+
dict([(0, 'bar'), (1, 'eggs')])])
372371
B = DataFrame([[0, 1], [2, 3]])
373-
B_dicts = pd.Series([dict([(0, 0), (1, 2)]), dict([(0, 1), (1, 3)])])
372+
B_dicts = Series([dict([(0, 0), (1, 2)]), dict([(0, 1), (1, 3)])])
374373
fn = lambda x: x.to_dict()
375374

376375
for df, dicts in [(A, A_dicts), (B, B_dicts)]:
@@ -482,6 +481,141 @@ def test_apply_non_numpy_dtype(self):
482481
assert_frame_equal(result, df)
483482

484483

484+
class TestInferOutputShape(object):
485+
# the user has supplied an opaque UDF where
486+
# they are transforming the input that requires
487+
# us to infer the output
488+
489+
def test_infer_row_shape(self):
490+
# gh-17437
491+
# if row shape is changing, infer it
492+
df = pd.DataFrame(np.random.rand(10, 2))
493+
result = df.apply(np.fft.fft, axis=0)
494+
assert result.shape == (10, 2)
495+
496+
result = df.apply(np.fft.rfft, axis=0)
497+
assert result.shape == (6, 2)
498+
499+
def test_with_dictlike_columns(self):
500+
# gh 17602
501+
502+
df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
503+
result = df.apply(lambda x: {'s': x['a'] + x['b']}, 1)
504+
expected = Series([{'s': 3} for t in df.itertuples()])
505+
assert_series_equal(result, expected)
506+
507+
df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'),
508+
pd.Timestamp('2017-05-02 00:00:00')]
509+
assert_series_equal(result, expected)
510+
511+
# compose a series
512+
result = (df['a'] + df['b']).apply(lambda x: {'s': x})
513+
expected = Series([{'s': 3}, {'s': 3}])
514+
assert_series_equal(result, expected)
515+
516+
# gh-18775
517+
df = DataFrame()
518+
df["author"] = ["X", "Y", "Z"]
519+
df["publisher"] = ["BBC", "NBC", "N24"]
520+
df["date"] = pd.to_datetime(['17-10-2010 07:15:30',
521+
'13-05-2011 08:20:35',
522+
'15-01-2013 09:09:09'])
523+
result = df.apply(lambda x: {}, axis=1)
524+
expected = Series([{}, {}, {}])
525+
assert_series_equal(result, expected)
526+
527+
def test_with_listlike_columns(self):
528+
# gh-17348
529+
df = DataFrame({'a': Series(np.random.randn(4)),
530+
'b': ['a', 'list', 'of', 'words'],
531+
'ts': date_range('2016-10-01', periods=4, freq='H')})
532+
533+
result = df[['a', 'b']].apply(tuple, axis=1)
534+
expected = Series([t[1:] for t in df[['a', 'b']].itertuples()])
535+
assert_series_equal(result, expected)
536+
537+
result = df[['a', 'ts']].apply(tuple, axis=1)
538+
expected = Series([t[1:] for t in df[['a', 'ts']].itertuples()])
539+
assert_series_equal(result, expected)
540+
541+
# gh-18919
542+
df = DataFrame({'x': Series([['a', 'b'], ['q']]),
543+
'y': Series([['z'], ['q', 't']])})
544+
df.index = MultiIndex.from_tuples([('i0', 'j0'), ('i1', 'j1')])
545+
546+
result = df.apply(
547+
lambda row: [el for el in row['x'] if el in row['y']],
548+
axis=1)
549+
expected = Series([[], ['q']], index=df.index)
550+
assert_series_equal(result, expected)
551+
552+
def test_infer_output_shape_columns(self):
553+
# gh-18573
554+
555+
df = DataFrame({'number': [1., 2.],
556+
'string': ['foo', 'bar'],
557+
'datetime': [pd.Timestamp('2017-11-29 03:30:00'),
558+
pd.Timestamp('2017-11-29 03:45:00')]})
559+
result = df.apply(lambda row: (row.number, row.string), axis=1)
560+
expected = Series([t[2:] for t in df.itertuples()])
561+
assert_series_equal(result, expected)
562+
563+
def test_infer_output_shape_listlike_columns(self):
564+
# gh-16353
565+
566+
df = DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C'])
567+
568+
result = df.apply(lambda x: [1, 2, 3], axis=1)
569+
expected = Series([[1, 2, 3] for t in df.itertuples()])
570+
assert_series_equal(result, expected)
571+
572+
result = df.apply(lambda x: [1, 2], axis=1)
573+
expected = Series([[1, 2] for t in df.itertuples()])
574+
assert_series_equal(result, expected)
575+
576+
# gh-17970
577+
df = DataFrame({"a": [1, 2, 3]}, index=list('abc'))
578+
579+
result = df.apply(lambda row: np.ones(1), axis=1)
580+
expected = Series([np.ones(1) for t in df.itertuples()],
581+
index=df.index)
582+
assert_series_equal(result, expected)
583+
584+
result = df.apply(lambda row: np.ones(2), axis=1)
585+
expected = Series([np.ones(2) for t in df.itertuples()],
586+
index=df.index)
587+
assert_series_equal(result, expected)
588+
589+
# gh-17892
590+
df = pd.DataFrame({'a': [pd.Timestamp('2010-02-01'),
591+
pd.Timestamp('2010-02-04'),
592+
pd.Timestamp('2010-02-05'),
593+
pd.Timestamp('2010-02-06')],
594+
'b': [9, 5, 4, 3],
595+
'c': [5, 3, 4, 2],
596+
'd': [1, 2, 3, 4]})
597+
598+
def fun(x):
599+
return (1, 2)
600+
601+
result = df.apply(fun, axis=1)
602+
expected = Series([(1, 2) for t in df.itertuples()])
603+
assert_series_equal(result, expected)
604+
605+
def test_consistent_coerce_for_shapes(self):
606+
# we want column names to NOT be propagated
607+
# just because the shape matches the input shape
608+
df = DataFrame(np.random.randn(4, 3), columns=['A', 'B', 'C'])
609+
610+
result = df.apply(lambda x: [1, 2, 3], axis=1)
611+
expected = Series([[1, 2, 3] for t in df.itertuples()])
612+
assert_series_equal(result, expected)
613+
614+
result = df.apply(lambda x: [1, 2], axis=1)
615+
expected = Series([[1, 2] for t in df.itertuples()])
616+
assert_series_equal(result, expected)
617+
618+
485619
def zip_frames(*frames):
486620
"""
487621
take a list of frames, zip the columns together for each
@@ -659,13 +793,13 @@ def test_non_callable_aggregates(self):
659793

660794
# Function aggregate
661795
result = df.agg({'A': 'count'})
662-
expected = pd.Series({'A': 2})
796+
expected = Series({'A': 2})
663797

664798
assert_series_equal(result, expected)
665799

666800
# Non-function aggregate
667801
result = df.agg({'A': 'size'})
668-
expected = pd.Series({'A': 3})
802+
expected = Series({'A': 3})
669803

670804
assert_series_equal(result, expected)
671805

0 commit comments

Comments
 (0)