Skip to content

Commit f6f0371

Browse files
committed
API/BUG: .apply will correctly infer output shape when axis=1
closes pandas-dev#16353 closes pandas-dev#17348 closes pandas-dev#17437 closes pandas-dev#18573 closes pandas-dev#17970 closes pandas-dev#17892 closes pandas-dev#17602
1 parent 3e506a3 commit f6f0371

File tree

3 files changed

+215
-13
lines changed

3 files changed

+215
-13
lines changed

doc/source/whatsnew/v0.22.0.txt

+74
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,80 @@ If installed, we now require:
157157
| openpyxl | 2.4.0 | |
158158
+-----------------+-----------------+----------+
159159

160+
.. _whatsnew_0220.api_breaking.apply:
161+
162+
Apply Changes
163+
~~~~~~~~~~~~~
164+
165+
:func:`DataFrame.apply` was inconsistent when applying an arbitrary user-defined-function that returned a list-like with ``axis=1``. Several bugs and inconsistencies
166+
are resolved (:issue:`16353`, :issue:`17437`, :issue:`17970`, :issue:`17348`, :issue:`17892`, :issue:`18573`, :issue:`17602`)
167+
168+
.. ipython:: python
169+
170+
df = pd.DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C'])
171+
df
172+
173+
Previous Behavior. If the returned shape happened to match the index, this would return a list-like.
174+
175+
.. code-block:: python
176+
177+
In [3]: df.apply(lambda x: [1, 2, 3], axis=1)
178+
Out[3]:
179+
A B C
180+
0 1 2 3
181+
1 1 2 3
182+
2 1 2 3
183+
3 1 2 3
184+
4 1 2 3
185+
5 1 2 3
186+
187+
In [4]: df.apply(lambda x: [1, 2], axis=1)
188+
Out[4]:
189+
0 [1, 2]
190+
1 [1, 2]
191+
2 [1, 2]
192+
3 [1, 2]
193+
4 [1, 2]
194+
5 [1, 2]
195+
dtype: object
196+
197+
198+
New Behavior. The behaviour is consistent.
199+
200+
.. ipython:: python
201+
202+
df.apply(lambda x: [1, 2, 3], axis=1)
203+
df.apply(lambda x: [1, 2], axis=1)
204+
205+
The returned input will also *not* return a Series with the list-wrapper as previously.
206+
207+
.. ipython:: python
208+
209+
df = pd.DataFrame([[1,2], [1,2]], columns=['a','b'])
210+
211+
212+
Previous Behavior
213+
214+
.. code-block:: python
215+
216+
In [3]: df.apply(lambda x: {'s':x['a'] + x['b']}, 1)
217+
Out[3]:
218+
0 {'s': 3}
219+
1 {'s': 3}
220+
dtype: object
221+
222+
223+
New Behaviour
224+
225+
.. ipython:: python
226+
227+
df.apply(lambda x: {'s':x['a'] + x['b']}, 1)
228+
229+
To achieve the original effect, you can operate on a ``Series``
230+
231+
.. ipython:: python
232+
233+
(df['a'] + df['b']).apply(lambda x: {'s': x})
160234

161235
- Building pandas for development now requires ``cython >= 0.24`` (:issue:`18613`)
162236
- Building from source now explicity requires ``setuptools`` in ``setup.py`` (:issue:`18113`)

pandas/core/frame.py

+36-13
Original file line numberDiff line numberDiff line change
@@ -2139,7 +2139,7 @@ def __getitem__(self, key):
21392139
try:
21402140
if key in self.columns and not is_mi_columns:
21412141
return self._getitem_column(key)
2142-
except:
2142+
except Exception:
21432143
pass
21442144

21452145
# see if we can slice the rows
@@ -2582,7 +2582,7 @@ def _ensure_valid_index(self, value):
25822582
if not len(self.index) and is_list_like(value):
25832583
try:
25842584
value = Series(value)
2585-
except:
2585+
except Exception:
25862586
raise ValueError('Cannot set a frame with no defined index '
25872587
'and a value that cannot be converted to a '
25882588
'Series')
@@ -4922,8 +4922,7 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True):
49224922

49234923
# skip if we are mixed datelike and trying reduce across axes
49244924
# GH6125
4925-
if (reduce and axis == 1 and self._is_mixed_type and
4926-
self._is_datelike_mixed_type):
4925+
if reduce and axis == 1:
49274926
reduce = False
49284927

49294928
# try to reduce first (by default)
@@ -4996,16 +4995,40 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True):
49964995
raise
49974996

49984997
if len(results) > 0 and is_sequence(results[0]):
4999-
if not isinstance(results[0], Series):
5000-
index = res_columns
5001-
else:
5002-
index = None
4998+
# map to rows
4999+
if axis == 0:
5000+
result = self._constructor(data=results)
5001+
5002+
if not isinstance(results[0], Series):
5003+
try:
5004+
result.index = res_columns
5005+
except ValueError:
5006+
pass
50035007

5004-
result = self._constructor(data=results, index=index)
5005-
result.columns = res_index
5008+
try:
5009+
result.columns = res_index
5010+
except ValueError:
5011+
pass
50065012

5007-
if axis == 1:
5013+
# map to columns
5014+
else:
5015+
5016+
result = self._constructor(data=results)
50085017
result = result.T
5018+
5019+
# try to assign the result indices;
5020+
# this may fail, if so we have
5021+
# received an invalid return shape
5022+
try:
5023+
result.index = res_index
5024+
except ValueError:
5025+
pass
5026+
5027+
try:
5028+
result.columns = res_columns
5029+
except ValueError:
5030+
pass
5031+
50095032
result = result._convert(datetime=True, timedelta=True, copy=False)
50105033

50115034
else:
@@ -5742,7 +5765,7 @@ def f(x):
57425765
if result.ndim == self.ndim:
57435766
result = result.iloc[0]
57445767
return result
5745-
except:
5768+
except Exception:
57465769
pass
57475770

57485771
if filter_type is None or filter_type == 'numeric':
@@ -6257,7 +6280,7 @@ def convert(v):
62576280
values = np.array([convert(v) for v in values])
62586281
else:
62596282
values = convert(values)
6260-
except:
6283+
except Exception:
62616284
values = convert(values)
62626285

62636286
else:

pandas/tests/frame/test_apply.py

+105
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,111 @@ def test_apply_non_numpy_dtype(self):
470470
assert_frame_equal(result, df)
471471

472472

473+
class TestInferOutputShape(object):
474+
# the user has supplied an opaque UDF where
475+
# they are transforming the input that requires
476+
# us to infer the output
477+
478+
def test_infer_row_shape(self):
479+
# gh-17437
480+
# if row shape is changing, infer it
481+
df = pd.DataFrame(np.random.rand(10, 2))
482+
result = df.apply(np.fft.fft, axis=0)
483+
assert result.shape == (10, 2)
484+
485+
result = df.apply(np.fft.rfft, axis=0)
486+
assert result.shape == (6, 2)
487+
488+
def test_with_dictlike_columns(self):
489+
# gh 17602
490+
491+
df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
492+
result = df.apply(lambda x: {'s': x['a'] + x['b']}, 1)
493+
expected = DataFrame({'s': df['a'].values + df['b'].values})
494+
assert_frame_equal(result, expected)
495+
496+
df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'),
497+
pd.Timestamp('2017-05-02 00:00:00')]
498+
result = df.apply(lambda x: {'s': x['a'] + x['b']}, 1)
499+
assert_frame_equal(result, expected)
500+
501+
# compose a series
502+
result = (df['a'] + df['b']).apply(lambda x: {'s': x})
503+
expected = Series([{'s': 3}, {'s': 3}])
504+
assert_series_equal(result, expected)
505+
506+
def test_with_listlike_columns(self):
507+
# gh-17348
508+
df = DataFrame({'a': Series(np.random.randn(4)),
509+
'b': ['a', 'list', 'of', 'words'],
510+
'ts': date_range('2016-10-01', periods=4, freq='H')})
511+
512+
result = df[['a', 'b']].apply(tuple, axis=1)
513+
expected = df[['a', 'b']]
514+
assert_frame_equal(result, expected)
515+
516+
result = df[['a', 'ts']].apply(tuple, axis=1)
517+
expected = df[['a', 'ts']]
518+
assert_frame_equal(result, expected)
519+
520+
def test_infer_output_shape_columns(self):
521+
# gh-18573
522+
523+
df = DataFrame({'number': [1., 2.],
524+
'string': ['foo', 'bar'],
525+
'datetime': [pd.Timestamp('2017-11-29 03:30:00'),
526+
pd.Timestamp('2017-11-29 03:45:00')]})
527+
result = df.apply(lambda row: (row.number, row.string), axis=1)
528+
expected = df[['number', 'string']].copy()
529+
expected.columns = [0, 1]
530+
assert_frame_equal(result, expected)
531+
532+
def test_infer_output_shape_listlike_columns(self):
533+
# gh-16353
534+
535+
df = DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C'])
536+
537+
result = df.apply(lambda x: [1, 2, 3], axis=1)
538+
expected = DataFrame({'A': 1, 'B': 2, 'C': 3},
539+
index=range(6)).reindex(columns=df.columns)
540+
assert_frame_equal(result, expected)
541+
542+
result = df.apply(lambda x: [1, 2], axis=1)
543+
expected = DataFrame({0: 1, 1: 2},
544+
index=range(6)).reindex(columns=[0, 1])
545+
assert_frame_equal(result, expected)
546+
547+
# gh-17970
548+
df = DataFrame({"a": [1, 2, 3]})
549+
550+
result = df.apply(lambda row: np.ones(1), axis=1)
551+
expected = DataFrame({'a': 1.0},
552+
index=range(3))
553+
assert_frame_equal(result, expected)
554+
555+
result = df.apply(lambda row: np.ones(2), axis=1)
556+
expected = DataFrame({0: 1., 1: 1.},
557+
index=range(3)).reindex(columns=[0, 1])
558+
assert_frame_equal(result, expected)
559+
560+
# gh-17892
561+
df = pd.DataFrame({'a': [pd.Timestamp('2010-02-01'),
562+
pd.Timestamp('2010-02-04'),
563+
pd.Timestamp('2010-02-05'),
564+
pd.Timestamp('2010-02-06')],
565+
'b': [9, 5, 4, 3],
566+
'c': [5, 3, 4, 2],
567+
'd': [1, 2, 3, 4]})
568+
569+
def fun(x):
570+
return (1, 2)
571+
572+
result = df.apply(fun, axis=1)
573+
expected = DataFrame({0: 1, 1: 2},
574+
index=range(4)).reindex(columns=[0, 1])
575+
assert_frame_equal(result, expected)
576+
577+
473578
def zip_frames(*frames):
474579
"""
475580
take a list of frames, zip the columns together for each

0 commit comments

Comments
 (0)