API/BUG: .apply will correctly infer output shape when axis=1

jreback · jreback · commit f6f0371ac28a · 2017-12-07T06:17:44.000-05:00
closes pandas-dev#16353 closes pandas-dev#17348 closes pandas-dev#17437 closes pandas-dev#18573 closes pandas-dev#17970 closes pandas-dev#17892 closes pandas-dev#17602
diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
@@ -157,6 +157,80 @@ If installed, we now require:
    | openpyxl        | 2.4.0           |          |
    +-----------------+-----------------+----------+
 
+.. _whatsnew_0220.api_breaking.apply:
+
+Apply Changes
+~~~~~~~~~~~~~
+
+:func:`DataFrame.apply` was inconsistent when applying an arbitrary user-defined-function that returned a list-like with ``axis=1``. Several bugs and inconsistencies
+are resolved (:issue:`16353`, :issue:`17437`, :issue:`17970`, :issue:`17348`, :issue:`17892`, :issue:`18573`, :issue:`17602`)
+
+.. ipython:: python
+
+    df = pd.DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C'])
+    df
+
+Previous Behavior. If the returned shape happened to match the index, this would return a list-like.
+
+.. code-block:: python
+
+   In [3]: df.apply(lambda x: [1, 2, 3], axis=1)
+   Out[3]:
+      A  B  C
+   0  1  2  3
+   1  1  2  3
+   2  1  2  3
+   3  1  2  3
+   4  1  2  3
+   5  1  2  3
+
+   In [4]: df.apply(lambda x: [1, 2], axis=1)
+   Out[4]:
+   0    [1, 2]
+   1    [1, 2]
+   2    [1, 2]
+   3    [1, 2]
+   4    [1, 2]
+   5    [1, 2]
+   dtype: object
+
+
+New Behavior. The behaviour is consistent.
+
+.. ipython:: python
+
+    df.apply(lambda x: [1, 2, 3], axis=1)
+    df.apply(lambda x: [1, 2], axis=1)
+
+The returned input will also *not* return a Series with the list-wrapper as previously.
+
+.. ipython:: python
+
+   df = pd.DataFrame([[1,2], [1,2]], columns=['a','b'])
+
+
+Previous Behavior
+
+.. code-block:: python
+
+   In [3]: df.apply(lambda x: {'s':x['a'] + x['b']}, 1)
+   Out[3]:
+   0    {'s': 3}
+   1    {'s': 3}
+   dtype: object
+
+
+New Behaviour
+
+.. ipython:: python
+
+   df.apply(lambda x: {'s':x['a'] + x['b']}, 1)
+
+To achieve the original effect, you can operate on a ``Series``
+
+.. ipython:: python
+
+   (df['a'] + df['b']).apply(lambda x: {'s': x})
 
 - Building pandas for development now requires ``cython >= 0.24`` (:issue:`18613`)
 - Building from source now explicity requires ``setuptools`` in ``setup.py`` (:issue:`18113`)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -2139,7 +2139,7 @@ def __getitem__(self, key):
         try:
             if key in self.columns and not is_mi_columns:
                 return self._getitem_column(key)
-        except:
+        except Exception:
             pass
 
         # see if we can slice the rows
@@ -2582,7 +2582,7 @@ def _ensure_valid_index(self, value):
         if not len(self.index) and is_list_like(value):
             try:
                 value = Series(value)
-            except:
+            except Exception:
                 raise ValueError('Cannot set a frame with no defined index '
                                  'and a value that cannot be converted to a '
                                  'Series')
@@ -4922,8 +4922,7 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True):
 
         # skip if we are mixed datelike and trying reduce across axes
         # GH6125
-        if (reduce and axis == 1 and self._is_mixed_type and
-                self._is_datelike_mixed_type):
+        if reduce and axis == 1:
             reduce = False
 
         # try to reduce first (by default)
@@ -4996,16 +4995,40 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True):
                 raise
 
         if len(results) > 0 and is_sequence(results[0]):
-            if not isinstance(results[0], Series):
-                index = res_columns
-            else:
-                index = None
+            # map to rows
+            if axis == 0:
+                result = self._constructor(data=results)
+
+                if not isinstance(results[0], Series):
+                    try:
+                        result.index = res_columns
+                    except ValueError:
+                        pass
 
-            result = self._constructor(data=results, index=index)
-            result.columns = res_index
+                try:
+                    result.columns = res_index
+                except ValueError:
+                    pass
 
-            if axis == 1:
+            # map to columns
+            else:
+
+                result = self._constructor(data=results)
                 result = result.T
+
+                # try to assign the result indices;
+                # this may fail, if so we have
+                # received an invalid return shape
+                try:
+                    result.index = res_index
+                except ValueError:
+                    pass
+
+                try:
+                    result.columns = res_columns
+                except ValueError:
+                    pass
+
             result = result._convert(datetime=True, timedelta=True, copy=False)
 
         else:
@@ -5742,7 +5765,7 @@ def f(x):
                         if result.ndim == self.ndim:
                             result = result.iloc[0]
                         return result
-                    except:
+                    except Exception:
                         pass
 
                 if filter_type is None or filter_type == 'numeric':
@@ -6257,7 +6280,7 @@ def convert(v):
                 values = np.array([convert(v) for v in values])
             else:
                 values = convert(values)
-        except:
+        except Exception:
             values = convert(values)
 
     else:
diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py
@@ -470,6 +470,111 @@ def test_apply_non_numpy_dtype(self):
         assert_frame_equal(result, df)
 
 
+class TestInferOutputShape(object):
+    # the user has supplied an opaque UDF where
+    # they are transforming the input that requires
+    # us to infer the output
+
+    def test_infer_row_shape(self):
+        # gh-17437
+        # if row shape is changing, infer it
+        df = pd.DataFrame(np.random.rand(10, 2))
+        result = df.apply(np.fft.fft, axis=0)
+        assert result.shape == (10, 2)
+
+        result = df.apply(np.fft.rfft, axis=0)
+        assert result.shape == (6, 2)
+
+    def test_with_dictlike_columns(self):
+        # gh 17602
+
+        df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
+        result = df.apply(lambda x: {'s': x['a'] + x['b']}, 1)
+        expected = DataFrame({'s': df['a'].values + df['b'].values})
+        assert_frame_equal(result, expected)
+
+        df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'),
+                    pd.Timestamp('2017-05-02 00:00:00')]
+        result = df.apply(lambda x: {'s': x['a'] + x['b']}, 1)
+        assert_frame_equal(result, expected)
+
+        # compose a series
+        result = (df['a'] + df['b']).apply(lambda x: {'s': x})
+        expected = Series([{'s': 3}, {'s': 3}])
+        assert_series_equal(result, expected)
+
+    def test_with_listlike_columns(self):
+        # gh-17348
+        df = DataFrame({'a': Series(np.random.randn(4)),
+                        'b': ['a', 'list', 'of', 'words'],
+                        'ts': date_range('2016-10-01', periods=4, freq='H')})
+
+        result = df[['a', 'b']].apply(tuple, axis=1)
+        expected = df[['a', 'b']]
+        assert_frame_equal(result, expected)
+
+        result = df[['a', 'ts']].apply(tuple, axis=1)
+        expected = df[['a', 'ts']]
+        assert_frame_equal(result, expected)
+
+    def test_infer_output_shape_columns(self):
+        # gh-18573
+
+        df = DataFrame({'number': [1., 2.],
+                        'string': ['foo', 'bar'],
+                        'datetime': [pd.Timestamp('2017-11-29 03:30:00'),
+                                     pd.Timestamp('2017-11-29 03:45:00')]})
+        result = df.apply(lambda row: (row.number, row.string), axis=1)
+        expected = df[['number', 'string']].copy()
+        expected.columns = [0, 1]
+        assert_frame_equal(result, expected)
+
+    def test_infer_output_shape_listlike_columns(self):
+        # gh-16353
+
+        df = DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C'])
+
+        result = df.apply(lambda x: [1, 2, 3], axis=1)
+        expected = DataFrame({'A': 1, 'B': 2, 'C': 3},
+                             index=range(6)).reindex(columns=df.columns)
+        assert_frame_equal(result, expected)
+
+        result = df.apply(lambda x: [1, 2], axis=1)
+        expected = DataFrame({0: 1, 1: 2},
+                             index=range(6)).reindex(columns=[0, 1])
+        assert_frame_equal(result, expected)
+
+        # gh-17970
+        df = DataFrame({"a": [1, 2, 3]})
+
+        result = df.apply(lambda row: np.ones(1), axis=1)
+        expected = DataFrame({'a': 1.0},
+                             index=range(3))
+        assert_frame_equal(result, expected)
+
+        result = df.apply(lambda row: np.ones(2), axis=1)
+        expected = DataFrame({0: 1., 1: 1.},
+                             index=range(3)).reindex(columns=[0, 1])
+        assert_frame_equal(result, expected)
+
+        # gh-17892
+        df = pd.DataFrame({'a': [pd.Timestamp('2010-02-01'),
+                                 pd.Timestamp('2010-02-04'),
+                                 pd.Timestamp('2010-02-05'),
+                                 pd.Timestamp('2010-02-06')],
+                           'b': [9, 5, 4, 3],
+                           'c': [5, 3, 4, 2],
+                           'd': [1, 2, 3, 4]})
+
+        def fun(x):
+            return (1, 2)
+
+        result = df.apply(fun, axis=1)
+        expected = DataFrame({0: 1, 1: 2},
+                             index=range(4)).reindex(columns=[0, 1])
+        assert_frame_equal(result, expected)
+
+
 def zip_frames(*frames):
     """
     take a list of frames, zip the columns together for each