Merge pull request #6438 from jreback/concat

jreback · jreback · commit af63b99d2d56 · 2014-02-21T17:44:05.000-05:00
PERF/API: concat improvements
diff --git a/doc/source/merging.rst b/doc/source/merging.rst
@@ -213,6 +213,33 @@ This is also a valid argument to ``DataFrame.append``:
 
    df1.append(df2, ignore_index=True)
 
+.. _merging.mixed_ndims:
+
+Concatenating with mixed ndims
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You can concatenate a mix of Series and DataFrames. The
+Series will be transformed to DataFrames with the column name as
+the name of the Series.
+
+.. ipython:: python
+
+   df1 = DataFrame(randn(6, 4), columns=['A', 'B', 'C', 'D'])
+   s1 = Series(randn(6), name='foo')
+   concat([df1, s1],axis=1)
+
+If unnamed Series are passed they will be numbered consecutively.
+
+.. ipython:: python
+
+   s2 = Series(randn(6))
+   concat([df1, s2, s2, s2],axis=1)
+
+Passing ``ignore_index=True`` will drop all name references.
+
+.. ipython:: python
+
+   concat([df1, s1],axis=1,ignore_index=True)
 
 More concatenating with group keys
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -98,6 +98,8 @@ API Changes
 - The top-level :func:`pandas.eval` function does not allow you use the
   ``'@'`` prefix and provides you with an error message telling you so.
 - ``NameResolutionError`` was removed because it isn't necessary anymore.
+- ``concat`` will now concatenate mixed Series and DataFrames using the Series name
+  or numbering columns as needed (:issue:`2385`)
 
 Experimental Features
 ~~~~~~~~~~~~~~~~~~~~~
@@ -166,6 +168,7 @@ Bug Fixes
 - Bug in ``Series.reindex`` when specifying a ``method`` with some nan values was inconsistent (noted on a resample) (:issue:`6418`)
 - Bug in :meth:`DataFrame.replace` where nested dicts were erroneously
   depending on the order of dictionary keys and values (:issue:`5338`).
+- Perf issue in concatting with empty objects (:issue:`3259`)
 
 pandas 0.13.1
 -------------
diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt
@@ -66,6 +66,8 @@ API changes
 - The top-level :func:`pandas.eval` function does not allow you use the
   ``'@'`` prefix and provides you with an error message telling you so.
 - ``NameResolutionError`` was removed because it isn't necessary anymore.
+- ``concat`` will now concatenate mixed Series and DataFrames using the Series name
+  or numbering columns as needed (:issue:`2385`). See :ref:`the docs <mergine.mixed_ndims>`
 
 MultiIndexing Using Slicers
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -2209,10 +2209,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
 
             # make Nones an empty object
             if com._count_not_none(*values) != len(values):
-                v = None
-                for v in values:
-                    if v is not None:
-                        break
+                v = next(v for v in values if v is not None)
                 if v is None:
                     return DataFrame()
                 elif isinstance(v, NDFrame):
diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py
@@ -957,7 +957,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None,
             objs = [objs[k] for k in keys]
 
         if keys is None:
-            objs = [obj for obj in objs if obj is not None]
+            objs = [obj for obj in objs if obj is not None ]
         else:
             # #1649
             clean_keys = []
@@ -973,28 +973,83 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None,
         if len(objs) == 0:
             raise Exception('All objects passed were None')
 
-        # consolidate data
+        # consolidate data & figure out what our result ndim is going to be
+        ndims = set()
         for obj in objs:
-            if isinstance(obj, NDFrame):
-                obj.consolidate(inplace=True)
-        self.objs = objs
+            if not isinstance(obj, NDFrame):
+                raise TypeError("cannot concatenate a non-NDFrame object")
+
+            # consolidate
+            obj.consolidate(inplace=True)
+            ndims.add(obj.ndim)
+
+        # get the sample
+        # want the higest ndim that we have, and must be non-empty
+        # unless all objs are empty
+        sample = None
+        if len(ndims) > 1:
+            max_ndim = max(ndims)
+            for obj in objs:
+                if obj.ndim == max_ndim and np.sum(obj.shape):
+                    sample = obj
+                    break
 
-        sample = objs[0]
+        else:
+            # filter out the empties
+            # if we have not multi-index possibiltes
+            df = DataFrame([ obj.shape for obj in objs ]).sum(1)
+            non_empties = df[df!=0]
+            if len(non_empties) and (keys is None and names is None and levels is None and join_axes is None):
+                objs = [ objs[i] for i in non_empties.index ]
+                sample = objs[0]
+
+        if sample is None:
+            sample = objs[0]
+        self.objs = objs
 
         # Need to flip BlockManager axis in the DataFrame special case
-        if isinstance(sample, DataFrame):
+        self._is_frame = isinstance(sample, DataFrame)
+        if self._is_frame:
             axis = 1 if axis == 0 else 0
 
         self._is_series = isinstance(sample, ABCSeries)
         if not 0 <= axis <= sample.ndim:
             raise AssertionError("axis must be between 0 and {0}, "
                                  "input was {1}".format(sample.ndim, axis))
 
+        # if we have mixed ndims, then convert to highest ndim
+        # creating column numbers as needed
+        if len(ndims) > 1:
+            current_column = 0
+            max_ndim = sample.ndim
+            self.objs, objs = [], self.objs
+            for obj in objs:
+
+                ndim = obj.ndim
+                if ndim == max_ndim:
+                    pass
+
+                elif ndim != max_ndim-1:
+                    raise ValueError("cannot concatenate unaligned mixed "
+                                     "dimensional NDFrame objects")
+
+                else:
+                    name = getattr(obj,'name',None)
+                    if ignore_index or name is None:
+                        name = current_column
+                        current_column += 1
+
+                    # doing a row-wise concatenation so need everything
+                    # to line up
+                    if self._is_frame and axis == 1:
+                        name = 0
+                    obj = sample._constructor({ name : obj })
+
+                self.objs.append(obj)
+
         # note: this is the BlockManager axis (since DataFrame is transposed)
         self.axis = axis
-
         self.join_axes = join_axes
-
         self.keys = keys
         self.names = names
         self.levels = levels
diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py
@@ -1653,6 +1653,77 @@ def test_handle_empty_objects(self):
 
         tm.assert_frame_equal(concatted, expected)
 
+        # empty as first element with time series
+        # GH3259
+        df = DataFrame(dict(A = range(10000)),index=date_range('20130101',periods=10000,freq='s'))
+        empty = DataFrame()
+        result = concat([df,empty],axis=1)
+        assert_frame_equal(result, df)
+        result = concat([empty,df],axis=1)
+        assert_frame_equal(result, df)
+
+        result = concat([df,empty])
+        assert_frame_equal(result, df)
+        result = concat([empty,df])
+        assert_frame_equal(result, df)
+
+    def test_concat_mixed_objs(self):
+
+        # concat mixed series/frames
+        # G2385
+
+        # axis 1
+        index=date_range('01-Jan-2013', periods=10, freq='H')
+        arr = np.arange(10, dtype='int64')
+        s1 = Series(arr, index=index)
+        s2 = Series(arr, index=index)
+        df = DataFrame(arr.reshape(-1,1), index=index)
+
+        expected = DataFrame(np.repeat(arr,2).reshape(-1,2), index=index, columns = [0, 0])
+        result = concat([df,df], axis=1)
+        assert_frame_equal(result, expected)
+
+        expected = DataFrame(np.repeat(arr,2).reshape(-1,2), index=index, columns = [0, 1])
+        result = concat([s1,s2], axis=1)
+        assert_frame_equal(result, expected)
+
+        expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = [0, 1, 2])
+        result = concat([s1,s2,s1], axis=1)
+        assert_frame_equal(result, expected)
+
+        expected = DataFrame(np.repeat(arr,5).reshape(-1,5), index=index, columns = [0, 0, 1, 2, 3])
+        result = concat([s1,df,s2,s2,s1], axis=1)
+        assert_frame_equal(result, expected)
+
+        # with names
+        s1.name = 'foo'
+        expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = ['foo', 0, 0])
+        result = concat([s1,df,s2], axis=1)
+        assert_frame_equal(result, expected)
+
+        s2.name = 'bar'
+        expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = ['foo', 0, 'bar'])
+        result = concat([s1,df,s2], axis=1)
+        assert_frame_equal(result, expected)
+
+        # ignore index
+        expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = [0, 1, 2])
+        result = concat([s1,df,s2], axis=1, ignore_index=True)
+        assert_frame_equal(result, expected)
+
+        # axis 0
+        expected = DataFrame(np.tile(arr,3).reshape(-1,1), index=index.tolist() * 3, columns = [0])
+        result = concat([s1,df,s2])
+        assert_frame_equal(result, expected)
+
+        expected = DataFrame(np.tile(arr,3).reshape(-1,1), columns = [0])
+        result = concat([s1,df,s2], ignore_index=True)
+        assert_frame_equal(result, expected)
+
+        # invalid concatente of mixed dims
+        panel = tm.makePanel()
+        self.assertRaises(ValueError, lambda : concat([panel,s1],axis=1))
+
     def test_panel_join(self):
         panel = tm.makePanel()
         tm.add_nans(panel)
@@ -1967,6 +2038,13 @@ def test_concat_series_axis1_same_names_ignore_index(self):
         result = concat([s1, s2], axis=1, ignore_index=True)
         self.assertTrue(np.array_equal(result.columns, [0, 1]))
 
+    def test_concat_invalid(self):
+
+        # trying to concat a ndframe with a non-ndframe
+        df1 = mkdf(10, 2)
+        for obj in [1, dict(), [1, 2], (1, 2) ]:
+            self.assertRaises(TypeError, lambda x: concat([ df1, obj ]))
+
     def test_concat_invalid_first_argument(self):
         df1 = mkdf(10, 2)
         df2 = mkdf(10, 2)
@@ -1975,15 +2053,6 @@ def test_concat_invalid_first_argument(self):
         # generator ok though
         concat(DataFrame(np.random.rand(5,5)) for _ in range(3))
 
-    def test_concat_mixed_types_fails(self):
-        df = DataFrame(randn(10, 1))
-
-        with tm.assertRaisesRegexp(TypeError, "Cannot concatenate.+"):
-            concat([df[0], df], axis=1)
-
-        with tm.assertRaisesRegexp(TypeError, "Cannot concatenate.+"):
-            concat([df, df[0]], axis=1)
-
 class TestOrderedMerge(tm.TestCase):
 
     def setUp(self):
diff --git a/vb_suite/join_merge.py b/vb_suite/join_merge.py
@@ -186,6 +186,21 @@ def sample(values, k):
 concat_small_frames = Benchmark('concat([df] * 1000)', setup,
                                 start_date=datetime(2012, 1, 1))
 
+
+#----------------------------------------------------------------------
+# Concat empty
+
+setup = common_setup + """
+df = DataFrame(dict(A = range(10000)),index=date_range('20130101',periods=10000,freq='s'))
+empty = DataFrame()
+"""
+
+concat_empty_frames1 = Benchmark('concat([df,empty])', setup,
+                                start_date=datetime(2012, 1, 1))
+concat_empty_frames2 = Benchmark('concat([empty,df])', setup,
+                                start_date=datetime(2012, 1, 1))
+
+
 #----------------------------------------------------------------------
 # Ordered merge