ENH: add multiple-join to DataFrame with new concat function, test append multiple, #115, #479, #273

wesm · wesm · commit 35f3322ac559 · 2012-01-04T21:36:37.000-05:00
diff --git a/pandas/__init__.py b/pandas/__init__.py
@@ -30,4 +30,4 @@
 from pandas.util.testing import debug
 
 from pandas.tools.pivot import pivot_table
-from pandas.tools.merge import merge
+from pandas.tools.merge import merge, concat
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -301,9 +301,9 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
 
     def _init_mgr(self, mgr, index, columns, dtype=None, copy=False):
         if columns is not None:
-            mgr = mgr.reindex_axis(columns, axis=0)
+            mgr = mgr.reindex_axis(columns, axis=0, copy=False)
         if index is not None:
-            mgr = mgr.reindex_axis(index, axis=1)
+            mgr = mgr.reindex_axis(index, axis=1, copy=False)
         # do not copy BlockManager unless explicitly done
         if copy and dtype is None:
             mgr = mgr.copy()
@@ -2715,7 +2715,7 @@ def applymap(self, func):
     #----------------------------------------------------------------------
     # Merging / joining methods
 
-    def append(self, other, ignore_index=False):
+    def append(self, other, ignore_index=False, verify_integrity=True):
         """
         Append columns of other to end of this frame's columns and index.
         Columns not in this frame are added as new columns.
@@ -2749,19 +2749,20 @@ def append(self, other, ignore_index=False):
         else:
             to_concat = [self, other]
         return concat(to_concat, ignore_index=ignore_index,
-                      verify_integrity=True)
+                      verify_integrity=verify_integrity)
 
     def _get_raw_column(self, col):
         return self._data.get(col)
 
     def join(self, other, on=None, how='left', lsuffix='', rsuffix=''):
         """
         Join columns with other DataFrame either on index or on a key
-        column.
+        column. Efficiently Join multiple DataFrame objects by index at once by
+        passing a list.
 
         Parameters
         ----------
-        other : DataFrame, or Series with name field set
+        other : DataFrame, Series with name field set, or list of DataFrame
             Index should be similar to one of the columns in this one. If a
             Series is passed, its name attribute must be set, and that will be
             used as the column name in the resulting joined DataFrame
@@ -2782,6 +2783,11 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix=''):
         rsuffix : string
             Suffix to use from right frame's overlapping columns
 
+        Notes
+        -----
+        on, lsuffix, and rsuffix options are not supported when passing a list
+        of DataFrame objects
+
         Returns
         -------
         joined : DataFrame
@@ -2791,15 +2797,30 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix=''):
                                  rsuffix=rsuffix)
 
     def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix=''):
-        from pandas.tools.merge import merge
+        from pandas.tools.merge import merge, concat
 
         if isinstance(other, Series):
             assert(other.name is not None)
             other = DataFrame({other.name : other})
 
-        return merge(self, other, left_on=on, how=how,
-                     left_index=on is None, right_index=True,
-                     suffixes=(lsuffix, rsuffix), sort=False)
+        if isinstance(other, DataFrame):
+            return merge(self, other, left_on=on, how=how,
+                         left_index=on is None, right_index=True,
+                         suffixes=(lsuffix, rsuffix), sort=False)
+        else:
+            if on is not None:
+                raise ValueError('Joining multiple DataFrames only supported'
+                                 ' for joining on index')
+
+            # join indexes only using concat
+            if how == 'left':
+                how = 'outer'
+                join_index = self.index
+            else:
+                join_index = None
+
+            return concat([self] + list(other), axis=1, join=how,
+                          join_index=join_index, verify_integrity=True)
 
     def merge(self, right, how='inner', on=None, left_on=None, right_on=None,
               left_index=False, right_index=False, sort=True,
@@ -3623,20 +3644,6 @@ def extract_index(data):
     return _ensure_index(index)
 
 
-
-def _check_data_types(data):
-    have_raw_arrays = False
-    have_series = False
-    for v in data.values():
-        if not isinstance(v, (dict, Series)):
-            have_raw_arrays = True
-        else:
-            have_series = True
-
-    is_mixed = have_series and have_raw_arrays
-    return have_raw_arrays, is_mixed
-
-
 def _prep_ndarray(values, copy=True):
     if not isinstance(values, np.ndarray):
         arr = np.asarray(values)
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -453,7 +453,8 @@ def _wrap_frames(self, keys, values, not_indexed_same=False):
                                                  self.groupings,
                                                  axis=self.axis)
         else:
-            result = concat(values, axis=0).reindex(self.obj.index)
+            result = concat(values, axis=0, verify_integrity=False)
+            result = result.reindex(self.obj.index)
 
         return result
 
@@ -1117,7 +1118,7 @@ def transform(self, func, *args, **kwargs):
         >>> grouped = df.groupby(lambda x: mapping[x])
         >>> grouped.transform(lambda x: (x - x.mean()) / x.std())
         """
-        import pandas.tools.merge as merge
+        from pandas.tools.merge import concat
 
         applied = []
 
@@ -1143,8 +1144,8 @@ def transform(self, func, *args, **kwargs):
                 applied.append(res)
 
         concat_index = obj.columns if self.axis == 0 else obj.index
-        concatenated = merge.concat(applied, join_index=concat_index,
-                                    axis=self.axis)
+        concatenated = concat(applied, join_index=concat_index,
+                              axis=self.axis, verify_integrity=False)
         return concatenated.reindex_like(obj)
 
 class PanelGroupBy(GroupBy):
diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py
@@ -385,7 +385,7 @@ class _BlockJoinOperation(object):
     BlockManager data structures
     """
     def __init__(self, data_list, join_index, indexers, axis=1, copy=True):
-        if axis <= 0:
+        if axis <= 0:  # pragma: no cover
             raise Exception('Only axis >= 1 supported for this operation')
 
         assert(len(data_list) == len(indexers))
@@ -587,49 +587,55 @@ def concat(frames, axis=0, join='outer', join_index=None,
         How to handle indexes on other axis
     join_index : index-like
     verify_integrity : boolean, default False
+        Check whether the new concatenated axis contains duplicates. This can
+        be very expensive relative to the actual data concatenation
 
     Returns
     -------
     concatenated : DataFrame
     """
     op = Concatenator(frames, axis=axis, join_index=join_index,
-                      ignore_index=ignore_index,
+                      ignore_index=ignore_index, join=join,
                       verify_integrity=verify_integrity)
     return op.get_result()
 
 
 class Concatenator(object):
     """
-
+    Orchestrates a concatenation operation with a list of DataFrame objects
     """
 
     def __init__(self, frames, axis=0, join='outer', join_index=None,
                  ignore_index=False, verify_integrity=False):
+        if join == 'outer':
+            self.intersect = False
+        elif join == 'inner':
+            self.intersect = True
+        else:  # pragma: no cover
+            raise ValueError('Only can inner (intersect) or outer (union) join '
+                             'the other axis')
 
         # consolidate data
         for frame in frames:
             frame.consolidate(inplace=True)
 
         self.frames = frames
         self.axis = axis
-        self.join = join
         self.join_index = join_index
 
         self.ignore_index = ignore_index
-
         self.verify_integrity = verify_integrity
-
         self.new_index, self.new_columns = self._get_new_axes()
 
     def get_result(self):
         if len(self.frames) == 1:
             return self.frames[0]
 
         new_data = self._get_concatenated_data()
-        new_index, new_columns = self._get_new_axes()
         constructor = self._get_frame_constructor()
 
-        return constructor(new_data, index=new_index, columns=new_columns)
+        return constructor(new_data, index=self.new_index,
+                           columns=self.new_columns)
 
     def _get_concatenated_data(self):
         try:
@@ -717,9 +723,13 @@ def _get_new_axes(self):
 
             if self.join_index is None:
                 all_cols = [df.columns for df in self.frames]
-                new_columns = _get_combined_index(all_cols, intersect=False)
+                new_columns = _get_combined_index(all_cols,
+                                                  intersect=self.intersect)
             else:
                 new_columns = self.join_index
+
+            self.frames = [df.reindex(columns=new_columns, copy=False)
+                           for df in self.frames]
         else:
             new_columns = _concat_indexes([df.columns for df in self.frames])
             self._maybe_check_integrity(new_columns)
@@ -730,10 +740,14 @@ def _get_new_axes(self):
 
             if self.join_index is None:
                 all_indexes = [df.index for df in self.frames]
-                new_index = _get_combined_index(all_indexes, intersect=False)
+                new_index = _get_combined_index(all_indexes,
+                                                intersect=self.intersect)
             else:
                 new_index = self.join_index
 
+            self.frames = [df.reindex(new_index, copy=False)
+                           for df in self.frames]
+
         return new_index, new_columns
 
     def _get_frame_constructor(self):
diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py
@@ -678,6 +678,47 @@ def test_append_different_columns(self):
         self.assert_(isnull(appended['strings'][:5]).all())
         self.assert_(isnull(appended['bools'][5:]).all())
 
+    def test_append_many(self):
+        chunks = [self.frame[:5], self.frame[5:10],
+                  self.frame[10:15], self.frame[15:]]
+
+        result = chunks[0].append(chunks[1:])
+        tm.assert_frame_equal(result, self.frame)
+
+        chunks[-1]['foo'] = 'bar'
+        result = chunks[0].append(chunks[1:])
+        tm.assert_frame_equal(result.ix[:, self.frame.columns], self.frame)
+        self.assert_((result['foo'][15:] == 'bar').all())
+        self.assert_(result['foo'][:15].isnull().all())
+
+    def test_join_many(self):
+        df = DataFrame(np.random.randn(10, 6), columns=list('abcdef'))
+        df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]]
+
+        joined = df_list[0].join(df_list[1:])
+        tm.assert_frame_equal(joined, df)
+
+        df_list = [df[['a', 'b']][:-2],
+                   df[['c', 'd']][2:], df[['e', 'f']][1:9]]
+
+        def _check_diff_index(df_list, result, exp_index):
+            reindexed = [x.reindex(exp_index) for x in df_list]
+            expected = reindexed[0].join(reindexed[1:])
+            tm.assert_frame_equal(result, expected)
+
+
+        # different join types
+        joined = df_list[0].join(df_list[1:], how='outer')
+        _check_diff_index(df_list, joined, df.index)
+
+        joined = df_list[0].join(df_list[1:])
+        _check_diff_index(df_list, joined, df_list[0].index)
+
+        joined = df_list[0].join(df_list[1:], how='inner')
+        _check_diff_index(df_list, joined, df.index[2:8])
+
+        self.assertRaises(ValueError, df_list[0].join, df_list[1:], on='a')
+
     def test_append_missing_column_proper_upcast(self):
         pass