Merge pull request #6363 from jreback/mi_merge_single

jreback · jreback · commit ab27073072b0 · 2014-02-15T17:08:44.000-05:00
ENH/BUG: allow single versus multi-index joining on inferred level (GH3662)
diff --git a/doc/source/merging.rst b/doc/source/merging.rst
@@ -307,7 +307,7 @@ the data in DataFrame.
 
 See the :ref:`cookbook<cookbook.merge>` for some advanced strategies.
 
-Users who are familiar with SQL but new to pandas might be interested in a 
+Users who are familiar with SQL but new to pandas might be interested in a
 :ref:`comparison with SQL<compare_with_sql.join>`.
 
 pandas provides a single function, ``merge``, as the entry point for all
@@ -610,3 +610,77 @@ values inplace:
 
    df1.update(df2)
    df1
+
+.. _merging.on_mi:
+
+Merging with Multi-indexes
+--------------------------
+
+.. _merging.join_on_mi:
+
+Joining a single Index to a Multi-index
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. versionadded:: 0.14.0
+
+You can join a singly-indexed DataFrame with a level of a multi-indexed DataFrame.
+The level will match on the name of the index of the singly-indexed frame against
+a level name of the multi-indexed frame.
+
+..  ipython:: python
+
+    household = DataFrame(dict(household_id = [1,2,3],
+                               male = [0,1,0],
+                               wealth = [196087.3,316478.7,294750]),
+                          columns = ['household_id','male','wealth']
+                         ).set_index('household_id')
+    household
+    portfolio = DataFrame(dict(household_id = [1,2,2,3,3,3,4],
+                               asset_id = ["nl0000301109","nl0000289783","gb00b03mlx29",
+                                           "gb00b03mlx29","lu0197800237","nl0000289965",np.nan],
+                               name = ["ABN Amro","Robeco","Royal Dutch Shell","Royal Dutch Shell",
+                                       "AAB Eastern Europe Equity Fund","Postbank BioTech Fonds",np.nan],
+                               share = [1.0,0.4,0.6,0.15,0.6,0.25,1.0]),
+                          columns = ['household_id','asset_id','name','share']
+                         ).set_index(['household_id','asset_id'])
+    portfolio
+
+    household.join(portfolio, how='inner')
+
+This is equivalent but less verbose and more memory efficient / faster than this.
+
+.. code-block:: python
+
+    merge(household.reset_index(),
+          portfolio.reset_index(),
+          on=['household_id'],
+          how='inner'
+         ).set_index(['household_id','asset_id'])
+
+Joining with two multi-indexes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This is not Implemented via ``join`` at-the-moment, however it can be done using the following.
+
+.. ipython:: python
+
+   household = DataFrame(dict(household_id = [1,2,2,3,3,3,4],
+                              asset_id = ["nl0000301109","nl0000301109","gb00b03mlx29",
+                                          "gb00b03mlx29","lu0197800237","nl0000289965",np.nan],
+                              share = [1.0,0.4,0.6,0.15,0.6,0.25,1.0]),
+                         columns = ['household_id','asset_id','share']
+                        ).set_index(['household_id','asset_id'])
+   household
+
+   log_return = DataFrame(dict(asset_id = ["gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29",
+                                           "lu0197800237", "lu0197800237"],
+                               t = [233, 234, 235, 180, 181],
+                               log_return = [.09604978, -.06524096, .03532373, .03025441, .036997]),
+                         ).set_index(["asset_id","t"])
+   log_return
+
+   merge(household.reset_index(),
+         log_return.reset_index(),
+         on=['asset_id'],
+         how='inner'
+        ).set_index(['household_id','asset_id','t'])
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -78,6 +78,7 @@ Improvements to existing features
   (:issue:`6014`)
 - Allow multi-index slicers (:issue:`6134`, :issue:`4036`, :issue:`3057`, :issue:`2598`, :issue:`5641`)
 - improve performance of slice indexing on Series with string keys (:issue:`6341`)
+- implement joining a single-level indexed DataFrame on a matching column of a multi-indexed DataFrame (:issue:`3662`)
 
 .. _release.bug_fixes-0.14.0:
 
diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt
@@ -9,8 +9,8 @@ users upgrade to this version.
 
 Highlights include:
 
--
-
+- MultIndexing Using Slicers
+- Joining a singly-indexed DataFrame with a multi-indexed DataFrame
 
 API changes
 ~~~~~~~~~~~
@@ -155,6 +155,29 @@ Enhancements
   most plot kinds. (:issue:`6014`)
 - improve performance of slice indexing on Series with string keys (:issue:`6341`)
 - Hexagonal bin plots from ``DataFrame.plot`` with ``kind='hexbin'`` (:issue:`5478`)
+- Joining a singly-indexed DataFrame with a multi-indexed DataFrame (:issue:`3662`)
+
+  See :ref:`the docs<merging.join_on_mi>`. Joining multi-index DataFrames on both the left and right is not yet supported ATM.
+
+  .. ipython:: python
+
+     household = DataFrame(dict(household_id = [1,2,3],
+                                male = [0,1,0],
+                                wealth = [196087.3,316478.7,294750]),
+                           columns = ['household_id','male','wealth']
+                          ).set_index('household_id')
+     household
+     portfolio = DataFrame(dict(household_id = [1,2,2,3,3,3,4],
+                                asset_id = ["nl0000301109","nl0000289783","gb00b03mlx29",
+                                            "gb00b03mlx29","lu0197800237","nl0000289965",np.nan],
+                                name = ["ABN Amro","Robeco","Royal Dutch Shell","Royal Dutch Shell",
+                                        "AAB Eastern Europe Equity Fund","Postbank BioTech Fonds",np.nan],
+                                share = [1.0,0.4,0.6,0.15,0.6,0.25,1.0]),
+                           columns = ['household_id','asset_id','name','share']
+                          ).set_index(['household_id','asset_id'])
+     portfolio
+
+     household.join(portfolio, how='inner')
 
 Performance
 ~~~~~~~~~~~
diff --git a/pandas/core/index.py b/pandas/core/index.py
@@ -1265,8 +1265,21 @@ def join(self, other, how='left', level=None, return_indexers=False):
         -------
         join_index, (left_indexer, right_indexer)
         """
-        if (level is not None and (isinstance(self, MultiIndex) or
-                                   isinstance(other, MultiIndex))):
+        self_is_mi = isinstance(self, MultiIndex)
+        other_is_mi = isinstance(other, MultiIndex)
+
+        # try to figure out the join level
+        # GH3662
+        if (level is None and (self_is_mi or other_is_mi)):
+
+            # have the same levels/names so a simple join
+            if self.names == other.names:
+                pass
+            else:
+                return self._join_multi(other, how=how, return_indexers=return_indexers)
+
+        # join on the level
+        if (level is not None and (self_is_mi or other_is_mi)):
             return self._join_level(other, level, how=how,
                                     return_indexers=return_indexers)
 
@@ -1344,6 +1357,43 @@ def join(self, other, how='left', level=None, return_indexers=False):
         else:
             return join_index
 
+    def _join_multi(self, other, how, return_indexers=True):
+
+        self_is_mi = isinstance(self, MultiIndex)
+        other_is_mi = isinstance(other, MultiIndex)
+
+        # figure out join names
+        self_names = [ n for n in self.names if n is not None ]
+        other_names = [ n for n in other.names if n is not None ]
+        overlap = list(set(self_names) & set(other_names))
+
+        # need at least 1 in common, but not more than 1
+        if not len(overlap):
+            raise ValueError("cannot join with no level specified and no overlapping names")
+        if len(overlap) > 1:
+            raise NotImplementedError("merging with more than one level overlap on a multi-index is not implemented")
+        jl = overlap[0]
+
+        # make the indices into mi's that match
+        if not (self_is_mi and other_is_mi):
+
+            flip_order = False
+            if self_is_mi:
+                self, other = other, self
+                flip_order = True
+
+            level = other.names.index(jl)
+            result = self._join_level(other, level, how=how,
+                                      return_indexers=return_indexers)
+
+            if flip_order:
+                if isinstance(result, tuple):
+                    return result[0], result[2], result[1]
+            return result
+
+        # 2 multi-indexes
+        raise NotImplementedError("merging with both multi-indexes is not implemented")
+
     def _join_non_unique(self, other, how='left', return_indexers=False):
         from pandas.tools.merge import _get_join_indexers
 
diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py
@@ -8,15 +8,15 @@
 import numpy as np
 import random
 
-from pandas.compat import range, lrange, lzip, zip
+from pandas.compat import range, lrange, lzip, zip, StringIO
 from pandas import compat, _np_version_under1p7
 from pandas.tseries.index import DatetimeIndex
 from pandas.tools.merge import merge, concat, ordered_merge, MergeError
 from pandas.util.testing import (assert_frame_equal, assert_series_equal,
                                  assert_almost_equal, rands,
                                  makeCustomDataframe as mkdf,
                                  assertRaisesRegexp)
-from pandas import isnull, DataFrame, Index, MultiIndex, Panel, Series, date_range
+from pandas import isnull, DataFrame, Index, MultiIndex, Panel, Series, date_range, read_table
 import pandas.algos as algos
 import pandas.util.testing as tm
 
@@ -1025,6 +1025,98 @@ def test_int64_overflow_issues(self):
         result = merge(df1, df2, how='outer')
         self.assertTrue(len(result) == 2000)
 
+    def test_join_multi_levels(self):
+
+        # GH 3662
+        # merge multi-levels
+
+        household = DataFrame(dict(household_id = [1,2,3],
+                                   male = [0,1,0],
+                                   wealth = [196087.3,316478.7,294750]),
+                              columns = ['household_id','male','wealth']).set_index('household_id')
+        portfolio = DataFrame(dict(household_id = [1,2,2,3,3,3,4],
+                                   asset_id = ["nl0000301109","nl0000289783","gb00b03mlx29","gb00b03mlx29","lu0197800237","nl0000289965",np.nan],
+                                   name = ["ABN Amro","Robeco","Royal Dutch Shell","Royal Dutch Shell","AAB Eastern Europe Equity Fund","Postbank BioTech Fonds",np.nan],
+                                   share = [1.0,0.4,0.6,0.15,0.6,0.25,1.0]),
+                              columns = ['household_id','asset_id','name','share']).set_index(['household_id','asset_id'])
+        result = household.join(portfolio, how='inner')
+        expected = DataFrame(dict(male = [0,1,1,0,0,0],
+                                  wealth = [ 196087.3, 316478.7, 316478.7, 294750.0, 294750.0, 294750.0 ],
+                                  name = ['ABN Amro','Robeco','Royal Dutch Shell','Royal Dutch Shell','AAB Eastern Europe Equity Fund','Postbank BioTech Fonds'],
+                                  share = [1.00,0.40,0.60,0.15,0.60,0.25],
+                                  household_id = [1,2,2,3,3,3],
+                                  asset_id = ['nl0000301109','nl0000289783','gb00b03mlx29','gb00b03mlx29','lu0197800237','nl0000289965']),
+                             ).set_index(['household_id','asset_id']).reindex(columns=['male','wealth','name','share'])
+        assert_frame_equal(result,expected)
+
+        assert_frame_equal(result,expected)
+
+        # equivalency
+        result2 = merge(household.reset_index(),portfolio.reset_index(),on=['household_id'],how='inner').set_index(['household_id','asset_id'])
+        assert_frame_equal(result2,expected)
+
+        result = household.join(portfolio, how='outer')
+        expected = concat([expected,DataFrame(dict(share = [1.00]),
+                                              index=MultiIndex.from_tuples([(4,np.nan)],
+                                                                           names=['household_id','asset_id']))],
+                          axis=0).reindex(columns=expected.columns)
+        assert_frame_equal(result,expected)
+
+        # invalid cases
+        household.index.name = 'foo'
+        def f():
+            household.join(portfolio, how='inner')
+        self.assertRaises(ValueError, f)
+
+        portfolio2 = portfolio.copy()
+        portfolio2.index.set_names(['household_id','foo'])
+        def f():
+            portfolio2.join(portfolio, how='inner')
+        self.assertRaises(ValueError, f)
+
+    def test_join_multi_levels2(self):
+
+        # some more advanced merges
+        # GH6360
+        household = DataFrame(dict(household_id = [1,2,2,3,3,3,4],
+                                   asset_id = ["nl0000301109","nl0000301109","gb00b03mlx29","gb00b03mlx29","lu0197800237","nl0000289965",np.nan],
+                                   share = [1.0,0.4,0.6,0.15,0.6,0.25,1.0]),
+                              columns = ['household_id','asset_id','share']).set_index(['household_id','asset_id'])
+
+        log_return = DataFrame(dict(
+            asset_id = ["gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237"],
+            t = [233, 234, 235, 180, 181],
+            log_return = [.09604978, -.06524096, .03532373, .03025441, .036997]
+                )).set_index(["asset_id","t"])
+
+        expected = DataFrame(dict(
+            household_id = [2, 2, 2, 3, 3, 3, 3, 3],
+            asset_id = ["gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237"],
+            t = [233, 234, 235, 233, 234, 235, 180, 181],
+            share = [0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6],
+            log_return = [.09604978, -.06524096, .03532373, .09604978, -.06524096, .03532373, .03025441, .036997]
+            )).set_index(["household_id", "asset_id", "t"]).reindex(columns=['share','log_return'])
+
+        def f():
+            household.join(log_return, how='inner')
+        self.assertRaises(NotImplementedError, f)
+
+        # this is the equivalency
+        result = merge(household.reset_index(),log_return.reset_index(),on=['asset_id'],how='inner').set_index(['household_id','asset_id','t'])
+        assert_frame_equal(result,expected)
+
+        expected = DataFrame(dict(
+            household_id = [1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4],
+            asset_id = ["nl0000301109", "nl0000289783", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237", "nl0000289965", None],
+            t = [None, None, 233, 234, 235, 233, 234, 235, 180, 181, None, None],
+            share = [1.0, 0.4, 0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6, 0.25, 1.0],
+            log_return = [None, None, .09604978, -.06524096, .03532373, .09604978, -.06524096, .03532373, .03025441, .036997, None, None]
+            )).set_index(["household_id", "asset_id", "t"])
+
+        def f():
+            household.join(log_return, how='outer')
+        self.assertRaises(NotImplementedError, f)
+
 def _check_join(left, right, result, join_col, how='left',
                 lsuffix='_x', rsuffix='_y'):