diff --git a/doc/source/merging.rst b/doc/source/merging.rst index d16b998f31ec1..72344ee003547 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -307,7 +307,7 @@ the data in DataFrame. See the :ref:`cookbook` for some advanced strategies. -Users who are familiar with SQL but new to pandas might be interested in a +Users who are familiar with SQL but new to pandas might be interested in a :ref:`comparison with SQL`. pandas provides a single function, ``merge``, as the entry point for all @@ -610,3 +610,77 @@ values inplace: df1.update(df2) df1 + +.. _merging.on_mi: + +Merging with Multi-indexes +-------------------------- + +.. _merging.join_on_mi: + +Joining a single Index to a Multi-index +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.14.0 + +You can join a singly-indexed DataFrame with a level of a multi-indexed DataFrame. +The level will match on the name of the index of the singly-indexed frame against +a level name of the multi-indexed frame. + +.. ipython:: python + + household = DataFrame(dict(household_id = [1,2,3], + male = [0,1,0], + wealth = [196087.3,316478.7,294750]), + columns = ['household_id','male','wealth'] + ).set_index('household_id') + household + portfolio = DataFrame(dict(household_id = [1,2,2,3,3,3,4], + asset_id = ["nl0000301109","nl0000289783","gb00b03mlx29", + "gb00b03mlx29","lu0197800237","nl0000289965",np.nan], + name = ["ABN Amro","Robeco","Royal Dutch Shell","Royal Dutch Shell", + "AAB Eastern Europe Equity Fund","Postbank BioTech Fonds",np.nan], + share = [1.0,0.4,0.6,0.15,0.6,0.25,1.0]), + columns = ['household_id','asset_id','name','share'] + ).set_index(['household_id','asset_id']) + portfolio + + household.join(portfolio, how='inner') + +This is equivalent but less verbose and more memory efficient / faster than this. + +.. code-block:: python + + merge(household.reset_index(), + portfolio.reset_index(), + on=['household_id'], + how='inner' + ).set_index(['household_id','asset_id']) + +Joining with two multi-indexes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is not Implemented via ``join`` at-the-moment, however it can be done using the following. + +.. ipython:: python + + household = DataFrame(dict(household_id = [1,2,2,3,3,3,4], + asset_id = ["nl0000301109","nl0000301109","gb00b03mlx29", + "gb00b03mlx29","lu0197800237","nl0000289965",np.nan], + share = [1.0,0.4,0.6,0.15,0.6,0.25,1.0]), + columns = ['household_id','asset_id','share'] + ).set_index(['household_id','asset_id']) + household + + log_return = DataFrame(dict(asset_id = ["gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", + "lu0197800237", "lu0197800237"], + t = [233, 234, 235, 180, 181], + log_return = [.09604978, -.06524096, .03532373, .03025441, .036997]), + ).set_index(["asset_id","t"]) + log_return + + merge(household.reset_index(), + log_return.reset_index(), + on=['asset_id'], + how='inner' + ).set_index(['household_id','asset_id','t']) diff --git a/doc/source/release.rst b/doc/source/release.rst index 35ce6c9359d56..5e363e0f3c00b 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -78,6 +78,7 @@ Improvements to existing features (:issue:`6014`) - Allow multi-index slicers (:issue:`6134`, :issue:`4036`, :issue:`3057`, :issue:`2598`, :issue:`5641`) - improve performance of slice indexing on Series with string keys (:issue:`6341`) +- implement joining a single-level indexed DataFrame on a matching column of a multi-indexed DataFrame (:issue:`3662`) .. _release.bug_fixes-0.14.0: diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index ea9fbadeeaf4e..e9e78c832028c 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -9,8 +9,8 @@ users upgrade to this version. Highlights include: -- - +- MultIndexing Using Slicers +- Joining a singly-indexed DataFrame with a multi-indexed DataFrame API changes ~~~~~~~~~~~ @@ -155,6 +155,29 @@ Enhancements most plot kinds. (:issue:`6014`) - improve performance of slice indexing on Series with string keys (:issue:`6341`) - Hexagonal bin plots from ``DataFrame.plot`` with ``kind='hexbin'`` (:issue:`5478`) +- Joining a singly-indexed DataFrame with a multi-indexed DataFrame (:issue:`3662`) + + See :ref:`the docs`. Joining multi-index DataFrames on both the left and right is not yet supported ATM. + + .. ipython:: python + + household = DataFrame(dict(household_id = [1,2,3], + male = [0,1,0], + wealth = [196087.3,316478.7,294750]), + columns = ['household_id','male','wealth'] + ).set_index('household_id') + household + portfolio = DataFrame(dict(household_id = [1,2,2,3,3,3,4], + asset_id = ["nl0000301109","nl0000289783","gb00b03mlx29", + "gb00b03mlx29","lu0197800237","nl0000289965",np.nan], + name = ["ABN Amro","Robeco","Royal Dutch Shell","Royal Dutch Shell", + "AAB Eastern Europe Equity Fund","Postbank BioTech Fonds",np.nan], + share = [1.0,0.4,0.6,0.15,0.6,0.25,1.0]), + columns = ['household_id','asset_id','name','share'] + ).set_index(['household_id','asset_id']) + portfolio + + household.join(portfolio, how='inner') Performance ~~~~~~~~~~~ diff --git a/pandas/core/index.py b/pandas/core/index.py index 316e82c05ef30..a4eca1216ea84 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1265,8 +1265,21 @@ def join(self, other, how='left', level=None, return_indexers=False): ------- join_index, (left_indexer, right_indexer) """ - if (level is not None and (isinstance(self, MultiIndex) or - isinstance(other, MultiIndex))): + self_is_mi = isinstance(self, MultiIndex) + other_is_mi = isinstance(other, MultiIndex) + + # try to figure out the join level + # GH3662 + if (level is None and (self_is_mi or other_is_mi)): + + # have the same levels/names so a simple join + if self.names == other.names: + pass + else: + return self._join_multi(other, how=how, return_indexers=return_indexers) + + # join on the level + if (level is not None and (self_is_mi or other_is_mi)): return self._join_level(other, level, how=how, return_indexers=return_indexers) @@ -1344,6 +1357,43 @@ def join(self, other, how='left', level=None, return_indexers=False): else: return join_index + def _join_multi(self, other, how, return_indexers=True): + + self_is_mi = isinstance(self, MultiIndex) + other_is_mi = isinstance(other, MultiIndex) + + # figure out join names + self_names = [ n for n in self.names if n is not None ] + other_names = [ n for n in other.names if n is not None ] + overlap = list(set(self_names) & set(other_names)) + + # need at least 1 in common, but not more than 1 + if not len(overlap): + raise ValueError("cannot join with no level specified and no overlapping names") + if len(overlap) > 1: + raise NotImplementedError("merging with more than one level overlap on a multi-index is not implemented") + jl = overlap[0] + + # make the indices into mi's that match + if not (self_is_mi and other_is_mi): + + flip_order = False + if self_is_mi: + self, other = other, self + flip_order = True + + level = other.names.index(jl) + result = self._join_level(other, level, how=how, + return_indexers=return_indexers) + + if flip_order: + if isinstance(result, tuple): + return result[0], result[2], result[1] + return result + + # 2 multi-indexes + raise NotImplementedError("merging with both multi-indexes is not implemented") + def _join_non_unique(self, other, how='left', return_indexers=False): from pandas.tools.merge import _get_join_indexers diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index bfa6fd77ba733..6645391aeda64 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -8,7 +8,7 @@ import numpy as np import random -from pandas.compat import range, lrange, lzip, zip +from pandas.compat import range, lrange, lzip, zip, StringIO from pandas import compat, _np_version_under1p7 from pandas.tseries.index import DatetimeIndex from pandas.tools.merge import merge, concat, ordered_merge, MergeError @@ -16,7 +16,7 @@ assert_almost_equal, rands, makeCustomDataframe as mkdf, assertRaisesRegexp) -from pandas import isnull, DataFrame, Index, MultiIndex, Panel, Series, date_range +from pandas import isnull, DataFrame, Index, MultiIndex, Panel, Series, date_range, read_table import pandas.algos as algos import pandas.util.testing as tm @@ -1025,6 +1025,98 @@ def test_int64_overflow_issues(self): result = merge(df1, df2, how='outer') self.assertTrue(len(result) == 2000) + def test_join_multi_levels(self): + + # GH 3662 + # merge multi-levels + + household = DataFrame(dict(household_id = [1,2,3], + male = [0,1,0], + wealth = [196087.3,316478.7,294750]), + columns = ['household_id','male','wealth']).set_index('household_id') + portfolio = DataFrame(dict(household_id = [1,2,2,3,3,3,4], + asset_id = ["nl0000301109","nl0000289783","gb00b03mlx29","gb00b03mlx29","lu0197800237","nl0000289965",np.nan], + name = ["ABN Amro","Robeco","Royal Dutch Shell","Royal Dutch Shell","AAB Eastern Europe Equity Fund","Postbank BioTech Fonds",np.nan], + share = [1.0,0.4,0.6,0.15,0.6,0.25,1.0]), + columns = ['household_id','asset_id','name','share']).set_index(['household_id','asset_id']) + result = household.join(portfolio, how='inner') + expected = DataFrame(dict(male = [0,1,1,0,0,0], + wealth = [ 196087.3, 316478.7, 316478.7, 294750.0, 294750.0, 294750.0 ], + name = ['ABN Amro','Robeco','Royal Dutch Shell','Royal Dutch Shell','AAB Eastern Europe Equity Fund','Postbank BioTech Fonds'], + share = [1.00,0.40,0.60,0.15,0.60,0.25], + household_id = [1,2,2,3,3,3], + asset_id = ['nl0000301109','nl0000289783','gb00b03mlx29','gb00b03mlx29','lu0197800237','nl0000289965']), + ).set_index(['household_id','asset_id']).reindex(columns=['male','wealth','name','share']) + assert_frame_equal(result,expected) + + assert_frame_equal(result,expected) + + # equivalency + result2 = merge(household.reset_index(),portfolio.reset_index(),on=['household_id'],how='inner').set_index(['household_id','asset_id']) + assert_frame_equal(result2,expected) + + result = household.join(portfolio, how='outer') + expected = concat([expected,DataFrame(dict(share = [1.00]), + index=MultiIndex.from_tuples([(4,np.nan)], + names=['household_id','asset_id']))], + axis=0).reindex(columns=expected.columns) + assert_frame_equal(result,expected) + + # invalid cases + household.index.name = 'foo' + def f(): + household.join(portfolio, how='inner') + self.assertRaises(ValueError, f) + + portfolio2 = portfolio.copy() + portfolio2.index.set_names(['household_id','foo']) + def f(): + portfolio2.join(portfolio, how='inner') + self.assertRaises(ValueError, f) + + def test_join_multi_levels2(self): + + # some more advanced merges + # GH6360 + household = DataFrame(dict(household_id = [1,2,2,3,3,3,4], + asset_id = ["nl0000301109","nl0000301109","gb00b03mlx29","gb00b03mlx29","lu0197800237","nl0000289965",np.nan], + share = [1.0,0.4,0.6,0.15,0.6,0.25,1.0]), + columns = ['household_id','asset_id','share']).set_index(['household_id','asset_id']) + + log_return = DataFrame(dict( + asset_id = ["gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237"], + t = [233, 234, 235, 180, 181], + log_return = [.09604978, -.06524096, .03532373, .03025441, .036997] + )).set_index(["asset_id","t"]) + + expected = DataFrame(dict( + household_id = [2, 2, 2, 3, 3, 3, 3, 3], + asset_id = ["gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237"], + t = [233, 234, 235, 233, 234, 235, 180, 181], + share = [0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6], + log_return = [.09604978, -.06524096, .03532373, .09604978, -.06524096, .03532373, .03025441, .036997] + )).set_index(["household_id", "asset_id", "t"]).reindex(columns=['share','log_return']) + + def f(): + household.join(log_return, how='inner') + self.assertRaises(NotImplementedError, f) + + # this is the equivalency + result = merge(household.reset_index(),log_return.reset_index(),on=['asset_id'],how='inner').set_index(['household_id','asset_id','t']) + assert_frame_equal(result,expected) + + expected = DataFrame(dict( + household_id = [1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4], + asset_id = ["nl0000301109", "nl0000289783", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237", "nl0000289965", None], + t = [None, None, 233, 234, 235, 233, 234, 235, 180, 181, None, None], + share = [1.0, 0.4, 0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6, 0.25, 1.0], + log_return = [None, None, .09604978, -.06524096, .03532373, .09604978, -.06524096, .03532373, .03025441, .036997, None, None] + )).set_index(["household_id", "asset_id", "t"]) + + def f(): + household.join(log_return, how='outer') + self.assertRaises(NotImplementedError, f) + def _check_join(left, right, result, join_col, how='left', lsuffix='_x', rsuffix='_y'):