Skip to content

ENH: Allow for join between two multi-index dataframe instances #20356

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 25 commits into from
Nov 15, 2018
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
b581789
Allow for join between two multi-index dataframe instances
Sep 19, 2018
2d61a12
Merge remote-tracking branch 'upstream/master' into multi-index-join
harisbal Sep 19, 2018
4d4acc5
Merge remote-tracking branch 'upstream/master' into multi-index-join
harisbal Oct 7, 2018
66d82fb
Review
harisbal Oct 8, 2018
c091bb4
Merge remote-tracking branch 'upstream/master' into multi-index-join
Oct 8, 2018
d56ebcd
Second review
harisbal Oct 9, 2018
0cdad73
Merge remote-tracking branch 'upstream/master' into multi-index-join
Oct 9, 2018
c2a65aa
Merge remote-tracking branch 'upstream/master' into multi-index-join
harisbal Oct 10, 2018
571fdf7
Merge remote-tracking branch 'upstream/master' into multi-index-join
Nov 1, 2018
ae2d8ad
Review
harisbal Nov 1, 2018
405c1a4
Merge remote-tracking branch 'upstream/master' into multi-index-join
harisbal Nov 1, 2018
1d2d9f3
Fix ci
harisbal Nov 3, 2018
f0ac24d
Merge branch 'master' into multi-index-join
Nov 3, 2018
5ac40ff
Merge remote-tracking branch 'upstream/master' into multi-index-join
harisbal Nov 3, 2018
be862c7
Update v0.24.0.txt
harisbal Nov 4, 2018
e10cbde
Update docstring _restore_dropped_levels_multijoin
harisbal Nov 4, 2018
06d48d0
Update docstring _restore_dropped_levels_multijoin
harisbal Nov 4, 2018
f54c151
Merge remote-tracking branch 'upstream/master' into multi-index-join
Nov 4, 2018
c75108d
Merge remote-tracking branch 'origin/multi-index-join' into multi-ind…
harisbal Nov 5, 2018
c690260
Merge remote-tracking branch 'upstream/master' into multi-index-join
harisbal Nov 6, 2018
4092b34
updated comments
harisbal Nov 6, 2018
cfd5fcc
Refactoring
harisbal Nov 6, 2018
6c8131d
Review
harisbal Nov 10, 2018
ecaf515
Merge remote-tracking branch 'upstream/master' into multi-index-join
harisbal Nov 10, 2018
8b5d0aa
Merge remote-tracking branch 'upstream/master' into harisbal-multi-in…
TomAugspurger Nov 14, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,47 @@ array, but rather an ``ExtensionArray``:
This is the same behavior as ``Series.values`` for categorical data. See
:ref:`whatsnew_0240.api_breaking.interval_values` for more.

.. _whatsnew_0240.enhancements.join_with_two_multiindexes:

Joining with two multi-indexes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

:func:`Datafame.merge` and :func:`Dataframe.join` can now be used to join multi-indexed ``Dataframe`` instances on the overlaping index levels (:issue:`6360`)

See the :ref:`Merge, join, and concatenate
<merging.Join_with_two_multi_indexes>` documentation section.

.. ipython:: python

index_left = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'),
('K1', 'X2')],
names=['key', 'X'])


left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=index_left)


index_right = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'),
('K2', 'Y2'), ('K2', 'Y3')],
names=['key', 'Y'])


right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=index_right)


left.join(right)

For earlier versions this can be done using the following.

.. ipython:: python

pd.merge(left.reset_index(), right.reset_index(),
on=['key'], how='inner').set_index(['key', 'X', 'Y'])

.. _whatsnew_0240.enhancements.rename_axis:

Renaming names in a MultiIndex
Expand Down
98 changes: 62 additions & 36 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3134,8 +3134,8 @@ def get_value(self, series, key):
iloc = self.get_loc(key)
return s[iloc]
except KeyError:
if (len(self) > 0
and (self.holds_integer() or self.is_boolean())):
if (len(self) > 0 and
(self.holds_integer() or self.is_boolean())):
raise
elif is_integer(key):
return s[key]
Expand Down Expand Up @@ -3923,46 +3923,72 @@ def join(self, other, how='left', level=None, return_indexers=False,

def _join_multi(self, other, how, return_indexers=True):
from .multi import MultiIndex
from pandas.core.reshape.merge import _restore_dropped_levels_multijoin

# figure out join names
self_names = set(com._not_none(*self.names))
other_names = set(com._not_none(*other.names))
overlap = self_names & other_names

# need at least 1 in common
if not overlap:
raise ValueError("cannot join with no overlapping index names")

self_is_mi = isinstance(self, MultiIndex)
other_is_mi = isinstance(other, MultiIndex)

# figure out join names
self_names = com._not_none(*self.names)
other_names = com._not_none(*other.names)
overlap = list(set(self_names) & set(other_names))

# need at least 1 in common, but not more than 1
if not len(overlap):
raise ValueError("cannot join with no level specified and no "
"overlapping names")
if len(overlap) > 1:
raise NotImplementedError("merging with more than one level "
"overlap on a multi-index is not "
"implemented")
jl = overlap[0]
if self_is_mi and other_is_mi:

# Drop the non-matching levels from left and right respectively
ldrop_names = list(self_names - overlap)
rdrop_names = list(other_names - overlap)

self_jnlevels = self.droplevel(ldrop_names)
other_jnlevels = other.droplevel(rdrop_names)

# Join left and right
# Join on same leveled multi-index frames is supported
join_idx, lidx, ridx = self_jnlevels.join(other_jnlevels, how,
return_indexers=True)

# Restore the dropped levels
# Returned index level order is
# common levels, ldrop_names, rdrop_names
dropped_names = ldrop_names + rdrop_names

levels, labels, names = (
_restore_dropped_levels_multijoin(self, other,
dropped_names,
join_idx,
lidx, ridx))

# Re-create the multi-index
multi_join_idx = MultiIndex(levels=levels, labels=labels,
names=names, verify_integrity=False)

multi_join_idx = multi_join_idx.remove_unused_levels()

return multi_join_idx, lidx, ridx

jl = list(overlap)[0]

# Case where only one index is multi
# make the indices into mi's that match
if not (self_is_mi and other_is_mi):

flip_order = False
if self_is_mi:
self, other = other, self
flip_order = True
# flip if join method is right or left
how = {'right': 'left', 'left': 'right'}.get(how, how)

level = other.names.index(jl)
result = self._join_level(other, level, how=how,
return_indexers=return_indexers)

if flip_order:
if isinstance(result, tuple):
return result[0], result[2], result[1]
return result
flip_order = False
if self_is_mi:
self, other = other, self
flip_order = True
# flip if join method is right or left
how = {'right': 'left', 'left': 'right'}.get(how, how)

level = other.names.index(jl)
result = self._join_level(other, level, how=how,
return_indexers=return_indexers)

# 2 multi-indexes
raise NotImplementedError("merging with both multi-indexes is not "
"implemented")
if flip_order:
if isinstance(result, tuple):
return result[0], result[2], result[1]
return result

def _join_non_unique(self, other, how='left', return_indexers=False):
from pandas.core.reshape.merge import _get_join_indexers
Expand Down
89 changes: 89 additions & 0 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1122,6 +1122,95 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner',
return join_func(lkey, rkey, count, **kwargs)


def _restore_dropped_levels_multijoin(left, right, dropped_level_names,
join_index, lindexer, rindexer):
"""
*this is an internal non-public method*

Returns the levels, labels and names of a multi-index to multi-index join.
Depending on the type of join, this method restores the appropriate
dropped levels of the joined multi-index.
The method relies on lidx, rindexer which hold the index positions of
left and right, where a join was feasible

Parameters
----------
left : MultiIndex
left index
right : MultiIndex
right index
dropped_level_names : str array
list of non-common level names
join_index : MultiIndex
the index of the join between the
common levels of left and right
lindexer : intp array
left indexer
rindexer : intp array
right indexer

Returns
-------
levels : list of Index
levels of combined multiindexes
labels : intp array
labels of combined multiindexes
names : str array
names of combined multiindexes

"""

def _convert_to_mulitindex(index):
if isinstance(index, MultiIndex):
return index
else:
return MultiIndex.from_arrays([index.values],
names=[index.name])

# For multi-multi joins with one overlapping level,
# the returned index if of type Index
# Assure that join_index is of type MultiIndex
# so that dropped levels can be appended
join_index = _convert_to_mulitindex(join_index)

join_levels = join_index.levels
join_labels = join_index.labels
join_names = join_index.names

# lindexer and rindexer hold the indexes where the join occurred
# for left and right respectively. If left/right is None then
# the join occurred on all indices of left/right
if lindexer is None:
lindexer = range(left.size)

if rindexer is None:
rindexer = range(right.size)

# Iterate through the levels that must be restored
for dropped_level_name in dropped_level_names:
if dropped_level_name in left.names:
idx = left
indexer = lindexer
else:
idx = right
indexer = rindexer

# The index of the level name to be restored
name_idx = idx.names.index(dropped_level_name)

restore_levels = idx.levels[name_idx]
# Inject -1 in the labels list where a join was not possible
# IOW indexer[i]=-1
labels = idx.labels[name_idx]
restore_labels = algos.take_nd(labels, indexer, fill_value=-1)

join_levels = join_levels + [restore_levels]
join_labels = join_labels + [restore_labels]
join_names = join_names + [dropped_level_name]

return join_levels, join_labels, join_names


class _OrderedMerge(_MergeOperation):
_merge_type = 'ordered_merge'

Expand Down
Loading