Skip to content

Commit c7a1833

Browse files
author
harisbal
committed
Rebase
1 parent 7495e9a commit c7a1833

File tree

5 files changed

+318
-95
lines changed

5 files changed

+318
-95
lines changed

pandas/core/indexes/base.py

+59-52
Original file line numberDiff line numberDiff line change
@@ -2490,6 +2490,7 @@ def _get_unique_index(self, dropna=False):
24902490
includes list, tuple, array, Series, and must be the same size as
24912491
the index and its dtype must exactly match the index's type.
24922492
2493+
.. versionadded:: 0.17.0
24932494
.. versionadded:: 0.21.0 (list-like tolerance)
24942495
24952496
Returns
@@ -2639,6 +2640,7 @@ def _get_level_values(self, level):
26392640
the same size as the index and its dtype must exactly match the
26402641
index's type.
26412642
2643+
.. versionadded:: 0.17.0
26422644
.. versionadded:: 0.21.0 (list-like tolerance)
26432645
26442646
Examples
@@ -3180,46 +3182,68 @@ def join(self, other, how='left', level=None, return_indexers=False,
31803182

31813183
def _join_multi(self, other, how, return_indexers=True):
31823184
from .multi import MultiIndex
3183-
self_is_mi = isinstance(self, MultiIndex)
3184-
other_is_mi = isinstance(other, MultiIndex)
3185+
from pandas.core.reshape.merge import _complete_multilevel_join
31853186

31863187
# figure out join names
3187-
self_names = _not_none(*self.names)
3188-
other_names = _not_none(*other.names)
3188+
self_names = list(_not_none(*self.names))
3189+
other_names = list(_not_none(*other.names))
31893190
overlap = list(set(self_names) & set(other_names))
31903191

3191-
# need at least 1 in common, but not more than 1
3192+
# need at least 1 in common
31923193
if not len(overlap):
3193-
raise ValueError("cannot join with no level specified and no "
3194-
"overlapping names")
3195-
if len(overlap) > 1:
3196-
raise NotImplementedError("merging with more than one level "
3197-
"overlap on a multi-index is not "
3198-
"implemented")
3199-
jl = overlap[0]
3194+
raise ValueError("cannot join with no overlapping index names")
3195+
3196+
self_is_mi = isinstance(self, MultiIndex)
3197+
other_is_mi = isinstance(other, MultiIndex)
3198+
3199+
# Drop the non matching levels
3200+
ldrop_levels = list(set(self_names) - set(overlap))
3201+
rdrop_levels = list(set(other_names) - set(overlap))
3202+
3203+
if self_is_mi and other_is_mi:
3204+
self_jnlevels = self.droplevel(ldrop_levels)
3205+
other_jnlevels = other.droplevel(rdrop_levels)
3206+
3207+
if not (self_jnlevels.is_unique and other_jnlevels.is_unique):
3208+
raise ValueError("Join on level between two MultiIndex objects"
3209+
"is ambiguous")
3210+
3211+
dropped_levels = ldrop_levels + rdrop_levels
3212+
3213+
join_idx, lidx, ridx = self_jnlevels.join(other_jnlevels, how,
3214+
return_indexers=True)
3215+
3216+
levels, labels, names = _complete_multilevel_join(self, other, how,
3217+
dropped_levels,
3218+
join_idx,
3219+
lidx, ridx)
3220+
3221+
multi_join_idx = MultiIndex(levels=levels, labels=labels,
3222+
names=names, verify_integrity=False)
3223+
3224+
# Check for unused levels
3225+
multi_join_idx = multi_join_idx.remove_unused_levels()
3226+
3227+
return multi_join_idx, lidx, ridx
3228+
3229+
jl = list(overlap)[0]
32003230

32013231
# make the indices into mi's that match
3202-
if not (self_is_mi and other_is_mi):
3203-
3204-
flip_order = False
3205-
if self_is_mi:
3206-
self, other = other, self
3207-
flip_order = True
3208-
# flip if join method is right or left
3209-
how = {'right': 'left', 'left': 'right'}.get(how, how)
3210-
3211-
level = other.names.index(jl)
3212-
result = self._join_level(other, level, how=how,
3213-
return_indexers=return_indexers)
3214-
3215-
if flip_order:
3216-
if isinstance(result, tuple):
3217-
return result[0], result[2], result[1]
3218-
return result
3232+
flip_order = False
3233+
if self_is_mi:
3234+
self, other = other, self
3235+
flip_order = True
3236+
# flip if join method is right or left
3237+
how = {'right': 'left', 'left': 'right'}.get(how, how)
32193238

3220-
# 2 multi-indexes
3221-
raise NotImplementedError("merging with both multi-indexes is not "
3222-
"implemented")
3239+
level = other.names.index(jl)
3240+
result = self._join_level(other, level, how=how,
3241+
return_indexers=return_indexers)
3242+
3243+
if flip_order:
3244+
if isinstance(result, tuple):
3245+
return result[0], result[2], result[1]
3246+
return result
32233247

32243248
def _join_non_unique(self, other, how='left', return_indexers=False):
32253249
from pandas.core.reshape.merge import _get_join_indexers
@@ -3428,8 +3452,8 @@ def _get_string_slice(self, key, use_lhs=True, use_rhs=True):
34283452

34293453
def slice_indexer(self, start=None, end=None, step=None, kind=None):
34303454
"""
3431-
For an ordered or unique index, compute the slice indexer for input
3432-
labels and step.
3455+
For an ordered Index, compute the slice indexer for input labels and
3456+
step
34333457
34343458
Parameters
34353459
----------
@@ -3442,28 +3466,11 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None):
34423466
34433467
Returns
34443468
-------
3445-
indexer : slice
3446-
3447-
Raises
3448-
------
3449-
KeyError : If key does not exist, or key is not unique and index is
3450-
not ordered.
3469+
indexer : ndarray or slice
34513470
34523471
Notes
34533472
-----
34543473
This function assumes that the data is sorted, so use at your own peril
3455-
3456-
Examples
3457-
---------
3458-
This is a method on all index types. For example you can do:
3459-
3460-
>>> idx = pd.Index(list('abcd'))
3461-
>>> idx.slice_indexer(start='b', end='c')
3462-
slice(1, 3)
3463-
3464-
>>> idx = pd.MultiIndex.from_arrays([list('abcd'), list('efgh')])
3465-
>>> idx.slice_indexer(start='b', end=('c', 'g'))
3466-
slice(1, 3)
34673474
"""
34683475
start_slice, end_slice = self.slice_locs(start, end, step=step,
34693476
kind=kind)

pandas/core/indexes/multi.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -1345,9 +1345,10 @@ def remove_unused_levels(self):
13451345
for lev, lab in zip(self.levels, self.labels):
13461346

13471347
uniques = algos.unique(lab)
1348-
1348+
# remove if NaN in index
1349+
uniques_no_nans = uniques[uniques != -1]
13491350
# nothing unused
1350-
if len(uniques) == len(lev):
1351+
if len(uniques_no_nans) == len(lev):
13511352
new_levels.append(lev)
13521353
new_labels.append(lab)
13531354
continue
@@ -1356,11 +1357,12 @@ def remove_unused_levels(self):
13561357

13571358
# labels get mapped from uniques to 0:len(uniques)
13581359
label_mapping = np.zeros(len(lev))
1359-
label_mapping[uniques] = np.arange(len(uniques))
1360-
lab = label_mapping[lab]
1360+
label_mapping[uniques_no_nans] = np.arange(len(uniques_no_nans))
1361+
# apply the mapping where lab != -1
1362+
lab = np.where(lab != -1, label_mapping[lab], -1)
13611363

13621364
# new levels are simple
1363-
lev = lev.take(uniques)
1365+
lev = lev.take(uniques_no_nans)
13641366

13651367
new_levels.append(lev)
13661368
new_labels.append(lab)

pandas/core/reshape/merge.py

+84-12
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ def _groupby_and_merge(by, on, left, right, _merge_pieces,
126126
try:
127127
if k in merged:
128128
merged[k] = key
129-
except KeyError:
129+
except:
130130
pass
131131

132132
pieces.append(merged)
@@ -1066,6 +1066,82 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner',
10661066
return join_func(lkey, rkey, count, **kwargs)
10671067

10681068

1069+
def _complete_multilevel_join(left, right, how, dropped_levels,
1070+
join_idx, lidx, ridx):
1071+
"""
1072+
*this is an internal non-public method*
1073+
1074+
Returns the levels, labels and names of a multilevel to multilevel join
1075+
Depending on the type of join, this method restores the appropriate
1076+
dropped levels of the joined multi-index. The method relies on lidx, ridx
1077+
which hold the index positions of left and right, where a join was feasible
1078+
1079+
Parameters
1080+
----------
1081+
left : Index
1082+
left index
1083+
right : Index
1084+
right index
1085+
join_idx : Index
1086+
the index of the join between the common levels of left and right
1087+
how : {'left', 'right', 'outer', 'inner'}
1088+
lidx : intp array
1089+
left indexer
1090+
right : intp array
1091+
right indexer
1092+
dropped_levels : str array
1093+
list of non-common levels
1094+
1095+
Returns
1096+
-------
1097+
levels : intp array
1098+
levels of combined multiindexes
1099+
labels : str array
1100+
labels of combined multiindexes
1101+
names : str array
1102+
names of combined multiindexes
1103+
1104+
"""
1105+
1106+
join_levels = join_idx.levels
1107+
join_labels = join_idx.labels
1108+
join_names = join_idx.names
1109+
1110+
# lidx and ridx hold the indexes where the join occured
1111+
# for left and right respectively. If left (right) is None it means that
1112+
# the join occured on all indices of left (right)
1113+
if lidx is None:
1114+
lidx = range(0, len(left))
1115+
1116+
if ridx is None:
1117+
ridx = range(0, len(right))
1118+
1119+
# Iterate through the levels that must be restored
1120+
for dl in dropped_levels:
1121+
if dl in left.names:
1122+
idx = left
1123+
indexer = lidx
1124+
else:
1125+
idx = right
1126+
indexer = ridx
1127+
1128+
# The index of the level name to be restored
1129+
name_idx = idx.names.index(dl)
1130+
1131+
restore_levels = idx.levels[name_idx].values
1132+
restore_labels = idx.labels[name_idx]
1133+
1134+
join_levels = join_levels.__add__([restore_levels])
1135+
join_names = join_names.__add__([dl])
1136+
1137+
# Inject -1 in the labels list where a join was not possible
1138+
# IOW indexer[i]=-1
1139+
labels = [restore_labels[i] if i != -1 else -1 for i in indexer]
1140+
join_labels = join_labels.__add__([labels])
1141+
1142+
return join_levels, join_labels, join_names
1143+
1144+
10691145
class _OrderedMerge(_MergeOperation):
10701146
_merge_type = 'ordered_merge'
10711147

@@ -1253,12 +1329,10 @@ def _get_merge_keys(self):
12531329
join_names) = super(_AsOfMerge, self)._get_merge_keys()
12541330

12551331
# validate index types are the same
1256-
for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)):
1332+
for lk, rk in zip(left_join_keys, right_join_keys):
12571333
if not is_dtype_equal(lk.dtype, rk.dtype):
1258-
raise MergeError("incompatible merge keys [{i}] {lkdtype} and "
1259-
"{rkdtype}, must be the same type"
1260-
.format(i=i, lkdtype=lk.dtype,
1261-
rkdtype=rk.dtype))
1334+
raise MergeError("incompatible merge keys, "
1335+
"must be the same type")
12621336

12631337
# validate tolerance; must be a Timedelta if we have a DTI
12641338
if self.tolerance is not None:
@@ -1268,10 +1342,8 @@ def _get_merge_keys(self):
12681342
else:
12691343
lt = left_join_keys[-1]
12701344

1271-
msg = ("incompatible tolerance {tolerance}, must be compat "
1272-
"with type {lkdtype}".format(
1273-
tolerance=type(self.tolerance),
1274-
lkdtype=lt.dtype))
1345+
msg = "incompatible tolerance, must be compat " \
1346+
"with type {lt}".format(lt=type(lt))
12751347

12761348
if is_datetime64_dtype(lt) or is_datetime64tz_dtype(lt):
12771349
if not isinstance(self.tolerance, Timedelta):
@@ -1507,12 +1579,12 @@ def _sort_labels(uniques, left, right):
15071579
# tuplesafe
15081580
uniques = Index(uniques).values
15091581

1510-
llength = len(left)
1582+
l = len(left)
15111583
labels = np.concatenate([left, right])
15121584

15131585
_, new_labels = sorting.safe_sort(uniques, labels, na_sentinel=-1)
15141586
new_labels = _ensure_int64(new_labels)
1515-
new_left, new_right = new_labels[:llength], new_labels[llength:]
1587+
new_left, new_right = new_labels[:l], new_labels[l:]
15161588

15171589
return new_left, new_right
15181590

0 commit comments

Comments
 (0)