Skip to content

Commit 59b8977

Browse files
committed
BUG: Fix merging non-indexes causes Index dtype promotion in when keys are
missing from left or right side. (GH28220) Also closes GH24897, GH24212, and GH17257
1 parent f8a924b commit 59b8977

File tree

3 files changed

+481
-135
lines changed

3 files changed

+481
-135
lines changed

doc/source/whatsnew/v0.25.2.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ Groupby/resample/rolling
8585
Reshaping
8686
^^^^^^^^^
8787

88-
-
88+
- Bug in certain joins when missing keys when not using left_index when how='right', or right_index when how='left' causing NA values in index and dtype promotion (:issue:`28220`).
8989
-
9090
-
9191
-

pandas/core/reshape/merge.py

+164-40
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,12 @@
3030
is_int64_dtype,
3131
is_integer,
3232
is_integer_dtype,
33+
is_interval_dtype,
3334
is_list_like,
3435
is_number,
3536
is_numeric_dtype,
3637
is_object_dtype,
38+
is_period_dtype,
3739
needs_i8_conversion,
3840
)
3941
from pandas.core.dtypes.missing import isnull, na_value_for_dtype
@@ -842,65 +844,187 @@ def _get_join_info(self):
842844
else:
843845
(left_indexer, right_indexer) = self._get_join_indexers()
844846

847+
if self.left_index:
848+
left = self.left.index
849+
else:
850+
left = self.left_join_keys
851+
845852
if self.right_index:
846-
if len(self.left) > 0:
847-
join_index = self._create_join_index(
848-
self.left.index,
849-
self.right.index,
850-
left_indexer,
851-
right_indexer,
852-
how="right",
853-
)
854-
else:
855-
join_index = self.right.index.take(right_indexer)
856-
left_indexer = np.array([-1] * len(join_index))
857-
elif self.left_index:
858-
if len(self.right) > 0:
859-
join_index = self._create_join_index(
860-
self.right.index,
861-
self.left.index,
862-
right_indexer,
863-
left_indexer,
864-
how="left",
865-
)
866-
else:
867-
join_index = self.left.index.take(left_indexer)
868-
right_indexer = np.array([-1] * len(join_index))
853+
right = self.right.index
854+
else:
855+
right = self.right_join_keys
856+
857+
if self.how != "right":
858+
join_index = self._create_join_index(
859+
left,
860+
right,
861+
left_indexer,
862+
right_indexer,
863+
self.join_names,
864+
self.join_names,
865+
)
869866
else:
870-
join_index = Index(np.arange(len(left_indexer)))
867+
join_index = self._create_join_index(
868+
right,
869+
left,
870+
right_indexer,
871+
left_indexer,
872+
self.join_names,
873+
self.join_names,
874+
)
871875

872876
if len(join_index) == 0:
873877
join_index = join_index.astype(object)
874878
return join_index, left_indexer, right_indexer
875879

876880
def _create_join_index(
877-
self, index, other_index, indexer, other_indexer, how="left"
881+
self, index, other_index, indexer, other_indexer, names, other_names
878882
):
879883
"""
880884
Create a join index by rearranging one index to match another
881885
882886
Parameters
883887
----------
884-
index: Index being rearranged
885-
other_index: Index used to supply values not found in index
886-
indexer: how to rearrange index
887-
how: replacement is only necessary if indexer based on other_index
888+
index: [Multi]Index being rearranged
889+
other_index: [Multi]Index used to supply values not found in index
890+
indexer: indicies for fancy indexing in index
891+
other_indexer: indicies for fancy indexing in other_index
892+
names: index name(s)
893+
other_names: other_index name(s)
888894
889895
Returns
890896
-------
891897
join_index
892898
"""
893-
if self.how in (how, "outer") and not isinstance(other_index, MultiIndex):
894-
# if final index requires values in other_index but not target
895-
# index, indexer may hold missing (-1) values, causing Index.take
896-
# to take the final value in target index. So, we set the last
897-
# element to be the desired fill value. We do not use allow_fill
898-
# and fill_value because it throws a ValueError on integer indices
899-
mask = indexer == -1
900-
if np.any(mask):
901-
fill_value = na_value_for_dtype(index.dtype, compat=False)
902-
index = index.append(Index([fill_value]))
903-
return index.take(indexer)
899+
900+
if not issubclass(type(index), Index) and is_list_like(index):
901+
if len(index) > 1:
902+
index = MultiIndex.from_arrays(index, names=names)
903+
else:
904+
index = Index(index[0], name=names[0])
905+
906+
infer_index_type = False
907+
if not issubclass(type(other_index), Index) and is_list_like(other_index):
908+
# must infer dtype from index if possible
909+
infer_index_type = True
910+
911+
if len(other_index) > 1:
912+
other_index = MultiIndex.from_arrays(other_index, names=other_names)
913+
else:
914+
other_index = Index(other_index[0], name=other_names[0])
915+
916+
# if final index requires values in other_index but not target
917+
# index, indexer may hold missing (-1) values, causing Index.take
918+
# to take the final value in target index
919+
mask = indexer == -1
920+
921+
idx = None
922+
convert_type = None
923+
if self.how == "outer":
924+
925+
# we may need to reshape the index to include missing keys
926+
outer_mask = mask & (other_indexer != -1)
927+
928+
if outer_mask.any():
929+
930+
if is_categorical_dtype(other_index.dtype):
931+
np_type = other_index.categories.dtype
932+
convert_type = other_index.categories.dtype
933+
elif (
934+
is_period_dtype(other_index.dtype)
935+
or is_datetime64tz_dtype(other_index.dtype)
936+
or is_interval_dtype(other_index.dtype)
937+
):
938+
# TODO: better way to get numpy dtype without try/except?
939+
np_type = other_index.to_numpy().dtype
940+
convert_type = other_index.dtype
941+
else:
942+
np_type = other_index.dtype
943+
944+
outer_index = np.empty(len(outer_mask), dtype=np_type)
945+
outer_index[mask] = other_index.take(other_indexer[outer_mask])
946+
947+
if not index.empty:
948+
if is_datetime64tz_dtype(other_index.dtype):
949+
idx = np.where(
950+
outer_mask, outer_index, index.take(indexer).to_numpy()
951+
)
952+
else:
953+
idx = np.where(outer_mask, outer_index, index.take(indexer))
954+
else:
955+
idx = outer_index
956+
957+
elif self.how == "inner":
958+
if not self.left_index and not self.right_index:
959+
# TODO: this behaviour below is assumed in a lot of test cases
960+
return np.arange(np.sum(~mask))
961+
else:
962+
idx = index.take(indexer[~mask])
963+
964+
# left/right/outer (if outer is a subset of left/right) join
965+
reset_dtype = index.empty
966+
if idx is None:
967+
968+
# try to preserve index, worse case we reset it
969+
if index.shape == mask.shape:
970+
idx = np.where(mask, index, index.take(indexer))
971+
else:
972+
if mask.any():
973+
return np.arange(len(mask))
974+
else:
975+
idx = index.take(indexer)
976+
977+
if isinstance(index, MultiIndex):
978+
if len(idx) == 0:
979+
return MultiIndex.from_arrays(np.empty((index.nlevels, 0)), names=names)
980+
else:
981+
return MultiIndex.from_tuples(idx, names=names)
982+
else:
983+
984+
# a bit of spagetti soup
985+
create_categorial = False
986+
if infer_index_type and issubclass(type(index), Index):
987+
988+
# TODO: Int64Index doesn't like Int64Index(np.ndarray) nor Categorical
989+
if is_integer_dtype(index):
990+
index_type = Index
991+
elif is_categorical_dtype(index):
992+
create_categorial = True
993+
index_type = type(index)
994+
if is_categorical_dtype(other_index):
995+
union_cat = index.categories.union(other_index.categories)
996+
else:
997+
union_cat = index.categories.union(other_index)
998+
else:
999+
index_type = type(index)
1000+
1001+
else:
1002+
if is_categorical_dtype(other_index):
1003+
create_categorial = True
1004+
index_type = type(other_index)
1005+
if is_categorical_dtype(index):
1006+
union_cat = other_index.categories.union(index.categories)
1007+
else:
1008+
union_cat = other_index.categories.union(index)
1009+
else:
1010+
index_type = Index
1011+
1012+
if create_categorial:
1013+
return index_type(idx, categories=union_cat, name=names[0])
1014+
elif reset_dtype:
1015+
return index_type(idx, name=names[0])
1016+
elif convert_type is None:
1017+
if is_period_dtype(index.dtype) or is_datetime64tz_dtype(index.dtype):
1018+
return index_type(idx, name=names[0])
1019+
else:
1020+
return index_type(idx, dtype=index.dtype, name=names[0])
1021+
else:
1022+
if is_period_dtype(other_index.dtype) or is_datetime64tz_dtype(
1023+
other_index.dtype
1024+
):
1025+
return index_type(idx, name=names[0])
1026+
else:
1027+
return index_type(idx, dtype=convert_type, name=names[0])
9041028

9051029
def _get_merge_keys(self):
9061030
"""

0 commit comments

Comments
 (0)