Skip to content

Commit 23b37c3

Browse files
committed
BUG: Fix merging non-indexes causes Index dtype promotion in when keys are missing from left or right side. (GH28220)
Also closes GH24897, GH24212, and GH17257
1 parent c4489cb commit 23b37c3

File tree

4 files changed

+204
-2
lines changed

4 files changed

+204
-2
lines changed

doc/source/whatsnew/v0.25.2.rst

+1
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ Groupby/resample/rolling
8585
Reshaping
8686
^^^^^^^^^
8787

88+
- Added new option to allow user to specify NA value for certain joins when missing keys when not using left_index when how='right', or right_index when how='left' causing dtype promotion (:issue:`28220`).
8889
-
8990
-
9091
-

pandas/core/frame.py

+6
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,12 @@
217217
218218
.. versionadded:: 0.21.0
219219
220+
index_na_value : value, optional
221+
If a join requires NA values to be placed in the index use this value or
222+
accept the default NA for the dtype which may involve a type promotion
223+
224+
.. versionadded:: 0.25.2
225+
220226
Returns
221227
-------
222228
DataFrame

pandas/core/reshape/merge.py

+16-1
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@
4949
from pandas.core.sorting import is_int64_overflow_possible
5050

5151

52+
class DefaultNA:
53+
pass
54+
55+
5256
@Substitution("\nleft : DataFrame")
5357
@Appender(_merge_doc, indents=0)
5458
def merge(
@@ -65,6 +69,7 @@ def merge(
6569
copy=True,
6670
indicator=False,
6771
validate=None,
72+
index_na_value=DefaultNA(),
6873
):
6974
op = _MergeOperation(
7075
left,
@@ -80,6 +85,7 @@ def merge(
8085
copy=copy,
8186
indicator=indicator,
8287
validate=validate,
88+
index_na_value=index_na_value,
8389
)
8490
return op.get_result()
8591

@@ -552,6 +558,7 @@ def __init__(
552558
copy=True,
553559
indicator=False,
554560
validate=None,
561+
index_na_value=DefaultNA(),
555562
):
556563
left = validate_operand(left)
557564
right = validate_operand(right)
@@ -620,6 +627,10 @@ def __init__(
620627
if validate is not None:
621628
self._validate(validate)
622629

630+
# if a join requires NA values to be placed in the index
631+
# use this value or default NA which may involve a type promotion
632+
self.index_na_value = index_na_value
633+
623634
def get_result(self):
624635
if self.indicator:
625636
self.left, self.right = self._indicator_pre_merge(self.left, self.right)
@@ -899,7 +910,11 @@ def _create_join_index(
899910
# and fill_value because it throws a ValueError on integer indices
900911
mask = indexer == -1
901912
if np.any(mask):
902-
fill_value = na_value_for_dtype(index.dtype, compat=False)
913+
if isinstance(self.index_na_value, DefaultNA):
914+
fill_value = na_value_for_dtype(index.dtype, compat=False)
915+
else:
916+
fill_value = self.index_na_value
917+
903918
index = index.append(Index([fill_value]))
904919
return index.take(indexer)
905920

pandas/tests/reshape/merge/test_merge.py

+181-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,11 @@
3030
from pandas.core.reshape.concat import concat
3131
from pandas.core.reshape.merge import MergeError, merge
3232
import pandas.util.testing as tm
33-
from pandas.util.testing import assert_frame_equal, assert_series_equal
33+
from pandas.util.testing import (
34+
assert_frame_equal,
35+
assert_index_equal,
36+
assert_series_equal,
37+
)
3438

3539
N = 50
3640
NGROUPS = 8
@@ -2094,3 +2098,179 @@ def test_merge_equal_cat_dtypes2():
20942098

20952099
# Categorical is unordered, so don't check ordering.
20962100
tm.assert_frame_equal(result, expected, check_categorical=False)
2101+
2102+
2103+
@pytest.fixture(
2104+
params=[
2105+
dict(domain=pd.Index(["A", "B", "C"])),
2106+
dict(domain=CategoricalIndex(["A", "B", "C"])),
2107+
dict(domain=DatetimeIndex(["2001-01-01", "2002-02-02", "2003-03-03"])),
2108+
dict(domain=Float64Index([1, 2, 3])),
2109+
dict(domain=Int64Index([1, 2, 3])),
2110+
dict(domain=IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)])),
2111+
dict(domain=TimedeltaIndex(["1d", "2d", "3d"])),
2112+
dict(domain=PeriodIndex(["2001-01-01", "2001-01-02", "2001-01-03"], freq="D")),
2113+
]
2114+
)
2115+
def fix_GH_28220_(request):
2116+
class Data:
2117+
def __init__(self):
2118+
self.domain = request.param["domain"]
2119+
self.X = pd.DataFrame({"count": [1, 2]}, index=self.domain.take([0, 1]))
2120+
self.Y = pd.DataFrame(
2121+
{"name": self.domain.take([0, 2]), "value": [100, 200]}
2122+
)
2123+
self.Z = pd.DataFrame(
2124+
{"name": self.domain.take([0, 0, 2]), "value": [100, 200, 300]}
2125+
)
2126+
self.E = pd.DataFrame(columns=["name", "value"])
2127+
2128+
assert isinstance(self.X.index, type(self.domain))
2129+
2130+
return Data()
2131+
2132+
2133+
@pytest.mark.parametrize(
2134+
"how,expected",
2135+
[
2136+
("left", ([0, -255], [0, 1, -255], [0, 1])),
2137+
("inner", ([0], [0, 1], [])),
2138+
("outer", ([0, -255, 1], [0, 1, -255, 2], [0, 1])),
2139+
],
2140+
)
2141+
def test_left_index_merge_with_missing_by_right_on(fix_GH_28220_, how, expected):
2142+
2143+
# GH 28220
2144+
(e1, e2, e3) = map(lambda x: pd.Index(x), expected)
2145+
e3 = fix_GH_28220_.domain.take(e3)
2146+
2147+
r1 = pd.merge(
2148+
fix_GH_28220_.X,
2149+
fix_GH_28220_.Y,
2150+
left_index=True,
2151+
right_on=["name"],
2152+
how=how,
2153+
index_na_value=-255,
2154+
)
2155+
assert_index_equal(r1.index, e1)
2156+
2157+
r2 = pd.merge(
2158+
fix_GH_28220_.X,
2159+
fix_GH_28220_.Z,
2160+
left_index=True,
2161+
right_on=["name"],
2162+
how=how,
2163+
index_na_value=-255,
2164+
)
2165+
assert_index_equal(r2.index, e2)
2166+
2167+
r3 = pd.merge(
2168+
fix_GH_28220_.X,
2169+
fix_GH_28220_.E,
2170+
left_index=True,
2171+
right_on=["name"],
2172+
how=how,
2173+
index_na_value=-255,
2174+
)
2175+
2176+
# special case when result is empty, dtype is object
2177+
if r3.empty:
2178+
e3 = pd.Index([], dtype=object, name=e3.name)
2179+
2180+
assert_index_equal(r3.index, e3)
2181+
2182+
2183+
@pytest.mark.parametrize(
2184+
"how,expected",
2185+
[
2186+
("right", ([0, -255], [0, 0, -255], [0, 1, 2])),
2187+
("inner", ([0], [0, 0], [])),
2188+
("outer", ([0, 1, -255], [0, 0, 1, -255], [0, 1])),
2189+
],
2190+
)
2191+
def test_left_on_merge_with_missing_by_right_index(fix_GH_28220_, how, expected):
2192+
2193+
# GH 28220
2194+
(e1, e2, e3) = map(lambda x: pd.Index(x), expected)
2195+
2196+
r1 = pd.merge(
2197+
fix_GH_28220_.X.reset_index(),
2198+
fix_GH_28220_.Y.set_index("name"),
2199+
left_on=["index"],
2200+
right_index=True,
2201+
how=how,
2202+
index_na_value=-255,
2203+
)
2204+
assert_index_equal(r1.index, e1)
2205+
2206+
r2 = pd.merge(
2207+
fix_GH_28220_.X.reset_index(),
2208+
fix_GH_28220_.Z.set_index("name"),
2209+
left_on=["index"],
2210+
right_index=True,
2211+
how=how,
2212+
index_na_value=-255,
2213+
)
2214+
assert_index_equal(r2.index, e2)
2215+
2216+
r3 = pd.merge(
2217+
fix_GH_28220_.X.reset_index(),
2218+
fix_GH_28220_.E.set_index("name"),
2219+
left_on=["index"],
2220+
right_index=True,
2221+
how=how,
2222+
index_na_value=-255,
2223+
)
2224+
2225+
# special case when result is empty, dtype is object
2226+
if r3.empty:
2227+
e3 = pd.Index([], dtype=object, name=e3.name)
2228+
2229+
assert_index_equal(r3.index, e3)
2230+
2231+
2232+
@pytest.mark.parametrize(
2233+
"how,expected",
2234+
[
2235+
("left", ([0, 1], [0, 1, 2], [0, 1])),
2236+
("right", ([0, 1], [0, 1, 2], [0, 2])),
2237+
("inner", ([0], [0, 1], [])),
2238+
("outer", ([0, 1, 2], [0, 1, 2, 3], [0, 1])),
2239+
],
2240+
)
2241+
def test_left_on_merge_with_missing_by_right_on(fix_GH_28220_, how, expected):
2242+
2243+
# GH 28220
2244+
(e1, e2, e3) = map(lambda x: pd.Index(x), expected)
2245+
2246+
r1 = pd.merge(
2247+
fix_GH_28220_.X.reset_index(),
2248+
fix_GH_28220_.Y,
2249+
left_on=["index"],
2250+
right_on=["name"],
2251+
how=how,
2252+
)
2253+
assert_index_equal(r1.index, e1)
2254+
2255+
r2 = pd.merge(
2256+
fix_GH_28220_.X.reset_index(),
2257+
fix_GH_28220_.Z,
2258+
left_on=["index"],
2259+
right_on=["name"],
2260+
how=how,
2261+
)
2262+
assert_index_equal(r2.index, e2)
2263+
2264+
r3 = pd.merge(
2265+
fix_GH_28220_.X.reset_index(),
2266+
fix_GH_28220_.E,
2267+
left_on=["index"],
2268+
right_on=["name"],
2269+
how=how,
2270+
)
2271+
2272+
# special case when result is empty, dtype is object
2273+
if r3.empty:
2274+
e3 = pd.Index([], dtype=object, name=e3.name)
2275+
2276+
assert_index_equal(r3.index, e3)

0 commit comments

Comments
 (0)