Skip to content

Commit ec70eee

Browse files
committed
BUG: Fix merging non-indexes causes Index dtype promotion in when keys are missing from left or right side. (GH28220)
Also closes GH24897, GH24212, and GH17257
1 parent 6498bc1 commit ec70eee

File tree

4 files changed

+203
-1
lines changed

4 files changed

+203
-1
lines changed

doc/source/whatsnew/v0.25.4.rst

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
2+
Reshaping
3+
^^^^^^^^^
4+
5+
- Added new option to allow user to specify NA value for certain joins when missing keys when not using left_index when how='right', or right_index when how='left' causing dtype promotion (:issue:`28220`).
6+

pandas/core/frame.py

+6
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,12 @@
230230
231231
.. versionadded:: 0.21.0
232232
233+
index_na_value : value, optional
234+
If a join requires NA values to be placed in the index use this value or
235+
accept the default NA for the dtype which may involve a type promotion
236+
237+
.. versionadded:: 0.25.2
238+
233239
Returns
234240
-------
235241
DataFrame

pandas/core/reshape/merge.py

+16-1
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,10 @@
4747
from pandas.core.sorting import is_int64_overflow_possible
4848

4949

50+
class DefaultNA:
51+
pass
52+
53+
5054
@Substitution("\nleft : DataFrame")
5155
@Appender(_merge_doc, indents=0)
5256
def merge(
@@ -63,6 +67,7 @@ def merge(
6367
copy=True,
6468
indicator=False,
6569
validate=None,
70+
index_na_value=DefaultNA(),
6671
):
6772
op = _MergeOperation(
6873
left,
@@ -78,6 +83,7 @@ def merge(
7883
copy=copy,
7984
indicator=indicator,
8085
validate=validate,
86+
index_na_value=index_na_value,
8187
)
8288
return op.get_result()
8389

@@ -555,6 +561,7 @@ def __init__(
555561
copy=True,
556562
indicator=False,
557563
validate=None,
564+
index_na_value=DefaultNA(),
558565
):
559566
left = validate_operand(left)
560567
right = validate_operand(right)
@@ -623,6 +630,10 @@ def __init__(
623630
if validate is not None:
624631
self._validate(validate)
625632

633+
# if a join requires NA values to be placed in the index
634+
# use this value or default NA which may involve a type promotion
635+
self.index_na_value = index_na_value
636+
626637
def get_result(self):
627638
if self.indicator:
628639
self.left, self.right = self._indicator_pre_merge(self.left, self.right)
@@ -902,7 +913,11 @@ def _create_join_index(
902913
# and fill_value because it throws a ValueError on integer indices
903914
mask = indexer == -1
904915
if np.any(mask):
905-
fill_value = na_value_for_dtype(index.dtype, compat=False)
916+
if isinstance(self.index_na_value, DefaultNA):
917+
fill_value = na_value_for_dtype(index.dtype, compat=False)
918+
else:
919+
fill_value = self.index_na_value
920+
906921
index = index.append(Index([fill_value]))
907922
return index.take(indexer)
908923

pandas/tests/reshape/merge/test_merge.py

+175
Original file line numberDiff line numberDiff line change
@@ -2153,3 +2153,178 @@ def test_merge_multiindex_columns():
21532153
expected["id"] = ""
21542154

21552155
tm.assert_frame_equal(result, expected)
2156+
2157+
2158+
@pytest.fixture(
2159+
params=[
2160+
dict(domain=pd.Index(["A", "B", "C"])),
2161+
dict(domain=CategoricalIndex(["A", "B", "C"])),
2162+
dict(domain=DatetimeIndex(["2001-01-01", "2002-02-02", "2003-03-03"])),
2163+
dict(domain=Float64Index([1, 2, 3])),
2164+
dict(domain=Int64Index([1, 2, 3])),
2165+
dict(domain=IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)])),
2166+
dict(domain=TimedeltaIndex(["1d", "2d", "3d"])),
2167+
]
2168+
)
2169+
def fix_GH_28220_(request):
2170+
class Data:
2171+
def __init__(self):
2172+
self.domain = request.param["domain"]
2173+
self.X = pd.DataFrame({"count": [1, 2]}, index=self.domain.take([0, 1]))
2174+
self.Y = pd.DataFrame(
2175+
{"name": self.domain.take([0, 2]), "value": [100, 200]}
2176+
)
2177+
self.Z = pd.DataFrame(
2178+
{"name": self.domain.take([0, 0, 2]), "value": [100, 200, 300]}
2179+
)
2180+
self.E = pd.DataFrame(columns=["name", "value"])
2181+
2182+
assert isinstance(self.X.index, type(self.domain))
2183+
2184+
return Data()
2185+
2186+
2187+
@pytest.mark.parametrize(
2188+
"how,expected",
2189+
[
2190+
("left", ([0, -255], [0, 1, -255], [0, 1])),
2191+
("inner", ([0], [0, 1], [])),
2192+
("outer", ([0, -255, 1], [0, 1, -255, 2], [0, 1])),
2193+
],
2194+
)
2195+
def test_left_index_merge_with_missing_by_right_on(fix_GH_28220_, how, expected):
2196+
2197+
# GH 28220
2198+
(e1, e2, e3) = map(lambda x: pd.Index(x), expected)
2199+
e3 = fix_GH_28220_.domain.take(e3)
2200+
2201+
r1 = pd.merge(
2202+
fix_GH_28220_.X,
2203+
fix_GH_28220_.Y,
2204+
left_index=True,
2205+
right_on=["name"],
2206+
how=how,
2207+
index_na_value=-255,
2208+
)
2209+
tm.assert_index_equal(r1.index, e1)
2210+
2211+
r2 = pd.merge(
2212+
fix_GH_28220_.X,
2213+
fix_GH_28220_.Z,
2214+
left_index=True,
2215+
right_on=["name"],
2216+
how=how,
2217+
index_na_value=-255,
2218+
)
2219+
tm.assert_index_equal(r2.index, e2)
2220+
2221+
r3 = pd.merge(
2222+
fix_GH_28220_.X,
2223+
fix_GH_28220_.E,
2224+
left_index=True,
2225+
right_on=["name"],
2226+
how=how,
2227+
index_na_value=-255,
2228+
)
2229+
2230+
# special case when result is empty, dtype is object
2231+
if r3.empty:
2232+
e3 = pd.Index([], dtype=object, name=e3.name)
2233+
2234+
tm.assert_index_equal(r3.index, e3)
2235+
2236+
2237+
@pytest.mark.parametrize(
2238+
"how,expected",
2239+
[
2240+
("right", ([0, -255], [0, 0, -255], [0, 1, 2])),
2241+
("inner", ([0], [0, 0], [])),
2242+
("outer", ([0, 1, -255], [0, 0, 1, -255], [0, 1])),
2243+
],
2244+
)
2245+
def test_left_on_merge_with_missing_by_right_index(fix_GH_28220_, how, expected):
2246+
2247+
# GH 28220
2248+
(e1, e2, e3) = map(lambda x: pd.Index(x), expected)
2249+
2250+
r1 = pd.merge(
2251+
fix_GH_28220_.X.reset_index(),
2252+
fix_GH_28220_.Y.set_index("name"),
2253+
left_on=["index"],
2254+
right_index=True,
2255+
how=how,
2256+
index_na_value=-255,
2257+
)
2258+
tm.assert_index_equal(r1.index, e1)
2259+
2260+
r2 = pd.merge(
2261+
fix_GH_28220_.X.reset_index(),
2262+
fix_GH_28220_.Z.set_index("name"),
2263+
left_on=["index"],
2264+
right_index=True,
2265+
how=how,
2266+
index_na_value=-255,
2267+
)
2268+
tm.assert_index_equal(r2.index, e2)
2269+
2270+
r3 = pd.merge(
2271+
fix_GH_28220_.X.reset_index(),
2272+
fix_GH_28220_.E.set_index("name"),
2273+
left_on=["index"],
2274+
right_index=True,
2275+
how=how,
2276+
index_na_value=-255,
2277+
)
2278+
2279+
# special case when result is empty, dtype is object
2280+
if r3.empty:
2281+
e3 = pd.Index([], dtype=object, name=e3.name)
2282+
2283+
tm.assert_index_equal(r3.index, e3)
2284+
2285+
2286+
@pytest.mark.parametrize(
2287+
"how,expected",
2288+
[
2289+
("left", ([0, 1], [0, 1, 2], [0, 1])),
2290+
("right", ([0, 1], [0, 1, 2], [0, 2])),
2291+
("inner", ([0], [0, 1], [])),
2292+
("outer", ([0, 1, 2], [0, 1, 2, 3], [0, 1])),
2293+
],
2294+
)
2295+
def test_left_on_merge_with_missing_by_right_on(fix_GH_28220_, how, expected):
2296+
2297+
# GH 28220
2298+
(e1, e2, e3) = map(lambda x: pd.Index(x), expected)
2299+
2300+
r1 = pd.merge(
2301+
fix_GH_28220_.X.reset_index(),
2302+
fix_GH_28220_.Y,
2303+
left_on=["index"],
2304+
right_on=["name"],
2305+
how=how,
2306+
)
2307+
tm.assert_index_equal(r1.index, e1)
2308+
2309+
r2 = pd.merge(
2310+
fix_GH_28220_.X.reset_index(),
2311+
fix_GH_28220_.Z,
2312+
left_on=["index"],
2313+
right_on=["name"],
2314+
how=how,
2315+
)
2316+
tm.assert_index_equal(r2.index, e2)
2317+
2318+
r3 = pd.merge(
2319+
fix_GH_28220_.X.reset_index(),
2320+
fix_GH_28220_.E,
2321+
left_on=["index"],
2322+
right_on=["name"],
2323+
how=how,
2324+
)
2325+
2326+
# special case when result is empty, dtype is object
2327+
if r3.empty:
2328+
e3 = pd.Index([], dtype=object, name=e3.name)
2329+
2330+
tm.assert_index_equal(r3.index, e3)

0 commit comments

Comments
 (0)