Skip to content

Commit a0a95c2

Browse files
committed
BUG: Fix merging non-indexes causes Index dtype promotion in when keys are missing from left or right side. (GH28220)
Also closes GH24897, GH24212, and GH17257
1 parent f8a924b commit a0a95c2

File tree

4 files changed

+185
-2
lines changed

4 files changed

+185
-2
lines changed

doc/source/whatsnew/v0.25.2.rst

+1
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ Groupby/resample/rolling
8585
Reshaping
8686
^^^^^^^^^
8787

88+
- Added new option to allow user to specify NA value for certain joins when missing keys when not using left_index when how='right', or right_index when how='left' causing dtype promotion (:issue:`28220`).
8889
-
8990
-
9091
-

pandas/core/frame.py

+6
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,12 @@
217217
218218
.. versionadded:: 0.21.0
219219
220+
index_na_value: value, optional
221+
If a join requires NA values to be placed in the index use this value or
222+
accept the default NA for the dtype which may involve a type promotion
223+
224+
.. versionadded:: 0.25.2
225+
220226
Returns
221227
-------
222228
DataFrame

pandas/core/reshape/merge.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@
4747
import pandas.core.sorting as sorting
4848
from pandas.core.sorting import is_int64_overflow_possible
4949

50+
class DefaultNA:
51+
pass
5052

5153
@Substitution("\nleft : DataFrame")
5254
@Appender(_merge_doc, indents=0)
@@ -64,6 +66,7 @@ def merge(
6466
copy=True,
6567
indicator=False,
6668
validate=None,
69+
index_na_value=DefaultNA(),
6770
):
6871
op = _MergeOperation(
6972
left,
@@ -79,6 +82,7 @@ def merge(
7982
copy=copy,
8083
indicator=indicator,
8184
validate=validate,
85+
index_na_value=index_na_value,
8286
)
8387
return op.get_result()
8488

@@ -551,6 +555,7 @@ def __init__(
551555
copy=True,
552556
indicator=False,
553557
validate=None,
558+
index_na_value=DefaultNA(),
554559
):
555560
left = validate_operand(left)
556561
right = validate_operand(right)
@@ -619,6 +624,10 @@ def __init__(
619624
if validate is not None:
620625
self._validate(validate)
621626

627+
# if a join requires NA values to be placed in the index
628+
# use this value or default NA which may involve a type promotion
629+
self.index_na_value = index_na_value
630+
622631
def get_result(self):
623632
if self.indicator:
624633
self.left, self.right = self._indicator_pre_merge(self.left, self.right)
@@ -898,7 +907,11 @@ def _create_join_index(
898907
# and fill_value because it throws a ValueError on integer indices
899908
mask = indexer == -1
900909
if np.any(mask):
901-
fill_value = na_value_for_dtype(index.dtype, compat=False)
910+
if isinstance(self.index_na_value, DefaultNA):
911+
fill_value = na_value_for_dtype(index.dtype, compat=False)
912+
else:
913+
fill_value = self.index_na_value
914+
902915
index = index.append(Index([fill_value]))
903916
return index.take(indexer)
904917

pandas/tests/reshape/merge/test_merge.py

+164-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,11 @@
3030
from pandas.core.reshape.concat import concat
3131
from pandas.core.reshape.merge import MergeError, merge
3232
import pandas.util.testing as tm
33-
from pandas.util.testing import assert_frame_equal, assert_series_equal
33+
from pandas.util.testing import (
34+
assert_frame_equal,
35+
assert_series_equal,
36+
assert_index_equal,
37+
)
3438

3539
N = 50
3640
NGROUPS = 8
@@ -2094,3 +2098,162 @@ def test_merge_equal_cat_dtypes2():
20942098

20952099
# Categorical is unordered, so don't check ordering.
20962100
tm.assert_frame_equal(result, expected, check_categorical=False)
2101+
2102+
@pytest.fixture(
2103+
params=[
2104+
dict(domain=pd.Index(["A", "B", "C"])),
2105+
dict(domain=CategoricalIndex(["A", "B", "C"])),
2106+
dict(domain=DatetimeIndex(["2001-01-01", "2002-02-02", "2003-03-03"])),
2107+
dict(domain=Float64Index([1, 2, 3])),
2108+
dict(domain=Int64Index([1, 2, 3])),
2109+
dict(domain=IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)])),
2110+
dict(domain=TimedeltaIndex(["1d", "2d", "3d"])),
2111+
dict(domain=PeriodIndex(["2001-01-01", "2001-01-02", "2001-01-03"], freq="D")),
2112+
]
2113+
)
2114+
def fix_GH_28220_(request):
2115+
class Data:
2116+
def __init__(self):
2117+
self.domain = request.param["domain"]
2118+
self.X = pd.DataFrame({"count": [1, 2]}, index=self.domain.take([0, 1]))
2119+
self.Y = pd.DataFrame(
2120+
{"name": self.domain.take([0, 2]), "value": [100, 200]}
2121+
)
2122+
self.Z = pd.DataFrame(
2123+
{"name": self.domain.take([0, 0, 2]), "value": [100, 200, 300]}
2124+
)
2125+
self.E = pd.DataFrame(columns=["name", "value"])
2126+
2127+
assert isinstance(self.X.index, type(self.domain))
2128+
2129+
return Data()
2130+
2131+
@pytest.mark.parametrize(
2132+
"how,expected",
2133+
[
2134+
("left", ([0, -255], [0, 1, -255], [0, 1])),
2135+
("inner", ([0], [0, 1], [])),
2136+
("outer", ([0, -255, 1], [0, 1, -255, 2], [0, 1])),
2137+
],
2138+
)
2139+
def test_left_index_merge_with_missing_by_right_on(fix_GH_28220_, how, expected):
2140+
2141+
# GH 28220
2142+
(e1, e2, e3) = map(lambda x: pd.Index(x), expected)
2143+
e3 = fix_GH_28220_.domain.take(e3)
2144+
2145+
r1 = pd.merge(
2146+
fix_GH_28220_.X, fix_GH_28220_.Y, left_index=True, right_on=["name"], how=how, index_na_value=-255
2147+
)
2148+
assert_index_equal(r1.index, e1)
2149+
2150+
r2 = pd.merge(
2151+
fix_GH_28220_.X, fix_GH_28220_.Z, left_index=True, right_on=["name"], how=how, index_na_value=-255
2152+
)
2153+
assert_index_equal(r2.index, e2)
2154+
2155+
r3 = pd.merge(
2156+
fix_GH_28220_.X, fix_GH_28220_.E, left_index=True, right_on=["name"], how=how, index_na_value=-255
2157+
)
2158+
2159+
# special case when result is empty, dtype is object
2160+
if r3.empty:
2161+
e3 = pd.Index([], dtype=object, name=e3.name)
2162+
2163+
assert_index_equal(r3.index, e3)
2164+
2165+
2166+
@pytest.mark.parametrize(
2167+
"how,expected",
2168+
[
2169+
("right", ([0, -255], [0, 0, -255], [0, 1, 2])),
2170+
("inner", ([0], [0, 0], [])),
2171+
("outer", ([0, 1, -255], [0, 0, 1, -255], [0, 1])),
2172+
],
2173+
)
2174+
def test_left_on_merge_with_missing_by_right_index(fix_GH_28220_, how, expected):
2175+
2176+
# GH 28220
2177+
(e1, e2, e3) = map(lambda x: pd.Index(x), expected)
2178+
2179+
r1 = pd.merge(
2180+
fix_GH_28220_.X.reset_index(),
2181+
fix_GH_28220_.Y.set_index("name"),
2182+
left_on=["index"],
2183+
right_index=True,
2184+
how=how,
2185+
index_na_value=-255,
2186+
)
2187+
assert_index_equal(r1.index, e1)
2188+
2189+
r2 = pd.merge(
2190+
fix_GH_28220_.X.reset_index(),
2191+
fix_GH_28220_.Z.set_index("name"),
2192+
left_on=["index"],
2193+
right_index=True,
2194+
how=how,
2195+
index_na_value=-255,
2196+
)
2197+
assert_index_equal(r2.index, e2)
2198+
2199+
r3 = pd.merge(
2200+
fix_GH_28220_.X.reset_index(),
2201+
fix_GH_28220_.E.set_index("name"),
2202+
left_on=["index"],
2203+
right_index=True,
2204+
how=how,
2205+
index_na_value=-255,
2206+
)
2207+
2208+
# special case when result is empty, dtype is object
2209+
if r3.empty:
2210+
e3 = pd.Index([], dtype=object, name=e3.name)
2211+
2212+
assert_index_equal(r3.index, e3)
2213+
2214+
2215+
@pytest.mark.parametrize(
2216+
"how,expected",
2217+
[
2218+
("left", ([0, 1], [0, 1, 2], [0, 1])),
2219+
("right", ([0, 1], [0, 1, 2], [0, 2])),
2220+
("inner", ([0], [0, 1], [])),
2221+
("outer", ([0, 1, 2], [0, 1, 2, 3], [0, 1])),
2222+
],
2223+
)
2224+
def test_left_on_merge_with_missing_by_right_on(fix_GH_28220_, how, expected):
2225+
2226+
# GH 28220
2227+
(e1, e2, e3) = map(lambda x: pd.Index(x), expected)
2228+
2229+
r1 = pd.merge(
2230+
fix_GH_28220_.X.reset_index(),
2231+
fix_GH_28220_.Y,
2232+
left_on=["index"],
2233+
right_on=["name"],
2234+
how=how,
2235+
)
2236+
assert_index_equal(r1.index, e1)
2237+
2238+
r2 = pd.merge(
2239+
fix_GH_28220_.X.reset_index(),
2240+
fix_GH_28220_.Z,
2241+
left_on=["index"],
2242+
right_on=["name"],
2243+
how=how,
2244+
)
2245+
assert_index_equal(r2.index, e2)
2246+
2247+
r3 = pd.merge(
2248+
fix_GH_28220_.X.reset_index(),
2249+
fix_GH_28220_.E,
2250+
left_on=["index"],
2251+
right_on=["name"],
2252+
how=how,
2253+
)
2254+
2255+
# special case when result is empty, dtype is object
2256+
if r3.empty:
2257+
e3 = pd.Index([], dtype=object, name=e3.name)
2258+
2259+
assert_index_equal(r3.index, e3)

0 commit comments

Comments
 (0)