Skip to content

Commit 3e1b7f0

Browse files
committed
BUG: Fix merging non-indexes causes Index dtype promotion in when keys are missing from left or right side. (GH28220)
Also closes GH24897, GH24212, and GH17257
1 parent e623f0f commit 3e1b7f0

File tree

4 files changed

+207
-3
lines changed

4 files changed

+207
-3
lines changed

doc/source/whatsnew/v0.25.2.rst

+5
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,11 @@ Groupby/resample/rolling
3434
- Bug incorrectly raising an ``IndexError`` when passing a list of quantiles to :meth:`pandas.core.groupby.DataFrameGroupBy.quantile` (:issue:`28113`).
3535
- Bug in :meth:`pandas.core.groupby.GroupBy.shift`, :meth:`pandas.core.groupby.GroupBy.bfill` and :meth:`pandas.core.groupby.GroupBy.ffill` where timezone information would be dropped (:issue:`19995`, :issue:`27992`)
3636

37+
Reshaping
38+
^^^^^^^^^
39+
40+
- Added new option to allow user to specify NA value for certain joins when missing keys when not using left_index when how='right', or right_index when how='left' causing dtype promotion (:issue:`28220`).
41+
3742
Other
3843
^^^^^
3944

pandas/core/frame.py

+6
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,12 @@
231231
232232
.. versionadded:: 0.21.0
233233
234+
index_na_value : value, optional
235+
If a join requires NA values to be placed in the index use this value or
236+
accept the default NA for the dtype which may involve a type promotion
237+
238+
.. versionadded:: 0.25.2
239+
234240
Returns
235241
-------
236242
DataFrame

pandas/core/reshape/merge.py

+16-1
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,10 @@
4848
from pandas.core.sorting import is_int64_overflow_possible
4949

5050

51+
class DefaultNA:
52+
pass
53+
54+
5155
@Substitution("\nleft : DataFrame")
5256
@Appender(_merge_doc, indents=0)
5357
def merge(
@@ -64,6 +68,7 @@ def merge(
6468
copy=True,
6569
indicator=False,
6670
validate=None,
71+
index_na_value=DefaultNA(),
6772
):
6873
op = _MergeOperation(
6974
left,
@@ -79,6 +84,7 @@ def merge(
7984
copy=copy,
8085
indicator=indicator,
8186
validate=validate,
87+
index_na_value=index_na_value,
8288
)
8389
return op.get_result()
8490

@@ -551,6 +557,7 @@ def __init__(
551557
copy=True,
552558
indicator=False,
553559
validate=None,
560+
index_na_value=DefaultNA(),
554561
):
555562
left = validate_operand(left)
556563
right = validate_operand(right)
@@ -619,6 +626,10 @@ def __init__(
619626
if validate is not None:
620627
self._validate(validate)
621628

629+
# if a join requires NA values to be placed in the index
630+
# use this value or default NA which may involve a type promotion
631+
self.index_na_value = index_na_value
632+
622633
def get_result(self):
623634
if self.indicator:
624635
self.left, self.right = self._indicator_pre_merge(self.left, self.right)
@@ -898,7 +909,11 @@ def _create_join_index(
898909
# and fill_value because it throws a ValueError on integer indices
899910
mask = indexer == -1
900911
if np.any(mask):
901-
fill_value = na_value_for_dtype(index.dtype, compat=False)
912+
if isinstance(self.index_na_value, DefaultNA):
913+
fill_value = na_value_for_dtype(index.dtype, compat=False)
914+
else:
915+
fill_value = self.index_na_value
916+
902917
index = index.append(Index([fill_value]))
903918
return index.take(indexer)
904919

pandas/tests/reshape/merge/test_merge.py

+180-2
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,11 @@
2929
from pandas.core.reshape.concat import concat
3030
from pandas.core.reshape.merge import MergeError, merge
3131
import pandas.util.testing as tm
32-
from pandas.util.testing import assert_frame_equal, assert_series_equal
32+
from pandas.util.testing import (
33+
assert_frame_equal,
34+
assert_index_equal,
35+
assert_series_equal,
36+
)
3337

3438
N = 50
3539
NGROUPS = 8
@@ -2088,7 +2092,6 @@ def test_merge_equal_cat_dtypes2():
20882092
# Categorical is unordered, so don't check ordering.
20892093
tm.assert_frame_equal(result, expected, check_categorical=False)
20902094

2091-
20922095
def test_merge_on_cat_and_ext_array():
20932096
# GH 28668
20942097
right = DataFrame(
@@ -2131,3 +2134,178 @@ def test_merge_multiindex_columns():
21312134
expected["id"] = ""
21322135

21332136
tm.assert_frame_equal(result, expected)
2137+
2138+
@pytest.fixture(
2139+
params=[
2140+
dict(domain=pd.Index(["A", "B", "C"])),
2141+
dict(domain=CategoricalIndex(["A", "B", "C"])),
2142+
dict(domain=DatetimeIndex(["2001-01-01", "2002-02-02", "2003-03-03"])),
2143+
dict(domain=Float64Index([1, 2, 3])),
2144+
dict(domain=Int64Index([1, 2, 3])),
2145+
dict(domain=IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)])),
2146+
dict(domain=TimedeltaIndex(["1d", "2d", "3d"])),
2147+
dict(domain=PeriodIndex(["2001-01-01", "2001-01-02", "2001-01-03"], freq="D")),
2148+
]
2149+
)
2150+
def fix_GH_28220_(request):
2151+
class Data:
2152+
def __init__(self):
2153+
self.domain = request.param["domain"]
2154+
self.X = pd.DataFrame({"count": [1, 2]}, index=self.domain.take([0, 1]))
2155+
self.Y = pd.DataFrame(
2156+
{"name": self.domain.take([0, 2]), "value": [100, 200]}
2157+
)
2158+
self.Z = pd.DataFrame(
2159+
{"name": self.domain.take([0, 0, 2]), "value": [100, 200, 300]}
2160+
)
2161+
self.E = pd.DataFrame(columns=["name", "value"])
2162+
2163+
assert isinstance(self.X.index, type(self.domain))
2164+
2165+
return Data()
2166+
2167+
2168+
@pytest.mark.parametrize(
2169+
"how,expected",
2170+
[
2171+
("left", ([0, -255], [0, 1, -255], [0, 1])),
2172+
("inner", ([0], [0, 1], [])),
2173+
("outer", ([0, -255, 1], [0, 1, -255, 2], [0, 1])),
2174+
],
2175+
)
2176+
def test_left_index_merge_with_missing_by_right_on(fix_GH_28220_, how, expected):
2177+
2178+
# GH 28220
2179+
(e1, e2, e3) = map(lambda x: pd.Index(x), expected)
2180+
e3 = fix_GH_28220_.domain.take(e3)
2181+
2182+
r1 = pd.merge(
2183+
fix_GH_28220_.X,
2184+
fix_GH_28220_.Y,
2185+
left_index=True,
2186+
right_on=["name"],
2187+
how=how,
2188+
index_na_value=-255,
2189+
)
2190+
assert_index_equal(r1.index, e1)
2191+
2192+
r2 = pd.merge(
2193+
fix_GH_28220_.X,
2194+
fix_GH_28220_.Z,
2195+
left_index=True,
2196+
right_on=["name"],
2197+
how=how,
2198+
index_na_value=-255,
2199+
)
2200+
assert_index_equal(r2.index, e2)
2201+
2202+
r3 = pd.merge(
2203+
fix_GH_28220_.X,
2204+
fix_GH_28220_.E,
2205+
left_index=True,
2206+
right_on=["name"],
2207+
how=how,
2208+
index_na_value=-255,
2209+
)
2210+
2211+
# special case when result is empty, dtype is object
2212+
if r3.empty:
2213+
e3 = pd.Index([], dtype=object, name=e3.name)
2214+
2215+
assert_index_equal(r3.index, e3)
2216+
2217+
2218+
@pytest.mark.parametrize(
2219+
"how,expected",
2220+
[
2221+
("right", ([0, -255], [0, 0, -255], [0, 1, 2])),
2222+
("inner", ([0], [0, 0], [])),
2223+
("outer", ([0, 1, -255], [0, 0, 1, -255], [0, 1])),
2224+
],
2225+
)
2226+
def test_left_on_merge_with_missing_by_right_index(fix_GH_28220_, how, expected):
2227+
2228+
# GH 28220
2229+
(e1, e2, e3) = map(lambda x: pd.Index(x), expected)
2230+
2231+
r1 = pd.merge(
2232+
fix_GH_28220_.X.reset_index(),
2233+
fix_GH_28220_.Y.set_index("name"),
2234+
left_on=["index"],
2235+
right_index=True,
2236+
how=how,
2237+
index_na_value=-255,
2238+
)
2239+
assert_index_equal(r1.index, e1)
2240+
2241+
r2 = pd.merge(
2242+
fix_GH_28220_.X.reset_index(),
2243+
fix_GH_28220_.Z.set_index("name"),
2244+
left_on=["index"],
2245+
right_index=True,
2246+
how=how,
2247+
index_na_value=-255,
2248+
)
2249+
assert_index_equal(r2.index, e2)
2250+
2251+
r3 = pd.merge(
2252+
fix_GH_28220_.X.reset_index(),
2253+
fix_GH_28220_.E.set_index("name"),
2254+
left_on=["index"],
2255+
right_index=True,
2256+
how=how,
2257+
index_na_value=-255,
2258+
)
2259+
2260+
# special case when result is empty, dtype is object
2261+
if r3.empty:
2262+
e3 = pd.Index([], dtype=object, name=e3.name)
2263+
2264+
assert_index_equal(r3.index, e3)
2265+
2266+
2267+
@pytest.mark.parametrize(
2268+
"how,expected",
2269+
[
2270+
("left", ([0, 1], [0, 1, 2], [0, 1])),
2271+
("right", ([0, 1], [0, 1, 2], [0, 2])),
2272+
("inner", ([0], [0, 1], [])),
2273+
("outer", ([0, 1, 2], [0, 1, 2, 3], [0, 1])),
2274+
],
2275+
)
2276+
def test_left_on_merge_with_missing_by_right_on(fix_GH_28220_, how, expected):
2277+
2278+
# GH 28220
2279+
(e1, e2, e3) = map(lambda x: pd.Index(x), expected)
2280+
2281+
r1 = pd.merge(
2282+
fix_GH_28220_.X.reset_index(),
2283+
fix_GH_28220_.Y,
2284+
left_on=["index"],
2285+
right_on=["name"],
2286+
how=how,
2287+
)
2288+
assert_index_equal(r1.index, e1)
2289+
2290+
r2 = pd.merge(
2291+
fix_GH_28220_.X.reset_index(),
2292+
fix_GH_28220_.Z,
2293+
left_on=["index"],
2294+
right_on=["name"],
2295+
how=how,
2296+
)
2297+
assert_index_equal(r2.index, e2)
2298+
2299+
r3 = pd.merge(
2300+
fix_GH_28220_.X.reset_index(),
2301+
fix_GH_28220_.E,
2302+
left_on=["index"],
2303+
right_on=["name"],
2304+
how=how,
2305+
)
2306+
2307+
# special case when result is empty, dtype is object
2308+
if r3.empty:
2309+
e3 = pd.Index([], dtype=object, name=e3.name)
2310+
2311+
assert_index_equal(r3.index, e3)

0 commit comments

Comments
 (0)