BUG: Fix merging non-indexes causes Index dtype promotion in when keys are missing from left or right side. (GH28220)

dworvos · dworvos · commit 3e1b7f040426 · 2019-10-20T16:51:43.000-04:00
Also closes GH24897, GH24212, and GH17257
diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst
@@ -34,6 +34,11 @@ Groupby/resample/rolling
 - Bug incorrectly raising an ``IndexError`` when passing a list of quantiles to :meth:`pandas.core.groupby.DataFrameGroupBy.quantile` (:issue:`28113`).
 - Bug in :meth:`pandas.core.groupby.GroupBy.shift`, :meth:`pandas.core.groupby.GroupBy.bfill` and :meth:`pandas.core.groupby.GroupBy.ffill` where timezone information would be dropped (:issue:`19995`, :issue:`27992`)
 
+Reshaping
+^^^^^^^^^
+
+- Added new option to allow user to specify NA value for certain joins when missing keys when not using left_index when how='right', or right_index when how='left' causing dtype promotion (:issue:`28220`).
+
 Other
 ^^^^^
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -231,6 +231,12 @@
 
     .. versionadded:: 0.21.0
 
+index_na_value : value, optional
+    If a join requires NA values to be placed in the index use this value or
+    accept the default NA for the dtype which may involve a type promotion
+
+    .. versionadded:: 0.25.2
+
 Returns
 -------
 DataFrame
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -48,6 +48,10 @@
 from pandas.core.sorting import is_int64_overflow_possible
 
 
+class DefaultNA:
+    pass
+
+
 @Substitution("\nleft : DataFrame")
 @Appender(_merge_doc, indents=0)
 def merge(
@@ -64,6 +68,7 @@ def merge(
     copy=True,
     indicator=False,
     validate=None,
+    index_na_value=DefaultNA(),
 ):
     op = _MergeOperation(
         left,
@@ -79,6 +84,7 @@ def merge(
         copy=copy,
         indicator=indicator,
         validate=validate,
+        index_na_value=index_na_value,
     )
     return op.get_result()
 
@@ -551,6 +557,7 @@ def __init__(
         copy=True,
         indicator=False,
         validate=None,
+        index_na_value=DefaultNA(),
     ):
         left = validate_operand(left)
         right = validate_operand(right)
@@ -619,6 +626,10 @@ def __init__(
         if validate is not None:
             self._validate(validate)
 
+        # if a join requires NA values to be placed in the index
+        # use this value or default NA which may involve a type promotion
+        self.index_na_value = index_na_value
+
     def get_result(self):
         if self.indicator:
             self.left, self.right = self._indicator_pre_merge(self.left, self.right)
@@ -898,7 +909,11 @@ def _create_join_index(
             # and fill_value because it throws a ValueError on integer indices
             mask = indexer == -1
             if np.any(mask):
-                fill_value = na_value_for_dtype(index.dtype, compat=False)
+                if isinstance(self.index_na_value, DefaultNA):
+                    fill_value = na_value_for_dtype(index.dtype, compat=False)
+                else:
+                    fill_value = self.index_na_value
+
                 index = index.append(Index([fill_value]))
         return index.take(indexer)
 
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
@@ -29,7 +29,11 @@
 from pandas.core.reshape.concat import concat
 from pandas.core.reshape.merge import MergeError, merge
 import pandas.util.testing as tm
-from pandas.util.testing import assert_frame_equal, assert_series_equal
+from pandas.util.testing import (
+    assert_frame_equal,
+    assert_index_equal,
+    assert_series_equal,
+)
 
 N = 50
 NGROUPS = 8
@@ -2088,7 +2092,6 @@ def test_merge_equal_cat_dtypes2():
     # Categorical is unordered, so don't check ordering.
     tm.assert_frame_equal(result, expected, check_categorical=False)
 
-
 def test_merge_on_cat_and_ext_array():
     # GH 28668
     right = DataFrame(
@@ -2131,3 +2134,178 @@ def test_merge_multiindex_columns():
     expected["id"] = ""
 
     tm.assert_frame_equal(result, expected)
+
+@pytest.fixture(
+    params=[
+        dict(domain=pd.Index(["A", "B", "C"])),
+        dict(domain=CategoricalIndex(["A", "B", "C"])),
+        dict(domain=DatetimeIndex(["2001-01-01", "2002-02-02", "2003-03-03"])),
+        dict(domain=Float64Index([1, 2, 3])),
+        dict(domain=Int64Index([1, 2, 3])),
+        dict(domain=IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)])),
+        dict(domain=TimedeltaIndex(["1d", "2d", "3d"])),
+        dict(domain=PeriodIndex(["2001-01-01", "2001-01-02", "2001-01-03"], freq="D")),
+    ]
+)
+def fix_GH_28220_(request):
+    class Data:
+        def __init__(self):
+            self.domain = request.param["domain"]
+            self.X = pd.DataFrame({"count": [1, 2]}, index=self.domain.take([0, 1]))
+            self.Y = pd.DataFrame(
+                {"name": self.domain.take([0, 2]), "value": [100, 200]}
+            )
+            self.Z = pd.DataFrame(
+                {"name": self.domain.take([0, 0, 2]), "value": [100, 200, 300]}
+            )
+            self.E = pd.DataFrame(columns=["name", "value"])
+
+            assert isinstance(self.X.index, type(self.domain))
+
+    return Data()
+
+
+@pytest.mark.parametrize(
+    "how,expected",
+    [
+        ("left", ([0, -255], [0, 1, -255], [0, 1])),
+        ("inner", ([0], [0, 1], [])),
+        ("outer", ([0, -255, 1], [0, 1, -255, 2], [0, 1])),
+    ],
+)
+def test_left_index_merge_with_missing_by_right_on(fix_GH_28220_, how, expected):
+
+    # GH 28220
+    (e1, e2, e3) = map(lambda x: pd.Index(x), expected)
+    e3 = fix_GH_28220_.domain.take(e3)
+
+    r1 = pd.merge(
+        fix_GH_28220_.X,
+        fix_GH_28220_.Y,
+        left_index=True,
+        right_on=["name"],
+        how=how,
+        index_na_value=-255,
+    )
+    assert_index_equal(r1.index, e1)
+
+    r2 = pd.merge(
+        fix_GH_28220_.X,
+        fix_GH_28220_.Z,
+        left_index=True,
+        right_on=["name"],
+        how=how,
+        index_na_value=-255,
+    )
+    assert_index_equal(r2.index, e2)
+
+    r3 = pd.merge(
+        fix_GH_28220_.X,
+        fix_GH_28220_.E,
+        left_index=True,
+        right_on=["name"],
+        how=how,
+        index_na_value=-255,
+    )
+
+    # special case when result is empty, dtype is object
+    if r3.empty:
+        e3 = pd.Index([], dtype=object, name=e3.name)
+
+    assert_index_equal(r3.index, e3)
+
+
+@pytest.mark.parametrize(
+    "how,expected",
+    [
+        ("right", ([0, -255], [0, 0, -255], [0, 1, 2])),
+        ("inner", ([0], [0, 0], [])),
+        ("outer", ([0, 1, -255], [0, 0, 1, -255], [0, 1])),
+    ],
+)
+def test_left_on_merge_with_missing_by_right_index(fix_GH_28220_, how, expected):
+
+    # GH 28220
+    (e1, e2, e3) = map(lambda x: pd.Index(x), expected)
+
+    r1 = pd.merge(
+        fix_GH_28220_.X.reset_index(),
+        fix_GH_28220_.Y.set_index("name"),
+        left_on=["index"],
+        right_index=True,
+        how=how,
+        index_na_value=-255,
+    )
+    assert_index_equal(r1.index, e1)
+
+    r2 = pd.merge(
+        fix_GH_28220_.X.reset_index(),
+        fix_GH_28220_.Z.set_index("name"),
+        left_on=["index"],
+        right_index=True,
+        how=how,
+        index_na_value=-255,
+    )
+    assert_index_equal(r2.index, e2)
+
+    r3 = pd.merge(
+        fix_GH_28220_.X.reset_index(),
+        fix_GH_28220_.E.set_index("name"),
+        left_on=["index"],
+        right_index=True,
+        how=how,
+        index_na_value=-255,
+    )
+
+    # special case when result is empty, dtype is object
+    if r3.empty:
+        e3 = pd.Index([], dtype=object, name=e3.name)
+
+    assert_index_equal(r3.index, e3)
+
+
+@pytest.mark.parametrize(
+    "how,expected",
+    [
+        ("left", ([0, 1], [0, 1, 2], [0, 1])),
+        ("right", ([0, 1], [0, 1, 2], [0, 2])),
+        ("inner", ([0], [0, 1], [])),
+        ("outer", ([0, 1, 2], [0, 1, 2, 3], [0, 1])),
+    ],
+)
+def test_left_on_merge_with_missing_by_right_on(fix_GH_28220_, how, expected):
+
+    # GH 28220
+    (e1, e2, e3) = map(lambda x: pd.Index(x), expected)
+
+    r1 = pd.merge(
+        fix_GH_28220_.X.reset_index(),
+        fix_GH_28220_.Y,
+        left_on=["index"],
+        right_on=["name"],
+        how=how,
+    )
+    assert_index_equal(r1.index, e1)
+
+    r2 = pd.merge(
+        fix_GH_28220_.X.reset_index(),
+        fix_GH_28220_.Z,
+        left_on=["index"],
+        right_on=["name"],
+        how=how,
+    )
+    assert_index_equal(r2.index, e2)
+
+    r3 = pd.merge(
+        fix_GH_28220_.X.reset_index(),
+        fix_GH_28220_.E,
+        left_on=["index"],
+        right_on=["name"],
+        how=how,
+    )
+
+    # special case when result is empty, dtype is object
+    if r3.empty:
+        e3 = pd.Index([], dtype=object, name=e3.name)
+
+    assert_index_equal(r3.index, e3)