BUG: merging an Integer EA rasises (#23262)

makbigc · jreback · commit 14c33b0e531b · 2018-12-19T09:28:57.000-05:00
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -1009,6 +1009,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your
 - :meth:`DataFrame.stack` no longer converts to object dtype for DataFrames where each column has the same extension dtype. The output Series will have the same dtype as the columns (:issue:`23077`).
 - :meth:`Series.unstack` and :meth:`DataFrame.unstack` no longer convert extension arrays to object-dtype ndarrays. Each column in the output ``DataFrame`` will now have the same dtype as the input (:issue:`23077`).
 - Bug when grouping :meth:`Dataframe.groupby()` and aggregating on ``ExtensionArray`` it was not returning the actual ``ExtensionArray`` dtype (:issue:`23227`).
+- Bug in :func:`pandas.merge` when merging on an extension array-backed column (:issue:`23020`).
 - A default repr for :class:`pandas.api.extensions.ExtensionArray` is now provided (:issue:`23601`).
 
 .. _whatsnew_0240.api.incompatibilities:
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -17,9 +17,10 @@
 from pandas.core.dtypes.common import (
     ensure_float64, ensure_int64, ensure_object, is_array_like, is_bool,
     is_bool_dtype, is_categorical_dtype, is_datetime64_dtype,
-    is_datetime64tz_dtype, is_datetimelike, is_dtype_equal, is_float_dtype,
-    is_int64_dtype, is_integer, is_integer_dtype, is_list_like, is_number,
-    is_numeric_dtype, needs_i8_conversion)
+    is_datetime64tz_dtype, is_datetimelike, is_dtype_equal,
+    is_extension_array_dtype, is_float_dtype, is_int64_dtype, is_integer,
+    is_integer_dtype, is_list_like, is_number, is_numeric_dtype,
+    needs_i8_conversion)
 from pandas.core.dtypes.missing import isnull, na_value_for_dtype
 
 from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timedelta
@@ -1589,25 +1590,31 @@ def _right_outer_join(x, y, max_groups):
 
 
 def _factorize_keys(lk, rk, sort=True):
+    # Some pre-processing for non-ndarray lk / rk
     if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk):
         lk = lk.values
         rk = rk.values
 
-    # if we exactly match in categories, allow us to factorize on codes
-    if (is_categorical_dtype(lk) and
+    elif (is_categorical_dtype(lk) and
             is_categorical_dtype(rk) and
             lk.is_dtype_equal(rk)):
-        klass = libhashtable.Int64Factorizer
-
         if lk.categories.equals(rk.categories):
+            # if we exactly match in categories, allow us to factorize on codes
             rk = rk.codes
         else:
             # Same categories in different orders -> recode
             rk = _recode_for_categories(rk.codes, rk.categories, lk.categories)
 
         lk = ensure_int64(lk.codes)
         rk = ensure_int64(rk)
-    elif is_integer_dtype(lk) and is_integer_dtype(rk):
+
+    elif (is_extension_array_dtype(lk.dtype) and
+          is_extension_array_dtype(rk.dtype) and
+          lk.dtype == rk.dtype):
+        lk, _ = lk._values_for_factorize()
+        rk, _ = rk._values_for_factorize()
+
+    if is_integer_dtype(lk) and is_integer_dtype(rk):
         # GH#23917 TODO: needs tests for case where lk is integer-dtype
         #  and rk is datetime-dtype
         klass = libhashtable.Int64Factorizer
diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py
@@ -173,6 +173,38 @@ def test_merge(self, data, na_value):
                  dtype=data.dtype)})
         self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']])
 
+    def test_merge_on_extension_array(self, data):
+        # GH 23020
+        a, b = data[:2]
+        key = type(data)._from_sequence([a, b], dtype=data.dtype)
+
+        df = pd.DataFrame({"key": key, "val": [1, 2]})
+        result = pd.merge(df, df, on='key')
+        expected = pd.DataFrame({"key": key,
+                                 "val_x": [1, 2],
+                                 "val_y": [1, 2]})
+        self.assert_frame_equal(result, expected)
+
+        # order
+        result = pd.merge(df.iloc[[1, 0]], df, on='key')
+        expected = expected.iloc[[1, 0]].reset_index(drop=True)
+        self.assert_frame_equal(result, expected)
+
+    def test_merge_on_extension_array_duplicates(self, data):
+        # GH 23020
+        a, b = data[:2]
+        key = type(data)._from_sequence([a, b, a], dtype=data.dtype)
+        df1 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
+        df2 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
+
+        result = pd.merge(df1, df2, on='key')
+        expected = pd.DataFrame({
+            "key": key.take([0, 0, 0, 0, 1]),
+            "val_x": [1, 1, 3, 3, 2],
+            "val_y": [1, 3, 1, 3, 2],
+        })
+        self.assert_frame_equal(result, expected)
+
     @pytest.mark.parametrize("columns", [
         ["A", "B"],
         pd.MultiIndex.from_tuples([('A', 'a'), ('A', 'b')],
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
@@ -1326,6 +1326,16 @@ def test_merging_with_bool_or_int_cateorical_column(self, category_column,
             CDT(categories, ordered=ordered))
         assert_frame_equal(expected, result)
 
+    def test_merge_on_int_array(self):
+        # GH 23020
+        df = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'),
+                           'B': 1})
+        result = pd.merge(df, df, on='A')
+        expected = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'),
+                                 'B_x': 1,
+                                 'B_y': 1})
+        assert_frame_equal(result, expected)
+
 
 @pytest.fixture
 def left_df():