Skip to content

Commit 14c33b0

Browse files
makbigcjreback
authored andcommitted
BUG: merging an Integer EA rasises (#23262)
1 parent c230f29 commit 14c33b0

File tree

4 files changed

+58
-8
lines changed

4 files changed

+58
-8
lines changed

doc/source/whatsnew/v0.24.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1009,6 +1009,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your
10091009
- :meth:`DataFrame.stack` no longer converts to object dtype for DataFrames where each column has the same extension dtype. The output Series will have the same dtype as the columns (:issue:`23077`).
10101010
- :meth:`Series.unstack` and :meth:`DataFrame.unstack` no longer convert extension arrays to object-dtype ndarrays. Each column in the output ``DataFrame`` will now have the same dtype as the input (:issue:`23077`).
10111011
- Bug when grouping :meth:`Dataframe.groupby()` and aggregating on ``ExtensionArray`` it was not returning the actual ``ExtensionArray`` dtype (:issue:`23227`).
1012+
- Bug in :func:`pandas.merge` when merging on an extension array-backed column (:issue:`23020`).
10121013
- A default repr for :class:`pandas.api.extensions.ExtensionArray` is now provided (:issue:`23601`).
10131014

10141015
.. _whatsnew_0240.api.incompatibilities:

pandas/core/reshape/merge.py

+15-8
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,10 @@
1717
from pandas.core.dtypes.common import (
1818
ensure_float64, ensure_int64, ensure_object, is_array_like, is_bool,
1919
is_bool_dtype, is_categorical_dtype, is_datetime64_dtype,
20-
is_datetime64tz_dtype, is_datetimelike, is_dtype_equal, is_float_dtype,
21-
is_int64_dtype, is_integer, is_integer_dtype, is_list_like, is_number,
22-
is_numeric_dtype, needs_i8_conversion)
20+
is_datetime64tz_dtype, is_datetimelike, is_dtype_equal,
21+
is_extension_array_dtype, is_float_dtype, is_int64_dtype, is_integer,
22+
is_integer_dtype, is_list_like, is_number, is_numeric_dtype,
23+
needs_i8_conversion)
2324
from pandas.core.dtypes.missing import isnull, na_value_for_dtype
2425

2526
from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timedelta
@@ -1589,25 +1590,31 @@ def _right_outer_join(x, y, max_groups):
15891590

15901591

15911592
def _factorize_keys(lk, rk, sort=True):
1593+
# Some pre-processing for non-ndarray lk / rk
15921594
if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk):
15931595
lk = lk.values
15941596
rk = rk.values
15951597

1596-
# if we exactly match in categories, allow us to factorize on codes
1597-
if (is_categorical_dtype(lk) and
1598+
elif (is_categorical_dtype(lk) and
15981599
is_categorical_dtype(rk) and
15991600
lk.is_dtype_equal(rk)):
1600-
klass = libhashtable.Int64Factorizer
1601-
16021601
if lk.categories.equals(rk.categories):
1602+
# if we exactly match in categories, allow us to factorize on codes
16031603
rk = rk.codes
16041604
else:
16051605
# Same categories in different orders -> recode
16061606
rk = _recode_for_categories(rk.codes, rk.categories, lk.categories)
16071607

16081608
lk = ensure_int64(lk.codes)
16091609
rk = ensure_int64(rk)
1610-
elif is_integer_dtype(lk) and is_integer_dtype(rk):
1610+
1611+
elif (is_extension_array_dtype(lk.dtype) and
1612+
is_extension_array_dtype(rk.dtype) and
1613+
lk.dtype == rk.dtype):
1614+
lk, _ = lk._values_for_factorize()
1615+
rk, _ = rk._values_for_factorize()
1616+
1617+
if is_integer_dtype(lk) and is_integer_dtype(rk):
16111618
# GH#23917 TODO: needs tests for case where lk is integer-dtype
16121619
# and rk is datetime-dtype
16131620
klass = libhashtable.Int64Factorizer

pandas/tests/extension/base/reshaping.py

+32
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,38 @@ def test_merge(self, data, na_value):
173173
dtype=data.dtype)})
174174
self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']])
175175

176+
def test_merge_on_extension_array(self, data):
177+
# GH 23020
178+
a, b = data[:2]
179+
key = type(data)._from_sequence([a, b], dtype=data.dtype)
180+
181+
df = pd.DataFrame({"key": key, "val": [1, 2]})
182+
result = pd.merge(df, df, on='key')
183+
expected = pd.DataFrame({"key": key,
184+
"val_x": [1, 2],
185+
"val_y": [1, 2]})
186+
self.assert_frame_equal(result, expected)
187+
188+
# order
189+
result = pd.merge(df.iloc[[1, 0]], df, on='key')
190+
expected = expected.iloc[[1, 0]].reset_index(drop=True)
191+
self.assert_frame_equal(result, expected)
192+
193+
def test_merge_on_extension_array_duplicates(self, data):
194+
# GH 23020
195+
a, b = data[:2]
196+
key = type(data)._from_sequence([a, b, a], dtype=data.dtype)
197+
df1 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
198+
df2 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
199+
200+
result = pd.merge(df1, df2, on='key')
201+
expected = pd.DataFrame({
202+
"key": key.take([0, 0, 0, 0, 1]),
203+
"val_x": [1, 1, 3, 3, 2],
204+
"val_y": [1, 3, 1, 3, 2],
205+
})
206+
self.assert_frame_equal(result, expected)
207+
176208
@pytest.mark.parametrize("columns", [
177209
["A", "B"],
178210
pd.MultiIndex.from_tuples([('A', 'a'), ('A', 'b')],

pandas/tests/reshape/merge/test_merge.py

+10
Original file line numberDiff line numberDiff line change
@@ -1326,6 +1326,16 @@ def test_merging_with_bool_or_int_cateorical_column(self, category_column,
13261326
CDT(categories, ordered=ordered))
13271327
assert_frame_equal(expected, result)
13281328

1329+
def test_merge_on_int_array(self):
1330+
# GH 23020
1331+
df = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'),
1332+
'B': 1})
1333+
result = pd.merge(df, df, on='A')
1334+
expected = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'),
1335+
'B_x': 1,
1336+
'B_y': 1})
1337+
assert_frame_equal(result, expected)
1338+
13291339

13301340
@pytest.fixture
13311341
def left_df():

0 commit comments

Comments
 (0)