Skip to content

Commit f8e4705

Browse files
committed
Bug in pd.merge() when merge/join with multiple categorical columns
closes pandas-dev#16767
1 parent 65a0e64 commit f8e4705

File tree

3 files changed

+29
-4
lines changed

3 files changed

+29
-4
lines changed

doc/source/whatsnew/v0.20.3.txt

+1
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ Sparse
7878
Reshaping
7979
^^^^^^^^^
8080

81+
- Bug in ``pd.merge()`` when merge/join with multiple categorical columns (:issue:`16767`)
8182

8283

8384
Numeric

pandas/core/reshape/merge.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -1440,13 +1440,14 @@ def _factorize_keys(lk, rk, sort=True):
14401440
lk = lk.values
14411441
rk = rk.values
14421442

1443-
# if we exactly match in categories, allow us to use codes
1443+
# if we exactly match in categories, allow us to factorize on codes
14441444
if (is_categorical_dtype(lk) and
14451445
is_categorical_dtype(rk) and
14461446
lk.is_dtype_equal(rk)):
1447-
return lk.codes, rk.codes, len(lk.categories)
1448-
1449-
if is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk):
1447+
klass = libhashtable.Int64Factorizer
1448+
lk = _ensure_int64(lk.codes)
1449+
rk = _ensure_int64(rk.codes)
1450+
elif is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk):
14501451
klass = libhashtable.Int64Factorizer
14511452
lk = _ensure_int64(com._values_from_object(lk))
14521453
rk = _ensure_int64(com._values_from_object(rk))

pandas/tests/reshape/test_merge.py

+23
Original file line numberDiff line numberDiff line change
@@ -1480,6 +1480,29 @@ def test_dtype_on_merged_different(self, change, how, left, right):
14801480
index=['X', 'Y', 'Z'])
14811481
assert_series_equal(result, expected)
14821482

1483+
def test_self_join_multiple_categories(self):
1484+
# GH 16767
1485+
# non-duplicates should work with multiple categories
1486+
m = 5
1487+
df = pd.DataFrame({
1488+
'a': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] * m,
1489+
'b': ['t', 'w', 'x', 'y', 'z'] * 2 * m,
1490+
'c': [letter
1491+
for each in ['m', 'n', 'u', 'p', 'o']
1492+
for letter in [each] * 2 * m],
1493+
'd': [letter
1494+
for each in ['aa', 'bb', 'cc', 'dd', 'ee',
1495+
'ff', 'gg', 'hh', 'ii', 'jj']
1496+
for letter in [each] * m]})
1497+
1498+
# change them all to categorical variables
1499+
df = df.apply(lambda x: x.astype('category'))
1500+
1501+
# self-join should equal ourselves
1502+
result = pd.merge(df, df, on=list(df.columns))
1503+
1504+
assert_frame_equal(result, df)
1505+
14831506

14841507
@pytest.fixture
14851508
def left_df():

0 commit comments

Comments
 (0)