Skip to content

Commit c7008a0

Browse files
committed
BUG/API: .merge() and .join() on category dtype columns will now
preserve the category dtype when possible closes pandas-dev#10409
1 parent 6d2293f commit c7008a0

File tree

7 files changed

+158
-1
lines changed

7 files changed

+158
-1
lines changed

asv_bench/benchmarks/join_merge.py

+24
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,30 @@ def time_i8merge(self):
257257
merge(self.left, self.right, how='outer')
258258

259259

260+
class MergeCategoricals(object):
261+
goal_time = 0.2
262+
263+
def setup(self):
264+
self.left_object = pd.DataFrame(
265+
{'X': np.random.choice(range(0, 10), size=(10000,)),
266+
'Y': np.random.choice(['one', 'two', 'three'], size=(10000,))})
267+
268+
self.right_object = pd.DataFrame(
269+
{'X': np.random.choice(range(0, 10), size=(10000,)),
270+
'Z': np.random.choice(['jjj', 'kkk', 'sss'], size=(10000,))})
271+
272+
self.left_cat = self.left_object.assign(
273+
Y=self.left_object['Y'].astype('category'))
274+
self.right_cat = self.right_object.assign(
275+
Z=self.right_object['Z'].astype('category'))
276+
277+
def time_merge_object(self):
278+
merge(self.left_object, self.right_object, on='X')
279+
280+
def time_merge_cat(self):
281+
merge(self.left_cat, self.right_cat, on='X')
282+
283+
260284
#----------------------------------------------------------------------
261285
# Ordered merge
262286

doc/source/whatsnew/v0.20.0.txt

+5-1
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,10 @@ Other API Changes
368368
- ``pd.read_csv()`` will now raise a ``ValueError`` for the C engine if the quote character is larger than than one byte (:issue:`11592`)
369369
- ``inplace`` arguments now require a boolean value, else a ``ValueError`` is thrown (:issue:`14189`)
370370
- ``pandas.api.types.is_datetime64_ns_dtype`` will now report ``True`` on a tz-aware dtype, similar to ``pandas.api.types.is_datetime64_any_dtype``
371-
- ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`)
371+
- ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`)
372+
- ``.merge()`` and ``.join()`` on ``category`` dtype columns will now preserve the category dtype when possible (:issue:`10409`)
373+
374+
372375
.. _whatsnew_0200.deprecations:
373376

374377
Deprecations
@@ -409,6 +412,7 @@ Performance Improvements
409412
- Improved performance of timeseries plotting with an irregular DatetimeIndex
410413
(or with ``compat_x=True``) (:issue:`15073`).
411414
- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`)
415+
- Improved performance of merge/join on ``category`` columns (:issue:`10409`)
412416

413417
- When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object.
414418

pandas/core/internals.py

+2
Original file line numberDiff line numberDiff line change
@@ -5223,6 +5223,8 @@ def get_reindexed_values(self, empty_dtype, upcasted_na):
52235223
# External code requested filling/upcasting, bool values must
52245224
# be upcasted to object to avoid being upcasted to numeric.
52255225
values = self.block.astype(np.object_).values
5226+
elif self.block.is_categorical:
5227+
values = self.block.values
52265228
else:
52275229
# No dtype upcasting is done here, it will be performed during
52285230
# concatenation itself.

pandas/tests/test_categorical.py

+2
Original file line numberDiff line numberDiff line change
@@ -4098,12 +4098,14 @@ def test_merge(self):
40984098
cright = right.copy()
40994099
cright['d'] = cright['d'].astype('category')
41004100
result = pd.merge(left, cright, how='left', left_on='b', right_on='c')
4101+
expected['d'] = expected['d'].astype('category', categories=['null'])
41014102
tm.assert_frame_equal(result, expected)
41024103

41034104
# cat-object
41044105
cleft = left.copy()
41054106
cleft['b'] = cleft['b'].astype('category')
41064107
result = pd.merge(cleft, cright, how='left', left_on='b', right_on='c')
4108+
expected['b'] = expected['b'].astype('category')
41074109
tm.assert_frame_equal(result, expected)
41084110

41094111
# cat-cat

pandas/tools/merge.py

+8
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
is_datetime64_dtype,
2222
needs_i8_conversion,
2323
is_int64_dtype,
24+
is_categorical_dtype,
2425
is_integer_dtype,
2526
is_float_dtype,
2627
is_integer,
@@ -1339,6 +1340,13 @@ def _factorize_keys(lk, rk, sort=True):
13391340
if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk):
13401341
lk = lk.values
13411342
rk = rk.values
1343+
1344+
# if we exactly match in categories, allow us to use codes
1345+
if (is_categorical_dtype(lk) and
1346+
is_categorical_dtype(rk) and
1347+
lk.is_dtype_equal(rk)):
1348+
return lk.codes, rk.codes, len(lk.categories)
1349+
13421350
if is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk):
13431351
klass = _hash.Int64Factorizer
13441352
lk = _ensure_int64(com._values_from_object(lk))

pandas/tools/tests/test_merge.py

+116
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from pandas.util.testing import (assert_frame_equal,
1515
assert_series_equal,
1616
slow)
17+
from pandas.types.dtypes import CategoricalDtype
1718
from pandas import DataFrame, Index, MultiIndex, Series, Categorical
1819
import pandas.util.testing as tm
1920

@@ -1372,6 +1373,121 @@ def f():
13721373
self.assertRaises(NotImplementedError, f)
13731374

13741375

1376+
class TestMergeCategorical(tm.TestCase):
1377+
_multiprocess_can_split_ = True
1378+
1379+
def setUp(self):
1380+
np.random.seed(1234)
1381+
self.left = DataFrame(
1382+
{'X': np.random.choice(['foo', 'bar'], size=(10,)),
1383+
'Y': np.random.choice(['one', 'two', 'three'], size=(10,))})
1384+
1385+
self.right = pd.DataFrame(
1386+
{'X': np.random.choice(['foo', 'bar'], size=(10,)),
1387+
'Z': np.random.choice(['jjj', 'kkk', 'sss'], size=(10,))})
1388+
1389+
def test_identical(self):
1390+
# GH 10409
1391+
left = self.left.assign(X=self.left.X.astype('category'))
1392+
1393+
merged = pd.merge(left, left, on='X')
1394+
result = merged.dtypes.sort_index()
1395+
expected = Series([CategoricalDtype(),
1396+
np.dtype('O'),
1397+
np.dtype('O')],
1398+
index=['X', 'Y_x', 'Y_y'])
1399+
assert_series_equal(result, expected)
1400+
1401+
def test_other_columns(self):
1402+
# non-merge columns should preserver if possible
1403+
x = self.left.X.astype('category')
1404+
left = DataFrame({'X': x, 'Y': x})
1405+
1406+
merged = pd.merge(left, left, on='X')
1407+
result = merged.dtypes.sort_index()
1408+
expected = Series([CategoricalDtype(),
1409+
CategoricalDtype(),
1410+
CategoricalDtype()],
1411+
index=['X', 'Y_x', 'Y_y'])
1412+
assert_series_equal(result, expected)
1413+
1414+
# different categories
1415+
x = self.left.X.astype('category')
1416+
left = DataFrame(
1417+
{'X': x,
1418+
'Y': x.cat.set_categories(['bar', 'foo', 'bah'])})
1419+
right = self.right.drop_duplicates(['X'])
1420+
right = right.assign(
1421+
Y=pd.Series(['foo', 'foo']).astype(
1422+
'category', categories=['foo', 'bar', 'baz']))
1423+
1424+
merged = pd.merge(left, right, on='X')
1425+
result = merged.dtypes.sort_index()
1426+
expected = Series([CategoricalDtype(),
1427+
CategoricalDtype(),
1428+
CategoricalDtype(),
1429+
np.dtype('O')],
1430+
index=['X', 'Y_x', 'Y_y', 'Z'])
1431+
assert_series_equal(result, expected)
1432+
1433+
def test_categories_same(self):
1434+
# GH 10409
1435+
left = self.left.assign(X=self.left.X.astype('category'))
1436+
right = self.right.assign(X=self.right.X.astype('category'))
1437+
1438+
merged = pd.merge(left, right, on='X')
1439+
result = merged.dtypes.sort_index()
1440+
expected = Series([CategoricalDtype(),
1441+
np.dtype('O'),
1442+
np.dtype('O')],
1443+
index=['X', 'Y', 'Z'])
1444+
assert_series_equal(result, expected)
1445+
1446+
def test_categories_different(self):
1447+
1448+
r = self.right.drop_duplicates(['X'])
1449+
1450+
# from above with original categories
1451+
left = self.left.assign(X=self.left.X.astype('category'))
1452+
1453+
right = r.assign(X=r.X.astype('category'))
1454+
merged = pd.merge(left, right, on='X')
1455+
1456+
# swap the categories
1457+
# but should still work (end return categorical)
1458+
left = self.left.assign(X=self.left.X.astype('category'))
1459+
right = r.assign(X=r.X.astype('category', categories=['foo', 'bar']))
1460+
result = pd.merge(left, right, on='X')
1461+
tm.assert_index_equal(result.X.cat.categories,
1462+
pd.Index(['bar', 'foo']))
1463+
1464+
assert_frame_equal(result, merged)
1465+
1466+
result = result.dtypes.sort_index()
1467+
expected = Series([CategoricalDtype(),
1468+
np.dtype('O'),
1469+
np.dtype('O')],
1470+
index=['X', 'Y', 'Z'])
1471+
assert_series_equal(result, expected)
1472+
1473+
# swap the categories and ordered on one
1474+
# but should still work (end return categorical)
1475+
right = r.assign(X=r.X.astype('category', categories=['foo', 'bar'],
1476+
ordered=True))
1477+
result = pd.merge(left, right, on='X')
1478+
tm.assert_index_equal(result.X.cat.categories,
1479+
pd.Index(['bar', 'foo']))
1480+
1481+
assert_frame_equal(result, merged)
1482+
1483+
result = result.dtypes.sort_index()
1484+
expected = Series([CategoricalDtype(),
1485+
np.dtype('O'),
1486+
np.dtype('O')],
1487+
index=['X', 'Y', 'Z'])
1488+
assert_series_equal(result, expected)
1489+
1490+
13751491
if __name__ == '__main__':
13761492
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
13771493
exit=False)

pandas/tools/tests/test_merge_asof.py

+1
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ def test_basic_categorical(self):
149149
trades.ticker = trades.ticker.astype('category')
150150
quotes = self.quotes.copy()
151151
quotes.ticker = quotes.ticker.astype('category')
152+
expected.ticker = expected.ticker.astype('category')
152153

153154
result = merge_asof(trades, quotes,
154155
on='time',

0 commit comments

Comments
 (0)