BUG/API: .merge() and .join() on category dtype columns will now preserve the category dtype when possible

jreback · jreback · commit 63dd41fd7716 · 2017-02-12T12:39:16.000-05:00
closes pandas-dev#10409
diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py
@@ -257,6 +257,30 @@ def time_i8merge(self):
         merge(self.left, self.right, how='outer')
 
 
+class MergeCategoricals(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.left_object = pd.DataFrame(
+            {'X': np.random.choice(range(0, 10), size=(10000,)),
+             'Y': np.random.choice(['one', 'two', 'three'], size=(10000,))})
+
+        self.right_object = pd.DataFrame(
+            {'X': np.random.choice(range(0, 10), size=(10000,)),
+             'Z': np.random.choice(['jjj', 'kkk', 'sss'], size=(10000,))})
+
+        self.left_cat = self.left_object.assign(
+            Y=self.left_object['Y'].astype('category'))
+        self.right_cat = self.right_object.assign(
+            Z=self.right_object['Z'].astype('category'))
+
+    def time_merge_object(self):
+        merge(self.left_object, self.right_object, on='X')
+
+    def time_merge_cat(self):
+        merge(self.left_cat, self.right_cat, on='X')
+
+
 #----------------------------------------------------------------------
 # Ordered merge
 
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -428,6 +428,7 @@ Other API Changes
 - ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`)
 - The :func:`pd.read_gbq` method now stores ``INTEGER`` columns as ``dtype=object`` if they contain ``NULL`` values. Otherwise they are stored as ``int64``. This prevents precision lost for integers greather than 2**53. Furthermore ``FLOAT`` columns with values above 10**4 are no more casted to ``int64`` which also caused precision lost (:issue: `14064`, :issue:`14305`).
 - Reorganization of timeseries development tests (:issue:`14854`)
+- ``.merge()`` and ``.join()`` on ``category`` dtype columns will now preserve the category dtype when possible (:issue:`10409`)
 
 .. _whatsnew_0200.deprecations:
 
@@ -469,6 +470,7 @@ Performance Improvements
 - Improved performance of timeseries plotting with an irregular DatetimeIndex
   (or with ``compat_x=True``) (:issue:`15073`).
 - Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`)
+- Improved performance of merge/join on ``category`` columns (:issue:`10409`)
 
 - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object.
 
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -5224,6 +5224,8 @@ def get_reindexed_values(self, empty_dtype, upcasted_na):
                 # External code requested filling/upcasting, bool values must
                 # be upcasted to object to avoid being upcasted to numeric.
                 values = self.block.astype(np.object_).values
+            elif self.block.is_categorical:
+                values = self.block.values
             else:
                 # No dtype upcasting is done here, it will be performed during
                 # concatenation itself.
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -4097,9 +4097,12 @@ def test_merge(self):
         expected = df.copy()
 
         # object-cat
+        # note that we propogate the category
+        # because we don't have any matching rows
         cright = right.copy()
         cright['d'] = cright['d'].astype('category')
         result = pd.merge(left, cright, how='left', left_on='b', right_on='c')
+        expected['d'] = expected['d'].astype('category', categories=['null'])
         tm.assert_frame_equal(result, expected)
 
         # cat-object
diff --git a/pandas/tests/tools/test_merge.py b/pandas/tests/tools/test_merge.py
@@ -13,6 +13,8 @@
 from pandas.util.testing import (assert_frame_equal,
                                  assert_series_equal,
                                  slow)
+from pandas.types.dtypes import CategoricalDtype
+from pandas.types.common import is_categorical_dtype, is_object_dtype
 from pandas import DataFrame, Index, MultiIndex, Series, Categorical
 import pandas.util.testing as tm
 
@@ -1018,38 +1020,6 @@ def test_left_join_index_multi_match(self):
         expected.index = np.arange(len(expected))
         tm.assert_frame_equal(result, expected)
 
-    def test_join_multi_dtypes(self):
-
-        # test with multi dtypes in the join index
-        def _test(dtype1, dtype2):
-            left = DataFrame({'k1': np.array([0, 1, 2] * 8, dtype=dtype1),
-                              'k2': ['foo', 'bar'] * 12,
-                              'v': np.array(np.arange(24), dtype=np.int64)})
-
-            index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
-            right = DataFrame(
-                {'v2': np.array([5, 7], dtype=dtype2)}, index=index)
-
-            result = left.join(right, on=['k1', 'k2'])
-
-            expected = left.copy()
-
-            if dtype2.kind == 'i':
-                dtype2 = np.dtype('float64')
-            expected['v2'] = np.array(np.nan, dtype=dtype2)
-            expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5
-            expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7
-
-            tm.assert_frame_equal(result, expected)
-
-            result = left.join(right, on=['k1', 'k2'], sort=True)
-            expected.sort_values(['k1', 'k2'], kind='mergesort', inplace=True)
-            tm.assert_frame_equal(result, expected)
-
-        for d1 in [np.int64, np.int32, np.int16, np.int8, np.uint8]:
-            for d2 in [np.int64, np.float64, np.float32, np.float16]:
-                _test(np.dtype(d1), np.dtype(d2))
-
     def test_left_merge_na_buglet(self):
         left = DataFrame({'id': list('abcde'), 'v1': randn(5),
                           'v2': randn(5), 'dummy': list('abcde'),
@@ -1367,3 +1337,140 @@ def f():
         def f():
             household.join(log_return, how='outer')
         self.assertRaises(NotImplementedError, f)
+
+
+class TestMergeDtypes(tm.TestCase):
+
+    def setUp(self):
+
+        self.df = DataFrame(
+            {'A': ['foo', 'bar'],
+             'B': Series(['foo', 'bar']).astype('category'),
+             'C': [1, 2],
+             'D': [1.0, 2.0],
+             'E': Series([1, 2], dtype='uint64'),
+             'F': Series([1, 2], dtype='int32')})
+
+    def test_different(self):
+
+        # we expect differences by kind
+        # to be ok, while other differences should return object
+
+        left = self.df
+        for col in self.df.columns:
+            right = DataFrame({'A': self.df[col]})
+            result = pd.merge(left, right, on='A')
+            self.assertTrue(is_object_dtype(result.A.dtype))
+
+    def test_join_multi_dtypes(self):
+
+        # test with multi dtypes in the join index
+        def _test(dtype1, dtype2):
+            left = DataFrame({'k1': np.array([0, 1, 2] * 8, dtype=dtype1),
+                              'k2': ['foo', 'bar'] * 12,
+                              'v': np.array(np.arange(24), dtype=np.int64)})
+
+            index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
+            right = DataFrame(
+                {'v2': np.array([5, 7], dtype=dtype2)}, index=index)
+
+            result = left.join(right, on=['k1', 'k2'])
+
+            expected = left.copy()
+
+            if dtype2.kind == 'i':
+                dtype2 = np.dtype('float64')
+            expected['v2'] = np.array(np.nan, dtype=dtype2)
+            expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5
+            expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7
+
+            tm.assert_frame_equal(result, expected)
+
+            result = left.join(right, on=['k1', 'k2'], sort=True)
+            expected.sort_values(['k1', 'k2'], kind='mergesort', inplace=True)
+            tm.assert_frame_equal(result, expected)
+
+        for d1 in [np.int64, np.int32, np.int16, np.int8, np.uint8]:
+            for d2 in [np.int64, np.float64, np.float32, np.float16]:
+                _test(np.dtype(d1), np.dtype(d2))
+
+
+class TestMergeCategorical(tm.TestCase):
+    _multiprocess_can_split_ = True
+
+    def setUp(self):
+        np.random.seed(1234)
+        self.left = DataFrame(
+            {'X': Series(np.random.choice(
+                ['foo', 'bar'],
+                size=(10,))).astype('category', categories=['foo', 'bar']),
+             'Y': np.random.choice(['one', 'two', 'three'], size=(10,))})
+        self.right = pd.DataFrame(
+            {'X': Series(['foo', 'bar']).astype('category',
+                                                categories=['foo', 'bar']),
+             'Z': [1, 2]})
+
+    def test_identical(self):
+        # merging on the same, should preserve dtypes
+        merged = pd.merge(self.left, self.left, on='X')
+        result = merged.dtypes.sort_index()
+        expected = Series([CategoricalDtype(),
+                           np.dtype('O'),
+                           np.dtype('O')],
+                          index=['X', 'Y_x', 'Y_y'])
+        assert_series_equal(result, expected)
+
+    def test_basic(self):
+        # we have matching Categorical dtypes in X
+        # so should preserve the merged column
+        merged = pd.merge(self.left, self.right, on='X')
+        result = merged.dtypes.sort_index()
+        expected = Series([CategoricalDtype(),
+                           np.dtype('O'),
+                           np.dtype('int64')],
+                          index=['X', 'Y', 'Z'])
+        assert_series_equal(result, expected)
+
+    def test_other_columns(self):
+        # non-merge columns should preserve if possible
+        left = self.left
+        right = self.right.assign(Z=self.right.Z.astype('category'))
+
+        merged = pd.merge(left, right, on='X')
+        result = merged.dtypes.sort_index()
+        expected = Series([CategoricalDtype(),
+                           np.dtype('O'),
+                           CategoricalDtype()],
+                          index=['X', 'Y', 'Z'])
+        assert_series_equal(result, expected)
+
+        # categories are preserved
+        self.assertTrue(left.X.values.is_dtype_equal(merged.X.values))
+        self.assertTrue(right.Z.values.is_dtype_equal(merged.Z.values))
+
+    def test_dtype_on_merged_different(self):
+        # our merging columns, X now has 2 different dtypes
+        # so we must be object as a result
+        left = self.left
+
+        for change in [lambda x: x,
+                       lambda x: x.astype('category',
+                                          categories=['bar', 'foo']),
+                       lambda x: x.astype('category',
+                                          categories=['foo', 'bar', 'bah']),
+                       lambda x: x.astype('category', ordered=True)]:
+            for how in ['inner', 'outer', 'left', 'right']:
+
+                X = change(self.right.X.astype('object'))
+                right = self.right.assign(X=X)
+                self.assertTrue(is_categorical_dtype(left.X.values))
+                self.assertFalse(left.X.values.is_dtype_equal(right.X.values))
+
+                merged = pd.merge(left, right, on='X', how=how)
+
+                result = merged.dtypes.sort_index()
+                expected = Series([np.dtype('O'),
+                                   np.dtype('O'),
+                                   np.dtype('int64')],
+                                  index=['X', 'Y', 'Z'])
+                assert_series_equal(result, expected)
diff --git a/pandas/tests/tools/test_merge_asof.py b/pandas/tests/tools/test_merge_asof.py
@@ -147,6 +147,7 @@ def test_basic_categorical(self):
         trades.ticker = trades.ticker.astype('category')
         quotes = self.quotes.copy()
         quotes.ticker = quotes.ticker.astype('category')
+        expected.ticker = expected.ticker.astype('category')
 
         result = merge_asof(trades, quotes,
                             on='time',
diff --git a/pandas/tests/types/test_common.py b/pandas/tests/types/test_common.py
@@ -39,16 +39,33 @@ def test_period_dtype(self):
 
 
 def test_dtype_equal():
-    assert is_dtype_equal(np.int64, np.int64)
-    assert not is_dtype_equal(np.int64, np.float64)
 
-    p1 = PeriodDtype('D')
-    p2 = PeriodDtype('D')
-    assert is_dtype_equal(p1, p2)
-    assert not is_dtype_equal(np.int64, p1)
+    dtypes = dict(dt_tz=pandas_dtype('datetime64[ns, US/Eastern]'),
+                  dt=pandas_dtype('datetime64[ns]'),
+                  td=pandas_dtype('timedelta64[ns]'),
+                  p=PeriodDtype('D'),
+                  i=np.int64,
+                  f=np.float64,
+                  o=np.object)
 
-    p3 = PeriodDtype('2D')
-    assert not is_dtype_equal(p1, p3)
+    # match equal to self, but not equal to other
+    for name, dtype in dtypes.items():
+        assert is_dtype_equal(dtype, dtype)
 
-    assert not DatetimeTZDtype.is_dtype(np.int64)
-    assert not PeriodDtype.is_dtype(np.int64)
+        for name2, dtype2 in dtypes.items():
+            if name != name2:
+                assert not is_dtype_equal(dtype, dtype2)
+
+    # we are strict on kind equality
+    for dtype in [np.int8, np.int16, np.int32]:
+        assert not is_dtype_equal(dtypes['i'], dtype)
+
+    for dtype in [np.float32]:
+        assert not is_dtype_equal(dtypes['f'], dtype)
+
+    # strict w.r.t. PeriodDtype
+    assert not is_dtype_equal(dtypes['p'], PeriodDtype('2D'))
+
+    # strict w.r.t. datetime64
+    assert not is_dtype_equal(dtypes['dt_tz'],
+                              pandas_dtype('datetime64[ns, CET]'))
diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py