BUG/API: .merge() and .join() on category dtype columns will now

jreback · jreback · commit b64776148884 · 2017-02-06T16:37:06.000-05:00
preserve the category dtype when possible closes pandas-dev#10409
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -365,7 +365,10 @@ Other API Changes
 - ``pd.read_csv()`` will now raise a ``ValueError`` for the C engine if the quote character is larger than than one byte (:issue:`11592`)
 - ``inplace`` arguments now require a boolean value, else a ``ValueError`` is thrown (:issue:`14189`)
 - ``pandas.api.types.is_datetime64_ns_dtype`` will now report ``True`` on a tz-aware dtype, similar to ``pandas.api.types.is_datetime64_any_dtype``
- - ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`)
+- ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`)
+- ``.merge()`` and ``.join()`` on ``category`` dtype columns will now preserve the category dtype when possible (:issue:`10409`)
+
+
 .. _whatsnew_0200.deprecations:
 
 Deprecations
@@ -406,6 +409,7 @@ Performance Improvements
 - Improved performance of timeseries plotting with an irregular DatetimeIndex
   (or with ``compat_x=True``) (:issue:`15073`).
 - Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`)
+- Improved performance of merge/join on ``category`` columns (:issue:`10409`)
 
 - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object.
 
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -5223,6 +5223,8 @@ def get_reindexed_values(self, empty_dtype, upcasted_na):
                 # External code requested filling/upcasting, bool values must
                 # be upcasted to object to avoid being upcasted to numeric.
                 values = self.block.astype(np.object_).values
+            elif self.block.is_categorical:
+                values = self.block.values
             else:
                 # No dtype upcasting is done here, it will be performed during
                 # concatenation itself.
diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py
@@ -21,6 +21,7 @@
                                  is_datetime64_dtype,
                                  needs_i8_conversion,
                                  is_int64_dtype,
+                                 is_categorical_dtype,
                                  is_integer_dtype,
                                  is_float_dtype,
                                  is_integer,
@@ -1339,6 +1340,13 @@ def _factorize_keys(lk, rk, sort=True):
     if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk):
         lk = lk.values
         rk = rk.values
+
+    # if we exactly match in categories, allow us to use codes
+    if (is_categorical_dtype(lk) and
+            is_categorical_dtype(rk) and
+            lk.is_dtype_equal(rk)):
+        return lk.codes, rk.codes, len(lk.categories)
+
     if is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk):
         klass = _hash.Int64Factorizer
         lk = _ensure_int64(com._values_from_object(lk))
diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py
@@ -14,6 +14,7 @@
 from pandas.util.testing import (assert_frame_equal,
                                  assert_series_equal,
                                  slow)
+from pandas.types.dtypes import CategoricalDtype
 from pandas import DataFrame, Index, MultiIndex, Series, Categorical
 import pandas.util.testing as tm
 
@@ -1372,6 +1373,121 @@ def f():
         self.assertRaises(NotImplementedError, f)
 
 
+class TestMergeCategorical(tm.TestCase):
+    _multiprocess_can_split_ = True
+
+    def setUp(self):
+        np.random.seed(1234)
+        self.left = DataFrame(
+            {'X': np.random.choice(['foo', 'bar'], size=(10,)),
+             'Y': np.random.choice(['one', 'two', 'three'], size=(10,))})
+
+        self.right = pd.DataFrame(
+            {'X': np.random.choice(['foo', 'bar'], size=(10,)),
+             'Z': np.random.choice(['jjj', 'kkk', 'sss'], size=(10,))})
+
+    def test_identical(self):
+        # GH 10409
+        left = self.left.assign(X=self.left.X.astype('category'))
+
+        merged = pd.merge(left, left, on='X')
+        result = merged.dtypes.sort_index()
+        expected = Series([CategoricalDtype(),
+                           np.dtype('O'),
+                           np.dtype('O')],
+                          index=['X', 'Y_x', 'Y_y'])
+        assert_series_equal(result, expected)
+
+    def test_other_columns(self):
+        # non-merge columns should preserver if possible
+        x = self.left.X.astype('category')
+        left = DataFrame({'X': x, 'Y': x})
+
+        merged = pd.merge(left, left, on='X')
+        result = merged.dtypes.sort_index()
+        expected = Series([CategoricalDtype(),
+                           CategoricalDtype(),
+                           CategoricalDtype()],
+                          index=['X', 'Y_x', 'Y_y'])
+        assert_series_equal(result, expected)
+
+        # different categories
+        x = self.left.X.astype('category')
+        left = DataFrame(
+            {'X': x,
+             'Y': x.cat.set_categories(['bar', 'foo', 'bah'])})
+        right = self.right.drop_duplicates(['X'])
+        right = right.assign(
+            Y=pd.Series(['foo', 'foo']).astype(
+                'category', categories=['foo', 'bar', 'baz']))
+
+        merged = pd.merge(left, right, on='X')
+        result = merged.dtypes.sort_index()
+        expected = Series([CategoricalDtype(),
+                           CategoricalDtype(),
+                           CategoricalDtype(),
+                           np.dtype('O')],
+                          index=['X', 'Y_x', 'Y_y', 'Z'])
+        assert_series_equal(result, expected)
+
+    def test_categories_same(self):
+        # GH 10409
+        left = self.left.assign(X=self.left.X.astype('category'))
+        right = self.right.assign(X=self.right.X.astype('category'))
+
+        merged = pd.merge(left, right, on='X')
+        result = merged.dtypes.sort_index()
+        expected = Series([CategoricalDtype(),
+                           np.dtype('O'),
+                           np.dtype('O')],
+                          index=['X', 'Y', 'Z'])
+        assert_series_equal(result, expected)
+
+    def test_categories_different(self):
+
+        r = self.right.drop_duplicates(['X'])
+
+        # from above with original categories
+        left = self.left.assign(X=self.left.X.astype('category'))
+
+        right = r.assign(X=r.X.astype('category'))
+        merged = pd.merge(left, right, on='X')
+
+        # swap the categories
+        # but should still work (end return categorical)
+        left = self.left.assign(X=self.left.X.astype('category'))
+        right = r.assign(X=r.X.astype('category', categories=['foo', 'bar']))
+        result = pd.merge(left, right, on='X')
+        tm.assert_index_equal(result.X.cat.categories,
+                              pd.Index(['bar', 'foo']))
+
+        assert_frame_equal(result, merged)
+
+        result = result.dtypes.sort_index()
+        expected = Series([CategoricalDtype(),
+                           np.dtype('O'),
+                           np.dtype('O')],
+                          index=['X', 'Y', 'Z'])
+        assert_series_equal(result, expected)
+
+        # swap the categories and ordered on one
+        # but should still work (end return categorical)
+        right = r.assign(X=r.X.astype('category', categories=['foo', 'bar'],
+                                      ordered=True))
+        result = pd.merge(left, right, on='X')
+        tm.assert_index_equal(result.X.cat.categories,
+                              pd.Index(['bar', 'foo']))
+
+        assert_frame_equal(result, merged)
+
+        result = result.dtypes.sort_index()
+        expected = Series([CategoricalDtype(),
+                           np.dtype('O'),
+                           np.dtype('O')],
+                          index=['X', 'Y', 'Z'])
+        assert_series_equal(result, expected)
+
+
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                    exit=False)