BUG/API: .merge() and .join() on category dtype columns will now preserve the category dtype when possible

jreback · jreback · commit 86a171960b36 · 2017-02-09T18:24:19.000-05:00
closes #10409
diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py
@@ -257,6 +257,30 @@ def time_i8merge(self):
         merge(self.left, self.right, how='outer')
 
 
+class MergeCategoricals(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.left_object = pd.DataFrame(
+            {'X': np.random.choice(range(0, 10), size=(10000,)),
+             'Y': np.random.choice(['one', 'two', 'three'], size=(10000,))})
+
+        self.right_object = pd.DataFrame(
+            {'X': np.random.choice(range(0, 10), size=(10000,)),
+             'Z': np.random.choice(['jjj', 'kkk', 'sss'], size=(10000,))})
+
+        self.left_cat = self.left_object.assign(
+            Y=self.left_object['Y'].astype('category'))
+        self.right_cat = self.right_object.assign(
+            Z=self.right_object['Z'].astype('category'))
+
+    def time_merge_object(self):
+        merge(self.left_object, self.right_object, on='X')
+
+    def time_merge_cat(self):
+        merge(self.left_cat, self.right_cat, on='X')
+
+
 #----------------------------------------------------------------------
 # Ordered merge
 
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -371,6 +371,7 @@ Other API Changes
 - ``pandas.api.types.is_datetime64_ns_dtype`` will now report ``True`` on a tz-aware dtype, similar to ``pandas.api.types.is_datetime64_any_dtype``
 - ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`)
 - The :func:`pd.read_gbq` method now stores ``INTEGER`` columns as ``dtype=object`` if they contain ``NULL`` values. Otherwise they are stored as ``int64``. This prevents precision lost for integers greather than 2**53. Furthermore ``FLOAT`` columns with values above 10**4 are no more casted to ``int64`` which also caused precision lost (:issue: `14064`, :issue:`14305`).
+- ``.merge()`` and ``.join()`` on ``category`` dtype columns will now preserve the category dtype when possible (:issue:`10409`)
 
 .. _whatsnew_0200.deprecations:
 
@@ -412,6 +413,7 @@ Performance Improvements
 - Improved performance of timeseries plotting with an irregular DatetimeIndex
   (or with ``compat_x=True``) (:issue:`15073`).
 - Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`)
+- Improved performance of merge/join on ``category`` columns (:issue:`10409`)
 
 - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object.
 
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -5224,6 +5224,8 @@ def get_reindexed_values(self, empty_dtype, upcasted_na):
                 # External code requested filling/upcasting, bool values must
                 # be upcasted to object to avoid being upcasted to numeric.
                 values = self.block.astype(np.object_).values
+            elif self.block.is_categorical:
+                values = self.block.values
             else:
                 # No dtype upcasting is done here, it will be performed during
                 # concatenation itself.
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -4097,9 +4097,12 @@ def test_merge(self):
         expected = df.copy()
 
         # object-cat
+        # note that we propogate the category
+        # because we don't have any matching rows
         cright = right.copy()
         cright['d'] = cright['d'].astype('category')
         result = pd.merge(left, cright, how='left', left_on='b', right_on='c')
+        expected['d'] = expected['d'].astype('category', categories=['null'])
         tm.assert_frame_equal(result, expected)
 
         # cat-object
diff --git a/pandas/tests/types/test_common.py b/pandas/tests/types/test_common.py
@@ -39,16 +39,33 @@ def test_period_dtype(self):
 
 
 def test_dtype_equal():
-    assert is_dtype_equal(np.int64, np.int64)
-    assert not is_dtype_equal(np.int64, np.float64)
 
-    p1 = PeriodDtype('D')
-    p2 = PeriodDtype('D')
-    assert is_dtype_equal(p1, p2)
-    assert not is_dtype_equal(np.int64, p1)
+    dtypes = dict(dt_tz=pandas_dtype('datetime64[ns, US/Eastern]'),
+                  dt=pandas_dtype('datetime64[ns]'),
+                  td=pandas_dtype('timedelta64[ns]'),
+                  p=PeriodDtype('D'),
+                  i=np.int64,
+                  f=np.float64,
+                  o=np.object)
 
-    p3 = PeriodDtype('2D')
-    assert not is_dtype_equal(p1, p3)
+    # match equal to self, but not equal to other
+    for name, dtype in dtypes.items():
+        assert is_dtype_equal(dtype, dtype)
 
-    assert not DatetimeTZDtype.is_dtype(np.int64)
-    assert not PeriodDtype.is_dtype(np.int64)
+        for name2, dtype2 in dtypes.items():
+            if name != name2:
+                assert not is_dtype_equal(dtype, dtype2)
+
+    # we are strict on kind equality
+    for dtype in [np.int8, np.int16, np.int32]:
+        assert not is_dtype_equal(dtypes['i'], dtype)
+
+    for dtype in [np.float32]:
+        assert not is_dtype_equal(dtypes['f'], dtype)
+
+    # strict w.r.t. PeriodDtype
+    assert not is_dtype_equal(dtypes['p'], PeriodDtype('2D'))
+
+    # strict w.r.t. datetime64
+    assert not is_dtype_equal(dtypes['dt_tz'],
+                              pandas_dtype('datetime64[ns, CET]'))
diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py
@@ -12,14 +12,16 @@
 
 import pandas as pd
 from pandas import (Categorical, Series, DataFrame,
-                    Index, MultiIndex, Timedelta)
+                    Index, MultiIndex, Timedelta, lib)
 from pandas.core.frame import _merge_doc
 from pandas.types.common import (is_datetime64tz_dtype,
                                  is_datetime64_dtype,
                                  needs_i8_conversion,
                                  is_int64_dtype,
+                                 is_categorical_dtype,
                                  is_integer_dtype,
                                  is_float_dtype,
+                                 is_numeric_dtype,
                                  is_integer,
                                  is_int_or_datetime_dtype,
                                  is_dtype_equal,
@@ -567,6 +569,10 @@ def __init__(self, left, right, how='inner', on=None,
          self.right_join_keys,
          self.join_names) = self._get_merge_keys()
 
+        # validate the merge keys dtypes. We may need to coerce
+        # to avoid incompat dtypes
+        self._maybe_coerce_merge_keys()
+
     def get_result(self):
         if self.indicator:
             self.left, self.right = self._indicator_pre_merge(
@@ -757,26 +763,6 @@ def _get_join_info(self):
             join_index = join_index.astype(object)
         return join_index, left_indexer, right_indexer
 
-    def _get_merge_data(self):
-        """
-        Handles overlapping column names etc.
-        """
-        ldata, rdata = self.left._data, self.right._data
-        lsuf, rsuf = self.suffixes
-
-        llabels, rlabels = items_overlap_with_suffix(
-            ldata.items, lsuf, rdata.items, rsuf)
-
-        if not llabels.equals(ldata.items):
-            ldata = ldata.copy(deep=False)
-            ldata.set_axis(0, llabels)
-
-        if not rlabels.equals(rdata.items):
-            rdata = rdata.copy(deep=False)
-            rdata.set_axis(0, rlabels)
-
-        return ldata, rdata
-
     def _get_merge_keys(self):
         """
         Note: has side effects (copy/delete key columns)
@@ -888,6 +874,51 @@ def _get_merge_keys(self):
 
         return left_keys, right_keys, join_names
 
+    def _maybe_coerce_merge_keys(self):
+        # we have valid mergee's but we may have to further
+        # coerce these if they are originally incompatible types
+        #
+        # for example if these are categorical, but are not dtype_equal
+        # or if we have object and integer dtypes
+
+        for lk, rk, name in zip(self.left_join_keys,
+                                self.right_join_keys,
+                                self.join_names):
+            if (len(lk) and not len(rk)) or (not len(lk) and len(rk)):
+                continue
+
+            # if either left or right is a categorical
+            # then the must match exactly in categories & ordered
+            if is_categorical_dtype(lk) and is_categorical_dtype(rk):
+                if lk.is_dtype_equal(rk):
+                    continue
+            elif is_categorical_dtype(lk) or is_categorical_dtype(rk):
+                pass
+
+            elif is_dtype_equal(lk.dtype, rk.dtype):
+                continue
+
+            # if we are numeric, then allow differing
+            # kinds to proceed, eg. int64 and int8
+            # further if we are object, but we infer to
+            # the same, then proceed
+            if (is_numeric_dtype(lk) and is_numeric_dtype(rk)):
+                if lk.dtype.kind == rk.dtype.kind:
+                    continue
+
+                # let's infer and see if we are ok
+                if lib.infer_dtype(lk) == lib.infer_dtype(rk):
+                    continue
+
+            # Houston, we have a problem!
+            # let's coerce to object
+            if name in self.left.columns:
+                self.left = self.left.assign(
+                    **{name: self.left[name].astype(object)})
+            if name in self.right.columns:
+                self.right = self.right.assign(
+                    **{name: self.right[name].astype(object)})
+
     def _validate_specification(self):
         # Hm, any way to make this logic less complicated??
         if self.on is None and self.left_on is None and self.right_on is None:
@@ -939,9 +970,15 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner',
 
     Parameters
     ----------
+    left_keys: ndarray, Index, Series
+    right_keys: ndarray, Index, Series
+    sort: boolean, default False
+    how: string {'inner', 'outer', 'left', 'right'}, default 'inner'
 
     Returns
     -------
+    tuple of (left_indexer, right_indexer)
+        indexers into the left_keys, right_keys
 
     """
     from functools import partial
@@ -1345,6 +1382,13 @@ def _factorize_keys(lk, rk, sort=True):
     if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk):
         lk = lk.values
         rk = rk.values
+
+    # if we exactly match in categories, allow us to use codes
+    if (is_categorical_dtype(lk) and
+            is_categorical_dtype(rk) and
+            lk.is_dtype_equal(rk)):
+        return lk.codes, rk.codes, len(lk.categories)
+
     if is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk):
         klass = _hash.Int64Factorizer
         lk = _ensure_int64(com._values_from_object(lk))
diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py
diff --git a/pandas/tools/tests/test_merge_asof.py b/pandas/tools/tests/test_merge_asof.py