Skip to content

Commit 026e748

Browse files
committed
BUG/API: .merge() and .join() on category dtype columns will now preserve category dtype
closes pandas-dev#10409 Author: Jeff Reback <[email protected]> Closes pandas-dev#15321 from jreback/merge_cat and squashes the following commits: 3671dad [Jeff Reback] DOC: merge docs a4b2ee6 [Jeff Reback] BUG/API: .merge() and .join() on category dtype columns will now preserve the category dtype when possible
1 parent 5dee1f1 commit 026e748

File tree

10 files changed

+364
-71
lines changed

10 files changed

+364
-71
lines changed

asv_bench/benchmarks/join_merge.py

+30-6
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from pandas import ordered_merge as merge_ordered
77

88

9-
#----------------------------------------------------------------------
9+
# ----------------------------------------------------------------------
1010
# Append
1111

1212
class Append(object):
@@ -35,7 +35,7 @@ def time_append_mixed(self):
3535
self.mdf1.append(self.mdf2)
3636

3737

38-
#----------------------------------------------------------------------
38+
# ----------------------------------------------------------------------
3939
# Concat
4040

4141
class Concat(object):
@@ -120,7 +120,7 @@ def time_f_ordered_axis1(self):
120120
concat(self.frames_f, axis=1, ignore_index=True)
121121

122122

123-
#----------------------------------------------------------------------
123+
# ----------------------------------------------------------------------
124124
# Joins
125125

126126
class Join(object):
@@ -202,7 +202,7 @@ def time_join_non_unique_equal(self):
202202
(self.fracofday * self.temp[self.fracofday.index])
203203

204204

205-
#----------------------------------------------------------------------
205+
# ----------------------------------------------------------------------
206206
# Merges
207207

208208
class Merge(object):
@@ -257,7 +257,31 @@ def time_i8merge(self):
257257
merge(self.left, self.right, how='outer')
258258

259259

260-
#----------------------------------------------------------------------
260+
class MergeCategoricals(object):
261+
goal_time = 0.2
262+
263+
def setup(self):
264+
self.left_object = pd.DataFrame(
265+
{'X': np.random.choice(range(0, 10), size=(10000,)),
266+
'Y': np.random.choice(['one', 'two', 'three'], size=(10000,))})
267+
268+
self.right_object = pd.DataFrame(
269+
{'X': np.random.choice(range(0, 10), size=(10000,)),
270+
'Z': np.random.choice(['jjj', 'kkk', 'sss'], size=(10000,))})
271+
272+
self.left_cat = self.left_object.assign(
273+
Y=self.left_object['Y'].astype('category'))
274+
self.right_cat = self.right_object.assign(
275+
Z=self.right_object['Z'].astype('category'))
276+
277+
def time_merge_object(self):
278+
merge(self.left_object, self.right_object, on='X')
279+
280+
def time_merge_cat(self):
281+
merge(self.left_cat, self.right_cat, on='X')
282+
283+
284+
# ----------------------------------------------------------------------
261285
# Ordered merge
262286

263287
class MergeOrdered(object):
@@ -332,7 +356,7 @@ def time_multiby(self):
332356
merge_asof(self.df1e, self.df2e, on='time', by=['key', 'key2'])
333357

334358

335-
#----------------------------------------------------------------------
359+
# ----------------------------------------------------------------------
336360
# data alignment
337361

338362
class Align(object):

doc/source/categorical.rst

+3
Original file line numberDiff line numberDiff line change
@@ -646,6 +646,9 @@ In this case the categories are not the same and so an error is raised:
646646
647647
The same applies to ``df.append(df_different)``.
648648

649+
See also the section on :ref:`merge dtypes<merging.dtypes>` for notes about preserving merge dtypes and performance.
650+
651+
649652
.. _categorical.union:
650653

651654
Unioning

doc/source/merging.rst

+73
Original file line numberDiff line numberDiff line change
@@ -746,6 +746,79 @@ The ``indicator`` argument will also accept string arguments, in which case the
746746
pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
747747
748748
749+
.. _merging.dtypes:
750+
751+
Merge Dtypes
752+
~~~~~~~~~~~~
753+
754+
.. versionadded:: 0.19.0
755+
756+
Merging will preserve the dtype of the join keys.
757+
758+
.. ipython:: python
759+
760+
left = pd.DataFrame({'key': [1], 'v1': [10]})
761+
left
762+
right = pd.DataFrame({'key': [1, 2], 'v1': [20, 30]})
763+
right
764+
765+
We are able to preserve the join keys
766+
767+
.. ipython:: python
768+
769+
pd.merge(left, right, how='outer')
770+
pd.merge(left, right, how='outer').dtypes
771+
772+
Of course if you have missing values that are introduced, then the
773+
resulting dtype will be upcast.
774+
775+
.. ipython:: python
776+
777+
pd.merge(left, right, how='outer', on='key')
778+
pd.merge(left, right, how='outer', on='key').dtypes
779+
780+
.. versionadded:: 0.20.0
781+
782+
Merging will preserve ``category`` dtypes of the mergands.
783+
784+
The left frame.
785+
786+
.. ipython:: python
787+
788+
X = pd.Series(np.random.choice(['foo', 'bar'], size=(10,)))
789+
X = X.astype('category', categories=['foo', 'bar'])
790+
791+
left = DataFrame({'X': X,
792+
'Y': np.random.choice(['one', 'two', 'three'], size=(10,))})
793+
left
794+
left.dtypes
795+
796+
The right frame.
797+
798+
.. ipython:: python
799+
800+
right = DataFrame({'X': Series(['foo', 'bar']).astype('category', categories=['foo', 'bar']),
801+
'Z': [1, 2]})
802+
right
803+
right.dtypes
804+
805+
The merged result
806+
807+
.. ipython:: python
808+
809+
result = pd.merge(left, right, how='outer')
810+
result
811+
result.dtypes
812+
813+
.. note::
814+
815+
The category dtypes must be *exactly* the same, meaning the same categories and the ordered attribute.
816+
Otherwise the result will coerce to ``object`` dtype.
817+
818+
.. note::
819+
820+
Merging on ``category`` dtypes that are the same can be quite performant compared to ``object`` dtype merging.
821+
749822
.. _merging.join.index:
750823

751824
Joining on index

doc/source/whatsnew/v0.20.0.txt

+3-1
Original file line numberDiff line numberDiff line change
@@ -692,7 +692,7 @@ Other API Changes
692692
- Reorganization of timeseries development tests (:issue:`14854`)
693693
- Specific support for ``copy.copy()`` and ``copy.deepcopy()`` functions on NDFrame objects (:issue:`15444`)
694694
- ``Series.sort_values()`` accepts a one element list of bool for consistency with the behavior of ``DataFrame.sort_values()`` (:issue:`15604`)
695-
- ``DataFrame.iterkv()`` has been removed in favor of ``DataFrame.iteritems()`` (:issue:`10711`)
695+
- ``.merge()`` and ``.join()`` on ``category`` dtype columns will now preserve the category dtype when possible (:issue:`10409`)
696696

697697
.. _whatsnew_0200.deprecations:
698698

@@ -733,6 +733,7 @@ Removal of prior version deprecations/changes
733733
- ``Series.is_time_series`` is dropped in favor of ``Series.index.is_all_dates`` (:issue:`15098`)
734734
- The deprecated ``irow``, ``icol``, ``iget`` and ``iget_value`` methods are removed
735735
in favor of ``iloc`` and ``iat`` as explained :ref:`here <whatsnew_0170.deprecations>` (:issue:`10711`).
736+
- The deprecated ``DataFrame.iterkv()`` has been removed in favor of ``DataFrame.iteritems()`` (:issue:`10711`)
736737

737738

738739
.. _whatsnew_0200.performance:
@@ -749,6 +750,7 @@ Performance Improvements
749750
- When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. (:issue:`14947`)
750751
- Improved performance of ``.rank()`` for categorical data (:issue:`15498`)
751752
- Improved performance when using ``.unstack()`` (:issue:`15503`)
753+
- Improved performance of merge/join on ``category`` columns (:issue:`10409`)
752754

753755

754756
.. _whatsnew_0200.bug_fixes:

pandas/core/internals.py

+2
Original file line numberDiff line numberDiff line change
@@ -5227,6 +5227,8 @@ def get_reindexed_values(self, empty_dtype, upcasted_na):
52275227
# External code requested filling/upcasting, bool values must
52285228
# be upcasted to object to avoid being upcasted to numeric.
52295229
values = self.block.astype(np.object_).values
5230+
elif self.block.is_categorical:
5231+
values = self.block.values
52305232
else:
52315233
# No dtype upcasting is done here, it will be performed during
52325234
# concatenation itself.

pandas/tests/test_categorical.py

+3
Original file line numberDiff line numberDiff line change
@@ -4097,9 +4097,12 @@ def test_merge(self):
40974097
expected = df.copy()
40984098

40994099
# object-cat
4100+
# note that we propogate the category
4101+
# because we don't have any matching rows
41004102
cright = right.copy()
41014103
cright['d'] = cright['d'].astype('category')
41024104
result = pd.merge(left, cright, how='left', left_on='b', right_on='c')
4105+
expected['d'] = expected['d'].astype('category', categories=['null'])
41034106
tm.assert_frame_equal(result, expected)
41044107

41054108
# cat-object

0 commit comments

Comments
 (0)