Skip to content

Commit 8f309db

Browse files
dwillmerjreback
authored andcommitted
BUG in merging categorical dates
closes #16900 Author: Dave Willmer <[email protected]> This patch had conflicts when merged, resolved by Committer: Jeff Reback <[email protected]> Closes #16986 from dwillmer/cat_fix and squashes the following commits: 1ea1977 [Dave Willmer] Minor tweaks + comment 21a35a0 [Dave Willmer] Merge branch 'cat_fix' of https://github.com/dwillmer/pandas into cat_fix 04d5404 [Dave Willmer] Update tests 3cc5c24 [Dave Willmer] Merge branch 'master' into cat_fix 5e8e23b [Dave Willmer] Add whatsnew item b82d117 [Dave Willmer] Lint fixes a81933d [Dave Willmer] Remove unused import 218da66 [Dave Willmer] Generic solution to categorical problem 48e7163 [Dave Willmer] Test inner join 8843c10 [Dave Willmer] Fix TypeError when merging categorical dates
1 parent a2d03d4 commit 8f309db

File tree

3 files changed

+55
-9
lines changed

3 files changed

+55
-9
lines changed

doc/source/whatsnew/v0.21.0.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -224,8 +224,9 @@ Sparse
224224

225225
Reshaping
226226
^^^^^^^^^
227-
- Joining/Merging with a non unique ``PeriodIndex`` raised a TypeError (:issue:`16871`)
227+
- Joining/Merging with a non unique ``PeriodIndex`` raised a ``TypeError`` (:issue:`16871`)
228228
- Bug in :func:`crosstab` where non-aligned series of integers were casted to float (:issue:`17005`)
229+
- Bug in merging with categorical dtypes with datetimelikes incorrectly raised a ``TypeError`` (:issue:`16900`)
229230
- Bug when using :func:`isin` on a large object series and large comparison array (:issue:`16012`)
230231
- Fixes regression from 0.20, :func:`Series.aggregate` and :func:`DataFrame.aggregate` allow dictionaries as return values again (:issue:`16741`)
231232

pandas/core/reshape/merge.py

+18-7
Original file line numberDiff line numberDiff line change
@@ -878,7 +878,7 @@ def _get_merge_keys(self):
878878
return left_keys, right_keys, join_names
879879

880880
def _maybe_coerce_merge_keys(self):
881-
# we have valid mergee's but we may have to further
881+
# we have valid mergees but we may have to further
882882
# coerce these if they are originally incompatible types
883883
#
884884
# for example if these are categorical, but are not dtype_equal
@@ -890,12 +890,16 @@ def _maybe_coerce_merge_keys(self):
890890
if (len(lk) and not len(rk)) or (not len(lk) and len(rk)):
891891
continue
892892

893+
lk_is_cat = is_categorical_dtype(lk)
894+
rk_is_cat = is_categorical_dtype(rk)
895+
893896
# if either left or right is a categorical
894897
# then the must match exactly in categories & ordered
895-
if is_categorical_dtype(lk) and is_categorical_dtype(rk):
898+
if lk_is_cat and rk_is_cat:
896899
if lk.is_dtype_equal(rk):
897900
continue
898-
elif is_categorical_dtype(lk) or is_categorical_dtype(rk):
901+
902+
elif lk_is_cat or rk_is_cat:
899903
pass
900904

901905
elif is_dtype_equal(lk.dtype, rk.dtype):
@@ -905,7 +909,7 @@ def _maybe_coerce_merge_keys(self):
905909
# kinds to proceed, eg. int64 and int8
906910
# further if we are object, but we infer to
907911
# the same, then proceed
908-
if (is_numeric_dtype(lk) and is_numeric_dtype(rk)):
912+
if is_numeric_dtype(lk) and is_numeric_dtype(rk):
909913
if lk.dtype.kind == rk.dtype.kind:
910914
continue
911915

@@ -914,13 +918,20 @@ def _maybe_coerce_merge_keys(self):
914918
continue
915919

916920
# Houston, we have a problem!
917-
# let's coerce to object
921+
# let's coerce to object if the dtypes aren't
922+
# categorical, otherwise coerce to the category
923+
# dtype. If we coerced categories to object,
924+
# then we would lose type information on some
925+
# columns, and end up trying to merge
926+
# incompatible dtypes. See GH 16900.
918927
if name in self.left.columns:
928+
typ = lk.categories.dtype if lk_is_cat else object
919929
self.left = self.left.assign(
920-
**{name: self.left[name].astype(object)})
930+
**{name: self.left[name].astype(typ)})
921931
if name in self.right.columns:
932+
typ = rk.categories.dtype if rk_is_cat else object
922933
self.right = self.right.assign(
923-
**{name: self.right[name].astype(object)})
934+
**{name: self.right[name].astype(typ)})
924935

925936
def _validate_specification(self):
926937
# Hm, any way to make this logic less complicated??

pandas/tests/reshape/test_merge.py

+35-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# pylint: disable=E1103
22

33
import pytest
4-
from datetime import datetime
4+
from datetime import datetime, date
55
from numpy.random import randn
66
from numpy import nan
77
import numpy as np
@@ -1515,6 +1515,40 @@ def test_self_join_multiple_categories(self):
15151515

15161516
assert_frame_equal(result, df)
15171517

1518+
def test_dtype_on_categorical_dates(self):
1519+
# GH 16900
1520+
# dates should not be coerced to ints
1521+
1522+
df = pd.DataFrame(
1523+
[[date(2001, 1, 1), 1.1],
1524+
[date(2001, 1, 2), 1.3]],
1525+
columns=['date', 'num2']
1526+
)
1527+
df['date'] = df['date'].astype('category')
1528+
1529+
df2 = pd.DataFrame(
1530+
[[date(2001, 1, 1), 1.3],
1531+
[date(2001, 1, 3), 1.4]],
1532+
columns=['date', 'num4']
1533+
)
1534+
df2['date'] = df2['date'].astype('category')
1535+
1536+
expected_outer = pd.DataFrame([
1537+
[pd.Timestamp('2001-01-01'), 1.1, 1.3],
1538+
[pd.Timestamp('2001-01-02'), 1.3, np.nan],
1539+
[pd.Timestamp('2001-01-03'), np.nan, 1.4]],
1540+
columns=['date', 'num2', 'num4']
1541+
)
1542+
result_outer = pd.merge(df, df2, how='outer', on=['date'])
1543+
assert_frame_equal(result_outer, expected_outer)
1544+
1545+
expected_inner = pd.DataFrame(
1546+
[[pd.Timestamp('2001-01-01'), 1.1, 1.3]],
1547+
columns=['date', 'num2', 'num4']
1548+
)
1549+
result_inner = pd.merge(df, df2, how='inner', on=['date'])
1550+
assert_frame_equal(result_inner, expected_inner)
1551+
15181552

15191553
@pytest.fixture
15201554
def left_df():

0 commit comments

Comments
 (0)