Skip to content

Commit a478fde

Browse files
authored
BUG: Categorical with non-nano dt64 (#38791)
1 parent f5ef132 commit a478fde

File tree

4 files changed

+112
-35
lines changed

4 files changed

+112
-35
lines changed

pandas/core/arrays/categorical.py

+4
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
coerce_indexer_dtype,
3131
maybe_cast_to_extension_array,
3232
maybe_infer_to_datetimelike,
33+
sanitize_to_nanoseconds,
3334
)
3435
from pandas.core.dtypes.common import (
3536
ensure_int64,
@@ -366,6 +367,9 @@ def __init__(
366367
values = [values[idx] for idx in np.where(~null_mask)[0]]
367368
values = sanitize_array(values, None, dtype=sanitize_dtype)
368369

370+
else:
371+
values = sanitize_to_nanoseconds(values)
372+
369373
if dtype.categories is None:
370374
try:
371375
codes, categories = factorize(values, sort=True)

pandas/core/dtypes/cast.py

+15-7
Original file line numberDiff line numberDiff line change
@@ -1521,13 +1521,7 @@ def maybe_cast_to_datetime(value, dtype: Optional[DtypeObj]):
15211521
# catch a datetime/timedelta that is not of ns variety
15221522
# and no coercion specified
15231523
if is_array and value.dtype.kind in ["M", "m"]:
1524-
dtype = value.dtype
1525-
1526-
if dtype.kind == "M" and dtype != DT64NS_DTYPE:
1527-
value = conversion.ensure_datetime64ns(value)
1528-
1529-
elif dtype.kind == "m" and dtype != TD64NS_DTYPE:
1530-
value = conversion.ensure_timedelta64ns(value)
1524+
value = sanitize_to_nanoseconds(value)
15311525

15321526
# only do this if we have an array and the dtype of the array is not
15331527
# setup already we are not an integer/object, so don't bother with this
@@ -1543,6 +1537,20 @@ def maybe_cast_to_datetime(value, dtype: Optional[DtypeObj]):
15431537
return value
15441538

15451539

1540+
def sanitize_to_nanoseconds(values: np.ndarray) -> np.ndarray:
1541+
"""
1542+
Safely convert non-nanosecond datetime64 or timedelta64 values to nanosecond.
1543+
"""
1544+
dtype = values.dtype
1545+
if dtype.kind == "M" and dtype != DT64NS_DTYPE:
1546+
values = conversion.ensure_datetime64ns(values)
1547+
1548+
elif dtype.kind == "m" and dtype != TD64NS_DTYPE:
1549+
values = conversion.ensure_timedelta64ns(values)
1550+
1551+
return values
1552+
1553+
15461554
def find_common_type(types: List[DtypeObj]) -> DtypeObj:
15471555
"""
15481556
Find a common data type among the given dtypes.

pandas/tests/arrays/categorical/test_constructors.py

+13
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import numpy as np
44
import pytest
55

6+
from pandas.compat import IS64, is_platform_windows
7+
68
from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype
79
from pandas.core.dtypes.dtypes import CategoricalDtype
810

@@ -723,3 +725,14 @@ def test_from_sequence_copy(self):
723725
result = Categorical._from_sequence(cat, dtype=None, copy=True)
724726

725727
assert not np.shares_memory(result._codes, cat._codes)
728+
729+
@pytest.mark.xfail(
730+
not IS64 or is_platform_windows(),
731+
reason="Incorrectly raising in ensure_datetime64ns",
732+
)
733+
def test_constructor_datetime64_non_nano(self):
734+
categories = np.arange(10).view("M8[D]")
735+
values = categories[::2].copy()
736+
737+
cat = Categorical(values, categories=categories)
738+
assert (cat == values).all()

pandas/tests/series/methods/test_drop_duplicates.py

+80-28
Original file line numberDiff line numberDiff line change
@@ -67,72 +67,124 @@ def test_drop_duplicates_no_duplicates(any_numpy_dtype, keep, values):
6767

6868

6969
class TestSeriesDropDuplicates:
70-
@pytest.mark.parametrize(
71-
"dtype",
72-
["int_", "uint", "float_", "unicode_", "timedelta64[h]", "datetime64[D]"],
70+
@pytest.fixture(
71+
params=["int_", "uint", "float_", "unicode_", "timedelta64[h]", "datetime64[D]"]
7372
)
74-
def test_drop_duplicates_categorical_non_bool(self, dtype, ordered):
75-
cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype))
73+
def dtype(self, request):
74+
return request.param
7675

76+
@pytest.fixture
77+
def cat_series1(self, dtype, ordered):
7778
# Test case 1
79+
cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype))
80+
7881
input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype))
79-
tc1 = Series(Categorical(input1, categories=cat_array, ordered=ordered))
80-
if dtype == "datetime64[D]":
81-
# pre-empty flaky xfail, tc1 values are seemingly-random
82-
if not (np.array(tc1) == input1).all():
83-
pytest.xfail(reason="GH#7996")
82+
cat = Categorical(input1, categories=cat_array, ordered=ordered)
83+
tc1 = Series(cat)
84+
return tc1
85+
86+
def test_drop_duplicates_categorical_non_bool(self, cat_series1):
87+
tc1 = cat_series1
8488

8589
expected = Series([False, False, False, True])
86-
tm.assert_series_equal(tc1.duplicated(), expected)
87-
tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected])
90+
91+
result = tc1.duplicated()
92+
tm.assert_series_equal(result, expected)
93+
94+
result = tc1.drop_duplicates()
95+
tm.assert_series_equal(result, tc1[~expected])
96+
8897
sc = tc1.copy()
8998
return_value = sc.drop_duplicates(inplace=True)
9099
assert return_value is None
91100
tm.assert_series_equal(sc, tc1[~expected])
92101

102+
def test_drop_duplicates_categorical_non_bool_keeplast(self, cat_series1):
103+
tc1 = cat_series1
104+
93105
expected = Series([False, False, True, False])
94-
tm.assert_series_equal(tc1.duplicated(keep="last"), expected)
95-
tm.assert_series_equal(tc1.drop_duplicates(keep="last"), tc1[~expected])
106+
107+
result = tc1.duplicated(keep="last")
108+
tm.assert_series_equal(result, expected)
109+
110+
result = tc1.drop_duplicates(keep="last")
111+
tm.assert_series_equal(result, tc1[~expected])
112+
96113
sc = tc1.copy()
97114
return_value = sc.drop_duplicates(keep="last", inplace=True)
98115
assert return_value is None
99116
tm.assert_series_equal(sc, tc1[~expected])
100117

118+
def test_drop_duplicates_categorical_non_bool_keepfalse(self, cat_series1):
119+
tc1 = cat_series1
120+
101121
expected = Series([False, False, True, True])
102-
tm.assert_series_equal(tc1.duplicated(keep=False), expected)
103-
tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected])
122+
123+
result = tc1.duplicated(keep=False)
124+
tm.assert_series_equal(result, expected)
125+
126+
result = tc1.drop_duplicates(keep=False)
127+
tm.assert_series_equal(result, tc1[~expected])
128+
104129
sc = tc1.copy()
105130
return_value = sc.drop_duplicates(keep=False, inplace=True)
106131
assert return_value is None
107132
tm.assert_series_equal(sc, tc1[~expected])
108133

109-
# Test case 2
134+
@pytest.fixture
135+
def cat_series2(self, dtype, ordered):
136+
# Test case 2; TODO: better name
137+
cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype))
138+
110139
input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype))
111-
tc2 = Series(Categorical(input2, categories=cat_array, ordered=ordered))
112-
if dtype == "datetime64[D]":
113-
# pre-empty flaky xfail, tc2 values are seemingly-random
114-
if not (np.array(tc2) == input2).all():
115-
pytest.xfail(reason="GH#7996")
140+
cat = Categorical(input2, categories=cat_array, ordered=ordered)
141+
tc2 = Series(cat)
142+
return tc2
143+
144+
def test_drop_duplicates_categorical_non_bool2(self, cat_series2):
145+
# Test case 2; TODO: better name
146+
tc2 = cat_series2
116147

117148
expected = Series([False, False, False, False, True, True, False])
118-
tm.assert_series_equal(tc2.duplicated(), expected)
119-
tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected])
149+
150+
result = tc2.duplicated()
151+
tm.assert_series_equal(result, expected)
152+
153+
result = tc2.drop_duplicates()
154+
tm.assert_series_equal(result, tc2[~expected])
155+
120156
sc = tc2.copy()
121157
return_value = sc.drop_duplicates(inplace=True)
122158
assert return_value is None
123159
tm.assert_series_equal(sc, tc2[~expected])
124160

161+
def test_drop_duplicates_categorical_non_bool2_keeplast(self, cat_series2):
162+
tc2 = cat_series2
163+
125164
expected = Series([False, True, True, False, False, False, False])
126-
tm.assert_series_equal(tc2.duplicated(keep="last"), expected)
127-
tm.assert_series_equal(tc2.drop_duplicates(keep="last"), tc2[~expected])
165+
166+
result = tc2.duplicated(keep="last")
167+
tm.assert_series_equal(result, expected)
168+
169+
result = tc2.drop_duplicates(keep="last")
170+
tm.assert_series_equal(result, tc2[~expected])
171+
128172
sc = tc2.copy()
129173
return_value = sc.drop_duplicates(keep="last", inplace=True)
130174
assert return_value is None
131175
tm.assert_series_equal(sc, tc2[~expected])
132176

177+
def test_drop_duplicates_categorical_non_bool2_keepfalse(self, cat_series2):
178+
tc2 = cat_series2
179+
133180
expected = Series([False, True, True, False, True, True, False])
134-
tm.assert_series_equal(tc2.duplicated(keep=False), expected)
135-
tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected])
181+
182+
result = tc2.duplicated(keep=False)
183+
tm.assert_series_equal(result, expected)
184+
185+
result = tc2.drop_duplicates(keep=False)
186+
tm.assert_series_equal(result, tc2[~expected])
187+
136188
sc = tc2.copy()
137189
return_value = sc.drop_duplicates(keep=False, inplace=True)
138190
assert return_value is None

0 commit comments

Comments
 (0)