Skip to content

Commit 6a13450

Browse files
authored
BUG/API: preserve non-nano in factorize/unique (#51978)
* BUG/API: preserve non-nano in factorize/unique * test
1 parent b02ffe2 commit 6a13450

File tree

8 files changed

+48
-50
lines changed

8 files changed

+48
-50
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -768,6 +768,7 @@ Other API changes
768768
- Division by zero with :class:`ArrowDtype` dtypes returns ``-inf``, ``nan``, or ``inf`` depending on the numerator, instead of raising (:issue:`51541`)
769769
- Added :func:`pandas.api.types.is_any_real_numeric_dtype` to check for real numeric dtypes (:issue:`51152`)
770770
- :meth:`~arrays.ArrowExtensionArray.value_counts` now returns data with :class:`ArrowDtype` with ``pyarrow.int64`` type instead of ``"Int64"`` type (:issue:`51462`)
771+
- :func:`factorize` and :func:`unique` preserve the original dtype when passed numpy timedelta64 or datetime64 with non-nanosecond resolution (:issue:`48670`)
771772

772773
.. note::
773774

pandas/core/algorithms.py

+2-11
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@
3535
from pandas.core.dtypes.cast import (
3636
construct_1d_object_array_from_listlike,
3737
infer_dtype_from_array,
38-
sanitize_to_nanoseconds,
3938
)
4039
from pandas.core.dtypes.common import (
4140
ensure_float64,
@@ -45,7 +44,6 @@
4544
is_bool_dtype,
4645
is_categorical_dtype,
4746
is_complex_dtype,
48-
is_datetime64_dtype,
4947
is_dict_like,
5048
is_extension_array_dtype,
5149
is_float_dtype,
@@ -56,7 +54,6 @@
5654
is_object_dtype,
5755
is_scalar,
5856
is_signed_integer_dtype,
59-
is_timedelta64_dtype,
6057
needs_i8_conversion,
6158
)
6259
from pandas.core.dtypes.concat import concat_compat
@@ -175,8 +172,6 @@ def _ensure_data(values: ArrayLike) -> np.ndarray:
175172

176173
# datetimelike
177174
elif needs_i8_conversion(values.dtype):
178-
if isinstance(values, np.ndarray):
179-
values = sanitize_to_nanoseconds(values)
180175
npvalues = values.view("i8")
181176
npvalues = cast(np.ndarray, npvalues)
182177
return npvalues
@@ -214,11 +209,6 @@ def _reconstruct_data(
214209
values = cls._from_sequence(values, dtype=dtype)
215210

216211
else:
217-
if is_datetime64_dtype(dtype):
218-
dtype = np.dtype("datetime64[ns]")
219-
elif is_timedelta64_dtype(dtype):
220-
dtype = np.dtype("timedelta64[ns]")
221-
222212
values = values.astype(dtype, copy=False)
223213

224214
return values
@@ -769,7 +759,8 @@ def factorize(
769759
codes, uniques = values.factorize(sort=sort)
770760
return codes, uniques
771761

772-
elif not isinstance(values.dtype, np.dtype):
762+
elif not isinstance(values, np.ndarray):
763+
# i.e. ExtensionArray
773764
codes, uniques = values.factorize(use_na_sentinel=use_na_sentinel)
774765

775766
else:

pandas/core/dtypes/cast.py

-20
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@
3131
OutOfBoundsTimedelta,
3232
Timedelta,
3333
Timestamp,
34-
astype_overflowsafe,
3534
get_unit_from_dtype,
3635
is_supported_unit,
3736
)
@@ -42,8 +41,6 @@
4241
)
4342

4443
from pandas.core.dtypes.common import (
45-
DT64NS_DTYPE,
46-
TD64NS_DTYPE,
4744
ensure_int8,
4845
ensure_int16,
4946
ensure_int32,
@@ -1232,23 +1229,6 @@ def maybe_cast_to_datetime(
12321229
return dta
12331230

12341231

1235-
def sanitize_to_nanoseconds(values: np.ndarray, copy: bool = False) -> np.ndarray:
1236-
"""
1237-
Safely convert non-nanosecond datetime64 or timedelta64 values to nanosecond.
1238-
"""
1239-
dtype = values.dtype
1240-
if dtype.kind == "M" and dtype != DT64NS_DTYPE:
1241-
values = astype_overflowsafe(values, dtype=DT64NS_DTYPE)
1242-
1243-
elif dtype.kind == "m" and dtype != TD64NS_DTYPE:
1244-
values = astype_overflowsafe(values, dtype=TD64NS_DTYPE)
1245-
1246-
elif copy:
1247-
values = values.copy()
1248-
1249-
return values
1250-
1251-
12521232
def _ensure_nanosecond_dtype(dtype: DtypeObj) -> None:
12531233
"""
12541234
Convert dtypes with granularity less than nanosecond to nanosecond

pandas/core/tools/datetimes.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -307,7 +307,7 @@ def _convert_and_box_cache(
307307
"""
308308
from pandas import Series
309309

310-
result = Series(arg).map(cache_array)
310+
result = Series(arg, dtype=cache_array.index.dtype).map(cache_array)
311311
return _box_as_indexlike(result._values, utc=False, name=name)
312312

313313

pandas/tests/indexes/datetimes/methods/test_factorize.py

+18
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import numpy as np
2+
import pytest
23

34
from pandas import (
45
DatetimeIndex,
@@ -105,3 +106,20 @@ def test_factorize_dst(self, index_or_series):
105106
tm.assert_index_equal(res, idx)
106107
if index_or_series is Index:
107108
assert res.freq == idx.freq
109+
110+
@pytest.mark.parametrize("sort", [True, False])
111+
def test_factorize_no_freq_non_nano(self, tz_naive_fixture, sort):
112+
# GH#51978 case that does not go through the fastpath based on
113+
# non-None freq
114+
tz = tz_naive_fixture
115+
idx = date_range("2016-11-06", freq="H", periods=5, tz=tz)[[0, 4, 1, 3, 2]]
116+
exp_codes, exp_uniques = idx.factorize(sort=sort)
117+
118+
res_codes, res_uniques = idx.as_unit("s").factorize(sort=sort)
119+
120+
tm.assert_numpy_array_equal(res_codes, exp_codes)
121+
tm.assert_index_equal(res_uniques, exp_uniques.as_unit("s"))
122+
123+
res_codes, res_uniques = idx.as_unit("s").to_series().factorize(sort=sort)
124+
tm.assert_numpy_array_equal(res_codes, exp_codes)
125+
tm.assert_index_equal(res_uniques, exp_uniques.as_unit("s"))

pandas/tests/io/parser/test_parse_dates.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1657,8 +1657,8 @@ def date_parser(dt, time):
16571657
datetimes = np.array(["2013-11-03T19:00:00"] * 3, dtype="datetime64[s]")
16581658
expected = DataFrame(
16591659
data={"rxstatus": ["00E80000"] * 3},
1660-
index=MultiIndex.from_tuples(
1661-
[(datetimes[0], 126), (datetimes[1], 23), (datetimes[2], 13)],
1660+
index=MultiIndex.from_arrays(
1661+
[datetimes, [126, 23, 13]],
16621662
names=["datetime", "prn"],
16631663
),
16641664
)

pandas/tests/test_algos.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -327,7 +327,7 @@ def test_object_factorize(self, writable):
327327

328328
def test_datetime64_factorize(self, writable):
329329
# GH35650 Verify whether read-only datetime64 array can be factorized
330-
data = np.array([np.datetime64("2020-01-01T00:00:00.000")])
330+
data = np.array([np.datetime64("2020-01-01T00:00:00.000")], dtype="M8[ns]")
331331
data.setflags(write=writable)
332332
expected_codes = np.array([0], dtype=np.intp)
333333
expected_uniques = np.array(
@@ -620,13 +620,13 @@ def test_datetime64_dtype_array_returned(self):
620620
def test_datetime_non_ns(self):
621621
a = np.array(["2000", "2000", "2001"], dtype="datetime64[s]")
622622
result = pd.unique(a)
623-
expected = np.array(["2000", "2001"], dtype="datetime64[ns]")
623+
expected = np.array(["2000", "2001"], dtype="datetime64[s]")
624624
tm.assert_numpy_array_equal(result, expected)
625625

626626
def test_timedelta_non_ns(self):
627627
a = np.array(["2000", "2000", "2001"], dtype="timedelta64[s]")
628628
result = pd.unique(a)
629-
expected = np.array([2000000000000, 2001000000000], dtype="timedelta64[ns]")
629+
expected = np.array([2000, 2001], dtype="timedelta64[s]")
630630
tm.assert_numpy_array_equal(result, expected)
631631

632632
def test_timedelta64_dtype_array_returned(self):

pandas/tests/tools/test_to_datetime.py

+21-13
Original file line numberDiff line numberDiff line change
@@ -1076,31 +1076,39 @@ def test_to_datetime_array_of_dt64s(self, cache, unit):
10761076
# Assuming all datetimes are in bounds, to_datetime() returns
10771077
# an array that is equal to Timestamp() parsing
10781078
result = to_datetime(dts, cache=cache)
1079-
expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[ns]")
1079+
if cache:
1080+
# FIXME: behavior should not depend on cache
1081+
expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[s]")
1082+
else:
1083+
expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[ns]")
1084+
10801085
tm.assert_index_equal(result, expected)
10811086

10821087
# A list of datetimes where the last one is out of bounds
10831088
dts_with_oob = dts + [np.datetime64("9999-01-01")]
10841089

1085-
msg = "Out of bounds nanosecond timestamp: 9999-01-01 00:00:00"
1086-
with pytest.raises(OutOfBoundsDatetime, match=msg):
1087-
to_datetime(dts_with_oob, errors="raise")
1090+
# As of GH#?? we do not raise in this case
1091+
to_datetime(dts_with_oob, errors="raise")
10881092

1089-
tm.assert_index_equal(
1090-
to_datetime(dts_with_oob, errors="coerce", cache=cache),
1091-
DatetimeIndex(
1093+
result = to_datetime(dts_with_oob, errors="coerce", cache=cache)
1094+
if not cache:
1095+
# FIXME: shouldn't depend on cache!
1096+
expected = DatetimeIndex(
10921097
[Timestamp(dts_with_oob[0]).asm8, Timestamp(dts_with_oob[1]).asm8] * 30
10931098
+ [NaT],
1094-
),
1095-
)
1099+
)
1100+
else:
1101+
expected = DatetimeIndex(np.array(dts_with_oob, dtype="M8[s]"))
1102+
tm.assert_index_equal(result, expected)
10961103

10971104
# With errors='ignore', out of bounds datetime64s
10981105
# are converted to their .item(), which depending on the version of
10991106
# numpy is either a python datetime.datetime or datetime.date
1100-
tm.assert_index_equal(
1101-
to_datetime(dts_with_oob, errors="ignore", cache=cache),
1102-
Index(dts_with_oob),
1103-
)
1107+
result = to_datetime(dts_with_oob, errors="ignore", cache=cache)
1108+
if not cache:
1109+
# FIXME: shouldn't depend on cache!
1110+
expected = Index(dts_with_oob)
1111+
tm.assert_index_equal(result, expected)
11041112

11051113
def test_out_of_bounds_errors_ignore(self):
11061114
# https://github.com/pandas-dev/pandas/issues/50587

0 commit comments

Comments
 (0)