Skip to content

Commit c29c176

Browse files
authored
API: preserve freq in DTI/TDI.factorize (#38120)
1 parent aad85ad commit c29c176

File tree

8 files changed

+104
-24
lines changed

8 files changed

+104
-24
lines changed

doc/source/whatsnew/v1.1.5.rst

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ Fixed regressions
1919
- Fixed regression in :meth:`DataFrame.loc` and :meth:`Series.loc` for ``__setitem__`` when one-dimensional tuple was given to select from :class:`MultiIndex` (:issue:`37711`)
2020
- Fixed regression in inplace operations on :class:`Series` with ``ExtensionDtype`` with NumPy dtyped operand (:issue:`37910`)
2121
- Fixed regression in metadata propagation for ``groupby`` iterator (:issue:`37343`)
22+
- Fixed regression in :class:`MultiIndex` constructed from a :class:`DatetimeIndex` not retaining frequency (:issue:`35563`)
2223
- Fixed regression in indexing on a :class:`Series` with ``CategoricalDtype`` after unpickling (:issue:`37631`)
2324
- Fixed regression in :meth:`DataFrame.groupby` aggregation with out-of-bounds datetime objects in an object-dtype column (:issue:`36003`)
2425
- Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`)

pandas/core/algorithms.py

+30-5
Original file line numberDiff line numberDiff line change
@@ -48,11 +48,13 @@
4848
pandas_dtype,
4949
)
5050
from pandas.core.dtypes.generic import (
51+
ABCDatetimeArray,
5152
ABCExtensionArray,
5253
ABCIndexClass,
5354
ABCMultiIndex,
5455
ABCRangeIndex,
5556
ABCSeries,
57+
ABCTimedeltaArray,
5658
)
5759
from pandas.core.dtypes.missing import isna, na_value_for_dtype
5860

@@ -199,8 +201,16 @@ def _reconstruct_data(
199201
-------
200202
ExtensionArray or np.ndarray
201203
"""
204+
if isinstance(values, ABCExtensionArray) and values.dtype == dtype:
205+
# Catch DatetimeArray/TimedeltaArray
206+
return values
207+
202208
if is_extension_array_dtype(dtype):
203-
values = dtype.construct_array_type()._from_sequence(values)
209+
cls = dtype.construct_array_type()
210+
if isinstance(values, cls) and values.dtype == dtype:
211+
return values
212+
213+
values = cls._from_sequence(values)
204214
elif is_bool_dtype(dtype):
205215
values = values.astype(dtype, copy=False)
206216

@@ -674,8 +684,13 @@ def factorize(
674684
# responsible only for factorization. All data coercion, sorting and boxing
675685
# should happen here.
676686

687+
if isinstance(values, ABCRangeIndex):
688+
return values.factorize(sort=sort)
689+
677690
values = _ensure_arraylike(values)
678691
original = values
692+
if not isinstance(values, ABCMultiIndex):
693+
values = extract_array(values, extract_numpy=True)
679694

680695
# GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques
681696
# of values, assign na_sentinel=-1 to replace code value for NaN.
@@ -684,10 +699,20 @@ def factorize(
684699
na_sentinel = -1
685700
dropna = False
686701

687-
if isinstance(values, ABCRangeIndex):
688-
return values.factorize(sort=sort)
689-
elif is_extension_array_dtype(values.dtype):
690-
values = extract_array(values)
702+
if (
703+
isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray))
704+
and values.freq is not None
705+
):
706+
codes, uniques = values.factorize(sort=sort)
707+
if isinstance(original, ABCIndexClass):
708+
uniques = original._shallow_copy(uniques, name=None)
709+
elif isinstance(original, ABCSeries):
710+
from pandas import Index
711+
712+
uniques = Index(uniques)
713+
return codes, uniques
714+
715+
if is_extension_array_dtype(values.dtype):
691716
codes, uniques = values.factorize(na_sentinel=na_sentinel)
692717
dtype = original.dtype
693718
else:

pandas/core/arrays/datetimelike.py

+18
Original file line numberDiff line numberDiff line change
@@ -1645,6 +1645,24 @@ def _with_freq(self, freq):
16451645
arr._freq = freq
16461646
return arr
16471647

1648+
# --------------------------------------------------------------
1649+
1650+
def factorize(self, na_sentinel=-1, sort: bool = False):
1651+
if self.freq is not None:
1652+
# We must be unique, so can short-circuit (and retain freq)
1653+
codes = np.arange(len(self), dtype=np.intp)
1654+
uniques = self.copy() # TODO: copy or view?
1655+
if sort and self.freq.n < 0:
1656+
codes = codes[::-1]
1657+
# TODO: overload __getitem__, a slice indexer returns same type as self
1658+
# error: Incompatible types in assignment (expression has type
1659+
# "Union[DatetimeLikeArrayMixin, Union[Any, Any]]", variable
1660+
# has type "TimelikeOps") [assignment]
1661+
uniques = uniques[::-1] # type: ignore[assignment]
1662+
return codes, uniques
1663+
# FIXME: shouldn't get here; we are ignoring sort
1664+
return super().factorize(na_sentinel=na_sentinel)
1665+
16481666

16491667
# -------------------------------------------------------------------
16501668
# Shared Constructor Helpers

pandas/tests/indexes/datetimes/test_datetime.py

+35-16
Original file line numberDiff line numberDiff line change
@@ -265,10 +265,12 @@ def test_factorize(self):
265265
arr, idx = idx1.factorize()
266266
tm.assert_numpy_array_equal(arr, exp_arr)
267267
tm.assert_index_equal(idx, exp_idx)
268+
assert idx.freq == exp_idx.freq
268269

269270
arr, idx = idx1.factorize(sort=True)
270271
tm.assert_numpy_array_equal(arr, exp_arr)
271272
tm.assert_index_equal(idx, exp_idx)
273+
assert idx.freq == exp_idx.freq
272274

273275
# tz must be preserved
274276
idx1 = idx1.tz_localize("Asia/Tokyo")
@@ -277,6 +279,7 @@ def test_factorize(self):
277279
arr, idx = idx1.factorize()
278280
tm.assert_numpy_array_equal(arr, exp_arr)
279281
tm.assert_index_equal(idx, exp_idx)
282+
assert idx.freq == exp_idx.freq
280283

281284
idx2 = DatetimeIndex(
282285
["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"]
@@ -287,49 +290,65 @@ def test_factorize(self):
287290
arr, idx = idx2.factorize(sort=True)
288291
tm.assert_numpy_array_equal(arr, exp_arr)
289292
tm.assert_index_equal(idx, exp_idx)
293+
assert idx.freq == exp_idx.freq
290294

291295
exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp)
292296
exp_idx = DatetimeIndex(["2014-03", "2014-02", "2014-01"])
293297
arr, idx = idx2.factorize()
294298
tm.assert_numpy_array_equal(arr, exp_arr)
295299
tm.assert_index_equal(idx, exp_idx)
300+
assert idx.freq == exp_idx.freq
296301

297-
# freq must be preserved
302+
def test_factorize_preserves_freq(self):
303+
# GH#38120 freq should be preserved
298304
idx3 = date_range("2000-01", periods=4, freq="M", tz="Asia/Tokyo")
299305
exp_arr = np.array([0, 1, 2, 3], dtype=np.intp)
306+
300307
arr, idx = idx3.factorize()
301308
tm.assert_numpy_array_equal(arr, exp_arr)
302309
tm.assert_index_equal(idx, idx3)
310+
assert idx.freq == idx3.freq
311+
312+
arr, idx = pd.factorize(idx3)
313+
tm.assert_numpy_array_equal(arr, exp_arr)
314+
tm.assert_index_equal(idx, idx3)
315+
assert idx.freq == idx3.freq
303316

304-
def test_factorize_tz(self, tz_naive_fixture):
317+
def test_factorize_tz(self, tz_naive_fixture, index_or_series):
305318
tz = tz_naive_fixture
306319
# GH#13750
307320
base = date_range("2016-11-05", freq="H", periods=100, tz=tz)
308321
idx = base.repeat(5)
309322

310323
exp_arr = np.arange(100, dtype=np.intp).repeat(5)
311324

312-
for obj in [idx, pd.Series(idx)]:
313-
arr, res = obj.factorize()
314-
tm.assert_numpy_array_equal(arr, exp_arr)
315-
expected = base._with_freq(None)
316-
tm.assert_index_equal(res, expected)
325+
obj = index_or_series(idx)
326+
327+
arr, res = obj.factorize()
328+
tm.assert_numpy_array_equal(arr, exp_arr)
329+
expected = base._with_freq(None)
330+
tm.assert_index_equal(res, expected)
331+
assert res.freq == expected.freq
317332

318-
def test_factorize_dst(self):
333+
def test_factorize_dst(self, index_or_series):
319334
# GH 13750
320335
idx = date_range("2016-11-06", freq="H", periods=12, tz="US/Eastern")
336+
obj = index_or_series(idx)
321337

322-
for obj in [idx, pd.Series(idx)]:
323-
arr, res = obj.factorize()
324-
tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp))
325-
tm.assert_index_equal(res, idx)
338+
arr, res = obj.factorize()
339+
tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp))
340+
tm.assert_index_equal(res, idx)
341+
if index_or_series is Index:
342+
assert res.freq == idx.freq
326343

327344
idx = date_range("2016-06-13", freq="H", periods=12, tz="US/Eastern")
345+
obj = index_or_series(idx)
328346

329-
for obj in [idx, pd.Series(idx)]:
330-
arr, res = obj.factorize()
331-
tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp))
332-
tm.assert_index_equal(res, idx)
347+
arr, res = obj.factorize()
348+
tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp))
349+
tm.assert_index_equal(res, idx)
350+
if index_or_series is Index:
351+
assert res.freq == idx.freq
333352

334353
@pytest.mark.parametrize(
335354
"arr, expected",

pandas/tests/indexes/timedeltas/test_timedelta.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -75,17 +75,26 @@ def test_factorize(self):
7575
arr, idx = idx1.factorize()
7676
tm.assert_numpy_array_equal(arr, exp_arr)
7777
tm.assert_index_equal(idx, exp_idx)
78+
assert idx.freq == exp_idx.freq
7879

7980
arr, idx = idx1.factorize(sort=True)
8081
tm.assert_numpy_array_equal(arr, exp_arr)
8182
tm.assert_index_equal(idx, exp_idx)
83+
assert idx.freq == exp_idx.freq
8284

83-
# freq must be preserved
85+
def test_factorize_preserves_freq(self):
86+
# GH#38120 freq should be preserved
8487
idx3 = timedelta_range("1 day", periods=4, freq="s")
8588
exp_arr = np.array([0, 1, 2, 3], dtype=np.intp)
8689
arr, idx = idx3.factorize()
8790
tm.assert_numpy_array_equal(arr, exp_arr)
8891
tm.assert_index_equal(idx, idx3)
92+
assert idx.freq == idx3.freq
93+
94+
arr, idx = pd.factorize(idx3)
95+
tm.assert_numpy_array_equal(arr, exp_arr)
96+
tm.assert_index_equal(idx, idx3)
97+
assert idx.freq == idx3.freq
8998

9099
def test_sort_values(self):
91100

pandas/tests/indexing/multiindex/test_multiindex.py

+10
Original file line numberDiff line numberDiff line change
@@ -83,3 +83,13 @@ def test_nested_tuples_duplicates(self):
8383
df3 = df.copy(deep=True)
8484
df3.loc[[(dti[0], "a")], "c2"] = 1.0
8585
tm.assert_frame_equal(df3, expected)
86+
87+
def test_multiindex_with_datatime_level_preserves_freq(self):
88+
# https://github.com/pandas-dev/pandas/issues/35563
89+
idx = Index(range(2), name="A")
90+
dti = pd.date_range("2020-01-01", periods=7, freq="D", name="B")
91+
mi = MultiIndex.from_product([idx, dti])
92+
df = DataFrame(np.random.randn(14, 2), index=mi)
93+
result = df.loc[0].index
94+
tm.assert_index_equal(result, dti)
95+
assert result.freq == dti.freq

pandas/tests/window/moments/test_moments_consistency_ewm.py

-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ def test_ewm_pairwise_cov_corr(func, frame):
1111
result = result.loc[(slice(None), 1), 5]
1212
result.index = result.index.droplevel(1)
1313
expected = getattr(frame[1].ewm(span=10, min_periods=5), func)(frame[5])
14-
expected.index = expected.index._with_freq(None)
1514
tm.assert_series_equal(result, expected, check_names=False)
1615

1716

pandas/tests/window/moments/test_moments_consistency_rolling.py

-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@ def test_rolling_pairwise_cov_corr(func, frame):
5151
result = result.loc[(slice(None), 1), 5]
5252
result.index = result.index.droplevel(1)
5353
expected = getattr(frame[1].rolling(window=10, min_periods=5), func)(frame[5])
54-
expected.index = expected.index._with_freq(None)
5554
tm.assert_series_equal(result, expected, check_names=False)
5655

5756

0 commit comments

Comments
 (0)