diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index eca1733b61a52..7ac9cef0d2412 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -184,8 +184,16 @@ def _reconstruct_data(values, dtype, original): ------- Index for extension types, otherwise ndarray casted to dtype """ + if isinstance(values, ABCExtensionArray) and values.dtype == dtype: + # Catch DatetimeArray/TimedeltaArray + return values + if is_extension_array_dtype(dtype): - values = dtype.construct_array_type()._from_sequence(values) + cls = dtype.construct_array_type() + if isinstance(values, cls) and values.dtype == dtype: + return values + + values = cls._from_sequence(values) elif is_bool_dtype(dtype): values = values.astype(dtype, copy=False) @@ -613,9 +621,11 @@ def factorize( values = _ensure_arraylike(values) original = values + if not isinstance(values, ABCMultiIndex): + values = extract_array(values, extract_numpy=True) - if is_extension_array_dtype(values.dtype): - values = extract_array(values) + if isinstance(values, ABCExtensionArray): + # Includes DatetimeArray, TimedeltaArray codes, uniques = values.factorize(na_sentinel=na_sentinel) dtype = original.dtype else: diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c325ec0d0bf7c..22ffd00ca1393 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -437,6 +437,13 @@ def _with_freq(self, freq): arr._freq = freq return arr + def factorize(self, na_sentinel=-1): + if self.freq is not None: + # We must be unique, so can short-circuit (and retain freq) + codes = np.arange(len(self), dtype=np.intp) + return codes, self.copy() + return ExtensionArray.factorize(self, na_sentinel=na_sentinel) + DatetimeLikeArrayT = TypeVar("DatetimeLikeArrayT", bound="DatetimeLikeArrayMixin") diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index e3fbb906ed6b1..222e9d8edd454 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -112,7 +112,7 @@ def f(self): return property(f) -class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps): +class DatetimeArray(dtl.TimelikeOps, dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps): """ Pandas ExtensionArray for tz-naive or tz-aware datetime data. diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index a460d07e1f6f2..cc5ad95036a1c 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -57,7 +57,7 @@ def f(self): return property(f) -class TimedeltaArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps): +class TimedeltaArray(dtl.TimelikeOps, dtl.DatetimeLikeArrayMixin): """ Pandas ExtensionArray for timedelta data. diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index decb17cdf6672..c6f79de61053e 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -602,6 +602,15 @@ def _shallow_copy(self, values=None, name: Label = lib.no_default): result._cache = cache return result + def factorize(self, sort=False, na_sentinel=-1): + if self.freq is not None and sort is False: + # we are unique, so can short-circuit, also can preserve freq + codes = np.arange(len(self), dtype=np.intp) + return codes, self.copy() + # TODO: In the sort=True case we could check for montonic_decreasing + # and operate on self[::-1] + return super().factorize(sort=sort, na_sentinel=na_sentinel) + # -------------------------------------------------------------------- # Set Operation Methods diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index e0e5beaf48e20..aa3a963efb735 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -328,10 +328,12 @@ def test_factorize(self): arr, idx = idx1.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq arr, idx = idx1.factorize(sort=True) tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq # tz must be preserved idx1 = idx1.tz_localize("Asia/Tokyo") @@ -340,6 +342,7 @@ def test_factorize(self): arr, idx = idx1.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq idx2 = pd.DatetimeIndex( ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"] @@ -350,21 +353,31 @@ def test_factorize(self): arr, idx = idx2.factorize(sort=True) tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp) exp_idx = DatetimeIndex(["2014-03", "2014-02", "2014-01"]) arr, idx = idx2.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq - # freq must be preserved + def test_factorize_preserves_freq(self): + # GH#33836 freq should be preserved idx3 = date_range("2000-01", periods=4, freq="M", tz="Asia/Tokyo") exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) + arr, idx = idx3.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, idx3) + assert idx.freq == idx3.freq + + arr, idx = pd.factorize(idx3) + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, idx3) + assert idx.freq == idx3.freq - def test_factorize_tz(self, tz_naive_fixture): + def test_factorize_tz(self, tz_naive_fixture, index_or_series): tz = tz_naive_fixture # GH#13750 base = pd.date_range("2016-11-05", freq="H", periods=100, tz=tz) @@ -372,27 +385,33 @@ def test_factorize_tz(self, tz_naive_fixture): exp_arr = np.arange(100, dtype=np.intp).repeat(5) - for obj in [idx, pd.Series(idx)]: - arr, res = obj.factorize() - tm.assert_numpy_array_equal(arr, exp_arr) - expected = base._with_freq(None) - tm.assert_index_equal(res, expected) + obj = index_or_series(idx) + + arr, res = obj.factorize() + tm.assert_numpy_array_equal(arr, exp_arr) + expected = base._with_freq(None) + tm.assert_index_equal(res, expected) + assert res.freq == expected.freq - def test_factorize_dst(self): + def test_factorize_dst(self, index_or_series): # GH 13750 idx = pd.date_range("2016-11-06", freq="H", periods=12, tz="US/Eastern") + obj = index_or_series(idx) - for obj in [idx, pd.Series(idx)]: - arr, res = obj.factorize() - tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) - tm.assert_index_equal(res, idx) + arr, res = obj.factorize() + tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) + tm.assert_index_equal(res, idx) + if index_or_series is Index: + assert res.freq == idx.freq idx = pd.date_range("2016-06-13", freq="H", periods=12, tz="US/Eastern") + obj = index_or_series(idx) - for obj in [idx, pd.Series(idx)]: - arr, res = obj.factorize() - tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) - tm.assert_index_equal(res, idx) + arr, res = obj.factorize() + tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) + tm.assert_index_equal(res, idx) + if index_or_series is Index: + assert res.freq == idx.freq @pytest.mark.parametrize( "arr, expected", diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 5efa1a75700e0..fa82ccb68989b 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -76,17 +76,26 @@ def test_factorize(self): arr, idx = idx1.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq arr, idx = idx1.factorize(sort=True) tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq - # freq must be preserved + def test_factorize_preserves_freq(self): + # GH#33836 freq should be preserved idx3 = timedelta_range("1 day", periods=4, freq="s") exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) arr, idx = idx3.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, idx3) + assert idx.freq == idx3.freq + + arr, idx = pd.factorize(idx3) + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, idx3) + assert idx.freq == idx3.freq def test_sort_values(self):