diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index edc2f7327abfc..4770ab37e08d2 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -19,6 +19,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.loc` and :meth:`Series.loc` for ``__setitem__`` when one-dimensional tuple was given to select from :class:`MultiIndex` (:issue:`37711`) - Fixed regression in inplace operations on :class:`Series` with ``ExtensionDtype`` with NumPy dtyped operand (:issue:`37910`) - Fixed regression in metadata propagation for ``groupby`` iterator (:issue:`37343`) +- Fixed regression in :class:`MultiIndex` constructed from a :class:`DatetimeIndex` not retaining frequency (:issue:`35563`) - Fixed regression in indexing on a :class:`Series` with ``CategoricalDtype`` after unpickling (:issue:`37631`) - Fixed regression in :meth:`DataFrame.groupby` aggregation with out-of-bounds datetime objects in an object-dtype column (:issue:`36003`) - Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 713d58b4df5be..840e79c6c9ebe 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -48,11 +48,13 @@ pandas_dtype, ) from pandas.core.dtypes.generic import ( + ABCDatetimeArray, ABCExtensionArray, ABCIndexClass, ABCMultiIndex, ABCRangeIndex, ABCSeries, + ABCTimedeltaArray, ) from pandas.core.dtypes.missing import isna, na_value_for_dtype @@ -199,8 +201,16 @@ def _reconstruct_data( ------- ExtensionArray or np.ndarray """ + if isinstance(values, ABCExtensionArray) and values.dtype == dtype: + # Catch DatetimeArray/TimedeltaArray + return values + if is_extension_array_dtype(dtype): - values = dtype.construct_array_type()._from_sequence(values) + cls = dtype.construct_array_type() + if isinstance(values, cls) and values.dtype == dtype: + return values + + values = cls._from_sequence(values) elif is_bool_dtype(dtype): values = values.astype(dtype, copy=False) @@ -674,8 +684,13 @@ def factorize( # responsible only for factorization. All data coercion, sorting and boxing # should happen here. + if isinstance(values, ABCRangeIndex): + return values.factorize(sort=sort) + values = _ensure_arraylike(values) original = values + if not isinstance(values, ABCMultiIndex): + values = extract_array(values, extract_numpy=True) # GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques # of values, assign na_sentinel=-1 to replace code value for NaN. @@ -684,10 +699,20 @@ def factorize( na_sentinel = -1 dropna = False - if isinstance(values, ABCRangeIndex): - return values.factorize(sort=sort) - elif is_extension_array_dtype(values.dtype): - values = extract_array(values) + if ( + isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray)) + and values.freq is not None + ): + codes, uniques = values.factorize(sort=sort) + if isinstance(original, ABCIndexClass): + uniques = original._shallow_copy(uniques, name=None) + elif isinstance(original, ABCSeries): + from pandas import Index + + uniques = Index(uniques) + return codes, uniques + + if is_extension_array_dtype(values.dtype): codes, uniques = values.factorize(na_sentinel=na_sentinel) dtype = original.dtype else: diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 8fa2c734092f4..979ca70bad9b3 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1645,6 +1645,24 @@ def _with_freq(self, freq): arr._freq = freq return arr + # -------------------------------------------------------------- + + def factorize(self, na_sentinel=-1, sort: bool = False): + if self.freq is not None: + # We must be unique, so can short-circuit (and retain freq) + codes = np.arange(len(self), dtype=np.intp) + uniques = self.copy() # TODO: copy or view? + if sort and self.freq.n < 0: + codes = codes[::-1] + # TODO: overload __getitem__, a slice indexer returns same type as self + # error: Incompatible types in assignment (expression has type + # "Union[DatetimeLikeArrayMixin, Union[Any, Any]]", variable + # has type "TimelikeOps") [assignment] + uniques = uniques[::-1] # type: ignore[assignment] + return codes, uniques + # FIXME: shouldn't get here; we are ignoring sort + return super().factorize(na_sentinel=na_sentinel) + # ------------------------------------------------------------------- # Shared Constructor Helpers diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 2657fc817ec3a..789510b452969 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -265,10 +265,12 @@ def test_factorize(self): arr, idx = idx1.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq arr, idx = idx1.factorize(sort=True) tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq # tz must be preserved idx1 = idx1.tz_localize("Asia/Tokyo") @@ -277,6 +279,7 @@ def test_factorize(self): arr, idx = idx1.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq idx2 = DatetimeIndex( ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"] @@ -287,21 +290,31 @@ def test_factorize(self): arr, idx = idx2.factorize(sort=True) tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp) exp_idx = DatetimeIndex(["2014-03", "2014-02", "2014-01"]) arr, idx = idx2.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq - # freq must be preserved + def test_factorize_preserves_freq(self): + # GH#38120 freq should be preserved idx3 = date_range("2000-01", periods=4, freq="M", tz="Asia/Tokyo") exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) + arr, idx = idx3.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, idx3) + assert idx.freq == idx3.freq + + arr, idx = pd.factorize(idx3) + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, idx3) + assert idx.freq == idx3.freq - def test_factorize_tz(self, tz_naive_fixture): + def test_factorize_tz(self, tz_naive_fixture, index_or_series): tz = tz_naive_fixture # GH#13750 base = date_range("2016-11-05", freq="H", periods=100, tz=tz) @@ -309,27 +322,33 @@ def test_factorize_tz(self, tz_naive_fixture): exp_arr = np.arange(100, dtype=np.intp).repeat(5) - for obj in [idx, pd.Series(idx)]: - arr, res = obj.factorize() - tm.assert_numpy_array_equal(arr, exp_arr) - expected = base._with_freq(None) - tm.assert_index_equal(res, expected) + obj = index_or_series(idx) + + arr, res = obj.factorize() + tm.assert_numpy_array_equal(arr, exp_arr) + expected = base._with_freq(None) + tm.assert_index_equal(res, expected) + assert res.freq == expected.freq - def test_factorize_dst(self): + def test_factorize_dst(self, index_or_series): # GH 13750 idx = date_range("2016-11-06", freq="H", periods=12, tz="US/Eastern") + obj = index_or_series(idx) - for obj in [idx, pd.Series(idx)]: - arr, res = obj.factorize() - tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) - tm.assert_index_equal(res, idx) + arr, res = obj.factorize() + tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) + tm.assert_index_equal(res, idx) + if index_or_series is Index: + assert res.freq == idx.freq idx = date_range("2016-06-13", freq="H", periods=12, tz="US/Eastern") + obj = index_or_series(idx) - for obj in [idx, pd.Series(idx)]: - arr, res = obj.factorize() - tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) - tm.assert_index_equal(res, idx) + arr, res = obj.factorize() + tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) + tm.assert_index_equal(res, idx) + if index_or_series is Index: + assert res.freq == idx.freq @pytest.mark.parametrize( "arr, expected", diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 774370ed866da..f0e730eecf3d5 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -75,17 +75,26 @@ def test_factorize(self): arr, idx = idx1.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq arr, idx = idx1.factorize(sort=True) tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq - # freq must be preserved + def test_factorize_preserves_freq(self): + # GH#38120 freq should be preserved idx3 = timedelta_range("1 day", periods=4, freq="s") exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) arr, idx = idx3.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, idx3) + assert idx.freq == idx3.freq + + arr, idx = pd.factorize(idx3) + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, idx3) + assert idx.freq == idx3.freq def test_sort_values(self): diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index a3b8d66c92024..9a3039c28416c 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -83,3 +83,13 @@ def test_nested_tuples_duplicates(self): df3 = df.copy(deep=True) df3.loc[[(dti[0], "a")], "c2"] = 1.0 tm.assert_frame_equal(df3, expected) + + def test_multiindex_with_datatime_level_preserves_freq(self): + # https://github.com/pandas-dev/pandas/issues/35563 + idx = Index(range(2), name="A") + dti = pd.date_range("2020-01-01", periods=7, freq="D", name="B") + mi = MultiIndex.from_product([idx, dti]) + df = DataFrame(np.random.randn(14, 2), index=mi) + result = df.loc[0].index + tm.assert_index_equal(result, dti) + assert result.freq == dti.freq diff --git a/pandas/tests/window/moments/test_moments_consistency_ewm.py b/pandas/tests/window/moments/test_moments_consistency_ewm.py index aa3453680190b..57665b47dea7f 100644 --- a/pandas/tests/window/moments/test_moments_consistency_ewm.py +++ b/pandas/tests/window/moments/test_moments_consistency_ewm.py @@ -11,7 +11,6 @@ def test_ewm_pairwise_cov_corr(func, frame): result = result.loc[(slice(None), 1), 5] result.index = result.index.droplevel(1) expected = getattr(frame[1].ewm(span=10, min_periods=5), func)(frame[5]) - expected.index = expected.index._with_freq(None) tm.assert_series_equal(result, expected, check_names=False) diff --git a/pandas/tests/window/moments/test_moments_consistency_rolling.py b/pandas/tests/window/moments/test_moments_consistency_rolling.py index 802ece77fd36d..53e5354340dcc 100644 --- a/pandas/tests/window/moments/test_moments_consistency_rolling.py +++ b/pandas/tests/window/moments/test_moments_consistency_rolling.py @@ -51,7 +51,6 @@ def test_rolling_pairwise_cov_corr(func, frame): result = result.loc[(slice(None), 1), 5] result.index = result.index.droplevel(1) expected = getattr(frame[1].rolling(window=10, min_periods=5), func)(frame[5]) - expected.index = expected.index._with_freq(None) tm.assert_series_equal(result, expected, check_names=False)