From da47af8a33f089e58d700cf866e10ea438e33467 Mon Sep 17 00:00:00 2001 From: TiffL Date: Sat, 29 Jun 2024 15:20:35 -0400 Subject: [PATCH 1/6] fix difference functionality for period indexes, add additional tests --- pandas/core/indexes/base.py | 8 +++++++- pandas/core/indexes/period.py | 6 ++++++ pandas/tests/indexes/period/test_setops.py | 13 +++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 71dfff520113c..12cc16e232d91 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3406,7 +3406,7 @@ def difference(self, other, sort=None): return self._wrap_difference_result(other, result) def _difference(self, other, sort): - # overridden by RangeIndex + # overridden by RangeIndex, PeriodIndex this = self if isinstance(self, ABCCategoricalIndex) and self.hasnans and other.hasnans: this = this.dropna() @@ -6209,6 +6209,12 @@ def _maybe_downcast_for_indexing(self, other: Index) -> tuple[Index, Index]: # let's instead try with a straight Index self = Index(self._values) + elif self.inferred_type == 'string' and isinstance(other, ABCPeriodIndex): + try: + return self.astype(other.dtype), other + except: + return self, other + if not is_object_dtype(self.dtype) and is_object_dtype(other.dtype): # Reverse op so we dont need to re-implement on the subclasses other, self = other._maybe_downcast_for_indexing(self) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index edd1fdd4da943..ea345c0d10aa4 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -514,6 +514,12 @@ def shift(self, periods: int = 1, freq=None) -> Self: ) return self + periods + def _difference(self, other, sort=None): + if not isinstance(other, PeriodIndex): + self, other = self._maybe_downcast_for_indexing(other) + + return super()._difference(other, sort=sort) + def period_range( start=None, diff --git a/pandas/tests/indexes/period/test_setops.py b/pandas/tests/indexes/period/test_setops.py index 2fa7e8cd0d2df..94b9e40279dba 100644 --- a/pandas/tests/indexes/period/test_setops.py +++ b/pandas/tests/indexes/period/test_setops.py @@ -314,6 +314,19 @@ def test_difference(self, sort): expected = expected.sort_values() tm.assert_index_equal(result_difference, expected) + def test_difference_mismatched_dtypes(self, sort): + # GH58971 + index = pd.period_range('2022-01-01', periods=5, freq='M') + other = pd.Index(['2022-02', '2022-03']) + + idx_diff = index.difference(other, sort) + expected = pd.PeriodIndex(['2022-01', '2022-04', '2022-05'], freq='M') + tm.assert_index_equal(idx_diff, expected) + + idx_diff = other.difference(index, sort) + expected = pd.Index([]) + tm.assert_index_equal(idx_diff, expected) + def test_difference_freq(self, sort): # GH14323: difference of Period MUST preserve frequency # but the ability to union results must be preserved From 324704871105cf5c28876caf935c42bdc0208b79 Mon Sep 17 00:00:00 2001 From: TiffL Date: Sat, 29 Jun 2024 15:37:57 -0400 Subject: [PATCH 2/6] update whatsnew doc --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index afb2f91f65ccd..a6e8c7d7f1480 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -625,6 +625,7 @@ Other - Bug in :meth:`DataFrame.transform` that was returning the wrong order unless the index was monotonically increasing. (:issue:`57069`) - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`) - Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`) +- Bug in :meth:`Index.difference` returning too many values / incorrect values for period indexes (:issue:`58971`) - Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`) - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) From 7c50a76699e2e794e04db6cd27092756242a45f1 Mon Sep 17 00:00:00 2001 From: TiffL Date: Sat, 29 Jun 2024 16:26:28 -0400 Subject: [PATCH 3/6] add type annotation, fix namespace usage --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/indexes/period.py | 2 +- pandas/tests/indexes/period/test_setops.py | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a6e8c7d7f1480..7639d75af89e1 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -624,8 +624,8 @@ Other - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`) - Bug in :meth:`DataFrame.transform` that was returning the wrong order unless the index was monotonically increasing. (:issue:`57069`) - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`) -- Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`) - Bug in :meth:`Index.difference` returning too many values / incorrect values for period indexes (:issue:`58971`) +- Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`) - Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`) - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index ea345c0d10aa4..5d9ad6ac19aff 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -514,7 +514,7 @@ def shift(self, periods: int = 1, freq=None) -> Self: ) return self + periods - def _difference(self, other, sort=None): + def _difference(self, other, sort=None) -> PeriodIndex: if not isinstance(other, PeriodIndex): self, other = self._maybe_downcast_for_indexing(other) diff --git a/pandas/tests/indexes/period/test_setops.py b/pandas/tests/indexes/period/test_setops.py index 94b9e40279dba..f02f3785f19d9 100644 --- a/pandas/tests/indexes/period/test_setops.py +++ b/pandas/tests/indexes/period/test_setops.py @@ -316,15 +316,15 @@ def test_difference(self, sort): def test_difference_mismatched_dtypes(self, sort): # GH58971 - index = pd.period_range('2022-01-01', periods=5, freq='M') - other = pd.Index(['2022-02', '2022-03']) + index = period_range('2022-01-01', periods=5, freq='M') + other = Index(['2022-02', '2022-03']) idx_diff = index.difference(other, sort) - expected = pd.PeriodIndex(['2022-01', '2022-04', '2022-05'], freq='M') + expected = PeriodIndex(['2022-01', '2022-04', '2022-05'], freq='M') tm.assert_index_equal(idx_diff, expected) idx_diff = other.difference(index, sort) - expected = pd.Index([]) + expected = Index([]) tm.assert_index_equal(idx_diff, expected) def test_difference_freq(self, sort): From ba27811e32516c9d31d442390080e3091149eab1 Mon Sep 17 00:00:00 2001 From: TiffL Date: Sat, 29 Jun 2024 17:41:48 -0400 Subject: [PATCH 4/6] fix namespace usage --- pandas/core/indexes/base.py | 4 ++-- pandas/tests/indexes/period/test_setops.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 12cc16e232d91..d290b9d6f2fae 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6209,10 +6209,10 @@ def _maybe_downcast_for_indexing(self, other: Index) -> tuple[Index, Index]: # let's instead try with a straight Index self = Index(self._values) - elif self.inferred_type == 'string' and isinstance(other, ABCPeriodIndex): + elif self.inferred_type == "string" and isinstance(other, ABCPeriodIndex): try: return self.astype(other.dtype), other - except: + except (TypeError, ValueError): return self, other if not is_object_dtype(self.dtype) and is_object_dtype(other.dtype): diff --git a/pandas/tests/indexes/period/test_setops.py b/pandas/tests/indexes/period/test_setops.py index f02f3785f19d9..3d31f85975592 100644 --- a/pandas/tests/indexes/period/test_setops.py +++ b/pandas/tests/indexes/period/test_setops.py @@ -316,15 +316,15 @@ def test_difference(self, sort): def test_difference_mismatched_dtypes(self, sort): # GH58971 - index = period_range('2022-01-01', periods=5, freq='M') - other = Index(['2022-02', '2022-03']) + index = period_range("2022-01-01", periods=5, freq="M") + other = pd.Index(["2022-02", "2022-03"]) idx_diff = index.difference(other, sort) - expected = PeriodIndex(['2022-01', '2022-04', '2022-05'], freq='M') + expected = PeriodIndex(["2022-01", "2022-04", "2022-05"], freq="M") tm.assert_index_equal(idx_diff, expected) idx_diff = other.difference(index, sort) - expected = Index([]) + expected = pd.Index([]) tm.assert_index_equal(idx_diff, expected) def test_difference_freq(self, sort): From 4d0c9b5b09f85bc5fc9fb32bb03b92f50fa7c701 Mon Sep 17 00:00:00 2001 From: TiffL Date: Sat, 29 Jun 2024 19:05:37 -0400 Subject: [PATCH 5/6] fix casting --- pandas/core/indexes/base.py | 6 ------ pandas/core/indexes/period.py | 4 ++-- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d290b9d6f2fae..405491f2644f4 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6209,12 +6209,6 @@ def _maybe_downcast_for_indexing(self, other: Index) -> tuple[Index, Index]: # let's instead try with a straight Index self = Index(self._values) - elif self.inferred_type == "string" and isinstance(other, ABCPeriodIndex): - try: - return self.astype(other.dtype), other - except (TypeError, ValueError): - return self, other - if not is_object_dtype(self.dtype) and is_object_dtype(other.dtype): # Reverse op so we dont need to re-implement on the subclasses other, self = other._maybe_downcast_for_indexing(self) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 5d9ad6ac19aff..440e42b06a812 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -515,8 +515,8 @@ def shift(self, periods: int = 1, freq=None) -> Self: return self + periods def _difference(self, other, sort=None) -> PeriodIndex: - if not isinstance(other, PeriodIndex): - self, other = self._maybe_downcast_for_indexing(other) + if isinstance(other, Index) and other.inferred_type == "string": + other = other.astype(self.dtype) return super()._difference(other, sort=sort) From 2081a0342b996fd1d4a7aba46e2be223ecf88bbe Mon Sep 17 00:00:00 2001 From: TiffL Date: Sat, 29 Jun 2024 19:40:14 -0400 Subject: [PATCH 6/6] add back missed try block --- pandas/core/indexes/period.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 440e42b06a812..e673b31b56755 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -516,7 +516,10 @@ def shift(self, periods: int = 1, freq=None) -> Self: def _difference(self, other, sort=None) -> PeriodIndex: if isinstance(other, Index) and other.inferred_type == "string": - other = other.astype(self.dtype) + try: + other = other.astype(self.dtype) + except (TypeError, ValueError): + pass return super()._difference(other, sort=sort)