From 7523babdfd1d0a8331efc217e643819035da0db4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 8 Aug 2024 10:22:57 +0200 Subject: [PATCH 1/2] String dtype: fix alignment sorting in case of python storage --- pandas/core/generic.py | 1 - pandas/core/indexes/base.py | 5 ++++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8a6fc69d47cc3..73e83434027dc 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9542,7 +9542,6 @@ def _align_series( join_index, lidx, ridx = self.index.join( other.index, how=join, level=level, return_indexers=True ) - if is_series: left = self._reindex_indexer(join_index, lidx) elif lidx is None or join_index is None: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 50f44cc728aea..d39c337fbb4b2 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4884,7 +4884,10 @@ def _can_use_libjoin(self) -> bool: return ( isinstance(self.dtype, np.dtype) or isinstance(self._values, (ArrowExtensionArray, BaseMaskedArray)) - or self.dtype == "string[python]" + or ( + isinstance(self.dtype, StringDtype) + and self.dtype.storage == "python" + ) ) # Exclude index types where the conversion to numpy converts to object dtype, # which negates the performance benefit of libjoin From de6ab171b0ab75f2bb5156a45e2d0a52668eca3d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 8 Aug 2024 13:58:21 +0200 Subject: [PATCH 2/2] add test --- pandas/core/generic.py | 1 + pandas/tests/series/methods/test_align.py | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 73e83434027dc..8a6fc69d47cc3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9542,6 +9542,7 @@ def _align_series( join_index, lidx, ridx = self.index.join( other.index, how=join, level=level, return_indexers=True ) + if is_series: left = self._reindex_indexer(join_index, lidx) elif lidx is None or join_index is None: diff --git a/pandas/tests/series/methods/test_align.py b/pandas/tests/series/methods/test_align.py index 620a8ae54fee3..f7e4dbe11f3d3 100644 --- a/pandas/tests/series/methods/test_align.py +++ b/pandas/tests/series/methods/test_align.py @@ -148,6 +148,19 @@ def test_align_periodindex(join_type): ts.align(ts[::2], join=join_type) +def test_align_stringindex(any_string_dtype): + left = Series(range(3), index=pd.Index(["a", "b", "d"], dtype=any_string_dtype)) + right = Series(range(3), index=pd.Index(["a", "b", "c"], dtype=any_string_dtype)) + result_left, result_right = left.align(right) + + expected_idx = pd.Index(["a", "b", "c", "d"], dtype=any_string_dtype) + expected_left = Series([0, 1, np.nan, 2], index=expected_idx) + expected_right = Series([0, 1, 2, np.nan], index=expected_idx) + + tm.assert_series_equal(result_left, expected_left) + tm.assert_series_equal(result_right, expected_right) + + def test_align_left_fewer_levels(): # GH#45224 left = Series([2], index=pd.MultiIndex.from_tuples([(1, 3)], names=["a", "c"]))