From d5c7e9a7fb1be07b61d6ca2b71782231bccceadb Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 17 Nov 2023 00:55:35 +0100 Subject: [PATCH 1/5] BUG: get_indexer rountripping through string dtype --- doc/source/whatsnew/v2.1.4.rst | 1 + pandas/core/indexes/base.py | 3 +++ pandas/tests/indexes/object/test_indexing.py | 9 +++++++++ 3 files changed, 13 insertions(+) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 25afcbb3bb532..9f58217bee4f7 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -22,6 +22,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in :class:`Series` constructor raising DeprecationWarning when ``index`` is a list of :class:`Series` (:issue:`55228`) +- Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 1b4e14f075f22..4d0092892e05e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6580,6 +6580,9 @@ def _maybe_cast_listlike_indexer(self, target) -> Index: """ Analogue to maybe_cast_indexer for get_indexer instead of get_loc. """ + if not hasattr(target, "dtype") and self.dtype == object: + # Avoid inference for object since we are casting back later anyway + return Index(target, dtype=self.dtype) return ensure_index(target) @final diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 87d3afc77d556..da898887e0487 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -4,6 +4,7 @@ import pytest from pandas._libs.missing import is_matching_na +import pandas.util._test_decorators as td import pandas as pd from pandas import Index @@ -55,6 +56,14 @@ def test_get_indexer_with_NA_values( expected = np.array([0, 1, -1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) + @td.skip_if_no("pyarrow") + def test_get_indexer_infer_string_missing_values(self): + # GH#55834 + idx = Index(["a", "b", None], dtype="object") + result = idx.get_indexer([None, "x"]) + expected = np.array([2, -1]) + tm.assert_numpy_array_equal(result, expected) + class TestGetIndexerNonUnique: def test_get_indexer_non_unique_nas(self, nulls_fixture): From bc87192d0ce66b8f9f8a60cd7fa36fe842550e17 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 19 Nov 2023 00:43:45 +0100 Subject: [PATCH 2/5] Update test_indexing.py --- pandas/tests/indexes/object/test_indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index da898887e0487..e6ab4ce734b69 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -62,7 +62,7 @@ def test_get_indexer_infer_string_missing_values(self): idx = Index(["a", "b", None], dtype="object") result = idx.get_indexer([None, "x"]) expected = np.array([2, -1]) - tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result, expected, check_dtype=False) class TestGetIndexerNonUnique: From cfb2f0915c3f38338a26cab1f86ed56b6f621f3c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 4 Nov 2024 07:39:47 +0100 Subject: [PATCH 3/5] update teset --- pandas/tests/indexes/object/test_indexing.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index cc9e783e68404..4afe39ab699a0 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -65,11 +65,13 @@ def test_get_indexer_with_NA_values( @td.skip_if_no("pyarrow") def test_get_indexer_infer_string_missing_values(self): - # GH#55834 + # ensure the passed list is not cast to string but to object so that + # the None value is matched in the index + # https://github.com/pandas-dev/pandas/issues/55834 idx = Index(["a", "b", None], dtype="object") result = idx.get_indexer([None, "x"]) - expected = np.array([2, -1]) - tm.assert_numpy_array_equal(result, expected, check_dtype=False) + expected = np.array([2, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) class TestGetIndexerNonUnique: From d202057f437dbc95612635f3690d644ffd5371f2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 14 Nov 2024 20:48:52 +0100 Subject: [PATCH 4/5] remove pyarrow skip --- pandas/tests/indexes/object/test_indexing.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 4afe39ab699a0..2c5968314e5cf 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -4,7 +4,6 @@ import pytest from pandas._libs.missing import is_matching_na -import pandas.util._test_decorators as td from pandas import Index import pandas._testing as tm @@ -63,7 +62,6 @@ def test_get_indexer_with_NA_values( expected = np.array([0, 1, -1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - @td.skip_if_no("pyarrow") def test_get_indexer_infer_string_missing_values(self): # ensure the passed list is not cast to string but to object so that # the None value is matched in the index From 1609e891ebca5d54442b42ee67273d3050a685aa Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 14 Nov 2024 20:58:17 +0100 Subject: [PATCH 5/5] more specific check for object/string combo --- pandas/core/indexes/base.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8d8cdfce7cfe2..d4ba7e01ebfa9 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6556,10 +6556,16 @@ def _maybe_cast_listlike_indexer(self, target) -> Index: """ Analogue to maybe_cast_indexer for get_indexer instead of get_loc. """ - if not hasattr(target, "dtype") and self.dtype == object: - # Avoid inference for object since we are casting back later anyway - return Index(target, dtype=self.dtype) - return ensure_index(target) + target_index = ensure_index(target) + if ( + not hasattr(target, "dtype") + and self.dtype == object + and target_index.dtype == "string" + ): + # If we started with a list-like, avoid inference to string dtype if self + # is object dtype (coercing to string dtype will alter the missing values) + target_index = Index(target, dtype=self.dtype) + return target_index @final def _validate_indexer(