From ff6e271961434063b277e723e918cc7f3cbc5dea Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 4 Nov 2023 22:47:43 -0400 Subject: [PATCH 01/13] BUG: Index.getitem returning wrong result with negative step for arrow --- doc/source/whatsnew/v2.1.3.rst | 2 +- pandas/core/arrays/arrow/array.py | 7 +++++++ pandas/tests/indexes/object/test_indexing.py | 14 +++++++++++--- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst index 3b1cd1c152baa..f4c32b6ecc056 100644 --- a/doc/source/whatsnew/v2.1.3.rst +++ b/doc/source/whatsnew/v2.1.3.rst @@ -22,7 +22,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in :meth:`DatetimeIndex.diff` raising ``TypeError`` (:issue:`55080`) -- +- Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) .. --------------------------------------------------------------------------- .. _whatsnew_213.other: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4bcc03643dac8..43927f554a875 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -553,6 +553,13 @@ def __getitem__(self, item: PositionalIndexer): ) # We are not an array indexer, so maybe e.g. a slice or integer # indexer. We dispatch to pyarrow. + if isinstance(item, slice): + if item.start == item.stop: + pass + elif item.start == -len(self) - 1: + item = slice(None, item.stop, item.step) + elif item.stop == -len(self) - 1: + item = slice(item.start, None, item.step) value = self._pa_array[item] if isinstance(value, pa.ChunkedArray): return type(self)(value) diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 87d3afc77d556..7a6e93a4605d7 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -4,6 +4,7 @@ import pytest from pandas._libs.missing import is_matching_na +import pandas.util._test_decorators as td import pandas as pd from pandas import Index @@ -144,6 +145,13 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): class TestSliceLocs: + @pytest.mark.parametrize( + "dtype", + [ + "object", + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), + ], + ) @pytest.mark.parametrize( "in_slice,expected", [ @@ -167,12 +175,12 @@ class TestSliceLocs: (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] ], ) - def test_slice_locs_negative_step(self, in_slice, expected): - index = Index(list("bcdxy")) + def test_slice_locs_negative_step(self, in_slice, expected, dtype): + index = Index(list("bcdxy"), dtype=dtype) s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) result = index[s_start : s_stop : in_slice.step] - expected = Index(list(expected)) + expected = Index(list(expected), dtype=dtype) tm.assert_index_equal(result, expected) def test_slice_locs_dup(self): From 7684911a70609559f7efcb84c73662ba0579a169 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 5 Nov 2023 09:18:25 -0500 Subject: [PATCH 02/13] Update --- pandas/core/arrays/arrow/array.py | 5 +++-- pandas/tests/indexes/object/test_indexing.py | 11 +++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 43927f554a875..e073b8c20acbe 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -556,10 +556,11 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, slice): if item.start == item.stop: pass - elif item.start == -len(self) - 1: + elif item.start <= -len(self) - 1: item = slice(None, item.stop, item.step) - elif item.stop == -len(self) - 1: + elif item.stop <= -len(self) - 1: item = slice(item.start, None, item.step) + value = self._pa_array[item] if isinstance(value, pa.ChunkedArray): return type(self)(value) diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 7a6e93a4605d7..93d46ebdd0b51 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -183,6 +183,17 @@ def test_slice_locs_negative_step(self, in_slice, expected, dtype): expected = Index(list(expected), dtype=dtype) tm.assert_index_equal(result, expected) + @td.skip_if_no("pyarrow") + def test_slice_locs_negative_step_oob(self): + index = Index(list("bcdxy"), dtype="string[pyarrow_numpy]") + + result = index[-10:5:1] + tm.assert_index_equal(result, index) + + result = index[4:-10:-1] + expected = Index(list("yxdcb"), dtype="string[pyarrow_numpy]") + tm.assert_index_equal(result, expected) + def test_slice_locs_dup(self): index = Index(["a", "a", "b", "c", "d", "d"]) assert index.slice_locs("a", "d") == (0, 6) From 7474cb2b70e548a6000dd797c8f5c524b08624cb Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 5 Nov 2023 09:21:00 -0500 Subject: [PATCH 03/13] Update --- pandas/core/arrays/arrow/array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e073b8c20acbe..87d355ef79142 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -556,9 +556,9 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, slice): if item.start == item.stop: pass - elif item.start <= -len(self) - 1: + elif item.start < -len(self): item = slice(None, item.stop, item.step) - elif item.stop <= -len(self) - 1: + elif item.stop < -len(self): item = slice(item.start, None, item.step) value = self._pa_array[item] From b27b0f82b85e20d0c3ae724acbece252999a8b26 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 5 Nov 2023 09:26:59 -0500 Subject: [PATCH 04/13] Fix --- pandas/core/arrays/arrow/array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 87d355ef79142..9a6b91f21d90f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -556,9 +556,9 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, slice): if item.start == item.stop: pass - elif item.start < -len(self): + elif item.start is not None and item.start < -len(self): item = slice(None, item.stop, item.step) - elif item.stop < -len(self): + elif item.stop is not None and item.stop < -len(self): item = slice(item.start, None, item.step) value = self._pa_array[item] From 659577e6c43dfe5125b865f707ec537f66b89cd3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 5 Nov 2023 10:13:53 -0500 Subject: [PATCH 05/13] Update array.py --- pandas/core/arrays/arrow/array.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 9a6b91f21d90f..7e5452a21af9a 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -556,8 +556,6 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, slice): if item.start == item.stop: pass - elif item.start is not None and item.start < -len(self): - item = slice(None, item.stop, item.step) elif item.stop is not None and item.stop < -len(self): item = slice(item.start, None, item.step) From 86fe4f129a648fa2f765e9f3587d8829eedbc7ff Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 17 Nov 2023 00:35:46 +0100 Subject: [PATCH 06/13] Fix --- pandas/core/arrays/arrow/array.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 7e5452a21af9a..820a3856d48fc 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -556,7 +556,12 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, slice): if item.start == item.stop: pass - elif item.stop is not None and item.stop < -len(self): + elif ( + item.stop is not None + and item.stop < -len(self) + and item.step is not None + and item.step < 0 + ): item = slice(item.start, None, item.step) value = self._pa_array[item] From 242552937fe6d8e12c2af90e2d55b91a9d9bdb5c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 17 Nov 2023 19:35:18 +0100 Subject: [PATCH 07/13] Add gh ref --- pandas/core/arrays/arrow/array.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 820a3856d48fc..d162b66e5d369 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -554,6 +554,7 @@ def __getitem__(self, item: PositionalIndexer): # We are not an array indexer, so maybe e.g. a slice or integer # indexer. We dispatch to pyarrow. if isinstance(item, slice): + # Arrow bug https://github.com/apache/arrow/issues/38768 if item.start == item.stop: pass elif ( From 06b4f8909fef759bdda8877602b40f4be3b9925e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 21 Nov 2023 21:17:28 +0100 Subject: [PATCH 08/13] Update --- doc/source/whatsnew/v2.1.3.rst | 1 - doc/source/whatsnew/v2.1.4.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst index a7f20aa67685f..af626895a9e0e 100644 --- a/doc/source/whatsnew/v2.1.3.rst +++ b/doc/source/whatsnew/v2.1.3.rst @@ -21,7 +21,6 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in :meth:`DatetimeIndex.diff` raising ``TypeError`` (:issue:`55080`) -- Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) - Bug in :meth:`Index.isin` raising for Arrow backed string and ``None`` value (:issue:`55821`) - Fix :func:`read_parquet` and :func:`read_feather` for `CVE-2023-47248 `__ (:issue:`55894`) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 25afcbb3bb532..e52c42dd31211 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -22,6 +22,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in :class:`Series` constructor raising DeprecationWarning when ``index`` is a list of :class:`Series` (:issue:`55228`) +- Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) - .. --------------------------------------------------------------------------- From a015f483d7a88af30b447e11c6e7c2ebc8ce290c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 22 Nov 2023 00:19:47 +0100 Subject: [PATCH 09/13] Fix string option tests in indexing --- pandas/core/config_init.py | 2 +- pandas/tests/indexing/multiindex/test_loc.py | 10 ++-- pandas/tests/indexing/test_at.py | 7 ++- pandas/tests/indexing/test_categorical.py | 2 +- .../indexing/test_chaining_and_caching.py | 4 +- pandas/tests/indexing/test_coercion.py | 26 ++++++---- pandas/tests/indexing/test_iloc.py | 18 +++++-- pandas/tests/indexing/test_indexing.py | 48 ++++++++++++------ pandas/tests/indexing/test_loc.py | 49 +++++++++++++------ pandas/tests/indexing/test_partial.py | 40 ++++++++++----- 10 files changed, 145 insertions(+), 61 deletions(-) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index a8b63f97141c2..bdbab78a443de 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -905,7 +905,7 @@ def register_converter_cb(key) -> None: with cf.config_prefix("future"): cf.register_option( "infer_string", - False, + True, "Whether to infer sequence of str objects as pyarrow string " "dtype, which will be the default in pandas 3.0 " "(at which point this option will be deprecated).", diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index 873c4e3e60f4c..5508153322adb 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -566,7 +566,7 @@ def test_loc_setitem_single_column_slice(): tm.assert_frame_equal(df, expected) -def test_loc_nan_multiindex(): +def test_loc_nan_multiindex(using_infer_string): # GH 5286 tups = [ ("Good Things", "C", np.nan), @@ -586,8 +586,12 @@ def test_loc_nan_multiindex(): result = df.loc["Good Things"].loc["C"] expected = DataFrame( np.ones((1, 4)), - index=Index([np.nan], dtype="object", name="u3"), - columns=Index(["d1", "d2", "d3", "d4"], dtype="object"), + index=Index( + [np.nan], + dtype="object" if not using_infer_string else "string[pyarrow_numpy]", + name="u3", + ), + columns=Index(["d1", "d2", "d3", "d4"]), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_at.py b/pandas/tests/indexing/test_at.py index 7504c984794e8..d78694018749c 100644 --- a/pandas/tests/indexing/test_at.py +++ b/pandas/tests/indexing/test_at.py @@ -13,6 +13,7 @@ CategoricalIndex, DataFrame, DatetimeIndex, + Index, MultiIndex, Series, Timestamp, @@ -70,7 +71,11 @@ def test_at_setitem_item_cache_cleared(self): df.at[0, "x"] = 4 df.at[0, "cost"] = 789 - expected = DataFrame({"x": [4], "cost": 789}, index=[0]) + expected = DataFrame( + {"x": [4], "cost": 789}, + index=[0], + columns=Index(["x", "cost"], dtype=object), + ) tm.assert_frame_equal(df, expected) # And in particular, check that the _item_cache has updated correctly. diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 0432c8856e5c5..6f0ef0b357269 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -273,7 +273,7 @@ def test_slicing_doc_examples(self): tm.assert_frame_equal(result, expected) result = df.iloc[2:4, :].dtypes - expected = Series(["category", "int64"], ["cats", "values"]) + expected = Series(["category", "int64"], ["cats", "values"], dtype=object) tm.assert_series_equal(result, expected) result = df.loc["h":"j", "cats"] diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index 21aab6652a300..bf0975a803dce 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -339,7 +339,9 @@ def test_detect_chained_assignment_object_dtype( self, using_array_manager, using_copy_on_write, warn_copy_on_write ): expected = DataFrame({"A": [111, "bbb", "ccc"], "B": [1, 2, 3]}) - df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]}) + df = DataFrame( + {"A": Series(["aaa", "bbb", "ccc"], dtype=object), "B": [1, 2, 3]} + ) df_original = df.copy() if not using_copy_on_write and not warn_copy_on_write: diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index c743166030048..95db295f8fc2c 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.compat import ( IS64, is_platform_windows, @@ -111,7 +113,7 @@ def _assert_setitem_index_conversion( "val,exp_dtype", [("x", object), (5, IndexError), (1.1, object)] ) def test_setitem_index_object(self, val, exp_dtype): - obj = pd.Series([1, 2, 3, 4], index=list("abcd")) + obj = pd.Series([1, 2, 3, 4], index=pd.Index(list("abcd"), dtype=object)) assert obj.index.dtype == object if exp_dtype is IndexError: @@ -122,7 +124,7 @@ def test_setitem_index_object(self, val, exp_dtype): with tm.assert_produces_warning(FutureWarning, match=warn_msg): temp[5] = 5 else: - exp_index = pd.Index(list("abcd") + [val]) + exp_index = pd.Index(list("abcd") + [val], dtype=object) self._assert_setitem_index_conversion(obj, val, exp_index, exp_dtype) @pytest.mark.parametrize( @@ -195,10 +197,10 @@ def _assert_insert_conversion(self, original, value, expected, expected_dtype): ], ) def test_insert_index_object(self, insert, coerced_val, coerced_dtype): - obj = pd.Index(list("abcd")) + obj = pd.Index(list("abcd"), dtype=object) assert obj.dtype == object - exp = pd.Index(["a", coerced_val, "b", "c", "d"]) + exp = pd.Index(["a", coerced_val, "b", "c", "d"], dtype=object) self._assert_insert_conversion(obj, insert, exp, coerced_dtype) @pytest.mark.parametrize( @@ -397,7 +399,7 @@ def _run_test(self, obj, fill_val, klass, exp_dtype): ) def test_where_object(self, index_or_series, fill_val, exp_dtype): klass = index_or_series - obj = klass(list("abcd")) + obj = klass(list("abcd"), dtype=object) assert obj.dtype == object self._run_test(obj, fill_val, klass, exp_dtype) @@ -559,10 +561,10 @@ def _assert_fillna_conversion(self, original, value, expected, expected_dtype): ) def test_fillna_object(self, index_or_series, fill_val, fill_dtype): klass = index_or_series - obj = klass(["a", np.nan, "c", "d"]) + obj = klass(["a", np.nan, "c", "d"], dtype=object) assert obj.dtype == object - exp = klass(["a", fill_val, "c", "d"]) + exp = klass(["a", fill_val, "c", "d"], dtype=object) self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) @pytest.mark.parametrize( @@ -824,6 +826,7 @@ def replacer(self, how, from_key, to_key): raise ValueError return replacer + @pytest.mark.skipif(using_pyarrow_string_dtype(), reason="TODO: test is to complex") def test_replace_series(self, how, to_key, from_key, replacer): index = pd.Index([3, 4], name="xxx") obj = pd.Series(self.rep[from_key], index=index, name="yyy") @@ -870,13 +873,18 @@ def test_replace_series(self, how, to_key, from_key, replacer): @pytest.mark.parametrize( "from_key", ["datetime64[ns, UTC]", "datetime64[ns, US/Eastern]"], indirect=True ) - def test_replace_series_datetime_tz(self, how, to_key, from_key, replacer): + def test_replace_series_datetime_tz( + self, how, to_key, from_key, replacer, using_infer_string + ): index = pd.Index([3, 4], name="xyz") obj = pd.Series(self.rep[from_key], index=index, name="yyy") assert obj.dtype == from_key exp = pd.Series(self.rep[to_key], index=index, name="yyy") - assert exp.dtype == to_key + if using_infer_string and to_key == "object": + assert exp.dtype == "string" + else: + assert exp.dtype == to_key msg = "Downcasting behavior in `replace`" warn = FutureWarning if exp.dtype != object else None diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index cbcbf3396363a..a285356e2563f 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -100,9 +100,8 @@ def test_iloc_setitem_fullcol_categorical(self, indexer, key, using_array_manage # we retain the object dtype. frame = DataFrame({0: np.array([0, 1, 2], dtype=object), 1: range(3)}) df = frame.copy() - orig_vals = df.values indexer(df)[key, 0] = cat - expected = DataFrame({0: cat.astype(object), 1: range(3)}) + expected = DataFrame({0: Series(cat.astype(object), dtype=object), 1: range(3)}) tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("box", [array, Series]) @@ -451,12 +450,16 @@ def test_iloc_setitem(self): def test_iloc_setitem_axis_argument(self): # GH45032 df = DataFrame([[6, "c", 10], [7, "d", 11], [8, "e", 12]]) + df[1] = df[1].astype(object) expected = DataFrame([[6, "c", 10], [7, "d", 11], [5, 5, 5]]) + expected[1] = expected[1].astype(object) df.iloc(axis=0)[2] = 5 tm.assert_frame_equal(df, expected) df = DataFrame([[6, "c", 10], [7, "d", 11], [8, "e", 12]]) + df[1] = df[1].astype(object) expected = DataFrame([[6, "c", 5], [7, "d", 5], [8, "e", 5]]) + expected[1] = expected[1].astype(object) df.iloc(axis=1)[2] = 5 tm.assert_frame_equal(df, expected) @@ -615,7 +618,7 @@ def test_iloc_getitem_labelled_frame(self): assert result == exp # out-of-bounds exception - msg = "index 5 is out of bounds for axis 0 with size 4" + msg = "index 5 is out of bounds for axis 0 with size 4|index out of bounds" with pytest.raises(IndexError, match=msg): df.iloc[10, 5] @@ -1313,7 +1316,9 @@ def test_iloc_setitem_dtypes_duplicate_columns( self, dtypes, init_value, expected_value ): # GH#22035 - df = DataFrame([[init_value, "str", "str2"]], columns=["a", "b", "b"]) + df = DataFrame( + [[init_value, "str", "str2"]], columns=["a", "b", "b"], dtype=object + ) # with the enforcement of GH#45333 in 2.0, this sets values inplace, # so we retain object dtype @@ -1360,7 +1365,10 @@ def test_frame_iloc_getitem_callable(self): def test_frame_iloc_setitem_callable(self): # GH#11485 - df = DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD")) + df = DataFrame( + {"X": [1, 2, 3, 4], "Y": Series(list("aabb"), dtype=object)}, + index=list("ABCD"), + ) # return location res = df.copy() diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index bdbbcabcaab0e..d6ec7ac3e4185 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.errors import IndexingError from pandas.core.dtypes.common import ( @@ -189,7 +191,7 @@ def test_setitem_dtype_upcast(self): ): df.loc[0, "c"] = "foo" expected = DataFrame( - [{"a": 1, "b": np.nan, "c": "foo"}, {"a": 3, "b": 2, "c": np.nan}] + {"a": [1, 3], "b": [np.nan, 2], "c": Series(["foo", np.nan], dtype=object)} ) tm.assert_frame_equal(df, expected) @@ -284,18 +286,27 @@ def test_dups_fancy_indexing_not_in_order(self): with pytest.raises(KeyError, match="not in index"): df.loc[rows] - def test_dups_fancy_indexing_only_missing_label(self): + def test_dups_fancy_indexing_only_missing_label(self, using_infer_string): # List containing only missing label dfnu = DataFrame( np.random.default_rng(2).standard_normal((5, 3)), index=list("AABCD") ) - with pytest.raises( - KeyError, - match=re.escape( - "\"None of [Index(['E'], dtype='object')] are in the [index]\"" - ), - ): - dfnu.loc[["E"]] + if using_infer_string: + with pytest.raises( + KeyError, + match=re.escape( + "\"None of [Index(['E'], dtype='string')] are in the [index]\"" + ), + ): + dfnu.loc[["E"]] + else: + with pytest.raises( + KeyError, + match=re.escape( + "\"None of [Index(['E'], dtype='object')] are in the [index]\"" + ), + ): + dfnu.loc[["E"]] @pytest.mark.parametrize("vals", [[0, 1, 2], list("abc")]) def test_dups_fancy_indexing_missing_label(self, vals): @@ -451,6 +462,9 @@ def test_set_index_nan(self): ) tm.assert_frame_equal(result, df) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't multiply arrow strings" + ) def test_multi_assign(self): # GH 3626, an assignment of a sub-df to a df # set float64 to avoid upcast when setting nan @@ -553,7 +567,7 @@ def test_string_slice_empty(self): with pytest.raises(KeyError, match="^0$"): df.loc["2011", 0] - def test_astype_assignment(self): + def test_astype_assignment(self, using_infer_string): # GH4312 (iloc) df_orig = DataFrame( [["1", "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") @@ -567,8 +581,9 @@ def test_astype_assignment(self): expected = DataFrame( [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - expected["A"] = expected["A"].astype(object) - expected["B"] = expected["B"].astype(object) + if not using_infer_string: + expected["A"] = expected["A"].astype(object) + expected["B"] = expected["B"].astype(object) tm.assert_frame_equal(df, expected) # GH5702 (loc) @@ -577,7 +592,8 @@ def test_astype_assignment(self): expected = DataFrame( [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - expected["A"] = expected["A"].astype(object) + if not using_infer_string: + expected["A"] = expected["A"].astype(object) tm.assert_frame_equal(df, expected) df = df_orig.copy() @@ -585,8 +601,9 @@ def test_astype_assignment(self): expected = DataFrame( [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - expected["B"] = expected["B"].astype(object) - expected["C"] = expected["C"].astype(object) + if not using_infer_string: + expected["B"] = expected["B"].astype(object) + expected["C"] = expected["C"].astype(object) tm.assert_frame_equal(df, expected) def test_astype_assignment_full_replacements(self): @@ -673,6 +690,7 @@ def test_loc_setitem_fullindex_views(self): df.loc[df.index] = df.loc[df.index] tm.assert_frame_equal(df, df2) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set int into string") def test_rhs_alignment(self): # GH8258, tests that both rows & columns are aligned to what is # assigned to. covers both uniform data-type & multi-type cases diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 96fd3f4e6fca0..8459cc5a30130 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -12,6 +12,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import index as libindex from pandas.errors import IndexingError import pandas.util._test_decorators as td @@ -469,7 +471,7 @@ def test_loc_to_fail2(self): msg = r"\"None of \[Index\(\['4'\], dtype='object'\)\] are in the \[index\]\"" with pytest.raises(KeyError, match=msg): - s.loc[["4"]] + s.loc[Index(["4"], dtype=object)] s.loc[-1] = 3 with pytest.raises(KeyError, match="not in index"): @@ -781,7 +783,9 @@ def test_loc_setitem_empty_frame(self): # is inplace, so that dtype is retained sera = Series(val1, index=keys1, dtype=np.float64) serb = Series(val2, index=keys2) - expected = DataFrame({"A": sera, "B": serb}).reindex(index=index) + expected = DataFrame( + {"A": sera, "B": serb}, columns=Index(["A", "B"], dtype=object) + ).reindex(index=index) tm.assert_frame_equal(df, expected) def test_loc_setitem_frame(self): @@ -979,7 +983,7 @@ def test_setitem_new_key_tz(self, indexer_sl): to_datetime(42).tz_localize("UTC"), to_datetime(666).tz_localize("UTC"), ] - expected = Series(vals, index=["foo", "bar"]) + expected = Series(vals, index=Index(["foo", "bar"], dtype=object)) ser = Series(dtype=object) indexer_sl(ser)["foo"] = vals[0] @@ -1254,6 +1258,7 @@ def test_loc_reverse_assignment(self): tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set int into string") def test_loc_setitem_str_to_small_float_conversion_type(self): # GH#20388 @@ -1439,7 +1444,7 @@ def test_loc_setitem_categorical_values_partial_column_slice(self): df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) tm.assert_frame_equal(df, exp) - def test_loc_setitem_single_row_categorical(self): + def test_loc_setitem_single_row_categorical(self, using_infer_string): # GH#25495 df = DataFrame({"Alpha": ["a"], "Numeric": [0]}) categories = Categorical(df["Alpha"], categories=["a", "b", "c"]) @@ -1449,7 +1454,9 @@ def test_loc_setitem_single_row_categorical(self): df.loc[:, "Alpha"] = categories result = df["Alpha"] - expected = Series(categories, index=df.index, name="Alpha").astype(object) + expected = Series(categories, index=df.index, name="Alpha").astype( + object if not using_infer_string else "string[pyarrow_numpy]" + ) tm.assert_series_equal(result, expected) # double-check that the non-loc setting retains categoricalness @@ -1615,7 +1622,7 @@ def test_loc_getitem_index_namedtuple(self): result = df.loc[IndexType("foo", "bar")]["A"] assert result == 1 - def test_loc_setitem_single_column_mixed(self): + def test_loc_setitem_single_column_mixed(self, using_infer_string): df = DataFrame( np.random.default_rng(2).standard_normal((5, 3)), index=["a", "b", "c", "d", "e"], @@ -1623,7 +1630,10 @@ def test_loc_setitem_single_column_mixed(self): ) df["str"] = "qux" df.loc[df.index[::2], "str"] = np.nan - expected = np.array([np.nan, "qux", np.nan, "qux", np.nan], dtype=object) + expected = Series( + [np.nan, "qux", np.nan, "qux", np.nan], + dtype=object if not using_infer_string else "string[pyarrow_numpy]", + ).values tm.assert_almost_equal(df["str"].values, expected) def test_loc_setitem_cast2(self): @@ -2016,11 +2026,15 @@ def test_loc_setitem_empty_series_str_idx(self): # partially set with an empty object series ser = Series(dtype=object) ser.loc["foo"] = 1 - tm.assert_series_equal(ser, Series([1], index=["foo"])) + tm.assert_series_equal(ser, Series([1], index=Index(["foo"], dtype=object))) ser.loc["bar"] = 3 - tm.assert_series_equal(ser, Series([1, 3], index=["foo", "bar"])) + tm.assert_series_equal( + ser, Series([1, 3], index=Index(["foo", "bar"], dtype=object)) + ) ser.loc[3] = 4 - tm.assert_series_equal(ser, Series([1, 3, 4], index=["foo", "bar", 3])) + tm.assert_series_equal( + ser, Series([1, 3, 4], index=Index(["foo", "bar", 3], dtype=object)) + ) def test_loc_setitem_incremental_with_dst(self): # GH#20724 @@ -2050,7 +2064,11 @@ def test_loc_setitem_datetime_keys_cast(self, conv): df.loc[conv(dt1), "one"] = 100 df.loc[conv(dt2), "one"] = 200 - expected = DataFrame({"one": [100.0, 200.0]}, index=[dt1, dt2]) + expected = DataFrame( + {"one": [100.0, 200.0]}, + index=[dt1, dt2], + columns=Index(["one"], dtype=object), + ) tm.assert_frame_equal(df, expected) def test_loc_setitem_categorical_column_retains_dtype(self, ordered): @@ -2168,11 +2186,11 @@ def test_loc_setitem_with_expansion_preserves_nullable_int(self, dtype): result = DataFrame(index=df.index) result.loc[df.index, "data"] = ser - tm.assert_frame_equal(result, df) + tm.assert_frame_equal(result, df, check_column_type=False) result = DataFrame(index=df.index) result.loc[df.index, "data"] = ser._values - tm.assert_frame_equal(result, df) + tm.assert_frame_equal(result, df, check_column_type=False) def test_loc_setitem_ea_not_full_column(self): # GH#39163 @@ -2262,7 +2280,10 @@ def test_frame_loc_getitem_callable_labels(self): def test_frame_loc_setitem_callable(self): # GH#11485 - df = DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD")) + df = DataFrame( + {"X": [1, 2, 3, 4], "Y": Series(list("aabb"), dtype=object)}, + index=list("ABCD"), + ) # return label res = df.copy() diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 3d04cc764563f..2f9018112c03b 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -28,7 +28,9 @@ def test_empty_frame_setitem_index_name_retained(self): df["series"] = series expected = DataFrame( - {"series": [1.23] * 4}, index=pd.RangeIndex(4, name="df_index") + {"series": [1.23] * 4}, + index=pd.RangeIndex(4, name="df_index"), + columns=Index(["series"], dtype=object), ) tm.assert_frame_equal(df, expected) @@ -39,7 +41,9 @@ def test_empty_frame_setitem_index_name_inherited(self): series = Series(1.23, index=pd.RangeIndex(4, name="series_index")) df["series"] = series expected = DataFrame( - {"series": [1.23] * 4}, index=pd.RangeIndex(4, name="series_index") + {"series": [1.23] * 4}, + index=pd.RangeIndex(4, name="series_index"), + columns=Index(["series"], dtype=object), ) tm.assert_frame_equal(df, expected) @@ -92,7 +96,9 @@ def test_partial_set_empty_frame2(self): # these work as they don't really change # anything but the index # GH#5632 - expected = DataFrame(columns=["foo"], index=Index([], dtype="object")) + expected = DataFrame( + columns=Index(["foo"], dtype=object), index=Index([], dtype="object") + ) df = DataFrame(index=Index([], dtype="object")) df["foo"] = Series([], dtype="object") @@ -110,7 +116,9 @@ def test_partial_set_empty_frame2(self): tm.assert_frame_equal(df, expected) def test_partial_set_empty_frame3(self): - expected = DataFrame(columns=["foo"], index=Index([], dtype="int64")) + expected = DataFrame( + columns=Index(["foo"], dtype=object), index=Index([], dtype="int64") + ) expected["foo"] = expected["foo"].astype("float64") df = DataFrame(index=Index([], dtype="int64")) @@ -127,7 +135,9 @@ def test_partial_set_empty_frame4(self): df = DataFrame(index=Index([], dtype="int64")) df["foo"] = range(len(df)) - expected = DataFrame(columns=["foo"], index=Index([], dtype="int64")) + expected = DataFrame( + columns=Index(["foo"], dtype=object), index=Index([], dtype="int64") + ) # range is int-dtype-like, so we get int64 dtype expected["foo"] = expected["foo"].astype("int64") tm.assert_frame_equal(df, expected) @@ -200,10 +210,10 @@ def test_partial_set_empty_frame_empty_copy_assignment(self): df = DataFrame(index=[0]) df = df.copy() df["a"] = 0 - expected = DataFrame(0, index=[0], columns=["a"]) + expected = DataFrame(0, index=[0], columns=Index(["a"], dtype=object)) tm.assert_frame_equal(df, expected) - def test_partial_set_empty_frame_empty_consistencies(self): + def test_partial_set_empty_frame_empty_consistencies(self, using_infer_string): # GH#6171 # consistency on empty frames df = DataFrame(columns=["x", "y"]) @@ -213,7 +223,15 @@ def test_partial_set_empty_frame_empty_consistencies(self): df = DataFrame(columns=["x", "y"]) df["x"] = ["1", "2"] - expected = DataFrame({"x": ["1", "2"], "y": [np.nan, np.nan]}, dtype=object) + expected = DataFrame( + { + "x": Series( + ["1", "2"], + dtype=object if not using_infer_string else "string[pyarrow_numpy]", + ), + "y": Series([np.nan, np.nan], dtype=object), + } + ) tm.assert_frame_equal(df, expected) df = DataFrame(columns=["x", "y"]) @@ -618,7 +636,7 @@ def test_loc_with_list_of_strings_representing_datetimes_missing_value( [ ( period_range(start="2000", periods=20, freq="D"), - ["4D", "8D"], + Index(["4D", "8D"], dtype=object), ( r"None of \[Index\(\['4D', '8D'\], dtype='object'\)\] " r"are in the \[index\]" @@ -626,7 +644,7 @@ def test_loc_with_list_of_strings_representing_datetimes_missing_value( ), ( date_range(start="2000", periods=20, freq="D"), - ["4D", "8D"], + Index(["4D", "8D"], dtype=object), ( r"None of \[Index\(\['4D', '8D'\], dtype='object'\)\] " r"are in the \[index\]" @@ -634,7 +652,7 @@ def test_loc_with_list_of_strings_representing_datetimes_missing_value( ), ( pd.timedelta_range(start="1 day", periods=20), - ["2000-01-04", "2000-01-08"], + Index(["2000-01-04", "2000-01-08"], dtype=object), ( r"None of \[Index\(\['2000-01-04', '2000-01-08'\], " r"dtype='object'\)\] are in the \[index\]" From dfda6cfaa4b6d3a27cb9d84b1d5ae181e334d696 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 22 Nov 2023 00:39:52 +0100 Subject: [PATCH 10/13] Fix string option tests in indexing --- pandas/tests/indexing/test_iloc.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index a285356e2563f..baf2cdae43fe4 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -231,7 +231,10 @@ def test_iloc_exceeds_bounds(self): dfl = DataFrame( np.random.default_rng(2).standard_normal((5, 2)), columns=list("AB") ) - tm.assert_frame_equal(dfl.iloc[:, 2:3], DataFrame(index=dfl.index, columns=[])) + tm.assert_frame_equal( + dfl.iloc[:, 2:3], + DataFrame(index=dfl.index, columns=Index([], dtype=dfl.columns.dtype)), + ) tm.assert_frame_equal(dfl.iloc[:, 1:3], dfl.iloc[:, [1]]) tm.assert_frame_equal(dfl.iloc[4:6], dfl.iloc[[4]]) From 88f0957d5e39a588e2a8620a540dc397e9b20a09 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 22 Nov 2023 00:40:56 +0100 Subject: [PATCH 11/13] Fix string option tests in indexing --- pandas/core/config_init.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index bdbab78a443de..a8b63f97141c2 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -905,7 +905,7 @@ def register_converter_cb(key) -> None: with cf.config_prefix("future"): cf.register_option( "infer_string", - True, + False, "Whether to infer sequence of str objects as pyarrow string " "dtype, which will be the default in pandas 3.0 " "(at which point this option will be deprecated).", From b5bed5dc5840554b1a2f54ec839569b4a284d0a8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 26 Nov 2023 12:41:47 +0100 Subject: [PATCH 12/13] Update test_coercion.py --- pandas/tests/indexing/test_coercion.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 95db295f8fc2c..029926e714e75 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -826,6 +826,7 @@ def replacer(self, how, from_key, to_key): raise ValueError return replacer + # Needs adjustment for the infer string option @pytest.mark.skipif(using_pyarrow_string_dtype(), reason="TODO: test is to complex") def test_replace_series(self, how, to_key, from_key, replacer): index = pd.Index([3, 4], name="xxx") From abadce8865ae1a141753d36398fdf8304b1e948a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 26 Nov 2023 12:42:13 +0100 Subject: [PATCH 13/13] Update test_coercion.py --- pandas/tests/indexing/test_coercion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 029926e714e75..03691ca318037 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -826,7 +826,7 @@ def replacer(self, how, from_key, to_key): raise ValueError return replacer - # Needs adjustment for the infer string option + # Expected needs adjustment for the infer string option, seems to work as expecetd @pytest.mark.skipif(using_pyarrow_string_dtype(), reason="TODO: test is to complex") def test_replace_series(self, how, to_key, from_key, replacer): index = pd.Index([3, 4], name="xxx")