From 2f5221fe72055bcc818b8433da535df0b2edb23c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 17 Aug 2024 21:41:45 +0200 Subject: [PATCH 1/4] TST (string-dtype): Adjust indexing string tests --- pandas/core/arrays/string_.py | 4 +++ pandas/tests/indexing/test_iloc.py | 31 ++++++++-------- pandas/tests/indexing/test_indexing.py | 49 +++++++++++++++----------- pandas/tests/indexing/test_loc.py | 48 ++++++++++++++----------- 4 files changed, 76 insertions(+), 56 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 823084c3e9982..03aa676c4e24c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -688,6 +688,10 @@ def __setitem__(self, key, value) -> None: else: if not is_array_like(value): value = np.asarray(value, dtype=object) + else: + # cast categories and friends to arrays to see if values are + # compatible, compatibility with arrow backed strings + value = np.asarray(value) if len(value) and not lib.is_string_array(value, skipna=True): raise TypeError("Must provide strings.") diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index b05b5d3dea2dc..dc95e1bb1b8a0 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import IndexingError from pandas import ( @@ -1198,22 +1196,25 @@ def test_iloc_getitem_int_single_ea_block_view(self): arr[2] = arr[-1] assert ser[0] == arr[-1] - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_iloc_setitem_multicolumn_to_datetime(self): + def test_iloc_setitem_multicolumn_to_datetime(self, using_infer_string): # GH#20511 df = DataFrame({"A": ["2022-01-01", "2022-01-02"], "B": ["2021", "2022"]}) - df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])}) - expected = DataFrame( - { - "A": [ - Timestamp("2021-01-01 00:00:00"), - Timestamp("2022-01-01 00:00:00"), - ], - "B": ["2021", "2022"], - } - ) - tm.assert_frame_equal(df, expected, check_dtype=False) + if using_infer_string: + with pytest.raises(TypeError, match="Invalid value"): + df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])}) + else: + df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])}) + expected = DataFrame( + { + "A": [ + Timestamp("2021-01-01 00:00:00"), + Timestamp("2022-01-01 00:00:00"), + ], + "B": ["2021", "2022"], + } + ) + tm.assert_frame_equal(df, expected, check_dtype=False) class TestILocErrors: diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index ef8c0e432ca49..9a76dc6814171 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import IndexingError from pandas.core.dtypes.common import ( @@ -528,7 +526,6 @@ def test_string_slice_empty(self): with pytest.raises(KeyError, match="^0$"): df.loc["2011", 0] - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_astype_assignment(self, using_infer_string): # GH4312 (iloc) df_orig = DataFrame( @@ -539,34 +536,44 @@ def test_astype_assignment(self, using_infer_string): # with the enforcement of GH#45333 in 2.0, this setting is attempted inplace, # so object dtype is retained - df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64) - expected = DataFrame( - [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") - ) - if not using_infer_string: + if using_infer_string: + with pytest.raises(TypeError, match="Invalid value"): + df.iloc[:, 0] = df.iloc[:, 0].astype(np.int64) + else: + df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64) + expected = DataFrame( + [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") + ) expected["A"] = expected["A"].astype(object) expected["B"] = expected["B"].astype(object) - tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # GH5702 (loc) df = df_orig.copy() - df.loc[:, "A"] = df.loc[:, "A"].astype(np.int64) - expected = DataFrame( - [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") - ) - if not using_infer_string: + if using_infer_string: + with pytest.raises(TypeError, match="Invalid value"): + df.loc[:, "A"] = df.loc[:, "A"].astype(np.int64) + else: + df.loc[:, "A"] = df.loc[:, "A"].astype(np.int64) + expected = DataFrame( + [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") + ) expected["A"] = expected["A"].astype(object) - tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = df_orig.copy() - df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64) - expected = DataFrame( - [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") - ) - if not using_infer_string: + + if using_infer_string: + with pytest.raises(TypeError, match="Invalid value"): + df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64) + else: + df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64) + expected = DataFrame( + [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") + ) expected["B"] = expected["B"].astype(object) expected["C"] = expected["C"].astype(object) - tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_astype_assignment_full_replacements(self): # full replacements / no nans diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index e007b8c4e97ac..51b10b6eedddc 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1,6 +1,7 @@ """test label based indexing with loc""" from collections import namedtuple +import contextlib from datetime import ( date, datetime, @@ -13,10 +14,7 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import index as libindex -from pandas.compat import HAS_PYARROW from pandas.errors import IndexingError import pandas as pd @@ -615,8 +613,7 @@ def test_loc_setitem_consistency_empty(self): expected["x"] = expected["x"].astype(np.int64) tm.assert_frame_equal(df, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_loc_setitem_consistency_slice_column_len(self): + def test_loc_setitem_consistency_slice_column_len(self, using_infer_string): # .loc[:,column] setting with slice == len of the column # GH10408 levels = [ @@ -640,12 +637,23 @@ def test_loc_setitem_consistency_slice_column_len(self): ] df = DataFrame(values, index=mi, columns=cols) - df.loc[:, ("Respondent", "StartDate")] = to_datetime( - df.loc[:, ("Respondent", "StartDate")] - ) - df.loc[:, ("Respondent", "EndDate")] = to_datetime( - df.loc[:, ("Respondent", "EndDate")] - ) + ctx = contextlib.nullcontext() + if using_infer_string: + ctx = pytest.raises(TypeError, match="Invalid value") + + with ctx: + df.loc[:, ("Respondent", "StartDate")] = to_datetime( + df.loc[:, ("Respondent", "StartDate")] + ) + with ctx: + df.loc[:, ("Respondent", "EndDate")] = to_datetime( + df.loc[:, ("Respondent", "EndDate")] + ) + + if using_infer_string: + # infer-objects won't infer stuff anymore + return + df = df.infer_objects() # Adding a new key @@ -1211,20 +1219,23 @@ def test_loc_reverse_assignment(self): tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set int into string") - def test_loc_setitem_str_to_small_float_conversion_type(self): + def test_loc_setitem_str_to_small_float_conversion_type(self, using_infer_string): # GH#20388 col_data = [str(np.random.default_rng(2).random() * 1e-12) for _ in range(5)] result = DataFrame(col_data, columns=["A"]) - expected = DataFrame(col_data, columns=["A"], dtype=object) + expected = DataFrame(col_data, columns=["A"]) tm.assert_frame_equal(result, expected) # assigning with loc/iloc attempts to set the values inplace, which # in this case is successful - result.loc[result.index, "A"] = [float(x) for x in col_data] - expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object) - tm.assert_frame_equal(result, expected) + if using_infer_string: + with pytest.raises(TypeError, match="Scalar must"): + result.loc[result.index, "A"] = [float(x) for x in col_data] + else: + result.loc[result.index, "A"] = [float(x) for x in col_data] + expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object) + tm.assert_frame_equal(result, expected) # assigning the entire column using __setitem__ swaps in the new array # GH#??? @@ -1389,9 +1400,6 @@ def test_loc_setitem_categorical_values_partial_column_slice(self): df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"]) df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) def test_loc_setitem_single_row_categorical(self, using_infer_string): # GH#25495 df = DataFrame({"Alpha": ["a"], "Numeric": [0]}) From cf3c44eff080effece4ef79efd465900fddde427 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 19 Aug 2024 09:15:17 +0200 Subject: [PATCH 2/4] Update --- pandas/tests/indexing/test_indexing.py | 53 ++++++++++---------------- 1 file changed, 21 insertions(+), 32 deletions(-) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 9a76dc6814171..c793c75695a86 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -531,49 +531,38 @@ def test_astype_assignment(self, using_infer_string): df_orig = DataFrame( [["1", "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) + df_orig[list("ABCDG")] = df_orig[list("ABCDG")].astype(object) df = df_orig.copy() # with the enforcement of GH#45333 in 2.0, this setting is attempted inplace, # so object dtype is retained - if using_infer_string: - with pytest.raises(TypeError, match="Invalid value"): - df.iloc[:, 0] = df.iloc[:, 0].astype(np.int64) - else: - df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64) - expected = DataFrame( - [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") - ) - expected["A"] = expected["A"].astype(object) - expected["B"] = expected["B"].astype(object) - tm.assert_frame_equal(df, expected) + df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64) + expected = DataFrame( + [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") + ) + expected[list("CDG")] = expected[list("CDG")].astype(object) + expected["A"] = expected["A"].astype(object) + expected["B"] = expected["B"].astype(object) + tm.assert_frame_equal(df, expected) # GH5702 (loc) df = df_orig.copy() - if using_infer_string: - with pytest.raises(TypeError, match="Invalid value"): - df.loc[:, "A"] = df.loc[:, "A"].astype(np.int64) - else: - df.loc[:, "A"] = df.loc[:, "A"].astype(np.int64) - expected = DataFrame( - [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") - ) - expected["A"] = expected["A"].astype(object) - tm.assert_frame_equal(df, expected) + df.loc[:, "A"] = df.loc[:, "A"].astype(np.int64) + expected = DataFrame( + [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") + ) + expected[list("ABCDG")] = expected[list("ABCDG")].astype(object) + tm.assert_frame_equal(df, expected) df = df_orig.copy() - if using_infer_string: - with pytest.raises(TypeError, match="Invalid value"): - df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64) - else: - df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64) - expected = DataFrame( - [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") - ) - expected["B"] = expected["B"].astype(object) - expected["C"] = expected["C"].astype(object) - tm.assert_frame_equal(df, expected) + df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64) + expected = DataFrame( + [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") + ) + expected[list("ABCDG")] = expected[list("ABCDG")].astype(object) + tm.assert_frame_equal(df, expected) def test_astype_assignment_full_replacements(self): # full replacements / no nans From 4bb986ab14c4862f641fe0b9ce89136bd6fde72a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 Sep 2024 11:36:26 +0200 Subject: [PATCH 3/4] make error message consistent --- pandas/core/arrays/string_arrow.py | 2 +- pandas/tests/indexing/test_loc.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 97381b82ceab9..1e5adf106752f 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -240,7 +240,7 @@ def _maybe_convert_setitem_value(self, value): value[isna(value)] = None for v in value: if not (v is None or isinstance(v, str)): - raise TypeError("Scalar must be NA or str") + raise TypeError("Must provide strings") return super()._maybe_convert_setitem_value(value) def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 51b10b6eedddc..36b08ee1df790 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1230,7 +1230,7 @@ def test_loc_setitem_str_to_small_float_conversion_type(self, using_infer_string # assigning with loc/iloc attempts to set the values inplace, which # in this case is successful if using_infer_string: - with pytest.raises(TypeError, match="Scalar must"): + with pytest.raises(TypeError, match="Must provide strings"): result.loc[result.index, "A"] = [float(x) for x in col_data] else: result.loc[result.index, "A"] = [float(x) for x in col_data] From 677e06389f04ce835d4dc164f521e031b5e0f225 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 Sep 2024 12:08:33 +0200 Subject: [PATCH 4/4] update test --- pandas/tests/arrays/string_/test_string.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index dd87dbf8e9a43..87bd1d5921caa 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -102,10 +102,7 @@ def test_setitem_validates(cls, dtype): with pytest.raises(TypeError, match=msg): arr[0] = 10 - if dtype.storage == "python": - msg = "Must provide strings." - else: - msg = "Scalar must be NA or str" + msg = "Must provide strings" with pytest.raises(TypeError, match=msg): arr[:] = np.array([1, 2])