From 5dd7d939c6db76dda21b7da95e0a9bbd8983c88c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 1 Oct 2023 21:56:14 +0100 Subject: [PATCH 01/36] Start fixing string tests --- pandas/conftest.py | 8 ++++++++ pandas/core/config_init.py | 2 +- pandas/tests/frame/methods/test_astype.py | 20 +++++++++++++------ .../tests/frame/methods/test_combine_first.py | 6 ++++-- .../frame/methods/test_convert_dtypes.py | 6 +++++- pandas/tests/frame/methods/test_drop.py | 2 +- .../frame/methods/test_drop_duplicates.py | 2 +- pandas/tests/frame/methods/test_duplicated.py | 2 +- pandas/tests/frame/methods/test_equals.py | 4 ++-- pandas/tests/frame/methods/test_explode.py | 2 +- pandas/tests/frame/methods/test_fillna.py | 4 ++++ .../frame/methods/test_get_numeric_data.py | 6 +++--- .../tests/frame/methods/test_interpolate.py | 10 ++++++---- .../methods/test_is_homogeneous_dtype.py | 3 ++- pandas/tests/frame/methods/test_reindex.py | 6 +++--- 15 files changed, 56 insertions(+), 27 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 62f22921f0482..415f8a8168607 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1999,6 +1999,14 @@ def using_copy_on_write() -> bool: ) +@pytest.fixture +def using_infer_string() -> bool: + """ + Fixture to check if Copy-on-Write is enabled. + """ + return pd.options.future.infer_string + + warsaws = ["Europe/Warsaw", "dateutil/Europe/Warsaw"] if zoneinfo is not None: warsaws.append(zoneinfo.ZoneInfo("Europe/Warsaw")) # type: ignore[arg-type] diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 4652acdcae287..c0201e13ed1bf 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -903,7 +903,7 @@ def register_converter_cb(key) -> None: with cf.config_prefix("future"): cf.register_option( "infer_string", - False, + os.environ.get("PANDAS_INFER_STRING", "0") == "0", "Whether to infer sequence of str objects as pyarrow string " "dtype, which will be the default in pandas 3.0 " "(at which point this option will be deprecated).", diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 6590f10c6b967..af0010beeb46d 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -157,7 +157,8 @@ def test_astype_str(self): "c": [Timedelta(x)._repr_base() for x in c._values], "d": list(map(str, d._values)), "e": list(map(str, e._values)), - } + }, + dtype="object", ) tm.assert_frame_equal(result, expected) @@ -165,13 +166,13 @@ def test_astype_str(self): def test_astype_str_float(self): # see GH#11302 result = DataFrame([np.nan]).astype(str) - expected = DataFrame(["nan"]) + expected = DataFrame(["nan"], dtype="object") tm.assert_frame_equal(result, expected) result = DataFrame([1.12345678901234567890]).astype(str) val = "1.1234567890123457" - expected = DataFrame([val]) + expected = DataFrame([val], dtype="object") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype_class", [dict, Series]) @@ -190,7 +191,7 @@ def test_astype_dict_like(self, dtype_class): expected = DataFrame( { "a": a, - "b": Series(["0", "1", "2", "3", "4"]), + "b": Series(["0", "1", "2", "3", "4"], dtype="object"), "c": c, "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"), } @@ -606,6 +607,7 @@ def test_astype_arg_for_errors_dictlist(self): {"a": 2.2, "b": "15.3", "c": "another_test"}, ] ) + expected["c"] = expected["c"].astype("object") type_dict = {"a": "float64", "b": "float64", "c": "object"} result = df.astype(dtype=type_dict, errors="ignore") @@ -666,6 +668,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame): ], ], columns=timezone_frame.columns, + dtype="object", ) tm.assert_frame_equal(result, expected) @@ -740,7 +743,9 @@ def test_astype_tz_object_conversion(self, tz): result = result.astype({"tz": "datetime64[ns, Europe/London]"}) tm.assert_frame_equal(result, expected) - def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture): + def test_astype_dt64_to_string( + self, frame_or_series, tz_naive_fixture, using_infer_string + ): # GH#41409 tz = tz_naive_fixture @@ -758,7 +763,10 @@ def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture): item = result.iloc[0] if frame_or_series is DataFrame: item = item.iloc[0] - assert item is pd.NA + if using_infer_string: + assert item is np.nan + else: + assert item is pd.NA # For non-NA values, we should match what we get for non-EA str alt = obj.astype(str) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 156e50d50a9ef..71ffbf20eb430 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -30,7 +30,7 @@ def test_combine_first_mixed(self): combined = f.combine_first(g) tm.assert_frame_equal(combined, exp) - def test_combine_first(self, float_frame): + def test_combine_first(self, float_frame, using_infer_string): # disjoint head, tail = float_frame[:5], float_frame[5:] @@ -76,7 +76,9 @@ def test_combine_first(self, float_frame): tm.assert_series_equal(combined["A"].reindex(g.index), g["A"]) # corner cases - comb = float_frame.combine_first(DataFrame()) + warning = FutureWarning if using_infer_string else None + with tm.assert_produces_warning(warning, match="empty entries"): + comb = float_frame.combine_first(DataFrame()) tm.assert_frame_equal(comb, float_frame) comb = DataFrame().combine_first(float_frame) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index c2b1016e88402..c3b04787ff5c1 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -11,9 +11,13 @@ class TestConvertDtypes: @pytest.mark.parametrize( "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] ) - def test_convert_dtypes(self, convert_integer, expected, string_storage): + def test_convert_dtypes( + self, convert_integer, expected, string_storage, using_infer_string + ): # Specific types are tested in tests/series/test_dtypes.py # Just check that it works for DataFrame here + if using_infer_string and string_storage == "python": + string_storage = "pyarrow_numpy" df = pd.DataFrame( { "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index 9a4882f11e961..694752be49ff1 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -510,7 +510,7 @@ def test_drop_with_duplicate_columns2(self): def test_drop_inplace_no_leftover_column_reference(self): # GH 13934 - df = DataFrame({"a": [1, 2, 3]}) + df = DataFrame({"a": [1, 2, 3]}, columns=Index(["a"], dtype="object")) a = df.a df.drop(["a"], axis=1, inplace=True) tm.assert_index_equal(df.columns, Index([], dtype="object")) diff --git a/pandas/tests/frame/methods/test_drop_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py index df12139258a6d..6bea97b2cf189 100644 --- a/pandas/tests/frame/methods/test_drop_duplicates.py +++ b/pandas/tests/frame/methods/test_drop_duplicates.py @@ -16,7 +16,7 @@ def test_drop_duplicates_with_misspelled_column_name(subset): # GH 19730 df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) - msg = re.escape("Index(['a'], dtype='object')") + msg = re.escape("Index(['a'], dtype=") with pytest.raises(KeyError, match=msg): df.drop_duplicates(subset) diff --git a/pandas/tests/frame/methods/test_duplicated.py b/pandas/tests/frame/methods/test_duplicated.py index 788aede805110..6052b61ea8db5 100644 --- a/pandas/tests/frame/methods/test_duplicated.py +++ b/pandas/tests/frame/methods/test_duplicated.py @@ -16,7 +16,7 @@ def test_duplicated_with_misspelled_column_name(subset): # GH 19730 df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) - msg = re.escape("Index(['a'], dtype='object')") + msg = re.escape("Index(['a'], dtype=") with pytest.raises(KeyError, match=msg): df.duplicated(subset) diff --git a/pandas/tests/frame/methods/test_equals.py b/pandas/tests/frame/methods/test_equals.py index 6fcf670f96ef0..d0b9d96cafa0d 100644 --- a/pandas/tests/frame/methods/test_equals.py +++ b/pandas/tests/frame/methods/test_equals.py @@ -14,11 +14,11 @@ def test_dataframe_not_equal(self): df2 = DataFrame({"a": ["s", "d"], "b": [1, 2]}) assert df1.equals(df2) is False - def test_equals_different_blocks(self, using_array_manager): + def test_equals_different_blocks(self, using_array_manager, using_infer_string): # GH#9330 df0 = DataFrame({"A": ["x", "y"], "B": [1, 2], "C": ["w", "z"]}) df1 = df0.reset_index()[["A", "B", "C"]] - if not using_array_manager: + if not using_array_manager and not using_infer_string: # this assert verifies that the above operations have # induced a block rearrangement assert df0._mgr.blocks[0].dtype != df1._mgr.blocks[0].dtype diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py index d1e4a603c5710..5cd54db62d783 100644 --- a/pandas/tests/frame/methods/test_explode.py +++ b/pandas/tests/frame/methods/test_explode.py @@ -203,7 +203,7 @@ def test_usecase(): ) def test_duplicate_index(input_dict, input_index, expected_dict, expected_index): # GH 28005 - df = pd.DataFrame(input_dict, index=input_index) + df = pd.DataFrame(input_dict, index=input_index, dtype=object) result = df.explode("col1") expected = pd.DataFrame(expected_dict, index=expected_index, dtype=object) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 52b4b64ee279f..6cf859ddfce84 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas.util._test_decorators as td from pandas import ( @@ -84,6 +86,7 @@ def test_fillna_datetime(self, datetime_frame): with pytest.raises(ValueError, match=msg): datetime_frame.fillna(5, method="ffill") + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") def test_fillna_mixed_type(self, float_string_frame): mf = float_string_frame mf.loc[mf.index[5:20], "foo"] = np.nan @@ -644,6 +647,7 @@ def test_fillna_col_reordering(self): filled = df.fillna(method="ffill") assert df.columns.tolist() == filled.columns.tolist() + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") def test_fill_corner(self, float_frame, float_string_frame): mf = float_string_frame mf.loc[mf.index[5:20], "foo"] = np.nan diff --git a/pandas/tests/frame/methods/test_get_numeric_data.py b/pandas/tests/frame/methods/test_get_numeric_data.py index ec1c768603a59..c5d32d56d03c1 100644 --- a/pandas/tests/frame/methods/test_get_numeric_data.py +++ b/pandas/tests/frame/methods/test_get_numeric_data.py @@ -15,12 +15,12 @@ class TestGetNumericData: def test_get_numeric_data_preserve_dtype(self): # get the numeric data - obj = DataFrame({"A": [1, "2", 3.0]}) + obj = DataFrame({"A": [1, "2", 3.0]}, columns=Index(["A"], dtype="object")) result = obj._get_numeric_data() expected = DataFrame(dtype=object, index=pd.RangeIndex(3), columns=[]) tm.assert_frame_equal(result, expected) - def test_get_numeric_data(self): + def test_get_numeric_data(self, using_infer_string): datetime64name = np.dtype("M8[s]").name objectname = np.dtype(np.object_).name @@ -33,7 +33,7 @@ def test_get_numeric_data(self): [ np.dtype("float64"), np.dtype("int64"), - np.dtype(objectname), + np.dtype(objectname) if not using_infer_string else "string", np.dtype(datetime64name), ], index=["a", "b", "c", "f"], diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 291a79815a81c..bfaecbcdab818 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -67,7 +67,7 @@ def test_interpolate_inplace(self, frame_or_series, using_array_manager, request assert np.shares_memory(orig, obj.values) assert orig.squeeze()[1] == 1.5 - def test_interp_basic(self, using_copy_on_write): + def test_interp_basic(self, using_copy_on_write, using_infer_string): df = DataFrame( { "A": [1, 2, np.nan, 4], @@ -85,7 +85,8 @@ def test_interp_basic(self, using_copy_on_write): } ) msg = "DataFrame.interpolate with object dtype" - with tm.assert_produces_warning(FutureWarning, match=msg): + warning = FutureWarning if not using_infer_string else None + with tm.assert_produces_warning(warning, match=msg): result = df.interpolate() tm.assert_frame_equal(result, expected) @@ -108,7 +109,7 @@ def test_interp_basic(self, using_copy_on_write): assert np.shares_memory(df["C"]._values, cvalues) assert np.shares_memory(df["D"]._values, dvalues) - def test_interp_basic_with_non_range_index(self): + def test_interp_basic_with_non_range_index(self, using_infer_string): df = DataFrame( { "A": [1, 2, np.nan, 4], @@ -119,7 +120,8 @@ def test_interp_basic_with_non_range_index(self): ) msg = "DataFrame.interpolate with object dtype" - with tm.assert_produces_warning(FutureWarning, match=msg): + warning = FutureWarning if not using_infer_string else None + with tm.assert_produces_warning(warning, match=msg): result = df.set_index("C").interpolate() expected = df.set_index("C") expected.loc[3, "A"] = 3 diff --git a/pandas/tests/frame/methods/test_is_homogeneous_dtype.py b/pandas/tests/frame/methods/test_is_homogeneous_dtype.py index a5f285d31301b..1fe28cb8eb856 100644 --- a/pandas/tests/frame/methods/test_is_homogeneous_dtype.py +++ b/pandas/tests/frame/methods/test_is_homogeneous_dtype.py @@ -25,7 +25,8 @@ { "A": np.array([1, 2], dtype=object), "B": np.array(["a", "b"], dtype=object), - } + }, + dtype="object", ), True, ), diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index 0105c41bd0eca..00e7ed1bf6487 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -119,7 +119,7 @@ def test_reindex_timestamp_with_fold(self, timezone, year, month, day, hour): exp = DataFrame({"index": ["1", "2"], "vals": [np.nan, np.nan]}).set_index( "index" ) - exp = exp.astype(object) + exp = exp.astype(df.vals.dtype) tm.assert_frame_equal( df, exp, @@ -837,8 +837,8 @@ def test_reindex_fill_value(self): # other dtypes df["foo"] = "foo" - result = df.reindex(range(15), fill_value=0) - expected = df.reindex(range(15)).fillna(0) + result = df.reindex(range(15), fill_value="0") + expected = df.reindex(range(15)).fillna("0") tm.assert_frame_equal(result, expected) def test_reindex_uint_dtypes_fill_value(self, any_unsigned_int_numpy_dtype): From 9320144a66611e027ec95aa52383a7e22164654d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 1 Oct 2023 22:23:06 +0100 Subject: [PATCH 02/36] BUG: interpolate raising wrong error for ea --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/arrays/base.py | 4 ++-- pandas/tests/frame/methods/test_interpolate.py | 6 ++++++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 1a25b848e0f84..87d95f0dee07f 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -22,6 +22,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Fixed bug in :meth:`DataFrame.interpolate` raising incorrect error message (:issue:`55347`) - Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) - diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 933944dbd4632..001e0a5f4996e 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -891,8 +891,8 @@ def interpolate( limit, limit_direction, limit_area, - fill_value, - copy: bool, + fill_value=None, + copy: bool = True, **kwargs, ) -> Self: """ diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 291a79815a81c..67aa07dd83764 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -497,3 +497,9 @@ def test_interpolate_empty_df(self): result = df.interpolate(inplace=True) assert result is None tm.assert_frame_equal(df, expected) + + def test_interpolate_ea_raise(self): + # GH#55347 + df = DataFrame({"a": [1, None, 2]}, dtype="Int64") + with pytest.raises(NotImplementedError, match="does not implement"): + df.interpolate() From be20fb20e4d2842d450badf12a5ceacec2a1dca8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 1 Oct 2023 22:45:01 +0100 Subject: [PATCH 03/36] Fix more tests --- pandas/tests/frame/methods/test_astype.py | 2 +- pandas/tests/frame/methods/test_convert_dtypes.py | 2 +- pandas/tests/frame/methods/test_cov_corr.py | 12 +++++++++--- pandas/tests/frame/methods/test_interpolate.py | 13 ++++++++++--- 4 files changed, 21 insertions(+), 8 deletions(-) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index af0010beeb46d..c5924090b070d 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -274,7 +274,7 @@ def test_astype_duplicate_col_series_arg(self): result = df.astype(dtypes) expected = DataFrame( { - 0: vals[:, 0].astype(str), + 0: Series(vals[:, 0].astype(str), dtype=object), 1: vals[:, 1], 2: pd.array(vals[:, 2], dtype="Float64"), 3: vals[:, 3], diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index c3b04787ff5c1..c00d0418137f0 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -16,7 +16,7 @@ def test_convert_dtypes( ): # Specific types are tested in tests/series/test_dtypes.py # Just check that it works for DataFrame here - if using_infer_string and string_storage == "python": + if using_infer_string: string_storage = "pyarrow_numpy" df = pd.DataFrame( { diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 23a9656193d2c..652b441a56eb3 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -323,7 +323,7 @@ def test_corrwith(self, datetime_frame, dtype): for row in index[:4]: tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row])) - def test_corrwith_with_objects(self): + def test_corrwith_with_objects(self, using_infer_string): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame() cols = ["A", "B", "C", "D"] @@ -331,8 +331,14 @@ def test_corrwith_with_objects(self): df1["obj"] = "foo" df2["obj"] = "bar" - with pytest.raises(TypeError, match="Could not convert"): - df1.corrwith(df2) + if using_infer_string: + import pyarrow as pa + + with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"): + df1.corrwith(df2) + else: + with pytest.raises(TypeError, match="Could not convert"): + df1.corrwith(df2) result = df1.corrwith(df2, numeric_only=True) expected = df1.loc[:, cols].corrwith(df2.loc[:, cols]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 45ce8eaa741a1..1300d0808cedd 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.errors import ChainedAssignmentError import pandas.util._test_decorators as td @@ -67,7 +69,10 @@ def test_interpolate_inplace(self, frame_or_series, using_array_manager, request assert np.shares_memory(orig, obj.values) assert orig.squeeze()[1] == 1.5 - def test_interp_basic(self, using_copy_on_write, using_infer_string): + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="interpolate doesn't work for string" + ) + def test_interp_basic(self, using_copy_on_write): df = DataFrame( { "A": [1, 2, np.nan, 4], @@ -85,8 +90,7 @@ def test_interp_basic(self, using_copy_on_write, using_infer_string): } ) msg = "DataFrame.interpolate with object dtype" - warning = FutureWarning if not using_infer_string else None - with tm.assert_produces_warning(warning, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg): result = df.interpolate() tm.assert_frame_equal(result, expected) @@ -109,6 +113,9 @@ def test_interp_basic(self, using_copy_on_write, using_infer_string): assert np.shares_memory(df["C"]._values, cvalues) assert np.shares_memory(df["D"]._values, dvalues) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="interpolate doesn't work for string" + ) def test_interp_basic_with_non_range_index(self, using_infer_string): df = DataFrame( { From 2a3af77d003a829132c431f0ebc9b86fb35ca3af Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 1 Oct 2023 22:58:33 +0100 Subject: [PATCH 04/36] REGR: join segfaulting for arrow string with nulls --- doc/source/whatsnew/v2.1.2.rst | 2 +- pandas/core/reshape/merge.py | 2 ++ pandas/tests/frame/methods/test_join.py | 5 ++++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 1a25b848e0f84..a5ba365f2d456 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -15,7 +15,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) - Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`) -- +- Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`) .. --------------------------------------------------------------------------- .. _whatsnew_212.bug_fixes: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 4b9fcc80af4bb..ba6579a739f54 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2443,6 +2443,8 @@ def _factorize_keys( .astype(np.intp, copy=False), len(dc.dictionary), ) + if dc.null_count > 0: + count += 1 if how == "right": return rlab, llab, count return llab, rlab, count diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index 2d4ac1d4a4444..d131aa6be22f2 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -158,9 +158,12 @@ def test_join_invalid_validate(left_no_dup, right_no_dup): left_no_dup.merge(right_no_dup, on="a", validate="invalid") -def test_join_on_single_col_dup_on_right(left_no_dup, right_w_dups): +@pytest.mark.parametrize("dtype", ["object", "string[pyarrow]"]) +def test_join_on_single_col_dup_on_right(left_no_dup, right_w_dups, dtype): # GH 46622 # Dups on right allowed by one_to_many constraint + left_no_dup = left_no_dup.astype(dtype) + right_w_dups.index = right_w_dups.index.astype(dtype) left_no_dup.join( right_w_dups, on="a", From f48d3843514a7b3a9e5501d2ee4dd05863fd9cb3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 1 Oct 2023 23:56:48 +0100 Subject: [PATCH 05/36] Fix more tests --- pandas/tests/frame/methods/test_nlargest.py | 2 +- pandas/tests/frame/methods/test_reset_index.py | 12 ++++++++++-- pandas/tests/frame/methods/test_select_dtypes.py | 12 +++++++++--- pandas/tests/frame/methods/test_to_csv.py | 5 +++-- pandas/tests/frame/methods/test_update.py | 12 +++++++++--- 5 files changed, 32 insertions(+), 11 deletions(-) diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index 0bdf9a0e5c007..c5b10481781a2 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -86,7 +86,7 @@ def test_nlargest_n(self, df_strings, nselect_method, n, order): df = df_strings if "b" in order: error_msg = ( - f"Column 'b' has dtype object, " + f"Column 'b' has dtype (object|string), " f"cannot use method '{nselect_method}' with this dtype" ) with pytest.raises(TypeError, match=error_msg): diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 339e19254fd10..8e8cfd6ba3885 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -683,10 +683,14 @@ def test_rest_index_multiindex_categorical_with_missing_values(self, codes): ), ], ) -def test_reset_index_dtypes_on_empty_frame_with_multiindex(array, dtype): +def test_reset_index_dtypes_on_empty_frame_with_multiindex( + array, dtype, using_infer_string +): # GH 19602 - Preserve dtype on empty DataFrame with MultiIndex idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) result = DataFrame(index=idx)[:0].reset_index().dtypes + if using_infer_string and dtype == object: + dtype = "string" expected = Series({"level_0": np.int64, "level_1": np.float64, "level_2": dtype}) tm.assert_series_equal(result, expected) @@ -708,7 +712,9 @@ def test_reset_index_empty_frame_with_datetime64_multiindex(): tm.assert_frame_equal(result, expected) -def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby(): +def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby( + using_infer_string, +): # https://github.com/pandas-dev/pandas/issues/35657 df = DataFrame({"c1": [10.0], "c2": ["a"], "c3": pd.to_datetime("2020-01-01")}) df = df.head(0).groupby(["c2", "c3"])[["c1"]].sum() @@ -718,6 +724,8 @@ def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby(): ) expected["c3"] = expected["c3"].astype("datetime64[ns]") expected["c1"] = expected["c1"].astype("float64") + if using_infer_string: + expected["c2"] = expected["c2"].astype("string[pyarrow_numpy]") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 67dd5b6217187..b5b9b90598470 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -282,7 +282,7 @@ def test_select_dtypes_duplicate_columns(self): result = df.select_dtypes(include=[np.number], exclude=["floating"]) tm.assert_frame_equal(result, expected) - def test_select_dtypes_not_an_attr_but_still_valid_dtype(self): + def test_select_dtypes_not_an_attr_but_still_valid_dtype(self, using_infer_string): df = DataFrame( { "a": list("abc"), @@ -296,11 +296,17 @@ def test_select_dtypes_not_an_attr_but_still_valid_dtype(self): df["g"] = df.f.diff() assert not hasattr(np, "u8") r = df.select_dtypes(include=["i8", "O"], exclude=["timedelta"]) - e = df[["a", "b"]] + if using_infer_string: + e = df[["b"]] + else: + e = df[["a", "b"]] tm.assert_frame_equal(r, e) r = df.select_dtypes(include=["i8", "O", "timedelta64[ns]"]) - e = df[["a", "b", "g"]] + if using_infer_string: + e = df[["b", "g"]] + else: + e = df[["a", "b", "g"]] tm.assert_frame_equal(r, e) def test_select_dtypes_empty(self): diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 9f45347c31165..83d0e68f28221 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -612,7 +612,7 @@ def _make_frame(names=None): tm.assert_index_equal(recons.columns, exp.columns) assert len(recons) == 0 - def test_to_csv_interval_index(self): + def test_to_csv_interval_index(self, using_infer_string): # GH 28210 df = DataFrame({"A": list("abc"), "B": range(3)}, index=pd.interval_range(0, 3)) @@ -622,7 +622,8 @@ def test_to_csv_interval_index(self): # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) expected = df.copy() - expected.index = expected.index.astype(str) + if not using_infer_string: + expected.index = expected.index.astype(str) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 5738a25f26fcb..803c7770d005d 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -150,11 +150,17 @@ def test_update_with_different_dtype(self, using_copy_on_write): with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): df["c"].update(Series(["foo"], index=[0])) - expected = DataFrame({"a": [1, 3], "b": [np.nan, 2], "c": ["foo", np.nan]}) + expected = DataFrame( + { + "a": [1, 3], + "b": [np.nan, 2], + "c": Series(["foo", np.nan], dtype="object"), + } + ) tm.assert_frame_equal(df, expected) @td.skip_array_manager_invalid_test - def test_update_modify_view(self, using_copy_on_write): + def test_update_modify_view(self, using_copy_on_write, using_infer_string): # GH#47188 df = DataFrame({"A": ["1", np.nan], "B": ["100", np.nan]}) df2 = DataFrame({"A": ["a", "x"], "B": ["100", "200"]}) @@ -163,7 +169,7 @@ def test_update_modify_view(self, using_copy_on_write): df2.update(df) expected = DataFrame({"A": ["1", "x"], "B": ["100", "200"]}) tm.assert_frame_equal(df2, expected) - if using_copy_on_write: + if using_copy_on_write or using_infer_string: tm.assert_frame_equal(result_view, df2_orig) else: tm.assert_frame_equal(result_view, expected) From 1cbeced2a865278a36110a90643c7bcc3cecc76e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 2 Oct 2023 00:21:13 +0100 Subject: [PATCH 06/36] Fix more tests --- pandas/tests/frame/methods/test_fillna.py | 22 ++++- pandas/tests/frame/methods/test_replace.py | 106 ++++++++++++++++++--- 2 files changed, 109 insertions(+), 19 deletions(-) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 6cf859ddfce84..29dec49ac9e23 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -120,19 +120,27 @@ def test_fillna_empty(self, using_copy_on_write): df.x.fillna(method=m, inplace=True) df.x.fillna(method=m) - def test_fillna_different_dtype(self): + def test_fillna_different_dtype(self, using_infer_string): # with different dtype (GH#3386) df = DataFrame( [["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]] ) - result = df.fillna({2: "foo"}) + if using_infer_string: + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = df.fillna({2: "foo"}) + else: + result = df.fillna({2: "foo"}) expected = DataFrame( [["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]] ) tm.assert_frame_equal(result, expected) - return_value = df.fillna({2: "foo"}, inplace=True) + if using_infer_string: + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + return_value = df.fillna({2: "foo"}, inplace=True) + else: + return_value = df.fillna({2: "foo"}, inplace=True) tm.assert_frame_equal(df, expected) assert return_value is None @@ -356,7 +364,7 @@ def test_fillna_dictlike_value_duplicate_colnames(self, columns): expected["A"] = 0.0 tm.assert_frame_equal(result, expected) - def test_fillna_dtype_conversion(self): + def test_fillna_dtype_conversion(self, using_infer_string): # make sure that fillna on an empty frame works df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) result = df.dtypes @@ -371,7 +379,11 @@ def test_fillna_dtype_conversion(self): # empty block df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64") - result = df.fillna("nan") + if using_infer_string: + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = df.fillna("nan") + else: + result = df.fillna("nan") expected = DataFrame("nan", index=range(3), columns=["A", "B"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index f07c53060a06b..318bbdd4ec337 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -28,6 +30,9 @@ def mix_abc() -> dict[str, list[float | str]]: class TestDataFrameReplace: + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_inplace(self, datetime_frame, float_string_frame): datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan @@ -278,14 +283,25 @@ def test_regex_replace_dict_nested(self, mix_abc): tm.assert_frame_equal(res3, expec) tm.assert_frame_equal(res4, expec) - def test_regex_replace_dict_nested_non_first_character(self, any_string_dtype): + def test_regex_replace_dict_nested_non_first_character( + self, any_string_dtype, using_infer_string + ): # GH 25259 dtype = any_string_dtype df = DataFrame({"first": ["abc", "bca", "cab"]}, dtype=dtype) - expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) - result = df.replace({"a": "."}, regex=True) + if using_infer_string and any_string_dtype == "object": + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = df.replace({"a": "."}, regex=True) + expected = DataFrame({"first": [".bc", "bc.", "c.b"]}) + + else: + result = df.replace({"a": "."}, regex=True) + expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_regex_replace_dict_nested_gh4115(self): df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2}) @@ -294,6 +310,9 @@ def test_regex_replace_dict_nested_gh4115(self): result = df.replace({"Type": {"Q": 0, "T": 1}}) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_regex_replace_list_to_scalar(self, mix_abc): df = DataFrame(mix_abc) expec = DataFrame( @@ -322,6 +341,9 @@ def test_regex_replace_list_to_scalar(self, mix_abc): tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_regex_replace_str_to_numeric(self, mix_abc): # what happens when you try to replace a numeric value with a regex? df = DataFrame(mix_abc) @@ -337,6 +359,9 @@ def test_regex_replace_str_to_numeric(self, mix_abc): tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_regex_replace_regex_list_to_numeric(self, mix_abc): df = DataFrame(mix_abc) res = df.replace([r"\s*\.\s*", "b"], 0, regex=True) @@ -415,12 +440,23 @@ def test_replace_regex_metachar(self, metachar): ], ) def test_regex_replace_string_types( - self, data, to_replace, expected, frame_or_series, any_string_dtype + self, + data, + to_replace, + expected, + frame_or_series, + any_string_dtype, + using_infer_string, ): # GH-41333, GH-35977 dtype = any_string_dtype obj = frame_or_series(data, dtype=dtype) - result = obj.replace(to_replace, regex=True) + if using_infer_string and any_string_dtype == "object": + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = obj.replace(to_replace, regex=True) + dtype = "string[pyarrow_numpy]" + else: + result = obj.replace(to_replace, regex=True) expected = frame_or_series(expected, dtype=dtype) tm.assert_equal(result, expected) @@ -522,6 +558,9 @@ def test_replace_series_dict(self): result = df.replace(s, df.mean()) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_convert(self): # gh 3907 df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]]) @@ -533,6 +572,9 @@ def test_replace_convert(self): res = rep.dtypes tm.assert_series_equal(expec, res) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_mixed(self, float_string_frame): mf = float_string_frame mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan @@ -588,7 +630,7 @@ def test_replace_mixed_int_block_splitting(self): result = df.replace(0, 0.5) tm.assert_frame_equal(result, expected) - def test_replace_mixed2(self): + def test_replace_mixed2(self, using_infer_string): # to object block upcasting df = DataFrame( { @@ -607,11 +649,15 @@ def test_replace_mixed2(self): expected = DataFrame( { - "A": Series(["foo", "bar"], dtype="object"), + "A": Series(["foo", "bar"]), "B": Series([0, "foo"], dtype="object"), } ) - result = df.replace([1, 2], ["foo", "bar"]) + if using_infer_string: + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = df.replace([1, 2], ["foo", "bar"]) + else: + result = df.replace([1, 2], ["foo", "bar"]) tm.assert_frame_equal(result, expected) def test_replace_mixed3(self): @@ -894,6 +940,9 @@ def test_replace_input_formats_listlike(self): with pytest.raises(ValueError, match=msg): df.replace(to_rep, values[1:]) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_input_formats_scalar(self): df = DataFrame( {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} @@ -922,6 +971,9 @@ def test_replace_limit(self): # TODO pass + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_dict_no_regex(self): answer = Series( { @@ -945,6 +997,9 @@ def test_replace_dict_no_regex(self): result = answer.replace(weights) tm.assert_series_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_series_no_regex(self): answer = Series( { @@ -1051,7 +1106,10 @@ def test_nested_dict_overlapping_keys_replace_str(self): expected = df.replace({"a": dict(zip(astr, bstr))}) tm.assert_frame_equal(result, expected) - def test_replace_swapping_bug(self): + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) + def test_replace_swapping_bug(self, using_infer_string): df = DataFrame({"a": [True, False, True]}) res = df.replace({"a": {True: "Y", False: "N"}}) expect = DataFrame({"a": ["Y", "N", "Y"]}) @@ -1062,6 +1120,9 @@ def test_replace_swapping_bug(self): expect = DataFrame({"a": ["Y", "N", "Y"]}) tm.assert_frame_equal(res, expect) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_period(self): d = { "fname": { @@ -1098,6 +1159,9 @@ def test_replace_period(self): result = df.replace(d) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_datetime(self): d = { "fname": { @@ -1318,6 +1382,9 @@ def test_replace_commutative(self, df, to_replace, exp): result = df.replace(to_replace) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) @pytest.mark.parametrize( "replacer", [ @@ -1478,10 +1545,12 @@ def test_replace_with_compiled_regex(self): expected = DataFrame(["z", "b", "c"]) tm.assert_frame_equal(result, expected) - def test_replace_intervals(self): + def test_replace_intervals(self, using_infer_string): # https://github.com/pandas-dev/pandas/issues/35931 df = DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]}) - result = df.replace({"a": {pd.Interval(0, 1): "x"}}) + warning = FutureWarning if using_infer_string else None + with tm.assert_produces_warning(warning, match="Downcasting"): + result = df.replace({"a": {pd.Interval(0, 1): "x"}}) expected = DataFrame({"a": ["x", "x"]}) tm.assert_frame_equal(result, expected) @@ -1582,6 +1651,9 @@ def test_regex_replace_scalar( expected.loc[expected["a"] == ".", "a"] = expected_replace_val tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_frame(self, regex): # GH-48644 @@ -1619,9 +1691,15 @@ def test_replace_categorical_no_replacement(self): result = df.replace(to_replace=[".", "def"], value=["_", None]) tm.assert_frame_equal(result, expected) - def test_replace_object_splitting(self): + def test_replace_object_splitting(self, using_infer_string): # GH#53977 df = DataFrame({"a": ["a"], "b": "b"}) - assert len(df._mgr.blocks) == 1 + if using_infer_string: + assert len(df._mgr.blocks) == 2 + else: + assert len(df._mgr.blocks) == 1 df.replace(to_replace=r"^\s*$", value="", inplace=True, regex=True) - assert len(df._mgr.blocks) == 1 + if using_infer_string: + assert len(df._mgr.blocks) == 2 + else: + assert len(df._mgr.blocks) == 1 From 9dddb80915f24f3c915ae0203af90e5a8f41c84f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 2 Oct 2023 15:08:47 +0100 Subject: [PATCH 07/36] BUG: rank raising for arrow string dtypes --- doc/source/whatsnew/v2.1.2.rst | 2 +- pandas/core/arrays/arrow/array.py | 31 ++++++++++++++++++++----- pandas/core/arrays/string_arrow.py | 23 ++++++++++++++++++ pandas/tests/frame/methods/test_rank.py | 11 +++++++++ 4 files changed, 60 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 1a25b848e0f84..a2fa7cf32746b 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -24,7 +24,7 @@ Bug fixes ~~~~~~~~~ - Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) -- +- Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`TODO`) .. --------------------------------------------------------------------------- .. _whatsnew_212.other: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4b79d0dbb683e..37ebb69f1d73c 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1708,7 +1708,7 @@ def __setitem__(self, key, value) -> None: data = pa.chunked_array([data]) self._pa_array = data - def _rank( + def _rank_calc( self, *, axis: AxisInt = 0, @@ -1717,9 +1717,6 @@ def _rank( ascending: bool = True, pct: bool = False, ): - """ - See Series.rank.__doc__. - """ if pa_version_under9p0 or axis != 0: ranked = super()._rank( axis=axis, @@ -1734,7 +1731,7 @@ def _rank( else: pa_type = pa.uint64() result = pa.array(ranked, type=pa_type, from_pandas=True) - return type(self)(result) + return result data = self._pa_array.combine_chunks() sort_keys = "ascending" if ascending else "descending" @@ -1773,7 +1770,29 @@ def _rank( divisor = pc.count(result) result = pc.divide(result, divisor) - return type(self)(result) + return result + + def _rank( + self, + *, + axis: AxisInt = 0, + method: str = "average", + na_option: str = "keep", + ascending: bool = True, + pct: bool = False, + ): + """ + See Series.rank.__doc__. + """ + return type(self)( + self._rank_calc( + axis=axis, + method=method, + na_option=na_option, + ascending=ascending, + pct=pct, + ) + ) def _quantile(self, qs: npt.NDArray[np.float64], interpolation: str) -> Self: """ diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 6262055827428..b0ddd082e9c8e 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -53,6 +53,7 @@ from collections.abc import Sequence from pandas._typing import ( + AxisInt, Dtype, Scalar, npt, @@ -501,6 +502,28 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): def _convert_int_dtype(self, result): return Int64Dtype().__from_arrow__(result) + def _rank( + self, + *, + axis: AxisInt = 0, + method: str = "average", + na_option: str = "keep", + ascending: bool = True, + pct: bool = False, + ): + """ + See Series.rank.__doc__. + """ + return self._convert_int_dtype( + self._rank_calc( + axis=axis, + method=method, + na_option=na_option, + ascending=ascending, + pct=pct, + ) + ) + class ArrowStringArrayNumpySemantics(ArrowStringArray): _storage = "pyarrow_numpy" diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 8b451c84dc5da..b037169a8241d 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -488,3 +488,14 @@ def test_rank_mixed_axis_zero(self, data, expected): df.rank() result = df.rank(numeric_only=True) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype, exp_dtype", + [("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")], + ) + def test_rank_string_dtype(self, dtype, exp_dtype): + # GH# + obj = Series(["foo", "foo", None, "foo"], dtype=dtype) + result = obj.rank(method="first") + expected = Series([1, 2, None, 3], dtype=exp_dtype) + tm.assert_series_equal(result, expected) From e07f63930e455e1698e41413fc309587b7d3074a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 2 Oct 2023 15:31:26 +0100 Subject: [PATCH 08/36] BUG: eq not implemented for categorical and arrow backed strings --- doc/source/whatsnew/v2.1.2.rst | 2 +- pandas/core/arrays/arrow/array.py | 5 ++++- pandas/tests/indexes/categorical/test_equals.py | 6 ++++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 1a25b848e0f84..7bee10d687def 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -24,7 +24,7 @@ Bug fixes ~~~~~~~~~ - Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) -- +- Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`TODO`) .. --------------------------------------------------------------------------- .. _whatsnew_212.other: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4b79d0dbb683e..4937581b0abdd 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -32,6 +32,7 @@ from pandas.core.dtypes.cast import can_hold_element from pandas.core.dtypes.common import ( + CategoricalDtype, is_array_like, is_bool_dtype, is_integer, @@ -627,7 +628,9 @@ def __setstate__(self, state) -> None: def _cmp_method(self, other, op): pc_func = ARROW_CMP_FUNCS[op.__name__] - if isinstance(other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray)): + if isinstance( + other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray) + ) or isinstance(getattr(other, "dtype", None), CategoricalDtype): result = pc_func(self._pa_array, self._box_pa(other)) elif is_scalar(other): try: diff --git a/pandas/tests/indexes/categorical/test_equals.py b/pandas/tests/indexes/categorical/test_equals.py index 1ed8f3a903439..c693e1181b9d0 100644 --- a/pandas/tests/indexes/categorical/test_equals.py +++ b/pandas/tests/indexes/categorical/test_equals.py @@ -88,3 +88,9 @@ def test_equals_multiindex(self): ci = mi.to_flat_index().astype("category") assert not ci.equals(mi) + + def test_equals_string_dtype(self, any_string_dtype): + # GH# + idx = CategoricalIndex(list("abc"), name="B") + other = Index(["a", "b", "c"], name="B", dtype=any_string_dtype) + assert idx.equals(other) From 4c074c11783e617180e396f39ce28ca00001e39a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 2 Oct 2023 15:31:44 +0100 Subject: [PATCH 09/36] More tests --- pandas/tests/frame/methods/test_dtypes.py | 2 +- pandas/tests/frame/methods/test_rank.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/methods/test_dtypes.py b/pandas/tests/frame/methods/test_dtypes.py index 4bdf16977dae6..00ced2978d85b 100644 --- a/pandas/tests/frame/methods/test_dtypes.py +++ b/pandas/tests/frame/methods/test_dtypes.py @@ -146,5 +146,5 @@ def test_frame_apply_np_array_return_type(self): # GH 35517 df = DataFrame([["foo"]]) result = df.apply(lambda col: np.array("bar")) - expected = Series(["bar"]) + expected = Series([np.array(["bar"])]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 8b451c84dc5da..7b191bad7f2f7 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -13,6 +13,7 @@ from pandas import ( DataFrame, + Index, Series, ) import pandas._testing as tm @@ -478,12 +479,15 @@ def test_rank_object_first(self, frame_or_series, na_option, ascending, expected @pytest.mark.parametrize( "data,expected", [ - ({"a": [1, 2, "a"], "b": [4, 5, 6]}, DataFrame({"b": [1.0, 2.0, 3.0]})), + ( + {"a": [1, 2, "a"], "b": [4, 5, 6]}, + DataFrame({"b": [1.0, 2.0, 3.0]}, columns=Index(["b"], dtype=object)), + ), ({"a": [1, 2, "a"]}, DataFrame(index=range(3), columns=[])), ], ) def test_rank_mixed_axis_zero(self, data, expected): - df = DataFrame(data) + df = DataFrame(data, columns=Index(list(data.keys()), dtype=object)) with pytest.raises(TypeError, match="'<' not supported between instances of"): df.rank() result = df.rank(numeric_only=True) From bf4b3cae85c68b73e6b459ab9bc8b582e728cc0c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 2 Oct 2023 15:43:40 +0100 Subject: [PATCH 10/36] BUG: ndim of string block incorrect with string inference --- pandas/core/internals/construction.py | 2 +- pandas/tests/frame/test_constructors.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 6f30bc650aa36..d6aeda3d418ed 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -383,7 +383,7 @@ def ndarray_to_mgr( new_block( dtype.construct_array_type()._from_sequence(data, dtype=dtype), BlockPlacement(slice(i, i + 1)), - ndim=1, + ndim=2, ) for i, data in enumerate(obj_columns) ] diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index fd851ab244cb8..01fb8cf53c7cd 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2743,6 +2743,12 @@ def test_frame_string_inference_array_string_dtype(self): df = DataFrame(np.array([["a", "c"], ["b", "d"]]), columns=["a", "b"]) tm.assert_frame_equal(df, expected) + def test_frame_string_inference_block_dim(self): + # GH# + with pd.option_context("future.infer_string", True): + df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) + assert df._mgr.blocks[0].ndim == 2 + class TestDataFrameConstructorIndexInference: def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self): From 3b5974d6247d85fe668e3ca13f5ba4fadb82fe91 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 2 Oct 2023 15:46:39 +0100 Subject: [PATCH 11/36] Fix test --- pandas/tests/frame/methods/test_rank.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index ededae5cec77f..d4b572d96a677 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -470,10 +470,14 @@ def test_rank_inf_nans_na_option( ("top", False, [2.0, 3.0, 1.0, 4.0]), ], ) - def test_rank_object_first(self, frame_or_series, na_option, ascending, expected): + def test_rank_object_first( + self, frame_or_series, na_option, ascending, expected, using_infer_string + ): obj = frame_or_series(["foo", "foo", None, "foo"]) result = obj.rank(method="first", na_option=na_option, ascending=ascending) expected = frame_or_series(expected) + if using_infer_string: + expected = expected.astype("uint64") tm.assert_equal(result, expected) @pytest.mark.parametrize( From df81cc03e36814187b5c14b1ecc0ebe91905a174 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 2 Oct 2023 16:05:38 +0100 Subject: [PATCH 12/36] Fix tests --- pandas/tests/frame/methods/test_replace.py | 8 ++++++++ pandas/tests/frame/methods/test_to_csv.py | 4 +++- pandas/tests/frame/methods/test_value_counts.py | 2 +- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 318bbdd4ec337..e825931669f84 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -447,11 +447,19 @@ def test_regex_replace_string_types( frame_or_series, any_string_dtype, using_infer_string, + request, ): # GH-41333, GH-35977 dtype = any_string_dtype obj = frame_or_series(data, dtype=dtype) if using_infer_string and any_string_dtype == "object": + if len(to_replace) > 1 and isinstance(obj, DataFrame): + request.node.add_marker( + pytest.mark.xfail( + reason="object input array that gets downcasted raises on " + "second pass" + ) + ) with tm.assert_produces_warning(FutureWarning, match="Downcasting"): result = obj.replace(to_replace, regex=True) dtype = "string[pyarrow_numpy]" diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 83d0e68f28221..8fb6a13be8899 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -622,7 +622,9 @@ def test_to_csv_interval_index(self, using_infer_string): # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) expected = df.copy() - if not using_infer_string: + if using_infer_string: + expected.index = expected.index.astype("string[pyarrow_numpy]") + else: expected.index = expected.index.astype(str) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index f30db91f82b60..fa177754b4ff3 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -147,7 +147,7 @@ def test_data_frame_value_counts_dropna_false(nulls_fixture): index=pd.MultiIndex( levels=[ pd.Index(["Anne", "Beth", "John"]), - pd.Index(["Louise", "Smith", nulls_fixture]), + pd.Index(["Louise", "Smith", np.nan]), # GH#48476 ], codes=[[0, 1, 2, 2], [2, 0, 1, 2]], names=["first_name", "middle_name"], From 74e09e4e8a62c81ebdf94016b06a7c31acacb39b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 2 Oct 2023 16:09:10 +0100 Subject: [PATCH 13/36] Fix tests --- pandas/tests/frame/methods/test_align.py | 4 ++-- pandas/tests/frame/methods/test_rank.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/methods/test_align.py b/pandas/tests/frame/methods/test_align.py index 87a56c0736287..cf111ba740861 100644 --- a/pandas/tests/frame/methods/test_align.py +++ b/pandas/tests/frame/methods/test_align.py @@ -107,7 +107,7 @@ def test_align_float(self, float_frame, using_copy_on_write): af, bf = float_frame.align( other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=None ) - tm.assert_index_equal(bf.index, Index([])) + tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype)) msg = ( "The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align " @@ -117,7 +117,7 @@ def test_align_float(self, float_frame, using_copy_on_write): af, bf = float_frame.align( other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 ) - tm.assert_index_equal(bf.index, Index([])) + tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype)) # Try to align DataFrame to Series along bad axis msg = "No axis named 2 for object type DataFrame" diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index d4b572d96a677..3ed8b3d455546 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -476,7 +476,7 @@ def test_rank_object_first( obj = frame_or_series(["foo", "foo", None, "foo"]) result = obj.rank(method="first", na_option=na_option, ascending=ascending) expected = frame_or_series(expected) - if using_infer_string: + if using_infer_string and isinstance(obj, Series): expected = expected.astype("uint64") tm.assert_equal(result, expected) From 255267f0adda46454778acc58fcc9b5ea4ec1e7c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 2 Oct 2023 16:25:34 +0100 Subject: [PATCH 14/36] Fix more indexing tests --- pandas/tests/frame/indexing/test_getitem.py | 2 +- pandas/tests/frame/indexing/test_indexing.py | 33 ++++++++++++++------ pandas/tests/frame/indexing/test_setitem.py | 2 +- 3 files changed, 25 insertions(+), 12 deletions(-) diff --git a/pandas/tests/frame/indexing/test_getitem.py b/pandas/tests/frame/indexing/test_getitem.py index 9d9324f557c8d..ed66bdf6c5a25 100644 --- a/pandas/tests/frame/indexing/test_getitem.py +++ b/pandas/tests/frame/indexing/test_getitem.py @@ -106,7 +106,7 @@ def test_getitem_list_duplicates(self): def test_getitem_dupe_cols(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"]) - msg = "\"None of [Index(['baf'], dtype='object')] are in the [columns]\"" + msg = "\"None of [Index(['baf'], dtype=.*)] are in the [columns]\"" with pytest.raises(KeyError, match=re.escape(msg)): df[["baf"]] diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 370cbf0f33174..1fd27b057cbdc 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -288,7 +288,7 @@ def test_setattr_column(self): df.foobar = 5 assert (df.foobar == 5).all() - def test_setitem(self, float_frame, using_copy_on_write): + def test_setitem(self, float_frame, using_copy_on_write, using_infer_string): # not sure what else to do here series = float_frame["A"][::2] float_frame["col5"] = series @@ -331,7 +331,10 @@ def test_setitem(self, float_frame, using_copy_on_write): with pytest.raises(SettingWithCopyError, match=msg): smaller["col10"] = ["1", "2"] - assert smaller["col10"].dtype == np.object_ + if using_infer_string: + assert smaller["col10"].dtype == "string" + else: + assert smaller["col10"].dtype == np.object_ assert (smaller["col10"] == ["1", "2"]).all() def test_setitem2(self): @@ -426,7 +429,7 @@ def test_setitem_cast(self, float_frame): float_frame["something"] = 2.5 assert float_frame["something"].dtype == np.float64 - def test_setitem_corner(self, float_frame): + def test_setitem_corner(self, float_frame, using_infer_string): # corner case df = DataFrame({"B": [1.0, 2.0, 3.0], "C": ["a", "b", "c"]}, index=np.arange(3)) del df["B"] @@ -463,10 +466,16 @@ def test_setitem_corner(self, float_frame): dm["foo"] = "bar" del dm["foo"] dm["foo"] = "bar" - assert dm["foo"].dtype == np.object_ + if using_infer_string: + assert dm["foo"].dtype == "string" + else: + assert dm["foo"].dtype == np.object_ dm["coercible"] = ["1", "2", "3"] - assert dm["coercible"].dtype == np.object_ + if using_infer_string: + assert dm["coercible"].dtype == "string" + else: + assert dm["coercible"].dtype == np.object_ def test_setitem_corner2(self): data = { @@ -483,7 +492,7 @@ def test_setitem_corner2(self): assert df.loc[1, "title"] == "foobar" assert df.loc[1, "cruft"] == 0 - def test_setitem_ambig(self): + def test_setitem_ambig(self, using_infer_string): # Difficulties with mixed-type data # Created as float type dm = DataFrame(index=range(3), columns=range(3)) @@ -499,18 +508,22 @@ def test_setitem_ambig(self): dm[2] = uncoercable_series assert len(dm.columns) == 3 - assert dm[2].dtype == np.object_ + if using_infer_string: + assert dm[2].dtype == "string" + else: + assert dm[2].dtype == np.object_ - def test_setitem_None(self, float_frame): + def test_setitem_None(self, float_frame, using_infer_string): # GH #766 float_frame[None] = float_frame["A"] + key = None if not using_infer_string else np.nan tm.assert_series_equal( float_frame.iloc[:, -1], float_frame["A"], check_names=False ) tm.assert_series_equal( - float_frame.loc[:, None], float_frame["A"], check_names=False + float_frame.loc[:, key], float_frame["A"], check_names=False ) - tm.assert_series_equal(float_frame[None], float_frame["A"], check_names=False) + tm.assert_series_equal(float_frame[key], float_frame["A"], check_names=False) repr(float_frame) def test_loc_setitem_boolean_mask_allfalse(self): diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index ccc1249088f9a..0ccd181ebfc6c 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -1298,7 +1298,7 @@ def test_setitem_column_frame_as_category(self): df["col2"] = Series([1, 2, 3], dtype="category") expected_types = Series( - ["int64", "category", "category"], index=[0, "col1", "col2"] + ["int64", "category", "category"], index=[0, "col1", "col2"], dtype=object ) tm.assert_series_equal(df.dtypes, expected_types) From a62342106b27860624d38c239330872881eb10c3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 2 Oct 2023 16:31:58 +0100 Subject: [PATCH 15/36] BUG: Index.insert raising when inserting None into new string dtype --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/arrays/string_arrow.py | 5 +++++ pandas/tests/indexes/base_class/test_reshape.py | 8 ++++++++ 3 files changed, 14 insertions(+) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 1a25b848e0f84..5a5380177620e 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -24,6 +24,7 @@ Bug fixes ~~~~~~~~~ - Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) +- Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (issue:`TODO`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 6262055827428..8cb6e5a0e9db2 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -613,3 +613,8 @@ def _reduce( ) else: return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) + + def insert(self, loc: int, item) -> ArrowStringArrayNumpySemantics: + if item is np.nan: + item = libmissing.NA + return super().insert(loc, item) diff --git a/pandas/tests/indexes/base_class/test_reshape.py b/pandas/tests/indexes/base_class/test_reshape.py index 5ecb2c753644d..886793b299ba2 100644 --- a/pandas/tests/indexes/base_class/test_reshape.py +++ b/pandas/tests/indexes/base_class/test_reshape.py @@ -54,6 +54,14 @@ def test_insert_datetime_into_object(self, loc, val): tm.assert_index_equal(result, expected) assert type(expected[2]) is type(val) + def test_insert_none_into_string_numpy(self): + # GH# + pytest.importorskip("pyarrow") + index = Index(["a", "b", "c"], dtype="string[pyarrow_numpy]") + result = index.insert(-1, None) + expected = Index(["a", "b", None, "c"], dtype="string[pyarrow_numpy]") + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( "pos,expected", [ From 3cf79ef7b911ea4177f559705d3a76cfb0904073 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 2 Oct 2023 16:55:08 +0100 Subject: [PATCH 16/36] Fix tests --- pandas/tests/frame/indexing/test_getitem.py | 2 +- pandas/tests/frame/indexing/test_where.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/indexing/test_getitem.py b/pandas/tests/frame/indexing/test_getitem.py index ed66bdf6c5a25..bf503cb090b45 100644 --- a/pandas/tests/frame/indexing/test_getitem.py +++ b/pandas/tests/frame/indexing/test_getitem.py @@ -106,7 +106,7 @@ def test_getitem_list_duplicates(self): def test_getitem_dupe_cols(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"]) - msg = "\"None of [Index(['baf'], dtype=.*)] are in the [columns]\"" + msg = "\"None of [Index(['baf'], dtype=" with pytest.raises(KeyError, match=re.escape(msg)): df[["baf"]] diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 4576a86ad27cd..0572798b8ce61 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -1079,9 +1079,13 @@ def test_where_producing_ea_cond_for_np_dtype(): @pytest.mark.parametrize( "replacement", [0.001, True, "snake", None, datetime(2022, 5, 4)] ) -def test_where_int_overflow(replacement): +def test_where_int_overflow(replacement, using_infer_string, request): # GH 31687 df = DataFrame([[1.0, 2e25, "nine"], [np.nan, 0.1, None]]) + if using_infer_string and replacement not in (None, "snake"): + request.node.add_marker( + pytest.mark.xfail(reason="Can't set non-string into string column") + ) result = df.where(pd.notnull(df), replacement) expected = DataFrame([[1.0, 2e25, "nine"], [replacement, 0.1, replacement]]) From e823c97a39615193265ac8d96718c0a47edcd11b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 2 Oct 2023 16:58:28 +0100 Subject: [PATCH 17/36] BUG: Inserting ndim=0 array does not infer string dtype --- doc/source/whatsnew/v2.1.2.rst | 2 +- pandas/core/construction.py | 9 ++++++++- pandas/tests/frame/indexing/test_indexing.py | 12 ++++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 1a25b848e0f84..b6c86621537f8 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -24,7 +24,7 @@ Bug fixes ~~~~~~~~~ - Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) -- +- Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`TODO`) .. --------------------------------------------------------------------------- .. _whatsnew_212.other: diff --git a/pandas/core/construction.py b/pandas/core/construction.py index aaac0dc73486f..bb1b4978d2367 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -562,7 +562,14 @@ def sanitize_array( if not is_list_like(data): if index is None: raise ValueError("index must be specified when data is not list-like") - data = construct_1d_arraylike_from_scalar(data, len(index), dtype) + if isinstance(data, str) and using_pyarrow_string_dtype(): + from pandas.core.arrays.string_ import StringDtype + + dtype = StringDtype("pyarrow_numpy") + data = dtype.construct_array_type()._from_sequence_of_strings([data]) + else: + data = construct_1d_arraylike_from_scalar(data, len(index), dtype) + return data elif isinstance(data, ABCExtensionArray): diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 370cbf0f33174..ac4a820b1b319 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1905,6 +1905,18 @@ def test_adding_new_conditional_column() -> None: tm.assert_frame_equal(df, expected) +def test_add_new_column_infer_string(): + # GH# + df = DataFrame({"x": [1]}) + with pd.option_context("future.infer_string", True): + df.loc[df["x"] == 1, "y"] = "1" + expected = DataFrame( + {"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")}, + columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"), + ) + tm.assert_frame_equal(df, expected) + + class TestSetitemValidation: # This is adapted from pandas/tests/arrays/masked/test_indexing.py # but checks for warnings instead of errors. From c89da874207cd95abcfe96ec894738385387c019 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 2 Oct 2023 17:06:32 +0100 Subject: [PATCH 18/36] Fix tests --- pandas/core/construction.py | 4 +++- pandas/tests/frame/indexing/test_set_value.py | 8 +++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index bb1b4978d2367..1b0150dbab602 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -566,7 +566,9 @@ def sanitize_array( from pandas.core.arrays.string_ import StringDtype dtype = StringDtype("pyarrow_numpy") - data = dtype.construct_array_type()._from_sequence_of_strings([data]) + data = dtype.construct_array_type()._from_sequence_of_strings( + [data] * len(index) + ) else: data = construct_1d_arraylike_from_scalar(data, len(index), dtype) diff --git a/pandas/tests/frame/indexing/test_set_value.py b/pandas/tests/frame/indexing/test_set_value.py index 32312868adacb..1e3c793c8449f 100644 --- a/pandas/tests/frame/indexing/test_set_value.py +++ b/pandas/tests/frame/indexing/test_set_value.py @@ -16,7 +16,7 @@ def test_set_value(self, float_frame): float_frame._set_value(idx, col, 1) assert float_frame[col][idx] == 1 - def test_set_value_resize(self, float_frame): + def test_set_value_resize(self, float_frame, using_infer_string): res = float_frame._set_value("foobar", "B", 0) assert res is None assert float_frame.index[-1] == "foobar" @@ -27,8 +27,10 @@ def test_set_value_resize(self, float_frame): res = float_frame.copy() res._set_value("foobar", "baz", "sam") - assert res["baz"].dtype == np.object_ - + if using_infer_string: + assert res["baz"].dtype == "string" + else: + assert res["baz"].dtype == np.object_ res = float_frame.copy() with tm.assert_produces_warning( FutureWarning, match="Setting an item of incompatible dtype" From 04f3d9dc95daaca2e94099c1905625451c3632e6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 2 Oct 2023 17:44:04 +0100 Subject: [PATCH 19/36] Fix tests --- pandas/tests/frame/test_reductions.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index e66557f132c1d..a4bcdbc702e65 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -590,14 +590,15 @@ def test_mode_dropna(self, dropna, expected): expected = DataFrame(expected) tm.assert_frame_equal(result, expected) - def test_mode_sortwarning(self): + def test_mode_sortwarning(self, using_infer_string): # Check for the warning that is raised when the mode # results cannot be sorted df = DataFrame({"A": [np.nan, np.nan, "a", "a"]}) expected = DataFrame({"A": ["a", np.nan]}) - with tm.assert_produces_warning(UserWarning): + warning = None if using_infer_string else UserWarning + with tm.assert_produces_warning(warning): result = df.mode(dropna=False) result = result.sort_values(by="A").reset_index(drop=True) @@ -1273,7 +1274,8 @@ def test_any_datetime(self): def test_any_all_bool_only(self): # GH 25101 df = DataFrame( - {"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [None, None, None]} + {"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [None, None, None]}, + columns=Index(["col1", "col2", "col3"], dtype=object), ) result = df.all(bool_only=True) From 28f5411cdf5f7a1290caa9754a319b60c019d5fd Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 2 Oct 2023 18:04:05 +0100 Subject: [PATCH 20/36] Fix more tests --- pandas/tests/frame/test_reductions.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index a4bcdbc702e65..94e3f40b61d52 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1237,7 +1237,9 @@ def test_any_all_extra(self): @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) @pytest.mark.parametrize("skipna", [True, False]) - def test_any_all_object_dtype(self, axis, bool_agg_func, skipna): + def test_any_all_object_dtype( + self, axis, bool_agg_func, skipna, using_infer_string + ): # GH#35450 df = DataFrame( data=[ @@ -1247,8 +1249,13 @@ def test_any_all_object_dtype(self, axis, bool_agg_func, skipna): [np.nan, np.nan, "5", np.nan], ] ) + if using_infer_string: + # na in object is True while in string pyarrow numpy it's false + val = False if axis == 0 and not skipna and bool_agg_func == "all" else True + else: + val = True result = getattr(df, bool_agg_func)(axis=axis, skipna=skipna) - expected = Series([True, True, True, True]) + expected = Series([True, True, val, True]) tm.assert_series_equal(result, expected) # GH#50947 deprecates this but it is not emitting a warning in some builds. From ca296ec5b4efc412155adc285219a6a64ab24b92 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 3 Oct 2023 22:41:42 +0200 Subject: [PATCH 21/36] Fix more tests --- pandas/tests/frame/test_reductions.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 3fe2076f91356..0ef230e448a75 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -167,11 +167,17 @@ class TestDataFrameAnalytics: pytest.param("kurt", marks=td.skip_if_no_scipy), ], ) - def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname): - if (opname in ("sum", "min", "max") and axis == 0) or opname in ( - "count", - "nunique", - ): + def test_stat_op_api_float_string_frame( + self, float_string_frame, axis, opname, using_infer_string + ): + if ( + (opname in ("sum", "min", "max") and axis == 0) + or opname + in ( + "count", + "nunique", + ) + ) and not (using_infer_string and opname == "sum"): getattr(float_string_frame, opname)(axis=axis) else: if opname in ["var", "std", "sem", "skew", "kurt"]: @@ -197,7 +203,11 @@ def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname): elif opname in ["min", "max"]: msg = "'[><]=' not supported between instances of 'float' and 'str'" elif opname == "median": - msg = re.compile(r"Cannot convert \[.*\] to numeric", flags=re.S) + msg = re.compile( + r"Cannot convert \[.*\] to numeric|does not support", flags=re.S + ) + if not isinstance(msg, re.Pattern): + msg = msg + "|does not support" with pytest.raises(TypeError, match=msg): getattr(float_string_frame, opname)(axis=axis) if opname != "nunique": @@ -358,6 +368,7 @@ def test_mixed_ops(self, op): "Could not convert", "could not convert", "can't multiply sequence by non-int", + "does not support", ] ) with pytest.raises(TypeError, match=msg): @@ -369,6 +380,7 @@ def test_mixed_ops(self, op): "Could not convert", "could not convert", "can't multiply sequence by non-int", + "does not support", ] ) with pytest.raises(TypeError, match=msg): @@ -887,7 +899,8 @@ def test_sum_mixed_datetime(self): def test_mean_corner(self, float_frame, float_string_frame): # unit test when have object data - with pytest.raises(TypeError, match="Could not convert"): + msg = "Could not convert|does not support" + with pytest.raises(TypeError, match=msg): float_string_frame.mean(axis=0) # xs sum mixed type, just want to know it works... From ab35982e4bba6b49a32e619c733a5507484f8fe3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 3 Oct 2023 22:50:07 +0200 Subject: [PATCH 22/36] BUG: idxmax raising for arrow strings --- pandas/core/arrays/arrow/array.py | 11 ++++++++++- pandas/core/arrays/string_arrow.py | 8 ++++++++ pandas/tests/frame/test_reductions.py | 9 +++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 2c788411eb089..9743ca891d4b8 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1627,6 +1627,15 @@ def _reduce( ------ TypeError : subclass does not define reductions """ + result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) + if isinstance(result, pa.Array): + return type(self)(result) + else: + return result + + def _reduce_calc( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): pa_result = self._reduce_pyarrow(name, skipna=skipna, **kwargs) if keepdims: @@ -1637,7 +1646,7 @@ def _reduce( [pa_result], type=to_pyarrow_type(infer_dtype_from_scalar(pa_result)[0]), ) - return type(self)(result) + return result if pc.is_null(pa_result).as_py(): return self.dtype.na_value diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index e904123849821..33fcdf56d31cb 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -501,6 +501,14 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): def _convert_int_dtype(self, result): return Int64Dtype().__from_arrow__(result) + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) + if name in ("argmin", "argmax") and isinstance(result, pa.Array): + return self._convert_int_dtype(result) + return type(self)(result) + class ArrowStringArrayNumpySemantics(ArrowStringArray): _storage = "pyarrow_numpy" diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 77f64b18a82f8..1fcc08946cb04 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1069,6 +1069,15 @@ def test_idxmax_arrow_types(self): expected = Series([2, 1], index=["a", "b"]) tm.assert_series_equal(result, expected) + df = DataFrame({"a": ["b", "c", "a"]}, dtype="string[pyarrow]") + result = df.idxmax(numeric_only=False) + expected = Series([1], index=["a"]) + tm.assert_series_equal(result, expected) + + result = df.idxmin(numeric_only=False) + expected = Series([2], index=["a"]) + tm.assert_series_equal(result, expected) + def test_idxmax_axis_2(self, float_frame): frame = float_frame msg = "No axis named 2 for object type DataFrame" From d910efac8126b1845dad9f64269b09ba525a41b1 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 3 Oct 2023 22:51:10 +0200 Subject: [PATCH 23/36] Fix --- pandas/core/arrays/string_arrow.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 33fcdf56d31cb..5f800e781d2fa 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -507,7 +507,10 @@ def _reduce( result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) if name in ("argmin", "argmax") and isinstance(result, pa.Array): return self._convert_int_dtype(result) - return type(self)(result) + elif isinstance(result, pa.Array): + return type(self)(result) + else: + return result class ArrowStringArrayNumpySemantics(ArrowStringArray): From d0221e35a05132447ad6a79d6db675b5148bd287 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 3 Oct 2023 23:01:53 +0200 Subject: [PATCH 24/36] Fix more tests --- pandas/tests/frame/test_arithmetic.py | 11 ++++++++--- pandas/tests/frame/test_query_eval.py | 17 ++++++++++++----- pandas/tests/frame/test_reductions.py | 4 ++-- 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index bb9a76829c77d..f27e51cb98ea7 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -417,8 +417,8 @@ def test_bool_flex_frame_complex_dtype(self): def test_bool_flex_frame_object_dtype(self): # corner, dtype=object - df1 = DataFrame({"col": ["foo", np.nan, "bar"]}) - df2 = DataFrame({"col": ["foo", datetime.now(), "bar"]}) + df1 = DataFrame({"col": ["foo", np.nan, "bar"]}, dtype=object) + df2 = DataFrame({"col": ["foo", datetime.now(), "bar"]}, dtype=object) result = df1.ne(df2) exp = DataFrame({"col": [False, True, False]}) tm.assert_frame_equal(result, exp) @@ -1997,7 +1997,12 @@ def test_dataframe_blockwise_slicelike(): "df, col_dtype", [ (DataFrame([[1.0, 2.0], [4.0, 5.0]], columns=list("ab")), "float64"), - (DataFrame([[1.0, "b"], [4.0, "b"]], columns=list("ab")), "object"), + ( + DataFrame([[1.0, "b"], [4.0, "b"]], columns=list("ab")).astype( + {"b": object} + ), + "object", + ), ], ) def test_dataframe_operation_with_non_numeric_types(df, col_dtype): diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 72e8236159bda..880775f709dcb 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -1031,7 +1031,7 @@ def test_query_with_string_columns(self, parser, engine): with pytest.raises(NotImplementedError, match=msg): df.query("a in b and c < d", parser=parser, engine=engine) - def test_object_array_eq_ne(self, parser, engine): + def test_object_array_eq_ne(self, parser, engine, using_infer_string): df = DataFrame( { "a": list("aaaabbbbcccc"), @@ -1040,11 +1040,14 @@ def test_object_array_eq_ne(self, parser, engine): "d": np.random.default_rng(2).integers(9, size=12), } ) - res = df.query("a == b", parser=parser, engine=engine) + warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None + with tm.assert_produces_warning(warning): + res = df.query("a == b", parser=parser, engine=engine) exp = df[df.a == df.b] tm.assert_frame_equal(res, exp) - res = df.query("a != b", parser=parser, engine=engine) + with tm.assert_produces_warning(warning): + res = df.query("a != b", parser=parser, engine=engine) exp = df[df.a != df.b] tm.assert_frame_equal(res, exp) @@ -1083,12 +1086,16 @@ def test_query_with_nested_special_character(self, parser, engine): [">=", operator.ge], ], ) - def test_query_lex_compare_strings(self, parser, engine, op, func): + def test_query_lex_compare_strings( + self, parser, engine, op, func, using_infer_string + ): a = Series(np.random.default_rng(2).choice(list("abcde"), 20)) b = Series(np.arange(a.size)) df = DataFrame({"X": a, "Y": b}) - res = df.query(f'X {op} "d"', engine=engine, parser=parser) + warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None + with tm.assert_produces_warning(warning): + res = df.query(f'X {op} "d"', engine=engine, parser=parser) expected = df[func(df.X, "d")] tm.assert_frame_equal(res, expected) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 7f7ab4b8989c4..1fe2b3c8ff9b8 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1888,7 +1888,7 @@ def test_minmax_extensionarray(method, numeric_only): expected = Series( [getattr(int64_info, method)], dtype="Int64", - index=Index(["Int64"], dtype="object"), + index=Index(["Int64"]), ) tm.assert_series_equal(result, expected) @@ -1906,7 +1906,7 @@ def test_prod_sum_min_count_mixed_object(): df = DataFrame([1, "a", True]) result = df.prod(axis=0, min_count=1, numeric_only=False) - expected = Series(["a"]) + expected = Series(["a"], dtype=object) tm.assert_series_equal(result, expected) msg = re.escape("unsupported operand type(s) for +: 'int' and 'str'") From 18d5e62da95b06c07eefc4477a039daead8636e3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 3 Oct 2023 23:21:39 +0200 Subject: [PATCH 25/36] Fix more tests --- pandas/tests/frame/test_query_eval.py | 2 +- pandas/tests/frame/test_stack_unstack.py | 23 +++++++++++++++-------- pandas/tests/frame/test_unary.py | 1 + 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 880775f709dcb..bbff4a507d8f4 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -1169,7 +1169,7 @@ def test_bool_arith_expr(self, frame, parser, engine): @pytest.mark.parametrize("op", ["+", "-", "*", "/"]) def test_invalid_type_for_operator_raises(self, parser, engine, op): df = DataFrame({"a": [1, 2], "b": ["c", "d"]}) - msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'" + msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'|Cannot" with pytest.raises(TypeError, match=msg): df.eval(f"a {op} b", engine=engine, parser=parser) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 9b76ae093e8c4..bbb3d08e0e566 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -602,7 +602,7 @@ def test_unstack_to_series(self, float_frame): data = data.unstack() tm.assert_frame_equal(old_data, data) - def test_unstack_dtypes(self): + def test_unstack_dtypes(self, using_infer_string): # GH 2929 rows = [[1, 1, 3, 4], [1, 2, 3, 4], [2, 1, 3, 4], [2, 2, 3, 4]] @@ -638,8 +638,9 @@ def test_unstack_dtypes(self): df2["D"] = "foo" df3 = df2.unstack("B") result = df3.dtypes + dtype = "string" if using_infer_string else np.dtype("object") expected = Series( - [np.dtype("float64")] * 2 + [np.dtype("object")] * 2, + [np.dtype("float64")] * 2 + [dtype] * 2, index=MultiIndex.from_arrays( [["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B") ), @@ -1321,14 +1322,16 @@ def test_unstack_fill_frame_object(): # By default missing values will be NaN result = data.unstack() expected = DataFrame( - {"a": ["a", np.nan, "a"], "b": ["b", "c", np.nan]}, index=list("xyz") + {"a": ["a", np.nan, "a"], "b": ["b", "c", np.nan]}, + index=list("xyz"), + dtype=object, ) tm.assert_frame_equal(result, expected) # Fill with any value replaces missing values as expected result = data.unstack(fill_value="d") expected = DataFrame( - {"a": ["a", "d", "a"], "b": ["b", "c", "d"]}, index=list("xyz") + {"a": ["a", "d", "a"], "b": ["b", "c", "d"]}, index=list("xyz"), dtype=object ) tm.assert_frame_equal(result, expected) @@ -2013,7 +2016,7 @@ def test_stack_multiple_bug(self, future_stack): multi = df.set_index(["DATE", "ID"]) multi.columns.name = "Params" unst = multi.unstack("ID") - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): unst.resample("W-THU").mean() down = unst.resample("W-THU").mean(numeric_only=True) @@ -2210,7 +2213,7 @@ def test_stack_unstack_unordered_multiindex(self, future_stack): tm.assert_frame_equal(result, expected) def test_unstack_preserve_types( - self, multiindex_year_month_day_dataframe_random_data + self, multiindex_year_month_day_dataframe_random_data, using_infer_string ): # GH#403 ymd = multiindex_year_month_day_dataframe_random_data @@ -2219,7 +2222,11 @@ def test_unstack_preserve_types( unstacked = ymd.unstack("month") assert unstacked["A", 1].dtype == np.float64 - assert unstacked["E", 1].dtype == np.object_ + assert ( + unstacked["E", 1].dtype == np.object_ + if not using_infer_string + else "string" + ) assert unstacked["F", 1].dtype == np.float64 def test_unstack_group_index_overflow(self, future_stack): @@ -2279,7 +2286,7 @@ def test_unstack_with_missing_int_cast_to_float(self, using_array_manager): expected = DataFrame( [[10.0, 10.0, 1.0, 1.0], [np.nan, 10.0, 0.0, 1.0]], - index=Index(["A", "B"], dtype="object", name="a"), + index=Index(["A", "B"], name="a"), columns=MultiIndex.from_tuples( [("v", "ca"), ("v", "cb"), ("is_", "ca"), ("is_", "cb")], names=[None, "b"], diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py index 5e29d3c868983..a6f1f87db3010 100644 --- a/pandas/tests/frame/test_unary.py +++ b/pandas/tests/frame/test_unary.py @@ -51,6 +51,7 @@ def test_neg_object(self, df, expected): def test_neg_raises(self, df): msg = ( "bad operand type for unary -: 'str'|" + "has no kernel matching input types|" r"bad operand type for unary -: 'DatetimeArray'" ) with pytest.raises(TypeError, match=msg): From 130eeb3d3300e2e2cc341a917c0321a1c75a0e58 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 4 Oct 2023 21:43:07 +0200 Subject: [PATCH 26/36] Fix more tests --- pandas/tests/frame/test_reductions.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 1fe2b3c8ff9b8..e0765c3fab145 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.compat import ( IS64, is_platform_windows, @@ -386,6 +388,9 @@ def test_mixed_ops(self, op): with pytest.raises(TypeError, match=msg): getattr(df, op)() + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="sum doesn't work for arrow strings" + ) def test_reduce_mixed_frame(self): # GH 6806 df = DataFrame( @@ -452,7 +457,9 @@ def test_mean_mixed_string_decimal(self): df = DataFrame(d) - with pytest.raises(TypeError, match="unsupported operand type"): + with pytest.raises( + TypeError, match="unsupported operand type|does not support" + ): df.mean() result = df[["A", "C"]].mean() expected = Series([2.7, 681.6], index=["A", "C"], dtype=object) @@ -584,7 +591,7 @@ def test_mode_dropna(self, dropna, expected): "A": [12, 12, 19, 11], "B": [10, 10, np.nan, 3], "C": [1, np.nan, np.nan, np.nan], - "D": [np.nan, np.nan, "a", np.nan], + "D": Series([np.nan, np.nan, "a", np.nan], dtype=object), "E": Categorical([np.nan, np.nan, "a", np.nan]), "F": to_datetime(["NaT", "2000-1-2", "NaT", "NaT"]), "G": to_timedelta(["1 days", "nan", "nan", "nan"]), @@ -1868,6 +1875,9 @@ def test_sum_timedelta64_skipna_false(using_array_manager, request): tm.assert_series_equal(result, expected) +@pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="sum doesn't work with arrow strings" +) def test_mixed_frame_with_integer_sum(): # https://github.com/pandas-dev/pandas/issues/34520 df = DataFrame([["a", 1]], columns=list("ab")) From 1e7b93e5c3ea7bf957c7308ed79985bb2e400bbe Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 4 Oct 2023 22:51:20 +0200 Subject: [PATCH 27/36] Fix remaining tests --- .../frame/constructors/test_from_dict.py | 5 +++ .../frame/constructors/test_from_records.py | 5 +++ pandas/tests/frame/test_api.py | 2 + pandas/tests/frame/test_arithmetic.py | 5 +++ pandas/tests/frame/test_block_internals.py | 4 +- pandas/tests/frame/test_constructors.py | 39 +++++++++++-------- pandas/tests/frame/test_logical_ops.py | 12 ++++-- pandas/tests/frame/test_repr_info.py | 3 ++ pandas/tests/frame/test_unary.py | 21 +++++++--- 9 files changed, 68 insertions(+), 28 deletions(-) diff --git a/pandas/tests/frame/constructors/test_from_dict.py b/pandas/tests/frame/constructors/test_from_dict.py index d78924ff9d046..631709bf713bf 100644 --- a/pandas/tests/frame/constructors/test_from_dict.py +++ b/pandas/tests/frame/constructors/test_from_dict.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas import ( DataFrame, Index, @@ -42,6 +44,9 @@ def test_constructor_single_row(self): ) tm.assert_frame_equal(result, expected) + @pytest.mark.skipif( + using_pyarrow_string_dtype(), reason="columns inferring logic broken" + ) def test_constructor_list_of_series(self): data = [ OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]), diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 7dffa7bb242d5..36549fdd9d7df 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -6,6 +6,8 @@ import pytest import pytz +from pandas._config import using_pyarrow_string_dtype + from pandas.compat import is_platform_little_endian from pandas import ( @@ -56,6 +58,9 @@ def test_from_records_with_datetimes(self): expected["EXPIRY"] = expected["EXPIRY"].astype("M8[s]") tm.assert_frame_equal(result, expected) + @pytest.mark.skipif( + using_pyarrow_string_dtype(), reason="dtype checking logic doesn't work" + ) def test_from_records_sequencelike(self): df = DataFrame( { diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 8fc78629beb0a..0f1f4018a069c 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -5,6 +5,7 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype from pandas._config.config import option_context from pandas.util._test_decorators import async_mark @@ -114,6 +115,7 @@ def test_not_hashable(self): with pytest.raises(TypeError, match=msg): hash(empty_frame) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="surrogates not allowed") def test_column_name_contains_unicode_surrogate(self): # GH 25509 colname = "\ud83d" diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index f27e51cb98ea7..b93d2cd3111d7 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -11,6 +11,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -236,6 +238,9 @@ def test_timestamp_compare(self, left, right): with pytest.raises(TypeError, match=msg): right_f(pd.Timestamp("nat"), df) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't compare string and int" + ) def test_mixed_comparison(self): # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False, # not raise TypeError diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 9e8d92e832d01..b57b639c955e4 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -176,7 +176,7 @@ def test_constructor_with_convert(self): ) tm.assert_series_equal(result, expected) - def test_construction_with_mixed(self, float_string_frame): + def test_construction_with_mixed(self, float_string_frame, using_infer_string): # test construction edge cases with mixed types # f7u12, this does not work without extensive workaround @@ -199,7 +199,7 @@ def test_construction_with_mixed(self, float_string_frame): expected = Series( [np.dtype("float64")] * 4 + [ - np.dtype("object"), + np.dtype("object") if not using_infer_string else "string", np.dtype("datetime64[us]"), np.dtype("timedelta64[us]"), ], diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 4b41050c86467..b558b303069de 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -21,6 +21,8 @@ import pytest import pytz +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td @@ -79,7 +81,7 @@ def test_constructor_from_ndarray_with_str_dtype(self): # with an array of strings each of which is e.g. "[0 1 2]" arr = np.arange(12).reshape(4, 3) df = DataFrame(arr, dtype=str) - expected = DataFrame(arr.astype(str)) + expected = DataFrame(arr.astype(str), dtype=object) tm.assert_frame_equal(df, expected) def test_constructor_from_2d_datetimearray(self, using_array_manager): @@ -261,8 +263,9 @@ def test_emptylike_constructor(self, emptylike, expected_index, expected_columns result = DataFrame(emptylike) tm.assert_frame_equal(result, expected) - def test_constructor_mixed(self, float_string_frame): - assert float_string_frame["foo"].dtype == np.object_ + def test_constructor_mixed(self, float_string_frame, using_infer_string): + dtype = "string" if using_infer_string else np.object_ + assert float_string_frame["foo"].dtype == dtype def test_constructor_cast_failure(self): # as of 2.0, we raise if we can't respect "dtype", previously we @@ -318,6 +321,7 @@ def test_constructor_dtype_nocast_view_2d_array( assert df2._mgr.arrays[0].flags.c_contiguous @td.skip_array_manager_invalid_test + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") def test_1d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array(["a", "b"], dtype="object") @@ -325,6 +329,7 @@ def test_1d_object_array_does_not_copy(self): assert np.shares_memory(df.values, arr) @td.skip_array_manager_invalid_test + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") def test_2d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array([["a", "b"], ["c", "d"]], dtype="object") @@ -764,7 +769,7 @@ def test_constructor_dict_block(self): ) tm.assert_numpy_array_equal(df.values, expected) - def test_constructor_dict_cast(self): + def test_constructor_dict_cast(self, using_infer_string): # cast float tests test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}} frame = DataFrame(test_data, dtype=float) @@ -774,7 +779,7 @@ def test_constructor_dict_cast(self): frame = DataFrame(test_data) assert len(frame) == 3 - assert frame["B"].dtype == np.object_ + assert frame["B"].dtype == np.object_ if not using_infer_string else "string" assert frame["A"].dtype == np.float64 def test_constructor_dict_cast2(self): @@ -1186,7 +1191,7 @@ def test_constructor_dtype_nullable_extension_arrays( df = DataFrame({"a": data}, dtype=input_dtype) assert df["a"].dtype == expected_dtype() - def test_constructor_scalar_inference(self): + def test_constructor_scalar_inference(self, using_infer_string): data = {"int": 1, "bool": True, "float": 3.0, "complex": 4j, "object": "foo"} df = DataFrame(data, index=np.arange(10)) @@ -1194,7 +1199,7 @@ def test_constructor_scalar_inference(self): assert df["bool"].dtype == np.bool_ assert df["float"].dtype == np.float64 assert df["complex"].dtype == np.complex128 - assert df["object"].dtype == np.object_ + assert df["object"].dtype == np.object_ if not using_infer_string else "string" def test_constructor_arrays_and_scalars(self): df = DataFrame({"a": np.random.default_rng(2).standard_normal(10), "b": True}) @@ -1273,11 +1278,11 @@ def empty_gen(): df = DataFrame(empty_gen(), columns=["A", "B"]) tm.assert_frame_equal(df, expected) - def test_constructor_list_of_lists(self): + def test_constructor_list_of_lists(self, using_infer_string): # GH #484 df = DataFrame(data=[[1, "a"], [2, "b"]], columns=["num", "str"]) assert is_integer_dtype(df["num"]) - assert df["str"].dtype == np.object_ + assert df["str"].dtype == np.object_ if not using_infer_string else "string" # GH 4851 # list of 0-dim ndarrays @@ -1822,7 +1827,7 @@ def test_constructor_single_value(self): with pytest.raises(TypeError, match=msg): DataFrame("a", [1, 2], ["a", "c"], float) - def test_constructor_with_datetimes(self): + def test_constructor_with_datetimes(self, using_infer_string): intname = np.dtype(np.int_).name floatname = np.dtype(np.float64).name objectname = np.dtype(np.object_).name @@ -1841,7 +1846,7 @@ def test_constructor_with_datetimes(self): result = df.dtypes expected = Series( [np.dtype("int64")] - + [np.dtype(objectname)] * 2 + + [np.dtype(objectname) if not using_infer_string else "string"] * 2 + [np.dtype("M8[s]"), np.dtype("M8[us]")], index=list("ABCDE"), ) @@ -1863,7 +1868,7 @@ def test_constructor_with_datetimes(self): expected = Series( [np.dtype("float64")] + [np.dtype("int64")] - + [np.dtype("object")] + + [np.dtype("object") if not using_infer_string else "string"] + [np.dtype("float64")] + [np.dtype(intname)], index=["a", "b", "c", floatname, intname], @@ -1885,7 +1890,7 @@ def test_constructor_with_datetimes(self): expected = Series( [np.dtype("float64")] + [np.dtype("int64")] - + [np.dtype("object")] + + [np.dtype("object") if not using_infer_string else "string"] + [np.dtype("float64")] + [np.dtype(intname)], index=["a", "b", "c", floatname, intname], @@ -1922,13 +1927,13 @@ def test_constructor_with_datetimes3(self): df = DataFrame({"End Date": dt}, index=[0]) assert df.iat[0, 0] == dt tm.assert_series_equal( - df.dtypes, Series({"End Date": "datetime64[us, US/Eastern]"}) + df.dtypes, Series({"End Date": "datetime64[us, US/Eastern]"}, dtype=object) ) df = DataFrame([{"End Date": dt}]) assert df.iat[0, 0] == dt tm.assert_series_equal( - df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"}) + df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"}, dtype=object) ) def test_constructor_with_datetimes4(self): @@ -2053,7 +2058,7 @@ def test_constructor_timedelta_non_ns(self, order, unit): # dtype=exp_dtype. tm.assert_frame_equal(df, expected) - def test_constructor_for_list_with_dtypes(self): + def test_constructor_for_list_with_dtypes(self, using_infer_string): # test list of lists/ndarrays df = DataFrame([np.arange(5) for x in range(5)]) result = df.dtypes @@ -2104,7 +2109,7 @@ def test_constructor_for_list_with_dtypes(self): [ np.dtype("int64"), np.dtype("float64"), - np.dtype("object"), + np.dtype("object") if not using_infer_string else "string", np.dtype("datetime64[ns]"), np.dtype("float64"), ], diff --git a/pandas/tests/frame/test_logical_ops.py b/pandas/tests/frame/test_logical_ops.py index a15d7d7f93f01..16ca3a202f1e0 100644 --- a/pandas/tests/frame/test_logical_ops.py +++ b/pandas/tests/frame/test_logical_ops.py @@ -96,7 +96,7 @@ def test_logical_ops_int_frame(self): res_ser = df1a_int["A"] | df1a_bool["A"] tm.assert_series_equal(res_ser, df1a_bool["A"]) - def test_logical_ops_invalid(self): + def test_logical_ops_invalid(self, using_infer_string): # GH#5808 df1 = DataFrame(1.0, index=[1], columns=["A"]) @@ -108,8 +108,14 @@ def test_logical_ops_invalid(self): df1 = DataFrame("foo", index=[1], columns=["A"]) df2 = DataFrame(True, index=[1], columns=["A"]) msg = re.escape("unsupported operand type(s) for |: 'str' and 'bool'") - with pytest.raises(TypeError, match=msg): - df1 | df2 + if using_infer_string: + import pyarrow as pa + + with pytest.raises(pa.lib.ArrowNotImplementedError, match="|has no kernel"): + df1 | df2 + else: + with pytest.raises(TypeError, match=msg): + df1 | df2 def test_logical_operators(self): def _check_bin_op(op): diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 64d516e484991..c0cdf4853c4fe 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas import ( NA, Categorical, @@ -167,6 +169,7 @@ def test_repr_mixed_big(self): repr(biggie) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="/r in") def test_repr(self, float_frame): buf = StringIO() diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py index a6f1f87db3010..f79534bc08c93 100644 --- a/pandas/tests/frame/test_unary.py +++ b/pandas/tests/frame/test_unary.py @@ -48,16 +48,25 @@ def test_neg_object(self, df, expected): pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])}), ], ) - def test_neg_raises(self, df): + def test_neg_raises(self, df, using_infer_string): msg = ( "bad operand type for unary -: 'str'|" - "has no kernel matching input types|" r"bad operand type for unary -: 'DatetimeArray'" ) - with pytest.raises(TypeError, match=msg): - (-df) - with pytest.raises(TypeError, match=msg): - (-df["a"]) + if using_infer_string: + import pyarrow as pa + + msg = "has no kernel" + with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): + (-df) + with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): + (-df["a"]) + + else: + with pytest.raises(TypeError, match=msg): + (-df) + with pytest.raises(TypeError, match=msg): + (-df["a"]) def test_invert(self, float_frame): df = float_frame From 6e55ce22836ac40c321cfabc494b20e6c2f5681a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 4 Oct 2023 22:54:46 +0200 Subject: [PATCH 28/36] Fix remaining tests --- pandas/tests/frame/test_unary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py index f79534bc08c93..850c92013694f 100644 --- a/pandas/tests/frame/test_unary.py +++ b/pandas/tests/frame/test_unary.py @@ -53,7 +53,7 @@ def test_neg_raises(self, df, using_infer_string): "bad operand type for unary -: 'str'|" r"bad operand type for unary -: 'DatetimeArray'" ) - if using_infer_string: + if using_infer_string and df.dtypes.iloc[0] == "string": import pyarrow as pa msg = "has no kernel" From 70483617f0a4a33cbd77ae0e8ace832d69f9a9d4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 4 Oct 2023 22:56:19 +0200 Subject: [PATCH 29/36] Change default --- pandas/core/config_init.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index c0201e13ed1bf..8c98f9fc25756 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -903,7 +903,7 @@ def register_converter_cb(key) -> None: with cf.config_prefix("future"): cf.register_option( "infer_string", - os.environ.get("PANDAS_INFER_STRING", "0") == "0", + os.environ.get("PANDAS_INFER_STRING", "0") == "1", "Whether to infer sequence of str objects as pyarrow string " "dtype, which will be the default in pandas 3.0 " "(at which point this option will be deprecated).", From 0cb459c5763c794c669cc217aec6e07dc9cae97a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 21 Oct 2023 20:07:18 +0200 Subject: [PATCH 30/36] BUG: Groupby not keeping string dtype for empty objects --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/arrays/base.py | 2 ++ pandas/core/groupby/ops.py | 20 +++++++++++++------- pandas/tests/groupby/test_reductions.py | 13 +++++++++++++ 4 files changed, 29 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 97a718dd496e9..0f8ba33160e72 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -23,6 +23,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Fixed bug in :meth:`.DataFrameGroupBy.min()` and :meth:`.DataFrameGroupBy.max()` not preserving extension dtype for empty object (:issue:`55619`) - Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`) - Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`) - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 05e6fc09a5ef6..b48250ca95df6 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2352,6 +2352,8 @@ def _groupby_op( # GH#43682 if isinstance(self.dtype, StringDtype): # StringArray + # Fail early to avoid conversion to object + op._get_cython_function(op.kind, op.how, np.dtype(object), False) npvalues = self.to_numpy(object, na_value=np.nan) else: raise NotImplementedError( diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 607059e5183ec..e4cba7ce8f1cd 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -33,6 +33,7 @@ from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly +from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import ( maybe_cast_pointwise_result, maybe_downcast_to_dtype, @@ -837,10 +838,8 @@ def agg_series( ------- np.ndarray or ExtensionArray """ - # test_groupby_empty_with_category gets here with self.ngroups == 0 - # and len(obj) > 0 - if len(obj) > 0 and not isinstance(obj._values, np.ndarray): + if not isinstance(obj._values, np.ndarray): # we can preserve a little bit more aggressively with EA dtype # because maybe_cast_pointwise_result will do a try/except # with _from_sequence. NB we are assuming here that _from_sequence @@ -849,11 +848,18 @@ def agg_series( result = self._aggregate_series_pure_python(obj, func) - npvalues = lib.maybe_convert_objects(result, try_float=False) - if preserve_dtype: - out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) + if len(obj) == 0 and len(result) == 0 and isinstance(obj.dtype, ExtensionDtype): + cls = obj.dtype.construct_array_type() + out = cls._from_sequence(result) + else: - out = npvalues + npvalues = lib.maybe_convert_objects(result, try_float=False) + if preserve_dtype: + out = maybe_cast_pointwise_result( + npvalues, obj.dtype, numeric_only=True + ) + else: + out = npvalues return out @final diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index fdfb211ac2269..35ad8e3f5dc61 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -575,6 +575,19 @@ def test_groupby_min_max_categorical(func): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("func", ["min", "max"]) +def test_min_empty_string_dtype(func): + # GH#55619 + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" + df = DataFrame({"a": ["a"], "b": "a", "c": "a"}, dtype=dtype).iloc[:0] + result = getattr(df.groupby("a"), func)() + expected = DataFrame( + columns=["b", "c"], dtype=dtype, index=pd.Index([], dtype=dtype, name="a") + ) + tm.assert_frame_equal(result, expected) + + def test_max_nan_bug(): raw = """,Date,app,File -04-23,2013-04-23 00:00:00,,log080001.log From aff4f170a465d376c991c0048ad69b075bf0f09a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 21 Oct 2023 20:36:01 +0200 Subject: [PATCH 31/36] Start fixing gb tests --- pandas/core/arrays/base.py | 2 ++ pandas/core/config_init.py | 2 +- pandas/tests/groupby/test_apply.py | 27 +++++++++++++----------- pandas/tests/groupby/test_categorical.py | 5 +++-- pandas/tests/groupby/test_function.py | 18 ++++++++++++---- pandas/tests/groupby/test_groupby.py | 25 ++++++++++++++-------- 6 files changed, 51 insertions(+), 28 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 177c105688d0c..254b38cd59f13 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2300,6 +2300,8 @@ def _groupby_op( # GH#43682 if isinstance(self.dtype, StringDtype): # StringArray + # Fail early to avoid conversion to object + op._get_cython_function(op.kind, op.how, np.dtype(object), False) npvalues = self.to_numpy(object, na_value=np.nan) else: raise NotImplementedError( diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 8c98f9fc25756..06979f90a571f 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -903,7 +903,7 @@ def register_converter_cb(key) -> None: with cf.config_prefix("future"): cf.register_option( "infer_string", - os.environ.get("PANDAS_INFER_STRING", "0") == "1", + os.environ.get("PANDAS_INFER_STRING", "1") == "1", "Whether to infer sequence of str objects as pyarrow string " "dtype, which will be the default in pandas 3.0 " "(at which point this option will be deprecated).", diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index abcb9f68e0f5c..b3dbe9f778573 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -38,7 +38,7 @@ def store(group): tm.assert_frame_equal(groups[0], expected_value) -def test_apply_issues(): +def test_apply_issues(using_infer_string): # GH 5788 s = """2011.05.16,00:00,1.40893 @@ -69,8 +69,9 @@ def test_apply_issues(): # GH 5789 # don't auto coerce dates df = pd.read_csv(StringIO(s), header=None, names=["date", "time", "value"]) + dtype = "string[pyarrow_numpy]" if using_infer_string else object exp_idx = Index( - ["2011.05.16", "2011.05.17", "2011.05.18"], dtype=object, name="date" + ["2011.05.16", "2011.05.17", "2011.05.18"], dtype=dtype, name="date" ) expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) msg = "DataFrameGroupBy.apply operated on the grouping columns" @@ -81,14 +82,15 @@ def test_apply_issues(): tm.assert_series_equal(result, expected) -def test_apply_trivial(): +def test_apply_trivial(using_infer_string): # GH 20066 # trivial apply: ignore input and return a constant dataframe. df = DataFrame( {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], ) - expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", "object"]) + dtype = "string" if using_infer_string else "object" + expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", dtype]) msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -98,13 +100,14 @@ def test_apply_trivial(): tm.assert_frame_equal(result, expected) -def test_apply_trivial_fail(): +def test_apply_trivial_fail(using_infer_string): # GH 20066 df = DataFrame( {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], ) - expected = pd.concat([df, df], axis=1, keys=["float64", "object"]) + dtype = "string" if using_infer_string else "object" + expected = pd.concat([df, df], axis=1, keys=["float64", dtype]) msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): gb = df.groupby([str(x) for x in df.dtypes], axis=1, group_keys=True) @@ -901,7 +904,7 @@ def test_func_returns_object(): "group_column_dtlike", [datetime.today(), datetime.today().date(), datetime.today().time()], ) -def test_apply_datetime_issue(group_column_dtlike): +def test_apply_datetime_issue(group_column_dtlike, using_infer_string): # GH-28247 # groupby-apply throws an error if one of the columns in the DataFrame # is a datetime object and the column labels are different from @@ -912,9 +915,8 @@ def test_apply_datetime_issue(group_column_dtlike): with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) - expected = DataFrame( - ["spam"], Index(["foo"], dtype="object", name="a"), columns=[42] - ) + dtype = "string" if using_infer_string else "object" + expected = DataFrame(["spam"], Index(["foo"], dtype=dtype, name="a"), columns=[42]) tm.assert_frame_equal(result, expected) @@ -981,7 +983,7 @@ def test_apply_multi_level_name(category): assert df.index.names == ["A", "B"] -def test_groupby_apply_datetime_result_dtypes(): +def test_groupby_apply_datetime_result_dtypes(using_infer_string): # GH 14849 data = DataFrame.from_records( [ @@ -995,8 +997,9 @@ def test_groupby_apply_datetime_result_dtypes(): msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(FutureWarning, match=msg): result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes + dtype = "string" if using_infer_string else object expected = Series( - [np.dtype("datetime64[ns]"), object, object, np.int64, object], + [np.dtype("datetime64[ns]"), dtype, dtype, np.int64, dtype], index=["observation", "color", "mood", "intensity", "score"], ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index b11240c841420..51e585c06d5b1 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -82,7 +82,7 @@ def get_stats(group): assert result.index.names[0] == "C" -def test_basic(): # TODO: split this test +def test_basic(using_infer_string): # TODO: split this test cats = Categorical( ["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], @@ -129,7 +129,8 @@ def f(x): result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name="person_id") - expected["person_name"] = expected["person_name"].astype("object") + dtype = "string[pyarrow_numpy]" if using_infer_string else object + expected["person_name"] = expected["person_name"].astype(dtype) tm.assert_frame_equal(result, expected) # GH 9921 diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index a92880c87b847..ac15e0489d45a 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -275,6 +275,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): [ "category type does not support sum operations", re.escape(f"agg function failed [how->{method},dtype->object]"), + re.escape(f"agg function failed [how->{method},dtype->string]"), ] ) with pytest.raises(exception, match=msg): @@ -291,6 +292,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): "function is not implemented for this dtype", f"Cannot perform {method} with non-ordered Categorical", re.escape(f"agg function failed [how->{method},dtype->object]"), + re.escape(f"agg function failed [how->{method},dtype->string]"), ] ) with pytest.raises(exception, match=msg): @@ -545,7 +547,7 @@ def test_idxmin_idxmax_axis1(): @pytest.mark.parametrize("numeric_only", [True, False, None]) -def test_axis1_numeric_only(request, groupby_func, numeric_only): +def test_axis1_numeric_only(request, groupby_func, numeric_only, using_infer_string): if groupby_func in ("idxmax", "idxmin"): pytest.skip("idxmax and idx_min tested in test_idxmin_idxmax_axis1") if groupby_func in ("corrwith", "skew"): @@ -607,8 +609,15 @@ def test_axis1_numeric_only(request, groupby_func, numeric_only): "can't multiply sequence by non-int of type 'float'", # cumsum, diff, pct_change "unsupported operand type", + "has no kernel", ) - with pytest.raises(TypeError, match=f"({'|'.join(msgs)})"): + if using_infer_string: + import pyarrow as pa + + errs = (TypeError, pa.lib.ArrowNotImplementedError) + else: + errs = TypeError + with pytest.raises(errs, match=f"({'|'.join(msgs)})"): with tm.assert_produces_warning(FutureWarning, match=warn_msg): method(*args, **kwargs) else: @@ -1657,7 +1666,7 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): {"percentiles": [0.10, 0.20, 0.30], "include": ["int"], "exclude": None}, ], ) -def test_groupby_empty_dataset(dtype, kwargs): +def test_groupby_empty_dataset(dtype, kwargs, using_infer_string): # GH#41575 df = DataFrame([[1, 2, 3]], columns=["A", "B", "C"], dtype=dtype) df["B"] = df["B"].astype(int) @@ -1669,7 +1678,8 @@ def test_groupby_empty_dataset(dtype, kwargs): result = df.iloc[:0].groupby("A").B.describe(**kwargs) expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0] - expected.index = Index([]) + dtype = "string[pyarrow_numpy]" if using_infer_string else None + expected.index = Index([], dtype=dtype) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4ca8b0e317bd2..ced688c951d8d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -10,6 +10,8 @@ SpecificationError, ) +from pandas.core.dtypes.common import is_string_dtype + import pandas as pd from pandas import ( Categorical, @@ -672,7 +674,7 @@ def test_frame_multi_key_function_list_partial_failure(): grouped = data.groupby(["A", "B"]) funcs = ["mean", "std"] - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): grouped.agg(funcs) @@ -961,7 +963,7 @@ def test_groupby_multi_corner(df): def test_raises_on_nuisance(df): grouped = df.groupby("A") - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): grouped.agg("mean") with pytest.raises(TypeError, match=msg): @@ -1017,7 +1019,7 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): msg = "could not convert string to float: 'one'" else: klass = TypeError - msg = re.escape(f"agg function failed [how->{agg_function},dtype->object]") + msg = re.escape(f"agg function failed [how->{agg_function},dtype->") with pytest.raises(klass, match=msg): getattr(grouped, agg_function)(numeric_only=numeric_only) else: @@ -1042,7 +1044,7 @@ def test_raise_on_nuisance_python_single(df): def test_raise_on_nuisance_python_multiple(three_group): grouped = three_group.groupby(["A", "B"]) - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): grouped.agg("mean") with pytest.raises(TypeError, match=msg): @@ -1085,7 +1087,7 @@ def test_wrap_aggregated_output_multindex(mframe): df["baz", "two"] = "peekaboo" keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): df.groupby(keys).agg("mean") agged = df.drop(columns=("baz", "two")).groupby(keys).agg("mean") @@ -1174,7 +1176,7 @@ def test_groupby_complex(): tm.assert_series_equal(result, expected) -def test_groupby_complex_numbers(): +def test_groupby_complex_numbers(using_infer_string): # GH 17927 df = DataFrame( [ @@ -1183,10 +1185,11 @@ def test_groupby_complex_numbers(): {"a": 4, "b": 1}, ] ) + dtype = "string[pyarrow_numpy]" if using_infer_string else object expected = DataFrame( np.array([1, 1, 1], dtype=np.int64), index=Index([(1 + 1j), (1 + 2j), (1 + 0j)], name="b"), - columns=Index(["a"], dtype="object"), + columns=Index(["a"], dtype=dtype), ) result = df.groupby("b", sort=False).count() tm.assert_frame_equal(result, expected) @@ -1697,14 +1700,18 @@ def g(group): @pytest.mark.parametrize("grouper", ["A", ["A", "B"]]) -def test_set_group_name(df, grouper): +def test_set_group_name(df, grouper, using_infer_string): def f(group): assert group.name is not None return group def freduce(group): assert group.name is not None - return group.sum() + if using_infer_string and grouper == "A" and is_string_dtype(group.dtype): + with pytest.raises(TypeError, match="does not support"): + group.sum() + else: + return group.sum() def freducex(x): return freduce(x) From 620555efa85e1952bb08c350dc5192c5eb0d6a9e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 21 Oct 2023 21:25:15 +0200 Subject: [PATCH 32/36] Fix tests --- pandas/tests/groupby/test_groupby.py | 10 ++++++++-- pandas/tests/groupby/test_raises.py | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 6e8f2e0391c7d..17e8bdff942ca 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2008,7 +2008,9 @@ def test_pivot_table_values_key_error(): @pytest.mark.parametrize( "op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew"] ) -def test_empty_groupby(columns, keys, values, method, op, using_array_manager, dropna): +def test_empty_groupby( + columns, keys, values, method, op, using_array_manager, dropna, using_infer_string +): # GH8093 & GH26411 override_dtype = None @@ -2049,7 +2051,11 @@ def get_categorical_invalid_expected(): # Categorical is special without 'observed=True' idx = Index(lev, name=keys[0]) - expected = DataFrame([], columns=[], index=idx) + if using_infer_string: + columns = Index([], dtype="string[pyarrow_numpy]") + else: + columns = [] + expected = DataFrame([], columns=columns, index=idx) return expected is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype) diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 46bf324fad1d7..613ecd30cb158 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -189,7 +189,7 @@ def test_groupby_raises_string( "sum": (None, ""), "var": ( TypeError, - re.escape("agg function failed [how->var,dtype->object]"), + re.escape("agg function failed [how->var,dtype->"), ), }[groupby_func] From 05f1c958f242c17aebac40dd53145938b58fb93b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 8 Dec 2023 22:39:39 +0100 Subject: [PATCH 33/36] Merge main --- pandas/core/arrays/string_arrow.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index e079580302b7e..2a10e87981bc3 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -535,17 +535,6 @@ def _rank( ) ) - def _reduce( - self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs - ): - result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) - if name in ("argmin", "argmax") and isinstance(result, pa.Array): - return self._convert_int_dtype(result) - elif isinstance(result, pa.Array): - return type(self)(result) - else: - return result - class ArrowStringArrayNumpySemantics(ArrowStringArray): _storage = "pyarrow_numpy" From c7ba71774ffa96db92ddc04b46a59ea1b70435c9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 8 Dec 2023 23:30:13 +0100 Subject: [PATCH 34/36] Update config_init.py --- pandas/core/config_init.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 112f889d99b06..a8b63f97141c2 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -905,7 +905,7 @@ def register_converter_cb(key) -> None: with cf.config_prefix("future"): cf.register_option( "infer_string", - os.environ.get("PANDAS_INFER_STRING", "1") == "1", + False, "Whether to infer sequence of str objects as pyarrow string " "dtype, which will be the default in pandas 3.0 " "(at which point this option will be deprecated).", From 15fb683d6f2ff0b92fed2bbbf0350c87710e4a73 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 9 Dec 2023 00:11:15 +0100 Subject: [PATCH 35/36] Fixup --- pandas/tests/frame/test_reductions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index cb26977e140e2..66145c32c18d7 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1376,7 +1376,7 @@ def test_any_all_object_dtype( ) if using_infer_string: # na in object is True while in string pyarrow numpy it's false - val = False if axis == 0 and not skipna and bool_agg_func == "all" else True + val = not axis == 0 and not skipna and bool_agg_func == "all" else: val = True result = getattr(df, bool_agg_func)(axis=axis, skipna=skipna) From fcc8245fbe6a8c7540e2f2accfe1a86f427cc9cf Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 9 Dec 2023 00:12:56 +0100 Subject: [PATCH 36/36] Update --- pandas/tests/frame/methods/test_dtypes.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/methods/test_dtypes.py b/pandas/tests/frame/methods/test_dtypes.py index 00ced2978d85b..ab632ac17318e 100644 --- a/pandas/tests/frame/methods/test_dtypes.py +++ b/pandas/tests/frame/methods/test_dtypes.py @@ -142,9 +142,12 @@ def test_dtypes_timedeltas(self): ) tm.assert_series_equal(result, expected) - def test_frame_apply_np_array_return_type(self): + def test_frame_apply_np_array_return_type(self, using_infer_string): # GH 35517 df = DataFrame([["foo"]]) result = df.apply(lambda col: np.array("bar")) - expected = Series([np.array(["bar"])]) + if using_infer_string: + expected = Series([np.array(["bar"])]) + else: + expected = Series(["bar"]) tm.assert_series_equal(result, expected)