diff --git a/pandas/conftest.py b/pandas/conftest.py index d11213f1164bc..222aefb4afda8 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1272,6 +1272,34 @@ def string_dtype(request): return request.param +@pytest.fixture( + params=[ + ("python", pd.NA), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + ("python", np.nan), + ], + ids=[ + "string=string[python]", + "string=string[pyarrow]", + "string=str[pyarrow]", + "string=str[python]", + ], +) +def string_dtype_no_object(request): + """ + Parametrized fixture for string dtypes. + * 'string[python]' (NA variant) + * 'string[pyarrow]' (NA variant) + * 'str' (NaN variant, with pyarrow) + * 'str' (NaN variant, without pyarrow) + """ + # need to instantiate the StringDtype here instead of in the params + # to avoid importing pyarrow during test collection + storage, na_value = request.param + return pd.StringDtype(storage, na_value) + + @pytest.fixture( params=[ "string[python]", diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index d86eeadbaa0fe..825d295043e69 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -5,6 +5,7 @@ import pandas.util._test_decorators as td +import pandas as pd from pandas import ( DataFrame, Index, @@ -29,11 +30,10 @@ def test_numba_vs_python_noop(float_frame, apply_axis): def test_numba_vs_python_string_index(): # GH#56189 - pytest.importorskip("pyarrow") df = DataFrame( 1, - index=Index(["a", "b"], dtype="string[pyarrow_numpy]"), - columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"), + index=Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), + columns=Index(["x", "y"], dtype=pd.StringDtype(na_value=np.nan)), ) func = lambda x: x result = df.apply(func, engine="numba", axis=0) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index b042cf632288b..d4363171788d4 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -241,10 +241,11 @@ def test_setitem_invalid_indexer_raises(): arr[[0, 1]] = ["foo", "bar", "baz"] -@pytest.mark.parametrize("dtype", ["string[pyarrow]", "string[pyarrow_numpy]"]) -def test_pickle_roundtrip(dtype): +@pytest.mark.parametrize("na_value", [pd.NA, np.nan]) +def test_pickle_roundtrip(na_value): # GH 42600 pytest.importorskip("pyarrow") + dtype = StringDtype("pyarrow", na_value=na_value) expected = pd.Series(range(10), dtype=dtype) expected_sliced = expected.head(2) full_pickled = pickle.dumps(expected) diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index bbd9b150b88a8..7819b7b75f065 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -183,9 +183,7 @@ def test_access_by_position(index_flat): assert index[-1] == index[size - 1] msg = f"index {size} is out of bounds for axis 0 with size {size}" - if is_dtype_equal(index.dtype, "string[pyarrow]") or is_dtype_equal( - index.dtype, "string[pyarrow_numpy]" - ): + if isinstance(index.dtype, pd.StringDtype) and index.dtype.storage == "pyarrow": msg = "index out of bounds" with pytest.raises(IndexError, match=msg): index[size] diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 8ce4e8725d632..0723c3c70091c 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1864,13 +1864,11 @@ def test_adding_new_conditional_column() -> None: ("dtype", "infer_string"), [ (object, False), - ("string[pyarrow_numpy]", True), + (pd.StringDtype(na_value=np.nan), True), ], ) def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None: # https://github.com/pandas-dev/pandas/issues/56204 - pytest.importorskip("pyarrow") - df = DataFrame({"a": [1, 2], "b": [3, 4]}) with pd.option_context("future.infer_string", infer_string): df.loc[df["a"] == 1, "c"] = "1" @@ -1880,16 +1878,14 @@ def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None: tm.assert_frame_equal(df, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_add_new_column_infer_string(): # GH#55366 - pytest.importorskip("pyarrow") df = DataFrame({"x": [1]}) with pd.option_context("future.infer_string", True): df.loc[df["x"] == 1, "y"] = "1" expected = DataFrame( - {"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")}, - columns=Index(["x", "y"], dtype=object), + {"x": [1], "y": Series(["1"], dtype=pd.StringDtype(na_value=np.nan))}, + columns=Index(["x", "y"], dtype="str"), ) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 4b1435babe6b1..c1cdeaa6c10dd 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -14,6 +14,7 @@ ) from pandas.compat import HAS_PYARROW +import pandas as pd from pandas import ( DataFrame, Index, @@ -502,14 +503,13 @@ def test_rank_mixed_axis_zero(self, data, expected): result = df.rank(numeric_only=True) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "dtype, exp_dtype", - [("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")], - ) - def test_rank_string_dtype(self, dtype, exp_dtype): + def test_rank_string_dtype(self, string_dtype_no_object): # GH#55362 - pytest.importorskip("pyarrow") - obj = Series(["foo", "foo", None, "foo"], dtype=dtype) + obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object) result = obj.rank(method="first") + exp_dtype = "Int64" if string_dtype_no_object.na_value is pd.NA else "float64" + if string_dtype_no_object.storage == "python": + # TODO nullable string[python] should also return nullable Int64 + exp_dtype = "float64" expected = Series([1, 2, None, 3], dtype=exp_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 0176a36fe78d7..3d46e03547c38 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2655,8 +2655,7 @@ def test_construct_with_strings_and_none(self): def test_frame_string_inference(self): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) @@ -2690,8 +2689,7 @@ def test_frame_string_inference(self): def test_frame_string_inference_array_string_dtype(self): # GH#54496 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) @@ -2715,7 +2713,6 @@ def test_frame_string_inference_array_string_dtype(self): def test_frame_string_inference_block_dim(self): # GH#55363 - pytest.importorskip("pyarrow") with pd.option_context("future.infer_string", True): df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) assert df._mgr.blocks[0].ndim == 2 diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py index edeac642551a0..91200f53e36bd 100644 --- a/pandas/tests/groupby/methods/test_size.py +++ b/pandas/tests/groupby/methods/test_size.py @@ -3,8 +3,6 @@ from pandas._config import using_string_dtype -import pandas.util._test_decorators as td - from pandas import ( DataFrame, Index, @@ -79,16 +77,9 @@ def test_size_series_masked_type_returns_Int64(dtype): @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -@pytest.mark.parametrize( - "dtype", - [ - object, - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], -) -def test_size_strings(dtype): +def test_size_strings(any_string_dtype): # GH#55627 + dtype = any_string_dtype df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype) result = df.groupby("a")["b"].size() exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64" diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index da3d626f2d777..8f8f7f64aba75 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -7,8 +7,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import ( Categorical, CategoricalIndex, @@ -373,14 +371,6 @@ def test_against_frame_and_seriesgroupby( tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "dtype", - [ - object, - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], -) @pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize( "sort, ascending, expected_rows, expected_count, expected_group_size", @@ -398,9 +388,10 @@ def test_compound( expected_rows, expected_count, expected_group_size, - dtype, + any_string_dtype, using_infer_string, ): + dtype = any_string_dtype education_df = education_df.astype(dtype) education_df.columns = education_df.columns.astype(dtype) # Multiple groupby keys and as_index=False @@ -417,6 +408,7 @@ def test_compound( expected["proportion"] = expected_count expected["proportion"] /= expected_group_size if dtype == "string[pyarrow]": + # TODO(nullable) also string[python] should return nullable dtypes expected["proportion"] = expected["proportion"].convert_dtypes() else: expected["count"] = expected_count diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 11b874d0b1608..6393468fb8ccd 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2466,20 +2466,13 @@ def test_rolling_wrong_param_min_period(): test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum() -@pytest.mark.parametrize( - "dtype", - [ - object, - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - ], -) -def test_by_column_values_with_same_starting_value(dtype): +def test_by_column_values_with_same_starting_value(any_string_dtype): # GH29635 df = DataFrame( { "Name": ["Thomas", "Thomas", "Thomas John"], "Credit": [1200, 1300, 900], - "Mood": Series(["sad", "happy", "happy"], dtype=dtype), + "Mood": Series(["sad", "happy", "happy"], dtype=any_string_dtype), } ) aggregate_details = {"Mood": Series.mode, "Credit": "sum"} diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 8a421654cdf9b..a6ea1502103c5 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -714,10 +714,9 @@ def test_groupby_min_max_categorical(func): @pytest.mark.parametrize("func", ["min", "max"]) -def test_min_empty_string_dtype(func): +def test_min_empty_string_dtype(func, string_dtype_no_object): # GH#55619 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = string_dtype_no_object df = DataFrame({"a": ["a"], "b": "a", "c": "a"}, dtype=dtype).iloc[:0] result = getattr(df.groupby("a"), func)() expected = DataFrame( diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index 6036eddce7a01..0896b97e8a40e 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -47,9 +47,7 @@ def test_construct_empty_tuples(self, tuple_list): def test_index_string_inference(self): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" - expected = Index(["a", "b"], dtype=dtype) + expected = Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)) with pd.option_context("future.infer_string", True): ser = Index(["a", "b"]) tm.assert_index_equal(ser, expected) diff --git a/pandas/tests/indexes/base_class/test_reshape.py b/pandas/tests/indexes/base_class/test_reshape.py index e17e39a334acc..56cdca49cb2b0 100644 --- a/pandas/tests/indexes/base_class/test_reshape.py +++ b/pandas/tests/indexes/base_class/test_reshape.py @@ -57,12 +57,11 @@ def test_insert_datetime_into_object(self, loc, val): tm.assert_index_equal(result, expected) assert type(expected[2]) is type(val) - def test_insert_none_into_string_numpy(self): + def test_insert_none_into_string_numpy(self, string_dtype_no_object): # GH#55365 - pytest.importorskip("pyarrow") - index = Index(["a", "b", "c"], dtype="string[pyarrow_numpy]") + index = Index(["a", "b", "c"], dtype=string_dtype_no_object) result = index.insert(-1, None) - expected = Index(["a", "b", None, "c"], dtype="string[pyarrow_numpy]") + expected = Index(["a", "b", None, "c"], dtype=string_dtype_no_object) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 2e9ba007a45c1..ea3d068a673e8 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -7,7 +7,6 @@ NA, is_matching_na, ) -import pandas.util._test_decorators as td import pandas as pd from pandas import Index @@ -160,14 +159,6 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): class TestSliceLocs: - # TODO(infer_string) parametrize over multiple string dtypes - @pytest.mark.parametrize( - "dtype", - [ - "object", - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - ], - ) @pytest.mark.parametrize( "in_slice,expected", [ @@ -191,24 +182,22 @@ class TestSliceLocs: (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] ], ) - def test_slice_locs_negative_step(self, in_slice, expected, dtype): - index = Index(list("bcdxy"), dtype=dtype) + def test_slice_locs_negative_step(self, in_slice, expected, any_string_dtype): + index = Index(list("bcdxy"), dtype=any_string_dtype) s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) result = index[s_start : s_stop : in_slice.step] - expected = Index(list(expected), dtype=dtype) + expected = Index(list(expected), dtype=any_string_dtype) tm.assert_index_equal(result, expected) - # TODO(infer_string) parametrize over multiple string dtypes - @td.skip_if_no("pyarrow") - def test_slice_locs_negative_step_oob(self): - index = Index(list("bcdxy"), dtype="string[pyarrow_numpy]") + def test_slice_locs_negative_step_oob(self, any_string_dtype): + index = Index(list("bcdxy"), dtype=any_string_dtype) result = index[-10:5:1] tm.assert_index_equal(result, index) result = index[4:-10:-1] - expected = Index(list("yxdcb"), dtype="string[pyarrow_numpy]") + expected = Index(list("yxdcb"), dtype=any_string_dtype) tm.assert_index_equal(result, expected) def test_slice_locs_dup(self): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 486b24845d2ff..2b62b384930d6 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -933,10 +933,9 @@ def test_isin_empty(self, empty): result = index.isin(empty) tm.assert_numpy_array_equal(expected, result) - @td.skip_if_no("pyarrow") - def test_isin_arrow_string_null(self): + def test_isin_string_null(self, string_dtype_no_object): # GH#55821 - index = Index(["a", "b"], dtype="string[pyarrow_numpy]") + index = Index(["a", "b"], dtype=string_dtype_no_object) result = index.isin([None]) expected = np.array([False, False]) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 75284a8f8fd47..cd3d599abd30e 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -295,7 +295,10 @@ def test_ensure_copied_data(self, index): tm.assert_numpy_array_equal( index._values._ndarray, result._values._ndarray, check_same="same" ) - elif index.dtype in ("string[pyarrow]", "string[pyarrow_numpy]"): + elif ( + isinstance(index.dtype, StringDtype) + and index.dtype.storage == "pyarrow" + ): assert tm.shares_memory(result._values, index._values) else: raise NotImplementedError(index.dtype) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 76910db941d36..38961345dc1f2 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -465,7 +465,7 @@ def test_non_str_names_w_duplicates(): ([1.0, 2.25, None], "Float32[pyarrow]", "float32"), ([True, False, None], "boolean", "bool"), ([True, False, None], "boolean[pyarrow]", "bool"), - (["much ado", "about", None], "string[pyarrow_numpy]", "large_string"), + (["much ado", "about", None], pd.StringDtype(na_value=np.nan), "large_string"), (["much ado", "about", None], "string[pyarrow]", "large_string"), ( [datetime(2020, 1, 1), datetime(2020, 1, 2), None], @@ -528,7 +528,11 @@ def test_pandas_nullable_with_missing_values( ([1.0, 2.25, 5.0], "Float32[pyarrow]", "float32"), ([True, False, False], "boolean", "bool"), ([True, False, False], "boolean[pyarrow]", "bool"), - (["much ado", "about", "nothing"], "string[pyarrow_numpy]", "large_string"), + ( + ["much ado", "about", "nothing"], + pd.StringDtype(na_value=np.nan), + "large_string", + ), (["much ado", "about", "nothing"], "string[pyarrow]", "large_string"), ( [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)], diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 3d07c0219691e..1c54232b8b510 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2245,18 +2245,18 @@ def test_pyarrow_engine_lines_false(): def test_json_roundtrip_string_inference(orient): - pytest.importorskip("pyarrow") df = DataFrame( [["a", "b"], ["c", "d"]], index=["row 1", "row 2"], columns=["col 1", "col 2"] ) out = df.to_json() with pd.option_context("future.infer_string", True): result = read_json(StringIO(out)) + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( [["a", "b"], ["c", "d"]], - dtype="string[pyarrow_numpy]", - index=Index(["row 1", "row 2"], dtype="string[pyarrow_numpy]"), - columns=Index(["col 1", "col 2"], dtype="string[pyarrow_numpy]"), + dtype=dtype, + index=Index(["row 1", "row 2"], dtype=dtype), + columns=Index(["col 1", "col 2"], dtype=dtype), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 07f29518b7881..b664423364f6b 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -547,8 +547,7 @@ def test_ea_int_avoid_overflow(all_parsers): def test_string_inference(all_parsers): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) data = """a,b x,1 @@ -568,8 +567,6 @@ def test_string_inference(all_parsers): @pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_]) def test_string_inference_object_dtype(all_parsers, dtype): # GH#56047 - pytest.importorskip("pyarrow") - data = """a,b x,a y,a @@ -583,7 +580,7 @@ def test_string_inference_object_dtype(all_parsers, dtype): "a": pd.Series(["x", "y", "z"], dtype=object), "b": pd.Series(["a", "a", "a"], dtype=object), }, - columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) @@ -593,9 +590,9 @@ def test_string_inference_object_dtype(all_parsers, dtype): expected = DataFrame( { "a": pd.Series(["x", "y", "z"], dtype=object), - "b": pd.Series(["a", "a", "a"], dtype="string[pyarrow_numpy]"), + "b": pd.Series(["a", "a", "a"], dtype=pd.StringDtype(na_value=np.nan)), }, - columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index dd3a0eabe95ae..8ae87d4bab52d 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -310,7 +310,6 @@ def test_read_hdf_series_mode_r(tmp_path, format, setup_path): def test_read_infer_string(tmp_path, setup_path): # GH#54431 - pytest.importorskip("pyarrow") df = DataFrame({"a": ["a", "b", None]}) path = tmp_path / setup_path df.to_hdf(path, key="data", format="table") @@ -318,8 +317,8 @@ def test_read_infer_string(tmp_path, setup_path): result = read_hdf(path, key="data", mode="r") expected = DataFrame( {"a": ["a", "b", None]}, - dtype="string[pyarrow_numpy]", - columns=Index(["a"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + columns=Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index a1f3babb1ae3b..9721d045b7b91 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -243,5 +243,7 @@ def test_string_inference(self, tmp_path): df.to_feather(path) with pd.option_context("future.infer_string", True): result = read_feather(path) - expected = pd.DataFrame(data={"a": ["x", "y"]}, dtype="string[pyarrow_numpy]") + expected = pd.DataFrame( + data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan) + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 90133344fdfc9..efb3dffecd856 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -436,7 +436,7 @@ def test_string_inference(tmp_path): result = read_orc(path) expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype="string[pyarrow_numpy]", - columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index a29e479b7c9f1..4c2ea036f08dc 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1109,8 +1109,8 @@ def test_string_inference(self, tmp_path, pa): result = read_parquet(path, engine="pyarrow") expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype="string[pyarrow_numpy]", - index=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + index=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) @@ -1140,8 +1140,8 @@ def test_infer_string_large_string_type(self, tmp_path, pa): result = read_parquet(path) expected = pd.DataFrame( data={"a": [None, "b", "c"]}, - dtype="string[pyarrow_numpy]", - columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 980c88f070b89..c28a33069d23f 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -3809,7 +3809,6 @@ class Test(BaseModel): def test_read_sql_string_inference(sqlite_engine): conn = sqlite_engine # GH#54430 - pytest.importorskip("pyarrow") table = "test" df = DataFrame({"a": ["x", "y"]}) df.to_sql(table, con=conn, index=False, if_exists="replace") @@ -3817,7 +3816,7 @@ def test_read_sql_string_inference(sqlite_engine): with pd.option_context("future.infer_string", True): result = read_sql_table(table, conn) - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( {"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index f07c6845366da..9ce2c925a368b 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -708,19 +708,17 @@ def test_get_dummies_ea_dtype_dataframe(self, any_numeric_ea_and_arrow_dtype): ) tm.assert_frame_equal(result, expected) - @td.skip_if_no("pyarrow") - def test_get_dummies_ea_dtype(self): + @pytest.mark.parametrize("dtype_type", ["string", "category"]) + def test_get_dummies_ea_dtype(self, dtype_type, string_dtype_no_object): # GH#56273 - for dtype, exp_dtype in [ - ("string[pyarrow]", "boolean"), - ("string[pyarrow_numpy]", "bool"), - (CategoricalDtype(Index(["a"], dtype="string[pyarrow]")), "boolean"), - (CategoricalDtype(Index(["a"], dtype="string[pyarrow_numpy]")), "bool"), - ]: - df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1}) - result = get_dummies(df) - expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)}) - tm.assert_frame_equal(result, expected) + dtype = string_dtype_no_object + exp_dtype = "boolean" if dtype.na_value is pd.NA else "bool" + if dtype_type == "category": + dtype = CategoricalDtype(Index(["a"], dtype)) + df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1}) + result = get_dummies(df) + expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)}) + tm.assert_frame_equal(result, expected) @td.skip_if_no("pyarrow") def test_get_dummies_arrow_dtype(self): diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 4a12404f6775a..95aa5291cb45a 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -1242,9 +1242,9 @@ def test_missing_stubname(self, any_string_dtype): tm.assert_frame_equal(result, expected) -def test_wide_to_long_pyarrow_string_columns(): +def test_wide_to_long_string_columns(string_storage): # GH 57066 - pytest.importorskip("pyarrow") + string_dtype = pd.StringDtype(string_storage, na_value=np.nan) df = DataFrame( { "ID": {0: 1}, @@ -1254,7 +1254,7 @@ def test_wide_to_long_pyarrow_string_columns(): "D": {0: 1}, } ) - df.columns = df.columns.astype("string[pyarrow_numpy]") + df.columns = df.columns.astype(string_dtype) result = wide_to_long( df, stubnames="R", i="ID", j="UNPIVOTED", sep="_", suffix=".*" ) @@ -1264,7 +1264,7 @@ def test_wide_to_long_pyarrow_string_columns(): index=pd.MultiIndex.from_arrays( [ [1, 1, 1], - Index(["test1", "test2", "test3"], dtype="string[pyarrow_numpy]"), + Index(["test1", "test2", "test3"], dtype=string_dtype), ], names=["ID", "UNPIVOTED"], ), diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index 1586195e79a9d..8516018e8aa93 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -9,6 +9,7 @@ DataFrame, Index, Series, + StringDtype, bdate_range, ) import pandas._testing as tm @@ -514,7 +515,7 @@ def test_pyarrow_numpy_string_invalid(self): # GH#56008 pa = pytest.importorskip("pyarrow") ser = Series([False, True]) - ser2 = Series(["a", "b"], dtype="string[pyarrow_numpy]") + ser2 = Series(["a", "b"], dtype=StringDtype(na_value=np.nan)) result = ser == ser2 expected_eq = Series(False, index=ser.index) tm.assert_series_equal(result, expected_eq) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index bf01c4996bb32..ea9f89ed129aa 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -22,7 +22,7 @@ def using_pyarrow(dtype): - return dtype in ("string[pyarrow]", "string[pyarrow_numpy]") + return dtype == "string" and dtype.storage == "pyarrow" def test_contains(any_string_dtype): diff --git a/pandas/tests/util/test_shares_memory.py b/pandas/tests/util/test_shares_memory.py index 00a897d574a07..8f1ac93b40247 100644 --- a/pandas/tests/util/test_shares_memory.py +++ b/pandas/tests/util/test_shares_memory.py @@ -1,3 +1,5 @@ +import numpy as np + import pandas.util._test_decorators as td import pandas as pd @@ -20,10 +22,10 @@ def test_shares_memory_string(): # GH#55823 import pyarrow as pa - obj = pd.array(["a", "b"], dtype="string[pyarrow]") + obj = pd.array(["a", "b"], dtype=pd.StringDtype("pyarrow", na_value=pd.NA)) assert tm.shares_memory(obj, obj) - obj = pd.array(["a", "b"], dtype="string[pyarrow_numpy]") + obj = pd.array(["a", "b"], dtype=pd.StringDtype("pyarrow", na_value=np.nan)) assert tm.shares_memory(obj, obj) obj = pd.array(["a", "b"], dtype=pd.ArrowDtype(pa.string()))