Skip to content

Commit 000ea36

Browse files
TST (string dtype): remove usage of 'string[pyarrow_numpy]' alias (#59758)
1 parent d1052cf commit 000ea36

29 files changed

+119
-134
lines changed

pandas/conftest.py

+28
Original file line numberDiff line numberDiff line change
@@ -1228,6 +1228,34 @@ def string_dtype(request):
12281228
return request.param
12291229

12301230

1231+
@pytest.fixture(
1232+
params=[
1233+
("python", pd.NA),
1234+
pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
1235+
pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
1236+
("python", np.nan),
1237+
],
1238+
ids=[
1239+
"string=string[python]",
1240+
"string=string[pyarrow]",
1241+
"string=str[pyarrow]",
1242+
"string=str[python]",
1243+
],
1244+
)
1245+
def string_dtype_no_object(request):
1246+
"""
1247+
Parametrized fixture for string dtypes.
1248+
* 'string[python]' (NA variant)
1249+
* 'string[pyarrow]' (NA variant)
1250+
* 'str' (NaN variant, with pyarrow)
1251+
* 'str' (NaN variant, without pyarrow)
1252+
"""
1253+
# need to instantiate the StringDtype here instead of in the params
1254+
# to avoid importing pyarrow during test collection
1255+
storage, na_value = request.param
1256+
return pd.StringDtype(storage, na_value)
1257+
1258+
12311259
@pytest.fixture(
12321260
params=[
12331261
"string[python]",

pandas/tests/apply/test_numba.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import pandas.util._test_decorators as td
77

8+
import pandas as pd
89
from pandas import (
910
DataFrame,
1011
Index,
@@ -29,11 +30,10 @@ def test_numba_vs_python_noop(float_frame, apply_axis):
2930

3031
def test_numba_vs_python_string_index():
3132
# GH#56189
32-
pytest.importorskip("pyarrow")
3333
df = DataFrame(
3434
1,
35-
index=Index(["a", "b"], dtype="string[pyarrow_numpy]"),
36-
columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"),
35+
index=Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
36+
columns=Index(["x", "y"], dtype=pd.StringDtype(na_value=np.nan)),
3737
)
3838
func = lambda x: x
3939
result = df.apply(func, engine="numba", axis=0)

pandas/tests/arrays/string_/test_string_arrow.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -241,10 +241,11 @@ def test_setitem_invalid_indexer_raises():
241241
arr[[0, 1]] = ["foo", "bar", "baz"]
242242

243243

244-
@pytest.mark.parametrize("dtype", ["string[pyarrow]", "string[pyarrow_numpy]"])
245-
def test_pickle_roundtrip(dtype):
244+
@pytest.mark.parametrize("na_value", [pd.NA, np.nan])
245+
def test_pickle_roundtrip(na_value):
246246
# GH 42600
247247
pytest.importorskip("pyarrow")
248+
dtype = StringDtype("pyarrow", na_value=na_value)
248249
expected = pd.Series(range(10), dtype=dtype)
249250
expected_sliced = expected.head(2)
250251
full_pickled = pickle.dumps(expected)

pandas/tests/base/test_misc.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -180,9 +180,7 @@ def test_access_by_position(index_flat):
180180
assert index[-1] == index[size - 1]
181181

182182
msg = f"index {size} is out of bounds for axis 0 with size {size}"
183-
if is_dtype_equal(index.dtype, "string[pyarrow]") or is_dtype_equal(
184-
index.dtype, "string[pyarrow_numpy]"
185-
):
183+
if isinstance(index.dtype, pd.StringDtype) and index.dtype.storage == "pyarrow":
186184
msg = "index out of bounds"
187185
with pytest.raises(IndexError, match=msg):
188186
index[size]

pandas/tests/frame/indexing/test_indexing.py

+3-7
Original file line numberDiff line numberDiff line change
@@ -1955,13 +1955,11 @@ def test_adding_new_conditional_column() -> None:
19551955
("dtype", "infer_string"),
19561956
[
19571957
(object, False),
1958-
("string[pyarrow_numpy]", True),
1958+
(pd.StringDtype(na_value=np.nan), True),
19591959
],
19601960
)
19611961
def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None:
19621962
# https://github.com/pandas-dev/pandas/issues/56204
1963-
pytest.importorskip("pyarrow")
1964-
19651963
df = DataFrame({"a": [1, 2], "b": [3, 4]})
19661964
with pd.option_context("future.infer_string", infer_string):
19671965
df.loc[df["a"] == 1, "c"] = "1"
@@ -1971,16 +1969,14 @@ def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None:
19711969
tm.assert_frame_equal(df, expected)
19721970

19731971

1974-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
19751972
def test_add_new_column_infer_string():
19761973
# GH#55366
1977-
pytest.importorskip("pyarrow")
19781974
df = DataFrame({"x": [1]})
19791975
with pd.option_context("future.infer_string", True):
19801976
df.loc[df["x"] == 1, "y"] = "1"
19811977
expected = DataFrame(
1982-
{"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")},
1983-
columns=Index(["x", "y"], dtype=object),
1978+
{"x": [1], "y": Series(["1"], dtype=pd.StringDtype(na_value=np.nan))},
1979+
columns=Index(["x", "y"], dtype="str"),
19841980
)
19851981
tm.assert_frame_equal(df, expected)
19861982

pandas/tests/frame/methods/test_rank.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
)
1515
from pandas.compat import HAS_PYARROW
1616

17+
import pandas as pd
1718
from pandas import (
1819
DataFrame,
1920
Index,
@@ -509,14 +510,13 @@ def test_rank_mixed_axis_zero(self, data, expected):
509510
result = df.rank(numeric_only=True)
510511
tm.assert_frame_equal(result, expected)
511512

512-
@pytest.mark.parametrize(
513-
"dtype, exp_dtype",
514-
[("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")],
515-
)
516-
def test_rank_string_dtype(self, dtype, exp_dtype):
513+
def test_rank_string_dtype(self, string_dtype_no_object):
517514
# GH#55362
518-
pytest.importorskip("pyarrow")
519-
obj = Series(["foo", "foo", None, "foo"], dtype=dtype)
515+
obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object)
520516
result = obj.rank(method="first")
517+
exp_dtype = "Int64" if string_dtype_no_object.na_value is pd.NA else "float64"
518+
if string_dtype_no_object.storage == "python":
519+
# TODO nullable string[python] should also return nullable Int64
520+
exp_dtype = "float64"
521521
expected = Series([1, 2, None, 3], dtype=exp_dtype)
522522
tm.assert_series_equal(result, expected)

pandas/tests/frame/test_constructors.py

+2-5
Original file line numberDiff line numberDiff line change
@@ -2721,8 +2721,7 @@ def test_construct_with_strings_and_none(self):
27212721

27222722
def test_frame_string_inference(self):
27232723
# GH#54430
2724-
pytest.importorskip("pyarrow")
2725-
dtype = "string[pyarrow_numpy]"
2724+
dtype = pd.StringDtype(na_value=np.nan)
27262725
expected = DataFrame(
27272726
{"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
27282727
)
@@ -2756,8 +2755,7 @@ def test_frame_string_inference(self):
27562755

27572756
def test_frame_string_inference_array_string_dtype(self):
27582757
# GH#54496
2759-
pytest.importorskip("pyarrow")
2760-
dtype = "string[pyarrow_numpy]"
2758+
dtype = pd.StringDtype(na_value=np.nan)
27612759
expected = DataFrame(
27622760
{"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
27632761
)
@@ -2781,7 +2779,6 @@ def test_frame_string_inference_array_string_dtype(self):
27812779

27822780
def test_frame_string_inference_block_dim(self):
27832781
# GH#55363
2784-
pytest.importorskip("pyarrow")
27852782
with pd.option_context("future.infer_string", True):
27862783
df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]]))
27872784
assert df._mgr.blocks[0].ndim == 2

pandas/tests/groupby/methods/test_size.py

+2-11
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33

44
from pandas._config import using_string_dtype
55

6-
import pandas.util._test_decorators as td
7-
86
from pandas.core.dtypes.common import is_integer_dtype
97

108
from pandas import (
@@ -111,16 +109,9 @@ def test_size_series_masked_type_returns_Int64(dtype):
111109

112110

113111
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
114-
@pytest.mark.parametrize(
115-
"dtype",
116-
[
117-
object,
118-
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
119-
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
120-
],
121-
)
122-
def test_size_strings(dtype):
112+
def test_size_strings(any_string_dtype):
123113
# GH#55627
114+
dtype = any_string_dtype
124115
df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype)
125116
result = df.groupby("a")["b"].size()
126117
exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64"

pandas/tests/groupby/methods/test_value_counts.py

+3-11
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88
import numpy as np
99
import pytest
1010

11-
import pandas.util._test_decorators as td
12-
1311
from pandas import (
1412
Categorical,
1513
CategoricalIndex,
@@ -389,14 +387,6 @@ def test_against_frame_and_seriesgroupby(
389387
tm.assert_frame_equal(result, expected)
390388

391389

392-
@pytest.mark.parametrize(
393-
"dtype",
394-
[
395-
object,
396-
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
397-
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
398-
],
399-
)
400390
@pytest.mark.parametrize("normalize", [True, False])
401391
@pytest.mark.parametrize(
402392
"sort, ascending, expected_rows, expected_count, expected_group_size",
@@ -414,9 +404,10 @@ def test_compound(
414404
expected_rows,
415405
expected_count,
416406
expected_group_size,
417-
dtype,
407+
any_string_dtype,
418408
using_infer_string,
419409
):
410+
dtype = any_string_dtype
420411
education_df = education_df.astype(dtype)
421412
education_df.columns = education_df.columns.astype(dtype)
422413
# Multiple groupby keys and as_index=False
@@ -433,6 +424,7 @@ def test_compound(
433424
expected["proportion"] = expected_count
434425
expected["proportion"] /= expected_group_size
435426
if dtype == "string[pyarrow]":
427+
# TODO(nullable) also string[python] should return nullable dtypes
436428
expected["proportion"] = expected["proportion"].convert_dtypes()
437429
else:
438430
expected["count"] = expected_count

pandas/tests/groupby/test_groupby.py

+2-9
Original file line numberDiff line numberDiff line change
@@ -2832,20 +2832,13 @@ def test_rolling_wrong_param_min_period():
28322832
test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum()
28332833

28342834

2835-
@pytest.mark.parametrize(
2836-
"dtype",
2837-
[
2838-
object,
2839-
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
2840-
],
2841-
)
2842-
def test_by_column_values_with_same_starting_value(dtype):
2835+
def test_by_column_values_with_same_starting_value(any_string_dtype):
28432836
# GH29635
28442837
df = DataFrame(
28452838
{
28462839
"Name": ["Thomas", "Thomas", "Thomas John"],
28472840
"Credit": [1200, 1300, 900],
2848-
"Mood": Series(["sad", "happy", "happy"], dtype=dtype),
2841+
"Mood": Series(["sad", "happy", "happy"], dtype=any_string_dtype),
28492842
}
28502843
)
28512844
aggregate_details = {"Mood": Series.mode, "Credit": "sum"}

pandas/tests/groupby/test_reductions.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -702,10 +702,9 @@ def test_groupby_min_max_categorical(func):
702702

703703

704704
@pytest.mark.parametrize("func", ["min", "max"])
705-
def test_min_empty_string_dtype(func):
705+
def test_min_empty_string_dtype(func, string_dtype_no_object):
706706
# GH#55619
707-
pytest.importorskip("pyarrow")
708-
dtype = "string[pyarrow_numpy]"
707+
dtype = string_dtype_no_object
709708
df = DataFrame({"a": ["a"], "b": "a", "c": "a"}, dtype=dtype).iloc[:0]
710709
result = getattr(df.groupby("a"), func)()
711710
expected = DataFrame(

pandas/tests/indexes/base_class/test_constructors.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,7 @@ def test_construct_empty_tuples(self, tuple_list):
4747

4848
def test_index_string_inference(self):
4949
# GH#54430
50-
pytest.importorskip("pyarrow")
51-
dtype = "string[pyarrow_numpy]"
52-
expected = Index(["a", "b"], dtype=dtype)
50+
expected = Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan))
5351
with pd.option_context("future.infer_string", True):
5452
ser = Index(["a", "b"])
5553
tm.assert_index_equal(ser, expected)

pandas/tests/indexes/base_class/test_reshape.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -59,12 +59,11 @@ def test_insert_datetime_into_object(self, loc, val):
5959
tm.assert_index_equal(result, expected)
6060
assert type(expected[2]) is type(val)
6161

62-
def test_insert_none_into_string_numpy(self):
62+
def test_insert_none_into_string_numpy(self, string_dtype_no_object):
6363
# GH#55365
64-
pytest.importorskip("pyarrow")
65-
index = Index(["a", "b", "c"], dtype="string[pyarrow_numpy]")
64+
index = Index(["a", "b", "c"], dtype=string_dtype_no_object)
6665
result = index.insert(-1, None)
67-
expected = Index(["a", "b", None, "c"], dtype="string[pyarrow_numpy]")
66+
expected = Index(["a", "b", None, "c"], dtype=string_dtype_no_object)
6867
tm.assert_index_equal(result, expected)
6968

7069
@pytest.mark.parametrize(

pandas/tests/indexes/object/test_indexing.py

+6-17
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
NA,
88
is_matching_na,
99
)
10-
import pandas.util._test_decorators as td
1110

1211
import pandas as pd
1312
from pandas import Index
@@ -159,14 +158,6 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2):
159158

160159

161160
class TestSliceLocs:
162-
# TODO(infer_string) parametrize over multiple string dtypes
163-
@pytest.mark.parametrize(
164-
"dtype",
165-
[
166-
"object",
167-
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
168-
],
169-
)
170161
@pytest.mark.parametrize(
171162
"in_slice,expected",
172163
[
@@ -190,24 +181,22 @@ class TestSliceLocs:
190181
(pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc]
191182
],
192183
)
193-
def test_slice_locs_negative_step(self, in_slice, expected, dtype):
194-
index = Index(list("bcdxy"), dtype=dtype)
184+
def test_slice_locs_negative_step(self, in_slice, expected, any_string_dtype):
185+
index = Index(list("bcdxy"), dtype=any_string_dtype)
195186

196187
s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step)
197188
result = index[s_start : s_stop : in_slice.step]
198-
expected = Index(list(expected), dtype=dtype)
189+
expected = Index(list(expected), dtype=any_string_dtype)
199190
tm.assert_index_equal(result, expected)
200191

201-
# TODO(infer_string) parametrize over multiple string dtypes
202-
@td.skip_if_no("pyarrow")
203-
def test_slice_locs_negative_step_oob(self):
204-
index = Index(list("bcdxy"), dtype="string[pyarrow_numpy]")
192+
def test_slice_locs_negative_step_oob(self, any_string_dtype):
193+
index = Index(list("bcdxy"), dtype=any_string_dtype)
205194

206195
result = index[-10:5:1]
207196
tm.assert_index_equal(result, index)
208197

209198
result = index[4:-10:-1]
210-
expected = Index(list("yxdcb"), dtype="string[pyarrow_numpy]")
199+
expected = Index(list("yxdcb"), dtype=any_string_dtype)
211200
tm.assert_index_equal(result, expected)
212201

213202
def test_slice_locs_dup(self):

pandas/tests/indexes/test_base.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -971,10 +971,9 @@ def test_isin_empty(self, empty):
971971
result = index.isin(empty)
972972
tm.assert_numpy_array_equal(expected, result)
973973

974-
@td.skip_if_no("pyarrow")
975-
def test_isin_arrow_string_null(self):
974+
def test_isin_string_null(self, string_dtype_no_object):
976975
# GH#55821
977-
index = Index(["a", "b"], dtype="string[pyarrow_numpy]")
976+
index = Index(["a", "b"], dtype=string_dtype_no_object)
978977
result = index.isin([None])
979978
expected = np.array([False, False])
980979
tm.assert_numpy_array_equal(result, expected)

pandas/tests/indexes/test_old_base.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,10 @@ def test_ensure_copied_data(self, index):
301301
tm.assert_numpy_array_equal(
302302
index._values._ndarray, result._values._ndarray, check_same="same"
303303
)
304-
elif index.dtype in ("string[pyarrow]", "string[pyarrow_numpy]"):
304+
elif (
305+
isinstance(index.dtype, StringDtype)
306+
and index.dtype.storage == "pyarrow"
307+
):
305308
assert tm.shares_memory(result._values, index._values)
306309
else:
307310
raise NotImplementedError(index.dtype)

0 commit comments

Comments
 (0)