Skip to content

Commit 83fd9ba

Browse files
TST (string dtype): remove usage of 'string[pyarrow_numpy]' alias (#59758)
1 parent b717abb commit 83fd9ba

29 files changed

+119
-134
lines changed

pandas/conftest.py

+28
Original file line numberDiff line numberDiff line change
@@ -1272,6 +1272,34 @@ def string_dtype(request):
12721272
return request.param
12731273

12741274

1275+
@pytest.fixture(
1276+
params=[
1277+
("python", pd.NA),
1278+
pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
1279+
pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
1280+
("python", np.nan),
1281+
],
1282+
ids=[
1283+
"string=string[python]",
1284+
"string=string[pyarrow]",
1285+
"string=str[pyarrow]",
1286+
"string=str[python]",
1287+
],
1288+
)
1289+
def string_dtype_no_object(request):
1290+
"""
1291+
Parametrized fixture for string dtypes.
1292+
* 'string[python]' (NA variant)
1293+
* 'string[pyarrow]' (NA variant)
1294+
* 'str' (NaN variant, with pyarrow)
1295+
* 'str' (NaN variant, without pyarrow)
1296+
"""
1297+
# need to instantiate the StringDtype here instead of in the params
1298+
# to avoid importing pyarrow during test collection
1299+
storage, na_value = request.param
1300+
return pd.StringDtype(storage, na_value)
1301+
1302+
12751303
@pytest.fixture(
12761304
params=[
12771305
"string[python]",

pandas/tests/apply/test_numba.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import pandas.util._test_decorators as td
77

8+
import pandas as pd
89
from pandas import (
910
DataFrame,
1011
Index,
@@ -29,11 +30,10 @@ def test_numba_vs_python_noop(float_frame, apply_axis):
2930

3031
def test_numba_vs_python_string_index():
3132
# GH#56189
32-
pytest.importorskip("pyarrow")
3333
df = DataFrame(
3434
1,
35-
index=Index(["a", "b"], dtype="string[pyarrow_numpy]"),
36-
columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"),
35+
index=Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
36+
columns=Index(["x", "y"], dtype=pd.StringDtype(na_value=np.nan)),
3737
)
3838
func = lambda x: x
3939
result = df.apply(func, engine="numba", axis=0)

pandas/tests/arrays/string_/test_string_arrow.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -241,10 +241,11 @@ def test_setitem_invalid_indexer_raises():
241241
arr[[0, 1]] = ["foo", "bar", "baz"]
242242

243243

244-
@pytest.mark.parametrize("dtype", ["string[pyarrow]", "string[pyarrow_numpy]"])
245-
def test_pickle_roundtrip(dtype):
244+
@pytest.mark.parametrize("na_value", [pd.NA, np.nan])
245+
def test_pickle_roundtrip(na_value):
246246
# GH 42600
247247
pytest.importorskip("pyarrow")
248+
dtype = StringDtype("pyarrow", na_value=na_value)
248249
expected = pd.Series(range(10), dtype=dtype)
249250
expected_sliced = expected.head(2)
250251
full_pickled = pickle.dumps(expected)

pandas/tests/base/test_misc.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -183,9 +183,7 @@ def test_access_by_position(index_flat):
183183
assert index[-1] == index[size - 1]
184184

185185
msg = f"index {size} is out of bounds for axis 0 with size {size}"
186-
if is_dtype_equal(index.dtype, "string[pyarrow]") or is_dtype_equal(
187-
index.dtype, "string[pyarrow_numpy]"
188-
):
186+
if isinstance(index.dtype, pd.StringDtype) and index.dtype.storage == "pyarrow":
189187
msg = "index out of bounds"
190188
with pytest.raises(IndexError, match=msg):
191189
index[size]

pandas/tests/frame/indexing/test_indexing.py

+3-7
Original file line numberDiff line numberDiff line change
@@ -1864,13 +1864,11 @@ def test_adding_new_conditional_column() -> None:
18641864
("dtype", "infer_string"),
18651865
[
18661866
(object, False),
1867-
("string[pyarrow_numpy]", True),
1867+
(pd.StringDtype(na_value=np.nan), True),
18681868
],
18691869
)
18701870
def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None:
18711871
# https://github.com/pandas-dev/pandas/issues/56204
1872-
pytest.importorskip("pyarrow")
1873-
18741872
df = DataFrame({"a": [1, 2], "b": [3, 4]})
18751873
with pd.option_context("future.infer_string", infer_string):
18761874
df.loc[df["a"] == 1, "c"] = "1"
@@ -1880,16 +1878,14 @@ def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None:
18801878
tm.assert_frame_equal(df, expected)
18811879

18821880

1883-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
18841881
def test_add_new_column_infer_string():
18851882
# GH#55366
1886-
pytest.importorskip("pyarrow")
18871883
df = DataFrame({"x": [1]})
18881884
with pd.option_context("future.infer_string", True):
18891885
df.loc[df["x"] == 1, "y"] = "1"
18901886
expected = DataFrame(
1891-
{"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")},
1892-
columns=Index(["x", "y"], dtype=object),
1887+
{"x": [1], "y": Series(["1"], dtype=pd.StringDtype(na_value=np.nan))},
1888+
columns=Index(["x", "y"], dtype="str"),
18931889
)
18941890
tm.assert_frame_equal(df, expected)
18951891

pandas/tests/frame/methods/test_rank.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
)
1515
from pandas.compat import HAS_PYARROW
1616

17+
import pandas as pd
1718
from pandas import (
1819
DataFrame,
1920
Index,
@@ -502,14 +503,13 @@ def test_rank_mixed_axis_zero(self, data, expected):
502503
result = df.rank(numeric_only=True)
503504
tm.assert_frame_equal(result, expected)
504505

505-
@pytest.mark.parametrize(
506-
"dtype, exp_dtype",
507-
[("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")],
508-
)
509-
def test_rank_string_dtype(self, dtype, exp_dtype):
506+
def test_rank_string_dtype(self, string_dtype_no_object):
510507
# GH#55362
511-
pytest.importorskip("pyarrow")
512-
obj = Series(["foo", "foo", None, "foo"], dtype=dtype)
508+
obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object)
513509
result = obj.rank(method="first")
510+
exp_dtype = "Int64" if string_dtype_no_object.na_value is pd.NA else "float64"
511+
if string_dtype_no_object.storage == "python":
512+
# TODO nullable string[python] should also return nullable Int64
513+
exp_dtype = "float64"
514514
expected = Series([1, 2, None, 3], dtype=exp_dtype)
515515
tm.assert_series_equal(result, expected)

pandas/tests/frame/test_constructors.py

+2-5
Original file line numberDiff line numberDiff line change
@@ -2655,8 +2655,7 @@ def test_construct_with_strings_and_none(self):
26552655

26562656
def test_frame_string_inference(self):
26572657
# GH#54430
2658-
pytest.importorskip("pyarrow")
2659-
dtype = "string[pyarrow_numpy]"
2658+
dtype = pd.StringDtype(na_value=np.nan)
26602659
expected = DataFrame(
26612660
{"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
26622661
)
@@ -2690,8 +2689,7 @@ def test_frame_string_inference(self):
26902689

26912690
def test_frame_string_inference_array_string_dtype(self):
26922691
# GH#54496
2693-
pytest.importorskip("pyarrow")
2694-
dtype = "string[pyarrow_numpy]"
2692+
dtype = pd.StringDtype(na_value=np.nan)
26952693
expected = DataFrame(
26962694
{"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
26972695
)
@@ -2715,7 +2713,6 @@ def test_frame_string_inference_array_string_dtype(self):
27152713

27162714
def test_frame_string_inference_block_dim(self):
27172715
# GH#55363
2718-
pytest.importorskip("pyarrow")
27192716
with pd.option_context("future.infer_string", True):
27202717
df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]]))
27212718
assert df._mgr.blocks[0].ndim == 2

pandas/tests/groupby/methods/test_size.py

+2-11
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33

44
from pandas._config import using_string_dtype
55

6-
import pandas.util._test_decorators as td
7-
86
from pandas import (
97
DataFrame,
108
Index,
@@ -79,16 +77,9 @@ def test_size_series_masked_type_returns_Int64(dtype):
7977

8078

8179
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
82-
@pytest.mark.parametrize(
83-
"dtype",
84-
[
85-
object,
86-
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
87-
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
88-
],
89-
)
90-
def test_size_strings(dtype):
80+
def test_size_strings(any_string_dtype):
9181
# GH#55627
82+
dtype = any_string_dtype
9283
df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype)
9384
result = df.groupby("a")["b"].size()
9485
exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64"

pandas/tests/groupby/methods/test_value_counts.py

+3-11
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
import numpy as np
88
import pytest
99

10-
import pandas.util._test_decorators as td
11-
1210
from pandas import (
1311
Categorical,
1412
CategoricalIndex,
@@ -373,14 +371,6 @@ def test_against_frame_and_seriesgroupby(
373371
tm.assert_frame_equal(result, expected)
374372

375373

376-
@pytest.mark.parametrize(
377-
"dtype",
378-
[
379-
object,
380-
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
381-
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
382-
],
383-
)
384374
@pytest.mark.parametrize("normalize", [True, False])
385375
@pytest.mark.parametrize(
386376
"sort, ascending, expected_rows, expected_count, expected_group_size",
@@ -398,9 +388,10 @@ def test_compound(
398388
expected_rows,
399389
expected_count,
400390
expected_group_size,
401-
dtype,
391+
any_string_dtype,
402392
using_infer_string,
403393
):
394+
dtype = any_string_dtype
404395
education_df = education_df.astype(dtype)
405396
education_df.columns = education_df.columns.astype(dtype)
406397
# Multiple groupby keys and as_index=False
@@ -417,6 +408,7 @@ def test_compound(
417408
expected["proportion"] = expected_count
418409
expected["proportion"] /= expected_group_size
419410
if dtype == "string[pyarrow]":
411+
# TODO(nullable) also string[python] should return nullable dtypes
420412
expected["proportion"] = expected["proportion"].convert_dtypes()
421413
else:
422414
expected["count"] = expected_count

pandas/tests/groupby/test_groupby.py

+2-9
Original file line numberDiff line numberDiff line change
@@ -2466,20 +2466,13 @@ def test_rolling_wrong_param_min_period():
24662466
test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum()
24672467

24682468

2469-
@pytest.mark.parametrize(
2470-
"dtype",
2471-
[
2472-
object,
2473-
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
2474-
],
2475-
)
2476-
def test_by_column_values_with_same_starting_value(dtype):
2469+
def test_by_column_values_with_same_starting_value(any_string_dtype):
24772470
# GH29635
24782471
df = DataFrame(
24792472
{
24802473
"Name": ["Thomas", "Thomas", "Thomas John"],
24812474
"Credit": [1200, 1300, 900],
2482-
"Mood": Series(["sad", "happy", "happy"], dtype=dtype),
2475+
"Mood": Series(["sad", "happy", "happy"], dtype=any_string_dtype),
24832476
}
24842477
)
24852478
aggregate_details = {"Mood": Series.mode, "Credit": "sum"}

pandas/tests/groupby/test_reductions.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -714,10 +714,9 @@ def test_groupby_min_max_categorical(func):
714714

715715

716716
@pytest.mark.parametrize("func", ["min", "max"])
717-
def test_min_empty_string_dtype(func):
717+
def test_min_empty_string_dtype(func, string_dtype_no_object):
718718
# GH#55619
719-
pytest.importorskip("pyarrow")
720-
dtype = "string[pyarrow_numpy]"
719+
dtype = string_dtype_no_object
721720
df = DataFrame({"a": ["a"], "b": "a", "c": "a"}, dtype=dtype).iloc[:0]
722721
result = getattr(df.groupby("a"), func)()
723722
expected = DataFrame(

pandas/tests/indexes/base_class/test_constructors.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,7 @@ def test_construct_empty_tuples(self, tuple_list):
4747

4848
def test_index_string_inference(self):
4949
# GH#54430
50-
pytest.importorskip("pyarrow")
51-
dtype = "string[pyarrow_numpy]"
52-
expected = Index(["a", "b"], dtype=dtype)
50+
expected = Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan))
5351
with pd.option_context("future.infer_string", True):
5452
ser = Index(["a", "b"])
5553
tm.assert_index_equal(ser, expected)

pandas/tests/indexes/base_class/test_reshape.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -57,12 +57,11 @@ def test_insert_datetime_into_object(self, loc, val):
5757
tm.assert_index_equal(result, expected)
5858
assert type(expected[2]) is type(val)
5959

60-
def test_insert_none_into_string_numpy(self):
60+
def test_insert_none_into_string_numpy(self, string_dtype_no_object):
6161
# GH#55365
62-
pytest.importorskip("pyarrow")
63-
index = Index(["a", "b", "c"], dtype="string[pyarrow_numpy]")
62+
index = Index(["a", "b", "c"], dtype=string_dtype_no_object)
6463
result = index.insert(-1, None)
65-
expected = Index(["a", "b", None, "c"], dtype="string[pyarrow_numpy]")
64+
expected = Index(["a", "b", None, "c"], dtype=string_dtype_no_object)
6665
tm.assert_index_equal(result, expected)
6766

6867
@pytest.mark.parametrize(

pandas/tests/indexes/object/test_indexing.py

+6-17
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
NA,
88
is_matching_na,
99
)
10-
import pandas.util._test_decorators as td
1110

1211
import pandas as pd
1312
from pandas import Index
@@ -160,14 +159,6 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2):
160159

161160

162161
class TestSliceLocs:
163-
# TODO(infer_string) parametrize over multiple string dtypes
164-
@pytest.mark.parametrize(
165-
"dtype",
166-
[
167-
"object",
168-
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
169-
],
170-
)
171162
@pytest.mark.parametrize(
172163
"in_slice,expected",
173164
[
@@ -191,24 +182,22 @@ class TestSliceLocs:
191182
(pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc]
192183
],
193184
)
194-
def test_slice_locs_negative_step(self, in_slice, expected, dtype):
195-
index = Index(list("bcdxy"), dtype=dtype)
185+
def test_slice_locs_negative_step(self, in_slice, expected, any_string_dtype):
186+
index = Index(list("bcdxy"), dtype=any_string_dtype)
196187

197188
s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step)
198189
result = index[s_start : s_stop : in_slice.step]
199-
expected = Index(list(expected), dtype=dtype)
190+
expected = Index(list(expected), dtype=any_string_dtype)
200191
tm.assert_index_equal(result, expected)
201192

202-
# TODO(infer_string) parametrize over multiple string dtypes
203-
@td.skip_if_no("pyarrow")
204-
def test_slice_locs_negative_step_oob(self):
205-
index = Index(list("bcdxy"), dtype="string[pyarrow_numpy]")
193+
def test_slice_locs_negative_step_oob(self, any_string_dtype):
194+
index = Index(list("bcdxy"), dtype=any_string_dtype)
206195

207196
result = index[-10:5:1]
208197
tm.assert_index_equal(result, index)
209198

210199
result = index[4:-10:-1]
211-
expected = Index(list("yxdcb"), dtype="string[pyarrow_numpy]")
200+
expected = Index(list("yxdcb"), dtype=any_string_dtype)
212201
tm.assert_index_equal(result, expected)
213202

214203
def test_slice_locs_dup(self):

pandas/tests/indexes/test_base.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -933,10 +933,9 @@ def test_isin_empty(self, empty):
933933
result = index.isin(empty)
934934
tm.assert_numpy_array_equal(expected, result)
935935

936-
@td.skip_if_no("pyarrow")
937-
def test_isin_arrow_string_null(self):
936+
def test_isin_string_null(self, string_dtype_no_object):
938937
# GH#55821
939-
index = Index(["a", "b"], dtype="string[pyarrow_numpy]")
938+
index = Index(["a", "b"], dtype=string_dtype_no_object)
940939
result = index.isin([None])
941940
expected = np.array([False, False])
942941
tm.assert_numpy_array_equal(result, expected)

pandas/tests/indexes/test_old_base.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,10 @@ def test_ensure_copied_data(self, index):
295295
tm.assert_numpy_array_equal(
296296
index._values._ndarray, result._values._ndarray, check_same="same"
297297
)
298-
elif index.dtype in ("string[pyarrow]", "string[pyarrow_numpy]"):
298+
elif (
299+
isinstance(index.dtype, StringDtype)
300+
and index.dtype.storage == "pyarrow"
301+
):
299302
assert tm.shares_memory(result._values, index._values)
300303
else:
301304
raise NotImplementedError(index.dtype)

0 commit comments

Comments
 (0)