Skip to content

TST (string dtype): remove usage of 'string[pyarrow_numpy]' alias #59758

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1272,6 +1272,34 @@ def string_dtype(request):
return request.param


@pytest.fixture(
params=[
("python", pd.NA),
pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
("python", np.nan),
],
ids=[
"string=string[python]",
"string=string[pyarrow]",
"string=str[pyarrow]",
"string=str[python]",
],
)
def string_dtype_no_object(request):
"""
Parametrized fixture for string dtypes.
* 'string[python]' (NA variant)
* 'string[pyarrow]' (NA variant)
* 'str' (NaN variant, with pyarrow)
* 'str' (NaN variant, without pyarrow)
"""
# need to instantiate the StringDtype here instead of in the params
# to avoid importing pyarrow during test collection
storage, na_value = request.param
return pd.StringDtype(storage, na_value)


@pytest.fixture(
params=[
"string[python]",
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/apply/test_numba.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
DataFrame,
Index,
Expand All @@ -29,11 +30,10 @@ def test_numba_vs_python_noop(float_frame, apply_axis):

def test_numba_vs_python_string_index():
# GH#56189
pytest.importorskip("pyarrow")
df = DataFrame(
1,
index=Index(["a", "b"], dtype="string[pyarrow_numpy]"),
columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"),
index=Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
columns=Index(["x", "y"], dtype=pd.StringDtype(na_value=np.nan)),
)
func = lambda x: x
result = df.apply(func, engine="numba", axis=0)
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/arrays/string_/test_string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,10 +241,11 @@ def test_setitem_invalid_indexer_raises():
arr[[0, 1]] = ["foo", "bar", "baz"]


@pytest.mark.parametrize("dtype", ["string[pyarrow]", "string[pyarrow_numpy]"])
def test_pickle_roundtrip(dtype):
@pytest.mark.parametrize("na_value", [pd.NA, np.nan])
def test_pickle_roundtrip(na_value):
# GH 42600
pytest.importorskip("pyarrow")
dtype = StringDtype("pyarrow", na_value=na_value)
expected = pd.Series(range(10), dtype=dtype)
expected_sliced = expected.head(2)
full_pickled = pickle.dumps(expected)
Expand Down
4 changes: 1 addition & 3 deletions pandas/tests/base/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,9 +183,7 @@ def test_access_by_position(index_flat):
assert index[-1] == index[size - 1]

msg = f"index {size} is out of bounds for axis 0 with size {size}"
if is_dtype_equal(index.dtype, "string[pyarrow]") or is_dtype_equal(
index.dtype, "string[pyarrow_numpy]"
):
if isinstance(index.dtype, pd.StringDtype) and index.dtype.storage == "pyarrow":
msg = "index out of bounds"
with pytest.raises(IndexError, match=msg):
index[size]
Expand Down
10 changes: 3 additions & 7 deletions pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1864,13 +1864,11 @@ def test_adding_new_conditional_column() -> None:
("dtype", "infer_string"),
[
(object, False),
("string[pyarrow_numpy]", True),
(pd.StringDtype(na_value=np.nan), True),
],
)
def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None:
# https://github.com/pandas-dev/pandas/issues/56204
pytest.importorskip("pyarrow")

df = DataFrame({"a": [1, 2], "b": [3, 4]})
with pd.option_context("future.infer_string", infer_string):
df.loc[df["a"] == 1, "c"] = "1"
Expand All @@ -1880,16 +1878,14 @@ def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None:
tm.assert_frame_equal(df, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_add_new_column_infer_string():
# GH#55366
pytest.importorskip("pyarrow")
df = DataFrame({"x": [1]})
with pd.option_context("future.infer_string", True):
df.loc[df["x"] == 1, "y"] = "1"
expected = DataFrame(
{"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")},
columns=Index(["x", "y"], dtype=object),
{"x": [1], "y": Series(["1"], dtype=pd.StringDtype(na_value=np.nan))},
columns=Index(["x", "y"], dtype="str"),
)
tm.assert_frame_equal(df, expected)

Expand Down
14 changes: 7 additions & 7 deletions pandas/tests/frame/methods/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
)
from pandas.compat import HAS_PYARROW

import pandas as pd
from pandas import (
DataFrame,
Index,
Expand Down Expand Up @@ -502,14 +503,13 @@ def test_rank_mixed_axis_zero(self, data, expected):
result = df.rank(numeric_only=True)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
"dtype, exp_dtype",
[("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")],
)
def test_rank_string_dtype(self, dtype, exp_dtype):
def test_rank_string_dtype(self, string_dtype_no_object):
# GH#55362
pytest.importorskip("pyarrow")
obj = Series(["foo", "foo", None, "foo"], dtype=dtype)
obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object)
result = obj.rank(method="first")
exp_dtype = "Int64" if string_dtype_no_object.na_value is pd.NA else "float64"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm why are these int64 for some types now? I think float is required for the return type to allow for some tiebreakers

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, good point! But so that is an existing bug in the ArrowExtensionArray implementation then, because this test was already asserting Int64 (I just rewrote it a bit, but in the removed parametrization a few lines above there is already this dtype)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah ok...interesting!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opened a separate PR to address this -> #59768

if string_dtype_no_object.storage == "python":
# TODO nullable string[python] should also return nullable Int64
exp_dtype = "float64"
expected = Series([1, 2, None, 3], dtype=exp_dtype)
tm.assert_series_equal(result, expected)
7 changes: 2 additions & 5 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2655,8 +2655,7 @@ def test_construct_with_strings_and_none(self):

def test_frame_string_inference(self):
# GH#54430
pytest.importorskip("pyarrow")
dtype = "string[pyarrow_numpy]"
dtype = pd.StringDtype(na_value=np.nan)
expected = DataFrame(
{"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
)
Expand Down Expand Up @@ -2690,8 +2689,7 @@ def test_frame_string_inference(self):

def test_frame_string_inference_array_string_dtype(self):
# GH#54496
pytest.importorskip("pyarrow")
dtype = "string[pyarrow_numpy]"
dtype = pd.StringDtype(na_value=np.nan)
expected = DataFrame(
{"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
)
Expand All @@ -2715,7 +2713,6 @@ def test_frame_string_inference_array_string_dtype(self):

def test_frame_string_inference_block_dim(self):
# GH#55363
pytest.importorskip("pyarrow")
with pd.option_context("future.infer_string", True):
df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]]))
assert df._mgr.blocks[0].ndim == 2
Expand Down
13 changes: 2 additions & 11 deletions pandas/tests/groupby/methods/test_size.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@

from pandas._config import using_string_dtype

import pandas.util._test_decorators as td

from pandas import (
DataFrame,
Index,
Expand Down Expand Up @@ -79,16 +77,9 @@ def test_size_series_masked_type_returns_Int64(dtype):


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize(
"dtype",
[
object,
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
],
)
def test_size_strings(dtype):
def test_size_strings(any_string_dtype):
# GH#55627
dtype = any_string_dtype
df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype)
result = df.groupby("a")["b"].size()
exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64"
Expand Down
14 changes: 3 additions & 11 deletions pandas/tests/groupby/methods/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

from pandas import (
Categorical,
CategoricalIndex,
Expand Down Expand Up @@ -373,14 +371,6 @@ def test_against_frame_and_seriesgroupby(
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"dtype",
[
object,
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
],
)
@pytest.mark.parametrize("normalize", [True, False])
@pytest.mark.parametrize(
"sort, ascending, expected_rows, expected_count, expected_group_size",
Expand All @@ -398,9 +388,10 @@ def test_compound(
expected_rows,
expected_count,
expected_group_size,
dtype,
any_string_dtype,
using_infer_string,
):
dtype = any_string_dtype
education_df = education_df.astype(dtype)
education_df.columns = education_df.columns.astype(dtype)
# Multiple groupby keys and as_index=False
Expand All @@ -417,6 +408,7 @@ def test_compound(
expected["proportion"] = expected_count
expected["proportion"] /= expected_group_size
if dtype == "string[pyarrow]":
# TODO(nullable) also string[python] should return nullable dtypes
expected["proportion"] = expected["proportion"].convert_dtypes()
else:
expected["count"] = expected_count
Expand Down
11 changes: 2 additions & 9 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2466,20 +2466,13 @@ def test_rolling_wrong_param_min_period():
test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum()


@pytest.mark.parametrize(
"dtype",
[
object,
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
],
)
def test_by_column_values_with_same_starting_value(dtype):
def test_by_column_values_with_same_starting_value(any_string_dtype):
# GH29635
df = DataFrame(
{
"Name": ["Thomas", "Thomas", "Thomas John"],
"Credit": [1200, 1300, 900],
"Mood": Series(["sad", "happy", "happy"], dtype=dtype),
"Mood": Series(["sad", "happy", "happy"], dtype=any_string_dtype),
}
)
aggregate_details = {"Mood": Series.mode, "Credit": "sum"}
Expand Down
5 changes: 2 additions & 3 deletions pandas/tests/groupby/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -714,10 +714,9 @@ def test_groupby_min_max_categorical(func):


@pytest.mark.parametrize("func", ["min", "max"])
def test_min_empty_string_dtype(func):
def test_min_empty_string_dtype(func, string_dtype_no_object):
# GH#55619
pytest.importorskip("pyarrow")
dtype = "string[pyarrow_numpy]"
dtype = string_dtype_no_object
df = DataFrame({"a": ["a"], "b": "a", "c": "a"}, dtype=dtype).iloc[:0]
result = getattr(df.groupby("a"), func)()
expected = DataFrame(
Expand Down
4 changes: 1 addition & 3 deletions pandas/tests/indexes/base_class/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,7 @@ def test_construct_empty_tuples(self, tuple_list):

def test_index_string_inference(self):
# GH#54430
pytest.importorskip("pyarrow")
dtype = "string[pyarrow_numpy]"
expected = Index(["a", "b"], dtype=dtype)
expected = Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan))
with pd.option_context("future.infer_string", True):
ser = Index(["a", "b"])
tm.assert_index_equal(ser, expected)
Expand Down
7 changes: 3 additions & 4 deletions pandas/tests/indexes/base_class/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,11 @@ def test_insert_datetime_into_object(self, loc, val):
tm.assert_index_equal(result, expected)
assert type(expected[2]) is type(val)

def test_insert_none_into_string_numpy(self):
def test_insert_none_into_string_numpy(self, string_dtype_no_object):
# GH#55365
pytest.importorskip("pyarrow")
index = Index(["a", "b", "c"], dtype="string[pyarrow_numpy]")
index = Index(["a", "b", "c"], dtype=string_dtype_no_object)
result = index.insert(-1, None)
expected = Index(["a", "b", None, "c"], dtype="string[pyarrow_numpy]")
expected = Index(["a", "b", None, "c"], dtype=string_dtype_no_object)
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize(
Expand Down
23 changes: 6 additions & 17 deletions pandas/tests/indexes/object/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
NA,
is_matching_na,
)
import pandas.util._test_decorators as td

import pandas as pd
from pandas import Index
Expand Down Expand Up @@ -160,14 +159,6 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2):


class TestSliceLocs:
# TODO(infer_string) parametrize over multiple string dtypes
@pytest.mark.parametrize(
"dtype",
[
"object",
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
],
)
@pytest.mark.parametrize(
"in_slice,expected",
[
Expand All @@ -191,24 +182,22 @@ class TestSliceLocs:
(pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc]
],
)
def test_slice_locs_negative_step(self, in_slice, expected, dtype):
index = Index(list("bcdxy"), dtype=dtype)
def test_slice_locs_negative_step(self, in_slice, expected, any_string_dtype):
index = Index(list("bcdxy"), dtype=any_string_dtype)

s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step)
result = index[s_start : s_stop : in_slice.step]
expected = Index(list(expected), dtype=dtype)
expected = Index(list(expected), dtype=any_string_dtype)
tm.assert_index_equal(result, expected)

# TODO(infer_string) parametrize over multiple string dtypes
@td.skip_if_no("pyarrow")
def test_slice_locs_negative_step_oob(self):
index = Index(list("bcdxy"), dtype="string[pyarrow_numpy]")
def test_slice_locs_negative_step_oob(self, any_string_dtype):
index = Index(list("bcdxy"), dtype=any_string_dtype)

result = index[-10:5:1]
tm.assert_index_equal(result, index)

result = index[4:-10:-1]
expected = Index(list("yxdcb"), dtype="string[pyarrow_numpy]")
expected = Index(list("yxdcb"), dtype=any_string_dtype)
tm.assert_index_equal(result, expected)

def test_slice_locs_dup(self):
Expand Down
5 changes: 2 additions & 3 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -940,10 +940,9 @@ def test_isin_empty(self, empty):
result = index.isin(empty)
tm.assert_numpy_array_equal(expected, result)

@td.skip_if_no("pyarrow")
def test_isin_arrow_string_null(self):
def test_isin_string_null(self, string_dtype_no_object):
# GH#55821
index = Index(["a", "b"], dtype="string[pyarrow_numpy]")
index = Index(["a", "b"], dtype=string_dtype_no_object)
result = index.isin([None])
expected = np.array([False, False])
tm.assert_numpy_array_equal(result, expected)
Expand Down
5 changes: 4 additions & 1 deletion pandas/tests/indexes/test_old_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,10 @@ def test_ensure_copied_data(self, index):
tm.assert_numpy_array_equal(
index._values._ndarray, result._values._ndarray, check_same="same"
)
elif index.dtype in ("string[pyarrow]", "string[pyarrow_numpy]"):
elif (
isinstance(index.dtype, pd.StringDtype)
and index.dtype.storage == "pyarrow"
):
assert tm.shares_memory(result._values, index._values)
else:
raise NotImplementedError(index.dtype)
Expand Down
Loading
Loading