Skip to content

TST (string dtype): clean up construction of expected string arrays #59481

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 14 additions & 30 deletions pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,6 @@
read_csv,
)
import pandas._testing as tm
from pandas.core.arrays import (
ArrowStringArray,
StringArray,
)

read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"]
engine_params = [
Expand Down Expand Up @@ -692,43 +688,31 @@ def test_dtype_backend_and_dtype(self, read_ext, tmp_excel):
)
tm.assert_frame_equal(result, df)

@pytest.mark.xfail(
using_string_dtype(), reason="infer_string takes precedence", strict=False
)
def test_dtype_backend_string(self, read_ext, string_storage, tmp_excel):
# GH#36712
if read_ext in (".xlsb", ".xls"):
pytest.skip(f"No engine for filetype: '{read_ext}'")

pa = pytest.importorskip("pyarrow")
df = DataFrame(
{
"a": np.array(["a", "b"], dtype=np.object_),
"b": np.array(["x", pd.NA], dtype=np.object_),
}
)
df.to_excel(tmp_excel, sheet_name="test", index=False)

with pd.option_context("mode.string_storage", string_storage):
df = DataFrame(
{
"a": np.array(["a", "b"], dtype=np.object_),
"b": np.array(["x", pd.NA], dtype=np.object_),
}
)
df.to_excel(tmp_excel, sheet_name="test", index=False)
result = pd.read_excel(
tmp_excel, sheet_name="test", dtype_backend="numpy_nullable"
)

if string_storage == "python":
expected = DataFrame(
{
"a": StringArray(np.array(["a", "b"], dtype=np.object_)),
"b": StringArray(np.array(["x", pd.NA], dtype=np.object_)),
}
)
else:
expected = DataFrame(
{
"a": ArrowStringArray(pa.array(["a", "b"])),
"b": ArrowStringArray(pa.array(["x", None])),
}
)
tm.assert_frame_equal(result, expected)
expected = DataFrame(
{
"a": Series(["a", "b"], dtype=pd.StringDtype(string_storage)),
"b": Series(["x", None], dtype=pd.StringDtype(string_storage)),
}
)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("dtypes, exp_value", [({}, 1), ({"a.1": "int64"}, 1)])
def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value):
Expand Down
36 changes: 9 additions & 27 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,6 @@
read_json,
)
import pandas._testing as tm
from pandas.core.arrays import (
ArrowStringArray,
StringArray,
)
from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics

from pandas.io.json import ujson_dumps

Expand Down Expand Up @@ -2143,12 +2138,10 @@ def test_json_uint64(self):
result = df.to_json(orient="split")
assert result == expected

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_read_json_dtype_backend(
self, string_storage, dtype_backend, orient, using_infer_string
):
# GH#50750
pa = pytest.importorskip("pyarrow")
df = DataFrame(
{
"a": Series([1, np.nan, 3], dtype="Int64"),
Expand All @@ -2162,30 +2155,18 @@ def test_read_json_dtype_backend(
}
)

if using_infer_string:
string_array = ArrowStringArrayNumpySemantics(pa.array(["a", "b", "c"]))
string_array_na = ArrowStringArrayNumpySemantics(pa.array(["a", "b", None]))
elif string_storage == "python":
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))

elif dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))

else:
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowStringArray(pa.array(["a", "b", None]))

out = df.to_json(orient=orient)
with pd.option_context("mode.string_storage", string_storage):
result = read_json(
StringIO(out), dtype_backend=dtype_backend, orient=orient
)

if dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
string_dtype = pd.ArrowDtype(pa.string())
else:
string_dtype = pd.StringDtype(string_storage)
Comment on lines +2164 to +2168
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is actually a different logic than the lines that are removed just above, i.e. it no longer uses using_infer_string, because when the user explicitly passes dtype_backend="pyarrow"|"numpy_nullable", the user should actually always get the NA-variants of the string dtype (regardless of whether the future (NaN-based) string dtype is enabled or not).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably useful in quite a few places in testing right? Maybe even in the core codebase? I wonder if this shouldn't be a helper function instead, or maybe the StringDtype __new__ should handle this and return a pd.ArrowDtype for the pyarrow backend.

Kind of an in between spot since pd.ArrowDtype is still a separate concept from pd.StringDtype, but I think in the future will be easier to refactor if we

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably useful in quite a few places in testing right?

I haven't yet encountered many places in the code base itself that uses that logic, IIRC (although #59487 is actually an example of where we maybe should do that, and the fact that we currently don't use ArrowDtype there is a bit of a bug/missing piece ..)

maybe the StringDtype new should handle this and return a pd.ArrowDtype for the pyarrow backend.

Given those are still separate concepts, as you mention (with also different arrays with different behaviour in various places), I personally think it would make the current situation on the short term rather more confusing if some invocation of StringDtype(..) would return ArrowDtype(string) (it would also require a new keyword because StringDtype("pyarrow") already exists). And on the longer term we should see what the outcome will be for PDEP-13 on logical types.
Especially for testing here, I think it is also good to be explicit about which dtype class is the expected one, for readability / understand-ability of the tests.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair points. If we start seeing in the core library, I think a common function to generate this would make more sense


expected = DataFrame(
{
"a": Series([1, np.nan, 3], dtype="Int64"),
Expand All @@ -2194,12 +2175,13 @@ def test_read_json_dtype_backend(
"d": Series([1.5, 2.0, 2.5], dtype="Float64"),
"e": Series([True, False, NA], dtype="boolean"),
"f": Series([True, False, True], dtype="boolean"),
"g": string_array,
"h": string_array_na,
"g": Series(["a", "b", "c"], dtype=string_dtype),
"h": Series(["a", "b", None], dtype=string_dtype),
}
)

if dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

expected = DataFrame(
Expand Down
30 changes: 8 additions & 22 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,7 @@
Timestamp,
)
import pandas._testing as tm
from pandas.core.arrays import (
ArrowStringArray,
IntegerArray,
StringArray,
)
from pandas.core.arrays import IntegerArray

pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
Expand Down Expand Up @@ -465,8 +461,6 @@ def test_dtype_backend_and_dtype(all_parsers):

def test_dtype_backend_string(all_parsers, string_storage):
# GH#36712
pa = pytest.importorskip("pyarrow")

with pd.option_context("mode.string_storage", string_storage):
parser = all_parsers

Expand All @@ -476,21 +470,13 @@ def test_dtype_backend_string(all_parsers, string_storage):
"""
result = parser.read_csv(StringIO(data), dtype_backend="numpy_nullable")

if string_storage == "python":
expected = DataFrame(
{
"a": StringArray(np.array(["a", "b"], dtype=np.object_)),
"b": StringArray(np.array(["x", pd.NA], dtype=np.object_)),
}
)
else:
expected = DataFrame(
{
"a": ArrowStringArray(pa.array(["a", "b"])),
"b": ArrowStringArray(pa.array(["x", None])),
}
)
tm.assert_frame_equal(result, expected)
expected = DataFrame(
{
"a": pd.array(["a", "b"], dtype=pd.StringDtype(string_storage)),
"b": pd.array(["x", pd.NA], dtype=pd.StringDtype(string_storage)),
}
)
tm.assert_frame_equal(result, expected)


def test_dtype_backend_ea_dtype_specified(all_parsers):
Expand Down
31 changes: 8 additions & 23 deletions pandas/tests/io/parser/test_read_fwf.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.errors import EmptyDataError

import pandas as pd
Expand All @@ -23,10 +21,6 @@
DatetimeIndex,
)
import pandas._testing as tm
from pandas.core.arrays import (
ArrowStringArray,
StringArray,
)

from pandas.io.common import urlopen
from pandas.io.parsers import (
Expand Down Expand Up @@ -941,39 +935,30 @@ def test_widths_and_usecols():
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_dtype_backend(string_storage, dtype_backend):
# GH#50289
if string_storage == "python":
arr = StringArray(np.array(["a", "b"], dtype=np.object_))
arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_))
elif dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

arr = ArrowExtensionArray(pa.array(["a", "b"]))
arr_na = ArrowExtensionArray(pa.array([None, "a"]))
else:
pa = pytest.importorskip("pyarrow")
arr = ArrowStringArray(pa.array(["a", "b"]))
arr_na = ArrowStringArray(pa.array([None, "a"]))

data = """a b c d e f g h i
1 2.5 True a
3 4.5 False b True 6 7.5 a"""
with pd.option_context("mode.string_storage", string_storage):
result = read_fwf(StringIO(data), dtype_backend=dtype_backend)

if dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
string_dtype = pd.ArrowDtype(pa.string())
else:
string_dtype = pd.StringDtype(string_storage)

expected = DataFrame(
{
"a": pd.Series([1, 3], dtype="Int64"),
"b": pd.Series([2.5, 4.5], dtype="Float64"),
"c": pd.Series([True, False], dtype="boolean"),
"d": arr,
"d": pd.Series(["a", "b"], dtype=string_dtype),
"e": pd.Series([pd.NA, True], dtype="boolean"),
"f": pd.Series([pd.NA, 6], dtype="Int64"),
"g": pd.Series([pd.NA, 7.5], dtype="Float64"),
"h": arr_na,
"h": pd.Series([None, "a"], dtype=string_dtype),
"i": pd.Series([pd.NA, pd.NA], dtype="Int64"),
}
)
Expand Down
30 changes: 9 additions & 21 deletions pandas/tests/io/test_clipboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,6 @@
read_clipboard,
)
import pandas._testing as tm
from pandas.core.arrays import (
ArrowStringArray,
StringArray,
)

from pandas.io.clipboard import (
CheckedCall,
Expand Down Expand Up @@ -358,23 +354,15 @@ def test_read_clipboard_dtype_backend(
self, clipboard, string_storage, dtype_backend, engine
):
# GH#50502
if string_storage == "pyarrow" or dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")

if string_storage == "python":
string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))

elif dtype_backend == "pyarrow" and engine != "c":
if dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["x", "y"]))
string_array_na = ArrowExtensionArray(pa.array(["x", None]))

if engine == "c" and string_storage == "pyarrow":
# TODO avoid this exception?
string_dtype = pd.ArrowDtype(pa.large_string())
else:
string_dtype = pd.ArrowDtype(pa.string())
else:
string_array = ArrowStringArray(pa.array(["x", "y"]))
string_array_na = ArrowStringArray(pa.array(["x", None]))
string_dtype = pd.StringDtype(string_storage)

text = """a,b,c,d,e,f,g,h,i
x,1,4.0,x,2,4.0,,True,False
Expand All @@ -386,10 +374,10 @@ def test_read_clipboard_dtype_backend(

expected = DataFrame(
{
"a": string_array,
"a": Series(["x", "y"], dtype=string_dtype),
"b": Series([1, 2], dtype="Int64"),
"c": Series([4.0, 5.0], dtype="Float64"),
"d": string_array_na,
"d": Series(["x", None], dtype=string_dtype),
"e": Series([2, NA], dtype="Int64"),
"f": Series([4.0, NA], dtype="Float64"),
"g": Series([NA, NA], dtype="Int64"),
Expand Down
28 changes: 8 additions & 20 deletions pandas/tests/io/test_feather.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,6 @@

import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import (
ArrowStringArray,
StringArray,
)

from pandas.io.feather_format import read_feather, to_feather # isort:skip

Expand Down Expand Up @@ -184,25 +180,17 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
}
)

if string_storage == "python":
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_))

elif dtype_backend == "pyarrow":
from pandas.arrays import ArrowExtensionArray

string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))

else:
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowStringArray(pa.array(["a", "b", None]))

with tm.ensure_clean() as path:
to_feather(df, path)
with pd.option_context("mode.string_storage", string_storage):
result = read_feather(path, dtype_backend=dtype_backend)

if dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
string_dtype = pd.ArrowDtype(pa.string())
else:
string_dtype = pd.StringDtype(string_storage)

expected = pd.DataFrame(
{
"a": pd.Series([1, np.nan, 3], dtype="Int64"),
Expand All @@ -211,8 +199,8 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
"d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"),
"e": pd.Series([True, False, pd.NA], dtype="boolean"),
"f": pd.Series([True, False, True], dtype="boolean"),
"g": string_array,
"h": string_array_na,
"g": pd.Series(["a", "b", "c"], dtype=string_dtype),
"h": pd.Series(["a", "b", None], dtype=string_dtype),
}
)

Expand Down
Loading
Loading