Skip to content

Commit 7d74bdb

Browse files
jorisvandenbosscheWillAyd
authored andcommitted
TST (string dtype): clean up construction of expected string arrays (pandas-dev#59481)
1 parent fec69ce commit 7d74bdb

File tree

9 files changed

+82
-225
lines changed

9 files changed

+82
-225
lines changed

pandas/tests/io/excel/test_readers.py

+17-31
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,6 @@
3030
read_csv,
3131
)
3232
import pandas._testing as tm
33-
from pandas.core.arrays import (
34-
ArrowStringArray,
35-
StringArray,
36-
)
3733

3834
if is_platform_windows():
3935
pytestmark = pytest.mark.single_cpu
@@ -663,41 +659,31 @@ def test_dtype_backend_and_dtype(self, read_ext):
663659
@pytest.mark.xfail(
664660
using_string_dtype(), reason="infer_string takes precedence", strict=False
665661
)
666-
def test_dtype_backend_string(self, read_ext, string_storage):
662+
def test_dtype_backend_string(self, read_ext, string_storage, tmp_excel):
667663
# GH#36712
668664
if read_ext in (".xlsb", ".xls"):
669665
pytest.skip(f"No engine for filetype: '{read_ext}'")
670666

671-
pa = pytest.importorskip("pyarrow")
667+
df = DataFrame(
668+
{
669+
"a": np.array(["a", "b"], dtype=np.object_),
670+
"b": np.array(["x", pd.NA], dtype=np.object_),
671+
}
672+
)
673+
df.to_excel(tmp_excel, sheet_name="test", index=False)
672674

673675
with pd.option_context("mode.string_storage", string_storage):
674-
df = DataFrame(
675-
{
676-
"a": np.array(["a", "b"], dtype=np.object_),
677-
"b": np.array(["x", pd.NA], dtype=np.object_),
678-
}
676+
result = pd.read_excel(
677+
tmp_excel, sheet_name="test", dtype_backend="numpy_nullable"
679678
)
680-
with tm.ensure_clean(read_ext) as file_path:
681-
df.to_excel(file_path, sheet_name="test", index=False)
682-
result = pd.read_excel(
683-
file_path, sheet_name="test", dtype_backend="numpy_nullable"
684-
)
685679

686-
if string_storage == "python":
687-
expected = DataFrame(
688-
{
689-
"a": StringArray(np.array(["a", "b"], dtype=np.object_)),
690-
"b": StringArray(np.array(["x", pd.NA], dtype=np.object_)),
691-
}
692-
)
693-
else:
694-
expected = DataFrame(
695-
{
696-
"a": ArrowStringArray(pa.array(["a", "b"])),
697-
"b": ArrowStringArray(pa.array(["x", None])),
698-
}
699-
)
700-
tm.assert_frame_equal(result, expected)
680+
expected = DataFrame(
681+
{
682+
"a": Series(["a", "b"], dtype=pd.StringDtype(string_storage)),
683+
"b": Series(["x", None], dtype=pd.StringDtype(string_storage)),
684+
}
685+
)
686+
tm.assert_frame_equal(result, expected)
701687

702688
@pytest.mark.parametrize("dtypes, exp_value", [({}, 1), ({"a.1": "int64"}, 1)])
703689
def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value):

pandas/tests/io/json/test_pandas.py

+9-29
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,6 @@
3131
read_json,
3232
)
3333
import pandas._testing as tm
34-
from pandas.core.arrays import (
35-
ArrowStringArray,
36-
StringArray,
37-
)
38-
from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
3934

4035
from pandas.io.json import ujson_dumps
4136

@@ -2037,14 +2032,10 @@ def test_json_uint64(self):
20372032
assert result == expected
20382033

20392034
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
2040-
@pytest.mark.parametrize(
2041-
"orient", ["split", "records", "values", "index", "columns"]
2042-
)
20432035
def test_read_json_dtype_backend(
20442036
self, string_storage, dtype_backend, orient, using_infer_string
20452037
):
20462038
# GH#50750
2047-
pa = pytest.importorskip("pyarrow")
20482039
df = DataFrame(
20492040
{
20502041
"a": Series([1, np.nan, 3], dtype="Int64"),
@@ -2058,30 +2049,18 @@ def test_read_json_dtype_backend(
20582049
}
20592050
)
20602051

2061-
if using_infer_string:
2062-
string_array = ArrowStringArrayNumpySemantics(pa.array(["a", "b", "c"]))
2063-
string_array_na = ArrowStringArrayNumpySemantics(pa.array(["a", "b", None]))
2064-
elif string_storage == "python":
2065-
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
2066-
string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))
2067-
2068-
elif dtype_backend == "pyarrow":
2069-
pa = pytest.importorskip("pyarrow")
2070-
from pandas.arrays import ArrowExtensionArray
2071-
2072-
string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
2073-
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))
2074-
2075-
else:
2076-
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
2077-
string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
2078-
20792052
out = df.to_json(orient=orient)
20802053
with pd.option_context("mode.string_storage", string_storage):
20812054
result = read_json(
20822055
StringIO(out), dtype_backend=dtype_backend, orient=orient
20832056
)
20842057

2058+
if dtype_backend == "pyarrow":
2059+
pa = pytest.importorskip("pyarrow")
2060+
string_dtype = pd.ArrowDtype(pa.string())
2061+
else:
2062+
string_dtype = pd.StringDtype(string_storage)
2063+
20852064
expected = DataFrame(
20862065
{
20872066
"a": Series([1, np.nan, 3], dtype="Int64"),
@@ -2090,12 +2069,13 @@ def test_read_json_dtype_backend(
20902069
"d": Series([1.5, 2.0, 2.5], dtype="Float64"),
20912070
"e": Series([True, False, NA], dtype="boolean"),
20922071
"f": Series([True, False, True], dtype="boolean"),
2093-
"g": string_array,
2094-
"h": string_array_na,
2072+
"g": Series(["a", "b", "c"], dtype=string_dtype),
2073+
"h": Series(["a", "b", None], dtype=string_dtype),
20952074
}
20962075
)
20972076

20982077
if dtype_backend == "pyarrow":
2078+
pa = pytest.importorskip("pyarrow")
20992079
from pandas.arrays import ArrowExtensionArray
21002080

21012081
expected = DataFrame(

pandas/tests/io/parser/dtypes/test_dtypes_basic.py

+8-22
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,7 @@
1818
Timestamp,
1919
)
2020
import pandas._testing as tm
21-
from pandas.core.arrays import (
22-
ArrowStringArray,
23-
IntegerArray,
24-
StringArray,
25-
)
21+
from pandas.core.arrays import IntegerArray
2622

2723
pytestmark = pytest.mark.filterwarnings(
2824
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
@@ -465,8 +461,6 @@ def test_dtype_backend_and_dtype(all_parsers):
465461

466462
def test_dtype_backend_string(all_parsers, string_storage):
467463
# GH#36712
468-
pa = pytest.importorskip("pyarrow")
469-
470464
with pd.option_context("mode.string_storage", string_storage):
471465
parser = all_parsers
472466

@@ -476,21 +470,13 @@ def test_dtype_backend_string(all_parsers, string_storage):
476470
"""
477471
result = parser.read_csv(StringIO(data), dtype_backend="numpy_nullable")
478472

479-
if string_storage == "python":
480-
expected = DataFrame(
481-
{
482-
"a": StringArray(np.array(["a", "b"], dtype=np.object_)),
483-
"b": StringArray(np.array(["x", pd.NA], dtype=np.object_)),
484-
}
485-
)
486-
else:
487-
expected = DataFrame(
488-
{
489-
"a": ArrowStringArray(pa.array(["a", "b"])),
490-
"b": ArrowStringArray(pa.array(["x", None])),
491-
}
492-
)
493-
tm.assert_frame_equal(result, expected)
473+
expected = DataFrame(
474+
{
475+
"a": pd.array(["a", "b"], dtype=pd.StringDtype(string_storage)),
476+
"b": pd.array(["x", pd.NA], dtype=pd.StringDtype(string_storage)),
477+
}
478+
)
479+
tm.assert_frame_equal(result, expected)
494480

495481

496482
def test_dtype_backend_ea_dtype_specified(all_parsers):

pandas/tests/io/parser/test_read_fwf.py

+8-23
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@
1414
import numpy as np
1515
import pytest
1616

17-
from pandas._config import using_string_dtype
18-
1917
from pandas.errors import EmptyDataError
2018

2119
import pandas as pd
@@ -24,10 +22,6 @@
2422
DatetimeIndex,
2523
)
2624
import pandas._testing as tm
27-
from pandas.core.arrays import (
28-
ArrowStringArray,
29-
StringArray,
30-
)
3125

3226
from pandas.io.common import urlopen
3327
from pandas.io.parsers import (
@@ -968,39 +962,30 @@ def test_widths_and_usecols():
968962
tm.assert_frame_equal(result, expected)
969963

970964

971-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
972965
def test_dtype_backend(string_storage, dtype_backend):
973966
# GH#50289
974-
if string_storage == "python":
975-
arr = StringArray(np.array(["a", "b"], dtype=np.object_))
976-
arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_))
977-
elif dtype_backend == "pyarrow":
978-
pa = pytest.importorskip("pyarrow")
979-
from pandas.arrays import ArrowExtensionArray
980-
981-
arr = ArrowExtensionArray(pa.array(["a", "b"]))
982-
arr_na = ArrowExtensionArray(pa.array([None, "a"]))
983-
else:
984-
pa = pytest.importorskip("pyarrow")
985-
arr = ArrowStringArray(pa.array(["a", "b"]))
986-
arr_na = ArrowStringArray(pa.array([None, "a"]))
987-
988967
data = """a b c d e f g h i
989968
1 2.5 True a
990969
3 4.5 False b True 6 7.5 a"""
991970
with pd.option_context("mode.string_storage", string_storage):
992971
result = read_fwf(StringIO(data), dtype_backend=dtype_backend)
993972

973+
if dtype_backend == "pyarrow":
974+
pa = pytest.importorskip("pyarrow")
975+
string_dtype = pd.ArrowDtype(pa.string())
976+
else:
977+
string_dtype = pd.StringDtype(string_storage)
978+
994979
expected = DataFrame(
995980
{
996981
"a": pd.Series([1, 3], dtype="Int64"),
997982
"b": pd.Series([2.5, 4.5], dtype="Float64"),
998983
"c": pd.Series([True, False], dtype="boolean"),
999-
"d": arr,
984+
"d": pd.Series(["a", "b"], dtype=string_dtype),
1000985
"e": pd.Series([pd.NA, True], dtype="boolean"),
1001986
"f": pd.Series([pd.NA, 6], dtype="Int64"),
1002987
"g": pd.Series([pd.NA, 7.5], dtype="Float64"),
1003-
"h": arr_na,
988+
"h": pd.Series([None, "a"], dtype=string_dtype),
1004989
"i": pd.Series([pd.NA, pd.NA], dtype="Int64"),
1005990
}
1006991
)

pandas/tests/io/test_clipboard.py

+9-21
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,6 @@
1919
read_clipboard,
2020
)
2121
import pandas._testing as tm
22-
from pandas.core.arrays import (
23-
ArrowStringArray,
24-
StringArray,
25-
)
2622

2723
from pandas.io.clipboard import (
2824
CheckedCall,
@@ -358,23 +354,15 @@ def test_read_clipboard_dtype_backend(
358354
self, clipboard, string_storage, dtype_backend, engine
359355
):
360356
# GH#50502
361-
if string_storage == "pyarrow" or dtype_backend == "pyarrow":
362-
pa = pytest.importorskip("pyarrow")
363-
364-
if string_storage == "python":
365-
string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
366-
string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))
367-
368-
elif dtype_backend == "pyarrow" and engine != "c":
357+
if dtype_backend == "pyarrow":
369358
pa = pytest.importorskip("pyarrow")
370-
from pandas.arrays import ArrowExtensionArray
371-
372-
string_array = ArrowExtensionArray(pa.array(["x", "y"]))
373-
string_array_na = ArrowExtensionArray(pa.array(["x", None]))
374-
359+
if engine == "c" and string_storage == "pyarrow":
360+
# TODO avoid this exception?
361+
string_dtype = pd.ArrowDtype(pa.large_string())
362+
else:
363+
string_dtype = pd.ArrowDtype(pa.string())
375364
else:
376-
string_array = ArrowStringArray(pa.array(["x", "y"]))
377-
string_array_na = ArrowStringArray(pa.array(["x", None]))
365+
string_dtype = pd.StringDtype(string_storage)
378366

379367
text = """a,b,c,d,e,f,g,h,i
380368
x,1,4.0,x,2,4.0,,True,False
@@ -386,10 +374,10 @@ def test_read_clipboard_dtype_backend(
386374

387375
expected = DataFrame(
388376
{
389-
"a": string_array,
377+
"a": Series(["x", "y"], dtype=string_dtype),
390378
"b": Series([1, 2], dtype="Int64"),
391379
"c": Series([4.0, 5.0], dtype="Float64"),
392-
"d": string_array_na,
380+
"d": Series(["x", None], dtype=string_dtype),
393381
"e": Series([2, NA], dtype="Int64"),
394382
"f": Series([4.0, NA], dtype="Float64"),
395383
"g": Series([NA, NA], dtype="Int64"),

pandas/tests/io/test_feather.py

+8-20
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,6 @@
66

77
import pandas as pd
88
import pandas._testing as tm
9-
from pandas.core.arrays import (
10-
ArrowStringArray,
11-
StringArray,
12-
)
139

1410
from pandas.io.feather_format import read_feather, to_feather # isort:skip
1511

@@ -188,25 +184,17 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
188184
}
189185
)
190186

191-
if string_storage == "python":
192-
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
193-
string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_))
194-
195-
elif dtype_backend == "pyarrow":
196-
from pandas.arrays import ArrowExtensionArray
197-
198-
string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
199-
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))
200-
201-
else:
202-
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
203-
string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
204-
205187
with tm.ensure_clean() as path:
206188
to_feather(df, path)
207189
with pd.option_context("mode.string_storage", string_storage):
208190
result = read_feather(path, dtype_backend=dtype_backend)
209191

192+
if dtype_backend == "pyarrow":
193+
pa = pytest.importorskip("pyarrow")
194+
string_dtype = pd.ArrowDtype(pa.string())
195+
else:
196+
string_dtype = pd.StringDtype(string_storage)
197+
210198
expected = pd.DataFrame(
211199
{
212200
"a": pd.Series([1, np.nan, 3], dtype="Int64"),
@@ -215,8 +203,8 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
215203
"d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"),
216204
"e": pd.Series([True, False, pd.NA], dtype="boolean"),
217205
"f": pd.Series([True, False, True], dtype="boolean"),
218-
"g": string_array,
219-
"h": string_array_na,
206+
"g": pd.Series(["a", "b", "c"], dtype=string_dtype),
207+
"h": pd.Series(["a", "b", None], dtype=string_dtype),
220208
}
221209
)
222210

0 commit comments

Comments
 (0)