Skip to content

Commit 883151a

Browse files
TST (string dtype): clean up construction of expected string arrays
1 parent 9c776ae commit 883151a

File tree

9 files changed

+78
-222
lines changed

9 files changed

+78
-222
lines changed

pandas/tests/io/excel/test_readers.py

+14-30
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,6 @@
3030
read_csv,
3131
)
3232
import pandas._testing as tm
33-
from pandas.core.arrays import (
34-
ArrowStringArray,
35-
StringArray,
36-
)
3733

3834
read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"]
3935
engine_params = [
@@ -692,43 +688,31 @@ def test_dtype_backend_and_dtype(self, read_ext, tmp_excel):
692688
)
693689
tm.assert_frame_equal(result, df)
694690

695-
@pytest.mark.xfail(
696-
using_string_dtype(), reason="infer_string takes precedence", strict=False
697-
)
698691
def test_dtype_backend_string(self, read_ext, string_storage, tmp_excel):
699692
# GH#36712
700693
if read_ext in (".xlsb", ".xls"):
701694
pytest.skip(f"No engine for filetype: '{read_ext}'")
702695

703-
pa = pytest.importorskip("pyarrow")
696+
df = DataFrame(
697+
{
698+
"a": np.array(["a", "b"], dtype=np.object_),
699+
"b": np.array(["x", pd.NA], dtype=np.object_),
700+
}
701+
)
702+
df.to_excel(tmp_excel, sheet_name="test", index=False)
704703

705704
with pd.option_context("mode.string_storage", string_storage):
706-
df = DataFrame(
707-
{
708-
"a": np.array(["a", "b"], dtype=np.object_),
709-
"b": np.array(["x", pd.NA], dtype=np.object_),
710-
}
711-
)
712-
df.to_excel(tmp_excel, sheet_name="test", index=False)
713705
result = pd.read_excel(
714706
tmp_excel, sheet_name="test", dtype_backend="numpy_nullable"
715707
)
716708

717-
if string_storage == "python":
718-
expected = DataFrame(
719-
{
720-
"a": StringArray(np.array(["a", "b"], dtype=np.object_)),
721-
"b": StringArray(np.array(["x", pd.NA], dtype=np.object_)),
722-
}
723-
)
724-
else:
725-
expected = DataFrame(
726-
{
727-
"a": ArrowStringArray(pa.array(["a", "b"])),
728-
"b": ArrowStringArray(pa.array(["x", None])),
729-
}
730-
)
731-
tm.assert_frame_equal(result, expected)
709+
expected = DataFrame(
710+
{
711+
"a": Series(["a", "b"], dtype=pd.StringDtype(string_storage)),
712+
"b": Series(["x", None], dtype=pd.StringDtype(string_storage)),
713+
}
714+
)
715+
tm.assert_frame_equal(result, expected)
732716

733717
@pytest.mark.parametrize("dtypes, exp_value", [({}, 1), ({"a.1": "int64"}, 1)])
734718
def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value):

pandas/tests/io/json/test_pandas.py

+9-27
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,6 @@
2828
read_json,
2929
)
3030
import pandas._testing as tm
31-
from pandas.core.arrays import (
32-
ArrowStringArray,
33-
StringArray,
34-
)
35-
from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
3631

3732
from pandas.io.json import ujson_dumps
3833

@@ -2143,12 +2138,10 @@ def test_json_uint64(self):
21432138
result = df.to_json(orient="split")
21442139
assert result == expected
21452140

2146-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
21472141
def test_read_json_dtype_backend(
21482142
self, string_storage, dtype_backend, orient, using_infer_string
21492143
):
21502144
# GH#50750
2151-
pa = pytest.importorskip("pyarrow")
21522145
df = DataFrame(
21532146
{
21542147
"a": Series([1, np.nan, 3], dtype="Int64"),
@@ -2162,30 +2155,18 @@ def test_read_json_dtype_backend(
21622155
}
21632156
)
21642157

2165-
if using_infer_string:
2166-
string_array = ArrowStringArrayNumpySemantics(pa.array(["a", "b", "c"]))
2167-
string_array_na = ArrowStringArrayNumpySemantics(pa.array(["a", "b", None]))
2168-
elif string_storage == "python":
2169-
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
2170-
string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))
2171-
2172-
elif dtype_backend == "pyarrow":
2173-
pa = pytest.importorskip("pyarrow")
2174-
from pandas.arrays import ArrowExtensionArray
2175-
2176-
string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
2177-
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))
2178-
2179-
else:
2180-
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
2181-
string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
2182-
21832158
out = df.to_json(orient=orient)
21842159
with pd.option_context("mode.string_storage", string_storage):
21852160
result = read_json(
21862161
StringIO(out), dtype_backend=dtype_backend, orient=orient
21872162
)
21882163

2164+
if dtype_backend == "pyarrow":
2165+
pa = pytest.importorskip("pyarrow")
2166+
string_dtype = pd.ArrowDtype(pa.string())
2167+
else:
2168+
string_dtype = pd.StringDtype(string_storage)
2169+
21892170
expected = DataFrame(
21902171
{
21912172
"a": Series([1, np.nan, 3], dtype="Int64"),
@@ -2194,12 +2175,13 @@ def test_read_json_dtype_backend(
21942175
"d": Series([1.5, 2.0, 2.5], dtype="Float64"),
21952176
"e": Series([True, False, NA], dtype="boolean"),
21962177
"f": Series([True, False, True], dtype="boolean"),
2197-
"g": string_array,
2198-
"h": string_array_na,
2178+
"g": Series(["a", "b", "c"], dtype=string_dtype),
2179+
"h": Series(["a", "b", None], dtype=string_dtype),
21992180
}
22002181
)
22012182

22022183
if dtype_backend == "pyarrow":
2184+
pa = pytest.importorskip("pyarrow")
22032185
from pandas.arrays import ArrowExtensionArray
22042186

22052187
expected = DataFrame(

pandas/tests/io/parser/dtypes/test_dtypes_basic.py

+8-22
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,7 @@
1919
Timestamp,
2020
)
2121
import pandas._testing as tm
22-
from pandas.core.arrays import (
23-
ArrowStringArray,
24-
IntegerArray,
25-
StringArray,
26-
)
22+
from pandas.core.arrays import IntegerArray
2723

2824
pytestmark = pytest.mark.filterwarnings(
2925
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
@@ -465,8 +461,6 @@ def test_dtype_backend_and_dtype(all_parsers):
465461

466462
def test_dtype_backend_string(all_parsers, string_storage):
467463
# GH#36712
468-
pa = pytest.importorskip("pyarrow")
469-
470464
with pd.option_context("mode.string_storage", string_storage):
471465
parser = all_parsers
472466

@@ -476,21 +470,13 @@ def test_dtype_backend_string(all_parsers, string_storage):
476470
"""
477471
result = parser.read_csv(StringIO(data), dtype_backend="numpy_nullable")
478472

479-
if string_storage == "python":
480-
expected = DataFrame(
481-
{
482-
"a": StringArray(np.array(["a", "b"], dtype=np.object_)),
483-
"b": StringArray(np.array(["x", pd.NA], dtype=np.object_)),
484-
}
485-
)
486-
else:
487-
expected = DataFrame(
488-
{
489-
"a": ArrowStringArray(pa.array(["a", "b"])),
490-
"b": ArrowStringArray(pa.array(["x", None])),
491-
}
492-
)
493-
tm.assert_frame_equal(result, expected)
473+
expected = DataFrame(
474+
{
475+
"a": pd.array(["a", "b"], dtype=pd.StringDtype(string_storage)),
476+
"b": pd.array(["x", pd.NA], dtype=pd.StringDtype(string_storage)),
477+
}
478+
)
479+
tm.assert_frame_equal(result, expected)
494480

495481

496482
def test_dtype_backend_ea_dtype_specified(all_parsers):

pandas/tests/io/parser/test_read_fwf.py

+8-23
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,6 @@
1313
import numpy as np
1414
import pytest
1515

16-
from pandas._config import using_string_dtype
17-
1816
from pandas.errors import EmptyDataError
1917

2018
import pandas as pd
@@ -23,10 +21,6 @@
2321
DatetimeIndex,
2422
)
2523
import pandas._testing as tm
26-
from pandas.core.arrays import (
27-
ArrowStringArray,
28-
StringArray,
29-
)
3024

3125
from pandas.io.common import urlopen
3226
from pandas.io.parsers import (
@@ -941,39 +935,30 @@ def test_widths_and_usecols():
941935
tm.assert_frame_equal(result, expected)
942936

943937

944-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
945938
def test_dtype_backend(string_storage, dtype_backend):
946939
# GH#50289
947-
if string_storage == "python":
948-
arr = StringArray(np.array(["a", "b"], dtype=np.object_))
949-
arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_))
950-
elif dtype_backend == "pyarrow":
951-
pa = pytest.importorskip("pyarrow")
952-
from pandas.arrays import ArrowExtensionArray
953-
954-
arr = ArrowExtensionArray(pa.array(["a", "b"]))
955-
arr_na = ArrowExtensionArray(pa.array([None, "a"]))
956-
else:
957-
pa = pytest.importorskip("pyarrow")
958-
arr = ArrowStringArray(pa.array(["a", "b"]))
959-
arr_na = ArrowStringArray(pa.array([None, "a"]))
960-
961940
data = """a b c d e f g h i
962941
1 2.5 True a
963942
3 4.5 False b True 6 7.5 a"""
964943
with pd.option_context("mode.string_storage", string_storage):
965944
result = read_fwf(StringIO(data), dtype_backend=dtype_backend)
966945

946+
if dtype_backend == "pyarrow":
947+
pa = pytest.importorskip("pyarrow")
948+
string_dtype = pd.ArrowDtype(pa.string())
949+
else:
950+
string_dtype = pd.StringDtype(string_storage)
951+
967952
expected = DataFrame(
968953
{
969954
"a": pd.Series([1, 3], dtype="Int64"),
970955
"b": pd.Series([2.5, 4.5], dtype="Float64"),
971956
"c": pd.Series([True, False], dtype="boolean"),
972-
"d": arr,
957+
"d": pd.Series(["a", "b"], dtype=string_dtype),
973958
"e": pd.Series([pd.NA, True], dtype="boolean"),
974959
"f": pd.Series([pd.NA, 6], dtype="Int64"),
975960
"g": pd.Series([pd.NA, 7.5], dtype="Float64"),
976-
"h": arr_na,
961+
"h": pd.Series([None, "a"], dtype=string_dtype),
977962
"i": pd.Series([pd.NA, pd.NA], dtype="Int64"),
978963
}
979964
)

pandas/tests/io/test_clipboard.py

+9-21
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,6 @@
1919
read_clipboard,
2020
)
2121
import pandas._testing as tm
22-
from pandas.core.arrays import (
23-
ArrowStringArray,
24-
StringArray,
25-
)
2622

2723
from pandas.io.clipboard import (
2824
CheckedCall,
@@ -358,23 +354,15 @@ def test_read_clipboard_dtype_backend(
358354
self, clipboard, string_storage, dtype_backend, engine
359355
):
360356
# GH#50502
361-
if string_storage == "pyarrow" or dtype_backend == "pyarrow":
362-
pa = pytest.importorskip("pyarrow")
363-
364-
if string_storage == "python":
365-
string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
366-
string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))
367-
368-
elif dtype_backend == "pyarrow" and engine != "c":
357+
if dtype_backend == "pyarrow":
369358
pa = pytest.importorskip("pyarrow")
370-
from pandas.arrays import ArrowExtensionArray
371-
372-
string_array = ArrowExtensionArray(pa.array(["x", "y"]))
373-
string_array_na = ArrowExtensionArray(pa.array(["x", None]))
374-
359+
if engine == "c" and string_storage == "pyarrow":
360+
# TODO avoid this exception?
361+
string_dtype = pd.ArrowDtype(pa.large_string())
362+
else:
363+
string_dtype = pd.ArrowDtype(pa.string())
375364
else:
376-
string_array = ArrowStringArray(pa.array(["x", "y"]))
377-
string_array_na = ArrowStringArray(pa.array(["x", None]))
365+
string_dtype = pd.StringDtype(string_storage)
378366

379367
text = """a,b,c,d,e,f,g,h,i
380368
x,1,4.0,x,2,4.0,,True,False
@@ -386,10 +374,10 @@ def test_read_clipboard_dtype_backend(
386374

387375
expected = DataFrame(
388376
{
389-
"a": string_array,
377+
"a": Series(["x", "y"], dtype=string_dtype),
390378
"b": Series([1, 2], dtype="Int64"),
391379
"c": Series([4.0, 5.0], dtype="Float64"),
392-
"d": string_array_na,
380+
"d": Series(["x", None], dtype=string_dtype),
393381
"e": Series([2, NA], dtype="Int64"),
394382
"f": Series([4.0, NA], dtype="Float64"),
395383
"g": Series([NA, NA], dtype="Int64"),

pandas/tests/io/test_feather.py

+8-20
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,6 @@
99

1010
import pandas as pd
1111
import pandas._testing as tm
12-
from pandas.core.arrays import (
13-
ArrowStringArray,
14-
StringArray,
15-
)
1612

1713
from pandas.io.feather_format import read_feather, to_feather # isort:skip
1814

@@ -184,25 +180,17 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
184180
}
185181
)
186182

187-
if string_storage == "python":
188-
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
189-
string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_))
190-
191-
elif dtype_backend == "pyarrow":
192-
from pandas.arrays import ArrowExtensionArray
193-
194-
string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
195-
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))
196-
197-
else:
198-
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
199-
string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
200-
201183
with tm.ensure_clean() as path:
202184
to_feather(df, path)
203185
with pd.option_context("mode.string_storage", string_storage):
204186
result = read_feather(path, dtype_backend=dtype_backend)
205187

188+
if dtype_backend == "pyarrow":
189+
pa = pytest.importorskip("pyarrow")
190+
string_dtype = pd.ArrowDtype(pa.string())
191+
else:
192+
string_dtype = pd.StringDtype(string_storage)
193+
206194
expected = pd.DataFrame(
207195
{
208196
"a": pd.Series([1, np.nan, 3], dtype="Int64"),
@@ -211,8 +199,8 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
211199
"d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"),
212200
"e": pd.Series([True, False, pd.NA], dtype="boolean"),
213201
"f": pd.Series([True, False, True], dtype="boolean"),
214-
"g": string_array,
215-
"h": string_array_na,
202+
"g": pd.Series(["a", "b", "c"], dtype=string_dtype),
203+
"h": pd.Series(["a", "b", None], dtype=string_dtype),
216204
}
217205
)
218206

0 commit comments

Comments
 (0)