Skip to content

Commit 171112e

Browse files
moinknofarmish
authored andcommitted
TST GH26807 Break up test_strings (pandas-dev#39215)
1 parent ad32e00 commit 171112e

11 files changed

+3776
-3680
lines changed

pandas/tests/strings/__init__.py

Whitespace-only changes.

pandas/tests/strings/conftest.py

+175
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
import numpy as np
2+
import pytest
3+
4+
from pandas import Series
5+
from pandas.core import strings as strings
6+
7+
_any_string_method = [
8+
("cat", (), {"sep": ","}),
9+
("cat", (Series(list("zyx")),), {"sep": ",", "join": "left"}),
10+
("center", (10,), {}),
11+
("contains", ("a",), {}),
12+
("count", ("a",), {}),
13+
("decode", ("UTF-8",), {}),
14+
("encode", ("UTF-8",), {}),
15+
("endswith", ("a",), {}),
16+
("endswith", ("a",), {"na": True}),
17+
("endswith", ("a",), {"na": False}),
18+
("extract", ("([a-z]*)",), {"expand": False}),
19+
("extract", ("([a-z]*)",), {"expand": True}),
20+
("extractall", ("([a-z]*)",), {}),
21+
("find", ("a",), {}),
22+
("findall", ("a",), {}),
23+
("get", (0,), {}),
24+
# because "index" (and "rindex") fail intentionally
25+
# if the string is not found, search only for empty string
26+
("index", ("",), {}),
27+
("join", (",",), {}),
28+
("ljust", (10,), {}),
29+
("match", ("a",), {}),
30+
("fullmatch", ("a",), {}),
31+
("normalize", ("NFC",), {}),
32+
("pad", (10,), {}),
33+
("partition", (" ",), {"expand": False}),
34+
("partition", (" ",), {"expand": True}),
35+
("repeat", (3,), {}),
36+
("replace", ("a", "z"), {}),
37+
("rfind", ("a",), {}),
38+
("rindex", ("",), {}),
39+
("rjust", (10,), {}),
40+
("rpartition", (" ",), {"expand": False}),
41+
("rpartition", (" ",), {"expand": True}),
42+
("slice", (0, 1), {}),
43+
("slice_replace", (0, 1, "z"), {}),
44+
("split", (" ",), {"expand": False}),
45+
("split", (" ",), {"expand": True}),
46+
("startswith", ("a",), {}),
47+
("startswith", ("a",), {"na": True}),
48+
("startswith", ("a",), {"na": False}),
49+
# translating unicode points of "a" to "d"
50+
("translate", ({97: 100},), {}),
51+
("wrap", (2,), {}),
52+
("zfill", (10,), {}),
53+
] + list(
54+
zip(
55+
[
56+
# methods without positional arguments: zip with empty tuple and empty dict
57+
"capitalize",
58+
"cat",
59+
"get_dummies",
60+
"isalnum",
61+
"isalpha",
62+
"isdecimal",
63+
"isdigit",
64+
"islower",
65+
"isnumeric",
66+
"isspace",
67+
"istitle",
68+
"isupper",
69+
"len",
70+
"lower",
71+
"lstrip",
72+
"partition",
73+
"rpartition",
74+
"rsplit",
75+
"rstrip",
76+
"slice",
77+
"slice_replace",
78+
"split",
79+
"strip",
80+
"swapcase",
81+
"title",
82+
"upper",
83+
"casefold",
84+
],
85+
[()] * 100,
86+
[{}] * 100,
87+
)
88+
)
89+
ids, _, _ = zip(*_any_string_method) # use method name as fixture-id
90+
missing_methods = {
91+
f for f in dir(strings.StringMethods) if not f.startswith("_")
92+
} - set(ids)
93+
94+
# test that the above list captures all methods of StringMethods
95+
assert not missing_methods
96+
97+
98+
@pytest.fixture(params=_any_string_method, ids=ids)
99+
def any_string_method(request):
100+
"""
101+
Fixture for all public methods of `StringMethods`
102+
103+
This fixture returns a tuple of the method name and sample arguments
104+
necessary to call the method.
105+
106+
Returns
107+
-------
108+
method_name : str
109+
The name of the method in `StringMethods`
110+
args : tuple
111+
Sample values for the positional arguments
112+
kwargs : dict
113+
Sample values for the keyword arguments
114+
115+
Examples
116+
--------
117+
>>> def test_something(any_string_method):
118+
... s = Series(['a', 'b', np.nan, 'd'])
119+
...
120+
... method_name, args, kwargs = any_string_method
121+
... method = getattr(s.str, method_name)
122+
... # will not raise
123+
... method(*args, **kwargs)
124+
"""
125+
return request.param
126+
127+
128+
# subset of the full set from pandas/conftest.py
129+
_any_allowed_skipna_inferred_dtype = [
130+
("string", ["a", np.nan, "c"]),
131+
("bytes", [b"a", np.nan, b"c"]),
132+
("empty", [np.nan, np.nan, np.nan]),
133+
("empty", []),
134+
("mixed-integer", ["a", np.nan, 2]),
135+
]
136+
ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id
137+
138+
139+
@pytest.fixture(params=_any_allowed_skipna_inferred_dtype, ids=ids)
140+
def any_allowed_skipna_inferred_dtype(request):
141+
"""
142+
Fixture for all (inferred) dtypes allowed in StringMethods.__init__
143+
144+
The covered (inferred) types are:
145+
* 'string'
146+
* 'empty'
147+
* 'bytes'
148+
* 'mixed'
149+
* 'mixed-integer'
150+
151+
Returns
152+
-------
153+
inferred_dtype : str
154+
The string for the inferred dtype from _libs.lib.infer_dtype
155+
values : np.ndarray
156+
An array of object dtype that will be inferred to have
157+
`inferred_dtype`
158+
159+
Examples
160+
--------
161+
>>> import pandas._libs.lib as lib
162+
>>>
163+
>>> def test_something(any_allowed_skipna_inferred_dtype):
164+
... inferred_dtype, values = any_allowed_skipna_inferred_dtype
165+
... # will pass
166+
... assert lib.infer_dtype(values, skipna=True) == inferred_dtype
167+
...
168+
... # constructor for .str-accessor will also pass
169+
... Series(values).str
170+
"""
171+
inferred_dtype, values = request.param
172+
values = np.array(values, dtype=object) # object dtype to avoid casting
173+
174+
# correctness of inference tested in tests/dtypes/test_inference.py
175+
return inferred_dtype, values

pandas/tests/strings/test_api.py

+132
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
import pytest
2+
3+
from pandas import DataFrame, Index, MultiIndex, Series, _testing as tm
4+
from pandas.core import strings as strings
5+
6+
7+
def test_api():
8+
9+
# GH 6106, GH 9322
10+
assert Series.str is strings.StringMethods
11+
assert isinstance(Series([""]).str, strings.StringMethods)
12+
13+
14+
def test_api_mi_raises():
15+
# GH 23679
16+
mi = MultiIndex.from_arrays([["a", "b", "c"]])
17+
msg = "Can only use .str accessor with Index, not MultiIndex"
18+
with pytest.raises(AttributeError, match=msg):
19+
mi.str
20+
assert not hasattr(mi, "str")
21+
22+
23+
@pytest.mark.parametrize("dtype", [object, "category"])
24+
def test_api_per_dtype(index_or_series, dtype, any_skipna_inferred_dtype):
25+
# one instance of parametrized fixture
26+
box = index_or_series
27+
inferred_dtype, values = any_skipna_inferred_dtype
28+
29+
t = box(values, dtype=dtype) # explicit dtype to avoid casting
30+
31+
types_passing_constructor = [
32+
"string",
33+
"unicode",
34+
"empty",
35+
"bytes",
36+
"mixed",
37+
"mixed-integer",
38+
]
39+
if inferred_dtype in types_passing_constructor:
40+
# GH 6106
41+
assert isinstance(t.str, strings.StringMethods)
42+
else:
43+
# GH 9184, GH 23011, GH 23163
44+
msg = "Can only use .str accessor with string values.*"
45+
with pytest.raises(AttributeError, match=msg):
46+
t.str
47+
assert not hasattr(t, "str")
48+
49+
50+
@pytest.mark.parametrize("dtype", [object, "category"])
51+
def test_api_per_method(
52+
index_or_series,
53+
dtype,
54+
any_allowed_skipna_inferred_dtype,
55+
any_string_method,
56+
request,
57+
):
58+
# this test does not check correctness of the different methods,
59+
# just that the methods work on the specified (inferred) dtypes,
60+
# and raise on all others
61+
box = index_or_series
62+
63+
# one instance of each parametrized fixture
64+
inferred_dtype, values = any_allowed_skipna_inferred_dtype
65+
method_name, args, kwargs = any_string_method
66+
67+
# TODO: get rid of these xfails
68+
reason = None
69+
if box is Index and values.size == 0:
70+
if method_name in ["partition", "rpartition"] and kwargs.get("expand", True):
71+
reason = "Method cannot deal with empty Index"
72+
elif method_name == "split" and kwargs.get("expand", None):
73+
reason = "Split fails on empty Series when expand=True"
74+
elif method_name == "get_dummies":
75+
reason = "Need to fortify get_dummies corner cases"
76+
77+
elif box is Index and inferred_dtype == "empty" and dtype == object:
78+
if method_name == "get_dummies":
79+
reason = "Need to fortify get_dummies corner cases"
80+
81+
if reason is not None:
82+
mark = pytest.mark.xfail(reason=reason)
83+
request.node.add_marker(mark)
84+
85+
t = box(values, dtype=dtype) # explicit dtype to avoid casting
86+
method = getattr(t.str, method_name)
87+
88+
bytes_allowed = method_name in ["decode", "get", "len", "slice"]
89+
# as of v0.23.4, all methods except 'cat' are very lenient with the
90+
# allowed data types, just returning NaN for entries that error.
91+
# This could be changed with an 'errors'-kwarg to the `str`-accessor,
92+
# see discussion in GH 13877
93+
mixed_allowed = method_name not in ["cat"]
94+
95+
allowed_types = (
96+
["string", "unicode", "empty"]
97+
+ ["bytes"] * bytes_allowed
98+
+ ["mixed", "mixed-integer"] * mixed_allowed
99+
)
100+
101+
if inferred_dtype in allowed_types:
102+
# xref GH 23555, GH 23556
103+
method(*args, **kwargs) # works!
104+
else:
105+
# GH 23011, GH 23163
106+
msg = (
107+
f"Cannot use .str.{method_name} with values of "
108+
f"inferred dtype {repr(inferred_dtype)}."
109+
)
110+
with pytest.raises(TypeError, match=msg):
111+
method(*args, **kwargs)
112+
113+
114+
def test_api_for_categorical(any_string_method):
115+
# https://github.com/pandas-dev/pandas/issues/10661
116+
s = Series(list("aabb"))
117+
s = s + " " + s
118+
c = s.astype("category")
119+
assert isinstance(c.str, strings.StringMethods)
120+
121+
method_name, args, kwargs = any_string_method
122+
123+
result = getattr(c.str, method_name)(*args, **kwargs)
124+
expected = getattr(s.str, method_name)(*args, **kwargs)
125+
126+
if isinstance(result, DataFrame):
127+
tm.assert_frame_equal(result, expected)
128+
elif isinstance(result, Series):
129+
tm.assert_series_equal(result, expected)
130+
else:
131+
# str.cat(others=None) returns string, for example
132+
assert result == expected

0 commit comments

Comments
 (0)