Skip to content

TST GH26807 Break up test_strings #39215

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jan 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
175 changes: 175 additions & 0 deletions pandas/tests/strings/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
import numpy as np
import pytest

from pandas import Series
from pandas.core import strings as strings

_any_string_method = [
("cat", (), {"sep": ","}),
("cat", (Series(list("zyx")),), {"sep": ",", "join": "left"}),
("center", (10,), {}),
("contains", ("a",), {}),
("count", ("a",), {}),
("decode", ("UTF-8",), {}),
("encode", ("UTF-8",), {}),
("endswith", ("a",), {}),
("endswith", ("a",), {"na": True}),
("endswith", ("a",), {"na": False}),
("extract", ("([a-z]*)",), {"expand": False}),
("extract", ("([a-z]*)",), {"expand": True}),
("extractall", ("([a-z]*)",), {}),
("find", ("a",), {}),
("findall", ("a",), {}),
("get", (0,), {}),
# because "index" (and "rindex") fail intentionally
# if the string is not found, search only for empty string
("index", ("",), {}),
("join", (",",), {}),
("ljust", (10,), {}),
("match", ("a",), {}),
("fullmatch", ("a",), {}),
("normalize", ("NFC",), {}),
("pad", (10,), {}),
("partition", (" ",), {"expand": False}),
("partition", (" ",), {"expand": True}),
("repeat", (3,), {}),
("replace", ("a", "z"), {}),
("rfind", ("a",), {}),
("rindex", ("",), {}),
("rjust", (10,), {}),
("rpartition", (" ",), {"expand": False}),
("rpartition", (" ",), {"expand": True}),
("slice", (0, 1), {}),
("slice_replace", (0, 1, "z"), {}),
("split", (" ",), {"expand": False}),
("split", (" ",), {"expand": True}),
("startswith", ("a",), {}),
("startswith", ("a",), {"na": True}),
("startswith", ("a",), {"na": False}),
# translating unicode points of "a" to "d"
("translate", ({97: 100},), {}),
("wrap", (2,), {}),
("zfill", (10,), {}),
] + list(
zip(
[
# methods without positional arguments: zip with empty tuple and empty dict
"capitalize",
"cat",
"get_dummies",
"isalnum",
"isalpha",
"isdecimal",
"isdigit",
"islower",
"isnumeric",
"isspace",
"istitle",
"isupper",
"len",
"lower",
"lstrip",
"partition",
"rpartition",
"rsplit",
"rstrip",
"slice",
"slice_replace",
"split",
"strip",
"swapcase",
"title",
"upper",
"casefold",
],
[()] * 100,
[{}] * 100,
)
)
ids, _, _ = zip(*_any_string_method) # use method name as fixture-id
missing_methods = {
f for f in dir(strings.StringMethods) if not f.startswith("_")
} - set(ids)

# test that the above list captures all methods of StringMethods
assert not missing_methods


@pytest.fixture(params=_any_string_method, ids=ids)
def any_string_method(request):
"""
Fixture for all public methods of `StringMethods`

This fixture returns a tuple of the method name and sample arguments
necessary to call the method.

Returns
-------
method_name : str
The name of the method in `StringMethods`
args : tuple
Sample values for the positional arguments
kwargs : dict
Sample values for the keyword arguments

Examples
--------
>>> def test_something(any_string_method):
... s = Series(['a', 'b', np.nan, 'd'])
...
... method_name, args, kwargs = any_string_method
... method = getattr(s.str, method_name)
... # will not raise
... method(*args, **kwargs)
"""
return request.param


# subset of the full set from pandas/conftest.py
_any_allowed_skipna_inferred_dtype = [
("string", ["a", np.nan, "c"]),
("bytes", [b"a", np.nan, b"c"]),
("empty", [np.nan, np.nan, np.nan]),
("empty", []),
("mixed-integer", ["a", np.nan, 2]),
]
ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id


@pytest.fixture(params=_any_allowed_skipna_inferred_dtype, ids=ids)
def any_allowed_skipna_inferred_dtype(request):
"""
Fixture for all (inferred) dtypes allowed in StringMethods.__init__

The covered (inferred) types are:
* 'string'
* 'empty'
* 'bytes'
* 'mixed'
* 'mixed-integer'

Returns
-------
inferred_dtype : str
The string for the inferred dtype from _libs.lib.infer_dtype
values : np.ndarray
An array of object dtype that will be inferred to have
`inferred_dtype`

Examples
--------
>>> import pandas._libs.lib as lib
>>>
>>> def test_something(any_allowed_skipna_inferred_dtype):
... inferred_dtype, values = any_allowed_skipna_inferred_dtype
... # will pass
... assert lib.infer_dtype(values, skipna=True) == inferred_dtype
...
... # constructor for .str-accessor will also pass
... Series(values).str
"""
inferred_dtype, values = request.param
values = np.array(values, dtype=object) # object dtype to avoid casting

# correctness of inference tested in tests/dtypes/test_inference.py
return inferred_dtype, values
132 changes: 132 additions & 0 deletions pandas/tests/strings/test_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import pytest

from pandas import DataFrame, Index, MultiIndex, Series, _testing as tm
from pandas.core import strings as strings


def test_api():

# GH 6106, GH 9322
assert Series.str is strings.StringMethods
assert isinstance(Series([""]).str, strings.StringMethods)


def test_api_mi_raises():
# GH 23679
mi = MultiIndex.from_arrays([["a", "b", "c"]])
msg = "Can only use .str accessor with Index, not MultiIndex"
with pytest.raises(AttributeError, match=msg):
mi.str
assert not hasattr(mi, "str")


@pytest.mark.parametrize("dtype", [object, "category"])
def test_api_per_dtype(index_or_series, dtype, any_skipna_inferred_dtype):
# one instance of parametrized fixture
box = index_or_series
inferred_dtype, values = any_skipna_inferred_dtype

t = box(values, dtype=dtype) # explicit dtype to avoid casting

types_passing_constructor = [
"string",
"unicode",
"empty",
"bytes",
"mixed",
"mixed-integer",
]
if inferred_dtype in types_passing_constructor:
# GH 6106
assert isinstance(t.str, strings.StringMethods)
else:
# GH 9184, GH 23011, GH 23163
msg = "Can only use .str accessor with string values.*"
with pytest.raises(AttributeError, match=msg):
t.str
assert not hasattr(t, "str")


@pytest.mark.parametrize("dtype", [object, "category"])
def test_api_per_method(
index_or_series,
dtype,
any_allowed_skipna_inferred_dtype,
any_string_method,
request,
):
# this test does not check correctness of the different methods,
# just that the methods work on the specified (inferred) dtypes,
# and raise on all others
box = index_or_series

# one instance of each parametrized fixture
inferred_dtype, values = any_allowed_skipna_inferred_dtype
method_name, args, kwargs = any_string_method

# TODO: get rid of these xfails
reason = None
if box is Index and values.size == 0:
if method_name in ["partition", "rpartition"] and kwargs.get("expand", True):
reason = "Method cannot deal with empty Index"
elif method_name == "split" and kwargs.get("expand", None):
reason = "Split fails on empty Series when expand=True"
elif method_name == "get_dummies":
reason = "Need to fortify get_dummies corner cases"

elif box is Index and inferred_dtype == "empty" and dtype == object:
if method_name == "get_dummies":
reason = "Need to fortify get_dummies corner cases"

if reason is not None:
mark = pytest.mark.xfail(reason=reason)
request.node.add_marker(mark)

t = box(values, dtype=dtype) # explicit dtype to avoid casting
method = getattr(t.str, method_name)

bytes_allowed = method_name in ["decode", "get", "len", "slice"]
# as of v0.23.4, all methods except 'cat' are very lenient with the
# allowed data types, just returning NaN for entries that error.
# This could be changed with an 'errors'-kwarg to the `str`-accessor,
# see discussion in GH 13877
mixed_allowed = method_name not in ["cat"]

allowed_types = (
["string", "unicode", "empty"]
+ ["bytes"] * bytes_allowed
+ ["mixed", "mixed-integer"] * mixed_allowed
)

if inferred_dtype in allowed_types:
# xref GH 23555, GH 23556
method(*args, **kwargs) # works!
else:
# GH 23011, GH 23163
msg = (
f"Cannot use .str.{method_name} with values of "
f"inferred dtype {repr(inferred_dtype)}."
)
with pytest.raises(TypeError, match=msg):
method(*args, **kwargs)


def test_api_for_categorical(any_string_method):
# https://github.com/pandas-dev/pandas/issues/10661
s = Series(list("aabb"))
s = s + " " + s
c = s.astype("category")
assert isinstance(c.str, strings.StringMethods)

method_name, args, kwargs = any_string_method

result = getattr(c.str, method_name)(*args, **kwargs)
expected = getattr(s.str, method_name)(*args, **kwargs)

if isinstance(result, DataFrame):
tm.assert_frame_equal(result, expected)
elif isinstance(result, Series):
tm.assert_series_equal(result, expected)
else:
# str.cat(others=None) returns string, for example
assert result == expected
Loading