pandas-dev · jreback · Jan 19, 2021 · Jan 16, 2021 · Jan 16, 2021 · Jan 16, 2021
diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py
diff --git a/pandas/tests/strings/conftest.py b/pandas/tests/strings/conftest.py
@@ -0,0 +1,175 @@
+import numpy as np
+import pytest
+
+from pandas import Series
+from pandas.core import strings as strings
+
+_any_string_method = [
+    ("cat", (), {"sep": ","}),
+    ("cat", (Series(list("zyx")),), {"sep": ",", "join": "left"}),
+    ("center", (10,), {}),
+    ("contains", ("a",), {}),
+    ("count", ("a",), {}),
+    ("decode", ("UTF-8",), {}),
+    ("encode", ("UTF-8",), {}),
+    ("endswith", ("a",), {}),
+    ("endswith", ("a",), {"na": True}),
+    ("endswith", ("a",), {"na": False}),
+    ("extract", ("([a-z]*)",), {"expand": False}),
+    ("extract", ("([a-z]*)",), {"expand": True}),
+    ("extractall", ("([a-z]*)",), {}),
+    ("find", ("a",), {}),
+    ("findall", ("a",), {}),
+    ("get", (0,), {}),
+    # because "index" (and "rindex") fail intentionally
+    # if the string is not found, search only for empty string
+    ("index", ("",), {}),
+    ("join", (",",), {}),
+    ("ljust", (10,), {}),
+    ("match", ("a",), {}),
+    ("fullmatch", ("a",), {}),
+    ("normalize", ("NFC",), {}),
+    ("pad", (10,), {}),
+    ("partition", (" ",), {"expand": False}),
+    ("partition", (" ",), {"expand": True}),
+    ("repeat", (3,), {}),
+    ("replace", ("a", "z"), {}),
+    ("rfind", ("a",), {}),
+    ("rindex", ("",), {}),
+    ("rjust", (10,), {}),
+    ("rpartition", (" ",), {"expand": False}),
+    ("rpartition", (" ",), {"expand": True}),
+    ("slice", (0, 1), {}),
+    ("slice_replace", (0, 1, "z"), {}),
+    ("split", (" ",), {"expand": False}),
+    ("split", (" ",), {"expand": True}),
+    ("startswith", ("a",), {}),
+    ("startswith", ("a",), {"na": True}),
+    ("startswith", ("a",), {"na": False}),
+    # translating unicode points of "a" to "d"
+    ("translate", ({97: 100},), {}),
+    ("wrap", (2,), {}),
+    ("zfill", (10,), {}),
+] + list(
+    zip(
+        [
+            # methods without positional arguments: zip with empty tuple and empty dict
+            "capitalize",
+            "cat",
+            "get_dummies",
+            "isalnum",
+            "isalpha",
+            "isdecimal",
+            "isdigit",
+            "islower",
+            "isnumeric",
+            "isspace",
+            "istitle",
+            "isupper",
+            "len",
+            "lower",
+            "lstrip",
+            "partition",
+            "rpartition",
+            "rsplit",
+            "rstrip",
+            "slice",
+            "slice_replace",
+            "split",
+            "strip",
+            "swapcase",
+            "title",
+            "upper",
+            "casefold",
+        ],
+        [()] * 100,
+        [{}] * 100,
+    )
+)
+ids, _, _ = zip(*_any_string_method)  # use method name as fixture-id
+missing_methods = {
+    f for f in dir(strings.StringMethods) if not f.startswith("_")
+} - set(ids)
+
+# test that the above list captures all methods of StringMethods
+assert not missing_methods
+
+
+@pytest.fixture(params=_any_string_method, ids=ids)
+def any_string_method(request):
+    """
+    Fixture for all public methods of `StringMethods`
+
+    This fixture returns a tuple of the method name and sample arguments
+    necessary to call the method.
+
+    Returns
+    -------
+    method_name : str
+        The name of the method in `StringMethods`
+    args : tuple
+        Sample values for the positional arguments
+    kwargs : dict
+        Sample values for the keyword arguments
+
+    Examples
+    --------
+    >>> def test_something(any_string_method):
+    ...     s = Series(['a', 'b', np.nan, 'd'])
+    ...
+    ...     method_name, args, kwargs = any_string_method
+    ...     method = getattr(s.str, method_name)
+    ...     # will not raise
+    ...     method(*args, **kwargs)
+    """
+    return request.param
+
+
+# subset of the full set from pandas/conftest.py
+_any_allowed_skipna_inferred_dtype = [
+    ("string", ["a", np.nan, "c"]),
+    ("bytes", [b"a", np.nan, b"c"]),
+    ("empty", [np.nan, np.nan, np.nan]),
+    ("empty", []),
+    ("mixed-integer", ["a", np.nan, 2]),
+]
+ids, _ = zip(*_any_allowed_skipna_inferred_dtype)  # use inferred type as id
+
+
+@pytest.fixture(params=_any_allowed_skipna_inferred_dtype, ids=ids)
+def any_allowed_skipna_inferred_dtype(request):
+    """
+    Fixture for all (inferred) dtypes allowed in StringMethods.__init__
+
+    The covered (inferred) types are:
+    * 'string'
+    * 'empty'
+    * 'bytes'
+    * 'mixed'
+    * 'mixed-integer'
+
+    Returns
+    -------
+    inferred_dtype : str
+        The string for the inferred dtype from _libs.lib.infer_dtype
+    values : np.ndarray
+        An array of object dtype that will be inferred to have
+        `inferred_dtype`
+
+    Examples
+    --------
+    >>> import pandas._libs.lib as lib
+    >>>
+    >>> def test_something(any_allowed_skipna_inferred_dtype):
+    ...     inferred_dtype, values = any_allowed_skipna_inferred_dtype
+    ...     # will pass
+    ...     assert lib.infer_dtype(values, skipna=True) == inferred_dtype
+    ...
+    ...     # constructor for .str-accessor will also pass
+    ...     Series(values).str
+    """
+    inferred_dtype, values = request.param
+    values = np.array(values, dtype=object)  # object dtype to avoid casting
+
+    # correctness of inference tested in tests/dtypes/test_inference.py
+    return inferred_dtype, values
diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py
@@ -0,0 +1,132 @@
+import pytest
+
+from pandas import DataFrame, Index, MultiIndex, Series, _testing as tm
+from pandas.core import strings as strings
+
+
+def test_api():
+
+    # GH 6106, GH 9322
+    assert Series.str is strings.StringMethods
+    assert isinstance(Series([""]).str, strings.StringMethods)
+
+
+def test_api_mi_raises():
+    # GH 23679
+    mi = MultiIndex.from_arrays([["a", "b", "c"]])
+    msg = "Can only use .str accessor with Index, not MultiIndex"
+    with pytest.raises(AttributeError, match=msg):
+        mi.str
+    assert not hasattr(mi, "str")
+
+
+@pytest.mark.parametrize("dtype", [object, "category"])
+def test_api_per_dtype(index_or_series, dtype, any_skipna_inferred_dtype):
+    # one instance of parametrized fixture
+    box = index_or_series
+    inferred_dtype, values = any_skipna_inferred_dtype
+
+    t = box(values, dtype=dtype)  # explicit dtype to avoid casting
+
+    types_passing_constructor = [
+        "string",
+        "unicode",
+        "empty",
+        "bytes",
+        "mixed",
+        "mixed-integer",
+    ]
+    if inferred_dtype in types_passing_constructor:
+        # GH 6106
+        assert isinstance(t.str, strings.StringMethods)
+    else:
+        # GH 9184, GH 23011, GH 23163
+        msg = "Can only use .str accessor with string values.*"
+        with pytest.raises(AttributeError, match=msg):
+            t.str
+        assert not hasattr(t, "str")
+
+
+@pytest.mark.parametrize("dtype", [object, "category"])
+def test_api_per_method(
+    index_or_series,
+    dtype,
+    any_allowed_skipna_inferred_dtype,
+    any_string_method,
+    request,
+):
+    # this test does not check correctness of the different methods,
+    # just that the methods work on the specified (inferred) dtypes,
+    # and raise on all others
+    box = index_or_series
+
+    # one instance of each parametrized fixture
+    inferred_dtype, values = any_allowed_skipna_inferred_dtype
+    method_name, args, kwargs = any_string_method
+
+    # TODO: get rid of these xfails
+    reason = None
+    if box is Index and values.size == 0:
+        if method_name in ["partition", "rpartition"] and kwargs.get("expand", True):
+            reason = "Method cannot deal with empty Index"
+        elif method_name == "split" and kwargs.get("expand", None):
+            reason = "Split fails on empty Series when expand=True"
+        elif method_name == "get_dummies":
+            reason = "Need to fortify get_dummies corner cases"
+
+    elif box is Index and inferred_dtype == "empty" and dtype == object:
+        if method_name == "get_dummies":
+            reason = "Need to fortify get_dummies corner cases"
+
+    if reason is not None:
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
+    t = box(values, dtype=dtype)  # explicit dtype to avoid casting
+    method = getattr(t.str, method_name)
+
+    bytes_allowed = method_name in ["decode", "get", "len", "slice"]
+    # as of v0.23.4, all methods except 'cat' are very lenient with the
+    # allowed data types, just returning NaN for entries that error.
+    # This could be changed with an 'errors'-kwarg to the `str`-accessor,
+    # see discussion in GH 13877
+    mixed_allowed = method_name not in ["cat"]
+
+    allowed_types = (
+        ["string", "unicode", "empty"]
+        + ["bytes"] * bytes_allowed
+        + ["mixed", "mixed-integer"] * mixed_allowed
+    )
+
+    if inferred_dtype in allowed_types:
+        # xref GH 23555, GH 23556
+        method(*args, **kwargs)  # works!
+    else:
+        # GH 23011, GH 23163
+        msg = (
+            f"Cannot use .str.{method_name} with values of "
+            f"inferred dtype {repr(inferred_dtype)}."
+        )
+        with pytest.raises(TypeError, match=msg):
+            method(*args, **kwargs)
+
+
+def test_api_for_categorical(any_string_method):
+    # https://github.com/pandas-dev/pandas/issues/10661
+    s = Series(list("aabb"))
+    s = s + " " + s
+    c = s.astype("category")
+    assert isinstance(c.str, strings.StringMethods)
+
+    method_name, args, kwargs = any_string_method
+
+    result = getattr(c.str, method_name)(*args, **kwargs)
+    expected = getattr(s.str, method_name)(*args, **kwargs)
+
+    if isinstance(result, DataFrame):
+        tm.assert_frame_equal(result, expected)
+    elif isinstance(result, Series):
+        tm.assert_series_equal(result, expected)
+    else:
+        # str.cat(others=None) returns string, for example
+        assert result == expected