From 9e90d4e7b07ea4a6975af4153f2d595bd18919e2 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Fri, 10 Jul 2020 20:19:15 +0200 Subject: [PATCH 01/24] Implement BaseDtypeTests for ArrowStringDtype --- pandas/core/arrays/base.py | 6 +- pandas/core/arrays/string_arrow.py | 484 ++++++++++++++++++++ pandas/tests/extension/test_string_arrow.py | 125 +++++ setup.py | 2 +- 4 files changed, 615 insertions(+), 2 deletions(-) create mode 100644 pandas/core/arrays/string_arrow.py create mode 100644 pandas/tests/extension/test_string_arrow.py diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 0c8efda5fc588..e8d00807ad70f 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -457,9 +457,13 @@ def astype(self, dtype, copy=True): NumPy ndarray with 'dtype' for its dtype. """ from pandas.core.arrays.string_ import StringDtype + from pandas.core.arrays.string_arrow import ArrowStringDtype dtype = pandas_dtype(dtype) - if isinstance(dtype, StringDtype): # allow conversion to StringArrays + # FIXME: Really hard-code here? + if isinstance( + dtype, (ArrowStringDtype, StringDtype) + ): # allow conversion to StringArrays return dtype.construct_array_type()._from_sequence(self, copy=False) return np.array(self, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py new file mode 100644 index 0000000000000..8248a3e91c0fe --- /dev/null +++ b/pandas/core/arrays/string_arrow.py @@ -0,0 +1,484 @@ +from collections.abc import Iterable +from typing import Any, Optional, Sequence, Tuple, Type, Union + +import numpy as np +import pyarrow as pa +import pyarrow.compute as pc + +from pandas._libs import missing as libmissing +from pandas._typing import ArrayLike + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.dtypes import register_extension_dtype + +import pandas as pd +from pandas.api.types import ( + is_array_like, + is_bool_dtype, + is_integer, + is_integer_dtype, + is_scalar, +) +from pandas.core.arrays.base import ExtensionArray +from pandas.core.indexers import check_array_indexer + + +def _as_pandas_scalar(arrow_scalar: pa.Scalar) -> Optional[str]: + scalar = arrow_scalar.as_py() + if scalar is None: + return libmissing.NA + else: + return scalar + + +@register_extension_dtype +class ArrowStringDtype(ExtensionDtype): + """ + Extension dtype for string data in a ``pyarrow.ChunkedArray``. + + .. versionadded:: 1.1.0 + + .. warning:: + + ArrowStringDtype is considered experimental. The implementation and + parts of the API may change without warning. + + Attributes + ---------- + None + + Methods + ------- + None + + Examples + -------- + >>> pd.ArrowStringDtype() + ArrowStringDtype + """ + + name = "arrow_string" + + #: StringDtype.na_value uses pandas.NA + na_value = libmissing.NA + + @property + def type(self) -> Type[str]: + return str + + @classmethod + def construct_array_type(cls) -> Type["ArrowStringArray"]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return ArrowStringArray + + def __hash__(self) -> int: + return hash("ArrowStringDtype") + + def __repr__(self) -> str: + return "ArrowStringDtype" + + def __from_arrow__( + self, array: Union["pa.Array", "pa.ChunkedArray"] + ) -> "ArrowStringArray": + """ + Construct StringArray from pyarrow Array/ChunkedArray. + """ + return ArrowStringArray(array) + + def __eq__(self, other) -> bool: + """Check whether 'other' is equal to self. + + By default, 'other' is considered equal if + * it's a string matching 'self.name'. + * it's an instance of this type. + + Parameters + ---------- + other : Any + + Returns + ------- + bool + """ + if isinstance(other, ArrowStringDtype): + return True + elif isinstance(other, str) and other == "arrow_string": + return True + else: + return False + + +class ArrowStringArray(ExtensionArray): + """ + Extension array for string data in a ``pyarrow.ChunkedArray``. + + .. versionadded:: 1.1.0 + + .. warning:: + + ArrowStringArray is considered experimental. The implementation and + parts of the API may change without warning. + + Parameters + ---------- + values : pyarrow.Array or pyarrow.ChunkedArray + The array of data. + + Attributes + ---------- + None + + Methods + ------- + None + + See Also + -------- + array + The recommended function for creating a ArrowStringArray. + Series.str + The string methods are available on Series backed by + a ArrowStringArray. + + Notes + ----- + ArrowStringArray returns a BooleanArray for comparison methods. + + Examples + -------- + >>> pd.array(['This is', 'some text', None, 'data.'], dtype="arrow_string") + + ['This is', 'some text', , 'data.'] + Length: 4, dtype: arrow_string + """ + + def __init__(self, values): + if isinstance(values, pa.Array): + self.data = pa.chunked_array([values]) + elif isinstance(values, pa.ChunkedArray): + self.data = values + else: + raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray") + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + # TODO(ARROW-9407): Accept pd.NA in Arrow + scalars_corrected = [None if pd.isna(x) else x for x in scalars] + return cls(pa.array(scalars_corrected, type=pa.string())) + + @property + def dtype(self) -> ArrowStringDtype: + """ + An instance of 'ArrowStringDtype'. + """ + return ArrowStringDtype() + + def __array__(self, *args, **kwargs) -> "np.ndarray": + """Correctly construct numpy arrays when passed to `np.asarray()`.""" + return self.data.__array__(*args, **kwargs) + + def __arrow_array__(self, type=None): + """Convert myself to a pyarrow Array or ChunkedArray.""" + return self.data + + @property + def size(self) -> int: + """ + Return the number of elements in this array. + + Returns + ------- + size : int + """ + return len(self.data) + + @property + def shape(self) -> Tuple[int]: + """Return the shape of the data.""" + # This may be patched by pandas to support pseudo-2D operations. + return (len(self.data),) + + @property + def ndim(self) -> int: + """Return the number of dimensions of the underlying data.""" + return 1 + + def __len__(self) -> int: + """ + Length of this array. + + Returns + ------- + length : int + """ + return len(self.data) + + @classmethod + def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + return cls._from_sequence(strings, dtype=dtype, copy=copy) + + def __getitem__(self, item): + # type (Any) -> Any + """Select a subset of self. + + Parameters + ---------- + item : int, slice, or ndarray + * int: The position in 'self' to get. + * slice: A slice object, where 'start', 'stop', and 'step' are + integers or None + * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' + + Returns + ------- + item : scalar or ExtensionArray + + Notes + ----- + For scalar ``item``, return a scalar value suitable for the array's + type. This should be an instance of ``self.dtype.type``. + For slice ``key``, return an instance of ``ExtensionArray``, even + if the slice is length 0 or 1. + For a boolean mask, return an instance of ``ExtensionArray``, filtered + to the values where ``item`` is True. + """ + item = check_array_indexer(self, item) + + if isinstance(item, Iterable): + if not is_array_like(item): + item = np.array(item) + if len(item) == 0: + return type(self)(pa.chunked_array([], type=pa.string())) + elif is_integer_dtype(item): + return self.take(item) + elif is_bool_dtype(item): + return type(self)(self.data.filter(item)) + else: + raise IndexError( + "Only integers, slices and integer or " + "boolean arrays are valid indices." + ) + elif is_integer(item): + if item < 0: + item += len(self) + if item >= len(self): + raise IndexError("index out of bounds") + + value = self.data[item] + if isinstance(value, pa.ChunkedArray): + return type(self)(value) + else: + return _as_pandas_scalar(value) + + def fillna(self, value=None, method=None, limit=None): + raise NotImplementedError("fillna") + + def _reduce(self, name, skipna=True, **kwargs): + if name in ["min", "max"]: + return getattr(self, name)(skipna=skipna) + + raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + + @property + def nbytes(self) -> int: + """ + The number of bytes needed to store this object in memory. + """ + return self.data.nbytes + + def isna(self) -> np.ndarray: + """ + Boolean NumPy array indicating if each value is missing. + + This should return a 1-D array the same length as 'self'. + """ + # TODO: Implement .to_numpy for ChunkedArray + return self.data.is_null().to_pandas().values + + def copy(self) -> ExtensionArray: + """ + Return a copy of the array. + + Parameters + ---------- + deep : bool, default False + Also copy the underlying data backing this array. + + Returns + ------- + ExtensionArray + """ + return type(self)(self.data) + + def __eq__(self, other: Any) -> ArrayLike: + """ + Return for `self == other` (element-wise equality). + """ + if isinstance(other, (pd.Series, pd.DataFrame, pd.Index)): + return NotImplemented + if isinstance(other, ArrowStringArray): + result = pc.equal(self.data, other.data) + elif is_scalar(other): + result = pc.equal(self.data, pa.scalar(other)) + else: + raise NotImplementedError("Neither scalar nor ArrowStringArray") + + # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray + return pd.array(result.to_pandas().values) + + def __setitem__(self, key, value): + # type: (Union[int, np.ndarray], Any) -> None + """Set one or more values inplace. + + Parameters + ---------- + key : int, ndarray, or slice + When called from, e.g. ``Series.__setitem__``, ``key`` will be + one of + + * scalar int + * ndarray of integers. + * boolean ndarray + * slice object + + value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object + value or values to be set of ``key``. + + Returns + ------- + None + """ + key = check_array_indexer(self, key) + + if is_integer(key): + if not pd.api.types.is_scalar(value): + raise ValueError("Must pass scalars with scalar indexer") + elif pd.isna(value): + value = None + elif not isinstance(value, str): + raise ValueError("Scalar must be NA or str") + + # Slice data and insert inbetween + new_data = [ + *self.data[0:key].chunks, + pa.array([value], type=pa.string()), + *self.data[(key + 1) :].chunks, + ] + self.data = pa.chunked_array(new_data) + else: + # Convert to integer indices and iteratively assign. + # TODO: Make a faster variant of this in Arrow upstream. + # This is probably extremely slow. + + # Convert all possible input key types to an array of integers + if is_bool_dtype(key): + # TODO(ARROW-9430): Directly support setitem(booleans) + key_array = np.argwhere(key).flatten() + elif isinstance(key, slice): + key_array = np.array(range(len(self))[key]) + else: + # TODO(ARROW-9431): Directly support setitem(integers) + key_array = np.asanyarray(key) + + if pd.api.types.is_scalar(value): + value = np.broadcast_to(value, len(key_array)) + else: + value = np.asarray(value) + + if len(key_array) != len(value): + raise ValueError("Length of indexer and values mismatch") + + for k, v in zip(key_array, value): + self[k] = v + + def take( + self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None + ) -> "ExtensionArray": + """ + Take elements from an array. + + Parameters + ---------- + indices : sequence of int + Indices to be taken. + allow_fill : bool, default False + How to handle negative values in `indices`. + + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + + fill_value : any, optional + Fill value to use for NA-indices when `allow_fill` is True. + This may be ``None``, in which case the default NA value for + the type, ``self.dtype.na_value``, is used. + + For many ExtensionArrays, there will be two representations of + `fill_value`: a user-facing "boxed" scalar, and a low-level + physical NA value. `fill_value` should be the user-facing version, + and the implementation should handle translating that to the + physical version for processing the take if necessary. + + Returns + ------- + ExtensionArray + + Raises + ------ + IndexError + When the indices are out of bounds for the array. + ValueError + When `indices` contains negative values other than ``-1`` + and `allow_fill` is True. + + See Also + -------- + numpy.take + api.extensions.take + + Notes + ----- + ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, + ``iloc``, when `indices` is a sequence of values. Additionally, + it's called by :meth:`Series.reindex`, or any other method + that causes realignment, with a `fill_value`. + """ + # TODO: Remove once we got rid of the (indices < 0) check + if not is_array_like(indices): + indices_array = np.asanyarray(indices) + else: + indices_array = indices + + if len(self.data) == 0 and (indices_array >= 0).any(): + raise IndexError("cannot do a non-empty take") + if indices_array.max() >= len(self.data): + raise IndexError("out of bounds value in 'indices'.") + + if allow_fill: + if (indices_array < 0).any(): + # TODO(ARROW-9433): Treat negative indices as NULL + indices_array = pa.array(indices_array, mask=indices_array < 0) + result = self.data.take(indices_array) + if pd.isna(fill_value): + return type(self)(result) + return type(self)(pc.fill_null(result, pa.scalar(fill_value))) + else: + # Nothing to fill + return type(self)(self.data.take(indices)) + else: # allow_fill=False + # TODO(ARROW-9432): Treat negative indices as indices from the right. + if (indices_array < 0).any(): + # Don't modify in-place + indices_array = np.copy(indices_array) + indices_array[indices_array < 0] += len(self.data) + return type(self)(self.data.take(indices_array)) diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py new file mode 100644 index 0000000000000..437d51060fb7f --- /dev/null +++ b/pandas/tests/extension/test_string_arrow.py @@ -0,0 +1,125 @@ +import string + +import numpy as np +import pytest + +import pandas as pd +from pandas.core.arrays.string_arrow import ArrowStringArray, ArrowStringDtype +from pandas.tests.extension import base + + +@pytest.fixture +def dtype(): + return ArrowStringDtype() + + +@pytest.fixture +def data(): + strings = np.random.choice(list(string.ascii_letters), size=100) + while strings[0] == strings[1]: + strings = np.random.choice(list(string.ascii_letters), size=100) + + return ArrowStringArray._from_sequence(strings) + + +@pytest.fixture +def data_missing(): + """Length 2 array with [NA, Valid]""" + return ArrowStringArray._from_sequence([pd.NA, "A"]) + + +@pytest.fixture +def data_for_sorting(): + return ArrowStringArray._from_sequence(["B", "C", "A"]) + + +@pytest.fixture +def data_missing_for_sorting(): + return ArrowStringArray._from_sequence(["B", pd.NA, "A"]) + + +@pytest.fixture +def na_value(): + return pd.NA + + +@pytest.fixture +def data_for_grouping(): + return ArrowStringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"]) + + +class TestDtype(base.BaseDtypeTests): + pass + + +class TestInterface(base.BaseInterfaceTests): + @pytest.mark.xfail(reason="Fails until implement, remove before merge") + def test_view(self, data): + base.BaseInterfaceTests.test_view(self, data) + + +class TestConstructors(base.BaseConstructorsTests): + pass + + +# class TestReshaping(base.BaseReshapingTests): +# pass + + +class TestGetitem(base.BaseGetitemTests): + pass + + +class TestSetitem(base.BaseSetitemTests): + pass + + +# class TestMissing(base.BaseMissingTests): +# pass + + +# class TestNoReduce(base.BaseNoReduceTests): +# @pytest.mark.parametrize("skipna", [True, False]) +# def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): +# op_name = all_numeric_reductions +# +# if op_name in ["min", "max"]: +# return None +# +# s = pd.Series(data) +# with pytest.raises(TypeError): +# getattr(s, op_name)(skipna=skipna) + + +# class TestMethods(base.BaseMethodsTests): +# @pytest.mark.skip(reason="returns nullable") +# def test_value_counts(self, all_data, dropna): +# return super().test_value_counts(all_data, dropna) + + +# class TestCasting(base.BaseCastingTests): +# pass + + +# class TestComparisonOps(base.BaseComparisonOpsTests): +# def _compare_other(self, s, data, op_name, other): +# result = getattr(s, op_name)(other) +# expected = getattr(s.astype(object), op_name)(other).astype("boolean") +# self.assert_series_equal(result, expected) + +# def test_compare_scalar(self, data, all_compare_operators): +# op_name = all_compare_operators +# s = pd.Series(data) +# self._compare_other(s, data, op_name, "abc") + + +# class TestParsing(base.BaseParsingTests): +# pass + + +# class TestPrinting(base.BasePrintingTests): +# pass + + +# class TestGroupBy(base.BaseGroupbyTests): +# pass diff --git a/setup.py b/setup.py index f6f0cd9aabc0e..4033ea2935de5 100755 --- a/setup.py +++ b/setup.py @@ -432,7 +432,7 @@ def run(self): extra_compile_args.append("/Z7") extra_link_args.append("/DEBUG") else: - extra_compile_args = ["-Werror"] + extra_compile_args = [] extra_link_args = [] if debugging_symbols_requested: extra_compile_args.append("-g") From 92f1d2635ea6fc34da38383de852d194b864bec0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 3 Sep 2020 15:32:45 -0500 Subject: [PATCH 02/24] Refactor to use parametrized StringDtype --- pandas/core/arrays/base.py | 13 +- pandas/core/arrays/string_.py | 90 +++++++++- pandas/core/arrays/string_arrow.py | 166 +++++++----------- pandas/core/config_init.py | 13 ++ pandas/core/strings.py | 10 +- .../tests/arrays/string_/test_string_arrow.py | 26 +++ pandas/tests/extension/arrow/test_string.py | 7 +- pandas/tests/extension/test_string_arrow.py | 103 +++++++---- setup.py | 2 +- 9 files changed, 261 insertions(+), 169 deletions(-) create mode 100644 pandas/tests/arrays/string_/test_string_arrow.py diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index e8d00807ad70f..4258bcc956cdd 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -356,6 +356,8 @@ def __ne__(self, other: Any) -> ArrayLike: """ Return for `self != other` (element-wise in-equality). """ + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented return ~(self == other) def to_numpy( @@ -457,13 +459,10 @@ def astype(self, dtype, copy=True): NumPy ndarray with 'dtype' for its dtype. """ from pandas.core.arrays.string_ import StringDtype - from pandas.core.arrays.string_arrow import ArrowStringDtype dtype = pandas_dtype(dtype) # FIXME: Really hard-code here? - if isinstance( - dtype, (ArrowStringDtype, StringDtype) - ): # allow conversion to StringArrays + if isinstance(dtype, StringDtype): # allow conversion to StringArrays return dtype.construct_array_type()._from_sequence(self, copy=False) return np.array(self, dtype=dtype, copy=copy) @@ -928,9 +927,9 @@ def take( from the right (the default). This is similar to :func:`numpy.take`. - * True: negative values in `indices` indicate - missing values. These values are set to `fill_value`. Any other - other negative values raise a ``ValueError``. + * True: ``-1`` in `indices` indicate missing values. + These values are set to `fill_value`. Any other other negative + value raise a ``ValueError``. fill_value : any, optional Fill value to use for NA-indices when `allow_fill` is True. diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 381968f9724b6..0e7c5a8036bcf 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,8 +1,10 @@ import operator -from typing import TYPE_CHECKING, Type, Union +from typing import TYPE_CHECKING, Any, Type, Union import numpy as np +from pandas._config import get_option + from pandas._libs import lib, missing as libmissing from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype @@ -50,17 +52,83 @@ class StringDtype(ExtensionDtype): StringDtype """ - name = "string" - #: StringDtype.na_value uses pandas.NA na_value = libmissing.NA + _metadata = ("storage",) + + def __init__(self, storage=None): + if storage is None: + storage = get_option("mode.string_storage") + if storage not in {"python", "pyarrow"}: + raise ValueError( + f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." + ) + self.storage = storage + + @property + def name(self): + return f"StringDtype[{self.storage}]" @property def type(self) -> Type[str]: return str @classmethod - def construct_array_type(cls) -> Type["StringArray"]: + def construct_from_string(cls, string): + """ + Construct a StringDtype from a string. + + Parameters + ---------- + string : str + The type of the name. The storage type will be taking from `string`. + Valid options and their storage types are + + ========================== ============== + string result storage + ========================== ============== + ``'string'`` global default + ``'string[python]'`` python + ``'StringDtype[python]'`` python + ``'string[pyarrow]'`` pyarrow + ``'StringDtype[pyarrow]'`` pyarrow + ========================== ============= + + Returns + ------- + StringDtype + + Raise + ----- + TypeError + If the string is not a valid option. + + """ + if not isinstance(string, str): + raise TypeError( + f"'construct_from_string' expects a string, got {type(string)}" + ) + if string == "string": + # TODO: use global default + return cls() + elif string in {"string[python]", "StringDtype[python]"}: + return cls(storage="python") + elif string in {"string[pyarrow]", "StringDtype[pyarrow]"}: + return cls(storage="pyarrow") + else: + raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") + + def __eq__(self, other: Any) -> bool: + if isinstance(other, str) and other == "string": + return True + return super().__eq__(other) + + def __hash__(self) -> int: + # custom __eq__ so have to override __hash__ + return super().__hash__() + + # XXX: this is a classmethod, but we need to know the storage type. + def construct_array_type(self) -> Type["StringArray"]: """ Return the array type associated with this dtype. @@ -68,10 +136,15 @@ def construct_array_type(cls) -> Type["StringArray"]: ------- type """ - return StringArray + from .string_arrow import ArrowStringArray + + if self.storage == "python": + return StringArray + else: + return ArrowStringArray - def __repr__(self) -> str: - return "StringDtype" + def __repr__(self): + return self.name def __from_arrow__( self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"] @@ -80,6 +153,7 @@ def __from_arrow__( Construct StringArray from pyarrow Array/ChunkedArray. """ import pyarrow # noqa: F811 + from .string_arrow import ArrowStringArray if isinstance(array, pyarrow.Array): chunks = [array] @@ -93,7 +167,7 @@ def __from_arrow__( str_arr = StringArray._from_sequence(np.array(arr)) results.append(str_arr) - return StringArray._concat_same_type(results) + return ArrowStringArray._concat_same_type(results) class StringArray(PandasArray): diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 8248a3e91c0fe..c0831a65b3644 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,5 +1,5 @@ from collections.abc import Iterable -from typing import Any, Optional, Sequence, Tuple, Type, Union +from typing import Any, Optional, Sequence, Tuple, Union import numpy as np import pyarrow as pa @@ -8,18 +8,19 @@ from pandas._libs import missing as libmissing from pandas._typing import ArrayLike -from pandas.core.dtypes.base import ExtensionDtype -from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.missing import isna -import pandas as pd from pandas.api.types import ( is_array_like, is_bool_dtype, + is_int64_dtype, is_integer, is_integer_dtype, is_scalar, ) +from pandas.core.algorithms import factorize from pandas.core.arrays.base import ExtensionArray +from pandas.core.arrays.string_ import StringDtype from pandas.core.indexers import check_array_indexer @@ -31,89 +32,6 @@ def _as_pandas_scalar(arrow_scalar: pa.Scalar) -> Optional[str]: return scalar -@register_extension_dtype -class ArrowStringDtype(ExtensionDtype): - """ - Extension dtype for string data in a ``pyarrow.ChunkedArray``. - - .. versionadded:: 1.1.0 - - .. warning:: - - ArrowStringDtype is considered experimental. The implementation and - parts of the API may change without warning. - - Attributes - ---------- - None - - Methods - ------- - None - - Examples - -------- - >>> pd.ArrowStringDtype() - ArrowStringDtype - """ - - name = "arrow_string" - - #: StringDtype.na_value uses pandas.NA - na_value = libmissing.NA - - @property - def type(self) -> Type[str]: - return str - - @classmethod - def construct_array_type(cls) -> Type["ArrowStringArray"]: - """ - Return the array type associated with this dtype. - - Returns - ------- - type - """ - return ArrowStringArray - - def __hash__(self) -> int: - return hash("ArrowStringDtype") - - def __repr__(self) -> str: - return "ArrowStringDtype" - - def __from_arrow__( - self, array: Union["pa.Array", "pa.ChunkedArray"] - ) -> "ArrowStringArray": - """ - Construct StringArray from pyarrow Array/ChunkedArray. - """ - return ArrowStringArray(array) - - def __eq__(self, other) -> bool: - """Check whether 'other' is equal to self. - - By default, 'other' is considered equal if - * it's a string matching 'self.name'. - * it's an instance of this type. - - Parameters - ---------- - other : Any - - Returns - ------- - bool - """ - if isinstance(other, ArrowStringDtype): - return True - elif isinstance(other, str) and other == "arrow_string": - return True - else: - return False - - class ArrowStringArray(ExtensionArray): """ Extension array for string data in a ``pyarrow.ChunkedArray``. @@ -165,19 +83,20 @@ def __init__(self, values): self.data = values else: raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray") + self._dtype = StringDtype(storage="pyarrow") @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): # TODO(ARROW-9407): Accept pd.NA in Arrow - scalars_corrected = [None if pd.isna(x) else x for x in scalars] + scalars_corrected = [None if isna(x) else x for x in scalars] return cls(pa.array(scalars_corrected, type=pa.string())) @property - def dtype(self) -> ArrowStringDtype: + def dtype(self) -> StringDtype: """ - An instance of 'ArrowStringDtype'. + An instance of 'StringDtype'. """ - return ArrowStringDtype() + return self._dtype def __array__(self, *args, **kwargs) -> "np.ndarray": """Correctly construct numpy arrays when passed to `np.asarray()`.""" @@ -276,15 +195,6 @@ def __getitem__(self, item): else: return _as_pandas_scalar(value) - def fillna(self, value=None, method=None, limit=None): - raise NotImplementedError("fillna") - - def _reduce(self, name, skipna=True, **kwargs): - if name in ["min", "max"]: - return getattr(self, name)(skipna=skipna) - - raise TypeError(f"Cannot perform reduction '{name}' with string dtype") - @property def nbytes(self) -> int: """ @@ -320,7 +230,9 @@ def __eq__(self, other: Any) -> ArrayLike: """ Return for `self == other` (element-wise equality). """ - if isinstance(other, (pd.Series, pd.DataFrame, pd.Index)): + from pandas import array, Series, DataFrame, Index + + if isinstance(other, (Series, DataFrame, Index)): return NotImplemented if isinstance(other, ArrowStringArray): result = pc.equal(self.data, other.data) @@ -330,7 +242,7 @@ def __eq__(self, other: Any) -> ArrayLike: raise NotImplementedError("Neither scalar nor ArrowStringArray") # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray - return pd.array(result.to_pandas().values) + return array(result.to_pandas().values, dtype="boolean") def __setitem__(self, key, value): # type: (Union[int, np.ndarray], Any) -> None @@ -357,9 +269,9 @@ def __setitem__(self, key, value): key = check_array_indexer(self, key) if is_integer(key): - if not pd.api.types.is_scalar(value): + if not is_scalar(value): raise ValueError("Must pass scalars with scalar indexer") - elif pd.isna(value): + elif isna(value): value = None elif not isinstance(value, str): raise ValueError("Scalar must be NA or str") @@ -386,7 +298,7 @@ def __setitem__(self, key, value): # TODO(ARROW-9431): Directly support setitem(integers) key_array = np.asanyarray(key) - if pd.api.types.is_scalar(value): + if is_scalar(value): value = np.broadcast_to(value, len(key_array)) else: value = np.asarray(value) @@ -461,15 +373,20 @@ def take( if len(self.data) == 0 and (indices_array >= 0).any(): raise IndexError("cannot do a non-empty take") - if indices_array.max() >= len(self.data): + if len(indices_array) > 0 and indices_array.max() >= len(self.data): raise IndexError("out of bounds value in 'indices'.") if allow_fill: if (indices_array < 0).any(): + if indices_array.min() < -1: + raise ValueError( + "'indicies' contains negative values other " + "-1 with 'allow_fill=True." + ) # TODO(ARROW-9433): Treat negative indices as NULL indices_array = pa.array(indices_array, mask=indices_array < 0) result = self.data.take(indices_array) - if pd.isna(fill_value): + if isna(fill_value): return type(self)(result) return type(self)(pc.fill_null(result, pa.scalar(fill_value))) else: @@ -482,3 +399,38 @@ def take( indices_array = np.copy(indices_array) indices_array[indices_array < 0] += len(self.data) return type(self)(self.data.take(indices_array)) + + def value_counts(self, dropna=True): + from pandas import Series + + if dropna: + na = self.isna() + self = self[~na] + counts = self.data.value_counts() + return Series(counts.field(1), counts.field(0)) + + def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray"]: + # see https://github.com/xhochy/fletcher/blob/master/fletcher/base.py + # doesn't handle dictionary types. + if self.data.num_chunks == 1: + encoded = self.data.chunk(0).dictionary_encode() + indices = encoded.indices.to_pandas() + if indices.dtype.kind == "f": + indices[np.isnan(indices)] = na_sentinel + indices = indices.astype(int) + if not is_int64_dtype(indices): + indices = indices.astype(np.int64) + return indices.values, type(self)(encoded.dictionary) + else: + np_array = self.data.to_pandas().values + return factorize(np_array, na_sentinel=na_sentinel) + + @classmethod + def _concat_same_type( + cls, to_concat: Sequence["ArrowStringArray"] + ) -> "ArrowStringArray": + return cls( + pa.chunked_array( + [array for ea in to_concat for array in ea.data.iterchunks()] + ) + ) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index bfe20551cbcfc..c7e0e7ef19010 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -504,6 +504,19 @@ def use_inf_as_na_cb(key): ) +string_storage_doc = """ +: string + The default storage for StringDtype. +""" + +with cf.config_prefix("mode"): + cf.register_option( + "string_storage", + "python", + string_storage_doc, + validator=is_one_of_factory(["python", "pyarrow"]), + ) + # Set up the io.excel specific reader configuration. reader_engine_doc = """ : string diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 6702bf519c52e..59aa8fc5cfa0e 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -901,8 +901,10 @@ def _result_dtype(arr): # workaround #27953 # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails # when the list of values is empty. - if arr.dtype.name == "string": - return "string" + from pandas.core.arrays.string_ import StringDtype + + if isinstance(arr.dtype.name, StringDtype): + return arr.dtype.name else: return object @@ -2097,9 +2099,11 @@ class StringMethods(NoNewAttributesMixin): """ def __init__(self, data): + from pandas.core.arrays.string_ import StringDtype + self._inferred_dtype = self._validate(data) self._is_categorical = is_categorical_dtype(data.dtype) - self._is_string = data.dtype.name == "string" + self._is_string = isinstance(data.dtype, StringDtype) # ._values.categories works for both Series/Index self._parent = data._values.categories if self._is_categorical else data diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py new file mode 100644 index 0000000000000..40e3f21670ea0 --- /dev/null +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -0,0 +1,26 @@ +import pytest + +import pandas as pd +import pandas.testing as tm + + +def test_eq_all_na(): + a = pd.array([pd.NA, pd.NA], dtype=pd.StringDtype("pyarrow")) + result = a == a + expected = pd.array([pd.NA, pd.NA], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_config(): + # python by default + assert pd.StringDtype().storage == "python" + arr = pd.array(["a", "b"]) + assert arr.dtype.storage == "python" + + with pd.option_context("mode.string_storage", "pyarrow"): + assert pd.StringDtype().storage == "pyarrow" + arr = pd.array(["a", "b"]) + assert arr.dtype.storage == "pyarrow" + + with pytest.raises(ValueError): + pd.options.mode.string_storage = "foo" diff --git a/pandas/tests/extension/arrow/test_string.py b/pandas/tests/extension/arrow/test_string.py index abd5c1f386dc5..f32f1e415ddc7 100644 --- a/pandas/tests/extension/arrow/test_string.py +++ b/pandas/tests/extension/arrow/test_string.py @@ -4,10 +4,9 @@ pytest.importorskip("pyarrow", minversion="0.13.0") -from .arrays import ArrowStringDtype # isort:skip - def test_constructor_from_list(): # GH 27673 - result = pd.Series(["E"], dtype=ArrowStringDtype()) - assert isinstance(result.dtype, ArrowStringDtype) + result = pd.Series(["E"], dtype=pd.StringDtype(storage="pyarrow")) + assert isinstance(result.dtype, pd.StringDtype) + assert result.dtype.storage == "pyarrow" diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py index 437d51060fb7f..848e8a435b530 100644 --- a/pandas/tests/extension/test_string_arrow.py +++ b/pandas/tests/extension/test_string_arrow.py @@ -4,13 +4,13 @@ import pytest import pandas as pd -from pandas.core.arrays.string_arrow import ArrowStringArray, ArrowStringDtype +from pandas.core.arrays.string_arrow import ArrowStringArray from pandas.tests.extension import base @pytest.fixture def dtype(): - return ArrowStringDtype() + return pd.StringDtype(storage="pyarrow") @pytest.fixture @@ -62,64 +62,89 @@ class TestConstructors(base.BaseConstructorsTests): pass -# class TestReshaping(base.BaseReshapingTests): -# pass +class TestReshaping(base.BaseReshapingTests): + pass class TestGetitem(base.BaseGetitemTests): - pass + @pytest.mark.xfail( + reason="pyarrow.lib.ArrowNotImplementedError: Function " + "fill_null has no kernel matching input types " + "(array[string], scalar[string])" + ) + def test_take_non_na_fill_value(self, data_missing): + super().test_take_non_na_fill_value(data_missing) + + @pytest.mark.xfail( + reason="pyarrow.lib.ArrowNotImplementedError: Function fill_null has no " + "kernel matching input types (array[string], scalar[string])" + ) + def test_reindex_non_na_fill_value(self, data_missing): + super().test_reindex_non_na_fill_value(self, data_missing) class TestSetitem(base.BaseSetitemTests): + @pytest.mark.xfail(reason="TODO") + def test_setitem_preserves_views(self, data): + # Unclear where the issue is (pyarrow getitem, our getitem, our slice) + # and what to do here. + super().test_setitem_preserves_views(data) + + +class TestMissing(base.BaseMissingTests): pass -# class TestMissing(base.BaseMissingTests): -# pass +class TestNoReduce(base.BaseNoReduceTests): + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): + op_name = all_numeric_reductions + if op_name in ["min", "max"]: + return None -# class TestNoReduce(base.BaseNoReduceTests): -# @pytest.mark.parametrize("skipna", [True, False]) -# def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): -# op_name = all_numeric_reductions -# -# if op_name in ["min", "max"]: -# return None -# -# s = pd.Series(data) -# with pytest.raises(TypeError): -# getattr(s, op_name)(skipna=skipna) + s = pd.Series(data) + with pytest.raises(TypeError): + getattr(s, op_name)(skipna=skipna) -# class TestMethods(base.BaseMethodsTests): -# @pytest.mark.skip(reason="returns nullable") -# def test_value_counts(self, all_data, dropna): -# return super().test_value_counts(all_data, dropna) +class TestMethods(base.BaseMethodsTests): + @pytest.mark.skip(reason="returns nullable") + def test_value_counts(self, all_data, dropna): + return super().test_value_counts(all_data, dropna) -# class TestCasting(base.BaseCastingTests): -# pass +class TestCasting(base.BaseCastingTests): + pass -# class TestComparisonOps(base.BaseComparisonOpsTests): -# def _compare_other(self, s, data, op_name, other): -# result = getattr(s, op_name)(other) -# expected = getattr(s.astype(object), op_name)(other).astype("boolean") -# self.assert_series_equal(result, expected) +class TestComparisonOps(base.BaseComparisonOpsTests): + def _compare_other(self, s, data, op_name, other): + if op_name not in {"__eq__", "__ne__"}: + pytest.skip(f"{op_name} is not implemented.") + result = getattr(s, op_name)(other) + expected = getattr(s.astype(object), op_name)(other).astype("boolean") + self.assert_series_equal(result, expected) -# def test_compare_scalar(self, data, all_compare_operators): -# op_name = all_compare_operators -# s = pd.Series(data) -# self._compare_other(s, data, op_name, "abc") + def test_compare_scalar(self, data, all_compare_operators): + op_name = all_compare_operators + s = pd.Series(data) + self._compare_other(s, data, op_name, "abc") + def test_compare_array(self, data, all_compare_operators): + op_name = all_compare_operators + s = pd.Series(data) + other = pd.Series([data[0]] * len(data), dtype=data.dtype) + self._compare_other(s, data, op_name, other) -# class TestParsing(base.BaseParsingTests): -# pass + +class TestParsing(base.BaseParsingTests): + pass -# class TestPrinting(base.BasePrintingTests): -# pass +class TestPrinting(base.BasePrintingTests): + pass -# class TestGroupBy(base.BaseGroupbyTests): -# pass +class TestGroupBy(base.BaseGroupbyTests): + pass diff --git a/setup.py b/setup.py index 4033ea2935de5..f6f0cd9aabc0e 100755 --- a/setup.py +++ b/setup.py @@ -432,7 +432,7 @@ def run(self): extra_compile_args.append("/Z7") extra_link_args.append("/DEBUG") else: - extra_compile_args = [] + extra_compile_args = ["-Werror"] extra_link_args = [] if debugging_symbols_requested: extra_compile_args.append("-g") From 00096f099a9b26795c709684b4d241483a77e08d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 8 Sep 2020 11:44:42 -0500 Subject: [PATCH 03/24] wip --- pandas/core/arrays/string_.py | 97 +- pandas/core/arrays/string_arrow.py | 14 + pandas/core/strings.py | 3657 --------------------------- pandas/core/strings/__init__.py | 1 + pandas/core/strings/accessor.py | 2752 ++++++++++++++++++++ pandas/core/strings/base.py | 33 + pandas/core/strings/object_array.py | 145 ++ pandas/core/strings_.py | 775 ++++++ 8 files changed, 3814 insertions(+), 3660 deletions(-) delete mode 100644 pandas/core/strings.py create mode 100644 pandas/core/strings/__init__.py create mode 100644 pandas/core/strings/accessor.py create mode 100644 pandas/core/strings/base.py create mode 100644 pandas/core/strings/object_array.py create mode 100644 pandas/core/strings_.py diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 0e7c5a8036bcf..d8c76d0615d45 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,23 +1,33 @@ import operator -from typing import TYPE_CHECKING, Any, Type, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, Type, Union import numpy as np from pandas._config import get_option from pandas._libs import lib, missing as libmissing +from pandas._typing import ArrayLike, Dtype from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype -from pandas.core.dtypes.common import pandas_dtype -from pandas.core.dtypes.inference import is_array_like +from pandas.core.dtypes.common import ( + is_array_like, + is_bool_dtype, + is_integer_dtype, + is_object_dtype, + is_string_dtype, + pandas_dtype, +) from pandas import compat from pandas.core import ops +from pandas.core.accessor import CachedAccessor from pandas.core.arrays import IntegerArray, PandasArray +from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.integer import _IntegerDtype from pandas.core.construction import extract_array from pandas.core.indexers import check_array_indexer from pandas.core.missing import isna +from pandas.core.strings.base import BaseStringArrayMethods if TYPE_CHECKING: import pyarrow # noqa: F401 @@ -170,6 +180,86 @@ def __from_arrow__( return ArrowStringArray._concat_same_type(results) +def _map_stringarray( + func: Callable[[str], Any], + arr: "StringArray", + na_value: Any = libmissing.NA, + dtype: Dtype = StringDtype(), +) -> ArrayLike: + """ + Map a callable over valid elements of a StringArray. + + Parameters + ---------- + func : Callable[[str], Any] + Apply to each valid element. + arr : StringArray + na_value : Any + The value to use for missing values. By default, this is + the original value (NA). + dtype : Dtype + The result dtype to use. Specifying this avoids an intermediate + object-dtype allocation. + + Returns + ------- + ArrayLike + An ExtensionArray for integer or string dtypes, otherwise + an ndarray. + + """ + from pandas.arrays import BooleanArray, IntegerArray, StringArray + + mask = isna(arr) + + assert isinstance(arr, StringArray) + arr = np.asarray(arr) + if na_value is None: + na_value = libmissing.NA + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + constructor: Union[Type[IntegerArray], Type[BooleanArray]] + if is_integer_dtype(dtype): + constructor = IntegerArray + else: + constructor = BooleanArray + + na_value_is_na = isna(na_value) + if na_value_is_na: + na_value = 1 + result = lib.map_infer_mask( + arr, + func, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype(dtype), + ) + + if not na_value_is_na: + mask[:] = False + + return constructor(result, mask) + + elif is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, func, mask.view("uint8"), convert=False, na_value=na_value + ) + return StringArray(result) + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, func, mask.view("uint8")) + + +class StringArrayMethods(BaseStringArrayMethods): + def _map(self, f, na_result=libmissing.NA, dtype=StringDtype()): + return _map_stringarray(f, self._array, na_result, dtype) + + class StringArray(PandasArray): """ Extension array for string data. @@ -417,6 +507,7 @@ def _add_arithmetic_ops(cls): cls.__rmul__ = cls._create_arithmetic_method(ops.rmul) _create_comparison_method = _create_arithmetic_method + _str = CachedAccessor("str", StringArrayMethods) StringArray._add_arithmetic_ops() diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index c0831a65b3644..65aa38db4f6f6 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -18,6 +18,7 @@ is_integer_dtype, is_scalar, ) +from pandas.core.accessor import CachedAccessor from pandas.core.algorithms import factorize from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.string_ import StringDtype @@ -32,6 +33,17 @@ def _as_pandas_scalar(arrow_scalar: pa.Scalar) -> Optional[str]: return scalar +class ArrowStringMethods: + def __init__(self, arr): + self._data = arr + + def upper(self): + import pyarrow.compute as pc + + result = pc.utf8_upper(self._data.data) + return ArrowStringArray(result) + + class ArrowStringArray(ExtensionArray): """ Extension array for string data in a ``pyarrow.ChunkedArray``. @@ -434,3 +446,5 @@ def _concat_same_type( [array for ea in to_concat for array in ea.data.iterchunks()] ) ) + + str = CachedAccessor("str", ArrowStringMethods) diff --git a/pandas/core/strings.py b/pandas/core/strings.py deleted file mode 100644 index 59aa8fc5cfa0e..0000000000000 --- a/pandas/core/strings.py +++ /dev/null @@ -1,3657 +0,0 @@ -import codecs -from functools import wraps -import re -import textwrap -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Pattern, Type, Union -import warnings - -import numpy as np - -import pandas._libs.lib as lib -import pandas._libs.missing as libmissing -import pandas._libs.ops as libops -from pandas._typing import ArrayLike, Dtype, Scalar -from pandas.util._decorators import Appender - -from pandas.core.dtypes.common import ( - ensure_object, - is_bool_dtype, - is_categorical_dtype, - is_extension_array_dtype, - is_integer, - is_integer_dtype, - is_list_like, - is_object_dtype, - is_re, - is_scalar, - is_string_dtype, -) -from pandas.core.dtypes.generic import ( - ABCDataFrame, - ABCIndexClass, - ABCMultiIndex, - ABCSeries, -) -from pandas.core.dtypes.missing import isna - -from pandas.core.algorithms import take_1d -from pandas.core.base import NoNewAttributesMixin -from pandas.core.construction import extract_array - -if TYPE_CHECKING: - from pandas.arrays import StringArray - -_cpython_optimized_encoders = ( - "utf-8", - "utf8", - "latin-1", - "latin1", - "iso-8859-1", - "mbcs", - "ascii", -) -_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32") - -_shared_docs: Dict[str, str] = dict() - - -def cat_core(list_of_columns: List, sep: str): - """ - Auxiliary function for :meth:`str.cat` - - Parameters - ---------- - list_of_columns : list of numpy arrays - List of arrays to be concatenated with sep; - these arrays may not contain NaNs! - sep : string - The separator string for concatenating the columns. - - Returns - ------- - nd.array - The concatenation of list_of_columns with sep. - """ - if sep == "": - # no need to interleave sep if it is empty - arr_of_cols = np.asarray(list_of_columns, dtype=object) - return np.sum(arr_of_cols, axis=0) - list_with_sep = [sep] * (2 * len(list_of_columns) - 1) - list_with_sep[::2] = list_of_columns - arr_with_sep = np.asarray(list_with_sep, dtype=object) - return np.sum(arr_with_sep, axis=0) - - -def cat_safe(list_of_columns: List, sep: str): - """ - Auxiliary function for :meth:`str.cat`. - - Same signature as cat_core, but handles TypeErrors in concatenation, which - happen if the arrays in list_of columns have the wrong dtypes or content. - - Parameters - ---------- - list_of_columns : list of numpy arrays - List of arrays to be concatenated with sep; - these arrays may not contain NaNs! - sep : string - The separator string for concatenating the columns. - - Returns - ------- - nd.array - The concatenation of list_of_columns with sep. - """ - try: - result = cat_core(list_of_columns, sep) - except TypeError: - # if there are any non-string values (wrong dtype or hidden behind - # object dtype), np.sum will fail; catch and return with better message - for column in list_of_columns: - dtype = lib.infer_dtype(column, skipna=True) - if dtype not in ["string", "empty"]: - raise TypeError( - "Concatenation requires list-likes containing only " - "strings (or missing values). Offending values found in " - f"column {dtype}" - ) from None - return result - - -def _na_map(f, arr, na_result=None, dtype=np.dtype(object)): - if is_extension_array_dtype(arr.dtype): - if na_result is None: - na_result = libmissing.NA - # just StringDtype - arr = extract_array(arr) - return _map_stringarray(f, arr, na_value=na_result, dtype=dtype) - if na_result is None: - na_result = np.nan - return _map_object(f, arr, na_mask=True, na_value=na_result, dtype=dtype) - - -def _map_stringarray( - func: Callable[[str], Any], arr: "StringArray", na_value: Any, dtype: Dtype -) -> ArrayLike: - """ - Map a callable over valid elements of a StringArray. - - Parameters - ---------- - func : Callable[[str], Any] - Apply to each valid element. - arr : StringArray - na_value : Any - The value to use for missing values. By default, this is - the original value (NA). - dtype : Dtype - The result dtype to use. Specifying this avoids an intermediate - object-dtype allocation. - - Returns - ------- - ArrayLike - An ExtensionArray for integer or string dtypes, otherwise - an ndarray. - - """ - from pandas.arrays import BooleanArray, IntegerArray, StringArray - - mask = isna(arr) - - assert isinstance(arr, StringArray) - arr = np.asarray(arr) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - constructor: Union[Type[IntegerArray], Type[BooleanArray]] - if is_integer_dtype(dtype): - constructor = IntegerArray - else: - constructor = BooleanArray - - na_value_is_na = isna(na_value) - if na_value_is_na: - na_value = 1 - result = lib.map_infer_mask( - arr, - func, - mask.view("uint8"), - convert=False, - na_value=na_value, - dtype=np.dtype(dtype), - ) - - if not na_value_is_na: - mask[:] = False - - return constructor(result, mask) - - elif is_string_dtype(dtype) and not is_object_dtype(dtype): - # i.e. StringDtype - result = lib.map_infer_mask( - arr, func, mask.view("uint8"), convert=False, na_value=na_value - ) - return StringArray(result) - else: - # This is when the result type is object. We reach this when - # -> We know the result type is truly object (e.g. .encode returns bytes - # or .findall returns a list). - # -> We don't know the result type. E.g. `.get` can return anything. - return lib.map_infer_mask(arr, func, mask.view("uint8")) - - -def _map_object(f, arr, na_mask=False, na_value=np.nan, dtype=np.dtype(object)): - if not len(arr): - return np.ndarray(0, dtype=dtype) - - if isinstance(arr, ABCSeries): - arr = arr._values # TODO: extract_array? - if not isinstance(arr, np.ndarray): - arr = np.asarray(arr, dtype=object) - if na_mask: - mask = isna(arr) - convert = not np.all(mask) - try: - result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert) - except (TypeError, AttributeError) as e: - # Reraise the exception if callable `f` got wrong number of args. - # The user may want to be warned by this, instead of getting NaN - p_err = ( - r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " - r"(?(3)required )positional arguments?" - ) - - if len(e.args) >= 1 and re.search(p_err, e.args[0]): - # FIXME: this should be totally avoidable - raise e - - def g(x): - try: - return f(x) - except (TypeError, AttributeError): - return na_value - - return _map_object(g, arr, dtype=dtype) - if na_value is not np.nan: - np.putmask(result, mask, na_value) - if result.dtype == object: - result = lib.maybe_convert_objects(result) - return result - else: - return lib.map_infer(arr, f) - - -def str_count(arr, pat, flags=0): - """ - Count occurrences of pattern in each string of the Series/Index. - - This function is used to count the number of times a particular regex - pattern is repeated in each of the string elements of the - :class:`~pandas.Series`. - - Parameters - ---------- - pat : str - Valid regular expression. - flags : int, default 0, meaning no flags - Flags for the `re` module. For a complete list, `see here - `_. - **kwargs - For compatibility with other string methods. Not used. - - Returns - ------- - Series or Index - Same type as the calling object containing the integer counts. - - See Also - -------- - re : Standard library module for regular expressions. - str.count : Standard library version, without regular expression support. - - Notes - ----- - Some characters need to be escaped when passing in `pat`. - eg. ``'$'`` has a special meaning in regex and must be escaped when - finding this literal character. - - Examples - -------- - >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat']) - >>> s.str.count('a') - 0 0.0 - 1 0.0 - 2 2.0 - 3 2.0 - 4 NaN - 5 0.0 - 6 1.0 - dtype: float64 - - Escape ``'$'`` to find the literal dollar sign. - - >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat']) - >>> s.str.count('\\$') - 0 1 - 1 0 - 2 1 - 3 2 - 4 2 - 5 0 - dtype: int64 - - This is also available on Index - - >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a') - Int64Index([0, 0, 2, 1], dtype='int64') - """ - regex = re.compile(pat, flags=flags) - f = lambda x: len(regex.findall(x)) - return _na_map(f, arr, dtype="int64") - - -def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): - """ - Test if pattern or regex is contained within a string of a Series or Index. - - Return boolean Series or Index based on whether a given pattern or regex is - contained within a string of a Series or Index. - - Parameters - ---------- - pat : str - Character sequence or regular expression. - case : bool, default True - If True, case sensitive. - flags : int, default 0 (no flags) - Flags to pass through to the re module, e.g. re.IGNORECASE. - na : default NaN - Fill value for missing values. - regex : bool, default True - If True, assumes the pat is a regular expression. - - If False, treats the pat as a literal string. - - Returns - ------- - Series or Index of boolean values - A Series or Index of boolean values indicating whether the - given pattern is contained within the string of each element - of the Series or Index. - - See Also - -------- - match : Analogous, but stricter, relying on re.match instead of re.search. - Series.str.startswith : Test if the start of each string element matches a - pattern. - Series.str.endswith : Same as startswith, but tests the end of string. - - Examples - -------- - Returning a Series of booleans using only a literal pattern. - - >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN]) - >>> s1.str.contains('og', regex=False) - 0 False - 1 True - 2 False - 3 False - 4 NaN - dtype: object - - Returning an Index of booleans using only a literal pattern. - - >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN]) - >>> ind.str.contains('23', regex=False) - Index([False, False, False, True, nan], dtype='object') - - Specifying case sensitivity using `case`. - - >>> s1.str.contains('oG', case=True, regex=True) - 0 False - 1 False - 2 False - 3 False - 4 NaN - dtype: object - - Specifying `na` to be `False` instead of `NaN` replaces NaN values - with `False`. If Series or Index does not contain NaN values - the resultant dtype will be `bool`, otherwise, an `object` dtype. - - >>> s1.str.contains('og', na=False, regex=True) - 0 False - 1 True - 2 False - 3 False - 4 False - dtype: bool - - Returning 'house' or 'dog' when either expression occurs in a string. - - >>> s1.str.contains('house|dog', regex=True) - 0 False - 1 True - 2 True - 3 False - 4 NaN - dtype: object - - Ignoring case sensitivity using `flags` with regex. - - >>> import re - >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True) - 0 False - 1 False - 2 True - 3 False - 4 NaN - dtype: object - - Returning any digit using regular expression. - - >>> s1.str.contains('\\d', regex=True) - 0 False - 1 False - 2 False - 3 True - 4 NaN - dtype: object - - Ensure `pat` is a not a literal pattern when `regex` is set to True. - Note in the following example one might expect only `s2[1]` and `s2[3]` to - return `True`. However, '.0' as a regex matches any character - followed by a 0. - - >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35']) - >>> s2.str.contains('.0', regex=True) - 0 True - 1 True - 2 False - 3 True - 4 False - dtype: bool - """ - if regex: - if not case: - flags |= re.IGNORECASE - - regex = re.compile(pat, flags=flags) - - if regex.groups > 0: - warnings.warn( - "This pattern has match groups. To actually get the " - "groups, use str.extract.", - UserWarning, - stacklevel=3, - ) - - f = lambda x: regex.search(x) is not None - else: - if case: - f = lambda x: pat in x - else: - upper_pat = pat.upper() - f = lambda x: upper_pat in x - uppered = _na_map(lambda x: x.upper(), arr) - return _na_map(f, uppered, na, dtype=np.dtype(bool)) - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def str_startswith(arr, pat, na=np.nan): - """ - Test if the start of each string element matches a pattern. - - Equivalent to :meth:`str.startswith`. - - Parameters - ---------- - pat : str - Character sequence. Regular expressions are not accepted. - na : object, default NaN - Object shown if element tested is not a string. - - Returns - ------- - Series or Index of bool - A Series of booleans indicating whether the given pattern matches - the start of each string element. - - See Also - -------- - str.startswith : Python standard library string method. - Series.str.endswith : Same as startswith, but tests the end of string. - Series.str.contains : Tests if string element contains a pattern. - - Examples - -------- - >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan]) - >>> s - 0 bat - 1 Bear - 2 cat - 3 NaN - dtype: object - - >>> s.str.startswith('b') - 0 True - 1 False - 2 False - 3 NaN - dtype: object - - Specifying `na` to be `False` instead of `NaN`. - - >>> s.str.startswith('b', na=False) - 0 True - 1 False - 2 False - 3 False - dtype: bool - """ - f = lambda x: x.startswith(pat) - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def str_endswith(arr, pat, na=np.nan): - """ - Test if the end of each string element matches a pattern. - - Equivalent to :meth:`str.endswith`. - - Parameters - ---------- - pat : str - Character sequence. Regular expressions are not accepted. - na : object, default NaN - Object shown if element tested is not a string. - - Returns - ------- - Series or Index of bool - A Series of booleans indicating whether the given pattern matches - the end of each string element. - - See Also - -------- - str.endswith : Python standard library string method. - Series.str.startswith : Same as endswith, but tests the start of string. - Series.str.contains : Tests if string element contains a pattern. - - Examples - -------- - >>> s = pd.Series(['bat', 'bear', 'caT', np.nan]) - >>> s - 0 bat - 1 bear - 2 caT - 3 NaN - dtype: object - - >>> s.str.endswith('t') - 0 True - 1 False - 2 False - 3 NaN - dtype: object - - Specifying `na` to be `False` instead of `NaN`. - - >>> s.str.endswith('t', na=False) - 0 True - 1 False - 2 False - 3 False - dtype: bool - """ - f = lambda x: x.endswith(pat) - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): - r""" - Replace each occurrence of pattern/regex in the Series/Index. - - Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on the regex value. - - Parameters - ---------- - pat : str or compiled regex - String can be a character sequence or regular expression. - repl : str or callable - Replacement string or a callable. The callable is passed the regex - match object and must return a replacement string to be used. - See :func:`re.sub`. - n : int, default -1 (all) - Number of replacements to make from start. - case : bool, default None - Determines if replace is case sensitive: - - - If True, case sensitive (the default if `pat` is a string) - - Set to False for case insensitive - - Cannot be set if `pat` is a compiled regex. - - flags : int, default 0 (no flags) - Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled - regex. - regex : bool, default True - Determines if assumes the passed-in pattern is a regular expression: - - - If True, assumes the passed-in pattern is a regular expression. - - If False, treats the pattern as a literal string - - Cannot be set to False if `pat` is a compiled regex or `repl` is - a callable. - - .. versionadded:: 0.23.0 - - Returns - ------- - Series or Index of object - A copy of the object with all matching occurrences of `pat` replaced by - `repl`. - - Raises - ------ - ValueError - * if `regex` is False and `repl` is a callable or `pat` is a compiled - regex - * if `pat` is a compiled regex and `case` or `flags` is set - - Notes - ----- - When `pat` is a compiled regex, all flags should be included in the - compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled - regex will raise an error. - - Examples - -------- - When `pat` is a string and `regex` is True (the default), the given `pat` - is compiled as a regex. When `repl` is a string, it replaces matching - regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are - left as is: - - >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True) - 0 bao - 1 baz - 2 NaN - dtype: object - - When `pat` is a string and `regex` is False, every `pat` is replaced with - `repl` as with :meth:`str.replace`: - - >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False) - 0 bao - 1 fuz - 2 NaN - dtype: object - - When `repl` is a callable, it is called on every `pat` using - :func:`re.sub`. The callable should expect one positional argument - (a regex object) and return a string. - - To get the idea: - - >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr) - 0 oo - 1 uz - 2 NaN - dtype: object - - Reverse every lowercase alphabetic word: - - >>> repl = lambda m: m.group(0)[::-1] - >>> pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl) - 0 oof 123 - 1 rab zab - 2 NaN - dtype: object - - Using regex groups (extract second group and swap case): - - >>> pat = r"(?P\w+) (?P\w+) (?P\w+)" - >>> repl = lambda m: m.group('two').swapcase() - >>> pd.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl) - 0 tWO - 1 bAR - dtype: object - - Using a compiled regex with flags - - >>> import re - >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE) - >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar') - 0 foo - 1 bar - 2 NaN - dtype: object - """ - # Check whether repl is valid (GH 13438, GH 15055) - if not (isinstance(repl, str) or callable(repl)): - raise TypeError("repl must be a string or callable") - - is_compiled_re = is_re(pat) - if regex: - if is_compiled_re: - if (case is not None) or (flags != 0): - raise ValueError( - "case and flags cannot be set when pat is a compiled regex" - ) - else: - # not a compiled regex - # set default case - if case is None: - case = True - - # add case flag, if provided - if case is False: - flags |= re.IGNORECASE - if is_compiled_re or len(pat) > 1 or flags or callable(repl): - n = n if n >= 0 else 0 - compiled = re.compile(pat, flags=flags) - f = lambda x: compiled.sub(repl=repl, string=x, count=n) - else: - f = lambda x: x.replace(pat, repl, n) - else: - if is_compiled_re: - raise ValueError( - "Cannot use a compiled regex as replacement pattern with regex=False" - ) - if callable(repl): - raise ValueError("Cannot use a callable replacement when regex=False") - f = lambda x: x.replace(pat, repl, n) - - return _na_map(f, arr, dtype=str) - - -def str_repeat(arr, repeats): - """ - Duplicate each string in the Series or Index. - - Parameters - ---------- - repeats : int or sequence of int - Same value for all (int) or different value per (sequence). - - Returns - ------- - Series or Index of object - Series or Index of repeated string objects specified by - input parameter repeats. - - Examples - -------- - >>> s = pd.Series(['a', 'b', 'c']) - >>> s - 0 a - 1 b - 2 c - dtype: object - - Single int repeats string in Series - - >>> s.str.repeat(repeats=2) - 0 aa - 1 bb - 2 cc - dtype: object - - Sequence of int repeats corresponding string in Series - - >>> s.str.repeat(repeats=[1, 2, 3]) - 0 a - 1 bb - 2 ccc - dtype: object - """ - if is_scalar(repeats): - - def scalar_rep(x): - try: - return bytes.__mul__(x, repeats) - except TypeError: - return str.__mul__(x, repeats) - - return _na_map(scalar_rep, arr, dtype=str) - else: - - def rep(x, r): - if x is libmissing.NA: - return x - try: - return bytes.__mul__(x, r) - except TypeError: - return str.__mul__(x, r) - - repeats = np.asarray(repeats, dtype=object) - result = libops.vec_binop(np.asarray(arr), repeats, rep) - return result - - -def str_match( - arr: ArrayLike, - pat: Union[str, Pattern], - case: bool = True, - flags: int = 0, - na: Scalar = np.nan, -): - """ - Determine if each string starts with a match of a regular expression. - - Parameters - ---------- - pat : str - Character sequence or regular expression. - case : bool, default True - If True, case sensitive. - flags : int, default 0 (no flags) - Regex module flags, e.g. re.IGNORECASE. - na : default NaN - Fill value for missing values. - - Returns - ------- - Series/array of boolean values - - See Also - -------- - fullmatch : Stricter matching that requires the entire string to match. - contains : Analogous, but less strict, relying on re.search instead of - re.match. - extract : Extract matched groups. - """ - if not case: - flags |= re.IGNORECASE - - regex = re.compile(pat, flags=flags) - - f = lambda x: regex.match(x) is not None - - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def str_fullmatch( - arr: ArrayLike, - pat: Union[str, Pattern], - case: bool = True, - flags: int = 0, - na: Scalar = np.nan, -): - """ - Determine if each string entirely matches a regular expression. - - .. versionadded:: 1.1.0 - - Parameters - ---------- - pat : str - Character sequence or regular expression. - case : bool, default True - If True, case sensitive. - flags : int, default 0 (no flags) - Regex module flags, e.g. re.IGNORECASE. - na : default NaN - Fill value for missing values. - - Returns - ------- - Series/array of boolean values - - See Also - -------- - match : Similar, but also returns `True` when only a *prefix* of the string - matches the regular expression. - extract : Extract matched groups. - """ - if not case: - flags |= re.IGNORECASE - - regex = re.compile(pat, flags=flags) - - f = lambda x: regex.fullmatch(x) is not None - - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def _get_single_group_name(rx): - try: - return list(rx.groupindex.keys()).pop() - except IndexError: - return None - - -def _groups_or_na_fun(regex): - """Used in both extract_noexpand and extract_frame""" - if regex.groups == 0: - raise ValueError("pattern contains no capture groups") - empty_row = [np.nan] * regex.groups - - def f(x): - if not isinstance(x, str): - return empty_row - m = regex.search(x) - if m: - return [np.nan if item is None else item for item in m.groups()] - else: - return empty_row - - return f - - -def _result_dtype(arr): - # workaround #27953 - # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails - # when the list of values is empty. - from pandas.core.arrays.string_ import StringDtype - - if isinstance(arr.dtype.name, StringDtype): - return arr.dtype.name - else: - return object - - -def _str_extract_noexpand(arr, pat, flags=0): - """ - Find groups in each string in the Series using passed regular - expression. This function is called from - str_extract(expand=False), and can return Series, DataFrame, or - Index. - - """ - from pandas import DataFrame - - regex = re.compile(pat, flags=flags) - groups_or_na = _groups_or_na_fun(regex) - - if regex.groups == 1: - result = np.array([groups_or_na(val)[0] for val in arr], dtype=object) - name = _get_single_group_name(regex) - else: - if isinstance(arr, ABCIndexClass): - raise ValueError("only one regex group is supported with Index") - name = None - names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) - columns = [names.get(1 + i, i) for i in range(regex.groups)] - if arr.empty: - result = DataFrame(columns=columns, dtype=object) - else: - dtype = _result_dtype(arr) - result = DataFrame( - [groups_or_na(val) for val in arr], - columns=columns, - index=arr.index, - dtype=dtype, - ) - return result, name - - -def _str_extract_frame(arr, pat, flags=0): - """ - For each subject string in the Series, extract groups from the - first match of regular expression pat. This function is called from - str_extract(expand=True), and always returns a DataFrame. - - """ - from pandas import DataFrame - - regex = re.compile(pat, flags=flags) - groups_or_na = _groups_or_na_fun(regex) - names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) - columns = [names.get(1 + i, i) for i in range(regex.groups)] - - if len(arr) == 0: - return DataFrame(columns=columns, dtype=object) - try: - result_index = arr.index - except AttributeError: - result_index = None - dtype = _result_dtype(arr) - return DataFrame( - [groups_or_na(val) for val in arr], - columns=columns, - index=result_index, - dtype=dtype, - ) - - -def str_extract(arr, pat, flags=0, expand=True): - r""" - Extract capture groups in the regex `pat` as columns in a DataFrame. - - For each subject string in the Series, extract groups from the - first match of regular expression `pat`. - - Parameters - ---------- - pat : str - Regular expression pattern with capturing groups. - flags : int, default 0 (no flags) - Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that - modify regular expression matching for things like case, - spaces, etc. For more details, see :mod:`re`. - expand : bool, default True - If True, return DataFrame with one column per capture group. - If False, return a Series/Index if there is one capture group - or DataFrame if there are multiple capture groups. - - Returns - ------- - DataFrame or Series or Index - A DataFrame with one row for each subject string, and one - column for each group. Any capture group names in regular - expression pat will be used for column names; otherwise - capture group numbers will be used. The dtype of each result - column is always object, even when no match is found. If - ``expand=False`` and pat has only one capture group, then - return a Series (if subject is a Series) or Index (if subject - is an Index). - - See Also - -------- - extractall : Returns all matches (not just the first match). - - Examples - -------- - A pattern with two groups will return a DataFrame with two columns. - Non-matches will be NaN. - - >>> s = pd.Series(['a1', 'b2', 'c3']) - >>> s.str.extract(r'([ab])(\d)') - 0 1 - 0 a 1 - 1 b 2 - 2 NaN NaN - - A pattern may contain optional groups. - - >>> s.str.extract(r'([ab])?(\d)') - 0 1 - 0 a 1 - 1 b 2 - 2 NaN 3 - - Named groups will become column names in the result. - - >>> s.str.extract(r'(?P[ab])(?P\d)') - letter digit - 0 a 1 - 1 b 2 - 2 NaN NaN - - A pattern with one group will return a DataFrame with one column - if expand=True. - - >>> s.str.extract(r'[ab](\d)', expand=True) - 0 - 0 1 - 1 2 - 2 NaN - - A pattern with one group will return a Series if expand=False. - - >>> s.str.extract(r'[ab](\d)', expand=False) - 0 1 - 1 2 - 2 NaN - dtype: object - """ - if not isinstance(expand, bool): - raise ValueError("expand must be True or False") - if expand: - return _str_extract_frame(arr._orig, pat, flags=flags) - else: - result, name = _str_extract_noexpand(arr._parent, pat, flags=flags) - return arr._wrap_result(result, name=name, expand=expand) - - -def str_extractall(arr, pat, flags=0): - r""" - Extract capture groups in the regex `pat` as columns in DataFrame. - - For each subject string in the Series, extract groups from all - matches of regular expression pat. When each subject string in the - Series has exactly one match, extractall(pat).xs(0, level='match') - is the same as extract(pat). - - Parameters - ---------- - pat : str - Regular expression pattern with capturing groups. - flags : int, default 0 (no flags) - A ``re`` module flag, for example ``re.IGNORECASE``. These allow - to modify regular expression matching for things like case, spaces, - etc. Multiple flags can be combined with the bitwise OR operator, - for example ``re.IGNORECASE | re.MULTILINE``. - - Returns - ------- - DataFrame - A ``DataFrame`` with one row for each match, and one column for each - group. Its rows have a ``MultiIndex`` with first levels that come from - the subject ``Series``. The last level is named 'match' and indexes the - matches in each item of the ``Series``. Any capture group names in - regular expression pat will be used for column names; otherwise capture - group numbers will be used. - - See Also - -------- - extract : Returns first match only (not all matches). - - Examples - -------- - A pattern with one group will return a DataFrame with one column. - Indices with no matches will not appear in the result. - - >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"]) - >>> s.str.extractall(r"[ab](\d)") - 0 - match - A 0 1 - 1 2 - B 0 1 - - Capture group names are used for column names of the result. - - >>> s.str.extractall(r"[ab](?P\d)") - digit - match - A 0 1 - 1 2 - B 0 1 - - A pattern with two groups will return a DataFrame with two columns. - - >>> s.str.extractall(r"(?P[ab])(?P\d)") - letter digit - match - A 0 a 1 - 1 a 2 - B 0 b 1 - - Optional groups that do not match are NaN in the result. - - >>> s.str.extractall(r"(?P[ab])?(?P\d)") - letter digit - match - A 0 a 1 - 1 a 2 - B 0 b 1 - C 0 NaN 1 - """ - regex = re.compile(pat, flags=flags) - # the regex must contain capture groups. - if regex.groups == 0: - raise ValueError("pattern contains no capture groups") - - if isinstance(arr, ABCIndexClass): - arr = arr.to_series().reset_index(drop=True) - - names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) - columns = [names.get(1 + i, i) for i in range(regex.groups)] - match_list = [] - index_list = [] - is_mi = arr.index.nlevels > 1 - - for subject_key, subject in arr.items(): - if isinstance(subject, str): - - if not is_mi: - subject_key = (subject_key,) - - for match_i, match_tuple in enumerate(regex.findall(subject)): - if isinstance(match_tuple, str): - match_tuple = (match_tuple,) - na_tuple = [np.NaN if group == "" else group for group in match_tuple] - match_list.append(na_tuple) - result_key = tuple(subject_key + (match_i,)) - index_list.append(result_key) - - from pandas import MultiIndex - - index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"]) - dtype = _result_dtype(arr) - - result = arr._constructor_expanddim( - match_list, index=index, columns=columns, dtype=dtype - ) - return result - - -def str_get_dummies(arr, sep="|"): - """ - Return DataFrame of dummy/indicator variables for Series. - - Each string in Series is split by sep and returned as a DataFrame - of dummy/indicator variables. - - Parameters - ---------- - sep : str, default "|" - String to split on. - - Returns - ------- - DataFrame - Dummy variables corresponding to values of the Series. - - See Also - -------- - get_dummies : Convert categorical variable into dummy/indicator - variables. - - Examples - -------- - >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies() - a b c - 0 1 1 0 - 1 1 0 0 - 2 1 0 1 - - >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies() - a b c - 0 1 1 0 - 1 0 0 0 - 2 1 0 1 - """ - arr = arr.fillna("") - try: - arr = sep + arr + sep - except TypeError: - arr = sep + arr.astype(str) + sep - - tags = set() - for ts in arr.str.split(sep): - tags.update(ts) - tags = sorted(tags - {""}) - - dummies = np.empty((len(arr), len(tags)), dtype=np.int64) - - for i, t in enumerate(tags): - pat = sep + t + sep - dummies[:, i] = lib.map_infer(arr.to_numpy(), lambda x: pat in x) - return dummies, tags - - -def str_join(arr, sep): - """ - Join lists contained as elements in the Series/Index with passed delimiter. - - If the elements of a Series are lists themselves, join the content of these - lists using the delimiter passed to the function. - This function is an equivalent to :meth:`str.join`. - - Parameters - ---------- - sep : str - Delimiter to use between list entries. - - Returns - ------- - Series/Index: object - The list entries concatenated by intervening occurrences of the - delimiter. - - Raises - ------ - AttributeError - If the supplied Series contains neither strings nor lists. - - See Also - -------- - str.join : Standard library version of this method. - Series.str.split : Split strings around given separator/delimiter. - - Notes - ----- - If any of the list items is not a string object, the result of the join - will be `NaN`. - - Examples - -------- - Example with a list that contains non-string elements. - - >>> s = pd.Series([['lion', 'elephant', 'zebra'], - ... [1.1, 2.2, 3.3], - ... ['cat', np.nan, 'dog'], - ... ['cow', 4.5, 'goat'], - ... ['duck', ['swan', 'fish'], 'guppy']]) - >>> s - 0 [lion, elephant, zebra] - 1 [1.1, 2.2, 3.3] - 2 [cat, nan, dog] - 3 [cow, 4.5, goat] - 4 [duck, [swan, fish], guppy] - dtype: object - - Join all lists using a '-'. The lists containing object(s) of types other - than str will produce a NaN. - - >>> s.str.join('-') - 0 lion-elephant-zebra - 1 NaN - 2 NaN - 3 NaN - 4 NaN - dtype: object - """ - return _na_map(sep.join, arr, dtype=str) - - -def str_findall(arr, pat, flags=0): - """ - Find all occurrences of pattern or regular expression in the Series/Index. - - Equivalent to applying :func:`re.findall` to all the elements in the - Series/Index. - - Parameters - ---------- - pat : str - Pattern or regular expression. - flags : int, default 0 - Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which - means no flags). - - Returns - ------- - Series/Index of lists of strings - All non-overlapping matches of pattern or regular expression in each - string of this Series/Index. - - See Also - -------- - count : Count occurrences of pattern or regular expression in each string - of the Series/Index. - extractall : For each string in the Series, extract groups from all matches - of regular expression and return a DataFrame with one row for each - match and one column for each group. - re.findall : The equivalent ``re`` function to all non-overlapping matches - of pattern or regular expression in string, as a list of strings. - - Examples - -------- - >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit']) - - The search for the pattern 'Monkey' returns one match: - - >>> s.str.findall('Monkey') - 0 [] - 1 [Monkey] - 2 [] - dtype: object - - On the other hand, the search for the pattern 'MONKEY' doesn't return any - match: - - >>> s.str.findall('MONKEY') - 0 [] - 1 [] - 2 [] - dtype: object - - Flags can be added to the pattern or regular expression. For instance, - to find the pattern 'MONKEY' ignoring the case: - - >>> import re - >>> s.str.findall('MONKEY', flags=re.IGNORECASE) - 0 [] - 1 [Monkey] - 2 [] - dtype: object - - When the pattern matches more than one string in the Series, all matches - are returned: - - >>> s.str.findall('on') - 0 [on] - 1 [on] - 2 [] - dtype: object - - Regular expressions are supported too. For instance, the search for all the - strings ending with the word 'on' is shown next: - - >>> s.str.findall('on$') - 0 [on] - 1 [] - 2 [] - dtype: object - - If the pattern is found more than once in the same string, then a list of - multiple strings is returned: - - >>> s.str.findall('b') - 0 [] - 1 [] - 2 [b, b] - dtype: object - """ - regex = re.compile(pat, flags=flags) - return _na_map(regex.findall, arr) - - -def str_find(arr, sub, start=0, end=None, side="left"): - """ - Return indexes in each strings in the Series/Index where the - substring is fully contained between [start:end]. Return -1 on failure. - - Parameters - ---------- - sub : str - Substring being searched. - start : int - Left edge index. - end : int - Right edge index. - side : {'left', 'right'}, default 'left' - Specifies a starting side, equivalent to ``find`` or ``rfind``. - - Returns - ------- - Series or Index - Indexes where substring is found. - """ - if not isinstance(sub, str): - msg = f"expected a string object, not {type(sub).__name__}" - raise TypeError(msg) - - if side == "left": - method = "find" - elif side == "right": - method = "rfind" - else: # pragma: no cover - raise ValueError("Invalid side") - - if end is None: - f = lambda x: getattr(x, method)(sub, start) - else: - f = lambda x: getattr(x, method)(sub, start, end) - - return _na_map(f, arr, dtype=np.dtype("int64")) - - -def str_index(arr, sub, start=0, end=None, side="left"): - if not isinstance(sub, str): - msg = f"expected a string object, not {type(sub).__name__}" - raise TypeError(msg) - - if side == "left": - method = "index" - elif side == "right": - method = "rindex" - else: # pragma: no cover - raise ValueError("Invalid side") - - if end is None: - f = lambda x: getattr(x, method)(sub, start) - else: - f = lambda x: getattr(x, method)(sub, start, end) - - return _na_map(f, arr, dtype=np.dtype("int64")) - - -def str_pad(arr, width, side="left", fillchar=" "): - """ - Pad strings in the Series/Index up to width. - - Parameters - ---------- - width : int - Minimum width of resulting string; additional characters will be filled - with character defined in `fillchar`. - side : {'left', 'right', 'both'}, default 'left' - Side from which to fill resulting string. - fillchar : str, default ' ' - Additional character for filling, default is whitespace. - - Returns - ------- - Series or Index of object - Returns Series or Index with minimum number of char in object. - - See Also - -------- - Series.str.rjust : Fills the left side of strings with an arbitrary - character. Equivalent to ``Series.str.pad(side='left')``. - Series.str.ljust : Fills the right side of strings with an arbitrary - character. Equivalent to ``Series.str.pad(side='right')``. - Series.str.center : Fills boths sides of strings with an arbitrary - character. Equivalent to ``Series.str.pad(side='both')``. - Series.str.zfill : Pad strings in the Series/Index by prepending '0' - character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``. - - Examples - -------- - >>> s = pd.Series(["caribou", "tiger"]) - >>> s - 0 caribou - 1 tiger - dtype: object - - >>> s.str.pad(width=10) - 0 caribou - 1 tiger - dtype: object - - >>> s.str.pad(width=10, side='right', fillchar='-') - 0 caribou--- - 1 tiger----- - dtype: object - - >>> s.str.pad(width=10, side='both', fillchar='-') - 0 -caribou-- - 1 --tiger--- - dtype: object - """ - if not isinstance(fillchar, str): - msg = f"fillchar must be a character, not {type(fillchar).__name__}" - raise TypeError(msg) - - if len(fillchar) != 1: - raise TypeError("fillchar must be a character, not str") - - if not is_integer(width): - msg = f"width must be of integer type, not {type(width).__name__}" - raise TypeError(msg) - - if side == "left": - f = lambda x: x.rjust(width, fillchar) - elif side == "right": - f = lambda x: x.ljust(width, fillchar) - elif side == "both": - f = lambda x: x.center(width, fillchar) - else: # pragma: no cover - raise ValueError("Invalid side") - - return _na_map(f, arr, dtype=str) - - -def str_split(arr, pat=None, n=None): - - if pat is None: - if n is None or n == 0: - n = -1 - f = lambda x: x.split(pat, n) - else: - if len(pat) == 1: - if n is None or n == 0: - n = -1 - f = lambda x: x.split(pat, n) - else: - if n is None or n == -1: - n = 0 - regex = re.compile(pat) - f = lambda x: regex.split(x, maxsplit=n) - res = _na_map(f, arr) - return res - - -def str_rsplit(arr, pat=None, n=None): - - if n is None or n == 0: - n = -1 - f = lambda x: x.rsplit(pat, n) - res = _na_map(f, arr) - return res - - -def str_slice(arr, start=None, stop=None, step=None): - """ - Slice substrings from each element in the Series or Index. - - Parameters - ---------- - start : int, optional - Start position for slice operation. - stop : int, optional - Stop position for slice operation. - step : int, optional - Step size for slice operation. - - Returns - ------- - Series or Index of object - Series or Index from sliced substring from original string object. - - See Also - -------- - Series.str.slice_replace : Replace a slice with a string. - Series.str.get : Return element at position. - Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i` - being the position. - - Examples - -------- - >>> s = pd.Series(["koala", "fox", "chameleon"]) - >>> s - 0 koala - 1 fox - 2 chameleon - dtype: object - - >>> s.str.slice(start=1) - 0 oala - 1 ox - 2 hameleon - dtype: object - - >>> s.str.slice(start=-1) - 0 a - 1 x - 2 n - dtype: object - - >>> s.str.slice(stop=2) - 0 ko - 1 fo - 2 ch - dtype: object - - >>> s.str.slice(step=2) - 0 kaa - 1 fx - 2 caeen - dtype: object - - >>> s.str.slice(start=0, stop=5, step=3) - 0 kl - 1 f - 2 cm - dtype: object - - Equivalent behaviour to: - - >>> s.str[0:5:3] - 0 kl - 1 f - 2 cm - dtype: object - """ - obj = slice(start, stop, step) - f = lambda x: x[obj] - return _na_map(f, arr, dtype=str) - - -def str_slice_replace(arr, start=None, stop=None, repl=None): - """ - Replace a positional slice of a string with another value. - - Parameters - ---------- - start : int, optional - Left index position to use for the slice. If not specified (None), - the slice is unbounded on the left, i.e. slice from the start - of the string. - stop : int, optional - Right index position to use for the slice. If not specified (None), - the slice is unbounded on the right, i.e. slice until the - end of the string. - repl : str, optional - String for replacement. If not specified (None), the sliced region - is replaced with an empty string. - - Returns - ------- - Series or Index - Same type as the original object. - - See Also - -------- - Series.str.slice : Just slicing without replacement. - - Examples - -------- - >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde']) - >>> s - 0 a - 1 ab - 2 abc - 3 abdc - 4 abcde - dtype: object - - Specify just `start`, meaning replace `start` until the end of the - string with `repl`. - - >>> s.str.slice_replace(1, repl='X') - 0 aX - 1 aX - 2 aX - 3 aX - 4 aX - dtype: object - - Specify just `stop`, meaning the start of the string to `stop` is replaced - with `repl`, and the rest of the string is included. - - >>> s.str.slice_replace(stop=2, repl='X') - 0 X - 1 X - 2 Xc - 3 Xdc - 4 Xcde - dtype: object - - Specify `start` and `stop`, meaning the slice from `start` to `stop` is - replaced with `repl`. Everything before or after `start` and `stop` is - included as is. - - >>> s.str.slice_replace(start=1, stop=3, repl='X') - 0 aX - 1 aX - 2 aX - 3 aXc - 4 aXde - dtype: object - """ - if repl is None: - repl = "" - - def f(x): - if x[start:stop] == "": - local_stop = start - else: - local_stop = stop - y = "" - if start is not None: - y += x[:start] - y += repl - if stop is not None: - y += x[local_stop:] - return y - - return _na_map(f, arr, dtype=str) - - -def str_strip(arr, to_strip=None, side="both"): - """ - Strip whitespace (including newlines) from each string in the - Series/Index. - - Parameters - ---------- - to_strip : str or unicode - side : {'left', 'right', 'both'}, default 'both' - - Returns - ------- - Series or Index - """ - if side == "both": - f = lambda x: x.strip(to_strip) - elif side == "left": - f = lambda x: x.lstrip(to_strip) - elif side == "right": - f = lambda x: x.rstrip(to_strip) - else: # pragma: no cover - raise ValueError("Invalid side") - return _na_map(f, arr, dtype=str) - - -def str_wrap(arr, width, **kwargs): - r""" - Wrap strings in Series/Index at specified line width. - - This method has the same keyword parameters and defaults as - :class:`textwrap.TextWrapper`. - - Parameters - ---------- - width : int - Maximum line width. - expand_tabs : bool, optional - If True, tab characters will be expanded to spaces (default: True). - replace_whitespace : bool, optional - If True, each whitespace character (as defined by string.whitespace) - remaining after tab expansion will be replaced by a single space - (default: True). - drop_whitespace : bool, optional - If True, whitespace that, after wrapping, happens to end up at the - beginning or end of a line is dropped (default: True). - break_long_words : bool, optional - If True, then words longer than width will be broken in order to ensure - that no lines are longer than width. If it is false, long words will - not be broken, and some lines may be longer than width (default: True). - break_on_hyphens : bool, optional - If True, wrapping will occur preferably on whitespace and right after - hyphens in compound words, as it is customary in English. If false, - only whitespaces will be considered as potentially good places for line - breaks, but you need to set break_long_words to false if you want truly - insecable words (default: True). - - Returns - ------- - Series or Index - - Notes - ----- - Internally, this method uses a :class:`textwrap.TextWrapper` instance with - default settings. To achieve behavior matching R's stringr library str_wrap - function, use the arguments: - - - expand_tabs = False - - replace_whitespace = True - - drop_whitespace = True - - break_long_words = False - - break_on_hyphens = False - - Examples - -------- - >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped']) - >>> s.str.wrap(12) - 0 line to be\nwrapped - 1 another line\nto be\nwrapped - dtype: object - """ - kwargs["width"] = width - - tw = textwrap.TextWrapper(**kwargs) - - return _na_map(lambda s: "\n".join(tw.wrap(s)), arr, dtype=str) - - -def str_translate(arr, table): - """ - Map all characters in the string through the given mapping table. - - Equivalent to standard :meth:`str.translate`. - - Parameters - ---------- - table : dict - Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or - None. Unmapped characters are left untouched. - Characters mapped to None are deleted. :meth:`str.maketrans` is a - helper function for making translation tables. - - Returns - ------- - Series or Index - """ - return _na_map(lambda x: x.translate(table), arr, dtype=str) - - -def str_get(arr, i): - """ - Extract element from each component at specified position. - - Extract element from lists, tuples, or strings in each element in the - Series/Index. - - Parameters - ---------- - i : int - Position of element to extract. - - Returns - ------- - Series or Index - - Examples - -------- - >>> s = pd.Series(["String", - ... (1, 2, 3), - ... ["a", "b", "c"], - ... 123, - ... -456, - ... {1: "Hello", "2": "World"}]) - >>> s - 0 String - 1 (1, 2, 3) - 2 [a, b, c] - 3 123 - 4 -456 - 5 {1: 'Hello', '2': 'World'} - dtype: object - - >>> s.str.get(1) - 0 t - 1 2 - 2 b - 3 NaN - 4 NaN - 5 Hello - dtype: object - - >>> s.str.get(-1) - 0 g - 1 3 - 2 c - 3 NaN - 4 NaN - 5 None - dtype: object - """ - - def f(x): - if isinstance(x, dict): - return x.get(i) - elif len(x) > i >= -len(x): - return x[i] - return np.nan - - return _na_map(f, arr) - - -def str_decode(arr, encoding, errors="strict"): - """ - Decode character string in the Series/Index using indicated encoding. - - Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in - python3. - - Parameters - ---------- - encoding : str - errors : str, optional - - Returns - ------- - Series or Index - """ - if encoding in _cpython_optimized_decoders: - # CPython optimized implementation - f = lambda x: x.decode(encoding, errors) - else: - decoder = codecs.getdecoder(encoding) - f = lambda x: decoder(x, errors)[0] - return _na_map(f, arr) - - -def str_encode(arr, encoding, errors="strict"): - """ - Encode character string in the Series/Index using indicated encoding. - - Equivalent to :meth:`str.encode`. - - Parameters - ---------- - encoding : str - errors : str, optional - - Returns - ------- - encoded : Series/Index of objects - """ - if encoding in _cpython_optimized_encoders: - # CPython optimized implementation - f = lambda x: x.encode(encoding, errors) - else: - encoder = codecs.getencoder(encoding) - f = lambda x: encoder(x, errors)[0] - return _na_map(f, arr) - - -def forbid_nonstring_types(forbidden, name=None): - """ - Decorator to forbid specific types for a method of StringMethods. - - For calling `.str.{method}` on a Series or Index, it is necessary to first - initialize the :class:`StringMethods` object, and then call the method. - However, different methods allow different input types, and so this can not - be checked during :meth:`StringMethods.__init__`, but must be done on a - per-method basis. This decorator exists to facilitate this process, and - make it explicit which (inferred) types are disallowed by the method. - - :meth:`StringMethods.__init__` allows the *union* of types its different - methods allow (after skipping NaNs; see :meth:`StringMethods._validate`), - namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer']. - - The default string types ['string', 'empty'] are allowed for all methods. - For the additional types ['bytes', 'mixed', 'mixed-integer'], each method - then needs to forbid the types it is not intended for. - - Parameters - ---------- - forbidden : list-of-str or None - List of forbidden non-string types, may be one or more of - `['bytes', 'mixed', 'mixed-integer']`. - name : str, default None - Name of the method to use in the error message. By default, this is - None, in which case the name from the method being wrapped will be - copied. However, for working with further wrappers (like _pat_wrapper - and _noarg_wrapper), it is necessary to specify the name. - - Returns - ------- - func : wrapper - The method to which the decorator is applied, with an added check that - enforces the inferred type to not be in the list of forbidden types. - - Raises - ------ - TypeError - If the inferred type of the underlying data is in `forbidden`. - """ - # deal with None - forbidden = [] if forbidden is None else forbidden - - allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set( - forbidden - ) - - def _forbid_nonstring_types(func): - func_name = func.__name__ if name is None else name - - @wraps(func) - def wrapper(self, *args, **kwargs): - if self._inferred_dtype not in allowed_types: - msg = ( - f"Cannot use .str.{func_name} with values of " - f"inferred dtype '{self._inferred_dtype}'." - ) - raise TypeError(msg) - return func(self, *args, **kwargs) - - wrapper.__name__ = func_name - return wrapper - - return _forbid_nonstring_types - - -def _noarg_wrapper( - f, - name=None, - docstring=None, - forbidden_types=["bytes"], - returns_string=True, - **kwargs, -): - @forbid_nonstring_types(forbidden_types, name=name) - def wrapper(self): - result = _na_map(f, self._parent, **kwargs) - return self._wrap_result(result, returns_string=returns_string) - - wrapper.__name__ = f.__name__ if name is None else name - if docstring is not None: - wrapper.__doc__ = docstring - else: - raise ValueError("Provide docstring") - - return wrapper - - -def _pat_wrapper( - f, - flags=False, - na=False, - name=None, - forbidden_types=["bytes"], - returns_string=True, - **kwargs, -): - @forbid_nonstring_types(forbidden_types, name=name) - def wrapper1(self, pat): - result = f(self._parent, pat) - return self._wrap_result(result, returns_string=returns_string) - - @forbid_nonstring_types(forbidden_types, name=name) - def wrapper2(self, pat, flags=0, **kwargs): - result = f(self._parent, pat, flags=flags, **kwargs) - return self._wrap_result(result, returns_string=returns_string) - - @forbid_nonstring_types(forbidden_types, name=name) - def wrapper3(self, pat, na=np.nan): - result = f(self._parent, pat, na=na) - return self._wrap_result(result, returns_string=returns_string) - - wrapper = wrapper3 if na else wrapper2 if flags else wrapper1 - - wrapper.__name__ = f.__name__ if name is None else name - if f.__doc__: - wrapper.__doc__ = f.__doc__ - - return wrapper - - -def copy(source): - """Copy a docstring from another source function (if present)""" - - def do_copy(target): - if source.__doc__: - target.__doc__ = source.__doc__ - return target - - return do_copy - - -class StringMethods(NoNewAttributesMixin): - """ - Vectorized string functions for Series and Index. - - NAs stay NA unless handled otherwise by a particular method. - Patterned after Python's string methods, with some inspiration from - R's stringr package. - - Examples - -------- - >>> s = pd.Series(["A_Str_Series"]) - >>> s - 0 A_Str_Series - dtype: object - - >>> s.str.split("_") - 0 [A, Str, Series] - dtype: object - - >>> s.str.replace("_", "") - 0 AStrSeries - dtype: object - """ - - def __init__(self, data): - from pandas.core.arrays.string_ import StringDtype - - self._inferred_dtype = self._validate(data) - self._is_categorical = is_categorical_dtype(data.dtype) - self._is_string = isinstance(data.dtype, StringDtype) - - # ._values.categories works for both Series/Index - self._parent = data._values.categories if self._is_categorical else data - # save orig to blow up categoricals to the right type - self._orig = data - self._freeze() - - @staticmethod - def _validate(data): - """ - Auxiliary function for StringMethods, infers and checks dtype of data. - - This is a "first line of defence" at the creation of the StringMethods- - object (see _make_accessor), and just checks that the dtype is in the - *union* of the allowed types over all string methods below; this - restriction is then refined on a per-method basis using the decorator - @forbid_nonstring_types (more info in the corresponding docstring). - - This really should exclude all series/index with any non-string values, - but that isn't practical for performance reasons until we have a str - dtype (GH 9343 / 13877) - - Parameters - ---------- - data : The content of the Series - - Returns - ------- - dtype : inferred dtype of data - """ - from pandas import StringDtype - - if isinstance(data, ABCMultiIndex): - raise AttributeError( - "Can only use .str accessor with Index, not MultiIndex" - ) - - # see _libs/lib.pyx for list of inferred types - allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"] - - values = getattr(data, "values", data) # Series / Index - values = getattr(values, "categories", values) # categorical / normal - - # explicitly allow StringDtype - if isinstance(values.dtype, StringDtype): - return "string" - - try: - inferred_dtype = lib.infer_dtype(values, skipna=True) - except ValueError: - # GH#27571 mostly occurs with ExtensionArray - inferred_dtype = None - - if inferred_dtype not in allowed_types: - raise AttributeError("Can only use .str accessor with string values!") - return inferred_dtype - - def __getitem__(self, key): - if isinstance(key, slice): - return self.slice(start=key.start, stop=key.stop, step=key.step) - else: - return self.get(key) - - def __iter__(self): - warnings.warn( - "Columnar iteration over characters will be deprecated in future releases.", - FutureWarning, - stacklevel=2, - ) - i = 0 - g = self.get(i) - while g.notna().any(): - yield g - i += 1 - g = self.get(i) - - def _wrap_result( - self, - result, - use_codes=True, - name=None, - expand=None, - fill_value=np.nan, - returns_string=True, - ): - - from pandas import Index, MultiIndex, Series - - # for category, we do the stuff on the categories, so blow it up - # to the full series again - # But for some operations, we have to do the stuff on the full values, - # so make it possible to skip this step as the method already did this - # before the transformation... - if use_codes and self._is_categorical: - # if self._orig is a CategoricalIndex, there is no .cat-accessor - result = take_1d( - result, Series(self._orig, copy=False).cat.codes, fill_value=fill_value - ) - - if not hasattr(result, "ndim") or not hasattr(result, "dtype"): - return result - assert result.ndim < 3 - - # We can be wrapping a string / object / categorical result, in which - # case we'll want to return the same dtype as the input. - # Or we can be wrapping a numeric output, in which case we don't want - # to return a StringArray. - if self._is_string and returns_string: - dtype = "string" - else: - dtype = None - - if expand is None: - # infer from ndim if expand is not specified - expand = result.ndim != 1 - - elif expand is True and not isinstance(self._orig, ABCIndexClass): - # required when expand=True is explicitly specified - # not needed when inferred - - def cons_row(x): - if is_list_like(x): - return x - else: - return [x] - - result = [cons_row(x) for x in result] - if result: - # propagate nan values to match longest sequence (GH 18450) - max_len = max(len(x) for x in result) - result = [ - x * max_len if len(x) == 0 or x[0] is np.nan else x for x in result - ] - - if not isinstance(expand, bool): - raise ValueError("expand must be True or False") - - if expand is False: - # if expand is False, result should have the same name - # as the original otherwise specified - if name is None: - name = getattr(result, "name", None) - if name is None: - # do not use logical or, _orig may be a DataFrame - # which has "name" column - name = self._orig.name - - # Wait until we are sure result is a Series or Index before - # checking attributes (GH 12180) - if isinstance(self._orig, ABCIndexClass): - # if result is a boolean np.array, return the np.array - # instead of wrapping it into a boolean Index (GH 8875) - if is_bool_dtype(result): - return result - - if expand: - result = list(result) - out = MultiIndex.from_tuples(result, names=name) - if out.nlevels == 1: - # We had all tuples of length-one, which are - # better represented as a regular Index. - out = out.get_level_values(0) - return out - else: - return Index(result, name=name) - else: - index = self._orig.index - if expand: - cons = self._orig._constructor_expanddim - result = cons(result, columns=name, index=index, dtype=dtype) - else: - # Must be a Series - cons = self._orig._constructor - result = cons(result, name=name, index=index, dtype=dtype) - return result - - def _get_series_list(self, others): - """ - Auxiliary function for :meth:`str.cat`. Turn potentially mixed input - into a list of Series (elements without an index must match the length - of the calling Series/Index). - - Parameters - ---------- - others : Series, DataFrame, np.ndarray, list-like or list-like of - Objects that are either Series, Index or np.ndarray (1-dim). - - Returns - ------- - list of Series - Others transformed into list of Series. - """ - from pandas import DataFrame, Series - - # self._orig is either Series or Index - idx = self._orig if isinstance(self._orig, ABCIndexClass) else self._orig.index - - # Generally speaking, all objects without an index inherit the index - # `idx` of the calling Series/Index - i.e. must have matching length. - # Objects with an index (i.e. Series/Index/DataFrame) keep their own. - if isinstance(others, ABCSeries): - return [others] - elif isinstance(others, ABCIndexClass): - return [Series(others._values, index=idx)] - elif isinstance(others, ABCDataFrame): - return [others[x] for x in others] - elif isinstance(others, np.ndarray) and others.ndim == 2: - others = DataFrame(others, index=idx) - return [others[x] for x in others] - elif is_list_like(others, allow_sets=False): - others = list(others) # ensure iterators do not get read twice etc - - # in case of list-like `others`, all elements must be - # either Series/Index/np.ndarray (1-dim)... - if all( - isinstance(x, (ABCSeries, ABCIndexClass)) - or (isinstance(x, np.ndarray) and x.ndim == 1) - for x in others - ): - los = [] - while others: # iterate through list and append each element - los = los + self._get_series_list(others.pop(0)) - return los - # ... or just strings - elif all(not is_list_like(x) for x in others): - return [Series(others, index=idx)] - raise TypeError( - "others must be Series, Index, DataFrame, np.ndarray " - "or list-like (either containing only strings or " - "containing only objects of type Series/Index/" - "np.ndarray[1-dim])" - ) - - @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"]) - def cat(self, others=None, sep=None, na_rep=None, join="left"): - """ - Concatenate strings in the Series/Index with given separator. - - If `others` is specified, this function concatenates the Series/Index - and elements of `others` element-wise. - If `others` is not passed, then all values in the Series/Index are - concatenated into a single string with a given `sep`. - - Parameters - ---------- - others : Series, Index, DataFrame, np.ndarray or list-like - Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and - other list-likes of strings must have the same length as the - calling Series/Index, with the exception of indexed objects (i.e. - Series/Index/DataFrame) if `join` is not None. - - If others is a list-like that contains a combination of Series, - Index or np.ndarray (1-dim), then all elements will be unpacked and - must satisfy the above criteria individually. - - If others is None, the method returns the concatenation of all - strings in the calling Series/Index. - sep : str, default '' - The separator between the different elements/columns. By default - the empty string `''` is used. - na_rep : str or None, default None - Representation that is inserted for all missing values: - - - If `na_rep` is None, and `others` is None, missing values in the - Series/Index are omitted from the result. - - If `na_rep` is None, and `others` is not None, a row containing a - missing value in any of the columns (before concatenation) will - have a missing value in the result. - join : {'left', 'right', 'outer', 'inner'}, default 'left' - Determines the join-style between the calling Series/Index and any - Series/Index/DataFrame in `others` (objects without an index need - to match the length of the calling Series/Index). To disable - alignment, use `.values` on any Series/Index/DataFrame in `others`. - - .. versionadded:: 0.23.0 - .. versionchanged:: 1.0.0 - Changed default of `join` from None to `'left'`. - - Returns - ------- - str, Series or Index - If `others` is None, `str` is returned, otherwise a `Series/Index` - (same type as caller) of objects is returned. - - See Also - -------- - split : Split each string in the Series/Index. - join : Join lists contained as elements in the Series/Index. - - Examples - -------- - When not passing `others`, all values are concatenated into a single - string: - - >>> s = pd.Series(['a', 'b', np.nan, 'd']) - >>> s.str.cat(sep=' ') - 'a b d' - - By default, NA values in the Series are ignored. Using `na_rep`, they - can be given a representation: - - >>> s.str.cat(sep=' ', na_rep='?') - 'a b ? d' - - If `others` is specified, corresponding values are concatenated with - the separator. Result will be a Series of strings. - - >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',') - 0 a,A - 1 b,B - 2 NaN - 3 d,D - dtype: object - - Missing values will remain missing in the result, but can again be - represented using `na_rep` - - >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-') - 0 a,A - 1 b,B - 2 -,C - 3 d,D - dtype: object - - If `sep` is not specified, the values are concatenated without - separation. - - >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-') - 0 aA - 1 bB - 2 -C - 3 dD - dtype: object - - Series with different indexes can be aligned before concatenation. The - `join`-keyword works as in other methods. - - >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2]) - >>> s.str.cat(t, join='left', na_rep='-') - 0 aa - 1 b- - 2 -c - 3 dd - dtype: object - >>> - >>> s.str.cat(t, join='outer', na_rep='-') - 0 aa - 1 b- - 2 -c - 3 dd - 4 -e - dtype: object - >>> - >>> s.str.cat(t, join='inner', na_rep='-') - 0 aa - 2 -c - 3 dd - dtype: object - >>> - >>> s.str.cat(t, join='right', na_rep='-') - 3 dd - 0 aa - 4 -e - 2 -c - dtype: object - - For more examples, see :ref:`here `. - """ - from pandas import Index, Series, concat - - if isinstance(others, str): - raise ValueError("Did you mean to supply a `sep` keyword?") - if sep is None: - sep = "" - - if isinstance(self._orig, ABCIndexClass): - data = Series(self._orig, index=self._orig) - else: # Series - data = self._orig - - # concatenate Series/Index with itself if no "others" - if others is None: - data = ensure_object(data) - na_mask = isna(data) - if na_rep is None and na_mask.any(): - data = data[~na_mask] - elif na_rep is not None and na_mask.any(): - data = np.where(na_mask, na_rep, data) - return sep.join(data) - - try: - # turn anything in "others" into lists of Series - others = self._get_series_list(others) - except ValueError as err: # do not catch TypeError raised by _get_series_list - raise ValueError( - "If `others` contains arrays or lists (or other " - "list-likes without an index), these must all be " - "of the same length as the calling Series/Index." - ) from err - - # align if required - if any(not data.index.equals(x.index) for x in others): - # Need to add keys for uniqueness in case of duplicate columns - others = concat( - others, - axis=1, - join=(join if join == "inner" else "outer"), - keys=range(len(others)), - sort=False, - copy=False, - ) - data, others = data.align(others, join=join) - others = [others[x] for x in others] # again list of Series - - all_cols = [ensure_object(x) for x in [data] + others] - na_masks = np.array([isna(x) for x in all_cols]) - union_mask = np.logical_or.reduce(na_masks, axis=0) - - if na_rep is None and union_mask.any(): - # no na_rep means NaNs for all rows where any column has a NaN - # only necessary if there are actually any NaNs - result = np.empty(len(data), dtype=object) - np.putmask(result, union_mask, np.nan) - - not_masked = ~union_mask - result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep) - elif na_rep is not None and union_mask.any(): - # fill NaNs with na_rep in case there are actually any NaNs - all_cols = [ - np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols) - ] - result = cat_safe(all_cols, sep) - else: - # no NaNs - can just concatenate - result = cat_safe(all_cols, sep) - - if isinstance(self._orig, ABCIndexClass): - # add dtype for case that result is all-NA - result = Index(result, dtype=object, name=self._orig.name) - else: # Series - if is_categorical_dtype(self._orig.dtype): - # We need to infer the new categories. - dtype = None - else: - dtype = self._orig.dtype - result = Series(result, dtype=dtype, index=data.index, name=self._orig.name) - return result - - _shared_docs[ - "str_split" - ] = r""" - Split strings around given separator/delimiter. - - Splits the string in the Series/Index from the %(side)s, - at the specified delimiter string. Equivalent to :meth:`str.%(method)s`. - - Parameters - ---------- - pat : str, optional - String or regular expression to split on. - If not specified, split on whitespace. - n : int, default -1 (all) - Limit number of splits in output. - ``None``, 0 and -1 will be interpreted as return all splits. - expand : bool, default False - Expand the split strings into separate columns. - - * If ``True``, return DataFrame/MultiIndex expanding dimensionality. - * If ``False``, return Series/Index, containing lists of strings. - - Returns - ------- - Series, Index, DataFrame or MultiIndex - Type matches caller unless ``expand=True`` (see Notes). - - See Also - -------- - Series.str.split : Split strings around given separator/delimiter. - Series.str.rsplit : Splits string around given separator/delimiter, - starting from the right. - Series.str.join : Join lists contained as elements in the Series/Index - with passed delimiter. - str.split : Standard library version for split. - str.rsplit : Standard library version for rsplit. - - Notes - ----- - The handling of the `n` keyword depends on the number of found splits: - - - If found splits > `n`, make first `n` splits only - - If found splits <= `n`, make all splits - - If for a certain row the number of found splits < `n`, - append `None` for padding up to `n` if ``expand=True`` - - If using ``expand=True``, Series and Index callers return DataFrame and - MultiIndex objects, respectively. - - Examples - -------- - >>> s = pd.Series( - ... [ - ... "this is a regular sentence", - ... "https://docs.python.org/3/tutorial/index.html", - ... np.nan - ... ] - ... ) - >>> s - 0 this is a regular sentence - 1 https://docs.python.org/3/tutorial/index.html - 2 NaN - dtype: object - - In the default setting, the string is split by whitespace. - - >>> s.str.split() - 0 [this, is, a, regular, sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 NaN - dtype: object - - Without the `n` parameter, the outputs of `rsplit` and `split` - are identical. - - >>> s.str.rsplit() - 0 [this, is, a, regular, sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 NaN - dtype: object - - The `n` parameter can be used to limit the number of splits on the - delimiter. The outputs of `split` and `rsplit` are different. - - >>> s.str.split(n=2) - 0 [this, is, a regular sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 NaN - dtype: object - - >>> s.str.rsplit(n=2) - 0 [this is a, regular, sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 NaN - dtype: object - - The `pat` parameter can be used to split by other characters. - - >>> s.str.split(pat="/") - 0 [this is a regular sentence] - 1 [https:, , docs.python.org, 3, tutorial, index... - 2 NaN - dtype: object - - When using ``expand=True``, the split elements will expand out into - separate columns. If NaN is present, it is propagated throughout - the columns during the split. - - >>> s.str.split(expand=True) - 0 1 2 3 4 - 0 this is a regular sentence - 1 https://docs.python.org/3/tutorial/index.html None None None None - 2 NaN NaN NaN NaN NaN - - For slightly more complex use cases like splitting the html document name - from a url, a combination of parameter settings can be used. - - >>> s.str.rsplit("/", n=1, expand=True) - 0 1 - 0 this is a regular sentence None - 1 https://docs.python.org/3/tutorial index.html - 2 NaN NaN - - Remember to escape special characters when explicitly using regular - expressions. - - >>> s = pd.Series(["1+1=2"]) - >>> s - 0 1+1=2 - dtype: object - >>> s.str.split(r"\+|=", expand=True) - 0 1 2 - 0 1 1 2 - """ - - @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"}) - @forbid_nonstring_types(["bytes"]) - def split(self, pat=None, n=-1, expand=False): - result = str_split(self._parent, pat, n=n) - return self._wrap_result(result, expand=expand, returns_string=expand) - - @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"}) - @forbid_nonstring_types(["bytes"]) - def rsplit(self, pat=None, n=-1, expand=False): - result = str_rsplit(self._parent, pat, n=n) - return self._wrap_result(result, expand=expand, returns_string=expand) - - _shared_docs[ - "str_partition" - ] = """ - Split the string at the %(side)s occurrence of `sep`. - - This method splits the string at the %(side)s occurrence of `sep`, - and returns 3 elements containing the part before the separator, - the separator itself, and the part after the separator. - If the separator is not found, return %(return)s. - - Parameters - ---------- - sep : str, default whitespace - String to split on. - expand : bool, default True - If True, return DataFrame/MultiIndex expanding dimensionality. - If False, return Series/Index. - - Returns - ------- - DataFrame/MultiIndex or Series/Index of objects - - See Also - -------- - %(also)s - Series.str.split : Split strings around given separators. - str.partition : Standard library version. - - Examples - -------- - - >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers']) - >>> s - 0 Linda van der Berg - 1 George Pitt-Rivers - dtype: object - - >>> s.str.partition() - 0 1 2 - 0 Linda van der Berg - 1 George Pitt-Rivers - - To partition by the last space instead of the first one: - - >>> s.str.rpartition() - 0 1 2 - 0 Linda van der Berg - 1 George Pitt-Rivers - - To partition by something different than a space: - - >>> s.str.partition('-') - 0 1 2 - 0 Linda van der Berg - 1 George Pitt - Rivers - - To return a Series containing tuples instead of a DataFrame: - - >>> s.str.partition('-', expand=False) - 0 (Linda van der Berg, , ) - 1 (George Pitt, -, Rivers) - dtype: object - - Also available on indices: - - >>> idx = pd.Index(['X 123', 'Y 999']) - >>> idx - Index(['X 123', 'Y 999'], dtype='object') - - Which will create a MultiIndex: - - >>> idx.str.partition() - MultiIndex([('X', ' ', '123'), - ('Y', ' ', '999')], - ) - - Or an index with tuples with ``expand=False``: - - >>> idx.str.partition(expand=False) - Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object') - """ - - @Appender( - _shared_docs["str_partition"] - % { - "side": "first", - "return": "3 elements containing the string itself, followed by two " - "empty strings", - "also": "rpartition : Split the string at the last occurrence of `sep`.", - } - ) - @forbid_nonstring_types(["bytes"]) - def partition(self, sep=" ", expand=True): - f = lambda x: x.partition(sep) - result = _na_map(f, self._parent) - return self._wrap_result(result, expand=expand, returns_string=expand) - - @Appender( - _shared_docs["str_partition"] - % { - "side": "last", - "return": "3 elements containing two empty strings, followed by the " - "string itself", - "also": "partition : Split the string at the first occurrence of `sep`.", - } - ) - @forbid_nonstring_types(["bytes"]) - def rpartition(self, sep=" ", expand=True): - f = lambda x: x.rpartition(sep) - result = _na_map(f, self._parent) - return self._wrap_result(result, expand=expand, returns_string=expand) - - @copy(str_get) - def get(self, i): - result = str_get(self._parent, i) - return self._wrap_result(result) - - @copy(str_join) - @forbid_nonstring_types(["bytes"]) - def join(self, sep): - result = str_join(self._parent, sep) - return self._wrap_result(result) - - @copy(str_contains) - @forbid_nonstring_types(["bytes"]) - def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): - result = str_contains( - self._parent, pat, case=case, flags=flags, na=na, regex=regex - ) - return self._wrap_result(result, fill_value=na, returns_string=False) - - @copy(str_match) - @forbid_nonstring_types(["bytes"]) - def match(self, pat, case=True, flags=0, na=np.nan): - result = str_match(self._parent, pat, case=case, flags=flags, na=na) - return self._wrap_result(result, fill_value=na, returns_string=False) - - @copy(str_fullmatch) - @forbid_nonstring_types(["bytes"]) - def fullmatch(self, pat, case=True, flags=0, na=np.nan): - result = str_fullmatch(self._parent, pat, case=case, flags=flags, na=na) - return self._wrap_result(result, fill_value=na, returns_string=False) - - @copy(str_replace) - @forbid_nonstring_types(["bytes"]) - def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): - result = str_replace( - self._parent, pat, repl, n=n, case=case, flags=flags, regex=regex - ) - return self._wrap_result(result) - - @copy(str_repeat) - @forbid_nonstring_types(["bytes"]) - def repeat(self, repeats): - result = str_repeat(self._parent, repeats) - return self._wrap_result(result) - - @copy(str_pad) - @forbid_nonstring_types(["bytes"]) - def pad(self, width, side="left", fillchar=" "): - result = str_pad(self._parent, width, side=side, fillchar=fillchar) - return self._wrap_result(result) - - _shared_docs[ - "str_pad" - ] = """ - Pad %(side)s side of strings in the Series/Index. - - Equivalent to :meth:`str.%(method)s`. - - Parameters - ---------- - width : int - Minimum width of resulting string; additional characters will be filled - with ``fillchar``. - fillchar : str - Additional character for filling, default is whitespace. - - Returns - ------- - filled : Series/Index of objects. - """ - - @Appender(_shared_docs["str_pad"] % dict(side="left and right", method="center")) - @forbid_nonstring_types(["bytes"]) - def center(self, width, fillchar=" "): - return self.pad(width, side="both", fillchar=fillchar) - - @Appender(_shared_docs["str_pad"] % dict(side="right", method="ljust")) - @forbid_nonstring_types(["bytes"]) - def ljust(self, width, fillchar=" "): - return self.pad(width, side="right", fillchar=fillchar) - - @Appender(_shared_docs["str_pad"] % dict(side="left", method="rjust")) - @forbid_nonstring_types(["bytes"]) - def rjust(self, width, fillchar=" "): - return self.pad(width, side="left", fillchar=fillchar) - - @forbid_nonstring_types(["bytes"]) - def zfill(self, width): - """ - Pad strings in the Series/Index by prepending '0' characters. - - Strings in the Series/Index are padded with '0' characters on the - left of the string to reach a total string length `width`. Strings - in the Series/Index with length greater or equal to `width` are - unchanged. - - Parameters - ---------- - width : int - Minimum length of resulting string; strings with length less - than `width` be prepended with '0' characters. - - Returns - ------- - Series/Index of objects. - - See Also - -------- - Series.str.rjust : Fills the left side of strings with an arbitrary - character. - Series.str.ljust : Fills the right side of strings with an arbitrary - character. - Series.str.pad : Fills the specified sides of strings with an arbitrary - character. - Series.str.center : Fills boths sides of strings with an arbitrary - character. - - Notes - ----- - Differs from :meth:`str.zfill` which has special handling - for '+'/'-' in the string. - - Examples - -------- - >>> s = pd.Series(['-1', '1', '1000', 10, np.nan]) - >>> s - 0 -1 - 1 1 - 2 1000 - 3 10 - 4 NaN - dtype: object - - Note that ``10`` and ``NaN`` are not strings, therefore they are - converted to ``NaN``. The minus sign in ``'-1'`` is treated as a - regular character and the zero is added to the left of it - (:meth:`str.zfill` would have moved it to the left). ``1000`` - remains unchanged as it is longer than `width`. - - >>> s.str.zfill(3) - 0 0-1 - 1 001 - 2 1000 - 3 NaN - 4 NaN - dtype: object - """ - result = str_pad(self._parent, width, side="left", fillchar="0") - return self._wrap_result(result) - - @copy(str_slice) - def slice(self, start=None, stop=None, step=None): - result = str_slice(self._parent, start, stop, step) - return self._wrap_result(result) - - @copy(str_slice_replace) - @forbid_nonstring_types(["bytes"]) - def slice_replace(self, start=None, stop=None, repl=None): - result = str_slice_replace(self._parent, start, stop, repl) - return self._wrap_result(result) - - @copy(str_decode) - def decode(self, encoding, errors="strict"): - # need to allow bytes here - result = str_decode(self._parent, encoding, errors) - # TODO: Not sure how to handle this. - return self._wrap_result(result, returns_string=False) - - @copy(str_encode) - @forbid_nonstring_types(["bytes"]) - def encode(self, encoding, errors="strict"): - result = str_encode(self._parent, encoding, errors) - return self._wrap_result(result, returns_string=False) - - _shared_docs[ - "str_strip" - ] = r""" - Remove %(position)s characters. - - Strip whitespaces (including newlines) or a set of specified characters - from each string in the Series/Index from %(side)s. - Equivalent to :meth:`str.%(method)s`. - - Parameters - ---------- - to_strip : str or None, default None - Specifying the set of characters to be removed. - All combinations of this set of characters will be stripped. - If None then whitespaces are removed. - - Returns - ------- - Series or Index of object - - See Also - -------- - Series.str.strip : Remove leading and trailing characters in Series/Index. - Series.str.lstrip : Remove leading characters in Series/Index. - Series.str.rstrip : Remove trailing characters in Series/Index. - - Examples - -------- - >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan]) - >>> s - 0 1. Ant. - 1 2. Bee!\n - 2 3. Cat?\t - 3 NaN - dtype: object - - >>> s.str.strip() - 0 1. Ant. - 1 2. Bee! - 2 3. Cat? - 3 NaN - dtype: object - - >>> s.str.lstrip('123.') - 0 Ant. - 1 Bee!\n - 2 Cat?\t - 3 NaN - dtype: object - - >>> s.str.rstrip('.!? \n\t') - 0 1. Ant - 1 2. Bee - 2 3. Cat - 3 NaN - dtype: object - - >>> s.str.strip('123.!? \n\t') - 0 Ant - 1 Bee - 2 Cat - 3 NaN - dtype: object - """ - - @Appender( - _shared_docs["str_strip"] - % dict( - side="left and right sides", method="strip", position="leading and trailing" - ) - ) - @forbid_nonstring_types(["bytes"]) - def strip(self, to_strip=None): - result = str_strip(self._parent, to_strip, side="both") - return self._wrap_result(result) - - @Appender( - _shared_docs["str_strip"] - % dict(side="left side", method="lstrip", position="leading") - ) - @forbid_nonstring_types(["bytes"]) - def lstrip(self, to_strip=None): - result = str_strip(self._parent, to_strip, side="left") - return self._wrap_result(result) - - @Appender( - _shared_docs["str_strip"] - % dict(side="right side", method="rstrip", position="trailing") - ) - @forbid_nonstring_types(["bytes"]) - def rstrip(self, to_strip=None): - result = str_strip(self._parent, to_strip, side="right") - return self._wrap_result(result) - - @copy(str_wrap) - @forbid_nonstring_types(["bytes"]) - def wrap(self, width, **kwargs): - result = str_wrap(self._parent, width, **kwargs) - return self._wrap_result(result) - - @copy(str_get_dummies) - @forbid_nonstring_types(["bytes"]) - def get_dummies(self, sep="|"): - # we need to cast to Series of strings as only that has all - # methods available for making the dummies... - data = self._orig.astype(str) if self._is_categorical else self._parent - result, name = str_get_dummies(data, sep) - return self._wrap_result( - result, - use_codes=(not self._is_categorical), - name=name, - expand=True, - returns_string=False, - ) - - @copy(str_translate) - @forbid_nonstring_types(["bytes"]) - def translate(self, table): - result = str_translate(self._parent, table) - return self._wrap_result(result) - - count = _pat_wrapper(str_count, flags=True, name="count", returns_string=False) - startswith = _pat_wrapper( - str_startswith, na=True, name="startswith", returns_string=False - ) - endswith = _pat_wrapper( - str_endswith, na=True, name="endswith", returns_string=False - ) - findall = _pat_wrapper( - str_findall, flags=True, name="findall", returns_string=False - ) - - @copy(str_extract) - @forbid_nonstring_types(["bytes"]) - def extract(self, pat, flags=0, expand=True): - return str_extract(self, pat, flags=flags, expand=expand) - - @copy(str_extractall) - @forbid_nonstring_types(["bytes"]) - def extractall(self, pat, flags=0): - return str_extractall(self._orig, pat, flags=flags) - - _shared_docs[ - "find" - ] = """ - Return %(side)s indexes in each strings in the Series/Index. - - Each of returned indexes corresponds to the position where the - substring is fully contained between [start:end]. Return -1 on - failure. Equivalent to standard :meth:`str.%(method)s`. - - Parameters - ---------- - sub : str - Substring being searched. - start : int - Left edge index. - end : int - Right edge index. - - Returns - ------- - Series or Index of int. - - See Also - -------- - %(also)s - """ - - @Appender( - _shared_docs["find"] - % dict( - side="lowest", - method="find", - also="rfind : Return highest indexes in each strings.", - ) - ) - @forbid_nonstring_types(["bytes"]) - def find(self, sub, start=0, end=None): - result = str_find(self._parent, sub, start=start, end=end, side="left") - return self._wrap_result(result, returns_string=False) - - @Appender( - _shared_docs["find"] - % dict( - side="highest", - method="rfind", - also="find : Return lowest indexes in each strings.", - ) - ) - @forbid_nonstring_types(["bytes"]) - def rfind(self, sub, start=0, end=None): - result = str_find(self._parent, sub, start=start, end=end, side="right") - return self._wrap_result(result, returns_string=False) - - @forbid_nonstring_types(["bytes"]) - def normalize(self, form): - """ - Return the Unicode normal form for the strings in the Series/Index. - - For more information on the forms, see the - :func:`unicodedata.normalize`. - - Parameters - ---------- - form : {'NFC', 'NFKC', 'NFD', 'NFKD'} - Unicode form. - - Returns - ------- - normalized : Series/Index of objects - """ - import unicodedata - - f = lambda x: unicodedata.normalize(form, x) - result = _na_map(f, self._parent, dtype=str) - return self._wrap_result(result) - - _shared_docs[ - "index" - ] = """ - Return %(side)s indexes in each string in Series/Index. - - Each of the returned indexes corresponds to the position where the - substring is fully contained between [start:end]. This is the same - as ``str.%(similar)s`` except instead of returning -1, it raises a - ValueError when the substring is not found. Equivalent to standard - ``str.%(method)s``. - - Parameters - ---------- - sub : str - Substring being searched. - start : int - Left edge index. - end : int - Right edge index. - - Returns - ------- - Series or Index of object - - See Also - -------- - %(also)s - """ - - @Appender( - _shared_docs["index"] - % dict( - side="lowest", - similar="find", - method="index", - also="rindex : Return highest indexes in each strings.", - ) - ) - @forbid_nonstring_types(["bytes"]) - def index(self, sub, start=0, end=None): - result = str_index(self._parent, sub, start=start, end=end, side="left") - return self._wrap_result(result, returns_string=False) - - @Appender( - _shared_docs["index"] - % dict( - side="highest", - similar="rfind", - method="rindex", - also="index : Return lowest indexes in each strings.", - ) - ) - @forbid_nonstring_types(["bytes"]) - def rindex(self, sub, start=0, end=None): - result = str_index(self._parent, sub, start=start, end=end, side="right") - return self._wrap_result(result, returns_string=False) - - _shared_docs[ - "len" - ] = """ - Compute the length of each element in the Series/Index. - - The element may be a sequence (such as a string, tuple or list) or a collection - (such as a dictionary). - - Returns - ------- - Series or Index of int - A Series or Index of integer values indicating the length of each - element in the Series or Index. - - See Also - -------- - str.len : Python built-in function returning the length of an object. - Series.size : Returns the length of the Series. - - Examples - -------- - Returns the length (number of characters) in a string. Returns the - number of entries for dictionaries, lists or tuples. - - >>> s = pd.Series(['dog', - ... '', - ... 5, - ... {'foo' : 'bar'}, - ... [2, 3, 5, 7], - ... ('one', 'two', 'three')]) - >>> s - 0 dog - 1 - 2 5 - 3 {'foo': 'bar'} - 4 [2, 3, 5, 7] - 5 (one, two, three) - dtype: object - >>> s.str.len() - 0 3.0 - 1 0.0 - 2 NaN - 3 1.0 - 4 4.0 - 5 3.0 - dtype: float64 - """ - len = _noarg_wrapper( - len, - docstring=_shared_docs["len"], - forbidden_types=None, - dtype=np.dtype("int64"), - returns_string=False, - ) - - _shared_docs[ - "casemethods" - ] = """ - Convert strings in the Series/Index to %(type)s. - %(version)s - Equivalent to :meth:`str.%(method)s`. - - Returns - ------- - Series or Index of object - - See Also - -------- - Series.str.lower : Converts all characters to lowercase. - Series.str.upper : Converts all characters to uppercase. - Series.str.title : Converts first character of each word to uppercase and - remaining to lowercase. - Series.str.capitalize : Converts first character to uppercase and - remaining to lowercase. - Series.str.swapcase : Converts uppercase to lowercase and lowercase to - uppercase. - Series.str.casefold: Removes all case distinctions in the string. - - Examples - -------- - >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe']) - >>> s - 0 lower - 1 CAPITALS - 2 this is a sentence - 3 SwApCaSe - dtype: object - - >>> s.str.lower() - 0 lower - 1 capitals - 2 this is a sentence - 3 swapcase - dtype: object - - >>> s.str.upper() - 0 LOWER - 1 CAPITALS - 2 THIS IS A SENTENCE - 3 SWAPCASE - dtype: object - - >>> s.str.title() - 0 Lower - 1 Capitals - 2 This Is A Sentence - 3 Swapcase - dtype: object - - >>> s.str.capitalize() - 0 Lower - 1 Capitals - 2 This is a sentence - 3 Swapcase - dtype: object - - >>> s.str.swapcase() - 0 LOWER - 1 capitals - 2 THIS IS A SENTENCE - 3 sWaPcAsE - dtype: object - """ - - # _doc_args holds dict of strings to use in substituting casemethod docs - _doc_args: Dict[str, Dict[str, str]] = {} - _doc_args["lower"] = dict(type="lowercase", method="lower", version="") - _doc_args["upper"] = dict(type="uppercase", method="upper", version="") - _doc_args["title"] = dict(type="titlecase", method="title", version="") - _doc_args["capitalize"] = dict( - type="be capitalized", method="capitalize", version="" - ) - _doc_args["swapcase"] = dict(type="be swapcased", method="swapcase", version="") - _doc_args["casefold"] = dict( - type="be casefolded", - method="casefold", - version="\n .. versionadded:: 0.25.0\n", - ) - lower = _noarg_wrapper( - lambda x: x.lower(), - name="lower", - docstring=_shared_docs["casemethods"] % _doc_args["lower"], - dtype=str, - ) - upper = _noarg_wrapper( - lambda x: x.upper(), - name="upper", - docstring=_shared_docs["casemethods"] % _doc_args["upper"], - dtype=str, - ) - title = _noarg_wrapper( - lambda x: x.title(), - name="title", - docstring=_shared_docs["casemethods"] % _doc_args["title"], - dtype=str, - ) - capitalize = _noarg_wrapper( - lambda x: x.capitalize(), - name="capitalize", - docstring=_shared_docs["casemethods"] % _doc_args["capitalize"], - dtype=str, - ) - swapcase = _noarg_wrapper( - lambda x: x.swapcase(), - name="swapcase", - docstring=_shared_docs["casemethods"] % _doc_args["swapcase"], - dtype=str, - ) - casefold = _noarg_wrapper( - lambda x: x.casefold(), - name="casefold", - docstring=_shared_docs["casemethods"] % _doc_args["casefold"], - dtype=str, - ) - - _shared_docs[ - "ismethods" - ] = """ - Check whether all characters in each string are %(type)s. - - This is equivalent to running the Python string method - :meth:`str.%(method)s` for each element of the Series/Index. If a string - has zero characters, ``False`` is returned for that check. - - Returns - ------- - Series or Index of bool - Series or Index of boolean values with the same length as the original - Series/Index. - - See Also - -------- - Series.str.isalpha : Check whether all characters are alphabetic. - Series.str.isnumeric : Check whether all characters are numeric. - Series.str.isalnum : Check whether all characters are alphanumeric. - Series.str.isdigit : Check whether all characters are digits. - Series.str.isdecimal : Check whether all characters are decimal. - Series.str.isspace : Check whether all characters are whitespace. - Series.str.islower : Check whether all characters are lowercase. - Series.str.isupper : Check whether all characters are uppercase. - Series.str.istitle : Check whether all characters are titlecase. - - Examples - -------- - **Checks for Alphabetic and Numeric Characters** - - >>> s1 = pd.Series(['one', 'one1', '1', '']) - - >>> s1.str.isalpha() - 0 True - 1 False - 2 False - 3 False - dtype: bool - - >>> s1.str.isnumeric() - 0 False - 1 False - 2 True - 3 False - dtype: bool - - >>> s1.str.isalnum() - 0 True - 1 True - 2 True - 3 False - dtype: bool - - Note that checks against characters mixed with any additional punctuation - or whitespace will evaluate to false for an alphanumeric check. - - >>> s2 = pd.Series(['A B', '1.5', '3,000']) - >>> s2.str.isalnum() - 0 False - 1 False - 2 False - dtype: bool - - **More Detailed Checks for Numeric Characters** - - There are several different but overlapping sets of numeric characters that - can be checked for. - - >>> s3 = pd.Series(['23', '³', '⅕', '']) - - The ``s3.str.isdecimal`` method checks for characters used to form numbers - in base 10. - - >>> s3.str.isdecimal() - 0 True - 1 False - 2 False - 3 False - dtype: bool - - The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also - includes special digits, like superscripted and subscripted digits in - unicode. - - >>> s3.str.isdigit() - 0 True - 1 True - 2 False - 3 False - dtype: bool - - The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also - includes other characters that can represent quantities such as unicode - fractions. - - >>> s3.str.isnumeric() - 0 True - 1 True - 2 True - 3 False - dtype: bool - - **Checks for Whitespace** - - >>> s4 = pd.Series([' ', '\\t\\r\\n ', '']) - >>> s4.str.isspace() - 0 True - 1 True - 2 False - dtype: bool - - **Checks for Character Case** - - >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) - - >>> s5.str.islower() - 0 True - 1 False - 2 False - 3 False - dtype: bool - - >>> s5.str.isupper() - 0 False - 1 False - 2 True - 3 False - dtype: bool - - The ``s5.str.istitle`` method checks for whether all words are in title - case (whether only the first letter of each word is capitalized). Words are - assumed to be as any sequence of non-numeric characters separated by - whitespace characters. - - >>> s5.str.istitle() - 0 False - 1 True - 2 False - 3 False - dtype: bool - """ - _doc_args["isalnum"] = dict(type="alphanumeric", method="isalnum") - _doc_args["isalpha"] = dict(type="alphabetic", method="isalpha") - _doc_args["isdigit"] = dict(type="digits", method="isdigit") - _doc_args["isspace"] = dict(type="whitespace", method="isspace") - _doc_args["islower"] = dict(type="lowercase", method="islower") - _doc_args["isupper"] = dict(type="uppercase", method="isupper") - _doc_args["istitle"] = dict(type="titlecase", method="istitle") - _doc_args["isnumeric"] = dict(type="numeric", method="isnumeric") - _doc_args["isdecimal"] = dict(type="decimal", method="isdecimal") - # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624) - isalnum = _noarg_wrapper( - lambda x: x.isalnum(), - name="isalnum", - docstring=_shared_docs["ismethods"] % _doc_args["isalnum"], - returns_string=False, - dtype=np.dtype(bool), - ) - isalpha = _noarg_wrapper( - lambda x: x.isalpha(), - name="isalpha", - docstring=_shared_docs["ismethods"] % _doc_args["isalpha"], - returns_string=False, - dtype=np.dtype(bool), - ) - isdigit = _noarg_wrapper( - lambda x: x.isdigit(), - name="isdigit", - docstring=_shared_docs["ismethods"] % _doc_args["isdigit"], - returns_string=False, - dtype=np.dtype(bool), - ) - isspace = _noarg_wrapper( - lambda x: x.isspace(), - name="isspace", - docstring=_shared_docs["ismethods"] % _doc_args["isspace"], - returns_string=False, - dtype=np.dtype(bool), - ) - islower = _noarg_wrapper( - lambda x: x.islower(), - name="islower", - docstring=_shared_docs["ismethods"] % _doc_args["islower"], - returns_string=False, - dtype=np.dtype(bool), - ) - isupper = _noarg_wrapper( - lambda x: x.isupper(), - name="isupper", - docstring=_shared_docs["ismethods"] % _doc_args["isupper"], - returns_string=False, - dtype=np.dtype(bool), - ) - istitle = _noarg_wrapper( - lambda x: x.istitle(), - name="istitle", - docstring=_shared_docs["ismethods"] % _doc_args["istitle"], - returns_string=False, - dtype=np.dtype(bool), - ) - isnumeric = _noarg_wrapper( - lambda x: x.isnumeric(), - name="isnumeric", - docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"], - returns_string=False, - dtype=np.dtype(bool), - ) - isdecimal = _noarg_wrapper( - lambda x: x.isdecimal(), - name="isdecimal", - docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"], - returns_string=False, - dtype=np.dtype(bool), - ) - - @classmethod - def _make_accessor(cls, data): - cls._validate(data) - return cls(data) diff --git a/pandas/core/strings/__init__.py b/pandas/core/strings/__init__.py new file mode 100644 index 0000000000000..c36a96edf7125 --- /dev/null +++ b/pandas/core/strings/__init__.py @@ -0,0 +1 @@ +from .accessor import StringMethods diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py new file mode 100644 index 0000000000000..84630066ef871 --- /dev/null +++ b/pandas/core/strings/accessor.py @@ -0,0 +1,2752 @@ +import codecs +from functools import wraps +import operator +from typing import Dict +import warnings + +import numpy as np + +import pandas._libs.lib as lib +from pandas.util._decorators import Appender + +from pandas.core.dtypes.common import is_bool_dtype, is_categorical_dtype, is_list_like +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCIndexClass, + ABCMultiIndex, + ABCSeries, +) + +from pandas.core.algorithms import take_1d +from pandas.core.arrays.numpy_ import PandasArray +from pandas.core.base import NoNewAttributesMixin +from pandas.core.strings.object_array import ObjectProxy + +_shared_docs: Dict[str, str] = dict() +_cpython_optimized_encoders = ( + "utf-8", + "utf8", + "latin-1", + "latin1", + "iso-8859-1", + "mbcs", + "ascii", +) +_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32") + + +def forbid_nonstring_types(forbidden, name=None): + """ + Decorator to forbid specific types for a method of StringMethods. + + For calling `.str.{method}` on a Series or Index, it is necessary to first + initialize the :class:`StringMethods` object, and then call the method. + However, different methods allow different input types, and so this can not + be checked during :meth:`StringMethods.__init__`, but must be done on a + per-method basis. This decorator exists to facilitate this process, and + make it explicit which (inferred) types are disallowed by the method. + + :meth:`StringMethods.__init__` allows the *union* of types its different + methods allow (after skipping NaNs; see :meth:`StringMethods._validate`), + namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer']. + + The default string types ['string', 'empty'] are allowed for all methods. + For the additional types ['bytes', 'mixed', 'mixed-integer'], each method + then needs to forbid the types it is not intended for. + + Parameters + ---------- + forbidden : list-of-str or None + List of forbidden non-string types, may be one or more of + `['bytes', 'mixed', 'mixed-integer']`. + name : str, default None + Name of the method to use in the error message. By default, this is + None, in which case the name from the method being wrapped will be + copied. However, for working with further wrappers (like _pat_wrapper + and _noarg_wrapper), it is necessary to specify the name. + + Returns + ------- + func : wrapper + The method to which the decorator is applied, with an added check that + enforces the inferred type to not be in the list of forbidden types. + + Raises + ------ + TypeError + If the inferred type of the underlying data is in `forbidden`. + """ + # deal with None + forbidden = [] if forbidden is None else forbidden + + allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set( + forbidden + ) + + def _forbid_nonstring_types(func): + func_name = func.__name__ if name is None else name + + @wraps(func) + def wrapper(self, *args, **kwargs): + if self._inferred_dtype not in allowed_types: + msg = ( + f"Cannot use .str.{func_name} with values of " + f"inferred dtype '{self._inferred_dtype}'." + ) + raise TypeError(msg) + return func(self, *args, **kwargs) + + wrapper.__name__ = func_name + return wrapper + + return _forbid_nonstring_types + + +def _map_and_wrap(name, docstring): + def wrapper(self): + result = operator.methodcaller(name)(self._array._str) + return self._wrap_result(result) + + wrapper.__doc__ = docstring + return wrapper + + +class StringMethods(NoNewAttributesMixin): + """ + Vectorized string functions for Series and Index. + + NAs stay NA unless handled otherwise by a particular method. + Patterned after Python's string methods, with some inspiration from + R's stringr package. + + Examples + -------- + >>> s = pd.Series(["A_Str_Series"]) + >>> s + 0 A_Str_Series + dtype: object + + >>> s.str.split("_") + 0 [A, Str, Series] + dtype: object + + >>> s.str.replace("_", "") + 0 AStrSeries + dtype: object + """ + + def __init__(self, data): + from pandas.core.arrays.string_ import StringDtype + + self._inferred_dtype = self._validate(data) + self._is_categorical = is_categorical_dtype(data.dtype) + self._is_string = isinstance(data.dtype, StringDtype) + array = data.array + + if isinstance(array, PandasArray): + # wrap in an object proxy to get the str methods. + # Alternatively, just add _str to PandasArray. + array = ObjectProxy(array._ndarray) + self._array = array + + if isinstance(data, ABCSeries): + self._index = data.index + self._name = data.name + else: + self._index = self._name = None + + # ._values.categories works for both Series/Index + self._parent = data._values.categories if self._is_categorical else data + # save orig to blow up categoricals to the right type + self._orig = data + self._freeze() + + @staticmethod + def _validate(data): + """ + Auxiliary function for StringMethods, infers and checks dtype of data. + + This is a "first line of defence" at the creation of the StringMethods- + object (see _make_accessor), and just checks that the dtype is in the + *union* of the allowed types over all string methods below; this + restriction is then refined on a per-method basis using the decorator + @forbid_nonstring_types (more info in the corresponding docstring). + + This really should exclude all series/index with any non-string values, + but that isn't practical for performance reasons until we have a str + dtype (GH 9343 / 13877) + + Parameters + ---------- + data : The content of the Series + + Returns + ------- + dtype : inferred dtype of data + """ + from pandas import StringDtype + + if isinstance(data, ABCMultiIndex): + raise AttributeError( + "Can only use .str accessor with Index, not MultiIndex" + ) + + # see _libs/lib.pyx for list of inferred types + allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"] + + values = getattr(data, "values", data) # Series / Index + values = getattr(values, "categories", values) # categorical / normal + + # explicitly allow StringDtype + if isinstance(values.dtype, StringDtype): + return "string" + + try: + inferred_dtype = lib.infer_dtype(values, skipna=True) + except ValueError: + # GH#27571 mostly occurs with ExtensionArray + inferred_dtype = None + + if inferred_dtype not in allowed_types: + raise AttributeError("Can only use .str accessor with string values!") + return inferred_dtype + + def __getitem__(self, key): + return self._array._str[key] + # if isinstance(key, slice): + # return self.slice(start=key.start, stop=key.stop, step=key.step) + # else: + # return self.get(key) + + def __iter__(self): + warnings.warn( + "Columnar iteration over characters will be deprecated in future releases.", + FutureWarning, + stacklevel=2, + ) + i = 0 + g = self.get(i) + while g.notna().any(): + yield g + i += 1 + g = self.get(i) + + def _wrap_result( + self, + result, + use_codes=True, + name=None, + expand=None, + fill_value=np.nan, + returns_string=True, + ): + from pandas import Index, MultiIndex, Series + + # for category, we do the stuff on the categories, so blow it up + # to the full series again + # But for some operations, we have to do the stuff on the full values, + # so make it possible to skip this step as the method already did this + # before the transformation... + if use_codes and self._is_categorical: + # if self._orig is a CategoricalIndex, there is no .cat-accessor + result = take_1d( + result, Series(self._orig, copy=False).cat.codes, fill_value=fill_value + ) + + if not hasattr(result, "ndim") or not hasattr(result, "dtype"): + return result + assert result.ndim < 3 + + # We can be wrapping a string / object / categorical result, in which + # case we'll want to return the same dtype as the input. + # Or we can be wrapping a numeric output, in which case we don't want + # to return a StringArray. + if self._is_string and returns_string: + dtype = "string" + else: + dtype = None + + if expand is None: + # infer from ndim if expand is not specified + expand = result.ndim != 1 + + elif expand is True and not isinstance(self._orig, ABCIndexClass): + # required when expand=True is explicitly specified + # not needed when inferred + + def cons_row(x): + if is_list_like(x): + return x + else: + return [x] + + result = [cons_row(x) for x in result] + if result: + # propagate nan values to match longest sequence (GH 18450) + max_len = max(len(x) for x in result) + result = [ + x * max_len if len(x) == 0 or x[0] is np.nan else x for x in result + ] + + if not isinstance(expand, bool): + raise ValueError("expand must be True or False") + + if expand is False: + # if expand is False, result should have the same name + # as the original otherwise specified + if name is None: + name = getattr(result, "name", None) + if name is None: + # do not use logical or, _orig may be a DataFrame + # which has "name" column + name = self._orig.name + + # Wait until we are sure result is a Series or Index before + # checking attributes (GH 12180) + if isinstance(self._orig, ABCIndexClass): + # if result is a boolean np.array, return the np.array + # instead of wrapping it into a boolean Index (GH 8875) + if is_bool_dtype(result): + return result + + if expand: + result = list(result) + out = MultiIndex.from_tuples(result, names=name) + if out.nlevels == 1: + # We had all tuples of length-one, which are + # better represented as a regular Index. + out = out.get_level_values(0) + return out + else: + return Index(result, name=name) + else: + index = self._orig.index + if expand: + cons = self._orig._constructor_expanddim + result = cons(result, columns=name, index=index, dtype=dtype) + else: + # Must be a Series + cons = self._orig._constructor + result = cons(result, name=name, index=index, dtype=dtype) + return result + + def _get_series_list(self, others): + """ + Auxiliary function for :meth:`str.cat`. Turn potentially mixed input + into a list of Series (elements without an index must match the length + of the calling Series/Index). + + Parameters + ---------- + others : Series, DataFrame, np.ndarray, list-like or list-like of + Objects that are either Series, Index or np.ndarray (1-dim). + + Returns + ------- + list of Series + Others transformed into list of Series. + """ + from pandas import DataFrame, Series + + # self._orig is either Series or Index + idx = self._orig if isinstance(self._orig, ABCIndexClass) else self._orig.index + + # Generally speaking, all objects without an index inherit the index + # `idx` of the calling Series/Index - i.e. must have matching length. + # Objects with an index (i.e. Series/Index/DataFrame) keep their own. + if isinstance(others, ABCSeries): + return [others] + elif isinstance(others, ABCIndexClass): + return [Series(others._values, index=idx)] + elif isinstance(others, ABCDataFrame): + return [others[x] for x in others] + elif isinstance(others, np.ndarray) and others.ndim == 2: + others = DataFrame(others, index=idx) + return [others[x] for x in others] + elif is_list_like(others, allow_sets=False): + others = list(others) # ensure iterators do not get read twice etc + + # in case of list-like `others`, all elements must be + # either Series/Index/np.ndarray (1-dim)... + if all( + isinstance(x, (ABCSeries, ABCIndexClass)) + or (isinstance(x, np.ndarray) and x.ndim == 1) + for x in others + ): + los = [] + while others: # iterate through list and append each element + los = los + self._get_series_list(others.pop(0)) + return los + # ... or just strings + elif all(not is_list_like(x) for x in others): + return [Series(others, index=idx)] + raise TypeError( + "others must be Series, Index, DataFrame, np.ndarray " + "or list-like (either containing only strings or " + "containing only objects of type Series/Index/" + "np.ndarray[1-dim])" + ) + + @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"]) + def cat(self, others=None, sep=None, na_rep=None, join="left"): + """ + Concatenate strings in the Series/Index with given separator. + + If `others` is specified, this function concatenates the Series/Index + and elements of `others` element-wise. + If `others` is not passed, then all values in the Series/Index are + concatenated into a single string with a given `sep`. + + Parameters + ---------- + others : Series, Index, DataFrame, np.ndarray or list-like + Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and + other list-likes of strings must have the same length as the + calling Series/Index, with the exception of indexed objects (i.e. + Series/Index/DataFrame) if `join` is not None. + + If others is a list-like that contains a combination of Series, + Index or np.ndarray (1-dim), then all elements will be unpacked and + must satisfy the above criteria individually. + + If others is None, the method returns the concatenation of all + strings in the calling Series/Index. + sep : str, default '' + The separator between the different elements/columns. By default + the empty string `''` is used. + na_rep : str or None, default None + Representation that is inserted for all missing values: + + - If `na_rep` is None, and `others` is None, missing values in the + Series/Index are omitted from the result. + - If `na_rep` is None, and `others` is not None, a row containing a + missing value in any of the columns (before concatenation) will + have a missing value in the result. + join : {'left', 'right', 'outer', 'inner'}, default 'left' + Determines the join-style between the calling Series/Index and any + Series/Index/DataFrame in `others` (objects without an index need + to match the length of the calling Series/Index). To disable + alignment, use `.values` on any Series/Index/DataFrame in `others`. + + .. versionadded:: 0.23.0 + .. versionchanged:: 1.0.0 + Changed default of `join` from None to `'left'`. + + Returns + ------- + str, Series or Index + If `others` is None, `str` is returned, otherwise a `Series/Index` + (same type as caller) of objects is returned. + + See Also + -------- + split : Split each string in the Series/Index. + join : Join lists contained as elements in the Series/Index. + + Examples + -------- + When not passing `others`, all values are concatenated into a single + string: + + >>> s = pd.Series(['a', 'b', np.nan, 'd']) + >>> s.str.cat(sep=' ') + 'a b d' + + By default, NA values in the Series are ignored. Using `na_rep`, they + can be given a representation: + + >>> s.str.cat(sep=' ', na_rep='?') + 'a b ? d' + + If `others` is specified, corresponding values are concatenated with + the separator. Result will be a Series of strings. + + >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',') + 0 a,A + 1 b,B + 2 NaN + 3 d,D + dtype: object + + Missing values will remain missing in the result, but can again be + represented using `na_rep` + + >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-') + 0 a,A + 1 b,B + 2 -,C + 3 d,D + dtype: object + + If `sep` is not specified, the values are concatenated without + separation. + + >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-') + 0 aA + 1 bB + 2 -C + 3 dD + dtype: object + + Series with different indexes can be aligned before concatenation. The + `join`-keyword works as in other methods. + + >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2]) + >>> s.str.cat(t, join='left', na_rep='-') + 0 aa + 1 b- + 2 -c + 3 dd + dtype: object + >>> + >>> s.str.cat(t, join='outer', na_rep='-') + 0 aa + 1 b- + 2 -c + 3 dd + 4 -e + dtype: object + >>> + >>> s.str.cat(t, join='inner', na_rep='-') + 0 aa + 2 -c + 3 dd + dtype: object + >>> + >>> s.str.cat(t, join='right', na_rep='-') + 3 dd + 0 aa + 4 -e + 2 -c + dtype: object + + For more examples, see :ref:`here `. + """ + return self._array._str.cat(others, sep, na_rep, join) + + _shared_docs[ + "str_split" + ] = r""" + Split strings around given separator/delimiter. + + Splits the string in the Series/Index from the %(side)s, + at the specified delimiter string. Equivalent to :meth:`str.%(method)s`. + + Parameters + ---------- + pat : str, optional + String or regular expression to split on. + If not specified, split on whitespace. + n : int, default -1 (all) + Limit number of splits in output. + ``None``, 0 and -1 will be interpreted as return all splits. + expand : bool, default False + Expand the split strings into separate columns. + + * If ``True``, return DataFrame/MultiIndex expanding dimensionality. + * If ``False``, return Series/Index, containing lists of strings. + + Returns + ------- + Series, Index, DataFrame or MultiIndex + Type matches caller unless ``expand=True`` (see Notes). + + See Also + -------- + Series.str.split : Split strings around given separator/delimiter. + Series.str.rsplit : Splits string around given separator/delimiter, + starting from the right. + Series.str.join : Join lists contained as elements in the Series/Index + with passed delimiter. + str.split : Standard library version for split. + str.rsplit : Standard library version for rsplit. + + Notes + ----- + The handling of the `n` keyword depends on the number of found splits: + + - If found splits > `n`, make first `n` splits only + - If found splits <= `n`, make all splits + - If for a certain row the number of found splits < `n`, + append `None` for padding up to `n` if ``expand=True`` + + If using ``expand=True``, Series and Index callers return DataFrame and + MultiIndex objects, respectively. + + Examples + -------- + >>> s = pd.Series( + ... [ + ... "this is a regular sentence", + ... "https://docs.python.org/3/tutorial/index.html", + ... np.nan + ... ] + ... ) + >>> s + 0 this is a regular sentence + 1 https://docs.python.org/3/tutorial/index.html + 2 NaN + dtype: object + + In the default setting, the string is split by whitespace. + + >>> s.str.split() + 0 [this, is, a, regular, sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + Without the `n` parameter, the outputs of `rsplit` and `split` + are identical. + + >>> s.str.rsplit() + 0 [this, is, a, regular, sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + The `n` parameter can be used to limit the number of splits on the + delimiter. The outputs of `split` and `rsplit` are different. + + >>> s.str.split(n=2) + 0 [this, is, a regular sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + >>> s.str.rsplit(n=2) + 0 [this is a, regular, sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + The `pat` parameter can be used to split by other characters. + + >>> s.str.split(pat="/") + 0 [this is a regular sentence] + 1 [https:, , docs.python.org, 3, tutorial, index... + 2 NaN + dtype: object + + When using ``expand=True``, the split elements will expand out into + separate columns. If NaN is present, it is propagated throughout + the columns during the split. + + >>> s.str.split(expand=True) + 0 1 2 3 4 + 0 this is a regular sentence + 1 https://docs.python.org/3/tutorial/index.html None None None None + 2 NaN NaN NaN NaN NaN + + For slightly more complex use cases like splitting the html document name + from a url, a combination of parameter settings can be used. + + >>> s.str.rsplit("/", n=1, expand=True) + 0 1 + 0 this is a regular sentence None + 1 https://docs.python.org/3/tutorial index.html + 2 NaN NaN + + Remember to escape special characters when explicitly using regular + expressions. + + >>> s = pd.Series(["1+1=2"]) + >>> s + 0 1+1=2 + dtype: object + >>> s.str.split(r"\+|=", expand=True) + 0 1 2 + 0 1 1 2 + """ + + @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"}) + @forbid_nonstring_types(["bytes"]) + def split(self, pat=None, n=-1, expand=False): + result = self._array._str.split(pat, n, expand) + return self._wrap_result(result, expand, returns_string=expand) + + @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"}) + @forbid_nonstring_types(["bytes"]) + def rsplit(self, pat=None, n=-1, expand=False): + result = self._array._str.rsplit(pat, n=n) + return self._wrap_result(result, expand=expand, returns_string=expand) + + _shared_docs[ + "str_partition" + ] = """ + Split the string at the %(side)s occurrence of `sep`. + + This method splits the string at the %(side)s occurrence of `sep`, + and returns 3 elements containing the part before the separator, + the separator itself, and the part after the separator. + If the separator is not found, return %(return)s. + + Parameters + ---------- + sep : str, default whitespace + String to split on. + expand : bool, default True + If True, return DataFrame/MultiIndex expanding dimensionality. + If False, return Series/Index. + + Returns + ------- + DataFrame/MultiIndex or Series/Index of objects + + See Also + -------- + %(also)s + Series.str.split : Split strings around given separators. + str.partition : Standard library version. + + Examples + -------- + + >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers']) + >>> s + 0 Linda van der Berg + 1 George Pitt-Rivers + dtype: object + + >>> s.str.partition() + 0 1 2 + 0 Linda van der Berg + 1 George Pitt-Rivers + + To partition by the last space instead of the first one: + + >>> s.str.rpartition() + 0 1 2 + 0 Linda van der Berg + 1 George Pitt-Rivers + + To partition by something different than a space: + + >>> s.str.partition('-') + 0 1 2 + 0 Linda van der Berg + 1 George Pitt - Rivers + + To return a Series containing tuples instead of a DataFrame: + + >>> s.str.partition('-', expand=False) + 0 (Linda van der Berg, , ) + 1 (George Pitt, -, Rivers) + dtype: object + + Also available on indices: + + >>> idx = pd.Index(['X 123', 'Y 999']) + >>> idx + Index(['X 123', 'Y 999'], dtype='object') + + Which will create a MultiIndex: + + >>> idx.str.partition() + MultiIndex([('X', ' ', '123'), + ('Y', ' ', '999')], + ) + + Or an index with tuples with ``expand=False``: + + >>> idx.str.partition(expand=False) + Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object') + """ + + @Appender( + _shared_docs["str_partition"] + % { + "side": "first", + "return": "3 elements containing the string itself, followed by two " + "empty strings", + "also": "rpartition : Split the string at the last occurrence of `sep`.", + } + ) + @forbid_nonstring_types(["bytes"]) + def partition(self, sep=" ", expand=True): + result = self._array._str.partition(sep, expand) + return self._wrap_result(result, expand=expand, returns_string=expand) + + @Appender( + _shared_docs["str_partition"] + % { + "side": "last", + "return": "3 elements containing two empty strings, followed by the " + "string itself", + "also": "partition : Split the string at the first occurrence of `sep`.", + } + ) + @forbid_nonstring_types(["bytes"]) + def rpartition(self, sep=" ", expand=True): + result = self._array._str.rpartion(sep, expand) + return self._wrap_result(result, expand=expand, returns_string=expand) + + def get(self, i): + """ + Extract element from each component at specified position. + + Extract element from lists, tuples, or strings in each element in the + Series/Index. + + Parameters + ---------- + i : int + Position of element to extract. + + Returns + ------- + Series or Index + + Examples + -------- + >>> s = pd.Series(["String", + ... (1, 2, 3), + ... ["a", "b", "c"], + ... 123, + ... -456, + ... {1: "Hello", "2": "World"}]) + >>> s + 0 String + 1 (1, 2, 3) + 2 [a, b, c] + 3 123 + 4 -456 + 5 {1: 'Hello', '2': 'World'} + dtype: object + + >>> s.str.get(1) + 0 t + 1 2 + 2 b + 3 NaN + 4 NaN + 5 Hello + dtype: object + + >>> s.str.get(-1) + 0 g + 1 3 + 2 c + 3 NaN + 4 NaN + 5 None + dtype: object + """ + result = self._array._str.get(i) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def join(self, sep): + """ + Join lists contained as elements in the Series/Index with passed delimiter. + + If the elements of a Series are lists themselves, join the content of these + lists using the delimiter passed to the function. + This function is an equivalent to :meth:`str.join`. + + Parameters + ---------- + sep : str + Delimiter to use between list entries. + + Returns + ------- + Series/Index: object + The list entries concatenated by intervening occurrences of the + delimiter. + + Raises + ------ + AttributeError + If the supplied Series contains neither strings nor lists. + + See Also + -------- + str.join : Standard library version of this method. + Series.str.split : Split strings around given separator/delimiter. + + Notes + ----- + If any of the list items is not a string object, the result of the join + will be `NaN`. + + Examples + -------- + Example with a list that contains non-string elements. + + >>> s = pd.Series([['lion', 'elephant', 'zebra'], + ... [1.1, 2.2, 3.3], + ... ['cat', np.nan, 'dog'], + ... ['cow', 4.5, 'goat'], + ... ['duck', ['swan', 'fish'], 'guppy']]) + >>> s + 0 [lion, elephant, zebra] + 1 [1.1, 2.2, 3.3] + 2 [cat, nan, dog] + 3 [cow, 4.5, goat] + 4 [duck, [swan, fish], guppy] + dtype: object + + Join all lists using a '-'. The lists containing object(s) of types other + than str will produce a NaN. + + >>> s.str.join('-') + 0 lion-elephant-zebra + 1 NaN + 2 NaN + 3 NaN + 4 NaN + dtype: object + """ + # XXX: Does this use the Series? If so then we can't dispatch. + result = self._array._str.join(sep) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): + """ + Test if pattern or regex is contained within a string of a Series or Index. + + Return boolean Series or Index based on whether a given pattern or regex is + contained within a string of a Series or Index. + + Parameters + ---------- + pat : str + Character sequence or regular expression. + case : bool, default True + If True, case sensitive. + flags : int, default 0 (no flags) + Flags to pass through to the re module, e.g. re.IGNORECASE. + na : default NaN + Fill value for missing values. + regex : bool, default True + If True, assumes the pat is a regular expression. + + If False, treats the pat as a literal string. + + Returns + ------- + Series or Index of boolean values + A Series or Index of boolean values indicating whether the + given pattern is contained within the string of each element + of the Series or Index. + + See Also + -------- + match : Analogous, but stricter, relying on re.match instead of re.search. + Series.str.startswith : Test if the start of each string element matches a + pattern. + Series.str.endswith : Same as startswith, but tests the end of string. + + Examples + -------- + Returning a Series of booleans using only a literal pattern. + + >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN]) + >>> s1.str.contains('og', regex=False) + 0 False + 1 True + 2 False + 3 False + 4 NaN + dtype: object + + Returning an Index of booleans using only a literal pattern. + + >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN]) + >>> ind.str.contains('23', regex=False) + Index([False, False, False, True, nan], dtype='object') + + Specifying case sensitivity using `case`. + + >>> s1.str.contains('oG', case=True, regex=True) + 0 False + 1 False + 2 False + 3 False + 4 NaN + dtype: object + + Specifying `na` to be `False` instead of `NaN` replaces NaN values + with `False`. If Series or Index does not contain NaN values + the resultant dtype will be `bool`, otherwise, an `object` dtype. + + >>> s1.str.contains('og', na=False, regex=True) + 0 False + 1 True + 2 False + 3 False + 4 False + dtype: bool + + Returning 'house' or 'dog' when either expression occurs in a string. + + >>> s1.str.contains('house|dog', regex=True) + 0 False + 1 True + 2 True + 3 False + 4 NaN + dtype: object + + Ignoring case sensitivity using `flags` with regex. + + >>> import re + >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True) + 0 False + 1 False + 2 True + 3 False + 4 NaN + dtype: object + + Returning any digit using regular expression. + + >>> s1.str.contains('\\d', regex=True) + 0 False + 1 False + 2 False + 3 True + 4 NaN + dtype: object + + Ensure `pat` is a not a literal pattern when `regex` is set to True. + Note in the following example one might expect only `s2[1]` and `s2[3]` to + return `True`. However, '.0' as a regex matches any character + followed by a 0. + + >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35']) + >>> s2.str.contains('.0', regex=True) + 0 True + 1 True + 2 False + 3 True + 4 False + dtype: bool + """ + result = self._array._str.contains(pat, case, flags, na, regex) + return self._wrap_result(result, fill_value=na, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def match(self, pat, case=True, flags=0, na=np.nan): + """ + Determine if each string starts with a match of a regular expression. + + Parameters + ---------- + pat : str + Character sequence or regular expression. + case : bool, default True + If True, case sensitive. + flags : int, default 0 (no flags) + Regex module flags, e.g. re.IGNORECASE. + na : default NaN + Fill value for missing values. + + Returns + ------- + Series/array of boolean values + + See Also + -------- + fullmatch : Stricter matching that requires the entire string to match. + contains : Analogous, but less strict, relying on re.search instead of + re.match. + extract : Extract matched groups. + """ + result = self._array._str.match(pat, case=case, flags=flags, na=na) + return self._wrap_result(result, fill_value=na, returns_string=False) + + def fullmatch(self, pat, case=True, flags=0, na=np.nan): + """ + Determine if each string entirely matches a regular expression. + + .. versionadded:: 1.1.0 + + Parameters + ---------- + pat : str + Character sequence or regular expression. + case : bool, default True + If True, case sensitive. + flags : int, default 0 (no flags) + Regex module flags, e.g. re.IGNORECASE. + na : default NaN + Fill value for missing values. + + Returns + ------- + Series/array of boolean values + + See Also + -------- + match : Similar, but also returns `True` when only a *prefix* of the string + matches the regular expression. + extract : Extract matched groups. + """ + result = self._array._str.fullmatch(pat, case=case, flags=flags, na=na) + return self._wrap_result(result, fill_value=na, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): + r""" + Replace each occurrence of pattern/regex in the Series/Index. + + Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on the regex value. + + Parameters + ---------- + pat : str or compiled regex + String can be a character sequence or regular expression. + repl : str or callable + Replacement string or a callable. The callable is passed the regex + match object and must return a replacement string to be used. + See :func:`re.sub`. + n : int, default -1 (all) + Number of replacements to make from start. + case : bool, default None + Determines if replace is case sensitive: + + - If True, case sensitive (the default if `pat` is a string) + - Set to False for case insensitive + - Cannot be set if `pat` is a compiled regex. + + flags : int, default 0 (no flags) + Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled + regex. + regex : bool, default True + Determines if assumes the passed-in pattern is a regular expression: + + - If True, assumes the passed-in pattern is a regular expression. + - If False, treats the pattern as a literal string + - Cannot be set to False if `pat` is a compiled regex or `repl` is + a callable. + + .. versionadded:: 0.23.0 + + Returns + ------- + Series or Index of object + A copy of the object with all matching occurrences of `pat` replaced by + `repl`. + + Raises + ------ + ValueError + * if `regex` is False and `repl` is a callable or `pat` is a compiled + regex + * if `pat` is a compiled regex and `case` or `flags` is set + + Notes + ----- + When `pat` is a compiled regex, all flags should be included in the + compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled + regex will raise an error. + + Examples + -------- + When `pat` is a string and `regex` is True (the default), the given `pat` + is compiled as a regex. When `repl` is a string, it replaces matching + regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are + left as is: + + >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True) + 0 bao + 1 baz + 2 NaN + dtype: object + + When `pat` is a string and `regex` is False, every `pat` is replaced with + `repl` as with :meth:`str.replace`: + + >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False) + 0 bao + 1 fuz + 2 NaN + dtype: object + + When `repl` is a callable, it is called on every `pat` using + :func:`re.sub`. The callable should expect one positional argument + (a regex object) and return a string. + + To get the idea: + + >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr) + 0 oo + 1 uz + 2 NaN + dtype: object + + Reverse every lowercase alphabetic word: + + >>> repl = lambda m: m.group(0)[::-1] + >>> pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl) + 0 oof 123 + 1 rab zab + 2 NaN + dtype: object + + Using regex groups (extract second group and swap case): + + >>> pat = r"(?P\w+) (?P\w+) (?P\w+)" + >>> repl = lambda m: m.group('two').swapcase() + >>> pd.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl) + 0 tWO + 1 bAR + dtype: object + + Using a compiled regex with flags + + >>> import re + >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE) + >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar') + 0 foo + 1 bar + 2 NaN + dtype: object + """ + result = self._array._str.replace( + pat, repl, n=n, case=case, flags=flags, regex=regex + ) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def repeat(self, repeats): + """ + Duplicate each string in the Series or Index. + + Parameters + ---------- + repeats : int or sequence of int + Same value for all (int) or different value per (sequence). + + Returns + ------- + Series or Index of object + Series or Index of repeated string objects specified by + input parameter repeats. + + Examples + -------- + >>> s = pd.Series(['a', 'b', 'c']) + >>> s + 0 a + 1 b + 2 c + dtype: object + + Single int repeats string in Series + + >>> s.str.repeat(repeats=2) + 0 aa + 1 bb + 2 cc + dtype: object + + Sequence of int repeats corresponding string in Series + + >>> s.str.repeat(repeats=[1, 2, 3]) + 0 a + 1 bb + 2 ccc + dtype: object + """ + result = self._array._str.repeat(repeats) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def pad(self, width, side="left", fillchar=" "): + """ + Pad strings in the Series/Index up to width. + + Parameters + ---------- + width : int + Minimum width of resulting string; additional characters will be filled + with character defined in `fillchar`. + side : {'left', 'right', 'both'}, default 'left' + Side from which to fill resulting string. + fillchar : str, default ' ' + Additional character for filling, default is whitespace. + + Returns + ------- + Series or Index of object + Returns Series or Index with minimum number of char in object. + + See Also + -------- + Series.str.rjust : Fills the left side of strings with an arbitrary + character. Equivalent to ``Series.str.pad(side='left')``. + Series.str.ljust : Fills the right side of strings with an arbitrary + character. Equivalent to ``Series.str.pad(side='right')``. + Series.str.center : Fills boths sides of strings with an arbitrary + character. Equivalent to ``Series.str.pad(side='both')``. + Series.str.zfill : Pad strings in the Series/Index by prepending '0' + character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``. + + Examples + -------- + >>> s = pd.Series(["caribou", "tiger"]) + >>> s + 0 caribou + 1 tiger + dtype: object + + >>> s.str.pad(width=10) + 0 caribou + 1 tiger + dtype: object + + >>> s.str.pad(width=10, side='right', fillchar='-') + 0 caribou--- + 1 tiger----- + dtype: object + + >>> s.str.pad(width=10, side='both', fillchar='-') + 0 -caribou-- + 1 --tiger--- + dtype: object + """ + result = self._array._str.pad(width, side=side, fillchar=fillchar) + return self._wrap_result(result) + + _shared_docs[ + "str_pad" + ] = """ + Pad %(side)s side of strings in the Series/Index. + + Equivalent to :meth:`str.%(method)s`. + + Parameters + ---------- + width : int + Minimum width of resulting string; additional characters will be filled + with ``fillchar``. + fillchar : str + Additional character for filling, default is whitespace. + + Returns + ------- + filled : Series/Index of objects. + """ + # XXX: Do we need to dispatch to center, etc, or is it equivalent? + + @Appender(_shared_docs["str_pad"] % dict(side="left and right", method="center")) + @forbid_nonstring_types(["bytes"]) + def center(self, width, fillchar=" "): + return self.pad(width, side="both", fillchar=fillchar) + + @Appender(_shared_docs["str_pad"] % dict(side="right", method="ljust")) + @forbid_nonstring_types(["bytes"]) + def ljust(self, width, fillchar=" "): + return self.pad(width, side="right", fillchar=fillchar) + + @Appender(_shared_docs["str_pad"] % dict(side="left", method="rjust")) + @forbid_nonstring_types(["bytes"]) + def rjust(self, width, fillchar=" "): + return self.pad(width, side="left", fillchar=fillchar) + + @forbid_nonstring_types(["bytes"]) + def zfill(self, width): + """ + Pad strings in the Series/Index by prepending '0' characters. + + Strings in the Series/Index are padded with '0' characters on the + left of the string to reach a total string length `width`. Strings + in the Series/Index with length greater or equal to `width` are + unchanged. + + Parameters + ---------- + width : int + Minimum length of resulting string; strings with length less + than `width` be prepended with '0' characters. + + Returns + ------- + Series/Index of objects. + + See Also + -------- + Series.str.rjust : Fills the left side of strings with an arbitrary + character. + Series.str.ljust : Fills the right side of strings with an arbitrary + character. + Series.str.pad : Fills the specified sides of strings with an arbitrary + character. + Series.str.center : Fills boths sides of strings with an arbitrary + character. + + Notes + ----- + Differs from :meth:`str.zfill` which has special handling + for '+'/'-' in the string. + + Examples + -------- + >>> s = pd.Series(['-1', '1', '1000', 10, np.nan]) + >>> s + 0 -1 + 1 1 + 2 1000 + 3 10 + 4 NaN + dtype: object + + Note that ``10`` and ``NaN`` are not strings, therefore they are + converted to ``NaN``. The minus sign in ``'-1'`` is treated as a + regular character and the zero is added to the left of it + (:meth:`str.zfill` would have moved it to the left). ``1000`` + remains unchanged as it is longer than `width`. + + >>> s.str.zfill(3) + 0 0-1 + 1 001 + 2 1000 + 3 NaN + 4 NaN + dtype: object + """ + result = self._array._str.pad(width, side="left", fillchar="0") + return self._wrap_result(result) + + def slice(self, start=None, stop=None, step=None): + """ + Slice substrings from each element in the Series or Index. + + Parameters + ---------- + start : int, optional + Start position for slice operation. + stop : int, optional + Stop position for slice operation. + step : int, optional + Step size for slice operation. + + Returns + ------- + Series or Index of object + Series or Index from sliced substring from original string object. + + See Also + -------- + Series.str.slice_replace : Replace a slice with a string. + Series.str.get : Return element at position. + Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i` + being the position. + + Examples + -------- + >>> s = pd.Series(["koala", "fox", "chameleon"]) + >>> s + 0 koala + 1 fox + 2 chameleon + dtype: object + + >>> s.str.slice(start=1) + 0 oala + 1 ox + 2 hameleon + dtype: object + + >>> s.str.slice(start=-1) + 0 a + 1 x + 2 n + dtype: object + + >>> s.str.slice(stop=2) + 0 ko + 1 fo + 2 ch + dtype: object + + >>> s.str.slice(step=2) + 0 kaa + 1 fx + 2 caeen + dtype: object + + >>> s.str.slice(start=0, stop=5, step=3) + 0 kl + 1 f + 2 cm + dtype: object + + Equivalent behaviour to: + + >>> s.str[0:5:3] + 0 kl + 1 f + 2 cm + dtype: object + """ + result = self._array._str.slice(start, stop, step) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def slice_replace(self, start=None, stop=None, repl=None): + """ + Replace a positional slice of a string with another value. + + Parameters + ---------- + start : int, optional + Left index position to use for the slice. If not specified (None), + the slice is unbounded on the left, i.e. slice from the start + of the string. + stop : int, optional + Right index position to use for the slice. If not specified (None), + the slice is unbounded on the right, i.e. slice until the + end of the string. + repl : str, optional + String for replacement. If not specified (None), the sliced region + is replaced with an empty string. + + Returns + ------- + Series or Index + Same type as the original object. + + See Also + -------- + Series.str.slice : Just slicing without replacement. + + Examples + -------- + >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde']) + >>> s + 0 a + 1 ab + 2 abc + 3 abdc + 4 abcde + dtype: object + + Specify just `start`, meaning replace `start` until the end of the + string with `repl`. + + >>> s.str.slice_replace(1, repl='X') + 0 aX + 1 aX + 2 aX + 3 aX + 4 aX + dtype: object + + Specify just `stop`, meaning the start of the string to `stop` is replaced + with `repl`, and the rest of the string is included. + + >>> s.str.slice_replace(stop=2, repl='X') + 0 X + 1 X + 2 Xc + 3 Xdc + 4 Xcde + dtype: object + + Specify `start` and `stop`, meaning the slice from `start` to `stop` is + replaced with `repl`. Everything before or after `start` and `stop` is + included as is. + + >>> s.str.slice_replace(start=1, stop=3, repl='X') + 0 aX + 1 aX + 2 aX + 3 aXc + 4 aXde + dtype: object + """ + result = self._array._str.slice_replace(start, stop, repl) + return self._wrap_result(result) + + def decode(self, encoding, errors="strict"): + """ + Decode character string in the Series/Index using indicated encoding. + + Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in + python3. + + Parameters + ---------- + encoding : str + errors : str, optional + + Returns + ------- + Series or Index + """ + # TODO: Add a similar _bytes interface. + if encoding in _cpython_optimized_decoders: + # CPython optimized implementation + f = lambda x: x.decode(encoding, errors) + else: + decoder = codecs.getdecoder(encoding) + f = lambda x: decoder(x, errors)[0] + arr = self._array + # assert isinstance(arr, (StringArray,)) + return arr._str._map(f) + + @forbid_nonstring_types(["bytes"]) + def encode(self, encoding, errors="strict"): + """ + Encode character string in the Series/Index using indicated encoding. + + Equivalent to :meth:`str.encode`. + + Parameters + ---------- + encoding : str + errors : str, optional + + Returns + ------- + encoded : Series/Index of objects + """ + result = self._array._str.encode(encoding, errors) + return self._wrap_result(result, returns_string=False) + + _shared_docs[ + "str_strip" + ] = r""" + Remove %(position)s characters. + + Strip whitespaces (including newlines) or a set of specified characters + from each string in the Series/Index from %(side)s. + Equivalent to :meth:`str.%(method)s`. + + Parameters + ---------- + to_strip : str or None, default None + Specifying the set of characters to be removed. + All combinations of this set of characters will be stripped. + If None then whitespaces are removed. + + Returns + ------- + Series or Index of object + + See Also + -------- + Series.str.strip : Remove leading and trailing characters in Series/Index. + Series.str.lstrip : Remove leading characters in Series/Index. + Series.str.rstrip : Remove trailing characters in Series/Index. + + Examples + -------- + >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan]) + >>> s + 0 1. Ant. + 1 2. Bee!\n + 2 3. Cat?\t + 3 NaN + dtype: object + + >>> s.str.strip() + 0 1. Ant. + 1 2. Bee! + 2 3. Cat? + 3 NaN + dtype: object + + >>> s.str.lstrip('123.') + 0 Ant. + 1 Bee!\n + 2 Cat?\t + 3 NaN + dtype: object + + >>> s.str.rstrip('.!? \n\t') + 0 1. Ant + 1 2. Bee + 2 3. Cat + 3 NaN + dtype: object + + >>> s.str.strip('123.!? \n\t') + 0 Ant + 1 Bee + 2 Cat + 3 NaN + dtype: object + """ + + @Appender( + _shared_docs["str_strip"] + % dict( + side="left and right sides", method="strip", position="leading and trailing" + ) + ) + @forbid_nonstring_types(["bytes"]) + def strip(self, to_strip=None): + result = self._array._str.strip(self._parent, to_strip) + return self._wrap_result(result) + + @Appender( + _shared_docs["str_strip"] + % dict(side="left side", method="lstrip", position="leading") + ) + @forbid_nonstring_types(["bytes"]) + def lstrip(self, to_strip=None): + result = self._array._str.lstrip(self._parent, to_strip) + return self._wrap_result(result) + + @Appender( + _shared_docs["str_strip"] + % dict(side="right side", method="rstrip", position="trailing") + ) + @forbid_nonstring_types(["bytes"]) + def rstrip(self, to_strip=None): + result = self._array._str.rstip(self._parent, to_strip) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def wrap(self, width, **kwargs): + r""" + Wrap strings in Series/Index at specified line width. + + This method has the same keyword parameters and defaults as + :class:`textwrap.TextWrapper`. + + Parameters + ---------- + width : int + Maximum line width. + expand_tabs : bool, optional + If True, tab characters will be expanded to spaces (default: True). + replace_whitespace : bool, optional + If True, each whitespace character (as defined by string.whitespace) + remaining after tab expansion will be replaced by a single space + (default: True). + drop_whitespace : bool, optional + If True, whitespace that, after wrapping, happens to end up at the + beginning or end of a line is dropped (default: True). + break_long_words : bool, optional + If True, then words longer than width will be broken in order to ensure + that no lines are longer than width. If it is false, long words will + not be broken, and some lines may be longer than width (default: True). + break_on_hyphens : bool, optional + If True, wrapping will occur preferably on whitespace and right after + hyphens in compound words, as it is customary in English. If false, + only whitespaces will be considered as potentially good places for line + breaks, but you need to set break_long_words to false if you want truly + insecable words (default: True). + + Returns + ------- + Series or Index + + Notes + ----- + Internally, this method uses a :class:`textwrap.TextWrapper` instance with + default settings. To achieve behavior matching R's stringr library str_wrap + function, use the arguments: + + - expand_tabs = False + - replace_whitespace = True + - drop_whitespace = True + - break_long_words = False + - break_on_hyphens = False + + Examples + -------- + >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped']) + >>> s.str.wrap(12) + 0 line to be\nwrapped + 1 another line\nto be\nwrapped + dtype: object + """ + result = self._array._str.wrap(self._parent, width, **kwargs) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def get_dummies(self, sep="|"): + """ + Return DataFrame of dummy/indicator variables for Series. + + Each string in Series is split by sep and returned as a DataFrame + of dummy/indicator variables. + + Parameters + ---------- + sep : str, default "|" + String to split on. + + Returns + ------- + DataFrame + Dummy variables corresponding to values of the Series. + + See Also + -------- + get_dummies : Convert categorical variable into dummy/indicator + variables. + + Examples + -------- + >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies() + a b c + 0 1 1 0 + 1 1 0 0 + 2 1 0 1 + + >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies() + a b c + 0 1 1 0 + 1 0 0 0 + 2 1 0 1 + """ + # we need to cast to Series of strings as only that has all + # methods available for making the dummies... + # XXX: data = self._orig.astype(str) if self._is_categorical else self._parent + result, name = self._array._str.get_dummies(sep) + # result, name = str_get_dummies(data, sep) + return self._wrap_result( + result, + use_codes=(not self._is_categorical), + name=name, + expand=True, + returns_string=False, + ) + + # @copy(str_translate) + @forbid_nonstring_types(["bytes"]) + def translate(self, table): + """ + Map all characters in the string through the given mapping table. + + Equivalent to standard :meth:`str.translate`. + + Parameters + ---------- + table : dict + Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or + None. Unmapped characters are left untouched. + Characters mapped to None are deleted. :meth:`str.maketrans` is a + helper function for making translation tables. + + Returns + ------- + Series or Index + """ + result = self._array._str.translate(table) + return self._wrap_result(result) + + def count(self, pat, flags): + """ + Count occurrences of pattern in each string of the Series/Index. + + This function is used to count the number of times a particular regex + pattern is repeated in each of the string elements of the + :class:`~pandas.Series`. + + Parameters + ---------- + pat : str + Valid regular expression. + flags : int, default 0, meaning no flags + Flags for the `re` module. For a complete list, `see here + `_. + **kwargs + For compatibility with other string methods. Not used. + + Returns + ------- + Series or Index + Same type as the calling object containing the integer counts. + + See Also + -------- + re : Standard library module for regular expressions. + str.count : Standard library version, without regular expression support. + + Notes + ----- + Some characters need to be escaped when passing in `pat`. + eg. ``'$'`` has a special meaning in regex and must be escaped when + finding this literal character. + + Examples + -------- + >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat']) + >>> s.str.count('a') + 0 0.0 + 1 0.0 + 2 2.0 + 3 2.0 + 4 NaN + 5 0.0 + 6 1.0 + dtype: float64 + + Escape ``'$'`` to find the literal dollar sign. + + >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat']) + >>> s.str.count('\\$') + 0 1 + 1 0 + 2 1 + 3 2 + 4 2 + 5 0 + dtype: int64 + + This is also available on Index + + >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a') + Int64Index([0, 0, 2, 1], dtype='int64') + """ + result = self._array._str.count(pat, flags) + return self._wrap_result(result, returns_string=False) + + def startswith(self, pat, na=None): + """ + Test if the start of each string element matches a pattern. + + Equivalent to :meth:`str.startswith`. + + Parameters + ---------- + pat : str + Character sequence. Regular expressions are not accepted. + na : object, default NaN + Object shown if element tested is not a string. + + Returns + ------- + Series or Index of bool + A Series of booleans indicating whether the given pattern matches + the start of each string element. + + See Also + -------- + str.startswith : Python standard library string method. + Series.str.endswith : Same as startswith, but tests the end of string. + Series.str.contains : Tests if string element contains a pattern. + + Examples + -------- + >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan]) + >>> s + 0 bat + 1 Bear + 2 cat + 3 NaN + dtype: object + + >>> s.str.startswith('b') + 0 True + 1 False + 2 False + 3 NaN + dtype: object + + Specifying `na` to be `False` instead of `NaN`. + + >>> s.str.startswith('b', na=False) + 0 True + 1 False + 2 False + 3 False + dtype: bool + """ + # XXX: changed default na to None + result = self._array._str.startswith(pat, na=na) + return self._wrap_result(result, returns_string=False) + + def endswith(self, pat, na=None): + """ + Test if the end of each string element matches a pattern. + + Equivalent to :meth:`str.endswith`. + + Parameters + ---------- + pat : str + Character sequence. Regular expressions are not accepted. + na : object, default NaN + Object shown if element tested is not a string. + + Returns + ------- + Series or Index of bool + A Series of booleans indicating whether the given pattern matches + the end of each string element. + + See Also + -------- + str.endswith : Python standard library string method. + Series.str.startswith : Same as endswith, but tests the start of string. + Series.str.contains : Tests if string element contains a pattern. + + Examples + -------- + >>> s = pd.Series(['bat', 'bear', 'caT', np.nan]) + >>> s + 0 bat + 1 bear + 2 caT + 3 NaN + dtype: object + + >>> s.str.endswith('t') + 0 True + 1 False + 2 False + 3 NaN + dtype: object + + Specifying `na` to be `False` instead of `NaN`. + + >>> s.str.endswith('t', na=False) + 0 True + 1 False + 2 False + 3 False + dtype: bool + """ + # XXX: changed default na to None + result = self._array._str.endswith(pat, na=na) + return self._wrap_result(result, returns_string=False) + + def findall(self, pat, flags=0): + """ + Find all occurrences of pattern or regular expression in the Series/Index. + + Equivalent to applying :func:`re.findall` to all the elements in the + Series/Index. + + Parameters + ---------- + pat : str + Pattern or regular expression. + flags : int, default 0 + Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which + means no flags). + + Returns + ------- + Series/Index of lists of strings + All non-overlapping matches of pattern or regular expression in each + string of this Series/Index. + + See Also + -------- + count : Count occurrences of pattern or regular expression in each string + of the Series/Index. + extractall : For each string in the Series, extract groups from all matches + of regular expression and return a DataFrame with one row for each + match and one column for each group. + re.findall : The equivalent ``re`` function to all non-overlapping matches + of pattern or regular expression in string, as a list of strings. + + Examples + -------- + >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit']) + + The search for the pattern 'Monkey' returns one match: + + >>> s.str.findall('Monkey') + 0 [] + 1 [Monkey] + 2 [] + dtype: object + + On the other hand, the search for the pattern 'MONKEY' doesn't return any + match: + + >>> s.str.findall('MONKEY') + 0 [] + 1 [] + 2 [] + dtype: object + + Flags can be added to the pattern or regular expression. For instance, + to find the pattern 'MONKEY' ignoring the case: + + >>> import re + >>> s.str.findall('MONKEY', flags=re.IGNORECASE) + 0 [] + 1 [Monkey] + 2 [] + dtype: object + + When the pattern matches more than one string in the Series, all matches + are returned: + + >>> s.str.findall('on') + 0 [on] + 1 [on] + 2 [] + dtype: object + + Regular expressions are supported too. For instance, the search for all the + strings ending with the word 'on' is shown next: + + >>> s.str.findall('on$') + 0 [on] + 1 [] + 2 [] + dtype: object + + If the pattern is found more than once in the same string, then a list of + multiple strings is returned: + + >>> s.str.findall('b') + 0 [] + 1 [] + 2 [b, b] + dtype: object + """ + result = self._array._str.findall(pat, flags) + return self._wrap_result(result, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def extract(self, pat, flags=0, expand=True): + r""" + Extract capture groups in the regex `pat` as columns in a DataFrame. + + For each subject string in the Series, extract groups from the + first match of regular expression `pat`. + + Parameters + ---------- + pat : str + Regular expression pattern with capturing groups. + flags : int, default 0 (no flags) + Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that + modify regular expression matching for things like case, + spaces, etc. For more details, see :mod:`re`. + expand : bool, default True + If True, return DataFrame with one column per capture group. + If False, return a Series/Index if there is one capture group + or DataFrame if there are multiple capture groups. + + Returns + ------- + DataFrame or Series or Index + A DataFrame with one row for each subject string, and one + column for each group. Any capture group names in regular + expression pat will be used for column names; otherwise + capture group numbers will be used. The dtype of each result + column is always object, even when no match is found. If + ``expand=False`` and pat has only one capture group, then + return a Series (if subject is a Series) or Index (if subject + is an Index). + + See Also + -------- + extractall : Returns all matches (not just the first match). + + Examples + -------- + A pattern with two groups will return a DataFrame with two columns. + Non-matches will be NaN. + + >>> s = pd.Series(['a1', 'b2', 'c3']) + >>> s.str.extract(r'([ab])(\d)') + 0 1 + 0 a 1 + 1 b 2 + 2 NaN NaN + + A pattern may contain optional groups. + + >>> s.str.extract(r'([ab])?(\d)') + 0 1 + 0 a 1 + 1 b 2 + 2 NaN 3 + + Named groups will become column names in the result. + + >>> s.str.extract(r'(?P[ab])(?P\d)') + letter digit + 0 a 1 + 1 b 2 + 2 NaN NaN + + A pattern with one group will return a DataFrame with one column + if expand=True. + + >>> s.str.extract(r'[ab](\d)', expand=True) + 0 + 0 1 + 1 2 + 2 NaN + + A pattern with one group will return a Series if expand=False. + + >>> s.str.extract(r'[ab](\d)', expand=False) + 0 1 + 1 2 + 2 NaN + dtype: object + """ + result = self._array._str.extract(pat, flags) + return self._wrap_result(result, expand=expand) + + @forbid_nonstring_types(["bytes"]) + def extractall(self, pat, flags=0): + r""" + Extract capture groups in the regex `pat` as columns in DataFrame. + + For each subject string in the Series, extract groups from all + matches of regular expression pat. When each subject string in the + Series has exactly one match, extractall(pat).xs(0, level='match') + is the same as extract(pat). + + Parameters + ---------- + pat : str + Regular expression pattern with capturing groups. + flags : int, default 0 (no flags) + A ``re`` module flag, for example ``re.IGNORECASE``. These allow + to modify regular expression matching for things like case, spaces, + etc. Multiple flags can be combined with the bitwise OR operator, + for example ``re.IGNORECASE | re.MULTILINE``. + + Returns + ------- + DataFrame + A ``DataFrame`` with one row for each match, and one column for each + group. Its rows have a ``MultiIndex`` with first levels that come from + the subject ``Series``. The last level is named 'match' and indexes the + matches in each item of the ``Series``. Any capture group names in + regular expression pat will be used for column names; otherwise capture + group numbers will be used. + + See Also + -------- + extract : Returns first match only (not all matches). + + Examples + -------- + A pattern with one group will return a DataFrame with one column. + Indices with no matches will not appear in the result. + + >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"]) + >>> s.str.extractall(r"[ab](\d)") + 0 + match + A 0 1 + 1 2 + B 0 1 + + Capture group names are used for column names of the result. + + >>> s.str.extractall(r"[ab](?P\d)") + digit + match + A 0 1 + 1 2 + B 0 1 + + A pattern with two groups will return a DataFrame with two columns. + + >>> s.str.extractall(r"(?P[ab])(?P\d)") + letter digit + match + A 0 a 1 + 1 a 2 + B 0 b 1 + + Optional groups that do not match are NaN in the result. + + >>> s.str.extractall(r"(?P[ab])?(?P\d)") + letter digit + match + A 0 a 1 + 1 a 2 + B 0 b 1 + C 0 NaN 1 + """ + result = self._array._str.extractall(pat, flags) + return self._wrap_result(result, expand=True) + + _shared_docs[ + "find" + ] = """ + Return %(side)s indexes in each strings in the Series/Index. + + Each of returned indexes corresponds to the position where the + substring is fully contained between [start:end]. Return -1 on + failure. Equivalent to standard :meth:`str.%(method)s`. + + Parameters + ---------- + sub : str + Substring being searched. + start : int + Left edge index. + end : int + Right edge index. + + Returns + ------- + Series or Index of int. + + See Also + -------- + %(also)s + """ + + @Appender( + _shared_docs["find"] + % dict( + side="lowest", + method="find", + also="rfind : Return highest indexes in each strings.", + ) + ) + @forbid_nonstring_types(["bytes"]) + def find(self, sub, start=0, end=None): + # result = str_find(self._parent, sub, start=start, end=end, side="left") + result = self._array.str.find(sub, start, end) + return self._wrap_result(result, returns_string=False) + + @Appender( + _shared_docs["find"] + % dict( + side="highest", + method="rfind", + also="find : Return lowest indexes in each strings.", + ) + ) + @forbid_nonstring_types(["bytes"]) + def rfind(self, sub, start=0, end=None): + result = self._array._str.rfind(sub, start=start, end=end) + return self._wrap_result(result, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def normalize(self, form): + """ + Return the Unicode normal form for the strings in the Series/Index. + + For more information on the forms, see the + :func:`unicodedata.normalize`. + + Parameters + ---------- + form : {'NFC', 'NFKC', 'NFD', 'NFKD'} + Unicode form. + + Returns + ------- + normalized : Series/Index of objects + """ + result = self._array._str.normalize(form) + return self._wrap_result(result) + + _shared_docs[ + "index" + ] = """ + Return %(side)s indexes in each string in Series/Index. + + Each of the returned indexes corresponds to the position where the + substring is fully contained between [start:end]. This is the same + as ``str.%(similar)s`` except instead of returning -1, it raises a + ValueError when the substring is not found. Equivalent to standard + ``str.%(method)s``. + + Parameters + ---------- + sub : str + Substring being searched. + start : int + Left edge index. + end : int + Right edge index. + + Returns + ------- + Series or Index of object + + See Also + -------- + %(also)s + """ + + @Appender( + _shared_docs["index"] + % dict( + side="lowest", + similar="find", + method="index", + also="rindex : Return highest indexes in each strings.", + ) + ) + @forbid_nonstring_types(["bytes"]) + def index(self, sub, start=0, end=None): + result = self._array._str.index(sub, start=start, end=end) + return self._wrap_result(result, returns_string=False) + + @Appender( + _shared_docs["index"] + % dict( + side="highest", + similar="rfind", + method="rindex", + also="index : Return lowest indexes in each strings.", + ) + ) + @forbid_nonstring_types(["bytes"]) + def rindex(self, sub, start=0, end=None): + result = self._array._str.rindex(sub, start=start, end=end) + return self._wrap_result(result, returns_string=False) + + def len(self): + """ + Compute the length of each element in the Series/Index. + + The element may be a sequence (such as a string, tuple or list) or a collection + (such as a dictionary). + + Returns + ------- + Series or Index of int + A Series or Index of integer values indicating the length of each + element in the Series or Index. + + See Also + -------- + str.len : Python built-in function returning the length of an object. + Series.size : Returns the length of the Series. + + Examples + -------- + Returns the length (number of characters) in a string. Returns the + number of entries for dictionaries, lists or tuples. + + >>> s = pd.Series(['dog', + ... '', + ... 5, + ... {'foo' : 'bar'}, + ... [2, 3, 5, 7], + ... ('one', 'two', 'three')]) + >>> s + 0 dog + 1 + 2 5 + 3 {'foo': 'bar'} + 4 [2, 3, 5, 7] + 5 (one, two, three) + dtype: object + >>> s.str.len() + 0 3.0 + 1 0.0 + 2 NaN + 3 1.0 + 4 4.0 + 5 3.0 + dtype: float64 + """ + result = self._array._str.len() + return self._wrap_result(result, returns_string=False) + + _shared_docs[ + "casemethods" + ] = """ + Convert strings in the Series/Index to %(type)s. + %(version)s + Equivalent to :meth:`str.%(method)s`. + + Returns + ------- + Series or Index of object + + See Also + -------- + Series.str.lower : Converts all characters to lowercase. + Series.str.upper : Converts all characters to uppercase. + Series.str.title : Converts first character of each word to uppercase and + remaining to lowercase. + Series.str.capitalize : Converts first character to uppercase and + remaining to lowercase. + Series.str.swapcase : Converts uppercase to lowercase and lowercase to + uppercase. + Series.str.casefold: Removes all case distinctions in the string. + + Examples + -------- + >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe']) + >>> s + 0 lower + 1 CAPITALS + 2 this is a sentence + 3 SwApCaSe + dtype: object + + >>> s.str.lower() + 0 lower + 1 capitals + 2 this is a sentence + 3 swapcase + dtype: object + + >>> s.str.upper() + 0 LOWER + 1 CAPITALS + 2 THIS IS A SENTENCE + 3 SWAPCASE + dtype: object + + >>> s.str.title() + 0 Lower + 1 Capitals + 2 This Is A Sentence + 3 Swapcase + dtype: object + + >>> s.str.capitalize() + 0 Lower + 1 Capitals + 2 This is a sentence + 3 Swapcase + dtype: object + + >>> s.str.swapcase() + 0 LOWER + 1 capitals + 2 THIS IS A SENTENCE + 3 sWaPcAsE + dtype: object + """ + # Types: + # cases: + # upper, lower, title, capitalize, swapcase, casefold + # boolean: + # isalpha, isnumeric isalnum isdigit isdecimal isspace islower isupper istitle + # _doc_args holds dict of strings to use in substituting casemethod docs + _doc_args: Dict[str, Dict[str, str]] = {} + _doc_args["lower"] = dict(type="lowercase", method="lower", version="") + _doc_args["upper"] = dict(type="uppercase", method="upper", version="") + _doc_args["title"] = dict(type="titlecase", method="title", version="") + _doc_args["capitalize"] = dict( + type="be capitalized", method="capitalize", version="" + ) + _doc_args["swapcase"] = dict(type="be swapcased", method="swapcase", version="") + _doc_args["casefold"] = dict( + type="be casefolded", + method="casefold", + version="\n .. versionadded:: 0.25.0\n", + ) + + @Appender(_shared_docs["casemethods"] % _doc_args["lower"]) + def lower(self): + result = self._array._str.lower() + return self._wrap_result(result) + + @Appender(_shared_docs["casemethods"] % _doc_args["upper"]) + def upper(self): + result = self._array._str.upper() + return self._wrap_result(result) + + @Appender(_shared_docs["casemethods"] % _doc_args["title"]) + def title(self): + result = self._array._str.title() + return self._wrap_result(result) + + @Appender(_shared_docs["casemethods"] % _doc_args["capitalize"]) + def capitalize(self): + result = self._array._str.capitalize() + return self._wrap_result(result) + + @Appender(_shared_docs["casemethods"] % _doc_args["swapcase"]) + def swapcase(self): + result = self._array._str.swapcase() + return self._wrap_result(result) + + @Appender(_shared_docs["casemethods"] % _doc_args["casefold"]) + def casefold(self): + result = self._array._str.casefold() + return self._wrap_result(result) + + _shared_docs[ + "ismethods" + ] = """ + Check whether all characters in each string are %(type)s. + + This is equivalent to running the Python string method + :meth:`str.%(method)s` for each element of the Series/Index. If a string + has zero characters, ``False`` is returned for that check. + + Returns + ------- + Series or Index of bool + Series or Index of boolean values with the same length as the original + Series/Index. + + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. + + Examples + -------- + **Checks for Alphabetic and Numeric Characters** + + >>> s1 = pd.Series(['one', 'one1', '1', '']) + + >>> s1.str.isalpha() + 0 True + 1 False + 2 False + 3 False + dtype: bool + + >>> s1.str.isnumeric() + 0 False + 1 False + 2 True + 3 False + dtype: bool + + >>> s1.str.isalnum() + 0 True + 1 True + 2 True + 3 False + dtype: bool + + Note that checks against characters mixed with any additional punctuation + or whitespace will evaluate to false for an alphanumeric check. + + >>> s2 = pd.Series(['A B', '1.5', '3,000']) + >>> s2.str.isalnum() + 0 False + 1 False + 2 False + dtype: bool + + **More Detailed Checks for Numeric Characters** + + There are several different but overlapping sets of numeric characters that + can be checked for. + + >>> s3 = pd.Series(['23', '³', '⅕', '']) + + The ``s3.str.isdecimal`` method checks for characters used to form numbers + in base 10. + + >>> s3.str.isdecimal() + 0 True + 1 False + 2 False + 3 False + dtype: bool + + The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also + includes special digits, like superscripted and subscripted digits in + unicode. + + >>> s3.str.isdigit() + 0 True + 1 True + 2 False + 3 False + dtype: bool + + The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also + includes other characters that can represent quantities such as unicode + fractions. + + >>> s3.str.isnumeric() + 0 True + 1 True + 2 True + 3 False + dtype: bool + + **Checks for Whitespace** + + >>> s4 = pd.Series([' ', '\\t\\r\\n ', '']) + >>> s4.str.isspace() + 0 True + 1 True + 2 False + dtype: bool + + **Checks for Character Case** + + >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) + + >>> s5.str.islower() + 0 True + 1 False + 2 False + 3 False + dtype: bool + + >>> s5.str.isupper() + 0 False + 1 False + 2 True + 3 False + dtype: bool + + The ``s5.str.istitle`` method checks for whether all words are in title + case (whether only the first letter of each word is capitalized). Words are + assumed to be as any sequence of non-numeric characters separated by + whitespace characters. + + >>> s5.str.istitle() + 0 False + 1 True + 2 False + 3 False + dtype: bool + """ + _doc_args["isalnum"] = dict(type="alphanumeric", method="isalnum") + _doc_args["isalpha"] = dict(type="alphabetic", method="isalpha") + _doc_args["isdigit"] = dict(type="digits", method="isdigit") + _doc_args["isspace"] = dict(type="whitespace", method="isspace") + _doc_args["islower"] = dict(type="lowercase", method="islower") + _doc_args["isupper"] = dict(type="uppercase", method="isupper") + _doc_args["istitle"] = dict(type="titlecase", method="istitle") + _doc_args["isnumeric"] = dict(type="numeric", method="isnumeric") + _doc_args["isdecimal"] = dict(type="decimal", method="isdecimal") + # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624) + + isalnum = _map_and_wrap( + "isalnum", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"] + ) + isalpha = _map_and_wrap( + "isalpha", docstring=_shared_docs["ismethods"] % _doc_args["isalpha"] + ) + isdigit = _map_and_wrap( + "isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"] + ) + isspace = _map_and_wrap( + "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"] + ) + islower = _map_and_wrap( + "islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"] + ) + isupper = _map_and_wrap( + "isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"] + ) + istitle = _map_and_wrap( + "istitle", docstring=_shared_docs["ismethods"] % _doc_args["istitle"] + ) + isnumeric = _map_and_wrap( + "isnumeric", docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"] + ) + isdecimal = _map_and_wrap( + "isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"] + ) + + @classmethod + def _make_accessor(cls, data): + cls._validate(data) + return cls(data) diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py new file mode 100644 index 0000000000000..c32cfffa98126 --- /dev/null +++ b/pandas/core/strings/base.py @@ -0,0 +1,33 @@ +import unicodedata + + +class BaseStringArrayMethods: + """Base class for it.""" + + def __init__(self, array): + self._array = array + + def upper(self): + return self._map(lambda x: x.upper()) + + def isalnum(self): + return self._map(str.isalnum, dtype="bool") + + def capitalize(self): + return self._map(str.capitalize) + + def casefold(self): + return self._map(str.casefold) + + def title(self): + return self._map(str.title) + + def swapcase(self): + return self._map(str.swapcase) + + def lower(self): + return self._map(str.lower) + + def normalize(self, form): + f = lambda x: unicodedata.normalize(form, x) + return self._map(f) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py new file mode 100644 index 0000000000000..acf234b711062 --- /dev/null +++ b/pandas/core/strings/object_array.py @@ -0,0 +1,145 @@ +import re + +import numpy as np + +import pandas._libs.lib as lib + +from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.missing import isna + +from pandas.core.accessor import CachedAccessor +from pandas.core.arrays.numpy_ import PandasArray +from pandas.core.strings.base import BaseStringArrayMethods + + +class ObjectArrayMethods(BaseStringArrayMethods): + def _map(self, f, na_mask=True, na_value=np.nan, dtype=np.dtype(object)): + # n.b.: na_mask is the default. + # need to figure out when it was false, maybe split + arr = self._array # object-dtype ndarray. + + if not len(arr): + return np.ndarray(0, dtype=dtype) + if na_value is None: + na_value = np.nan + + if isinstance(arr, ABCSeries): + arr = arr._values # TODO: extract_array? + if not isinstance(arr, np.ndarray): + arr = np.asarray(arr, dtype=object) + if na_mask: + mask = isna(arr) + convert = not np.all(mask) + try: + result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert) + except (TypeError, AttributeError) as e: + # Reraise the exception if callable `f` got wrong number of args. + # The user may want to be warned by this, instead of getting NaN + p_err = ( + r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " + r"(?(3)required )positional arguments?" + ) + + if len(e.args) >= 1 and re.search(p_err, e.args[0]): + # FIXME: this should be totally avoidable + raise e + + def g(x): + try: + return f(x) + except (TypeError, AttributeError): + return na_value + + return self._map_object(g, dtype=dtype) + if na_value is not np.nan: + np.putmask(result, mask, na_value) + if result.dtype == object: + result = lib.maybe_convert_objects(result) + return result + else: + return lib.map_infer(arr, f) + + def cat(self, others=None, sep=None, na_rep=None, join="left"): + from pandas import Index, Series, concat + + if isinstance(others, str): + raise ValueError("Did you mean to supply a `sep` keyword?") + if sep is None: + sep = "" + + if isinstance(self._orig, ABCIndexClass): + data = Series(self._orig, index=self._orig) + else: # Series + data = self._orig + + # concatenate Series/Index with itself if no "others" + if others is None: + data = ensure_object(data) + na_mask = isna(data) + if na_rep is None and na_mask.any(): + data = data[~na_mask] + elif na_rep is not None and na_mask.any(): + data = np.where(na_mask, na_rep, data) + return sep.join(data) + + try: + # turn anything in "others" into lists of Series + others = self._get_series_list(others) + except ValueError as err: # do not catch TypeError raised by _get_series_list + raise ValueError( + "If `others` contains arrays or lists (or other " + "list-likes without an index), these must all be " + "of the same length as the calling Series/Index." + ) from err + + # align if required + if any(not data.index.equals(x.index) for x in others): + # Need to add keys for uniqueness in case of duplicate columns + others = concat( + others, + axis=1, + join=(join if join == "inner" else "outer"), + keys=range(len(others)), + sort=False, + copy=False, + ) + data, others = data.align(others, join=join) + others = [others[x] for x in others] # again list of Series + + all_cols = [ensure_object(x) for x in [data] + others] + na_masks = np.array([isna(x) for x in all_cols]) + union_mask = np.logical_or.reduce(na_masks, axis=0) + + if na_rep is None and union_mask.any(): + # no na_rep means NaNs for all rows where any column has a NaN + # only necessary if there are actually any NaNs + result = np.empty(len(data), dtype=object) + np.putmask(result, union_mask, np.nan) + + not_masked = ~union_mask + result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep) + elif na_rep is not None and union_mask.any(): + # fill NaNs with na_rep in case there are actually any NaNs + all_cols = [ + np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols) + ] + result = cat_safe(all_cols, sep) + else: + # no NaNs - can just concatenate + result = cat_safe(all_cols, sep) + + if isinstance(self._orig, ABCIndexClass): + # add dtype for case that result is all-NA + result = Index(result, dtype=object, name=self._orig.name) + else: # Series + if is_categorical_dtype(self._orig.dtype): + # We need to infer the new categories. + dtype = None + else: + dtype = self._orig.dtype + result = Series(result, dtype=dtype, index=data.index, name=self._orig.name) + return result + + +class ObjectProxy(PandasArray): + _str = CachedAccessor("str", ObjectArrayMethods) diff --git a/pandas/core/strings_.py b/pandas/core/strings_.py new file mode 100644 index 0000000000000..09ed48e59a489 --- /dev/null +++ b/pandas/core/strings_.py @@ -0,0 +1,775 @@ +import codecs +from functools import wraps +import re +import textwrap +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Pattern, Type, Union +import unicodedata +import warnings + +import numpy as np + +import pandas._libs.lib as lib +import pandas._libs.missing as libmissing +import pandas._libs.ops as libops +from pandas._typing import ArrayLike, Dtype, Scalar +from pandas.util._decorators import Appender + +from pandas.core.dtypes.common import ( + ensure_object, + is_bool_dtype, + is_categorical_dtype, + is_extension_array_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_object_dtype, + is_re, + is_scalar, + is_string_dtype, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCIndexClass, + ABCMultiIndex, + ABCSeries, +) +from pandas.core.dtypes.missing import isna + +from pandas.core.accessor import CachedAccessor +from pandas.core.algorithms import take_1d +from pandas.core.arrays.numpy_ import PandasArray + +if TYPE_CHECKING: + from pandas.arrays import StringArray + +_cpython_optimized_encoders = ( + "utf-8", + "utf8", + "latin-1", + "latin1", + "iso-8859-1", + "mbcs", + "ascii", +) +_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32") + +_shared_docs: Dict[str, str] = dict() + + +def cat_core(list_of_columns: List, sep: str): + """ + Auxiliary function for :meth:`str.cat` + + Parameters + ---------- + list_of_columns : list of numpy arrays + List of arrays to be concatenated with sep; + these arrays may not contain NaNs! + sep : string + The separator string for concatenating the columns. + + Returns + ------- + nd.array + The concatenation of list_of_columns with sep. + """ + if sep == "": + # no need to interleave sep if it is empty + arr_of_cols = np.asarray(list_of_columns, dtype=object) + return np.sum(arr_of_cols, axis=0) + list_with_sep = [sep] * (2 * len(list_of_columns) - 1) + list_with_sep[::2] = list_of_columns + arr_with_sep = np.asarray(list_with_sep, dtype=object) + return np.sum(arr_with_sep, axis=0) + + +def cat_safe(list_of_columns: List, sep: str): + """ + Auxiliary function for :meth:`str.cat`. + + Same signature as cat_core, but handles TypeErrors in concatenation, which + happen if the arrays in list_of columns have the wrong dtypes or content. + + Parameters + ---------- + list_of_columns : list of numpy arrays + List of arrays to be concatenated with sep; + these arrays may not contain NaNs! + sep : string + The separator string for concatenating the columns. + + Returns + ------- + nd.array + The concatenation of list_of_columns with sep. + """ + try: + result = cat_core(list_of_columns, sep) + except TypeError: + # if there are any non-string values (wrong dtype or hidden behind + # object dtype), np.sum will fail; catch and return with better message + for column in list_of_columns: + dtype = lib.infer_dtype(column, skipna=True) + if dtype not in ["string", "empty"]: + raise TypeError( + "Concatenation requires list-likes containing only " + "strings (or missing values). Offending values found in " + f"column {dtype}" + ) from None + return result + + +def str_count(arr, pat, flags=0): + regex = re.compile(pat, flags=flags) + f = lambda x: len(regex.findall(x)) + return _na_map(f, arr, dtype="int64") + + +def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): + if regex: + if not case: + flags |= re.IGNORECASE + + regex = re.compile(pat, flags=flags) + + if regex.groups > 0: + warnings.warn( + "This pattern has match groups. To actually get the " + "groups, use str.extract.", + UserWarning, + stacklevel=3, + ) + + f = lambda x: regex.search(x) is not None + else: + if case: + f = lambda x: pat in x + else: + upper_pat = pat.upper() + f = lambda x: upper_pat in x + uppered = _na_map(lambda x: x.upper(), arr) + return _na_map(f, uppered, na, dtype=np.dtype(bool)) + return _na_map(f, arr, na, dtype=np.dtype(bool)) + + +def str_startswith(arr, pat, na=np.nan): + f = lambda x: x.startswith(pat) + return _na_map(f, arr, na, dtype=np.dtype(bool)) + + +def str_endswith(arr, pat, na=np.nan): + f = lambda x: x.endswith(pat) + return _na_map(f, arr, na, dtype=np.dtype(bool)) + + +def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): + # Check whether repl is valid (GH 13438, GH 15055) + if not (isinstance(repl, str) or callable(repl)): + raise TypeError("repl must be a string or callable") + + is_compiled_re = is_re(pat) + if regex: + if is_compiled_re: + if (case is not None) or (flags != 0): + raise ValueError( + "case and flags cannot be set when pat is a compiled regex" + ) + else: + # not a compiled regex + # set default case + if case is None: + case = True + + # add case flag, if provided + if case is False: + flags |= re.IGNORECASE + if is_compiled_re or len(pat) > 1 or flags or callable(repl): + n = n if n >= 0 else 0 + compiled = re.compile(pat, flags=flags) + f = lambda x: compiled.sub(repl=repl, string=x, count=n) + else: + f = lambda x: x.replace(pat, repl, n) + else: + if is_compiled_re: + raise ValueError( + "Cannot use a compiled regex as replacement pattern with regex=False" + ) + if callable(repl): + raise ValueError("Cannot use a callable replacement when regex=False") + f = lambda x: x.replace(pat, repl, n) + + return _na_map(f, arr, dtype=str) + + +def str_repeat(arr, repeats): + if is_scalar(repeats): + + def scalar_rep(x): + try: + return bytes.__mul__(x, repeats) + except TypeError: + return str.__mul__(x, repeats) + + return _na_map(scalar_rep, arr, dtype=str) + else: + + def rep(x, r): + if x is libmissing.NA: + return x + try: + return bytes.__mul__(x, r) + except TypeError: + return str.__mul__(x, r) + + repeats = np.asarray(repeats, dtype=object) + result = libops.vec_binop(np.asarray(arr), repeats, rep) + return result + + +def str_match( + arr: ArrayLike, + pat: Union[str, Pattern], + case: bool = True, + flags: int = 0, + na: Scalar = np.nan, +): + if not case: + flags |= re.IGNORECASE + + regex = re.compile(pat, flags=flags) + + f = lambda x: regex.match(x) is not None + + return _na_map(f, arr, na, dtype=np.dtype(bool)) + + +def str_fullmatch( + arr: ArrayLike, + pat: Union[str, Pattern], + case: bool = True, + flags: int = 0, + na: Scalar = np.nan, +): + if not case: + flags |= re.IGNORECASE + + regex = re.compile(pat, flags=flags) + + f = lambda x: regex.fullmatch(x) is not None + + return _na_map(f, arr, na, dtype=np.dtype(bool)) + + +def _get_single_group_name(rx): + try: + return list(rx.groupindex.keys()).pop() + except IndexError: + return None + + +def _groups_or_na_fun(regex): + """Used in both extract_noexpand and extract_frame""" + if regex.groups == 0: + raise ValueError("pattern contains no capture groups") + empty_row = [np.nan] * regex.groups + + def f(x): + if not isinstance(x, str): + return empty_row + m = regex.search(x) + if m: + return [np.nan if item is None else item for item in m.groups()] + else: + return empty_row + + return f + + +def _result_dtype(arr): + # workaround #27953 + # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails + # when the list of values is empty. + from pandas.core.arrays.string_ import StringDtype + + if isinstance(arr.dtype, StringDtype): + return arr.dtype.name + else: + return object + + +def _str_extract_noexpand(arr, pat, flags=0): + """ + Find groups in each string in the Series using passed regular + expression. This function is called from + str_extract(expand=False), and can return Series, DataFrame, or + Index. + + """ + from pandas import DataFrame + + regex = re.compile(pat, flags=flags) + groups_or_na = _groups_or_na_fun(regex) + + if regex.groups == 1: + result = np.array([groups_or_na(val)[0] for val in arr], dtype=object) + name = _get_single_group_name(regex) + else: + if isinstance(arr, ABCIndexClass): + raise ValueError("only one regex group is supported with Index") + name = None + names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) + columns = [names.get(1 + i, i) for i in range(regex.groups)] + if arr.empty: + result = DataFrame(columns=columns, dtype=object) + else: + dtype = _result_dtype(arr) + result = DataFrame( + [groups_or_na(val) for val in arr], + columns=columns, + index=arr.index, + dtype=dtype, + ) + return result, name + + +def _str_extract_frame(arr, pat, flags=0): + """ + For each subject string in the Series, extract groups from the + first match of regular expression pat. This function is called from + str_extract(expand=True), and always returns a DataFrame. + + """ + from pandas import DataFrame + + regex = re.compile(pat, flags=flags) + groups_or_na = _groups_or_na_fun(regex) + names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) + columns = [names.get(1 + i, i) for i in range(regex.groups)] + + if len(arr) == 0: + return DataFrame(columns=columns, dtype=object) + try: + result_index = arr.index + except AttributeError: + result_index = None + dtype = _result_dtype(arr) + return DataFrame( + [groups_or_na(val) for val in arr], + columns=columns, + index=result_index, + dtype=dtype, + ) + + +def str_extract(arr, pat, flags=0, expand=True): + if not isinstance(expand, bool): + raise ValueError("expand must be True or False") + if expand: + return _str_extract_frame(arr._orig, pat, flags=flags) + else: + result, name = _str_extract_noexpand(arr._parent, pat, flags=flags) + return arr._wrap_result(result, name=name, expand=expand) + + +def str_extractall(arr, pat, flags=0): + regex = re.compile(pat, flags=flags) + # the regex must contain capture groups. + if regex.groups == 0: + raise ValueError("pattern contains no capture groups") + + if isinstance(arr, ABCIndexClass): + arr = arr.to_series().reset_index(drop=True) + + names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) + columns = [names.get(1 + i, i) for i in range(regex.groups)] + match_list = [] + index_list = [] + is_mi = arr.index.nlevels > 1 + + for subject_key, subject in arr.items(): + if isinstance(subject, str): + + if not is_mi: + subject_key = (subject_key,) + + for match_i, match_tuple in enumerate(regex.findall(subject)): + if isinstance(match_tuple, str): + match_tuple = (match_tuple,) + na_tuple = [np.NaN if group == "" else group for group in match_tuple] + match_list.append(na_tuple) + result_key = tuple(subject_key + (match_i,)) + index_list.append(result_key) + + from pandas import MultiIndex + + index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"]) + dtype = _result_dtype(arr) + + result = arr._constructor_expanddim( + match_list, index=index, columns=columns, dtype=dtype + ) + return result + + +def str_get_dummies(arr, sep="|"): + arr = arr.fillna("") + try: + arr = sep + arr + sep + except TypeError: + arr = sep + arr.astype(str) + sep + + tags = set() + for ts in arr.str.split(sep): + tags.update(ts) + tags = sorted(tags - {""}) + + dummies = np.empty((len(arr), len(tags)), dtype=np.int64) + + for i, t in enumerate(tags): + pat = sep + t + sep + dummies[:, i] = lib.map_infer(arr.to_numpy(), lambda x: pat in x) + return dummies, tags + + +def str_join(arr, sep): + return _na_map(sep.join, arr, dtype=str) + + +def str_findall(arr, pat, flags=0): + regex = re.compile(pat, flags=flags) + return _na_map(regex.findall, arr) + + +def str_find(arr, sub, start=0, end=None, side="left"): + if not isinstance(sub, str): + msg = f"expected a string object, not {type(sub).__name__}" + raise TypeError(msg) + + if side == "left": + method = "find" + elif side == "right": + method = "rfind" + else: # pragma: no cover + raise ValueError("Invalid side") + + if end is None: + f = lambda x: getattr(x, method)(sub, start) + else: + f = lambda x: getattr(x, method)(sub, start, end) + + return _na_map(f, arr, dtype=np.dtype("int64")) + + +def str_index(arr, sub, start=0, end=None, side="left"): + if not isinstance(sub, str): + msg = f"expected a string object, not {type(sub).__name__}" + raise TypeError(msg) + + if side == "left": + method = "index" + elif side == "right": + method = "rindex" + else: # pragma: no cover + raise ValueError("Invalid side") + + if end is None: + f = lambda x: getattr(x, method)(sub, start) + else: + f = lambda x: getattr(x, method)(sub, start, end) + + return _na_map(f, arr, dtype=np.dtype("int64")) + + +def str_pad(arr, width, side="left", fillchar=" "): + if not isinstance(fillchar, str): + msg = f"fillchar must be a character, not {type(fillchar).__name__}" + raise TypeError(msg) + + if len(fillchar) != 1: + raise TypeError("fillchar must be a character, not str") + + if not is_integer(width): + msg = f"width must be of integer type, not {type(width).__name__}" + raise TypeError(msg) + + if side == "left": + f = lambda x: x.rjust(width, fillchar) + elif side == "right": + f = lambda x: x.ljust(width, fillchar) + elif side == "both": + f = lambda x: x.center(width, fillchar) + else: # pragma: no cover + raise ValueError("Invalid side") + + return _na_map(f, arr, dtype=str) + + +def str_split(arr, pat=None, n=None): + + if pat is None: + if n is None or n == 0: + n = -1 + f = lambda x: x.split(pat, n) + else: + if len(pat) == 1: + if n is None or n == 0: + n = -1 + f = lambda x: x.split(pat, n) + else: + if n is None or n == -1: + n = 0 + regex = re.compile(pat) + f = lambda x: regex.split(x, maxsplit=n) + res = _na_map(f, arr) + return res + + +def str_rsplit(arr, pat=None, n=None): + + if n is None or n == 0: + n = -1 + f = lambda x: x.rsplit(pat, n) + res = _na_map(f, arr) + return res + + +def str_slice(arr, start=None, stop=None, step=None): + obj = slice(start, stop, step) + f = lambda x: x[obj] + return _na_map(f, arr, dtype=str) + + +def str_slice_replace(arr, start=None, stop=None, repl=None): + if repl is None: + repl = "" + + def f(x): + if x[start:stop] == "": + local_stop = start + else: + local_stop = stop + y = "" + if start is not None: + y += x[:start] + y += repl + if stop is not None: + y += x[local_stop:] + return y + + return _na_map(f, arr, dtype=str) + + +def str_strip(arr, to_strip=None, side="both"): + """ + Strip whitespace (including newlines) from each string in the + Series/Index. + + Parameters + ---------- + to_strip : str or unicode + side : {'left', 'right', 'both'}, default 'both' + + Returns + ------- + Series or Index + """ + if side == "both": + f = lambda x: x.strip(to_strip) + elif side == "left": + f = lambda x: x.lstrip(to_strip) + elif side == "right": + f = lambda x: x.rstrip(to_strip) + else: # pragma: no cover + raise ValueError("Invalid side") + return _na_map(f, arr, dtype=str) + + +def str_wrap(arr, width, **kwargs): + kwargs["width"] = width + + tw = textwrap.TextWrapper(**kwargs) + + return _na_map(lambda s: "\n".join(tw.wrap(s)), arr, dtype=str) + + +def str_translate(arr, table): + return _na_map(lambda x: x.translate(table), arr, dtype=str) + + +def str_get(arr, i): + def f(x): + if isinstance(x, dict): + return x.get(i) + elif len(x) > i >= -len(x): + return x[i] + return np.nan + + return _na_map(f, arr) + + +def str_decode(arr, encoding, errors="strict"): + """ + Decode character string in the Series/Index using indicated encoding. + + Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in + python3. + + Parameters + ---------- + encoding : str + errors : str, optional + + Returns + ------- + Series or Index + """ + if encoding in _cpython_optimized_decoders: + # CPython optimized implementation + f = lambda x: x.decode(encoding, errors) + else: + decoder = codecs.getdecoder(encoding) + f = lambda x: decoder(x, errors)[0] + return _na_map(f, arr) + + +def str_encode(arr, encoding, errors="strict"): + if encoding in _cpython_optimized_encoders: + # CPython optimized implementation + f = lambda x: x.encode(encoding, errors) + else: + encoder = codecs.getencoder(encoding) + f = lambda x: encoder(x, errors)[0] + return _na_map(f, arr) + + +def forbid_nonstring_types(forbidden, name=None): + """ + Decorator to forbid specific types for a method of StringMethods. + + For calling `.str.{method}` on a Series or Index, it is necessary to first + initialize the :class:`StringMethods` object, and then call the method. + However, different methods allow different input types, and so this can not + be checked during :meth:`StringMethods.__init__`, but must be done on a + per-method basis. This decorator exists to facilitate this process, and + make it explicit which (inferred) types are disallowed by the method. + + :meth:`StringMethods.__init__` allows the *union* of types its different + methods allow (after skipping NaNs; see :meth:`StringMethods._validate`), + namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer']. + + The default string types ['string', 'empty'] are allowed for all methods. + For the additional types ['bytes', 'mixed', 'mixed-integer'], each method + then needs to forbid the types it is not intended for. + + Parameters + ---------- + forbidden : list-of-str or None + List of forbidden non-string types, may be one or more of + `['bytes', 'mixed', 'mixed-integer']`. + name : str, default None + Name of the method to use in the error message. By default, this is + None, in which case the name from the method being wrapped will be + copied. However, for working with further wrappers (like _pat_wrapper + and _noarg_wrapper), it is necessary to specify the name. + + Returns + ------- + func : wrapper + The method to which the decorator is applied, with an added check that + enforces the inferred type to not be in the list of forbidden types. + + Raises + ------ + TypeError + If the inferred type of the underlying data is in `forbidden`. + """ + # deal with None + forbidden = [] if forbidden is None else forbidden + + allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set( + forbidden + ) + + def _forbid_nonstring_types(func): + func_name = func.__name__ if name is None else name + + @wraps(func) + def wrapper(self, *args, **kwargs): + if self._inferred_dtype not in allowed_types: + msg = ( + f"Cannot use .str.{func_name} with values of " + f"inferred dtype '{self._inferred_dtype}'." + ) + raise TypeError(msg) + return func(self, *args, **kwargs) + + wrapper.__name__ = func_name + return wrapper + + return _forbid_nonstring_types + + +def _noarg_wrapper( + f, + name=None, + docstring=None, + forbidden_types=["bytes"], + returns_string=True, + **kwargs, +): + @forbid_nonstring_types(forbidden_types, name=name) + def wrapper(self): + result = _na_map(f, self._parent, **kwargs) + return self._wrap_result(result, returns_string=returns_string) + + wrapper.__name__ = f.__name__ if name is None else name + if docstring is not None: + wrapper.__doc__ = docstring + else: + raise ValueError("Provide docstring") + + return wrapper + + +def _pat_wrapper( + f, + flags=False, + na=False, + name=None, + forbidden_types=["bytes"], + returns_string=True, + **kwargs, +): + @forbid_nonstring_types(forbidden_types, name=name) + def wrapper1(self, pat): + result = f(self._parent, pat) + return self._wrap_result(result, returns_string=returns_string) + + @forbid_nonstring_types(forbidden_types, name=name) + def wrapper2(self, pat, flags=0, **kwargs): + result = f(self._parent, pat, flags=flags, **kwargs) + return self._wrap_result(result, returns_string=returns_string) + + @forbid_nonstring_types(forbidden_types, name=name) + def wrapper3(self, pat, na=np.nan): + result = f(self._parent, pat, na=na) + return self._wrap_result(result, returns_string=returns_string) + + wrapper = wrapper3 if na else wrapper2 if flags else wrapper1 + + wrapper.__name__ = f.__name__ if name is None else name + if f.__doc__: + wrapper.__doc__ = f.__doc__ + + return wrapper + + +def copy(source): + """Copy a docstring from another source function (if present)""" + + def do_copy(target): + if source.__doc__: + target.__doc__ = source.__doc__ + return target + + return do_copy From 89f8e6a38066a1cbcdc83d2592faef894100d0fe Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 11 Sep 2020 13:25:51 -0500 Subject: [PATCH 04/24] annoyed --- pandas/core/arrays/categorical.py | 5 +- pandas/core/arrays/string_.py | 206 ++--- pandas/core/strings/accessor.py | 446 ++++++++-- pandas/core/strings/base.py | 30 +- pandas/core/strings/categorical_strings.py | 17 + pandas/core/strings/object_array.py | 492 ++++++++--- pandas/core/strings_.py | 775 ------------------ .../tests/arrays/string_/test_string_arrow.py | 26 - pandas/tests/extension/arrow/test_string.py | 7 +- pandas/tests/extension/test_string_arrow.py | 150 ---- pandas/tests/test_strings.py | 124 +-- 11 files changed, 902 insertions(+), 1376 deletions(-) create mode 100644 pandas/core/strings/categorical_strings.py delete mode 100644 pandas/core/strings_.py delete mode 100644 pandas/tests/arrays/string_/test_string_arrow.py delete mode 100644 pandas/tests/extension/test_string_arrow.py diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 81f9456502bf0..33ba2c9cff985 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -41,7 +41,7 @@ from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna from pandas.core import ops -from pandas.core.accessor import PandasDelegate, delegate_names +from pandas.core.accessor import CachedAccessor, PandasDelegate, delegate_names import pandas.core.algorithms as algorithms from pandas.core.algorithms import _get_data_algo, factorize, take_1d, unique1d from pandas.core.arrays._mixins import NDArrayBackedExtensionArray @@ -52,6 +52,7 @@ from pandas.core.missing import interpolate_2d from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.sorting import nargsort +from pandas.core.strings.categorical_strings import CategoricalStringMethods from pandas.io.formats import console @@ -2336,6 +2337,8 @@ def replace(self, to_replace, value, inplace: bool = False): if not inplace: return cat + _str = CachedAccessor("_str", CategoricalStringMethods) + # The Series.cat accessor diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index d8c76d0615d45..7cc70e3660f81 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,12 +1,9 @@ import operator -from typing import TYPE_CHECKING, Any, Callable, Dict, Type, Union +from typing import TYPE_CHECKING, Type, Union import numpy as np -from pandas._config import get_option - from pandas._libs import lib, missing as libmissing -from pandas._typing import ArrayLike, Dtype from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype from pandas.core.dtypes.common import ( @@ -22,12 +19,11 @@ from pandas.core import ops from pandas.core.accessor import CachedAccessor from pandas.core.arrays import IntegerArray, PandasArray -from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.integer import _IntegerDtype from pandas.core.construction import extract_array from pandas.core.indexers import check_array_indexer from pandas.core.missing import isna -from pandas.core.strings.base import BaseStringArrayMethods +from pandas.core.strings.object_array import ObjectArrayMethods if TYPE_CHECKING: import pyarrow # noqa: F401 @@ -62,82 +58,16 @@ class StringDtype(ExtensionDtype): StringDtype """ + name = "string" + #: StringDtype.na_value uses pandas.NA na_value = libmissing.NA - _metadata = ("storage",) - - def __init__(self, storage=None): - if storage is None: - storage = get_option("mode.string_storage") - if storage not in {"python", "pyarrow"}: - raise ValueError( - f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." - ) - self.storage = storage - - @property - def name(self): - return f"StringDtype[{self.storage}]" @property def type(self) -> Type[str]: return str @classmethod - def construct_from_string(cls, string): - """ - Construct a StringDtype from a string. - - Parameters - ---------- - string : str - The type of the name. The storage type will be taking from `string`. - Valid options and their storage types are - - ========================== ============== - string result storage - ========================== ============== - ``'string'`` global default - ``'string[python]'`` python - ``'StringDtype[python]'`` python - ``'string[pyarrow]'`` pyarrow - ``'StringDtype[pyarrow]'`` pyarrow - ========================== ============= - - Returns - ------- - StringDtype - - Raise - ----- - TypeError - If the string is not a valid option. - - """ - if not isinstance(string, str): - raise TypeError( - f"'construct_from_string' expects a string, got {type(string)}" - ) - if string == "string": - # TODO: use global default - return cls() - elif string in {"string[python]", "StringDtype[python]"}: - return cls(storage="python") - elif string in {"string[pyarrow]", "StringDtype[pyarrow]"}: - return cls(storage="pyarrow") - else: - raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") - - def __eq__(self, other: Any) -> bool: - if isinstance(other, str) and other == "string": - return True - return super().__eq__(other) - - def __hash__(self) -> int: - # custom __eq__ so have to override __hash__ - return super().__hash__() - - # XXX: this is a classmethod, but we need to know the storage type. def construct_array_type(self) -> Type["StringArray"]: """ Return the array type associated with this dtype. @@ -146,14 +76,9 @@ def construct_array_type(self) -> Type["StringArray"]: ------- type """ - from .string_arrow import ArrowStringArray + return StringArray - if self.storage == "python": - return StringArray - else: - return ArrowStringArray - - def __repr__(self): + def __repr__(self) -> str: return self.name def __from_arrow__( @@ -163,7 +88,6 @@ def __from_arrow__( Construct StringArray from pyarrow Array/ChunkedArray. """ import pyarrow # noqa: F811 - from .string_arrow import ArrowStringArray if isinstance(array, pyarrow.Array): chunks = [array] @@ -177,87 +101,57 @@ def __from_arrow__( str_arr = StringArray._from_sequence(np.array(arr)) results.append(str_arr) - return ArrowStringArray._concat_same_type(results) - - -def _map_stringarray( - func: Callable[[str], Any], - arr: "StringArray", - na_value: Any = libmissing.NA, - dtype: Dtype = StringDtype(), -) -> ArrayLike: - """ - Map a callable over valid elements of a StringArray. - - Parameters - ---------- - func : Callable[[str], Any] - Apply to each valid element. - arr : StringArray - na_value : Any - The value to use for missing values. By default, this is - the original value (NA). - dtype : Dtype - The result dtype to use. Specifying this avoids an intermediate - object-dtype allocation. - - Returns - ------- - ArrayLike - An ExtensionArray for integer or string dtypes, otherwise - an ndarray. - - """ - from pandas.arrays import BooleanArray, IntegerArray, StringArray + return StringArray._concat_same_type(results) - mask = isna(arr) - assert isinstance(arr, StringArray) - arr = np.asarray(arr) - if na_value is None: - na_value = libmissing.NA +class StringArrayMethods(ObjectArrayMethods): + def _map(self, f, na_value=libmissing.NA, dtype=StringDtype()): + from pandas.arrays import BooleanArray, IntegerArray, StringArray - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - constructor: Union[Type[IntegerArray], Type[BooleanArray]] - if is_integer_dtype(dtype): - constructor = IntegerArray - else: - constructor = BooleanArray - - na_value_is_na = isna(na_value) - if na_value_is_na: - na_value = 1 - result = lib.map_infer_mask( - arr, - func, - mask.view("uint8"), - convert=False, - na_value=na_value, - dtype=np.dtype(dtype), - ) + arr = self._array + mask = isna(arr) - if not na_value_is_na: - mask[:] = False + assert isinstance(arr, StringArray) + arr = np.asarray(arr) + if na_value is None: + na_value = libmissing.NA - return constructor(result, mask) + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + constructor: Union[Type[IntegerArray], Type[BooleanArray]] + if is_integer_dtype(dtype): + constructor = IntegerArray + else: + constructor = BooleanArray + + na_value_is_na = isna(na_value) + if na_value_is_na: + na_value = 1 + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype(dtype), + ) - elif is_string_dtype(dtype) and not is_object_dtype(dtype): - # i.e. StringDtype - result = lib.map_infer_mask( - arr, func, mask.view("uint8"), convert=False, na_value=na_value - ) - return StringArray(result) - else: - # This is when the result type is object. We reach this when - # -> We know the result type is truly object (e.g. .encode returns bytes - # or .findall returns a list). - # -> We don't know the result type. E.g. `.get` can return anything. - return lib.map_infer_mask(arr, func, mask.view("uint8")) + if not na_value_is_na: + mask[:] = False + return constructor(result, mask) -class StringArrayMethods(BaseStringArrayMethods): - def _map(self, f, na_result=libmissing.NA, dtype=StringDtype()): - return _map_stringarray(f, self._array, na_result, dtype) + elif is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, f, mask.view("uint8"), convert=False, na_value=na_value + ) + return StringArray(result) + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, f, mask.view("uint8")) class StringArray(PandasArray): @@ -507,7 +401,7 @@ def _add_arithmetic_ops(cls): cls.__rmul__ = cls._create_arithmetic_method(ops.rmul) _create_comparison_method = _create_arithmetic_method - _str = CachedAccessor("str", StringArrayMethods) + _str = CachedAccessor("_str", StringArrayMethods) StringArray._add_arithmetic_ops() diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 84630066ef871..334769468e671 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1,7 +1,8 @@ import codecs from functools import wraps import operator -from typing import Dict +import re +from typing import Dict, List import warnings import numpy as np @@ -9,15 +10,21 @@ import pandas._libs.lib as lib from pandas.util._decorators import Appender -from pandas.core.dtypes.common import is_bool_dtype, is_categorical_dtype, is_list_like +from pandas.core.dtypes.common import ( + ensure_object, + is_bool_dtype, + is_categorical_dtype, + is_integer, + is_list_like, +) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCIndexClass, ABCMultiIndex, ABCSeries, ) +from pandas.core.dtypes.missing import isna -from pandas.core.algorithms import take_1d from pandas.core.arrays.numpy_ import PandasArray from pandas.core.base import NoNewAttributesMixin from pandas.core.strings.object_array import ObjectProxy @@ -103,6 +110,7 @@ def wrapper(self, *args, **kwargs): def _map_and_wrap(name, docstring): + @forbid_nonstring_types(["bytes"], name=name) def wrapper(self): result = operator.methodcaller(name)(self._array._str) return self._wrap_result(result) @@ -143,7 +151,7 @@ def __init__(self, data): self._is_string = isinstance(data.dtype, StringDtype) array = data.array - if isinstance(array, PandasArray): + if type(array) is PandasArray: # wrap in an object proxy to get the str methods. # Alternatively, just add _str to PandasArray. array = ObjectProxy(array._ndarray) @@ -167,7 +175,7 @@ def _validate(data): Auxiliary function for StringMethods, infers and checks dtype of data. This is a "first line of defence" at the creation of the StringMethods- - object (see _make_accessor), and just checks that the dtype is in the + object, and just checks that the dtype is in the *union* of the allowed types over all string methods below; this restriction is then refined on a per-method basis using the decorator @forbid_nonstring_types (more info in the corresponding docstring). @@ -212,11 +220,8 @@ def _validate(data): return inferred_dtype def __getitem__(self, key): - return self._array._str[key] - # if isinstance(key, slice): - # return self.slice(start=key.start, stop=key.stop, step=key.step) - # else: - # return self.get(key) + result = self._array._str[key] + return self._wrap_result(result) def __iter__(self): warnings.warn( @@ -232,26 +237,9 @@ def __iter__(self): g = self.get(i) def _wrap_result( - self, - result, - use_codes=True, - name=None, - expand=None, - fill_value=np.nan, - returns_string=True, + self, result, name=None, expand=None, fill_value=np.nan, returns_string=True, ): - from pandas import Index, MultiIndex, Series - - # for category, we do the stuff on the categories, so blow it up - # to the full series again - # But for some operations, we have to do the stuff on the full values, - # so make it possible to skip this step as the method already did this - # before the transformation... - if use_codes and self._is_categorical: - # if self._orig is a CategoricalIndex, there is no .cat-accessor - result = take_1d( - result, Series(self._orig, copy=False).cat.codes, fill_value=fill_value - ) + from pandas import Index, MultiIndex if not hasattr(result, "ndim") or not hasattr(result, "dtype"): return result @@ -261,11 +249,8 @@ def _wrap_result( # case we'll want to return the same dtype as the input. # Or we can be wrapping a numeric output, in which case we don't want # to return a StringArray. - if self._is_string and returns_string: - dtype = "string" - else: - dtype = None - + # XXX: see if this can be removed. + # Ideally the array method returns the right array type. if expand is None: # infer from ndim if expand is not specified expand = result.ndim != 1 @@ -321,13 +306,19 @@ def cons_row(x): return Index(result, name=name) else: index = self._orig.index + # This is a mess. + if self._is_string and returns_string: + dtype = "string" + else: + dtype = None + if expand: cons = self._orig._constructor_expanddim result = cons(result, columns=name, index=index, dtype=dtype) else: # Must be a Series cons = self._orig._constructor - result = cons(result, name=name, index=index, dtype=dtype) + result = cons(result, name=name, index=index) return result def _get_series_list(self, others): @@ -522,7 +513,86 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): For more examples, see :ref:`here `. """ - return self._array._str.cat(others, sep, na_rep, join) + # XXX: not dispatched yet. + from pandas import Index, Series, concat + + if isinstance(others, str): + raise ValueError("Did you mean to supply a `sep` keyword?") + if sep is None: + sep = "" + + if isinstance(self._orig, ABCIndexClass): + data = Series(self._orig, index=self._orig) + else: # Series + data = self._orig + + # concatenate Series/Index with itself if no "others" + if others is None: + data = ensure_object(data) + na_mask = isna(data) + if na_rep is None and na_mask.any(): + data = data[~na_mask] + elif na_rep is not None and na_mask.any(): + data = np.where(na_mask, na_rep, data) + return sep.join(data) + + try: + # turn anything in "others" into lists of Series + others = self._get_series_list(others) + except ValueError as err: # do not catch TypeError raised by _get_series_list + raise ValueError( + "If `others` contains arrays or lists (or other " + "list-likes without an index), these must all be " + "of the same length as the calling Series/Index." + ) from err + + # align if required + if any(not data.index.equals(x.index) for x in others): + # Need to add keys for uniqueness in case of duplicate columns + others = concat( + others, + axis=1, + join=(join if join == "inner" else "outer"), + keys=range(len(others)), + sort=False, + copy=False, + ) + data, others = data.align(others, join=join) + others = [others[x] for x in others] # again list of Series + + all_cols = [ensure_object(x) for x in [data] + others] + na_masks = np.array([isna(x) for x in all_cols]) + union_mask = np.logical_or.reduce(na_masks, axis=0) + + if na_rep is None and union_mask.any(): + # no na_rep means NaNs for all rows where any column has a NaN + # only necessary if there are actually any NaNs + result = np.empty(len(data), dtype=object) + np.putmask(result, union_mask, np.nan) + + not_masked = ~union_mask + result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep) + elif na_rep is not None and union_mask.any(): + # fill NaNs with na_rep in case there are actually any NaNs + all_cols = [ + np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols) + ] + result = cat_safe(all_cols, sep) + else: + # no NaNs - can just concatenate + result = cat_safe(all_cols, sep) + + if isinstance(self._orig, ABCIndexClass): + # add dtype for case that result is all-NA + result = Index(result, dtype=object, name=self._orig.name) + else: # Series + if is_categorical_dtype(self._orig.dtype): + # We need to infer the new categories. + dtype = None + else: + dtype = self._orig.dtype + result = Series(result, dtype=dtype, index=data.index, name=self._orig.name) + return result _shared_docs[ "str_split" @@ -663,7 +733,7 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): @forbid_nonstring_types(["bytes"]) def split(self, pat=None, n=-1, expand=False): result = self._array._str.split(pat, n, expand) - return self._wrap_result(result, expand, returns_string=expand) + return self._wrap_result(result, returns_string=expand, expand=expand) @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"}) @forbid_nonstring_types(["bytes"]) @@ -778,7 +848,7 @@ def partition(self, sep=" ", expand=True): ) @forbid_nonstring_types(["bytes"]) def rpartition(self, sep=" ", expand=True): - result = self._array._str.rpartion(sep, expand) + result = self._array._str.rpartition(sep, expand) return self._wrap_result(result, expand=expand, returns_string=expand) def get(self, i): @@ -1058,6 +1128,7 @@ def match(self, pat, case=True, flags=0, na=np.nan): result = self._array._str.match(pat, case=case, flags=flags, na=na) return self._wrap_result(result, fill_value=na, returns_string=False) + @forbid_nonstring_types(["bytes"]) def fullmatch(self, pat, case=True, flags=0, na=np.nan): """ Determine if each string entirely matches a regular expression. @@ -1093,7 +1164,8 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): r""" Replace each occurrence of pattern/regex in the Series/Index. - Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on the regex value. + Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on + the regex value. Parameters ---------- @@ -1309,6 +1381,17 @@ def pad(self, width, side="left", fillchar=" "): 1 --tiger--- dtype: object """ + if not isinstance(fillchar, str): + msg = f"fillchar must be a character, not {type(fillchar).__name__}" + raise TypeError(msg) + + if len(fillchar) != 1: + raise TypeError("fillchar must be a character, not str") + + if not is_integer(width): + msg = f"width must be of integer type, not {type(width).__name__}" + raise TypeError(msg) + result = self._array._str.pad(width, side=side, fillchar=fillchar) return self._wrap_result(result) @@ -1409,7 +1492,7 @@ def zfill(self, width): 4 NaN dtype: object """ - result = self._array._str.pad(width, side="left", fillchar="0") + result = self.pad(width, side="left", fillchar="0") return self._wrap_result(result) def slice(self, start=None, stop=None, step=None): @@ -1588,7 +1671,8 @@ def decode(self, encoding, errors="strict"): f = lambda x: decoder(x, errors)[0] arr = self._array # assert isinstance(arr, (StringArray,)) - return arr._str._map(f) + result = arr._str._map(f) + return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) def encode(self, encoding, errors="strict"): @@ -1682,7 +1766,7 @@ def encode(self, encoding, errors="strict"): ) @forbid_nonstring_types(["bytes"]) def strip(self, to_strip=None): - result = self._array._str.strip(self._parent, to_strip) + result = self._array._str.strip(to_strip) return self._wrap_result(result) @Appender( @@ -1691,7 +1775,7 @@ def strip(self, to_strip=None): ) @forbid_nonstring_types(["bytes"]) def lstrip(self, to_strip=None): - result = self._array._str.lstrip(self._parent, to_strip) + result = self._array._str.lstrip(to_strip) return self._wrap_result(result) @Appender( @@ -1700,7 +1784,7 @@ def lstrip(self, to_strip=None): ) @forbid_nonstring_types(["bytes"]) def rstrip(self, to_strip=None): - result = self._array._str.rstip(self._parent, to_strip) + result = self._array._str.rstrip(to_strip) return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) @@ -1759,7 +1843,7 @@ def wrap(self, width, **kwargs): 1 another line\nto be\nwrapped dtype: object """ - result = self._array._str.wrap(self._parent, width, **kwargs) + result = self._array._str.wrap(width, **kwargs) return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) @@ -1804,15 +1888,8 @@ def get_dummies(self, sep="|"): # XXX: data = self._orig.astype(str) if self._is_categorical else self._parent result, name = self._array._str.get_dummies(sep) # result, name = str_get_dummies(data, sep) - return self._wrap_result( - result, - use_codes=(not self._is_categorical), - name=name, - expand=True, - returns_string=False, - ) + return self._wrap_result(result, name=name, expand=True, returns_string=False,) - # @copy(str_translate) @forbid_nonstring_types(["bytes"]) def translate(self, table): """ @@ -1835,7 +1912,8 @@ def translate(self, table): result = self._array._str.translate(table) return self._wrap_result(result) - def count(self, pat, flags): + @forbid_nonstring_types(["bytes"]) + def count(self, pat, flags=0): """ Count occurrences of pattern in each string of the Series/Index. @@ -1902,6 +1980,7 @@ def count(self, pat, flags): result = self._array._str.count(pat, flags) return self._wrap_result(result, returns_string=False) + @forbid_nonstring_types(["bytes"]) def startswith(self, pat, na=None): """ Test if the start of each string element matches a pattern. @@ -1957,6 +2036,7 @@ def startswith(self, pat, na=None): result = self._array._str.startswith(pat, na=na) return self._wrap_result(result, returns_string=False) + @forbid_nonstring_types(["bytes"]) def endswith(self, pat, na=None): """ Test if the end of each string element matches a pattern. @@ -2012,6 +2092,7 @@ def endswith(self, pat, na=None): result = self._array._str.endswith(pat, na=na) return self._wrap_result(result, returns_string=False) + @forbid_nonstring_types(["bytes"]) def findall(self, pat, flags=0): """ Find all occurrences of pattern or regular expression in the Series/Index. @@ -2186,8 +2267,8 @@ def extract(self, pat, flags=0, expand=True): 2 NaN dtype: object """ - result = self._array._str.extract(pat, flags) - return self._wrap_result(result, expand=expand) + # XXX: not dispatched + return str_extract(self, pat, flags, expand=expand) @forbid_nonstring_types(["bytes"]) def extractall(self, pat, flags=0): @@ -2264,8 +2345,8 @@ def extractall(self, pat, flags=0): B 0 b 1 C 0 NaN 1 """ - result = self._array._str.extractall(pat, flags) - return self._wrap_result(result, expand=True) + # XXX: not dispatched + return str_extractall(self._orig, pat, flags) _shared_docs[ "find" @@ -2304,8 +2385,11 @@ def extractall(self, pat, flags=0): ) @forbid_nonstring_types(["bytes"]) def find(self, sub, start=0, end=None): - # result = str_find(self._parent, sub, start=start, end=end, side="left") - result = self._array.str.find(sub, start, end) + if not isinstance(sub, str): + msg = f"expected a string object, not {type(sub).__name__}" + raise TypeError(msg) + + result = self._array._str.find(sub, start, end) return self._wrap_result(result, returns_string=False) @Appender( @@ -2318,6 +2402,10 @@ def find(self, sub, start=0, end=None): ) @forbid_nonstring_types(["bytes"]) def rfind(self, sub, start=0, end=None): + if not isinstance(sub, str): + msg = f"expected a string object, not {type(sub).__name__}" + raise TypeError(msg) + result = self._array._str.rfind(sub, start=start, end=end) return self._wrap_result(result, returns_string=False) @@ -2381,6 +2469,10 @@ def normalize(self, form): ) @forbid_nonstring_types(["bytes"]) def index(self, sub, start=0, end=None): + if not isinstance(sub, str): + msg = f"expected a string object, not {type(sub).__name__}" + raise TypeError(msg) + result = self._array._str.index(sub, start=start, end=end) return self._wrap_result(result, returns_string=False) @@ -2395,6 +2487,10 @@ def index(self, sub, start=0, end=None): ) @forbid_nonstring_types(["bytes"]) def rindex(self, sub, start=0, end=None): + if not isinstance(sub, str): + msg = f"expected a string object, not {type(sub).__name__}" + raise TypeError(msg) + result = self._array._str.rindex(sub, start=start, end=end) return self._wrap_result(result, returns_string=False) @@ -2536,31 +2632,37 @@ def len(self): ) @Appender(_shared_docs["casemethods"] % _doc_args["lower"]) + @forbid_nonstring_types(["bytes"]) def lower(self): result = self._array._str.lower() return self._wrap_result(result) @Appender(_shared_docs["casemethods"] % _doc_args["upper"]) + @forbid_nonstring_types(["bytes"]) def upper(self): result = self._array._str.upper() return self._wrap_result(result) @Appender(_shared_docs["casemethods"] % _doc_args["title"]) + @forbid_nonstring_types(["bytes"]) def title(self): result = self._array._str.title() return self._wrap_result(result) @Appender(_shared_docs["casemethods"] % _doc_args["capitalize"]) + @forbid_nonstring_types(["bytes"]) def capitalize(self): result = self._array._str.capitalize() return self._wrap_result(result) @Appender(_shared_docs["casemethods"] % _doc_args["swapcase"]) + @forbid_nonstring_types(["bytes"]) def swapcase(self): result = self._array._str.swapcase() return self._wrap_result(result) @Appender(_shared_docs["casemethods"] % _doc_args["casefold"]) + @forbid_nonstring_types(["bytes"]) def casefold(self): result = self._array._str.casefold() return self._wrap_result(result) @@ -2746,7 +2848,223 @@ def casefold(self): "isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"] ) - @classmethod - def _make_accessor(cls, data): - cls._validate(data) - return cls(data) + +def cat_safe(list_of_columns: List, sep: str): + """ + Auxiliary function for :meth:`str.cat`. + + Same signature as cat_core, but handles TypeErrors in concatenation, which + happen if the arrays in list_of columns have the wrong dtypes or content. + + Parameters + ---------- + list_of_columns : list of numpy arrays + List of arrays to be concatenated with sep; + these arrays may not contain NaNs! + sep : string + The separator string for concatenating the columns. + + Returns + ------- + nd.array + The concatenation of list_of_columns with sep. + """ + try: + result = cat_core(list_of_columns, sep) + except TypeError: + # if there are any non-string values (wrong dtype or hidden behind + # object dtype), np.sum will fail; catch and return with better message + for column in list_of_columns: + dtype = lib.infer_dtype(column, skipna=True) + if dtype not in ["string", "empty"]: + raise TypeError( + "Concatenation requires list-likes containing only " + "strings (or missing values). Offending values found in " + f"column {dtype}" + ) from None + return result + + +def cat_core(list_of_columns: List, sep: str): + """ + Auxiliary function for :meth:`str.cat` + + Parameters + ---------- + list_of_columns : list of numpy arrays + List of arrays to be concatenated with sep; + these arrays may not contain NaNs! + sep : string + The separator string for concatenating the columns. + + Returns + ------- + nd.array + The concatenation of list_of_columns with sep. + """ + if sep == "": + # no need to interleave sep if it is empty + arr_of_cols = np.asarray(list_of_columns, dtype=object) + return np.sum(arr_of_cols, axis=0) + list_with_sep = [sep] * (2 * len(list_of_columns) - 1) + list_with_sep[::2] = list_of_columns + arr_with_sep = np.asarray(list_with_sep, dtype=object) + return np.sum(arr_with_sep, axis=0) + + +def _groups_or_na_fun(regex): + """Used in both extract_noexpand and extract_frame""" + if regex.groups == 0: + raise ValueError("pattern contains no capture groups") + empty_row = [np.nan] * regex.groups + + def f(x): + if not isinstance(x, str): + return empty_row + m = regex.search(x) + if m: + return [np.nan if item is None else item for item in m.groups()] + else: + return empty_row + + return f + + +def _result_dtype(arr): + # workaround #27953 + # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails + # when the list of values is empty. + from pandas.core.arrays.string_ import StringDtype + + if isinstance(arr.dtype, StringDtype): + return arr.dtype.name + else: + return object + + +def _get_single_group_name(rx): + try: + return list(rx.groupindex.keys()).pop() + except IndexError: + return None + + +def _str_extract_noexpand(arr, pat, flags=0): + """ + Find groups in each string in the Series using passed regular + expression. This function is called from + str_extract(expand=False), and can return Series, DataFrame, or + Index. + + """ + from pandas import DataFrame, array + + regex = re.compile(pat, flags=flags) + groups_or_na = _groups_or_na_fun(regex) + result_dtype = _result_dtype(arr) + + if regex.groups == 1: + result = np.array([groups_or_na(val)[0] for val in arr], dtype=object) + name = _get_single_group_name(regex) + # not dispatching, so we have to reconstruct here. + result = array(result, dtype=result_dtype) + else: + if isinstance(arr, ABCIndexClass): + raise ValueError("only one regex group is supported with Index") + name = None + names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) + columns = [names.get(1 + i, i) for i in range(regex.groups)] + if arr.size == 0: + result = DataFrame(columns=columns, dtype=object) + else: + dtype = _result_dtype(arr) + result = DataFrame( + [groups_or_na(val) for val in arr], + columns=columns, + index=arr.index, + dtype=dtype, + ) + return result, name + + +def _str_extract_frame(arr, pat, flags=0): + """ + For each subject string in the Series, extract groups from the + first match of regular expression pat. This function is called from + str_extract(expand=True), and always returns a DataFrame. + + """ + from pandas import DataFrame + + regex = re.compile(pat, flags=flags) + groups_or_na = _groups_or_na_fun(regex) + names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) + columns = [names.get(1 + i, i) for i in range(regex.groups)] + + if len(arr) == 0: + return DataFrame(columns=columns, dtype=object) + try: + result_index = arr.index + except AttributeError: + result_index = None + dtype = _result_dtype(arr) + return DataFrame( + [groups_or_na(val) for val in arr], + columns=columns, + index=result_index, + dtype=dtype, + ) + + +def str_extract(arr, pat, flags=0, expand=True): + if not isinstance(expand, bool): + raise ValueError("expand must be True or False") + if expand: + return _str_extract_frame(arr._orig, pat, flags=flags) + else: + # XXX: some dead code now. + # arr = arr._array + # if isinstance(arr, ObjectProxy): + # arr = arr._ndarray + result, name = _str_extract_noexpand(arr._orig, pat, flags=flags) + return arr._wrap_result(result, name=name, expand=expand) + + +def str_extractall(arr, pat, flags=0): + regex = re.compile(pat, flags=flags) + # the regex must contain capture groups. + if regex.groups == 0: + raise ValueError("pattern contains no capture groups") + + if isinstance(arr, ABCIndexClass): + arr = arr.to_series().reset_index(drop=True) + + names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) + columns = [names.get(1 + i, i) for i in range(regex.groups)] + match_list = [] + index_list = [] + is_mi = arr.index.nlevels > 1 + + for subject_key, subject in arr.items(): + if isinstance(subject, str): + + if not is_mi: + subject_key = (subject_key,) + + for match_i, match_tuple in enumerate(regex.findall(subject)): + if isinstance(match_tuple, str): + match_tuple = (match_tuple,) + na_tuple = [np.NaN if group == "" else group for group in match_tuple] + match_list.append(na_tuple) + result_key = tuple(subject_key + (match_i,)) + index_list.append(result_key) + + from pandas import MultiIndex + + index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"]) + dtype = _result_dtype(arr) + + result = arr._constructor_expanddim( + match_list, index=index, columns=columns, dtype=dtype + ) + return result diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index c32cfffa98126..9d484f449b14b 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -1,33 +1,5 @@ -import unicodedata - - class BaseStringArrayMethods: - """Base class for it.""" + """Base class for array _str accessor.""" def __init__(self, array): self._array = array - - def upper(self): - return self._map(lambda x: x.upper()) - - def isalnum(self): - return self._map(str.isalnum, dtype="bool") - - def capitalize(self): - return self._map(str.capitalize) - - def casefold(self): - return self._map(str.casefold) - - def title(self): - return self._map(str.title) - - def swapcase(self): - return self._map(str.swapcase) - - def lower(self): - return self._map(str.lower) - - def normalize(self, form): - f = lambda x: unicodedata.normalize(form, x) - return self._map(f) diff --git a/pandas/core/strings/categorical_strings.py b/pandas/core/strings/categorical_strings.py new file mode 100644 index 0000000000000..3be3825032988 --- /dev/null +++ b/pandas/core/strings/categorical_strings.py @@ -0,0 +1,17 @@ +import numpy as np + +from pandas.core.algorithms import take_1d +from pandas.core.strings.object_array import ObjectArrayMethods + + +class CategoricalStringMethods(ObjectArrayMethods): + def _map(self, f, na_value=np.nan, dtype=np.dtype(object)): + arr = self._array # Categorical + categories = arr.categories + codes = arr.codes + result = ObjectArrayMethods(categories)._map(f, na_value, dtype) + return take_1d(result, codes, fill_value=na_value) + + def get_dummies(self, sep="|"): + # sep may not be in categories. Just bail on this. + return ObjectArrayMethods(self._array.astype(str)).get_dummies(sep) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index acf234b711062..116b53778ac9d 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -1,10 +1,17 @@ import re +import textwrap +from typing import Pattern, Union +import unicodedata +import warnings import numpy as np import pandas._libs.lib as lib +import pandas._libs.missing as libmissing +import pandas._libs.ops as libops +from pandas._typing import Scalar -from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.common import is_re, is_scalar from pandas.core.dtypes.missing import isna from pandas.core.accessor import CachedAccessor @@ -13,9 +20,7 @@ class ObjectArrayMethods(BaseStringArrayMethods): - def _map(self, f, na_mask=True, na_value=np.nan, dtype=np.dtype(object)): - # n.b.: na_mask is the default. - # need to figure out when it was false, maybe split + def _map(self, f, na_value=np.nan, dtype=np.dtype(object)): arr = self._array # object-dtype ndarray. if not len(arr): @@ -23,123 +28,388 @@ def _map(self, f, na_mask=True, na_value=np.nan, dtype=np.dtype(object)): if na_value is None: na_value = np.nan - if isinstance(arr, ABCSeries): - arr = arr._values # TODO: extract_array? if not isinstance(arr, np.ndarray): arr = np.asarray(arr, dtype=object) - if na_mask: - mask = isna(arr) - convert = not np.all(mask) - try: - result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert) - except (TypeError, AttributeError) as e: - # Reraise the exception if callable `f` got wrong number of args. - # The user may want to be warned by this, instead of getting NaN - p_err = ( - r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " - r"(?(3)required )positional arguments?" + mask = isna(arr) + convert = not np.all(mask) + try: + result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert) + except (TypeError, AttributeError) as e: + # Reraise the exception if callable `f` got wrong number of args. + # The user may want to be warned by this, instead of getting NaN + p_err = ( + r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " + r"(?(3)required )positional arguments?" + ) + + if len(e.args) >= 1 and re.search(p_err, e.args[0]): + # FIXME: this should be totally avoidable + raise e + + def g(x): + # This type of fallback behavior can be removed once + # we remove object-dtype .str accessor. + try: + return f(x) + except (TypeError, AttributeError): + return na_value + + return self._map(g, na_value=na_value, dtype=dtype) + if na_value is not np.nan: + np.putmask(result, mask, na_value) + if result.dtype == object: + result = lib.maybe_convert_objects(result) + return result + + def __getitem__(self, key): + if isinstance(key, slice): + return self.slice(start=key.start, stop=key.stop, step=key.step) + else: + return self.get(key) + + def count(self, pat, flags=0): + regex = re.compile(pat, flags=flags) + f = lambda x: len(regex.findall(x)) + return self._map(f, dtype="int64") + + def pad(self, width, side="left", fillchar=" "): + if side == "left": + f = lambda x: x.rjust(width, fillchar) + elif side == "right": + f = lambda x: x.ljust(width, fillchar) + elif side == "both": + f = lambda x: x.center(width, fillchar) + else: # pragma: no cover + raise ValueError("Invalid side") + return self._map(f) + + def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): + if regex: + if not case: + flags |= re.IGNORECASE + + regex = re.compile(pat, flags=flags) + + if regex.groups > 0: + warnings.warn( + "This pattern has match groups. To actually get the " + "groups, use str.extract.", + UserWarning, + stacklevel=3, ) - if len(e.args) >= 1 and re.search(p_err, e.args[0]): - # FIXME: this should be totally avoidable - raise e - - def g(x): - try: - return f(x) - except (TypeError, AttributeError): - return na_value - - return self._map_object(g, dtype=dtype) - if na_value is not np.nan: - np.putmask(result, mask, na_value) - if result.dtype == object: - result = lib.maybe_convert_objects(result) + f = lambda x: regex.search(x) is not None + else: + if case: + f = lambda x: pat in x + else: + upper_pat = pat.upper() + f = lambda x: upper_pat in x.upper() + return self._map(f, na, dtype=np.dtype("bool")) + + def startswith(self, pat, na=np.nan): + f = lambda x: x.startswith(pat) + return self._map(f, na_value=na, dtype=np.dtype(bool)) + + def endswith(self, pat, na=np.nan): + f = lambda x: x.endswith(pat) + return self._map(f, na_value=na, dtype=np.dtype(bool)) + + def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): + # Check whether repl is valid (GH 13438, GH 15055) + if not (isinstance(repl, str) or callable(repl)): + raise TypeError("repl must be a string or callable") + + is_compiled_re = is_re(pat) + if regex: + if is_compiled_re: + if (case is not None) or (flags != 0): + raise ValueError( + "case and flags cannot be set when pat is a compiled regex" + ) + else: + # not a compiled regex + # set default case + if case is None: + case = True + + # add case flag, if provided + if case is False: + flags |= re.IGNORECASE + if is_compiled_re or len(pat) > 1 or flags or callable(repl): + n = n if n >= 0 else 0 + compiled = re.compile(pat, flags=flags) + f = lambda x: compiled.sub(repl=repl, string=x, count=n) + else: + f = lambda x: x.replace(pat, repl, n) + else: + if is_compiled_re: + raise ValueError( + "Cannot use a compiled regex as replacement pattern with " + "regex=False" + ) + if callable(repl): + raise ValueError("Cannot use a callable replacement when regex=False") + f = lambda x: x.replace(pat, repl, n) + + return self._map(f, dtype=str) + + def repeat(self, repeats): + if is_scalar(repeats): + + def scalar_rep(x): + try: + return bytes.__mul__(x, repeats) + except TypeError: + return str.__mul__(x, repeats) + + return self._map(scalar_rep, dtype=str) + else: + from pandas.core.arrays.string_ import StringArray + + def rep(x, r): + if x is libmissing.NA: + return x + try: + return bytes.__mul__(x, r) + except TypeError: + return str.__mul__(x, r) + + repeats = np.asarray(repeats, dtype=object) + result = libops.vec_binop(np.asarray(self._array), repeats, rep) + if isinstance(self._array, StringArray): + # Not going through map, so we have to do this here. + result = StringArray._from_sequence(result) return result + + def match( + self, + pat: Union[str, Pattern], + case: bool = True, + flags: int = 0, + na: Scalar = np.nan, + ): + if not case: + flags |= re.IGNORECASE + + regex = re.compile(pat, flags=flags) + + f = lambda x: regex.match(x) is not None + return self._map(f, na_value=na, dtype=np.dtype(bool)) + + def fullmatch( + self, + pat: Union[str, Pattern], + case: bool = True, + flags: int = 0, + na: Scalar = np.nan, + ): + if not case: + flags |= re.IGNORECASE + + regex = re.compile(pat, flags=flags) + + f = lambda x: regex.fullmatch(x) is not None + return self._map(f, na_value=na, dtype=np.dtype(bool)) + + def encode(self, encoding, errors="strict"): + f = lambda x: x.encode(encoding, errors=errors) + return self._map(f, dtype=object) + + def find(self, sub, start=0, end=None): + return self._find(sub, start, end, side="left") + + def rfind(self, sub, start=0, end=None): + return self._find(sub, start, end, side="right") + + def _find(self, sub, start, end, side): + if side == "left": + method = "find" + elif side == "right": + method = "rfind" + else: # pragma: no cover + raise ValueError("Invalid side") + + if end is None: + f = lambda x: getattr(x, method)(sub, start) else: - return lib.map_infer(arr, f) - - def cat(self, others=None, sep=None, na_rep=None, join="left"): - from pandas import Index, Series, concat - - if isinstance(others, str): - raise ValueError("Did you mean to supply a `sep` keyword?") - if sep is None: - sep = "" - - if isinstance(self._orig, ABCIndexClass): - data = Series(self._orig, index=self._orig) - else: # Series - data = self._orig - - # concatenate Series/Index with itself if no "others" - if others is None: - data = ensure_object(data) - na_mask = isna(data) - if na_rep is None and na_mask.any(): - data = data[~na_mask] - elif na_rep is not None and na_mask.any(): - data = np.where(na_mask, na_rep, data) - return sep.join(data) + f = lambda x: getattr(x, method)(sub, start, end) + return self._map(f, dtype="int64") - try: - # turn anything in "others" into lists of Series - others = self._get_series_list(others) - except ValueError as err: # do not catch TypeError raised by _get_series_list - raise ValueError( - "If `others` contains arrays or lists (or other " - "list-likes without an index), these must all be " - "of the same length as the calling Series/Index." - ) from err - - # align if required - if any(not data.index.equals(x.index) for x in others): - # Need to add keys for uniqueness in case of duplicate columns - others = concat( - others, - axis=1, - join=(join if join == "inner" else "outer"), - keys=range(len(others)), - sort=False, - copy=False, - ) - data, others = data.align(others, join=join) - others = [others[x] for x in others] # again list of Series - - all_cols = [ensure_object(x) for x in [data] + others] - na_masks = np.array([isna(x) for x in all_cols]) - union_mask = np.logical_or.reduce(na_masks, axis=0) - - if na_rep is None and union_mask.any(): - # no na_rep means NaNs for all rows where any column has a NaN - # only necessary if there are actually any NaNs - result = np.empty(len(data), dtype=object) - np.putmask(result, union_mask, np.nan) - - not_masked = ~union_mask - result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep) - elif na_rep is not None and union_mask.any(): - # fill NaNs with na_rep in case there are actually any NaNs - all_cols = [ - np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols) - ] - result = cat_safe(all_cols, sep) + def findall(self, pat, flags=0): + regex = re.compile(pat, flags=flags) + return self._map(regex.findall, dtype="object") + + def get(self, i): + def f(x): + if isinstance(x, dict): + return x.get(i) + elif len(x) > i >= -len(x): + return x[i] + return np.nan + + return self._map(f) + + def index(self, sub, start=0, end=None): + if end: + f = lambda x: x.index(sub, start, end) else: - # no NaNs - can just concatenate - result = cat_safe(all_cols, sep) - - if isinstance(self._orig, ABCIndexClass): - # add dtype for case that result is all-NA - result = Index(result, dtype=object, name=self._orig.name) - else: # Series - if is_categorical_dtype(self._orig.dtype): - # We need to infer the new categories. - dtype = None - else: - dtype = self._orig.dtype - result = Series(result, dtype=dtype, index=data.index, name=self._orig.name) + f = lambda x: x.index(sub, start, end) + return self._map(f, dtype="int64") + + def rindex(self, sub, start=0, end=None): + if end: + f = lambda x: x.rindex(sub, start, end) + else: + f = lambda x: x.rindex(sub, start, end) + return self._map(f, dtype="int64") + + def join(self, sep): + return self._map(sep.join) + + def partition(self, sep, expand): + result = self._map(lambda x: x.partition(sep), dtype="object") return result + def rpartition(self, sep, expand): + return self._map(lambda x: x.rpartition(sep), dtype="object") + + def len(self): + return self._map(len, dtype="int64") + + def slice(self, start=None, stop=None, step=None): + obj = slice(start, stop, step) + return self._map(lambda x: x[obj]) + + def slice_replace(self, start=None, stop=None, repl=None): + if repl is None: + repl = "" + + def f(x): + if x[start:stop] == "": + local_stop = start + else: + local_stop = stop + y = "" + if start is not None: + y += x[:start] + y += repl + if stop is not None: + y += x[local_stop:] + return y + + return self._map(f) + + def split(self, pat=None, n=-1, expand=False): + if pat is None: + if n is None or n == 0: + n = -1 + f = lambda x: x.split(pat, n) + else: + if len(pat) == 1: + if n is None or n == 0: + n = -1 + f = lambda x: x.split(pat, n) + else: + if n is None or n == -1: + n = 0 + regex = re.compile(pat) + f = lambda x: regex.split(x, maxsplit=n) + return self._map(f, dtype=object) + + def rsplit(self, pat=None, n=-1): + if n is None or n == 0: + n = -1 + f = lambda x: x.rsplit(pat, n) + return self._map(f, dtype="object") + + def translate(self, table): + return self._map(lambda x: x.translate(table)) + + def wrap(self, width, **kwargs): + kwargs["width"] = width + tw = textwrap.TextWrapper(**kwargs) + return self._map(lambda s: "\n".join(tw.wrap(s))) + + def get_dummies(self, sep="|"): + from pandas import Series + + arr = Series(self._array).fillna("") + try: + arr = sep + arr + sep + except TypeError: + arr = sep + arr.astype(str) + sep + + tags = set() + for ts in Series(arr).str.split(sep): + tags.update(ts) + tags = sorted(tags - {""}) + + dummies = np.empty((len(arr), len(tags)), dtype=np.int64) + + for i, t in enumerate(tags): + pat = sep + t + sep + dummies[:, i] = lib.map_infer(arr.to_numpy(), lambda x: pat in x) + return dummies, tags + + def upper(self): + return self._map(lambda x: x.upper()) + + def isalnum(self): + return self._map(str.isalnum, dtype="bool") + + def isalpha(self): + return self._map(str.isalpha, dtype="bool") + + def isdecimal(self): + return self._map(str.isdecimal, dtype="bool") + + def isdigit(self): + return self._map(str.isdigit, dtype="bool") + + def islower(self): + return self._map(str.islower, dtype="bool") + + def isnumeric(self): + return self._map(str.isnumeric, dtype="bool") + + def isspace(self): + return self._map(str.isspace, dtype="bool") + + def istitle(self): + return self._map(str.istitle, dtype="bool") + + def isupper(self): + return self._map(str.isupper, dtype="bool") + + def capitalize(self): + return self._map(str.capitalize) + + def casefold(self): + return self._map(str.casefold) + + def title(self): + return self._map(str.title) + + def swapcase(self): + return self._map(str.swapcase) + + def lower(self): + return self._map(str.lower) + + def normalize(self, form): + f = lambda x: unicodedata.normalize(form, x) + return self._map(f) + + def strip(self, to_strip=None): + return self._map(lambda x: x.strip(to_strip)) + + def lstrip(self, to_strip=None): + return self._map(lambda x: x.lstrip(to_strip)) + + def rstrip(self, to_strip=None): + return self._map(lambda x: x.rstrip(to_strip)) + class ObjectProxy(PandasArray): _str = CachedAccessor("str", ObjectArrayMethods) diff --git a/pandas/core/strings_.py b/pandas/core/strings_.py deleted file mode 100644 index 09ed48e59a489..0000000000000 --- a/pandas/core/strings_.py +++ /dev/null @@ -1,775 +0,0 @@ -import codecs -from functools import wraps -import re -import textwrap -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Pattern, Type, Union -import unicodedata -import warnings - -import numpy as np - -import pandas._libs.lib as lib -import pandas._libs.missing as libmissing -import pandas._libs.ops as libops -from pandas._typing import ArrayLike, Dtype, Scalar -from pandas.util._decorators import Appender - -from pandas.core.dtypes.common import ( - ensure_object, - is_bool_dtype, - is_categorical_dtype, - is_extension_array_dtype, - is_integer, - is_integer_dtype, - is_list_like, - is_object_dtype, - is_re, - is_scalar, - is_string_dtype, -) -from pandas.core.dtypes.generic import ( - ABCDataFrame, - ABCIndexClass, - ABCMultiIndex, - ABCSeries, -) -from pandas.core.dtypes.missing import isna - -from pandas.core.accessor import CachedAccessor -from pandas.core.algorithms import take_1d -from pandas.core.arrays.numpy_ import PandasArray - -if TYPE_CHECKING: - from pandas.arrays import StringArray - -_cpython_optimized_encoders = ( - "utf-8", - "utf8", - "latin-1", - "latin1", - "iso-8859-1", - "mbcs", - "ascii", -) -_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32") - -_shared_docs: Dict[str, str] = dict() - - -def cat_core(list_of_columns: List, sep: str): - """ - Auxiliary function for :meth:`str.cat` - - Parameters - ---------- - list_of_columns : list of numpy arrays - List of arrays to be concatenated with sep; - these arrays may not contain NaNs! - sep : string - The separator string for concatenating the columns. - - Returns - ------- - nd.array - The concatenation of list_of_columns with sep. - """ - if sep == "": - # no need to interleave sep if it is empty - arr_of_cols = np.asarray(list_of_columns, dtype=object) - return np.sum(arr_of_cols, axis=0) - list_with_sep = [sep] * (2 * len(list_of_columns) - 1) - list_with_sep[::2] = list_of_columns - arr_with_sep = np.asarray(list_with_sep, dtype=object) - return np.sum(arr_with_sep, axis=0) - - -def cat_safe(list_of_columns: List, sep: str): - """ - Auxiliary function for :meth:`str.cat`. - - Same signature as cat_core, but handles TypeErrors in concatenation, which - happen if the arrays in list_of columns have the wrong dtypes or content. - - Parameters - ---------- - list_of_columns : list of numpy arrays - List of arrays to be concatenated with sep; - these arrays may not contain NaNs! - sep : string - The separator string for concatenating the columns. - - Returns - ------- - nd.array - The concatenation of list_of_columns with sep. - """ - try: - result = cat_core(list_of_columns, sep) - except TypeError: - # if there are any non-string values (wrong dtype or hidden behind - # object dtype), np.sum will fail; catch and return with better message - for column in list_of_columns: - dtype = lib.infer_dtype(column, skipna=True) - if dtype not in ["string", "empty"]: - raise TypeError( - "Concatenation requires list-likes containing only " - "strings (or missing values). Offending values found in " - f"column {dtype}" - ) from None - return result - - -def str_count(arr, pat, flags=0): - regex = re.compile(pat, flags=flags) - f = lambda x: len(regex.findall(x)) - return _na_map(f, arr, dtype="int64") - - -def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): - if regex: - if not case: - flags |= re.IGNORECASE - - regex = re.compile(pat, flags=flags) - - if regex.groups > 0: - warnings.warn( - "This pattern has match groups. To actually get the " - "groups, use str.extract.", - UserWarning, - stacklevel=3, - ) - - f = lambda x: regex.search(x) is not None - else: - if case: - f = lambda x: pat in x - else: - upper_pat = pat.upper() - f = lambda x: upper_pat in x - uppered = _na_map(lambda x: x.upper(), arr) - return _na_map(f, uppered, na, dtype=np.dtype(bool)) - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def str_startswith(arr, pat, na=np.nan): - f = lambda x: x.startswith(pat) - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def str_endswith(arr, pat, na=np.nan): - f = lambda x: x.endswith(pat) - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): - # Check whether repl is valid (GH 13438, GH 15055) - if not (isinstance(repl, str) or callable(repl)): - raise TypeError("repl must be a string or callable") - - is_compiled_re = is_re(pat) - if regex: - if is_compiled_re: - if (case is not None) or (flags != 0): - raise ValueError( - "case and flags cannot be set when pat is a compiled regex" - ) - else: - # not a compiled regex - # set default case - if case is None: - case = True - - # add case flag, if provided - if case is False: - flags |= re.IGNORECASE - if is_compiled_re or len(pat) > 1 or flags or callable(repl): - n = n if n >= 0 else 0 - compiled = re.compile(pat, flags=flags) - f = lambda x: compiled.sub(repl=repl, string=x, count=n) - else: - f = lambda x: x.replace(pat, repl, n) - else: - if is_compiled_re: - raise ValueError( - "Cannot use a compiled regex as replacement pattern with regex=False" - ) - if callable(repl): - raise ValueError("Cannot use a callable replacement when regex=False") - f = lambda x: x.replace(pat, repl, n) - - return _na_map(f, arr, dtype=str) - - -def str_repeat(arr, repeats): - if is_scalar(repeats): - - def scalar_rep(x): - try: - return bytes.__mul__(x, repeats) - except TypeError: - return str.__mul__(x, repeats) - - return _na_map(scalar_rep, arr, dtype=str) - else: - - def rep(x, r): - if x is libmissing.NA: - return x - try: - return bytes.__mul__(x, r) - except TypeError: - return str.__mul__(x, r) - - repeats = np.asarray(repeats, dtype=object) - result = libops.vec_binop(np.asarray(arr), repeats, rep) - return result - - -def str_match( - arr: ArrayLike, - pat: Union[str, Pattern], - case: bool = True, - flags: int = 0, - na: Scalar = np.nan, -): - if not case: - flags |= re.IGNORECASE - - regex = re.compile(pat, flags=flags) - - f = lambda x: regex.match(x) is not None - - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def str_fullmatch( - arr: ArrayLike, - pat: Union[str, Pattern], - case: bool = True, - flags: int = 0, - na: Scalar = np.nan, -): - if not case: - flags |= re.IGNORECASE - - regex = re.compile(pat, flags=flags) - - f = lambda x: regex.fullmatch(x) is not None - - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def _get_single_group_name(rx): - try: - return list(rx.groupindex.keys()).pop() - except IndexError: - return None - - -def _groups_or_na_fun(regex): - """Used in both extract_noexpand and extract_frame""" - if regex.groups == 0: - raise ValueError("pattern contains no capture groups") - empty_row = [np.nan] * regex.groups - - def f(x): - if not isinstance(x, str): - return empty_row - m = regex.search(x) - if m: - return [np.nan if item is None else item for item in m.groups()] - else: - return empty_row - - return f - - -def _result_dtype(arr): - # workaround #27953 - # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails - # when the list of values is empty. - from pandas.core.arrays.string_ import StringDtype - - if isinstance(arr.dtype, StringDtype): - return arr.dtype.name - else: - return object - - -def _str_extract_noexpand(arr, pat, flags=0): - """ - Find groups in each string in the Series using passed regular - expression. This function is called from - str_extract(expand=False), and can return Series, DataFrame, or - Index. - - """ - from pandas import DataFrame - - regex = re.compile(pat, flags=flags) - groups_or_na = _groups_or_na_fun(regex) - - if regex.groups == 1: - result = np.array([groups_or_na(val)[0] for val in arr], dtype=object) - name = _get_single_group_name(regex) - else: - if isinstance(arr, ABCIndexClass): - raise ValueError("only one regex group is supported with Index") - name = None - names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) - columns = [names.get(1 + i, i) for i in range(regex.groups)] - if arr.empty: - result = DataFrame(columns=columns, dtype=object) - else: - dtype = _result_dtype(arr) - result = DataFrame( - [groups_or_na(val) for val in arr], - columns=columns, - index=arr.index, - dtype=dtype, - ) - return result, name - - -def _str_extract_frame(arr, pat, flags=0): - """ - For each subject string in the Series, extract groups from the - first match of regular expression pat. This function is called from - str_extract(expand=True), and always returns a DataFrame. - - """ - from pandas import DataFrame - - regex = re.compile(pat, flags=flags) - groups_or_na = _groups_or_na_fun(regex) - names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) - columns = [names.get(1 + i, i) for i in range(regex.groups)] - - if len(arr) == 0: - return DataFrame(columns=columns, dtype=object) - try: - result_index = arr.index - except AttributeError: - result_index = None - dtype = _result_dtype(arr) - return DataFrame( - [groups_or_na(val) for val in arr], - columns=columns, - index=result_index, - dtype=dtype, - ) - - -def str_extract(arr, pat, flags=0, expand=True): - if not isinstance(expand, bool): - raise ValueError("expand must be True or False") - if expand: - return _str_extract_frame(arr._orig, pat, flags=flags) - else: - result, name = _str_extract_noexpand(arr._parent, pat, flags=flags) - return arr._wrap_result(result, name=name, expand=expand) - - -def str_extractall(arr, pat, flags=0): - regex = re.compile(pat, flags=flags) - # the regex must contain capture groups. - if regex.groups == 0: - raise ValueError("pattern contains no capture groups") - - if isinstance(arr, ABCIndexClass): - arr = arr.to_series().reset_index(drop=True) - - names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) - columns = [names.get(1 + i, i) for i in range(regex.groups)] - match_list = [] - index_list = [] - is_mi = arr.index.nlevels > 1 - - for subject_key, subject in arr.items(): - if isinstance(subject, str): - - if not is_mi: - subject_key = (subject_key,) - - for match_i, match_tuple in enumerate(regex.findall(subject)): - if isinstance(match_tuple, str): - match_tuple = (match_tuple,) - na_tuple = [np.NaN if group == "" else group for group in match_tuple] - match_list.append(na_tuple) - result_key = tuple(subject_key + (match_i,)) - index_list.append(result_key) - - from pandas import MultiIndex - - index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"]) - dtype = _result_dtype(arr) - - result = arr._constructor_expanddim( - match_list, index=index, columns=columns, dtype=dtype - ) - return result - - -def str_get_dummies(arr, sep="|"): - arr = arr.fillna("") - try: - arr = sep + arr + sep - except TypeError: - arr = sep + arr.astype(str) + sep - - tags = set() - for ts in arr.str.split(sep): - tags.update(ts) - tags = sorted(tags - {""}) - - dummies = np.empty((len(arr), len(tags)), dtype=np.int64) - - for i, t in enumerate(tags): - pat = sep + t + sep - dummies[:, i] = lib.map_infer(arr.to_numpy(), lambda x: pat in x) - return dummies, tags - - -def str_join(arr, sep): - return _na_map(sep.join, arr, dtype=str) - - -def str_findall(arr, pat, flags=0): - regex = re.compile(pat, flags=flags) - return _na_map(regex.findall, arr) - - -def str_find(arr, sub, start=0, end=None, side="left"): - if not isinstance(sub, str): - msg = f"expected a string object, not {type(sub).__name__}" - raise TypeError(msg) - - if side == "left": - method = "find" - elif side == "right": - method = "rfind" - else: # pragma: no cover - raise ValueError("Invalid side") - - if end is None: - f = lambda x: getattr(x, method)(sub, start) - else: - f = lambda x: getattr(x, method)(sub, start, end) - - return _na_map(f, arr, dtype=np.dtype("int64")) - - -def str_index(arr, sub, start=0, end=None, side="left"): - if not isinstance(sub, str): - msg = f"expected a string object, not {type(sub).__name__}" - raise TypeError(msg) - - if side == "left": - method = "index" - elif side == "right": - method = "rindex" - else: # pragma: no cover - raise ValueError("Invalid side") - - if end is None: - f = lambda x: getattr(x, method)(sub, start) - else: - f = lambda x: getattr(x, method)(sub, start, end) - - return _na_map(f, arr, dtype=np.dtype("int64")) - - -def str_pad(arr, width, side="left", fillchar=" "): - if not isinstance(fillchar, str): - msg = f"fillchar must be a character, not {type(fillchar).__name__}" - raise TypeError(msg) - - if len(fillchar) != 1: - raise TypeError("fillchar must be a character, not str") - - if not is_integer(width): - msg = f"width must be of integer type, not {type(width).__name__}" - raise TypeError(msg) - - if side == "left": - f = lambda x: x.rjust(width, fillchar) - elif side == "right": - f = lambda x: x.ljust(width, fillchar) - elif side == "both": - f = lambda x: x.center(width, fillchar) - else: # pragma: no cover - raise ValueError("Invalid side") - - return _na_map(f, arr, dtype=str) - - -def str_split(arr, pat=None, n=None): - - if pat is None: - if n is None or n == 0: - n = -1 - f = lambda x: x.split(pat, n) - else: - if len(pat) == 1: - if n is None or n == 0: - n = -1 - f = lambda x: x.split(pat, n) - else: - if n is None or n == -1: - n = 0 - regex = re.compile(pat) - f = lambda x: regex.split(x, maxsplit=n) - res = _na_map(f, arr) - return res - - -def str_rsplit(arr, pat=None, n=None): - - if n is None or n == 0: - n = -1 - f = lambda x: x.rsplit(pat, n) - res = _na_map(f, arr) - return res - - -def str_slice(arr, start=None, stop=None, step=None): - obj = slice(start, stop, step) - f = lambda x: x[obj] - return _na_map(f, arr, dtype=str) - - -def str_slice_replace(arr, start=None, stop=None, repl=None): - if repl is None: - repl = "" - - def f(x): - if x[start:stop] == "": - local_stop = start - else: - local_stop = stop - y = "" - if start is not None: - y += x[:start] - y += repl - if stop is not None: - y += x[local_stop:] - return y - - return _na_map(f, arr, dtype=str) - - -def str_strip(arr, to_strip=None, side="both"): - """ - Strip whitespace (including newlines) from each string in the - Series/Index. - - Parameters - ---------- - to_strip : str or unicode - side : {'left', 'right', 'both'}, default 'both' - - Returns - ------- - Series or Index - """ - if side == "both": - f = lambda x: x.strip(to_strip) - elif side == "left": - f = lambda x: x.lstrip(to_strip) - elif side == "right": - f = lambda x: x.rstrip(to_strip) - else: # pragma: no cover - raise ValueError("Invalid side") - return _na_map(f, arr, dtype=str) - - -def str_wrap(arr, width, **kwargs): - kwargs["width"] = width - - tw = textwrap.TextWrapper(**kwargs) - - return _na_map(lambda s: "\n".join(tw.wrap(s)), arr, dtype=str) - - -def str_translate(arr, table): - return _na_map(lambda x: x.translate(table), arr, dtype=str) - - -def str_get(arr, i): - def f(x): - if isinstance(x, dict): - return x.get(i) - elif len(x) > i >= -len(x): - return x[i] - return np.nan - - return _na_map(f, arr) - - -def str_decode(arr, encoding, errors="strict"): - """ - Decode character string in the Series/Index using indicated encoding. - - Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in - python3. - - Parameters - ---------- - encoding : str - errors : str, optional - - Returns - ------- - Series or Index - """ - if encoding in _cpython_optimized_decoders: - # CPython optimized implementation - f = lambda x: x.decode(encoding, errors) - else: - decoder = codecs.getdecoder(encoding) - f = lambda x: decoder(x, errors)[0] - return _na_map(f, arr) - - -def str_encode(arr, encoding, errors="strict"): - if encoding in _cpython_optimized_encoders: - # CPython optimized implementation - f = lambda x: x.encode(encoding, errors) - else: - encoder = codecs.getencoder(encoding) - f = lambda x: encoder(x, errors)[0] - return _na_map(f, arr) - - -def forbid_nonstring_types(forbidden, name=None): - """ - Decorator to forbid specific types for a method of StringMethods. - - For calling `.str.{method}` on a Series or Index, it is necessary to first - initialize the :class:`StringMethods` object, and then call the method. - However, different methods allow different input types, and so this can not - be checked during :meth:`StringMethods.__init__`, but must be done on a - per-method basis. This decorator exists to facilitate this process, and - make it explicit which (inferred) types are disallowed by the method. - - :meth:`StringMethods.__init__` allows the *union* of types its different - methods allow (after skipping NaNs; see :meth:`StringMethods._validate`), - namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer']. - - The default string types ['string', 'empty'] are allowed for all methods. - For the additional types ['bytes', 'mixed', 'mixed-integer'], each method - then needs to forbid the types it is not intended for. - - Parameters - ---------- - forbidden : list-of-str or None - List of forbidden non-string types, may be one or more of - `['bytes', 'mixed', 'mixed-integer']`. - name : str, default None - Name of the method to use in the error message. By default, this is - None, in which case the name from the method being wrapped will be - copied. However, for working with further wrappers (like _pat_wrapper - and _noarg_wrapper), it is necessary to specify the name. - - Returns - ------- - func : wrapper - The method to which the decorator is applied, with an added check that - enforces the inferred type to not be in the list of forbidden types. - - Raises - ------ - TypeError - If the inferred type of the underlying data is in `forbidden`. - """ - # deal with None - forbidden = [] if forbidden is None else forbidden - - allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set( - forbidden - ) - - def _forbid_nonstring_types(func): - func_name = func.__name__ if name is None else name - - @wraps(func) - def wrapper(self, *args, **kwargs): - if self._inferred_dtype not in allowed_types: - msg = ( - f"Cannot use .str.{func_name} with values of " - f"inferred dtype '{self._inferred_dtype}'." - ) - raise TypeError(msg) - return func(self, *args, **kwargs) - - wrapper.__name__ = func_name - return wrapper - - return _forbid_nonstring_types - - -def _noarg_wrapper( - f, - name=None, - docstring=None, - forbidden_types=["bytes"], - returns_string=True, - **kwargs, -): - @forbid_nonstring_types(forbidden_types, name=name) - def wrapper(self): - result = _na_map(f, self._parent, **kwargs) - return self._wrap_result(result, returns_string=returns_string) - - wrapper.__name__ = f.__name__ if name is None else name - if docstring is not None: - wrapper.__doc__ = docstring - else: - raise ValueError("Provide docstring") - - return wrapper - - -def _pat_wrapper( - f, - flags=False, - na=False, - name=None, - forbidden_types=["bytes"], - returns_string=True, - **kwargs, -): - @forbid_nonstring_types(forbidden_types, name=name) - def wrapper1(self, pat): - result = f(self._parent, pat) - return self._wrap_result(result, returns_string=returns_string) - - @forbid_nonstring_types(forbidden_types, name=name) - def wrapper2(self, pat, flags=0, **kwargs): - result = f(self._parent, pat, flags=flags, **kwargs) - return self._wrap_result(result, returns_string=returns_string) - - @forbid_nonstring_types(forbidden_types, name=name) - def wrapper3(self, pat, na=np.nan): - result = f(self._parent, pat, na=na) - return self._wrap_result(result, returns_string=returns_string) - - wrapper = wrapper3 if na else wrapper2 if flags else wrapper1 - - wrapper.__name__ = f.__name__ if name is None else name - if f.__doc__: - wrapper.__doc__ = f.__doc__ - - return wrapper - - -def copy(source): - """Copy a docstring from another source function (if present)""" - - def do_copy(target): - if source.__doc__: - target.__doc__ = source.__doc__ - return target - - return do_copy diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py deleted file mode 100644 index 40e3f21670ea0..0000000000000 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ /dev/null @@ -1,26 +0,0 @@ -import pytest - -import pandas as pd -import pandas.testing as tm - - -def test_eq_all_na(): - a = pd.array([pd.NA, pd.NA], dtype=pd.StringDtype("pyarrow")) - result = a == a - expected = pd.array([pd.NA, pd.NA], dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - -def test_config(): - # python by default - assert pd.StringDtype().storage == "python" - arr = pd.array(["a", "b"]) - assert arr.dtype.storage == "python" - - with pd.option_context("mode.string_storage", "pyarrow"): - assert pd.StringDtype().storage == "pyarrow" - arr = pd.array(["a", "b"]) - assert arr.dtype.storage == "pyarrow" - - with pytest.raises(ValueError): - pd.options.mode.string_storage = "foo" diff --git a/pandas/tests/extension/arrow/test_string.py b/pandas/tests/extension/arrow/test_string.py index f32f1e415ddc7..abd5c1f386dc5 100644 --- a/pandas/tests/extension/arrow/test_string.py +++ b/pandas/tests/extension/arrow/test_string.py @@ -4,9 +4,10 @@ pytest.importorskip("pyarrow", minversion="0.13.0") +from .arrays import ArrowStringDtype # isort:skip + def test_constructor_from_list(): # GH 27673 - result = pd.Series(["E"], dtype=pd.StringDtype(storage="pyarrow")) - assert isinstance(result.dtype, pd.StringDtype) - assert result.dtype.storage == "pyarrow" + result = pd.Series(["E"], dtype=ArrowStringDtype()) + assert isinstance(result.dtype, ArrowStringDtype) diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py deleted file mode 100644 index 848e8a435b530..0000000000000 --- a/pandas/tests/extension/test_string_arrow.py +++ /dev/null @@ -1,150 +0,0 @@ -import string - -import numpy as np -import pytest - -import pandas as pd -from pandas.core.arrays.string_arrow import ArrowStringArray -from pandas.tests.extension import base - - -@pytest.fixture -def dtype(): - return pd.StringDtype(storage="pyarrow") - - -@pytest.fixture -def data(): - strings = np.random.choice(list(string.ascii_letters), size=100) - while strings[0] == strings[1]: - strings = np.random.choice(list(string.ascii_letters), size=100) - - return ArrowStringArray._from_sequence(strings) - - -@pytest.fixture -def data_missing(): - """Length 2 array with [NA, Valid]""" - return ArrowStringArray._from_sequence([pd.NA, "A"]) - - -@pytest.fixture -def data_for_sorting(): - return ArrowStringArray._from_sequence(["B", "C", "A"]) - - -@pytest.fixture -def data_missing_for_sorting(): - return ArrowStringArray._from_sequence(["B", pd.NA, "A"]) - - -@pytest.fixture -def na_value(): - return pd.NA - - -@pytest.fixture -def data_for_grouping(): - return ArrowStringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"]) - - -class TestDtype(base.BaseDtypeTests): - pass - - -class TestInterface(base.BaseInterfaceTests): - @pytest.mark.xfail(reason="Fails until implement, remove before merge") - def test_view(self, data): - base.BaseInterfaceTests.test_view(self, data) - - -class TestConstructors(base.BaseConstructorsTests): - pass - - -class TestReshaping(base.BaseReshapingTests): - pass - - -class TestGetitem(base.BaseGetitemTests): - @pytest.mark.xfail( - reason="pyarrow.lib.ArrowNotImplementedError: Function " - "fill_null has no kernel matching input types " - "(array[string], scalar[string])" - ) - def test_take_non_na_fill_value(self, data_missing): - super().test_take_non_na_fill_value(data_missing) - - @pytest.mark.xfail( - reason="pyarrow.lib.ArrowNotImplementedError: Function fill_null has no " - "kernel matching input types (array[string], scalar[string])" - ) - def test_reindex_non_na_fill_value(self, data_missing): - super().test_reindex_non_na_fill_value(self, data_missing) - - -class TestSetitem(base.BaseSetitemTests): - @pytest.mark.xfail(reason="TODO") - def test_setitem_preserves_views(self, data): - # Unclear where the issue is (pyarrow getitem, our getitem, our slice) - # and what to do here. - super().test_setitem_preserves_views(data) - - -class TestMissing(base.BaseMissingTests): - pass - - -class TestNoReduce(base.BaseNoReduceTests): - @pytest.mark.parametrize("skipna", [True, False]) - def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): - op_name = all_numeric_reductions - - if op_name in ["min", "max"]: - return None - - s = pd.Series(data) - with pytest.raises(TypeError): - getattr(s, op_name)(skipna=skipna) - - -class TestMethods(base.BaseMethodsTests): - @pytest.mark.skip(reason="returns nullable") - def test_value_counts(self, all_data, dropna): - return super().test_value_counts(all_data, dropna) - - -class TestCasting(base.BaseCastingTests): - pass - - -class TestComparisonOps(base.BaseComparisonOpsTests): - def _compare_other(self, s, data, op_name, other): - if op_name not in {"__eq__", "__ne__"}: - pytest.skip(f"{op_name} is not implemented.") - result = getattr(s, op_name)(other) - expected = getattr(s.astype(object), op_name)(other).astype("boolean") - self.assert_series_equal(result, expected) - - def test_compare_scalar(self, data, all_compare_operators): - op_name = all_compare_operators - s = pd.Series(data) - self._compare_other(s, data, op_name, "abc") - - def test_compare_array(self, data, all_compare_operators): - op_name = all_compare_operators - s = pd.Series(data) - other = pd.Series([data[0]] * len(data), dtype=data.dtype) - self._compare_other(s, data, op_name, other) - - -class TestParsing(base.BaseParsingTests): - pass - - -class TestPrinting(base.BasePrintingTests): - pass - - -class TestGroupBy(base.BaseGroupbyTests): - pass diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index d9396d70f9112..7eee4e5deb2d1 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -724,10 +724,6 @@ def test_count(self): ["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=np.object_ ) - result = strings.str_count(values, "f[o]+") - exp = np.array([1, 2, np.nan, 4]) - tm.assert_numpy_array_equal(result, exp) - result = Series(values).str.count("f[o]+") exp = Series([1, 2, np.nan, 4]) assert isinstance(result, Series) @@ -738,10 +734,6 @@ def test_count(self): ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], dtype=object, ) - rs = strings.str_count(mixed, "a") - xp = np.array([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan]) - tm.assert_numpy_array_equal(rs, xp) - rs = Series(mixed).str.count("a") xp = Series([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan]) assert isinstance(rs, Series) @@ -751,46 +743,55 @@ def test_contains(self): values = np.array( ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_ ) + values = Series(values) pat = "mmm[_]+" - result = strings.str_contains(values, pat) - expected = np.array([False, np.nan, True, True, False], dtype=np.object_) - tm.assert_numpy_array_equal(result, expected) + result = values.str.contains(pat) + expected = Series( + np.array([False, np.nan, True, True, False], dtype=np.object_) + ) + tm.assert_series_equal(result, expected) - result = strings.str_contains(values, pat, regex=False) - expected = np.array([False, np.nan, False, False, True], dtype=np.object_) - tm.assert_numpy_array_equal(result, expected) + result = values.str.contains(pat, regex=False) + expected = Series( + np.array([False, np.nan, False, False, True], dtype=np.object_) + ) + tm.assert_series_equal(result, expected) - values = np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=object) - result = strings.str_contains(values, pat) - expected = np.array([False, False, True, True]) + values = Series(np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=object)) + result = values.str.contains(pat) + expected = Series(np.array([False, False, True, True])) assert result.dtype == np.bool_ - tm.assert_numpy_array_equal(result, expected) + tm.assert_series_equal(result, expected) # case insensitive using regex - values = np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object) - result = strings.str_contains(values, "FOO|mmm", case=False) - expected = np.array([True, False, True, True]) - tm.assert_numpy_array_equal(result, expected) + values = Series(np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object)) + result = values.str.contains("FOO|mmm", case=False) + expected = Series(np.array([True, False, True, True])) + tm.assert_series_equal(result, expected) # case insensitive without regex - result = strings.str_contains(values, "foo", regex=False, case=False) - expected = np.array([True, False, True, False]) - tm.assert_numpy_array_equal(result, expected) + result = Series(values).str.contains("foo", regex=False, case=False) + expected = Series(np.array([True, False, True, False])) + tm.assert_series_equal(result, expected) # mixed - mixed = np.array( - ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], - dtype=object, + mixed = Series( + np.array( + ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], + dtype=object, + ) ) - rs = strings.str_contains(mixed, "o") - xp = np.array( - [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan], - dtype=np.object_, + rs = mixed.str.contains("o") + xp = Series( + np.array( + [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan], + dtype=np.object_, + ) ) - tm.assert_numpy_array_equal(rs, xp) + tm.assert_series_equal(rs, xp) - rs = Series(mixed).str.contains("o") + rs = mixed.str.contains("o") xp = Series( [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan] ) @@ -798,22 +799,26 @@ def test_contains(self): tm.assert_series_equal(rs, xp) # unicode - values = np.array(["foo", np.nan, "fooommm__foo", "mmm_"], dtype=np.object_) + values = Series( + np.array(["foo", np.nan, "fooommm__foo", "mmm_"], dtype=np.object_) + ) pat = "mmm[_]+" - result = strings.str_contains(values, pat) - expected = np.array([False, np.nan, True, True], dtype=np.object_) - tm.assert_numpy_array_equal(result, expected) + result = values.str.contains(pat) + expected = Series(np.array([False, np.nan, True, True], dtype=np.object_)) + tm.assert_series_equal(result, expected) - result = strings.str_contains(values, pat, na=False) - expected = np.array([False, False, True, True]) - tm.assert_numpy_array_equal(result, expected) + result = values.str.contains(pat, na=False) + expected = Series(np.array([False, False, True, True])) + tm.assert_series_equal(result, expected) - values = np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=np.object_) - result = strings.str_contains(values, pat) - expected = np.array([False, False, True, True]) + values = Series( + np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=np.object_) + ) + result = values.str.contains(pat) + expected = Series(np.array([False, False, True, True])) assert result.dtype == np.bool_ - tm.assert_numpy_array_equal(result, expected) + tm.assert_series_equal(result, expected) def test_contains_for_object_category(self): # gh 22158 @@ -853,15 +858,7 @@ def test_startswith(self): ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], dtype=np.object_, ) - rs = strings.str_startswith(mixed, "f") - xp = np.array( - [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan], - dtype=np.object_, - ) - tm.assert_numpy_array_equal(rs, xp) - rs = Series(mixed).str.startswith("f") - assert isinstance(rs, Series) xp = Series( [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan] ) @@ -882,18 +879,10 @@ def test_endswith(self): ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], dtype=object, ) - rs = strings.str_endswith(mixed, "f") - xp = np.array( - [False, np.nan, False, np.nan, np.nan, False, np.nan, np.nan, np.nan], - dtype=np.object_, - ) - tm.assert_numpy_array_equal(rs, xp) - rs = Series(mixed).str.endswith("f") xp = Series( [False, np.nan, False, np.nan, np.nan, False, np.nan, np.nan, np.nan] ) - assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) def test_title(self): @@ -1193,6 +1182,11 @@ def test_match(self): exp = Series([True, np.nan, np.nan]) tm.assert_series_equal(exp, res) + values = Series(["ab", "AB", "abc", "ABC"]) + result = values.str.match("ab", case=False) + expected = Series([True, True, True, True]) + tm.assert_series_equal(result, expected) + def test_fullmatch(self): # GH 32806 values = Series(["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"]) @@ -1209,6 +1203,11 @@ def test_fullmatch(self): string_exp = Series([True, False, np.nan, False], dtype="boolean") tm.assert_series_equal(result, string_exp) + values = Series(["ab", "AB", "abc", "ABC"]) + result = values.str.fullmatch("ab", case=False) + expected = Series([True, True, False, False]) + tm.assert_series_equal(result, expected) + def test_extract_expand_None(self): values = Series(["fooBAD__barBAD", np.nan, "foo"]) with pytest.raises(ValueError, match="expand must be True or False"): @@ -2232,6 +2231,9 @@ def _check(result, expected): with pytest.raises(TypeError, match=msg): result = s.str.index(0) + with pytest.raises(TypeError, match=msg): + result = s.str.rindex(0) + # test with nan s = Series(["abcb", "ab", "bcbe", np.nan]) result = s.str.index("b") From fabc01e2a81767a84d7108fca228e942658e2322 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 14 Sep 2020 07:41:07 -0500 Subject: [PATCH 05/24] wip --- pandas/core/arrays/base.py | 9 +- pandas/core/arrays/categorical.py | 2 +- pandas/core/arrays/string_.py | 65 +-- pandas/core/arrays/string_arrow.py | 450 ------------------ pandas/core/config_init.py | 13 - pandas/core/strings/__init__.py | 3 + pandas/core/strings/accessor.py | 23 +- pandas/core/strings/base.py | 230 ++++++++- ...{categorical_strings.py => categorical.py} | 0 pandas/core/strings/object_array.py | 6 +- pandas/core/strings/string_array.py | 70 +++ 11 files changed, 322 insertions(+), 549 deletions(-) delete mode 100644 pandas/core/arrays/string_arrow.py rename pandas/core/strings/{categorical_strings.py => categorical.py} (100%) create mode 100644 pandas/core/strings/string_array.py diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index de22af70643c2..e93cdb608dffb 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -354,8 +354,6 @@ def __ne__(self, other: Any) -> ArrayLike: """ Return for `self != other` (element-wise in-equality). """ - if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): - return NotImplemented return ~(self == other) def to_numpy( @@ -459,7 +457,6 @@ def astype(self, dtype, copy=True): from pandas.core.arrays.string_ import StringDtype dtype = pandas_dtype(dtype) - # FIXME: Really hard-code here? if isinstance(dtype, StringDtype): # allow conversion to StringArrays return dtype.construct_array_type()._from_sequence(self, copy=False) @@ -925,9 +922,9 @@ def take( from the right (the default). This is similar to :func:`numpy.take`. - * True: ``-1`` in `indices` indicate missing values. - These values are set to `fill_value`. Any other other negative - value raise a ``ValueError``. + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. fill_value : any, optional Fill value to use for NA-indices when `allow_fill` is True. diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index f2a27da79ddbb..12d6d21f19e04 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -52,7 +52,7 @@ from pandas.core.missing import interpolate_2d from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.sorting import nargsort -from pandas.core.strings.categorical_strings import CategoricalStringMethods +from pandas.core.strings.categorical import CategoricalStringMethods from pandas.io.formats import console diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 7cc70e3660f81..46d0fedbe8f39 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -6,14 +6,7 @@ from pandas._libs import lib, missing as libmissing from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype -from pandas.core.dtypes.common import ( - is_array_like, - is_bool_dtype, - is_integer_dtype, - is_object_dtype, - is_string_dtype, - pandas_dtype, -) +from pandas.core.dtypes.common import is_array_like, pandas_dtype from pandas import compat from pandas.core import ops @@ -23,7 +16,7 @@ from pandas.core.construction import extract_array from pandas.core.indexers import check_array_indexer from pandas.core.missing import isna -from pandas.core.strings.object_array import ObjectArrayMethods +from pandas.core.strings.string_array import StringArrayMethods if TYPE_CHECKING: import pyarrow # noqa: F401 @@ -68,7 +61,7 @@ def type(self) -> Type[str]: return str @classmethod - def construct_array_type(self) -> Type["StringArray"]: + def construct_array_type(cls) -> Type["StringArray"]: """ Return the array type associated with this dtype. @@ -79,7 +72,7 @@ def construct_array_type(self) -> Type["StringArray"]: return StringArray def __repr__(self) -> str: - return self.name + return "StringDtype" def __from_arrow__( self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"] @@ -104,56 +97,6 @@ def __from_arrow__( return StringArray._concat_same_type(results) -class StringArrayMethods(ObjectArrayMethods): - def _map(self, f, na_value=libmissing.NA, dtype=StringDtype()): - from pandas.arrays import BooleanArray, IntegerArray, StringArray - - arr = self._array - mask = isna(arr) - - assert isinstance(arr, StringArray) - arr = np.asarray(arr) - if na_value is None: - na_value = libmissing.NA - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - constructor: Union[Type[IntegerArray], Type[BooleanArray]] - if is_integer_dtype(dtype): - constructor = IntegerArray - else: - constructor = BooleanArray - - na_value_is_na = isna(na_value) - if na_value_is_na: - na_value = 1 - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - dtype=np.dtype(dtype), - ) - - if not na_value_is_na: - mask[:] = False - - return constructor(result, mask) - - elif is_string_dtype(dtype) and not is_object_dtype(dtype): - # i.e. StringDtype - result = lib.map_infer_mask( - arr, f, mask.view("uint8"), convert=False, na_value=na_value - ) - return StringArray(result) - else: - # This is when the result type is object. We reach this when - # -> We know the result type is truly object (e.g. .encode returns bytes - # or .findall returns a list). - # -> We don't know the result type. E.g. `.get` can return anything. - return lib.map_infer_mask(arr, f, mask.view("uint8")) - - class StringArray(PandasArray): """ Extension array for string data. diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py deleted file mode 100644 index 65aa38db4f6f6..0000000000000 --- a/pandas/core/arrays/string_arrow.py +++ /dev/null @@ -1,450 +0,0 @@ -from collections.abc import Iterable -from typing import Any, Optional, Sequence, Tuple, Union - -import numpy as np -import pyarrow as pa -import pyarrow.compute as pc - -from pandas._libs import missing as libmissing -from pandas._typing import ArrayLike - -from pandas.core.dtypes.missing import isna - -from pandas.api.types import ( - is_array_like, - is_bool_dtype, - is_int64_dtype, - is_integer, - is_integer_dtype, - is_scalar, -) -from pandas.core.accessor import CachedAccessor -from pandas.core.algorithms import factorize -from pandas.core.arrays.base import ExtensionArray -from pandas.core.arrays.string_ import StringDtype -from pandas.core.indexers import check_array_indexer - - -def _as_pandas_scalar(arrow_scalar: pa.Scalar) -> Optional[str]: - scalar = arrow_scalar.as_py() - if scalar is None: - return libmissing.NA - else: - return scalar - - -class ArrowStringMethods: - def __init__(self, arr): - self._data = arr - - def upper(self): - import pyarrow.compute as pc - - result = pc.utf8_upper(self._data.data) - return ArrowStringArray(result) - - -class ArrowStringArray(ExtensionArray): - """ - Extension array for string data in a ``pyarrow.ChunkedArray``. - - .. versionadded:: 1.1.0 - - .. warning:: - - ArrowStringArray is considered experimental. The implementation and - parts of the API may change without warning. - - Parameters - ---------- - values : pyarrow.Array or pyarrow.ChunkedArray - The array of data. - - Attributes - ---------- - None - - Methods - ------- - None - - See Also - -------- - array - The recommended function for creating a ArrowStringArray. - Series.str - The string methods are available on Series backed by - a ArrowStringArray. - - Notes - ----- - ArrowStringArray returns a BooleanArray for comparison methods. - - Examples - -------- - >>> pd.array(['This is', 'some text', None, 'data.'], dtype="arrow_string") - - ['This is', 'some text', , 'data.'] - Length: 4, dtype: arrow_string - """ - - def __init__(self, values): - if isinstance(values, pa.Array): - self.data = pa.chunked_array([values]) - elif isinstance(values, pa.ChunkedArray): - self.data = values - else: - raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray") - self._dtype = StringDtype(storage="pyarrow") - - @classmethod - def _from_sequence(cls, scalars, dtype=None, copy=False): - # TODO(ARROW-9407): Accept pd.NA in Arrow - scalars_corrected = [None if isna(x) else x for x in scalars] - return cls(pa.array(scalars_corrected, type=pa.string())) - - @property - def dtype(self) -> StringDtype: - """ - An instance of 'StringDtype'. - """ - return self._dtype - - def __array__(self, *args, **kwargs) -> "np.ndarray": - """Correctly construct numpy arrays when passed to `np.asarray()`.""" - return self.data.__array__(*args, **kwargs) - - def __arrow_array__(self, type=None): - """Convert myself to a pyarrow Array or ChunkedArray.""" - return self.data - - @property - def size(self) -> int: - """ - Return the number of elements in this array. - - Returns - ------- - size : int - """ - return len(self.data) - - @property - def shape(self) -> Tuple[int]: - """Return the shape of the data.""" - # This may be patched by pandas to support pseudo-2D operations. - return (len(self.data),) - - @property - def ndim(self) -> int: - """Return the number of dimensions of the underlying data.""" - return 1 - - def __len__(self) -> int: - """ - Length of this array. - - Returns - ------- - length : int - """ - return len(self.data) - - @classmethod - def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): - return cls._from_sequence(strings, dtype=dtype, copy=copy) - - def __getitem__(self, item): - # type (Any) -> Any - """Select a subset of self. - - Parameters - ---------- - item : int, slice, or ndarray - * int: The position in 'self' to get. - * slice: A slice object, where 'start', 'stop', and 'step' are - integers or None - * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' - - Returns - ------- - item : scalar or ExtensionArray - - Notes - ----- - For scalar ``item``, return a scalar value suitable for the array's - type. This should be an instance of ``self.dtype.type``. - For slice ``key``, return an instance of ``ExtensionArray``, even - if the slice is length 0 or 1. - For a boolean mask, return an instance of ``ExtensionArray``, filtered - to the values where ``item`` is True. - """ - item = check_array_indexer(self, item) - - if isinstance(item, Iterable): - if not is_array_like(item): - item = np.array(item) - if len(item) == 0: - return type(self)(pa.chunked_array([], type=pa.string())) - elif is_integer_dtype(item): - return self.take(item) - elif is_bool_dtype(item): - return type(self)(self.data.filter(item)) - else: - raise IndexError( - "Only integers, slices and integer or " - "boolean arrays are valid indices." - ) - elif is_integer(item): - if item < 0: - item += len(self) - if item >= len(self): - raise IndexError("index out of bounds") - - value = self.data[item] - if isinstance(value, pa.ChunkedArray): - return type(self)(value) - else: - return _as_pandas_scalar(value) - - @property - def nbytes(self) -> int: - """ - The number of bytes needed to store this object in memory. - """ - return self.data.nbytes - - def isna(self) -> np.ndarray: - """ - Boolean NumPy array indicating if each value is missing. - - This should return a 1-D array the same length as 'self'. - """ - # TODO: Implement .to_numpy for ChunkedArray - return self.data.is_null().to_pandas().values - - def copy(self) -> ExtensionArray: - """ - Return a copy of the array. - - Parameters - ---------- - deep : bool, default False - Also copy the underlying data backing this array. - - Returns - ------- - ExtensionArray - """ - return type(self)(self.data) - - def __eq__(self, other: Any) -> ArrayLike: - """ - Return for `self == other` (element-wise equality). - """ - from pandas import array, Series, DataFrame, Index - - if isinstance(other, (Series, DataFrame, Index)): - return NotImplemented - if isinstance(other, ArrowStringArray): - result = pc.equal(self.data, other.data) - elif is_scalar(other): - result = pc.equal(self.data, pa.scalar(other)) - else: - raise NotImplementedError("Neither scalar nor ArrowStringArray") - - # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray - return array(result.to_pandas().values, dtype="boolean") - - def __setitem__(self, key, value): - # type: (Union[int, np.ndarray], Any) -> None - """Set one or more values inplace. - - Parameters - ---------- - key : int, ndarray, or slice - When called from, e.g. ``Series.__setitem__``, ``key`` will be - one of - - * scalar int - * ndarray of integers. - * boolean ndarray - * slice object - - value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object - value or values to be set of ``key``. - - Returns - ------- - None - """ - key = check_array_indexer(self, key) - - if is_integer(key): - if not is_scalar(value): - raise ValueError("Must pass scalars with scalar indexer") - elif isna(value): - value = None - elif not isinstance(value, str): - raise ValueError("Scalar must be NA or str") - - # Slice data and insert inbetween - new_data = [ - *self.data[0:key].chunks, - pa.array([value], type=pa.string()), - *self.data[(key + 1) :].chunks, - ] - self.data = pa.chunked_array(new_data) - else: - # Convert to integer indices and iteratively assign. - # TODO: Make a faster variant of this in Arrow upstream. - # This is probably extremely slow. - - # Convert all possible input key types to an array of integers - if is_bool_dtype(key): - # TODO(ARROW-9430): Directly support setitem(booleans) - key_array = np.argwhere(key).flatten() - elif isinstance(key, slice): - key_array = np.array(range(len(self))[key]) - else: - # TODO(ARROW-9431): Directly support setitem(integers) - key_array = np.asanyarray(key) - - if is_scalar(value): - value = np.broadcast_to(value, len(key_array)) - else: - value = np.asarray(value) - - if len(key_array) != len(value): - raise ValueError("Length of indexer and values mismatch") - - for k, v in zip(key_array, value): - self[k] = v - - def take( - self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None - ) -> "ExtensionArray": - """ - Take elements from an array. - - Parameters - ---------- - indices : sequence of int - Indices to be taken. - allow_fill : bool, default False - How to handle negative values in `indices`. - - * False: negative values in `indices` indicate positional indices - from the right (the default). This is similar to - :func:`numpy.take`. - - * True: negative values in `indices` indicate - missing values. These values are set to `fill_value`. Any other - other negative values raise a ``ValueError``. - - fill_value : any, optional - Fill value to use for NA-indices when `allow_fill` is True. - This may be ``None``, in which case the default NA value for - the type, ``self.dtype.na_value``, is used. - - For many ExtensionArrays, there will be two representations of - `fill_value`: a user-facing "boxed" scalar, and a low-level - physical NA value. `fill_value` should be the user-facing version, - and the implementation should handle translating that to the - physical version for processing the take if necessary. - - Returns - ------- - ExtensionArray - - Raises - ------ - IndexError - When the indices are out of bounds for the array. - ValueError - When `indices` contains negative values other than ``-1`` - and `allow_fill` is True. - - See Also - -------- - numpy.take - api.extensions.take - - Notes - ----- - ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, - ``iloc``, when `indices` is a sequence of values. Additionally, - it's called by :meth:`Series.reindex`, or any other method - that causes realignment, with a `fill_value`. - """ - # TODO: Remove once we got rid of the (indices < 0) check - if not is_array_like(indices): - indices_array = np.asanyarray(indices) - else: - indices_array = indices - - if len(self.data) == 0 and (indices_array >= 0).any(): - raise IndexError("cannot do a non-empty take") - if len(indices_array) > 0 and indices_array.max() >= len(self.data): - raise IndexError("out of bounds value in 'indices'.") - - if allow_fill: - if (indices_array < 0).any(): - if indices_array.min() < -1: - raise ValueError( - "'indicies' contains negative values other " - "-1 with 'allow_fill=True." - ) - # TODO(ARROW-9433): Treat negative indices as NULL - indices_array = pa.array(indices_array, mask=indices_array < 0) - result = self.data.take(indices_array) - if isna(fill_value): - return type(self)(result) - return type(self)(pc.fill_null(result, pa.scalar(fill_value))) - else: - # Nothing to fill - return type(self)(self.data.take(indices)) - else: # allow_fill=False - # TODO(ARROW-9432): Treat negative indices as indices from the right. - if (indices_array < 0).any(): - # Don't modify in-place - indices_array = np.copy(indices_array) - indices_array[indices_array < 0] += len(self.data) - return type(self)(self.data.take(indices_array)) - - def value_counts(self, dropna=True): - from pandas import Series - - if dropna: - na = self.isna() - self = self[~na] - counts = self.data.value_counts() - return Series(counts.field(1), counts.field(0)) - - def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray"]: - # see https://github.com/xhochy/fletcher/blob/master/fletcher/base.py - # doesn't handle dictionary types. - if self.data.num_chunks == 1: - encoded = self.data.chunk(0).dictionary_encode() - indices = encoded.indices.to_pandas() - if indices.dtype.kind == "f": - indices[np.isnan(indices)] = na_sentinel - indices = indices.astype(int) - if not is_int64_dtype(indices): - indices = indices.astype(np.int64) - return indices.values, type(self)(encoded.dictionary) - else: - np_array = self.data.to_pandas().values - return factorize(np_array, na_sentinel=na_sentinel) - - @classmethod - def _concat_same_type( - cls, to_concat: Sequence["ArrowStringArray"] - ) -> "ArrowStringArray": - return cls( - pa.chunked_array( - [array for ea in to_concat for array in ea.data.iterchunks()] - ) - ) - - str = CachedAccessor("str", ArrowStringMethods) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index c7e0e7ef19010..bfe20551cbcfc 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -504,19 +504,6 @@ def use_inf_as_na_cb(key): ) -string_storage_doc = """ -: string - The default storage for StringDtype. -""" - -with cf.config_prefix("mode"): - cf.register_option( - "string_storage", - "python", - string_storage_doc, - validator=is_one_of_factory(["python", "pyarrow"]), - ) - # Set up the io.excel specific reader configuration. reader_engine_doc = """ : string diff --git a/pandas/core/strings/__init__.py b/pandas/core/strings/__init__.py index c36a96edf7125..bf0ac8ef872ca 100644 --- a/pandas/core/strings/__init__.py +++ b/pandas/core/strings/__init__.py @@ -1 +1,4 @@ from .accessor import StringMethods +from .base import BaseStringArrayMethods + +__all__ = ["StringMethods", "BaseStringArrayMethods"] diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 334769468e671..f5c91b692d0e7 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -143,6 +143,12 @@ class StringMethods(NoNewAttributesMixin): dtype: object """ + # TODO: Dispatch all the methods + # Currently the following are not dispatched to the array + # * cat + # * extract + # * extractall + def __init__(self, data): from pandas.core.arrays.string_ import StringDtype @@ -249,7 +255,6 @@ def _wrap_result( # case we'll want to return the same dtype as the input. # Or we can be wrapping a numeric output, in which case we don't want # to return a StringArray. - # XXX: see if this can be removed. # Ideally the array method returns the right array type. if expand is None: # infer from ndim if expand is not specified @@ -513,7 +518,7 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): For more examples, see :ref:`here `. """ - # XXX: not dispatched yet. + # TODO: dispatch from pandas import Index, Series, concat if isinstance(others, str): @@ -968,7 +973,6 @@ def join(self, sep): 4 NaN dtype: object """ - # XXX: Does this use the Series? If so then we can't dispatch. result = self._array._str.join(sep) return self._wrap_result(result) @@ -1414,7 +1418,6 @@ def pad(self, width, side="left", fillchar=" "): ------- filled : Series/Index of objects. """ - # XXX: Do we need to dispatch to center, etc, or is it equivalent? @Appender(_shared_docs["str_pad"] % dict(side="left and right", method="center")) @forbid_nonstring_types(["bytes"]) @@ -1885,9 +1888,7 @@ def get_dummies(self, sep="|"): """ # we need to cast to Series of strings as only that has all # methods available for making the dummies... - # XXX: data = self._orig.astype(str) if self._is_categorical else self._parent result, name = self._array._str.get_dummies(sep) - # result, name = str_get_dummies(data, sep) return self._wrap_result(result, name=name, expand=True, returns_string=False,) @forbid_nonstring_types(["bytes"]) @@ -2032,7 +2033,6 @@ def startswith(self, pat, na=None): 3 False dtype: bool """ - # XXX: changed default na to None result = self._array._str.startswith(pat, na=na) return self._wrap_result(result, returns_string=False) @@ -2088,7 +2088,6 @@ def endswith(self, pat, na=None): 3 False dtype: bool """ - # XXX: changed default na to None result = self._array._str.endswith(pat, na=na) return self._wrap_result(result, returns_string=False) @@ -2267,7 +2266,7 @@ def extract(self, pat, flags=0, expand=True): 2 NaN dtype: object """ - # XXX: not dispatched + # TODO: dispatch return str_extract(self, pat, flags, expand=expand) @forbid_nonstring_types(["bytes"]) @@ -2345,7 +2344,7 @@ def extractall(self, pat, flags=0): B 0 b 1 C 0 NaN 1 """ - # XXX: not dispatched + # TODO: dispatch return str_extractall(self._orig, pat, flags) _shared_docs[ @@ -3022,10 +3021,6 @@ def str_extract(arr, pat, flags=0, expand=True): if expand: return _str_extract_frame(arr._orig, pat, flags=flags) else: - # XXX: some dead code now. - # arr = arr._array - # if isinstance(arr, ObjectProxy): - # arr = arr._ndarray result, name = _str_extract_noexpand(arr._orig, pat, flags=flags) return arr._wrap_result(result, name=name, expand=expand) diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index 9d484f449b14b..f9e70f8559fd9 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -1,5 +1,229 @@ -class BaseStringArrayMethods: - """Base class for array _str accessor.""" +import abc +from typing import Pattern, Union - def __init__(self, array): +import numpy as np + +from pandas._typing import Scalar + +from pandas.core.arrays.base import ExtensionArray + + +class BaseStringArrayMethods(abc.ABC): + """ + Base class for array _str accessor. + + This is where ExtensionArrays can override the implementation of + Series.str.. The rough layout is + + * User calls Series.str. + * pandas extracts the extension array from the Series + * pandas calls ``extension_array._str.(*args, **kwargs)`` + * pandas wraps the result, to return to the user. + + See :ref:`Series.str` for the docstring of each method. + """ + + def __init__(self, array: ExtensionArray): self._array = array + + def __getitem__(self, key): + if isinstance(key, slice): + return self.slice(start=key.start, stop=key.stop, step=key.step) + else: + return self.get(key) + + @abc.abstractmethod + def count(self, pat, flags=0): + pass + + @abc.abstractmethod + def pad(self, width, side="left", fillchar=" "): + pass + + @abc.abstractmethod + def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): + pass + + @abc.abstractmethod + def startswith(self, pat, na=np.nan): + pass + + @abc.abstractmethod + def endswith(self, pat, na=np.nan): + pass + + @abc.abstractmethod + def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): + pass + + @abc.abstractmethod + def repeat(self, repeats): + pass + + @abc.abstractmethod + def match( + self, + pat: Union[str, Pattern], + case: bool = True, + flags: int = 0, + na: Scalar = np.nan, + ): + pass + + @abc.abstractmethod + def fullmatch( + self, + pat: Union[str, Pattern], + case: bool = True, + flags: int = 0, + na: Scalar = np.nan, + ): + pass + + @abc.abstractmethod + def encode(self, encoding, errors="strict"): + pass + + @abc.abstractmethod + def find(self, sub, start=0, end=None): + pass + + @abc.abstractmethod + def rfind(self, sub, start=0, end=None): + pass + + @abc.abstractmethod + def findall(self, pat, flags=0): + pass + + @abc.abstractmethod + def get(self, i): + pass + + @abc.abstractmethod + def index(self, sub, start=0, end=None): + pass + + @abc.abstractmethod + def rindex(self, sub, start=0, end=None): + pass + + @abc.abstractmethod + def join(self, sep): + pass + + @abc.abstractmethod + def partition(self, sep, expand): + pass + + @abc.abstractmethod + def rpartition(self, sep, expand): + pass + + @abc.abstractmethod + def len(self): + pass + + @abc.abstractmethod + def slice(self, start=None, stop=None, step=None): + pass + + @abc.abstractmethod + def slice_replace(self, start=None, stop=None, repl=None): + pass + + @abc.abstractmethod + def translate(self, table): + pass + + @abc.abstractmethod + def wrap(self, width, **kwargs): + pass + + @abc.abstractmethod + def get_dummies(self, sep="|"): + pass + + @abc.abstractmethod + def isalnum(self): + pass + + @abc.abstractmethod + def isalpha(self): + pass + + @abc.abstractmethod + def isdecimal(self): + pass + + @abc.abstractmethod + def isdigit(self): + pass + + @abc.abstractmethod + def islower(self): + pass + + @abc.abstractmethod + def isnumeric(self): + pass + + @abc.abstractmethod + def isspace(self): + pass + + @abc.abstractmethod + def istitle(self): + pass + + @abc.abstractmethod + def isupper(self): + pass + + @abc.abstractmethod + def capitalize(self): + pass + + @abc.abstractmethod + def casefold(self): + pass + + @abc.abstractmethod + def title(self): + pass + + @abc.abstractmethod + def swapcase(self): + pass + + @abc.abstractmethod + def lower(self): + pass + + @abc.abstractmethod + def upper(self): + pass + + @abc.abstractmethod + def normalize(self, form): + pass + + @abc.abstractmethod + def strip(self, to_strip=None): + pass + + @abc.abstractmethod + def lstrip(self, to_strip=None): + pass + + @abc.abstractmethod + def rstrip(self, to_strip=None): + pass + + @abc.abstractmethod + def split(self, pat=None, n=-1, expand=False): + pass + + @abc.abstractmethod + def rsplit(self, pat=None, n=-1): + pass diff --git a/pandas/core/strings/categorical_strings.py b/pandas/core/strings/categorical.py similarity index 100% rename from pandas/core/strings/categorical_strings.py rename to pandas/core/strings/categorical.py diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 116b53778ac9d..dc6144f0320cf 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -20,8 +20,12 @@ class ObjectArrayMethods(BaseStringArrayMethods): - def _map(self, f, na_value=np.nan, dtype=np.dtype(object)): + def _map(self, f, na_value=None, dtype=None): arr = self._array # object-dtype ndarray. + if dtype is None: + dtype = np.dtype("object") + if na_value is None: + na_value = np.nan if not len(arr): return np.ndarray(0, dtype=dtype) diff --git a/pandas/core/strings/string_array.py b/pandas/core/strings/string_array.py new file mode 100644 index 0000000000000..cfc7cb5db9c64 --- /dev/null +++ b/pandas/core/strings/string_array.py @@ -0,0 +1,70 @@ +from typing import Type, Union + +import numpy as np + +from pandas._libs import lib, missing as libmissing + +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_integer_dtype, + is_object_dtype, + is_string_dtype, +) + +from pandas.core.missing import isna +from pandas.core.strings.object_array import ObjectArrayMethods + + +class StringArrayMethods(ObjectArrayMethods): + def _map(self, f, na_value=None, dtype=None): + from pandas.arrays import BooleanArray, IntegerArray, StringArray + from pandas.core.arrays.string_ import StringDtype + + if dtype is None: + dtype = StringDtype() + if na_value is None: + na_value = libmissing.NA + + arr = self._array + mask = isna(arr) + + arr = np.asarray(arr) + if na_value is None: + na_value = libmissing.NA + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + constructor: Union[Type[IntegerArray], Type[BooleanArray]] + if is_integer_dtype(dtype): + constructor = IntegerArray + else: + constructor = BooleanArray + + na_value_is_na = isna(na_value) + if na_value_is_na: + na_value = 1 + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype(dtype), + ) + + if not na_value_is_na: + mask[:] = False + + return constructor(result, mask) + + elif is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, f, mask.view("uint8"), convert=False, na_value=na_value + ) + return StringArray(result) + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, f, mask.view("uint8")) From a4d4ad5b7e68c4b06fa257cd4abc8dbfb8441591 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 14 Sep 2020 08:18:21 -0500 Subject: [PATCH 06/24] remove old --- pandas/core/strings.py | 3650 ---------------------------------------- 1 file changed, 3650 deletions(-) delete mode 100644 pandas/core/strings.py diff --git a/pandas/core/strings.py b/pandas/core/strings.py deleted file mode 100644 index ab6c9cfb51414..0000000000000 --- a/pandas/core/strings.py +++ /dev/null @@ -1,3650 +0,0 @@ -import codecs -from functools import wraps -import re -import textwrap -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Pattern, Type, Union -import warnings - -import numpy as np - -import pandas._libs.lib as lib -import pandas._libs.missing as libmissing -import pandas._libs.ops as libops -from pandas._typing import ArrayLike, Dtype, Scalar -from pandas.util._decorators import Appender - -from pandas.core.dtypes.common import ( - ensure_object, - is_bool_dtype, - is_categorical_dtype, - is_extension_array_dtype, - is_integer, - is_integer_dtype, - is_list_like, - is_object_dtype, - is_re, - is_scalar, - is_string_dtype, -) -from pandas.core.dtypes.generic import ( - ABCDataFrame, - ABCIndexClass, - ABCMultiIndex, - ABCSeries, -) -from pandas.core.dtypes.missing import isna - -from pandas.core.algorithms import take_1d -from pandas.core.base import NoNewAttributesMixin -from pandas.core.construction import extract_array - -if TYPE_CHECKING: - from pandas.arrays import StringArray - -_cpython_optimized_encoders = ( - "utf-8", - "utf8", - "latin-1", - "latin1", - "iso-8859-1", - "mbcs", - "ascii", -) -_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32") - -_shared_docs: Dict[str, str] = dict() - - -def cat_core(list_of_columns: List, sep: str): - """ - Auxiliary function for :meth:`str.cat` - - Parameters - ---------- - list_of_columns : list of numpy arrays - List of arrays to be concatenated with sep; - these arrays may not contain NaNs! - sep : string - The separator string for concatenating the columns. - - Returns - ------- - nd.array - The concatenation of list_of_columns with sep. - """ - if sep == "": - # no need to interleave sep if it is empty - arr_of_cols = np.asarray(list_of_columns, dtype=object) - return np.sum(arr_of_cols, axis=0) - list_with_sep = [sep] * (2 * len(list_of_columns) - 1) - list_with_sep[::2] = list_of_columns - arr_with_sep = np.asarray(list_with_sep, dtype=object) - return np.sum(arr_with_sep, axis=0) - - -def cat_safe(list_of_columns: List, sep: str): - """ - Auxiliary function for :meth:`str.cat`. - - Same signature as cat_core, but handles TypeErrors in concatenation, which - happen if the arrays in list_of columns have the wrong dtypes or content. - - Parameters - ---------- - list_of_columns : list of numpy arrays - List of arrays to be concatenated with sep; - these arrays may not contain NaNs! - sep : string - The separator string for concatenating the columns. - - Returns - ------- - nd.array - The concatenation of list_of_columns with sep. - """ - try: - result = cat_core(list_of_columns, sep) - except TypeError: - # if there are any non-string values (wrong dtype or hidden behind - # object dtype), np.sum will fail; catch and return with better message - for column in list_of_columns: - dtype = lib.infer_dtype(column, skipna=True) - if dtype not in ["string", "empty"]: - raise TypeError( - "Concatenation requires list-likes containing only " - "strings (or missing values). Offending values found in " - f"column {dtype}" - ) from None - return result - - -def _na_map(f, arr, na_result=None, dtype=np.dtype(object)): - if is_extension_array_dtype(arr.dtype): - if na_result is None: - na_result = libmissing.NA - # just StringDtype - arr = extract_array(arr) - return _map_stringarray(f, arr, na_value=na_result, dtype=dtype) - if na_result is None: - na_result = np.nan - return _map_object(f, arr, na_mask=True, na_value=na_result, dtype=dtype) - - -def _map_stringarray( - func: Callable[[str], Any], arr: "StringArray", na_value: Any, dtype: Dtype -) -> ArrayLike: - """ - Map a callable over valid elements of a StringArray. - - Parameters - ---------- - func : Callable[[str], Any] - Apply to each valid element. - arr : StringArray - na_value : Any - The value to use for missing values. By default, this is - the original value (NA). - dtype : Dtype - The result dtype to use. Specifying this avoids an intermediate - object-dtype allocation. - - Returns - ------- - ArrayLike - An ExtensionArray for integer or string dtypes, otherwise - an ndarray. - - """ - from pandas.arrays import BooleanArray, IntegerArray, StringArray - - mask = isna(arr) - - assert isinstance(arr, StringArray) - arr = np.asarray(arr) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - constructor: Union[Type[IntegerArray], Type[BooleanArray]] - if is_integer_dtype(dtype): - constructor = IntegerArray - else: - constructor = BooleanArray - - na_value_is_na = isna(na_value) - if na_value_is_na: - na_value = 1 - result = lib.map_infer_mask( - arr, - func, - mask.view("uint8"), - convert=False, - na_value=na_value, - dtype=np.dtype(dtype), - ) - - if not na_value_is_na: - mask[:] = False - - return constructor(result, mask) - - elif is_string_dtype(dtype) and not is_object_dtype(dtype): - # i.e. StringDtype - result = lib.map_infer_mask( - arr, func, mask.view("uint8"), convert=False, na_value=na_value - ) - return StringArray(result) - else: - # This is when the result type is object. We reach this when - # -> We know the result type is truly object (e.g. .encode returns bytes - # or .findall returns a list). - # -> We don't know the result type. E.g. `.get` can return anything. - return lib.map_infer_mask(arr, func, mask.view("uint8")) - - -def _map_object(f, arr, na_mask=False, na_value=np.nan, dtype=np.dtype(object)): - if not len(arr): - return np.ndarray(0, dtype=dtype) - - if isinstance(arr, ABCSeries): - arr = arr._values # TODO: extract_array? - if not isinstance(arr, np.ndarray): - arr = np.asarray(arr, dtype=object) - if na_mask: - mask = isna(arr) - convert = not np.all(mask) - try: - result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert) - except (TypeError, AttributeError) as e: - # Reraise the exception if callable `f` got wrong number of args. - # The user may want to be warned by this, instead of getting NaN - p_err = ( - r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " - r"(?(3)required )positional arguments?" - ) - - if len(e.args) >= 1 and re.search(p_err, e.args[0]): - # FIXME: this should be totally avoidable - raise e - - def g(x): - try: - return f(x) - except (TypeError, AttributeError): - return na_value - - return _map_object(g, arr, dtype=dtype) - if na_value is not np.nan: - np.putmask(result, mask, na_value) - if result.dtype == object: - result = lib.maybe_convert_objects(result) - return result - else: - return lib.map_infer(arr, f) - - -def str_count(arr, pat, flags=0): - """ - Count occurrences of pattern in each string of the Series/Index. - - This function is used to count the number of times a particular regex - pattern is repeated in each of the string elements of the - :class:`~pandas.Series`. - - Parameters - ---------- - pat : str - Valid regular expression. - flags : int, default 0, meaning no flags - Flags for the `re` module. For a complete list, `see here - `_. - **kwargs - For compatibility with other string methods. Not used. - - Returns - ------- - Series or Index - Same type as the calling object containing the integer counts. - - See Also - -------- - re : Standard library module for regular expressions. - str.count : Standard library version, without regular expression support. - - Notes - ----- - Some characters need to be escaped when passing in `pat`. - eg. ``'$'`` has a special meaning in regex and must be escaped when - finding this literal character. - - Examples - -------- - >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat']) - >>> s.str.count('a') - 0 0.0 - 1 0.0 - 2 2.0 - 3 2.0 - 4 NaN - 5 0.0 - 6 1.0 - dtype: float64 - - Escape ``'$'`` to find the literal dollar sign. - - >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat']) - >>> s.str.count('\\$') - 0 1 - 1 0 - 2 1 - 3 2 - 4 2 - 5 0 - dtype: int64 - - This is also available on Index - - >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a') - Int64Index([0, 0, 2, 1], dtype='int64') - """ - regex = re.compile(pat, flags=flags) - f = lambda x: len(regex.findall(x)) - return _na_map(f, arr, dtype="int64") - - -def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): - """ - Test if pattern or regex is contained within a string of a Series or Index. - - Return boolean Series or Index based on whether a given pattern or regex is - contained within a string of a Series or Index. - - Parameters - ---------- - pat : str - Character sequence or regular expression. - case : bool, default True - If True, case sensitive. - flags : int, default 0 (no flags) - Flags to pass through to the re module, e.g. re.IGNORECASE. - na : default NaN - Fill value for missing values. - regex : bool, default True - If True, assumes the pat is a regular expression. - - If False, treats the pat as a literal string. - - Returns - ------- - Series or Index of boolean values - A Series or Index of boolean values indicating whether the - given pattern is contained within the string of each element - of the Series or Index. - - See Also - -------- - match : Analogous, but stricter, relying on re.match instead of re.search. - Series.str.startswith : Test if the start of each string element matches a - pattern. - Series.str.endswith : Same as startswith, but tests the end of string. - - Examples - -------- - Returning a Series of booleans using only a literal pattern. - - >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN]) - >>> s1.str.contains('og', regex=False) - 0 False - 1 True - 2 False - 3 False - 4 NaN - dtype: object - - Returning an Index of booleans using only a literal pattern. - - >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN]) - >>> ind.str.contains('23', regex=False) - Index([False, False, False, True, nan], dtype='object') - - Specifying case sensitivity using `case`. - - >>> s1.str.contains('oG', case=True, regex=True) - 0 False - 1 False - 2 False - 3 False - 4 NaN - dtype: object - - Specifying `na` to be `False` instead of `NaN` replaces NaN values - with `False`. If Series or Index does not contain NaN values - the resultant dtype will be `bool`, otherwise, an `object` dtype. - - >>> s1.str.contains('og', na=False, regex=True) - 0 False - 1 True - 2 False - 3 False - 4 False - dtype: bool - - Returning 'house' or 'dog' when either expression occurs in a string. - - >>> s1.str.contains('house|dog', regex=True) - 0 False - 1 True - 2 True - 3 False - 4 NaN - dtype: object - - Ignoring case sensitivity using `flags` with regex. - - >>> import re - >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True) - 0 False - 1 False - 2 True - 3 False - 4 NaN - dtype: object - - Returning any digit using regular expression. - - >>> s1.str.contains('\\d', regex=True) - 0 False - 1 False - 2 False - 3 True - 4 NaN - dtype: object - - Ensure `pat` is a not a literal pattern when `regex` is set to True. - Note in the following example one might expect only `s2[1]` and `s2[3]` to - return `True`. However, '.0' as a regex matches any character - followed by a 0. - - >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35']) - >>> s2.str.contains('.0', regex=True) - 0 True - 1 True - 2 False - 3 True - 4 False - dtype: bool - """ - if regex: - if not case: - flags |= re.IGNORECASE - - regex = re.compile(pat, flags=flags) - - if regex.groups > 0: - warnings.warn( - "This pattern has match groups. To actually get the " - "groups, use str.extract.", - UserWarning, - stacklevel=3, - ) - - f = lambda x: regex.search(x) is not None - else: - if case: - f = lambda x: pat in x - else: - upper_pat = pat.upper() - f = lambda x: upper_pat in x - uppered = _na_map(lambda x: x.upper(), arr) - return _na_map(f, uppered, na, dtype=np.dtype(bool)) - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def str_startswith(arr, pat, na=np.nan): - """ - Test if the start of each string element matches a pattern. - - Equivalent to :meth:`str.startswith`. - - Parameters - ---------- - pat : str - Character sequence. Regular expressions are not accepted. - na : object, default NaN - Object shown if element tested is not a string. - - Returns - ------- - Series or Index of bool - A Series of booleans indicating whether the given pattern matches - the start of each string element. - - See Also - -------- - str.startswith : Python standard library string method. - Series.str.endswith : Same as startswith, but tests the end of string. - Series.str.contains : Tests if string element contains a pattern. - - Examples - -------- - >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan]) - >>> s - 0 bat - 1 Bear - 2 cat - 3 NaN - dtype: object - - >>> s.str.startswith('b') - 0 True - 1 False - 2 False - 3 NaN - dtype: object - - Specifying `na` to be `False` instead of `NaN`. - - >>> s.str.startswith('b', na=False) - 0 True - 1 False - 2 False - 3 False - dtype: bool - """ - f = lambda x: x.startswith(pat) - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def str_endswith(arr, pat, na=np.nan): - """ - Test if the end of each string element matches a pattern. - - Equivalent to :meth:`str.endswith`. - - Parameters - ---------- - pat : str - Character sequence. Regular expressions are not accepted. - na : object, default NaN - Object shown if element tested is not a string. - - Returns - ------- - Series or Index of bool - A Series of booleans indicating whether the given pattern matches - the end of each string element. - - See Also - -------- - str.endswith : Python standard library string method. - Series.str.startswith : Same as endswith, but tests the start of string. - Series.str.contains : Tests if string element contains a pattern. - - Examples - -------- - >>> s = pd.Series(['bat', 'bear', 'caT', np.nan]) - >>> s - 0 bat - 1 bear - 2 caT - 3 NaN - dtype: object - - >>> s.str.endswith('t') - 0 True - 1 False - 2 False - 3 NaN - dtype: object - - Specifying `na` to be `False` instead of `NaN`. - - >>> s.str.endswith('t', na=False) - 0 True - 1 False - 2 False - 3 False - dtype: bool - """ - f = lambda x: x.endswith(pat) - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): - r""" - Replace each occurrence of pattern/regex in the Series/Index. - - Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on the regex value. - - Parameters - ---------- - pat : str or compiled regex - String can be a character sequence or regular expression. - repl : str or callable - Replacement string or a callable. The callable is passed the regex - match object and must return a replacement string to be used. - See :func:`re.sub`. - n : int, default -1 (all) - Number of replacements to make from start. - case : bool, default None - Determines if replace is case sensitive: - - - If True, case sensitive (the default if `pat` is a string) - - Set to False for case insensitive - - Cannot be set if `pat` is a compiled regex. - - flags : int, default 0 (no flags) - Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled - regex. - regex : bool, default True - Determines if assumes the passed-in pattern is a regular expression: - - - If True, assumes the passed-in pattern is a regular expression. - - If False, treats the pattern as a literal string - - Cannot be set to False if `pat` is a compiled regex or `repl` is - a callable. - - Returns - ------- - Series or Index of object - A copy of the object with all matching occurrences of `pat` replaced by - `repl`. - - Raises - ------ - ValueError - * if `regex` is False and `repl` is a callable or `pat` is a compiled - regex - * if `pat` is a compiled regex and `case` or `flags` is set - - Notes - ----- - When `pat` is a compiled regex, all flags should be included in the - compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled - regex will raise an error. - - Examples - -------- - When `pat` is a string and `regex` is True (the default), the given `pat` - is compiled as a regex. When `repl` is a string, it replaces matching - regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are - left as is: - - >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True) - 0 bao - 1 baz - 2 NaN - dtype: object - - When `pat` is a string and `regex` is False, every `pat` is replaced with - `repl` as with :meth:`str.replace`: - - >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False) - 0 bao - 1 fuz - 2 NaN - dtype: object - - When `repl` is a callable, it is called on every `pat` using - :func:`re.sub`. The callable should expect one positional argument - (a regex object) and return a string. - - To get the idea: - - >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr) - 0 oo - 1 uz - 2 NaN - dtype: object - - Reverse every lowercase alphabetic word: - - >>> repl = lambda m: m.group(0)[::-1] - >>> pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl) - 0 oof 123 - 1 rab zab - 2 NaN - dtype: object - - Using regex groups (extract second group and swap case): - - >>> pat = r"(?P\w+) (?P\w+) (?P\w+)" - >>> repl = lambda m: m.group('two').swapcase() - >>> pd.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl) - 0 tWO - 1 bAR - dtype: object - - Using a compiled regex with flags - - >>> import re - >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE) - >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar') - 0 foo - 1 bar - 2 NaN - dtype: object - """ - # Check whether repl is valid (GH 13438, GH 15055) - if not (isinstance(repl, str) or callable(repl)): - raise TypeError("repl must be a string or callable") - - is_compiled_re = is_re(pat) - if regex: - if is_compiled_re: - if (case is not None) or (flags != 0): - raise ValueError( - "case and flags cannot be set when pat is a compiled regex" - ) - else: - # not a compiled regex - # set default case - if case is None: - case = True - - # add case flag, if provided - if case is False: - flags |= re.IGNORECASE - if is_compiled_re or len(pat) > 1 or flags or callable(repl): - n = n if n >= 0 else 0 - compiled = re.compile(pat, flags=flags) - f = lambda x: compiled.sub(repl=repl, string=x, count=n) - else: - f = lambda x: x.replace(pat, repl, n) - else: - if is_compiled_re: - raise ValueError( - "Cannot use a compiled regex as replacement pattern with regex=False" - ) - if callable(repl): - raise ValueError("Cannot use a callable replacement when regex=False") - f = lambda x: x.replace(pat, repl, n) - - return _na_map(f, arr, dtype=str) - - -def str_repeat(arr, repeats): - """ - Duplicate each string in the Series or Index. - - Parameters - ---------- - repeats : int or sequence of int - Same value for all (int) or different value per (sequence). - - Returns - ------- - Series or Index of object - Series or Index of repeated string objects specified by - input parameter repeats. - - Examples - -------- - >>> s = pd.Series(['a', 'b', 'c']) - >>> s - 0 a - 1 b - 2 c - dtype: object - - Single int repeats string in Series - - >>> s.str.repeat(repeats=2) - 0 aa - 1 bb - 2 cc - dtype: object - - Sequence of int repeats corresponding string in Series - - >>> s.str.repeat(repeats=[1, 2, 3]) - 0 a - 1 bb - 2 ccc - dtype: object - """ - if is_scalar(repeats): - - def scalar_rep(x): - try: - return bytes.__mul__(x, repeats) - except TypeError: - return str.__mul__(x, repeats) - - return _na_map(scalar_rep, arr, dtype=str) - else: - - def rep(x, r): - if x is libmissing.NA: - return x - try: - return bytes.__mul__(x, r) - except TypeError: - return str.__mul__(x, r) - - repeats = np.asarray(repeats, dtype=object) - result = libops.vec_binop(np.asarray(arr), repeats, rep) - return result - - -def str_match( - arr: ArrayLike, - pat: Union[str, Pattern], - case: bool = True, - flags: int = 0, - na: Scalar = np.nan, -): - """ - Determine if each string starts with a match of a regular expression. - - Parameters - ---------- - pat : str - Character sequence or regular expression. - case : bool, default True - If True, case sensitive. - flags : int, default 0 (no flags) - Regex module flags, e.g. re.IGNORECASE. - na : default NaN - Fill value for missing values. - - Returns - ------- - Series/array of boolean values - - See Also - -------- - fullmatch : Stricter matching that requires the entire string to match. - contains : Analogous, but less strict, relying on re.search instead of - re.match. - extract : Extract matched groups. - """ - if not case: - flags |= re.IGNORECASE - - regex = re.compile(pat, flags=flags) - - f = lambda x: regex.match(x) is not None - - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def str_fullmatch( - arr: ArrayLike, - pat: Union[str, Pattern], - case: bool = True, - flags: int = 0, - na: Scalar = np.nan, -): - """ - Determine if each string entirely matches a regular expression. - - .. versionadded:: 1.1.0 - - Parameters - ---------- - pat : str - Character sequence or regular expression. - case : bool, default True - If True, case sensitive. - flags : int, default 0 (no flags) - Regex module flags, e.g. re.IGNORECASE. - na : default NaN - Fill value for missing values. - - Returns - ------- - Series/array of boolean values - - See Also - -------- - match : Similar, but also returns `True` when only a *prefix* of the string - matches the regular expression. - extract : Extract matched groups. - """ - if not case: - flags |= re.IGNORECASE - - regex = re.compile(pat, flags=flags) - - f = lambda x: regex.fullmatch(x) is not None - - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def _get_single_group_name(rx): - try: - return list(rx.groupindex.keys()).pop() - except IndexError: - return None - - -def _groups_or_na_fun(regex): - """Used in both extract_noexpand and extract_frame""" - if regex.groups == 0: - raise ValueError("pattern contains no capture groups") - empty_row = [np.nan] * regex.groups - - def f(x): - if not isinstance(x, str): - return empty_row - m = regex.search(x) - if m: - return [np.nan if item is None else item for item in m.groups()] - else: - return empty_row - - return f - - -def _result_dtype(arr): - # workaround #27953 - # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails - # when the list of values is empty. - if arr.dtype.name == "string": - return "string" - else: - return object - - -def _str_extract_noexpand(arr, pat, flags=0): - """ - Find groups in each string in the Series using passed regular - expression. This function is called from - str_extract(expand=False), and can return Series, DataFrame, or - Index. - - """ - from pandas import DataFrame - - regex = re.compile(pat, flags=flags) - groups_or_na = _groups_or_na_fun(regex) - - if regex.groups == 1: - result = np.array([groups_or_na(val)[0] for val in arr], dtype=object) - name = _get_single_group_name(regex) - else: - if isinstance(arr, ABCIndexClass): - raise ValueError("only one regex group is supported with Index") - name = None - names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) - columns = [names.get(1 + i, i) for i in range(regex.groups)] - if arr.empty: - result = DataFrame(columns=columns, dtype=object) - else: - dtype = _result_dtype(arr) - result = DataFrame( - [groups_or_na(val) for val in arr], - columns=columns, - index=arr.index, - dtype=dtype, - ) - return result, name - - -def _str_extract_frame(arr, pat, flags=0): - """ - For each subject string in the Series, extract groups from the - first match of regular expression pat. This function is called from - str_extract(expand=True), and always returns a DataFrame. - - """ - from pandas import DataFrame - - regex = re.compile(pat, flags=flags) - groups_or_na = _groups_or_na_fun(regex) - names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) - columns = [names.get(1 + i, i) for i in range(regex.groups)] - - if len(arr) == 0: - return DataFrame(columns=columns, dtype=object) - try: - result_index = arr.index - except AttributeError: - result_index = None - dtype = _result_dtype(arr) - return DataFrame( - [groups_or_na(val) for val in arr], - columns=columns, - index=result_index, - dtype=dtype, - ) - - -def str_extract(arr, pat, flags=0, expand=True): - r""" - Extract capture groups in the regex `pat` as columns in a DataFrame. - - For each subject string in the Series, extract groups from the - first match of regular expression `pat`. - - Parameters - ---------- - pat : str - Regular expression pattern with capturing groups. - flags : int, default 0 (no flags) - Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that - modify regular expression matching for things like case, - spaces, etc. For more details, see :mod:`re`. - expand : bool, default True - If True, return DataFrame with one column per capture group. - If False, return a Series/Index if there is one capture group - or DataFrame if there are multiple capture groups. - - Returns - ------- - DataFrame or Series or Index - A DataFrame with one row for each subject string, and one - column for each group. Any capture group names in regular - expression pat will be used for column names; otherwise - capture group numbers will be used. The dtype of each result - column is always object, even when no match is found. If - ``expand=False`` and pat has only one capture group, then - return a Series (if subject is a Series) or Index (if subject - is an Index). - - See Also - -------- - extractall : Returns all matches (not just the first match). - - Examples - -------- - A pattern with two groups will return a DataFrame with two columns. - Non-matches will be NaN. - - >>> s = pd.Series(['a1', 'b2', 'c3']) - >>> s.str.extract(r'([ab])(\d)') - 0 1 - 0 a 1 - 1 b 2 - 2 NaN NaN - - A pattern may contain optional groups. - - >>> s.str.extract(r'([ab])?(\d)') - 0 1 - 0 a 1 - 1 b 2 - 2 NaN 3 - - Named groups will become column names in the result. - - >>> s.str.extract(r'(?P[ab])(?P\d)') - letter digit - 0 a 1 - 1 b 2 - 2 NaN NaN - - A pattern with one group will return a DataFrame with one column - if expand=True. - - >>> s.str.extract(r'[ab](\d)', expand=True) - 0 - 0 1 - 1 2 - 2 NaN - - A pattern with one group will return a Series if expand=False. - - >>> s.str.extract(r'[ab](\d)', expand=False) - 0 1 - 1 2 - 2 NaN - dtype: object - """ - if not isinstance(expand, bool): - raise ValueError("expand must be True or False") - if expand: - return _str_extract_frame(arr._orig, pat, flags=flags) - else: - result, name = _str_extract_noexpand(arr._parent, pat, flags=flags) - return arr._wrap_result(result, name=name, expand=expand) - - -def str_extractall(arr, pat, flags=0): - r""" - Extract capture groups in the regex `pat` as columns in DataFrame. - - For each subject string in the Series, extract groups from all - matches of regular expression pat. When each subject string in the - Series has exactly one match, extractall(pat).xs(0, level='match') - is the same as extract(pat). - - Parameters - ---------- - pat : str - Regular expression pattern with capturing groups. - flags : int, default 0 (no flags) - A ``re`` module flag, for example ``re.IGNORECASE``. These allow - to modify regular expression matching for things like case, spaces, - etc. Multiple flags can be combined with the bitwise OR operator, - for example ``re.IGNORECASE | re.MULTILINE``. - - Returns - ------- - DataFrame - A ``DataFrame`` with one row for each match, and one column for each - group. Its rows have a ``MultiIndex`` with first levels that come from - the subject ``Series``. The last level is named 'match' and indexes the - matches in each item of the ``Series``. Any capture group names in - regular expression pat will be used for column names; otherwise capture - group numbers will be used. - - See Also - -------- - extract : Returns first match only (not all matches). - - Examples - -------- - A pattern with one group will return a DataFrame with one column. - Indices with no matches will not appear in the result. - - >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"]) - >>> s.str.extractall(r"[ab](\d)") - 0 - match - A 0 1 - 1 2 - B 0 1 - - Capture group names are used for column names of the result. - - >>> s.str.extractall(r"[ab](?P\d)") - digit - match - A 0 1 - 1 2 - B 0 1 - - A pattern with two groups will return a DataFrame with two columns. - - >>> s.str.extractall(r"(?P[ab])(?P\d)") - letter digit - match - A 0 a 1 - 1 a 2 - B 0 b 1 - - Optional groups that do not match are NaN in the result. - - >>> s.str.extractall(r"(?P[ab])?(?P\d)") - letter digit - match - A 0 a 1 - 1 a 2 - B 0 b 1 - C 0 NaN 1 - """ - regex = re.compile(pat, flags=flags) - # the regex must contain capture groups. - if regex.groups == 0: - raise ValueError("pattern contains no capture groups") - - if isinstance(arr, ABCIndexClass): - arr = arr.to_series().reset_index(drop=True) - - names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) - columns = [names.get(1 + i, i) for i in range(regex.groups)] - match_list = [] - index_list = [] - is_mi = arr.index.nlevels > 1 - - for subject_key, subject in arr.items(): - if isinstance(subject, str): - - if not is_mi: - subject_key = (subject_key,) - - for match_i, match_tuple in enumerate(regex.findall(subject)): - if isinstance(match_tuple, str): - match_tuple = (match_tuple,) - na_tuple = [np.NaN if group == "" else group for group in match_tuple] - match_list.append(na_tuple) - result_key = tuple(subject_key + (match_i,)) - index_list.append(result_key) - - from pandas import MultiIndex - - index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"]) - dtype = _result_dtype(arr) - - result = arr._constructor_expanddim( - match_list, index=index, columns=columns, dtype=dtype - ) - return result - - -def str_get_dummies(arr, sep="|"): - """ - Return DataFrame of dummy/indicator variables for Series. - - Each string in Series is split by sep and returned as a DataFrame - of dummy/indicator variables. - - Parameters - ---------- - sep : str, default "|" - String to split on. - - Returns - ------- - DataFrame - Dummy variables corresponding to values of the Series. - - See Also - -------- - get_dummies : Convert categorical variable into dummy/indicator - variables. - - Examples - -------- - >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies() - a b c - 0 1 1 0 - 1 1 0 0 - 2 1 0 1 - - >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies() - a b c - 0 1 1 0 - 1 0 0 0 - 2 1 0 1 - """ - arr = arr.fillna("") - try: - arr = sep + arr + sep - except TypeError: - arr = sep + arr.astype(str) + sep - - tags = set() - for ts in arr.str.split(sep): - tags.update(ts) - tags = sorted(tags - {""}) - - dummies = np.empty((len(arr), len(tags)), dtype=np.int64) - - for i, t in enumerate(tags): - pat = sep + t + sep - dummies[:, i] = lib.map_infer(arr.to_numpy(), lambda x: pat in x) - return dummies, tags - - -def str_join(arr, sep): - """ - Join lists contained as elements in the Series/Index with passed delimiter. - - If the elements of a Series are lists themselves, join the content of these - lists using the delimiter passed to the function. - This function is an equivalent to :meth:`str.join`. - - Parameters - ---------- - sep : str - Delimiter to use between list entries. - - Returns - ------- - Series/Index: object - The list entries concatenated by intervening occurrences of the - delimiter. - - Raises - ------ - AttributeError - If the supplied Series contains neither strings nor lists. - - See Also - -------- - str.join : Standard library version of this method. - Series.str.split : Split strings around given separator/delimiter. - - Notes - ----- - If any of the list items is not a string object, the result of the join - will be `NaN`. - - Examples - -------- - Example with a list that contains non-string elements. - - >>> s = pd.Series([['lion', 'elephant', 'zebra'], - ... [1.1, 2.2, 3.3], - ... ['cat', np.nan, 'dog'], - ... ['cow', 4.5, 'goat'], - ... ['duck', ['swan', 'fish'], 'guppy']]) - >>> s - 0 [lion, elephant, zebra] - 1 [1.1, 2.2, 3.3] - 2 [cat, nan, dog] - 3 [cow, 4.5, goat] - 4 [duck, [swan, fish], guppy] - dtype: object - - Join all lists using a '-'. The lists containing object(s) of types other - than str will produce a NaN. - - >>> s.str.join('-') - 0 lion-elephant-zebra - 1 NaN - 2 NaN - 3 NaN - 4 NaN - dtype: object - """ - return _na_map(sep.join, arr, dtype=str) - - -def str_findall(arr, pat, flags=0): - """ - Find all occurrences of pattern or regular expression in the Series/Index. - - Equivalent to applying :func:`re.findall` to all the elements in the - Series/Index. - - Parameters - ---------- - pat : str - Pattern or regular expression. - flags : int, default 0 - Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which - means no flags). - - Returns - ------- - Series/Index of lists of strings - All non-overlapping matches of pattern or regular expression in each - string of this Series/Index. - - See Also - -------- - count : Count occurrences of pattern or regular expression in each string - of the Series/Index. - extractall : For each string in the Series, extract groups from all matches - of regular expression and return a DataFrame with one row for each - match and one column for each group. - re.findall : The equivalent ``re`` function to all non-overlapping matches - of pattern or regular expression in string, as a list of strings. - - Examples - -------- - >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit']) - - The search for the pattern 'Monkey' returns one match: - - >>> s.str.findall('Monkey') - 0 [] - 1 [Monkey] - 2 [] - dtype: object - - On the other hand, the search for the pattern 'MONKEY' doesn't return any - match: - - >>> s.str.findall('MONKEY') - 0 [] - 1 [] - 2 [] - dtype: object - - Flags can be added to the pattern or regular expression. For instance, - to find the pattern 'MONKEY' ignoring the case: - - >>> import re - >>> s.str.findall('MONKEY', flags=re.IGNORECASE) - 0 [] - 1 [Monkey] - 2 [] - dtype: object - - When the pattern matches more than one string in the Series, all matches - are returned: - - >>> s.str.findall('on') - 0 [on] - 1 [on] - 2 [] - dtype: object - - Regular expressions are supported too. For instance, the search for all the - strings ending with the word 'on' is shown next: - - >>> s.str.findall('on$') - 0 [on] - 1 [] - 2 [] - dtype: object - - If the pattern is found more than once in the same string, then a list of - multiple strings is returned: - - >>> s.str.findall('b') - 0 [] - 1 [] - 2 [b, b] - dtype: object - """ - regex = re.compile(pat, flags=flags) - return _na_map(regex.findall, arr) - - -def str_find(arr, sub, start=0, end=None, side="left"): - """ - Return indexes in each strings in the Series/Index where the - substring is fully contained between [start:end]. Return -1 on failure. - - Parameters - ---------- - sub : str - Substring being searched. - start : int - Left edge index. - end : int - Right edge index. - side : {'left', 'right'}, default 'left' - Specifies a starting side, equivalent to ``find`` or ``rfind``. - - Returns - ------- - Series or Index - Indexes where substring is found. - """ - if not isinstance(sub, str): - msg = f"expected a string object, not {type(sub).__name__}" - raise TypeError(msg) - - if side == "left": - method = "find" - elif side == "right": - method = "rfind" - else: # pragma: no cover - raise ValueError("Invalid side") - - if end is None: - f = lambda x: getattr(x, method)(sub, start) - else: - f = lambda x: getattr(x, method)(sub, start, end) - - return _na_map(f, arr, dtype=np.dtype("int64")) - - -def str_index(arr, sub, start=0, end=None, side="left"): - if not isinstance(sub, str): - msg = f"expected a string object, not {type(sub).__name__}" - raise TypeError(msg) - - if side == "left": - method = "index" - elif side == "right": - method = "rindex" - else: # pragma: no cover - raise ValueError("Invalid side") - - if end is None: - f = lambda x: getattr(x, method)(sub, start) - else: - f = lambda x: getattr(x, method)(sub, start, end) - - return _na_map(f, arr, dtype=np.dtype("int64")) - - -def str_pad(arr, width, side="left", fillchar=" "): - """ - Pad strings in the Series/Index up to width. - - Parameters - ---------- - width : int - Minimum width of resulting string; additional characters will be filled - with character defined in `fillchar`. - side : {'left', 'right', 'both'}, default 'left' - Side from which to fill resulting string. - fillchar : str, default ' ' - Additional character for filling, default is whitespace. - - Returns - ------- - Series or Index of object - Returns Series or Index with minimum number of char in object. - - See Also - -------- - Series.str.rjust : Fills the left side of strings with an arbitrary - character. Equivalent to ``Series.str.pad(side='left')``. - Series.str.ljust : Fills the right side of strings with an arbitrary - character. Equivalent to ``Series.str.pad(side='right')``. - Series.str.center : Fills boths sides of strings with an arbitrary - character. Equivalent to ``Series.str.pad(side='both')``. - Series.str.zfill : Pad strings in the Series/Index by prepending '0' - character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``. - - Examples - -------- - >>> s = pd.Series(["caribou", "tiger"]) - >>> s - 0 caribou - 1 tiger - dtype: object - - >>> s.str.pad(width=10) - 0 caribou - 1 tiger - dtype: object - - >>> s.str.pad(width=10, side='right', fillchar='-') - 0 caribou--- - 1 tiger----- - dtype: object - - >>> s.str.pad(width=10, side='both', fillchar='-') - 0 -caribou-- - 1 --tiger--- - dtype: object - """ - if not isinstance(fillchar, str): - msg = f"fillchar must be a character, not {type(fillchar).__name__}" - raise TypeError(msg) - - if len(fillchar) != 1: - raise TypeError("fillchar must be a character, not str") - - if not is_integer(width): - msg = f"width must be of integer type, not {type(width).__name__}" - raise TypeError(msg) - - if side == "left": - f = lambda x: x.rjust(width, fillchar) - elif side == "right": - f = lambda x: x.ljust(width, fillchar) - elif side == "both": - f = lambda x: x.center(width, fillchar) - else: # pragma: no cover - raise ValueError("Invalid side") - - return _na_map(f, arr, dtype=str) - - -def str_split(arr, pat=None, n=None): - - if pat is None: - if n is None or n == 0: - n = -1 - f = lambda x: x.split(pat, n) - else: - if len(pat) == 1: - if n is None or n == 0: - n = -1 - f = lambda x: x.split(pat, n) - else: - if n is None or n == -1: - n = 0 - regex = re.compile(pat) - f = lambda x: regex.split(x, maxsplit=n) - res = _na_map(f, arr) - return res - - -def str_rsplit(arr, pat=None, n=None): - - if n is None or n == 0: - n = -1 - f = lambda x: x.rsplit(pat, n) - res = _na_map(f, arr) - return res - - -def str_slice(arr, start=None, stop=None, step=None): - """ - Slice substrings from each element in the Series or Index. - - Parameters - ---------- - start : int, optional - Start position for slice operation. - stop : int, optional - Stop position for slice operation. - step : int, optional - Step size for slice operation. - - Returns - ------- - Series or Index of object - Series or Index from sliced substring from original string object. - - See Also - -------- - Series.str.slice_replace : Replace a slice with a string. - Series.str.get : Return element at position. - Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i` - being the position. - - Examples - -------- - >>> s = pd.Series(["koala", "fox", "chameleon"]) - >>> s - 0 koala - 1 fox - 2 chameleon - dtype: object - - >>> s.str.slice(start=1) - 0 oala - 1 ox - 2 hameleon - dtype: object - - >>> s.str.slice(start=-1) - 0 a - 1 x - 2 n - dtype: object - - >>> s.str.slice(stop=2) - 0 ko - 1 fo - 2 ch - dtype: object - - >>> s.str.slice(step=2) - 0 kaa - 1 fx - 2 caeen - dtype: object - - >>> s.str.slice(start=0, stop=5, step=3) - 0 kl - 1 f - 2 cm - dtype: object - - Equivalent behaviour to: - - >>> s.str[0:5:3] - 0 kl - 1 f - 2 cm - dtype: object - """ - obj = slice(start, stop, step) - f = lambda x: x[obj] - return _na_map(f, arr, dtype=str) - - -def str_slice_replace(arr, start=None, stop=None, repl=None): - """ - Replace a positional slice of a string with another value. - - Parameters - ---------- - start : int, optional - Left index position to use for the slice. If not specified (None), - the slice is unbounded on the left, i.e. slice from the start - of the string. - stop : int, optional - Right index position to use for the slice. If not specified (None), - the slice is unbounded on the right, i.e. slice until the - end of the string. - repl : str, optional - String for replacement. If not specified (None), the sliced region - is replaced with an empty string. - - Returns - ------- - Series or Index - Same type as the original object. - - See Also - -------- - Series.str.slice : Just slicing without replacement. - - Examples - -------- - >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde']) - >>> s - 0 a - 1 ab - 2 abc - 3 abdc - 4 abcde - dtype: object - - Specify just `start`, meaning replace `start` until the end of the - string with `repl`. - - >>> s.str.slice_replace(1, repl='X') - 0 aX - 1 aX - 2 aX - 3 aX - 4 aX - dtype: object - - Specify just `stop`, meaning the start of the string to `stop` is replaced - with `repl`, and the rest of the string is included. - - >>> s.str.slice_replace(stop=2, repl='X') - 0 X - 1 X - 2 Xc - 3 Xdc - 4 Xcde - dtype: object - - Specify `start` and `stop`, meaning the slice from `start` to `stop` is - replaced with `repl`. Everything before or after `start` and `stop` is - included as is. - - >>> s.str.slice_replace(start=1, stop=3, repl='X') - 0 aX - 1 aX - 2 aX - 3 aXc - 4 aXde - dtype: object - """ - if repl is None: - repl = "" - - def f(x): - if x[start:stop] == "": - local_stop = start - else: - local_stop = stop - y = "" - if start is not None: - y += x[:start] - y += repl - if stop is not None: - y += x[local_stop:] - return y - - return _na_map(f, arr, dtype=str) - - -def str_strip(arr, to_strip=None, side="both"): - """ - Strip whitespace (including newlines) from each string in the - Series/Index. - - Parameters - ---------- - to_strip : str or unicode - side : {'left', 'right', 'both'}, default 'both' - - Returns - ------- - Series or Index - """ - if side == "both": - f = lambda x: x.strip(to_strip) - elif side == "left": - f = lambda x: x.lstrip(to_strip) - elif side == "right": - f = lambda x: x.rstrip(to_strip) - else: # pragma: no cover - raise ValueError("Invalid side") - return _na_map(f, arr, dtype=str) - - -def str_wrap(arr, width, **kwargs): - r""" - Wrap strings in Series/Index at specified line width. - - This method has the same keyword parameters and defaults as - :class:`textwrap.TextWrapper`. - - Parameters - ---------- - width : int - Maximum line width. - expand_tabs : bool, optional - If True, tab characters will be expanded to spaces (default: True). - replace_whitespace : bool, optional - If True, each whitespace character (as defined by string.whitespace) - remaining after tab expansion will be replaced by a single space - (default: True). - drop_whitespace : bool, optional - If True, whitespace that, after wrapping, happens to end up at the - beginning or end of a line is dropped (default: True). - break_long_words : bool, optional - If True, then words longer than width will be broken in order to ensure - that no lines are longer than width. If it is false, long words will - not be broken, and some lines may be longer than width (default: True). - break_on_hyphens : bool, optional - If True, wrapping will occur preferably on whitespace and right after - hyphens in compound words, as it is customary in English. If false, - only whitespaces will be considered as potentially good places for line - breaks, but you need to set break_long_words to false if you want truly - insecable words (default: True). - - Returns - ------- - Series or Index - - Notes - ----- - Internally, this method uses a :class:`textwrap.TextWrapper` instance with - default settings. To achieve behavior matching R's stringr library str_wrap - function, use the arguments: - - - expand_tabs = False - - replace_whitespace = True - - drop_whitespace = True - - break_long_words = False - - break_on_hyphens = False - - Examples - -------- - >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped']) - >>> s.str.wrap(12) - 0 line to be\nwrapped - 1 another line\nto be\nwrapped - dtype: object - """ - kwargs["width"] = width - - tw = textwrap.TextWrapper(**kwargs) - - return _na_map(lambda s: "\n".join(tw.wrap(s)), arr, dtype=str) - - -def str_translate(arr, table): - """ - Map all characters in the string through the given mapping table. - - Equivalent to standard :meth:`str.translate`. - - Parameters - ---------- - table : dict - Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or - None. Unmapped characters are left untouched. - Characters mapped to None are deleted. :meth:`str.maketrans` is a - helper function for making translation tables. - - Returns - ------- - Series or Index - """ - return _na_map(lambda x: x.translate(table), arr, dtype=str) - - -def str_get(arr, i): - """ - Extract element from each component at specified position. - - Extract element from lists, tuples, or strings in each element in the - Series/Index. - - Parameters - ---------- - i : int - Position of element to extract. - - Returns - ------- - Series or Index - - Examples - -------- - >>> s = pd.Series(["String", - ... (1, 2, 3), - ... ["a", "b", "c"], - ... 123, - ... -456, - ... {1: "Hello", "2": "World"}]) - >>> s - 0 String - 1 (1, 2, 3) - 2 [a, b, c] - 3 123 - 4 -456 - 5 {1: 'Hello', '2': 'World'} - dtype: object - - >>> s.str.get(1) - 0 t - 1 2 - 2 b - 3 NaN - 4 NaN - 5 Hello - dtype: object - - >>> s.str.get(-1) - 0 g - 1 3 - 2 c - 3 NaN - 4 NaN - 5 None - dtype: object - """ - - def f(x): - if isinstance(x, dict): - return x.get(i) - elif len(x) > i >= -len(x): - return x[i] - return np.nan - - return _na_map(f, arr) - - -def str_decode(arr, encoding, errors="strict"): - """ - Decode character string in the Series/Index using indicated encoding. - - Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in - python3. - - Parameters - ---------- - encoding : str - errors : str, optional - - Returns - ------- - Series or Index - """ - if encoding in _cpython_optimized_decoders: - # CPython optimized implementation - f = lambda x: x.decode(encoding, errors) - else: - decoder = codecs.getdecoder(encoding) - f = lambda x: decoder(x, errors)[0] - return _na_map(f, arr) - - -def str_encode(arr, encoding, errors="strict"): - """ - Encode character string in the Series/Index using indicated encoding. - - Equivalent to :meth:`str.encode`. - - Parameters - ---------- - encoding : str - errors : str, optional - - Returns - ------- - encoded : Series/Index of objects - """ - if encoding in _cpython_optimized_encoders: - # CPython optimized implementation - f = lambda x: x.encode(encoding, errors) - else: - encoder = codecs.getencoder(encoding) - f = lambda x: encoder(x, errors)[0] - return _na_map(f, arr) - - -def forbid_nonstring_types(forbidden, name=None): - """ - Decorator to forbid specific types for a method of StringMethods. - - For calling `.str.{method}` on a Series or Index, it is necessary to first - initialize the :class:`StringMethods` object, and then call the method. - However, different methods allow different input types, and so this can not - be checked during :meth:`StringMethods.__init__`, but must be done on a - per-method basis. This decorator exists to facilitate this process, and - make it explicit which (inferred) types are disallowed by the method. - - :meth:`StringMethods.__init__` allows the *union* of types its different - methods allow (after skipping NaNs; see :meth:`StringMethods._validate`), - namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer']. - - The default string types ['string', 'empty'] are allowed for all methods. - For the additional types ['bytes', 'mixed', 'mixed-integer'], each method - then needs to forbid the types it is not intended for. - - Parameters - ---------- - forbidden : list-of-str or None - List of forbidden non-string types, may be one or more of - `['bytes', 'mixed', 'mixed-integer']`. - name : str, default None - Name of the method to use in the error message. By default, this is - None, in which case the name from the method being wrapped will be - copied. However, for working with further wrappers (like _pat_wrapper - and _noarg_wrapper), it is necessary to specify the name. - - Returns - ------- - func : wrapper - The method to which the decorator is applied, with an added check that - enforces the inferred type to not be in the list of forbidden types. - - Raises - ------ - TypeError - If the inferred type of the underlying data is in `forbidden`. - """ - # deal with None - forbidden = [] if forbidden is None else forbidden - - allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set( - forbidden - ) - - def _forbid_nonstring_types(func): - func_name = func.__name__ if name is None else name - - @wraps(func) - def wrapper(self, *args, **kwargs): - if self._inferred_dtype not in allowed_types: - msg = ( - f"Cannot use .str.{func_name} with values of " - f"inferred dtype '{self._inferred_dtype}'." - ) - raise TypeError(msg) - return func(self, *args, **kwargs) - - wrapper.__name__ = func_name - return wrapper - - return _forbid_nonstring_types - - -def _noarg_wrapper( - f, - name=None, - docstring=None, - forbidden_types=["bytes"], - returns_string=True, - **kwargs, -): - @forbid_nonstring_types(forbidden_types, name=name) - def wrapper(self): - result = _na_map(f, self._parent, **kwargs) - return self._wrap_result(result, returns_string=returns_string) - - wrapper.__name__ = f.__name__ if name is None else name - if docstring is not None: - wrapper.__doc__ = docstring - else: - raise ValueError("Provide docstring") - - return wrapper - - -def _pat_wrapper( - f, - flags=False, - na=False, - name=None, - forbidden_types=["bytes"], - returns_string=True, - **kwargs, -): - @forbid_nonstring_types(forbidden_types, name=name) - def wrapper1(self, pat): - result = f(self._parent, pat) - return self._wrap_result(result, returns_string=returns_string) - - @forbid_nonstring_types(forbidden_types, name=name) - def wrapper2(self, pat, flags=0, **kwargs): - result = f(self._parent, pat, flags=flags, **kwargs) - return self._wrap_result(result, returns_string=returns_string) - - @forbid_nonstring_types(forbidden_types, name=name) - def wrapper3(self, pat, na=np.nan): - result = f(self._parent, pat, na=na) - return self._wrap_result(result, returns_string=returns_string, fill_value=na) - - wrapper = wrapper3 if na else wrapper2 if flags else wrapper1 - - wrapper.__name__ = f.__name__ if name is None else name - if f.__doc__: - wrapper.__doc__ = f.__doc__ - - return wrapper - - -def copy(source): - """Copy a docstring from another source function (if present)""" - - def do_copy(target): - if source.__doc__: - target.__doc__ = source.__doc__ - return target - - return do_copy - - -class StringMethods(NoNewAttributesMixin): - """ - Vectorized string functions for Series and Index. - - NAs stay NA unless handled otherwise by a particular method. - Patterned after Python's string methods, with some inspiration from - R's stringr package. - - Examples - -------- - >>> s = pd.Series(["A_Str_Series"]) - >>> s - 0 A_Str_Series - dtype: object - - >>> s.str.split("_") - 0 [A, Str, Series] - dtype: object - - >>> s.str.replace("_", "") - 0 AStrSeries - dtype: object - """ - - def __init__(self, data): - self._inferred_dtype = self._validate(data) - self._is_categorical = is_categorical_dtype(data.dtype) - self._is_string = data.dtype.name == "string" - - # ._values.categories works for both Series/Index - self._parent = data._values.categories if self._is_categorical else data - # save orig to blow up categoricals to the right type - self._orig = data - self._freeze() - - @staticmethod - def _validate(data): - """ - Auxiliary function for StringMethods, infers and checks dtype of data. - - This is a "first line of defence" at the creation of the StringMethods- - object (see _make_accessor), and just checks that the dtype is in the - *union* of the allowed types over all string methods below; this - restriction is then refined on a per-method basis using the decorator - @forbid_nonstring_types (more info in the corresponding docstring). - - This really should exclude all series/index with any non-string values, - but that isn't practical for performance reasons until we have a str - dtype (GH 9343 / 13877) - - Parameters - ---------- - data : The content of the Series - - Returns - ------- - dtype : inferred dtype of data - """ - from pandas import StringDtype - - if isinstance(data, ABCMultiIndex): - raise AttributeError( - "Can only use .str accessor with Index, not MultiIndex" - ) - - # see _libs/lib.pyx for list of inferred types - allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"] - - values = getattr(data, "values", data) # Series / Index - values = getattr(values, "categories", values) # categorical / normal - - # explicitly allow StringDtype - if isinstance(values.dtype, StringDtype): - return "string" - - try: - inferred_dtype = lib.infer_dtype(values, skipna=True) - except ValueError: - # GH#27571 mostly occurs with ExtensionArray - inferred_dtype = None - - if inferred_dtype not in allowed_types: - raise AttributeError("Can only use .str accessor with string values!") - return inferred_dtype - - def __getitem__(self, key): - if isinstance(key, slice): - return self.slice(start=key.start, stop=key.stop, step=key.step) - else: - return self.get(key) - - def __iter__(self): - warnings.warn( - "Columnar iteration over characters will be deprecated in future releases.", - FutureWarning, - stacklevel=2, - ) - i = 0 - g = self.get(i) - while g.notna().any(): - yield g - i += 1 - g = self.get(i) - - def _wrap_result( - self, - result, - use_codes=True, - name=None, - expand=None, - fill_value=np.nan, - returns_string=True, - ): - - from pandas import Index, MultiIndex, Series - - # for category, we do the stuff on the categories, so blow it up - # to the full series again - # But for some operations, we have to do the stuff on the full values, - # so make it possible to skip this step as the method already did this - # before the transformation... - if use_codes and self._is_categorical: - # if self._orig is a CategoricalIndex, there is no .cat-accessor - result = take_1d( - result, Series(self._orig, copy=False).cat.codes, fill_value=fill_value - ) - - if not hasattr(result, "ndim") or not hasattr(result, "dtype"): - return result - assert result.ndim < 3 - - # We can be wrapping a string / object / categorical result, in which - # case we'll want to return the same dtype as the input. - # Or we can be wrapping a numeric output, in which case we don't want - # to return a StringArray. - if self._is_string and returns_string: - dtype = "string" - else: - dtype = None - - if expand is None: - # infer from ndim if expand is not specified - expand = result.ndim != 1 - - elif expand is True and not isinstance(self._orig, ABCIndexClass): - # required when expand=True is explicitly specified - # not needed when inferred - - def cons_row(x): - if is_list_like(x): - return x - else: - return [x] - - result = [cons_row(x) for x in result] - if result: - # propagate nan values to match longest sequence (GH 18450) - max_len = max(len(x) for x in result) - result = [ - x * max_len if len(x) == 0 or x[0] is np.nan else x for x in result - ] - - if not isinstance(expand, bool): - raise ValueError("expand must be True or False") - - if expand is False: - # if expand is False, result should have the same name - # as the original otherwise specified - if name is None: - name = getattr(result, "name", None) - if name is None: - # do not use logical or, _orig may be a DataFrame - # which has "name" column - name = self._orig.name - - # Wait until we are sure result is a Series or Index before - # checking attributes (GH 12180) - if isinstance(self._orig, ABCIndexClass): - # if result is a boolean np.array, return the np.array - # instead of wrapping it into a boolean Index (GH 8875) - if is_bool_dtype(result): - return result - - if expand: - result = list(result) - out = MultiIndex.from_tuples(result, names=name) - if out.nlevels == 1: - # We had all tuples of length-one, which are - # better represented as a regular Index. - out = out.get_level_values(0) - return out - else: - return Index(result, name=name) - else: - index = self._orig.index - if expand: - cons = self._orig._constructor_expanddim - result = cons(result, columns=name, index=index, dtype=dtype) - else: - # Must be a Series - cons = self._orig._constructor - result = cons(result, name=name, index=index, dtype=dtype) - return result - - def _get_series_list(self, others): - """ - Auxiliary function for :meth:`str.cat`. Turn potentially mixed input - into a list of Series (elements without an index must match the length - of the calling Series/Index). - - Parameters - ---------- - others : Series, DataFrame, np.ndarray, list-like or list-like of - Objects that are either Series, Index or np.ndarray (1-dim). - - Returns - ------- - list of Series - Others transformed into list of Series. - """ - from pandas import DataFrame, Series - - # self._orig is either Series or Index - idx = self._orig if isinstance(self._orig, ABCIndexClass) else self._orig.index - - # Generally speaking, all objects without an index inherit the index - # `idx` of the calling Series/Index - i.e. must have matching length. - # Objects with an index (i.e. Series/Index/DataFrame) keep their own. - if isinstance(others, ABCSeries): - return [others] - elif isinstance(others, ABCIndexClass): - return [Series(others._values, index=idx)] - elif isinstance(others, ABCDataFrame): - return [others[x] for x in others] - elif isinstance(others, np.ndarray) and others.ndim == 2: - others = DataFrame(others, index=idx) - return [others[x] for x in others] - elif is_list_like(others, allow_sets=False): - others = list(others) # ensure iterators do not get read twice etc - - # in case of list-like `others`, all elements must be - # either Series/Index/np.ndarray (1-dim)... - if all( - isinstance(x, (ABCSeries, ABCIndexClass)) - or (isinstance(x, np.ndarray) and x.ndim == 1) - for x in others - ): - los = [] - while others: # iterate through list and append each element - los = los + self._get_series_list(others.pop(0)) - return los - # ... or just strings - elif all(not is_list_like(x) for x in others): - return [Series(others, index=idx)] - raise TypeError( - "others must be Series, Index, DataFrame, np.ndarray " - "or list-like (either containing only strings or " - "containing only objects of type Series/Index/" - "np.ndarray[1-dim])" - ) - - @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"]) - def cat(self, others=None, sep=None, na_rep=None, join="left"): - """ - Concatenate strings in the Series/Index with given separator. - - If `others` is specified, this function concatenates the Series/Index - and elements of `others` element-wise. - If `others` is not passed, then all values in the Series/Index are - concatenated into a single string with a given `sep`. - - Parameters - ---------- - others : Series, Index, DataFrame, np.ndarray or list-like - Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and - other list-likes of strings must have the same length as the - calling Series/Index, with the exception of indexed objects (i.e. - Series/Index/DataFrame) if `join` is not None. - - If others is a list-like that contains a combination of Series, - Index or np.ndarray (1-dim), then all elements will be unpacked and - must satisfy the above criteria individually. - - If others is None, the method returns the concatenation of all - strings in the calling Series/Index. - sep : str, default '' - The separator between the different elements/columns. By default - the empty string `''` is used. - na_rep : str or None, default None - Representation that is inserted for all missing values: - - - If `na_rep` is None, and `others` is None, missing values in the - Series/Index are omitted from the result. - - If `na_rep` is None, and `others` is not None, a row containing a - missing value in any of the columns (before concatenation) will - have a missing value in the result. - join : {'left', 'right', 'outer', 'inner'}, default 'left' - Determines the join-style between the calling Series/Index and any - Series/Index/DataFrame in `others` (objects without an index need - to match the length of the calling Series/Index). To disable - alignment, use `.values` on any Series/Index/DataFrame in `others`. - - .. versionchanged:: 1.0.0 - Changed default of `join` from None to `'left'`. - - Returns - ------- - str, Series or Index - If `others` is None, `str` is returned, otherwise a `Series/Index` - (same type as caller) of objects is returned. - - See Also - -------- - split : Split each string in the Series/Index. - join : Join lists contained as elements in the Series/Index. - - Examples - -------- - When not passing `others`, all values are concatenated into a single - string: - - >>> s = pd.Series(['a', 'b', np.nan, 'd']) - >>> s.str.cat(sep=' ') - 'a b d' - - By default, NA values in the Series are ignored. Using `na_rep`, they - can be given a representation: - - >>> s.str.cat(sep=' ', na_rep='?') - 'a b ? d' - - If `others` is specified, corresponding values are concatenated with - the separator. Result will be a Series of strings. - - >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',') - 0 a,A - 1 b,B - 2 NaN - 3 d,D - dtype: object - - Missing values will remain missing in the result, but can again be - represented using `na_rep` - - >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-') - 0 a,A - 1 b,B - 2 -,C - 3 d,D - dtype: object - - If `sep` is not specified, the values are concatenated without - separation. - - >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-') - 0 aA - 1 bB - 2 -C - 3 dD - dtype: object - - Series with different indexes can be aligned before concatenation. The - `join`-keyword works as in other methods. - - >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2]) - >>> s.str.cat(t, join='left', na_rep='-') - 0 aa - 1 b- - 2 -c - 3 dd - dtype: object - >>> - >>> s.str.cat(t, join='outer', na_rep='-') - 0 aa - 1 b- - 2 -c - 3 dd - 4 -e - dtype: object - >>> - >>> s.str.cat(t, join='inner', na_rep='-') - 0 aa - 2 -c - 3 dd - dtype: object - >>> - >>> s.str.cat(t, join='right', na_rep='-') - 3 dd - 0 aa - 4 -e - 2 -c - dtype: object - - For more examples, see :ref:`here `. - """ - from pandas import Index, Series, concat - - if isinstance(others, str): - raise ValueError("Did you mean to supply a `sep` keyword?") - if sep is None: - sep = "" - - if isinstance(self._orig, ABCIndexClass): - data = Series(self._orig, index=self._orig) - else: # Series - data = self._orig - - # concatenate Series/Index with itself if no "others" - if others is None: - data = ensure_object(data) - na_mask = isna(data) - if na_rep is None and na_mask.any(): - data = data[~na_mask] - elif na_rep is not None and na_mask.any(): - data = np.where(na_mask, na_rep, data) - return sep.join(data) - - try: - # turn anything in "others" into lists of Series - others = self._get_series_list(others) - except ValueError as err: # do not catch TypeError raised by _get_series_list - raise ValueError( - "If `others` contains arrays or lists (or other " - "list-likes without an index), these must all be " - "of the same length as the calling Series/Index." - ) from err - - # align if required - if any(not data.index.equals(x.index) for x in others): - # Need to add keys for uniqueness in case of duplicate columns - others = concat( - others, - axis=1, - join=(join if join == "inner" else "outer"), - keys=range(len(others)), - sort=False, - copy=False, - ) - data, others = data.align(others, join=join) - others = [others[x] for x in others] # again list of Series - - all_cols = [ensure_object(x) for x in [data] + others] - na_masks = np.array([isna(x) for x in all_cols]) - union_mask = np.logical_or.reduce(na_masks, axis=0) - - if na_rep is None and union_mask.any(): - # no na_rep means NaNs for all rows where any column has a NaN - # only necessary if there are actually any NaNs - result = np.empty(len(data), dtype=object) - np.putmask(result, union_mask, np.nan) - - not_masked = ~union_mask - result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep) - elif na_rep is not None and union_mask.any(): - # fill NaNs with na_rep in case there are actually any NaNs - all_cols = [ - np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols) - ] - result = cat_safe(all_cols, sep) - else: - # no NaNs - can just concatenate - result = cat_safe(all_cols, sep) - - if isinstance(self._orig, ABCIndexClass): - # add dtype for case that result is all-NA - result = Index(result, dtype=object, name=self._orig.name) - else: # Series - if is_categorical_dtype(self._orig.dtype): - # We need to infer the new categories. - dtype = None - else: - dtype = self._orig.dtype - result = Series(result, dtype=dtype, index=data.index, name=self._orig.name) - return result - - _shared_docs[ - "str_split" - ] = r""" - Split strings around given separator/delimiter. - - Splits the string in the Series/Index from the %(side)s, - at the specified delimiter string. Equivalent to :meth:`str.%(method)s`. - - Parameters - ---------- - pat : str, optional - String or regular expression to split on. - If not specified, split on whitespace. - n : int, default -1 (all) - Limit number of splits in output. - ``None``, 0 and -1 will be interpreted as return all splits. - expand : bool, default False - Expand the split strings into separate columns. - - * If ``True``, return DataFrame/MultiIndex expanding dimensionality. - * If ``False``, return Series/Index, containing lists of strings. - - Returns - ------- - Series, Index, DataFrame or MultiIndex - Type matches caller unless ``expand=True`` (see Notes). - - See Also - -------- - Series.str.split : Split strings around given separator/delimiter. - Series.str.rsplit : Splits string around given separator/delimiter, - starting from the right. - Series.str.join : Join lists contained as elements in the Series/Index - with passed delimiter. - str.split : Standard library version for split. - str.rsplit : Standard library version for rsplit. - - Notes - ----- - The handling of the `n` keyword depends on the number of found splits: - - - If found splits > `n`, make first `n` splits only - - If found splits <= `n`, make all splits - - If for a certain row the number of found splits < `n`, - append `None` for padding up to `n` if ``expand=True`` - - If using ``expand=True``, Series and Index callers return DataFrame and - MultiIndex objects, respectively. - - Examples - -------- - >>> s = pd.Series( - ... [ - ... "this is a regular sentence", - ... "https://docs.python.org/3/tutorial/index.html", - ... np.nan - ... ] - ... ) - >>> s - 0 this is a regular sentence - 1 https://docs.python.org/3/tutorial/index.html - 2 NaN - dtype: object - - In the default setting, the string is split by whitespace. - - >>> s.str.split() - 0 [this, is, a, regular, sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 NaN - dtype: object - - Without the `n` parameter, the outputs of `rsplit` and `split` - are identical. - - >>> s.str.rsplit() - 0 [this, is, a, regular, sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 NaN - dtype: object - - The `n` parameter can be used to limit the number of splits on the - delimiter. The outputs of `split` and `rsplit` are different. - - >>> s.str.split(n=2) - 0 [this, is, a regular sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 NaN - dtype: object - - >>> s.str.rsplit(n=2) - 0 [this is a, regular, sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 NaN - dtype: object - - The `pat` parameter can be used to split by other characters. - - >>> s.str.split(pat="/") - 0 [this is a regular sentence] - 1 [https:, , docs.python.org, 3, tutorial, index... - 2 NaN - dtype: object - - When using ``expand=True``, the split elements will expand out into - separate columns. If NaN is present, it is propagated throughout - the columns during the split. - - >>> s.str.split(expand=True) - 0 1 2 3 4 - 0 this is a regular sentence - 1 https://docs.python.org/3/tutorial/index.html None None None None - 2 NaN NaN NaN NaN NaN - - For slightly more complex use cases like splitting the html document name - from a url, a combination of parameter settings can be used. - - >>> s.str.rsplit("/", n=1, expand=True) - 0 1 - 0 this is a regular sentence None - 1 https://docs.python.org/3/tutorial index.html - 2 NaN NaN - - Remember to escape special characters when explicitly using regular - expressions. - - >>> s = pd.Series(["1+1=2"]) - >>> s - 0 1+1=2 - dtype: object - >>> s.str.split(r"\+|=", expand=True) - 0 1 2 - 0 1 1 2 - """ - - @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"}) - @forbid_nonstring_types(["bytes"]) - def split(self, pat=None, n=-1, expand=False): - result = str_split(self._parent, pat, n=n) - return self._wrap_result(result, expand=expand, returns_string=expand) - - @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"}) - @forbid_nonstring_types(["bytes"]) - def rsplit(self, pat=None, n=-1, expand=False): - result = str_rsplit(self._parent, pat, n=n) - return self._wrap_result(result, expand=expand, returns_string=expand) - - _shared_docs[ - "str_partition" - ] = """ - Split the string at the %(side)s occurrence of `sep`. - - This method splits the string at the %(side)s occurrence of `sep`, - and returns 3 elements containing the part before the separator, - the separator itself, and the part after the separator. - If the separator is not found, return %(return)s. - - Parameters - ---------- - sep : str, default whitespace - String to split on. - expand : bool, default True - If True, return DataFrame/MultiIndex expanding dimensionality. - If False, return Series/Index. - - Returns - ------- - DataFrame/MultiIndex or Series/Index of objects - - See Also - -------- - %(also)s - Series.str.split : Split strings around given separators. - str.partition : Standard library version. - - Examples - -------- - - >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers']) - >>> s - 0 Linda van der Berg - 1 George Pitt-Rivers - dtype: object - - >>> s.str.partition() - 0 1 2 - 0 Linda van der Berg - 1 George Pitt-Rivers - - To partition by the last space instead of the first one: - - >>> s.str.rpartition() - 0 1 2 - 0 Linda van der Berg - 1 George Pitt-Rivers - - To partition by something different than a space: - - >>> s.str.partition('-') - 0 1 2 - 0 Linda van der Berg - 1 George Pitt - Rivers - - To return a Series containing tuples instead of a DataFrame: - - >>> s.str.partition('-', expand=False) - 0 (Linda van der Berg, , ) - 1 (George Pitt, -, Rivers) - dtype: object - - Also available on indices: - - >>> idx = pd.Index(['X 123', 'Y 999']) - >>> idx - Index(['X 123', 'Y 999'], dtype='object') - - Which will create a MultiIndex: - - >>> idx.str.partition() - MultiIndex([('X', ' ', '123'), - ('Y', ' ', '999')], - ) - - Or an index with tuples with ``expand=False``: - - >>> idx.str.partition(expand=False) - Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object') - """ - - @Appender( - _shared_docs["str_partition"] - % { - "side": "first", - "return": "3 elements containing the string itself, followed by two " - "empty strings", - "also": "rpartition : Split the string at the last occurrence of `sep`.", - } - ) - @forbid_nonstring_types(["bytes"]) - def partition(self, sep=" ", expand=True): - f = lambda x: x.partition(sep) - result = _na_map(f, self._parent) - return self._wrap_result(result, expand=expand, returns_string=expand) - - @Appender( - _shared_docs["str_partition"] - % { - "side": "last", - "return": "3 elements containing two empty strings, followed by the " - "string itself", - "also": "partition : Split the string at the first occurrence of `sep`.", - } - ) - @forbid_nonstring_types(["bytes"]) - def rpartition(self, sep=" ", expand=True): - f = lambda x: x.rpartition(sep) - result = _na_map(f, self._parent) - return self._wrap_result(result, expand=expand, returns_string=expand) - - @copy(str_get) - def get(self, i): - result = str_get(self._parent, i) - return self._wrap_result(result) - - @copy(str_join) - @forbid_nonstring_types(["bytes"]) - def join(self, sep): - result = str_join(self._parent, sep) - return self._wrap_result(result) - - @copy(str_contains) - @forbid_nonstring_types(["bytes"]) - def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): - result = str_contains( - self._parent, pat, case=case, flags=flags, na=na, regex=regex - ) - return self._wrap_result(result, fill_value=na, returns_string=False) - - @copy(str_match) - @forbid_nonstring_types(["bytes"]) - def match(self, pat, case=True, flags=0, na=np.nan): - result = str_match(self._parent, pat, case=case, flags=flags, na=na) - return self._wrap_result(result, fill_value=na, returns_string=False) - - @copy(str_fullmatch) - @forbid_nonstring_types(["bytes"]) - def fullmatch(self, pat, case=True, flags=0, na=np.nan): - result = str_fullmatch(self._parent, pat, case=case, flags=flags, na=na) - return self._wrap_result(result, fill_value=na, returns_string=False) - - @copy(str_replace) - @forbid_nonstring_types(["bytes"]) - def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): - result = str_replace( - self._parent, pat, repl, n=n, case=case, flags=flags, regex=regex - ) - return self._wrap_result(result) - - @copy(str_repeat) - @forbid_nonstring_types(["bytes"]) - def repeat(self, repeats): - result = str_repeat(self._parent, repeats) - return self._wrap_result(result) - - @copy(str_pad) - @forbid_nonstring_types(["bytes"]) - def pad(self, width, side="left", fillchar=" "): - result = str_pad(self._parent, width, side=side, fillchar=fillchar) - return self._wrap_result(result) - - _shared_docs[ - "str_pad" - ] = """ - Pad %(side)s side of strings in the Series/Index. - - Equivalent to :meth:`str.%(method)s`. - - Parameters - ---------- - width : int - Minimum width of resulting string; additional characters will be filled - with ``fillchar``. - fillchar : str - Additional character for filling, default is whitespace. - - Returns - ------- - filled : Series/Index of objects. - """ - - @Appender(_shared_docs["str_pad"] % dict(side="left and right", method="center")) - @forbid_nonstring_types(["bytes"]) - def center(self, width, fillchar=" "): - return self.pad(width, side="both", fillchar=fillchar) - - @Appender(_shared_docs["str_pad"] % dict(side="right", method="ljust")) - @forbid_nonstring_types(["bytes"]) - def ljust(self, width, fillchar=" "): - return self.pad(width, side="right", fillchar=fillchar) - - @Appender(_shared_docs["str_pad"] % dict(side="left", method="rjust")) - @forbid_nonstring_types(["bytes"]) - def rjust(self, width, fillchar=" "): - return self.pad(width, side="left", fillchar=fillchar) - - @forbid_nonstring_types(["bytes"]) - def zfill(self, width): - """ - Pad strings in the Series/Index by prepending '0' characters. - - Strings in the Series/Index are padded with '0' characters on the - left of the string to reach a total string length `width`. Strings - in the Series/Index with length greater or equal to `width` are - unchanged. - - Parameters - ---------- - width : int - Minimum length of resulting string; strings with length less - than `width` be prepended with '0' characters. - - Returns - ------- - Series/Index of objects. - - See Also - -------- - Series.str.rjust : Fills the left side of strings with an arbitrary - character. - Series.str.ljust : Fills the right side of strings with an arbitrary - character. - Series.str.pad : Fills the specified sides of strings with an arbitrary - character. - Series.str.center : Fills boths sides of strings with an arbitrary - character. - - Notes - ----- - Differs from :meth:`str.zfill` which has special handling - for '+'/'-' in the string. - - Examples - -------- - >>> s = pd.Series(['-1', '1', '1000', 10, np.nan]) - >>> s - 0 -1 - 1 1 - 2 1000 - 3 10 - 4 NaN - dtype: object - - Note that ``10`` and ``NaN`` are not strings, therefore they are - converted to ``NaN``. The minus sign in ``'-1'`` is treated as a - regular character and the zero is added to the left of it - (:meth:`str.zfill` would have moved it to the left). ``1000`` - remains unchanged as it is longer than `width`. - - >>> s.str.zfill(3) - 0 0-1 - 1 001 - 2 1000 - 3 NaN - 4 NaN - dtype: object - """ - result = str_pad(self._parent, width, side="left", fillchar="0") - return self._wrap_result(result) - - @copy(str_slice) - def slice(self, start=None, stop=None, step=None): - result = str_slice(self._parent, start, stop, step) - return self._wrap_result(result) - - @copy(str_slice_replace) - @forbid_nonstring_types(["bytes"]) - def slice_replace(self, start=None, stop=None, repl=None): - result = str_slice_replace(self._parent, start, stop, repl) - return self._wrap_result(result) - - @copy(str_decode) - def decode(self, encoding, errors="strict"): - # need to allow bytes here - result = str_decode(self._parent, encoding, errors) - # TODO: Not sure how to handle this. - return self._wrap_result(result, returns_string=False) - - @copy(str_encode) - @forbid_nonstring_types(["bytes"]) - def encode(self, encoding, errors="strict"): - result = str_encode(self._parent, encoding, errors) - return self._wrap_result(result, returns_string=False) - - _shared_docs[ - "str_strip" - ] = r""" - Remove %(position)s characters. - - Strip whitespaces (including newlines) or a set of specified characters - from each string in the Series/Index from %(side)s. - Equivalent to :meth:`str.%(method)s`. - - Parameters - ---------- - to_strip : str or None, default None - Specifying the set of characters to be removed. - All combinations of this set of characters will be stripped. - If None then whitespaces are removed. - - Returns - ------- - Series or Index of object - - See Also - -------- - Series.str.strip : Remove leading and trailing characters in Series/Index. - Series.str.lstrip : Remove leading characters in Series/Index. - Series.str.rstrip : Remove trailing characters in Series/Index. - - Examples - -------- - >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan]) - >>> s - 0 1. Ant. - 1 2. Bee!\n - 2 3. Cat?\t - 3 NaN - dtype: object - - >>> s.str.strip() - 0 1. Ant. - 1 2. Bee! - 2 3. Cat? - 3 NaN - dtype: object - - >>> s.str.lstrip('123.') - 0 Ant. - 1 Bee!\n - 2 Cat?\t - 3 NaN - dtype: object - - >>> s.str.rstrip('.!? \n\t') - 0 1. Ant - 1 2. Bee - 2 3. Cat - 3 NaN - dtype: object - - >>> s.str.strip('123.!? \n\t') - 0 Ant - 1 Bee - 2 Cat - 3 NaN - dtype: object - """ - - @Appender( - _shared_docs["str_strip"] - % dict( - side="left and right sides", method="strip", position="leading and trailing" - ) - ) - @forbid_nonstring_types(["bytes"]) - def strip(self, to_strip=None): - result = str_strip(self._parent, to_strip, side="both") - return self._wrap_result(result) - - @Appender( - _shared_docs["str_strip"] - % dict(side="left side", method="lstrip", position="leading") - ) - @forbid_nonstring_types(["bytes"]) - def lstrip(self, to_strip=None): - result = str_strip(self._parent, to_strip, side="left") - return self._wrap_result(result) - - @Appender( - _shared_docs["str_strip"] - % dict(side="right side", method="rstrip", position="trailing") - ) - @forbid_nonstring_types(["bytes"]) - def rstrip(self, to_strip=None): - result = str_strip(self._parent, to_strip, side="right") - return self._wrap_result(result) - - @copy(str_wrap) - @forbid_nonstring_types(["bytes"]) - def wrap(self, width, **kwargs): - result = str_wrap(self._parent, width, **kwargs) - return self._wrap_result(result) - - @copy(str_get_dummies) - @forbid_nonstring_types(["bytes"]) - def get_dummies(self, sep="|"): - # we need to cast to Series of strings as only that has all - # methods available for making the dummies... - data = self._orig.astype(str) if self._is_categorical else self._parent - result, name = str_get_dummies(data, sep) - return self._wrap_result( - result, - use_codes=(not self._is_categorical), - name=name, - expand=True, - returns_string=False, - ) - - @copy(str_translate) - @forbid_nonstring_types(["bytes"]) - def translate(self, table): - result = str_translate(self._parent, table) - return self._wrap_result(result) - - count = _pat_wrapper(str_count, flags=True, name="count", returns_string=False) - startswith = _pat_wrapper( - str_startswith, na=True, name="startswith", returns_string=False - ) - endswith = _pat_wrapper( - str_endswith, na=True, name="endswith", returns_string=False - ) - findall = _pat_wrapper( - str_findall, flags=True, name="findall", returns_string=False - ) - - @copy(str_extract) - @forbid_nonstring_types(["bytes"]) - def extract(self, pat, flags=0, expand=True): - return str_extract(self, pat, flags=flags, expand=expand) - - @copy(str_extractall) - @forbid_nonstring_types(["bytes"]) - def extractall(self, pat, flags=0): - return str_extractall(self._orig, pat, flags=flags) - - _shared_docs[ - "find" - ] = """ - Return %(side)s indexes in each strings in the Series/Index. - - Each of returned indexes corresponds to the position where the - substring is fully contained between [start:end]. Return -1 on - failure. Equivalent to standard :meth:`str.%(method)s`. - - Parameters - ---------- - sub : str - Substring being searched. - start : int - Left edge index. - end : int - Right edge index. - - Returns - ------- - Series or Index of int. - - See Also - -------- - %(also)s - """ - - @Appender( - _shared_docs["find"] - % dict( - side="lowest", - method="find", - also="rfind : Return highest indexes in each strings.", - ) - ) - @forbid_nonstring_types(["bytes"]) - def find(self, sub, start=0, end=None): - result = str_find(self._parent, sub, start=start, end=end, side="left") - return self._wrap_result(result, returns_string=False) - - @Appender( - _shared_docs["find"] - % dict( - side="highest", - method="rfind", - also="find : Return lowest indexes in each strings.", - ) - ) - @forbid_nonstring_types(["bytes"]) - def rfind(self, sub, start=0, end=None): - result = str_find(self._parent, sub, start=start, end=end, side="right") - return self._wrap_result(result, returns_string=False) - - @forbid_nonstring_types(["bytes"]) - def normalize(self, form): - """ - Return the Unicode normal form for the strings in the Series/Index. - - For more information on the forms, see the - :func:`unicodedata.normalize`. - - Parameters - ---------- - form : {'NFC', 'NFKC', 'NFD', 'NFKD'} - Unicode form. - - Returns - ------- - normalized : Series/Index of objects - """ - import unicodedata - - f = lambda x: unicodedata.normalize(form, x) - result = _na_map(f, self._parent, dtype=str) - return self._wrap_result(result) - - _shared_docs[ - "index" - ] = """ - Return %(side)s indexes in each string in Series/Index. - - Each of the returned indexes corresponds to the position where the - substring is fully contained between [start:end]. This is the same - as ``str.%(similar)s`` except instead of returning -1, it raises a - ValueError when the substring is not found. Equivalent to standard - ``str.%(method)s``. - - Parameters - ---------- - sub : str - Substring being searched. - start : int - Left edge index. - end : int - Right edge index. - - Returns - ------- - Series or Index of object - - See Also - -------- - %(also)s - """ - - @Appender( - _shared_docs["index"] - % dict( - side="lowest", - similar="find", - method="index", - also="rindex : Return highest indexes in each strings.", - ) - ) - @forbid_nonstring_types(["bytes"]) - def index(self, sub, start=0, end=None): - result = str_index(self._parent, sub, start=start, end=end, side="left") - return self._wrap_result(result, returns_string=False) - - @Appender( - _shared_docs["index"] - % dict( - side="highest", - similar="rfind", - method="rindex", - also="index : Return lowest indexes in each strings.", - ) - ) - @forbid_nonstring_types(["bytes"]) - def rindex(self, sub, start=0, end=None): - result = str_index(self._parent, sub, start=start, end=end, side="right") - return self._wrap_result(result, returns_string=False) - - _shared_docs[ - "len" - ] = """ - Compute the length of each element in the Series/Index. - - The element may be a sequence (such as a string, tuple or list) or a collection - (such as a dictionary). - - Returns - ------- - Series or Index of int - A Series or Index of integer values indicating the length of each - element in the Series or Index. - - See Also - -------- - str.len : Python built-in function returning the length of an object. - Series.size : Returns the length of the Series. - - Examples - -------- - Returns the length (number of characters) in a string. Returns the - number of entries for dictionaries, lists or tuples. - - >>> s = pd.Series(['dog', - ... '', - ... 5, - ... {'foo' : 'bar'}, - ... [2, 3, 5, 7], - ... ('one', 'two', 'three')]) - >>> s - 0 dog - 1 - 2 5 - 3 {'foo': 'bar'} - 4 [2, 3, 5, 7] - 5 (one, two, three) - dtype: object - >>> s.str.len() - 0 3.0 - 1 0.0 - 2 NaN - 3 1.0 - 4 4.0 - 5 3.0 - dtype: float64 - """ - len = _noarg_wrapper( - len, - docstring=_shared_docs["len"], - forbidden_types=None, - dtype=np.dtype("int64"), - returns_string=False, - ) - - _shared_docs[ - "casemethods" - ] = """ - Convert strings in the Series/Index to %(type)s. - %(version)s - Equivalent to :meth:`str.%(method)s`. - - Returns - ------- - Series or Index of object - - See Also - -------- - Series.str.lower : Converts all characters to lowercase. - Series.str.upper : Converts all characters to uppercase. - Series.str.title : Converts first character of each word to uppercase and - remaining to lowercase. - Series.str.capitalize : Converts first character to uppercase and - remaining to lowercase. - Series.str.swapcase : Converts uppercase to lowercase and lowercase to - uppercase. - Series.str.casefold: Removes all case distinctions in the string. - - Examples - -------- - >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe']) - >>> s - 0 lower - 1 CAPITALS - 2 this is a sentence - 3 SwApCaSe - dtype: object - - >>> s.str.lower() - 0 lower - 1 capitals - 2 this is a sentence - 3 swapcase - dtype: object - - >>> s.str.upper() - 0 LOWER - 1 CAPITALS - 2 THIS IS A SENTENCE - 3 SWAPCASE - dtype: object - - >>> s.str.title() - 0 Lower - 1 Capitals - 2 This Is A Sentence - 3 Swapcase - dtype: object - - >>> s.str.capitalize() - 0 Lower - 1 Capitals - 2 This is a sentence - 3 Swapcase - dtype: object - - >>> s.str.swapcase() - 0 LOWER - 1 capitals - 2 THIS IS A SENTENCE - 3 sWaPcAsE - dtype: object - """ - - # _doc_args holds dict of strings to use in substituting casemethod docs - _doc_args: Dict[str, Dict[str, str]] = {} - _doc_args["lower"] = dict(type="lowercase", method="lower", version="") - _doc_args["upper"] = dict(type="uppercase", method="upper", version="") - _doc_args["title"] = dict(type="titlecase", method="title", version="") - _doc_args["capitalize"] = dict( - type="be capitalized", method="capitalize", version="" - ) - _doc_args["swapcase"] = dict(type="be swapcased", method="swapcase", version="") - _doc_args["casefold"] = dict( - type="be casefolded", - method="casefold", - version="\n .. versionadded:: 0.25.0\n", - ) - lower = _noarg_wrapper( - lambda x: x.lower(), - name="lower", - docstring=_shared_docs["casemethods"] % _doc_args["lower"], - dtype=str, - ) - upper = _noarg_wrapper( - lambda x: x.upper(), - name="upper", - docstring=_shared_docs["casemethods"] % _doc_args["upper"], - dtype=str, - ) - title = _noarg_wrapper( - lambda x: x.title(), - name="title", - docstring=_shared_docs["casemethods"] % _doc_args["title"], - dtype=str, - ) - capitalize = _noarg_wrapper( - lambda x: x.capitalize(), - name="capitalize", - docstring=_shared_docs["casemethods"] % _doc_args["capitalize"], - dtype=str, - ) - swapcase = _noarg_wrapper( - lambda x: x.swapcase(), - name="swapcase", - docstring=_shared_docs["casemethods"] % _doc_args["swapcase"], - dtype=str, - ) - casefold = _noarg_wrapper( - lambda x: x.casefold(), - name="casefold", - docstring=_shared_docs["casemethods"] % _doc_args["casefold"], - dtype=str, - ) - - _shared_docs[ - "ismethods" - ] = """ - Check whether all characters in each string are %(type)s. - - This is equivalent to running the Python string method - :meth:`str.%(method)s` for each element of the Series/Index. If a string - has zero characters, ``False`` is returned for that check. - - Returns - ------- - Series or Index of bool - Series or Index of boolean values with the same length as the original - Series/Index. - - See Also - -------- - Series.str.isalpha : Check whether all characters are alphabetic. - Series.str.isnumeric : Check whether all characters are numeric. - Series.str.isalnum : Check whether all characters are alphanumeric. - Series.str.isdigit : Check whether all characters are digits. - Series.str.isdecimal : Check whether all characters are decimal. - Series.str.isspace : Check whether all characters are whitespace. - Series.str.islower : Check whether all characters are lowercase. - Series.str.isupper : Check whether all characters are uppercase. - Series.str.istitle : Check whether all characters are titlecase. - - Examples - -------- - **Checks for Alphabetic and Numeric Characters** - - >>> s1 = pd.Series(['one', 'one1', '1', '']) - - >>> s1.str.isalpha() - 0 True - 1 False - 2 False - 3 False - dtype: bool - - >>> s1.str.isnumeric() - 0 False - 1 False - 2 True - 3 False - dtype: bool - - >>> s1.str.isalnum() - 0 True - 1 True - 2 True - 3 False - dtype: bool - - Note that checks against characters mixed with any additional punctuation - or whitespace will evaluate to false for an alphanumeric check. - - >>> s2 = pd.Series(['A B', '1.5', '3,000']) - >>> s2.str.isalnum() - 0 False - 1 False - 2 False - dtype: bool - - **More Detailed Checks for Numeric Characters** - - There are several different but overlapping sets of numeric characters that - can be checked for. - - >>> s3 = pd.Series(['23', '³', '⅕', '']) - - The ``s3.str.isdecimal`` method checks for characters used to form numbers - in base 10. - - >>> s3.str.isdecimal() - 0 True - 1 False - 2 False - 3 False - dtype: bool - - The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also - includes special digits, like superscripted and subscripted digits in - unicode. - - >>> s3.str.isdigit() - 0 True - 1 True - 2 False - 3 False - dtype: bool - - The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also - includes other characters that can represent quantities such as unicode - fractions. - - >>> s3.str.isnumeric() - 0 True - 1 True - 2 True - 3 False - dtype: bool - - **Checks for Whitespace** - - >>> s4 = pd.Series([' ', '\\t\\r\\n ', '']) - >>> s4.str.isspace() - 0 True - 1 True - 2 False - dtype: bool - - **Checks for Character Case** - - >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) - - >>> s5.str.islower() - 0 True - 1 False - 2 False - 3 False - dtype: bool - - >>> s5.str.isupper() - 0 False - 1 False - 2 True - 3 False - dtype: bool - - The ``s5.str.istitle`` method checks for whether all words are in title - case (whether only the first letter of each word is capitalized). Words are - assumed to be as any sequence of non-numeric characters separated by - whitespace characters. - - >>> s5.str.istitle() - 0 False - 1 True - 2 False - 3 False - dtype: bool - """ - _doc_args["isalnum"] = dict(type="alphanumeric", method="isalnum") - _doc_args["isalpha"] = dict(type="alphabetic", method="isalpha") - _doc_args["isdigit"] = dict(type="digits", method="isdigit") - _doc_args["isspace"] = dict(type="whitespace", method="isspace") - _doc_args["islower"] = dict(type="lowercase", method="islower") - _doc_args["isupper"] = dict(type="uppercase", method="isupper") - _doc_args["istitle"] = dict(type="titlecase", method="istitle") - _doc_args["isnumeric"] = dict(type="numeric", method="isnumeric") - _doc_args["isdecimal"] = dict(type="decimal", method="isdecimal") - # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624) - isalnum = _noarg_wrapper( - lambda x: x.isalnum(), - name="isalnum", - docstring=_shared_docs["ismethods"] % _doc_args["isalnum"], - returns_string=False, - dtype=np.dtype(bool), - ) - isalpha = _noarg_wrapper( - lambda x: x.isalpha(), - name="isalpha", - docstring=_shared_docs["ismethods"] % _doc_args["isalpha"], - returns_string=False, - dtype=np.dtype(bool), - ) - isdigit = _noarg_wrapper( - lambda x: x.isdigit(), - name="isdigit", - docstring=_shared_docs["ismethods"] % _doc_args["isdigit"], - returns_string=False, - dtype=np.dtype(bool), - ) - isspace = _noarg_wrapper( - lambda x: x.isspace(), - name="isspace", - docstring=_shared_docs["ismethods"] % _doc_args["isspace"], - returns_string=False, - dtype=np.dtype(bool), - ) - islower = _noarg_wrapper( - lambda x: x.islower(), - name="islower", - docstring=_shared_docs["ismethods"] % _doc_args["islower"], - returns_string=False, - dtype=np.dtype(bool), - ) - isupper = _noarg_wrapper( - lambda x: x.isupper(), - name="isupper", - docstring=_shared_docs["ismethods"] % _doc_args["isupper"], - returns_string=False, - dtype=np.dtype(bool), - ) - istitle = _noarg_wrapper( - lambda x: x.istitle(), - name="istitle", - docstring=_shared_docs["ismethods"] % _doc_args["istitle"], - returns_string=False, - dtype=np.dtype(bool), - ) - isnumeric = _noarg_wrapper( - lambda x: x.isnumeric(), - name="isnumeric", - docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"], - returns_string=False, - dtype=np.dtype(bool), - ) - isdecimal = _noarg_wrapper( - lambda x: x.isdecimal(), - name="isdecimal", - docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"], - returns_string=False, - dtype=np.dtype(bool), - ) - - @classmethod - def _make_accessor(cls, data): - cls._validate(data) - return cls(data) From e76a3c1e9dd88bd75da767dbe3f0d698eb22f4ed Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 14 Sep 2020 15:21:57 -0500 Subject: [PATCH 07/24] fixup --- pandas/core/strings/accessor.py | 32 +++++++++++++++++++---------- pandas/core/strings/base.py | 6 +++--- pandas/core/strings/object_array.py | 16 ++++++++------- pandas/core/strings/string_array.py | 6 +++--- pandas/tests/test_strings.py | 7 +++++++ 5 files changed, 43 insertions(+), 24 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index f5c91b692d0e7..66bedd15b91b4 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -977,7 +977,7 @@ def join(self, sep): return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) - def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): + def contains(self, pat, case=True, flags=0, na=None, regex=True): """ Test if pattern or regex is contained within a string of a Series or Index. @@ -992,8 +992,10 @@ def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): If True, case sensitive. flags : int, default 0 (no flags) Flags to pass through to the re module, e.g. re.IGNORECASE. - na : default NaN - Fill value for missing values. + na : scalar, optional. + Fill value for missing values. The default depends on dtype of the + array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, + ``pandas.NA`` is used. regex : bool, default True If True, assumes the pat is a regular expression. @@ -1103,7 +1105,7 @@ def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) - def match(self, pat, case=True, flags=0, na=np.nan): + def match(self, pat, case=True, flags=0, na=None): """ Determine if each string starts with a match of a regular expression. @@ -1115,8 +1117,10 @@ def match(self, pat, case=True, flags=0, na=np.nan): If True, case sensitive. flags : int, default 0 (no flags) Regex module flags, e.g. re.IGNORECASE. - na : default NaN - Fill value for missing values. + na : scalar, optional. + Fill value for missing values. The default depends on dtype of the + array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, + ``pandas.NA`` is used. Returns ------- @@ -1133,7 +1137,7 @@ def match(self, pat, case=True, flags=0, na=np.nan): return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) - def fullmatch(self, pat, case=True, flags=0, na=np.nan): + def fullmatch(self, pat, case=True, flags=0, na=None): """ Determine if each string entirely matches a regular expression. @@ -1147,8 +1151,10 @@ def fullmatch(self, pat, case=True, flags=0, na=np.nan): If True, case sensitive. flags : int, default 0 (no flags) Regex module flags, e.g. re.IGNORECASE. - na : default NaN - Fill value for missing values. + na : scalar, optional. + Fill value for missing values. The default depends on dtype of the + array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, + ``pandas.NA`` is used. Returns ------- @@ -1993,7 +1999,9 @@ def startswith(self, pat, na=None): pat : str Character sequence. Regular expressions are not accepted. na : object, default NaN - Object shown if element tested is not a string. + Object shown if element tested is not a string. The default depends + on dtype of the array. For object-dtype, ``numpy.nan`` is used. + For ``StringDtype``, ``pandas.NA`` is used. Returns ------- @@ -2048,7 +2056,9 @@ def endswith(self, pat, na=None): pat : str Character sequence. Regular expressions are not accepted. na : object, default NaN - Object shown if element tested is not a string. + Object shown if element tested is not a string. The default depends + on dtype of the array. For object-dtype, ``numpy.nan`` is used. + For ``StringDtype``, ``pandas.NA`` is used. Returns ------- diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index f9e70f8559fd9..d21e8f78ce9ba 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -41,15 +41,15 @@ def pad(self, width, side="left", fillchar=" "): pass @abc.abstractmethod - def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): + def contains(self, pat, case=True, flags=0, na=None, regex=True): pass @abc.abstractmethod - def startswith(self, pat, na=np.nan): + def startswith(self, pat, na=None): pass @abc.abstractmethod - def endswith(self, pat, na=np.nan): + def endswith(self, pat, na=None): pass @abc.abstractmethod diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index dc6144f0320cf..cf6922880ddd2 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -20,17 +20,19 @@ class ObjectArrayMethods(BaseStringArrayMethods): + _default_na_value = np.nan + def _map(self, f, na_value=None, dtype=None): arr = self._array # object-dtype ndarray. if dtype is None: dtype = np.dtype("object") if na_value is None: - na_value = np.nan + na_value = self._default_na_value if not len(arr): return np.ndarray(0, dtype=dtype) if na_value is None: - na_value = np.nan + na_value = self._default_na_value if not isinstance(arr, np.ndarray): arr = np.asarray(arr, dtype=object) @@ -111,11 +113,11 @@ def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): f = lambda x: upper_pat in x.upper() return self._map(f, na, dtype=np.dtype("bool")) - def startswith(self, pat, na=np.nan): + def startswith(self, pat, na=None): f = lambda x: x.startswith(pat) return self._map(f, na_value=na, dtype=np.dtype(bool)) - def endswith(self, pat, na=np.nan): + def endswith(self, pat, na=None): f = lambda x: x.endswith(pat) return self._map(f, na_value=na, dtype=np.dtype(bool)) @@ -191,7 +193,7 @@ def match( pat: Union[str, Pattern], case: bool = True, flags: int = 0, - na: Scalar = np.nan, + na: Scalar = None, ): if not case: flags |= re.IGNORECASE @@ -206,7 +208,7 @@ def fullmatch( pat: Union[str, Pattern], case: bool = True, flags: int = 0, - na: Scalar = np.nan, + na: Scalar = None, ): if not case: flags |= re.IGNORECASE @@ -250,7 +252,7 @@ def f(x): return x.get(i) elif len(x) > i >= -len(x): return x[i] - return np.nan + return self._default_na_value return self._map(f) diff --git a/pandas/core/strings/string_array.py b/pandas/core/strings/string_array.py index cfc7cb5db9c64..269e85b71161a 100644 --- a/pandas/core/strings/string_array.py +++ b/pandas/core/strings/string_array.py @@ -16,6 +16,8 @@ class StringArrayMethods(ObjectArrayMethods): + _default_na_value = libmissing.NA + def _map(self, f, na_value=None, dtype=None): from pandas.arrays import BooleanArray, IntegerArray, StringArray from pandas.core.arrays.string_ import StringDtype @@ -23,14 +25,12 @@ def _map(self, f, na_value=None, dtype=None): if dtype is None: dtype = StringDtype() if na_value is None: - na_value = libmissing.NA + na_value = self._default_na_value arr = self._array mask = isna(arr) arr = np.asarray(arr) - if na_value is None: - na_value = libmissing.NA if is_integer_dtype(dtype) or is_bool_dtype(dtype): constructor: Union[Type[IntegerArray], Type[BooleanArray]] diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index f3983a58719aa..df0e775e8750b 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3643,3 +3643,10 @@ def test_cat_different_classes(klass): result = s.str.cat(klass(["x", "y", "z"])) expected = pd.Series(["ax", "by", "cz"]) tm.assert_series_equal(result, expected) + + +def test_str_get_stringarray_multiple_nans(): + s = pd.Series(pd.array(["a", "ab", pd.NA, "abc"])) + result = s.str.get(2) + expected = pd.Series(pd.array([pd.NA, pd.NA, pd.NA, "c"])) + tm.assert_series_equal(result, expected) From 75831b3e112ea9b24ca98b4e31a08abc77115b0f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 17 Sep 2020 06:57:57 -0500 Subject: [PATCH 08/24] fixup --- pandas/core/strings/accessor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 66bedd15b91b4..432f6dcd7fb9d 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1203,7 +1203,7 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): - If True, assumes the passed-in pattern is a regular expression. - If False, treats the pattern as a literal string - Cannot be set to False if `pat` is a compiled regex or `repl` is - a callable. + a callable. .. versionadded:: 0.23.0 @@ -1217,7 +1217,7 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): ------ ValueError * if `regex` is False and `repl` is a callable or `pat` is a compiled - regex + regex * if `pat` is a compiled regex and `case` or `flags` is set Notes From 1cf54cc513e53f2ff629e86da4c6bba61520a073 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 17 Sep 2020 08:26:31 -0500 Subject: [PATCH 09/24] doctest --- ci/code_checks.sh | 2 +- pandas/core/strings/accessor.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 54aa830379c07..b8f6bd53d4a59 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -335,7 +335,7 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Doctests strings.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/strings.py + pytest -q --doctest-modules pandas/core/strings/ RET=$(($RET + $?)) ; echo $MSG "DONE" # Directories diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 432f6dcd7fb9d..ae05f878aee84 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -978,7 +978,7 @@ def join(self, sep): @forbid_nonstring_types(["bytes"]) def contains(self, pat, case=True, flags=0, na=None, regex=True): - """ + r""" Test if pattern or regex is contained within a string of a Series or Index. Return boolean Series or Index based on whether a given pattern or regex is @@ -1921,7 +1921,7 @@ def translate(self, table): @forbid_nonstring_types(["bytes"]) def count(self, pat, flags=0): - """ + r""" Count occurrences of pattern in each string of the Series/Index. This function is used to count the number of times a particular regex From fc81ebe4503bfcbc02f1e12a01cfe80891dad903 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 17 Sep 2020 08:27:27 -0500 Subject: [PATCH 10/24] docstrings --- pandas/core/strings/accessor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index ae05f878aee84..18d0a8ef41e36 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -992,7 +992,7 @@ def contains(self, pat, case=True, flags=0, na=None, regex=True): If True, case sensitive. flags : int, default 0 (no flags) Flags to pass through to the re module, e.g. re.IGNORECASE. - na : scalar, optional. + na : scalar, optional Fill value for missing values. The default depends on dtype of the array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, ``pandas.NA`` is used. @@ -1117,7 +1117,7 @@ def match(self, pat, case=True, flags=0, na=None): If True, case sensitive. flags : int, default 0 (no flags) Regex module flags, e.g. re.IGNORECASE. - na : scalar, optional. + na : scalar, optional Fill value for missing values. The default depends on dtype of the array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, ``pandas.NA`` is used. From 6be1af6b02cbfa5953a0e486426290ccbd49ca25 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 17 Sep 2020 08:40:23 -0500 Subject: [PATCH 11/24] typing --- pandas/core/strings/accessor.py | 5 +++-- pandas/core/strings/object_array.py | 13 +++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 18d0a8ef41e36..2c436740624b5 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -2,7 +2,7 @@ from functools import wraps import operator import re -from typing import Dict, List +from typing import Dict, List, Optional import warnings import numpy as np @@ -312,6 +312,7 @@ def cons_row(x): else: index = self._orig.index # This is a mess. + dtype: Optional[str] if self._is_string and returns_string: dtype = "string" else: @@ -369,7 +370,7 @@ def _get_series_list(self, others): or (isinstance(x, np.ndarray) and x.ndim == 1) for x in others ): - los = [] + los: List[Series] = [] while others: # iterate through list and append each element los = los + self._get_series_list(others.pop(0)) return los diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index cf6922880ddd2..ec07c0b1dc534 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -1,6 +1,6 @@ import re import textwrap -from typing import Pattern, Union +from typing import Pattern, Set, Union import unicodedata import warnings @@ -341,22 +341,23 @@ def get_dummies(self, sep="|"): from pandas import Series arr = Series(self._array).fillna("") + assert isinstance(arr, Series) # fillna returns Optional[Series] try: arr = sep + arr + sep except TypeError: arr = sep + arr.astype(str) + sep - tags = set() + tags: Set[str] = set() for ts in Series(arr).str.split(sep): tags.update(ts) - tags = sorted(tags - {""}) + tags2 = sorted(tags - {""}) - dummies = np.empty((len(arr), len(tags)), dtype=np.int64) + dummies = np.empty((len(arr), len(tags2)), dtype=np.int64) - for i, t in enumerate(tags): + for i, t in enumerate(tags2): pat = sep + t + sep dummies[:, i] = lib.map_infer(arr.to_numpy(), lambda x: pat in x) - return dummies, tags + return dummies, tags2 def upper(self): return self._map(lambda x: x.upper()) From 95b33101827a692faa0c3e09fd586ec51adc8c04 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 17 Sep 2020 13:40:28 -0500 Subject: [PATCH 12/24] typing --- pandas/core/strings/categorical.py | 7 ++++++- pandas/core/strings/object_array.py | 5 +++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/core/strings/categorical.py b/pandas/core/strings/categorical.py index 3be3825032988..f7d7e564ca43e 100644 --- a/pandas/core/strings/categorical.py +++ b/pandas/core/strings/categorical.py @@ -1,3 +1,5 @@ +from typing import cast + import numpy as np from pandas.core.algorithms import take_1d @@ -6,7 +8,10 @@ class CategoricalStringMethods(ObjectArrayMethods): def _map(self, f, na_value=np.nan, dtype=np.dtype(object)): - arr = self._array # Categorical + from pandas import Categorical + + arr = cast(Categorical, self._array) + categories = arr.categories codes = arr.codes result = ObjectArrayMethods(categories)._map(f, na_value, dtype) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index ec07c0b1dc534..2219a4af3f51b 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -1,6 +1,6 @@ import re import textwrap -from typing import Pattern, Set, Union +from typing import Pattern, Set, Union, cast import unicodedata import warnings @@ -341,11 +341,12 @@ def get_dummies(self, sep="|"): from pandas import Series arr = Series(self._array).fillna("") - assert isinstance(arr, Series) # fillna returns Optional[Series] try: arr = sep + arr + sep except TypeError: + arr = cast(Series, arr) arr = sep + arr.astype(str) + sep + arr = cast(Series, arr) tags: Set[str] = set() for ts in Series(arr).str.split(sep): From 20a870500a52902c6e30ed516e2556c941129305 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 21 Sep 2020 06:35:34 -0500 Subject: [PATCH 13/24] wip --- pandas/core/arrays/string_.py | 5 +- pandas/core/strings/base.py | 99 ++++++++++++------------ pandas/core/strings/object_array.py | 114 ++++++++++++++-------------- 3 files changed, 106 insertions(+), 112 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 46d0fedbe8f39..72d5e8cc862af 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -16,7 +16,7 @@ from pandas.core.construction import extract_array from pandas.core.indexers import check_array_indexer from pandas.core.missing import isna -from pandas.core.strings.string_array import StringArrayMethods +from pandas.core.strings.object_array import ObjectStringArray if TYPE_CHECKING: import pyarrow # noqa: F401 @@ -97,6 +97,9 @@ def __from_arrow__( return StringArray._concat_same_type(results) +# Uhmmm, this is going to have to dispatch on the dtype if we want +# to share StringArray between python / arrow. + class StringArray(PandasArray): """ Extension array for string data. diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index d21e8f78ce9ba..d16bec1053f95 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -5,8 +5,6 @@ from pandas._typing import Scalar -from pandas.core.arrays.base import ExtensionArray - class BaseStringArrayMethods(abc.ABC): """ @@ -23,45 +21,42 @@ class BaseStringArrayMethods(abc.ABC): See :ref:`Series.str` for the docstring of each method. """ - def __init__(self, array: ExtensionArray): - self._array = array - - def __getitem__(self, key): + def _str_getitem(self, key): if isinstance(key, slice): return self.slice(start=key.start, stop=key.stop, step=key.step) else: return self.get(key) @abc.abstractmethod - def count(self, pat, flags=0): + def _str_count(self, pat, flags=0): pass @abc.abstractmethod - def pad(self, width, side="left", fillchar=" "): + def _str_pad(self, width, side="left", fillchar=" "): pass @abc.abstractmethod - def contains(self, pat, case=True, flags=0, na=None, regex=True): + def _str_contains(self, pat, case=True, flags=0, na=None, regex=True): pass @abc.abstractmethod - def startswith(self, pat, na=None): + def _str_startswith(self, pat, na=None): pass @abc.abstractmethod - def endswith(self, pat, na=None): + def _str_endswith(self, pat, na=None): pass @abc.abstractmethod - def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): + def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): pass @abc.abstractmethod - def repeat(self, repeats): + def _str_repeat(self, repeats): pass @abc.abstractmethod - def match( + def _str_match( self, pat: Union[str, Pattern], case: bool = True, @@ -71,7 +66,7 @@ def match( pass @abc.abstractmethod - def fullmatch( + def _str_fullmatch( self, pat: Union[str, Pattern], case: bool = True, @@ -81,149 +76,149 @@ def fullmatch( pass @abc.abstractmethod - def encode(self, encoding, errors="strict"): + def _str_encode(self, encoding, errors="strict"): pass @abc.abstractmethod - def find(self, sub, start=0, end=None): + def _str_find(self, sub, start=0, end=None): pass @abc.abstractmethod - def rfind(self, sub, start=0, end=None): + def _str_rfind(self, sub, start=0, end=None): pass @abc.abstractmethod - def findall(self, pat, flags=0): + def _str_findall(self, pat, flags=0): pass @abc.abstractmethod - def get(self, i): + def _str_get(self, i): pass @abc.abstractmethod - def index(self, sub, start=0, end=None): + def _str_index(self, sub, start=0, end=None): pass @abc.abstractmethod - def rindex(self, sub, start=0, end=None): + def _str_rindex(self, sub, start=0, end=None): pass @abc.abstractmethod - def join(self, sep): + def _str_join(self, sep): pass @abc.abstractmethod - def partition(self, sep, expand): + def _str_partition(self, sep, expand): pass @abc.abstractmethod - def rpartition(self, sep, expand): + def _str_rpartition(self, sep, expand): pass @abc.abstractmethod - def len(self): + def _str_len(self): pass @abc.abstractmethod - def slice(self, start=None, stop=None, step=None): + def _str_slice(self, start=None, stop=None, step=None): pass @abc.abstractmethod - def slice_replace(self, start=None, stop=None, repl=None): + def _str_slice_replace(self, start=None, stop=None, repl=None): pass @abc.abstractmethod - def translate(self, table): + def _str_translate(self, table): pass @abc.abstractmethod - def wrap(self, width, **kwargs): + def _str_wrap(self, width, **kwargs): pass @abc.abstractmethod - def get_dummies(self, sep="|"): + def _str_get_dummies(self, sep="|"): pass @abc.abstractmethod - def isalnum(self): + def _str_isalnum(self): pass @abc.abstractmethod - def isalpha(self): + def _str_isalpha(self): pass @abc.abstractmethod - def isdecimal(self): + def _str_isdecimal(self): pass @abc.abstractmethod - def isdigit(self): + def _str_isdigit(self): pass @abc.abstractmethod - def islower(self): + def _str_islower(self): pass @abc.abstractmethod - def isnumeric(self): + def _str_isnumeric(self): pass @abc.abstractmethod - def isspace(self): + def _str_isspace(self): pass @abc.abstractmethod - def istitle(self): + def _str_istitle(self): pass @abc.abstractmethod - def isupper(self): + def _str_isupper(self): pass @abc.abstractmethod - def capitalize(self): + def _str_capitalize(self): pass @abc.abstractmethod - def casefold(self): + def _str_casefold(self): pass @abc.abstractmethod - def title(self): + def _str_title(self): pass @abc.abstractmethod - def swapcase(self): + def _str_swapcase(self): pass @abc.abstractmethod - def lower(self): + def _str_lower(self): pass @abc.abstractmethod - def upper(self): + def _str_upper(self): pass @abc.abstractmethod - def normalize(self, form): + def _str_normalize(self, form): pass @abc.abstractmethod - def strip(self, to_strip=None): + def _str_strip(self, to_strip=None): pass @abc.abstractmethod - def lstrip(self, to_strip=None): + def _str_lstrip(self, to_strip=None): pass @abc.abstractmethod - def rstrip(self, to_strip=None): + def _str_rstrip(self, to_strip=None): pass @abc.abstractmethod - def split(self, pat=None, n=-1, expand=False): + def _str_split(self, pat=None, n=-1, expand=False): pass @abc.abstractmethod - def rsplit(self, pat=None, n=-1): + def _str_rsplit(self, pat=None, n=-1): pass diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 2219a4af3f51b..c97943da41cbf 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -19,10 +19,10 @@ from pandas.core.strings.base import BaseStringArrayMethods -class ObjectArrayMethods(BaseStringArrayMethods): +class ObjectStringArray(PandasArray): _default_na_value = np.nan - def _map(self, f, na_value=None, dtype=None): + def _str_map(self, f, na_value=None, dtype=None): arr = self._array # object-dtype ndarray. if dtype is None: dtype = np.dtype("object") @@ -52,7 +52,7 @@ def _map(self, f, na_value=None, dtype=None): # FIXME: this should be totally avoidable raise e - def g(x): + def _str_g(x): # This type of fallback behavior can be removed once # we remove object-dtype .str accessor. try: @@ -67,18 +67,18 @@ def g(x): result = lib.maybe_convert_objects(result) return result - def __getitem__(self, key): + def _str_getitem(self, key): if isinstance(key, slice): return self.slice(start=key.start, stop=key.stop, step=key.step) else: return self.get(key) - def count(self, pat, flags=0): + def _str_count(self, pat, flags=0): regex = re.compile(pat, flags=flags) f = lambda x: len(regex.findall(x)) return self._map(f, dtype="int64") - def pad(self, width, side="left", fillchar=" "): + def _str_pad(self, width, side="left", fillchar=" "): if side == "left": f = lambda x: x.rjust(width, fillchar) elif side == "right": @@ -89,7 +89,7 @@ def pad(self, width, side="left", fillchar=" "): raise ValueError("Invalid side") return self._map(f) - def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): + def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True): if regex: if not case: flags |= re.IGNORECASE @@ -113,15 +113,15 @@ def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): f = lambda x: upper_pat in x.upper() return self._map(f, na, dtype=np.dtype("bool")) - def startswith(self, pat, na=None): + def _str_startswith(self, pat, na=None): f = lambda x: x.startswith(pat) return self._map(f, na_value=na, dtype=np.dtype(bool)) - def endswith(self, pat, na=None): + def _str_endswith(self, pat, na=None): f = lambda x: x.endswith(pat) return self._map(f, na_value=na, dtype=np.dtype(bool)) - def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): + def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): # Check whether repl is valid (GH 13438, GH 15055) if not (isinstance(repl, str) or callable(repl)): raise TypeError("repl must be a string or callable") @@ -160,10 +160,10 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): return self._map(f, dtype=str) - def repeat(self, repeats): + def _str_repeat(self, repeats): if is_scalar(repeats): - def scalar_rep(x): + def _str_scalar_rep(x): try: return bytes.__mul__(x, repeats) except TypeError: @@ -173,7 +173,7 @@ def scalar_rep(x): else: from pandas.core.arrays.string_ import StringArray - def rep(x, r): + def _str_rep(x, r): if x is libmissing.NA: return x try: @@ -188,7 +188,7 @@ def rep(x, r): result = StringArray._from_sequence(result) return result - def match( + def _str_match( self, pat: Union[str, Pattern], case: bool = True, @@ -203,7 +203,7 @@ def match( f = lambda x: regex.match(x) is not None return self._map(f, na_value=na, dtype=np.dtype(bool)) - def fullmatch( + def _str_fullmatch( self, pat: Union[str, Pattern], case: bool = True, @@ -218,17 +218,17 @@ def fullmatch( f = lambda x: regex.fullmatch(x) is not None return self._map(f, na_value=na, dtype=np.dtype(bool)) - def encode(self, encoding, errors="strict"): + def _str_encode(self, encoding, errors="strict"): f = lambda x: x.encode(encoding, errors=errors) return self._map(f, dtype=object) - def find(self, sub, start=0, end=None): + def _str_find(self, sub, start=0, end=None): return self._find(sub, start, end, side="left") - def rfind(self, sub, start=0, end=None): + def _str_rfind(self, sub, start=0, end=None): return self._find(sub, start, end, side="right") - def _find(self, sub, start, end, side): + def _str__find(self, sub, start, end, side): if side == "left": method = "find" elif side == "right": @@ -242,12 +242,12 @@ def _find(self, sub, start, end, side): f = lambda x: getattr(x, method)(sub, start, end) return self._map(f, dtype="int64") - def findall(self, pat, flags=0): + def _str_findall(self, pat, flags=0): regex = re.compile(pat, flags=flags) return self._map(regex.findall, dtype="object") - def get(self, i): - def f(x): + def _str_get(self, i): + def _str_f(x): if isinstance(x, dict): return x.get(i) elif len(x) > i >= -len(x): @@ -256,42 +256,42 @@ def f(x): return self._map(f) - def index(self, sub, start=0, end=None): + def _str_index(self, sub, start=0, end=None): if end: f = lambda x: x.index(sub, start, end) else: f = lambda x: x.index(sub, start, end) return self._map(f, dtype="int64") - def rindex(self, sub, start=0, end=None): + def _str_rindex(self, sub, start=0, end=None): if end: f = lambda x: x.rindex(sub, start, end) else: f = lambda x: x.rindex(sub, start, end) return self._map(f, dtype="int64") - def join(self, sep): + def _str_join(self, sep): return self._map(sep.join) - def partition(self, sep, expand): + def _str_partition(self, sep, expand): result = self._map(lambda x: x.partition(sep), dtype="object") return result - def rpartition(self, sep, expand): + def _str_rpartition(self, sep, expand): return self._map(lambda x: x.rpartition(sep), dtype="object") - def len(self): + def _str_len(self): return self._map(len, dtype="int64") - def slice(self, start=None, stop=None, step=None): + def _str_slice(self, start=None, stop=None, step=None): obj = slice(start, stop, step) return self._map(lambda x: x[obj]) - def slice_replace(self, start=None, stop=None, repl=None): + def _str_slice_replace(self, start=None, stop=None, repl=None): if repl is None: repl = "" - def f(x): + def _str_f(x): if x[start:stop] == "": local_stop = start else: @@ -306,7 +306,7 @@ def f(x): return self._map(f) - def split(self, pat=None, n=-1, expand=False): + def _str_split(self, pat=None, n=-1, expand=False): if pat is None: if n is None or n == 0: n = -1 @@ -323,21 +323,21 @@ def split(self, pat=None, n=-1, expand=False): f = lambda x: regex.split(x, maxsplit=n) return self._map(f, dtype=object) - def rsplit(self, pat=None, n=-1): + def _str_rsplit(self, pat=None, n=-1): if n is None or n == 0: n = -1 f = lambda x: x.rsplit(pat, n) return self._map(f, dtype="object") - def translate(self, table): + def _str_translate(self, table): return self._map(lambda x: x.translate(table)) - def wrap(self, width, **kwargs): + def _str_wrap(self, width, **kwargs): kwargs["width"] = width tw = textwrap.TextWrapper(**kwargs) return self._map(lambda s: "\n".join(tw.wrap(s))) - def get_dummies(self, sep="|"): + def _str_get_dummies(self, sep="|"): from pandas import Series arr = Series(self._array).fillna("") @@ -360,64 +360,60 @@ def get_dummies(self, sep="|"): dummies[:, i] = lib.map_infer(arr.to_numpy(), lambda x: pat in x) return dummies, tags2 - def upper(self): + def _str_upper(self): return self._map(lambda x: x.upper()) - def isalnum(self): + def _str_isalnum(self): return self._map(str.isalnum, dtype="bool") - def isalpha(self): + def _str_isalpha(self): return self._map(str.isalpha, dtype="bool") - def isdecimal(self): + def _str_isdecimal(self): return self._map(str.isdecimal, dtype="bool") - def isdigit(self): + def _str_isdigit(self): return self._map(str.isdigit, dtype="bool") - def islower(self): + def _str_islower(self): return self._map(str.islower, dtype="bool") - def isnumeric(self): + def _str_isnumeric(self): return self._map(str.isnumeric, dtype="bool") - def isspace(self): + def _str_isspace(self): return self._map(str.isspace, dtype="bool") - def istitle(self): + def _str_istitle(self): return self._map(str.istitle, dtype="bool") - def isupper(self): + def _str_isupper(self): return self._map(str.isupper, dtype="bool") - def capitalize(self): + def _str_capitalize(self): return self._map(str.capitalize) - def casefold(self): + def _str_casefold(self): return self._map(str.casefold) - def title(self): + def _str_title(self): return self._map(str.title) - def swapcase(self): + def _str_swapcase(self): return self._map(str.swapcase) - def lower(self): + def _str_lower(self): return self._map(str.lower) - def normalize(self, form): + def _str_normalize(self, form): f = lambda x: unicodedata.normalize(form, x) return self._map(f) - def strip(self, to_strip=None): + def _str_strip(self, to_strip=None): return self._map(lambda x: x.strip(to_strip)) - def lstrip(self, to_strip=None): + def _str_lstrip(self, to_strip=None): return self._map(lambda x: x.lstrip(to_strip)) - def rstrip(self, to_strip=None): + def _str_rstrip(self, to_strip=None): return self._map(lambda x: x.rstrip(to_strip)) - - -class ObjectProxy(PandasArray): - _str = CachedAccessor("str", ObjectArrayMethods) From 38c16111a71e74a0557e5fc80a13652cfbfc2010 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 22 Sep 2020 06:44:28 -0500 Subject: [PATCH 14/24] wip --- pandas/core/arrays/categorical.py | 5 +- pandas/core/arrays/string_.py | 4 +- pandas/core/strings/accessor.py | 90 +++++++++---------- pandas/core/strings/categorical.py | 19 +++-- pandas/core/strings/object_array.py | 128 +++++++++++++++------------- 5 files changed, 129 insertions(+), 117 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e94e91cd240c9..ef69d6565cfeb 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -41,7 +41,7 @@ from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna from pandas.core import ops -from pandas.core.accessor import CachedAccessor, PandasDelegate, delegate_names +from pandas.core.accessor import PandasDelegate, delegate_names import pandas.core.algorithms as algorithms from pandas.core.algorithms import factorize, get_data_algo, take_1d, unique1d from pandas.core.arrays._mixins import NDArrayBackedExtensionArray @@ -52,7 +52,6 @@ from pandas.core.missing import interpolate_2d from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.sorting import nargsort -from pandas.core.strings.categorical import CategoricalStringMethods from pandas.io.formats import console @@ -2336,8 +2335,6 @@ def replace(self, to_replace, value, inplace: bool = False): if not inplace: return cat - _str = CachedAccessor("_str", CategoricalStringMethods) - # The Series.cat accessor diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index c407cb764f23f..5bf9838f6b964 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -10,13 +10,12 @@ from pandas import compat from pandas.core import ops -from pandas.core.accessor import CachedAccessor from pandas.core.arrays import IntegerArray, PandasArray from pandas.core.arrays.integer import _IntegerDtype from pandas.core.construction import extract_array from pandas.core.indexers import check_array_indexer from pandas.core.missing import isna -from pandas.core.strings.object_array import ObjectStringArray +# from pandas.core.strings.object_array import ObjectStringArray if TYPE_CHECKING: import pyarrow # noqa: F401 @@ -351,7 +350,6 @@ def _add_arithmetic_ops(cls): cls.__rmul__ = cls._create_arithmetic_method(ops.rmul) _create_comparison_method = _create_arithmetic_method - _str = CachedAccessor("_str", StringArrayMethods) StringArray._add_arithmetic_ops() diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 2c436740624b5..97de112114520 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1,6 +1,5 @@ import codecs from functools import wraps -import operator import re from typing import Dict, List, Optional import warnings @@ -27,7 +26,7 @@ from pandas.core.arrays.numpy_ import PandasArray from pandas.core.base import NoNewAttributesMixin -from pandas.core.strings.object_array import ObjectProxy +from pandas.core.strings.object_array import ObjectStringArray _shared_docs: Dict[str, str] = dict() _cpython_optimized_encoders = ( @@ -112,7 +111,7 @@ def wrapper(self, *args, **kwargs): def _map_and_wrap(name, docstring): @forbid_nonstring_types(["bytes"], name=name) def wrapper(self): - result = operator.methodcaller(name)(self._array._str) + result = getattr(self._array, f"_str_{name}")() return self._wrap_result(result) wrapper.__doc__ = docstring @@ -151,6 +150,8 @@ class StringMethods(NoNewAttributesMixin): def __init__(self, data): from pandas.core.arrays.string_ import StringDtype + from pandas.core.arrays import Categorical + from pandas.core.strings.categorical import CategoricalStringMethods self._inferred_dtype = self._validate(data) self._is_categorical = is_categorical_dtype(data.dtype) @@ -159,8 +160,9 @@ def __init__(self, data): if type(array) is PandasArray: # wrap in an object proxy to get the str methods. - # Alternatively, just add _str to PandasArray. - array = ObjectProxy(array._ndarray) + array = ObjectStringArray(array._ndarray) + elif isinstance(array, Categorical): + array = CategoricalStringMethods(array) self._array = array if isinstance(data, ABCSeries): @@ -226,7 +228,7 @@ def _validate(data): return inferred_dtype def __getitem__(self, key): - result = self._array._str[key] + result = self._array._str_getitem(key) return self._wrap_result(result) def __iter__(self): @@ -738,13 +740,13 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"}) @forbid_nonstring_types(["bytes"]) def split(self, pat=None, n=-1, expand=False): - result = self._array._str.split(pat, n, expand) + result = self._array._str_split(pat, n, expand) return self._wrap_result(result, returns_string=expand, expand=expand) @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"}) @forbid_nonstring_types(["bytes"]) def rsplit(self, pat=None, n=-1, expand=False): - result = self._array._str.rsplit(pat, n=n) + result = self._array._str_rsplit(pat, n=n) return self._wrap_result(result, expand=expand, returns_string=expand) _shared_docs[ @@ -840,7 +842,7 @@ def rsplit(self, pat=None, n=-1, expand=False): ) @forbid_nonstring_types(["bytes"]) def partition(self, sep=" ", expand=True): - result = self._array._str.partition(sep, expand) + result = self._array._str_partition(sep, expand) return self._wrap_result(result, expand=expand, returns_string=expand) @Appender( @@ -854,7 +856,7 @@ def partition(self, sep=" ", expand=True): ) @forbid_nonstring_types(["bytes"]) def rpartition(self, sep=" ", expand=True): - result = self._array._str.rpartition(sep, expand) + result = self._array._str_rpartition(sep, expand) return self._wrap_result(result, expand=expand, returns_string=expand) def get(self, i): @@ -908,7 +910,7 @@ def get(self, i): 5 None dtype: object """ - result = self._array._str.get(i) + result = self._array._str_get(i) return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) @@ -974,7 +976,7 @@ def join(self, sep): 4 NaN dtype: object """ - result = self._array._str.join(sep) + result = self._array._str_join(sep) return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) @@ -1102,7 +1104,7 @@ def contains(self, pat, case=True, flags=0, na=None, regex=True): 4 False dtype: bool """ - result = self._array._str.contains(pat, case, flags, na, regex) + result = self._array._str_contains(pat, case, flags, na, regex) return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) @@ -1134,7 +1136,7 @@ def match(self, pat, case=True, flags=0, na=None): re.match. extract : Extract matched groups. """ - result = self._array._str.match(pat, case=case, flags=flags, na=na) + result = self._array._str_match(pat, case=case, flags=flags, na=na) return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) @@ -1167,7 +1169,7 @@ def fullmatch(self, pat, case=True, flags=0, na=None): matches the regular expression. extract : Extract matched groups. """ - result = self._array._str.fullmatch(pat, case=case, flags=flags, na=na) + result = self._array._str_fullmatch(pat, case=case, flags=flags, na=na) return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) @@ -1289,7 +1291,7 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): 2 NaN dtype: object """ - result = self._array._str.replace( + result = self._array._str_replace( pat, repl, n=n, case=case, flags=flags, regex=regex ) return self._wrap_result(result) @@ -1335,7 +1337,7 @@ def repeat(self, repeats): 2 ccc dtype: object """ - result = self._array._str.repeat(repeats) + result = self._array._str_repeat(repeats) return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) @@ -1403,7 +1405,7 @@ def pad(self, width, side="left", fillchar=" "): msg = f"width must be of integer type, not {type(width).__name__}" raise TypeError(msg) - result = self._array._str.pad(width, side=side, fillchar=fillchar) + result = self._array._str_pad(width, side=side, fillchar=fillchar) return self._wrap_result(result) _shared_docs[ @@ -1577,7 +1579,7 @@ def slice(self, start=None, stop=None, step=None): 2 cm dtype: object """ - result = self._array._str.slice(start, stop, step) + result = self._array._str_slice(start, stop, step) return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) @@ -1653,7 +1655,7 @@ def slice_replace(self, start=None, stop=None, repl=None): 4 aXde dtype: object """ - result = self._array._str.slice_replace(start, stop, repl) + result = self._array._str_slice_replace(start, stop, repl) return self._wrap_result(result) def decode(self, encoding, errors="strict"): @@ -1681,7 +1683,7 @@ def decode(self, encoding, errors="strict"): f = lambda x: decoder(x, errors)[0] arr = self._array # assert isinstance(arr, (StringArray,)) - result = arr._str._map(f) + result = arr._str_map(f) return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) @@ -1700,7 +1702,7 @@ def encode(self, encoding, errors="strict"): ------- encoded : Series/Index of objects """ - result = self._array._str.encode(encoding, errors) + result = self._array._str_encode(encoding, errors) return self._wrap_result(result, returns_string=False) _shared_docs[ @@ -1776,7 +1778,7 @@ def encode(self, encoding, errors="strict"): ) @forbid_nonstring_types(["bytes"]) def strip(self, to_strip=None): - result = self._array._str.strip(to_strip) + result = self._array._str_strip(to_strip) return self._wrap_result(result) @Appender( @@ -1785,7 +1787,7 @@ def strip(self, to_strip=None): ) @forbid_nonstring_types(["bytes"]) def lstrip(self, to_strip=None): - result = self._array._str.lstrip(to_strip) + result = self._array._str_lstrip(to_strip) return self._wrap_result(result) @Appender( @@ -1794,7 +1796,7 @@ def lstrip(self, to_strip=None): ) @forbid_nonstring_types(["bytes"]) def rstrip(self, to_strip=None): - result = self._array._str.rstrip(to_strip) + result = self._array._str_rstrip(to_strip) return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) @@ -1853,7 +1855,7 @@ def wrap(self, width, **kwargs): 1 another line\nto be\nwrapped dtype: object """ - result = self._array._str.wrap(width, **kwargs) + result = self._array._str_wrap(width, **kwargs) return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) @@ -1895,7 +1897,7 @@ def get_dummies(self, sep="|"): """ # we need to cast to Series of strings as only that has all # methods available for making the dummies... - result, name = self._array._str.get_dummies(sep) + result, name = self._array._str_get_dummies(sep) return self._wrap_result(result, name=name, expand=True, returns_string=False,) @forbid_nonstring_types(["bytes"]) @@ -1917,7 +1919,7 @@ def translate(self, table): ------- Series or Index """ - result = self._array._str.translate(table) + result = self._array._str_translate(table) return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) @@ -1985,7 +1987,7 @@ def count(self, pat, flags=0): >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a') Int64Index([0, 0, 2, 1], dtype='int64') """ - result = self._array._str.count(pat, flags) + result = self._array._str_count(pat, flags) return self._wrap_result(result, returns_string=False) @forbid_nonstring_types(["bytes"]) @@ -2042,7 +2044,7 @@ def startswith(self, pat, na=None): 3 False dtype: bool """ - result = self._array._str.startswith(pat, na=na) + result = self._array._str_startswith(pat, na=na) return self._wrap_result(result, returns_string=False) @forbid_nonstring_types(["bytes"]) @@ -2099,7 +2101,7 @@ def endswith(self, pat, na=None): 3 False dtype: bool """ - result = self._array._str.endswith(pat, na=na) + result = self._array._str_endswith(pat, na=na) return self._wrap_result(result, returns_string=False) @forbid_nonstring_types(["bytes"]) @@ -2192,7 +2194,7 @@ def findall(self, pat, flags=0): 2 [b, b] dtype: object """ - result = self._array._str.findall(pat, flags) + result = self._array._str_findall(pat, flags) return self._wrap_result(result, returns_string=False) @forbid_nonstring_types(["bytes"]) @@ -2399,7 +2401,7 @@ def find(self, sub, start=0, end=None): msg = f"expected a string object, not {type(sub).__name__}" raise TypeError(msg) - result = self._array._str.find(sub, start, end) + result = self._array._str_find(sub, start, end) return self._wrap_result(result, returns_string=False) @Appender( @@ -2416,7 +2418,7 @@ def rfind(self, sub, start=0, end=None): msg = f"expected a string object, not {type(sub).__name__}" raise TypeError(msg) - result = self._array._str.rfind(sub, start=start, end=end) + result = self._array._str_rfind(sub, start=start, end=end) return self._wrap_result(result, returns_string=False) @forbid_nonstring_types(["bytes"]) @@ -2436,7 +2438,7 @@ def normalize(self, form): ------- normalized : Series/Index of objects """ - result = self._array._str.normalize(form) + result = self._array._str_normalize(form) return self._wrap_result(result) _shared_docs[ @@ -2483,7 +2485,7 @@ def index(self, sub, start=0, end=None): msg = f"expected a string object, not {type(sub).__name__}" raise TypeError(msg) - result = self._array._str.index(sub, start=start, end=end) + result = self._array._str_index(sub, start=start, end=end) return self._wrap_result(result, returns_string=False) @Appender( @@ -2501,7 +2503,7 @@ def rindex(self, sub, start=0, end=None): msg = f"expected a string object, not {type(sub).__name__}" raise TypeError(msg) - result = self._array._str.rindex(sub, start=start, end=end) + result = self._array._str_rindex(sub, start=start, end=end) return self._wrap_result(result, returns_string=False) def len(self): @@ -2550,7 +2552,7 @@ def len(self): 5 3.0 dtype: float64 """ - result = self._array._str.len() + result = self._array._str_len() return self._wrap_result(result, returns_string=False) _shared_docs[ @@ -2644,37 +2646,37 @@ def len(self): @Appender(_shared_docs["casemethods"] % _doc_args["lower"]) @forbid_nonstring_types(["bytes"]) def lower(self): - result = self._array._str.lower() + result = self._array._str_lower() return self._wrap_result(result) @Appender(_shared_docs["casemethods"] % _doc_args["upper"]) @forbid_nonstring_types(["bytes"]) def upper(self): - result = self._array._str.upper() + result = self._array._str_upper() return self._wrap_result(result) @Appender(_shared_docs["casemethods"] % _doc_args["title"]) @forbid_nonstring_types(["bytes"]) def title(self): - result = self._array._str.title() + result = self._array._str_title() return self._wrap_result(result) @Appender(_shared_docs["casemethods"] % _doc_args["capitalize"]) @forbid_nonstring_types(["bytes"]) def capitalize(self): - result = self._array._str.capitalize() + result = self._array._str_capitalize() return self._wrap_result(result) @Appender(_shared_docs["casemethods"] % _doc_args["swapcase"]) @forbid_nonstring_types(["bytes"]) def swapcase(self): - result = self._array._str.swapcase() + result = self._array._str_swapcase() return self._wrap_result(result) @Appender(_shared_docs["casemethods"] % _doc_args["casefold"]) @forbid_nonstring_types(["bytes"]) def casefold(self): - result = self._array._str.casefold() + result = self._array._str_casefold() return self._wrap_result(result) _shared_docs[ diff --git a/pandas/core/strings/categorical.py b/pandas/core/strings/categorical.py index f7d7e564ca43e..e2a2cc731c0c3 100644 --- a/pandas/core/strings/categorical.py +++ b/pandas/core/strings/categorical.py @@ -3,10 +3,19 @@ import numpy as np from pandas.core.algorithms import take_1d -from pandas.core.strings.object_array import ObjectArrayMethods +from pandas.core.strings.object_array import ObjectStringArray +from pandas.core.arrays import Categorical -class CategoricalStringMethods(ObjectArrayMethods): +class CategoricalStringMethods(Categorical, ObjectStringArray): + """ + Extension array implementing _str methods for Categorical. + + We implement this just to avoid mucking up the inheritance chain + for Categorical. This inherits from ObjectStringArray just for + convenience. + """ + # Probably lots of room for improvement here. def _map(self, f, na_value=np.nan, dtype=np.dtype(object)): from pandas import Categorical @@ -14,9 +23,9 @@ def _map(self, f, na_value=np.nan, dtype=np.dtype(object)): categories = arr.categories codes = arr.codes - result = ObjectArrayMethods(categories)._map(f, na_value, dtype) + result = ObjectStringArray(categories)._map(f, na_value, dtype) return take_1d(result, codes, fill_value=na_value) - def get_dummies(self, sep="|"): + def _str_get_dummies(self, sep="|"): # sep may not be in categories. Just bail on this. - return ObjectArrayMethods(self._array.astype(str)).get_dummies(sep) + return ObjectStringArray(self.astype(str))._str_get_dummies(sep) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index c97943da41cbf..c1851993e24a7 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -14,16 +14,22 @@ from pandas.core.dtypes.common import is_re, is_scalar from pandas.core.dtypes.missing import isna -from pandas.core.accessor import CachedAccessor from pandas.core.arrays.numpy_ import PandasArray from pandas.core.strings.base import BaseStringArrayMethods -class ObjectStringArray(PandasArray): +class ObjectStringArray(PandasArray, BaseStringArrayMethods): + """ + PandasArray subclass with _str methods. + + We don't want to put the _str methods on all PandasArrays + so we use this subclass with the BaseStringArrayMethods + mixin. + """ _default_na_value = np.nan def _str_map(self, f, na_value=None, dtype=None): - arr = self._array # object-dtype ndarray. + arr = self if dtype is None: dtype = np.dtype("object") if na_value is None: @@ -52,7 +58,7 @@ def _str_map(self, f, na_value=None, dtype=None): # FIXME: this should be totally avoidable raise e - def _str_g(x): + def g(x): # This type of fallback behavior can be removed once # we remove object-dtype .str accessor. try: @@ -60,7 +66,7 @@ def _str_g(x): except (TypeError, AttributeError): return na_value - return self._map(g, na_value=na_value, dtype=dtype) + return self._str_map(g, na_value=na_value, dtype=dtype) if na_value is not np.nan: np.putmask(result, mask, na_value) if result.dtype == object: @@ -69,14 +75,14 @@ def _str_g(x): def _str_getitem(self, key): if isinstance(key, slice): - return self.slice(start=key.start, stop=key.stop, step=key.step) + return self._str_slice(start=key.start, stop=key.stop, step=key.step) else: - return self.get(key) + return self._str_get(key) def _str_count(self, pat, flags=0): regex = re.compile(pat, flags=flags) f = lambda x: len(regex.findall(x)) - return self._map(f, dtype="int64") + return self._str_map(f, dtype="int64") def _str_pad(self, width, side="left", fillchar=" "): if side == "left": @@ -87,7 +93,7 @@ def _str_pad(self, width, side="left", fillchar=" "): f = lambda x: x.center(width, fillchar) else: # pragma: no cover raise ValueError("Invalid side") - return self._map(f) + return self._str_map(f) def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True): if regex: @@ -111,15 +117,15 @@ def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True): else: upper_pat = pat.upper() f = lambda x: upper_pat in x.upper() - return self._map(f, na, dtype=np.dtype("bool")) + return self._str_map(f, na, dtype=np.dtype("bool")) def _str_startswith(self, pat, na=None): f = lambda x: x.startswith(pat) - return self._map(f, na_value=na, dtype=np.dtype(bool)) + return self._str_map(f, na_value=na, dtype=np.dtype(bool)) def _str_endswith(self, pat, na=None): f = lambda x: x.endswith(pat) - return self._map(f, na_value=na, dtype=np.dtype(bool)) + return self._str_map(f, na_value=na, dtype=np.dtype(bool)) def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): # Check whether repl is valid (GH 13438, GH 15055) @@ -158,22 +164,22 @@ def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): raise ValueError("Cannot use a callable replacement when regex=False") f = lambda x: x.replace(pat, repl, n) - return self._map(f, dtype=str) + return self._str_map(f, dtype=str) def _str_repeat(self, repeats): if is_scalar(repeats): - def _str_scalar_rep(x): + def scalar_rep(x): try: return bytes.__mul__(x, repeats) except TypeError: return str.__mul__(x, repeats) - return self._map(scalar_rep, dtype=str) + return self._str_map(scalar_rep, dtype=str) else: from pandas.core.arrays.string_ import StringArray - def _str_rep(x, r): + def rep(x, r): if x is libmissing.NA: return x try: @@ -182,8 +188,8 @@ def _str_rep(x, r): return str.__mul__(x, r) repeats = np.asarray(repeats, dtype=object) - result = libops.vec_binop(np.asarray(self._array), repeats, rep) - if isinstance(self._array, StringArray): + result = libops.vec_binop(np.asarray(self), repeats, rep) + if isinstance(self, StringArray): # Not going through map, so we have to do this here. result = StringArray._from_sequence(result) return result @@ -201,7 +207,7 @@ def _str_match( regex = re.compile(pat, flags=flags) f = lambda x: regex.match(x) is not None - return self._map(f, na_value=na, dtype=np.dtype(bool)) + return self._str_map(f, na_value=na, dtype=np.dtype(bool)) def _str_fullmatch( self, @@ -216,19 +222,19 @@ def _str_fullmatch( regex = re.compile(pat, flags=flags) f = lambda x: regex.fullmatch(x) is not None - return self._map(f, na_value=na, dtype=np.dtype(bool)) + return self._str_map(f, na_value=na, dtype=np.dtype(bool)) def _str_encode(self, encoding, errors="strict"): f = lambda x: x.encode(encoding, errors=errors) - return self._map(f, dtype=object) + return self._str_map(f, dtype=object) def _str_find(self, sub, start=0, end=None): - return self._find(sub, start, end, side="left") + return self._str_find_(sub, start, end, side="left") def _str_rfind(self, sub, start=0, end=None): - return self._find(sub, start, end, side="right") + return self._str_find_(sub, start, end, side="right") - def _str__find(self, sub, start, end, side): + def _str_find_(self, sub, start, end, side): if side == "left": method = "find" elif side == "right": @@ -240,58 +246,58 @@ def _str__find(self, sub, start, end, side): f = lambda x: getattr(x, method)(sub, start) else: f = lambda x: getattr(x, method)(sub, start, end) - return self._map(f, dtype="int64") + return self._str_map(f, dtype="int64") def _str_findall(self, pat, flags=0): regex = re.compile(pat, flags=flags) - return self._map(regex.findall, dtype="object") + return self._str_map(regex.findall, dtype="object") def _str_get(self, i): - def _str_f(x): + def f(x): if isinstance(x, dict): return x.get(i) elif len(x) > i >= -len(x): return x[i] return self._default_na_value - return self._map(f) + return self._str_map(f) def _str_index(self, sub, start=0, end=None): if end: f = lambda x: x.index(sub, start, end) else: f = lambda x: x.index(sub, start, end) - return self._map(f, dtype="int64") + return self._str_map(f, dtype="int64") def _str_rindex(self, sub, start=0, end=None): if end: f = lambda x: x.rindex(sub, start, end) else: f = lambda x: x.rindex(sub, start, end) - return self._map(f, dtype="int64") + return self._str_map(f, dtype="int64") def _str_join(self, sep): - return self._map(sep.join) + return self._str_map(sep.join) def _str_partition(self, sep, expand): - result = self._map(lambda x: x.partition(sep), dtype="object") + result = self._str_map(lambda x: x.partition(sep), dtype="object") return result def _str_rpartition(self, sep, expand): - return self._map(lambda x: x.rpartition(sep), dtype="object") + return self._str_map(lambda x: x.rpartition(sep), dtype="object") def _str_len(self): - return self._map(len, dtype="int64") + return self._str_map(len, dtype="int64") def _str_slice(self, start=None, stop=None, step=None): obj = slice(start, stop, step) - return self._map(lambda x: x[obj]) + return self._str_map(lambda x: x[obj]) def _str_slice_replace(self, start=None, stop=None, repl=None): if repl is None: repl = "" - def _str_f(x): + def f(x): if x[start:stop] == "": local_stop = start else: @@ -304,7 +310,7 @@ def _str_f(x): y += x[local_stop:] return y - return self._map(f) + return self._str_map(f) def _str_split(self, pat=None, n=-1, expand=False): if pat is None: @@ -321,26 +327,26 @@ def _str_split(self, pat=None, n=-1, expand=False): n = 0 regex = re.compile(pat) f = lambda x: regex.split(x, maxsplit=n) - return self._map(f, dtype=object) + return self._str_map(f, dtype=object) def _str_rsplit(self, pat=None, n=-1): if n is None or n == 0: n = -1 f = lambda x: x.rsplit(pat, n) - return self._map(f, dtype="object") + return self._str_map(f, dtype="object") def _str_translate(self, table): - return self._map(lambda x: x.translate(table)) + return self._str_map(lambda x: x.translate(table)) def _str_wrap(self, width, **kwargs): kwargs["width"] = width tw = textwrap.TextWrapper(**kwargs) - return self._map(lambda s: "\n".join(tw.wrap(s))) + return self._str_map(lambda s: "\n".join(tw.wrap(s))) def _str_get_dummies(self, sep="|"): from pandas import Series - arr = Series(self._array).fillna("") + arr = Series(self).fillna("") try: arr = sep + arr + sep except TypeError: @@ -361,59 +367,59 @@ def _str_get_dummies(self, sep="|"): return dummies, tags2 def _str_upper(self): - return self._map(lambda x: x.upper()) + return self._str_map(lambda x: x.upper()) def _str_isalnum(self): - return self._map(str.isalnum, dtype="bool") + return self._str_map(str.isalnum, dtype="bool") def _str_isalpha(self): - return self._map(str.isalpha, dtype="bool") + return self._str_map(str.isalpha, dtype="bool") def _str_isdecimal(self): - return self._map(str.isdecimal, dtype="bool") + return self._str_map(str.isdecimal, dtype="bool") def _str_isdigit(self): - return self._map(str.isdigit, dtype="bool") + return self._str_map(str.isdigit, dtype="bool") def _str_islower(self): - return self._map(str.islower, dtype="bool") + return self._str_map(str.islower, dtype="bool") def _str_isnumeric(self): - return self._map(str.isnumeric, dtype="bool") + return self._str_map(str.isnumeric, dtype="bool") def _str_isspace(self): - return self._map(str.isspace, dtype="bool") + return self._str_map(str.isspace, dtype="bool") def _str_istitle(self): - return self._map(str.istitle, dtype="bool") + return self._str_map(str.istitle, dtype="bool") def _str_isupper(self): - return self._map(str.isupper, dtype="bool") + return self._str_map(str.isupper, dtype="bool") def _str_capitalize(self): - return self._map(str.capitalize) + return self._str_map(str.capitalize) def _str_casefold(self): - return self._map(str.casefold) + return self._str_map(str.casefold) def _str_title(self): - return self._map(str.title) + return self._str_map(str.title) def _str_swapcase(self): - return self._map(str.swapcase) + return self._str_map(str.swapcase) def _str_lower(self): - return self._map(str.lower) + return self._str_map(str.lower) def _str_normalize(self, form): f = lambda x: unicodedata.normalize(form, x) - return self._map(f) + return self._str_map(f) def _str_strip(self, to_strip=None): - return self._map(lambda x: x.strip(to_strip)) + return self._str_map(lambda x: x.strip(to_strip)) def _str_lstrip(self, to_strip=None): - return self._map(lambda x: x.lstrip(to_strip)) + return self._str_map(lambda x: x.lstrip(to_strip)) def _str_rstrip(self, to_strip=None): - return self._map(lambda x: x.rstrip(to_strip)) + return self._str_map(lambda x: x.rstrip(to_strip)) From 8d3aecd996636f6a947547fdd400fd5337505650 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 22 Sep 2020 09:38:15 -0500 Subject: [PATCH 15/24] Move to arrays --- pandas/core/arrays/categorical.py | 19 +++++++- pandas/core/arrays/numpy_.py | 10 ++++- pandas/core/arrays/string_.py | 69 +++++++++++++++++++++++++--- pandas/core/strings/accessor.py | 10 ----- pandas/core/strings/categorical.py | 31 ------------- pandas/core/strings/object_array.py | 18 +++----- pandas/core/strings/string_array.py | 70 ----------------------------- 7 files changed, 97 insertions(+), 130 deletions(-) delete mode 100644 pandas/core/strings/categorical.py delete mode 100644 pandas/core/strings/string_array.py diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index ef69d6565cfeb..9210bd88362de 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -52,6 +52,7 @@ from pandas.core.missing import interpolate_2d from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.sorting import nargsort +from pandas.core.strings.object_array import ObjectStringArrayMixin from pandas.io.formats import console @@ -177,7 +178,7 @@ def contains(cat, key, container): return any(loc_ in container for loc_ in loc) -class Categorical(NDArrayBackedExtensionArray, PandasObject): +class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMixin): """ Represent a categorical variable in classic R / S-plus fashion. @@ -2335,6 +2336,22 @@ def replace(self, to_replace, value, inplace: bool = False): if not inplace: return cat + # ------------------------------------------------------------------------ + # String methods interface + def _str_map(self, f, na_value=np.nan, dtype=np.dtype(object)): + from pandas.core.arrays import PandasArray + + categories = self.categories + codes = self.codes + result = PandasArray(categories.to_numpy())._str_map(f, na_value, dtype) + return take_1d(result, codes, fill_value=na_value) + + def _str_get_dummies(self, sep="|"): + # sep may not be in categories. Just bail on this. + from pandas.core.arrays import PandasArray + + return PandasArray(self.astype(str))._str_get_dummies(sep) + # The Series.cat accessor diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 61076132b24cd..0a299b3473ef0 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -17,6 +17,7 @@ from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.arrays.base import ExtensionOpsMixin from pandas.core.construction import extract_array +from pandas.core.strings.object_array import ObjectStringArrayMixin class PandasDtype(ExtensionDtype): @@ -115,7 +116,10 @@ def itemsize(self) -> int: class PandasArray( - NDArrayBackedExtensionArray, ExtensionOpsMixin, NDArrayOperatorsMixin + NDArrayBackedExtensionArray, + ExtensionOpsMixin, + NDArrayOperatorsMixin, + ObjectStringArrayMixin, ): """ A pandas ExtensionArray for NumPy data. @@ -398,6 +402,10 @@ def arithmetic_method(self, other): _create_comparison_method = _create_arithmetic_method + # ------------------------------------------------------------------------ + # String methods interface + _str_na_value = np.nan + PandasArray._add_arithmetic_ops() PandasArray._add_comparison_ops() diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 5bf9838f6b964..6f3eed9c93301 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -6,7 +6,14 @@ from pandas._libs import lib, missing as libmissing from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype -from pandas.core.dtypes.common import is_array_like, pandas_dtype +from pandas.core.dtypes.common import ( + is_array_like, + is_bool_dtype, + is_integer_dtype, + is_object_dtype, + is_string_dtype, + pandas_dtype, +) from pandas import compat from pandas.core import ops @@ -15,7 +22,7 @@ from pandas.core.construction import extract_array from pandas.core.indexers import check_array_indexer from pandas.core.missing import isna -# from pandas.core.strings.object_array import ObjectStringArray +from pandas.core.strings.object_array import ObjectStringArrayMixin if TYPE_CHECKING: import pyarrow # noqa: F401 @@ -96,10 +103,7 @@ def __from_arrow__( return StringArray._concat_same_type(results) -# Uhmmm, this is going to have to dispatch on the dtype if we want -# to share StringArray between python / arrow. - -class StringArray(PandasArray): +class StringArray(PandasArray, ObjectStringArrayMixin): """ Extension array for string data. @@ -350,6 +354,59 @@ def _add_arithmetic_ops(cls): cls.__rmul__ = cls._create_arithmetic_method(ops.rmul) _create_comparison_method = _create_arithmetic_method + # ------------------------------------------------------------------------ + # String methods interface + _str_na_value = StringDtype.na_value + + def _str_map(self, f, na_value=None, dtype=None): + from pandas.arrays import BooleanArray, IntegerArray, StringArray + from pandas.core.arrays.string_ import StringDtype + + if dtype is None: + dtype = StringDtype() + if na_value is None: + na_value = self.dtype.na_value + + mask = isna(self) + arr = self + arr = np.asarray(self) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + constructor: Union[Type[IntegerArray], Type[BooleanArray]] + if is_integer_dtype(dtype): + constructor = IntegerArray + else: + constructor = BooleanArray + + na_value_is_na = isna(na_value) + if na_value_is_na: + na_value = 1 + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype(dtype), + ) + + if not na_value_is_na: + mask[:] = False + + return constructor(result, mask) + + elif is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, f, mask.view("uint8"), convert=False, na_value=na_value + ) + return StringArray(result) + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, f, mask.view("uint8")) StringArray._add_arithmetic_ops() diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 97de112114520..6578ab99ee885 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -24,9 +24,7 @@ ) from pandas.core.dtypes.missing import isna -from pandas.core.arrays.numpy_ import PandasArray from pandas.core.base import NoNewAttributesMixin -from pandas.core.strings.object_array import ObjectStringArray _shared_docs: Dict[str, str] = dict() _cpython_optimized_encoders = ( @@ -150,19 +148,11 @@ class StringMethods(NoNewAttributesMixin): def __init__(self, data): from pandas.core.arrays.string_ import StringDtype - from pandas.core.arrays import Categorical - from pandas.core.strings.categorical import CategoricalStringMethods self._inferred_dtype = self._validate(data) self._is_categorical = is_categorical_dtype(data.dtype) self._is_string = isinstance(data.dtype, StringDtype) array = data.array - - if type(array) is PandasArray: - # wrap in an object proxy to get the str methods. - array = ObjectStringArray(array._ndarray) - elif isinstance(array, Categorical): - array = CategoricalStringMethods(array) self._array = array if isinstance(data, ABCSeries): diff --git a/pandas/core/strings/categorical.py b/pandas/core/strings/categorical.py deleted file mode 100644 index e2a2cc731c0c3..0000000000000 --- a/pandas/core/strings/categorical.py +++ /dev/null @@ -1,31 +0,0 @@ -from typing import cast - -import numpy as np - -from pandas.core.algorithms import take_1d -from pandas.core.strings.object_array import ObjectStringArray -from pandas.core.arrays import Categorical - - -class CategoricalStringMethods(Categorical, ObjectStringArray): - """ - Extension array implementing _str methods for Categorical. - - We implement this just to avoid mucking up the inheritance chain - for Categorical. This inherits from ObjectStringArray just for - convenience. - """ - # Probably lots of room for improvement here. - def _map(self, f, na_value=np.nan, dtype=np.dtype(object)): - from pandas import Categorical - - arr = cast(Categorical, self._array) - - categories = arr.categories - codes = arr.codes - result = ObjectStringArray(categories)._map(f, na_value, dtype) - return take_1d(result, codes, fill_value=na_value) - - def _str_get_dummies(self, sep="|"): - # sep may not be in categories. Just bail on this. - return ObjectStringArray(self.astype(str))._str_get_dummies(sep) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index c1851993e24a7..d47513b18d6b9 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -14,31 +14,27 @@ from pandas.core.dtypes.common import is_re, is_scalar from pandas.core.dtypes.missing import isna -from pandas.core.arrays.numpy_ import PandasArray from pandas.core.strings.base import BaseStringArrayMethods -class ObjectStringArray(PandasArray, BaseStringArrayMethods): +class ObjectStringArrayMixin(BaseStringArrayMethods): """ - PandasArray subclass with _str methods. - - We don't want to put the _str methods on all PandasArrays - so we use this subclass with the BaseStringArrayMethods - mixin. + String Methods operating on object-dtype ndarrays. """ - _default_na_value = np.nan + + _str_na_value = np.nan def _str_map(self, f, na_value=None, dtype=None): arr = self if dtype is None: dtype = np.dtype("object") if na_value is None: - na_value = self._default_na_value + na_value = self._str_na_value if not len(arr): return np.ndarray(0, dtype=dtype) if na_value is None: - na_value = self._default_na_value + na_value = self._str_na_value if not isinstance(arr, np.ndarray): arr = np.asarray(arr, dtype=object) @@ -258,7 +254,7 @@ def f(x): return x.get(i) elif len(x) > i >= -len(x): return x[i] - return self._default_na_value + return self._str_na_value return self._str_map(f) diff --git a/pandas/core/strings/string_array.py b/pandas/core/strings/string_array.py deleted file mode 100644 index 269e85b71161a..0000000000000 --- a/pandas/core/strings/string_array.py +++ /dev/null @@ -1,70 +0,0 @@ -from typing import Type, Union - -import numpy as np - -from pandas._libs import lib, missing as libmissing - -from pandas.core.dtypes.common import ( - is_bool_dtype, - is_integer_dtype, - is_object_dtype, - is_string_dtype, -) - -from pandas.core.missing import isna -from pandas.core.strings.object_array import ObjectArrayMethods - - -class StringArrayMethods(ObjectArrayMethods): - _default_na_value = libmissing.NA - - def _map(self, f, na_value=None, dtype=None): - from pandas.arrays import BooleanArray, IntegerArray, StringArray - from pandas.core.arrays.string_ import StringDtype - - if dtype is None: - dtype = StringDtype() - if na_value is None: - na_value = self._default_na_value - - arr = self._array - mask = isna(arr) - - arr = np.asarray(arr) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - constructor: Union[Type[IntegerArray], Type[BooleanArray]] - if is_integer_dtype(dtype): - constructor = IntegerArray - else: - constructor = BooleanArray - - na_value_is_na = isna(na_value) - if na_value_is_na: - na_value = 1 - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - dtype=np.dtype(dtype), - ) - - if not na_value_is_na: - mask[:] = False - - return constructor(result, mask) - - elif is_string_dtype(dtype) and not is_object_dtype(dtype): - # i.e. StringDtype - result = lib.map_infer_mask( - arr, f, mask.view("uint8"), convert=False, na_value=na_value - ) - return StringArray(result) - else: - # This is when the result type is object. We reach this when - # -> We know the result type is truly object (e.g. .encode returns bytes - # or .findall returns a list). - # -> We don't know the result type. E.g. `.get` can return anything. - return lib.map_infer_mask(arr, f, mask.view("uint8")) From d11c2baabc6c23ec85b6f3b0cdd3ad5df578cac8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 22 Sep 2020 09:43:20 -0500 Subject: [PATCH 16/24] Fixup types --- pandas/core/strings/base.py | 4 ++-- pandas/core/strings/object_array.py | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index d16bec1053f95..89de0fd531e89 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -23,9 +23,9 @@ class BaseStringArrayMethods(abc.ABC): def _str_getitem(self, key): if isinstance(key, slice): - return self.slice(start=key.start, stop=key.stop, step=key.step) + return self._str_slice(start=key.start, stop=key.stop, step=key.step) else: - return self.get(key) + return self._str_get(key) @abc.abstractmethod def _str_count(self, pat, flags=0): diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index d47513b18d6b9..b9d9a4c8e3828 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -24,6 +24,10 @@ class ObjectStringArrayMixin(BaseStringArrayMethods): _str_na_value = np.nan + def __len__(self): + # For typing, _str_map relies on the object being sized. + raise NotImplementedError + def _str_map(self, f, na_value=None, dtype=None): arr = self if dtype is None: From 349e281884f99740c555d9b80cd9af8e1df5ddc1 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 22 Sep 2020 09:51:43 -0500 Subject: [PATCH 17/24] test coverage --- pandas/core/strings/object_array.py | 8 -------- pandas/tests/test_strings.py | 12 ++++++++++++ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index b9d9a4c8e3828..af097a283818e 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -37,8 +37,6 @@ def _str_map(self, f, na_value=None, dtype=None): if not len(arr): return np.ndarray(0, dtype=dtype) - if na_value is None: - na_value = self._str_na_value if not isinstance(arr, np.ndarray): arr = np.asarray(arr, dtype=object) @@ -73,12 +71,6 @@ def g(x): result = lib.maybe_convert_objects(result) return result - def _str_getitem(self, key): - if isinstance(key, slice): - return self._str_slice(start=key.start, stop=key.stop, step=key.step) - else: - return self._str_get(key) - def _str_count(self, pat, flags=0): regex = re.compile(pat, flags=flags) f = lambda x: len(regex.findall(x)) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index df0e775e8750b..6ad55639ae5d8 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -2541,6 +2541,18 @@ def test_split(self): exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) tm.assert_series_equal(result, exp) + @pytest.mark.parametrize("dtype", [object, "string"]) + @pytest.mark.parametrize("method", ["split", "rsplit"]) + def test_split_n(self, dtype, method): + s = pd.Series(["a b", pd.NA, "b c"], dtype=dtype) + expected = pd.Series([["a", "b"], pd.NA, ["b", "c"]]) + + result = getattr(s.str, method)(" ", n=None) + tm.assert_series_equal(result, expected) + + result = getattr(s.str, method)(" ", n=0) + tm.assert_series_equal(result, expected) + def test_rsplit(self): values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) result = values.str.rsplit("_") From b7ab130c0ac041aaeb6be826b31b4b585e492082 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 22 Sep 2020 11:21:10 -0500 Subject: [PATCH 18/24] fixup --- pandas/core/strings/accessor.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 6578ab99ee885..a921250bf3eb6 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -235,7 +235,12 @@ def __iter__(self): g = self.get(i) def _wrap_result( - self, result, name=None, expand=None, fill_value=np.nan, returns_string=True, + self, + result, + name=None, + expand=None, + fill_value=np.nan, + returns_string=True, ): from pandas import Index, MultiIndex @@ -1888,7 +1893,12 @@ def get_dummies(self, sep="|"): # we need to cast to Series of strings as only that has all # methods available for making the dummies... result, name = self._array._str_get_dummies(sep) - return self._wrap_result(result, name=name, expand=True, returns_string=False,) + return self._wrap_result( + result, + name=name, + expand=True, + returns_string=False, + ) @forbid_nonstring_types(["bytes"]) def translate(self, table): From 6dcd44e4fc5be091fca56d5cd41bc3e32ecb0043 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 23 Sep 2020 08:13:37 -0500 Subject: [PATCH 19/24] update docstring --- pandas/core/strings/base.py | 9 +++++---- pandas/core/strings/object_array.py | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index 89de0fd531e89..3d0e6bf69d6f9 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -8,14 +8,15 @@ class BaseStringArrayMethods(abc.ABC): """ - Base class for array _str accessor. + Base class for extension arrays implementing string methods. - This is where ExtensionArrays can override the implementation of - Series.str.. The rough layout is + This is our ExtensionArrays can override the implementation of + Series.str.. We don't currenlty expect this to work with + 3rd-party extension arrays. * User calls Series.str. * pandas extracts the extension array from the Series - * pandas calls ``extension_array._str.(*args, **kwargs)`` + * pandas calls ``extension_array._str_(*args, **kwargs)`` * pandas wraps the result, to return to the user. See :ref:`Series.str` for the docstring of each method. diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index af097a283818e..a29d84edd3a77 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -29,6 +29,21 @@ def __len__(self): raise NotImplementedError def _str_map(self, f, na_value=None, dtype=None): + """ + Map a callable over valid element of the array. + + Parameters + ---------- + f : Callable + A function to call on each non-NA element. + na_value : Scalar, optional + The value to set for NA values. Might also be used for the + fill value if the callable `f` raises an exception. + This defaults to ``self._str_na_value`` which is ``np.nan`` + for object-dtype and Categorical and ``pd.NA`` for StringArray. + dtype : Dtype, optional + The dtype of the result array. + """ arr = self if dtype is None: dtype = np.dtype("object") From efb3e3df28b6320593d9c22937af0678ad4921fe Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 24 Sep 2020 09:43:15 -0500 Subject: [PATCH 20/24] document current implementation --- pandas/core/arrays/categorical.py | 3 +++ pandas/core/strings/__init__.py | 28 ++++++++++++++++++++++++++++ pandas/core/strings/accessor.py | 2 ++ 3 files changed, 33 insertions(+) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c76b5314ee937..0930183f0bf2f 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2316,6 +2316,9 @@ def replace(self, to_replace, value, inplace: bool = False): # ------------------------------------------------------------------------ # String methods interface def _str_map(self, f, na_value=np.nan, dtype=np.dtype(object)): + # Optimization to apply the callable `f` to the categories once + # and rebuild the result by `take`ing from the result with the codes. + # Returns the same type as the object-dtype impelmentation though. from pandas.core.arrays import PandasArray categories = self.categories diff --git a/pandas/core/strings/__init__.py b/pandas/core/strings/__init__.py index bf0ac8ef872ca..ec44c9d7053e7 100644 --- a/pandas/core/strings/__init__.py +++ b/pandas/core/strings/__init__.py @@ -1,3 +1,31 @@ +""" +Implementation of pandas.Series.str and its interface. + +* strings.accessor.StringMethods : Accessor for Series.str +* strings.base.BaseStringArrayMethods: Mixin ABC for EAs to implement str methods + +Most methods on the StringMethods accessor follow the pattern: + + 1. extract the array from the series (or index) + 2. Call that array's impelmentation of the string method + 3. Wrap the result (in a Series, index, or DataFrame) + +Pandas extension arrays implementing string methods should inherit from +pandas.core.strings.base.BaseStringArrayMethods. This is an ABC defining +the various string methods. To avoid namespace clashes and pollution, +these are prefixed with `_str_`. So ``Series.str.upper()`` calls +``Series.array._str_upper()``. The interface isn't currently public +to other string extension arrays. +""" +# Pandas current implementation is in ObjectStringArrayMixin. This is designed +# to work on object-dtype ndarrays. +# +# BaseStringArrayMethods +# - ObjectStringArrayMixin +# - StringArray +# - PandasArray +# - Categorical + from .accessor import StringMethods from .base import BaseStringArrayMethods diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index a921250bf3eb6..2aec354223da0 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -140,6 +140,8 @@ class StringMethods(NoNewAttributesMixin): dtype: object """ + # Note: see the docstring in pandas.core.strings.__init__ + # for an explanation of the implementation. # TODO: Dispatch all the methods # Currently the following are not dispatched to the array # * cat From 0da70311b4741a5588213aeec510d3a50ec179ef Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 24 Sep 2020 09:44:06 -0500 Subject: [PATCH 21/24] typo --- pandas/core/strings/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index 3d0e6bf69d6f9..08064244a2ff9 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -10,8 +10,8 @@ class BaseStringArrayMethods(abc.ABC): """ Base class for extension arrays implementing string methods. - This is our ExtensionArrays can override the implementation of - Series.str.. We don't currenlty expect this to work with + This is where our ExtensionArrays can override the implementation of + Series.str.. We don't expect this to work with 3rd-party extension arrays. * User calls Series.str. From d681f99859a17a2ab214a4c47143cdd9241faac1 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 25 Sep 2020 15:38:19 -0500 Subject: [PATCH 22/24] fixup --- pandas/core/arrays/categorical.py | 2 +- pandas/core/strings/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index f36de9cb972ee..1e06bf1b6ce96 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2312,7 +2312,7 @@ def replace(self, to_replace, value, inplace: bool = False): def _str_map(self, f, na_value=np.nan, dtype=np.dtype(object)): # Optimization to apply the callable `f` to the categories once # and rebuild the result by `take`ing from the result with the codes. - # Returns the same type as the object-dtype impelmentation though. + # Returns the same type as the object-dtype implementation though. from pandas.core.arrays import PandasArray categories = self.categories diff --git a/pandas/core/strings/__init__.py b/pandas/core/strings/__init__.py index ec44c9d7053e7..243250f0360a0 100644 --- a/pandas/core/strings/__init__.py +++ b/pandas/core/strings/__init__.py @@ -7,7 +7,7 @@ Most methods on the StringMethods accessor follow the pattern: 1. extract the array from the series (or index) - 2. Call that array's impelmentation of the string method + 2. Call that array's implementation of the string method 3. Wrap the result (in a Series, index, or DataFrame) Pandas extension arrays implementing string methods should inherit from From 457c1122429af282983b302a62be4822b5eaef6b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 29 Sep 2020 08:20:36 -0500 Subject: [PATCH 23/24] fixup --- pandas/core/arrays/string_.py | 1 - pandas/core/strings.py | 3650 --------------------------------- 2 files changed, 3651 deletions(-) delete mode 100644 pandas/core/strings.py diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 7ad97c546f720..0db5fadce614e 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -368,7 +368,6 @@ def _str_map(self, f, na_value=None, dtype=None): na_value = self.dtype.na_value mask = isna(self) - arr = self arr = np.asarray(self) if is_integer_dtype(dtype) or is_bool_dtype(dtype): diff --git a/pandas/core/strings.py b/pandas/core/strings.py deleted file mode 100644 index 4467c96041dc7..0000000000000 --- a/pandas/core/strings.py +++ /dev/null @@ -1,3650 +0,0 @@ -import codecs -from functools import wraps -import re -import textwrap -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Pattern, Type, Union -import warnings - -import numpy as np - -import pandas._libs.lib as lib -import pandas._libs.missing as libmissing -import pandas._libs.ops as libops -from pandas._typing import ArrayLike, Dtype, Scalar -from pandas.util._decorators import Appender - -from pandas.core.dtypes.common import ( - ensure_object, - is_bool_dtype, - is_categorical_dtype, - is_extension_array_dtype, - is_integer, - is_integer_dtype, - is_list_like, - is_object_dtype, - is_re, - is_scalar, - is_string_dtype, -) -from pandas.core.dtypes.generic import ( - ABCDataFrame, - ABCIndexClass, - ABCMultiIndex, - ABCSeries, -) -from pandas.core.dtypes.missing import isna - -from pandas.core.algorithms import take_1d -from pandas.core.base import NoNewAttributesMixin -from pandas.core.construction import extract_array - -if TYPE_CHECKING: - from pandas.arrays import StringArray - -_cpython_optimized_encoders = ( - "utf-8", - "utf8", - "latin-1", - "latin1", - "iso-8859-1", - "mbcs", - "ascii", -) -_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32") - -_shared_docs: Dict[str, str] = dict() - - -def cat_core(list_of_columns: List, sep: str): - """ - Auxiliary function for :meth:`str.cat` - - Parameters - ---------- - list_of_columns : list of numpy arrays - List of arrays to be concatenated with sep; - these arrays may not contain NaNs! - sep : string - The separator string for concatenating the columns. - - Returns - ------- - nd.array - The concatenation of list_of_columns with sep. - """ - if sep == "": - # no need to interleave sep if it is empty - arr_of_cols = np.asarray(list_of_columns, dtype=object) - return np.sum(arr_of_cols, axis=0) - list_with_sep = [sep] * (2 * len(list_of_columns) - 1) - list_with_sep[::2] = list_of_columns - arr_with_sep = np.asarray(list_with_sep, dtype=object) - return np.sum(arr_with_sep, axis=0) - - -def cat_safe(list_of_columns: List, sep: str): - """ - Auxiliary function for :meth:`str.cat`. - - Same signature as cat_core, but handles TypeErrors in concatenation, which - happen if the arrays in list_of columns have the wrong dtypes or content. - - Parameters - ---------- - list_of_columns : list of numpy arrays - List of arrays to be concatenated with sep; - these arrays may not contain NaNs! - sep : string - The separator string for concatenating the columns. - - Returns - ------- - nd.array - The concatenation of list_of_columns with sep. - """ - try: - result = cat_core(list_of_columns, sep) - except TypeError: - # if there are any non-string values (wrong dtype or hidden behind - # object dtype), np.sum will fail; catch and return with better message - for column in list_of_columns: - dtype = lib.infer_dtype(column, skipna=True) - if dtype not in ["string", "empty"]: - raise TypeError( - "Concatenation requires list-likes containing only " - "strings (or missing values). Offending values found in " - f"column {dtype}" - ) from None - return result - - -def _na_map(f, arr, na_result=None, dtype=np.dtype(object)): - if is_extension_array_dtype(arr.dtype): - if na_result is None: - na_result = libmissing.NA - # just StringDtype - arr = extract_array(arr) - return _map_stringarray(f, arr, na_value=na_result, dtype=dtype) - if na_result is None: - na_result = np.nan - return _map_object(f, arr, na_mask=True, na_value=na_result, dtype=dtype) - - -def _map_stringarray( - func: Callable[[str], Any], arr: "StringArray", na_value: Any, dtype: Dtype -) -> ArrayLike: - """ - Map a callable over valid elements of a StringArray. - - Parameters - ---------- - func : Callable[[str], Any] - Apply to each valid element. - arr : StringArray - na_value : Any - The value to use for missing values. By default, this is - the original value (NA). - dtype : Dtype - The result dtype to use. Specifying this avoids an intermediate - object-dtype allocation. - - Returns - ------- - ArrayLike - An ExtensionArray for integer or string dtypes, otherwise - an ndarray. - - """ - from pandas.arrays import BooleanArray, IntegerArray, StringArray - - mask = isna(arr) - - assert isinstance(arr, StringArray) - arr = np.asarray(arr) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - constructor: Union[Type[IntegerArray], Type[BooleanArray]] - if is_integer_dtype(dtype): - constructor = IntegerArray - else: - constructor = BooleanArray - - na_value_is_na = isna(na_value) - if na_value_is_na: - na_value = 1 - result = lib.map_infer_mask( - arr, - func, - mask.view("uint8"), - convert=False, - na_value=na_value, - dtype=np.dtype(dtype), - ) - - if not na_value_is_na: - mask[:] = False - - return constructor(result, mask) - - elif is_string_dtype(dtype) and not is_object_dtype(dtype): - # i.e. StringDtype - result = lib.map_infer_mask( - arr, func, mask.view("uint8"), convert=False, na_value=na_value - ) - return StringArray(result) - else: - # This is when the result type is object. We reach this when - # -> We know the result type is truly object (e.g. .encode returns bytes - # or .findall returns a list). - # -> We don't know the result type. E.g. `.get` can return anything. - return lib.map_infer_mask(arr, func, mask.view("uint8")) - - -def _map_object(f, arr, na_mask=False, na_value=np.nan, dtype=np.dtype(object)): - if not len(arr): - return np.ndarray(0, dtype=dtype) - - if isinstance(arr, ABCSeries): - arr = arr._values # TODO: extract_array? - if not isinstance(arr, np.ndarray): - arr = np.asarray(arr, dtype=object) - if na_mask: - mask = isna(arr) - convert = not np.all(mask) - try: - result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert) - except (TypeError, AttributeError) as e: - # Reraise the exception if callable `f` got wrong number of args. - # The user may want to be warned by this, instead of getting NaN - p_err = ( - r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " - r"(?(3)required )positional arguments?" - ) - - if len(e.args) >= 1 and re.search(p_err, e.args[0]): - # FIXME: this should be totally avoidable - raise e - - def g(x): - try: - return f(x) - except (TypeError, AttributeError): - return na_value - - return _map_object(g, arr, dtype=dtype) - if na_value is not np.nan: - np.putmask(result, mask, na_value) - if result.dtype == object: - result = lib.maybe_convert_objects(result) - return result - else: - return lib.map_infer(arr, f) - - -def str_count(arr, pat, flags=0): - """ - Count occurrences of pattern in each string of the Series/Index. - - This function is used to count the number of times a particular regex - pattern is repeated in each of the string elements of the - :class:`~pandas.Series`. - - Parameters - ---------- - pat : str - Valid regular expression. - flags : int, default 0, meaning no flags - Flags for the `re` module. For a complete list, `see here - `_. - **kwargs - For compatibility with other string methods. Not used. - - Returns - ------- - Series or Index - Same type as the calling object containing the integer counts. - - See Also - -------- - re : Standard library module for regular expressions. - str.count : Standard library version, without regular expression support. - - Notes - ----- - Some characters need to be escaped when passing in `pat`. - eg. ``'$'`` has a special meaning in regex and must be escaped when - finding this literal character. - - Examples - -------- - >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat']) - >>> s.str.count('a') - 0 0.0 - 1 0.0 - 2 2.0 - 3 2.0 - 4 NaN - 5 0.0 - 6 1.0 - dtype: float64 - - Escape ``'$'`` to find the literal dollar sign. - - >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat']) - >>> s.str.count('\\$') - 0 1 - 1 0 - 2 1 - 3 2 - 4 2 - 5 0 - dtype: int64 - - This is also available on Index - - >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a') - Int64Index([0, 0, 2, 1], dtype='int64') - """ - regex = re.compile(pat, flags=flags) - f = lambda x: len(regex.findall(x)) - return _na_map(f, arr, dtype="int64") - - -def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): - """ - Test if pattern or regex is contained within a string of a Series or Index. - - Return boolean Series or Index based on whether a given pattern or regex is - contained within a string of a Series or Index. - - Parameters - ---------- - pat : str - Character sequence or regular expression. - case : bool, default True - If True, case sensitive. - flags : int, default 0 (no flags) - Flags to pass through to the re module, e.g. re.IGNORECASE. - na : default NaN - Fill value for missing values. - regex : bool, default True - If True, assumes the pat is a regular expression. - - If False, treats the pat as a literal string. - - Returns - ------- - Series or Index of boolean values - A Series or Index of boolean values indicating whether the - given pattern is contained within the string of each element - of the Series or Index. - - See Also - -------- - match : Analogous, but stricter, relying on re.match instead of re.search. - Series.str.startswith : Test if the start of each string element matches a - pattern. - Series.str.endswith : Same as startswith, but tests the end of string. - - Examples - -------- - Returning a Series of booleans using only a literal pattern. - - >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN]) - >>> s1.str.contains('og', regex=False) - 0 False - 1 True - 2 False - 3 False - 4 NaN - dtype: object - - Returning an Index of booleans using only a literal pattern. - - >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN]) - >>> ind.str.contains('23', regex=False) - Index([False, False, False, True, nan], dtype='object') - - Specifying case sensitivity using `case`. - - >>> s1.str.contains('oG', case=True, regex=True) - 0 False - 1 False - 2 False - 3 False - 4 NaN - dtype: object - - Specifying `na` to be `False` instead of `NaN` replaces NaN values - with `False`. If Series or Index does not contain NaN values - the resultant dtype will be `bool`, otherwise, an `object` dtype. - - >>> s1.str.contains('og', na=False, regex=True) - 0 False - 1 True - 2 False - 3 False - 4 False - dtype: bool - - Returning 'house' or 'dog' when either expression occurs in a string. - - >>> s1.str.contains('house|dog', regex=True) - 0 False - 1 True - 2 True - 3 False - 4 NaN - dtype: object - - Ignoring case sensitivity using `flags` with regex. - - >>> import re - >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True) - 0 False - 1 False - 2 True - 3 False - 4 NaN - dtype: object - - Returning any digit using regular expression. - - >>> s1.str.contains('\\d', regex=True) - 0 False - 1 False - 2 False - 3 True - 4 NaN - dtype: object - - Ensure `pat` is a not a literal pattern when `regex` is set to True. - Note in the following example one might expect only `s2[1]` and `s2[3]` to - return `True`. However, '.0' as a regex matches any character - followed by a 0. - - >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35']) - >>> s2.str.contains('.0', regex=True) - 0 True - 1 True - 2 False - 3 True - 4 False - dtype: bool - """ - if regex: - if not case: - flags |= re.IGNORECASE - - regex = re.compile(pat, flags=flags) - - if regex.groups > 0: - warnings.warn( - "This pattern has match groups. To actually get the " - "groups, use str.extract.", - UserWarning, - stacklevel=3, - ) - - f = lambda x: regex.search(x) is not None - else: - if case: - f = lambda x: pat in x - else: - upper_pat = pat.upper() - f = lambda x: upper_pat in x - uppered = _na_map(lambda x: x.upper(), arr) - return _na_map(f, uppered, na, dtype=np.dtype(bool)) - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def str_startswith(arr, pat, na=np.nan): - """ - Test if the start of each string element matches a pattern. - - Equivalent to :meth:`str.startswith`. - - Parameters - ---------- - pat : str - Character sequence. Regular expressions are not accepted. - na : object, default NaN - Object shown if element tested is not a string. - - Returns - ------- - Series or Index of bool - A Series of booleans indicating whether the given pattern matches - the start of each string element. - - See Also - -------- - str.startswith : Python standard library string method. - Series.str.endswith : Same as startswith, but tests the end of string. - Series.str.contains : Tests if string element contains a pattern. - - Examples - -------- - >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan]) - >>> s - 0 bat - 1 Bear - 2 cat - 3 NaN - dtype: object - - >>> s.str.startswith('b') - 0 True - 1 False - 2 False - 3 NaN - dtype: object - - Specifying `na` to be `False` instead of `NaN`. - - >>> s.str.startswith('b', na=False) - 0 True - 1 False - 2 False - 3 False - dtype: bool - """ - f = lambda x: x.startswith(pat) - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def str_endswith(arr, pat, na=np.nan): - """ - Test if the end of each string element matches a pattern. - - Equivalent to :meth:`str.endswith`. - - Parameters - ---------- - pat : str - Character sequence. Regular expressions are not accepted. - na : object, default NaN - Object shown if element tested is not a string. - - Returns - ------- - Series or Index of bool - A Series of booleans indicating whether the given pattern matches - the end of each string element. - - See Also - -------- - str.endswith : Python standard library string method. - Series.str.startswith : Same as endswith, but tests the start of string. - Series.str.contains : Tests if string element contains a pattern. - - Examples - -------- - >>> s = pd.Series(['bat', 'bear', 'caT', np.nan]) - >>> s - 0 bat - 1 bear - 2 caT - 3 NaN - dtype: object - - >>> s.str.endswith('t') - 0 True - 1 False - 2 False - 3 NaN - dtype: object - - Specifying `na` to be `False` instead of `NaN`. - - >>> s.str.endswith('t', na=False) - 0 True - 1 False - 2 False - 3 False - dtype: bool - """ - f = lambda x: x.endswith(pat) - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): - r""" - Replace each occurrence of pattern/regex in the Series/Index. - - Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on the regex value. - - Parameters - ---------- - pat : str or compiled regex - String can be a character sequence or regular expression. - repl : str or callable - Replacement string or a callable. The callable is passed the regex - match object and must return a replacement string to be used. - See :func:`re.sub`. - n : int, default -1 (all) - Number of replacements to make from start. - case : bool, default None - Determines if replace is case sensitive: - - - If True, case sensitive (the default if `pat` is a string) - - Set to False for case insensitive - - Cannot be set if `pat` is a compiled regex. - - flags : int, default 0 (no flags) - Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled - regex. - regex : bool, default True - Determines if assumes the passed-in pattern is a regular expression: - - - If True, assumes the passed-in pattern is a regular expression. - - If False, treats the pattern as a literal string - - Cannot be set to False if `pat` is a compiled regex or `repl` is - a callable. - - Returns - ------- - Series or Index of object - A copy of the object with all matching occurrences of `pat` replaced by - `repl`. - - Raises - ------ - ValueError - * if `regex` is False and `repl` is a callable or `pat` is a compiled - regex - * if `pat` is a compiled regex and `case` or `flags` is set - - Notes - ----- - When `pat` is a compiled regex, all flags should be included in the - compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled - regex will raise an error. - - Examples - -------- - When `pat` is a string and `regex` is True (the default), the given `pat` - is compiled as a regex. When `repl` is a string, it replaces matching - regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are - left as is: - - >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True) - 0 bao - 1 baz - 2 NaN - dtype: object - - When `pat` is a string and `regex` is False, every `pat` is replaced with - `repl` as with :meth:`str.replace`: - - >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False) - 0 bao - 1 fuz - 2 NaN - dtype: object - - When `repl` is a callable, it is called on every `pat` using - :func:`re.sub`. The callable should expect one positional argument - (a regex object) and return a string. - - To get the idea: - - >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr) - 0 oo - 1 uz - 2 NaN - dtype: object - - Reverse every lowercase alphabetic word: - - >>> repl = lambda m: m.group(0)[::-1] - >>> pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl) - 0 oof 123 - 1 rab zab - 2 NaN - dtype: object - - Using regex groups (extract second group and swap case): - - >>> pat = r"(?P\w+) (?P\w+) (?P\w+)" - >>> repl = lambda m: m.group('two').swapcase() - >>> pd.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl) - 0 tWO - 1 bAR - dtype: object - - Using a compiled regex with flags - - >>> import re - >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE) - >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar') - 0 foo - 1 bar - 2 NaN - dtype: object - """ - # Check whether repl is valid (GH 13438, GH 15055) - if not (isinstance(repl, str) or callable(repl)): - raise TypeError("repl must be a string or callable") - - is_compiled_re = is_re(pat) - if regex: - if is_compiled_re: - if (case is not None) or (flags != 0): - raise ValueError( - "case and flags cannot be set when pat is a compiled regex" - ) - else: - # not a compiled regex - # set default case - if case is None: - case = True - - # add case flag, if provided - if case is False: - flags |= re.IGNORECASE - if is_compiled_re or len(pat) > 1 or flags or callable(repl): - n = n if n >= 0 else 0 - compiled = re.compile(pat, flags=flags) - f = lambda x: compiled.sub(repl=repl, string=x, count=n) - else: - f = lambda x: x.replace(pat, repl, n) - else: - if is_compiled_re: - raise ValueError( - "Cannot use a compiled regex as replacement pattern with regex=False" - ) - if callable(repl): - raise ValueError("Cannot use a callable replacement when regex=False") - f = lambda x: x.replace(pat, repl, n) - - return _na_map(f, arr, dtype=str) - - -def str_repeat(arr, repeats): - """ - Duplicate each string in the Series or Index. - - Parameters - ---------- - repeats : int or sequence of int - Same value for all (int) or different value per (sequence). - - Returns - ------- - Series or Index of object - Series or Index of repeated string objects specified by - input parameter repeats. - - Examples - -------- - >>> s = pd.Series(['a', 'b', 'c']) - >>> s - 0 a - 1 b - 2 c - dtype: object - - Single int repeats string in Series - - >>> s.str.repeat(repeats=2) - 0 aa - 1 bb - 2 cc - dtype: object - - Sequence of int repeats corresponding string in Series - - >>> s.str.repeat(repeats=[1, 2, 3]) - 0 a - 1 bb - 2 ccc - dtype: object - """ - if is_scalar(repeats): - - def scalar_rep(x): - try: - return bytes.__mul__(x, repeats) - except TypeError: - return str.__mul__(x, repeats) - - return _na_map(scalar_rep, arr, dtype=str) - else: - - def rep(x, r): - if x is libmissing.NA: - return x - try: - return bytes.__mul__(x, r) - except TypeError: - return str.__mul__(x, r) - - repeats = np.asarray(repeats, dtype=object) - result = libops.vec_binop(np.asarray(arr), repeats, rep) - return result - - -def str_match( - arr: ArrayLike, - pat: Union[str, Pattern], - case: bool = True, - flags: int = 0, - na: Scalar = np.nan, -): - """ - Determine if each string starts with a match of a regular expression. - - Parameters - ---------- - pat : str - Character sequence or regular expression. - case : bool, default True - If True, case sensitive. - flags : int, default 0 (no flags) - Regex module flags, e.g. re.IGNORECASE. - na : default NaN - Fill value for missing values. - - Returns - ------- - Series/array of boolean values - - See Also - -------- - fullmatch : Stricter matching that requires the entire string to match. - contains : Analogous, but less strict, relying on re.search instead of - re.match. - extract : Extract matched groups. - """ - if not case: - flags |= re.IGNORECASE - - regex = re.compile(pat, flags=flags) - - f = lambda x: regex.match(x) is not None - - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def str_fullmatch( - arr: ArrayLike, - pat: Union[str, Pattern], - case: bool = True, - flags: int = 0, - na: Scalar = np.nan, -): - """ - Determine if each string entirely matches a regular expression. - - .. versionadded:: 1.1.0 - - Parameters - ---------- - pat : str - Character sequence or regular expression. - case : bool, default True - If True, case sensitive. - flags : int, default 0 (no flags) - Regex module flags, e.g. re.IGNORECASE. - na : default NaN - Fill value for missing values. - - Returns - ------- - Series/array of boolean values - - See Also - -------- - match : Similar, but also returns `True` when only a *prefix* of the string - matches the regular expression. - extract : Extract matched groups. - """ - if not case: - flags |= re.IGNORECASE - - regex = re.compile(pat, flags=flags) - - f = lambda x: regex.fullmatch(x) is not None - - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def _get_single_group_name(rx): - try: - return list(rx.groupindex.keys()).pop() - except IndexError: - return None - - -def _groups_or_na_fun(regex): - """Used in both extract_noexpand and extract_frame""" - if regex.groups == 0: - raise ValueError("pattern contains no capture groups") - empty_row = [np.nan] * regex.groups - - def f(x): - if not isinstance(x, str): - return empty_row - m = regex.search(x) - if m: - return [np.nan if item is None else item for item in m.groups()] - else: - return empty_row - - return f - - -def _result_dtype(arr): - # workaround #27953 - # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails - # when the list of values is empty. - if arr.dtype.name == "string": - return "string" - else: - return object - - -def _str_extract_noexpand(arr, pat, flags=0): - """ - Find groups in each string in the Series using passed regular - expression. This function is called from - str_extract(expand=False), and can return Series, DataFrame, or - Index. - - """ - from pandas import DataFrame - - regex = re.compile(pat, flags=flags) - groups_or_na = _groups_or_na_fun(regex) - - if regex.groups == 1: - result = np.array([groups_or_na(val)[0] for val in arr], dtype=object) - name = _get_single_group_name(regex) - else: - if isinstance(arr, ABCIndexClass): - raise ValueError("only one regex group is supported with Index") - name = None - names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) - columns = [names.get(1 + i, i) for i in range(regex.groups)] - if arr.empty: - result = DataFrame(columns=columns, dtype=object) - else: - dtype = _result_dtype(arr) - result = DataFrame( - [groups_or_na(val) for val in arr], - columns=columns, - index=arr.index, - dtype=dtype, - ) - return result, name - - -def _str_extract_frame(arr, pat, flags=0): - """ - For each subject string in the Series, extract groups from the - first match of regular expression pat. This function is called from - str_extract(expand=True), and always returns a DataFrame. - - """ - from pandas import DataFrame - - regex = re.compile(pat, flags=flags) - groups_or_na = _groups_or_na_fun(regex) - names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) - columns = [names.get(1 + i, i) for i in range(regex.groups)] - - if len(arr) == 0: - return DataFrame(columns=columns, dtype=object) - try: - result_index = arr.index - except AttributeError: - result_index = None - dtype = _result_dtype(arr) - return DataFrame( - [groups_or_na(val) for val in arr], - columns=columns, - index=result_index, - dtype=dtype, - ) - - -def str_extract(arr, pat, flags=0, expand=True): - r""" - Extract capture groups in the regex `pat` as columns in a DataFrame. - - For each subject string in the Series, extract groups from the - first match of regular expression `pat`. - - Parameters - ---------- - pat : str - Regular expression pattern with capturing groups. - flags : int, default 0 (no flags) - Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that - modify regular expression matching for things like case, - spaces, etc. For more details, see :mod:`re`. - expand : bool, default True - If True, return DataFrame with one column per capture group. - If False, return a Series/Index if there is one capture group - or DataFrame if there are multiple capture groups. - - Returns - ------- - DataFrame or Series or Index - A DataFrame with one row for each subject string, and one - column for each group. Any capture group names in regular - expression pat will be used for column names; otherwise - capture group numbers will be used. The dtype of each result - column is always object, even when no match is found. If - ``expand=False`` and pat has only one capture group, then - return a Series (if subject is a Series) or Index (if subject - is an Index). - - See Also - -------- - extractall : Returns all matches (not just the first match). - - Examples - -------- - A pattern with two groups will return a DataFrame with two columns. - Non-matches will be NaN. - - >>> s = pd.Series(['a1', 'b2', 'c3']) - >>> s.str.extract(r'([ab])(\d)') - 0 1 - 0 a 1 - 1 b 2 - 2 NaN NaN - - A pattern may contain optional groups. - - >>> s.str.extract(r'([ab])?(\d)') - 0 1 - 0 a 1 - 1 b 2 - 2 NaN 3 - - Named groups will become column names in the result. - - >>> s.str.extract(r'(?P[ab])(?P\d)') - letter digit - 0 a 1 - 1 b 2 - 2 NaN NaN - - A pattern with one group will return a DataFrame with one column - if expand=True. - - >>> s.str.extract(r'[ab](\d)', expand=True) - 0 - 0 1 - 1 2 - 2 NaN - - A pattern with one group will return a Series if expand=False. - - >>> s.str.extract(r'[ab](\d)', expand=False) - 0 1 - 1 2 - 2 NaN - dtype: object - """ - if not isinstance(expand, bool): - raise ValueError("expand must be True or False") - if expand: - return _str_extract_frame(arr._orig, pat, flags=flags) - else: - result, name = _str_extract_noexpand(arr._parent, pat, flags=flags) - return arr._wrap_result(result, name=name, expand=expand) - - -def str_extractall(arr, pat, flags=0): - r""" - Extract capture groups in the regex `pat` as columns in DataFrame. - - For each subject string in the Series, extract groups from all - matches of regular expression pat. When each subject string in the - Series has exactly one match, extractall(pat).xs(0, level='match') - is the same as extract(pat). - - Parameters - ---------- - pat : str - Regular expression pattern with capturing groups. - flags : int, default 0 (no flags) - A ``re`` module flag, for example ``re.IGNORECASE``. These allow - to modify regular expression matching for things like case, spaces, - etc. Multiple flags can be combined with the bitwise OR operator, - for example ``re.IGNORECASE | re.MULTILINE``. - - Returns - ------- - DataFrame - A ``DataFrame`` with one row for each match, and one column for each - group. Its rows have a ``MultiIndex`` with first levels that come from - the subject ``Series``. The last level is named 'match' and indexes the - matches in each item of the ``Series``. Any capture group names in - regular expression pat will be used for column names; otherwise capture - group numbers will be used. - - See Also - -------- - extract : Returns first match only (not all matches). - - Examples - -------- - A pattern with one group will return a DataFrame with one column. - Indices with no matches will not appear in the result. - - >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"]) - >>> s.str.extractall(r"[ab](\d)") - 0 - match - A 0 1 - 1 2 - B 0 1 - - Capture group names are used for column names of the result. - - >>> s.str.extractall(r"[ab](?P\d)") - digit - match - A 0 1 - 1 2 - B 0 1 - - A pattern with two groups will return a DataFrame with two columns. - - >>> s.str.extractall(r"(?P[ab])(?P\d)") - letter digit - match - A 0 a 1 - 1 a 2 - B 0 b 1 - - Optional groups that do not match are NaN in the result. - - >>> s.str.extractall(r"(?P[ab])?(?P\d)") - letter digit - match - A 0 a 1 - 1 a 2 - B 0 b 1 - C 0 NaN 1 - """ - regex = re.compile(pat, flags=flags) - # the regex must contain capture groups. - if regex.groups == 0: - raise ValueError("pattern contains no capture groups") - - if isinstance(arr, ABCIndexClass): - arr = arr.to_series().reset_index(drop=True) - - names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) - columns = [names.get(1 + i, i) for i in range(regex.groups)] - match_list = [] - index_list = [] - is_mi = arr.index.nlevels > 1 - - for subject_key, subject in arr.items(): - if isinstance(subject, str): - - if not is_mi: - subject_key = (subject_key,) - - for match_i, match_tuple in enumerate(regex.findall(subject)): - if isinstance(match_tuple, str): - match_tuple = (match_tuple,) - na_tuple = [np.NaN if group == "" else group for group in match_tuple] - match_list.append(na_tuple) - result_key = tuple(subject_key + (match_i,)) - index_list.append(result_key) - - from pandas import MultiIndex - - index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"]) - dtype = _result_dtype(arr) - - result = arr._constructor_expanddim( - match_list, index=index, columns=columns, dtype=dtype - ) - return result - - -def str_get_dummies(arr, sep="|"): - """ - Return DataFrame of dummy/indicator variables for Series. - - Each string in Series is split by sep and returned as a DataFrame - of dummy/indicator variables. - - Parameters - ---------- - sep : str, default "|" - String to split on. - - Returns - ------- - DataFrame - Dummy variables corresponding to values of the Series. - - See Also - -------- - get_dummies : Convert categorical variable into dummy/indicator - variables. - - Examples - -------- - >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies() - a b c - 0 1 1 0 - 1 1 0 0 - 2 1 0 1 - - >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies() - a b c - 0 1 1 0 - 1 0 0 0 - 2 1 0 1 - """ - arr = arr.fillna("") - try: - arr = sep + arr + sep - except TypeError: - arr = sep + arr.astype(str) + sep - - tags = set() - for ts in arr.str.split(sep): - tags.update(ts) - tags = sorted(tags - {""}) - - dummies = np.empty((len(arr), len(tags)), dtype=np.int64) - - for i, t in enumerate(tags): - pat = sep + t + sep - dummies[:, i] = lib.map_infer(arr.to_numpy(), lambda x: pat in x) - return dummies, tags - - -def str_join(arr, sep): - """ - Join lists contained as elements in the Series/Index with passed delimiter. - - If the elements of a Series are lists themselves, join the content of these - lists using the delimiter passed to the function. - This function is an equivalent to :meth:`str.join`. - - Parameters - ---------- - sep : str - Delimiter to use between list entries. - - Returns - ------- - Series/Index: object - The list entries concatenated by intervening occurrences of the - delimiter. - - Raises - ------ - AttributeError - If the supplied Series contains neither strings nor lists. - - See Also - -------- - str.join : Standard library version of this method. - Series.str.split : Split strings around given separator/delimiter. - - Notes - ----- - If any of the list items is not a string object, the result of the join - will be `NaN`. - - Examples - -------- - Example with a list that contains non-string elements. - - >>> s = pd.Series([['lion', 'elephant', 'zebra'], - ... [1.1, 2.2, 3.3], - ... ['cat', np.nan, 'dog'], - ... ['cow', 4.5, 'goat'], - ... ['duck', ['swan', 'fish'], 'guppy']]) - >>> s - 0 [lion, elephant, zebra] - 1 [1.1, 2.2, 3.3] - 2 [cat, nan, dog] - 3 [cow, 4.5, goat] - 4 [duck, [swan, fish], guppy] - dtype: object - - Join all lists using a '-'. The lists containing object(s) of types other - than str will produce a NaN. - - >>> s.str.join('-') - 0 lion-elephant-zebra - 1 NaN - 2 NaN - 3 NaN - 4 NaN - dtype: object - """ - return _na_map(sep.join, arr, dtype=str) - - -def str_findall(arr, pat, flags=0): - """ - Find all occurrences of pattern or regular expression in the Series/Index. - - Equivalent to applying :func:`re.findall` to all the elements in the - Series/Index. - - Parameters - ---------- - pat : str - Pattern or regular expression. - flags : int, default 0 - Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which - means no flags). - - Returns - ------- - Series/Index of lists of strings - All non-overlapping matches of pattern or regular expression in each - string of this Series/Index. - - See Also - -------- - count : Count occurrences of pattern or regular expression in each string - of the Series/Index. - extractall : For each string in the Series, extract groups from all matches - of regular expression and return a DataFrame with one row for each - match and one column for each group. - re.findall : The equivalent ``re`` function to all non-overlapping matches - of pattern or regular expression in string, as a list of strings. - - Examples - -------- - >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit']) - - The search for the pattern 'Monkey' returns one match: - - >>> s.str.findall('Monkey') - 0 [] - 1 [Monkey] - 2 [] - dtype: object - - On the other hand, the search for the pattern 'MONKEY' doesn't return any - match: - - >>> s.str.findall('MONKEY') - 0 [] - 1 [] - 2 [] - dtype: object - - Flags can be added to the pattern or regular expression. For instance, - to find the pattern 'MONKEY' ignoring the case: - - >>> import re - >>> s.str.findall('MONKEY', flags=re.IGNORECASE) - 0 [] - 1 [Monkey] - 2 [] - dtype: object - - When the pattern matches more than one string in the Series, all matches - are returned: - - >>> s.str.findall('on') - 0 [on] - 1 [on] - 2 [] - dtype: object - - Regular expressions are supported too. For instance, the search for all the - strings ending with the word 'on' is shown next: - - >>> s.str.findall('on$') - 0 [on] - 1 [] - 2 [] - dtype: object - - If the pattern is found more than once in the same string, then a list of - multiple strings is returned: - - >>> s.str.findall('b') - 0 [] - 1 [] - 2 [b, b] - dtype: object - """ - regex = re.compile(pat, flags=flags) - return _na_map(regex.findall, arr) - - -def str_find(arr, sub, start=0, end=None, side="left"): - """ - Return indexes in each strings in the Series/Index where the - substring is fully contained between [start:end]. Return -1 on failure. - - Parameters - ---------- - sub : str - Substring being searched. - start : int - Left edge index. - end : int - Right edge index. - side : {'left', 'right'}, default 'left' - Specifies a starting side, equivalent to ``find`` or ``rfind``. - - Returns - ------- - Series or Index - Indexes where substring is found. - """ - if not isinstance(sub, str): - msg = f"expected a string object, not {type(sub).__name__}" - raise TypeError(msg) - - if side == "left": - method = "find" - elif side == "right": - method = "rfind" - else: # pragma: no cover - raise ValueError("Invalid side") - - if end is None: - f = lambda x: getattr(x, method)(sub, start) - else: - f = lambda x: getattr(x, method)(sub, start, end) - - return _na_map(f, arr, dtype=np.dtype("int64")) - - -def str_index(arr, sub, start=0, end=None, side="left"): - if not isinstance(sub, str): - msg = f"expected a string object, not {type(sub).__name__}" - raise TypeError(msg) - - if side == "left": - method = "index" - elif side == "right": - method = "rindex" - else: # pragma: no cover - raise ValueError("Invalid side") - - if end is None: - f = lambda x: getattr(x, method)(sub, start) - else: - f = lambda x: getattr(x, method)(sub, start, end) - - return _na_map(f, arr, dtype=np.dtype("int64")) - - -def str_pad(arr, width, side="left", fillchar=" "): - """ - Pad strings in the Series/Index up to width. - - Parameters - ---------- - width : int - Minimum width of resulting string; additional characters will be filled - with character defined in `fillchar`. - side : {'left', 'right', 'both'}, default 'left' - Side from which to fill resulting string. - fillchar : str, default ' ' - Additional character for filling, default is whitespace. - - Returns - ------- - Series or Index of object - Returns Series or Index with minimum number of char in object. - - See Also - -------- - Series.str.rjust : Fills the left side of strings with an arbitrary - character. Equivalent to ``Series.str.pad(side='left')``. - Series.str.ljust : Fills the right side of strings with an arbitrary - character. Equivalent to ``Series.str.pad(side='right')``. - Series.str.center : Fills both sides of strings with an arbitrary - character. Equivalent to ``Series.str.pad(side='both')``. - Series.str.zfill : Pad strings in the Series/Index by prepending '0' - character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``. - - Examples - -------- - >>> s = pd.Series(["caribou", "tiger"]) - >>> s - 0 caribou - 1 tiger - dtype: object - - >>> s.str.pad(width=10) - 0 caribou - 1 tiger - dtype: object - - >>> s.str.pad(width=10, side='right', fillchar='-') - 0 caribou--- - 1 tiger----- - dtype: object - - >>> s.str.pad(width=10, side='both', fillchar='-') - 0 -caribou-- - 1 --tiger--- - dtype: object - """ - if not isinstance(fillchar, str): - msg = f"fillchar must be a character, not {type(fillchar).__name__}" - raise TypeError(msg) - - if len(fillchar) != 1: - raise TypeError("fillchar must be a character, not str") - - if not is_integer(width): - msg = f"width must be of integer type, not {type(width).__name__}" - raise TypeError(msg) - - if side == "left": - f = lambda x: x.rjust(width, fillchar) - elif side == "right": - f = lambda x: x.ljust(width, fillchar) - elif side == "both": - f = lambda x: x.center(width, fillchar) - else: # pragma: no cover - raise ValueError("Invalid side") - - return _na_map(f, arr, dtype=str) - - -def str_split(arr, pat=None, n=None): - - if pat is None: - if n is None or n == 0: - n = -1 - f = lambda x: x.split(pat, n) - else: - if len(pat) == 1: - if n is None or n == 0: - n = -1 - f = lambda x: x.split(pat, n) - else: - if n is None or n == -1: - n = 0 - regex = re.compile(pat) - f = lambda x: regex.split(x, maxsplit=n) - res = _na_map(f, arr) - return res - - -def str_rsplit(arr, pat=None, n=None): - - if n is None or n == 0: - n = -1 - f = lambda x: x.rsplit(pat, n) - res = _na_map(f, arr) - return res - - -def str_slice(arr, start=None, stop=None, step=None): - """ - Slice substrings from each element in the Series or Index. - - Parameters - ---------- - start : int, optional - Start position for slice operation. - stop : int, optional - Stop position for slice operation. - step : int, optional - Step size for slice operation. - - Returns - ------- - Series or Index of object - Series or Index from sliced substring from original string object. - - See Also - -------- - Series.str.slice_replace : Replace a slice with a string. - Series.str.get : Return element at position. - Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i` - being the position. - - Examples - -------- - >>> s = pd.Series(["koala", "fox", "chameleon"]) - >>> s - 0 koala - 1 fox - 2 chameleon - dtype: object - - >>> s.str.slice(start=1) - 0 oala - 1 ox - 2 hameleon - dtype: object - - >>> s.str.slice(start=-1) - 0 a - 1 x - 2 n - dtype: object - - >>> s.str.slice(stop=2) - 0 ko - 1 fo - 2 ch - dtype: object - - >>> s.str.slice(step=2) - 0 kaa - 1 fx - 2 caeen - dtype: object - - >>> s.str.slice(start=0, stop=5, step=3) - 0 kl - 1 f - 2 cm - dtype: object - - Equivalent behaviour to: - - >>> s.str[0:5:3] - 0 kl - 1 f - 2 cm - dtype: object - """ - obj = slice(start, stop, step) - f = lambda x: x[obj] - return _na_map(f, arr, dtype=str) - - -def str_slice_replace(arr, start=None, stop=None, repl=None): - """ - Replace a positional slice of a string with another value. - - Parameters - ---------- - start : int, optional - Left index position to use for the slice. If not specified (None), - the slice is unbounded on the left, i.e. slice from the start - of the string. - stop : int, optional - Right index position to use for the slice. If not specified (None), - the slice is unbounded on the right, i.e. slice until the - end of the string. - repl : str, optional - String for replacement. If not specified (None), the sliced region - is replaced with an empty string. - - Returns - ------- - Series or Index - Same type as the original object. - - See Also - -------- - Series.str.slice : Just slicing without replacement. - - Examples - -------- - >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde']) - >>> s - 0 a - 1 ab - 2 abc - 3 abdc - 4 abcde - dtype: object - - Specify just `start`, meaning replace `start` until the end of the - string with `repl`. - - >>> s.str.slice_replace(1, repl='X') - 0 aX - 1 aX - 2 aX - 3 aX - 4 aX - dtype: object - - Specify just `stop`, meaning the start of the string to `stop` is replaced - with `repl`, and the rest of the string is included. - - >>> s.str.slice_replace(stop=2, repl='X') - 0 X - 1 X - 2 Xc - 3 Xdc - 4 Xcde - dtype: object - - Specify `start` and `stop`, meaning the slice from `start` to `stop` is - replaced with `repl`. Everything before or after `start` and `stop` is - included as is. - - >>> s.str.slice_replace(start=1, stop=3, repl='X') - 0 aX - 1 aX - 2 aX - 3 aXc - 4 aXde - dtype: object - """ - if repl is None: - repl = "" - - def f(x): - if x[start:stop] == "": - local_stop = start - else: - local_stop = stop - y = "" - if start is not None: - y += x[:start] - y += repl - if stop is not None: - y += x[local_stop:] - return y - - return _na_map(f, arr, dtype=str) - - -def str_strip(arr, to_strip=None, side="both"): - """ - Strip whitespace (including newlines) from each string in the - Series/Index. - - Parameters - ---------- - to_strip : str or unicode - side : {'left', 'right', 'both'}, default 'both' - - Returns - ------- - Series or Index - """ - if side == "both": - f = lambda x: x.strip(to_strip) - elif side == "left": - f = lambda x: x.lstrip(to_strip) - elif side == "right": - f = lambda x: x.rstrip(to_strip) - else: # pragma: no cover - raise ValueError("Invalid side") - return _na_map(f, arr, dtype=str) - - -def str_wrap(arr, width, **kwargs): - r""" - Wrap strings in Series/Index at specified line width. - - This method has the same keyword parameters and defaults as - :class:`textwrap.TextWrapper`. - - Parameters - ---------- - width : int - Maximum line width. - expand_tabs : bool, optional - If True, tab characters will be expanded to spaces (default: True). - replace_whitespace : bool, optional - If True, each whitespace character (as defined by string.whitespace) - remaining after tab expansion will be replaced by a single space - (default: True). - drop_whitespace : bool, optional - If True, whitespace that, after wrapping, happens to end up at the - beginning or end of a line is dropped (default: True). - break_long_words : bool, optional - If True, then words longer than width will be broken in order to ensure - that no lines are longer than width. If it is false, long words will - not be broken, and some lines may be longer than width (default: True). - break_on_hyphens : bool, optional - If True, wrapping will occur preferably on whitespace and right after - hyphens in compound words, as it is customary in English. If false, - only whitespaces will be considered as potentially good places for line - breaks, but you need to set break_long_words to false if you want truly - insecable words (default: True). - - Returns - ------- - Series or Index - - Notes - ----- - Internally, this method uses a :class:`textwrap.TextWrapper` instance with - default settings. To achieve behavior matching R's stringr library str_wrap - function, use the arguments: - - - expand_tabs = False - - replace_whitespace = True - - drop_whitespace = True - - break_long_words = False - - break_on_hyphens = False - - Examples - -------- - >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped']) - >>> s.str.wrap(12) - 0 line to be\nwrapped - 1 another line\nto be\nwrapped - dtype: object - """ - kwargs["width"] = width - - tw = textwrap.TextWrapper(**kwargs) - - return _na_map(lambda s: "\n".join(tw.wrap(s)), arr, dtype=str) - - -def str_translate(arr, table): - """ - Map all characters in the string through the given mapping table. - - Equivalent to standard :meth:`str.translate`. - - Parameters - ---------- - table : dict - Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or - None. Unmapped characters are left untouched. - Characters mapped to None are deleted. :meth:`str.maketrans` is a - helper function for making translation tables. - - Returns - ------- - Series or Index - """ - return _na_map(lambda x: x.translate(table), arr, dtype=str) - - -def str_get(arr, i): - """ - Extract element from each component at specified position. - - Extract element from lists, tuples, or strings in each element in the - Series/Index. - - Parameters - ---------- - i : int - Position of element to extract. - - Returns - ------- - Series or Index - - Examples - -------- - >>> s = pd.Series(["String", - ... (1, 2, 3), - ... ["a", "b", "c"], - ... 123, - ... -456, - ... {1: "Hello", "2": "World"}]) - >>> s - 0 String - 1 (1, 2, 3) - 2 [a, b, c] - 3 123 - 4 -456 - 5 {1: 'Hello', '2': 'World'} - dtype: object - - >>> s.str.get(1) - 0 t - 1 2 - 2 b - 3 NaN - 4 NaN - 5 Hello - dtype: object - - >>> s.str.get(-1) - 0 g - 1 3 - 2 c - 3 NaN - 4 NaN - 5 None - dtype: object - """ - - def f(x): - if isinstance(x, dict): - return x.get(i) - elif len(x) > i >= -len(x): - return x[i] - return np.nan - - return _na_map(f, arr) - - -def str_decode(arr, encoding, errors="strict"): - """ - Decode character string in the Series/Index using indicated encoding. - - Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in - python3. - - Parameters - ---------- - encoding : str - errors : str, optional - - Returns - ------- - Series or Index - """ - if encoding in _cpython_optimized_decoders: - # CPython optimized implementation - f = lambda x: x.decode(encoding, errors) - else: - decoder = codecs.getdecoder(encoding) - f = lambda x: decoder(x, errors)[0] - return _na_map(f, arr) - - -def str_encode(arr, encoding, errors="strict"): - """ - Encode character string in the Series/Index using indicated encoding. - - Equivalent to :meth:`str.encode`. - - Parameters - ---------- - encoding : str - errors : str, optional - - Returns - ------- - encoded : Series/Index of objects - """ - if encoding in _cpython_optimized_encoders: - # CPython optimized implementation - f = lambda x: x.encode(encoding, errors) - else: - encoder = codecs.getencoder(encoding) - f = lambda x: encoder(x, errors)[0] - return _na_map(f, arr) - - -def forbid_nonstring_types(forbidden, name=None): - """ - Decorator to forbid specific types for a method of StringMethods. - - For calling `.str.{method}` on a Series or Index, it is necessary to first - initialize the :class:`StringMethods` object, and then call the method. - However, different methods allow different input types, and so this can not - be checked during :meth:`StringMethods.__init__`, but must be done on a - per-method basis. This decorator exists to facilitate this process, and - make it explicit which (inferred) types are disallowed by the method. - - :meth:`StringMethods.__init__` allows the *union* of types its different - methods allow (after skipping NaNs; see :meth:`StringMethods._validate`), - namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer']. - - The default string types ['string', 'empty'] are allowed for all methods. - For the additional types ['bytes', 'mixed', 'mixed-integer'], each method - then needs to forbid the types it is not intended for. - - Parameters - ---------- - forbidden : list-of-str or None - List of forbidden non-string types, may be one or more of - `['bytes', 'mixed', 'mixed-integer']`. - name : str, default None - Name of the method to use in the error message. By default, this is - None, in which case the name from the method being wrapped will be - copied. However, for working with further wrappers (like _pat_wrapper - and _noarg_wrapper), it is necessary to specify the name. - - Returns - ------- - func : wrapper - The method to which the decorator is applied, with an added check that - enforces the inferred type to not be in the list of forbidden types. - - Raises - ------ - TypeError - If the inferred type of the underlying data is in `forbidden`. - """ - # deal with None - forbidden = [] if forbidden is None else forbidden - - allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set( - forbidden - ) - - def _forbid_nonstring_types(func): - func_name = func.__name__ if name is None else name - - @wraps(func) - def wrapper(self, *args, **kwargs): - if self._inferred_dtype not in allowed_types: - msg = ( - f"Cannot use .str.{func_name} with values of " - f"inferred dtype '{self._inferred_dtype}'." - ) - raise TypeError(msg) - return func(self, *args, **kwargs) - - wrapper.__name__ = func_name - return wrapper - - return _forbid_nonstring_types - - -def _noarg_wrapper( - f, - name=None, - docstring=None, - forbidden_types=["bytes"], - returns_string=True, - **kwargs, -): - @forbid_nonstring_types(forbidden_types, name=name) - def wrapper(self): - result = _na_map(f, self._parent, **kwargs) - return self._wrap_result(result, returns_string=returns_string) - - wrapper.__name__ = f.__name__ if name is None else name - if docstring is not None: - wrapper.__doc__ = docstring - else: - raise ValueError("Provide docstring") - - return wrapper - - -def _pat_wrapper( - f, - flags=False, - na=False, - name=None, - forbidden_types=["bytes"], - returns_string=True, - **kwargs, -): - @forbid_nonstring_types(forbidden_types, name=name) - def wrapper1(self, pat): - result = f(self._parent, pat) - return self._wrap_result(result, returns_string=returns_string) - - @forbid_nonstring_types(forbidden_types, name=name) - def wrapper2(self, pat, flags=0, **kwargs): - result = f(self._parent, pat, flags=flags, **kwargs) - return self._wrap_result(result, returns_string=returns_string) - - @forbid_nonstring_types(forbidden_types, name=name) - def wrapper3(self, pat, na=np.nan): - result = f(self._parent, pat, na=na) - return self._wrap_result(result, returns_string=returns_string, fill_value=na) - - wrapper = wrapper3 if na else wrapper2 if flags else wrapper1 - - wrapper.__name__ = f.__name__ if name is None else name - if f.__doc__: - wrapper.__doc__ = f.__doc__ - - return wrapper - - -def copy(source): - """Copy a docstring from another source function (if present)""" - - def do_copy(target): - if source.__doc__: - target.__doc__ = source.__doc__ - return target - - return do_copy - - -class StringMethods(NoNewAttributesMixin): - """ - Vectorized string functions for Series and Index. - - NAs stay NA unless handled otherwise by a particular method. - Patterned after Python's string methods, with some inspiration from - R's stringr package. - - Examples - -------- - >>> s = pd.Series(["A_Str_Series"]) - >>> s - 0 A_Str_Series - dtype: object - - >>> s.str.split("_") - 0 [A, Str, Series] - dtype: object - - >>> s.str.replace("_", "") - 0 AStrSeries - dtype: object - """ - - def __init__(self, data): - self._inferred_dtype = self._validate(data) - self._is_categorical = is_categorical_dtype(data.dtype) - self._is_string = data.dtype.name == "string" - - # ._values.categories works for both Series/Index - self._parent = data._values.categories if self._is_categorical else data - # save orig to blow up categoricals to the right type - self._orig = data - self._freeze() - - @staticmethod - def _validate(data): - """ - Auxiliary function for StringMethods, infers and checks dtype of data. - - This is a "first line of defence" at the creation of the StringMethods- - object (see _make_accessor), and just checks that the dtype is in the - *union* of the allowed types over all string methods below; this - restriction is then refined on a per-method basis using the decorator - @forbid_nonstring_types (more info in the corresponding docstring). - - This really should exclude all series/index with any non-string values, - but that isn't practical for performance reasons until we have a str - dtype (GH 9343 / 13877) - - Parameters - ---------- - data : The content of the Series - - Returns - ------- - dtype : inferred dtype of data - """ - from pandas import StringDtype - - if isinstance(data, ABCMultiIndex): - raise AttributeError( - "Can only use .str accessor with Index, not MultiIndex" - ) - - # see _libs/lib.pyx for list of inferred types - allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"] - - values = getattr(data, "values", data) # Series / Index - values = getattr(values, "categories", values) # categorical / normal - - # explicitly allow StringDtype - if isinstance(values.dtype, StringDtype): - return "string" - - try: - inferred_dtype = lib.infer_dtype(values, skipna=True) - except ValueError: - # GH#27571 mostly occurs with ExtensionArray - inferred_dtype = None - - if inferred_dtype not in allowed_types: - raise AttributeError("Can only use .str accessor with string values!") - return inferred_dtype - - def __getitem__(self, key): - if isinstance(key, slice): - return self.slice(start=key.start, stop=key.stop, step=key.step) - else: - return self.get(key) - - def __iter__(self): - warnings.warn( - "Columnar iteration over characters will be deprecated in future releases.", - FutureWarning, - stacklevel=2, - ) - i = 0 - g = self.get(i) - while g.notna().any(): - yield g - i += 1 - g = self.get(i) - - def _wrap_result( - self, - result, - use_codes=True, - name=None, - expand=None, - fill_value=np.nan, - returns_string=True, - ): - - from pandas import Index, MultiIndex, Series - - # for category, we do the stuff on the categories, so blow it up - # to the full series again - # But for some operations, we have to do the stuff on the full values, - # so make it possible to skip this step as the method already did this - # before the transformation... - if use_codes and self._is_categorical: - # if self._orig is a CategoricalIndex, there is no .cat-accessor - result = take_1d( - result, Series(self._orig, copy=False).cat.codes, fill_value=fill_value - ) - - if not hasattr(result, "ndim") or not hasattr(result, "dtype"): - return result - assert result.ndim < 3 - - # We can be wrapping a string / object / categorical result, in which - # case we'll want to return the same dtype as the input. - # Or we can be wrapping a numeric output, in which case we don't want - # to return a StringArray. - if self._is_string and returns_string: - dtype = "string" - else: - dtype = None - - if expand is None: - # infer from ndim if expand is not specified - expand = result.ndim != 1 - - elif expand is True and not isinstance(self._orig, ABCIndexClass): - # required when expand=True is explicitly specified - # not needed when inferred - - def cons_row(x): - if is_list_like(x): - return x - else: - return [x] - - result = [cons_row(x) for x in result] - if result: - # propagate nan values to match longest sequence (GH 18450) - max_len = max(len(x) for x in result) - result = [ - x * max_len if len(x) == 0 or x[0] is np.nan else x for x in result - ] - - if not isinstance(expand, bool): - raise ValueError("expand must be True or False") - - if expand is False: - # if expand is False, result should have the same name - # as the original otherwise specified - if name is None: - name = getattr(result, "name", None) - if name is None: - # do not use logical or, _orig may be a DataFrame - # which has "name" column - name = self._orig.name - - # Wait until we are sure result is a Series or Index before - # checking attributes (GH 12180) - if isinstance(self._orig, ABCIndexClass): - # if result is a boolean np.array, return the np.array - # instead of wrapping it into a boolean Index (GH 8875) - if is_bool_dtype(result): - return result - - if expand: - result = list(result) - out = MultiIndex.from_tuples(result, names=name) - if out.nlevels == 1: - # We had all tuples of length-one, which are - # better represented as a regular Index. - out = out.get_level_values(0) - return out - else: - return Index(result, name=name) - else: - index = self._orig.index - if expand: - cons = self._orig._constructor_expanddim - result = cons(result, columns=name, index=index, dtype=dtype) - else: - # Must be a Series - cons = self._orig._constructor - result = cons(result, name=name, index=index, dtype=dtype) - return result - - def _get_series_list(self, others): - """ - Auxiliary function for :meth:`str.cat`. Turn potentially mixed input - into a list of Series (elements without an index must match the length - of the calling Series/Index). - - Parameters - ---------- - others : Series, DataFrame, np.ndarray, list-like or list-like of - Objects that are either Series, Index or np.ndarray (1-dim). - - Returns - ------- - list of Series - Others transformed into list of Series. - """ - from pandas import DataFrame, Series - - # self._orig is either Series or Index - idx = self._orig if isinstance(self._orig, ABCIndexClass) else self._orig.index - - # Generally speaking, all objects without an index inherit the index - # `idx` of the calling Series/Index - i.e. must have matching length. - # Objects with an index (i.e. Series/Index/DataFrame) keep their own. - if isinstance(others, ABCSeries): - return [others] - elif isinstance(others, ABCIndexClass): - return [Series(others._values, index=idx)] - elif isinstance(others, ABCDataFrame): - return [others[x] for x in others] - elif isinstance(others, np.ndarray) and others.ndim == 2: - others = DataFrame(others, index=idx) - return [others[x] for x in others] - elif is_list_like(others, allow_sets=False): - others = list(others) # ensure iterators do not get read twice etc - - # in case of list-like `others`, all elements must be - # either Series/Index/np.ndarray (1-dim)... - if all( - isinstance(x, (ABCSeries, ABCIndexClass)) - or (isinstance(x, np.ndarray) and x.ndim == 1) - for x in others - ): - los = [] - while others: # iterate through list and append each element - los = los + self._get_series_list(others.pop(0)) - return los - # ... or just strings - elif all(not is_list_like(x) for x in others): - return [Series(others, index=idx)] - raise TypeError( - "others must be Series, Index, DataFrame, np.ndarray " - "or list-like (either containing only strings or " - "containing only objects of type Series/Index/" - "np.ndarray[1-dim])" - ) - - @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"]) - def cat(self, others=None, sep=None, na_rep=None, join="left"): - """ - Concatenate strings in the Series/Index with given separator. - - If `others` is specified, this function concatenates the Series/Index - and elements of `others` element-wise. - If `others` is not passed, then all values in the Series/Index are - concatenated into a single string with a given `sep`. - - Parameters - ---------- - others : Series, Index, DataFrame, np.ndarray or list-like - Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and - other list-likes of strings must have the same length as the - calling Series/Index, with the exception of indexed objects (i.e. - Series/Index/DataFrame) if `join` is not None. - - If others is a list-like that contains a combination of Series, - Index or np.ndarray (1-dim), then all elements will be unpacked and - must satisfy the above criteria individually. - - If others is None, the method returns the concatenation of all - strings in the calling Series/Index. - sep : str, default '' - The separator between the different elements/columns. By default - the empty string `''` is used. - na_rep : str or None, default None - Representation that is inserted for all missing values: - - - If `na_rep` is None, and `others` is None, missing values in the - Series/Index are omitted from the result. - - If `na_rep` is None, and `others` is not None, a row containing a - missing value in any of the columns (before concatenation) will - have a missing value in the result. - join : {'left', 'right', 'outer', 'inner'}, default 'left' - Determines the join-style between the calling Series/Index and any - Series/Index/DataFrame in `others` (objects without an index need - to match the length of the calling Series/Index). To disable - alignment, use `.values` on any Series/Index/DataFrame in `others`. - - .. versionchanged:: 1.0.0 - Changed default of `join` from None to `'left'`. - - Returns - ------- - str, Series or Index - If `others` is None, `str` is returned, otherwise a `Series/Index` - (same type as caller) of objects is returned. - - See Also - -------- - split : Split each string in the Series/Index. - join : Join lists contained as elements in the Series/Index. - - Examples - -------- - When not passing `others`, all values are concatenated into a single - string: - - >>> s = pd.Series(['a', 'b', np.nan, 'd']) - >>> s.str.cat(sep=' ') - 'a b d' - - By default, NA values in the Series are ignored. Using `na_rep`, they - can be given a representation: - - >>> s.str.cat(sep=' ', na_rep='?') - 'a b ? d' - - If `others` is specified, corresponding values are concatenated with - the separator. Result will be a Series of strings. - - >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',') - 0 a,A - 1 b,B - 2 NaN - 3 d,D - dtype: object - - Missing values will remain missing in the result, but can again be - represented using `na_rep` - - >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-') - 0 a,A - 1 b,B - 2 -,C - 3 d,D - dtype: object - - If `sep` is not specified, the values are concatenated without - separation. - - >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-') - 0 aA - 1 bB - 2 -C - 3 dD - dtype: object - - Series with different indexes can be aligned before concatenation. The - `join`-keyword works as in other methods. - - >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2]) - >>> s.str.cat(t, join='left', na_rep='-') - 0 aa - 1 b- - 2 -c - 3 dd - dtype: object - >>> - >>> s.str.cat(t, join='outer', na_rep='-') - 0 aa - 1 b- - 2 -c - 3 dd - 4 -e - dtype: object - >>> - >>> s.str.cat(t, join='inner', na_rep='-') - 0 aa - 2 -c - 3 dd - dtype: object - >>> - >>> s.str.cat(t, join='right', na_rep='-') - 3 dd - 0 aa - 4 -e - 2 -c - dtype: object - - For more examples, see :ref:`here `. - """ - from pandas import Index, Series, concat - - if isinstance(others, str): - raise ValueError("Did you mean to supply a `sep` keyword?") - if sep is None: - sep = "" - - if isinstance(self._orig, ABCIndexClass): - data = Series(self._orig, index=self._orig) - else: # Series - data = self._orig - - # concatenate Series/Index with itself if no "others" - if others is None: - data = ensure_object(data) - na_mask = isna(data) - if na_rep is None and na_mask.any(): - data = data[~na_mask] - elif na_rep is not None and na_mask.any(): - data = np.where(na_mask, na_rep, data) - return sep.join(data) - - try: - # turn anything in "others" into lists of Series - others = self._get_series_list(others) - except ValueError as err: # do not catch TypeError raised by _get_series_list - raise ValueError( - "If `others` contains arrays or lists (or other " - "list-likes without an index), these must all be " - "of the same length as the calling Series/Index." - ) from err - - # align if required - if any(not data.index.equals(x.index) for x in others): - # Need to add keys for uniqueness in case of duplicate columns - others = concat( - others, - axis=1, - join=(join if join == "inner" else "outer"), - keys=range(len(others)), - sort=False, - copy=False, - ) - data, others = data.align(others, join=join) - others = [others[x] for x in others] # again list of Series - - all_cols = [ensure_object(x) for x in [data] + others] - na_masks = np.array([isna(x) for x in all_cols]) - union_mask = np.logical_or.reduce(na_masks, axis=0) - - if na_rep is None and union_mask.any(): - # no na_rep means NaNs for all rows where any column has a NaN - # only necessary if there are actually any NaNs - result = np.empty(len(data), dtype=object) - np.putmask(result, union_mask, np.nan) - - not_masked = ~union_mask - result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep) - elif na_rep is not None and union_mask.any(): - # fill NaNs with na_rep in case there are actually any NaNs - all_cols = [ - np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols) - ] - result = cat_safe(all_cols, sep) - else: - # no NaNs - can just concatenate - result = cat_safe(all_cols, sep) - - if isinstance(self._orig, ABCIndexClass): - # add dtype for case that result is all-NA - result = Index(result, dtype=object, name=self._orig.name) - else: # Series - if is_categorical_dtype(self._orig.dtype): - # We need to infer the new categories. - dtype = None - else: - dtype = self._orig.dtype - result = Series(result, dtype=dtype, index=data.index, name=self._orig.name) - return result - - _shared_docs[ - "str_split" - ] = r""" - Split strings around given separator/delimiter. - - Splits the string in the Series/Index from the %(side)s, - at the specified delimiter string. Equivalent to :meth:`str.%(method)s`. - - Parameters - ---------- - pat : str, optional - String or regular expression to split on. - If not specified, split on whitespace. - n : int, default -1 (all) - Limit number of splits in output. - ``None``, 0 and -1 will be interpreted as return all splits. - expand : bool, default False - Expand the split strings into separate columns. - - * If ``True``, return DataFrame/MultiIndex expanding dimensionality. - * If ``False``, return Series/Index, containing lists of strings. - - Returns - ------- - Series, Index, DataFrame or MultiIndex - Type matches caller unless ``expand=True`` (see Notes). - - See Also - -------- - Series.str.split : Split strings around given separator/delimiter. - Series.str.rsplit : Splits string around given separator/delimiter, - starting from the right. - Series.str.join : Join lists contained as elements in the Series/Index - with passed delimiter. - str.split : Standard library version for split. - str.rsplit : Standard library version for rsplit. - - Notes - ----- - The handling of the `n` keyword depends on the number of found splits: - - - If found splits > `n`, make first `n` splits only - - If found splits <= `n`, make all splits - - If for a certain row the number of found splits < `n`, - append `None` for padding up to `n` if ``expand=True`` - - If using ``expand=True``, Series and Index callers return DataFrame and - MultiIndex objects, respectively. - - Examples - -------- - >>> s = pd.Series( - ... [ - ... "this is a regular sentence", - ... "https://docs.python.org/3/tutorial/index.html", - ... np.nan - ... ] - ... ) - >>> s - 0 this is a regular sentence - 1 https://docs.python.org/3/tutorial/index.html - 2 NaN - dtype: object - - In the default setting, the string is split by whitespace. - - >>> s.str.split() - 0 [this, is, a, regular, sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 NaN - dtype: object - - Without the `n` parameter, the outputs of `rsplit` and `split` - are identical. - - >>> s.str.rsplit() - 0 [this, is, a, regular, sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 NaN - dtype: object - - The `n` parameter can be used to limit the number of splits on the - delimiter. The outputs of `split` and `rsplit` are different. - - >>> s.str.split(n=2) - 0 [this, is, a regular sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 NaN - dtype: object - - >>> s.str.rsplit(n=2) - 0 [this is a, regular, sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 NaN - dtype: object - - The `pat` parameter can be used to split by other characters. - - >>> s.str.split(pat="/") - 0 [this is a regular sentence] - 1 [https:, , docs.python.org, 3, tutorial, index... - 2 NaN - dtype: object - - When using ``expand=True``, the split elements will expand out into - separate columns. If NaN is present, it is propagated throughout - the columns during the split. - - >>> s.str.split(expand=True) - 0 1 2 3 4 - 0 this is a regular sentence - 1 https://docs.python.org/3/tutorial/index.html None None None None - 2 NaN NaN NaN NaN NaN - - For slightly more complex use cases like splitting the html document name - from a url, a combination of parameter settings can be used. - - >>> s.str.rsplit("/", n=1, expand=True) - 0 1 - 0 this is a regular sentence None - 1 https://docs.python.org/3/tutorial index.html - 2 NaN NaN - - Remember to escape special characters when explicitly using regular - expressions. - - >>> s = pd.Series(["1+1=2"]) - >>> s - 0 1+1=2 - dtype: object - >>> s.str.split(r"\+|=", expand=True) - 0 1 2 - 0 1 1 2 - """ - - @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"}) - @forbid_nonstring_types(["bytes"]) - def split(self, pat=None, n=-1, expand=False): - result = str_split(self._parent, pat, n=n) - return self._wrap_result(result, expand=expand, returns_string=expand) - - @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"}) - @forbid_nonstring_types(["bytes"]) - def rsplit(self, pat=None, n=-1, expand=False): - result = str_rsplit(self._parent, pat, n=n) - return self._wrap_result(result, expand=expand, returns_string=expand) - - _shared_docs[ - "str_partition" - ] = """ - Split the string at the %(side)s occurrence of `sep`. - - This method splits the string at the %(side)s occurrence of `sep`, - and returns 3 elements containing the part before the separator, - the separator itself, and the part after the separator. - If the separator is not found, return %(return)s. - - Parameters - ---------- - sep : str, default whitespace - String to split on. - expand : bool, default True - If True, return DataFrame/MultiIndex expanding dimensionality. - If False, return Series/Index. - - Returns - ------- - DataFrame/MultiIndex or Series/Index of objects - - See Also - -------- - %(also)s - Series.str.split : Split strings around given separators. - str.partition : Standard library version. - - Examples - -------- - - >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers']) - >>> s - 0 Linda van der Berg - 1 George Pitt-Rivers - dtype: object - - >>> s.str.partition() - 0 1 2 - 0 Linda van der Berg - 1 George Pitt-Rivers - - To partition by the last space instead of the first one: - - >>> s.str.rpartition() - 0 1 2 - 0 Linda van der Berg - 1 George Pitt-Rivers - - To partition by something different than a space: - - >>> s.str.partition('-') - 0 1 2 - 0 Linda van der Berg - 1 George Pitt - Rivers - - To return a Series containing tuples instead of a DataFrame: - - >>> s.str.partition('-', expand=False) - 0 (Linda van der Berg, , ) - 1 (George Pitt, -, Rivers) - dtype: object - - Also available on indices: - - >>> idx = pd.Index(['X 123', 'Y 999']) - >>> idx - Index(['X 123', 'Y 999'], dtype='object') - - Which will create a MultiIndex: - - >>> idx.str.partition() - MultiIndex([('X', ' ', '123'), - ('Y', ' ', '999')], - ) - - Or an index with tuples with ``expand=False``: - - >>> idx.str.partition(expand=False) - Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object') - """ - - @Appender( - _shared_docs["str_partition"] - % { - "side": "first", - "return": "3 elements containing the string itself, followed by two " - "empty strings", - "also": "rpartition : Split the string at the last occurrence of `sep`.", - } - ) - @forbid_nonstring_types(["bytes"]) - def partition(self, sep=" ", expand=True): - f = lambda x: x.partition(sep) - result = _na_map(f, self._parent) - return self._wrap_result(result, expand=expand, returns_string=expand) - - @Appender( - _shared_docs["str_partition"] - % { - "side": "last", - "return": "3 elements containing two empty strings, followed by the " - "string itself", - "also": "partition : Split the string at the first occurrence of `sep`.", - } - ) - @forbid_nonstring_types(["bytes"]) - def rpartition(self, sep=" ", expand=True): - f = lambda x: x.rpartition(sep) - result = _na_map(f, self._parent) - return self._wrap_result(result, expand=expand, returns_string=expand) - - @copy(str_get) - def get(self, i): - result = str_get(self._parent, i) - return self._wrap_result(result) - - @copy(str_join) - @forbid_nonstring_types(["bytes"]) - def join(self, sep): - result = str_join(self._parent, sep) - return self._wrap_result(result) - - @copy(str_contains) - @forbid_nonstring_types(["bytes"]) - def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): - result = str_contains( - self._parent, pat, case=case, flags=flags, na=na, regex=regex - ) - return self._wrap_result(result, fill_value=na, returns_string=False) - - @copy(str_match) - @forbid_nonstring_types(["bytes"]) - def match(self, pat, case=True, flags=0, na=np.nan): - result = str_match(self._parent, pat, case=case, flags=flags, na=na) - return self._wrap_result(result, fill_value=na, returns_string=False) - - @copy(str_fullmatch) - @forbid_nonstring_types(["bytes"]) - def fullmatch(self, pat, case=True, flags=0, na=np.nan): - result = str_fullmatch(self._parent, pat, case=case, flags=flags, na=na) - return self._wrap_result(result, fill_value=na, returns_string=False) - - @copy(str_replace) - @forbid_nonstring_types(["bytes"]) - def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): - result = str_replace( - self._parent, pat, repl, n=n, case=case, flags=flags, regex=regex - ) - return self._wrap_result(result) - - @copy(str_repeat) - @forbid_nonstring_types(["bytes"]) - def repeat(self, repeats): - result = str_repeat(self._parent, repeats) - return self._wrap_result(result) - - @copy(str_pad) - @forbid_nonstring_types(["bytes"]) - def pad(self, width, side="left", fillchar=" "): - result = str_pad(self._parent, width, side=side, fillchar=fillchar) - return self._wrap_result(result) - - _shared_docs[ - "str_pad" - ] = """ - Pad %(side)s side of strings in the Series/Index. - - Equivalent to :meth:`str.%(method)s`. - - Parameters - ---------- - width : int - Minimum width of resulting string; additional characters will be filled - with ``fillchar``. - fillchar : str - Additional character for filling, default is whitespace. - - Returns - ------- - filled : Series/Index of objects. - """ - - @Appender(_shared_docs["str_pad"] % dict(side="left and right", method="center")) - @forbid_nonstring_types(["bytes"]) - def center(self, width, fillchar=" "): - return self.pad(width, side="both", fillchar=fillchar) - - @Appender(_shared_docs["str_pad"] % dict(side="right", method="ljust")) - @forbid_nonstring_types(["bytes"]) - def ljust(self, width, fillchar=" "): - return self.pad(width, side="right", fillchar=fillchar) - - @Appender(_shared_docs["str_pad"] % dict(side="left", method="rjust")) - @forbid_nonstring_types(["bytes"]) - def rjust(self, width, fillchar=" "): - return self.pad(width, side="left", fillchar=fillchar) - - @forbid_nonstring_types(["bytes"]) - def zfill(self, width): - """ - Pad strings in the Series/Index by prepending '0' characters. - - Strings in the Series/Index are padded with '0' characters on the - left of the string to reach a total string length `width`. Strings - in the Series/Index with length greater or equal to `width` are - unchanged. - - Parameters - ---------- - width : int - Minimum length of resulting string; strings with length less - than `width` be prepended with '0' characters. - - Returns - ------- - Series/Index of objects. - - See Also - -------- - Series.str.rjust : Fills the left side of strings with an arbitrary - character. - Series.str.ljust : Fills the right side of strings with an arbitrary - character. - Series.str.pad : Fills the specified sides of strings with an arbitrary - character. - Series.str.center : Fills both sides of strings with an arbitrary - character. - - Notes - ----- - Differs from :meth:`str.zfill` which has special handling - for '+'/'-' in the string. - - Examples - -------- - >>> s = pd.Series(['-1', '1', '1000', 10, np.nan]) - >>> s - 0 -1 - 1 1 - 2 1000 - 3 10 - 4 NaN - dtype: object - - Note that ``10`` and ``NaN`` are not strings, therefore they are - converted to ``NaN``. The minus sign in ``'-1'`` is treated as a - regular character and the zero is added to the left of it - (:meth:`str.zfill` would have moved it to the left). ``1000`` - remains unchanged as it is longer than `width`. - - >>> s.str.zfill(3) - 0 0-1 - 1 001 - 2 1000 - 3 NaN - 4 NaN - dtype: object - """ - result = str_pad(self._parent, width, side="left", fillchar="0") - return self._wrap_result(result) - - @copy(str_slice) - def slice(self, start=None, stop=None, step=None): - result = str_slice(self._parent, start, stop, step) - return self._wrap_result(result) - - @copy(str_slice_replace) - @forbid_nonstring_types(["bytes"]) - def slice_replace(self, start=None, stop=None, repl=None): - result = str_slice_replace(self._parent, start, stop, repl) - return self._wrap_result(result) - - @copy(str_decode) - def decode(self, encoding, errors="strict"): - # need to allow bytes here - result = str_decode(self._parent, encoding, errors) - # TODO: Not sure how to handle this. - return self._wrap_result(result, returns_string=False) - - @copy(str_encode) - @forbid_nonstring_types(["bytes"]) - def encode(self, encoding, errors="strict"): - result = str_encode(self._parent, encoding, errors) - return self._wrap_result(result, returns_string=False) - - _shared_docs[ - "str_strip" - ] = r""" - Remove %(position)s characters. - - Strip whitespaces (including newlines) or a set of specified characters - from each string in the Series/Index from %(side)s. - Equivalent to :meth:`str.%(method)s`. - - Parameters - ---------- - to_strip : str or None, default None - Specifying the set of characters to be removed. - All combinations of this set of characters will be stripped. - If None then whitespaces are removed. - - Returns - ------- - Series or Index of object - - See Also - -------- - Series.str.strip : Remove leading and trailing characters in Series/Index. - Series.str.lstrip : Remove leading characters in Series/Index. - Series.str.rstrip : Remove trailing characters in Series/Index. - - Examples - -------- - >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan]) - >>> s - 0 1. Ant. - 1 2. Bee!\n - 2 3. Cat?\t - 3 NaN - dtype: object - - >>> s.str.strip() - 0 1. Ant. - 1 2. Bee! - 2 3. Cat? - 3 NaN - dtype: object - - >>> s.str.lstrip('123.') - 0 Ant. - 1 Bee!\n - 2 Cat?\t - 3 NaN - dtype: object - - >>> s.str.rstrip('.!? \n\t') - 0 1. Ant - 1 2. Bee - 2 3. Cat - 3 NaN - dtype: object - - >>> s.str.strip('123.!? \n\t') - 0 Ant - 1 Bee - 2 Cat - 3 NaN - dtype: object - """ - - @Appender( - _shared_docs["str_strip"] - % dict( - side="left and right sides", method="strip", position="leading and trailing" - ) - ) - @forbid_nonstring_types(["bytes"]) - def strip(self, to_strip=None): - result = str_strip(self._parent, to_strip, side="both") - return self._wrap_result(result) - - @Appender( - _shared_docs["str_strip"] - % dict(side="left side", method="lstrip", position="leading") - ) - @forbid_nonstring_types(["bytes"]) - def lstrip(self, to_strip=None): - result = str_strip(self._parent, to_strip, side="left") - return self._wrap_result(result) - - @Appender( - _shared_docs["str_strip"] - % dict(side="right side", method="rstrip", position="trailing") - ) - @forbid_nonstring_types(["bytes"]) - def rstrip(self, to_strip=None): - result = str_strip(self._parent, to_strip, side="right") - return self._wrap_result(result) - - @copy(str_wrap) - @forbid_nonstring_types(["bytes"]) - def wrap(self, width, **kwargs): - result = str_wrap(self._parent, width, **kwargs) - return self._wrap_result(result) - - @copy(str_get_dummies) - @forbid_nonstring_types(["bytes"]) - def get_dummies(self, sep="|"): - # we need to cast to Series of strings as only that has all - # methods available for making the dummies... - data = self._orig.astype(str) if self._is_categorical else self._parent - result, name = str_get_dummies(data, sep) - return self._wrap_result( - result, - use_codes=(not self._is_categorical), - name=name, - expand=True, - returns_string=False, - ) - - @copy(str_translate) - @forbid_nonstring_types(["bytes"]) - def translate(self, table): - result = str_translate(self._parent, table) - return self._wrap_result(result) - - count = _pat_wrapper(str_count, flags=True, name="count", returns_string=False) - startswith = _pat_wrapper( - str_startswith, na=True, name="startswith", returns_string=False - ) - endswith = _pat_wrapper( - str_endswith, na=True, name="endswith", returns_string=False - ) - findall = _pat_wrapper( - str_findall, flags=True, name="findall", returns_string=False - ) - - @copy(str_extract) - @forbid_nonstring_types(["bytes"]) - def extract(self, pat, flags=0, expand=True): - return str_extract(self, pat, flags=flags, expand=expand) - - @copy(str_extractall) - @forbid_nonstring_types(["bytes"]) - def extractall(self, pat, flags=0): - return str_extractall(self._orig, pat, flags=flags) - - _shared_docs[ - "find" - ] = """ - Return %(side)s indexes in each strings in the Series/Index. - - Each of returned indexes corresponds to the position where the - substring is fully contained between [start:end]. Return -1 on - failure. Equivalent to standard :meth:`str.%(method)s`. - - Parameters - ---------- - sub : str - Substring being searched. - start : int - Left edge index. - end : int - Right edge index. - - Returns - ------- - Series or Index of int. - - See Also - -------- - %(also)s - """ - - @Appender( - _shared_docs["find"] - % dict( - side="lowest", - method="find", - also="rfind : Return highest indexes in each strings.", - ) - ) - @forbid_nonstring_types(["bytes"]) - def find(self, sub, start=0, end=None): - result = str_find(self._parent, sub, start=start, end=end, side="left") - return self._wrap_result(result, returns_string=False) - - @Appender( - _shared_docs["find"] - % dict( - side="highest", - method="rfind", - also="find : Return lowest indexes in each strings.", - ) - ) - @forbid_nonstring_types(["bytes"]) - def rfind(self, sub, start=0, end=None): - result = str_find(self._parent, sub, start=start, end=end, side="right") - return self._wrap_result(result, returns_string=False) - - @forbid_nonstring_types(["bytes"]) - def normalize(self, form): - """ - Return the Unicode normal form for the strings in the Series/Index. - - For more information on the forms, see the - :func:`unicodedata.normalize`. - - Parameters - ---------- - form : {'NFC', 'NFKC', 'NFD', 'NFKD'} - Unicode form. - - Returns - ------- - normalized : Series/Index of objects - """ - import unicodedata - - f = lambda x: unicodedata.normalize(form, x) - result = _na_map(f, self._parent, dtype=str) - return self._wrap_result(result) - - _shared_docs[ - "index" - ] = """ - Return %(side)s indexes in each string in Series/Index. - - Each of the returned indexes corresponds to the position where the - substring is fully contained between [start:end]. This is the same - as ``str.%(similar)s`` except instead of returning -1, it raises a - ValueError when the substring is not found. Equivalent to standard - ``str.%(method)s``. - - Parameters - ---------- - sub : str - Substring being searched. - start : int - Left edge index. - end : int - Right edge index. - - Returns - ------- - Series or Index of object - - See Also - -------- - %(also)s - """ - - @Appender( - _shared_docs["index"] - % dict( - side="lowest", - similar="find", - method="index", - also="rindex : Return highest indexes in each strings.", - ) - ) - @forbid_nonstring_types(["bytes"]) - def index(self, sub, start=0, end=None): - result = str_index(self._parent, sub, start=start, end=end, side="left") - return self._wrap_result(result, returns_string=False) - - @Appender( - _shared_docs["index"] - % dict( - side="highest", - similar="rfind", - method="rindex", - also="index : Return lowest indexes in each strings.", - ) - ) - @forbid_nonstring_types(["bytes"]) - def rindex(self, sub, start=0, end=None): - result = str_index(self._parent, sub, start=start, end=end, side="right") - return self._wrap_result(result, returns_string=False) - - _shared_docs[ - "len" - ] = """ - Compute the length of each element in the Series/Index. - - The element may be a sequence (such as a string, tuple or list) or a collection - (such as a dictionary). - - Returns - ------- - Series or Index of int - A Series or Index of integer values indicating the length of each - element in the Series or Index. - - See Also - -------- - str.len : Python built-in function returning the length of an object. - Series.size : Returns the length of the Series. - - Examples - -------- - Returns the length (number of characters) in a string. Returns the - number of entries for dictionaries, lists or tuples. - - >>> s = pd.Series(['dog', - ... '', - ... 5, - ... {'foo' : 'bar'}, - ... [2, 3, 5, 7], - ... ('one', 'two', 'three')]) - >>> s - 0 dog - 1 - 2 5 - 3 {'foo': 'bar'} - 4 [2, 3, 5, 7] - 5 (one, two, three) - dtype: object - >>> s.str.len() - 0 3.0 - 1 0.0 - 2 NaN - 3 1.0 - 4 4.0 - 5 3.0 - dtype: float64 - """ - len = _noarg_wrapper( - len, - docstring=_shared_docs["len"], - forbidden_types=None, - dtype=np.dtype("int64"), - returns_string=False, - ) - - _shared_docs[ - "casemethods" - ] = """ - Convert strings in the Series/Index to %(type)s. - %(version)s - Equivalent to :meth:`str.%(method)s`. - - Returns - ------- - Series or Index of object - - See Also - -------- - Series.str.lower : Converts all characters to lowercase. - Series.str.upper : Converts all characters to uppercase. - Series.str.title : Converts first character of each word to uppercase and - remaining to lowercase. - Series.str.capitalize : Converts first character to uppercase and - remaining to lowercase. - Series.str.swapcase : Converts uppercase to lowercase and lowercase to - uppercase. - Series.str.casefold: Removes all case distinctions in the string. - - Examples - -------- - >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe']) - >>> s - 0 lower - 1 CAPITALS - 2 this is a sentence - 3 SwApCaSe - dtype: object - - >>> s.str.lower() - 0 lower - 1 capitals - 2 this is a sentence - 3 swapcase - dtype: object - - >>> s.str.upper() - 0 LOWER - 1 CAPITALS - 2 THIS IS A SENTENCE - 3 SWAPCASE - dtype: object - - >>> s.str.title() - 0 Lower - 1 Capitals - 2 This Is A Sentence - 3 Swapcase - dtype: object - - >>> s.str.capitalize() - 0 Lower - 1 Capitals - 2 This is a sentence - 3 Swapcase - dtype: object - - >>> s.str.swapcase() - 0 LOWER - 1 capitals - 2 THIS IS A SENTENCE - 3 sWaPcAsE - dtype: object - """ - - # _doc_args holds dict of strings to use in substituting casemethod docs - _doc_args: Dict[str, Dict[str, str]] = {} - _doc_args["lower"] = dict(type="lowercase", method="lower", version="") - _doc_args["upper"] = dict(type="uppercase", method="upper", version="") - _doc_args["title"] = dict(type="titlecase", method="title", version="") - _doc_args["capitalize"] = dict( - type="be capitalized", method="capitalize", version="" - ) - _doc_args["swapcase"] = dict(type="be swapcased", method="swapcase", version="") - _doc_args["casefold"] = dict( - type="be casefolded", - method="casefold", - version="\n .. versionadded:: 0.25.0\n", - ) - lower = _noarg_wrapper( - lambda x: x.lower(), - name="lower", - docstring=_shared_docs["casemethods"] % _doc_args["lower"], - dtype=str, - ) - upper = _noarg_wrapper( - lambda x: x.upper(), - name="upper", - docstring=_shared_docs["casemethods"] % _doc_args["upper"], - dtype=str, - ) - title = _noarg_wrapper( - lambda x: x.title(), - name="title", - docstring=_shared_docs["casemethods"] % _doc_args["title"], - dtype=str, - ) - capitalize = _noarg_wrapper( - lambda x: x.capitalize(), - name="capitalize", - docstring=_shared_docs["casemethods"] % _doc_args["capitalize"], - dtype=str, - ) - swapcase = _noarg_wrapper( - lambda x: x.swapcase(), - name="swapcase", - docstring=_shared_docs["casemethods"] % _doc_args["swapcase"], - dtype=str, - ) - casefold = _noarg_wrapper( - lambda x: x.casefold(), - name="casefold", - docstring=_shared_docs["casemethods"] % _doc_args["casefold"], - dtype=str, - ) - - _shared_docs[ - "ismethods" - ] = """ - Check whether all characters in each string are %(type)s. - - This is equivalent to running the Python string method - :meth:`str.%(method)s` for each element of the Series/Index. If a string - has zero characters, ``False`` is returned for that check. - - Returns - ------- - Series or Index of bool - Series or Index of boolean values with the same length as the original - Series/Index. - - See Also - -------- - Series.str.isalpha : Check whether all characters are alphabetic. - Series.str.isnumeric : Check whether all characters are numeric. - Series.str.isalnum : Check whether all characters are alphanumeric. - Series.str.isdigit : Check whether all characters are digits. - Series.str.isdecimal : Check whether all characters are decimal. - Series.str.isspace : Check whether all characters are whitespace. - Series.str.islower : Check whether all characters are lowercase. - Series.str.isupper : Check whether all characters are uppercase. - Series.str.istitle : Check whether all characters are titlecase. - - Examples - -------- - **Checks for Alphabetic and Numeric Characters** - - >>> s1 = pd.Series(['one', 'one1', '1', '']) - - >>> s1.str.isalpha() - 0 True - 1 False - 2 False - 3 False - dtype: bool - - >>> s1.str.isnumeric() - 0 False - 1 False - 2 True - 3 False - dtype: bool - - >>> s1.str.isalnum() - 0 True - 1 True - 2 True - 3 False - dtype: bool - - Note that checks against characters mixed with any additional punctuation - or whitespace will evaluate to false for an alphanumeric check. - - >>> s2 = pd.Series(['A B', '1.5', '3,000']) - >>> s2.str.isalnum() - 0 False - 1 False - 2 False - dtype: bool - - **More Detailed Checks for Numeric Characters** - - There are several different but overlapping sets of numeric characters that - can be checked for. - - >>> s3 = pd.Series(['23', '³', '⅕', '']) - - The ``s3.str.isdecimal`` method checks for characters used to form numbers - in base 10. - - >>> s3.str.isdecimal() - 0 True - 1 False - 2 False - 3 False - dtype: bool - - The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also - includes special digits, like superscripted and subscripted digits in - unicode. - - >>> s3.str.isdigit() - 0 True - 1 True - 2 False - 3 False - dtype: bool - - The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also - includes other characters that can represent quantities such as unicode - fractions. - - >>> s3.str.isnumeric() - 0 True - 1 True - 2 True - 3 False - dtype: bool - - **Checks for Whitespace** - - >>> s4 = pd.Series([' ', '\\t\\r\\n ', '']) - >>> s4.str.isspace() - 0 True - 1 True - 2 False - dtype: bool - - **Checks for Character Case** - - >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) - - >>> s5.str.islower() - 0 True - 1 False - 2 False - 3 False - dtype: bool - - >>> s5.str.isupper() - 0 False - 1 False - 2 True - 3 False - dtype: bool - - The ``s5.str.istitle`` method checks for whether all words are in title - case (whether only the first letter of each word is capitalized). Words are - assumed to be as any sequence of non-numeric characters separated by - whitespace characters. - - >>> s5.str.istitle() - 0 False - 1 True - 2 False - 3 False - dtype: bool - """ - _doc_args["isalnum"] = dict(type="alphanumeric", method="isalnum") - _doc_args["isalpha"] = dict(type="alphabetic", method="isalpha") - _doc_args["isdigit"] = dict(type="digits", method="isdigit") - _doc_args["isspace"] = dict(type="whitespace", method="isspace") - _doc_args["islower"] = dict(type="lowercase", method="islower") - _doc_args["isupper"] = dict(type="uppercase", method="isupper") - _doc_args["istitle"] = dict(type="titlecase", method="istitle") - _doc_args["isnumeric"] = dict(type="numeric", method="isnumeric") - _doc_args["isdecimal"] = dict(type="decimal", method="isdecimal") - # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624) - isalnum = _noarg_wrapper( - lambda x: x.isalnum(), - name="isalnum", - docstring=_shared_docs["ismethods"] % _doc_args["isalnum"], - returns_string=False, - dtype=np.dtype(bool), - ) - isalpha = _noarg_wrapper( - lambda x: x.isalpha(), - name="isalpha", - docstring=_shared_docs["ismethods"] % _doc_args["isalpha"], - returns_string=False, - dtype=np.dtype(bool), - ) - isdigit = _noarg_wrapper( - lambda x: x.isdigit(), - name="isdigit", - docstring=_shared_docs["ismethods"] % _doc_args["isdigit"], - returns_string=False, - dtype=np.dtype(bool), - ) - isspace = _noarg_wrapper( - lambda x: x.isspace(), - name="isspace", - docstring=_shared_docs["ismethods"] % _doc_args["isspace"], - returns_string=False, - dtype=np.dtype(bool), - ) - islower = _noarg_wrapper( - lambda x: x.islower(), - name="islower", - docstring=_shared_docs["ismethods"] % _doc_args["islower"], - returns_string=False, - dtype=np.dtype(bool), - ) - isupper = _noarg_wrapper( - lambda x: x.isupper(), - name="isupper", - docstring=_shared_docs["ismethods"] % _doc_args["isupper"], - returns_string=False, - dtype=np.dtype(bool), - ) - istitle = _noarg_wrapper( - lambda x: x.istitle(), - name="istitle", - docstring=_shared_docs["ismethods"] % _doc_args["istitle"], - returns_string=False, - dtype=np.dtype(bool), - ) - isnumeric = _noarg_wrapper( - lambda x: x.isnumeric(), - name="isnumeric", - docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"], - returns_string=False, - dtype=np.dtype(bool), - ) - isdecimal = _noarg_wrapper( - lambda x: x.isdecimal(), - name="isdecimal", - docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"], - returns_string=False, - dtype=np.dtype(bool), - ) - - @classmethod - def _make_accessor(cls, data): - cls._validate(data) - return cls(data) From cb2fb24439c69a040d2f1eec144c514cdf27d1f2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 29 Sep 2020 15:44:49 -0500 Subject: [PATCH 24/24] simplify inheritance --- pandas/core/arrays/string_.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 0db5fadce614e..fb126b3725237 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -22,7 +22,6 @@ from pandas.core.construction import extract_array from pandas.core.indexers import check_array_indexer from pandas.core.missing import isna -from pandas.core.strings.object_array import ObjectStringArrayMixin if TYPE_CHECKING: import pyarrow @@ -103,7 +102,7 @@ def __from_arrow__( return StringArray._concat_same_type(results) -class StringArray(PandasArray, ObjectStringArrayMixin): +class StringArray(PandasArray): """ Extension array for string data.