Skip to content

REF: Dispatch string methods to ExtensionArray #36357

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 35 commits into from
Sep 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
9e90d4e
Implement BaseDtypeTests for ArrowStringDtype
xhochy Jul 10, 2020
92f1d26
Refactor to use parametrized StringDtype
TomAugspurger Sep 3, 2020
00096f0
wip
TomAugspurger Sep 8, 2020
5a89dbf
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
TomAugspurger Sep 11, 2020
89f8e6a
annoyed
TomAugspurger Sep 11, 2020
3f82225
Merge remote-tracking branch 'upstream/master' into dispatch-string-m…
TomAugspurger Sep 14, 2020
fabc01e
wip
TomAugspurger Sep 14, 2020
a4d4ad5
remove old
TomAugspurger Sep 14, 2020
e76a3c1
fixup
TomAugspurger Sep 14, 2020
49dff8a
Merge remote-tracking branch 'upstream/master' into dispatch-string-m…
TomAugspurger Sep 17, 2020
75831b3
fixup
TomAugspurger Sep 17, 2020
1cf54cc
doctest
TomAugspurger Sep 17, 2020
fc81ebe
docstrings
TomAugspurger Sep 17, 2020
6be1af6
typing
TomAugspurger Sep 17, 2020
95b3310
typing
TomAugspurger Sep 17, 2020
20a8705
wip
TomAugspurger Sep 21, 2020
136831a
Merge remote-tracking branch 'upstream/master' into dispatch-string-m…
TomAugspurger Sep 21, 2020
38c1611
wip
TomAugspurger Sep 22, 2020
ea27e57
Merge remote-tracking branch 'upstream/master' into dispatch-string-m…
TomAugspurger Sep 22, 2020
8d3aecd
Move to arrays
TomAugspurger Sep 22, 2020
d11c2ba
Fixup types
TomAugspurger Sep 22, 2020
349e281
test coverage
TomAugspurger Sep 22, 2020
c6b99cb
Merge remote-tracking branch 'upstream/master' into dispatch-string-m…
TomAugspurger Sep 22, 2020
b7ab130
fixup
TomAugspurger Sep 22, 2020
3b837d1
Merge remote-tracking branch 'upstream/master' into dispatch-string-m…
TomAugspurger Sep 22, 2020
28cf7e6
Merge remote-tracking branch 'upstream/master' into dispatch-string-m…
TomAugspurger Sep 23, 2020
6dcd44e
update docstring
TomAugspurger Sep 23, 2020
efb3e3d
document current implementation
TomAugspurger Sep 24, 2020
0da7031
typo
TomAugspurger Sep 24, 2020
35a97ab
Merge remote-tracking branch 'upstream/master' into dispatch-string-m…
TomAugspurger Sep 25, 2020
d681f99
fixup
TomAugspurger Sep 25, 2020
cc5ceed
Merge remote-tracking branch 'upstream/master' into dispatch-string-m…
TomAugspurger Sep 29, 2020
457c112
fixup
TomAugspurger Sep 29, 2020
58e1bb9
Merge remote-tracking branch 'upstream/master' into dispatch-string-m…
TomAugspurger Sep 29, 2020
cb2fb24
simplify inheritance
TomAugspurger Sep 29, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ci/code_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
RET=$(($RET + $?)) ; echo $MSG "DONE"

MSG='Doctests strings.py' ; echo $MSG
pytest -q --doctest-modules pandas/core/strings.py
pytest -q --doctest-modules pandas/core/strings/
RET=$(($RET + $?)) ; echo $MSG "DONE"

# Directories
Expand Down
22 changes: 21 additions & 1 deletion pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
from pandas.core.missing import interpolate_2d
from pandas.core.ops.common import unpack_zerodim_and_defer
from pandas.core.sorting import nargsort
from pandas.core.strings.object_array import ObjectStringArrayMixin

from pandas.io.formats import console

Expand Down Expand Up @@ -176,7 +177,7 @@ def contains(cat, key, container):
return any(loc_ in container for loc_ in loc)


class Categorical(NDArrayBackedExtensionArray, PandasObject):
class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMixin):
"""
Represent a categorical variable in classic R / S-plus fashion.

Expand Down Expand Up @@ -2305,6 +2306,25 @@ def replace(self, to_replace, value, inplace: bool = False):
if not inplace:
return cat

# ------------------------------------------------------------------------
# String methods interface
def _str_map(self, f, na_value=np.nan, dtype=np.dtype(object)):
# Optimization to apply the callable `f` to the categories once
# and rebuild the result by `take`ing from the result with the codes.
# Returns the same type as the object-dtype implementation though.
from pandas.core.arrays import PandasArray

categories = self.categories
codes = self.codes
result = PandasArray(categories.to_numpy())._str_map(f, na_value, dtype)
return take_1d(result, codes, fill_value=na_value)

def _str_get_dummies(self, sep="|"):
# sep may not be in categories. Just bail on this.
from pandas.core.arrays import PandasArray

return PandasArray(self.astype(str))._str_get_dummies(sep)


# The Series.cat accessor

Expand Down
10 changes: 9 additions & 1 deletion pandas/core/arrays/numpy_.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from pandas.core.array_algos import masked_reductions
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
from pandas.core.arrays.base import ExtensionOpsMixin
from pandas.core.strings.object_array import ObjectStringArrayMixin


class PandasDtype(ExtensionDtype):
Expand Down Expand Up @@ -114,7 +115,10 @@ def itemsize(self) -> int:


class PandasArray(
NDArrayBackedExtensionArray, ExtensionOpsMixin, NDArrayOperatorsMixin
NDArrayBackedExtensionArray,
ExtensionOpsMixin,
NDArrayOperatorsMixin,
ObjectStringArrayMixin,
):
"""
A pandas ExtensionArray for NumPy data.
Expand Down Expand Up @@ -376,6 +380,10 @@ def arithmetic_method(self, other):

_create_comparison_method = _create_arithmetic_method

# ------------------------------------------------------------------------
# String methods interface
_str_na_value = np.nan


PandasArray._add_arithmetic_ops()
PandasArray._add_comparison_ops()
62 changes: 60 additions & 2 deletions pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,14 @@
from pandas._libs import lib, missing as libmissing

from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype
from pandas.core.dtypes.common import pandas_dtype
from pandas.core.dtypes.inference import is_array_like
from pandas.core.dtypes.common import (
is_array_like,
is_bool_dtype,
is_integer_dtype,
is_object_dtype,
is_string_dtype,
pandas_dtype,
)

from pandas import compat
from pandas.core import ops
Expand Down Expand Up @@ -347,6 +353,58 @@ def _add_arithmetic_ops(cls):
cls.__rmul__ = cls._create_arithmetic_method(ops.rmul)

_create_comparison_method = _create_arithmetic_method
# ------------------------------------------------------------------------
# String methods interface
_str_na_value = StringDtype.na_value

def _str_map(self, f, na_value=None, dtype=None):
from pandas.arrays import BooleanArray, IntegerArray, StringArray
from pandas.core.arrays.string_ import StringDtype

if dtype is None:
dtype = StringDtype()
if na_value is None:
na_value = self.dtype.na_value

mask = isna(self)
arr = np.asarray(self)

if is_integer_dtype(dtype) or is_bool_dtype(dtype):
constructor: Union[Type[IntegerArray], Type[BooleanArray]]
if is_integer_dtype(dtype):
constructor = IntegerArray
else:
constructor = BooleanArray

na_value_is_na = isna(na_value)
if na_value_is_na:
na_value = 1
result = lib.map_infer_mask(
arr,
f,
mask.view("uint8"),
convert=False,
na_value=na_value,
dtype=np.dtype(dtype),
)

if not na_value_is_na:
mask[:] = False

return constructor(result, mask)

elif is_string_dtype(dtype) and not is_object_dtype(dtype):
# i.e. StringDtype
result = lib.map_infer_mask(
arr, f, mask.view("uint8"), convert=False, na_value=na_value
)
return StringArray(result)
else:
# This is when the result type is object. We reach this when
# -> We know the result type is truly object (e.g. .encode returns bytes
# or .findall returns a list).
# -> We don't know the result type. E.g. `.get` can return anything.
return lib.map_infer_mask(arr, f, mask.view("uint8"))


StringArray._add_arithmetic_ops()
Expand Down
Loading