From 9b6c0548da182faede95845691c0b46080c541e7 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 31 Mar 2021 10:44:47 +0100 Subject: [PATCH 01/11] ArrowStringArrayMixin --- pandas/_libs/lib.pyx | 1 + pandas/conftest.py | 20 ++++++ pandas/core/arrays/string_arrow.py | 4 +- pandas/core/strings/__init__.py | 2 + pandas/core/strings/arrow_array.py | 83 +++++++++++++++++++++++ pandas/tests/strings/test_string_array.py | 20 +++--- 6 files changed, 119 insertions(+), 11 deletions(-) create mode 100644 pandas/core/strings/arrow_array.py diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 94a4d586b4f13..8f496713aced4 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1110,6 +1110,7 @@ _TYPE_MAP = { "complex128": "complex", "c": "complex", "string": "string", + "arrow_string": "string", "S": "bytes", "U": "string", "bool": "boolean", diff --git a/pandas/conftest.py b/pandas/conftest.py index f3356d2998ff8..0a64273321d29 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1131,6 +1131,26 @@ def string_dtype(request): return request.param +@pytest.fixture( + params=[ + "string", + pytest.param( + "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") + ), + ] +) +def nullable_string_dtype(request): + """ + Parametrized fixture for string dtypes. + + * 'string' + * 'arrow_string' + """ + from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa + + return request.param + + @pytest.fixture(params=tm.BYTES_DTYPES) def bytes_dtype(request): """ diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 7251faee333bb..28a23c47b7877 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -43,6 +43,7 @@ check_array_indexer, validate_indices, ) +from pandas.core.strings.arrow_array import ArrowStringArrayMixin try: import pyarrow as pa @@ -153,7 +154,7 @@ def __eq__(self, other) -> bool: return False -class ArrowStringArray(OpsMixin, ExtensionArray): +class ArrowStringArray(OpsMixin, ExtensionArray, ArrowStringArrayMixin): """ Extension array for string data in a ``pyarrow.ChunkedArray``. @@ -198,6 +199,7 @@ class ArrowStringArray(OpsMixin, ExtensionArray): """ _dtype = ArrowStringDtype() + _str_na_value = ArrowStringDtype.na_value def __init__(self, values): self._chk_pyarrow_available() diff --git a/pandas/core/strings/__init__.py b/pandas/core/strings/__init__.py index 943686fc85a05..bd27c8c504a47 100644 --- a/pandas/core/strings/__init__.py +++ b/pandas/core/strings/__init__.py @@ -25,6 +25,8 @@ # - StringArray # - PandasArray # - Categorical +# - ArrowStringArrayMixin +# - ArrowStringArray from pandas.core.strings.accessor import StringMethods from pandas.core.strings.base import BaseStringArrayMethods diff --git a/pandas/core/strings/arrow_array.py b/pandas/core/strings/arrow_array.py new file mode 100644 index 0000000000000..c50dcc198031f --- /dev/null +++ b/pandas/core/strings/arrow_array.py @@ -0,0 +1,83 @@ +from __future__ import annotations + +import numpy as np + +from pandas._libs import lib +from pandas._typing import Dtype + +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_integer_dtype, + is_object_dtype, + is_string_dtype, +) + +from pandas.core.missing import isna +from pandas.core.strings.object_array import ObjectStringArrayMixin + + +class ArrowStringArrayMixin(ObjectStringArrayMixin): + """ + String Methods operating on string type PyArrow arrays. + """ + + # ------------------------------------------------------------------------ + # String methods interface + + def _str_map(self, f, na_value=None, dtype: Dtype | None = None): + from pandas.arrays import ( + BooleanArray, + IntegerArray, + StringArray, + ) + from pandas.core.arrays.string_ import StringDtype + + if dtype is None: + dtype = StringDtype() + if na_value is None: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + constructor: type[IntegerArray] | type[BooleanArray] + if is_integer_dtype(dtype): + constructor = IntegerArray + else: + constructor = BooleanArray + + na_value_is_na = isna(na_value) + if na_value_is_na: + na_value = 1 + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + # error: Value of type variable "_DTypeScalar" of "dtype" cannot be + # "object" + # error: Argument 1 to "dtype" has incompatible type + # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected + # "Type[object]" + dtype=np.dtype(dtype), # type: ignore[type-var,arg-type] + ) + + if not na_value_is_na: + mask[:] = False + + return constructor(result, mask) + + elif is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, f, mask.view("uint8"), convert=False, na_value=na_value + ) + return StringArray(result) + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, f, mask.view("uint8")) diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index b51132caf7573..a74692f3ae13b 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -11,14 +11,14 @@ ) -def test_string_array(any_string_method): +def test_string_array(nullable_string_dtype, any_string_method): method_name, args, kwargs = any_string_method if method_name == "decode": pytest.skip("decode requires bytes.") data = ["a", "bb", np.nan, "ccc"] a = Series(data, dtype=object) - b = Series(data, dtype="string") + b = Series(data, dtype=nullable_string_dtype) expected = getattr(a.str, method_name)(*args, **kwargs) result = getattr(b.str, method_name)(*args, **kwargs) @@ -60,8 +60,8 @@ def test_string_array(any_string_method): ("rindex", [2, None]), ], ) -def test_string_array_numeric_integer_array(method, expected): - s = Series(["aba", None], dtype="string") +def test_string_array_numeric_integer_array(nullable_string_dtype, method, expected): + s = Series(["aba", None], dtype=nullable_string_dtype) result = getattr(s.str, method)("a") expected = Series(expected, dtype="Int64") tm.assert_series_equal(result, expected) @@ -76,17 +76,17 @@ def test_string_array_numeric_integer_array(method, expected): ("isdigit", [False, None, True]), ], ) -def test_string_array_boolean_array(method, expected): - s = Series(["a", None, "1"], dtype="string") +def test_string_array_boolean_array(nullable_string_dtype, method, expected): + s = Series(["a", None, "1"], dtype=nullable_string_dtype) result = getattr(s.str, method)() expected = Series(expected, dtype="boolean") tm.assert_series_equal(result, expected) -def test_string_array_extract(): +def test_string_array_extract(nullable_string_dtype): # https://github.com/pandas-dev/pandas/issues/30969 # Only expand=False & multiple groups was failing - a = Series(["a1", "b2", "cc"], dtype="string") + a = Series(["a1", "b2", "cc"], dtype=nullable_string_dtype) b = Series(["a1", "b2", "cc"], dtype="object") pat = r"(\w)(\d)" @@ -98,8 +98,8 @@ def test_string_array_extract(): tm.assert_equal(result, expected) -def test_str_get_stringarray_multiple_nans(): - s = Series(pd.array(["a", "ab", pd.NA, "abc"])) +def test_str_get_stringarray_multiple_nans(nullable_string_dtype): + s = Series(pd.array(["a", "ab", pd.NA, "abc"], dtype=nullable_string_dtype)) result = s.str.get(2) expected = Series(pd.array([pd.NA, pd.NA, pd.NA, "c"])) tm.assert_series_equal(result, expected) From c0fedcd83296e00092daf240761febbd119676cb Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 31 Mar 2021 11:05:51 +0100 Subject: [PATCH 02/11] move _str_map to ArrowStringArray --- pandas/core/arrays/string_arrow.py | 72 ++++++++++++++++++++++++-- pandas/core/strings/__init__.py | 3 +- pandas/core/strings/arrow_array.py | 83 ------------------------------ 3 files changed, 70 insertions(+), 88 deletions(-) delete mode 100644 pandas/core/strings/arrow_array.py diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 28a23c47b7877..043fbba2fd6f9 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -26,6 +26,10 @@ from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.common import ( + is_object_dtype, + is_string_dtype, +) from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.missing import isna @@ -43,7 +47,7 @@ check_array_indexer, validate_indices, ) -from pandas.core.strings.arrow_array import ArrowStringArrayMixin +from pandas.core.strings.object_array import ObjectStringArrayMixin try: import pyarrow as pa @@ -154,7 +158,7 @@ def __eq__(self, other) -> bool: return False -class ArrowStringArray(OpsMixin, ExtensionArray, ArrowStringArrayMixin): +class ArrowStringArray(OpsMixin, ExtensionArray, ObjectStringArrayMixin): """ Extension array for string data in a ``pyarrow.ChunkedArray``. @@ -199,7 +203,6 @@ class ArrowStringArray(OpsMixin, ExtensionArray, ArrowStringArrayMixin): """ _dtype = ArrowStringDtype() - _str_na_value = ArrowStringDtype.na_value def __init__(self, values): self._chk_pyarrow_available() @@ -682,3 +685,66 @@ def value_counts(self, dropna: bool = True) -> Series: raise NotImplementedError("yo") return Series(counts, index=index).astype("Int64") + + # ------------------------------------------------------------------------ + # String methods interface + + _str_na_value = ArrowStringDtype.na_value + + def _str_map(self, f, na_value=None, dtype: Dtype | None = None): + from pandas.arrays import ( + BooleanArray, + IntegerArray, + StringArray, + ) + from pandas.core.arrays.string_ import StringDtype + + if dtype is None: + dtype = StringDtype() + if na_value is None: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + constructor: type[IntegerArray] | type[BooleanArray] + if is_integer_dtype(dtype): + constructor = IntegerArray + else: + constructor = BooleanArray + + na_value_is_na = isna(na_value) + if na_value_is_na: + na_value = 1 + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + # error: Value of type variable "_DTypeScalar" of "dtype" cannot be + # "object" + # error: Argument 1 to "dtype" has incompatible type + # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected + # "Type[object]" + dtype=np.dtype(dtype), # type: ignore[type-var,arg-type] + ) + + if not na_value_is_na: + mask[:] = False + + return constructor(result, mask) + + elif is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, f, mask.view("uint8"), convert=False, na_value=na_value + ) + return StringArray(result) + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, f, mask.view("uint8")) diff --git a/pandas/core/strings/__init__.py b/pandas/core/strings/__init__.py index bd27c8c504a47..28aba7c9ce0b3 100644 --- a/pandas/core/strings/__init__.py +++ b/pandas/core/strings/__init__.py @@ -25,8 +25,7 @@ # - StringArray # - PandasArray # - Categorical -# - ArrowStringArrayMixin -# - ArrowStringArray +# - ArrowStringArray from pandas.core.strings.accessor import StringMethods from pandas.core.strings.base import BaseStringArrayMethods diff --git a/pandas/core/strings/arrow_array.py b/pandas/core/strings/arrow_array.py deleted file mode 100644 index c50dcc198031f..0000000000000 --- a/pandas/core/strings/arrow_array.py +++ /dev/null @@ -1,83 +0,0 @@ -from __future__ import annotations - -import numpy as np - -from pandas._libs import lib -from pandas._typing import Dtype - -from pandas.core.dtypes.common import ( - is_bool_dtype, - is_integer_dtype, - is_object_dtype, - is_string_dtype, -) - -from pandas.core.missing import isna -from pandas.core.strings.object_array import ObjectStringArrayMixin - - -class ArrowStringArrayMixin(ObjectStringArrayMixin): - """ - String Methods operating on string type PyArrow arrays. - """ - - # ------------------------------------------------------------------------ - # String methods interface - - def _str_map(self, f, na_value=None, dtype: Dtype | None = None): - from pandas.arrays import ( - BooleanArray, - IntegerArray, - StringArray, - ) - from pandas.core.arrays.string_ import StringDtype - - if dtype is None: - dtype = StringDtype() - if na_value is None: - na_value = self.dtype.na_value - - mask = isna(self) - arr = np.asarray(self) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - constructor: type[IntegerArray] | type[BooleanArray] - if is_integer_dtype(dtype): - constructor = IntegerArray - else: - constructor = BooleanArray - - na_value_is_na = isna(na_value) - if na_value_is_na: - na_value = 1 - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - # error: Value of type variable "_DTypeScalar" of "dtype" cannot be - # "object" - # error: Argument 1 to "dtype" has incompatible type - # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected - # "Type[object]" - dtype=np.dtype(dtype), # type: ignore[type-var,arg-type] - ) - - if not na_value_is_na: - mask[:] = False - - return constructor(result, mask) - - elif is_string_dtype(dtype) and not is_object_dtype(dtype): - # i.e. StringDtype - result = lib.map_infer_mask( - arr, f, mask.view("uint8"), convert=False, na_value=na_value - ) - return StringArray(result) - else: - # This is when the result type is object. We reach this when - # -> We know the result type is truly object (e.g. .encode returns bytes - # or .findall returns a list). - # -> We don't know the result type. E.g. `.get` can return anything. - return lib.map_infer_mask(arr, f, mask.view("uint8")) From e6033216fcf76f2e6d73d7f096e38620576b69aa Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 31 Mar 2021 11:35:12 +0100 Subject: [PATCH 03/11] test_str_get_stringarray_multiple_nans --- pandas/core/arrays/string_arrow.py | 6 ++---- pandas/tests/strings/test_string_array.py | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 043fbba2fd6f9..4dbbb58581179 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -695,12 +695,10 @@ def _str_map(self, f, na_value=None, dtype: Dtype | None = None): from pandas.arrays import ( BooleanArray, IntegerArray, - StringArray, ) - from pandas.core.arrays.string_ import StringDtype if dtype is None: - dtype = StringDtype() + dtype = self.dtype if na_value is None: na_value = self.dtype.na_value @@ -741,7 +739,7 @@ def _str_map(self, f, na_value=None, dtype: Dtype | None = None): result = lib.map_infer_mask( arr, f, mask.view("uint8"), convert=False, na_value=na_value ) - return StringArray(result) + return self._from_sequence(result) else: # This is when the result type is object. We reach this when # -> We know the result type is truly object (e.g. .encode returns bytes diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index a74692f3ae13b..5408c89b8ddcb 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -101,5 +101,5 @@ def test_string_array_extract(nullable_string_dtype): def test_str_get_stringarray_multiple_nans(nullable_string_dtype): s = Series(pd.array(["a", "ab", pd.NA, "abc"], dtype=nullable_string_dtype)) result = s.str.get(2) - expected = Series(pd.array([pd.NA, pd.NA, pd.NA, "c"])) + expected = Series(pd.array([pd.NA, pd.NA, pd.NA, "c"], dtype=nullable_string_dtype)) tm.assert_series_equal(result, expected) From 37035b43621b5f311e540d36ed016942427ad892 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 31 Mar 2021 11:38:46 +0100 Subject: [PATCH 04/11] flake code --- pandas/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 0a64273321d29..03a6b610acdb8 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1146,7 +1146,7 @@ def nullable_string_dtype(request): * 'string' * 'arrow_string' """ - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa + from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 return request.param From c823953694aac48fc3839ce61bd6db985bf26e61 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 31 Mar 2021 14:07:47 +0100 Subject: [PATCH 05/11] test assertions --- pandas/tests/strings/test_string_array.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index 5408c89b8ddcb..dabd3798317c8 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -27,7 +27,7 @@ def test_string_array(nullable_string_dtype, any_string_method): if expected.dtype == "object" and lib.is_string_array( expected.dropna().values, ): - assert result.dtype == "string" + assert result.dtype == nullable_string_dtype result = result.astype(object) elif expected.dtype == "object" and lib.is_bool_array( @@ -46,7 +46,7 @@ def test_string_array(nullable_string_dtype, any_string_method): elif isinstance(expected, DataFrame): columns = expected.select_dtypes(include="object").columns - assert all(result[columns].dtypes == "string") + assert all(result[columns].dtypes == nullable_string_dtype) result[columns] = result[columns].astype(object) tm.assert_equal(result, expected) @@ -73,7 +73,6 @@ def test_string_array_numeric_integer_array(nullable_string_dtype, method, expec ("isdigit", [False, None, True]), ("isalpha", [True, None, False]), ("isalnum", [True, None, True]), - ("isdigit", [False, None, True]), ], ) def test_string_array_boolean_array(nullable_string_dtype, method, expected): @@ -92,7 +91,7 @@ def test_string_array_extract(nullable_string_dtype): result = a.str.extract(pat, expand=False) expected = b.str.extract(pat, expand=False) - assert all(result.dtypes == "string") + assert all(result.dtypes == nullable_string_dtype) result = result.astype(object) tm.assert_equal(result, expected) From 76292ecf6047dc19ce23800e11975af2c9035d36 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 31 Mar 2021 16:33:26 +0100 Subject: [PATCH 06/11] remove xfail from test_string_methods --- pandas/tests/arrays/string_/test_string.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 8b84a510c01e6..749f3d0aee8a5 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -91,17 +91,11 @@ def test_setitem_with_scalar_string(dtype): @pytest.mark.parametrize( "input, method", [ - (["a", "b", "c"], operator.methodcaller("capitalize")), (["a", "b", "c"], operator.methodcaller("capitalize")), (["a b", "a bc. de"], operator.methodcaller("capitalize")), ], ) -def test_string_methods(input, method, dtype, request): - if dtype == "arrow_string": - reason = "AttributeError: 'ArrowStringDtype' object has no attribute 'base'" - mark = pytest.mark.xfail(reason=reason) - request.node.add_marker(mark) - +def test_string_methods(input, method, dtype): a = pd.Series(input, dtype=dtype) b = pd.Series(input, dtype="object") result = method(a.str) From 50db8765df093a59e644ac3d283ecc00a7f6f52d Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 31 Mar 2021 16:54:33 +0100 Subject: [PATCH 07/11] xfail extract/extractall tests - out of scope for this PR --- pandas/tests/strings/test_string_array.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index dabd3798317c8..e7cf5085d2532 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -11,11 +11,19 @@ ) -def test_string_array(nullable_string_dtype, any_string_method): +def test_string_array(nullable_string_dtype, any_string_method, request): method_name, args, kwargs = any_string_method if method_name == "decode": pytest.skip("decode requires bytes.") + if nullable_string_dtype == "arrow_string" and method_name in { + "extract", + "extractall", + }: + reason = "extract/extractall does not yet dispatch to array" + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + data = ["a", "bb", np.nan, "ccc"] a = Series(data, dtype=object) b = Series(data, dtype=nullable_string_dtype) @@ -82,9 +90,15 @@ def test_string_array_boolean_array(nullable_string_dtype, method, expected): tm.assert_series_equal(result, expected) -def test_string_array_extract(nullable_string_dtype): +def test_string_array_extract(nullable_string_dtype, request): # https://github.com/pandas-dev/pandas/issues/30969 # Only expand=False & multiple groups was failing + + if nullable_string_dtype == "arrow_string": + reason = "extract does not yet dispatch to array" + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + a = Series(["a1", "b2", "cc"], dtype=nullable_string_dtype) b = Series(["a1", "b2", "cc"], dtype="object") pat = r"(\w)(\d)" From 374924b1a3c354f28c908c2da4fa688e58446cf2 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 31 Mar 2021 18:44:19 +0100 Subject: [PATCH 08/11] special case in _wrap_result --- pandas/core/strings/accessor.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 1eda06dbbb1c4..6cace87924d40 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -154,10 +154,11 @@ class StringMethods(NoNewAttributesMixin): def __init__(self, data): from pandas.core.arrays.string_ import StringDtype + from pandas.core.arrays.string_arrow import ArrowStringDtype self._inferred_dtype = self._validate(data) self._is_categorical = is_categorical_dtype(data.dtype) - self._is_string = isinstance(data.dtype, StringDtype) + self._is_string = isinstance(data.dtype, (StringDtype, ArrowStringDtype)) self._data = data self._index = self._name = None @@ -316,7 +317,7 @@ def cons_row(x): # This is a mess. dtype: Optional[str] if self._is_string and returns_string: - dtype = "string" + dtype = self._orig.dtype else: dtype = None From 4b86f67cff53f97ded913c29df089312a979ea49 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 1 Apr 2021 16:39:28 +0100 Subject: [PATCH 09/11] add isnumeric to test_string_array_boolean_array --- pandas/tests/strings/test_string_array.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index e7cf5085d2532..23c9b14c5a36a 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -81,6 +81,7 @@ def test_string_array_numeric_integer_array(nullable_string_dtype, method, expec ("isdigit", [False, None, True]), ("isalpha", [True, None, False]), ("isalnum", [True, None, True]), + ("isnumeric", [False, None, True]), ], ) def test_string_array_boolean_array(nullable_string_dtype, method, expected): From aaf54ca8d69187c0230060e7602b32c2f84cdee5 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 9 Apr 2021 12:12:12 +0100 Subject: [PATCH 10/11] mypy fixup --- pandas/core/arrays/string_arrow.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index bfc980efaeb06..3f24aabb81dd2 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -728,7 +728,11 @@ def _str_map(self, f, na_value=None, dtype: Dtype | None = None): if not na_value_is_na: mask[:] = False - return constructor(result, mask) + # error: Argument 1 to "IntegerArray" has incompatible type + # "Union[ExtensionArray, ndarray]"; expected "ndarray" + # error: Argument 1 to "BooleanArray" has incompatible type + # "Union[ExtensionArray, ndarray]"; expected "ndarray" + return constructor(result, mask) # type: ignore[arg-type] elif is_string_dtype(dtype) and not is_object_dtype(dtype): # i.e. StringDtype From d1c8a3e0fbe1fa71bee93eab1a001ff6a047f427 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 15 Apr 2021 08:55:30 +0100 Subject: [PATCH 11/11] add comments --- pandas/core/arrays/string_arrow.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 3f24aabb81dd2..fd47597b2191f 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -154,6 +154,11 @@ def __eq__(self, other) -> bool: return False +# TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from +# ObjectStringArrayMixin because we want to have the object-dtype based methods as +# fallback for the ones that pyarrow doesn't yet support + + class ArrowStringArray(OpsMixin, ExtensionArray, ObjectStringArrayMixin): """ Extension array for string data in a ``pyarrow.ChunkedArray``. @@ -688,6 +693,9 @@ def value_counts(self, dropna: bool = True) -> Series: _str_na_value = ArrowStringDtype.na_value def _str_map(self, f, na_value=None, dtype: Dtype | None = None): + # TODO: de-duplicate with StringArray method. This method is moreless copy and + # paste. + from pandas.arrays import ( BooleanArray, IntegerArray,