From 7cf3a5491ee27ee2ceeb0785fb629ab211ef9809 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 15 Apr 2021 13:22:43 +0100 Subject: [PATCH 1/4] [ArrowStringArray] API: StringArray -> PythonStringArray --- pandas/arrays/__init__.py | 4 +- pandas/core/arrays/__init__.py | 4 +- pandas/core/arrays/string_.py | 62 ++++++++++++------- pandas/core/arrays/string_arrow.py | 3 +- pandas/core/construction.py | 8 +-- pandas/core/strings/object_array.py | 6 +- .../arrays/categorical/test_constructors.py | 2 +- pandas/tests/arrays/string_/test_string.py | 16 ++--- pandas/tests/arrays/test_array.py | 10 +-- pandas/tests/arrays/test_datetimelike.py | 2 +- 10 files changed, 66 insertions(+), 51 deletions(-) diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py index 0fa070b6e4fc4..49a5b33b15f00 100644 --- a/pandas/arrays/__init__.py +++ b/pandas/arrays/__init__.py @@ -12,8 +12,8 @@ IntervalArray, PandasArray, PeriodArray, + PythonStringArray, SparseArray, - StringArray, TimedeltaArray, ) @@ -27,6 +27,6 @@ "PandasArray", "PeriodArray", "SparseArray", - "StringArray", + "PythonStringArray", "TimedeltaArray", ] diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 22f15ca9650db..ae93f4c762ba1 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -16,7 +16,7 @@ period_array, ) from pandas.core.arrays.sparse import SparseArray -from pandas.core.arrays.string_ import StringArray +from pandas.core.arrays.string_ import PythonStringArray from pandas.core.arrays.timedeltas import TimedeltaArray __all__ = [ @@ -34,6 +34,6 @@ "PeriodArray", "period_array", "SparseArray", - "StringArray", + "PythonStringArray", "TimedeltaArray", ] diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 600aacec9c87a..f2b140cd0fc42 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,6 +1,9 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + TypeVar, +) import numpy as np @@ -36,6 +39,7 @@ IntegerArray, PandasArray, ) +from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.floating import FloatingDtype from pandas.core.arrays.integer import _IntegerDtype from pandas.core.construction import extract_array @@ -85,7 +89,7 @@ def type(self) -> type[str]: return str @classmethod - def construct_array_type(cls) -> type_t[StringArray]: + def construct_array_type(cls) -> type_t[PythonStringArray]: """ Return the array type associated with this dtype. @@ -93,16 +97,16 @@ def construct_array_type(cls) -> type_t[StringArray]: ------- type """ - return StringArray + return PythonStringArray def __repr__(self) -> str: return "StringDtype" def __from_arrow__( self, array: pyarrow.Array | pyarrow.ChunkedArray - ) -> StringArray: + ) -> PythonStringArray: """ - Construct StringArray from pyarrow Array/ChunkedArray. + Construct PythonStringArray from pyarrow Array/ChunkedArray. """ import pyarrow @@ -115,13 +119,20 @@ def __from_arrow__( results = [] for arr in chunks: # using _from_sequence to ensure None is converted to NA - str_arr = StringArray._from_sequence(np.array(arr)) + str_arr = PythonStringArray._from_sequence(np.array(arr)) results.append(str_arr) - return StringArray._concat_same_type(results) + return PythonStringArray._concat_same_type(results) + + +StringArrayT = TypeVar("StringArrayT", bound="StringArray") + +class StringArray(ExtensionArray): + pass -class StringArray(PandasArray): + +class PythonStringArray(StringArray, PandasArray): """ Extension array for string data. @@ -129,7 +140,7 @@ class StringArray(PandasArray): .. warning:: - StringArray is considered experimental. The implementation and + PythonStringArray is considered experimental. The implementation and parts of the API may change without warning. Parameters @@ -143,7 +154,7 @@ class StringArray(PandasArray): where the elements are Python strings or :attr:`pandas.NA`. This may change without warning in the future. Use :meth:`pandas.array` with ``dtype="string"`` for a stable way of - creating a `StringArray` from any sequence. + creating a `PythonStringArray` from any sequence. copy : bool, default False Whether to copy the array of data. @@ -159,23 +170,23 @@ class StringArray(PandasArray): See Also -------- array - The recommended function for creating a StringArray. + The recommended function for creating a PythonStringArray. Series.str The string methods are available on Series backed by - a StringArray. + a PythonStringArray. Notes ----- - StringArray returns a BooleanArray for comparison methods. + PythonStringArray returns a BooleanArray for comparison methods. Examples -------- >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string") - + ['This is', 'some text', , 'data.'] Length: 4, dtype: string - Unlike arrays instantiated with ``dtype="object"``, ``StringArray`` + Unlike arrays instantiated with ``dtype="object"``, ``PythonStringArray`` will convert the values to strings. >>> pd.array(['1', 1], dtype="object") @@ -187,9 +198,10 @@ class StringArray(PandasArray): ['1', '1'] Length: 2, dtype: string - However, instantiating StringArrays directly with non-strings will raise an error. + However, instantiating PythonStringArrays directly with non-strings will raise an + error. - For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`: + For comparison methods, `PythonStringArray` returns a :class:`pandas.BooleanArray`: >>> pd.array(["a", None, "c"], dtype="string") == "a" @@ -213,10 +225,12 @@ def __init__(self, values, copy=False): def _validate(self): """Validate that we only store NA or strings.""" if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): - raise ValueError("StringArray requires a sequence of strings or pandas.NA") + raise ValueError( + "PythonStringArray requires a sequence of strings or pandas.NA" + ) if self._ndarray.dtype != "object": raise ValueError( - "StringArray requires a sequence of strings or pandas.NA. Got " + "PythonStringArray requires a sequence of strings or pandas.NA. Got " f"'{self._ndarray.dtype}' dtype instead." ) @@ -255,7 +269,7 @@ def _from_sequence_of_strings( return cls._from_sequence(strings, dtype=dtype, copy=copy) @classmethod - def _empty(cls, shape, dtype) -> StringArray: + def _empty(cls, shape, dtype) -> PythonStringArray: values = np.empty(shape, dtype=object) values[:] = libmissing.NA return cls(values).astype(dtype, copy=False) @@ -297,7 +311,7 @@ def __setitem__(self, key, value): value = StringDtype.na_value elif not isinstance(value, str): raise ValueError( - f"Cannot set non-string value '{value}' into a StringArray." + f"Cannot set non-string value '{value}' into a PythonStringArray." ) else: if not is_array_like(value): @@ -376,7 +390,7 @@ def memory_usage(self, deep: bool = False) -> int: def _cmp_method(self, other, op): from pandas.arrays import BooleanArray - if isinstance(other, StringArray): + if isinstance(other, PythonStringArray): other = other._ndarray mask = isna(self) | isna(other) @@ -396,7 +410,7 @@ def _cmp_method(self, other, op): result = np.empty_like(self._ndarray, dtype="object") result[mask] = StringDtype.na_value result[valid] = op(self._ndarray[valid], other) - return StringArray(result) + return PythonStringArray(result) else: # logical result = np.zeros(len(self._ndarray), dtype="bool") @@ -456,7 +470,7 @@ def _str_map(self, f, na_value=None, dtype: Dtype | None = None): result = lib.map_infer_mask( arr, f, mask.view("uint8"), convert=False, na_value=na_value ) - return StringArray(result) + return PythonStringArray(result) else: # This is when the result type is object. We reach this when # -> We know the result type is truly object (e.g. .encode returns bytes diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index fd47597b2191f..27ae62f39711a 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -41,6 +41,7 @@ from pandas.core import missing from pandas.core.arraylike import OpsMixin from pandas.core.arrays.base import ExtensionArray +from pandas.core.arrays.string_ import StringArray from pandas.core.indexers import ( check_array_indexer, validate_indices, @@ -159,7 +160,7 @@ def __eq__(self, other) -> bool: # fallback for the ones that pyarrow doesn't yet support -class ArrowStringArray(OpsMixin, ExtensionArray, ObjectStringArrayMixin): +class ArrowStringArray(OpsMixin, StringArray, ObjectStringArrayMixin): """ Extension array for string data in a ``pyarrow.ChunkedArray``. diff --git a/pandas/core/construction.py b/pandas/core/construction.py index f3133480108a6..e0f65a85ec48b 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -122,7 +122,7 @@ def array( :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` :class:`int` :class:`pandas.arrays.IntegerArray` :class:`float` :class:`pandas.arrays.FloatingArray` - :class:`str` :class:`pandas.arrays.StringArray` + :class:`str` :class:`pandas.arrays.PythonStringArray` :class:`bool` :class:`pandas.arrays.BooleanArray` ============================== ===================================== @@ -236,7 +236,7 @@ def array( Length: 2, dtype: Float64 >>> pd.array(["a", None, "c"]) - + ['a', , 'c'] Length: 3, dtype: string @@ -292,7 +292,7 @@ def array( IntegerArray, IntervalArray, PandasArray, - StringArray, + PythonStringArray, TimedeltaArray, period_array, ) @@ -345,7 +345,7 @@ def array( return TimedeltaArray._from_sequence(data, copy=copy) elif inferred_dtype == "string": - return StringArray._from_sequence(data, copy=copy) + return PythonStringArray._from_sequence(data, copy=copy) elif inferred_dtype == "integer": return IntegerArray._from_sequence(data, copy=copy) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index f2027f2707a8b..0ecabae8f92ed 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -198,7 +198,7 @@ def scalar_rep(x): return self._str_map(scalar_rep, dtype=str) else: - from pandas.core.arrays.string_ import StringArray + from pandas.core.arrays.string_ import PythonStringArray def rep(x, r): if x is libmissing.NA: @@ -210,9 +210,9 @@ def rep(x, r): repeats = np.asarray(repeats, dtype=object) result = libops.vec_binop(np.asarray(self), repeats, rep) - if isinstance(self, StringArray): + if isinstance(self, PythonStringArray): # Not going through map, so we have to do this here. - result = StringArray._from_sequence(result) + result = PythonStringArray._from_sequence(result) return result def _str_match( diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 93ba16c5fda22..cd3436344abc4 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -725,7 +725,7 @@ def test_interval(self): def test_categorical_extension_array_nullable(self, nulls_fixture): # GH: - arr = pd.arrays.StringArray._from_sequence([nulls_fixture] * 2) + arr = pd.arrays.PythonStringArray._from_sequence([nulls_fixture] * 2) result = Categorical(arr) expected = Categorical(Series([pd.NA, pd.NA], dtype="object")) tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 749f3d0aee8a5..1afb10c82ffa7 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -34,7 +34,7 @@ def dtype_object(dtype): @pytest.fixture( params=[ - pd.arrays.StringArray, + pd.arrays.PythonStringArray, pytest.param(ArrowStringArray, marks=skip_if_no_pyarrow), ] ) @@ -50,7 +50,7 @@ def test_repr(dtype): expected = f"0 a\n1 \n2 b\nName: A, dtype: {dtype}" assert repr(df.A) == expected - arr_name = "ArrowStringArray" if dtype == "arrow_string" else "StringArray" + arr_name = "ArrowStringArray" if dtype == "arrow_string" else "PythonStringArray" expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: {dtype}" assert repr(df.A.array) == expected @@ -64,14 +64,14 @@ def test_none_to_nan(cls): def test_setitem_validates(cls): arr = cls._from_sequence(["a", "b"]) - if cls is pd.arrays.StringArray: - msg = "Cannot set non-string value '10' into a StringArray." + if cls is pd.arrays.PythonStringArray: + msg = "Cannot set non-string value '10' into a PythonStringArray." else: msg = "Scalar must be NA or str" with pytest.raises(ValueError, match=msg): arr[0] = 10 - if cls is pd.arrays.StringArray: + if cls is pd.arrays.PythonStringArray: msg = "Must provide strings." else: msg = "Scalar must be NA or str" @@ -299,8 +299,8 @@ def test_comparison_methods_array(all_compare_operators, dtype, request): def test_constructor_raises(cls): - if cls is pd.arrays.StringArray: - msg = "StringArray requires a sequence of strings or pandas.NA" + if cls is pd.arrays.PythonStringArray: + msg = "PythonStringArray requires a sequence of strings or pandas.NA" else: msg = "Unsupported type '' for ArrowStringArray" @@ -455,7 +455,7 @@ def test_fillna_args(dtype, request): expected = pd.array(["a", "b"], dtype=dtype) tm.assert_extension_array_equal(res, expected) - msg = "Cannot set non-string value '1' into a StringArray." + msg = "Cannot set non-string value '1' into a PythonStringArray." with pytest.raises(ValueError, match=msg): arr.fillna(value=1) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index bfe588883d9f3..78145f9e37713 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -17,8 +17,8 @@ FloatingArray, IntegerArray, IntervalArray, + PythonStringArray, SparseArray, - StringArray, TimedeltaArray, ) from pandas.core.arrays import ( @@ -132,8 +132,8 @@ ([1, None], "Int16", pd.array([1, None], dtype="Int16")), (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), # String - (["a", None], "string", StringArray._from_sequence(["a", None])), - (["a", None], pd.StringDtype(), StringArray._from_sequence(["a", None])), + (["a", None], "string", PythonStringArray._from_sequence(["a", None])), + (["a", None], pd.StringDtype(), PythonStringArray._from_sequence(["a", None])), # Boolean ([True, None], "boolean", BooleanArray._from_sequence([True, None])), ([True, None], pd.BooleanDtype(), BooleanArray._from_sequence([True, None])), @@ -253,8 +253,8 @@ def test_array_copy(): ([1, 2.0], FloatingArray._from_sequence([1.0, 2.0])), ([1, np.nan, 2.0], FloatingArray._from_sequence([1.0, None, 2.0])), # string - (["a", "b"], StringArray._from_sequence(["a", "b"])), - (["a", None], StringArray._from_sequence(["a", None])), + (["a", "b"], PythonStringArray._from_sequence(["a", "b"])), + (["a", None], PythonStringArray._from_sequence(["a", None])), # Boolean ([True, False], BooleanArray._from_sequence([True, False])), ([True, None], BooleanArray._from_sequence([True, None])), diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 62d368264752b..e1d0849154562 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -343,7 +343,7 @@ def test_searchsorted_castable_strings(self, arr1d, box, request): TypeError, match=re.escape( f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', " - "or array of those. Got 'StringArray' instead." + "or array of those. Got 'PythonStringArray' instead." ), ): arr.searchsorted([str(arr[1]), "baz"]) From 836adaab00dce9eb794b5a744e2898c0e60afe1d Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 2 Jun 2021 12:09:38 +0100 Subject: [PATCH 2/4] post merge fixup --- pandas/core/construction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index a80af62418d2f..cca9b5c5e4b8e 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -289,8 +289,8 @@ def array( IntegerArray, IntervalArray, PandasArray, - PythonStringArray, PeriodArray, + PythonStringArray, TimedeltaArray, ) From e7f2d17e0b2262d949fde5df8447f69d71d35d26 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 5 Jun 2021 13:24:42 +0100 Subject: [PATCH 3/4] revert all changes --- pandas/arrays/__init__.py | 4 +- pandas/core/arrays/__init__.py | 4 +- pandas/core/arrays/string_.py | 64 ++++++++----------- pandas/core/arrays/string_arrow.py | 7 +- pandas/core/construction.py | 8 +-- pandas/core/strings/object_array.py | 4 +- .../arrays/categorical/test_constructors.py | 2 +- pandas/tests/arrays/string_/test_string.py | 16 ++--- pandas/tests/arrays/test_array.py | 10 +-- pandas/tests/arrays/test_datetimelike.py | 2 +- 10 files changed, 52 insertions(+), 69 deletions(-) diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py index 49a5b33b15f00..0fa070b6e4fc4 100644 --- a/pandas/arrays/__init__.py +++ b/pandas/arrays/__init__.py @@ -12,8 +12,8 @@ IntervalArray, PandasArray, PeriodArray, - PythonStringArray, SparseArray, + StringArray, TimedeltaArray, ) @@ -27,6 +27,6 @@ "PandasArray", "PeriodArray", "SparseArray", - "PythonStringArray", + "StringArray", "TimedeltaArray", ] diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index ae93f4c762ba1..22f15ca9650db 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -16,7 +16,7 @@ period_array, ) from pandas.core.arrays.sparse import SparseArray -from pandas.core.arrays.string_ import PythonStringArray +from pandas.core.arrays.string_ import StringArray from pandas.core.arrays.timedeltas import TimedeltaArray __all__ = [ @@ -34,6 +34,6 @@ "PeriodArray", "period_array", "SparseArray", - "PythonStringArray", + "StringArray", "TimedeltaArray", ] diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index b291b70aa94bb..ab1dadf4d2dfa 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,9 +1,6 @@ from __future__ import annotations -from typing import ( - TYPE_CHECKING, - TypeVar, -) +from typing import TYPE_CHECKING import numpy as np @@ -40,7 +37,6 @@ IntegerArray, PandasArray, ) -from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.floating import FloatingDtype from pandas.core.arrays.integer import _IntegerDtype from pandas.core.construction import extract_array @@ -90,7 +86,7 @@ def type(self) -> type[str]: return str @classmethod - def construct_array_type(cls) -> type_t[PythonStringArray]: + def construct_array_type(cls) -> type_t[StringArray]: """ Return the array type associated with this dtype. @@ -98,16 +94,16 @@ def construct_array_type(cls) -> type_t[PythonStringArray]: ------- type """ - return PythonStringArray + return StringArray def __repr__(self) -> str: return "StringDtype" def __from_arrow__( self, array: pyarrow.Array | pyarrow.ChunkedArray - ) -> PythonStringArray: + ) -> StringArray: """ - Construct PythonStringArray from pyarrow Array/ChunkedArray. + Construct StringArray from pyarrow Array/ChunkedArray. """ import pyarrow @@ -120,23 +116,16 @@ def __from_arrow__( results = [] for arr in chunks: # using _from_sequence to ensure None is converted to NA - str_arr = PythonStringArray._from_sequence(np.array(arr)) + str_arr = StringArray._from_sequence(np.array(arr)) results.append(str_arr) if results: - return PythonStringArray._concat_same_type(results) + return StringArray._concat_same_type(results) else: - return PythonStringArray(np.array([], dtype="object")) - - -StringArrayT = TypeVar("StringArrayT", bound="StringArray") - + return StringArray(np.array([], dtype="object")) -class StringArray(ExtensionArray): - pass - -class PythonStringArray(StringArray, PandasArray): +class StringArray(PandasArray): """ Extension array for string data. @@ -144,7 +133,7 @@ class PythonStringArray(StringArray, PandasArray): .. warning:: - PythonStringArray is considered experimental. The implementation and + StringArray is considered experimental. The implementation and parts of the API may change without warning. Parameters @@ -158,7 +147,7 @@ class PythonStringArray(StringArray, PandasArray): where the elements are Python strings or :attr:`pandas.NA`. This may change without warning in the future. Use :meth:`pandas.array` with ``dtype="string"`` for a stable way of - creating a `PythonStringArray` from any sequence. + creating a `StringArray` from any sequence. copy : bool, default False Whether to copy the array of data. @@ -174,23 +163,23 @@ class PythonStringArray(StringArray, PandasArray): See Also -------- array - The recommended function for creating a PythonStringArray. + The recommended function for creating a StringArray. Series.str The string methods are available on Series backed by - a PythonStringArray. + a StringArray. Notes ----- - PythonStringArray returns a BooleanArray for comparison methods. + StringArray returns a BooleanArray for comparison methods. Examples -------- >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string") - + ['This is', 'some text', , 'data.'] Length: 4, dtype: string - Unlike arrays instantiated with ``dtype="object"``, ``PythonStringArray`` + Unlike arrays instantiated with ``dtype="object"``, ``StringArray`` will convert the values to strings. >>> pd.array(['1', 1], dtype="object") @@ -202,10 +191,9 @@ class PythonStringArray(StringArray, PandasArray): ['1', '1'] Length: 2, dtype: string - However, instantiating PythonStringArrays directly with non-strings will raise an - error. + However, instantiating StringArrays directly with non-strings will raise an error. - For comparison methods, `PythonStringArray` returns a :class:`pandas.BooleanArray`: + For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`: >>> pd.array(["a", None, "c"], dtype="string") == "a" @@ -229,12 +217,10 @@ def __init__(self, values, copy=False): def _validate(self): """Validate that we only store NA or strings.""" if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): - raise ValueError( - "PythonStringArray requires a sequence of strings or pandas.NA" - ) + raise ValueError("StringArray requires a sequence of strings or pandas.NA") if self._ndarray.dtype != "object": raise ValueError( - "PythonStringArray requires a sequence of strings or pandas.NA. Got " + "StringArray requires a sequence of strings or pandas.NA. Got " f"'{self._ndarray.dtype}' dtype instead." ) @@ -272,7 +258,7 @@ def _from_sequence_of_strings( return cls._from_sequence(strings, dtype=dtype, copy=copy) @classmethod - def _empty(cls, shape, dtype) -> PythonStringArray: + def _empty(cls, shape, dtype) -> StringArray: values = np.empty(shape, dtype=object) values[:] = libmissing.NA return cls(values).astype(dtype, copy=False) @@ -314,7 +300,7 @@ def __setitem__(self, key, value): value = StringDtype.na_value elif not isinstance(value, str): raise ValueError( - f"Cannot set non-string value '{value}' into a PythonStringArray." + f"Cannot set non-string value '{value}' into a StringArray." ) else: if not is_array_like(value): @@ -391,7 +377,7 @@ def memory_usage(self, deep: bool = False) -> int: def _cmp_method(self, other, op): from pandas.arrays import BooleanArray - if isinstance(other, PythonStringArray): + if isinstance(other, StringArray): other = other._ndarray mask = isna(self) | isna(other) @@ -411,7 +397,7 @@ def _cmp_method(self, other, op): result = np.empty_like(self._ndarray, dtype="object") result[mask] = StringDtype.na_value result[valid] = op(self._ndarray[valid], other) - return PythonStringArray(result) + return StringArray(result) else: # logical result = np.zeros(len(self._ndarray), dtype="bool") @@ -471,7 +457,7 @@ def _str_map( result = lib.map_infer_mask( arr, f, mask.view("uint8"), convert=False, na_value=na_value ) - return PythonStringArray(result) + return StringArray(result) else: # This is when the result type is object. We reach this when # -> We know the result type is truly object (e.g. .encode returns bytes diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 3946b20fd8b4d..3cf471e381da9 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -52,10 +52,7 @@ from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.integer import Int64Dtype from pandas.core.arrays.numeric import NumericDtype -from pandas.core.arrays.string_ import ( - StringArray, - StringDtype, -) +from pandas.core.arrays.string_ import StringDtype from pandas.core.indexers import ( check_array_indexer, validate_indices, @@ -181,7 +178,7 @@ def __eq__(self, other) -> bool: # fallback for the ones that pyarrow doesn't yet support -class ArrowStringArray(OpsMixin, StringArray, ObjectStringArrayMixin): +class ArrowStringArray(OpsMixin, ExtensionArray, ObjectStringArrayMixin): """ Extension array for string data in a ``pyarrow.ChunkedArray``. diff --git a/pandas/core/construction.py b/pandas/core/construction.py index dd768c87bd9e7..330902b402324 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -118,7 +118,7 @@ def array( :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` :class:`int` :class:`pandas.arrays.IntegerArray` :class:`float` :class:`pandas.arrays.FloatingArray` - :class:`str` :class:`pandas.arrays.PythonStringArray` + :class:`str` :class:`pandas.arrays.StringArray` :class:`bool` :class:`pandas.arrays.BooleanArray` ============================== ===================================== @@ -232,7 +232,7 @@ def array( Length: 2, dtype: Float64 >>> pd.array(["a", None, "c"]) - + ['a', , 'c'] Length: 3, dtype: string @@ -289,7 +289,7 @@ def array( IntervalArray, PandasArray, PeriodArray, - PythonStringArray, + StringArray, TimedeltaArray, ) @@ -332,7 +332,7 @@ def array( return TimedeltaArray._from_sequence(data, copy=copy) elif inferred_dtype == "string": - return PythonStringArray._from_sequence(data, copy=copy) + return StringArray._from_sequence(data, copy=copy) elif inferred_dtype == "integer": return IntegerArray._from_sequence(data, copy=copy) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index b74a8e5d2295d..7ce4abe904f3b 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -173,7 +173,7 @@ def scalar_rep(x): return self._str_map(scalar_rep, dtype=str) else: - from pandas.core.arrays.string_ import PythonStringArray + from pandas.core.arrays.string_ import StringArray from pandas.core.arrays.string_arrow import ArrowStringArray def rep(x, r): @@ -186,7 +186,7 @@ def rep(x, r): repeats = np.asarray(repeats, dtype=object) result = libops.vec_binop(np.asarray(self), repeats, rep) - if isinstance(self, (PythonStringArray, ArrowStringArray)): + if isinstance(self, (StringArray, ArrowStringArray)): # Not going through map, so we have to do this here. result = type(self)._from_sequence(result) return result diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index cd3436344abc4..93ba16c5fda22 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -725,7 +725,7 @@ def test_interval(self): def test_categorical_extension_array_nullable(self, nulls_fixture): # GH: - arr = pd.arrays.PythonStringArray._from_sequence([nulls_fixture] * 2) + arr = pd.arrays.StringArray._from_sequence([nulls_fixture] * 2) result = Categorical(arr) expected = Categorical(Series([pd.NA, pd.NA], dtype="object")) tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 13b1da5ffdb2d..c9533e239abe0 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -39,7 +39,7 @@ def dtype_object(dtype): @pytest.fixture( params=[ - pd.arrays.PythonStringArray, + pd.arrays.StringArray, pytest.param(ArrowStringArray, marks=skip_if_no_pyarrow), ] ) @@ -55,7 +55,7 @@ def test_repr(dtype): expected = f"0 a\n1 \n2 b\nName: A, dtype: {dtype}" assert repr(df.A) == expected - arr_name = "ArrowStringArray" if dtype == "arrow_string" else "PythonStringArray" + arr_name = "ArrowStringArray" if dtype == "arrow_string" else "StringArray" expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: {dtype}" assert repr(df.A.array) == expected @@ -69,14 +69,14 @@ def test_none_to_nan(cls): def test_setitem_validates(cls): arr = cls._from_sequence(["a", "b"]) - if cls is pd.arrays.PythonStringArray: - msg = "Cannot set non-string value '10' into a PythonStringArray." + if cls is pd.arrays.StringArray: + msg = "Cannot set non-string value '10' into a StringArray." else: msg = "Scalar must be NA or str" with pytest.raises(ValueError, match=msg): arr[0] = 10 - if cls is pd.arrays.PythonStringArray: + if cls is pd.arrays.StringArray: msg = "Must provide strings." else: msg = "Scalar must be NA or str" @@ -280,8 +280,8 @@ def test_comparison_methods_array(all_compare_operators, dtype, request): def test_constructor_raises(cls): - if cls is pd.arrays.PythonStringArray: - msg = "PythonStringArray requires a sequence of strings or pandas.NA" + if cls is pd.arrays.StringArray: + msg = "StringArray requires a sequence of strings or pandas.NA" else: msg = "Unsupported type '' for ArrowStringArray" @@ -431,7 +431,7 @@ def test_fillna_args(dtype, request): expected = pd.array(["a", "b"], dtype=dtype) tm.assert_extension_array_equal(res, expected) - msg = "Cannot set non-string value '1' into a PythonStringArray." + msg = "Cannot set non-string value '1' into a StringArray." with pytest.raises(ValueError, match=msg): arr.fillna(value=1) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 78145f9e37713..bfe588883d9f3 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -17,8 +17,8 @@ FloatingArray, IntegerArray, IntervalArray, - PythonStringArray, SparseArray, + StringArray, TimedeltaArray, ) from pandas.core.arrays import ( @@ -132,8 +132,8 @@ ([1, None], "Int16", pd.array([1, None], dtype="Int16")), (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), # String - (["a", None], "string", PythonStringArray._from_sequence(["a", None])), - (["a", None], pd.StringDtype(), PythonStringArray._from_sequence(["a", None])), + (["a", None], "string", StringArray._from_sequence(["a", None])), + (["a", None], pd.StringDtype(), StringArray._from_sequence(["a", None])), # Boolean ([True, None], "boolean", BooleanArray._from_sequence([True, None])), ([True, None], pd.BooleanDtype(), BooleanArray._from_sequence([True, None])), @@ -253,8 +253,8 @@ def test_array_copy(): ([1, 2.0], FloatingArray._from_sequence([1.0, 2.0])), ([1, np.nan, 2.0], FloatingArray._from_sequence([1.0, None, 2.0])), # string - (["a", "b"], PythonStringArray._from_sequence(["a", "b"])), - (["a", None], PythonStringArray._from_sequence(["a", None])), + (["a", "b"], StringArray._from_sequence(["a", "b"])), + (["a", None], StringArray._from_sequence(["a", None])), # Boolean ([True, False], BooleanArray._from_sequence([True, False])), ([True, None], BooleanArray._from_sequence([True, None])), diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index d9db713be29e3..c6f8efe7b939e 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -345,7 +345,7 @@ def test_searchsorted_castable_strings(self, arr1d, box, request): TypeError, match=re.escape( f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', " - "or array of those. Got 'PythonStringArray' instead." + "or array of those. Got 'StringArray' instead." ), ): arr.searchsorted([str(arr[1]), "baz"]) From 594e3832b645a0b59b499d8580a22f6214f88645 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 6 Jun 2021 20:40:09 +0100 Subject: [PATCH 4/4] wip --- pandas/conftest.py | 53 +- pandas/core/arrays/string_.py | 681 +++++++++++++++++- pandas/core/arrays/string_arrow.py | 115 +-- pandas/core/config_init.py | 12 + pandas/core/dtypes/cast.py | 6 +- pandas/core/strings/__init__.py | 1 - pandas/core/strings/object_array.py | 3 +- pandas/tests/arrays/string_/test_string.py | 331 +++++---- .../tests/arrays/string_/test_string_arrow.py | 18 +- pandas/tests/extension/base/casting.py | 15 +- pandas/tests/extension/test_string.py | 62 +- pandas/tests/frame/methods/test_astype.py | 15 +- pandas/tests/io/test_parquet.py | 7 +- pandas/tests/series/methods/test_astype.py | 16 +- pandas/tests/series/methods/test_update.py | 22 +- pandas/tests/strings/test_api.py | 6 +- pandas/tests/strings/test_strings.py | 2 +- 17 files changed, 986 insertions(+), 379 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 329023ed7ba6a..cf8787e9e97aa 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1120,22 +1120,30 @@ def string_dtype(request): @pytest.fixture( params=[ - "string", - pytest.param( - "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") - ), + "python", + pytest.param("pyarrow", marks=td.skip_if_no("pyarrow", min_version="1.0.0")), ] ) -def nullable_string_dtype(request): +def string_storage(request): + return request.param + + +# Aliases so we can test with cartesian product of string_storage +string_storage2 = string_storage +string_storage3 = string_storage + + +@pytest.fixture +def nullable_string_dtype(string_storage): """ - Parametrized fixture for string dtypes. + Parametrized fixture for StringDtype with string_storage. - * 'string' - * 'arrow_string' + * 'string' (python storage) + * 'string` (pyarrow storage) """ - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 + with pd.option_context("string_storage", string_storage): - return request.param + yield "string" @pytest.fixture(params=tm.BYTES_DTYPES) @@ -1163,22 +1171,27 @@ def object_dtype(request): @pytest.fixture( params=[ "object", - "string", - pytest.param( - "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") - ), + "python", + pytest.param("pyarrow", marks=td.skip_if_no("pyarrow", min_version="1.0.0")), ] ) -def any_string_dtype(request): +def any_string_dtype_param(request): + return request.param + + +@pytest.fixture +def any_string_dtype(any_string_dtype_param): """ Parametrized fixture for string dtypes. * 'object' - * 'string' - * 'arrow_string' + * 'string' (python storage) + * 'string` (pyarrow storage) """ - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - - return request.param + if any_string_dtype_param == "object": + yield "object" + else: + with pd.option_context("string_storage", any_string_dtype_param): + yield "string" @pytest.fixture(params=tm.DATETIME64_DTYPES) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index ab1dadf4d2dfa..4c487fa6af7e4 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,9 +1,17 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Any, + Sequence, + TypeVar, + cast, +) import numpy as np +from pandas._config import get_option + from pandas._libs import ( lib, missing as libmissing, @@ -11,9 +19,12 @@ from pandas._libs.arrays import NDArrayBacked from pandas._typing import ( Dtype, + NpDtype, + PositionalIndexer, Scalar, type_t, ) +from pandas.compat import pa_version_under1p0 from pandas.compat.numpy import function as nv from pandas.core.dtypes.base import ( @@ -32,20 +43,45 @@ from pandas.core import ops from pandas.core.array_algos import masked_reductions +from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ( FloatingArray, IntegerArray, PandasArray, ) +from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.floating import FloatingDtype from pandas.core.arrays.integer import _IntegerDtype from pandas.core.construction import extract_array from pandas.core.indexers import check_array_indexer from pandas.core.missing import isna +from pandas.core.strings.object_array import ObjectStringArrayMixin if TYPE_CHECKING: + from typing import Literal + import pyarrow + from pandas.core.arrays.string_arrow import ArrowStringArray + + StringStorage = Literal["python", "pyarrow"] + + +def _validate_string_storage(storage: StringStorage) -> None: + if storage not in {"python", "pyarrow"}: + raise ValueError( + f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." + ) + if storage == "pyarrow" and pa_version_under1p0: + raise ImportError("pyarrow>=1.0.0 is required for PyArrow backed StringArray.") + + +def _get_string_storage(storage: StringStorage | None) -> StringStorage: + if storage is None: + storage = get_option("mode.string_storage") + _validate_string_storage(storage) + return storage + @register_extension_dtype class StringDtype(ExtensionDtype): @@ -125,7 +161,10 @@ def __from_arrow__( return StringArray(np.array([], dtype="object")) -class StringArray(PandasArray): +StringArrayT = TypeVar("StringArrayT", bound="StringArray") + + +class StringArray(OpsMixin, ExtensionArray, ObjectStringArrayMixin): """ Extension array for string data. @@ -152,6 +191,9 @@ class StringArray(PandasArray): copy : bool, default False Whether to copy the array of data. + storage : {"python", "pyarrow"}, optional + If not given, the value of ``pd.options.mode.string_storage``. + Attributes ---------- None @@ -201,6 +243,633 @@ class StringArray(PandasArray): Length: 3, dtype: boolean """ + _dtype = StringDtype() + _array: ObjectStringArray | ArrowStringArray + _storage: StringStorage + + @property + def storage(self) -> str: + return self._storage + + # ------------------------------------------------------------------------ + # Constructors + # ------------------------------------------------------------------------ + + def __init__(self, values, copy=False, *, storage: StringStorage | None = None): + from pandas.core.arrays.string_arrow import ArrowStringArray + + storage = _get_string_storage(storage) + self._storage = storage + klass = ObjectStringArray if storage == "python" else ArrowStringArray + # error: Incompatible types in assignment (expression has type + # "ObjectStringArrayMixin", variable has type "Union[ObjectStringArray, + # ArrowStringArray]") + self._array = klass(values, copy=copy) # type: ignore[assignment] + + def _from_array(self, array): + klass = type(self) + new_string_array = klass.__new__(klass) + new_string_array._storage = self._storage + new_string_array._array = array + return new_string_array + + def _maybe_wrap_result(self, result): + if isinstance(result, type(self._array)): + return self._from_array(result) + return result + + @classmethod + def _from_sequence( + cls, + scalars, + *, + dtype: Dtype | None = None, + copy=False, + storage: StringStorage | None = None, + ): + from pandas.core.arrays.string_arrow import ArrowStringArray + + if dtype: + assert dtype == "string" + + new_string_array = cls.__new__(cls) + storage = _get_string_storage(storage) + new_string_array._storage = storage + klass = ObjectStringArray if storage == "python" else ArrowStringArray + # error: "Type[ObjectStringArrayMixin]" has no attribute "_from_sequence" + new_string_array._array = klass._from_sequence( # type: ignore[attr-defined] + scalars, dtype=dtype, copy=copy + ) + return new_string_array + + @classmethod + def _from_sequence_of_strings( + cls, + strings, + *, + dtype: Dtype | None = None, + copy=False, + storage: StringStorage | None = None, + ): + from pandas.core.arrays.string_arrow import ArrowStringArray + + if dtype: + assert dtype == "string" + + new_string_array = cls.__new__(cls) + storage = _get_string_storage(storage) + new_string_array._storage = storage + klass = ObjectStringArray if storage == "python" else ArrowStringArray + # error: "Type[ObjectStringArrayMixin]" has no attribute + # "_from_sequence_of_strings" + tmp = klass._from_sequence_of_strings # type: ignore[attr-defined] + new_string_array._array = tmp(strings, dtype=dtype, copy=copy) + return new_string_array + + # ------------------------------------------------------------------------ + # Must be a Sequence + # ------------------------------------------------------------------------ + + def __getitem__(self, item: PositionalIndexer) -> Any: + result = self._array[item] + return self._maybe_wrap_result(result) + + def __setitem__(self, key, value) -> None: + if isinstance(value, type(self)): + value = value._array + self._array[key] = value + + def __len__(self) -> int: + return len(self._array) + + def to_numpy( + self, + dtype=None, + copy: bool = False, + na_value=lib.no_default, + ) -> np.ndarray: + return self._array.to_numpy(dtype=dtype, copy=copy, na_value=na_value) + + # ------------------------------------------------------------------------ + # Required attributes + # ------------------------------------------------------------------------ + + @property + def dtype(self) -> StringDtype: + return self._dtype + + @property + def nbytes(self) -> int: + return self._array.nbytes + + # ------------------------------------------------------------------------ + # Additional Methods + # ------------------------------------------------------------------------ + + def astype(self, dtype, copy=True): + dtype = pandas_dtype(dtype) + + if is_dtype_equal(dtype, self.dtype): + if copy: + return self.copy() + return self + + return self._array.astype(dtype, copy=copy) + + def isna(self) -> np.ndarray: + return self._array.isna() + + # def _values_for_argsort(self) -> np.ndarray: + # return self._array._values_for_argsort() + + # def argsort( + # self, + # ascending: bool = True, + # kind: str = "quicksort", + # na_position: str = "last", + # *args, + # **kwargs, + # ) -> np.ndarray: + + # def argmin(self, skipna: bool = True) -> int: + + # def argmax(self, skipna: bool = True) -> int: + + # def fillna( + # self, + # value: object | ArrayLike | None = None, + # method: FillnaOptions | None = None, + # limit: int | None = None, + # ): + + # def dropna(self): + # """ + # Return ExtensionArray without NA values. + + # Returns + # ------- + # valid : ExtensionArray + # """ + # # error: Unsupported operand type for ~ ("ExtensionArray") + # return self[~self.isna()] # type: ignore[operator] + + # def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray: + # """ + # Shift values by desired number. + + # Newly introduced missing values are filled with + # ``self.dtype.na_value``. + + # .. versionadded:: 0.24.0 + + # Parameters + # ---------- + # periods : int, default 1 + # The number of periods to shift. Negative values are allowed + # for shifting backwards. + + # fill_value : object, optional + # The scalar value to use for newly introduced missing values. + # The default is ``self.dtype.na_value``. + + # .. versionadded:: 0.24.0 + + # Returns + # ------- + # ExtensionArray + # Shifted. + + # Notes + # ----- + # If ``self`` is empty or ``periods`` is 0, a copy of ``self`` is + # returned. + + # If ``periods > len(self)``, then an array of size + # len(self) is returned, with all values filled with + # ``self.dtype.na_value``. + # """ + # # Note: this implementation assumes that `self.dtype.na_value` can be + # # stored in an instance of your ExtensionArray with `self.dtype`. + # if not len(self) or periods == 0: + # return self.copy() + + # if isna(fill_value): + # fill_value = self.dtype.na_value + + # empty = self._from_sequence( + # [fill_value] * min(abs(periods), len(self)), dtype=self.dtype + # ) + # if periods > 0: + # a = empty + # b = self[:-periods] + # else: + # a = self[abs(periods) :] + # b = empty + # return self._concat_same_type([a, b]) + + # def unique(self: ExtensionArrayT) -> ExtensionArrayT: + # """ + # Compute the ExtensionArray of unique values. + + # Returns + # ------- + # uniques : ExtensionArray + # """ + # uniques = unique(self.astype(object)) + # return self._from_sequence(uniques, dtype=self.dtype) + + # def searchsorted(self, value, side="left", sorter=None): + # """ + # Find indices where elements should be inserted to maintain order. + + # .. versionadded:: 0.24.0 + + # Find the indices into a sorted array `self` (a) such that, if the + # corresponding elements in `value` were inserted before the indices, + # the order of `self` would be preserved. + + # Assuming that `self` is sorted: + + # ====== ================================ + # `side` returned index `i` satisfies + # ====== ================================ + # left ``self[i-1] < value <= self[i]`` + # right ``self[i-1] <= value < self[i]`` + # ====== ================================ + + # Parameters + # ---------- + # value : array_like + # Values to insert into `self`. + # side : {'left', 'right'}, optional + # If 'left', the index of the first suitable location found is given. + # If 'right', return the last such index. If there is no suitable + # index, return either 0 or N (where N is the length of `self`). + # sorter : 1-D array_like, optional + # Optional array of integer indices that sort array a into ascending + # order. They are typically the result of argsort. + + # Returns + # ------- + # array of ints + # Array of insertion points with the same shape as `value`. + + # See Also + # -------- + # numpy.searchsorted : Similar method from NumPy. + # """ + # # Note: the base tests provided by pandas only test the basics. + # # We do not test + # # 1. Values outside the range of the `data_for_sorting` fixture + # # 2. Values between the values in the `data_for_sorting` fixture + # # 3. Missing values. + # arr = self.astype(object) + # return arr.searchsorted(value, side=side, sorter=sorter) + + def equals(self, other: object) -> bool: + # TODO: allow ObjectStringArray and ArrowStringArray to compare equal + if isinstance(other, type(self)): + other = other._array + return self._array.equals(other) + + # def isin(self, values) -> np.ndarray: + # """ + # Pointwise comparison for set containment in the given values. + + # Roughly equivalent to `np.array([x in values for x in self])` + + # Parameters + # ---------- + # values : Sequence + + # Returns + # ------- + # np.ndarray[bool] + # """ + # return isin(np.asarray(self), values) + + # def _values_for_factorize(self) -> tuple[np.ndarray, Any]: + # """ + # Return an array and missing value suitable for factorization. + + # Returns + # ------- + # values : ndarray + + # An array suitable for factorization. This should maintain order + # and be a supported dtype (Float64, Int64, UInt64, String, Object). + # By default, the extension array is cast to object dtype. + # na_value : object + # The value in `values` to consider missing. This will be treated + # as NA in the factorization routines, so it will be coded as + # `na_sentinel` and not included in `uniques`. By default, + # ``np.nan`` is used. + + # Notes + # ----- + # The values returned by this method are also used in + # :func:`pandas.util.hash_pandas_object`. + # """ + # return self.astype(object), np.nan + + def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: + return self._array.factorize(na_sentinel=na_sentinel) + + # def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: + # """ + # Encode the extension array as an enumerated type. + + # Parameters + # ---------- + # na_sentinel : int, default -1 + # Value to use in the `codes` array to indicate missing values. + + # Returns + # ------- + # codes : ndarray + # An integer NumPy array that's an indexer into the original + # ExtensionArray. + # uniques : ExtensionArray + # An ExtensionArray containing the unique values of `self`. + + # .. note:: + + # uniques will *not* contain an entry for the NA value of + # the ExtensionArray if there are any missing values present + # in `self`. + + # See Also + # -------- + # factorize : Top-level factorize method that dispatches here. + + # Notes + # ----- + # :meth:`pandas.factorize` offers a `sort` keyword as well. + # """ + # # Implementer note: There are two ways to override the behavior of + # # pandas.factorize + # # 1. _values_for_factorize and _from_factorize. + # # Specify the values passed to pandas' internal factorization + # # routines, and how to convert from those values back to the + # # original ExtensionArray. + # # 2. ExtensionArray.factorize. + # # Complete control over factorization. + # arr, na_value = self._values_for_factorize() + + # codes, uniques = factorize_array( + # arr, na_sentinel=na_sentinel, na_value=na_value + # ) + + # uniques = self._from_factorized(uniques, self) + # # error: Incompatible return value type (got "Tuple[ndarray, ndarray]", + # # expected "Tuple[ndarray, ExtensionArray]") + # return codes, uniques # type: ignore[return-value] + + # _extension_array_shared_docs[ + # "repeat" + # ] = """ + # Repeat elements of a %(klass)s. + + # Returns a new %(klass)s where each element of the current %(klass)s + # is repeated consecutively a given number of times. + + # Parameters + # ---------- + # repeats : int or array of ints + # The number of repetitions for each element. This should be a + # non-negative integer. Repeating 0 times will return an empty + # %(klass)s. + # axis : None + # Must be ``None``. Has no effect but is accepted for compatibility + # with numpy. + + # Returns + # ------- + # repeated_array : %(klass)s + # Newly created %(klass)s with repeated elements. + + # See Also + # -------- + # Series.repeat : Equivalent function for Series. + # Index.repeat : Equivalent function for Index. + # numpy.repeat : Similar method for :class:`numpy.ndarray`. + # ExtensionArray.take : Take arbitrary positions. + + # Examples + # -------- + # >>> cat = pd.Categorical(['a', 'b', 'c']) + # >>> cat + # ['a', 'b', 'c'] + # Categories (3, object): ['a', 'b', 'c'] + # >>> cat.repeat(2) + # ['a', 'a', 'b', 'b', 'c', 'c'] + # Categories (3, object): ['a', 'b', 'c'] + # >>> cat.repeat([1, 2, 3]) + # ['a', 'b', 'b', 'c', 'c', 'c'] + # Categories (3, object): ['a', 'b', 'c'] + # """ + + # @Substitution(klass="ExtensionArray") + # @Appender(_extension_array_shared_docs["repeat"]) + # def repeat(self, repeats: int | Sequence[int], axis: int | None = None): + # nv.validate_repeat((), {"axis": axis}) + # ind = np.arange(len(self)).repeat(repeats) + # return self.take(ind) + + # ------------------------------------------------------------------------ + # Indexing methods + # ------------------------------------------------------------------------ + + def take( + self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None + ): + result = self._array.take(indices, allow_fill=allow_fill, fill_value=fill_value) + return self._from_array(result) + + def copy(self) -> StringArray: + result = self._array.copy() + return self._from_array(result) + + # def view(self, dtype: Dtype | None = None) -> ArrayLike: + # """ + # Return a view on the array. + + # Parameters + # ---------- + # dtype : str, np.dtype, or ExtensionDtype, optional + # Default None. + + # Returns + # ------- + # ExtensionArray or np.ndarray + # A view on the :class:`ExtensionArray`'s data. + # """ + # # NB: + # # - This must return a *new* object referencing the same data, not self. + # # - The only case that *must* be implemented is with dtype=None, + # # giving a view with the same dtype as self. + # if dtype is not None: + # raise NotImplementedError(dtype) + # return self[:] + + # ------------------------------------------------------------------------ + # Printing + # ------------------------------------------------------------------------ + + # def __repr__(self) -> str: + # from pandas.io.formats.printing import format_object_summary + + # # the short repr has no trailing newline, while the truncated + # # repr does. So we include a newline in our template, and strip + # # any trailing newlines from format_object_summary + # data = format_object_summary( + # self, self._formatter(), indent_for_name=False + # ).rstrip(", \n") + # class_name = f"<{type(self).__name__}>\n" + # return f"{class_name}{data}\nLength: {len(self)}, dtype: {self.dtype}" + + # def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]: + # """ + # Formatting function for scalar values. + + # This is used in the default '__repr__'. The returned formatting + # function receives instances of your scalar type. + + # Parameters + # ---------- + # boxed : bool, default False + # An indicated for whether or not your array is being printed + # within a Series, DataFrame, or Index (True), or just by + # itself (False). This may be useful if you want scalar values + # to appear differently within a Series versus on its own (e.g. + # quoted or not). + + # Returns + # ------- + # Callable[[Any], str] + # A callable that gets instances of the scalar type and + # returns a string. By default, :func:`repr` is used + # when ``boxed=False`` and :func:`str` is used when + # ``boxed=True``. + # """ + # if boxed: + # return str + # return repr + + # ------------------------------------------------------------------------ + # Reshaping + # ------------------------------------------------------------------------ + + @classmethod + def _concat_same_type( + cls: type[StringArrayT], to_concat: Sequence[StringArrayT] + ) -> StringArrayT: + from pandas.core.arrays.string_arrow import ArrowStringArray + + result: ObjectStringArray | ArrowStringArray + if all(arr.storage == "python" for arr in to_concat): + to_concat_object = cast( + Sequence[ObjectStringArray], [arr._array for arr in to_concat] + ) + result = ObjectStringArray._concat_same_type(to_concat_object) + storage = "python" + elif all(arr.storage == "pyarrow" for arr in to_concat): + to_concat_arrow = [arr._array for arr in to_concat] + result = ArrowStringArray._concat_same_type(to_concat_arrow) + storage = "pyarrow" + else: + raise NotImplementedError + + new_string_array = cls.__new__(cls) + new_string_array._storage = storage + new_string_array._array = result + return new_string_array + + # The _can_hold_na attribute is set to True so that pandas internals + # will use the ExtensionDtype.na_value as the NA value in operations + # such as take(), reindex(), shift(), etc. In addition, those results + # will then be of the ExtensionArray subclass rather than an array + # of objects + _can_hold_na = True + + def _reduce(self, name: str, *, skipna: bool = True, **kwargs): + return self._array._reduce(name, skipna=skipna, **kwargs) + + # def __hash__(self) -> int: + # raise TypeError(f"unhashable type: {repr(type(self).__name__)}") + + # ------------------------------------------------------------------------ + # Other + # ------------------------------------------------------------------------ + + # @classmethod + # def _empty(cls, shape, dtype) -> StringArray: + # values = np.empty(shape, dtype=object) + # values[:] = libmissing.NA + # return cls(values).astype(dtype, copy=False) + + # def _values_for_factorize(self): + # arr = self._ndarray.copy() + # mask = self.isna() + # arr[mask] = -1 + # return arr, -1 + + # ------------------------------------------------------------------------ + # Additional array methods + # These are not part of the EA API, but we implement them because + # pandas assumes they're there. + # ------------------------------------------------------------------------ + + def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + return np.asarray(self._array, dtype=dtype) + + def __arrow_array__(self, type=None): + return self._array.__arrow_array__(type) + + def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: + return self._array.min(axis=axis, skipna=skipna, **kwargs) + + def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: + return self._array.max(axis=axis, skipna=skipna, **kwargs) + + def value_counts(self, dropna: bool = True): + return self._array.value_counts(dropna=dropna) + + def memory_usage(self, deep: bool = False) -> int: + return self._array.memory_usage(deep=deep) + + # ------------------------------------------------------------------------ + # OpsMixin interface + # ------------------------------------------------------------------------ + + def _cmp_method(self, other, op): + return self._array._cmp_method(other, op) + + def _logical_method(self, other, op): + return self._array._logical_method(other, op) + + def _arith_method(self, other, op): + return self._array._arith_method(other, op) + + # ------------------------------------------------------------------------ + # String methods interface + # ------------------------------------------------------------------------ + + _str_na_value = StringDtype.na_value + + def _str_map( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): + result = self._array._str_map( + f, na_value=na_value, dtype=dtype, convert=convert + ) + return self._maybe_wrap_result(result) + + # TODO: dispatch all str accessor methods to array instead of wrapping result of + # object fallback (_str_map) + + +class ObjectStringArray(PandasArray): # undo the PandasArray hack _typ = "extension" @@ -258,7 +927,7 @@ def _from_sequence_of_strings( return cls._from_sequence(strings, dtype=dtype, copy=copy) @classmethod - def _empty(cls, shape, dtype) -> StringArray: + def _empty(cls, shape, dtype) -> ObjectStringArray: values = np.empty(shape, dtype=object) values[:] = libmissing.NA return cls(values).astype(dtype, copy=False) @@ -377,7 +1046,7 @@ def memory_usage(self, deep: bool = False) -> int: def _cmp_method(self, other, op): from pandas.arrays import BooleanArray - if isinstance(other, StringArray): + if isinstance(other, ObjectStringArray): other = other._ndarray mask = isna(self) | isna(other) @@ -397,7 +1066,7 @@ def _cmp_method(self, other, op): result = np.empty_like(self._ndarray, dtype="object") result[mask] = StringDtype.na_value result[valid] = op(self._ndarray[valid], other) - return StringArray(result) + return type(self)(result) else: # logical result = np.zeros(len(self._ndarray), dtype="bool") @@ -457,7 +1126,7 @@ def _str_map( result = lib.map_infer_mask( arr, f, mask.view("uint8"), convert=False, na_value=na_value ) - return StringArray(result) + return type(self)(result) else: # This is when the result type is object. We reach this when # -> We know the result type is truly object (e.g. .encode returns bytes diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 3cf471e381da9..fde2ab7066a60 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -11,16 +11,12 @@ import numpy as np -from pandas._libs import ( - lib, - missing as libmissing, -) +from pandas._libs import lib from pandas._typing import ( Dtype, NpDtype, PositionalIndexer, Scalar, - type_t, ) from pandas.compat import ( pa_version_under1p0, @@ -43,7 +39,6 @@ is_string_dtype, pandas_dtype, ) -from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.missing import isna from pandas.core import missing @@ -86,93 +81,6 @@ def _chk_pyarrow_available() -> None: raise ImportError(msg) -@register_extension_dtype -class ArrowStringDtype(StringDtype): - """ - Extension dtype for string data in a ``pyarrow.ChunkedArray``. - - .. versionadded:: 1.2.0 - - .. warning:: - - ArrowStringDtype is considered experimental. The implementation and - parts of the API may change without warning. - - Attributes - ---------- - None - - Methods - ------- - None - - Examples - -------- - >>> from pandas.core.arrays.string_arrow import ArrowStringDtype - >>> ArrowStringDtype() - ArrowStringDtype - """ - - name = "arrow_string" - - #: StringDtype.na_value uses pandas.NA - na_value = libmissing.NA - - def __init__(self): - _chk_pyarrow_available() - - @property - def type(self) -> type[str]: - return str - - @classmethod - def construct_array_type(cls) -> type_t[ArrowStringArray]: # type: ignore[override] - """ - Return the array type associated with this dtype. - - Returns - ------- - type - """ - return ArrowStringArray - - def __hash__(self) -> int: - return hash("ArrowStringDtype") - - def __repr__(self) -> str: - return "ArrowStringDtype" - - def __from_arrow__( # type: ignore[override] - self, array: pa.Array | pa.ChunkedArray - ) -> ArrowStringArray: - """ - Construct StringArray from pyarrow Array/ChunkedArray. - """ - return ArrowStringArray(array) - - def __eq__(self, other) -> bool: - """Check whether 'other' is equal to self. - - By default, 'other' is considered equal if - * it's a string matching 'self.name'. - * it's an instance of this type. - - Parameters - ---------- - other : Any - - Returns - ------- - bool - """ - if isinstance(other, ArrowStringDtype): - return True - elif isinstance(other, str) and other == "arrow_string": - return True - else: - return False - - # TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from # ObjectStringArrayMixin because we want to have the object-dtype based methods as # fallback for the ones that pyarrow doesn't yet support @@ -222,8 +130,10 @@ class ArrowStringArray(OpsMixin, ExtensionArray, ObjectStringArrayMixin): Length: 4, dtype: arrow_string """ - def __init__(self, values): - self._dtype = ArrowStringDtype() + _dtype = StringDtype() + + def __init__(self, values, copy: bool = False): + # copy is ignored, for compatibility with ObjectStringArray if isinstance(values, pa.Array): self._data = pa.chunked_array([values]) elif isinstance(values, pa.ChunkedArray): @@ -261,9 +171,9 @@ def _from_sequence_of_strings( return cls._from_sequence(strings, dtype=dtype, copy=copy) @property - def dtype(self) -> ArrowStringDtype: + def dtype(self) -> StringDtype: """ - An instance of 'ArrowStringDtype'. + An instance of 'StringDtype'. """ return self._dtype @@ -465,6 +375,12 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: + raise NotImplementedError + + def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: + raise NotImplementedError + @property def nbytes(self) -> int: """ @@ -472,6 +388,9 @@ def nbytes(self) -> int: """ return self._data.nbytes + def memory_usage(self, deep: bool = False) -> int: + return self.nbytes + def isna(self) -> np.ndarray: """ Boolean NumPy array indicating if each value is missing. @@ -761,7 +680,7 @@ def astype(self, dtype, copy=True): # ------------------------------------------------------------------------ # String methods interface - _str_na_value = ArrowStringDtype.na_value + _str_na_value = StringDtype.na_value def _str_map( self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 0db0c5a57207d..57d09cd8d78f5 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -525,6 +525,18 @@ def use_inf_as_na_cb(key): validator=is_one_of_factory([None, "warn", "raise"]), ) +string_storage_doc = """ +: string + The default storage for StringArray. +""" + +with cf.config_prefix("mode"): + cf.register_option( + "string_storage", + "python", + string_storage_doc, + validator=is_one_of_factory(["python", "pyarrow"]), + ) # Set up the io.excel specific reader configuration. reader_engine_doc = """ diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 4abb5d98202f6..0e40b8c4ff14e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -420,17 +420,13 @@ def maybe_cast_to_extension_array( ExtensionArray or obj """ from pandas.core.arrays.string_ import StringArray - from pandas.core.arrays.string_arrow import ArrowStringArray assert isinstance(cls, type), f"must pass a type: {cls}" assertion_msg = f"must pass a subclass of ExtensionArray: {cls}" assert issubclass(cls, ABCExtensionArray), assertion_msg # Everything can be converted to StringArrays, but we may not want to convert - if ( - issubclass(cls, (StringArray, ArrowStringArray)) - and lib.infer_dtype(obj) != "string" - ): + if issubclass(cls, StringArray) and lib.infer_dtype(obj) != "string": return obj try: diff --git a/pandas/core/strings/__init__.py b/pandas/core/strings/__init__.py index 28aba7c9ce0b3..943686fc85a05 100644 --- a/pandas/core/strings/__init__.py +++ b/pandas/core/strings/__init__.py @@ -25,7 +25,6 @@ # - StringArray # - PandasArray # - Categorical -# - ArrowStringArray from pandas.core.strings.accessor import StringMethods from pandas.core.strings.base import BaseStringArrayMethods diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 7ce4abe904f3b..8fa1459b36df4 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -174,7 +174,6 @@ def scalar_rep(x): return self._str_map(scalar_rep, dtype=str) else: from pandas.core.arrays.string_ import StringArray - from pandas.core.arrays.string_arrow import ArrowStringArray def rep(x, r): if x is libmissing.NA: @@ -186,7 +185,7 @@ def rep(x, r): repeats = np.asarray(repeats, dtype=object) result = libops.vec_binop(np.asarray(self), repeats, rep) - if isinstance(self, (StringArray, ArrowStringArray)): + if isinstance(self, StringArray): # Not going through map, so we have to do this here. result = type(self)._from_sequence(result) return result diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index c9533e239abe0..929e77c2e5609 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -1,5 +1,5 @@ """ -This module tests the functionality of StringArray and ArrowStringArray. +This module tests the functionality of StringArray. Tests for the str accessors are in pandas/tests/strings/test_string_array.py """ @@ -14,69 +14,46 @@ import pandas as pd import pandas._testing as tm -from pandas.core.arrays.string_arrow import ( - ArrowStringArray, - ArrowStringDtype, +from pandas.core.arrays.string_ import ( + StringArray, + StringDtype, ) skip_if_no_pyarrow = td.skip_if_no("pyarrow", min_version="1.0.0") -@pytest.fixture( - params=["string", pytest.param("arrow_string", marks=skip_if_no_pyarrow)] -) -def dtype(request): - return request.param - - -@pytest.fixture -def dtype_object(dtype): - if dtype == "string": - return pd.StringDtype - else: - return ArrowStringDtype - - -@pytest.fixture( - params=[ - pd.arrays.StringArray, - pytest.param(ArrowStringArray, marks=skip_if_no_pyarrow), - ] -) -def cls(request): - return request.param - - -def test_repr(dtype): - df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)}) +def test_repr(string_storage): + with pd.option_context("string_storage", string_storage): + df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype="string")}) expected = " A\n0 a\n1 \n2 b" assert repr(df) == expected - expected = f"0 a\n1 \n2 b\nName: A, dtype: {dtype}" + expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected - arr_name = "ArrowStringArray" if dtype == "arrow_string" else "StringArray" - expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: {dtype}" + expected = "\n['a', , 'b']\nLength: 3, dtype: string" assert repr(df.A.array) == expected -def test_none_to_nan(cls): - a = cls._from_sequence(["a", None, "b"]) +def test_none_to_nan(string_storage): + with pd.option_context("string_storage", string_storage): + a = StringArray._from_sequence(["a", None, "b"]) assert a[1] is not None assert a[1] is pd.NA -def test_setitem_validates(cls): - arr = cls._from_sequence(["a", "b"]) +def test_setitem_validates(string_storage): + with pd.option_context("string_storage", string_storage): + arr = StringArray._from_sequence(["a", "b"]) - if cls is pd.arrays.StringArray: + if string_storage == "python": msg = "Cannot set non-string value '10' into a StringArray." else: msg = "Scalar must be NA or str" with pytest.raises(ValueError, match=msg): arr[0] = 10 - if cls is pd.arrays.StringArray: + if string_storage == "python": msg = "Must provide strings." else: msg = "Scalar must be NA or str" @@ -84,17 +61,18 @@ def test_setitem_validates(cls): arr[:] = np.array([1, 2]) -def test_setitem_with_scalar_string(dtype): +def test_setitem_with_scalar_string(string_storage): # is_float_dtype considers some strings, like 'd', to be floats # which can cause issues. - arr = pd.array(["a", "c"], dtype=dtype) + with pd.option_context("string_storage", string_storage): + arr = pd.array(["a", "c"], dtype="string") arr[0] = "d" - expected = pd.array(["d", "c"], dtype=dtype) + expected = pd.array(["d", "c"], dtype="string") tm.assert_extension_array_equal(arr, expected) -def test_astype_roundtrip(dtype, request): - if dtype == "arrow_string": +def test_astype_roundtrip(string_storage, request): + if string_storage == "pyarrow": reason = "ValueError: Could not convert object to NumPy datetime" mark = pytest.mark.xfail(reason=reason, raises=ValueError) request.node.add_marker(mark) @@ -107,15 +85,16 @@ def test_astype_roundtrip(dtype, request): ser = pd.Series(pd.date_range("2000", periods=12)) ser[0] = None - casted = ser.astype(dtype) - assert is_dtype_equal(casted.dtype, dtype) + with pd.option_context("string_storage", string_storage): + casted = ser.astype("string") + assert is_dtype_equal(casted.dtype, "string") result = casted.astype("datetime64[ns]") tm.assert_series_equal(result, ser) -def test_add(dtype, request): - if dtype == "arrow_string": +def test_add(string_storage, string_storage2, string_storage3, request): + if string_storage == "pyarrow" or string_storage2 == "pyarrow": reason = ( "unsupported operand type(s) for +: 'ArrowStringArray' and " "'ArrowStringArray'" @@ -123,32 +102,39 @@ def test_add(dtype, request): mark = pytest.mark.xfail(raises=TypeError, reason=reason) request.node.add_marker(mark) - a = pd.Series(["a", "b", "c", None, None], dtype=dtype) - b = pd.Series(["x", "y", None, "z", None], dtype=dtype) + with pd.option_context("string_storage", string_storage): + a = pd.Series(["a", "b", "c", None, None], dtype="string") + + with pd.option_context("string_storage", string_storage2): + b = pd.Series(["x", "y", None, "z", None], dtype="string") result = a + b - expected = pd.Series(["ax", "by", None, None, None], dtype=dtype) + with pd.option_context("string_storage", string_storage3): + expected = pd.Series(["ax", "by", None, None, None], dtype="string") tm.assert_series_equal(result, expected) result = a.add(b) tm.assert_series_equal(result, expected) result = a.radd(b) - expected = pd.Series(["xa", "yb", None, None, None], dtype=dtype) + with pd.option_context("string_storage", string_storage3): + expected = pd.Series(["xa", "yb", None, None, None], dtype="string") tm.assert_series_equal(result, expected) result = a.add(b, fill_value="-") - expected = pd.Series(["ax", "by", "c-", "-z", None], dtype=dtype) + with pd.option_context("string_storage", string_storage3): + expected = pd.Series(["ax", "by", "c-", "-z", None], dtype="string") tm.assert_series_equal(result, expected) -def test_add_2d(dtype, request): - if dtype == "arrow_string": +def test_add_2d(string_storage, request): + if string_storage == "pyarrow": reason = "Failed: DID NOT RAISE " mark = pytest.mark.xfail(raises=None, reason=reason) request.node.add_marker(mark) - a = pd.array(["a", "b", "c"], dtype=dtype) + with pd.option_context("string_storage", string_storage): + a = pd.array(["a", "b", "c"], dtype="string") b = np.array([["a", "b", "c"]], dtype=object) with pytest.raises(ValueError, match="3 != 1"): a + b @@ -158,33 +144,39 @@ def test_add_2d(dtype, request): s + b -def test_add_sequence(dtype, request): - if dtype == "arrow_string": +def test_add_sequence(string_storage, string_storage2, request): + if string_storage == "pyarrow": reason = "unsupported operand type(s) for +: 'ArrowStringArray' and 'list'" mark = pytest.mark.xfail(raises=TypeError, reason=reason) request.node.add_marker(mark) - a = pd.array(["a", "b", None, None], dtype=dtype) + with pd.option_context("string_storage", string_storage): + a = pd.array(["a", "b", None, None], dtype="string") other = ["x", None, "y", None] result = a + other - expected = pd.array(["ax", None, None, None], dtype=dtype) + with pd.option_context("string_storage", string_storage2): + expected = pd.array(["ax", None, None, None], dtype="string") tm.assert_extension_array_equal(result, expected) result = other + a - expected = pd.array(["xa", None, None, None], dtype=dtype) + with pd.option_context("string_storage", string_storage2): + expected = pd.array(["xa", None, None, None], dtype="string") tm.assert_extension_array_equal(result, expected) -def test_mul(dtype, request): - if dtype == "arrow_string": +def test_mul(string_storage, string_storage2, request): + if string_storage == "pyarrow": reason = "unsupported operand type(s) for *: 'ArrowStringArray' and 'int'" mark = pytest.mark.xfail(raises=TypeError, reason=reason) request.node.add_marker(mark) - a = pd.array(["a", "b", None], dtype=dtype) + with pd.option_context("string_storage", string_storage): + a = pd.array(["a", "b", None], dtype="string") result = a * 2 - expected = pd.array(["aa", "bb", None], dtype=dtype) + + with pd.option_context("string_storage", string_storage2): + expected = pd.array(["aa", "bb", None], dtype="string") tm.assert_extension_array_equal(result, expected) result = 2 * a @@ -192,39 +184,46 @@ def test_mul(dtype, request): @pytest.mark.xfail(reason="GH-28527") -def test_add_strings(dtype): - arr = pd.array(["a", "b", "c", "d"], dtype=dtype) +def test_add_strings(string_storage, string_storage2): + with pd.option_context("string_storage", string_storage): + arr = pd.array(["a", "b", "c", "d"], dtype="string") df = pd.DataFrame([["t", "u", "v", "w"]]) assert arr.__add__(df) is NotImplemented result = arr + df - expected = pd.DataFrame([["at", "bu", "cv", "dw"]]).astype(dtype) + with pd.option_context("string_storage", string_storage2): + expected = pd.DataFrame([["at", "bu", "cv", "dw"]]).astype("string") tm.assert_frame_equal(result, expected) result = df + arr - expected = pd.DataFrame([["ta", "ub", "vc", "wd"]]).astype(dtype) + with pd.option_context("string_storage", string_storage2): + expected = pd.DataFrame([["ta", "ub", "vc", "wd"]]).astype("string") tm.assert_frame_equal(result, expected) @pytest.mark.xfail(reason="GH-28527") -def test_add_frame(dtype): - arr = pd.array(["a", "b", np.nan, np.nan], dtype=dtype) +def test_add_frame(string_storage, string_storage2): + with pd.option_context("string_storage", string_storage): + arr = pd.array(["a", "b", np.nan, np.nan], dtype="string") df = pd.DataFrame([["x", np.nan, "y", np.nan]]) assert arr.__add__(df) is NotImplemented result = arr + df - expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype(dtype) + with pd.option_context("string_storage", string_storage2): + expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype("string") tm.assert_frame_equal(result, expected) result = df + arr - expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype(dtype) + with pd.option_context("string_storage", string_storage2): + expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype("string") tm.assert_frame_equal(result, expected) -def test_comparison_methods_scalar(all_compare_operators, dtype): +def test_comparison_methods_scalar(all_compare_operators, string_storage): op_name = all_compare_operators - a = pd.array(["a", None, "c"], dtype=dtype) + with pd.option_context("string_storage", string_storage): + a = pd.array(["a", None, "c"], dtype="string") other = "a" result = getattr(a, op_name)(other) expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) @@ -232,22 +231,26 @@ def test_comparison_methods_scalar(all_compare_operators, dtype): tm.assert_extension_array_equal(result, expected) -def test_comparison_methods_scalar_pd_na(all_compare_operators, dtype): +def test_comparison_methods_scalar_pd_na(all_compare_operators, string_storage): op_name = all_compare_operators - a = pd.array(["a", None, "c"], dtype=dtype) + with pd.option_context("string_storage", string_storage): + a = pd.array(["a", None, "c"], dtype="string") result = getattr(a, op_name)(pd.NA) expected = pd.array([None, None, None], dtype="boolean") tm.assert_extension_array_equal(result, expected) -def test_comparison_methods_scalar_not_string(all_compare_operators, dtype, request): +def test_comparison_methods_scalar_not_string( + all_compare_operators, string_storage, request +): if all_compare_operators not in ["__eq__", "__ne__"]: reason = "comparison op not supported between instances of 'str' and 'int'" mark = pytest.mark.xfail(raises=TypeError, reason=reason) request.node.add_marker(mark) op_name = all_compare_operators - a = pd.array(["a", None, "c"], dtype=dtype) + with pd.option_context("string_storage", string_storage): + a = pd.array(["a", None, "c"], dtype="string") other = 42 result = getattr(a, op_name)(other) expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[ @@ -257,8 +260,8 @@ def test_comparison_methods_scalar_not_string(all_compare_operators, dtype, requ tm.assert_extension_array_equal(result, expected) -def test_comparison_methods_array(all_compare_operators, dtype, request): - if dtype == "arrow_string": +def test_comparison_methods_array(all_compare_operators, string_storage, request): + if string_storage == "pyarrow": mark = pytest.mark.xfail( raises=AssertionError, reason="left is not an ExtensionArray" ) @@ -266,7 +269,8 @@ def test_comparison_methods_array(all_compare_operators, dtype, request): op_name = all_compare_operators - a = pd.array(["a", None, "c"], dtype=dtype) + with pd.option_context("string_storage", string_storage): + a = pd.array(["a", None, "c"], dtype="string") other = [None, None, "c"] result = getattr(a, op_name)(other) expected = np.empty_like(a, dtype="object") @@ -279,31 +283,31 @@ def test_comparison_methods_array(all_compare_operators, dtype, request): tm.assert_extension_array_equal(result, expected) -def test_constructor_raises(cls): - if cls is pd.arrays.StringArray: +def test_constructor_raises(string_storage): + if string_storage == "python": msg = "StringArray requires a sequence of strings or pandas.NA" else: msg = "Unsupported type '' for ArrowStringArray" with pytest.raises(ValueError, match=msg): - cls(np.array(["a", "b"], dtype="S1")) + StringArray(np.array(["a", "b"], dtype="S1"), storage=string_storage) with pytest.raises(ValueError, match=msg): - cls(np.array([])) + StringArray(np.array([]), storage=string_storage) with pytest.raises(ValueError, match=msg): - cls(np.array(["a", np.nan], dtype=object)) + StringArray(np.array(["a", np.nan], dtype=object), storage=string_storage) with pytest.raises(ValueError, match=msg): - cls(np.array(["a", None], dtype=object)) + StringArray(np.array(["a", None], dtype=object), storage=string_storage) with pytest.raises(ValueError, match=msg): - cls(np.array(["a", pd.NaT], dtype=object)) + StringArray(np.array(["a", pd.NaT], dtype=object), storage=string_storage) @pytest.mark.parametrize("copy", [True, False]) -def test_from_sequence_no_mutate(copy, cls, request): - if cls is ArrowStringArray and copy is False: +def test_from_sequence_no_mutate(copy, string_storage, request): + if string_storage == "pyarrow" and copy is False: mark = pytest.mark.xfail( raises=AssertionError, reason="numpy array are different" ) @@ -312,14 +316,17 @@ def test_from_sequence_no_mutate(copy, cls, request): nan_arr = np.array(["a", np.nan], dtype=object) na_arr = np.array(["a", pd.NA], dtype=object) - result = cls._from_sequence(nan_arr, copy=copy) + with pd.option_context("string_storage", string_storage): + result = StringArray._from_sequence(nan_arr, copy=copy) - if cls is ArrowStringArray: + if string_storage == "pyarrow": import pyarrow as pa - expected = cls(pa.array(na_arr, type=pa.string(), from_pandas=True)) + expected = StringArray( + pa.array(na_arr, type=pa.string(), from_pandas=True), storage="pyarrow" + ) else: - expected = cls(na_arr) + expected = StringArray(na_arr, storage="python") tm.assert_extension_array_equal(result, expected) @@ -327,29 +334,33 @@ def test_from_sequence_no_mutate(copy, cls, request): tm.assert_numpy_array_equal(nan_arr, expected) -def test_astype_int(dtype): - arr = pd.array(["1", "2", "3"], dtype=dtype) +def test_astype_int(string_storage): + with pd.option_context("string_storage", string_storage): + arr = pd.array(["1", "2", "3"], dtype="string") result = arr.astype("int64") expected = np.array([1, 2, 3], dtype="int64") tm.assert_numpy_array_equal(result, expected) - arr = pd.array(["1", pd.NA, "3"], dtype=dtype) + with pd.option_context("string_storage", string_storage): + arr = pd.array(["1", pd.NA, "3"], dtype="string") msg = re.escape("int() argument must be a string, a bytes-like object or a number") with pytest.raises(TypeError, match=msg): arr.astype("int64") -def test_astype_nullable_int(dtype): - arr = pd.array(["1", pd.NA, "3"], dtype=dtype) +def test_astype_nullable_int(string_storage): + with pd.option_context("string_storage", string_storage): + arr = pd.array(["1", pd.NA, "3"], dtype="string") result = arr.astype("Int64") expected = pd.array([1, pd.NA, 3], dtype="Int64") tm.assert_extension_array_equal(result, expected) -def test_astype_float(dtype, any_float_allowed_nullable_dtype): +def test_astype_float(string_storage, any_float_allowed_nullable_dtype): # Don't compare arrays (37974) - ser = pd.Series(["1.1", pd.NA, "3.3"], dtype=dtype) + with pd.option_context("string_storage", string_storage): + ser = pd.Series(["1.1", pd.NA, "3.3"], dtype="string") result = ser.astype(any_float_allowed_nullable_dtype) expected = pd.Series([1.1, np.nan, 3.3], dtype=any_float_allowed_nullable_dtype) tm.assert_series_equal(result, expected) @@ -357,21 +368,22 @@ def test_astype_float(dtype, any_float_allowed_nullable_dtype): @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.xfail(reason="Not implemented StringArray.sum") -def test_reduce(skipna, dtype): - arr = pd.Series(["a", "b", "c"], dtype=dtype) +def test_reduce(skipna, string_storage): + with pd.option_context("string_storage", string_storage): + arr = pd.Series(["a", "b", "c"], dtype="string") result = arr.sum(skipna=skipna) assert result == "abc" @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("skipna", [True, False]) -def test_min_max(method, skipna, dtype, request): - if dtype == "arrow_string": - reason = "'ArrowStringArray' object has no attribute 'max'" - mark = pytest.mark.xfail(raises=AttributeError, reason=reason) +def test_min_max(method, skipna, string_storage, request): + if string_storage == "pyarrow": + mark = pytest.mark.xfail(raises=NotImplementedError, reason="not implemented") request.node.add_marker(mark) - arr = pd.Series(["a", "b", "c", None], dtype=dtype) + with pd.option_context("string_storage", string_storage): + arr = pd.Series(["a", "b", "c", None], dtype="string") result = getattr(arr, method)(skipna=skipna) if skipna: expected = "a" if method == "min" else "c" @@ -382,18 +394,13 @@ def test_min_max(method, skipna, dtype, request): @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("box", [pd.Series, pd.array]) -def test_min_max_numpy(method, box, dtype, request): - if dtype == "arrow_string": - if box is pd.array: - raises = TypeError - reason = "'<=' not supported between instances of 'str' and 'NoneType'" - else: - raises = AttributeError - reason = "'ArrowStringArray' object has no attribute 'max'" - mark = pytest.mark.xfail(raises=raises, reason=reason) +def test_min_max_numpy(method, box, string_storage, request): + if string_storage == "pyarrow": + mark = pytest.mark.xfail(raises=NotImplementedError, reason="not implemented") request.node.add_marker(mark) - arr = box(["a", "b", "c", None], dtype=dtype) + with pd.option_context("string_storage", string_storage): + arr = box(["a", "b", "c", None], dtype="string") result = getattr(np, method)(arr) expected = "a" if method == "min" else "c" assert result == expected @@ -401,8 +408,9 @@ def test_min_max_numpy(method, box, dtype, request): @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.xfail(reason="Not implemented StringArray.sum") -def test_reduce_missing(skipna, dtype): - arr = pd.Series([None, "a", None, "b", "c", None], dtype=dtype) +def test_reduce_missing(skipna, string_storage): + with pd.option_context("string_storage", string_storage): + arr = pd.Series([None, "a", None, "b", "c", None], dtype="string") result = arr.sum(skipna=skipna) if skipna: assert result == "abc" @@ -410,10 +418,10 @@ def test_reduce_missing(skipna, dtype): assert pd.isna(result) -def test_fillna_args(dtype, request): +def test_fillna_args(string_storage, string_storage2, request): # GH 37987 - if dtype == "arrow_string": + if string_storage == "pyarrow": reason = ( "Regex pattern \"Cannot set non-string value '1' into " "a StringArray.\" does not match 'Scalar must be NA or str'" @@ -421,14 +429,17 @@ def test_fillna_args(dtype, request): mark = pytest.mark.xfail(raises=AssertionError, reason=reason) request.node.add_marker(mark) - arr = pd.array(["a", pd.NA], dtype=dtype) + with pd.option_context("string_storage", string_storage): + arr = pd.array(["a", pd.NA], dtype="string") res = arr.fillna(value="b") - expected = pd.array(["a", "b"], dtype=dtype) + with pd.option_context("string_storage", string_storage2): + expected = pd.array(["a", "b"], dtype="string") tm.assert_extension_array_equal(res, expected) res = arr.fillna(value=np.str_("b")) - expected = pd.array(["a", "b"], dtype=dtype) + with pd.option_context("string_storage", string_storage2): + expected = pd.array(["a", "b"], dtype="string") tm.assert_extension_array_equal(res, expected) msg = "Cannot set non-string value '1' into a StringArray." @@ -437,53 +448,57 @@ def test_fillna_args(dtype, request): @td.skip_if_no("pyarrow") -def test_arrow_array(dtype): +def test_arrow_array(string_storage): # protocol added in 0.15.0 import pyarrow as pa - data = pd.array(["a", "b", "c"], dtype=dtype) + with pd.option_context("string_storage", string_storage): + data = pd.array(["a", "b", "c"], dtype="string") arr = pa.array(data) expected = pa.array(list(data), type=pa.string(), from_pandas=True) - if dtype == "arrow_string": + if string_storage == "pyarrow": expected = pa.chunked_array(expected) assert arr.equals(expected) @td.skip_if_no("pyarrow") -def test_arrow_roundtrip(dtype, dtype_object): +def test_arrow_roundtrip(string_storage): # roundtrip possible from arrow 1.0.0 import pyarrow as pa - data = pd.array(["a", "b", None], dtype=dtype) + with pd.option_context("string_storage", string_storage): + data = pd.array(["a", "b", None], dtype="string") df = pd.DataFrame({"a": data}) table = pa.table(df) assert table.field("a").type == "string" result = table.to_pandas() - assert isinstance(result["a"].dtype, dtype_object) + assert isinstance(result["a"].dtype, StringDtype) tm.assert_frame_equal(result, df) # ensure the missing value is represented by NA and not np.nan or None assert result.loc[2, "a"] is pd.NA @td.skip_if_no("pyarrow") -def test_arrow_load_from_zero_chunks(dtype, dtype_object): +def test_arrow_load_from_zero_chunks(string_storage): # GH-41040 import pyarrow as pa - data = pd.array([], dtype=dtype) + with pd.option_context("string_storage", string_storage): + data = pd.array([], dtype="string") df = pd.DataFrame({"a": data}) table = pa.table(df) assert table.field("a").type == "string" # Instantiate the same table with no chunks at all table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema) result = table.to_pandas() - assert isinstance(result["a"].dtype, dtype_object) + assert isinstance(result["a"].dtype, StringDtype) tm.assert_frame_equal(result, df) -def test_value_counts_na(dtype): - arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) +def test_value_counts_na(string_storage): + with pd.option_context("string_storage", string_storage): + arr = pd.array(["a", "b", "a", pd.NA], dtype="string") result = arr.value_counts(dropna=False) expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64") tm.assert_series_equal(result, expected) @@ -493,8 +508,9 @@ def test_value_counts_na(dtype): tm.assert_series_equal(result, expected) -def test_value_counts_with_normalize(dtype): - s = pd.Series(["a", "b", "a", pd.NA], dtype=dtype) +def test_value_counts_with_normalize(string_storage): + with pd.option_context("string_storage", string_storage): + s = pd.Series(["a", "b", "a", pd.NA], dtype="string") result = s.value_counts(normalize=True) expected = pd.Series([2, 1], index=["a", "b"], dtype="Float64") / 3 tm.assert_series_equal(result, expected) @@ -507,9 +523,10 @@ def test_value_counts_with_normalize(dtype): (["a", "b", None], np.array([False, False, True])), ], ) -def test_use_inf_as_na(values, expected, dtype): +def test_use_inf_as_na(values, expected, string_storage): # https://github.com/pandas-dev/pandas/issues/33655 - values = pd.array(values, dtype=dtype) + with pd.option_context("string_storage", string_storage): + values = pd.array(values, dtype="string") with pd.option_context("mode.use_inf_as_na", True): result = values.isna() tm.assert_numpy_array_equal(result, expected) @@ -523,43 +540,49 @@ def test_use_inf_as_na(values, expected, dtype): tm.assert_frame_equal(result, expected) -def test_memory_usage(dtype, request): +def test_memory_usage(string_storage, request): # GH 33963 - if dtype == "arrow_string": + if string_storage == "pyarrow": pytest.skip("not applicable") - series = pd.Series(["a", "b", "c"], dtype=dtype) + with pd.option_context("string_storage", string_storage): + series = pd.Series(["a", "b", "c"], dtype="string") assert 0 < series.nbytes <= series.memory_usage() < series.memory_usage(deep=True) @pytest.mark.parametrize("float_dtype", [np.float16, np.float32, np.float64]) -def test_astype_from_float_dtype(float_dtype, dtype): +def test_astype_from_float_dtype(float_dtype, string_storage, string_storage2): # https://github.com/pandas-dev/pandas/issues/36451 s = pd.Series([0.1], dtype=float_dtype) - result = s.astype(dtype) - expected = pd.Series(["0.1"], dtype=dtype) + with pd.option_context("string_storage", string_storage): + result = s.astype("string") + with pd.option_context("string_storage", string_storage2): + expected = pd.Series(["0.1"], dtype="string") tm.assert_series_equal(result, expected) -def test_to_numpy_returns_pdna_default(dtype): - arr = pd.array(["a", pd.NA, "b"], dtype=dtype) +def test_to_numpy_returns_pdna_default(string_storage): + with pd.option_context("string_storage", string_storage): + arr = pd.array(["a", pd.NA, "b"], dtype="string") result = np.array(arr) expected = np.array(["a", pd.NA, "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) -def test_to_numpy_na_value(dtype, nulls_fixture): +def test_to_numpy_na_value(string_storage, nulls_fixture): na_value = nulls_fixture - arr = pd.array(["a", pd.NA, "b"], dtype=dtype) + with pd.option_context("string_storage", string_storage): + arr = pd.array(["a", pd.NA, "b"], dtype="string") result = arr.to_numpy(na_value=na_value) expected = np.array(["a", na_value, "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) -def test_isin(dtype, request): - s = pd.Series(["a", "b", None], dtype=dtype) +def test_isin(string_storage, request): + with pd.option_context("string_storage", string_storage): + s = pd.Series(["a", "b", None], dtype="string") result = s.isin(["a", "c"]) expected = pd.Series([True, False, False]) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 3db8333798e36..137572d28606b 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -5,10 +5,8 @@ from pandas.compat import pa_version_under1p0 -from pandas.core.arrays.string_arrow import ( - ArrowStringArray, - ArrowStringDtype, -) +import pandas as pd +from pandas.core.arrays.string_ import StringArray @pytest.mark.skipif( @@ -34,7 +32,7 @@ def test_constructor_not_string_type_raises(array, chunked): "ArrowStringArray requires a PyArrow (chunked) array of string type" ) with pytest.raises(ValueError, match=msg): - ArrowStringArray(arr) + StringArray(arr, storage="pyarrow") @pytest.mark.skipif( @@ -45,10 +43,8 @@ def test_pyarrow_not_installed_raises(): msg = re.escape("pyarrow>=1.0.0 is required for PyArrow backed StringArray") with pytest.raises(ImportError, match=msg): - ArrowStringDtype() - - with pytest.raises(ImportError, match=msg): - ArrowStringArray([]) + StringArray([], storage="pyarrow") - with pytest.raises(ImportError, match=msg): - ArrowStringArray._from_sequence(["a", None, "b"]) + with pd.option_context("string_storage", "pyarrow"): + with pytest.raises(ImportError, match=msg): + StringArray._from_sequence(["a", None, "b"]) diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 99a5666926e10..0d8675e3a85a1 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -46,20 +46,19 @@ def test_astype_str(self, data): self.assert_series_equal(result, expected) @pytest.mark.parametrize( - "nullable_string_dtype", + "string_storage", [ - "string", + "python", pytest.param( - "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") + "pyarrow", marks=td.skip_if_no("pyarrow", min_version="1.0.0") ), ], ) - def test_astype_string(self, data, nullable_string_dtype): + def test_astype_string(self, data, string_storage): # GH-33465 - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - - result = pd.Series(data[:5]).astype(nullable_string_dtype) - expected = pd.Series([str(x) for x in data[:5]], dtype=nullable_string_dtype) + with pd.option_context("string_storage", string_storage): + result = pd.Series(data[:5]).astype("string") + expected = pd.Series([str(x) for x in data[:5]], dtype="string") self.assert_series_equal(result, expected) def test_to_numpy(self, data): diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 49aee76e10f6a..1e60af52435f5 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -18,28 +18,25 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd -from pandas.core.arrays.string_ import StringDtype -from pandas.core.arrays.string_arrow import ArrowStringDtype +from pandas.core.arrays.string_ import StringArray from pandas.tests.extension import base def split_array(arr): - if not isinstance(arr.dtype, ArrowStringDtype): + if arr.storage == "python": pytest.skip("chunked array n/a") def _split_array(arr): import pyarrow as pa - arrow_array = arr._data + arrow_array = arr._array._data split = len(arrow_array) // 2 arrow_array = pa.chunked_array( [*arrow_array[:split].chunks, *arrow_array[split:].chunks] ) assert arrow_array.num_chunks == 2 - return type(arr)(arrow_array) + return type(arr)(arrow_array, storage="pyarrow") return _split_array(arr) @@ -49,44 +46,43 @@ def chunked(request): return request.param -@pytest.fixture( - params=[ - StringDtype, - pytest.param( - ArrowStringDtype, marks=td.skip_if_no("pyarrow", min_version="1.0.0") - ), - ] -) -def dtype(request): - return request.param() +@pytest.fixture(autouse=True) +def string_storage_setting(string_storage): + with pd.option_context("string_storage", string_storage): + yield + + +@pytest.fixture +def dtype(): + return pd.StringDtype() @pytest.fixture -def data(dtype, chunked): +def data(chunked): strings = np.random.choice(list(string.ascii_letters), size=100) while strings[0] == strings[1]: strings = np.random.choice(list(string.ascii_letters), size=100) - arr = dtype.construct_array_type()._from_sequence(strings) + arr = StringArray._from_sequence(strings) return split_array(arr) if chunked else arr @pytest.fixture -def data_missing(dtype, chunked): +def data_missing(chunked): """Length 2 array with [NA, Valid]""" - arr = dtype.construct_array_type()._from_sequence([pd.NA, "A"]) + arr = StringArray._from_sequence([pd.NA, "A"]) return split_array(arr) if chunked else arr @pytest.fixture -def data_for_sorting(dtype, chunked): - arr = dtype.construct_array_type()._from_sequence(["B", "C", "A"]) +def data_for_sorting(chunked): + arr = StringArray._from_sequence(["B", "C", "A"]) return split_array(arr) if chunked else arr @pytest.fixture -def data_missing_for_sorting(dtype, chunked): - arr = dtype.construct_array_type()._from_sequence(["B", pd.NA, "A"]) +def data_missing_for_sorting(chunked): + arr = StringArray._from_sequence(["B", pd.NA, "A"]) return split_array(arr) if chunked else arr @@ -96,10 +92,8 @@ def na_value(): @pytest.fixture -def data_for_grouping(dtype, chunked): - arr = dtype.construct_array_type()._from_sequence( - ["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"] - ) +def data_for_grouping(chunked): + arr = StringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"]) return split_array(arr) if chunked else arr @@ -109,7 +103,7 @@ class TestDtype(base.BaseDtypeTests): class TestInterface(base.BaseInterfaceTests): def test_view(self, data, request): - if isinstance(data.dtype, ArrowStringDtype): + if data.storage == "pyarrow": mark = pytest.mark.xfail(reason="not implemented") request.node.add_marker(mark) super().test_view(data) @@ -120,8 +114,8 @@ class TestConstructors(base.BaseConstructorsTests): class TestReshaping(base.BaseReshapingTests): - def test_transpose(self, data, dtype, request): - if isinstance(dtype, ArrowStringDtype): + def test_transpose(self, data, request): + if data.storage == "pyarrow": mark = pytest.mark.xfail(reason="not implemented") request.node.add_marker(mark) super().test_transpose(data) @@ -132,8 +126,8 @@ class TestGetitem(base.BaseGetitemTests): class TestSetitem(base.BaseSetitemTests): - def test_setitem_preserves_views(self, data, dtype, request): - if isinstance(dtype, ArrowStringDtype): + def test_setitem_preserves_views(self, data, request): + if data.storage == "pyarrow": mark = pytest.mark.xfail(reason="not implemented") request.node.add_marker(mark) super().test_setitem_preserves_views(data) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 1583b3f91bea2..c477ce117bf3f 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -3,8 +3,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( Categorical, @@ -585,22 +583,19 @@ def test_astype_empty_dtype_dict(self): "data, dtype", [ (["x", "y", "z"], "string"), - pytest.param( - ["x", "y", "z"], - "arrow_string", - marks=td.skip_if_no("pyarrow", min_version="1.0.0"), - ), (["x", "y", "z"], "category"), (3 * [Timestamp("2020-01-01", tz="UTC")], None), (3 * [Interval(0, 1)], None), ], ) @pytest.mark.parametrize("errors", ["raise", "ignore"]) - def test_astype_ignores_errors_for_extension_dtypes(self, data, dtype, errors): + def test_astype_ignores_errors_for_extension_dtypes( + self, data, dtype, errors, string_storage + ): # https://github.com/pandas-dev/pandas/issues/35471 - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - df = DataFrame(Series(data, dtype=dtype)) + with option_context("string_storage", string_storage): + df = DataFrame(Series(data, dtype=dtype)) if errors == "ignore": expected = df result = df.astype(float, errors=errors) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index ae6425cd93ac5..11e68cab058d8 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -811,10 +811,11 @@ def test_additional_extension_arrays(self, pa): @td.skip_if_no("pyarrow", min_version="1.0.0") def test_pyarrow_backed_string_array(self, pa): - # test ArrowStringArray supported through the __arrow_array__ protocol - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 + # test StringArray(..., storage="pyarrow") supported through the __arrow_array__ + # protocol - df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="arrow_string")}) + with pd.option_context("string_storage", "pyarrow"): + df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string")}) check_round_trip(df, pa, expected=df) @td.skip_if_no("pyarrow") diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index ffaecf1576364..3e439144cf3f5 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -10,8 +10,8 @@ import pytest from pandas._libs.tslibs import iNaT -import pandas.util._test_decorators as td +import pandas as pd from pandas import ( NA, Categorical, @@ -250,23 +250,19 @@ def test_td64_series_astype_object(self): "data, dtype", [ (["x", "y", "z"], "string"), - pytest.param( - ["x", "y", "z"], - "arrow_string", - marks=td.skip_if_no("pyarrow", min_version="1.0.0"), - ), (["x", "y", "z"], "category"), (3 * [Timestamp("2020-01-01", tz="UTC")], None), (3 * [Interval(0, 1)], None), ], ) @pytest.mark.parametrize("errors", ["raise", "ignore"]) - def test_astype_ignores_errors_for_extension_dtypes(self, data, dtype, errors): + def test_astype_ignores_errors_for_extension_dtypes( + self, data, dtype, errors, string_storage + ): # https://github.com/pandas-dev/pandas/issues/35471 - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - - ser = Series(data, dtype=dtype) + with pd.option_context("string_storage", string_storage): + ser = Series(data, dtype=dtype) if errors == "ignore": expected = ser result = ser.astype(float, errors="ignore") diff --git a/pandas/tests/series/methods/test_update.py b/pandas/tests/series/methods/test_update.py index 9a64877cb92ff..7cb4a129d8917 100644 --- a/pandas/tests/series/methods/test_update.py +++ b/pandas/tests/series/methods/test_update.py @@ -1,8 +1,7 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - +import pandas as pd from pandas import ( CategoricalDtype, DataFrame, @@ -11,7 +10,6 @@ Timestamp, ) import pandas._testing as tm -from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 class TestUpdate: @@ -88,13 +86,6 @@ def test_update_from_non_series(self, series, other, expected): "data, other, expected, dtype", [ (["a", None], [None, "b"], ["a", "b"], "string"), - pytest.param( - ["a", None], - [None, "b"], - ["a", "b"], - "arrow_string", - marks=td.skip_if_no("pyarrow", min_version="1.0.0"), - ), ([1, None], [None, 2], [1, 2], "Int64"), ([True, None], [None, False], [True, False], "boolean"), ( @@ -111,10 +102,13 @@ def test_update_from_non_series(self, series, other, expected): ), ], ) - def test_update_extension_array_series(self, data, other, expected, dtype): - result = Series(data, dtype=dtype) - other = Series(other, dtype=dtype) - expected = Series(expected, dtype=dtype) + def test_update_extension_array_series( + self, data, other, expected, dtype, string_storage + ): + with pd.option_context("string_storage", string_storage): + result = Series(data, dtype=dtype) + other = Series(other, dtype=dtype) + expected = Series(expected, dtype=dtype) result.update(other) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py index ec8b5bfa11ad5..e9e89ea6442b2 100644 --- a/pandas/tests/strings/test_api.py +++ b/pandas/tests/strings/test_api.py @@ -125,10 +125,12 @@ def test_api_per_method( method(*args, **kwargs) -def test_api_for_categorical(any_string_method, any_string_dtype, request): +def test_api_for_categorical( + any_string_method, any_string_dtype, any_string_dtype_param, request +): # https://github.com/pandas-dev/pandas/issues/10661 - if any_string_dtype == "arrow_string": + if any_string_dtype_param == "pyarrow": # unsupported operand type(s) for +: 'ArrowStringArray' and 'str' mark = pytest.mark.xfail(raises=TypeError, reason="Not Implemented") request.node.add_marker(mark) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 98f3fc859976e..ec5ae7c67875e 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -366,7 +366,7 @@ def test_len_mixed(): ) def test_index(method, sub, start, end, index_or_series, any_string_dtype, expected): if index_or_series is Index and not any_string_dtype == "object": - pytest.skip("Index cannot yet be backed by a StringArray/ArrowStringArray") + pytest.skip("Index cannot yet be backed by a StringArray") obj = index_or_series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype