diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index f6d2d6e63340f..1ca18bae4e2c4 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -636,7 +636,7 @@ cpdef ndarray[object] ensure_string_array( ---------- arr : array-like The values to be converted to str, if needed. - na_value : Any + na_value : Any, default np.nan The value to use for na. For example, np.nan or pd.NA. convert_na_value : bool, default True If False, existing na values will be used unchanged in the new array. diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 0968545a6b8a4..6884d03f9c5aa 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -468,6 +468,7 @@ def astype(self, dtype, copy=True): NumPy ndarray with 'dtype' for its dtype. """ from pandas.core.arrays.string_ import StringDtype + from pandas.core.arrays.string_arrow import ArrowStringDtype dtype = pandas_dtype(dtype) if is_dtype_equal(dtype, self.dtype): @@ -475,7 +476,11 @@ def astype(self, dtype, copy=True): return self else: return self.copy() - if isinstance(dtype, StringDtype): # allow conversion to StringArrays + + # FIXME: Really hard-code here? + if isinstance( + dtype, (ArrowStringDtype, StringDtype) + ): # allow conversion to StringArrays return dtype.construct_array_type()._from_sequence(self, copy=False) return np.array(self, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py new file mode 100644 index 0000000000000..184fbc050036b --- /dev/null +++ b/pandas/core/arrays/string_arrow.py @@ -0,0 +1,625 @@ +from __future__ import annotations + +from distutils.version import LooseVersion +from typing import TYPE_CHECKING, Any, Sequence, Type, Union + +import numpy as np + +from pandas._libs import lib, missing as libmissing +from pandas.util._validators import validate_fillna_kwargs + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.missing import isna + +from pandas.api.types import ( + is_array_like, + is_bool_dtype, + is_integer, + is_integer_dtype, + is_scalar, +) +from pandas.core.arraylike import OpsMixin +from pandas.core.arrays.base import ExtensionArray +from pandas.core.indexers import check_array_indexer, validate_indices +from pandas.core.missing import get_fill_func + +try: + import pyarrow as pa +except ImportError: + pa = None +else: + # our min supported version of pyarrow, 0.15.1, does not have a compute + # module + try: + import pyarrow.compute as pc + except ImportError: + pass + else: + ARROW_CMP_FUNCS = { + "eq": pc.equal, + "ne": pc.not_equal, + "lt": pc.less, + "gt": pc.greater, + "le": pc.less_equal, + "ge": pc.greater_equal, + } + + +if TYPE_CHECKING: + from pandas import Series + + +@register_extension_dtype +class ArrowStringDtype(ExtensionDtype): + """ + Extension dtype for string data in a ``pyarrow.ChunkedArray``. + + .. versionadded:: 1.2.0 + + .. warning:: + + ArrowStringDtype is considered experimental. The implementation and + parts of the API may change without warning. + + Attributes + ---------- + None + + Methods + ------- + None + + Examples + -------- + >>> from pandas.core.arrays.string_arrow import ArrowStringDtype + >>> ArrowStringDtype() + ArrowStringDtype + """ + + name = "arrow_string" + + #: StringDtype.na_value uses pandas.NA + na_value = libmissing.NA + + @property + def type(self) -> Type[str]: + return str + + @classmethod + def construct_array_type(cls) -> Type["ArrowStringArray"]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return ArrowStringArray + + def __hash__(self) -> int: + return hash("ArrowStringDtype") + + def __repr__(self) -> str: + return "ArrowStringDtype" + + def __from_arrow__( + self, array: Union["pa.Array", "pa.ChunkedArray"] + ) -> "ArrowStringArray": + """ + Construct StringArray from pyarrow Array/ChunkedArray. + """ + return ArrowStringArray(array) + + def __eq__(self, other) -> bool: + """Check whether 'other' is equal to self. + + By default, 'other' is considered equal if + * it's a string matching 'self.name'. + * it's an instance of this type. + + Parameters + ---------- + other : Any + + Returns + ------- + bool + """ + if isinstance(other, ArrowStringDtype): + return True + elif isinstance(other, str) and other == "arrow_string": + return True + else: + return False + + +class ArrowStringArray(OpsMixin, ExtensionArray): + """ + Extension array for string data in a ``pyarrow.ChunkedArray``. + + .. versionadded:: 1.2.0 + + .. warning:: + + ArrowStringArray is considered experimental. The implementation and + parts of the API may change without warning. + + Parameters + ---------- + values : pyarrow.Array or pyarrow.ChunkedArray + The array of data. + + Attributes + ---------- + None + + Methods + ------- + None + + See Also + -------- + array + The recommended function for creating a ArrowStringArray. + Series.str + The string methods are available on Series backed by + a ArrowStringArray. + + Notes + ----- + ArrowStringArray returns a BooleanArray for comparison methods. + + Examples + -------- + >>> pd.array(['This is', 'some text', None, 'data.'], dtype="arrow_string") + + ['This is', 'some text', , 'data.'] + Length: 4, dtype: arrow_string + """ + + _dtype = ArrowStringDtype() + + def __init__(self, values): + self._chk_pyarrow_available() + if isinstance(values, pa.Array): + self._data = pa.chunked_array([values]) + elif isinstance(values, pa.ChunkedArray): + self._data = values + else: + raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray") + + if not pa.types.is_string(self._data.type): + raise ValueError( + "ArrowStringArray requires a PyArrow (chunked) array of string type" + ) + + @classmethod + def _chk_pyarrow_available(cls) -> None: + # TODO: maybe update import_optional_dependency to allow a minimum + # version to be specified rather than use the global minimum + if pa is None or LooseVersion(pa.__version__) < "1.0.0": + msg = "pyarrow>=1.0.0 is required for PyArrow backed StringArray." + raise ImportError(msg) + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + cls._chk_pyarrow_available() + # convert non-na-likes to str, and nan-likes to ArrowStringDtype.na_value + scalars = lib.ensure_string_array(scalars, copy=False) + return cls(pa.array(scalars, type=pa.string(), from_pandas=True)) + + @classmethod + def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + return cls._from_sequence(strings, dtype=dtype, copy=copy) + + @property + def dtype(self) -> ArrowStringDtype: + """ + An instance of 'ArrowStringDtype'. + """ + return self._dtype + + def __array__(self, dtype=None) -> np.ndarray: + """Correctly construct numpy arrays when passed to `np.asarray()`.""" + return self.to_numpy(dtype=dtype) + + def __arrow_array__(self, type=None): + """Convert myself to a pyarrow Array or ChunkedArray.""" + return self._data + + def to_numpy( + self, dtype=None, copy: bool = False, na_value=lib.no_default + ) -> np.ndarray: + """ + Convert to a NumPy ndarray. + """ + # TODO: copy argument is ignored + + if na_value is lib.no_default: + na_value = self._dtype.na_value + result = self._data.__array__(dtype=dtype) + result[isna(result)] = na_value + return result + + def __len__(self) -> int: + """ + Length of this array. + + Returns + ------- + length : int + """ + return len(self._data) + + @classmethod + def _from_factorized(cls, values, original): + return cls._from_sequence(values) + + @classmethod + def _concat_same_type(cls, to_concat) -> ArrowStringArray: + """ + Concatenate multiple ArrowStringArray. + + Parameters + ---------- + to_concat : sequence of ArrowStringArray + + Returns + ------- + ArrowStringArray + """ + return cls( + pa.chunked_array( + [array for ea in to_concat for array in ea._data.iterchunks()] + ) + ) + + def __getitem__(self, item: Any) -> Any: + """Select a subset of self. + + Parameters + ---------- + item : int, slice, or ndarray + * int: The position in 'self' to get. + * slice: A slice object, where 'start', 'stop', and 'step' are + integers or None + * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' + + Returns + ------- + item : scalar or ExtensionArray + + Notes + ----- + For scalar ``item``, return a scalar value suitable for the array's + type. This should be an instance of ``self.dtype.type``. + For slice ``key``, return an instance of ``ExtensionArray``, even + if the slice is length 0 or 1. + For a boolean mask, return an instance of ``ExtensionArray``, filtered + to the values where ``item`` is True. + """ + item = check_array_indexer(self, item) + + if isinstance(item, np.ndarray): + if not len(item): + return type(self)(pa.chunked_array([], type=pa.string())) + elif is_integer_dtype(item.dtype): + return self.take(item) + elif is_bool_dtype(item.dtype): + return type(self)(self._data.filter(item)) + else: + raise IndexError( + "Only integers, slices and integer or " + "boolean arrays are valid indices." + ) + + # We are not an array indexer, so maybe e.g. a slice or integer + # indexer. We dispatch to pyarrow. + value = self._data[item] + if isinstance(value, pa.ChunkedArray): + return type(self)(value) + else: + return self._as_pandas_scalar(value) + + def _as_pandas_scalar(self, arrow_scalar: pa.Scalar): + scalar = arrow_scalar.as_py() + if scalar is None: + return self._dtype.na_value + else: + return scalar + + def fillna(self, value=None, method=None, limit=None): + """ + Fill NA/NaN values using the specified method. + + Parameters + ---------- + value : scalar, array-like + If a scalar value is passed it is used to fill all missing values. + Alternatively, an array-like 'value' can be given. It's expected + that the array-like have the same length as 'self'. + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap. + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. + + Returns + ------- + ExtensionArray + With NA/NaN filled. + """ + value, method = validate_fillna_kwargs(value, method) + + mask = self.isna() + + if is_array_like(value): + if len(value) != len(self): + raise ValueError( + f"Length of 'value' does not match. Got ({len(value)}) " + f"expected {len(self)}" + ) + value = value[mask] + + if mask.any(): + if method is not None: + func = get_fill_func(method) + new_values = func(self.to_numpy(object), limit=limit, mask=mask) + new_values = self._from_sequence(new_values) + else: + # fill with value + new_values = self.copy() + new_values[mask] = value + else: + new_values = self.copy() + return new_values + + def _reduce(self, name, skipna=True, **kwargs): + if name in ["min", "max"]: + return getattr(self, name)(skipna=skipna) + + raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + + @property + def nbytes(self) -> int: + """ + The number of bytes needed to store this object in memory. + """ + return self._data.nbytes + + def isna(self) -> np.ndarray: + """ + Boolean NumPy array indicating if each value is missing. + + This should return a 1-D array the same length as 'self'. + """ + # TODO: Implement .to_numpy for ChunkedArray + return self._data.is_null().to_pandas().values + + def copy(self) -> ArrowStringArray: + """ + Return a shallow copy of the array. + + Returns + ------- + ArrowStringArray + """ + return type(self)(self._data) + + def _cmp_method(self, other, op): + from pandas.arrays import BooleanArray + + pc_func = ARROW_CMP_FUNCS[op.__name__] + if isinstance(other, ArrowStringArray): + result = pc_func(self._data, other._data) + elif isinstance(other, np.ndarray): + result = pc_func(self._data, other) + elif is_scalar(other): + try: + result = pc_func(self._data, pa.scalar(other)) + except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): + mask = isna(self) | isna(other) + valid = ~mask + result = np.zeros(len(self), dtype="bool") + result[valid] = op(np.array(self)[valid], other) + return BooleanArray(result, mask) + else: + return NotImplemented + + # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray + return BooleanArray._from_sequence(result.to_pandas().values) + + def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None: + """Set one or more values inplace. + + Parameters + ---------- + key : int, ndarray, or slice + When called from, e.g. ``Series.__setitem__``, ``key`` will be + one of + + * scalar int + * ndarray of integers. + * boolean ndarray + * slice object + + value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object + value or values to be set of ``key``. + + Returns + ------- + None + """ + key = check_array_indexer(self, key) + + if is_integer(key): + if not is_scalar(value): + raise ValueError("Must pass scalars with scalar indexer") + elif isna(value): + value = None + elif not isinstance(value, str): + raise ValueError("Scalar must be NA or str") + + # Slice data and insert inbetween + new_data = [ + *self._data[0:key].chunks, + pa.array([value], type=pa.string()), + *self._data[(key + 1) :].chunks, + ] + self._data = pa.chunked_array(new_data) + else: + # Convert to integer indices and iteratively assign. + # TODO: Make a faster variant of this in Arrow upstream. + # This is probably extremely slow. + + # Convert all possible input key types to an array of integers + if is_bool_dtype(key): + # TODO(ARROW-9430): Directly support setitem(booleans) + key_array = np.argwhere(key).flatten() + elif isinstance(key, slice): + key_array = np.array(range(len(self))[key]) + else: + # TODO(ARROW-9431): Directly support setitem(integers) + key_array = np.asanyarray(key) + + if is_scalar(value): + value = np.broadcast_to(value, len(key_array)) + else: + value = np.asarray(value) + + if len(key_array) != len(value): + raise ValueError("Length of indexer and values mismatch") + + for k, v in zip(key_array, value): + self[k] = v + + def take( + self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None + ) -> "ExtensionArray": + """ + Take elements from an array. + + Parameters + ---------- + indices : sequence of int + Indices to be taken. + allow_fill : bool, default False + How to handle negative values in `indices`. + + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + + fill_value : any, optional + Fill value to use for NA-indices when `allow_fill` is True. + This may be ``None``, in which case the default NA value for + the type, ``self.dtype.na_value``, is used. + + For many ExtensionArrays, there will be two representations of + `fill_value`: a user-facing "boxed" scalar, and a low-level + physical NA value. `fill_value` should be the user-facing version, + and the implementation should handle translating that to the + physical version for processing the take if necessary. + + Returns + ------- + ExtensionArray + + Raises + ------ + IndexError + When the indices are out of bounds for the array. + ValueError + When `indices` contains negative values other than ``-1`` + and `allow_fill` is True. + + See Also + -------- + numpy.take + api.extensions.take + + Notes + ----- + ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, + ``iloc``, when `indices` is a sequence of values. Additionally, + it's called by :meth:`Series.reindex`, or any other method + that causes realignment, with a `fill_value`. + """ + # TODO: Remove once we got rid of the (indices < 0) check + if not is_array_like(indices): + indices_array = np.asanyarray(indices) + else: + indices_array = indices + + if len(self._data) == 0 and (indices_array >= 0).any(): + raise IndexError("cannot do a non-empty take") + if indices_array.size > 0 and indices_array.max() >= len(self._data): + raise IndexError("out of bounds value in 'indices'.") + + if allow_fill: + fill_mask = indices_array < 0 + if fill_mask.any(): + validate_indices(indices_array, len(self._data)) + # TODO(ARROW-9433): Treat negative indices as NULL + indices_array = pa.array(indices_array, mask=fill_mask) + result = self._data.take(indices_array) + if isna(fill_value): + return type(self)(result) + # TODO: ArrowNotImplementedError: Function fill_null has no + # kernel matching input types (array[string], scalar[string]) + result = type(self)(result) + result[fill_mask] = fill_value + return result + # return type(self)(pc.fill_null(result, pa.scalar(fill_value))) + else: + # Nothing to fill + return type(self)(self._data.take(indices)) + else: # allow_fill=False + # TODO(ARROW-9432): Treat negative indices as indices from the right. + if (indices_array < 0).any(): + # Don't modify in-place + indices_array = np.copy(indices_array) + indices_array[indices_array < 0] += len(self._data) + return type(self)(self._data.take(indices_array)) + + def value_counts(self, dropna: bool = True) -> Series: + """ + Return a Series containing counts of each unique value. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of missing values. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts + """ + from pandas import Index, Series + + vc = self._data.value_counts() + + # Index cannot hold ExtensionArrays yet + index = Index(type(self)(vc.field(0)).astype(object)) + # No missings, so we can adhere to the interface and return a numpy array. + counts = np.array(vc.field(1)) + + if dropna and self._data.null_count > 0: + raise NotImplementedError("yo") + + return Series(counts, index=index).astype("Int64") diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 9758eae60c262..465ec821400e7 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -385,13 +385,17 @@ def maybe_cast_to_extension_array( ExtensionArray or obj """ from pandas.core.arrays.string_ import StringArray + from pandas.core.arrays.string_arrow import ArrowStringArray assert isinstance(cls, type), f"must pass a type: {cls}" assertion_msg = f"must pass a subclass of ExtensionArray: {cls}" assert issubclass(cls, ABCExtensionArray), assertion_msg # Everything can be be converted to StringArrays, but we may not want to convert - if issubclass(cls, StringArray) and lib.infer_dtype(obj) != "string": + if ( + issubclass(cls, (StringArray, ArrowStringArray)) + and lib.infer_dtype(obj) != "string" + ): return obj try: diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 089bbcf4e0e3f..07e9484994c26 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -7,10 +7,54 @@ import pandas as pd import pandas._testing as tm +from pandas.core.arrays.string_arrow import ArrowStringArray, ArrowStringDtype +skip_if_no_pyarrow = td.skip_if_no("pyarrow", min_version="1.0.0") -def test_repr(): - df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype="string")}) + +@pytest.fixture( + params=[ + # pandas\tests\arrays\string_\test_string.py:16: error: List item 1 has + # incompatible type "ParameterSet"; expected + # "Sequence[Collection[object]]" [list-item] + "string", + pytest.param( + "arrow_string", marks=skip_if_no_pyarrow + ), # type:ignore[list-item] + ] +) +def dtype(request): + return request.param + + +@pytest.fixture +def dtype_object(dtype): + if dtype == "string": + return pd.StringDtype + else: + return ArrowStringDtype + + +@pytest.fixture( + params=[ + pd.arrays.StringArray, + pytest.param(ArrowStringArray, marks=skip_if_no_pyarrow), + ] +) +def cls(request): + return request.param + + +def test_repr(dtype, request): + if dtype == "arrow_string": + reason = ( + "AssertionError: assert ' A\n0 a\n1 None\n2 b' " + "== ' A\n0 a\n1 \n2 b'" + ) + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)}) expected = " A\n0 a\n1 \n2 b" assert repr(df) == expected @@ -21,27 +65,36 @@ def test_repr(): assert repr(df.A.array) == expected -def test_none_to_nan(): - a = pd.arrays.StringArray._from_sequence(["a", None, "b"]) +def test_none_to_nan(cls): + a = cls._from_sequence(["a", None, "b"]) assert a[1] is not None assert a[1] is pd.NA -def test_setitem_validates(): - a = pd.arrays.StringArray._from_sequence(["a", "b"]) - with pytest.raises(ValueError, match="10"): - a[0] = 10 +def test_setitem_validates(cls): + arr = cls._from_sequence(["a", "b"]) - with pytest.raises(ValueError, match="strings"): - a[:] = np.array([1, 2]) + if cls is pd.arrays.StringArray: + msg = "Cannot set non-string value '10' into a StringArray." + else: + msg = "Scalar must be NA or str" + with pytest.raises(ValueError, match=msg): + arr[0] = 10 + if cls is pd.arrays.StringArray: + msg = "Must provide strings." + else: + msg = "Scalar must be NA or str" + with pytest.raises(ValueError, match=msg): + arr[:] = np.array([1, 2]) -def test_setitem_with_scalar_string(): + +def test_setitem_with_scalar_string(dtype): # is_float_dtype considers some strings, like 'd', to be floats # which can cause issues. - arr = pd.array(["a", "c"], dtype="string") + arr = pd.array(["a", "c"], dtype=dtype) arr[0] = "d" - expected = pd.array(["d", "c"], dtype="string") + expected = pd.array(["d", "c"], dtype=dtype) tm.assert_extension_array_equal(arr, expected) @@ -53,46 +106,69 @@ def test_setitem_with_scalar_string(): (["a b", "a bc. de"], operator.methodcaller("capitalize")), ], ) -def test_string_methods(input, method): - a = pd.Series(input, dtype="string") +def test_string_methods(input, method, dtype, request): + if dtype == "arrow_string": + reason = "AttributeError: 'ArrowStringDtype' object has no attribute 'base'" + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + a = pd.Series(input, dtype=dtype) b = pd.Series(input, dtype="object") result = method(a.str) expected = method(b.str) - assert result.dtype.name == "string" + assert result.dtype.name == dtype tm.assert_series_equal(result.astype(object), expected) -def test_astype_roundtrip(): +def test_astype_roundtrip(dtype, request): + if dtype == "arrow_string": + reason = "ValueError: Could not convert object to NumPy datetime" + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + s = pd.Series(pd.date_range("2000", periods=12)) s[0] = None - result = s.astype("string").astype("datetime64[ns]") + result = s.astype(dtype).astype("datetime64[ns]") tm.assert_series_equal(result, s) -def test_add(): - a = pd.Series(["a", "b", "c", None, None], dtype="string") - b = pd.Series(["x", "y", None, "z", None], dtype="string") +def test_add(dtype, request): + if dtype == "arrow_string": + reason = ( + "TypeError: unsupported operand type(s) for +: 'ArrowStringArray' and " + "'ArrowStringArray'" + ) + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + a = pd.Series(["a", "b", "c", None, None], dtype=dtype) + b = pd.Series(["x", "y", None, "z", None], dtype=dtype) result = a + b - expected = pd.Series(["ax", "by", None, None, None], dtype="string") + expected = pd.Series(["ax", "by", None, None, None], dtype=dtype) tm.assert_series_equal(result, expected) result = a.add(b) tm.assert_series_equal(result, expected) result = a.radd(b) - expected = pd.Series(["xa", "yb", None, None, None], dtype="string") + expected = pd.Series(["xa", "yb", None, None, None], dtype=dtype) tm.assert_series_equal(result, expected) result = a.add(b, fill_value="-") - expected = pd.Series(["ax", "by", "c-", "-z", None], dtype="string") + expected = pd.Series(["ax", "by", "c-", "-z", None], dtype=dtype) tm.assert_series_equal(result, expected) -def test_add_2d(): - a = pd.array(["a", "b", "c"], dtype="string") +def test_add_2d(dtype, request): + if dtype == "arrow_string": + reason = "Failed: DID NOT RAISE " + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + a = pd.array(["a", "b", "c"], dtype=dtype) b = np.array([["a", "b", "c"]], dtype=object) with pytest.raises(ValueError, match="3 != 1"): a + b @@ -102,23 +178,38 @@ def test_add_2d(): s + b -def test_add_sequence(): - a = pd.array(["a", "b", None, None], dtype="string") +def test_add_sequence(dtype, request): + if dtype == "arrow_string": + reason = ( + "TypeError: unsupported operand type(s) for +: 'ArrowStringArray' " + "and 'list'" + ) + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + a = pd.array(["a", "b", None, None], dtype=dtype) other = ["x", None, "y", None] result = a + other - expected = pd.array(["ax", None, None, None], dtype="string") + expected = pd.array(["ax", None, None, None], dtype=dtype) tm.assert_extension_array_equal(result, expected) result = other + a - expected = pd.array(["xa", None, None, None], dtype="string") + expected = pd.array(["xa", None, None, None], dtype=dtype) tm.assert_extension_array_equal(result, expected) -def test_mul(): - a = pd.array(["a", "b", None], dtype="string") +def test_mul(dtype, request): + if dtype == "arrow_string": + reason = ( + "TypeError: unsupported operand type(s) for *: 'ArrowStringArray' and 'int'" + ) + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + a = pd.array(["a", "b", None], dtype=dtype) result = a * 2 - expected = pd.array(["aa", "bb", None], dtype="string") + expected = pd.array(["aa", "bb", None], dtype=dtype) tm.assert_extension_array_equal(result, expected) result = 2 * a @@ -126,55 +217,83 @@ def test_mul(): @pytest.mark.xfail(reason="GH-28527") -def test_add_strings(): - array = pd.array(["a", "b", "c", "d"], dtype="string") +def test_add_strings(dtype): + array = pd.array(["a", "b", "c", "d"], dtype=dtype) df = pd.DataFrame([["t", "u", "v", "w"]]) assert array.__add__(df) is NotImplemented result = array + df - expected = pd.DataFrame([["at", "bu", "cv", "dw"]]).astype("string") + expected = pd.DataFrame([["at", "bu", "cv", "dw"]]).astype(dtype) tm.assert_frame_equal(result, expected) result = df + array - expected = pd.DataFrame([["ta", "ub", "vc", "wd"]]).astype("string") + expected = pd.DataFrame([["ta", "ub", "vc", "wd"]]).astype(dtype) tm.assert_frame_equal(result, expected) @pytest.mark.xfail(reason="GH-28527") -def test_add_frame(): - array = pd.array(["a", "b", np.nan, np.nan], dtype="string") +def test_add_frame(dtype): + array = pd.array(["a", "b", np.nan, np.nan], dtype=dtype) df = pd.DataFrame([["x", np.nan, "y", np.nan]]) assert array.__add__(df) is NotImplemented result = array + df - expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype("string") + expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype(dtype) tm.assert_frame_equal(result, expected) result = df + array - expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype("string") + expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype(dtype) tm.assert_frame_equal(result, expected) -def test_comparison_methods_scalar(all_compare_operators): +def test_comparison_methods_scalar(all_compare_operators, dtype): op_name = all_compare_operators - - a = pd.array(["a", None, "c"], dtype="string") + a = pd.array(["a", None, "c"], dtype=dtype) other = "a" result = getattr(a, op_name)(other) expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) expected = pd.array(expected, dtype="boolean") tm.assert_extension_array_equal(result, expected) + +def test_comparison_methods_scalar_pd_na(all_compare_operators, dtype): + op_name = all_compare_operators + a = pd.array(["a", None, "c"], dtype=dtype) result = getattr(a, op_name)(pd.NA) expected = pd.array([None, None, None], dtype="boolean") tm.assert_extension_array_equal(result, expected) -def test_comparison_methods_array(all_compare_operators): +def test_comparison_methods_scalar_not_string(all_compare_operators, dtype, request): + if all_compare_operators not in ["__eq__", "__ne__"]: + reason = "comparison op not supported between instances of 'str' and 'int'" + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + op_name = all_compare_operators + a = pd.array(["a", None, "c"], dtype=dtype) + other = 42 + result = getattr(a, op_name)(other) + expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[ + op_name + ] + expected = pd.array(expected_data, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_comparison_methods_array(all_compare_operators, dtype, request): + if dtype == "arrow_string": + if all_compare_operators in ["__eq__", "__ne__"]: + reason = "NotImplementedError: Neither scalar nor ArrowStringArray" + else: + reason = "AssertionError: left is not an ExtensionArray" + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + op_name = all_compare_operators - a = pd.array(["a", None, "c"], dtype="string") + a = pd.array(["a", None, "c"], dtype=dtype) other = [None, None, "c"] result = getattr(a, op_name)(other) expected = np.empty_like(a, dtype="object") @@ -187,30 +306,46 @@ def test_comparison_methods_array(all_compare_operators): tm.assert_extension_array_equal(result, expected) -def test_constructor_raises(): - with pytest.raises(ValueError, match="sequence of strings"): - pd.arrays.StringArray(np.array(["a", "b"], dtype="S1")) +def test_constructor_raises(cls): + if cls is pd.arrays.StringArray: + msg = "StringArray requires a sequence of strings or pandas.NA" + else: + msg = "Unsupported type '' for ArrowStringArray" + + with pytest.raises(ValueError, match=msg): + cls(np.array(["a", "b"], dtype="S1")) - with pytest.raises(ValueError, match="sequence of strings"): - pd.arrays.StringArray(np.array([])) + with pytest.raises(ValueError, match=msg): + cls(np.array([])) - with pytest.raises(ValueError, match="strings or pandas.NA"): - pd.arrays.StringArray(np.array(["a", np.nan], dtype=object)) + with pytest.raises(ValueError, match=msg): + cls(np.array(["a", np.nan], dtype=object)) - with pytest.raises(ValueError, match="strings or pandas.NA"): - pd.arrays.StringArray(np.array(["a", None], dtype=object)) + with pytest.raises(ValueError, match=msg): + cls(np.array(["a", None], dtype=object)) - with pytest.raises(ValueError, match="strings or pandas.NA"): - pd.arrays.StringArray(np.array(["a", pd.NaT], dtype=object)) + with pytest.raises(ValueError, match=msg): + cls(np.array(["a", pd.NaT], dtype=object)) @pytest.mark.parametrize("copy", [True, False]) -def test_from_sequence_no_mutate(copy): +def test_from_sequence_no_mutate(copy, cls, request): + if cls is ArrowStringArray and copy is False: + reason = "AssertionError: numpy array are different" + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + nan_arr = np.array(["a", np.nan], dtype=object) na_arr = np.array(["a", pd.NA], dtype=object) - result = pd.arrays.StringArray._from_sequence(nan_arr, copy=copy) - expected = pd.arrays.StringArray(na_arr) + result = cls._from_sequence(nan_arr, copy=copy) + + if cls is ArrowStringArray: + import pyarrow as pa + + expected = cls(pa.array(na_arr, type=pa.string(), from_pandas=True)) + else: + expected = cls(na_arr) tm.assert_extension_array_equal(result, expected) @@ -218,8 +353,13 @@ def test_from_sequence_no_mutate(copy): tm.assert_numpy_array_equal(nan_arr, expected) -def test_astype_int(): - arr = pd.array(["1", pd.NA, "3"], dtype="string") +def test_astype_int(dtype, request): + if dtype == "arrow_string": + reason = "TypeError: Cannot interpret 'Int64Dtype()' as a data type" + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + arr = pd.array(["1", pd.NA, "3"], dtype=dtype) result = arr.astype("Int64") expected = pd.array([1, pd.NA, 3], dtype="Int64") @@ -228,16 +368,21 @@ def test_astype_int(): @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.xfail(reason="Not implemented StringArray.sum") -def test_reduce(skipna): - arr = pd.Series(["a", "b", "c"], dtype="string") +def test_reduce(skipna, dtype): + arr = pd.Series(["a", "b", "c"], dtype=dtype) result = arr.sum(skipna=skipna) assert result == "abc" @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("skipna", [True, False]) -def test_min_max(method, skipna): - arr = pd.Series(["a", "b", "c", None], dtype="string") +def test_min_max(method, skipna, dtype, request): + if dtype == "arrow_string": + reason = "AttributeError: 'ArrowStringArray' object has no attribute 'max'" + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + arr = pd.Series(["a", "b", "c", None], dtype=dtype) result = getattr(arr, method)(skipna=skipna) if skipna: expected = "a" if method == "min" else "c" @@ -247,14 +392,20 @@ def test_min_max(method, skipna): @pytest.mark.parametrize("method", ["min", "max"]) -@pytest.mark.parametrize( - "arr", - [ - pd.Series(["a", "b", "c", None], dtype="string"), - pd.array(["a", "b", "c", None], dtype="string"), - ], -) -def test_min_max_numpy(method, arr): +@pytest.mark.parametrize("box", [pd.Series, pd.array]) +def test_min_max_numpy(method, box, dtype, request): + if dtype == "arrow_string": + if box is pd.array: + reason = ( + "TypeError: '<=' not supported between instances of 'str' and " + "'NoneType'" + ) + else: + reason = "AttributeError: 'ArrowStringArray' object has no attribute 'max'" + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + arr = box(["a", "b", "c", None], dtype=dtype) result = getattr(np, method)(arr) expected = "a" if method == "min" else "c" assert result == expected @@ -262,8 +413,8 @@ def test_min_max_numpy(method, arr): @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.xfail(reason="Not implemented StringArray.sum") -def test_reduce_missing(skipna): - arr = pd.Series([None, "a", None, "b", "c", None], dtype="string") +def test_reduce_missing(skipna, dtype): + arr = pd.Series([None, "a", None, "b", "c", None], dtype=dtype) result = arr.sum(skipna=skipna) if skipna: assert result == "abc" @@ -272,34 +423,42 @@ def test_reduce_missing(skipna): @td.skip_if_no("pyarrow", min_version="0.15.0") -def test_arrow_array(): +def test_arrow_array(dtype): # protocol added in 0.15.0 import pyarrow as pa - data = pd.array(["a", "b", "c"], dtype="string") + data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) expected = pa.array(list(data), type=pa.string(), from_pandas=True) + if dtype == "arrow_string": + expected = pa.chunked_array(expected) + assert arr.equals(expected) @td.skip_if_no("pyarrow", min_version="0.15.1.dev") -def test_arrow_roundtrip(): +def test_arrow_roundtrip(dtype, dtype_object): # roundtrip possible from arrow 1.0.0 import pyarrow as pa - data = pd.array(["a", "b", None], dtype="string") + data = pd.array(["a", "b", None], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) assert table.field("a").type == "string" result = table.to_pandas() - assert isinstance(result["a"].dtype, pd.StringDtype) + assert isinstance(result["a"].dtype, dtype_object) tm.assert_frame_equal(result, df) # ensure the missing value is represented by NA and not np.nan or None assert result.loc[2, "a"] is pd.NA -def test_value_counts_na(): - arr = pd.array(["a", "b", "a", pd.NA], dtype="string") +def test_value_counts_na(dtype, request): + if dtype == "arrow_string": + reason = "TypeError: boolean value of NA is ambiguous" + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) result = arr.value_counts(dropna=False) expected = pd.Series([2, 1, 1], index=["a", pd.NA, "b"], dtype="Int64") tm.assert_series_equal(result, expected) @@ -312,12 +471,13 @@ def test_value_counts_na(): @pytest.mark.parametrize( "values, expected", [ - (pd.array(["a", "b", "c"]), np.array([False, False, False])), - (pd.array(["a", "b", None]), np.array([False, False, True])), + (["a", "b", "c"], np.array([False, False, False])), + (["a", "b", None], np.array([False, False, True])), ], ) -def test_use_inf_as_na(values, expected): +def test_use_inf_as_na(values, expected, dtype): # https://github.com/pandas-dev/pandas/issues/33655 + values = pd.array(values, dtype=dtype) with pd.option_context("mode.use_inf_as_na", True): result = values.isna() tm.assert_numpy_array_equal(result, expected) @@ -331,17 +491,36 @@ def test_use_inf_as_na(values, expected): tm.assert_frame_equal(result, expected) -def test_memory_usage(): +def test_memory_usage(dtype, request): # GH 33963 - series = pd.Series(["a", "b", "c"], dtype="string") + + if dtype == "arrow_string": + pytest.skip("not applicable") + + series = pd.Series(["a", "b", "c"], dtype=dtype) assert 0 < series.nbytes <= series.memory_usage() < series.memory_usage(deep=True) -@pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64]) -def test_astype_from_float_dtype(dtype): +@pytest.mark.parametrize("float_dtype", [np.float16, np.float32, np.float64]) +def test_astype_from_float_dtype(float_dtype, dtype): # https://github.com/pandas-dev/pandas/issues/36451 - s = pd.Series([0.1], dtype=dtype) - result = s.astype("string") - expected = pd.Series(["0.1"], dtype="string") + s = pd.Series([0.1], dtype=float_dtype) + result = s.astype(dtype) + expected = pd.Series(["0.1"], dtype=dtype) tm.assert_series_equal(result, expected) + + +def test_to_numpy_returns_pdna_default(dtype): + arr = pd.array(["a", pd.NA, "b"], dtype=dtype) + result = np.array(arr) + expected = np.array(["a", pd.NA, "b"], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + +def test_to_numpy_na_value(dtype, nulls_fixture): + na_value = nulls_fixture + arr = pd.array(["a", pd.NA, "b"], dtype=dtype) + result = arr.to_numpy(na_value=na_value) + expected = np.array(["a", na_value, "b"], dtype=object) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py new file mode 100644 index 0000000000000..ec7f57940a67f --- /dev/null +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -0,0 +1,26 @@ +import re + +import numpy as np +import pytest + +from pandas.core.arrays.string_arrow import ArrowStringArray + +pa = pytest.importorskip("pyarrow", minversion="1.0.0") + + +@pytest.mark.parametrize("chunked", [True, False]) +@pytest.mark.parametrize("array", [np, pa]) +def test_constructor_not_string_type_raises(array, chunked): + arr = array.array([1, 2, 3]) + if chunked: + if array is np: + pytest.skip("chunked not applicable to numpy array") + arr = pa.chunked_array(arr) + if array is np: + msg = "Unsupported type '' for ArrowStringArray" + else: + msg = re.escape( + "ArrowStringArray requires a PyArrow (chunked) array of string type" + ) + with pytest.raises(ValueError, match=msg): + ArrowStringArray(arr) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 27a157d2127f6..db1940226e04e 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -3,39 +3,49 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd -from pandas.core.arrays.string_ import StringArray, StringDtype +from pandas.core.arrays.string_ import StringDtype +from pandas.core.arrays.string_arrow import ArrowStringDtype from pandas.tests.extension import base -@pytest.fixture -def dtype(): - return StringDtype() +@pytest.fixture( + params=[ + StringDtype, + pytest.param( + ArrowStringDtype, marks=td.skip_if_no("pyarrow", min_version="1.0.0") + ), + ] +) +def dtype(request): + return request.param() @pytest.fixture -def data(): +def data(dtype): strings = np.random.choice(list(string.ascii_letters), size=100) while strings[0] == strings[1]: strings = np.random.choice(list(string.ascii_letters), size=100) - return StringArray._from_sequence(strings) + return dtype.construct_array_type()._from_sequence(strings) @pytest.fixture -def data_missing(): +def data_missing(dtype): """Length 2 array with [NA, Valid]""" - return StringArray._from_sequence([pd.NA, "A"]) + return dtype.construct_array_type()._from_sequence([pd.NA, "A"]) @pytest.fixture -def data_for_sorting(): - return StringArray._from_sequence(["B", "C", "A"]) +def data_for_sorting(dtype): + return dtype.construct_array_type()._from_sequence(["B", "C", "A"]) @pytest.fixture -def data_missing_for_sorting(): - return StringArray._from_sequence(["B", pd.NA, "A"]) +def data_missing_for_sorting(dtype): + return dtype.construct_array_type()._from_sequence(["B", pd.NA, "A"]) @pytest.fixture @@ -44,8 +54,10 @@ def na_value(): @pytest.fixture -def data_for_grouping(): - return StringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"]) +def data_for_grouping(dtype): + return dtype.construct_array_type()._from_sequence( + ["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"] + ) class TestDtype(base.BaseDtypeTests): @@ -53,7 +65,11 @@ class TestDtype(base.BaseDtypeTests): class TestInterface(base.BaseInterfaceTests): - pass + def test_view(self, data, request): + if isinstance(data.dtype, ArrowStringDtype): + mark = pytest.mark.xfail(reason="not implemented") + request.node.add_marker(mark) + super().test_view(data) class TestConstructors(base.BaseConstructorsTests): @@ -61,7 +77,11 @@ class TestConstructors(base.BaseConstructorsTests): class TestReshaping(base.BaseReshapingTests): - pass + def test_transpose(self, data, dtype, request): + if isinstance(dtype, ArrowStringDtype): + mark = pytest.mark.xfail(reason="not implemented") + request.node.add_marker(mark) + super().test_transpose(data) class TestGetitem(base.BaseGetitemTests): @@ -69,7 +89,11 @@ class TestGetitem(base.BaseGetitemTests): class TestSetitem(base.BaseSetitemTests): - pass + def test_setitem_preserves_views(self, data, dtype, request): + if isinstance(dtype, ArrowStringDtype): + mark = pytest.mark.xfail(reason="not implemented") + request.node.add_marker(mark) + super().test_setitem_preserves_views(data) class TestMissing(base.BaseMissingTests):