From 4cb60e6b586338c468e04a4274a05c06811adeb7 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Fri, 10 Jul 2020 20:19:15 +0200 Subject: [PATCH 01/46] Implement BaseDtypeTests for ArrowStringDtype --- pandas/core/arrays/base.py | 6 +- pandas/core/arrays/string_arrow.py | 484 ++++++++++++++++++++ pandas/tests/extension/test_string_arrow.py | 125 +++++ setup.py | 2 +- 4 files changed, 615 insertions(+), 2 deletions(-) create mode 100644 pandas/core/arrays/string_arrow.py create mode 100644 pandas/tests/extension/test_string_arrow.py diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 8193d65b3b30c..736d95b4b64b6 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -457,9 +457,13 @@ def astype(self, dtype, copy=True): NumPy ndarray with 'dtype' for its dtype. """ from pandas.core.arrays.string_ import StringDtype + from pandas.core.arrays.string_arrow import ArrowStringDtype dtype = pandas_dtype(dtype) - if isinstance(dtype, StringDtype): # allow conversion to StringArrays + # FIXME: Really hard-code here? + if isinstance( + dtype, (ArrowStringDtype, StringDtype) + ): # allow conversion to StringArrays return dtype.construct_array_type()._from_sequence(self, copy=False) return np.array(self, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py new file mode 100644 index 0000000000000..8248a3e91c0fe --- /dev/null +++ b/pandas/core/arrays/string_arrow.py @@ -0,0 +1,484 @@ +from collections.abc import Iterable +from typing import Any, Optional, Sequence, Tuple, Type, Union + +import numpy as np +import pyarrow as pa +import pyarrow.compute as pc + +from pandas._libs import missing as libmissing +from pandas._typing import ArrayLike + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.dtypes import register_extension_dtype + +import pandas as pd +from pandas.api.types import ( + is_array_like, + is_bool_dtype, + is_integer, + is_integer_dtype, + is_scalar, +) +from pandas.core.arrays.base import ExtensionArray +from pandas.core.indexers import check_array_indexer + + +def _as_pandas_scalar(arrow_scalar: pa.Scalar) -> Optional[str]: + scalar = arrow_scalar.as_py() + if scalar is None: + return libmissing.NA + else: + return scalar + + +@register_extension_dtype +class ArrowStringDtype(ExtensionDtype): + """ + Extension dtype for string data in a ``pyarrow.ChunkedArray``. + + .. versionadded:: 1.1.0 + + .. warning:: + + ArrowStringDtype is considered experimental. The implementation and + parts of the API may change without warning. + + Attributes + ---------- + None + + Methods + ------- + None + + Examples + -------- + >>> pd.ArrowStringDtype() + ArrowStringDtype + """ + + name = "arrow_string" + + #: StringDtype.na_value uses pandas.NA + na_value = libmissing.NA + + @property + def type(self) -> Type[str]: + return str + + @classmethod + def construct_array_type(cls) -> Type["ArrowStringArray"]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return ArrowStringArray + + def __hash__(self) -> int: + return hash("ArrowStringDtype") + + def __repr__(self) -> str: + return "ArrowStringDtype" + + def __from_arrow__( + self, array: Union["pa.Array", "pa.ChunkedArray"] + ) -> "ArrowStringArray": + """ + Construct StringArray from pyarrow Array/ChunkedArray. + """ + return ArrowStringArray(array) + + def __eq__(self, other) -> bool: + """Check whether 'other' is equal to self. + + By default, 'other' is considered equal if + * it's a string matching 'self.name'. + * it's an instance of this type. + + Parameters + ---------- + other : Any + + Returns + ------- + bool + """ + if isinstance(other, ArrowStringDtype): + return True + elif isinstance(other, str) and other == "arrow_string": + return True + else: + return False + + +class ArrowStringArray(ExtensionArray): + """ + Extension array for string data in a ``pyarrow.ChunkedArray``. + + .. versionadded:: 1.1.0 + + .. warning:: + + ArrowStringArray is considered experimental. The implementation and + parts of the API may change without warning. + + Parameters + ---------- + values : pyarrow.Array or pyarrow.ChunkedArray + The array of data. + + Attributes + ---------- + None + + Methods + ------- + None + + See Also + -------- + array + The recommended function for creating a ArrowStringArray. + Series.str + The string methods are available on Series backed by + a ArrowStringArray. + + Notes + ----- + ArrowStringArray returns a BooleanArray for comparison methods. + + Examples + -------- + >>> pd.array(['This is', 'some text', None, 'data.'], dtype="arrow_string") + + ['This is', 'some text', , 'data.'] + Length: 4, dtype: arrow_string + """ + + def __init__(self, values): + if isinstance(values, pa.Array): + self.data = pa.chunked_array([values]) + elif isinstance(values, pa.ChunkedArray): + self.data = values + else: + raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray") + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + # TODO(ARROW-9407): Accept pd.NA in Arrow + scalars_corrected = [None if pd.isna(x) else x for x in scalars] + return cls(pa.array(scalars_corrected, type=pa.string())) + + @property + def dtype(self) -> ArrowStringDtype: + """ + An instance of 'ArrowStringDtype'. + """ + return ArrowStringDtype() + + def __array__(self, *args, **kwargs) -> "np.ndarray": + """Correctly construct numpy arrays when passed to `np.asarray()`.""" + return self.data.__array__(*args, **kwargs) + + def __arrow_array__(self, type=None): + """Convert myself to a pyarrow Array or ChunkedArray.""" + return self.data + + @property + def size(self) -> int: + """ + Return the number of elements in this array. + + Returns + ------- + size : int + """ + return len(self.data) + + @property + def shape(self) -> Tuple[int]: + """Return the shape of the data.""" + # This may be patched by pandas to support pseudo-2D operations. + return (len(self.data),) + + @property + def ndim(self) -> int: + """Return the number of dimensions of the underlying data.""" + return 1 + + def __len__(self) -> int: + """ + Length of this array. + + Returns + ------- + length : int + """ + return len(self.data) + + @classmethod + def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + return cls._from_sequence(strings, dtype=dtype, copy=copy) + + def __getitem__(self, item): + # type (Any) -> Any + """Select a subset of self. + + Parameters + ---------- + item : int, slice, or ndarray + * int: The position in 'self' to get. + * slice: A slice object, where 'start', 'stop', and 'step' are + integers or None + * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' + + Returns + ------- + item : scalar or ExtensionArray + + Notes + ----- + For scalar ``item``, return a scalar value suitable for the array's + type. This should be an instance of ``self.dtype.type``. + For slice ``key``, return an instance of ``ExtensionArray``, even + if the slice is length 0 or 1. + For a boolean mask, return an instance of ``ExtensionArray``, filtered + to the values where ``item`` is True. + """ + item = check_array_indexer(self, item) + + if isinstance(item, Iterable): + if not is_array_like(item): + item = np.array(item) + if len(item) == 0: + return type(self)(pa.chunked_array([], type=pa.string())) + elif is_integer_dtype(item): + return self.take(item) + elif is_bool_dtype(item): + return type(self)(self.data.filter(item)) + else: + raise IndexError( + "Only integers, slices and integer or " + "boolean arrays are valid indices." + ) + elif is_integer(item): + if item < 0: + item += len(self) + if item >= len(self): + raise IndexError("index out of bounds") + + value = self.data[item] + if isinstance(value, pa.ChunkedArray): + return type(self)(value) + else: + return _as_pandas_scalar(value) + + def fillna(self, value=None, method=None, limit=None): + raise NotImplementedError("fillna") + + def _reduce(self, name, skipna=True, **kwargs): + if name in ["min", "max"]: + return getattr(self, name)(skipna=skipna) + + raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + + @property + def nbytes(self) -> int: + """ + The number of bytes needed to store this object in memory. + """ + return self.data.nbytes + + def isna(self) -> np.ndarray: + """ + Boolean NumPy array indicating if each value is missing. + + This should return a 1-D array the same length as 'self'. + """ + # TODO: Implement .to_numpy for ChunkedArray + return self.data.is_null().to_pandas().values + + def copy(self) -> ExtensionArray: + """ + Return a copy of the array. + + Parameters + ---------- + deep : bool, default False + Also copy the underlying data backing this array. + + Returns + ------- + ExtensionArray + """ + return type(self)(self.data) + + def __eq__(self, other: Any) -> ArrayLike: + """ + Return for `self == other` (element-wise equality). + """ + if isinstance(other, (pd.Series, pd.DataFrame, pd.Index)): + return NotImplemented + if isinstance(other, ArrowStringArray): + result = pc.equal(self.data, other.data) + elif is_scalar(other): + result = pc.equal(self.data, pa.scalar(other)) + else: + raise NotImplementedError("Neither scalar nor ArrowStringArray") + + # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray + return pd.array(result.to_pandas().values) + + def __setitem__(self, key, value): + # type: (Union[int, np.ndarray], Any) -> None + """Set one or more values inplace. + + Parameters + ---------- + key : int, ndarray, or slice + When called from, e.g. ``Series.__setitem__``, ``key`` will be + one of + + * scalar int + * ndarray of integers. + * boolean ndarray + * slice object + + value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object + value or values to be set of ``key``. + + Returns + ------- + None + """ + key = check_array_indexer(self, key) + + if is_integer(key): + if not pd.api.types.is_scalar(value): + raise ValueError("Must pass scalars with scalar indexer") + elif pd.isna(value): + value = None + elif not isinstance(value, str): + raise ValueError("Scalar must be NA or str") + + # Slice data and insert inbetween + new_data = [ + *self.data[0:key].chunks, + pa.array([value], type=pa.string()), + *self.data[(key + 1) :].chunks, + ] + self.data = pa.chunked_array(new_data) + else: + # Convert to integer indices and iteratively assign. + # TODO: Make a faster variant of this in Arrow upstream. + # This is probably extremely slow. + + # Convert all possible input key types to an array of integers + if is_bool_dtype(key): + # TODO(ARROW-9430): Directly support setitem(booleans) + key_array = np.argwhere(key).flatten() + elif isinstance(key, slice): + key_array = np.array(range(len(self))[key]) + else: + # TODO(ARROW-9431): Directly support setitem(integers) + key_array = np.asanyarray(key) + + if pd.api.types.is_scalar(value): + value = np.broadcast_to(value, len(key_array)) + else: + value = np.asarray(value) + + if len(key_array) != len(value): + raise ValueError("Length of indexer and values mismatch") + + for k, v in zip(key_array, value): + self[k] = v + + def take( + self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None + ) -> "ExtensionArray": + """ + Take elements from an array. + + Parameters + ---------- + indices : sequence of int + Indices to be taken. + allow_fill : bool, default False + How to handle negative values in `indices`. + + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + + fill_value : any, optional + Fill value to use for NA-indices when `allow_fill` is True. + This may be ``None``, in which case the default NA value for + the type, ``self.dtype.na_value``, is used. + + For many ExtensionArrays, there will be two representations of + `fill_value`: a user-facing "boxed" scalar, and a low-level + physical NA value. `fill_value` should be the user-facing version, + and the implementation should handle translating that to the + physical version for processing the take if necessary. + + Returns + ------- + ExtensionArray + + Raises + ------ + IndexError + When the indices are out of bounds for the array. + ValueError + When `indices` contains negative values other than ``-1`` + and `allow_fill` is True. + + See Also + -------- + numpy.take + api.extensions.take + + Notes + ----- + ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, + ``iloc``, when `indices` is a sequence of values. Additionally, + it's called by :meth:`Series.reindex`, or any other method + that causes realignment, with a `fill_value`. + """ + # TODO: Remove once we got rid of the (indices < 0) check + if not is_array_like(indices): + indices_array = np.asanyarray(indices) + else: + indices_array = indices + + if len(self.data) == 0 and (indices_array >= 0).any(): + raise IndexError("cannot do a non-empty take") + if indices_array.max() >= len(self.data): + raise IndexError("out of bounds value in 'indices'.") + + if allow_fill: + if (indices_array < 0).any(): + # TODO(ARROW-9433): Treat negative indices as NULL + indices_array = pa.array(indices_array, mask=indices_array < 0) + result = self.data.take(indices_array) + if pd.isna(fill_value): + return type(self)(result) + return type(self)(pc.fill_null(result, pa.scalar(fill_value))) + else: + # Nothing to fill + return type(self)(self.data.take(indices)) + else: # allow_fill=False + # TODO(ARROW-9432): Treat negative indices as indices from the right. + if (indices_array < 0).any(): + # Don't modify in-place + indices_array = np.copy(indices_array) + indices_array[indices_array < 0] += len(self.data) + return type(self)(self.data.take(indices_array)) diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py new file mode 100644 index 0000000000000..437d51060fb7f --- /dev/null +++ b/pandas/tests/extension/test_string_arrow.py @@ -0,0 +1,125 @@ +import string + +import numpy as np +import pytest + +import pandas as pd +from pandas.core.arrays.string_arrow import ArrowStringArray, ArrowStringDtype +from pandas.tests.extension import base + + +@pytest.fixture +def dtype(): + return ArrowStringDtype() + + +@pytest.fixture +def data(): + strings = np.random.choice(list(string.ascii_letters), size=100) + while strings[0] == strings[1]: + strings = np.random.choice(list(string.ascii_letters), size=100) + + return ArrowStringArray._from_sequence(strings) + + +@pytest.fixture +def data_missing(): + """Length 2 array with [NA, Valid]""" + return ArrowStringArray._from_sequence([pd.NA, "A"]) + + +@pytest.fixture +def data_for_sorting(): + return ArrowStringArray._from_sequence(["B", "C", "A"]) + + +@pytest.fixture +def data_missing_for_sorting(): + return ArrowStringArray._from_sequence(["B", pd.NA, "A"]) + + +@pytest.fixture +def na_value(): + return pd.NA + + +@pytest.fixture +def data_for_grouping(): + return ArrowStringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"]) + + +class TestDtype(base.BaseDtypeTests): + pass + + +class TestInterface(base.BaseInterfaceTests): + @pytest.mark.xfail(reason="Fails until implement, remove before merge") + def test_view(self, data): + base.BaseInterfaceTests.test_view(self, data) + + +class TestConstructors(base.BaseConstructorsTests): + pass + + +# class TestReshaping(base.BaseReshapingTests): +# pass + + +class TestGetitem(base.BaseGetitemTests): + pass + + +class TestSetitem(base.BaseSetitemTests): + pass + + +# class TestMissing(base.BaseMissingTests): +# pass + + +# class TestNoReduce(base.BaseNoReduceTests): +# @pytest.mark.parametrize("skipna", [True, False]) +# def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): +# op_name = all_numeric_reductions +# +# if op_name in ["min", "max"]: +# return None +# +# s = pd.Series(data) +# with pytest.raises(TypeError): +# getattr(s, op_name)(skipna=skipna) + + +# class TestMethods(base.BaseMethodsTests): +# @pytest.mark.skip(reason="returns nullable") +# def test_value_counts(self, all_data, dropna): +# return super().test_value_counts(all_data, dropna) + + +# class TestCasting(base.BaseCastingTests): +# pass + + +# class TestComparisonOps(base.BaseComparisonOpsTests): +# def _compare_other(self, s, data, op_name, other): +# result = getattr(s, op_name)(other) +# expected = getattr(s.astype(object), op_name)(other).astype("boolean") +# self.assert_series_equal(result, expected) + +# def test_compare_scalar(self, data, all_compare_operators): +# op_name = all_compare_operators +# s = pd.Series(data) +# self._compare_other(s, data, op_name, "abc") + + +# class TestParsing(base.BaseParsingTests): +# pass + + +# class TestPrinting(base.BasePrintingTests): +# pass + + +# class TestGroupBy(base.BaseGroupbyTests): +# pass diff --git a/setup.py b/setup.py index f6f0cd9aabc0e..4033ea2935de5 100755 --- a/setup.py +++ b/setup.py @@ -432,7 +432,7 @@ def run(self): extra_compile_args.append("/Z7") extra_link_args.append("/DEBUG") else: - extra_compile_args = ["-Werror"] + extra_compile_args = [] extra_link_args = [] if debugging_symbols_requested: extra_compile_args.append("-g") From d242f2d0bc2d0eae9481ce2fa09969d9eb20113c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 3 Sep 2020 15:32:45 -0500 Subject: [PATCH 02/46] Refactor to use parametrized StringDtype --- pandas/core/arrays/base.py | 13 +- pandas/core/arrays/string_.py | 90 +++++++++- pandas/core/arrays/string_arrow.py | 166 +++++++----------- pandas/core/config_init.py | 13 ++ pandas/core/strings.py | 10 +- .../tests/arrays/string_/test_string_arrow.py | 26 +++ pandas/tests/extension/arrow/test_string.py | 7 +- pandas/tests/extension/test_string_arrow.py | 103 +++++++---- setup.py | 2 +- 9 files changed, 261 insertions(+), 169 deletions(-) create mode 100644 pandas/tests/arrays/string_/test_string_arrow.py diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 736d95b4b64b6..9b1b2c0d74e3f 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -356,6 +356,8 @@ def __ne__(self, other: Any) -> ArrayLike: """ Return for `self != other` (element-wise in-equality). """ + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented return ~(self == other) def to_numpy( @@ -457,13 +459,10 @@ def astype(self, dtype, copy=True): NumPy ndarray with 'dtype' for its dtype. """ from pandas.core.arrays.string_ import StringDtype - from pandas.core.arrays.string_arrow import ArrowStringDtype dtype = pandas_dtype(dtype) # FIXME: Really hard-code here? - if isinstance( - dtype, (ArrowStringDtype, StringDtype) - ): # allow conversion to StringArrays + if isinstance(dtype, StringDtype): # allow conversion to StringArrays return dtype.construct_array_type()._from_sequence(self, copy=False) return np.array(self, dtype=dtype, copy=copy) @@ -928,9 +927,9 @@ def take( from the right (the default). This is similar to :func:`numpy.take`. - * True: negative values in `indices` indicate - missing values. These values are set to `fill_value`. Any other - other negative values raise a ``ValueError``. + * True: ``-1`` in `indices` indicate missing values. + These values are set to `fill_value`. Any other other negative + value raise a ``ValueError``. fill_value : any, optional Fill value to use for NA-indices when `allow_fill` is True. diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 381968f9724b6..0e7c5a8036bcf 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,8 +1,10 @@ import operator -from typing import TYPE_CHECKING, Type, Union +from typing import TYPE_CHECKING, Any, Type, Union import numpy as np +from pandas._config import get_option + from pandas._libs import lib, missing as libmissing from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype @@ -50,17 +52,83 @@ class StringDtype(ExtensionDtype): StringDtype """ - name = "string" - #: StringDtype.na_value uses pandas.NA na_value = libmissing.NA + _metadata = ("storage",) + + def __init__(self, storage=None): + if storage is None: + storage = get_option("mode.string_storage") + if storage not in {"python", "pyarrow"}: + raise ValueError( + f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." + ) + self.storage = storage + + @property + def name(self): + return f"StringDtype[{self.storage}]" @property def type(self) -> Type[str]: return str @classmethod - def construct_array_type(cls) -> Type["StringArray"]: + def construct_from_string(cls, string): + """ + Construct a StringDtype from a string. + + Parameters + ---------- + string : str + The type of the name. The storage type will be taking from `string`. + Valid options and their storage types are + + ========================== ============== + string result storage + ========================== ============== + ``'string'`` global default + ``'string[python]'`` python + ``'StringDtype[python]'`` python + ``'string[pyarrow]'`` pyarrow + ``'StringDtype[pyarrow]'`` pyarrow + ========================== ============= + + Returns + ------- + StringDtype + + Raise + ----- + TypeError + If the string is not a valid option. + + """ + if not isinstance(string, str): + raise TypeError( + f"'construct_from_string' expects a string, got {type(string)}" + ) + if string == "string": + # TODO: use global default + return cls() + elif string in {"string[python]", "StringDtype[python]"}: + return cls(storage="python") + elif string in {"string[pyarrow]", "StringDtype[pyarrow]"}: + return cls(storage="pyarrow") + else: + raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") + + def __eq__(self, other: Any) -> bool: + if isinstance(other, str) and other == "string": + return True + return super().__eq__(other) + + def __hash__(self) -> int: + # custom __eq__ so have to override __hash__ + return super().__hash__() + + # XXX: this is a classmethod, but we need to know the storage type. + def construct_array_type(self) -> Type["StringArray"]: """ Return the array type associated with this dtype. @@ -68,10 +136,15 @@ def construct_array_type(cls) -> Type["StringArray"]: ------- type """ - return StringArray + from .string_arrow import ArrowStringArray + + if self.storage == "python": + return StringArray + else: + return ArrowStringArray - def __repr__(self) -> str: - return "StringDtype" + def __repr__(self): + return self.name def __from_arrow__( self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"] @@ -80,6 +153,7 @@ def __from_arrow__( Construct StringArray from pyarrow Array/ChunkedArray. """ import pyarrow # noqa: F811 + from .string_arrow import ArrowStringArray if isinstance(array, pyarrow.Array): chunks = [array] @@ -93,7 +167,7 @@ def __from_arrow__( str_arr = StringArray._from_sequence(np.array(arr)) results.append(str_arr) - return StringArray._concat_same_type(results) + return ArrowStringArray._concat_same_type(results) class StringArray(PandasArray): diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 8248a3e91c0fe..c0831a65b3644 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,5 +1,5 @@ from collections.abc import Iterable -from typing import Any, Optional, Sequence, Tuple, Type, Union +from typing import Any, Optional, Sequence, Tuple, Union import numpy as np import pyarrow as pa @@ -8,18 +8,19 @@ from pandas._libs import missing as libmissing from pandas._typing import ArrayLike -from pandas.core.dtypes.base import ExtensionDtype -from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.missing import isna -import pandas as pd from pandas.api.types import ( is_array_like, is_bool_dtype, + is_int64_dtype, is_integer, is_integer_dtype, is_scalar, ) +from pandas.core.algorithms import factorize from pandas.core.arrays.base import ExtensionArray +from pandas.core.arrays.string_ import StringDtype from pandas.core.indexers import check_array_indexer @@ -31,89 +32,6 @@ def _as_pandas_scalar(arrow_scalar: pa.Scalar) -> Optional[str]: return scalar -@register_extension_dtype -class ArrowStringDtype(ExtensionDtype): - """ - Extension dtype for string data in a ``pyarrow.ChunkedArray``. - - .. versionadded:: 1.1.0 - - .. warning:: - - ArrowStringDtype is considered experimental. The implementation and - parts of the API may change without warning. - - Attributes - ---------- - None - - Methods - ------- - None - - Examples - -------- - >>> pd.ArrowStringDtype() - ArrowStringDtype - """ - - name = "arrow_string" - - #: StringDtype.na_value uses pandas.NA - na_value = libmissing.NA - - @property - def type(self) -> Type[str]: - return str - - @classmethod - def construct_array_type(cls) -> Type["ArrowStringArray"]: - """ - Return the array type associated with this dtype. - - Returns - ------- - type - """ - return ArrowStringArray - - def __hash__(self) -> int: - return hash("ArrowStringDtype") - - def __repr__(self) -> str: - return "ArrowStringDtype" - - def __from_arrow__( - self, array: Union["pa.Array", "pa.ChunkedArray"] - ) -> "ArrowStringArray": - """ - Construct StringArray from pyarrow Array/ChunkedArray. - """ - return ArrowStringArray(array) - - def __eq__(self, other) -> bool: - """Check whether 'other' is equal to self. - - By default, 'other' is considered equal if - * it's a string matching 'self.name'. - * it's an instance of this type. - - Parameters - ---------- - other : Any - - Returns - ------- - bool - """ - if isinstance(other, ArrowStringDtype): - return True - elif isinstance(other, str) and other == "arrow_string": - return True - else: - return False - - class ArrowStringArray(ExtensionArray): """ Extension array for string data in a ``pyarrow.ChunkedArray``. @@ -165,19 +83,20 @@ def __init__(self, values): self.data = values else: raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray") + self._dtype = StringDtype(storage="pyarrow") @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): # TODO(ARROW-9407): Accept pd.NA in Arrow - scalars_corrected = [None if pd.isna(x) else x for x in scalars] + scalars_corrected = [None if isna(x) else x for x in scalars] return cls(pa.array(scalars_corrected, type=pa.string())) @property - def dtype(self) -> ArrowStringDtype: + def dtype(self) -> StringDtype: """ - An instance of 'ArrowStringDtype'. + An instance of 'StringDtype'. """ - return ArrowStringDtype() + return self._dtype def __array__(self, *args, **kwargs) -> "np.ndarray": """Correctly construct numpy arrays when passed to `np.asarray()`.""" @@ -276,15 +195,6 @@ def __getitem__(self, item): else: return _as_pandas_scalar(value) - def fillna(self, value=None, method=None, limit=None): - raise NotImplementedError("fillna") - - def _reduce(self, name, skipna=True, **kwargs): - if name in ["min", "max"]: - return getattr(self, name)(skipna=skipna) - - raise TypeError(f"Cannot perform reduction '{name}' with string dtype") - @property def nbytes(self) -> int: """ @@ -320,7 +230,9 @@ def __eq__(self, other: Any) -> ArrayLike: """ Return for `self == other` (element-wise equality). """ - if isinstance(other, (pd.Series, pd.DataFrame, pd.Index)): + from pandas import array, Series, DataFrame, Index + + if isinstance(other, (Series, DataFrame, Index)): return NotImplemented if isinstance(other, ArrowStringArray): result = pc.equal(self.data, other.data) @@ -330,7 +242,7 @@ def __eq__(self, other: Any) -> ArrayLike: raise NotImplementedError("Neither scalar nor ArrowStringArray") # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray - return pd.array(result.to_pandas().values) + return array(result.to_pandas().values, dtype="boolean") def __setitem__(self, key, value): # type: (Union[int, np.ndarray], Any) -> None @@ -357,9 +269,9 @@ def __setitem__(self, key, value): key = check_array_indexer(self, key) if is_integer(key): - if not pd.api.types.is_scalar(value): + if not is_scalar(value): raise ValueError("Must pass scalars with scalar indexer") - elif pd.isna(value): + elif isna(value): value = None elif not isinstance(value, str): raise ValueError("Scalar must be NA or str") @@ -386,7 +298,7 @@ def __setitem__(self, key, value): # TODO(ARROW-9431): Directly support setitem(integers) key_array = np.asanyarray(key) - if pd.api.types.is_scalar(value): + if is_scalar(value): value = np.broadcast_to(value, len(key_array)) else: value = np.asarray(value) @@ -461,15 +373,20 @@ def take( if len(self.data) == 0 and (indices_array >= 0).any(): raise IndexError("cannot do a non-empty take") - if indices_array.max() >= len(self.data): + if len(indices_array) > 0 and indices_array.max() >= len(self.data): raise IndexError("out of bounds value in 'indices'.") if allow_fill: if (indices_array < 0).any(): + if indices_array.min() < -1: + raise ValueError( + "'indicies' contains negative values other " + "-1 with 'allow_fill=True." + ) # TODO(ARROW-9433): Treat negative indices as NULL indices_array = pa.array(indices_array, mask=indices_array < 0) result = self.data.take(indices_array) - if pd.isna(fill_value): + if isna(fill_value): return type(self)(result) return type(self)(pc.fill_null(result, pa.scalar(fill_value))) else: @@ -482,3 +399,38 @@ def take( indices_array = np.copy(indices_array) indices_array[indices_array < 0] += len(self.data) return type(self)(self.data.take(indices_array)) + + def value_counts(self, dropna=True): + from pandas import Series + + if dropna: + na = self.isna() + self = self[~na] + counts = self.data.value_counts() + return Series(counts.field(1), counts.field(0)) + + def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray"]: + # see https://github.com/xhochy/fletcher/blob/master/fletcher/base.py + # doesn't handle dictionary types. + if self.data.num_chunks == 1: + encoded = self.data.chunk(0).dictionary_encode() + indices = encoded.indices.to_pandas() + if indices.dtype.kind == "f": + indices[np.isnan(indices)] = na_sentinel + indices = indices.astype(int) + if not is_int64_dtype(indices): + indices = indices.astype(np.int64) + return indices.values, type(self)(encoded.dictionary) + else: + np_array = self.data.to_pandas().values + return factorize(np_array, na_sentinel=na_sentinel) + + @classmethod + def _concat_same_type( + cls, to_concat: Sequence["ArrowStringArray"] + ) -> "ArrowStringArray": + return cls( + pa.chunked_array( + [array for ea in to_concat for array in ea.data.iterchunks()] + ) + ) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 0c23f1b4bcdf2..a58e6eccf7644 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -504,6 +504,19 @@ def use_inf_as_na_cb(key): ) +string_storage_doc = """ +: string + The default storage for StringDtype. +""" + +with cf.config_prefix("mode"): + cf.register_option( + "string_storage", + "python", + string_storage_doc, + validator=is_one_of_factory(["python", "pyarrow"]), + ) + # Set up the io.excel specific reader configuration. reader_engine_doc = """ : string diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 6702bf519c52e..59aa8fc5cfa0e 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -901,8 +901,10 @@ def _result_dtype(arr): # workaround #27953 # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails # when the list of values is empty. - if arr.dtype.name == "string": - return "string" + from pandas.core.arrays.string_ import StringDtype + + if isinstance(arr.dtype.name, StringDtype): + return arr.dtype.name else: return object @@ -2097,9 +2099,11 @@ class StringMethods(NoNewAttributesMixin): """ def __init__(self, data): + from pandas.core.arrays.string_ import StringDtype + self._inferred_dtype = self._validate(data) self._is_categorical = is_categorical_dtype(data.dtype) - self._is_string = data.dtype.name == "string" + self._is_string = isinstance(data.dtype, StringDtype) # ._values.categories works for both Series/Index self._parent = data._values.categories if self._is_categorical else data diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py new file mode 100644 index 0000000000000..40e3f21670ea0 --- /dev/null +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -0,0 +1,26 @@ +import pytest + +import pandas as pd +import pandas.testing as tm + + +def test_eq_all_na(): + a = pd.array([pd.NA, pd.NA], dtype=pd.StringDtype("pyarrow")) + result = a == a + expected = pd.array([pd.NA, pd.NA], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_config(): + # python by default + assert pd.StringDtype().storage == "python" + arr = pd.array(["a", "b"]) + assert arr.dtype.storage == "python" + + with pd.option_context("mode.string_storage", "pyarrow"): + assert pd.StringDtype().storage == "pyarrow" + arr = pd.array(["a", "b"]) + assert arr.dtype.storage == "pyarrow" + + with pytest.raises(ValueError): + pd.options.mode.string_storage = "foo" diff --git a/pandas/tests/extension/arrow/test_string.py b/pandas/tests/extension/arrow/test_string.py index abd5c1f386dc5..f32f1e415ddc7 100644 --- a/pandas/tests/extension/arrow/test_string.py +++ b/pandas/tests/extension/arrow/test_string.py @@ -4,10 +4,9 @@ pytest.importorskip("pyarrow", minversion="0.13.0") -from .arrays import ArrowStringDtype # isort:skip - def test_constructor_from_list(): # GH 27673 - result = pd.Series(["E"], dtype=ArrowStringDtype()) - assert isinstance(result.dtype, ArrowStringDtype) + result = pd.Series(["E"], dtype=pd.StringDtype(storage="pyarrow")) + assert isinstance(result.dtype, pd.StringDtype) + assert result.dtype.storage == "pyarrow" diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py index 437d51060fb7f..848e8a435b530 100644 --- a/pandas/tests/extension/test_string_arrow.py +++ b/pandas/tests/extension/test_string_arrow.py @@ -4,13 +4,13 @@ import pytest import pandas as pd -from pandas.core.arrays.string_arrow import ArrowStringArray, ArrowStringDtype +from pandas.core.arrays.string_arrow import ArrowStringArray from pandas.tests.extension import base @pytest.fixture def dtype(): - return ArrowStringDtype() + return pd.StringDtype(storage="pyarrow") @pytest.fixture @@ -62,64 +62,89 @@ class TestConstructors(base.BaseConstructorsTests): pass -# class TestReshaping(base.BaseReshapingTests): -# pass +class TestReshaping(base.BaseReshapingTests): + pass class TestGetitem(base.BaseGetitemTests): - pass + @pytest.mark.xfail( + reason="pyarrow.lib.ArrowNotImplementedError: Function " + "fill_null has no kernel matching input types " + "(array[string], scalar[string])" + ) + def test_take_non_na_fill_value(self, data_missing): + super().test_take_non_na_fill_value(data_missing) + + @pytest.mark.xfail( + reason="pyarrow.lib.ArrowNotImplementedError: Function fill_null has no " + "kernel matching input types (array[string], scalar[string])" + ) + def test_reindex_non_na_fill_value(self, data_missing): + super().test_reindex_non_na_fill_value(self, data_missing) class TestSetitem(base.BaseSetitemTests): + @pytest.mark.xfail(reason="TODO") + def test_setitem_preserves_views(self, data): + # Unclear where the issue is (pyarrow getitem, our getitem, our slice) + # and what to do here. + super().test_setitem_preserves_views(data) + + +class TestMissing(base.BaseMissingTests): pass -# class TestMissing(base.BaseMissingTests): -# pass +class TestNoReduce(base.BaseNoReduceTests): + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): + op_name = all_numeric_reductions + if op_name in ["min", "max"]: + return None -# class TestNoReduce(base.BaseNoReduceTests): -# @pytest.mark.parametrize("skipna", [True, False]) -# def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): -# op_name = all_numeric_reductions -# -# if op_name in ["min", "max"]: -# return None -# -# s = pd.Series(data) -# with pytest.raises(TypeError): -# getattr(s, op_name)(skipna=skipna) + s = pd.Series(data) + with pytest.raises(TypeError): + getattr(s, op_name)(skipna=skipna) -# class TestMethods(base.BaseMethodsTests): -# @pytest.mark.skip(reason="returns nullable") -# def test_value_counts(self, all_data, dropna): -# return super().test_value_counts(all_data, dropna) +class TestMethods(base.BaseMethodsTests): + @pytest.mark.skip(reason="returns nullable") + def test_value_counts(self, all_data, dropna): + return super().test_value_counts(all_data, dropna) -# class TestCasting(base.BaseCastingTests): -# pass +class TestCasting(base.BaseCastingTests): + pass -# class TestComparisonOps(base.BaseComparisonOpsTests): -# def _compare_other(self, s, data, op_name, other): -# result = getattr(s, op_name)(other) -# expected = getattr(s.astype(object), op_name)(other).astype("boolean") -# self.assert_series_equal(result, expected) +class TestComparisonOps(base.BaseComparisonOpsTests): + def _compare_other(self, s, data, op_name, other): + if op_name not in {"__eq__", "__ne__"}: + pytest.skip(f"{op_name} is not implemented.") + result = getattr(s, op_name)(other) + expected = getattr(s.astype(object), op_name)(other).astype("boolean") + self.assert_series_equal(result, expected) -# def test_compare_scalar(self, data, all_compare_operators): -# op_name = all_compare_operators -# s = pd.Series(data) -# self._compare_other(s, data, op_name, "abc") + def test_compare_scalar(self, data, all_compare_operators): + op_name = all_compare_operators + s = pd.Series(data) + self._compare_other(s, data, op_name, "abc") + def test_compare_array(self, data, all_compare_operators): + op_name = all_compare_operators + s = pd.Series(data) + other = pd.Series([data[0]] * len(data), dtype=data.dtype) + self._compare_other(s, data, op_name, other) -# class TestParsing(base.BaseParsingTests): -# pass + +class TestParsing(base.BaseParsingTests): + pass -# class TestPrinting(base.BasePrintingTests): -# pass +class TestPrinting(base.BasePrintingTests): + pass -# class TestGroupBy(base.BaseGroupbyTests): -# pass +class TestGroupBy(base.BaseGroupbyTests): + pass diff --git a/setup.py b/setup.py index 4033ea2935de5..f6f0cd9aabc0e 100755 --- a/setup.py +++ b/setup.py @@ -432,7 +432,7 @@ def run(self): extra_compile_args.append("/Z7") extra_link_args.append("/DEBUG") else: - extra_compile_args = [] + extra_compile_args = ["-Werror"] extra_link_args = [] if debugging_symbols_requested: extra_compile_args.append("-g") From 236781065c7ea739a05fc108994c6e02244d13b7 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 18 Feb 2021 16:53:37 +0000 Subject: [PATCH 03/46] abs-imports --- pandas/core/arrays/string_.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 6b724cc147a7b..2e40eab2d528e 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -166,7 +166,7 @@ def construct_array_type(self) -> Type[StringArray]: ------- type """ - from .string_arrow import ArrowStringArray + from pandas.core.arrays.string_arrow import ArrowStringArray if self.storage == "python": return StringArray @@ -184,7 +184,7 @@ def __from_arrow__( """ import pyarrow - from .string_arrow import ArrowStringArray + from pandas.core.arrays.string_arrow import ArrowStringArray if isinstance(array, pyarrow.Array): chunks = [array] From 9166d3b431ebda5781c3d4a8dc0cac2225696dd5 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 19 Feb 2021 13:45:26 +0000 Subject: [PATCH 04/46] post merge fixup --- pandas/core/arrays/string_.py | 12 +- pandas/core/arrays/string_arrow.py | 2 +- pandas/tests/arrays/string_/test_string.py | 59 +++---- .../tests/arrays/string_/test_string_arrow.py | 3 +- pandas/tests/extension/test_string.py | 19 +-- pandas/tests/extension/test_string_arrow.py | 150 ------------------ 6 files changed, 43 insertions(+), 202 deletions(-) delete mode 100644 pandas/tests/extension/test_string_arrow.py diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 2e40eab2d528e..7aaa3c32c84dc 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -52,6 +52,8 @@ if TYPE_CHECKING: import pyarrow + from pandas.core.arrays.string_arrow import ArrowStringArray + @register_extension_dtype class StringDtype(ExtensionDtype): @@ -157,8 +159,12 @@ def __hash__(self) -> int: # custom __eq__ so have to override __hash__ return super().__hash__() - # XXX: this is a classmethod, but we need to know the storage type. - def construct_array_type(self) -> Type[StringArray]: + # TODO: this is a classmethod, but we need to know the storage type. + # error: Signature of "construct_array_type" incompatible with supertype + # "ExtensionDtype" + def construct_array_type( # type: ignore[override] + self, + ) -> Type[StringArray | ArrowStringArray]: """ Return the array type associated with this dtype. @@ -178,7 +184,7 @@ def __repr__(self): def __from_arrow__( self, array: Union[pyarrow.Array, pyarrow.ChunkedArray] - ) -> StringArray: + ) -> ArrowStringArray: """ Construct StringArray from pyarrow Array/ChunkedArray. """ diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 27fc52418e246..db2bfa8c5771e 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -136,7 +136,7 @@ def _from_sequence_of_strings( @property def dtype(self) -> StringDtype: """ - An instance of 'ArrowStringDtype'. + An instance of 'StringDtype[pyarrow]'. """ return self._dtype diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index d5254adc1ee24..27325c6b4b7ea 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -9,27 +9,14 @@ import pandas as pd import pandas._testing as tm -from pandas.core.arrays.string_arrow import ( - ArrowStringArray, - ArrowStringDtype, -) +from pandas.core.arrays.string_arrow import ArrowStringArray skip_if_no_pyarrow = td.skip_if_no("pyarrow", min_version="1.0.0") -@pytest.fixture( - params=["string", pytest.param("arrow_string", marks=skip_if_no_pyarrow)] -) +@pytest.fixture(params=["python", pytest.param("pyarrow", marks=skip_if_no_pyarrow)]) def dtype(request): - return request.param - - -@pytest.fixture -def dtype_object(dtype): - if dtype == "string": - return pd.StringDtype - else: - return ArrowStringDtype + return pd.StringDtype(storage=request.param) @pytest.fixture( @@ -43,7 +30,7 @@ def cls(request): def test_repr(dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": reason = ( "AssertionError: assert ' A\n0 a\n1 None\n2 b' " "== ' A\n0 a\n1 \n2 b'" @@ -55,10 +42,10 @@ def test_repr(dtype, request): expected = " A\n0 a\n1 \n2 b" assert repr(df) == expected - expected = "0 a\n1 \n2 b\nName: A, dtype: string" + expected = "0 a\n1 \n2 b\nName: A, dtype: StringDtype[python]" assert repr(df.A) == expected - expected = "\n['a', , 'b']\nLength: 3, dtype: string" + expected = "\n['a', , 'b']\nLength: 3, dtype: StringDtype[python]" assert repr(df.A.array) == expected @@ -104,7 +91,7 @@ def test_setitem_with_scalar_string(dtype): ], ) def test_string_methods(input, method, dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": reason = "AttributeError: 'ArrowStringDtype' object has no attribute 'base'" mark = pytest.mark.xfail(reason=reason) request.node.add_marker(mark) @@ -119,7 +106,7 @@ def test_string_methods(input, method, dtype, request): def test_astype_roundtrip(dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": reason = "ValueError: Could not convert object to NumPy datetime" mark = pytest.mark.xfail(reason=reason, raises=ValueError) request.node.add_marker(mark) @@ -140,7 +127,7 @@ def test_astype_roundtrip(dtype, request): def test_add(dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": reason = ( "TypeError: unsupported operand type(s) for +: 'ArrowStringArray' and " "'ArrowStringArray'" @@ -168,7 +155,7 @@ def test_add(dtype, request): def test_add_2d(dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": reason = "Failed: DID NOT RAISE " mark = pytest.mark.xfail(reason=reason) request.node.add_marker(mark) @@ -184,7 +171,7 @@ def test_add_2d(dtype, request): def test_add_sequence(dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": reason = ( "TypeError: unsupported operand type(s) for +: 'ArrowStringArray' " "and 'list'" @@ -205,7 +192,7 @@ def test_add_sequence(dtype, request): def test_mul(dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": reason = ( "TypeError: unsupported operand type(s) for *: 'ArrowStringArray' and 'int'" ) @@ -288,7 +275,7 @@ def test_comparison_methods_scalar_not_string(all_compare_operators, dtype, requ def test_comparison_methods_array(all_compare_operators, dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": if all_compare_operators in ["__eq__", "__ne__"]: reason = "NotImplementedError: Neither scalar nor ArrowStringArray" else: @@ -359,7 +346,7 @@ def test_from_sequence_no_mutate(copy, cls, request): def test_astype_int(dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": reason = "TypeError: Cannot interpret 'Int64Dtype()' as a data type" mark = pytest.mark.xfail(reason=reason) request.node.add_marker(mark) @@ -391,7 +378,7 @@ def test_reduce(skipna, dtype): @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("skipna", [True, False]) def test_min_max(method, skipna, dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": reason = "AttributeError: 'ArrowStringArray' object has no attribute 'max'" mark = pytest.mark.xfail(reason=reason) request.node.add_marker(mark) @@ -408,7 +395,7 @@ def test_min_max(method, skipna, dtype, request): @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("box", [pd.Series, pd.array]) def test_min_max_numpy(method, box, dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": if box is pd.array: reason = ( "TypeError: '<=' not supported between instances of 'str' and " @@ -462,14 +449,14 @@ def test_arrow_array(dtype): data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) expected = pa.array(list(data), type=pa.string(), from_pandas=True) - if dtype == "arrow_string": + if dtype.storage == "pyarrow": expected = pa.chunked_array(expected) assert arr.equals(expected) @td.skip_if_no("pyarrow", min_version="0.15.1.dev") -def test_arrow_roundtrip(dtype, dtype_object): +def test_arrow_roundtrip(dtype): # roundtrip possible from arrow 1.0.0 import pyarrow as pa @@ -478,14 +465,14 @@ def test_arrow_roundtrip(dtype, dtype_object): table = pa.table(df) assert table.field("a").type == "string" result = table.to_pandas() - assert isinstance(result["a"].dtype, dtype_object) + assert isinstance(result["a"].dtype, type(dtype)) tm.assert_frame_equal(result, df) # ensure the missing value is represented by NA and not np.nan or None assert result.loc[2, "a"] is pd.NA def test_value_counts_na(dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": reason = "TypeError: boolean value of NA is ambiguous" mark = pytest.mark.xfail(reason=reason) request.node.add_marker(mark) @@ -501,7 +488,7 @@ def test_value_counts_na(dtype, request): def test_value_counts_with_normalize(dtype, request): - if dtype == "arrow_string": + if dtype.storage == "pyarrow": reason = "TypeError: boolean value of NA is ambiguous" mark = pytest.mark.xfail(reason=reason) request.node.add_marker(mark) @@ -535,10 +522,10 @@ def test_use_inf_as_na(values, expected, dtype): tm.assert_frame_equal(result, expected) -def test_memory_usage(dtype, request): +def test_memory_usage(dtype): # GH 33963 - if dtype == "arrow_string": + if dtype.storage == "pyarrow": pytest.skip("not applicable") series = pd.Series(["a", "b", "c"], dtype=dtype) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index afe2394484fda..b5d0627c8583c 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -29,7 +29,8 @@ def test_config(): arr = pd.array(["a", "b"]) assert arr.dtype.storage == "pyarrow" - with pytest.raises(ValueError): + msg = re.escape("Value must be one of python|pyarrow") + with pytest.raises(ValueError, match=msg): pd.options.mode.string_storage = "foo" diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index d0a3ef17afdbc..a14e9af1b6abf 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -22,20 +22,17 @@ import pandas as pd from pandas.core.arrays.string_ import StringDtype -from pandas.core.arrays.string_arrow import ArrowStringDtype from pandas.tests.extension import base @pytest.fixture( params=[ - StringDtype, - pytest.param( - ArrowStringDtype, marks=td.skip_if_no("pyarrow", min_version="1.0.0") - ), + "python", + pytest.param("pyarrow", marks=td.skip_if_no("pyarrow", min_version="1.0.0")), ] ) def dtype(request): - return request.param() + return StringDtype(storage=request.param) @pytest.fixture @@ -81,7 +78,7 @@ class TestDtype(base.BaseDtypeTests): class TestInterface(base.BaseInterfaceTests): def test_view(self, data, request): - if isinstance(data.dtype, ArrowStringDtype): + if data.dtype.storage == "pyarrow": mark = pytest.mark.xfail(reason="not implemented") request.node.add_marker(mark) super().test_view(data) @@ -92,8 +89,8 @@ class TestConstructors(base.BaseConstructorsTests): class TestReshaping(base.BaseReshapingTests): - def test_transpose(self, data, dtype, request): - if isinstance(dtype, ArrowStringDtype): + def test_transpose(self, data, request): + if data.dtype.storage == "pyarrow": mark = pytest.mark.xfail(reason="not implemented") request.node.add_marker(mark) super().test_transpose(data) @@ -104,8 +101,8 @@ class TestGetitem(base.BaseGetitemTests): class TestSetitem(base.BaseSetitemTests): - def test_setitem_preserves_views(self, data, dtype, request): - if isinstance(dtype, ArrowStringDtype): + def test_setitem_preserves_views(self, data, request): + if data.dtype.storage == "pyarrow": mark = pytest.mark.xfail(reason="not implemented") request.node.add_marker(mark) super().test_setitem_preserves_views(data) diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py deleted file mode 100644 index 848e8a435b530..0000000000000 --- a/pandas/tests/extension/test_string_arrow.py +++ /dev/null @@ -1,150 +0,0 @@ -import string - -import numpy as np -import pytest - -import pandas as pd -from pandas.core.arrays.string_arrow import ArrowStringArray -from pandas.tests.extension import base - - -@pytest.fixture -def dtype(): - return pd.StringDtype(storage="pyarrow") - - -@pytest.fixture -def data(): - strings = np.random.choice(list(string.ascii_letters), size=100) - while strings[0] == strings[1]: - strings = np.random.choice(list(string.ascii_letters), size=100) - - return ArrowStringArray._from_sequence(strings) - - -@pytest.fixture -def data_missing(): - """Length 2 array with [NA, Valid]""" - return ArrowStringArray._from_sequence([pd.NA, "A"]) - - -@pytest.fixture -def data_for_sorting(): - return ArrowStringArray._from_sequence(["B", "C", "A"]) - - -@pytest.fixture -def data_missing_for_sorting(): - return ArrowStringArray._from_sequence(["B", pd.NA, "A"]) - - -@pytest.fixture -def na_value(): - return pd.NA - - -@pytest.fixture -def data_for_grouping(): - return ArrowStringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"]) - - -class TestDtype(base.BaseDtypeTests): - pass - - -class TestInterface(base.BaseInterfaceTests): - @pytest.mark.xfail(reason="Fails until implement, remove before merge") - def test_view(self, data): - base.BaseInterfaceTests.test_view(self, data) - - -class TestConstructors(base.BaseConstructorsTests): - pass - - -class TestReshaping(base.BaseReshapingTests): - pass - - -class TestGetitem(base.BaseGetitemTests): - @pytest.mark.xfail( - reason="pyarrow.lib.ArrowNotImplementedError: Function " - "fill_null has no kernel matching input types " - "(array[string], scalar[string])" - ) - def test_take_non_na_fill_value(self, data_missing): - super().test_take_non_na_fill_value(data_missing) - - @pytest.mark.xfail( - reason="pyarrow.lib.ArrowNotImplementedError: Function fill_null has no " - "kernel matching input types (array[string], scalar[string])" - ) - def test_reindex_non_na_fill_value(self, data_missing): - super().test_reindex_non_na_fill_value(self, data_missing) - - -class TestSetitem(base.BaseSetitemTests): - @pytest.mark.xfail(reason="TODO") - def test_setitem_preserves_views(self, data): - # Unclear where the issue is (pyarrow getitem, our getitem, our slice) - # and what to do here. - super().test_setitem_preserves_views(data) - - -class TestMissing(base.BaseMissingTests): - pass - - -class TestNoReduce(base.BaseNoReduceTests): - @pytest.mark.parametrize("skipna", [True, False]) - def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): - op_name = all_numeric_reductions - - if op_name in ["min", "max"]: - return None - - s = pd.Series(data) - with pytest.raises(TypeError): - getattr(s, op_name)(skipna=skipna) - - -class TestMethods(base.BaseMethodsTests): - @pytest.mark.skip(reason="returns nullable") - def test_value_counts(self, all_data, dropna): - return super().test_value_counts(all_data, dropna) - - -class TestCasting(base.BaseCastingTests): - pass - - -class TestComparisonOps(base.BaseComparisonOpsTests): - def _compare_other(self, s, data, op_name, other): - if op_name not in {"__eq__", "__ne__"}: - pytest.skip(f"{op_name} is not implemented.") - result = getattr(s, op_name)(other) - expected = getattr(s.astype(object), op_name)(other).astype("boolean") - self.assert_series_equal(result, expected) - - def test_compare_scalar(self, data, all_compare_operators): - op_name = all_compare_operators - s = pd.Series(data) - self._compare_other(s, data, op_name, "abc") - - def test_compare_array(self, data, all_compare_operators): - op_name = all_compare_operators - s = pd.Series(data) - other = pd.Series([data[0]] * len(data), dtype=data.dtype) - self._compare_other(s, data, op_name, other) - - -class TestParsing(base.BaseParsingTests): - pass - - -class TestPrinting(base.BasePrintingTests): - pass - - -class TestGroupBy(base.BaseGroupbyTests): - pass From 8760705eef02a1122a44927b0d12b0c83c141010 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 19 Feb 2021 17:07:02 +0000 Subject: [PATCH 05/46] StringDtype[python] -> string[python] --- pandas/core/arrays/string_.py | 4 +--- pandas/core/construction.py | 2 +- pandas/tests/arrays/string_/test_string.py | 4 ++-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 7aaa3c32c84dc..fa7d1ec5bf417 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -99,7 +99,7 @@ def __init__(self, storage=None): @property def name(self): - return f"StringDtype[{self.storage}]" + return f"string[{self.storage}]" @property def type(self) -> Type[str]: @@ -121,9 +121,7 @@ def construct_from_string(cls, string): ========================== ============== ``'string'`` global default ``'string[python]'`` python - ``'StringDtype[python]'`` python ``'string[pyarrow]'`` pyarrow - ``'StringDtype[pyarrow]'`` pyarrow ========================== ============= Returns diff --git a/pandas/core/construction.py b/pandas/core/construction.py index dd75473da6d78..189a6ccc0f884 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -239,7 +239,7 @@ def array( >>> pd.array(["a", None, "c"]) ['a', , 'c'] - Length: 3, dtype: string + Length: 3, dtype: string[python] >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 27325c6b4b7ea..1a3064c02810a 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -42,10 +42,10 @@ def test_repr(dtype, request): expected = " A\n0 a\n1 \n2 b" assert repr(df) == expected - expected = "0 a\n1 \n2 b\nName: A, dtype: StringDtype[python]" + expected = "0 a\n1 \n2 b\nName: A, dtype: string[python]" assert repr(df.A) == expected - expected = "\n['a', , 'b']\nLength: 3, dtype: StringDtype[python]" + expected = "\n['a', , 'b']\nLength: 3, dtype: string[python]" assert repr(df.A.array) == expected From 2c657df7ff233256c271b7ff79325bacbd5a2d57 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 22 Mar 2021 16:06:12 +0000 Subject: [PATCH 06/46] pre-commit fix for inconsistent use of pandas namespace --- pandas/tests/arrays/string_/test_string_arrow.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index b5d0627c8583c..7565e1aa0488b 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -35,14 +35,14 @@ def test_config(): @pytest.mark.parametrize("chunked", [True, False]) -@pytest.mark.parametrize("array", [np, pa]) -def test_constructor_not_string_type_raises(array, chunked): - arr = array.array([1, 2, 3]) +@pytest.mark.parametrize("np_or_pa", [np, pa]) +def test_constructor_not_string_type_raises(np_or_pa, chunked): + arr = np_or_pa.array([1, 2, 3]) if chunked: - if array is np: + if np_or_pa is np: pytest.skip("chunked not applicable to numpy array") arr = pa.chunked_array(arr) - if array is np: + if np_or_pa is np: msg = "Unsupported type '' for ArrowStringArray" else: msg = re.escape( From 647a6c2e0699475ab282aad993d19d4ca5c5da2f Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 22 Mar 2021 16:12:06 +0000 Subject: [PATCH 07/46] fix typo --- pandas/core/arrays/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 6607d1aeac3b7..ae06004dfb485 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1059,8 +1059,8 @@ def take( :func:`numpy.take`. * True: ``-1`` in `indices` indicate missing values. - These values are set to `fill_value`. Any other other negative - value raise a ``ValueError``. + These values are set to `fill_value`. Any other negative + value raises a ``ValueError``. fill_value : any, optional Fill value to use for NA-indices when `allow_fill` is True. From 0596fd7ec634b9af983a11b2b3ea28a1724cbc57 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 22 Mar 2021 16:23:57 +0000 Subject: [PATCH 08/46] pre-commit fixup - undefined name 'ArrowStringDtype' --- pandas/tests/extension/test_string.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index e74938b544240..269ede7ca93ae 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -26,7 +26,7 @@ def split_array(arr): - if not isinstance(arr.dtype, ArrowStringDtype): + if arr.dtype.storage != "pyarrow": pytest.skip("chunked array n/a") def _split_array(arr): From 69a6cc1becd3f9198ad1b2967e1a5df0617051f9 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 28 Mar 2021 11:42:49 +0100 Subject: [PATCH 09/46] "StringDtype[storage]" -> "string[storage]" misc --- pandas/core/arrays/string_.py | 4 ++-- pandas/core/arrays/string_arrow.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index bcb8be5657a73..eadb3ef9e982c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -141,9 +141,9 @@ def construct_from_string(cls, string): if string == "string": # TODO: use global default return cls() - elif string in {"string[python]", "StringDtype[python]"}: + elif string == "string[python]": return cls(storage="python") - elif string in {"string[pyarrow]", "StringDtype[pyarrow]"}: + elif string == "string[pyarrow]": return cls(storage="pyarrow") else: raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 6d088d957b7d0..4bed6a067cea3 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -139,7 +139,7 @@ def _from_sequence_of_strings( @property def dtype(self) -> StringDtype: """ - An instance of 'StringDtype[pyarrow]'. + An instance of 'string[pyarrow]'. """ return self._dtype From bd147ba28c92656bdaeb3f3c5f106101b0fec154 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 28 Mar 2021 12:31:06 +0100 Subject: [PATCH 10/46] __from_arrow__ --- pandas/core/arrays/string_.py | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index eadb3ef9e982c..bdbb2827cd82d 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -182,27 +182,31 @@ def __repr__(self): def __from_arrow__( self, array: Union[pyarrow.Array, pyarrow.ChunkedArray] - ) -> ArrowStringArray: + ) -> StringArray | ArrowStringArray: """ Construct StringArray from pyarrow Array/ChunkedArray. """ - import pyarrow + if self.storage == "pyarrow": + from pandas.core.arrays.string_arrow import ArrowStringArray - from pandas.core.arrays.string_arrow import ArrowStringArray - - if isinstance(array, pyarrow.Array): - chunks = [array] + return ArrowStringArray(array) else: - # pyarrow.ChunkedArray - chunks = array.chunks - results = [] - for arr in chunks: - # using _from_sequence to ensure None is converted to NA - str_arr = StringArray._from_sequence(np.array(arr)) - results.append(str_arr) + import pyarrow + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + # pyarrow.ChunkedArray + chunks = array.chunks + + results = [] + for arr in chunks: + # using _from_sequence to ensure None is converted to NA + str_arr = StringArray._from_sequence(np.array(arr)) + results.append(str_arr) - return ArrowStringArray._concat_same_type(results) + return StringArray._concat_same_type(results) class StringArray(PandasArray): From 830275f1b8b6aefd2c8d0d13610ed58d0975249e Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 28 Mar 2021 14:52:13 +0100 Subject: [PATCH 11/46] more testing (wip) --- pandas/conftest.py | 20 +++++ pandas/core/arrays/string_arrow.py | 2 +- pandas/tests/arrays/string_/test_string.py | 89 ++++++++++++++----- pandas/tests/dtypes/test_common.py | 5 +- pandas/tests/dtypes/test_inference.py | 4 +- pandas/tests/extension/base/casting.py | 6 +- pandas/tests/frame/methods/test_astype.py | 7 ++ .../tests/frame/methods/test_combine_first.py | 10 ++- pandas/tests/frame/test_constructors.py | 6 +- pandas/tests/tools/test_to_numeric.py | 4 +- 10 files changed, 113 insertions(+), 40 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index f3356d2998ff8..403b41a1c86b7 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1131,6 +1131,26 @@ def string_dtype(request): return request.param +@pytest.fixture( + params=[ + "string", + "string[python]", + pytest.param( + "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0") + ), + ] +) +def nullable_string_dtype(request): + """ + Parametrized fixture for string dtypes. + + * 'string' + * 'string[python]' + * 'string[pyarrow]' + """ + return request.param + + @pytest.fixture(params=tm.BYTES_DTYPES) def bytes_dtype(request): """ diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 4bed6a067cea3..b68ae3dbe4218 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -96,7 +96,7 @@ class ArrowStringArray(OpsMixin, ExtensionArray): >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[arrow]") ['This is', 'some text', , 'data.'] - Length: 4, dtype: arrow_string + Length: 4, dtype: string[pyarrow] """ _dtype = StringDtype(storage="pyarrow") diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index b34c8d9722515..e224233b3bd13 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -14,9 +14,30 @@ skip_if_no_pyarrow = td.skip_if_no("pyarrow", min_version="1.0.0") -@pytest.fixture(params=["python", pytest.param("pyarrow", marks=skip_if_no_pyarrow)]) +def _is_pyarrow_dtype(dtype): + if isinstance(dtype, str): + if dtype == "string[pyarrow]": + return True + else: + if dtype.storage == "pyarrow": + return True + return False + + +@pytest.fixture( + params=[ + "string", + "string[python]", + pytest.param("string[pyarrow]", marks=skip_if_no_pyarrow), + pd.StringDtype(storage="python"), + pytest.param( + pd.StringDtype(storage="pyarrow"), + marks=skip_if_no_pyarrow, + ), + ] +) def dtype(request): - return pd.StringDtype(storage=request.param) + return request.param @pytest.fixture( @@ -30,7 +51,7 @@ def cls(request): def test_repr(dtype, request): - if dtype.storage == "pyarrow": + if _is_pyarrow_dtype(dtype): reason = ( "AssertionError: assert ' A\n0 a\n1 None\n2 b' " "== ' A\n0 a\n1 \n2 b'" @@ -91,7 +112,7 @@ def test_setitem_with_scalar_string(dtype): ], ) def test_string_methods(input, method, dtype, request): - if dtype.storage == "pyarrow": + if _is_pyarrow_dtype(dtype): reason = "AttributeError: 'ArrowStringDtype' object has no attribute 'base'" mark = pytest.mark.xfail(reason=reason) request.node.add_marker(mark) @@ -106,7 +127,7 @@ def test_string_methods(input, method, dtype, request): def test_astype_roundtrip(dtype, request): - if dtype.storage == "pyarrow": + if _is_pyarrow_dtype(dtype): reason = "ValueError: Could not convert object to NumPy datetime" mark = pytest.mark.xfail(reason=reason, raises=ValueError) request.node.add_marker(mark) @@ -127,7 +148,7 @@ def test_astype_roundtrip(dtype, request): def test_add(dtype, request): - if dtype.storage == "pyarrow": + if _is_pyarrow_dtype(dtype): reason = ( "TypeError: unsupported operand type(s) for +: 'ArrowStringArray' and " "'ArrowStringArray'" @@ -155,7 +176,7 @@ def test_add(dtype, request): def test_add_2d(dtype, request): - if dtype.storage == "pyarrow": + if _is_pyarrow_dtype(dtype): reason = "Failed: DID NOT RAISE " mark = pytest.mark.xfail(reason=reason) request.node.add_marker(mark) @@ -171,7 +192,7 @@ def test_add_2d(dtype, request): def test_add_sequence(dtype, request): - if dtype.storage == "pyarrow": + if _is_pyarrow_dtype(dtype): reason = ( "TypeError: unsupported operand type(s) for +: 'ArrowStringArray' " "and 'list'" @@ -192,7 +213,7 @@ def test_add_sequence(dtype, request): def test_mul(dtype, request): - if dtype.storage == "pyarrow": + if _is_pyarrow_dtype(dtype): reason = ( "TypeError: unsupported operand type(s) for *: 'ArrowStringArray' and 'int'" ) @@ -275,7 +296,7 @@ def test_comparison_methods_scalar_not_string(all_compare_operators, dtype, requ def test_comparison_methods_array(all_compare_operators, dtype, request): - if dtype.storage == "pyarrow": + if _is_pyarrow_dtype(dtype): if all_compare_operators in ["__eq__", "__ne__"]: reason = "NotImplementedError: Neither scalar nor ArrowStringArray" else: @@ -346,7 +367,7 @@ def test_from_sequence_no_mutate(copy, cls, request): def test_astype_int(dtype, request): - if dtype.storage == "pyarrow": + if _is_pyarrow_dtype(dtype): reason = "TypeError: Cannot interpret 'Int64Dtype()' as a data type" mark = pytest.mark.xfail(reason=reason) request.node.add_marker(mark) @@ -358,9 +379,20 @@ def test_astype_int(dtype, request): tm.assert_extension_array_equal(result, expected) -def test_astype_float(any_float_allowed_nullable_dtype): +def test_astype_float(dtype, any_float_allowed_nullable_dtype, request): # Don't compare arrays (37974) - ser = pd.Series(["1.1", pd.NA, "3.3"], dtype="string") + + if _is_pyarrow_dtype(dtype): + if any_float_allowed_nullable_dtype in {"Float32", "Float64"}: + reason = "TypeError: Cannot interpret 'Float32Dtype()' as a data type" + else: + reason = ( + "TypeError: float() argument must be a string or a number, not 'NAType'" + ) + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + ser = pd.Series(["1.1", pd.NA, "3.3"], dtype=dtype) result = ser.astype(any_float_allowed_nullable_dtype) expected = pd.Series([1.1, np.nan, 3.3], dtype=any_float_allowed_nullable_dtype) @@ -378,7 +410,7 @@ def test_reduce(skipna, dtype): @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("skipna", [True, False]) def test_min_max(method, skipna, dtype, request): - if dtype.storage == "pyarrow": + if _is_pyarrow_dtype(dtype): reason = "AttributeError: 'ArrowStringArray' object has no attribute 'max'" mark = pytest.mark.xfail(reason=reason) request.node.add_marker(mark) @@ -395,7 +427,7 @@ def test_min_max(method, skipna, dtype, request): @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("box", [pd.Series, pd.array]) def test_min_max_numpy(method, box, dtype, request): - if dtype.storage == "pyarrow": + if _is_pyarrow_dtype(dtype): if box is pd.array: reason = ( "TypeError: '<=' not supported between instances of 'str' and " @@ -423,17 +455,25 @@ def test_reduce_missing(skipna, dtype): assert pd.isna(result) -def test_fillna_args(): +def test_fillna_args(dtype, request): # GH 37987 - arr = pd.array(["a", pd.NA], dtype="string") + if _is_pyarrow_dtype(dtype): + reason = ( + "AssertionError: Regex pattern \"Cannot set non-string value '1' into " + "a StringArray.\" does not match 'Scalar must be NA or str'" + ) + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + arr = pd.array(["a", pd.NA], dtype=dtype) res = arr.fillna(value="b") - expected = pd.array(["a", "b"], dtype="string") + expected = pd.array(["a", "b"], dtype=dtype) tm.assert_extension_array_equal(res, expected) res = arr.fillna(value=np.str_("b")) - expected = pd.array(["a", "b"], dtype="string") + expected = pd.array(["a", "b"], dtype=dtype) tm.assert_extension_array_equal(res, expected) msg = "Cannot set non-string value '1' into a StringArray." @@ -449,7 +489,7 @@ def test_arrow_array(dtype): data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) expected = pa.array(list(data), type=pa.string(), from_pandas=True) - if dtype.storage == "pyarrow": + if _is_pyarrow_dtype(dtype): expected = pa.chunked_array(expected) assert arr.equals(expected) @@ -465,14 +505,15 @@ def test_arrow_roundtrip(dtype): table = pa.table(df) assert table.field("a").type == "string" result = table.to_pandas() - assert isinstance(result["a"].dtype, type(dtype)) + if not isinstance(dtype, str): + assert isinstance(result["a"].dtype, type(dtype)) tm.assert_frame_equal(result, df) # ensure the missing value is represented by NA and not np.nan or None assert result.loc[2, "a"] is pd.NA def test_value_counts_na(dtype, request): - if dtype.storage == "pyarrow": + if _is_pyarrow_dtype(dtype): reason = "TypeError: boolean value of NA is ambiguous" mark = pytest.mark.xfail(reason=reason) request.node.add_marker(mark) @@ -488,7 +529,7 @@ def test_value_counts_na(dtype, request): def test_value_counts_with_normalize(dtype, request): - if dtype.storage == "pyarrow": + if _is_pyarrow_dtype(dtype): reason = "TypeError: boolean value of NA is ambiguous" mark = pytest.mark.xfail(reason=reason) request.node.add_marker(mark) @@ -525,7 +566,7 @@ def test_use_inf_as_na(values, expected, dtype): def test_memory_usage(dtype): # GH 33963 - if dtype.storage == "pyarrow": + if _is_pyarrow_dtype(dtype): pytest.skip("not applicable") series = pd.Series(["a", "b", "c"], dtype=dtype) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 406aec9d4c16e..616f46624bfd7 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -281,7 +281,10 @@ def test_is_string_dtype(): assert com.is_string_dtype(object) assert com.is_string_dtype(np.array(["a", "b"])) assert com.is_string_dtype(pd.StringDtype()) - assert com.is_string_dtype(pd.array(["a", "b"], dtype="string")) + + +def test_is_string_dtype_nullable(nullable_string_dtype): + assert com.is_string_dtype(pd.array(["a", "b"], dtype=nullable_string_dtype)) integer_dtypes: List = [] diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index b3c6015475674..907991b97ead1 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1267,9 +1267,9 @@ def test_interval(self): @pytest.mark.parametrize("klass", [pd.array, Series]) @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("data", [["a", "b", "c"], ["a", "b", pd.NA]]) - def test_string_dtype(self, data, skipna, klass): + def test_string_dtype(self, data, skipna, klass, nullable_string_dtype): # StringArray - val = klass(data, dtype="string") + val = klass(data, dtype=nullable_string_dtype) inferred = lib.infer_dtype(val, skipna=skipna) assert inferred == "string" diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 7c5ef5b3b27d3..47f4f7585243d 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -43,10 +43,10 @@ def test_astype_str(self, data): expected = pd.Series([str(x) for x in data[:5]], dtype=str) self.assert_series_equal(result, expected) - def test_astype_string(self, data): + def test_astype_string(self, data, nullable_string_dtype): # GH-33465 - result = pd.Series(data[:5]).astype("string") - expected = pd.Series([str(x) for x in data[:5]], dtype="string") + result = pd.Series(data[:5]).astype(nullable_string_dtype) + expected = pd.Series([str(x) for x in data[:5]], dtype=nullable_string_dtype) self.assert_series_equal(result, expected) def test_to_numpy(self, data): diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 161fe7990a327..c0b6e18e23847 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Categorical, @@ -567,6 +569,11 @@ def test_astype_empty_dtype_dict(self): "df", [ DataFrame(Series(["x", "y", "z"], dtype="string")), + DataFrame(Series(["x", "y", "z"], dtype="string[python]")), + pytest.param( + DataFrame(Series(["x", "y", "z"], dtype="string[pyarrow]")), + marks=td.skip_if_no("pyarrow", min_version="1.0.0"), + ), DataFrame(Series(["x", "y", "z"], dtype="category")), DataFrame(Series(3 * [Timestamp("2020-01-01", tz="UTC")])), DataFrame(Series(3 * [Interval(0, 1)])), diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index b4d8a53e4b23f..dd91b32c8eb8c 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -381,15 +381,17 @@ def test_combine_first_with_asymmetric_other(self, val): tm.assert_frame_equal(res, exp) - def test_combine_first_string_dtype_only_na(self): + def test_combine_first_string_dtype_only_na(self, nullable_string_dtype): # GH: 37519 - df = DataFrame({"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string") - df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype="string") + df = DataFrame( + {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype + ) + df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype=nullable_string_dtype) df.set_index(["a", "b"], inplace=True) df2.set_index(["a", "b"], inplace=True) result = df.combine_first(df2) expected = DataFrame( - {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string" + {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype ).set_index(["a", "b"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index b76a44b3c86be..a62f2b0426911 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1649,10 +1649,10 @@ def test_constructor_empty_with_string_dtype(self): df = DataFrame(index=[0, 1], columns=[0, 1], dtype="U5") tm.assert_frame_equal(df, expected) - def test_constructor_empty_with_string_extension(self): + def test_constructor_empty_with_string_extension(self, nullable_string_dtype): # GH 34915 - expected = DataFrame(index=[], columns=["c1"], dtype="string") - df = DataFrame(columns=["c1"], dtype="string") + expected = DataFrame(index=[], columns=["c1"], dtype=nullable_string_dtype) + df = DataFrame(columns=["c1"], dtype=nullable_string_dtype) tm.assert_frame_equal(df, expected) def test_constructor_single_value(self): diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 65aa189a3e965..30d6436c7e250 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -725,9 +725,9 @@ def test_precision_float_conversion(strrep): (["1", "2", "3.5"], Series([1, 2, 3.5])), ], ) -def test_to_numeric_from_nullable_string(values, expected): +def test_to_numeric_from_nullable_string(values, nullable_string_dtype, expected): # https://github.com/pandas-dev/pandas/issues/37262 - s = Series(values, dtype="string") + s = Series(values, dtype=nullable_string_dtype) result = to_numeric(s) tm.assert_series_equal(result, expected) From 214e524c3ed801450d926222db30afe5543675ef Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 28 Mar 2021 15:22:53 +0100 Subject: [PATCH 12/46] fix inference --- pandas/_libs/lib.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 94a4d586b4f13..c0979d165ba3c 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1110,6 +1110,8 @@ _TYPE_MAP = { "complex128": "complex", "c": "complex", "string": "string", + "string[python]": "string", + "string[pyarrow]": "string", "S": "bytes", "U": "string", "bool": "boolean", From 5cfa97ac464f24c843d0aaf4d4fca6d950ea5776 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 1 Apr 2021 16:48:36 +0100 Subject: [PATCH 13/46] post-merge fixup --- pandas/tests/extension/json/array.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index a63c849d25a9f..6c1161294dd17 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -39,7 +39,6 @@ ExtensionDtype, ) from pandas.api.types import is_bool_dtype -from pandas.core.arrays.string_arrow import ArrowStringDtype class JSONDtype(ExtensionDtype): @@ -195,7 +194,7 @@ def astype(self, dtype, copy=True): if copy: return self.copy() return self - elif isinstance(dtype, (StringDtype, ArrowStringDtype)): + elif isinstance(dtype, StringDtype): value = self.astype(str) # numpy doesn'y like nested dicts return dtype.construct_array_type()._from_sequence(value, copy=False) From 74dbf96c9d2d077176a5f35620c654ee9bd19903 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 1 Apr 2021 17:48:06 +0100 Subject: [PATCH 14/46] remove changes to test_string_dtype - broken off in #40725 --- pandas/_libs/lib.pyx | 2 -- pandas/tests/dtypes/test_inference.py | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index b922489fedddc..646b5a05afcad 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1111,8 +1111,6 @@ _TYPE_MAP = { "complex128": "complex", "c": "complex", "string": "string", - "string[python]": "string", - "string[pyarrow]": "string", "S": "bytes", "U": "string", "bool": "boolean", diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 907991b97ead1..b3c6015475674 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1267,9 +1267,9 @@ def test_interval(self): @pytest.mark.parametrize("klass", [pd.array, Series]) @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("data", [["a", "b", "c"], ["a", "b", pd.NA]]) - def test_string_dtype(self, data, skipna, klass, nullable_string_dtype): + def test_string_dtype(self, data, skipna, klass): # StringArray - val = klass(data, dtype=nullable_string_dtype) + val = klass(data, dtype="string") inferred = lib.infer_dtype(val, skipna=skipna) assert inferred == "string" From 3bda421aa3a26159ea799d25183d4d557226fdc4 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 15 Apr 2021 10:57:28 +0100 Subject: [PATCH 15/46] post merge fix-up --- pandas/conftest.py | 2 -- pandas/core/arrays/string_arrow.py | 2 +- pandas/core/strings/accessor.py | 3 +-- pandas/tests/io/test_parquet.py | 4 +--- 4 files changed, 3 insertions(+), 8 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 5ab52f4b1e7f5..25e001859c96a 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1146,8 +1146,6 @@ def nullable_string_dtype(request): * 'string[python]' * 'string[pyarrow]' """ - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - return request.param diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 26b7be9174d33..21ff40cb021aa 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -134,7 +134,7 @@ def _chk_pyarrow_available(cls) -> None: @classmethod def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False): cls._chk_pyarrow_available() - # convert non-na-likes to str, and nan-likes to ArrowStringDtype.na_value + # convert non-na-likes to str, and nan-likes to StringDtype.na_value scalars = lib.ensure_string_array(scalars, copy=False) return cls(pa.array(scalars, type=pa.string(), from_pandas=True)) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 0b5613e302175..9a100aa4231b6 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -154,11 +154,10 @@ class StringMethods(NoNewAttributesMixin): def __init__(self, data): from pandas.core.arrays.string_ import StringDtype - from pandas.core.arrays.string_arrow import ArrowStringDtype self._inferred_dtype = self._validate(data) self._is_categorical = is_categorical_dtype(data.dtype) - self._is_string = isinstance(data.dtype, (StringDtype, ArrowStringDtype)) + self._is_string = isinstance(data.dtype, StringDtype) self._data = data self._index = self._name = None diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 21ea2bd560060..631cc8bfc8ff7 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -839,9 +839,7 @@ def test_additional_extension_arrays(self, pa): @td.skip_if_no("pyarrow", min_version="1.0.0") def test_pyarrow_backed_string_array(self, pa): # test ArrowStringArray supported through the __arrow_array__ protocol - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - - df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="arrow_string")}) + df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")}) check_round_trip(df, pa, expected=df) @td.skip_if_no("pyarrow", min_version="0.16.0") From 523e24c0bcc70c202fd9cec139fcb0df0d62caed Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 15 Apr 2021 11:16:45 +0100 Subject: [PATCH 16/46] post merge fix-up --- pandas/tests/frame/methods/test_astype.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 544960113fafc..322252d70a45e 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -569,9 +569,10 @@ def test_astype_empty_dtype_dict(self): "data, dtype", [ (["x", "y", "z"], "string"), + (["x", "y", "z"], "string[python]"), pytest.param( ["x", "y", "z"], - "arrow_string", + "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0"), ), (["x", "y", "z"], "category"), @@ -582,8 +583,6 @@ def test_astype_empty_dtype_dict(self): @pytest.mark.parametrize("errors", ["raise", "ignore"]) def test_astype_ignores_errors_for_extension_dtypes(self, data, dtype, errors): # https://github.com/pandas-dev/pandas/issues/35471 - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - df = DataFrame(Series(data, dtype=dtype)) if errors == "ignore": expected = df From 279624cd1d489bf86653d3b588ad6bb66219050d Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 15 Apr 2021 15:19:40 +0100 Subject: [PATCH 17/46] revert some changes made for pre-commit checks. --- pandas/tests/arrays/string_/test_string_arrow.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 7565e1aa0488b..b5d0627c8583c 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -35,14 +35,14 @@ def test_config(): @pytest.mark.parametrize("chunked", [True, False]) -@pytest.mark.parametrize("np_or_pa", [np, pa]) -def test_constructor_not_string_type_raises(np_or_pa, chunked): - arr = np_or_pa.array([1, 2, 3]) +@pytest.mark.parametrize("array", [np, pa]) +def test_constructor_not_string_type_raises(array, chunked): + arr = array.array([1, 2, 3]) if chunked: - if np_or_pa is np: + if array is np: pytest.skip("chunked not applicable to numpy array") arr = pa.chunked_array(arr) - if np_or_pa is np: + if array is np: msg = "Unsupported type '' for ArrowStringArray" else: msg = re.escape( From c5ced5a1736cf34fb2814278ae65ad93c705c973 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 16 Apr 2021 13:05:23 +0100 Subject: [PATCH 18/46] post merge fix-up --- pandas/tests/arrays/string_/test_string_arrow.py | 2 +- pandas/tests/series/methods/test_astype.py | 6 ++---- pandas/tests/series/methods/test_update.py | 4 ++-- pandas/tests/strings/test_string_array.py | 6 +++++- pandas/tests/strings/test_strings.py | 2 +- 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index b5d0627c8583c..be89db9f25d20 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -4,7 +4,7 @@ import pytest import pandas as pd -import pandas.testing as tm +import pandas._testing as tm pa = pytest.importorskip("pyarrow", minversion="1.0.0") diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index bebe6948cff9c..157b76c630ef9 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -250,9 +250,10 @@ def test_td64_series_astype_object(self): "data, dtype", [ (["x", "y", "z"], "string"), + (["x", "y", "z"], "string[python]"), pytest.param( ["x", "y", "z"], - "arrow_string", + "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0"), ), (["x", "y", "z"], "category"), @@ -263,9 +264,6 @@ def test_td64_series_astype_object(self): @pytest.mark.parametrize("errors", ["raise", "ignore"]) def test_astype_ignores_errors_for_extension_dtypes(self, data, dtype, errors): # https://github.com/pandas-dev/pandas/issues/35471 - - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - ser = Series(data, dtype=dtype) if errors == "ignore": expected = ser diff --git a/pandas/tests/series/methods/test_update.py b/pandas/tests/series/methods/test_update.py index 9a64877cb92ff..98cfb4cd6414d 100644 --- a/pandas/tests/series/methods/test_update.py +++ b/pandas/tests/series/methods/test_update.py @@ -11,7 +11,6 @@ Timestamp, ) import pandas._testing as tm -from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 class TestUpdate: @@ -88,11 +87,12 @@ def test_update_from_non_series(self, series, other, expected): "data, other, expected, dtype", [ (["a", None], [None, "b"], ["a", "b"], "string"), + (["a", None], [None, "b"], ["a", "b"], "string[python]"), pytest.param( ["a", None], [None, "b"], ["a", "b"], - "arrow_string", + "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0"), ), ([1, None], [None, 2], [1, 2], "Int64"), diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index 02ccb3a930557..4bad3eb4fba47 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -134,5 +134,9 @@ def test_capitalize(input, method, nullable_string_dtype): result = method(a.str) expected = method(b.str) - assert result.dtype.name == nullable_string_dtype + if nullable_string_dtype == "string": + assert result.dtype.name == "string[python]" + else: + assert result.dtype.name == nullable_string_dtype + tm.assert_series_equal(result.astype(object), expected) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index a809446f0bc06..8b231c35282cd 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -139,7 +139,7 @@ def test_repeat(): def test_repeat_with_null(nullable_string_dtype, request): # GH: 31632 - if nullable_string_dtype == "arrow_string": + if nullable_string_dtype == "string[pyarrow]": reason = 'Attribute "dtype" are different' mark = pytest.mark.xfail(reason=reason) request.node.add_marker(mark) From 459812c335e5c34a461114c95a1e222eb260ad12 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 16 Apr 2021 13:26:09 +0100 Subject: [PATCH 19/46] undo unrelated changes --- pandas/core/arrays/base.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c9dafb3714f7d..354e4cd765509 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -420,8 +420,6 @@ def __ne__(self, other: Any) -> ArrayLike: # type: ignore[override] """ Return for `self != other` (element-wise in-equality). """ - if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndex)): - return NotImplemented return ~(self == other) def to_numpy( @@ -1052,9 +1050,9 @@ def take( from the right (the default). This is similar to :func:`numpy.take`. - * True: ``-1`` in `indices` indicate missing values. - These values are set to `fill_value`. Any other negative - value raises a ``ValueError``. + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. fill_value : any, optional Fill value to use for NA-indices when `allow_fill` is True. From d707b6b96d7dee25fc14bc29945d48bdebd2b364 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 16 Apr 2021 13:54:28 +0100 Subject: [PATCH 20/46] undo changes to imports --- pandas/core/arrays/string_arrow.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 09004ab562422..73d0946217b52 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -9,8 +9,6 @@ ) import numpy as np -import pyarrow as pa -import pyarrow.compute as pc from pandas._libs import lib from pandas._typing import ( @@ -42,14 +40,25 @@ ) from pandas.core.strings.object_array import ObjectStringArrayMixin -ARROW_CMP_FUNCS = { - "eq": pc.equal, - "ne": pc.not_equal, - "lt": pc.less, - "gt": pc.greater, - "le": pc.less_equal, - "ge": pc.greater_equal, -} +try: + import pyarrow as pa +except ImportError: + pa = None +else: + # PyArrow backed StringArrays are available starting at 1.0.0, but this + # file is imported from even if pyarrow is < 1.0.0, before pyarrow.compute + # and its compute functions existed. GH38801 + if LooseVersion(pa.__version__) >= "1.0.0": + import pyarrow.compute as pc + + ARROW_CMP_FUNCS = { + "eq": pc.equal, + "ne": pc.not_equal, + "lt": pc.less, + "gt": pc.greater, + "le": pc.less_equal, + "ge": pc.greater_equal, + } if TYPE_CHECKING: From daaac062c08f31f879ed5f0166da2866c79e5e2b Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 17 Apr 2021 12:26:01 +0100 Subject: [PATCH 21/46] StringDtype.construct_array_type - add ref to issue --- pandas/core/arrays/string_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index a390487f8b191..dd06976f746c0 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -155,7 +155,7 @@ def __hash__(self) -> int: # custom __eq__ so have to override __hash__ return super().__hash__() - # TODO: this is a classmethod, but we need to know the storage type. + # https://github.com/pandas-dev/pandas/issues/36126 # error: Signature of "construct_array_type" incompatible with supertype # "ExtensionDtype" def construct_array_type( # type: ignore[override] From 42d382faa38cea74177d15ab7b86df6368d91a21 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 1 May 2021 13:18:27 +0100 Subject: [PATCH 22/46] post merge fixup --- asv_bench/benchmarks/algorithms.py | 19 ++++---- asv_bench/benchmarks/strings.py | 8 +--- pandas/core/arrays/interval.py | 3 +- pandas/core/arrays/string_arrow.py | 2 +- pandas/core/strings/accessor.py | 3 +- pandas/tests/arrays/string_/test_string.py | 54 +++++++++------------- pandas/tests/strings/test_find_replace.py | 8 ++-- pandas/tests/strings/test_strings.py | 8 ++-- 8 files changed, 43 insertions(+), 62 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index aecc609df574e..8885a0dcc781e 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -28,7 +28,7 @@ class Factorize: "datetime64[ns, tz]", "Int64", "boolean", - "string_arrow", + "string[pyarrow]", ], ] param_names = ["unique", "sort", "dtype"] @@ -36,15 +36,12 @@ class Factorize: def setup(self, unique, sort, dtype): N = 10 ** 5 string_index = tm.makeStringIndex(N) - try: - from pandas.core.arrays.string_arrow import ArrowStringDtype - - string_arrow = pd.array(string_index, dtype=ArrowStringDtype()) - except ImportError: - string_arrow = None - - if dtype == "string_arrow" and not string_arrow: - raise NotImplementedError + string_arrow = None + if dtype == "string[pyarrow]": + try: + string_arrow = pd.array(string_index, dtype="string[pyarrow]") + except ImportError: + raise NotImplementedError data = { "int": pd.Int64Index(np.arange(N)), @@ -57,7 +54,7 @@ def setup(self, unique, sort, dtype): ), "Int64": pd.array(np.arange(N), dtype="Int64"), "boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"), - "string_arrow": string_arrow, + "string[pyarrow]": string_arrow, }[dtype] if not unique: data = data.repeat(5) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 45a9053954569..5796b3f5440e7 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -50,12 +50,10 @@ def peakmem_cat_frame_construction(self, dtype): class Methods: - params = ["str", "string", "arrow_string"] + params = ["str", "string[python]", "string[pyarrow]"] param_names = ["dtype"] def setup(self, dtype): - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - try: self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype) except ImportError: @@ -213,12 +211,10 @@ def time_cat(self, other_cols, sep, na_rep, na_frac): class Contains: - params = (["str", "string", "arrow_string"], [True, False]) + params = (["str", "string[python]", "string[pyarrow]"], [True, False]) param_names = ["dtype", "regex"] def setup(self, dtype, regex): - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - try: self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype) except ImportError: diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 50e8cc4c82e0d..8d3a8feb89d67 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -829,7 +829,6 @@ def astype(self, dtype, copy: bool = True): """ from pandas import Index from pandas.core.arrays.string_ import StringDtype - from pandas.core.arrays.string_arrow import ArrowStringDtype if dtype is not None: dtype = pandas_dtype(dtype) @@ -852,7 +851,7 @@ def astype(self, dtype, copy: bool = True): return self._shallow_copy(new_left, new_right) elif is_categorical_dtype(dtype): return Categorical(np.asarray(self), dtype=dtype) - elif isinstance(dtype, (StringDtype, ArrowStringDtype)): + elif isinstance(dtype, StringDtype): return dtype.construct_array_type()._from_sequence(self, copy=False) # TODO: This try/except will be repeated. diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 91aa808580ee7..9ead565c2d335 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -34,8 +34,8 @@ from pandas.core import missing from pandas.core.arraylike import OpsMixin from pandas.core.arrays.base import ExtensionArray -from pandas.core.arrays.string_ import StringDtype from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.arrays.string_ import StringDtype from pandas.core.indexers import ( check_array_indexer, validate_indices, diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 3c7479b2e4aa8..9a100aa4231b6 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -3001,9 +3001,8 @@ def _result_dtype(arr): # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails # when the list of values is empty. from pandas.core.arrays.string_ import StringDtype - from pandas.core.arrays.string_arrow import ArrowStringDtype - if isinstance(arr.dtype, (StringDtype, ArrowStringDtype)): + if isinstance(arr.dtype, StringDtype): return arr.dtype.name else: return object diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 2894370a55b1a..cbd7734c11017 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -17,21 +17,8 @@ skip_if_no_pyarrow = td.skip_if_no("pyarrow", min_version="1.0.0") -def _is_pyarrow_dtype(dtype): - if isinstance(dtype, str): - if dtype == "string[pyarrow]": - return True - else: - if dtype.storage == "pyarrow": - return True - return False - - @pytest.fixture( params=[ - "string", - "string[python]", - pytest.param("string[pyarrow]", marks=skip_if_no_pyarrow), pd.StringDtype(storage="python"), pytest.param( pd.StringDtype(storage="pyarrow"), @@ -58,12 +45,15 @@ def test_repr(dtype): expected = " A\n0 a\n1 \n2 b" assert repr(df) == expected - dtype_name = "pyarrow" if _is_pyarrow_dtype(dtype) else "python" - expected = f"0 a\n1 \n2 b\nName: A, dtype: string[{dtype_name}]" + expected = ( + f"0 a\n1 \n2 b\nName: A, dtype: string[{dtype.storage}]" + ) assert repr(df.A) == expected - arr_name = "ArrowStringArray" if _is_pyarrow_dtype(dtype) else "StringArray" - expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string[{dtype_name}]" + arr_name = "ArrowStringArray" if dtype.storage == "pyarrow" else "StringArray" + expected = ( + f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string[{dtype.storage}]" + ) assert repr(df.A.array) == expected @@ -101,7 +91,7 @@ def test_setitem_with_scalar_string(dtype): def test_astype_roundtrip(dtype, request): - if _is_pyarrow_dtype(dtype): + if dtype.storage == "pyarrow": reason = "ValueError: Could not convert object to NumPy datetime" mark = pytest.mark.xfail(reason=reason, raises=ValueError) request.node.add_marker(mark) @@ -122,7 +112,7 @@ def test_astype_roundtrip(dtype, request): def test_add(dtype, request): - if _is_pyarrow_dtype(dtype): + if dtype.storage == "pyarrow": reason = ( "TypeError: unsupported operand type(s) for +: 'ArrowStringArray' and " "'ArrowStringArray'" @@ -150,7 +140,7 @@ def test_add(dtype, request): def test_add_2d(dtype, request): - if _is_pyarrow_dtype(dtype): + if dtype.storage == "pyarrow": reason = "Failed: DID NOT RAISE " mark = pytest.mark.xfail(reason=reason) request.node.add_marker(mark) @@ -166,7 +156,7 @@ def test_add_2d(dtype, request): def test_add_sequence(dtype, request): - if _is_pyarrow_dtype(dtype): + if dtype.storage == "pyarrow": reason = ( "TypeError: unsupported operand type(s) for +: 'ArrowStringArray' " "and 'list'" @@ -187,7 +177,7 @@ def test_add_sequence(dtype, request): def test_mul(dtype, request): - if _is_pyarrow_dtype(dtype): + if dtype.storage == "pyarrow": reason = ( "TypeError: unsupported operand type(s) for *: 'ArrowStringArray' and 'int'" ) @@ -270,7 +260,7 @@ def test_comparison_methods_scalar_not_string(all_compare_operators, dtype, requ def test_comparison_methods_array(all_compare_operators, dtype, request): - if _is_pyarrow_dtype(dtype): + if dtype.storage == "pyarrow": if all_compare_operators in ["__eq__", "__ne__"]: reason = "NotImplementedError: Neither scalar nor ArrowStringArray" else: @@ -341,7 +331,7 @@ def test_from_sequence_no_mutate(copy, cls, request): def test_astype_int(dtype, request): - if _is_pyarrow_dtype(dtype): + if dtype.storage == "pyarrow": reason = "TypeError: Cannot interpret 'Int64Dtype()' as a data type" mark = pytest.mark.xfail(reason=reason) request.node.add_marker(mark) @@ -356,7 +346,7 @@ def test_astype_int(dtype, request): def test_astype_float(dtype, any_float_allowed_nullable_dtype, request): # Don't compare arrays (37974) - if _is_pyarrow_dtype(dtype): + if dtype.storage == "pyarrow": if any_float_allowed_nullable_dtype in {"Float32", "Float64"}: reason = "TypeError: Cannot interpret 'Float32Dtype()' as a data type" else: @@ -384,7 +374,7 @@ def test_reduce(skipna, dtype): @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("skipna", [True, False]) def test_min_max(method, skipna, dtype, request): - if _is_pyarrow_dtype(dtype): + if dtype.storage == "pyarrow": reason = "AttributeError: 'ArrowStringArray' object has no attribute 'max'" mark = pytest.mark.xfail(reason=reason) request.node.add_marker(mark) @@ -401,7 +391,7 @@ def test_min_max(method, skipna, dtype, request): @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("box", [pd.Series, pd.array]) def test_min_max_numpy(method, box, dtype, request): - if _is_pyarrow_dtype(dtype): + if dtype.storage == "pyarrow": if box is pd.array: reason = ( "TypeError: '<=' not supported between instances of 'str' and " @@ -432,7 +422,7 @@ def test_reduce_missing(skipna, dtype): def test_fillna_args(dtype, request): # GH 37987 - if _is_pyarrow_dtype(dtype): + if dtype.storage == "pyarrow": reason = ( "AssertionError: Regex pattern \"Cannot set non-string value '1' into " "a StringArray.\" does not match 'Scalar must be NA or str'" @@ -463,7 +453,7 @@ def test_arrow_array(dtype): data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) expected = pa.array(list(data), type=pa.string(), from_pandas=True) - if _is_pyarrow_dtype(dtype): + if dtype.storage == "pyarrow": expected = pa.chunked_array(expected) assert arr.equals(expected) @@ -487,7 +477,7 @@ def test_arrow_roundtrip(dtype): @td.skip_if_no("pyarrow", min_version="0.15.1.dev") -def test_arrow_load_from_zero_chunks(dtype, dtype_object): +def test_arrow_load_from_zero_chunks(dtype): # GH-41040 import pyarrow as pa @@ -498,7 +488,7 @@ def test_arrow_load_from_zero_chunks(dtype, dtype_object): # Instantiate the same table with no chunks at all table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema) result = table.to_pandas() - assert isinstance(result["a"].dtype, dtype_object) + assert isinstance(result["a"].dtype, type(dtype)) tm.assert_frame_equal(result, df) @@ -546,7 +536,7 @@ def test_use_inf_as_na(values, expected, dtype): def test_memory_usage(dtype): # GH 33963 - if _is_pyarrow_dtype(dtype): + if dtype.storage == "pyarrow": pytest.skip("not applicable") series = pd.Series(["a", "b", "c"], dtype=dtype) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 0c54042d983ad..99f1196ac89a9 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -18,8 +18,9 @@ params=[ "object", "string", + "string[python]", pytest.param( - "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") + "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0") ), ] ) @@ -28,10 +29,9 @@ def any_string_dtype(request): Parametrized fixture for string dtypes. * 'object' * 'string' - * 'arrow_string' + * 'string[python]' + * 'string[pyarrow]' """ - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - return request.param diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index f218d5333b415..b8603ae71a0b1 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -23,8 +23,9 @@ params=[ "object", "string", + "string[python]", pytest.param( - "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") + "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0") ), ] ) @@ -33,10 +34,9 @@ def any_string_dtype(request): Parametrized fixture for string dtypes. * 'object' * 'string' - * 'arrow_string' + * 'string[python]' + * 'string[pyarrow]' """ - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - return request.param From 4fb1a0db22f2aa98b54383089a76e87f50c173c1 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 1 May 2021 13:24:09 +0100 Subject: [PATCH 23/46] add draft release note --- doc/source/whatsnew/v1.3.0.rst | 50 ++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index b2f4de22ca5c1..7ec7cb80db463 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -164,6 +164,56 @@ a copy will no longer be made (:issue:`32960`) The default behavior when not passing ``copy`` will remain unchanged, i.e. a copy will be made. +.. _whatsnew_130.arrow_string: + +PyArrow backed string data type +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We've enhanced the :class:`StringDtype`, an extension type dedicated to string data. +(:issue:`39908`) + +It is now possible to specify a ``storage`` keyword option to :class:`StringDtype`, use +pandas options or specify the dtype using ``dtype='string[pyarrow]'`` + +.. warning:: + + ``string[pyarrow]`` is currently considered experimental. The implementation + and parts of the API may change without warning. + +The ``'string[pyarrow]'`` extension type solves several issues with NumPy backed arrays: + +1. +2. +3. + + +.. ipython:: python + + pd.Series(['abc', None, 'def'], dtype=pd.StringDtype(storage="pyarrow")) + +You can use the alias ``"string[pyarrow]"`` as well. + +.. ipython:: python + + s = pd.Series(['abc', None, 'def'], dtype="string[pyarrow]") + s + +The usual string accessor methods work. Where appropriate, the return type +of the Series or columns of a DataFrame will also have string dtype. + +.. ipython:: python + + s.str.upper() + s.str.split('b', expand=True).dtypes + +String accessor methods returning integers will return a value with :class:`Int64Dtype` + +.. ipython:: python + + s.str.count("a") + +See :ref:`text.types` for more. + Centered Datetime-Like Rolling Windows ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 15efb2e57f9e52e7076b7c492ad79ca72839d1c8 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 12 May 2021 12:45:53 +0100 Subject: [PATCH 24/46] post merge fix-up --- doc/source/whatsnew/v1.3.0.rst | 6 +++--- pandas/conftest.py | 8 ++++---- pandas/tests/extension/base/casting.py | 5 ++--- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 9b209c2b46efe..d71bf046ba9ae 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -184,9 +184,9 @@ pandas options or specify the dtype using ``dtype='string[pyarrow]'`` The ``'string[pyarrow]'`` extension type solves several issues with NumPy backed arrays: -1. -2. -3. +1. +2. +3. .. ipython:: python diff --git a/pandas/conftest.py b/pandas/conftest.py index 1f7a194b60ac5..b859a35c484e1 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1157,8 +1157,9 @@ def object_dtype(request): params=[ "object", "string", + "string[python]", pytest.param( - "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") + "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0") ), ] ) @@ -1167,10 +1168,9 @@ def any_string_dtype(request): Parametrized fixture for string dtypes. * 'object' * 'string' - * 'arrow_string' + * 'string[python]' + * 'string[pyarrow]' """ - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - return request.param diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 99a5666926e10..ed1047f6e28f6 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -49,15 +49,14 @@ def test_astype_str(self, data): "nullable_string_dtype", [ "string", + "string[python]", pytest.param( - "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") + "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0") ), ], ) def test_astype_string(self, data, nullable_string_dtype): # GH-33465 - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - result = pd.Series(data[:5]).astype(nullable_string_dtype) expected = pd.Series([str(x) for x in data[:5]], dtype=nullable_string_dtype) self.assert_series_equal(result, expected) From b53cfe015f6ad4e60b2f0cc847bdc75c40e5a436 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 12 May 2021 14:23:03 +0100 Subject: [PATCH 25/46] docstrings --- pandas/core/arrays/string_.py | 6 +++--- pandas/core/arrays/string_arrow.py | 2 +- pandas/core/generic.py | 14 +++++++------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index d08bbed7c9c75..1cb5eade8a20c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -80,7 +80,7 @@ class StringDtype(ExtensionDtype): Examples -------- >>> pd.StringDtype() - StringDtype + string[python] """ #: StringDtype.na_value uses pandas.NA @@ -263,7 +263,7 @@ class StringArray(PandasArray): >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string") ['This is', 'some text', , 'data.'] - Length: 4, dtype: string + Length: 4, dtype: string[python] Unlike arrays instantiated with ``dtype="object"``, ``StringArray`` will convert the values to strings. @@ -275,7 +275,7 @@ class StringArray(PandasArray): >>> pd.array(['1', 1], dtype="string") ['1', '1'] - Length: 2, dtype: string + Length: 2, dtype: string[python] However, instantiating StringArrays directly with non-strings will raise an error. diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index c3aa44a438e1f..9036bfde7537e 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -117,7 +117,7 @@ class ArrowStringArray(OpsMixin, ExtensionArray, ObjectStringArrayMixin): Examples -------- - >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[arrow]") + >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[pyarrow]") ['This is', 'some text', , 'data.'] Length: 4, dtype: string[pyarrow] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0d39f13afc426..fef2c1f46200d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6134,12 +6134,12 @@ def convert_dtypes( 2 3 z 20 200.0 >>> dfn.dtypes - a Int32 - b string - c boolean - d string - e Int64 - f Float64 + a Int32 + b string[python] + c boolean + d string[python] + e Int64 + f Float64 dtype: object Start with a Series of strings and missing data represented by ``np.nan``. @@ -6157,7 +6157,7 @@ def convert_dtypes( 0 a 1 b 2 - dtype: string + dtype: string[python] """ if self.ndim == 1: return self._convert_dtypes( From b7db53f99305f5d54ccb0db2674fac506ce067dc Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 12 May 2021 14:28:48 +0100 Subject: [PATCH 26/46] benchmarks --- asv_bench/benchmarks/algos/isin.py | 8 +++----- asv_bench/benchmarks/strings.py | 4 +--- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/algos/isin.py b/asv_bench/benchmarks/algos/isin.py index 44245295beafc..4b58981694014 100644 --- a/asv_bench/benchmarks/algos/isin.py +++ b/asv_bench/benchmarks/algos/isin.py @@ -25,8 +25,8 @@ class IsIn: "category[object]", "category[int]", "str", - "string", - "arrow_string", + "string[python]", + "string[pyarrow]", ] param_names = ["dtype"] @@ -62,9 +62,7 @@ def setup(self, dtype): self.values = np.random.choice(arr, sample_size) self.series = Series(arr).astype("category") - elif dtype in ["str", "string", "arrow_string"]: - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - + elif dtype in ["str", "string[python]", "string[pyarrow]"]: try: self.series = Series(tm.makeStringIndex(N), dtype=dtype) except ImportError: diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 47cc9fcf568ee..02cbff7a1559c 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -226,12 +226,10 @@ def time_contains(self, dtype, regex): class Split: - params = (["str", "string", "arrow_string"], [True, False]) + params = (["str", "string[python]", "string[pyarrow]"], [True, False]) param_names = ["dtype", "expand"] def setup(self, dtype, expand): - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - try: self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype).str.join("--") except ImportError: From 3399f08727ee04d01141e6d2244e4aad0ad54799 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 12 May 2021 14:33:21 +0100 Subject: [PATCH 27/46] pyarrow min --- pandas/tests/extension/arrow/test_string.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/arrow/test_string.py b/pandas/tests/extension/arrow/test_string.py index f32f1e415ddc7..67a62978aa1bc 100644 --- a/pandas/tests/extension/arrow/test_string.py +++ b/pandas/tests/extension/arrow/test_string.py @@ -2,7 +2,7 @@ import pandas as pd -pytest.importorskip("pyarrow", minversion="0.13.0") +pytest.importorskip("pyarrow", minversion="1.0.0") def test_constructor_from_list(): From 71d1e6c6bbb4f7f2ee28954000a5903a678a7228 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 26 May 2021 17:45:48 +0100 Subject: [PATCH 28/46] post merge fixup --- asv_bench/benchmarks/strings.py | 4 +--- pandas/tests/strings/test_api.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 2e109e59c1c6d..32fbf4e6c7de3 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -12,12 +12,10 @@ class Dtypes: - params = ["str", "string", "arrow_string"] + params = ["str", "string[python]", "string[pyarrow]"] param_names = ["dtype"] def setup(self, dtype): - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - try: self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype) except ImportError: diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py index ec8b5bfa11ad5..c0ae06802bdb1 100644 --- a/pandas/tests/strings/test_api.py +++ b/pandas/tests/strings/test_api.py @@ -128,7 +128,7 @@ def test_api_per_method( def test_api_for_categorical(any_string_method, any_string_dtype, request): # https://github.com/pandas-dev/pandas/issues/10661 - if any_string_dtype == "arrow_string": + if any_string_dtype == "string[pyarrow]": # unsupported operand type(s) for +: 'ArrowStringArray' and 'str' mark = pytest.mark.xfail(raises=TypeError, reason="Not Implemented") request.node.add_marker(mark) From 9e23c35a3e84024b9c50b056d2481e2b76a477c1 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 26 May 2021 21:26:19 +0100 Subject: [PATCH 29/46] misc clean --- pandas/core/arrays/string_.py | 1 - pandas/tests/arrays/string_/test_string.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 1cb5eade8a20c..866210a07ca0f 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -138,7 +138,6 @@ def construct_from_string(cls, string): f"'construct_from_string' expects a string, got {type(string)}" ) if string == "string": - # TODO: use global default return cls() elif string == "string[python]": return cls(storage="python") diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index ad708350d3251..ae2fedef1d947 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -460,8 +460,7 @@ def test_arrow_roundtrip(dtype): table = pa.table(df) assert table.field("a").type == "string" result = table.to_pandas() - if not isinstance(dtype, str): - assert isinstance(result["a"].dtype, type(dtype)) + assert isinstance(result["a"].dtype, type(dtype)) tm.assert_frame_equal(result, df) # ensure the missing value is represented by NA and not np.nan or None assert result.loc[2, "a"] is pd.NA From 64b3206a66efad4a1bb5bb9d8287bfc81adc94d0 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 27 May 2021 16:47:43 +0100 Subject: [PATCH 30/46] update construct_from_string docstring --- pandas/core/arrays/string_.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 591de9b9096cd..aafd34dcf7a13 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -115,13 +115,13 @@ def construct_from_string(cls, string): The type of the name. The storage type will be taking from `string`. Valid options and their storage types are - ========================== ============== + ========================== ============================================== string result storage - ========================== ============== - ``'string'`` global default + ========================== ============================================== + ``'string'`` pd.options.mode.string_storage, default python ``'string[python]'`` python ``'string[pyarrow]'`` pyarrow - ========================== ============= + ========================== ============================================== Returns ------- From d83a4ff42fd8fd535bec2b8cd01bcfcf638f24c1 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 27 May 2021 16:57:49 +0100 Subject: [PATCH 31/46] update whatsnew for dtype="string" --- doc/source/whatsnew/v1.3.0.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index aa032c0c4fb70..012a524321f56 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -204,6 +204,14 @@ You can use the alias ``"string[pyarrow]"`` as well. s = pd.Series(['abc', None, 'def'], dtype="string[pyarrow]") s +You can also create a PyArrow backed string array using pandas options. + +.. ipython:: python + + with pd.option_context("string_storage", "pyarrow"): + s = pd.Series(['abc', None, 'def'], dtype="string") + s + The usual string accessor methods work. Where appropriate, the return type of the Series or columns of a DataFrame will also have string dtype. From aef11626d8f5de98566b4aeb8ec7032f718b1434 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 30 May 2021 14:42:27 +0100 Subject: [PATCH 32/46] update release note --- doc/source/whatsnew/v1.3.0.rst | 72 ++++++++++++++++++++++++++++------ 1 file changed, 61 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 8dc6dc52d2687..83680eacb9912 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -179,20 +179,16 @@ We've enhanced the :class:`StringDtype`, an extension type dedicated to string d (:issue:`39908`) It is now possible to specify a ``storage`` keyword option to :class:`StringDtype`, use -pandas options or specify the dtype using ``dtype='string[pyarrow]'`` +pandas options or specify the dtype using ``dtype='string[pyarrow]'`` to allow the +StringArray to be backed by a PyArrow array instead of a NumPy array of Python objects. + +The PyArrow backed StringArray requires pyarrow 1.0.0 or greater to be installed. .. warning:: ``string[pyarrow]`` is currently considered experimental. The implementation and parts of the API may change without warning. -The ``'string[pyarrow]'`` extension type solves several issues with NumPy backed arrays: - -1. -2. -3. - - .. ipython:: python pd.Series(['abc', None, 'def'], dtype=pd.StringDtype(storage="pyarrow")) @@ -212,8 +208,8 @@ You can also create a PyArrow backed string array using pandas options. s = pd.Series(['abc', None, 'def'], dtype="string") s -The usual string accessor methods work. Where appropriate, the return type -of the Series or columns of a DataFrame will also have string dtype. +The usual string accessor methods work. Where appropriate, the return type of the Series +or columns of a DataFrame will also have string dtype. .. ipython:: python @@ -226,7 +222,61 @@ String accessor methods returning integers will return a value with :class:`Int6 s.str.count("a") -See :ref:`text.types` for more. +Some string accessor methods use native PyArrow string kernels operating directly on the +PyArrow memory, others fallback to converting to a NumPy array of Python objects and +using the native Python string functions. String methods using Pyarrow kernels are +generally much more performant. + +Some PyArrow string kernels are implemented in later versions of pyarrow that the +minimum version required to create a PyArrow backed StringArray. In these cases, the +string accessor will fall back to the Python implementations. + +Some string accessor methods accept arguments controlling their behaviour which are not +supported by the PyArrow kernels. These cases will also fall back to object mode. + ++--------------------------------+----------+------------------------------------------+ +| Accessor | Minimum | Limitations (otherwise fall back to | +| Method | PyArrow | object mode) | +| | Version | | ++================================+==========+==========================================+ +| :meth:`~Series.str.contains` | 1.0.0 | The ``flags`` argument is not supported. | +| | | If ``regex=True``, pyarrow 4.0.0 is | +| | | required and ``case=False`` is not | +| | | supported. | ++--------------------------------+----------+------------------------------------------+ +| :meth:`~Series.str.startswith` | 4.0.0 | | +| :meth:`~Series.str.endswith` | | | ++--------------------------------+----------+------------------------------------------+ +| :meth:`~Series.str.replace` | 4.0.0 | The ``flags`` argument, ``case=False``, | +| | | passing a callable for the ``repr`` | +| | | argument or passing a compiled regex is | +| | | not supported. | ++--------------------------------+----------+------------------------------------------+ +| :meth:`~Series.str.match` | 4.0.0 | | +| :meth:`~Series.str.fullmatch` | | | ++--------------------------------+----------+------------------------------------------+ +| :meth:`~Series.str.isalnum` | 1.0.0 | | +| :meth:`~Series.str.isalpha` | | | +| :meth:`~Series.str.isdecimal` | | | +| :meth:`~Series.str.isdigit` | | | +| :meth:`~Series.str.islower` | | | +| :meth:`~Series.str.isnumeric` | | | +| :meth:`~Series.str.istitle` | | | +| :meth:`~Series.str.isupper` | | | ++--------------------------------+----------+------------------------------------------+ +| :meth:`~Series.str.isspace` | 2.0.0 | | ++--------------------------------+----------+------------------------------------------+ +| :meth:`~Series.str.len` | 4.0.0 | | ++--------------------------------+----------+------------------------------------------+ +| :meth:`~Series.str.lower` | 1.0.0 | | +| :meth:`~Series.str.upper` | | | ++--------------------------------+----------+------------------------------------------+ +| :meth:`~Series.str.strip` | 4.0.0 | | +| :meth:`~Series.str.lstrip` | | | +| :meth:`~Series.str.rstrip` | | | ++--------------------------------+----------+------------------------------------------+ + + Centered Datetime-Like Rolling Windows ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 6247a5b2dfddab8497f1874911b20e6272985fb6 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 30 May 2021 16:49:30 +0100 Subject: [PATCH 33/46] paramertize test for df.convert_dtypes() --- .../tests/frame/methods/test_convert_dtypes.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index dd7bf0aada449..2d3f0011617fc 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd import pandas._testing as tm @@ -9,7 +11,16 @@ class TestConvertDtypes: @pytest.mark.parametrize( "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] ) - def test_convert_dtypes(self, convert_integer, expected): + @pytest.mark.parametrize( + "string_storage", + [ + "python", + pytest.param( + "pyarrow", marks=td.skip_if_no("pyarrow", min_version="1.0.0") + ), + ], + ) + def test_convert_dtypes(self, convert_integer, expected, string_storage): # Specific types are tested in tests/series/test_dtypes.py # Just check that it works for DataFrame here df = pd.DataFrame( @@ -18,11 +29,12 @@ def test_convert_dtypes(self, convert_integer, expected): "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), } ) - result = df.convert_dtypes(True, True, convert_integer, False) + with pd.option_context("string_storage", string_storage): + result = df.convert_dtypes(True, True, convert_integer, False) expected = pd.DataFrame( { "a": pd.Series([1, 2, 3], dtype=expected), - "b": pd.Series(["x", "y", "z"], dtype="string"), + "b": pd.Series(["x", "y", "z"], dtype=f"string[{string_storage}]"), } ) tm.assert_frame_equal(result, expected) From a6d066ca43f44879f4a01c74c805b2bf4b0790b7 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 31 May 2021 13:20:33 +0100 Subject: [PATCH 34/46] fixup pd.array and more testing of string_storage option --- pandas/conftest.py | 16 ++++ pandas/core/arrays/string_.py | 11 +-- pandas/core/arrays/string_arrow.py | 5 ++ pandas/core/construction.py | 25 +++++-- .../tests/arrays/string_/test_string_arrow.py | 74 ++++++++++++++++--- pandas/tests/arrays/test_array.py | 23 ++++-- pandas/tests/arrays/test_datetimelike.py | 21 +++--- pandas/tests/series/methods/test_astype.py | 22 +++++- pandas/tests/strings/test_api.py | 5 +- 9 files changed, 164 insertions(+), 38 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index b859a35c484e1..a880fcc08e0fe 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1131,6 +1131,22 @@ def nullable_string_dtype(request): return request.param +@pytest.fixture( + params=[ + "python", + pytest.param("pyarrow", marks=td.skip_if_no("pyarrow", min_version="1.0.0")), + ] +) +def string_storage(request): + """ + Parametrized fixture for pd.options.mode.string_storage. + + * 'python' + * 'pyarrow' + """ + return request.param + + @pytest.fixture(params=tm.BYTES_DTYPES) def bytes_dtype(request): """ diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index aafd34dcf7a13..c80c1263c3bf8 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -295,7 +295,7 @@ def __init__(self, values, copy=False): super().__init__(values, copy=copy) # error: Incompatible types in assignment (expression has type "StringDtype", # variable has type "PandasDtype") - NDArrayBacked.__init__(self, self._ndarray, StringDtype()) + NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python")) if not isinstance(values, type(self)): self._validate() @@ -311,8 +311,9 @@ def _validate(self): @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): - if dtype: - assert dtype == "string" + if dtype and not (isinstance(dtype, str) and dtype == "string"): + dtype = pandas_dtype(dtype) + assert isinstance(dtype, StringDtype) and dtype.storage == "python" from pandas.core.arrays.masked import BaseMaskedArray @@ -332,7 +333,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): # Manually creating new array avoids the validation step in the __init__, so is # faster. Refactor need for validation? new_string_array = cls.__new__(cls) - NDArrayBacked.__init__(new_string_array, result, StringDtype()) + NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python")) return new_string_array @@ -501,7 +502,7 @@ def _str_map( from pandas.arrays import BooleanArray if dtype is None: - dtype = StringDtype() + dtype = StringDtype(storage="python") if na_value is None: na_value = self.dtype.na_value diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index b0a36b7c02644..9d4992f6f04a8 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -35,6 +35,7 @@ is_object_dtype, is_scalar, is_string_dtype, + pandas_dtype, ) from pandas.core.dtypes.missing import isna @@ -154,6 +155,10 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False) cls._chk_pyarrow_available() + if dtype and not (isinstance(dtype, str) and dtype == "string"): + dtype = pandas_dtype(dtype) + assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow" + if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype in ensure_string_array and # numerical issues with Float32Dtype diff --git a/pandas/core/construction.py b/pandas/core/construction.py index b05bc895d0081..47e482b849095 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -113,18 +113,22 @@ def array( Currently, pandas will infer an extension dtype for sequences of - ============================== ===================================== + ============================== ======================================= Scalar Type Array Type - ============================== ===================================== + ============================== ======================================= :class:`pandas.Interval` :class:`pandas.arrays.IntervalArray` :class:`pandas.Period` :class:`pandas.arrays.PeriodArray` :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray` :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` :class:`int` :class:`pandas.arrays.IntegerArray` :class:`float` :class:`pandas.arrays.FloatingArray` - :class:`str` :class:`pandas.arrays.StringArray` + :class:`str` :class:`pandas.arrays.StringArray` or + :class:`pandas.arrays.ArrowStringArray` :class:`bool` :class:`pandas.arrays.BooleanArray` - ============================== ===================================== + ============================== ======================================= + + The ExtensionArray created when the scalar type is :class:`str` is determined by + pd.options.mode.string_storage if the dtype is not explicitly given. For all other cases, NumPy's usual inference rules will be used. @@ -240,6 +244,14 @@ def array( ['a', , 'c'] Length: 3, dtype: string[python] + >>> with pd.option_context("string_storage", "pyarrow"): + ... arr = pd.array(["a", None, "c"]) + ... + >>> arr + + ['a', , 'c'] + Length: 3, dtype: string[pyarrow] + >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) ['2000-01-01', '2000-01-01'] @@ -292,10 +304,10 @@ def array( IntegerArray, IntervalArray, PandasArray, - StringArray, TimedeltaArray, period_array, ) + from pandas.core.arrays.string_ import StringDtype if lib.is_scalar(data): msg = f"Cannot pass scalar '{data}' to 'pandas.array'." @@ -345,7 +357,8 @@ def array( return TimedeltaArray._from_sequence(data, copy=copy) elif inferred_dtype == "string": - return StringArray._from_sequence(data, copy=copy) + # StringArray/ArrowStringArray depending on pd.options.mode.string_storage + return StringDtype().construct_array_type()._from_sequence(data, copy=copy) elif inferred_dtype == "integer": return IntegerArray._from_sequence(data, copy=copy) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index be89db9f25d20..bb16754182c87 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -8,27 +8,33 @@ pa = pytest.importorskip("pyarrow", minversion="1.0.0") +from pandas.core.arrays.string_ import ( + StringArray, + StringDtype, +) from pandas.core.arrays.string_arrow import ArrowStringArray def test_eq_all_na(): - a = pd.array([pd.NA, pd.NA], dtype=pd.StringDtype("pyarrow")) + a = pd.array([pd.NA, pd.NA], dtype=StringDtype("pyarrow")) result = a == a expected = pd.array([pd.NA, pd.NA], dtype="boolean") tm.assert_extension_array_equal(result, expected) -def test_config(): - # python by default - assert pd.StringDtype().storage == "python" - arr = pd.array(["a", "b"]) - assert arr.dtype.storage == "python" +def test_config(string_storage): + with pd.option_context("string_storage", string_storage): + assert StringDtype().storage == string_storage + result = pd.array(["a", "b"]) + assert result.dtype.storage == string_storage - with pd.option_context("mode.string_storage", "pyarrow"): - assert pd.StringDtype().storage == "pyarrow" - arr = pd.array(["a", "b"]) - assert arr.dtype.storage == "pyarrow" + expected = ( + StringDtype(string_storage).construct_array_type()._from_sequence(["a", "b"]) + ) + tm.assert_equal(result, expected) + +def test_config_bad_storage_raises(): msg = re.escape("Value must be one of python|pyarrow") with pytest.raises(ValueError, match=msg): pd.options.mode.string_storage = "foo" @@ -50,3 +56,51 @@ def test_constructor_not_string_type_raises(array, chunked): ) with pytest.raises(ValueError, match=msg): ArrowStringArray(arr) + + +def test_from_sequence_wrong_dtype_raises(): + with pd.option_context("string_storage", "python"): + ArrowStringArray._from_sequence(["a", None, "c"], dtype="string") + + with pd.option_context("string_storage", "pyarrow"): + ArrowStringArray._from_sequence(["a", None, "c"], dtype="string") + + with pytest.raises(AssertionError, match=None): + ArrowStringArray._from_sequence(["a", None, "c"], dtype="string[python]") + + ArrowStringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]") + + with pytest.raises(AssertionError, match=None): + with pd.option_context("string_storage", "python"): + ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + + with pd.option_context("string_storage", "pyarrow"): + ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + + with pytest.raises(AssertionError, match=None): + ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype("python")) + + ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype("pyarrow")) + + with pd.option_context("string_storage", "python"): + StringArray._from_sequence(["a", None, "c"], dtype="string") + + with pd.option_context("string_storage", "pyarrow"): + StringArray._from_sequence(["a", None, "c"], dtype="string") + + StringArray._from_sequence(["a", None, "c"], dtype="string[python]") + + with pytest.raises(AssertionError, match=None): + StringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]") + + with pd.option_context("string_storage", "python"): + StringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + + with pytest.raises(AssertionError, match=None): + with pd.option_context("string_storage", "pyarrow"): + StringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + + StringArray._from_sequence(["a", None, "c"], dtype=StringDtype("python")) + + with pytest.raises(AssertionError, match=None): + StringArray._from_sequence(["a", None, "c"], dtype=StringDtype("pyarrow")) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index bfe588883d9f3..61d56df485ab1 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -18,7 +18,6 @@ IntegerArray, IntervalArray, SparseArray, - StringArray, TimedeltaArray, ) from pandas.core.arrays import ( @@ -132,8 +131,16 @@ ([1, None], "Int16", pd.array([1, None], dtype="Int16")), (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), # String - (["a", None], "string", StringArray._from_sequence(["a", None])), - (["a", None], pd.StringDtype(), StringArray._from_sequence(["a", None])), + ( + ["a", None], + "string", + pd.StringDtype().construct_array_type()._from_sequence(["a", None]), + ), + ( + ["a", None], + pd.StringDtype(), + pd.StringDtype().construct_array_type()._from_sequence(["a", None]), + ), # Boolean ([True, None], "boolean", BooleanArray._from_sequence([True, None])), ([True, None], pd.BooleanDtype(), BooleanArray._from_sequence([True, None])), @@ -253,8 +260,14 @@ def test_array_copy(): ([1, 2.0], FloatingArray._from_sequence([1.0, 2.0])), ([1, np.nan, 2.0], FloatingArray._from_sequence([1.0, None, 2.0])), # string - (["a", "b"], StringArray._from_sequence(["a", "b"])), - (["a", None], StringArray._from_sequence(["a", None])), + ( + ["a", "b"], + pd.StringDtype().construct_array_type()._from_sequence(["a", "b"]), + ), + ( + ["a", None], + pd.StringDtype().construct_array_type()._from_sequence(["a", None]), + ), # Boolean ([True, False], BooleanArray._from_sequence([True, False])), ([True, None], BooleanArray._from_sequence([True, None])), diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index c6f8efe7b939e..0bd10b36a8b5c 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -298,7 +298,7 @@ def test_searchsorted(self): assert result == 10 @pytest.mark.parametrize("box", [None, "index", "series"]) - def test_searchsorted_castable_strings(self, arr1d, box, request): + def test_searchsorted_castable_strings(self, arr1d, box, request, string_storage): if isinstance(arr1d, DatetimeArray): tz = arr1d.tz ts1, ts2 = arr1d[1:3] @@ -341,14 +341,17 @@ def test_searchsorted_castable_strings(self, arr1d, box, request): ): arr.searchsorted("foo") - with pytest.raises( - TypeError, - match=re.escape( - f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', " - "or array of those. Got 'StringArray' instead." - ), - ): - arr.searchsorted([str(arr[1]), "baz"]) + arr_type = "StringArray" if string_storage == "python" else "ArrowStringArray" + + with pd.option_context("string_storage", string_storage): + with pytest.raises( + TypeError, + match=re.escape( + f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', " + f"or array of those. Got '{arr_type}' instead." + ), + ): + arr.searchsorted([str(arr[1]), "baz"]) def test_getitem_near_implementation_bounds(self): # We only check tz-naive for DTA bc the bounds are slightly different diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 157b76c630ef9..ae3958995864c 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -12,6 +12,7 @@ from pandas._libs.tslibs import iNaT import pandas.util._test_decorators as td +import pandas as pd from pandas import ( NA, Categorical, @@ -377,7 +378,9 @@ class TestAstypeString: # currently no way to parse IntervalArray from a list of strings ], ) - def test_astype_string_to_extension_dtype_roundtrip(self, data, dtype, request): + def test_astype_string_to_extension_dtype_roundtrip( + self, data, dtype, request, string_storage + ): if dtype == "boolean" or ( dtype in ("period[M]", "datetime64[ns]", "timedelta64[ns]") and NaT in data ): @@ -385,9 +388,24 @@ def test_astype_string_to_extension_dtype_roundtrip(self, data, dtype, request): reason="TODO StringArray.astype() with missing values #GH40566" ) request.node.add_marker(mark) + + if string_storage == "pyarrow" and dtype in ( + "category", + "datetime64[ns]", + "datetime64[ns, US/Eastern]", + "UInt16", + "period[M]", + ): + mark = pytest.mark.xfail( + reason="TypeError: Cannot interpret ... as a data type" + ) + request.node.add_marker(mark) + # GH-40351 s = Series(data, dtype=dtype) - tm.assert_series_equal(s, s.astype("string").astype(dtype)) + with pd.option_context("string_storage", string_storage): + result = s.astype("string").astype(dtype) + tm.assert_series_equal(result, s) class TestAstypeCategorical: diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py index c0ae06802bdb1..6cbf2dd606692 100644 --- a/pandas/tests/strings/test_api.py +++ b/pandas/tests/strings/test_api.py @@ -6,6 +6,7 @@ MultiIndex, Series, _testing as tm, + get_option, ) from pandas.core import strings as strings @@ -128,7 +129,9 @@ def test_api_per_method( def test_api_for_categorical(any_string_method, any_string_dtype, request): # https://github.com/pandas-dev/pandas/issues/10661 - if any_string_dtype == "string[pyarrow]": + if any_string_dtype == "string[pyarrow]" or ( + any_string_dtype == "string" and get_option("string_storage") == "pyarrow" + ): # unsupported operand type(s) for +: 'ArrowStringArray' and 'str' mark = pytest.mark.xfail(raises=TypeError, reason="Not Implemented") request.node.add_marker(mark) From 8adb08d481777e7a6aca2fe5f390d3c36b0c1ae9 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 31 May 2021 13:52:03 +0100 Subject: [PATCH 35/46] use string_storage fixture more --- pandas/tests/arrays/string_/test_string.py | 27 +++++-------------- pandas/tests/extension/test_string.py | 13 +++------ .../frame/methods/test_convert_dtypes.py | 11 -------- 3 files changed, 9 insertions(+), 42 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index ae2fedef1d947..5179378b86ba0 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -14,30 +14,15 @@ import pandas._testing as tm from pandas.core.arrays.string_arrow import ArrowStringArray -skip_if_no_pyarrow = td.skip_if_no("pyarrow", min_version="1.0.0") - -@pytest.fixture( - params=[ - pd.StringDtype(storage="python"), - pytest.param( - pd.StringDtype(storage="pyarrow"), - marks=skip_if_no_pyarrow, - ), - ] -) -def dtype(request): - return request.param +@pytest.fixture +def dtype(string_storage): + return pd.StringDtype(storage=string_storage) -@pytest.fixture( - params=[ - pd.arrays.StringArray, - pytest.param(ArrowStringArray, marks=skip_if_no_pyarrow), - ] -) -def cls(request): - return request.param +@pytest.fixture +def cls(dtype): + return dtype.construct_array_type() def test_repr(dtype): diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 269ede7ca93ae..02e1cb31fd41a 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -18,8 +18,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas.core.arrays.string_ import StringDtype from pandas.tests.extension import base @@ -48,14 +46,9 @@ def chunked(request): return request.param -@pytest.fixture( - params=[ - "python", - pytest.param("pyarrow", marks=td.skip_if_no("pyarrow", min_version="1.0.0")), - ] -) -def dtype(request): - return StringDtype(storage=request.param) +@pytest.fixture +def dtype(string_storage): + return StringDtype(storage=string_storage) @pytest.fixture diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 2d3f0011617fc..a2d539d784d3c 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd import pandas._testing as tm @@ -11,15 +9,6 @@ class TestConvertDtypes: @pytest.mark.parametrize( "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] ) - @pytest.mark.parametrize( - "string_storage", - [ - "python", - pytest.param( - "pyarrow", marks=td.skip_if_no("pyarrow", min_version="1.0.0") - ), - ], - ) def test_convert_dtypes(self, convert_integer, expected, string_storage): # Specific types are tested in tests/series/test_dtypes.py # Just check that it works for DataFrame here From 56714c9fd3a9a0158a19bc533b2848dd2275112b Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 31 May 2021 19:11:16 +0100 Subject: [PATCH 36/46] post merge fixup --- pandas/core/arrays/string_.py | 6 ++++++ pandas/tests/arrays/string_/test_string_arrow.py | 14 ++++++++------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index c80c1263c3bf8..f33311d4d9114 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -19,6 +19,7 @@ Scalar, type_t, ) +from pandas.compat import pa_version_under1p0 from pandas.compat.numpy import function as nv from pandas.core.dtypes.base import ( @@ -94,6 +95,11 @@ def __init__(self, storage=None): raise ValueError( f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." ) + if storage == "pyarrow" and pa_version_under1p0: + raise ImportError( + "pyarrow>=1.0.0 is required for PyArrow backed StringArray." + ) + self.storage = storage @property diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 443984cef7687..c3f951adf7f89 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -11,11 +11,15 @@ StringArray, StringDtype, ) -from pandas.core.arrays.string_arrow import ( - ArrowStringArray, +from pandas.core.arrays.string_arrow import ArrowStringArray + +skip_if_no_pyarrow = pytest.mark.skipif( + pa_version_under1p0, + reason="pyarrow>=1.0.0 is required for PyArrow backed StringArray", ) +@skip_if_no_pyarrow def test_eq_all_na(): a = pd.array([pd.NA, pd.NA], dtype=StringDtype("pyarrow")) result = a == a @@ -41,10 +45,7 @@ def test_config_bad_storage_raises(): pd.options.mode.string_storage = "foo" -@pytest.mark.skipif( - pa_version_under1p0, - reason="pyarrow>=1.0.0 is required for PyArrow backed StringArray", -) +@skip_if_no_pyarrow @pytest.mark.parametrize("chunked", [True, False]) @pytest.mark.parametrize("array", ["numpy", "pyarrow"]) def test_constructor_not_string_type_raises(array, chunked): @@ -67,6 +68,7 @@ def test_constructor_not_string_type_raises(array, chunked): ArrowStringArray(arr) +@skip_if_no_pyarrow def test_from_sequence_wrong_dtype_raises(): with pd.option_context("string_storage", "python"): ArrowStringArray._from_sequence(["a", None, "c"], dtype="string") From 1761a84eb2e7e89956334881be1ed27801b8bf38 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 2 Jun 2021 10:15:17 +0100 Subject: [PATCH 37/46] remove accessor methods section from release note --- doc/source/whatsnew/v1.3.0.rst | 56 +--------------------------------- 1 file changed, 1 insertion(+), 55 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index a0e8a4905f20b..12f5ced79b934 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -222,61 +222,7 @@ String accessor methods returning integers will return a value with :class:`Int6 s.str.count("a") -Some string accessor methods use native PyArrow string kernels operating directly on the -PyArrow memory, others fallback to converting to a NumPy array of Python objects and -using the native Python string functions. String methods using Pyarrow kernels are -generally much more performant. - -Some PyArrow string kernels are implemented in later versions of pyarrow that the -minimum version required to create a PyArrow backed StringArray. In these cases, the -string accessor will fall back to the Python implementations. - -Some string accessor methods accept arguments controlling their behaviour which are not -supported by the PyArrow kernels. These cases will also fall back to object mode. - -+--------------------------------+----------+------------------------------------------+ -| Accessor | Minimum | Limitations (otherwise fall back to | -| Method | PyArrow | object mode) | -| | Version | | -+================================+==========+==========================================+ -| :meth:`~Series.str.contains` | 1.0.0 | The ``flags`` argument is not supported. | -| | | If ``regex=True``, pyarrow 4.0.0 is | -| | | required and ``case=False`` is not | -| | | supported. | -+--------------------------------+----------+------------------------------------------+ -| :meth:`~Series.str.startswith` | 4.0.0 | | -| :meth:`~Series.str.endswith` | | | -+--------------------------------+----------+------------------------------------------+ -| :meth:`~Series.str.replace` | 4.0.0 | The ``flags`` argument, ``case=False``, | -| | | passing a callable for the ``repr`` | -| | | argument or passing a compiled regex is | -| | | not supported. | -+--------------------------------+----------+------------------------------------------+ -| :meth:`~Series.str.match` | 4.0.0 | | -| :meth:`~Series.str.fullmatch` | | | -+--------------------------------+----------+------------------------------------------+ -| :meth:`~Series.str.isalnum` | 1.0.0 | | -| :meth:`~Series.str.isalpha` | | | -| :meth:`~Series.str.isdecimal` | | | -| :meth:`~Series.str.isdigit` | | | -| :meth:`~Series.str.islower` | | | -| :meth:`~Series.str.isnumeric` | | | -| :meth:`~Series.str.istitle` | | | -| :meth:`~Series.str.isupper` | | | -+--------------------------------+----------+------------------------------------------+ -| :meth:`~Series.str.isspace` | 2.0.0 | | -+--------------------------------+----------+------------------------------------------+ -| :meth:`~Series.str.len` | 4.0.0 | | -+--------------------------------+----------+------------------------------------------+ -| :meth:`~Series.str.lower` | 1.0.0 | | -| :meth:`~Series.str.upper` | | | -+--------------------------------+----------+------------------------------------------+ -| :meth:`~Series.str.strip` | 4.0.0 | | -| :meth:`~Series.str.lstrip` | | | -| :meth:`~Series.str.rstrip` | | | -+--------------------------------+----------+------------------------------------------+ - - +See :ref:`text.types` for more. Centered Datetime-Like Rolling Windows ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 3e26baa7c29402010538db70be59c59681fe9889 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 2 Jun 2021 10:27:42 +0100 Subject: [PATCH 38/46] consistent dtype naming in benchmark --- asv_bench/benchmarks/algorithms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 8885a0dcc781e..e48a2060a3b34 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -23,7 +23,7 @@ class Factorize: "int", "uint", "float", - "string", + "object", "datetime64[ns]", "datetime64[ns, tz]", "Int64", @@ -47,7 +47,7 @@ def setup(self, unique, sort, dtype): "int": pd.Int64Index(np.arange(N)), "uint": pd.UInt64Index(np.arange(N)), "float": pd.Float64Index(np.random.randn(N)), - "string": string_index, + "object": string_index, "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N), "datetime64[ns, tz]": pd.date_range( "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" From 6b470b1ed53f045bcb9bb5caf20ba1aaca3e2137 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 2 Jun 2021 12:30:49 +0100 Subject: [PATCH 39/46] Apply suggestions from code review Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/core/construction.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 12f5ced79b934..81b9b4132b8c9 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -178,7 +178,7 @@ PyArrow backed string data type We've enhanced the :class:`StringDtype`, an extension type dedicated to string data. (:issue:`39908`) -It is now possible to specify a ``storage`` keyword option to :class:`StringDtype`, use +It is now possible to specify a ``storage`` keyword option to :class:`StringDtype`. Use pandas options or specify the dtype using ``dtype='string[pyarrow]'`` to allow the StringArray to be backed by a PyArrow array instead of a NumPy array of Python objects. diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 061d9aadfbd7b..92eff02ec1307 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -125,7 +125,7 @@ def array( ============================== ======================================= The ExtensionArray created when the scalar type is :class:`str` is determined by - pd.options.mode.string_storage if the dtype is not explicitly given. + ``pd.options.mode.string_storage`` if the dtype is not explicitly given. For all other cases, NumPy's usual inference rules will be used. From 2ec6de0446abe9ba9c33de331d97c232fb1e55ba Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 2 Jun 2021 14:53:03 +0100 Subject: [PATCH 40/46] name and str() change to "string" --- pandas/_testing/asserters.py | 13 +++++++++++-- pandas/core/arrays/string_.py | 13 +++++++------ pandas/core/arrays/string_arrow.py | 2 +- pandas/core/construction.py | 4 ++-- pandas/core/generic.py | 14 +++++++------- pandas/core/strings/accessor.py | 2 +- pandas/tests/arrays/string_/test_string.py | 8 ++------ pandas/tests/extension/test_string.py | 8 ++++++-- 8 files changed, 37 insertions(+), 27 deletions(-) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 2d695458e32e6..ccd73810981d3 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -48,6 +48,7 @@ TimedeltaArray, ) from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin +from pandas.core.arrays.string_ import StringDtype from pandas.io.formats.printing import pprint_thing @@ -638,12 +639,20 @@ def raise_assert_detail(obj, message, left, right, diff=None, index_values=None) if isinstance(left, np.ndarray): left = pprint_thing(left) - elif is_categorical_dtype(left) or isinstance(left, PandasDtype): + elif ( + is_categorical_dtype(left) + or isinstance(left, PandasDtype) + or isinstance(left, StringDtype) + ): left = repr(left) if isinstance(right, np.ndarray): right = pprint_thing(right) - elif is_categorical_dtype(right) or isinstance(right, PandasDtype): + elif ( + is_categorical_dtype(right) + or isinstance(right, PandasDtype) + or isinstance(right, StringDtype) + ): right = repr(right) msg += f""" diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index f33311d4d9114..5a3c27b7ae0af 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -84,6 +84,8 @@ class StringDtype(ExtensionDtype): string[python] """ + name = "string" + #: StringDtype.na_value uses pandas.NA na_value = libmissing.NA _metadata = ("storage",) @@ -102,10 +104,6 @@ def __init__(self, storage=None): self.storage = storage - @property - def name(self): - return f"string[{self.storage}]" - @property def type(self) -> type[str]: return str @@ -182,6 +180,9 @@ def construct_array_type( # type: ignore[override] return ArrowStringArray def __repr__(self): + return f"string[{self.storage}]" + + def __str__(self): return self.name def __from_arrow__( @@ -268,7 +269,7 @@ class StringArray(PandasArray): >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string") ['This is', 'some text', , 'data.'] - Length: 4, dtype: string[python] + Length: 4, dtype: string Unlike arrays instantiated with ``dtype="object"``, ``StringArray`` will convert the values to strings. @@ -280,7 +281,7 @@ class StringArray(PandasArray): >>> pd.array(['1', 1], dtype="string") ['1', '1'] - Length: 2, dtype: string[python] + Length: 2, dtype: string However, instantiating StringArrays directly with non-strings will raise an error. diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 09c4c519bfc2b..019b94f5dd207 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -127,7 +127,7 @@ class ArrowStringArray(OpsMixin, ExtensionArray, ObjectStringArrayMixin): >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[pyarrow]") ['This is', 'some text', , 'data.'] - Length: 4, dtype: string[pyarrow] + Length: 4, dtype: string """ def __init__(self, values): diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 92eff02ec1307..2632b5ba2d287 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -239,7 +239,7 @@ def array( >>> pd.array(["a", None, "c"]) ['a', , 'c'] - Length: 3, dtype: string[python] + Length: 3, dtype: string >>> with pd.option_context("string_storage", "pyarrow"): ... arr = pd.array(["a", None, "c"]) @@ -247,7 +247,7 @@ def array( >>> arr ['a', , 'c'] - Length: 3, dtype: string[pyarrow] + Length: 3, dtype: string >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a15d602e0d724..49dc71954fd8f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6139,12 +6139,12 @@ def convert_dtypes( 2 3 z 20 200.0 >>> dfn.dtypes - a Int32 - b string[python] - c boolean - d string[python] - e Int64 - f Float64 + a Int32 + b string + c boolean + d string + e Int64 + f Float64 dtype: object Start with a Series of strings and missing data represented by ``np.nan``. @@ -6162,7 +6162,7 @@ def convert_dtypes( 0 a 1 b 2 - dtype: string[python] + dtype: string """ if self.ndim == 1: return self._convert_dtypes( diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 7643019ff8c55..aa867ae4dd401 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -3080,7 +3080,7 @@ def _result_dtype(arr): from pandas.core.arrays.string_ import StringDtype if isinstance(arr.dtype, StringDtype): - return arr.dtype.name + return arr.dtype else: return object diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 214f2184ee2fe..92d0d19901b21 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -32,15 +32,11 @@ def test_repr(dtype): expected = " A\n0 a\n1 \n2 b" assert repr(df) == expected - expected = ( - f"0 a\n1 \n2 b\nName: A, dtype: string[{dtype.storage}]" - ) + expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected arr_name = "ArrowStringArray" if dtype.storage == "pyarrow" else "StringArray" - expected = ( - f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string[{dtype.storage}]" - ) + expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" assert repr(df.A.array) == expected diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 02e1cb31fd41a..3d0edb70d1ced 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -94,7 +94,9 @@ def data_for_grouping(dtype, chunked): class TestDtype(base.BaseDtypeTests): - pass + def test_eq_with_str(self, dtype): + assert dtype == f"string[{dtype.storage}]" + super().test_eq_with_str(dtype) class TestInterface(base.BaseInterfaceTests): @@ -106,7 +108,9 @@ def test_view(self, data, request): class TestConstructors(base.BaseConstructorsTests): - pass + def test_from_dtype(self, data): + # base test uses string representation of dtype + pass class TestReshaping(base.BaseReshapingTests): From a0b7a70f347a9575268ecadde647faed578ebbf1 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 2 Jun 2021 15:28:09 +0100 Subject: [PATCH 41/46] remove testing of sting dtype without storage specified. --- pandas/conftest.py | 4 ---- pandas/tests/extension/base/casting.py | 1 - pandas/tests/frame/methods/test_astype.py | 1 - pandas/tests/series/methods/test_astype.py | 2 -- pandas/tests/series/methods/test_update.py | 1 - 5 files changed, 9 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 690a0a524e69a..f1c0280bc52bb 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1120,7 +1120,6 @@ def string_dtype(request): @pytest.fixture( params=[ - "string", "string[python]", pytest.param( "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0") @@ -1131,7 +1130,6 @@ def nullable_string_dtype(request): """ Parametrized fixture for string dtypes. - * 'string' * 'string[python]' * 'string[pyarrow]' """ @@ -1179,7 +1177,6 @@ def object_dtype(request): @pytest.fixture( params=[ "object", - "string", "string[python]", pytest.param( "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0") @@ -1190,7 +1187,6 @@ def any_string_dtype(request): """ Parametrized fixture for string dtypes. * 'object' - * 'string' * 'string[python]' * 'string[pyarrow]' """ diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index ed1047f6e28f6..9c59c79f677de 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -48,7 +48,6 @@ def test_astype_str(self, data): @pytest.mark.parametrize( "nullable_string_dtype", [ - "string", "string[python]", pytest.param( "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0") diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index e8b533f0c8817..881f8db305240 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -584,7 +584,6 @@ def test_astype_empty_dtype_dict(self): @pytest.mark.parametrize( "data, dtype", [ - (["x", "y", "z"], "string"), (["x", "y", "z"], "string[python]"), pytest.param( ["x", "y", "z"], diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index c610475581952..99a7ba910eb74 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -249,7 +249,6 @@ def test_td64_series_astype_object(self): @pytest.mark.parametrize( "data, dtype", [ - (["x", "y", "z"], "string"), (["x", "y", "z"], "string[python]"), pytest.param( ["x", "y", "z"], @@ -387,7 +386,6 @@ def test_astype_string_to_extension_dtype_roundtrip( reason="TODO StringArray.astype() with missing values #GH40566" ) request.node.add_marker(mark) - # GH-40351 s = Series(data, dtype=dtype) result = s.astype(nullable_string_dtype).astype(dtype) diff --git a/pandas/tests/series/methods/test_update.py b/pandas/tests/series/methods/test_update.py index 98cfb4cd6414d..d9d6641d54237 100644 --- a/pandas/tests/series/methods/test_update.py +++ b/pandas/tests/series/methods/test_update.py @@ -86,7 +86,6 @@ def test_update_from_non_series(self, series, other, expected): @pytest.mark.parametrize( "data, other, expected, dtype", [ - (["a", None], [None, "b"], ["a", "b"], "string"), (["a", None], [None, "b"], ["a", "b"], "string[python]"), pytest.param( ["a", None], From d9dcd20f99ca6fc2f0a08eb2a3f4a54f29309af3 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 2 Jun 2021 15:42:28 +0100 Subject: [PATCH 42/46] update StringDtype docstring --- pandas/core/arrays/string_.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 5a3c27b7ae0af..b57181987d90b 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -70,6 +70,11 @@ class StringDtype(ExtensionDtype): In particular, StringDtype.na_value may change to no longer be ``numpy.nan``. + Parameters + ---------- + storage : {"python", "pyarrow"}, optional + If not given, the value of ``pd.options.mode.string_storage``. + Attributes ---------- None @@ -82,6 +87,9 @@ class StringDtype(ExtensionDtype): -------- >>> pd.StringDtype() string[python] + + >>> pd.StringDtype(storage="pyarrow") + string[pyarrow] """ name = "string" From 4a37470d4802df49ea03982178d939348016bea3 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 2 Jun 2021 16:17:19 +0100 Subject: [PATCH 43/46] add ArrowStringArray to pd.arrays namespace --- doc/source/reference/arrays.rst | 1 + pandas/arrays/__init__.py | 2 ++ pandas/core/arrays/__init__.py | 2 ++ pandas/core/arrays/string_arrow.py | 3 ++- 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index 43e2509469488..c6fda85b0486d 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -480,6 +480,7 @@ we recommend using :class:`StringDtype` (with the alias ``"string"``). :template: autosummary/class_without_autosummary.rst arrays.StringArray + arrays.ArrowStringArray .. autosummary:: :toctree: api/ diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py index 0fa070b6e4fc4..89d362eb77e68 100644 --- a/pandas/arrays/__init__.py +++ b/pandas/arrays/__init__.py @@ -4,6 +4,7 @@ See :ref:`extending.extension-types` for more. """ from pandas.core.arrays import ( + ArrowStringArray, BooleanArray, Categorical, DatetimeArray, @@ -18,6 +19,7 @@ ) __all__ = [ + "ArrowStringArray", "BooleanArray", "Categorical", "DatetimeArray", diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 22f15ca9650db..e301e82a0ee75 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -17,12 +17,14 @@ ) from pandas.core.arrays.sparse import SparseArray from pandas.core.arrays.string_ import StringArray +from pandas.core.arrays.string_arrow import ArrowStringArray from pandas.core.arrays.timedeltas import TimedeltaArray __all__ = [ "ExtensionArray", "ExtensionOpsMixin", "ExtensionScalarOpsMixin", + "ArrowStringArray", "BaseMaskedArray", "BooleanArray", "Categorical", diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 019b94f5dd207..1539c6db6317f 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -673,7 +673,8 @@ def astype(self, dtype, copy=True): # ------------------------------------------------------------------------ # String methods interface - _str_na_value = StringDtype.na_value + # error: Cannot determine type of 'na_value' + _str_na_value = StringDtype.na_value # type: ignore[has-type] def _str_map( self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True From 1d59c7a84c9fbef748e60febd92ce983da9534ba Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 2 Jun 2021 16:50:28 +0100 Subject: [PATCH 44/46] add common base class, BaseStringArray --- pandas/core/arrays/string_.py | 13 ++++++++----- pandas/core/arrays/string_arrow.py | 7 +++++-- pandas/core/dtypes/cast.py | 8 ++------ pandas/core/strings/object_array.py | 5 ++--- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index b57181987d90b..8d150c8f6ad3d 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -43,6 +43,7 @@ IntegerArray, PandasArray, ) +from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.floating import FloatingDtype from pandas.core.arrays.integer import _IntegerDtype from pandas.core.construction import extract_array @@ -52,8 +53,6 @@ if TYPE_CHECKING: import pyarrow - from pandas.core.arrays.string_arrow import ArrowStringArray - @register_extension_dtype class StringDtype(ExtensionDtype): @@ -172,7 +171,7 @@ def __hash__(self) -> int: # "ExtensionDtype" def construct_array_type( # type: ignore[override] self, - ) -> type_t[StringArray | ArrowStringArray]: + ) -> type_t[BaseStringArray]: """ Return the array type associated with this dtype. @@ -195,7 +194,7 @@ def __str__(self): def __from_arrow__( self, array: pyarrow.Array | pyarrow.ChunkedArray - ) -> StringArray | ArrowStringArray: + ) -> BaseStringArray: """ Construct StringArray from pyarrow Array/ChunkedArray. """ @@ -225,7 +224,11 @@ def __from_arrow__( return StringArray(np.array([], dtype="object")) -class StringArray(PandasArray): +class BaseStringArray(ExtensionArray): + pass + + +class StringArray(BaseStringArray, PandasArray): """ Extension array for string data. diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 1539c6db6317f..ab8599f0f05ba 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -47,7 +47,10 @@ from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.integer import Int64Dtype from pandas.core.arrays.numeric import NumericDtype -from pandas.core.arrays.string_ import StringDtype +from pandas.core.arrays.string_ import ( + BaseStringArray, + StringDtype, +) from pandas.core.indexers import ( check_array_indexer, validate_indices, @@ -86,7 +89,7 @@ def _chk_pyarrow_available() -> None: # fallback for the ones that pyarrow doesn't yet support -class ArrowStringArray(OpsMixin, ExtensionArray, ObjectStringArrayMixin): +class ArrowStringArray(OpsMixin, BaseStringArray, ObjectStringArrayMixin): """ Extension array for string data in a ``pyarrow.ChunkedArray``. diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 5c7211a5d1852..73463db401ea5 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -420,18 +420,14 @@ def maybe_cast_to_extension_array( ------- ExtensionArray or obj """ - from pandas.core.arrays.string_ import StringArray - from pandas.core.arrays.string_arrow import ArrowStringArray + from pandas.core.arrays.string_ import BaseStringArray assert isinstance(cls, type), f"must pass a type: {cls}" assertion_msg = f"must pass a subclass of ExtensionArray: {cls}" assert issubclass(cls, ABCExtensionArray), assertion_msg # Everything can be converted to StringArrays, but we may not want to convert - if ( - issubclass(cls, (StringArray, ArrowStringArray)) - and lib.infer_dtype(obj) != "string" - ): + if issubclass(cls, BaseStringArray) and lib.infer_dtype(obj) != "string": return obj try: diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 7ce4abe904f3b..02bdb7f181583 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -173,8 +173,7 @@ def scalar_rep(x): return self._str_map(scalar_rep, dtype=str) else: - from pandas.core.arrays.string_ import StringArray - from pandas.core.arrays.string_arrow import ArrowStringArray + from pandas.core.arrays.string_ import BaseStringArray def rep(x, r): if x is libmissing.NA: @@ -186,7 +185,7 @@ def rep(x, r): repeats = np.asarray(repeats, dtype=object) result = libops.vec_binop(np.asarray(self), repeats, rep) - if isinstance(self, (StringArray, ArrowStringArray)): + if isinstance(self, BaseStringArray): # Not going through map, so we have to do this here. result = type(self)._from_sequence(result) return result From 51f1b1d7ce878b40826cb96d7e661aae9ab2b726 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 4 Jun 2021 20:40:24 +0100 Subject: [PATCH 45/46] fixup roundtrip tests --- pandas/conftest.py | 4 ++++ pandas/tests/arrays/string_/test_string.py | 20 ++++++++++++-------- pandas/tests/io/test_parquet.py | 5 +++-- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index f1c0280bc52bb..e106f7f425fa0 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1152,6 +1152,10 @@ def string_storage(request): return request.param +# Alias so we can test with cartesian product of string_storage +string_storage2 = string_storage + + @pytest.fixture(params=tm.BYTES_DTYPES) def bytes_dtype(request): """ diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 92d0d19901b21..5731f02430a9d 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -431,7 +431,7 @@ def test_arrow_array(dtype): @td.skip_if_no("pyarrow") -def test_arrow_roundtrip(dtype): +def test_arrow_roundtrip(dtype, string_storage2): # roundtrip possible from arrow 1.0.0 import pyarrow as pa @@ -439,15 +439,17 @@ def test_arrow_roundtrip(dtype): df = pd.DataFrame({"a": data}) table = pa.table(df) assert table.field("a").type == "string" - result = table.to_pandas() - assert isinstance(result["a"].dtype, type(dtype)) - tm.assert_frame_equal(result, df) + with pd.option_context("string_storage", string_storage2): + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.StringDtype) + expected = df.astype(f"string[{string_storage2}]") + tm.assert_frame_equal(result, expected) # ensure the missing value is represented by NA and not np.nan or None assert result.loc[2, "a"] is pd.NA @td.skip_if_no("pyarrow") -def test_arrow_load_from_zero_chunks(dtype): +def test_arrow_load_from_zero_chunks(dtype, string_storage2): # GH-41040 import pyarrow as pa @@ -457,9 +459,11 @@ def test_arrow_load_from_zero_chunks(dtype): assert table.field("a").type == "string" # Instantiate the same table with no chunks at all table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema) - result = table.to_pandas() - assert isinstance(result["a"].dtype, type(dtype)) - tm.assert_frame_equal(result, df) + with pd.option_context("string_storage", string_storage2): + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.StringDtype) + expected = df.astype(f"string[{string_storage2}]") + tm.assert_frame_equal(result, expected) def test_value_counts_na(dtype): diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 8535cb0fd8bad..d100c584b698a 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -810,10 +810,11 @@ def test_additional_extension_arrays(self, pa): check_round_trip(df, pa) @td.skip_if_no("pyarrow", min_version="1.0.0") - def test_pyarrow_backed_string_array(self, pa): + def test_pyarrow_backed_string_array(self, pa, string_storage): # test ArrowStringArray supported through the __arrow_array__ protocol df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")}) - check_round_trip(df, pa, expected=df) + with pd.option_context("string_storage", string_storage): + check_round_trip(df, pa, expected=df.astype(f"string[{string_storage}]")) @td.skip_if_no("pyarrow") def test_additional_extension_types(self, pa): From ef02a435ec481eeac62fe00ebc87d501489e13fc Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 7 Jun 2021 11:27:30 +0100 Subject: [PATCH 46/46] remove link --- doc/source/whatsnew/v1.3.0.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 57ca921ff5ba3..c2f25b389c9eb 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -223,8 +223,6 @@ String accessor methods returning integers will return a value with :class:`Int6 s.str.count("a") -See :ref:`text.types` for more. - Centered Datetime-Like Rolling Windows ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^