From 4c2e37a60c4390f02cc45f31626fac9ed2504ca0 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Fri, 10 Jul 2020 20:19:15 +0200 Subject: [PATCH 01/78] Implement BaseDtypeTests for ArrowStringDtype --- pandas/core/arrays/base.py | 6 +- pandas/core/arrays/string_arrow.py | 254 ++++++++++++++++++++ pandas/tests/extension/test_string_arrow.py | 127 ++++++++++ setup.py | 2 +- 4 files changed, 387 insertions(+), 2 deletions(-) create mode 100644 pandas/core/arrays/string_arrow.py create mode 100644 pandas/tests/extension/test_string_arrow.py diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 32a2a30fcfd43..b987a30a0ecd2 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -452,9 +452,13 @@ def astype(self, dtype, copy=True): NumPy ndarray with 'dtype' for its dtype. """ from pandas.core.arrays.string_ import StringDtype + from pandas.core.arrays.string_arrow import ArrowStringDtype dtype = pandas_dtype(dtype) - if isinstance(dtype, StringDtype): # allow conversion to StringArrays + # FIXME: Really hard-code here? + if isinstance( + dtype, (ArrowStringDtype, StringDtype) + ): # allow conversion to StringArrays return dtype.construct_array_type()._from_sequence(self, copy=False) return np.array(self, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py new file mode 100644 index 0000000000000..f9670d3f0da5f --- /dev/null +++ b/pandas/core/arrays/string_arrow.py @@ -0,0 +1,254 @@ +from typing import TYPE_CHECKING, Tuple, Type, Union + +import pyarrow as pa + +from pandas._libs import missing as libmissing + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.dtypes import register_extension_dtype + +from pandas.core.arrays.base import ExtensionArray + +if TYPE_CHECKING: + import numpy as np + + +@register_extension_dtype +class ArrowStringDtype(ExtensionDtype): + """ + Extension dtype for string data in a ``pyarrow.ChunkedArray``. + + .. versionadded:: 1.1.0 + + .. warning:: + + ArrowStringDtype is considered experimental. The implementation and + parts of the API may change without warning. + + Attributes + ---------- + None + + Methods + ------- + None + + Examples + -------- + >>> pd.ArrowStringDtype() + ArrowStringDtype + """ + + name = "arrow_string" + + #: StringDtype.na_value uses pandas.NA + na_value = libmissing.NA + + @property + def type(self) -> Type[str]: + return str + + @classmethod + def construct_array_type(cls) -> Type["ArrowStringArray"]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return ArrowStringArray + + def __hash__(self) -> int: + return hash("ArrowStringDtype") + + def __repr__(self) -> str: + return "ArrowStringDtype" + + def __from_arrow__( + self, array: Union["pa.Array", "pa.ChunkedArray"] + ) -> "ArrowStringArray": + """ + Construct StringArray from pyarrow Array/ChunkedArray. + """ + return ArrowStringArray(array) + + def __eq__(self, other) -> bool: + """Check whether 'other' is equal to self. + + By default, 'other' is considered equal if + * it's a string matching 'self.name'. + * it's an instance of this type. + + Parameters + ---------- + other : Any + + Returns + ------- + bool + """ + if isinstance(other, ArrowStringDtype): + return True + elif isinstance(other, str) and other == "arrow_string": + return True + else: + return False + + +class ArrowStringArray(ExtensionArray): + """ + Extension array for string data in a ``pyarrow.ChunkedArray``. + + .. versionadded:: 1.1.0 + + .. warning:: + + ArrowStringArray is considered experimental. The implementation and + parts of the API may change without warning. + + Parameters + ---------- + values : pyarrow.Array or pyarrow.ChunkedArray + The array of data. + + Attributes + ---------- + None + + Methods + ------- + None + + See Also + -------- + array + The recommended function for creating a ArrowStringArray. + Series.str + The string methods are available on Series backed by + a ArrowStringArray. + + Notes + ----- + ArrowStringArray returns a BooleanArray for comparison methods. + + Examples + -------- + >>> pd.array(['This is', 'some text', None, 'data.'], dtype="arrow_string") + + ['This is', 'some text', , 'data.'] + Length: 4, dtype: arrow_string + """ + + def __init__(self, values): + if isinstance(values, pa.Array): + self.data = pa.chunked_array([values]) + elif isinstance(values, pa.ChunkedArray): + self.data = values + else: + raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray") + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + return cls(pa.array(scalars, type=pa.string())) + + @property + def dtype(self) -> ArrowStringDtype: + """ + An instance of 'ArrowStringDtype'. + """ + return ArrowStringDtype() + + def __array__(self, *args, **kwargs) -> "np.ndarray": + """Correctly construct numpy arrays when passed to `np.asarray()`.""" + return self.data.__array__(*args, **kwargs) + + def __arrow_array__(self, type=None): + """Convert myself to a pyarrow Array or ChunkedArray.""" + return self.data + + @property + def size(self) -> int: + """ + Return the number of elements in this array. + + Returns + ------- + size : int + """ + return len(self.data) + + @property + def shape(self) -> Tuple[int]: + """Return the shape of the data.""" + # This may be patched by pandas to support pseudo-2D operations. + return (len(self.data),) + + @property + def ndim(self) -> int: + """Return the number of dimensions of the underlying data.""" + return 1 + + def __len__(self) -> int: + """ + Length of this array. + + Returns + ------- + length : int + """ + return len(self.data) + + @classmethod + def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + return cls._from_sequence(strings, dtype=dtype, copy=copy) + + # def _values_for_factorize(self): + # arr = self._ndarray.copy() + # mask = self.isna() + # arr[mask] = -1 + # return arr, -1 + + def __setitem__(self, key, value): + raise NotImplementedError("__setitem__") + + def fillna(self, value=None, method=None, limit=None): + raise NotImplementedError("fillna") + + # def astype(self, dtype, copy=True): + # dtype = pandas_dtype(dtype) + # if isinstance(dtype, StringDtype): + # if copy: + # return self.copy() + # return self + # elif isinstance(dtype, _IntegerDtype): + # arr = self._ndarray.copy() + # mask = self.isna() + # arr[mask] = 0 + # values = arr.astype(dtype.numpy_dtype) + # return IntegerArray(values, mask, copy=False) + + # return super().astype(dtype, copy) + + def _reduce(self, name, skipna=True, **kwargs): + if name in ["min", "max"]: + return getattr(self, name)(skipna=skipna) + + raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + + # def value_counts(self, dropna=False): + # from pandas import value_counts + + # return value_counts(self._ndarray, dropna=dropna).astype("Int64") + + @property + def nbytes(self) -> int: + """ + The number of bytes needed to store this object in memory. + """ + size = 0 + for chunk in self.data.chunks: + for buf in chunk.buffers(): + if buf is not None: + size += buf.size + return size diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py new file mode 100644 index 0000000000000..756d2dd0a739f --- /dev/null +++ b/pandas/tests/extension/test_string_arrow.py @@ -0,0 +1,127 @@ +import string + +import numpy as np +import pytest + +import pandas as pd +from pandas.core.arrays.string_arrow import ArrowStringArray, ArrowStringDtype +from pandas.tests.extension import base + + +@pytest.fixture +def dtype(): + return ArrowStringDtype() + + +@pytest.fixture +def data(): + strings = np.random.choice(list(string.ascii_letters), size=100) + while strings[0] == strings[1]: + strings = np.random.choice(list(string.ascii_letters), size=100) + + return ArrowStringArray._from_sequence(strings) + + +@pytest.fixture +def data_missing(): + """Length 2 array with [NA, Valid]""" + # TODO(ARROW-9407): Accept pd.NA in Arrow + return ArrowStringArray._from_sequence([pd.NA, "A"]) + + +@pytest.fixture +def data_for_sorting(): + return ArrowStringArray._from_sequence(["B", "C", "A"]) + + +@pytest.fixture +def data_missing_for_sorting(): + # TODO(ARROW-9407): Accept pd.NA in Arrow + return ArrowStringArray._from_sequence(["B", pd.NA, "A"]) + + +@pytest.fixture +def na_value(): + # TODO(ARROW-9407): Accept pd.NA in Arrow + return pd.NA + + +@pytest.fixture +def data_for_grouping(): + # TODO(ARROW-9407): Accept pd.NA in Arrow + return ArrowStringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"]) + + +class TestDtype(base.BaseDtypeTests): + pass + + +class TestInterface(base.BaseInterfaceTests): + pass + + +# class TestConstructors(base.BaseConstructorsTests): +# pass + + +# class TestReshaping(base.BaseReshapingTests): +# pass + + +# class TestGetitem(base.BaseGetitemTests): +# pass + + +# class TestSetitem(base.BaseSetitemTests): +# pass + + +# class TestMissing(base.BaseMissingTests): +# pass + + +# class TestNoReduce(base.BaseNoReduceTests): +# @pytest.mark.parametrize("skipna", [True, False]) +# def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): +# op_name = all_numeric_reductions +# +# if op_name in ["min", "max"]: +# return None +# +# s = pd.Series(data) +# with pytest.raises(TypeError): +# getattr(s, op_name)(skipna=skipna) + + +# class TestMethods(base.BaseMethodsTests): +# @pytest.mark.skip(reason="returns nullable") +# def test_value_counts(self, all_data, dropna): +# return super().test_value_counts(all_data, dropna) + + +# class TestCasting(base.BaseCastingTests): +# pass + + +# class TestComparisonOps(base.BaseComparisonOpsTests): +# def _compare_other(self, s, data, op_name, other): +# result = getattr(s, op_name)(other) +# expected = getattr(s.astype(object), op_name)(other).astype("boolean") +# self.assert_series_equal(result, expected) + +# def test_compare_scalar(self, data, all_compare_operators): +# op_name = all_compare_operators +# s = pd.Series(data) +# self._compare_other(s, data, op_name, "abc") + + +# class TestParsing(base.BaseParsingTests): +# pass + + +# class TestPrinting(base.BasePrintingTests): +# pass + + +# class TestGroupBy(base.BaseGroupbyTests): +# pass diff --git a/setup.py b/setup.py index 1885546e001fe..d83092514aca8 100755 --- a/setup.py +++ b/setup.py @@ -434,7 +434,7 @@ def run(self): extra_compile_args.append("/Z7") extra_link_args.append("/DEBUG") else: - extra_compile_args = ["-Werror"] + extra_compile_args = [] extra_link_args = [] if debugging_symbols_requested: extra_compile_args.append("-g") From d477ee7520afb5f5606967ec0caaa5cf2a6e1730 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Mon, 13 Jul 2020 12:07:06 +0200 Subject: [PATCH 02/78] Implement getitem --- pandas/core/arrays/string_arrow.py | 76 +++++++++++++++++++-- pandas/tests/extension/test_string_arrow.py | 4 -- 2 files changed, 71 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index f9670d3f0da5f..28b6231fdb516 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,5 +1,7 @@ -from typing import TYPE_CHECKING, Tuple, Type, Union +from collections.abc import Iterable +from typing import Tuple, Type, Union +import numpy as np import pyarrow as pa from pandas._libs import missing as libmissing @@ -7,10 +9,10 @@ from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.dtypes import register_extension_dtype +import pandas as pd +from pandas.api.types import is_integer from pandas.core.arrays.base import ExtensionArray - -if TYPE_CHECKING: - import numpy as np +from pandas.core.indexers import check_array_indexer @register_extension_dtype @@ -150,7 +152,9 @@ def __init__(self, values): @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): - return cls(pa.array(scalars, type=pa.string())) + # TODO(ARROW-9407): Accept pd.NA in Arrow + scalars_corrected = [None if pd.isna(x) else x for x in scalars] + return cls(pa.array(scalars_corrected, type=pa.string())) @property def dtype(self) -> ArrowStringDtype: @@ -209,6 +213,60 @@ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): # arr[mask] = -1 # return arr, -1 + def __getitem__(self, item): + # type (Any) -> Any + """Select a subset of self. + + Parameters + ---------- + item : int, slice, or ndarray + * int: The position in 'self' to get. + * slice: A slice object, where 'start', 'stop', and 'step' are + integers or None + * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' + + Returns + ------- + item : scalar or ExtensionArray + + Notes + ----- + For scalar ``item``, return a scalar value suitable for the array's + type. This should be an instance of ``self.dtype.type``. + For slice ``key``, return an instance of ``ExtensionArray``, even + if the slice is length 0 or 1. + For a boolean mask, return an instance of ``ExtensionArray``, filtered + to the values where ``item`` is True. + """ + item = check_array_indexer(self, item) + + if isinstance(item, Iterable): + raise NotImplementedError("Iterable") + # if not is_array_like(item): + # item = np.array(item) + # if is_integer_dtype(item) or (len(item) == 0): + # return self.take(item) + # elif is_bool_dtype(item): + # indices = np.array(item) + # indices = np.argwhere(indices).flatten() + # return self.take(indices) + # else: + # raise IndexError( + # """Only integers, slices and integer or + # boolean arrays are valid indices.""" + # ) + elif is_integer(item): + if item < 0: + item += len(self) + if item >= len(self): + return None + + value = self.data[item] + if isinstance(value, pa.ChunkedArray): + return type(self)(value) + else: + return value.as_py() + def __setitem__(self, key, value): raise NotImplementedError("__setitem__") @@ -252,3 +310,11 @@ def nbytes(self) -> int: if buf is not None: size += buf.size return size + + def isna(self) -> np.ndarray: + """ + Boolean NumPy array indicating if each value is missing. + + This should return a 1-D array the same length as 'self'. + """ + return self.data.is_null() diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py index 756d2dd0a739f..208a79e7be460 100644 --- a/pandas/tests/extension/test_string_arrow.py +++ b/pandas/tests/extension/test_string_arrow.py @@ -25,7 +25,6 @@ def data(): @pytest.fixture def data_missing(): """Length 2 array with [NA, Valid]""" - # TODO(ARROW-9407): Accept pd.NA in Arrow return ArrowStringArray._from_sequence([pd.NA, "A"]) @@ -36,19 +35,16 @@ def data_for_sorting(): @pytest.fixture def data_missing_for_sorting(): - # TODO(ARROW-9407): Accept pd.NA in Arrow return ArrowStringArray._from_sequence(["B", pd.NA, "A"]) @pytest.fixture def na_value(): - # TODO(ARROW-9407): Accept pd.NA in Arrow return pd.NA @pytest.fixture def data_for_grouping(): - # TODO(ARROW-9407): Accept pd.NA in Arrow return ArrowStringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"]) From 206f4930afbaa010436583a935b4b05205953fcc Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Mon, 13 Jul 2020 12:10:26 +0200 Subject: [PATCH 03/78] Add basic copy implementation --- pandas/core/arrays/string_arrow.py | 16 ++++++++++++++++ pandas/tests/extension/test_string_arrow.py | 4 ++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 28b6231fdb516..65e79da13bc99 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -318,3 +318,19 @@ def isna(self) -> np.ndarray: This should return a 1-D array the same length as 'self'. """ return self.data.is_null() + + def copy(self): + # type: () -> ExtensionArray + """ + Return a copy of the array. + + Parameters + ---------- + deep : bool, default False + Also copy the underlying data backing this array. + + Returns + ------- + ExtensionArray + """ + return type(self)(self.data) diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py index 208a79e7be460..e94ffcd44e3e9 100644 --- a/pandas/tests/extension/test_string_arrow.py +++ b/pandas/tests/extension/test_string_arrow.py @@ -56,8 +56,8 @@ class TestInterface(base.BaseInterfaceTests): pass -# class TestConstructors(base.BaseConstructorsTests): -# pass +class TestConstructors(base.BaseConstructorsTests): + pass # class TestReshaping(base.BaseReshapingTests): From d58dba6cf40caed93d43460d050117e3ec766989 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Mon, 13 Jul 2020 12:17:38 +0200 Subject: [PATCH 04/78] Implement getitem for iterables --- pandas/core/arrays/string_arrow.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 65e79da13bc99..1c553c1778b15 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -10,7 +10,7 @@ from pandas.core.dtypes.dtypes import register_extension_dtype import pandas as pd -from pandas.api.types import is_integer +from pandas.api.types import is_array_like, is_bool_dtype, is_integer, is_integer_dtype from pandas.core.arrays.base import ExtensionArray from pandas.core.indexers import check_array_indexer @@ -241,20 +241,19 @@ def __getitem__(self, item): item = check_array_indexer(self, item) if isinstance(item, Iterable): - raise NotImplementedError("Iterable") - # if not is_array_like(item): - # item = np.array(item) - # if is_integer_dtype(item) or (len(item) == 0): - # return self.take(item) - # elif is_bool_dtype(item): - # indices = np.array(item) - # indices = np.argwhere(indices).flatten() - # return self.take(indices) - # else: - # raise IndexError( - # """Only integers, slices and integer or - # boolean arrays are valid indices.""" - # ) + if not is_array_like(item): + item = np.array(item) + if len(item) == 0: + return type(self)(pa.chunked_array([], type=pa.string())) + elif is_integer_dtype(item): + return self.take(item) + elif is_bool_dtype(item): + return type(self)(self.data.filter(item)) + else: + raise IndexError( + "Only integers, slices and integer or " + "boolean arrays are valid indices." + ) elif is_integer(item): if item < 0: item += len(self) From 7a9e2c3a40e4103ae2353a2d7af717a41eeb1ff6 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Mon, 13 Jul 2020 12:19:37 +0200 Subject: [PATCH 05/78] Remove commented code --- pandas/core/arrays/string_arrow.py | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 1c553c1778b15..b9d60b5034ae3 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -207,12 +207,6 @@ def __len__(self) -> int: def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): return cls._from_sequence(strings, dtype=dtype, copy=copy) - # def _values_for_factorize(self): - # arr = self._ndarray.copy() - # mask = self.isna() - # arr[mask] = -1 - # return arr, -1 - def __getitem__(self, item): # type (Any) -> Any """Select a subset of self. @@ -272,32 +266,12 @@ def __setitem__(self, key, value): def fillna(self, value=None, method=None, limit=None): raise NotImplementedError("fillna") - # def astype(self, dtype, copy=True): - # dtype = pandas_dtype(dtype) - # if isinstance(dtype, StringDtype): - # if copy: - # return self.copy() - # return self - # elif isinstance(dtype, _IntegerDtype): - # arr = self._ndarray.copy() - # mask = self.isna() - # arr[mask] = 0 - # values = arr.astype(dtype.numpy_dtype) - # return IntegerArray(values, mask, copy=False) - - # return super().astype(dtype, copy) - def _reduce(self, name, skipna=True, **kwargs): if name in ["min", "max"]: return getattr(self, name)(skipna=skipna) raise TypeError(f"Cannot perform reduction '{name}' with string dtype") - # def value_counts(self, dropna=False): - # from pandas import value_counts - - # return value_counts(self._ndarray, dropna=dropna).astype("Int64") - @property def nbytes(self) -> int: """ From ffc4c0f70c0aaa520b3ca81fb7da938c6172ac92 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Mon, 13 Jul 2020 13:44:24 +0200 Subject: [PATCH 06/78] Implement more Setitem/Getitem variants --- pandas/core/arrays/string_arrow.py | 169 +++++++++++++++++++- pandas/tests/extension/test_string_arrow.py | 12 +- 2 files changed, 169 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index b9d60b5034ae3..29f369b117948 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,16 +1,23 @@ from collections.abc import Iterable -from typing import Tuple, Type, Union +from typing import Any, Sequence, Tuple, Type, Union import numpy as np import pyarrow as pa from pandas._libs import missing as libmissing +from pandas._typing import ArrayLike from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.dtypes import register_extension_dtype import pandas as pd -from pandas.api.types import is_array_like, is_bool_dtype, is_integer, is_integer_dtype +from pandas.api.types import ( + is_array_like, + is_bool_dtype, + is_integer, + is_integer_dtype, + is_scalar, +) from pandas.core.arrays.base import ExtensionArray from pandas.core.indexers import check_array_indexer @@ -260,9 +267,6 @@ def __getitem__(self, item): else: return value.as_py() - def __setitem__(self, key, value): - raise NotImplementedError("__setitem__") - def fillna(self, value=None, method=None, limit=None): raise NotImplementedError("fillna") @@ -292,8 +296,7 @@ def isna(self) -> np.ndarray: """ return self.data.is_null() - def copy(self): - # type: () -> ExtensionArray + def copy(self) -> ExtensionArray: """ Return a copy of the array. @@ -307,3 +310,155 @@ def copy(self): ExtensionArray """ return type(self)(self.data) + + def __eq__(self, other: Any) -> ArrayLike: + """ + Return for `self == other` (element-wise equality). + """ + if isinstance(other, (pd.Series, pd.DataFrame, pd.Index)): + return NotImplemented + if isinstance(other, ArrowStringArray): + result = self.data == other.data + elif is_scalar(other): + result = self.data == pa.scalar(other) + else: + raise NotImplementedError("Neither scalar nor ArrowStringArray") + + # TODO: Add a .to_numpy() to ChunkedArray + return pd.array(result.to_pandas().values) + + def __setitem__(self, key, value): + # type: (Union[int, np.ndarray], Any) -> None + """Set one or more values inplace. + + Parameters + ---------- + key : int, ndarray, or slice + When called from, e.g. ``Series.__setitem__``, ``key`` will be + one of + + * scalar int + * ndarray of integers. + * boolean ndarray + * slice object + + value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object + value or values to be set of ``key``. + + Returns + ------- + None + """ + key = check_array_indexer(self, key) + + if is_integer(key): + if not pd.api.types.is_scalar(value): + raise ValueError("Must pass scalars with scalar indexer") + elif pd.isna(value): + value = None + elif not isinstance(value, str): + raise ValueError("Scalar must be NA or str") + + # Slice data and insert inbetween + new_data = [ + *self.data[0:key].chunks, + pa.array([value], type=pa.string()), + *self.data[(key + 1) :].chunks, + ] + self.data = pa.chunked_array(new_data) + else: + # Convert to integer indices and iteratively assign. + # TODO: Make a faster variant of this in Arrow upstream. + # This is probably extremely slow. + + # Convert all possible input key types to an array of integers + if is_bool_dtype(key): + key_array = np.argwhere(key).flatten() + elif isinstance(key, slice): + key_array = np.array(range(len(self))[key]) + else: + key_array = np.asanyarray(key) + + if pd.api.types.is_scalar(value): + value = np.broadcast_to(value, len(key_array)) + else: + value = np.asarray(value) + + if len(key_array) != len(value): + raise ValueError("Length of indexer and values mismatch") + + for k, v in zip(key_array, value): + self[k] = v + + def take( + self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None + ) -> "ExtensionArray": + """ + Take elements from an array. + + Parameters + ---------- + indices : sequence of int + Indices to be taken. + allow_fill : bool, default False + How to handle negative values in `indices`. + + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + + fill_value : any, optional + Fill value to use for NA-indices when `allow_fill` is True. + This may be ``None``, in which case the default NA value for + the type, ``self.dtype.na_value``, is used. + + For many ExtensionArrays, there will be two representations of + `fill_value`: a user-facing "boxed" scalar, and a low-level + physical NA value. `fill_value` should be the user-facing version, + and the implementation should handle translating that to the + physical version for processing the take if necessary. + + Returns + ------- + ExtensionArray + + Raises + ------ + IndexError + When the indices are out of bounds for the array. + ValueError + When `indices` contains negative values other than ``-1`` + and `allow_fill` is True. + + See Also + -------- + numpy.take + api.extensions.take + + Notes + ----- + ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, + ``iloc``, when `indices` is a sequence of values. Additionally, + it's called by :meth:`Series.reindex`, or any other method + that causes realignment, with a `fill_value`. + """ + # TODO: Remove once we got rid of the (indices < 0) check + if not is_array_like(indices): + indices_array = np.asanyarray(indices) + else: + indices_array = indices + + if allow_fill: + if (indices_array < 0).any(): + raise NotImplementedError("allow_fill=True") + else: + # Nothing to fill + return type(self)(self.data.take(indices)) + else: # allow_fill=False + if (indices_array < 0).any(): + raise NotImplementedError("negative indices") + return type(self)(self.data.take(indices)) diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py index e94ffcd44e3e9..437d51060fb7f 100644 --- a/pandas/tests/extension/test_string_arrow.py +++ b/pandas/tests/extension/test_string_arrow.py @@ -53,7 +53,9 @@ class TestDtype(base.BaseDtypeTests): class TestInterface(base.BaseInterfaceTests): - pass + @pytest.mark.xfail(reason="Fails until implement, remove before merge") + def test_view(self, data): + base.BaseInterfaceTests.test_view(self, data) class TestConstructors(base.BaseConstructorsTests): @@ -64,12 +66,12 @@ class TestConstructors(base.BaseConstructorsTests): # pass -# class TestGetitem(base.BaseGetitemTests): -# pass +class TestGetitem(base.BaseGetitemTests): + pass -# class TestSetitem(base.BaseSetitemTests): -# pass +class TestSetitem(base.BaseSetitemTests): + pass # class TestMissing(base.BaseMissingTests): From c1305ab833db1bed89929b6f040edbf82f63c54c Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Mon, 13 Jul 2020 13:54:41 +0200 Subject: [PATCH 07/78] Review comments by @jorisvandenbossche --- pandas/core/arrays/string_arrow.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 29f369b117948..29f39084fab3e 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,5 +1,5 @@ from collections.abc import Iterable -from typing import Any, Sequence, Tuple, Type, Union +from typing import Any, Optional, Sequence, Tuple, Type, Union import numpy as np import pyarrow as pa @@ -22,6 +22,14 @@ from pandas.core.indexers import check_array_indexer +def _as_pandas_scalar(arrow_scalar: pa.Scalar) -> Optional[str]: + scalar = arrow_scalar.as_py() + if scalar is None: + return libmissing.NA + else: + return scalar + + @register_extension_dtype class ArrowStringDtype(ExtensionDtype): """ @@ -259,13 +267,13 @@ def __getitem__(self, item): if item < 0: item += len(self) if item >= len(self): - return None + raise IndexError("index out of bounds") value = self.data[item] if isinstance(value, pa.ChunkedArray): return type(self)(value) else: - return value.as_py() + return _as_pandas_scalar(value) def fillna(self, value=None, method=None, limit=None): raise NotImplementedError("fillna") @@ -281,12 +289,7 @@ def nbytes(self) -> int: """ The number of bytes needed to store this object in memory. """ - size = 0 - for chunk in self.data.chunks: - for buf in chunk.buffers(): - if buf is not None: - size += buf.size - return size + return self.data.nbytes def isna(self) -> np.ndarray: """ @@ -294,7 +297,8 @@ def isna(self) -> np.ndarray: This should return a 1-D array the same length as 'self'. """ - return self.data.is_null() + # TODO: Implement .to_numpy for ChunkedArray + return self.data.is_null().to_pandas().values def copy(self) -> ExtensionArray: """ From 13a42f74a62b0ab37b35c1346306b0f45a4479eb Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Mon, 13 Jul 2020 14:20:06 +0200 Subject: [PATCH 08/78] Add Arrow issue numbers --- pandas/core/arrays/string_arrow.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 29f39084fab3e..bf2f07ed637fd 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -328,7 +328,7 @@ def __eq__(self, other: Any) -> ArrayLike: else: raise NotImplementedError("Neither scalar nor ArrowStringArray") - # TODO: Add a .to_numpy() to ChunkedArray + # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray return pd.array(result.to_pandas().values) def __setitem__(self, key, value): @@ -377,10 +377,12 @@ def __setitem__(self, key, value): # Convert all possible input key types to an array of integers if is_bool_dtype(key): + # TODO(ARROW-9430): Directly support setitem(booleans) key_array = np.argwhere(key).flatten() elif isinstance(key, slice): key_array = np.array(range(len(self))[key]) else: + # TODO(ARROW-9431): Directly support setitem(integers) key_array = np.asanyarray(key) if pd.api.types.is_scalar(value): @@ -458,11 +460,13 @@ def take( if allow_fill: if (indices_array < 0).any(): + # TODO(ARROW-9433): Treat negative indices as NULL raise NotImplementedError("allow_fill=True") else: # Nothing to fill return type(self)(self.data.take(indices)) else: # allow_fill=False if (indices_array < 0).any(): + # TODO(ARROW-9432): Treat negative indices as indices from the right. raise NotImplementedError("negative indices") return type(self)(self.data.take(indices)) From decd0220a10f3735c0106efc870276f7af0c2c96 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Wed, 15 Jul 2020 16:32:47 +0200 Subject: [PATCH 09/78] Adopt to kernel renamings --- pandas/core/arrays/string_arrow.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index bf2f07ed637fd..9311e07226366 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -3,6 +3,7 @@ import numpy as np import pyarrow as pa +import pyarrow.compute as pc from pandas._libs import missing as libmissing from pandas._typing import ArrayLike @@ -322,9 +323,9 @@ def __eq__(self, other: Any) -> ArrayLike: if isinstance(other, (pd.Series, pd.DataFrame, pd.Index)): return NotImplemented if isinstance(other, ArrowStringArray): - result = self.data == other.data + result = pc.equal(self.data, other.data) elif is_scalar(other): - result = self.data == pa.scalar(other) + result = pc.equal(self.data, pa.scalar(other)) else: raise NotImplementedError("Neither scalar nor ArrowStringArray") From 3145e44d6e8c3a3b4f684cbff332689e5d056c69 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Wed, 15 Jul 2020 16:58:49 +0200 Subject: [PATCH 10/78] Handle take(indices<0, allow_fill=False) --- pandas/core/arrays/string_arrow.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 9311e07226366..402f17772a572 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -467,7 +467,9 @@ def take( # Nothing to fill return type(self)(self.data.take(indices)) else: # allow_fill=False + # TODO(ARROW-9432): Treat negative indices as indices from the right. if (indices_array < 0).any(): - # TODO(ARROW-9432): Treat negative indices as indices from the right. - raise NotImplementedError("negative indices") - return type(self)(self.data.take(indices)) + # Don't modify in-place + indices_array = np.copy(indices_array) + indices_array[indices_array < 0] += len(self.data) + return type(self)(self.data.take(indices_array)) From e22b3481657cc7b5ba1dcc075c8e4d2effa7f0b6 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Wed, 15 Jul 2020 17:17:52 +0200 Subject: [PATCH 11/78] Handle fill_value better --- pandas/core/arrays/string_arrow.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 402f17772a572..8248a3e91c0fe 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -459,10 +459,19 @@ def take( else: indices_array = indices + if len(self.data) == 0 and (indices_array >= 0).any(): + raise IndexError("cannot do a non-empty take") + if indices_array.max() >= len(self.data): + raise IndexError("out of bounds value in 'indices'.") + if allow_fill: if (indices_array < 0).any(): # TODO(ARROW-9433): Treat negative indices as NULL - raise NotImplementedError("allow_fill=True") + indices_array = pa.array(indices_array, mask=indices_array < 0) + result = self.data.take(indices_array) + if pd.isna(fill_value): + return type(self)(result) + return type(self)(pc.fill_null(result, pa.scalar(fill_value))) else: # Nothing to fill return type(self)(self.data.take(indices)) From 2446562047018793f7d0c445c904c3abcd06be18 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 19 Oct 2020 16:33:37 +0100 Subject: [PATCH 12/78] fix doctest --- pandas/__init__.py | 1 + pandas/core/api.py | 2 ++ pandas/core/arrays/string_arrow.py | 43 ++++++++++++++++++------------ 3 files changed, 29 insertions(+), 17 deletions(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index cf7ae2505b72d..a4e9e04560241 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -65,6 +65,7 @@ IntervalDtype, DatetimeTZDtype, StringDtype, + ArrowStringDtype, BooleanDtype, # missing NA, diff --git a/pandas/core/api.py b/pandas/core/api.py index 67e86c2076329..d8210d114e213 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -57,3 +57,5 @@ # DataFrame needs to be imported after NamedAgg to avoid a circular import from pandas.core.frame import DataFrame # isort:skip + +from pandas.core.arrays.string_arrow import ArrowStringDtype # isort:skip diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 8248a3e91c0fe..7b8275c1d0f9a 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,9 +1,9 @@ +from __future__ import annotations + from collections.abc import Iterable -from typing import Any, Optional, Sequence, Tuple, Type, Union +from typing import TYPE_CHECKING, Any, Optional, Sequence, Tuple, Type, Union import numpy as np -import pyarrow as pa -import pyarrow.compute as pc from pandas._libs import missing as libmissing from pandas._typing import ArrayLike @@ -22,6 +22,9 @@ from pandas.core.arrays.base import ExtensionArray from pandas.core.indexers import check_array_indexer +if TYPE_CHECKING: + import pyarrow as pa + def _as_pandas_scalar(arrow_scalar: pa.Scalar) -> Optional[str]: scalar = arrow_scalar.as_py() @@ -36,7 +39,7 @@ class ArrowStringDtype(ExtensionDtype): """ Extension dtype for string data in a ``pyarrow.ChunkedArray``. - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 .. warning:: @@ -57,6 +60,9 @@ class ArrowStringDtype(ExtensionDtype): ArrowStringDtype """ + import pyarrow as pa + import pyarrow.compute as pc + name = "arrow_string" #: StringDtype.na_value uses pandas.NA @@ -118,7 +124,7 @@ class ArrowStringArray(ExtensionArray): """ Extension array for string data in a ``pyarrow.ChunkedArray``. - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 .. warning:: @@ -158,10 +164,13 @@ class ArrowStringArray(ExtensionArray): Length: 4, dtype: arrow_string """ + import pyarrow as pa + import pyarrow.compute as pc + def __init__(self, values): - if isinstance(values, pa.Array): - self.data = pa.chunked_array([values]) - elif isinstance(values, pa.ChunkedArray): + if isinstance(values, self.pa.Array): + self.data = self.pa.chunked_array([values]) + elif isinstance(values, self.pa.ChunkedArray): self.data = values else: raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray") @@ -170,7 +179,7 @@ def __init__(self, values): def _from_sequence(cls, scalars, dtype=None, copy=False): # TODO(ARROW-9407): Accept pd.NA in Arrow scalars_corrected = [None if pd.isna(x) else x for x in scalars] - return cls(pa.array(scalars_corrected, type=pa.string())) + return cls(cls.pa.array(scalars_corrected, type=cls.pa.string())) @property def dtype(self) -> ArrowStringDtype: @@ -254,7 +263,7 @@ def __getitem__(self, item): if not is_array_like(item): item = np.array(item) if len(item) == 0: - return type(self)(pa.chunked_array([], type=pa.string())) + return type(self)(self.pa.chunked_array([], type=self.pa.string())) elif is_integer_dtype(item): return self.take(item) elif is_bool_dtype(item): @@ -271,7 +280,7 @@ def __getitem__(self, item): raise IndexError("index out of bounds") value = self.data[item] - if isinstance(value, pa.ChunkedArray): + if isinstance(value, self.pa.ChunkedArray): return type(self)(value) else: return _as_pandas_scalar(value) @@ -323,9 +332,9 @@ def __eq__(self, other: Any) -> ArrayLike: if isinstance(other, (pd.Series, pd.DataFrame, pd.Index)): return NotImplemented if isinstance(other, ArrowStringArray): - result = pc.equal(self.data, other.data) + result = self.pc.equal(self.data, other.data) elif is_scalar(other): - result = pc.equal(self.data, pa.scalar(other)) + result = self.pc.equal(self.data, self.pa.scalar(other)) else: raise NotImplementedError("Neither scalar nor ArrowStringArray") @@ -367,10 +376,10 @@ def __setitem__(self, key, value): # Slice data and insert inbetween new_data = [ *self.data[0:key].chunks, - pa.array([value], type=pa.string()), + self.pa.array([value], type=self.pa.string()), *self.data[(key + 1) :].chunks, ] - self.data = pa.chunked_array(new_data) + self.data = self.pa.chunked_array(new_data) else: # Convert to integer indices and iteratively assign. # TODO: Make a faster variant of this in Arrow upstream. @@ -467,11 +476,11 @@ def take( if allow_fill: if (indices_array < 0).any(): # TODO(ARROW-9433): Treat negative indices as NULL - indices_array = pa.array(indices_array, mask=indices_array < 0) + indices_array = self.pa.array(indices_array, mask=indices_array < 0) result = self.data.take(indices_array) if pd.isna(fill_value): return type(self)(result) - return type(self)(pc.fill_null(result, pa.scalar(fill_value))) + return type(self)(self.pc.fill_null(result, self.pa.scalar(fill_value))) else: # Nothing to fill return type(self)(self.data.take(indices)) From a0dcc85b0f447e482c54efc21cc82395dabdb677 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 19 Oct 2020 19:45:14 +0100 Subject: [PATCH 13/78] Revert "fix doctest" This reverts commit 2446562047018793f7d0c445c904c3abcd06be18. --- pandas/__init__.py | 1 - pandas/core/api.py | 2 -- pandas/core/arrays/string_arrow.py | 43 ++++++++++++------------------ 3 files changed, 17 insertions(+), 29 deletions(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index a4e9e04560241..cf7ae2505b72d 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -65,7 +65,6 @@ IntervalDtype, DatetimeTZDtype, StringDtype, - ArrowStringDtype, BooleanDtype, # missing NA, diff --git a/pandas/core/api.py b/pandas/core/api.py index d8210d114e213..67e86c2076329 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -57,5 +57,3 @@ # DataFrame needs to be imported after NamedAgg to avoid a circular import from pandas.core.frame import DataFrame # isort:skip - -from pandas.core.arrays.string_arrow import ArrowStringDtype # isort:skip diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 7b8275c1d0f9a..8248a3e91c0fe 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,9 +1,9 @@ -from __future__ import annotations - from collections.abc import Iterable -from typing import TYPE_CHECKING, Any, Optional, Sequence, Tuple, Type, Union +from typing import Any, Optional, Sequence, Tuple, Type, Union import numpy as np +import pyarrow as pa +import pyarrow.compute as pc from pandas._libs import missing as libmissing from pandas._typing import ArrayLike @@ -22,9 +22,6 @@ from pandas.core.arrays.base import ExtensionArray from pandas.core.indexers import check_array_indexer -if TYPE_CHECKING: - import pyarrow as pa - def _as_pandas_scalar(arrow_scalar: pa.Scalar) -> Optional[str]: scalar = arrow_scalar.as_py() @@ -39,7 +36,7 @@ class ArrowStringDtype(ExtensionDtype): """ Extension dtype for string data in a ``pyarrow.ChunkedArray``. - .. versionadded:: 1.2.0 + .. versionadded:: 1.1.0 .. warning:: @@ -60,9 +57,6 @@ class ArrowStringDtype(ExtensionDtype): ArrowStringDtype """ - import pyarrow as pa - import pyarrow.compute as pc - name = "arrow_string" #: StringDtype.na_value uses pandas.NA @@ -124,7 +118,7 @@ class ArrowStringArray(ExtensionArray): """ Extension array for string data in a ``pyarrow.ChunkedArray``. - .. versionadded:: 1.2.0 + .. versionadded:: 1.1.0 .. warning:: @@ -164,13 +158,10 @@ class ArrowStringArray(ExtensionArray): Length: 4, dtype: arrow_string """ - import pyarrow as pa - import pyarrow.compute as pc - def __init__(self, values): - if isinstance(values, self.pa.Array): - self.data = self.pa.chunked_array([values]) - elif isinstance(values, self.pa.ChunkedArray): + if isinstance(values, pa.Array): + self.data = pa.chunked_array([values]) + elif isinstance(values, pa.ChunkedArray): self.data = values else: raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray") @@ -179,7 +170,7 @@ def __init__(self, values): def _from_sequence(cls, scalars, dtype=None, copy=False): # TODO(ARROW-9407): Accept pd.NA in Arrow scalars_corrected = [None if pd.isna(x) else x for x in scalars] - return cls(cls.pa.array(scalars_corrected, type=cls.pa.string())) + return cls(pa.array(scalars_corrected, type=pa.string())) @property def dtype(self) -> ArrowStringDtype: @@ -263,7 +254,7 @@ def __getitem__(self, item): if not is_array_like(item): item = np.array(item) if len(item) == 0: - return type(self)(self.pa.chunked_array([], type=self.pa.string())) + return type(self)(pa.chunked_array([], type=pa.string())) elif is_integer_dtype(item): return self.take(item) elif is_bool_dtype(item): @@ -280,7 +271,7 @@ def __getitem__(self, item): raise IndexError("index out of bounds") value = self.data[item] - if isinstance(value, self.pa.ChunkedArray): + if isinstance(value, pa.ChunkedArray): return type(self)(value) else: return _as_pandas_scalar(value) @@ -332,9 +323,9 @@ def __eq__(self, other: Any) -> ArrayLike: if isinstance(other, (pd.Series, pd.DataFrame, pd.Index)): return NotImplemented if isinstance(other, ArrowStringArray): - result = self.pc.equal(self.data, other.data) + result = pc.equal(self.data, other.data) elif is_scalar(other): - result = self.pc.equal(self.data, self.pa.scalar(other)) + result = pc.equal(self.data, pa.scalar(other)) else: raise NotImplementedError("Neither scalar nor ArrowStringArray") @@ -376,10 +367,10 @@ def __setitem__(self, key, value): # Slice data and insert inbetween new_data = [ *self.data[0:key].chunks, - self.pa.array([value], type=self.pa.string()), + pa.array([value], type=pa.string()), *self.data[(key + 1) :].chunks, ] - self.data = self.pa.chunked_array(new_data) + self.data = pa.chunked_array(new_data) else: # Convert to integer indices and iteratively assign. # TODO: Make a faster variant of this in Arrow upstream. @@ -476,11 +467,11 @@ def take( if allow_fill: if (indices_array < 0).any(): # TODO(ARROW-9433): Treat negative indices as NULL - indices_array = self.pa.array(indices_array, mask=indices_array < 0) + indices_array = pa.array(indices_array, mask=indices_array < 0) result = self.data.take(indices_array) if pd.isna(fill_value): return type(self)(result) - return type(self)(self.pc.fill_null(result, self.pa.scalar(fill_value))) + return type(self)(pc.fill_null(result, pa.scalar(fill_value))) else: # Nothing to fill return type(self)(self.data.take(indices)) From 5c4217345246347c655654f8d395e5fe02f35efb Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 19 Oct 2020 19:46:28 +0100 Subject: [PATCH 14/78] change version for versionadded --- pandas/core/arrays/string_arrow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 8248a3e91c0fe..1a183b5535cbf 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -36,7 +36,7 @@ class ArrowStringDtype(ExtensionDtype): """ Extension dtype for string data in a ``pyarrow.ChunkedArray``. - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 .. warning:: @@ -118,7 +118,7 @@ class ArrowStringArray(ExtensionArray): """ Extension array for string data in a ``pyarrow.ChunkedArray``. - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 .. warning:: From 28c3ef275728cd5071f9ded151d61cc919a521e7 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 19 Oct 2020 20:35:46 +0100 Subject: [PATCH 15/78] code checks --- pandas/core/arrays/string_arrow.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 1a183b5535cbf..98df636f8435a 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,4 +1,4 @@ -from collections.abc import Iterable +from collections import abc from typing import Any, Optional, Sequence, Tuple, Type, Union import numpy as np @@ -53,7 +53,8 @@ class ArrowStringDtype(ExtensionDtype): Examples -------- - >>> pd.ArrowStringDtype() + >>> from pandas.core.arrays.string_arrow import ArrowStringDtype + >>> ArrowStringDtype() ArrowStringDtype """ @@ -223,8 +224,7 @@ def __len__(self) -> int: def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): return cls._from_sequence(strings, dtype=dtype, copy=copy) - def __getitem__(self, item): - # type (Any) -> Any + def __getitem__(self, item: Any) -> Any: """Select a subset of self. Parameters @@ -250,7 +250,7 @@ def __getitem__(self, item): """ item = check_array_indexer(self, item) - if isinstance(item, Iterable): + if isinstance(item, abc.Iterable): if not is_array_like(item): item = np.array(item) if len(item) == 0: @@ -332,8 +332,7 @@ def __eq__(self, other: Any) -> ArrayLike: # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray return pd.array(result.to_pandas().values) - def __setitem__(self, key, value): - # type: (Union[int, np.ndarray], Any) -> None + def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None: """Set one or more values inplace. Parameters From 1740524f6d374d7c040be3c916eb71ba6a0e42ce Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 21 Oct 2020 20:46:11 +0100 Subject: [PATCH 16/78] skip tests for pyarrow<1.0 --- pandas/tests/extension/test_string_arrow.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py index 437d51060fb7f..7b3f585ce2fb5 100644 --- a/pandas/tests/extension/test_string_arrow.py +++ b/pandas/tests/extension/test_string_arrow.py @@ -4,9 +4,12 @@ import pytest import pandas as pd -from pandas.core.arrays.string_arrow import ArrowStringArray, ArrowStringDtype from pandas.tests.extension import base +pytest.importorskip("pyarrow", minversion="1.0") + +from pandas.core.arrays.string_arrow import ArrowStringArray, ArrowStringDtype + @pytest.fixture def dtype(): From 34bf57d259f4943db893b3ac96732a74e86a1b2c Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 2 Nov 2020 13:45:03 +0000 Subject: [PATCH 17/78] raise ImportError in constructors on pyarrow < 1.0.0. or not installed --- pandas/core/arrays/string_arrow.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 98df636f8435a..bb3432743663d 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,9 +1,10 @@ +from __future__ import annotations + from collections import abc +from distutils.version import LooseVersion from typing import Any, Optional, Sequence, Tuple, Type, Union import numpy as np -import pyarrow as pa -import pyarrow.compute as pc from pandas._libs import missing as libmissing from pandas._typing import ArrayLike @@ -22,6 +23,16 @@ from pandas.core.arrays.base import ExtensionArray from pandas.core.indexers import check_array_indexer +try: + import pyarrow as pa +except ImportError: + pa = None +else: + try: + import pyarrow.compute as pc + except ImportError: + pass + def _as_pandas_scalar(arrow_scalar: pa.Scalar) -> Optional[str]: scalar = arrow_scalar.as_py() @@ -160,6 +171,7 @@ class ArrowStringArray(ExtensionArray): """ def __init__(self, values): + self._chk_pyarrow_available() if isinstance(values, pa.Array): self.data = pa.chunked_array([values]) elif isinstance(values, pa.ChunkedArray): @@ -167,8 +179,18 @@ def __init__(self, values): else: raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray") + @classmethod + def _chk_pyarrow_available(cls) -> None: + # TODO: maybe update import_optional_dependency to allow a minimum + # version to be specified rather than use the global minimum + if pa is None or LooseVersion(pa.__version__) < "1.0.0": + msg = "pyarrow>=1.0.0 is required for PyArrow backed StringArray." + raise ImportError(msg) + @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): + cls._chk_pyarrow_available() + # TODO(ARROW-9407): Accept pd.NA in Arrow scalars_corrected = [None if pd.isna(x) else x for x in scalars] return cls(pa.array(scalars_corrected, type=pa.string())) From f92241e3341872cc6b9849d46c3aa6033267a27b Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 2 Nov 2020 16:43:06 +0000 Subject: [PATCH 18/78] remove size, shape and ndim --- pandas/core/arrays/string_arrow.py | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index bb3432743663d..a0b719e5116e7 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -2,7 +2,7 @@ from collections import abc from distutils.version import LooseVersion -from typing import Any, Optional, Sequence, Tuple, Type, Union +from typing import Any, Optional, Sequence, Type, Union import numpy as np @@ -210,28 +210,6 @@ def __arrow_array__(self, type=None): """Convert myself to a pyarrow Array or ChunkedArray.""" return self.data - @property - def size(self) -> int: - """ - Return the number of elements in this array. - - Returns - ------- - size : int - """ - return len(self.data) - - @property - def shape(self) -> Tuple[int]: - """Return the shape of the data.""" - # This may be patched by pandas to support pseudo-2D operations. - return (len(self.data),) - - @property - def ndim(self) -> int: - """Return the number of dimensions of the underlying data.""" - return 1 - def __len__(self) -> int: """ Length of this array. From c09382d303e3097d324dfb43b824f615412750fa Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 2 Nov 2020 18:03:16 +0000 Subject: [PATCH 19/78] activate all extension array tests --- pandas/tests/extension/test_string_arrow.py | 72 ++++++++++----------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py index 7b3f585ce2fb5..d6c8838a55523 100644 --- a/pandas/tests/extension/test_string_arrow.py +++ b/pandas/tests/extension/test_string_arrow.py @@ -65,8 +65,8 @@ class TestConstructors(base.BaseConstructorsTests): pass -# class TestReshaping(base.BaseReshapingTests): -# pass +class TestReshaping(base.BaseReshapingTests): + pass class TestGetitem(base.BaseGetitemTests): @@ -77,52 +77,52 @@ class TestSetitem(base.BaseSetitemTests): pass -# class TestMissing(base.BaseMissingTests): -# pass +class TestMissing(base.BaseMissingTests): + pass + + +class TestNoReduce(base.BaseNoReduceTests): + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): + op_name = all_numeric_reductions + if op_name in ["min", "max"]: + return None -# class TestNoReduce(base.BaseNoReduceTests): -# @pytest.mark.parametrize("skipna", [True, False]) -# def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): -# op_name = all_numeric_reductions -# -# if op_name in ["min", "max"]: -# return None -# -# s = pd.Series(data) -# with pytest.raises(TypeError): -# getattr(s, op_name)(skipna=skipna) + s = pd.Series(data) + with pytest.raises(TypeError): + getattr(s, op_name)(skipna=skipna) -# class TestMethods(base.BaseMethodsTests): -# @pytest.mark.skip(reason="returns nullable") -# def test_value_counts(self, all_data, dropna): -# return super().test_value_counts(all_data, dropna) +class TestMethods(base.BaseMethodsTests): + @pytest.mark.skip(reason="returns nullable") + def test_value_counts(self, all_data, dropna): + return super().test_value_counts(all_data, dropna) -# class TestCasting(base.BaseCastingTests): -# pass +class TestCasting(base.BaseCastingTests): + pass -# class TestComparisonOps(base.BaseComparisonOpsTests): -# def _compare_other(self, s, data, op_name, other): -# result = getattr(s, op_name)(other) -# expected = getattr(s.astype(object), op_name)(other).astype("boolean") -# self.assert_series_equal(result, expected) +class TestComparisonOps(base.BaseComparisonOpsTests): + def _compare_other(self, s, data, op_name, other): + result = getattr(s, op_name)(other) + expected = getattr(s.astype(object), op_name)(other).astype("boolean") + self.assert_series_equal(result, expected) -# def test_compare_scalar(self, data, all_compare_operators): -# op_name = all_compare_operators -# s = pd.Series(data) -# self._compare_other(s, data, op_name, "abc") + def test_compare_scalar(self, data, all_compare_operators): + op_name = all_compare_operators + s = pd.Series(data) + self._compare_other(s, data, op_name, "abc") -# class TestParsing(base.BaseParsingTests): -# pass +class TestParsing(base.BaseParsingTests): + pass -# class TestPrinting(base.BasePrintingTests): -# pass +class TestPrinting(base.BasePrintingTests): + pass -# class TestGroupBy(base.BaseGroupbyTests): -# pass +class TestGroupBy(base.BaseGroupbyTests): + pass From bac64c10c322af2e8304b69cdc28fe091fda3300 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 3 Nov 2020 13:43:02 +0000 Subject: [PATCH 20/78] string array tests --- pandas/tests/arrays/string_/test_string.py | 340 +++++++++++++++------ 1 file changed, 248 insertions(+), 92 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 089bbcf4e0e3f..30fe82758313e 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -7,10 +7,46 @@ import pandas as pd import pandas._testing as tm +from pandas.core.arrays.string_arrow import ArrowStringArray, ArrowStringDtype +skip_if_no_pyarrow = td.skip_if_no("pyarrow", min_version="1.0.0") -def test_repr(): - df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype="string")}) + +@pytest.fixture( + params=["string", pytest.param("arrow_string", marks=skip_if_no_pyarrow)] +) +def dtype(request): + return request.param + + +@pytest.fixture +def dtype_object(dtype): + if dtype == "string": + return pd.StringDtype + else: + return ArrowStringDtype + + +@pytest.fixture( + params=[ + pd.arrays.StringArray, + pytest.param(ArrowStringArray, marks=skip_if_no_pyarrow), + ] +) +def cls(request): + return request.param + + +def test_repr(dtype, request): + if dtype == "arrow_string": + reason = ( + "AssertionError: assert ' A\n0 a\n1 None\n2 b' " + "== ' A\n0 a\n1 \n2 b'" + ) + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)}) expected = " A\n0 a\n1 \n2 b" assert repr(df) == expected @@ -21,27 +57,36 @@ def test_repr(): assert repr(df.A.array) == expected -def test_none_to_nan(): - a = pd.arrays.StringArray._from_sequence(["a", None, "b"]) +def test_none_to_nan(cls): + a = cls._from_sequence(["a", None, "b"]) assert a[1] is not None assert a[1] is pd.NA -def test_setitem_validates(): - a = pd.arrays.StringArray._from_sequence(["a", "b"]) - with pytest.raises(ValueError, match="10"): - a[0] = 10 +def test_setitem_validates(cls): + arr = cls._from_sequence(["a", "b"]) - with pytest.raises(ValueError, match="strings"): - a[:] = np.array([1, 2]) + if cls is pd.arrays.StringArray: + msg = "Cannot set non-string value '10' into a StringArray." + else: + msg = "Scalar must be NA or str" + with pytest.raises(ValueError, match=msg): + arr[0] = 10 + + if cls is pd.arrays.StringArray: + msg = "Must provide strings." + else: + msg = "Scalar must be NA or str" + with pytest.raises(ValueError, match=msg): + arr[:] = np.array([1, 2]) -def test_setitem_with_scalar_string(): +def test_setitem_with_scalar_string(dtype): # is_float_dtype considers some strings, like 'd', to be floats # which can cause issues. - arr = pd.array(["a", "c"], dtype="string") + arr = pd.array(["a", "c"], dtype=dtype) arr[0] = "d" - expected = pd.array(["d", "c"], dtype="string") + expected = pd.array(["d", "c"], dtype=dtype) tm.assert_extension_array_equal(arr, expected) @@ -53,46 +98,64 @@ def test_setitem_with_scalar_string(): (["a b", "a bc. de"], operator.methodcaller("capitalize")), ], ) -def test_string_methods(input, method): - a = pd.Series(input, dtype="string") +def test_string_methods(input, method, dtype, request): + if dtype == "arrow_string": + reason = "AttributeError: 'ArrowStringDtype' object has no attribute 'base'" + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + a = pd.Series(input, dtype=dtype) b = pd.Series(input, dtype="object") result = method(a.str) expected = method(b.str) - assert result.dtype.name == "string" + assert result.dtype.name == dtype tm.assert_series_equal(result.astype(object), expected) -def test_astype_roundtrip(): +def test_astype_roundtrip(dtype): s = pd.Series(pd.date_range("2000", periods=12)) s[0] = None - result = s.astype("string").astype("datetime64[ns]") + result = s.astype(dtype).astype("datetime64[ns]") tm.assert_series_equal(result, s) -def test_add(): - a = pd.Series(["a", "b", "c", None, None], dtype="string") - b = pd.Series(["x", "y", None, "z", None], dtype="string") +def test_add(dtype, request): + if dtype == "arrow_string": + reason = ( + "TypeError: unsupported operand type(s) for +: 'ArrowStringArray' and " + "'ArrowStringArray'" + ) + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + a = pd.Series(["a", "b", "c", None, None], dtype=dtype) + b = pd.Series(["x", "y", None, "z", None], dtype=dtype) result = a + b - expected = pd.Series(["ax", "by", None, None, None], dtype="string") + expected = pd.Series(["ax", "by", None, None, None], dtype=dtype) tm.assert_series_equal(result, expected) result = a.add(b) tm.assert_series_equal(result, expected) result = a.radd(b) - expected = pd.Series(["xa", "yb", None, None, None], dtype="string") + expected = pd.Series(["xa", "yb", None, None, None], dtype=dtype) tm.assert_series_equal(result, expected) result = a.add(b, fill_value="-") - expected = pd.Series(["ax", "by", "c-", "-z", None], dtype="string") + expected = pd.Series(["ax", "by", "c-", "-z", None], dtype=dtype) tm.assert_series_equal(result, expected) -def test_add_2d(): - a = pd.array(["a", "b", "c"], dtype="string") +def test_add_2d(dtype, request): + if dtype == "arrow_string": + reason = "Failed: DID NOT RAISE " + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + a = pd.array(["a", "b", "c"], dtype=dtype) b = np.array([["a", "b", "c"]], dtype=object) with pytest.raises(ValueError, match="3 != 1"): a + b @@ -102,23 +165,38 @@ def test_add_2d(): s + b -def test_add_sequence(): - a = pd.array(["a", "b", None, None], dtype="string") +def test_add_sequence(dtype, request): + if dtype == "arrow_string": + reason = ( + "TypeError: unsupported operand type(s) for +: 'ArrowStringArray' " + "and 'list'" + ) + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + a = pd.array(["a", "b", None, None], dtype=dtype) other = ["x", None, "y", None] result = a + other - expected = pd.array(["ax", None, None, None], dtype="string") + expected = pd.array(["ax", None, None, None], dtype=dtype) tm.assert_extension_array_equal(result, expected) result = other + a - expected = pd.array(["xa", None, None, None], dtype="string") + expected = pd.array(["xa", None, None, None], dtype=dtype) tm.assert_extension_array_equal(result, expected) -def test_mul(): - a = pd.array(["a", "b", None], dtype="string") +def test_mul(dtype, request): + if dtype == "arrow_string": + reason = ( + "TypeError: unsupported operand type(s) for *: 'ArrowStringArray' and 'int'" + ) + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + a = pd.array(["a", "b", None], dtype=dtype) result = a * 2 - expected = pd.array(["aa", "bb", None], dtype="string") + expected = pd.array(["aa", "bb", None], dtype=dtype) tm.assert_extension_array_equal(result, expected) result = 2 * a @@ -126,40 +204,51 @@ def test_mul(): @pytest.mark.xfail(reason="GH-28527") -def test_add_strings(): - array = pd.array(["a", "b", "c", "d"], dtype="string") +def test_add_strings(dtype): + array = pd.array(["a", "b", "c", "d"], dtype=dtype) df = pd.DataFrame([["t", "u", "v", "w"]]) assert array.__add__(df) is NotImplemented result = array + df - expected = pd.DataFrame([["at", "bu", "cv", "dw"]]).astype("string") + expected = pd.DataFrame([["at", "bu", "cv", "dw"]]).astype(dtype) tm.assert_frame_equal(result, expected) result = df + array - expected = pd.DataFrame([["ta", "ub", "vc", "wd"]]).astype("string") + expected = pd.DataFrame([["ta", "ub", "vc", "wd"]]).astype(dtype) tm.assert_frame_equal(result, expected) @pytest.mark.xfail(reason="GH-28527") -def test_add_frame(): - array = pd.array(["a", "b", np.nan, np.nan], dtype="string") +def test_add_frame(dtype): + array = pd.array(["a", "b", np.nan, np.nan], dtype=dtype) df = pd.DataFrame([["x", np.nan, "y", np.nan]]) assert array.__add__(df) is NotImplemented result = array + df - expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype("string") + expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype(dtype) tm.assert_frame_equal(result, expected) result = df + array - expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype("string") + expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype(dtype) tm.assert_frame_equal(result, expected) -def test_comparison_methods_scalar(all_compare_operators): +def test_comparison_methods_scalar(all_compare_operators, dtype, request): + if dtype == "arrow_string": + if all_compare_operators in ["__eq__", "__ne__"]: + reason = ( + "pyarrow.lib.ArrowInvalid: Could not convert with type NAType: " + "did not recognize Python value type when inferring an Arrow data type" + ) + else: + reason = "AssertionError: left is not an ExtensionArray" + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + op_name = all_compare_operators - a = pd.array(["a", None, "c"], dtype="string") + a = pd.array(["a", None, "c"], dtype=dtype) other = "a" result = getattr(a, op_name)(other) expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) @@ -171,10 +260,18 @@ def test_comparison_methods_scalar(all_compare_operators): tm.assert_extension_array_equal(result, expected) -def test_comparison_methods_array(all_compare_operators): +def test_comparison_methods_array(all_compare_operators, dtype, request): + if dtype == "arrow_string": + if all_compare_operators in ["__eq__", "__ne__"]: + reason = "NotImplementedError: Neither scalar nor ArrowStringArray" + else: + reason = "AssertionError: left is not an ExtensionArray" + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + op_name = all_compare_operators - a = pd.array(["a", None, "c"], dtype="string") + a = pd.array(["a", None, "c"], dtype=dtype) other = [None, None, "c"] result = getattr(a, op_name)(other) expected = np.empty_like(a, dtype="object") @@ -187,30 +284,43 @@ def test_comparison_methods_array(all_compare_operators): tm.assert_extension_array_equal(result, expected) -def test_constructor_raises(): - with pytest.raises(ValueError, match="sequence of strings"): - pd.arrays.StringArray(np.array(["a", "b"], dtype="S1")) +def test_constructor_raises(cls): + if cls is pd.arrays.StringArray: + msg = "StringArray requires a sequence of strings or pandas.NA" + else: + msg = "Unsupported type '' for ArrowStringArray" + + with pytest.raises(ValueError, match=msg): + cls(np.array(["a", "b"], dtype="S1")) - with pytest.raises(ValueError, match="sequence of strings"): - pd.arrays.StringArray(np.array([])) + with pytest.raises(ValueError, match=msg): + cls(np.array([])) - with pytest.raises(ValueError, match="strings or pandas.NA"): - pd.arrays.StringArray(np.array(["a", np.nan], dtype=object)) + with pytest.raises(ValueError, match=msg): + cls(np.array(["a", np.nan], dtype=object)) - with pytest.raises(ValueError, match="strings or pandas.NA"): - pd.arrays.StringArray(np.array(["a", None], dtype=object)) + with pytest.raises(ValueError, match=msg): + cls(np.array(["a", None], dtype=object)) - with pytest.raises(ValueError, match="strings or pandas.NA"): - pd.arrays.StringArray(np.array(["a", pd.NaT], dtype=object)) + with pytest.raises(ValueError, match=msg): + cls(np.array(["a", pd.NaT], dtype=object)) @pytest.mark.parametrize("copy", [True, False]) -def test_from_sequence_no_mutate(copy): +def test_from_sequence_no_mutate(copy, cls, request): + if cls is ArrowStringArray: + reason = ( + "ValueError: Unsupported type '' for " + "ArrowStringArray" + ) + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + nan_arr = np.array(["a", np.nan], dtype=object) na_arr = np.array(["a", pd.NA], dtype=object) - result = pd.arrays.StringArray._from_sequence(nan_arr, copy=copy) - expected = pd.arrays.StringArray(na_arr) + result = cls._from_sequence(nan_arr, copy=copy) + expected = cls(na_arr) tm.assert_extension_array_equal(result, expected) @@ -218,8 +328,13 @@ def test_from_sequence_no_mutate(copy): tm.assert_numpy_array_equal(nan_arr, expected) -def test_astype_int(): - arr = pd.array(["1", pd.NA, "3"], dtype="string") +def test_astype_int(dtype, request): + if dtype == "arrow_string": + reason = "TypeError: Cannot interpret 'Int64Dtype()' as a data type" + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + arr = pd.array(["1", pd.NA, "3"], dtype=dtype) result = arr.astype("Int64") expected = pd.array([1, pd.NA, 3], dtype="Int64") @@ -228,16 +343,21 @@ def test_astype_int(): @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.xfail(reason="Not implemented StringArray.sum") -def test_reduce(skipna): - arr = pd.Series(["a", "b", "c"], dtype="string") +def test_reduce(skipna, dtype): + arr = pd.Series(["a", "b", "c"], dtype=dtype) result = arr.sum(skipna=skipna) assert result == "abc" @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("skipna", [True, False]) -def test_min_max(method, skipna): - arr = pd.Series(["a", "b", "c", None], dtype="string") +def test_min_max(method, skipna, dtype, request): + if dtype == "arrow_string": + reason = "AttributeError: 'ArrowStringArray' object has no attribute 'max'" + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + arr = pd.Series(["a", "b", "c", None], dtype=dtype) result = getattr(arr, method)(skipna=skipna) if skipna: expected = "a" if method == "min" else "c" @@ -247,14 +367,20 @@ def test_min_max(method, skipna): @pytest.mark.parametrize("method", ["min", "max"]) -@pytest.mark.parametrize( - "arr", - [ - pd.Series(["a", "b", "c", None], dtype="string"), - pd.array(["a", "b", "c", None], dtype="string"), - ], -) -def test_min_max_numpy(method, arr): +@pytest.mark.parametrize("box", [pd.Series, pd.array]) +def test_min_max_numpy(method, box, dtype, request): + if dtype == "arrow_string": + if box is pd.array: + reason = ( + "TypeError: '<=' not supported between instances of 'str' and " + "'NoneType'" + ) + else: + reason = "AttributeError: 'ArrowStringArray' object has no attribute 'max'" + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + arr = box(["a", "b", "c", None], dtype=dtype) result = getattr(np, method)(arr) expected = "a" if method == "min" else "c" assert result == expected @@ -262,8 +388,8 @@ def test_min_max_numpy(method, arr): @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.xfail(reason="Not implemented StringArray.sum") -def test_reduce_missing(skipna): - arr = pd.Series([None, "a", None, "b", "c", None], dtype="string") +def test_reduce_missing(skipna, dtype): + arr = pd.Series([None, "a", None, "b", "c", None], dtype=dtype) result = arr.sum(skipna=skipna) if skipna: assert result == "abc" @@ -272,34 +398,49 @@ def test_reduce_missing(skipna): @td.skip_if_no("pyarrow", min_version="0.15.0") -def test_arrow_array(): +def test_arrow_array(dtype, request): # protocol added in 0.15.0 import pyarrow as pa - data = pd.array(["a", "b", "c"], dtype="string") + if dtype == "arrow_string": + reason = ( + "TypeError: Argument 'other' has incorrect type " + "(expected pyarrow.lib.ChunkedArray, got pyarrow.lib.StringArray)" + ) + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) expected = pa.array(list(data), type=pa.string(), from_pandas=True) assert arr.equals(expected) @td.skip_if_no("pyarrow", min_version="0.15.1.dev") -def test_arrow_roundtrip(): +def test_arrow_roundtrip(dtype, dtype_object): # roundtrip possible from arrow 1.0.0 import pyarrow as pa - data = pd.array(["a", "b", None], dtype="string") + data = pd.array(["a", "b", None], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) assert table.field("a").type == "string" result = table.to_pandas() - assert isinstance(result["a"].dtype, pd.StringDtype) + assert isinstance(result["a"].dtype, dtype_object) tm.assert_frame_equal(result, df) # ensure the missing value is represented by NA and not np.nan or None assert result.loc[2, "a"] is pd.NA -def test_value_counts_na(): - arr = pd.array(["a", "b", "a", pd.NA], dtype="string") +def test_value_counts_na(dtype, request): + if dtype == "arrow_string": + reason = ( + "AttributeError: 'ArrowStringArray' object has no attribute 'value_counts'" + ) + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) result = arr.value_counts(dropna=False) expected = pd.Series([2, 1, 1], index=["a", pd.NA, "b"], dtype="Int64") tm.assert_series_equal(result, expected) @@ -312,12 +453,13 @@ def test_value_counts_na(): @pytest.mark.parametrize( "values, expected", [ - (pd.array(["a", "b", "c"]), np.array([False, False, False])), - (pd.array(["a", "b", None]), np.array([False, False, True])), + (["a", "b", "c"], np.array([False, False, False])), + (["a", "b", None], np.array([False, False, True])), ], ) -def test_use_inf_as_na(values, expected): +def test_use_inf_as_na(values, expected, dtype): # https://github.com/pandas-dev/pandas/issues/33655 + values = pd.array(values, dtype=dtype) with pd.option_context("mode.use_inf_as_na", True): result = values.isna() tm.assert_numpy_array_equal(result, expected) @@ -331,17 +473,31 @@ def test_use_inf_as_na(values, expected): tm.assert_frame_equal(result, expected) -def test_memory_usage(): +def test_memory_usage(dtype, request): # GH 33963 - series = pd.Series(["a", "b", "c"], dtype="string") + + if dtype == "arrow_string": + reason = "assert 147 < 147" + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + series = pd.Series(["a", "b", "c"], dtype=dtype) assert 0 < series.nbytes <= series.memory_usage() < series.memory_usage(deep=True) -@pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64]) -def test_astype_from_float_dtype(dtype): +@pytest.mark.parametrize("float_dtype", [np.float16, np.float32, np.float64]) +def test_astype_from_float_dtype(float_dtype, dtype, request): # https://github.com/pandas-dev/pandas/issues/36451 - s = pd.Series([0.1], dtype=dtype) - result = s.astype("string") - expected = pd.Series(["0.1"], dtype="string") + + if dtype == "arrow_string": + reason = ( + "pyarrow.lib.ArrowTypeError: Expected bytes, got a 'numpy.float64' object" + ) + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + s = pd.Series([0.1], dtype=float_dtype) + result = s.astype(dtype) + expected = pd.Series(["0.1"], dtype=dtype) tm.assert_series_equal(result, expected) From 0956147a8e2a3d1f459f88a753e319f9c34206a0 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 3 Nov 2020 14:43:28 +0000 Subject: [PATCH 21/78] Update pandas/core/arrays/string_arrow.py Co-authored-by: Joris Van den Bossche --- pandas/core/arrays/string_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index a0b719e5116e7..276f4e29ca946 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -202,7 +202,7 @@ def dtype(self) -> ArrowStringDtype: """ return ArrowStringDtype() - def __array__(self, *args, **kwargs) -> "np.ndarray": + def __array__(self, dtype=None) -> np.ndarray: """Correctly construct numpy arrays when passed to `np.asarray()`.""" return self.data.__array__(*args, **kwargs) From 963e1cf2b82e5c49ee86ee3cb46fe27e461739c6 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 3 Nov 2020 15:03:20 +0000 Subject: [PATCH 22/78] add a to_numpy() method and use from __array__ --- pandas/core/arrays/string_arrow.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 276f4e29ca946..52f9f523cda2a 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -6,7 +6,7 @@ import numpy as np -from pandas._libs import missing as libmissing +from pandas._libs import lib, missing as libmissing from pandas._typing import ArrayLike from pandas.core.dtypes.base import ExtensionDtype @@ -204,12 +204,21 @@ def dtype(self) -> ArrowStringDtype: def __array__(self, dtype=None) -> np.ndarray: """Correctly construct numpy arrays when passed to `np.asarray()`.""" - return self.data.__array__(*args, **kwargs) + return self.to_numpy(dtype=dtype) def __arrow_array__(self, type=None): """Convert myself to a pyarrow Array or ChunkedArray.""" return self.data + def to_numpy( + self, dtype=None, copy: bool = False, na_value=lib.no_default + ) -> np.ndarray: + """ + Convert to a NumPy ndarray. + """ + # TODO: copy and na_value arguments are ignored + return self.data.__array__(dtype=dtype) + def __len__(self) -> int: """ Length of this array. From 87b8e679374456979d6a74683f8c1641532000d3 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 3 Nov 2020 16:00:44 +0000 Subject: [PATCH 23/78] mypy fixup --- pandas/tests/arrays/string_/test_string.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 30fe82758313e..8f2c1171deac1 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -13,7 +13,15 @@ @pytest.fixture( - params=["string", pytest.param("arrow_string", marks=skip_if_no_pyarrow)] + params=[ + # pandas\tests\arrays\string_\test_string.py:16: error: List item 1 has + # incompatible type "ParameterSet"; expected + # "Sequence[Collection[object]]" [list-item] + "string", + pytest.param( + "arrow_string", marks=skip_if_no_pyarrow + ), # type:ignore[list-item] + ] ) def dtype(request): return request.param From 1ed0585e79449ba524a5a64cd1abef23d212983c Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 3 Nov 2020 16:32:00 +0000 Subject: [PATCH 24/78] remove workaround for ARROW-9407 and ci test on pyarrow=1.0.0 --- ci/deps/azure-38-locale.yaml | 2 +- pandas/core/arrays/string_arrow.py | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/ci/deps/azure-38-locale.yaml b/ci/deps/azure-38-locale.yaml index 8ce58e07a8542..f879111a32e67 100644 --- a/ci/deps/azure-38-locale.yaml +++ b/ci/deps/azure-38-locale.yaml @@ -34,7 +34,7 @@ dependencies: - xlsxwriter - xlwt - moto - - pyarrow>=0.15 + - pyarrow=1.0.0 - pip - pip: - pyxlsb diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 52f9f523cda2a..719dc39bd3515 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -190,10 +190,7 @@ def _chk_pyarrow_available(cls) -> None: @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): cls._chk_pyarrow_available() - - # TODO(ARROW-9407): Accept pd.NA in Arrow - scalars_corrected = [None if pd.isna(x) else x for x in scalars] - return cls(pa.array(scalars_corrected, type=pa.string())) + return cls(pa.array(scalars, type=pa.string(), from_pandas=True)) @property def dtype(self) -> ArrowStringDtype: From 82b84bfbe1de64b57af3aeaf476660199517c896 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 4 Nov 2020 12:46:36 +0000 Subject: [PATCH 25/78] add _dtype class attribute --- pandas/core/arrays/string_arrow.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 719dc39bd3515..ba04bff2efa8c 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -170,6 +170,8 @@ class ArrowStringArray(ExtensionArray): Length: 4, dtype: arrow_string """ + _dtype = ArrowStringDtype() + def __init__(self, values): self._chk_pyarrow_available() if isinstance(values, pa.Array): @@ -197,7 +199,7 @@ def dtype(self) -> ArrowStringDtype: """ An instance of 'ArrowStringDtype'. """ - return ArrowStringDtype() + return self._dtype def __array__(self, dtype=None) -> np.ndarray: """Correctly construct numpy arrays when passed to `np.asarray()`.""" From b1a3032e250b6df35bfa4578fbeb5397b2cdebf3 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 4 Nov 2020 13:16:11 +0000 Subject: [PATCH 26/78] remove redundant integer indexing OOB and negative indexing checks in __getitem__ --- pandas/core/arrays/string_arrow.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index ba04bff2efa8c..91938172b540f 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -272,11 +272,6 @@ def __getitem__(self, item: Any) -> Any: "Only integers, slices and integer or " "boolean arrays are valid indices." ) - elif is_integer(item): - if item < 0: - item += len(self) - if item >= len(self): - raise IndexError("index out of bounds") value = self.data[item] if isinstance(value, pa.ChunkedArray): From 08d34f406433d50fabef3d62c340369b2a724a5f Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 4 Nov 2020 15:56:03 +0000 Subject: [PATCH 27/78] check pyarrow array is string type in constructor --- pandas/core/arrays/string_arrow.py | 5 +++++ pandas/tests/arrays/string_/test_string.py | 16 ++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 91938172b540f..6d7abd3d0b156 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -181,6 +181,11 @@ def __init__(self, values): else: raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray") + if not pa.types.is_string(self.data.type): + raise ValueError( + "ArrowStringArray requires an array of strings or pandas.NA" + ) + @classmethod def _chk_pyarrow_available(cls) -> None: # TODO: maybe update import_optional_dependency to allow a minimum diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 8f2c1171deac1..6f500632f9030 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -314,6 +314,22 @@ def test_constructor_raises(cls): cls(np.array(["a", pd.NaT], dtype=object)) +@td.skip_if_no("pyarrow", min_version="1.0.0") +def test_constructor_pyarrow_not_string_raises(cls): + import pyarrow as pa + + if cls is pd.arrays.StringArray: + msg = "'values' must be a NumPy array" + else: + msg = "ArrowStringArray requires an array of strings or pandas.NA" + + with pytest.raises(ValueError, match=msg): + cls(pa.array([1, 2, 3])) + + with pytest.raises(ValueError, match=msg): + cls(pa.chunked_array(pa.array([1, 2, 3]))) + + @pytest.mark.parametrize("copy", [True, False]) def test_from_sequence_no_mutate(copy, cls, request): if cls is ArrowStringArray: From ae49807ed406b622667a1896177dc9e560d074f2 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 4 Nov 2020 21:24:51 +0000 Subject: [PATCH 28/78] basic _from_factorized pending discussion on performant factorisation --- pandas/core/arrays/string_arrow.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 6d7abd3d0b156..4b58356f1c5a9 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -237,6 +237,10 @@ def __len__(self) -> int: def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): return cls._from_sequence(strings, dtype=dtype, copy=copy) + @classmethod + def _from_factorized(cls, values, original): + return cls._from_sequence(values) + def __getitem__(self, item: Any) -> Any: """Select a subset of self. From 2e5d4c746361f64847458a0e979cdb83536c26ea Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 4 Nov 2020 21:47:43 +0000 Subject: [PATCH 29/78] update constructor error message and move test --- pandas/core/arrays/string_arrow.py | 2 +- pandas/tests/arrays/string_/test_string.py | 16 ---------------- .../tests/arrays/string_/test_string_arrow.py | 19 +++++++++++++++++++ 3 files changed, 20 insertions(+), 17 deletions(-) create mode 100644 pandas/tests/arrays/string_/test_string_arrow.py diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 4b58356f1c5a9..9192e044a7c77 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -183,7 +183,7 @@ def __init__(self, values): if not pa.types.is_string(self.data.type): raise ValueError( - "ArrowStringArray requires an array of strings or pandas.NA" + "ArrowStringArray requires a PyArrow (chunked) array of string type" ) @classmethod diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 6f500632f9030..8f2c1171deac1 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -314,22 +314,6 @@ def test_constructor_raises(cls): cls(np.array(["a", pd.NaT], dtype=object)) -@td.skip_if_no("pyarrow", min_version="1.0.0") -def test_constructor_pyarrow_not_string_raises(cls): - import pyarrow as pa - - if cls is pd.arrays.StringArray: - msg = "'values' must be a NumPy array" - else: - msg = "ArrowStringArray requires an array of strings or pandas.NA" - - with pytest.raises(ValueError, match=msg): - cls(pa.array([1, 2, 3])) - - with pytest.raises(ValueError, match=msg): - cls(pa.chunked_array(pa.array([1, 2, 3]))) - - @pytest.mark.parametrize("copy", [True, False]) def test_from_sequence_no_mutate(copy, cls, request): if cls is ArrowStringArray: diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py new file mode 100644 index 0000000000000..c0589cc96a95f --- /dev/null +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -0,0 +1,19 @@ +import re + +import pytest + +from pandas.core.arrays.string_arrow import ArrowStringArray + +pa = pytest.importorskip("pyarrow", minversion="1.0.0") + + +@pytest.mark.parametrize("chunked", [True, False]) +def test_constructor_not_string_type_raises(chunked): + arr = pa.array([1, 2, 3]) + if chunked: + arr = pa.chunked_array(arr) + msg = re.escape( + "ArrowStringArray requires a PyArrow (chunked) array of string type" + ) + with pytest.raises(ValueError, match=msg): + ArrowStringArray(arr) From c8318cc5a11c63e71b929ce4f68f9a8d30eafff5 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 4 Nov 2020 22:40:53 +0000 Subject: [PATCH 30/78] add _concat_same_type classmethod --- pandas/core/arrays/string_arrow.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 9192e044a7c77..1b0cd0a37eb7e 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -241,6 +241,25 @@ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): def _from_factorized(cls, values, original): return cls._from_sequence(values) + @classmethod + def _concat_same_type(cls, to_concat) -> ArrowStringArray: + """ + Concatenate multiple ArrowStringArray. + + Parameters + ---------- + to_concat : sequence of ArrowStringArray + + Returns + ------- + ArrowStringArray + """ + return cls( + pa.chunked_array( + [array for ea in to_concat for array in ea.data.iterchunks()] + ) + ) + def __getitem__(self, item: Any) -> Any: """Select a subset of self. From 1a200a2b00dc25e57dce1a7f59fa40ab6fb1ae27 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 4 Nov 2020 22:47:11 +0000 Subject: [PATCH 31/78] _as_pandas_scalar to method --- pandas/core/arrays/string_arrow.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 1b0cd0a37eb7e..9780db50c9b45 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -2,7 +2,7 @@ from collections import abc from distutils.version import LooseVersion -from typing import Any, Optional, Sequence, Type, Union +from typing import Any, Sequence, Type, Union import numpy as np @@ -34,14 +34,6 @@ pass -def _as_pandas_scalar(arrow_scalar: pa.Scalar) -> Optional[str]: - scalar = arrow_scalar.as_py() - if scalar is None: - return libmissing.NA - else: - return scalar - - @register_extension_dtype class ArrowStringDtype(ExtensionDtype): """ @@ -305,7 +297,14 @@ def __getitem__(self, item: Any) -> Any: if isinstance(value, pa.ChunkedArray): return type(self)(value) else: - return _as_pandas_scalar(value) + return self._as_pandas_scalar(value) + + def _as_pandas_scalar(self, arrow_scalar: pa.Scalar): + scalar = arrow_scalar.as_py() + if scalar is None: + return self._dtype.na_value + else: + return scalar def fillna(self, value=None, method=None, limit=None): raise NotImplementedError("fillna") From e10be804ec30688c76c6deac9a69768a56973ec7 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 5 Nov 2020 10:08:01 +0000 Subject: [PATCH 32/78] copy/paste fillna from fletcher as baseline (29 failed) --- pandas/core/arrays/string_arrow.py | 59 +++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 9780db50c9b45..e99fb58446548 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -307,7 +307,64 @@ def _as_pandas_scalar(self, arrow_scalar: pa.Scalar): return scalar def fillna(self, value=None, method=None, limit=None): - raise NotImplementedError("fillna") + """Fill NA/NaN values using the specified method. + + Parameters + ---------- + value : scalar, array-like + If a scalar value is passed it is used to fill all missing values. + Alternatively, an array-like 'value' can be given. It's expected + that the array-like have the same length as 'self'. + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. + + Returns + ------- + filled : ExtensionArray with NA/NaN filled + """ + from pandas.api.types import is_array_like + from pandas.util._validators import validate_fillna_kwargs + import pandas.core.missing as pd_missing + + value, method = validate_fillna_kwargs(value, method) + + mask = self.isna() + + if is_array_like(value): + if len(value) != len(self): + raise ValueError( + "Length of 'value' does not match. Got ({}) " + " expected {}".format(len(value), len(self)) + ) + value = value[mask] + + if mask.any(): + if method is not None: + # pandas 1.2+ doesn't expose pad_1d anymore + if not hasattr(pd_missing, "pad_1d"): + func = pd_missing.get_fill_func(method) + else: + func = ( + pd_missing.pad_1d if method == "pad" else pd_missing.backfill_1d + ) + new_values = func(self.astype(object), limit=limit, mask=mask) + new_values = self._from_sequence(new_values, self._dtype.arrow_dtype) + else: + # fill with value + new_values = self.copy() + new_values[mask] = value + else: + new_values = self.copy() + return new_values def _reduce(self, name, skipna=True, **kwargs): if name in ["min", "max"]: From c1d308739aff23679979c6390651d407b4bdc0a2 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 5 Nov 2020 10:24:15 +0000 Subject: [PATCH 33/78] minor cleanup of fillna (29 failed) --- pandas/core/arrays/string_arrow.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index e99fb58446548..4d0978eee87df 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -8,6 +8,7 @@ from pandas._libs import lib, missing as libmissing from pandas._typing import ArrayLike +from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.dtypes import register_extension_dtype @@ -307,7 +308,8 @@ def _as_pandas_scalar(self, arrow_scalar: pa.Scalar): return scalar def fillna(self, value=None, method=None, limit=None): - """Fill NA/NaN values using the specified method. + """ + Fill NA/NaN values using the specified method. Parameters ---------- @@ -331,9 +333,6 @@ def fillna(self, value=None, method=None, limit=None): ------- filled : ExtensionArray with NA/NaN filled """ - from pandas.api.types import is_array_like - from pandas.util._validators import validate_fillna_kwargs - import pandas.core.missing as pd_missing value, method = validate_fillna_kwargs(value, method) @@ -349,15 +348,9 @@ def fillna(self, value=None, method=None, limit=None): if mask.any(): if method is not None: - # pandas 1.2+ doesn't expose pad_1d anymore - if not hasattr(pd_missing, "pad_1d"): - func = pd_missing.get_fill_func(method) - else: - func = ( - pd_missing.pad_1d if method == "pad" else pd_missing.backfill_1d - ) + func = libmissing.pad_1d if method == "pad" else libmissing.backfill_1d new_values = func(self.astype(object), limit=limit, mask=mask) - new_values = self._from_sequence(new_values, self._dtype.arrow_dtype) + new_values = self._from_sequence(new_values) else: # fill with value new_values = self.copy() From 34f563dcaca1bee86dee8656af95aa8a7ded834e Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 5 Nov 2020 11:21:48 +0000 Subject: [PATCH 34/78] correct mistake in previous commit (25 failed) --- pandas/core/arrays/string_arrow.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 4d0978eee87df..65def9efb4a9d 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -23,6 +23,7 @@ ) from pandas.core.arrays.base import ExtensionArray from pandas.core.indexers import check_array_indexer +from pandas.core.missing import get_fill_func try: import pyarrow as pa @@ -341,14 +342,14 @@ def fillna(self, value=None, method=None, limit=None): if is_array_like(value): if len(value) != len(self): raise ValueError( - "Length of 'value' does not match. Got ({}) " - " expected {}".format(len(value), len(self)) + f"Length of 'value' does not match. Got ({len(value)}) " + f"expected {len(self)}" ) value = value[mask] if mask.any(): if method is not None: - func = libmissing.pad_1d if method == "pad" else libmissing.backfill_1d + func = get_fill_func(method) new_values = func(self.astype(object), limit=limit, mask=mask) new_values = self._from_sequence(new_values) else: From f5fc4fd967fa752982b80f996c21740eccd59c3a Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 5 Nov 2020 12:15:28 +0000 Subject: [PATCH 35/78] add OpsMixin (23 failed) --- pandas/core/arrays/string_arrow.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 65def9efb4a9d..c48f1c62b65e9 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -7,7 +7,6 @@ import numpy as np from pandas._libs import lib, missing as libmissing -from pandas._typing import ArrayLike from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.base import ExtensionDtype @@ -21,6 +20,7 @@ is_integer_dtype, is_scalar, ) +from pandas.core.arraylike import OpsMixin from pandas.core.arrays.base import ExtensionArray from pandas.core.indexers import check_array_indexer from pandas.core.missing import get_fill_func @@ -120,7 +120,7 @@ def __eq__(self, other) -> bool: return False -class ArrowStringArray(ExtensionArray): +class ArrowStringArray(OpsMixin, ExtensionArray): """ Extension array for string data in a ``pyarrow.ChunkedArray``. @@ -397,10 +397,10 @@ def copy(self) -> ExtensionArray: """ return type(self)(self.data) - def __eq__(self, other: Any) -> ArrayLike: - """ - Return for `self == other` (element-wise equality). - """ + def _cmp_method(self, other, op): + if op.__name__ != "eq": + return NotImplemented + if isinstance(other, (pd.Series, pd.DataFrame, pd.Index)): return NotImplemented if isinstance(other, ArrowStringArray): From a5a7c85b2fb12aa8d911224d3b3d972be39e245b Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 5 Nov 2020 12:41:44 +0000 Subject: [PATCH 36/78] add binops (18 failed) --- pandas/core/arrays/string_arrow.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index c48f1c62b65e9..7c507c037654a 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -398,15 +398,21 @@ def copy(self) -> ExtensionArray: return type(self)(self.data) def _cmp_method(self, other, op): - if op.__name__ != "eq": - return NotImplemented - + ops = { + "eq": pc.equal, + "ne": pc.not_equal, + "lt": pc.less, + "gt": pc.greater, + "le": pc.less_equal, + "ge": pc.greater_equal, + } + op = ops[op.__name__] if isinstance(other, (pd.Series, pd.DataFrame, pd.Index)): return NotImplemented if isinstance(other, ArrowStringArray): - result = pc.equal(self.data, other.data) + result = op(self.data, other.data) elif is_scalar(other): - result = pc.equal(self.data, pa.scalar(other)) + result = op(self.data, pa.scalar(other)) else: raise NotImplementedError("Neither scalar nor ArrowStringArray") From f651563571074af5d9cd8ed2f572431e1eaf3daa Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 5 Nov 2020 13:29:45 +0000 Subject: [PATCH 37/78] return Boolean array for comparison ops (12 failed) --- pandas/core/arrays/string_arrow.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 7c507c037654a..b1798eda3361f 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -2,6 +2,7 @@ from collections import abc from distutils.version import LooseVersion +import operator from typing import Any, Sequence, Type, Union import numpy as np @@ -406,15 +407,25 @@ def _cmp_method(self, other, op): "le": pc.less_equal, "ge": pc.greater_equal, } - op = ops[op.__name__] + pc_func = ops[op.__name__] if isinstance(other, (pd.Series, pd.DataFrame, pd.Index)): return NotImplemented if isinstance(other, ArrowStringArray): - result = op(self.data, other.data) + result = pc_func(self.data, other.data) elif is_scalar(other): - result = op(self.data, pa.scalar(other)) + result = pc_func(self.data, pa.scalar(other)) else: - raise NotImplementedError("Neither scalar nor ArrowStringArray") + rops = { + "eq": operator.eq, + "ne": operator.ne, + "lt": operator.gt, + "gt": operator.lt, + "le": operator.ge, + "ge": operator.le, + } + rop = rops[op.__name__] + result = rop(other, self) + return pd.array(result, dtype="boolean") # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray return pd.array(result.to_pandas().values) From f5419b92cbf9f51accf48b8167ef77cad22930ea Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 5 Nov 2020 14:13:40 +0000 Subject: [PATCH 38/78] fix ValueError: zero-size array to reduction operation maximum which has no identity (6 failed) --- pandas/core/arrays/string_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index b1798eda3361f..4d9a4f4a0ba6f 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -558,7 +558,7 @@ def take( if len(self.data) == 0 and (indices_array >= 0).any(): raise IndexError("cannot do a non-empty take") - if indices_array.max() >= len(self.data): + if indices_array.size > 0 and indices_array.max() >= len(self.data): raise IndexError("out of bounds value in 'indices'.") if allow_fill: From 3af5ce023310171e2c7d0153fe5b5c1c63b39ca7 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 5 Nov 2020 14:58:49 +0000 Subject: [PATCH 39/78] copy/paste value_counts from fletcher as baseline (5 failed) --- pandas/core/arrays/string_arrow.py | 34 +++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 4d9a4f4a0ba6f..0122d80904dd0 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -3,7 +3,7 @@ from collections import abc from distutils.version import LooseVersion import operator -from typing import Any, Sequence, Type, Union +from typing import TYPE_CHECKING, Any, Sequence, Type, Union import numpy as np @@ -36,6 +36,9 @@ except ImportError: pass +if TYPE_CHECKING: + from pandas import Series + @register_extension_dtype class ArrowStringDtype(ExtensionDtype): @@ -579,3 +582,32 @@ def take( indices_array = np.copy(indices_array) indices_array[indices_array < 0] += len(self.data) return type(self)(self.data.take(indices_array)) + + def value_counts(self, dropna: bool = True) -> Series: + """ + Return a Series containing counts of each unique value. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of missing values. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts + """ + vc = self.data.value_counts() + + # Index cannot hold ExtensionArrays yet + index = pd.Index(type(self)(vc.field(0)).astype(object)) + # No missings, so we can adhere to the interface and return a numpy array. + counts = np.array(vc.field(1)) + + if dropna and self.data.null_count > 0: + raise NotImplementedError("yo") + + return pd.Series(counts, index=index) From bdf4ad2abc302411343f06b52faa76c396b0a84f Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 5 Nov 2020 15:19:33 +0000 Subject: [PATCH 40/78] tidy imports --- pandas/core/arrays/string_arrow.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 0122d80904dd0..4f548252601fd 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -12,8 +12,9 @@ from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndex, ABCSeries +from pandas.core.dtypes.missing import isna -import pandas as pd from pandas.api.types import ( is_array_like, is_bool_dtype, @@ -402,6 +403,8 @@ def copy(self) -> ExtensionArray: return type(self)(self.data) def _cmp_method(self, other, op): + from pandas.arrays import BooleanArray + ops = { "eq": pc.equal, "ne": pc.not_equal, @@ -411,7 +414,7 @@ def _cmp_method(self, other, op): "ge": pc.greater_equal, } pc_func = ops[op.__name__] - if isinstance(other, (pd.Series, pd.DataFrame, pd.Index)): + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndex)): return NotImplemented if isinstance(other, ArrowStringArray): result = pc_func(self.data, other.data) @@ -428,10 +431,10 @@ def _cmp_method(self, other, op): } rop = rops[op.__name__] result = rop(other, self) - return pd.array(result, dtype="boolean") + return BooleanArray._from_sequence(result) # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray - return pd.array(result.to_pandas().values) + return BooleanArray._from_sequence(result.to_pandas().values) def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None: """Set one or more values inplace. @@ -457,9 +460,9 @@ def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None: key = check_array_indexer(self, key) if is_integer(key): - if not pd.api.types.is_scalar(value): + if not is_scalar(value): raise ValueError("Must pass scalars with scalar indexer") - elif pd.isna(value): + elif isna(value): value = None elif not isinstance(value, str): raise ValueError("Scalar must be NA or str") @@ -486,7 +489,7 @@ def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None: # TODO(ARROW-9431): Directly support setitem(integers) key_array = np.asanyarray(key) - if pd.api.types.is_scalar(value): + if is_scalar(value): value = np.broadcast_to(value, len(key_array)) else: value = np.asarray(value) @@ -569,7 +572,7 @@ def take( # TODO(ARROW-9433): Treat negative indices as NULL indices_array = pa.array(indices_array, mask=indices_array < 0) result = self.data.take(indices_array) - if pd.isna(fill_value): + if isna(fill_value): return type(self)(result) return type(self)(pc.fill_null(result, pa.scalar(fill_value))) else: @@ -600,14 +603,16 @@ def value_counts(self, dropna: bool = True) -> Series: -------- Series.value_counts """ + from pandas import Index, Series + vc = self.data.value_counts() # Index cannot hold ExtensionArrays yet - index = pd.Index(type(self)(vc.field(0)).astype(object)) + index = Index(type(self)(vc.field(0)).astype(object)) # No missings, so we can adhere to the interface and return a numpy array. counts = np.array(vc.field(1)) if dropna and self.data.null_count > 0: raise NotImplementedError("yo") - return pd.Series(counts, index=index) + return Series(counts, index=index) From e044c7f763b76ac64a5a53cd4af740cf04a21730 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 6 Nov 2020 12:34:29 +0000 Subject: [PATCH 41/78] fix test_take_non_na_fill_value (4 failed) --- pandas/core/arrays/string_arrow.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 4f548252601fd..bda3036c8dae4 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -574,7 +574,12 @@ def take( result = self.data.take(indices_array) if isna(fill_value): return type(self)(result) - return type(self)(pc.fill_null(result, pa.scalar(fill_value))) + # TODO: ArrowNotImplementedError: Function fill_null has no + # kernel matching input types (array[string], scalar[string]) + result = type(self)(result) + result[result.isna()] = fill_value + return result + # return type(self)(pc.fill_null(result, pa.scalar(fill_value))) else: # Nothing to fill return type(self)(self.data.take(indices)) From c5625a891f41092b1f1b51b51dd2be192e1b9599 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 6 Nov 2020 12:56:50 +0000 Subject: [PATCH 42/78] fix test_take_pandas_style_negative_raises (3 failed) --- pandas/core/arrays/string_arrow.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index bda3036c8dae4..24d498520410b 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -24,7 +24,7 @@ ) from pandas.core.arraylike import OpsMixin from pandas.core.arrays.base import ExtensionArray -from pandas.core.indexers import check_array_indexer +from pandas.core.indexers import check_array_indexer, validate_indices from pandas.core.missing import get_fill_func try: @@ -569,6 +569,7 @@ def take( if allow_fill: if (indices_array < 0).any(): + validate_indices(indices_array, len(self.data)) # TODO(ARROW-9433): Treat negative indices as NULL indices_array = pa.array(indices_array, mask=indices_array < 0) result = self.data.take(indices_array) From 50889fbd2a519a1e61dd1e0fb114ba9477c64b02 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 6 Nov 2020 13:36:58 +0000 Subject: [PATCH 43/78] parametrize string extension tests (3 failed) --- pandas/tests/extension/test_string.py | 47 ++++--- pandas/tests/extension/test_string_arrow.py | 128 -------------------- 2 files changed, 32 insertions(+), 143 deletions(-) delete mode 100644 pandas/tests/extension/test_string_arrow.py diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 27a157d2127f6..18659a0e48cd5 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -3,39 +3,49 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd -from pandas.core.arrays.string_ import StringArray, StringDtype +from pandas.core.arrays.string_ import StringDtype +from pandas.core.arrays.string_arrow import ArrowStringDtype from pandas.tests.extension import base -@pytest.fixture -def dtype(): - return StringDtype() +@pytest.fixture( + params=[ + StringDtype, + pytest.param( + ArrowStringDtype, marks=td.skip_if_no("pyarrow", min_version="1.0.0") + ), + ] +) +def dtype(request): + return request.param() @pytest.fixture -def data(): +def data(dtype): strings = np.random.choice(list(string.ascii_letters), size=100) while strings[0] == strings[1]: strings = np.random.choice(list(string.ascii_letters), size=100) - return StringArray._from_sequence(strings) + return dtype.construct_array_type()._from_sequence(strings) @pytest.fixture -def data_missing(): +def data_missing(dtype): """Length 2 array with [NA, Valid]""" - return StringArray._from_sequence([pd.NA, "A"]) + return dtype.construct_array_type()._from_sequence([pd.NA, "A"]) @pytest.fixture -def data_for_sorting(): - return StringArray._from_sequence(["B", "C", "A"]) +def data_for_sorting(dtype): + return dtype.construct_array_type()._from_sequence(["B", "C", "A"]) @pytest.fixture -def data_missing_for_sorting(): - return StringArray._from_sequence(["B", pd.NA, "A"]) +def data_missing_for_sorting(dtype): + return dtype.construct_array_type()._from_sequence(["B", pd.NA, "A"]) @pytest.fixture @@ -44,8 +54,10 @@ def na_value(): @pytest.fixture -def data_for_grouping(): - return StringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"]) +def data_for_grouping(dtype): + return dtype.construct_array_type()._from_sequence( + ["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"] + ) class TestDtype(base.BaseDtypeTests): @@ -53,7 +65,12 @@ class TestDtype(base.BaseDtypeTests): class TestInterface(base.BaseInterfaceTests): - pass + def test_view(self, data, dtype, request): + if isinstance(dtype, ArrowStringDtype): + reason = "Fails until implement, remove before merge" + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + base.BaseInterfaceTests.test_view(self, data) class TestConstructors(base.BaseConstructorsTests): diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py deleted file mode 100644 index d6c8838a55523..0000000000000 --- a/pandas/tests/extension/test_string_arrow.py +++ /dev/null @@ -1,128 +0,0 @@ -import string - -import numpy as np -import pytest - -import pandas as pd -from pandas.tests.extension import base - -pytest.importorskip("pyarrow", minversion="1.0") - -from pandas.core.arrays.string_arrow import ArrowStringArray, ArrowStringDtype - - -@pytest.fixture -def dtype(): - return ArrowStringDtype() - - -@pytest.fixture -def data(): - strings = np.random.choice(list(string.ascii_letters), size=100) - while strings[0] == strings[1]: - strings = np.random.choice(list(string.ascii_letters), size=100) - - return ArrowStringArray._from_sequence(strings) - - -@pytest.fixture -def data_missing(): - """Length 2 array with [NA, Valid]""" - return ArrowStringArray._from_sequence([pd.NA, "A"]) - - -@pytest.fixture -def data_for_sorting(): - return ArrowStringArray._from_sequence(["B", "C", "A"]) - - -@pytest.fixture -def data_missing_for_sorting(): - return ArrowStringArray._from_sequence(["B", pd.NA, "A"]) - - -@pytest.fixture -def na_value(): - return pd.NA - - -@pytest.fixture -def data_for_grouping(): - return ArrowStringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"]) - - -class TestDtype(base.BaseDtypeTests): - pass - - -class TestInterface(base.BaseInterfaceTests): - @pytest.mark.xfail(reason="Fails until implement, remove before merge") - def test_view(self, data): - base.BaseInterfaceTests.test_view(self, data) - - -class TestConstructors(base.BaseConstructorsTests): - pass - - -class TestReshaping(base.BaseReshapingTests): - pass - - -class TestGetitem(base.BaseGetitemTests): - pass - - -class TestSetitem(base.BaseSetitemTests): - pass - - -class TestMissing(base.BaseMissingTests): - pass - - -class TestNoReduce(base.BaseNoReduceTests): - @pytest.mark.parametrize("skipna", [True, False]) - def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): - op_name = all_numeric_reductions - - if op_name in ["min", "max"]: - return None - - s = pd.Series(data) - with pytest.raises(TypeError): - getattr(s, op_name)(skipna=skipna) - - -class TestMethods(base.BaseMethodsTests): - @pytest.mark.skip(reason="returns nullable") - def test_value_counts(self, all_data, dropna): - return super().test_value_counts(all_data, dropna) - - -class TestCasting(base.BaseCastingTests): - pass - - -class TestComparisonOps(base.BaseComparisonOpsTests): - def _compare_other(self, s, data, op_name, other): - result = getattr(s, op_name)(other) - expected = getattr(s.astype(object), op_name)(other).astype("boolean") - self.assert_series_equal(result, expected) - - def test_compare_scalar(self, data, all_compare_operators): - op_name = all_compare_operators - s = pd.Series(data) - self._compare_other(s, data, op_name, "abc") - - -class TestParsing(base.BaseParsingTests): - pass - - -class TestPrinting(base.BasePrintingTests): - pass - - -class TestGroupBy(base.BaseGroupbyTests): - pass From 0e1773bde82c6e1e85d8a52a53434150e1e6efbe Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 6 Nov 2020 14:19:28 +0000 Subject: [PATCH 44/78] xfail other 2 tests expecting views (1 failed) --- pandas/tests/extension/test_string.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 18659a0e48cd5..3f95117b238a7 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -78,7 +78,12 @@ class TestConstructors(base.BaseConstructorsTests): class TestReshaping(base.BaseReshapingTests): - pass + def test_transpose(self, data, dtype, request): + if isinstance(dtype, ArrowStringDtype): + reason = "Fails until implement, remove before merge" + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + base.BaseReshapingTests.test_transpose(self, data) class TestGetitem(base.BaseGetitemTests): @@ -86,7 +91,12 @@ class TestGetitem(base.BaseGetitemTests): class TestSetitem(base.BaseSetitemTests): - pass + def test_setitem_preserves_views(self, data, dtype, request): + if isinstance(dtype, ArrowStringDtype): + reason = "Fails until implement, remove before merge" + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + base.BaseSetitemTests.test_setitem_preserves_views(self, data) class TestMissing(base.BaseMissingTests): From 7bb9574792b7afedc1a3eb698e798f6a002d991b Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 6 Nov 2020 15:11:15 +0000 Subject: [PATCH 45/78] add ensure_string_array to _from_sequence (1 failed) --- pandas/_libs/lib.pyx | 2 +- pandas/core/arrays/string_arrow.py | 4 ++++ pandas/core/dtypes/cast.py | 6 +++++- pandas/tests/arrays/string_/test_string.py | 10 +--------- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 0b0334d52c1e9..6abf9c06f7289 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -634,7 +634,7 @@ cpdef ndarray[object] ensure_string_array( ---------- arr : array-like The values to be converted to str, if needed. - na_value : Any + na_value : Any, default np.nan The value to use for na. For example, np.nan or pd.NA. convert_na_value : bool, default True If False, existing na values will be used unchanged in the new array. diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 24d498520410b..c19884a091411 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -196,6 +196,10 @@ def _chk_pyarrow_available(cls) -> None: @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): cls._chk_pyarrow_available() + # convert non-na-likes to str, and nan-likes to ArrowStringDtype.na_value + scalars = lib.ensure_string_array( + scalars, na_value=cls._dtype.na_value, copy=copy + ) return cls(pa.array(scalars, type=pa.string(), from_pandas=True)) @property diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 692da8f8e021e..c2f28b17e7227 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -360,13 +360,17 @@ def maybe_cast_to_extension_array( ExtensionArray or obj """ from pandas.core.arrays.string_ import StringArray + from pandas.core.arrays.string_arrow import ArrowStringArray assert isinstance(cls, type), f"must pass a type: {cls}" assertion_msg = f"must pass a subclass of ExtensionArray: {cls}" assert issubclass(cls, ABCExtensionArray), assertion_msg # Everything can be be converted to StringArrays, but we may not want to convert - if issubclass(cls, StringArray) and lib.infer_dtype(obj) != "string": + if ( + issubclass(cls, (StringArray, ArrowStringArray)) + and lib.infer_dtype(obj) != "string" + ): return obj try: diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 8f2c1171deac1..250af3ab13d3a 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -495,16 +495,8 @@ def test_memory_usage(dtype, request): @pytest.mark.parametrize("float_dtype", [np.float16, np.float32, np.float64]) -def test_astype_from_float_dtype(float_dtype, dtype, request): +def test_astype_from_float_dtype(float_dtype, dtype): # https://github.com/pandas-dev/pandas/issues/36451 - - if dtype == "arrow_string": - reason = ( - "pyarrow.lib.ArrowTypeError: Expected bytes, got a 'numpy.float64' object" - ) - mark = pytest.mark.xfail(reason=reason) - request.node.add_marker(mark) - s = pd.Series([0.1], dtype=float_dtype) result = s.astype(dtype) expected = pd.Series(["0.1"], dtype=dtype) From 51d7d0a9fc1786732622ab684dbb8666bb1026c1 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 12 Nov 2020 15:44:56 +0000 Subject: [PATCH 46/78] Apply suggestions from code review Co-authored-by: Joris Van den Bossche --- pandas/core/arrays/string_arrow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index c19884a091411..4d8879a781c48 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -198,7 +198,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): cls._chk_pyarrow_available() # convert non-na-likes to str, and nan-likes to ArrowStringDtype.na_value scalars = lib.ensure_string_array( - scalars, na_value=cls._dtype.na_value, copy=copy + scalars, na_value=cls._dtype.na_value, copy=False ) return cls(pa.array(scalars, type=pa.string(), from_pandas=True)) @@ -625,4 +625,4 @@ def value_counts(self, dropna: bool = True) -> Series: if dropna and self.data.null_count > 0: raise NotImplementedError("yo") - return Series(counts, index=index) + return Series(counts, index=index).astype("Int64") From 3cf5c9183561046f6842f7e8cc3d82a9afd41622 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 12 Nov 2020 16:17:28 +0000 Subject: [PATCH 47/78] return NotImplemented in comparisons (7 failed) --- pandas/core/arrays/string_arrow.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 4d8879a781c48..d733612310ae7 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -2,7 +2,6 @@ from collections import abc from distutils.version import LooseVersion -import operator from typing import TYPE_CHECKING, Any, Sequence, Type, Union import numpy as np @@ -425,17 +424,7 @@ def _cmp_method(self, other, op): elif is_scalar(other): result = pc_func(self.data, pa.scalar(other)) else: - rops = { - "eq": operator.eq, - "ne": operator.ne, - "lt": operator.gt, - "gt": operator.lt, - "le": operator.ge, - "ge": operator.le, - } - rop = rops[op.__name__] - result = rop(other, self) - return BooleanArray._from_sequence(result) + return NotImplemented # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray return BooleanArray._from_sequence(result.to_pandas().values) From 07239a05ccd651d71290726a420b98f47ff178fe Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 12 Nov 2020 16:54:08 +0000 Subject: [PATCH 48/78] move arrow function lookup dict to module scope (7 failed) --- pandas/core/arrays/string_arrow.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index d733612310ae7..b453d0cbb6863 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -35,6 +35,16 @@ import pyarrow.compute as pc except ImportError: pass + else: + ARROW_CMP_FUNCS = { + "eq": pc.equal, + "ne": pc.not_equal, + "lt": pc.less, + "gt": pc.greater, + "le": pc.less_equal, + "ge": pc.greater_equal, + } + if TYPE_CHECKING: from pandas import Series @@ -408,15 +418,7 @@ def copy(self) -> ExtensionArray: def _cmp_method(self, other, op): from pandas.arrays import BooleanArray - ops = { - "eq": pc.equal, - "ne": pc.not_equal, - "lt": pc.less, - "gt": pc.greater, - "le": pc.less_equal, - "ge": pc.greater_equal, - } - pc_func = ops[op.__name__] + pc_func = ARROW_CMP_FUNCS[op.__name__] if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndex)): return NotImplemented if isinstance(other, ArrowStringArray): From 9a7cfc5b21ff5515eda7110ea56aa36802af1ecd Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 12 Nov 2020 19:15:48 +0000 Subject: [PATCH 49/78] remove isinstance(other, (ABCSeries, ABCDataFrame, ABCIndex)) check --- pandas/core/arrays/string_arrow.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index b453d0cbb6863..bd05df48e1226 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -11,7 +11,6 @@ from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.dtypes import register_extension_dtype -from pandas.core.dtypes.generic import ABCDataFrame, ABCIndex, ABCSeries from pandas.core.dtypes.missing import isna from pandas.api.types import ( @@ -419,8 +418,6 @@ def _cmp_method(self, other, op): from pandas.arrays import BooleanArray pc_func = ARROW_CMP_FUNCS[op.__name__] - if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndex)): - return NotImplemented if isinstance(other, ArrowStringArray): result = pc_func(self.data, other.data) elif is_scalar(other): From 2ba0dcddfe675dbadf7b4cc19d9180bd31b1e89e Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 13 Nov 2020 15:12:19 +0000 Subject: [PATCH 50/78] remove na_value=cls._dtype.na_value from ensure_string_array call (7 failed) --- pandas/core/arrays/string_arrow.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index bd05df48e1226..121984b548074 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -205,9 +205,7 @@ def _chk_pyarrow_available(cls) -> None: def _from_sequence(cls, scalars, dtype=None, copy=False): cls._chk_pyarrow_available() # convert non-na-likes to str, and nan-likes to ArrowStringDtype.na_value - scalars = lib.ensure_string_array( - scalars, na_value=cls._dtype.na_value, copy=False - ) + scalars = lib.ensure_string_array(scalars, copy=False) return cls(pa.array(scalars, type=pa.string(), from_pandas=True)) @property From 97c56e28a29b3ef11818613477e0a5b5ee36da38 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 13 Nov 2020 15:19:26 +0000 Subject: [PATCH 51/78] coloate _from_sequence_of_strings with _from_sequence (7 failed) --- pandas/core/arrays/string_arrow.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 121984b548074..d100489800401 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -208,6 +208,10 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): scalars = lib.ensure_string_array(scalars, copy=False) return cls(pa.array(scalars, type=pa.string(), from_pandas=True)) + @classmethod + def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + return cls._from_sequence(strings, dtype=dtype, copy=copy) + @property def dtype(self) -> ArrowStringDtype: """ @@ -242,10 +246,6 @@ def __len__(self) -> int: """ return len(self.data) - @classmethod - def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): - return cls._from_sequence(strings, dtype=dtype, copy=copy) - @classmethod def _from_factorized(cls, values, original): return cls._from_sequence(values) From d6d3543bd048a6bcb3e0bc4f918cc44e23884207 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 13 Nov 2020 15:21:43 +0000 Subject: [PATCH 52/78] revert change to extra_compile_args in setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 177cf4104133e..9a9d12ce4d2ba 100755 --- a/setup.py +++ b/setup.py @@ -432,7 +432,7 @@ def run(self): extra_compile_args.append("/Z7") extra_link_args.append("/DEBUG") else: - extra_compile_args = [] + extra_compile_args = ["-Werror"] extra_link_args = [] if debugging_symbols_requested: extra_compile_args.append("-g") From d71a895ae59a49e9ddf38c679697e1fc6f3f8f11 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 13 Nov 2020 15:38:08 +0000 Subject: [PATCH 53/78] sync fillna docstring with base --- pandas/core/arrays/string_arrow.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index d100489800401..379568c18fd3a 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -336,7 +336,7 @@ def fillna(self, value=None, method=None, limit=None): method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None Method to use for filling holes in reindexed Series pad / ffill: propagate last valid observation forward to next valid - backfill / bfill: use NEXT valid observation to fill gap + backfill / bfill: use NEXT valid observation to fill gap. limit : int, default None If method is specified, this is the maximum number of consecutive NaN values to forward/backward fill. In other words, if there is @@ -347,9 +347,9 @@ def fillna(self, value=None, method=None, limit=None): Returns ------- - filled : ExtensionArray with NA/NaN filled + ExtensionArray + With NA/NaN filled. """ - value, method = validate_fillna_kwargs(value, method) mask = self.isna() From f342b62716e735ddc1bb1fddd56fc3da0eb40eb5 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 13 Nov 2020 15:43:29 +0000 Subject: [PATCH 54/78] Apply suggestions from code review Co-authored-by: Joris Van den Bossche --- pandas/core/arrays/string_arrow.py | 2 +- pandas/tests/extension/test_string.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index b453d0cbb6863..b0efb73e48c88 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -368,7 +368,7 @@ def fillna(self, value=None, method=None, limit=None): if mask.any(): if method is not None: func = get_fill_func(method) - new_values = func(self.astype(object), limit=limit, mask=mask) + new_values = func(self.to_numpy(object), limit=limit, mask=mask) new_values = self._from_sequence(new_values) else: # fill with value diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 3f95117b238a7..3653ddf846510 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -65,12 +65,12 @@ class TestDtype(base.BaseDtypeTests): class TestInterface(base.BaseInterfaceTests): - def test_view(self, data, dtype, request): - if isinstance(dtype, ArrowStringDtype): + def test_view(self, data, request): + if isinstance(data.dtype, ArrowStringDtype): reason = "Fails until implement, remove before merge" mark = pytest.mark.xfail(reason=reason) request.node.add_marker(mark) - base.BaseInterfaceTests.test_view(self, data) + super().test_view(self, data) class TestConstructors(base.BaseConstructorsTests): From b3c63479ea0c9eedd6af58c93453a7769fffcd33 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 13 Nov 2020 15:51:11 +0000 Subject: [PATCH 55/78] other base.Base*Tests -> super() --- pandas/tests/extension/test_string.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 3653ddf846510..2e820940716a8 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -70,7 +70,7 @@ def test_view(self, data, request): reason = "Fails until implement, remove before merge" mark = pytest.mark.xfail(reason=reason) request.node.add_marker(mark) - super().test_view(self, data) + super().test_view(data) class TestConstructors(base.BaseConstructorsTests): @@ -83,7 +83,7 @@ def test_transpose(self, data, dtype, request): reason = "Fails until implement, remove before merge" mark = pytest.mark.xfail(reason=reason) request.node.add_marker(mark) - base.BaseReshapingTests.test_transpose(self, data) + super().test_transpose(data) class TestGetitem(base.BaseGetitemTests): @@ -96,7 +96,7 @@ def test_setitem_preserves_views(self, data, dtype, request): reason = "Fails until implement, remove before merge" mark = pytest.mark.xfail(reason=reason) request.node.add_marker(mark) - base.BaseSetitemTests.test_setitem_preserves_views(self, data) + super().test_setitem_preserves_views(data) class TestMissing(base.BaseMissingTests): From 26bca2535133ad5c0a634ce2fc9744e4f4281e28 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 13 Nov 2020 16:24:22 +0000 Subject: [PATCH 56/78] len(item) == 0 -> not len(item) --- pandas/core/arrays/string_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 1fb387a053e5e..9a22c7c1eae9b 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -298,7 +298,7 @@ def __getitem__(self, item: Any) -> Any: if isinstance(item, abc.Iterable): if not is_array_like(item): item = np.array(item) - if len(item) == 0: + if not len(item): return type(self)(pa.chunked_array([], type=pa.string())) elif is_integer_dtype(item): return self.take(item) From 9579444451c44548a6f0b2970d443eb82901996e Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 13 Nov 2020 16:50:56 +0000 Subject: [PATCH 57/78] update copy docstring and return type --- pandas/core/arrays/string_arrow.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 9a22c7c1eae9b..23e5bc58edb86 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -397,18 +397,13 @@ def isna(self) -> np.ndarray: # TODO: Implement .to_numpy for ChunkedArray return self.data.is_null().to_pandas().values - def copy(self) -> ExtensionArray: + def copy(self) -> ArrowStringArray: """ - Return a copy of the array. - - Parameters - ---------- - deep : bool, default False - Also copy the underlying data backing this array. + Return a shallow copy of the array. Returns ------- - ExtensionArray + ArrowStringArray """ return type(self)(self.data) From 88094a7726a8f6eb520796cc27d6e743e0b6dd34 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 13 Nov 2020 18:56:20 +0000 Subject: [PATCH 58/78] test_constructor_not_string_type_raises with np.ndarray --- .../tests/arrays/string_/test_string_arrow.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index c0589cc96a95f..ec7f57940a67f 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -1,5 +1,6 @@ import re +import numpy as np import pytest from pandas.core.arrays.string_arrow import ArrowStringArray @@ -8,12 +9,18 @@ @pytest.mark.parametrize("chunked", [True, False]) -def test_constructor_not_string_type_raises(chunked): - arr = pa.array([1, 2, 3]) +@pytest.mark.parametrize("array", [np, pa]) +def test_constructor_not_string_type_raises(array, chunked): + arr = array.array([1, 2, 3]) if chunked: + if array is np: + pytest.skip("chunked not applicable to numpy array") arr = pa.chunked_array(arr) - msg = re.escape( - "ArrowStringArray requires a PyArrow (chunked) array of string type" - ) + if array is np: + msg = "Unsupported type '' for ArrowStringArray" + else: + msg = re.escape( + "ArrowStringArray requires a PyArrow (chunked) array of string type" + ) with pytest.raises(ValueError, match=msg): ArrowStringArray(arr) From ba0cee8fb9a202ce4a4fd45cd2ea4814f42fde14 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 13 Nov 2020 19:41:43 +0000 Subject: [PATCH 59/78] update test_from_sequence_no_mutate (7 failed) --- pandas/tests/arrays/string_/test_string.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 250af3ab13d3a..cd9bd404043a3 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -316,11 +316,8 @@ def test_constructor_raises(cls): @pytest.mark.parametrize("copy", [True, False]) def test_from_sequence_no_mutate(copy, cls, request): - if cls is ArrowStringArray: - reason = ( - "ValueError: Unsupported type '' for " - "ArrowStringArray" - ) + if cls is ArrowStringArray and copy is False: + reason = "AssertionError: numpy array are different" mark = pytest.mark.xfail(reason=reason) request.node.add_marker(mark) @@ -328,7 +325,13 @@ def test_from_sequence_no_mutate(copy, cls, request): na_arr = np.array(["a", pd.NA], dtype=object) result = cls._from_sequence(nan_arr, copy=copy) - expected = cls(na_arr) + + if cls is ArrowStringArray: + import pyarrow as pa + + expected = cls(pa.array(na_arr, type=pa.string(), from_pandas=True)) + else: + expected = cls(na_arr) tm.assert_extension_array_equal(result, expected) From 6709ac3c198ff48ffbcdf85b0861272b2b87de02 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 13 Nov 2020 20:24:56 +0000 Subject: [PATCH 60/78] change xfail message for base extension array tests (7 failed) --- pandas/tests/extension/test_string.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 2e820940716a8..db1940226e04e 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -67,8 +67,7 @@ class TestDtype(base.BaseDtypeTests): class TestInterface(base.BaseInterfaceTests): def test_view(self, data, request): if isinstance(data.dtype, ArrowStringDtype): - reason = "Fails until implement, remove before merge" - mark = pytest.mark.xfail(reason=reason) + mark = pytest.mark.xfail(reason="not implemented") request.node.add_marker(mark) super().test_view(data) @@ -80,8 +79,7 @@ class TestConstructors(base.BaseConstructorsTests): class TestReshaping(base.BaseReshapingTests): def test_transpose(self, data, dtype, request): if isinstance(dtype, ArrowStringDtype): - reason = "Fails until implement, remove before merge" - mark = pytest.mark.xfail(reason=reason) + mark = pytest.mark.xfail(reason="not implemented") request.node.add_marker(mark) super().test_transpose(data) @@ -93,8 +91,7 @@ class TestGetitem(base.BaseGetitemTests): class TestSetitem(base.BaseSetitemTests): def test_setitem_preserves_views(self, data, dtype, request): if isinstance(dtype, ArrowStringDtype): - reason = "Fails until implement, remove before merge" - mark = pytest.mark.xfail(reason=reason) + mark = pytest.mark.xfail(reason="not implemented") request.node.add_marker(mark) super().test_setitem_preserves_views(data) From 11388b4e84494dd97e4f88cfe6273c40a270cdd5 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 13 Nov 2020 20:37:28 +0000 Subject: [PATCH 61/78] change xfail reason message in test_value_counts_na --- pandas/tests/arrays/string_/test_string.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index cd9bd404043a3..8552b83568cf2 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -445,9 +445,7 @@ def test_arrow_roundtrip(dtype, dtype_object): def test_value_counts_na(dtype, request): if dtype == "arrow_string": - reason = ( - "AttributeError: 'ArrowStringArray' object has no attribute 'value_counts'" - ) + reason = "TypeError: boolean value of NA is ambiguous" mark = pytest.mark.xfail(reason=reason) request.node.add_marker(mark) From eb284e767059cbcb2d9ba2f50d33ec13ed65400d Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 13 Nov 2020 20:45:43 +0000 Subject: [PATCH 62/78] skip test_memory_usage for ArrowStringArray --- pandas/tests/arrays/string_/test_string.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 8552b83568cf2..210b3791d431e 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -486,9 +486,7 @@ def test_memory_usage(dtype, request): # GH 33963 if dtype == "arrow_string": - reason = "assert 147 < 147" - mark = pytest.mark.xfail(reason=reason) - request.node.add_marker(mark) + pytest.skip("not applicable") series = pd.Series(["a", "b", "c"], dtype=dtype) From 9b7070923224daf065ff6ac47c558700f88eec4d Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 14 Nov 2020 12:42:05 +0000 Subject: [PATCH 63/78] part implementation of na_value in to_numpy --- pandas/core/arrays/string_arrow.py | 9 +++++++-- pandas/tests/arrays/string_/test_string.py | 15 +++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 23e5bc58edb86..58e076df4ac6b 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -233,8 +233,13 @@ def to_numpy( """ Convert to a NumPy ndarray. """ - # TODO: copy and na_value arguments are ignored - return self.data.__array__(dtype=dtype) + # TODO: copy argument is ignored + + if na_value is lib.no_default: + na_value = self._dtype.na_value + result = self.data.__array__(dtype=dtype) + result[isna(result)] = na_value + return result def __len__(self) -> int: """ diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 210b3791d431e..fa1580db7fe64 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -500,3 +500,18 @@ def test_astype_from_float_dtype(float_dtype, dtype): result = s.astype(dtype) expected = pd.Series(["0.1"], dtype=dtype) tm.assert_series_equal(result, expected) + + +def test_to_numpy_returns_pdna_default(dtype): + arr = pd.array(["a", pd.NA, "b"], dtype=dtype) + result = np.array(arr) + expected = np.array(["a", pd.NA, "b"], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + +def test_to_numpy_na_value(dtype, nulls_fixture): + na_value = nulls_fixture + arr = pd.array(["a", pd.NA, "b"], dtype=dtype) + result = arr.to_numpy(na_value=na_value) + expected = np.array(["a", na_value, "b"], dtype=object) + tm.assert_numpy_array_equal(result, expected) From 6757feb97071a487400fb9bc9ba44b43daa95c03 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 14 Nov 2020 13:37:24 +0000 Subject: [PATCH 64/78] remove is_array_like in __getitem__ --- pandas/core/arrays/string_arrow.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 58e076df4ac6b..b76af7f886e60 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,6 +1,5 @@ from __future__ import annotations -from collections import abc from distutils.version import LooseVersion from typing import TYPE_CHECKING, Any, Sequence, Type, Union @@ -10,16 +9,17 @@ from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.base import ExtensionDtype -from pandas.core.dtypes.dtypes import register_extension_dtype -from pandas.core.dtypes.missing import isna - -from pandas.api.types import ( +from pandas.core.dtypes.common import ( is_array_like, is_bool_dtype, is_integer, is_integer_dtype, + is_list_like, is_scalar, ) +from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.missing import isna + from pandas.core.arraylike import OpsMixin from pandas.core.arrays.base import ExtensionArray from pandas.core.indexers import check_array_indexer, validate_indices @@ -298,11 +298,9 @@ def __getitem__(self, item: Any) -> Any: For a boolean mask, return an instance of ``ExtensionArray``, filtered to the values where ``item`` is True. """ - item = check_array_indexer(self, item) - if isinstance(item, abc.Iterable): - if not is_array_like(item): - item = np.array(item) + if is_list_like(item): + item = check_array_indexer(self, item) if not len(item): return type(self)(pa.chunked_array([], type=pa.string())) elif is_integer_dtype(item): From 460ea3811e89d34d5ad10c276e2b1f88825ca6d7 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 14 Nov 2020 14:48:47 +0000 Subject: [PATCH 65/78] Revert "remove is_array_like in __getitem__" This reverts commit 6757feb97071a487400fb9bc9ba44b43daa95c03. --- pandas/core/arrays/string_arrow.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index b76af7f886e60..58e076df4ac6b 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,5 +1,6 @@ from __future__ import annotations +from collections import abc from distutils.version import LooseVersion from typing import TYPE_CHECKING, Any, Sequence, Type, Union @@ -9,17 +10,16 @@ from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.base import ExtensionDtype -from pandas.core.dtypes.common import ( +from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.missing import isna + +from pandas.api.types import ( is_array_like, is_bool_dtype, is_integer, is_integer_dtype, - is_list_like, is_scalar, ) -from pandas.core.dtypes.dtypes import register_extension_dtype -from pandas.core.dtypes.missing import isna - from pandas.core.arraylike import OpsMixin from pandas.core.arrays.base import ExtensionArray from pandas.core.indexers import check_array_indexer, validate_indices @@ -298,9 +298,11 @@ def __getitem__(self, item: Any) -> Any: For a boolean mask, return an instance of ``ExtensionArray``, filtered to the values where ``item`` is True. """ + item = check_array_indexer(self, item) - if is_list_like(item): - item = check_array_indexer(self, item) + if isinstance(item, abc.Iterable): + if not is_array_like(item): + item = np.array(item) if not len(item): return type(self)(pa.chunked_array([], type=pa.string())) elif is_integer_dtype(item): From 7bee5e29309ed1ff99a225b54e719cd7b35c9d72 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 14 Nov 2020 14:50:20 +0000 Subject: [PATCH 66/78] remove just is_array_like in __getitem__ --- pandas/core/arrays/string_arrow.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 58e076df4ac6b..bc9ab5157b6b1 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -301,8 +301,6 @@ def __getitem__(self, item: Any) -> Any: item = check_array_indexer(self, item) if isinstance(item, abc.Iterable): - if not is_array_like(item): - item = np.array(item) if not len(item): return type(self)(pa.chunked_array([], type=pa.string())) elif is_integer_dtype(item): From 91f37632160c32da39fdd5fc8d34fe50b7944403 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 14 Nov 2020 15:21:03 +0000 Subject: [PATCH 67/78] Update pandas/core/arrays/string_arrow.py Co-authored-by: Joris Van den Bossche --- pandas/core/arrays/string_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index bc9ab5157b6b1..7b781b95f9dd9 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -303,7 +303,7 @@ def __getitem__(self, item: Any) -> Any: if isinstance(item, abc.Iterable): if not len(item): return type(self)(pa.chunked_array([], type=pa.string())) - elif is_integer_dtype(item): + elif is_integer_dtype(item.dtype): return self.take(item) elif is_bool_dtype(item): return type(self)(self.data.filter(item)) From 36b662ab344b4915f806782cda23877de76fe19f Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 14 Nov 2020 15:23:18 +0000 Subject: [PATCH 68/78] Apply suggestions from code review Co-authored-by: Joris Van den Bossche --- pandas/core/arrays/string_arrow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 7b781b95f9dd9..f41e65e55af9b 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -300,12 +300,12 @@ def __getitem__(self, item: Any) -> Any: """ item = check_array_indexer(self, item) - if isinstance(item, abc.Iterable): + if isinstance(item, np.ndarray): if not len(item): return type(self)(pa.chunked_array([], type=pa.string())) elif is_integer_dtype(item.dtype): return self.take(item) - elif is_bool_dtype(item): + elif is_bool_dtype(item.dtype): return type(self)(self.data.filter(item)) else: raise IndexError( From 7a9ef9c83d1b1591db9c8de62df530cb99234d00 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 14 Nov 2020 16:04:35 +0000 Subject: [PATCH 69/78] lint fixup --- pandas/core/arrays/string_arrow.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index f41e65e55af9b..36d97c82750da 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,6 +1,5 @@ from __future__ import annotations -from collections import abc from distutils.version import LooseVersion from typing import TYPE_CHECKING, Any, Sequence, Type, Union From 5db87883bbab17a518715c044d32f51ec75895f9 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 14 Nov 2020 16:07:47 +0000 Subject: [PATCH 70/78] xfail test_astype_roundtrip --- pandas/tests/arrays/string_/test_string.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index fa1580db7fe64..ac0cef6391426 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -121,7 +121,12 @@ def test_string_methods(input, method, dtype, request): tm.assert_series_equal(result.astype(object), expected) -def test_astype_roundtrip(dtype): +def test_astype_roundtrip(dtype, request): + if dtype == "arrow_string": + reason = "ValueError: Could not convert object to NumPy datetime" + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + s = pd.Series(pd.date_range("2000", periods=12)) s[0] = None From c76c39f6b8fff32ece610271a56a1aa327323f2e Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 14 Nov 2020 16:39:08 +0000 Subject: [PATCH 71/78] update expected in test_arrow_array --- pandas/tests/arrays/string_/test_string.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index ac0cef6391426..ae2cae4cd4c53 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -414,21 +414,16 @@ def test_reduce_missing(skipna, dtype): @td.skip_if_no("pyarrow", min_version="0.15.0") -def test_arrow_array(dtype, request): +def test_arrow_array(dtype): # protocol added in 0.15.0 import pyarrow as pa - if dtype == "arrow_string": - reason = ( - "TypeError: Argument 'other' has incorrect type " - "(expected pyarrow.lib.ChunkedArray, got pyarrow.lib.StringArray)" - ) - mark = pytest.mark.xfail(reason=reason) - request.node.add_marker(mark) - data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) expected = pa.array(list(data), type=pa.string(), from_pandas=True) + if dtype == "arrow_string": + expected = pa.chunked_array(expected) + assert arr.equals(expected) From 24a782dc35382ec024661742176789c9f087f120 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 15 Nov 2020 13:11:59 +0000 Subject: [PATCH 72/78] add fallback for scalar comparison ops --- pandas/core/arrays/string_arrow.py | 10 ++++++- pandas/tests/arrays/string_/test_string.py | 35 ++++++++++++++-------- 2 files changed, 31 insertions(+), 14 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 36d97c82750da..9262147f801ce 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -416,7 +416,15 @@ def _cmp_method(self, other, op): if isinstance(other, ArrowStringArray): result = pc_func(self.data, other.data) elif is_scalar(other): - result = pc_func(self.data, pa.scalar(other)) + try: + result = pc_func(self.data, pa.scalar(other)) + except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): + mask = isna(self) | isna(other) + valid = ~mask + result = np.zeros(len(self), dtype="bool") + result[valid] = op(np.array(self)[valid], other) + return BooleanArray(result, mask) + else: return NotImplemented diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index ae2cae4cd4c53..07e9484994c26 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -247,20 +247,8 @@ def test_add_frame(dtype): tm.assert_frame_equal(result, expected) -def test_comparison_methods_scalar(all_compare_operators, dtype, request): - if dtype == "arrow_string": - if all_compare_operators in ["__eq__", "__ne__"]: - reason = ( - "pyarrow.lib.ArrowInvalid: Could not convert with type NAType: " - "did not recognize Python value type when inferring an Arrow data type" - ) - else: - reason = "AssertionError: left is not an ExtensionArray" - mark = pytest.mark.xfail(reason=reason) - request.node.add_marker(mark) - +def test_comparison_methods_scalar(all_compare_operators, dtype): op_name = all_compare_operators - a = pd.array(["a", None, "c"], dtype=dtype) other = "a" result = getattr(a, op_name)(other) @@ -268,11 +256,32 @@ def test_comparison_methods_scalar(all_compare_operators, dtype, request): expected = pd.array(expected, dtype="boolean") tm.assert_extension_array_equal(result, expected) + +def test_comparison_methods_scalar_pd_na(all_compare_operators, dtype): + op_name = all_compare_operators + a = pd.array(["a", None, "c"], dtype=dtype) result = getattr(a, op_name)(pd.NA) expected = pd.array([None, None, None], dtype="boolean") tm.assert_extension_array_equal(result, expected) +def test_comparison_methods_scalar_not_string(all_compare_operators, dtype, request): + if all_compare_operators not in ["__eq__", "__ne__"]: + reason = "comparison op not supported between instances of 'str' and 'int'" + mark = pytest.mark.xfail(reason=reason) + request.node.add_marker(mark) + + op_name = all_compare_operators + a = pd.array(["a", None, "c"], dtype=dtype) + other = 42 + result = getattr(a, op_name)(other) + expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[ + op_name + ] + expected = pd.array(expected_data, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + def test_comparison_methods_array(all_compare_operators, dtype, request): if dtype == "arrow_string": if all_compare_operators in ["__eq__", "__ne__"]: From 353bff9de4bb21f5f7bb59006a6540247e7ebec5 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 15 Nov 2020 14:01:41 +0000 Subject: [PATCH 73/78] dispatch to pyarrow for comparion with np.ndarray (1 failed) --- pandas/core/arrays/string_arrow.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 9262147f801ce..680752e6a7e07 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -415,6 +415,8 @@ def _cmp_method(self, other, op): pc_func = ARROW_CMP_FUNCS[op.__name__] if isinstance(other, ArrowStringArray): result = pc_func(self.data, other.data) + elif isinstance(other, np.ndarray): + result = pc_func(self.data, other) elif is_scalar(other): try: result = pc_func(self.data, pa.scalar(other)) @@ -424,7 +426,6 @@ def _cmp_method(self, other, op): result = np.zeros(len(self), dtype="bool") result[valid] = op(np.array(self)[valid], other) return BooleanArray(result, mask) - else: return NotImplemented From be939474bc0ecb840619da039e02630a7f5daf26 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 16 Nov 2020 12:50:27 +0000 Subject: [PATCH 74/78] fix test_reindex_non_na_fill_value --- pandas/core/arrays/string_arrow.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 680752e6a7e07..cb44c5ae71518 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -564,7 +564,8 @@ def take( raise IndexError("out of bounds value in 'indices'.") if allow_fill: - if (indices_array < 0).any(): + fill_mask = indices_array < 0 + if fill_mask.any(): validate_indices(indices_array, len(self.data)) # TODO(ARROW-9433): Treat negative indices as NULL indices_array = pa.array(indices_array, mask=indices_array < 0) @@ -574,7 +575,7 @@ def take( # TODO: ArrowNotImplementedError: Function fill_null has no # kernel matching input types (array[string], scalar[string]) result = type(self)(result) - result[result.isna()] = fill_value + result[fill_mask] = fill_value return result # return type(self)(pc.fill_null(result, pa.scalar(fill_value))) else: From 52440a75f506c0b6fbb6b10d8f3b7cfabfc84987 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 16 Nov 2020 13:04:03 +0000 Subject: [PATCH 75/78] use fill_mask in pa indices_array --- pandas/core/arrays/string_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index cb44c5ae71518..2343c3b2bba4d 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -568,7 +568,7 @@ def take( if fill_mask.any(): validate_indices(indices_array, len(self.data)) # TODO(ARROW-9433): Treat negative indices as NULL - indices_array = pa.array(indices_array, mask=indices_array < 0) + indices_array = pa.array(indices_array, mask=fill_mask) result = self.data.take(indices_array) if isna(fill_value): return type(self)(result) From bd05c2c0aff018739be9661206e382cd063cd386 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 16 Nov 2020 13:31:22 +0000 Subject: [PATCH 76/78] add comment to __gettem__ --- pandas/core/arrays/string_arrow.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 2343c3b2bba4d..5e4d3d5f17185 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -312,6 +312,8 @@ def __getitem__(self, item: Any) -> Any: "boolean arrays are valid indices." ) + # We are not an array indexer, so maybe e.g. a slice or integer + # indexer. We dispatch to pyarrow. value = self.data[item] if isinstance(value, pa.ChunkedArray): return type(self)(value) From 27c8de581aface7d69e1fadc95de379227b084a8 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 17 Nov 2020 10:51:54 +0000 Subject: [PATCH 77/78] add comment on pyarrow compute --- pandas/core/arrays/string_arrow.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 5e4d3d5f17185..be3ce7330314c 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -29,6 +29,8 @@ except ImportError: pa = None else: + # our min supported version of pyarrow, 0.15.1, does not have a compute + # module try: import pyarrow.compute as pc except ImportError: From b6713e95bf36f7cdfa5f12c4cc57b49020a3033a Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 17 Nov 2020 11:05:58 +0000 Subject: [PATCH 78/78] privatize `data` --- pandas/core/arrays/string_arrow.py | 54 +++++++++++++++--------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index be3ce7330314c..184fbc050036b 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -183,13 +183,13 @@ class ArrowStringArray(OpsMixin, ExtensionArray): def __init__(self, values): self._chk_pyarrow_available() if isinstance(values, pa.Array): - self.data = pa.chunked_array([values]) + self._data = pa.chunked_array([values]) elif isinstance(values, pa.ChunkedArray): - self.data = values + self._data = values else: raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray") - if not pa.types.is_string(self.data.type): + if not pa.types.is_string(self._data.type): raise ValueError( "ArrowStringArray requires a PyArrow (chunked) array of string type" ) @@ -226,7 +226,7 @@ def __array__(self, dtype=None) -> np.ndarray: def __arrow_array__(self, type=None): """Convert myself to a pyarrow Array or ChunkedArray.""" - return self.data + return self._data def to_numpy( self, dtype=None, copy: bool = False, na_value=lib.no_default @@ -238,7 +238,7 @@ def to_numpy( if na_value is lib.no_default: na_value = self._dtype.na_value - result = self.data.__array__(dtype=dtype) + result = self._data.__array__(dtype=dtype) result[isna(result)] = na_value return result @@ -250,7 +250,7 @@ def __len__(self) -> int: ------- length : int """ - return len(self.data) + return len(self._data) @classmethod def _from_factorized(cls, values, original): @@ -271,7 +271,7 @@ def _concat_same_type(cls, to_concat) -> ArrowStringArray: """ return cls( pa.chunked_array( - [array for ea in to_concat for array in ea.data.iterchunks()] + [array for ea in to_concat for array in ea._data.iterchunks()] ) ) @@ -307,7 +307,7 @@ def __getitem__(self, item: Any) -> Any: elif is_integer_dtype(item.dtype): return self.take(item) elif is_bool_dtype(item.dtype): - return type(self)(self.data.filter(item)) + return type(self)(self._data.filter(item)) else: raise IndexError( "Only integers, slices and integer or " @@ -316,7 +316,7 @@ def __getitem__(self, item: Any) -> Any: # We are not an array indexer, so maybe e.g. a slice or integer # indexer. We dispatch to pyarrow. - value = self.data[item] + value = self._data[item] if isinstance(value, pa.ChunkedArray): return type(self)(value) else: @@ -392,7 +392,7 @@ def nbytes(self) -> int: """ The number of bytes needed to store this object in memory. """ - return self.data.nbytes + return self._data.nbytes def isna(self) -> np.ndarray: """ @@ -401,7 +401,7 @@ def isna(self) -> np.ndarray: This should return a 1-D array the same length as 'self'. """ # TODO: Implement .to_numpy for ChunkedArray - return self.data.is_null().to_pandas().values + return self._data.is_null().to_pandas().values def copy(self) -> ArrowStringArray: """ @@ -411,19 +411,19 @@ def copy(self) -> ArrowStringArray: ------- ArrowStringArray """ - return type(self)(self.data) + return type(self)(self._data) def _cmp_method(self, other, op): from pandas.arrays import BooleanArray pc_func = ARROW_CMP_FUNCS[op.__name__] if isinstance(other, ArrowStringArray): - result = pc_func(self.data, other.data) + result = pc_func(self._data, other._data) elif isinstance(other, np.ndarray): - result = pc_func(self.data, other) + result = pc_func(self._data, other) elif is_scalar(other): try: - result = pc_func(self.data, pa.scalar(other)) + result = pc_func(self._data, pa.scalar(other)) except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): mask = isna(self) | isna(other) valid = ~mask @@ -469,11 +469,11 @@ def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None: # Slice data and insert inbetween new_data = [ - *self.data[0:key].chunks, + *self._data[0:key].chunks, pa.array([value], type=pa.string()), - *self.data[(key + 1) :].chunks, + *self._data[(key + 1) :].chunks, ] - self.data = pa.chunked_array(new_data) + self._data = pa.chunked_array(new_data) else: # Convert to integer indices and iteratively assign. # TODO: Make a faster variant of this in Arrow upstream. @@ -562,18 +562,18 @@ def take( else: indices_array = indices - if len(self.data) == 0 and (indices_array >= 0).any(): + if len(self._data) == 0 and (indices_array >= 0).any(): raise IndexError("cannot do a non-empty take") - if indices_array.size > 0 and indices_array.max() >= len(self.data): + if indices_array.size > 0 and indices_array.max() >= len(self._data): raise IndexError("out of bounds value in 'indices'.") if allow_fill: fill_mask = indices_array < 0 if fill_mask.any(): - validate_indices(indices_array, len(self.data)) + validate_indices(indices_array, len(self._data)) # TODO(ARROW-9433): Treat negative indices as NULL indices_array = pa.array(indices_array, mask=fill_mask) - result = self.data.take(indices_array) + result = self._data.take(indices_array) if isna(fill_value): return type(self)(result) # TODO: ArrowNotImplementedError: Function fill_null has no @@ -584,14 +584,14 @@ def take( # return type(self)(pc.fill_null(result, pa.scalar(fill_value))) else: # Nothing to fill - return type(self)(self.data.take(indices)) + return type(self)(self._data.take(indices)) else: # allow_fill=False # TODO(ARROW-9432): Treat negative indices as indices from the right. if (indices_array < 0).any(): # Don't modify in-place indices_array = np.copy(indices_array) - indices_array[indices_array < 0] += len(self.data) - return type(self)(self.data.take(indices_array)) + indices_array[indices_array < 0] += len(self._data) + return type(self)(self._data.take(indices_array)) def value_counts(self, dropna: bool = True) -> Series: """ @@ -612,14 +612,14 @@ def value_counts(self, dropna: bool = True) -> Series: """ from pandas import Index, Series - vc = self.data.value_counts() + vc = self._data.value_counts() # Index cannot hold ExtensionArrays yet index = Index(type(self)(vc.field(0)).astype(object)) # No missings, so we can adhere to the interface and return a numpy array. counts = np.array(vc.field(1)) - if dropna and self.data.null_count > 0: + if dropna and self._data.null_count > 0: raise NotImplementedError("yo") return Series(counts, index=index).astype("Int64")