diff --git a/pandas/tests/extension/arrow/__init__.py b/pandas/tests/extension/arrow/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/pandas/tests/extension/arrow/arrays.py b/pandas/tests/extension/arrow/arrays.py deleted file mode 100644 index b4e7a99a3d6f5..0000000000000 --- a/pandas/tests/extension/arrow/arrays.py +++ /dev/null @@ -1,197 +0,0 @@ -""" -Rudimentary Apache Arrow-backed ExtensionArray. - -At the moment, just a boolean array / type is implemented. -Eventually, we'll want to parametrize the type and support -multiple dtypes. Not all methods are implemented yet, and the -current implementation is not efficient. -""" -from __future__ import annotations - -import itertools -import operator - -import numpy as np -import pyarrow as pa - -from pandas._typing import type_t - -import pandas as pd -from pandas.api.extensions import ( - ExtensionDtype, - register_extension_dtype, - take, -) -from pandas.api.types import is_scalar -from pandas.core.arrays.arrow import ArrowExtensionArray as _ArrowExtensionArray -from pandas.core.construction import extract_array - - -@register_extension_dtype -class ArrowBoolDtype(ExtensionDtype): - - type = np.bool_ - kind = "b" - name = "arrow_bool" - na_value = pa.NULL - - @classmethod - def construct_array_type(cls) -> type_t[ArrowBoolArray]: - """ - Return the array type associated with this dtype. - - Returns - ------- - type - """ - return ArrowBoolArray - - @property - def _is_boolean(self) -> bool: - return True - - -@register_extension_dtype -class ArrowStringDtype(ExtensionDtype): - - type = str - kind = "U" - name = "arrow_string" - na_value = pa.NULL - - @classmethod - def construct_array_type(cls) -> type_t[ArrowStringArray]: - """ - Return the array type associated with this dtype. - - Returns - ------- - type - """ - return ArrowStringArray - - -class ArrowExtensionArray(_ArrowExtensionArray): - _data: pa.ChunkedArray - - @classmethod - def _from_sequence(cls, values, dtype=None, copy=False): - # TODO: respect dtype, copy - - if isinstance(values, cls): - # in particular for empty cases the pa.array(np.asarray(...)) - # does not round-trip - return cls(values._data) - - elif not len(values): - if isinstance(values, list): - dtype = bool if cls is ArrowBoolArray else str - values = np.array([], dtype=dtype) - - arr = pa.chunked_array([pa.array(np.asarray(values))]) - return cls(arr) - - def __repr__(self) -> str: - return f"{type(self).__name__}({repr(self._data)})" - - def __contains__(self, obj) -> bool: - if obj is None or obj is self.dtype.na_value: - # None -> EA.__contains__ only checks for self._dtype.na_value, not - # any compatible NA value. - # self.dtype.na_value -> isn't recognized by pd.isna - return bool(self.isna().any()) - return bool(super().__contains__(obj)) - - def __getitem__(self, item): - if is_scalar(item): - return self._data.to_pandas()[item] - else: - vals = self._data.to_pandas()[item] - return type(self)._from_sequence(vals) - - def astype(self, dtype, copy=True): - # needed to fix this astype for the Series constructor. - if isinstance(dtype, type(self.dtype)) and dtype == self.dtype: - if copy: - return self.copy() - return self - return super().astype(dtype, copy) - - @property - def dtype(self): - return self._dtype - - def _logical_method(self, other, op): - if not isinstance(other, type(self)): - raise NotImplementedError() - - result = op(np.array(self._data), np.array(other._data)) - return ArrowBoolArray( - pa.chunked_array([pa.array(result, mask=pd.isna(self._data.to_pandas()))]) - ) - - def __eq__(self, other): - if not isinstance(other, type(self)): - # TODO: use some pyarrow function here? - return np.asarray(self).__eq__(other) - - return self._logical_method(other, operator.eq) - - def take(self, indices, allow_fill=False, fill_value=None): - data = self._data.to_pandas() - data = extract_array(data, extract_numpy=True) - - if allow_fill and fill_value is None: - fill_value = self.dtype.na_value - - result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill) - return self._from_sequence(result, dtype=self.dtype) - - @classmethod - def _concat_same_type(cls, to_concat): - chunks = list(itertools.chain.from_iterable(x._data.chunks for x in to_concat)) - arr = pa.chunked_array(chunks) - return cls(arr) - - def __invert__(self): - return type(self)._from_sequence(~self._data.to_pandas()) - - def _reduce(self, name: str, *, skipna: bool = True, **kwargs): - if skipna: - arr = self[~self.isna()] - else: - arr = self - - try: - op = getattr(arr, name) - except AttributeError as err: - raise TypeError from err - return op(**kwargs) - - def any(self, axis=0, out=None): - # Explicitly return a plain bool to reproduce GH-34660 - return bool(self._data.to_pandas().any()) - - def all(self, axis=0, out=None): - # Explicitly return a plain bool to reproduce GH-34660 - return bool(self._data.to_pandas().all()) - - -class ArrowBoolArray(ArrowExtensionArray): - def __init__(self, values) -> None: - if not isinstance(values, pa.ChunkedArray): - raise ValueError - - assert values.type == pa.bool_() - self._data = values - self._dtype = ArrowBoolDtype() # type: ignore[assignment] - - -class ArrowStringArray(ArrowExtensionArray): - def __init__(self, values) -> None: - if not isinstance(values, pa.ChunkedArray): - raise ValueError - - assert values.type == pa.string() - self._data = values - self._dtype = ArrowStringDtype() # type: ignore[assignment] diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py deleted file mode 100644 index 0205b8aad9431..0000000000000 --- a/pandas/tests/extension/arrow/test_bool.py +++ /dev/null @@ -1,104 +0,0 @@ -import numpy as np -import pytest - -from pandas.compat import ( - is_ci_environment, - is_platform_windows, -) - -import pandas as pd -import pandas._testing as tm -from pandas.api.types import is_bool_dtype -from pandas.tests.extension import base - -pytest.importorskip("pyarrow", minversion="1.0.1") - -from pandas.tests.extension.arrow.arrays import ( # isort:skip - ArrowBoolArray, - ArrowBoolDtype, -) - - -@pytest.fixture -def dtype(): - return ArrowBoolDtype() - - -@pytest.fixture -def data(): - values = np.random.randint(0, 2, size=100, dtype=bool) - values[1] = ~values[0] - return ArrowBoolArray._from_sequence(values) - - -@pytest.fixture -def data_missing(): - return ArrowBoolArray._from_sequence([None, True]) - - -def test_basic_equals(data): - # https://github.com/pandas-dev/pandas/issues/34660 - assert pd.Series(data).equals(pd.Series(data)) - - -class BaseArrowTests: - pass - - -class TestDtype(BaseArrowTests, base.BaseDtypeTests): - pass - - -class TestInterface(BaseArrowTests, base.BaseInterfaceTests): - def test_copy(self, data): - # __setitem__ does not work, so we only have a smoke-test - data.copy() - - def test_view(self, data): - # __setitem__ does not work, so we only have a smoke-test - data.view() - - @pytest.mark.xfail( - raises=AssertionError, - reason="Doesn't recognize data._na_value as NA", - ) - def test_contains(self, data, data_missing): - super().test_contains(data, data_missing) - - -class TestConstructors(BaseArrowTests, base.BaseConstructorsTests): - @pytest.mark.xfail(reason="pa.NULL is not recognised as scalar, GH-33899") - def test_series_constructor_no_data_with_index(self, dtype, na_value): - # pyarrow.lib.ArrowInvalid: only handle 1-dimensional arrays - super().test_series_constructor_no_data_with_index(dtype, na_value) - - @pytest.mark.xfail(reason="pa.NULL is not recognised as scalar, GH-33899") - def test_series_constructor_scalar_na_with_index(self, dtype, na_value): - # pyarrow.lib.ArrowInvalid: only handle 1-dimensional arrays - super().test_series_constructor_scalar_na_with_index(dtype, na_value) - - @pytest.mark.xfail(reason="_from_sequence ignores dtype keyword") - def test_empty(self, dtype): - super().test_empty(dtype) - - -class TestReduce(base.BaseNoReduceTests): - def test_reduce_series_boolean(self): - pass - - -@pytest.mark.skipif( - is_ci_environment() and is_platform_windows(), - reason="Causes stack overflow on Windows CI", -) -class TestReduceBoolean(base.BaseBooleanReduceTests): - pass - - -def test_is_bool_dtype(data): - assert is_bool_dtype(data) - assert pd.core.common.is_bool_indexer(data) - s = pd.Series(range(len(data))) - result = s[data] - expected = s[np.asarray(data)] - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/arrow/test_string.py b/pandas/tests/extension/arrow/test_string.py deleted file mode 100644 index 67a62978aa1bc..0000000000000 --- a/pandas/tests/extension/arrow/test_string.py +++ /dev/null @@ -1,12 +0,0 @@ -import pytest - -import pandas as pd - -pytest.importorskip("pyarrow", minversion="1.0.0") - - -def test_constructor_from_list(): - # GH 27673 - result = pd.Series(["E"], dtype=pd.StringDtype(storage="pyarrow")) - assert isinstance(result.dtype, pd.StringDtype) - assert result.dtype.storage == "pyarrow" diff --git a/pandas/tests/extension/arrow/test_timestamp.py b/pandas/tests/extension/arrow/test_timestamp.py deleted file mode 100644 index 5b81940e5a6c0..0000000000000 --- a/pandas/tests/extension/arrow/test_timestamp.py +++ /dev/null @@ -1,57 +0,0 @@ -from __future__ import annotations - -import datetime - -import pytest - -from pandas._typing import type_t - -import pandas as pd -from pandas.api.extensions import ( - ExtensionDtype, - register_extension_dtype, -) - -pytest.importorskip("pyarrow", minversion="1.0.1") - -import pyarrow as pa # isort:skip - -from pandas.tests.extension.arrow.arrays import ArrowExtensionArray # isort:skip - - -@register_extension_dtype -class ArrowTimestampUSDtype(ExtensionDtype): - - type = datetime.datetime - kind = "M" - name = "arrow_timestamp_us" - na_value = pa.NULL - - @classmethod - def construct_array_type(cls) -> type_t[ArrowTimestampUSArray]: - """ - Return the array type associated with this dtype. - - Returns - ------- - type - """ - return ArrowTimestampUSArray - - -class ArrowTimestampUSArray(ArrowExtensionArray): - def __init__(self, values) -> None: - if not isinstance(values, pa.ChunkedArray): - raise ValueError - - assert values.type == pa.timestamp("us") - self._data = values - self._dtype = ArrowTimestampUSDtype() # type: ignore[assignment] - - -def test_constructor_extensionblock(): - # GH 34986 - arr = ArrowTimestampUSArray._from_sequence( - [None, datetime.datetime(2010, 9, 8, 7, 6, 5, 4)] - ) - pd.DataFrame({"timestamp": arr}) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 0c8221cb73eee..92e4dbaea4eea 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -35,6 +35,7 @@ import pandas as pd import pandas._testing as tm +from pandas.api.types import is_bool_dtype from pandas.tests.extension import base pa = pytest.importorskip("pyarrow", minversion="1.0.1") @@ -1350,6 +1351,10 @@ def test_where_series(self, data, na_value, as_frame, request): ) super().test_where_series(data, na_value, as_frame) + def test_basic_equals(self, data): + # https://github.com/pandas-dev/pandas/issues/34660 + assert pd.Series(data).equals(pd.Series(data)) + class TestBaseArithmeticOps(base.BaseArithmeticOpsTests): @@ -1749,3 +1754,14 @@ def test_mode(data_for_grouping, dropna, take_idx, exp_idx, request): result = ser.mode(dropna=dropna) expected = pd.Series(data_for_grouping.take(exp_idx)) tm.assert_series_equal(result, expected) + + +def test_is_bool_dtype(): + # GH 22667 + data = ArrowExtensionArray(pa.array([True, False, True])) + assert is_bool_dtype(data) + assert pd.core.common.is_bool_indexer(data) + s = pd.Series(range(len(data))) + result = s[data] + expected = s[np.asarray(data)] + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index e2e4475cd520d..1bb89f50e9de0 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -121,6 +121,13 @@ def test_from_dtype(self, data): # base test uses string representation of dtype pass + def test_constructor_from_list(self): + # GH 27673 + pytest.importorskip("pyarrow", minversion="1.0.0") + result = pd.Series(["E"], dtype=StringDtype(storage="pyarrow")) + assert isinstance(result.dtype, StringDtype) + assert result.dtype.storage == "pyarrow" + class TestReshaping(base.BaseReshapingTests): def test_transpose(self, data, request):