From 94722db763b8df4a475a777699c873b8453287f6 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 10 Feb 2023 10:26:37 -0800 Subject: [PATCH 1/7] API: ArrowDtype.type --- pandas/core/arrays/arrow/dtype.py | 41 ++++++++++++++++++++++++++++++- pandas/core/internals/blocks.py | 7 +++--- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index 90c86cd6d55ef..6dd2c6a52f542 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -1,9 +1,19 @@ from __future__ import annotations +from datetime import ( + date, + datetime, + time, + timedelta, +) import re import numpy as np +from pandas._libs.tslibs import ( + Timedelta, + Timestamp, +) from pandas._typing import ( TYPE_CHECKING, DtypeObj, @@ -90,7 +100,36 @@ def type(self): """ Returns pyarrow.DataType. """ - return type(self.pyarrow_dtype) + pa_type = self.pyarrow_dtype + if pa.types.is_integer(pa_type): + return int + elif pa.types.is_floating(pa_type): + return float + elif pa.types.is_string(pa_type): + return str + elif pa.types.is_binary(pa_type): + return bytes + elif pa.types.is_boolean(pa_type): + return bool + elif pa.types.is_duration(pa_type): + if pa_type.unit == "ns": + return Timedelta + else: + return timedelta + elif pa.types.is_timestamp(pa_type): + if pa_type.unit == "ns": + return Timestamp + else: + return datetime + elif pa.types.is_date(pa_type): + return date + elif pa.types.is_time(pa_type): + return time + elif pa.types.is_null(pa_type): + # TODO: None? pd.NA? pa.null? + return type(pa_type) + else: + raise NotImplementedError(pa_type) @property def name(self) -> str: # type: ignore[override] diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 401b9e60308be..747a18519e7b3 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -15,7 +15,6 @@ import numpy as np from pandas._libs import ( - Timestamp, internals as libinternals, lib, writers, @@ -63,6 +62,7 @@ is_string_dtype, ) from pandas.core.dtypes.dtypes import ( + DatetimeTZDtype, ExtensionDtype, PandasDtype, PeriodDtype, @@ -2180,9 +2180,8 @@ def get_block_type(dtype: DtypeObj): ------- cls : class, subclass of Block """ - # We use vtype and kind checks because they are much more performant + # We use kind checks because it is much more performant # than is_foo_dtype - vtype = dtype.type kind = dtype.kind cls: type[Block] @@ -2190,7 +2189,7 @@ def get_block_type(dtype: DtypeObj): if isinstance(dtype, SparseDtype): # Need this first(ish) so that Sparse[datetime] is sparse cls = ExtensionBlock - elif vtype is Timestamp: + elif isinstance(dtype, DatetimeTZDtype): cls = DatetimeTZBlock elif isinstance(dtype, PeriodDtype): cls = NDArrayBackedExtensionBlock From 9717e23d588a6757eabc81404a050de6e3c7ebfd Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 10 Feb 2023 13:05:59 -0800 Subject: [PATCH 2/7] handle Decimal, update docstring, update tests --- pandas/core/arrays/arrow/dtype.py | 5 +++- pandas/tests/extension/test_arrow.py | 34 +--------------------------- 2 files changed, 5 insertions(+), 34 deletions(-) diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index 6dd2c6a52f542..331e66698cc35 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -6,6 +6,7 @@ time, timedelta, ) +from decimal import Decimal import re import numpy as np @@ -98,7 +99,7 @@ def __repr__(self) -> str: @property def type(self): """ - Returns pyarrow.DataType. + Returns associated scalar type. """ pa_type = self.pyarrow_dtype if pa.types.is_integer(pa_type): @@ -125,6 +126,8 @@ def type(self): return date elif pa.types.is_time(pa_type): return time + elif pa.types.is_decimal(pa_type): + return Decimal elif pa.types.is_null(pa_type): # TODO: None? pd.NA? pa.null? return type(pa_type) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 1dac8faa3a9e2..d3971f3e7cc41 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -322,39 +322,7 @@ def test_from_sequence_of_strings_pa_array(self, data, request): class TestGetitemTests(base.BaseGetitemTests): - def test_getitem_scalar(self, data): - # In the base class we expect data.dtype.type; but this (intentionally) - # returns Python scalars or pd.NA - pa_type = data._data.type - if pa.types.is_integer(pa_type): - exp_type = int - elif pa.types.is_floating(pa_type): - exp_type = float - elif pa.types.is_string(pa_type): - exp_type = str - elif pa.types.is_binary(pa_type): - exp_type = bytes - elif pa.types.is_boolean(pa_type): - exp_type = bool - elif pa.types.is_duration(pa_type): - exp_type = timedelta - elif pa.types.is_timestamp(pa_type): - if pa_type.unit == "ns": - exp_type = pd.Timestamp - else: - exp_type = datetime - elif pa.types.is_date(pa_type): - exp_type = date - elif pa.types.is_time(pa_type): - exp_type = time - else: - raise NotImplementedError(data.dtype) - - result = data[0] - assert isinstance(result, exp_type), type(result) - - result = pd.Series(data)[0] - assert isinstance(result, exp_type), type(result) + pass class TestBaseAccumulateTests(base.BaseAccumulateTests): From bf6dda93a33011f4ca4d521342ef66f7a1e6d281 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 10 Feb 2023 13:43:15 -0800 Subject: [PATCH 3/7] patch __contains__ --- pandas/core/arrays/arrow/array.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 075beca106e6a..f4e03faf4c00c 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -494,6 +494,18 @@ def __len__(self) -> int: """ return len(self._data) + def __contains__(self, key) -> bool: + # https://github.com/pandas-dev/pandas/pull/51307#issuecomment-1426372604 + if isna(key) and key is not self.dtype.na_value: + if self.dtype.kind == "f" and lib.is_float(key) and isna(key): + return (pc.is_null(self._data, nan_is_null=True) & ~self.isna()).any() + + # e.g. date or timestamp types we do not allow None here to match pd.NA + return False + # TODO: maybe complex? object? + + return super().__contains__(key) + @property def _hasna(self) -> bool: return self._data.null_count > 0 From 9174758cb6a19191277515ae4708f6e53d3eccfe Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 11 Feb 2023 08:22:07 -0800 Subject: [PATCH 4/7] mypy fixup --- pandas/core/arrays/arrow/array.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index f4e03faf4c00c..bf42c8d9019d2 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -516,7 +516,9 @@ def isna(self) -> npt.NDArray[np.bool_]: This should return a 1-D array the same length as 'self'. """ - return self._data.is_null().to_numpy() + # error: Incompatible return value type (got "Union[bool, bool_]", + # expected "bool") + return self._data.is_null().to_numpy() # type: ignore[return-value] def argsort( self, From afd8745cba6f4406fc82d440a9695a66e36f4f8d Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 12 Feb 2023 07:42:25 -0800 Subject: [PATCH 5/7] mypy fixup --- pandas/core/arrays/arrow/array.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e2638a510e537..8c69b5600ac85 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -510,7 +510,8 @@ def __contains__(self, key) -> bool: # https://github.com/pandas-dev/pandas/pull/51307#issuecomment-1426372604 if isna(key) and key is not self.dtype.na_value: if self.dtype.kind == "f" and lib.is_float(key) and isna(key): - return (pc.is_null(self._data, nan_is_null=True) & ~self.isna()).any() + res = (pc.is_null(self._data, nan_is_null=True) & ~self.isna()).any() + return bool(res) # e.g. date or timestamp types we do not allow None here to match pd.NA return False @@ -528,9 +529,7 @@ def isna(self) -> npt.NDArray[np.bool_]: This should return a 1-D array the same length as 'self'. """ - # error: Incompatible return value type (got "Union[bool, bool_]", - # expected "bool") - return self._data.is_null().to_numpy() # type: ignore[return-value] + return self._data.is_null().to_numpy() def argsort( self, From d849ad3c5c56b753f90c84361f88fa41a9fdd089 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 12 Feb 2023 08:38:50 -0800 Subject: [PATCH 6/7] mypy fixup --- pandas/core/arrays/arrow/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 8c69b5600ac85..c7cecfdc20f73 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -517,7 +517,7 @@ def __contains__(self, key) -> bool: return False # TODO: maybe complex? object? - return super().__contains__(key) + return bool(super().__contains__(key)) @property def _hasna(self) -> bool: From 7ce43753231ba5abcaa629a9c8ea19b8b1571fd4 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 15 Feb 2023 15:04:06 -0800 Subject: [PATCH 7/7] use pc.any(pc.is_nan... --- pandas/core/arrays/arrow/array.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index c7cecfdc20f73..18caee3249d32 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -510,8 +510,7 @@ def __contains__(self, key) -> bool: # https://github.com/pandas-dev/pandas/pull/51307#issuecomment-1426372604 if isna(key) and key is not self.dtype.na_value: if self.dtype.kind == "f" and lib.is_float(key) and isna(key): - res = (pc.is_null(self._data, nan_is_null=True) & ~self.isna()).any() - return bool(res) + return pc.any(pc.is_nan(self._data)).as_py() # e.g. date or timestamp types we do not allow None here to match pd.NA return False