From 4f96f3612bd8d71b192a9e29560a289868be61a1 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 24 Jan 2022 16:47:40 -0600 Subject: [PATCH 1/6] refactor: use public pandas APIs where possible --- db_dtypes/__init__.py | 12 +++--------- db_dtypes/core.py | 7 +------ 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py index 056be28..1314722 100644 --- a/db_dtypes/__init__.py +++ b/db_dtypes/__init__.py @@ -22,13 +22,7 @@ import numpy import packaging.version import pandas -import pandas.compat.numpy.function -import pandas.core.algorithms -import pandas.core.arrays -import pandas.core.dtypes.base -import pandas.core.dtypes.dtypes -import pandas.core.dtypes.generic -import pandas.core.nanops +import pandas.api.extensions import pyarrow import pyarrow.compute @@ -44,7 +38,7 @@ pandas_release = packaging.version.parse(pandas.__version__).release -@pandas.core.dtypes.dtypes.register_extension_dtype +@pandas.api.extensions.register_extension_dtype class TimeDtype(core.BaseDatetimeDtype): """ Extension dtype for time data. @@ -194,7 +188,7 @@ def __arrow_array__(self, type=None): ) -@pandas.core.dtypes.dtypes.register_extension_dtype +@pandas.api.extensions.register_extension_dtype class DateDtype(core.BaseDatetimeDtype): """ Extension dtype for time data. diff --git a/db_dtypes/core.py b/db_dtypes/core.py index 3ade198..8a41dda 100644 --- a/db_dtypes/core.py +++ b/db_dtypes/core.py @@ -19,12 +19,7 @@ from pandas._libs import NaT import pandas.api.extensions import pandas.compat.numpy.function -import pandas.core.algorithms -import pandas.core.arrays -import pandas.core.dtypes.base -from pandas.core.dtypes.common import is_dtype_equal, is_list_like, pandas_dtype -import pandas.core.dtypes.dtypes -import pandas.core.dtypes.generic +from pandas.api.types import is_dtype_equal, is_list_like, pandas_dtype import pandas.core.nanops from db_dtypes import pandas_backports From eff42ed3f41fe3f23cc7e3f8c41bc4e5b0df5a14 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 25 Jan 2022 15:59:49 -0600 Subject: [PATCH 2/6] no need to override take --- db_dtypes/__init__.py | 4 ++-- db_dtypes/core.py | 39 ++++----------------------------------- 2 files changed, 6 insertions(+), 37 deletions(-) diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py index 1314722..a518a0b 100644 --- a/db_dtypes/__init__.py +++ b/db_dtypes/__init__.py @@ -107,7 +107,7 @@ def _datetime( .as_py() ) - if scalar is None: + if pandas.isna(scalar): return None if isinstance(scalar, datetime.time): return pandas.Timestamp( @@ -232,7 +232,7 @@ def _datetime( if isinstance(scalar, (pyarrow.Date32Scalar, pyarrow.Date64Scalar)): scalar = scalar.as_py() - if scalar is None: + if pandas.isna(scalar): return None elif isinstance(scalar, datetime.date): return pandas.Timestamp( diff --git a/db_dtypes/core.py b/db_dtypes/core.py index 8a41dda..6f045cd 100644 --- a/db_dtypes/core.py +++ b/db_dtypes/core.py @@ -12,13 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Optional, Sequence +from typing import Optional import numpy import pandas -from pandas._libs import NaT +from pandas import NaT import pandas.api.extensions -import pandas.compat.numpy.function +import pandas.compat.numpy.function # TODO: move to pandas_backports from pandas.api.types import is_dtype_equal, is_list_like, pandas_dtype import pandas.core.nanops @@ -102,38 +102,7 @@ def isna(self): return pandas.isna(self._ndarray) def _validate_scalar(self, value): - if pandas.isna(value): - return None - - if not isinstance(value, self.dtype.type): - raise ValueError(value) - - return value - - def take( - self, - indices: Sequence[int], - *, - allow_fill: bool = False, - fill_value: Any = None, - ): - indices = numpy.asarray(indices, dtype=numpy.intp) - data = self._ndarray - if allow_fill: - fill_value = self._validate_scalar(fill_value) - fill_value = ( - numpy.datetime64() if fill_value is None else self._datetime(fill_value) - ) - if (indices < -1).any(): - raise ValueError( - "take called with negative indexes other than -1," - " when a fill value is provided." - ) - out = data.take(indices) - if allow_fill: - out[indices == -1] = fill_value - - return self.__class__(out) + return self._datetime(value) # TODO: provide implementations of dropna, fillna, unique, # factorize, argsort, searchsoeted for better performance over From 60da4d0a0b265c42c194f8bcae670baa77b442b5 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 25 Jan 2022 16:24:55 -0600 Subject: [PATCH 3/6] backport take implementation --- db_dtypes/core.py | 9 +-------- db_dtypes/pandas_backports.py | 33 +++++++++++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/db_dtypes/core.py b/db_dtypes/core.py index 6f045cd..97a9c65 100644 --- a/db_dtypes/core.py +++ b/db_dtypes/core.py @@ -18,7 +18,6 @@ import pandas from pandas import NaT import pandas.api.extensions -import pandas.compat.numpy.function # TODO: move to pandas_backports from pandas.api.types import is_dtype_equal, is_list_like, pandas_dtype import pandas.core.nanops @@ -104,10 +103,6 @@ def isna(self): def _validate_scalar(self, value): return self._datetime(value) - # TODO: provide implementations of dropna, fillna, unique, - # factorize, argsort, searchsoeted for better performance over - # abstract implementations. - def any( self, *, @@ -116,9 +111,7 @@ def any( keepdims: bool = False, skipna: bool = True, ): - pandas.compat.numpy.function.validate_any( - (), {"out": out, "keepdims": keepdims} - ) + pandas_backports.numpy_validate_any((), {"out": out, "keepdims": keepdims}) result = pandas.core.nanops.nanany(self._ndarray, axis=axis, skipna=skipna) return result diff --git a/db_dtypes/pandas_backports.py b/db_dtypes/pandas_backports.py index 003224f..9ac1150 100644 --- a/db_dtypes/pandas_backports.py +++ b/db_dtypes/pandas_backports.py @@ -20,15 +20,20 @@ """ import operator +from typing import Any import numpy import packaging.version import pandas -from pandas._libs.lib import is_integer +from pandas._libs.lib import is_integer # TODO: use public version if available +import pandas.compat.numpy.function pandas_release = packaging.version.parse(pandas.__version__).release +# Create aliases for private methods in case they move in a future version. +numpy_validate_any = pandas.compat.numpy.function.validate_any + def import_default(module_name, force=False, default=None): """ @@ -55,7 +60,7 @@ def import_default(module_name, force=False, default=None): return getattr(module, name, default) -@import_default("pandas.core.arraylike") +@import_default("pandas.core.arraylike") # TODO: is there a public API for this? class OpsMixin: def _cmp_method(self, other, op): # pragma: NO COVER return NotImplemented @@ -81,6 +86,8 @@ def __ge__(self, other): __add__ = __radd__ = __sub__ = lambda self, other: NotImplemented +# TODO: use public API if possible +# https://github.com/pandas-dev/pandas/pull/45544/files @import_default("pandas.core.arrays._mixins", pandas_release < (1, 3)) class NDArrayBackedExtensionArray(pandas.core.arrays.base.ExtensionArray): @@ -130,6 +137,28 @@ def copy(self): def repeat(self, n): return self.__class__(self._ndarray.repeat(n), self._dtype) + def take( + self, + indices, + *, + allow_fill: bool = False, + fill_value: Any = None, + axis: int = 0, + ): + from pandas.core.algorithms import take + + if allow_fill: + fill_value = self._validate_scalar(fill_value) + + new_data = take( + self._ndarray, + indices, + allow_fill=allow_fill, + fill_value=fill_value, + axis=axis, + ) + return self._from_backing_data(new_data) + @classmethod def _concat_same_type(cls, to_concat, axis=0): dtypes = {str(x.dtype) for x in to_concat} From 66d5b6646dbb048301f2656649659da6bb4d99b6 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 25 Jan 2022 16:32:49 -0600 Subject: [PATCH 4/6] move remaining private pandas methods to backports --- db_dtypes/core.py | 23 +++++++++-------------- db_dtypes/pandas_backports.py | 14 +++++++++++++- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/db_dtypes/core.py b/db_dtypes/core.py index 97a9c65..6dd288c 100644 --- a/db_dtypes/core.py +++ b/db_dtypes/core.py @@ -19,7 +19,6 @@ from pandas import NaT import pandas.api.extensions from pandas.api.types import is_dtype_equal, is_list_like, pandas_dtype -import pandas.core.nanops from db_dtypes import pandas_backports @@ -112,7 +111,7 @@ def any( skipna: bool = True, ): pandas_backports.numpy_validate_any((), {"out": out, "keepdims": keepdims}) - result = pandas.core.nanops.nanany(self._ndarray, axis=axis, skipna=skipna) + result = pandas_backports.nanany(self._ndarray, axis=axis, skipna=skipna) return result def all( @@ -123,22 +122,20 @@ def all( keepdims: bool = False, skipna: bool = True, ): - pandas.compat.numpy.function.validate_all( - (), {"out": out, "keepdims": keepdims} - ) - result = pandas.core.nanops.nanall(self._ndarray, axis=axis, skipna=skipna) + pandas_backports.numpy_validate_all((), {"out": out, "keepdims": keepdims}) + result = pandas_backports.nanall(self._ndarray, axis=axis, skipna=skipna) return result def min(self, *, axis: Optional[int] = None, skipna: bool = True, **kwargs): - pandas.compat.numpy.function.validate_min((), kwargs) - result = pandas.core.nanops.nanmin( + pandas_backports.numpy_validate_min((), kwargs) + result = pandas_backports.nanmin( values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna ) return self._box_func(result) def max(self, *, axis: Optional[int] = None, skipna: bool = True, **kwargs): - pandas.compat.numpy.function.validate_max((), kwargs) - result = pandas.core.nanops.nanmax( + pandas_backports.numpy_validate_max((), kwargs) + result = pandas_backports.nanmax( values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna ) return self._box_func(result) @@ -154,11 +151,9 @@ def median( keepdims: bool = False, skipna: bool = True, ): - pandas.compat.numpy.function.validate_median( + pandas_backports.numpy_validate_median( (), {"out": out, "overwrite_input": overwrite_input, "keepdims": keepdims}, ) - result = pandas.core.nanops.nanmedian( - self._ndarray, axis=axis, skipna=skipna - ) + result = pandas_backports.nanmedian(self._ndarray, axis=axis, skipna=skipna) return self._box_func(result) diff --git a/db_dtypes/pandas_backports.py b/db_dtypes/pandas_backports.py index 9ac1150..f7d9c9f 100644 --- a/db_dtypes/pandas_backports.py +++ b/db_dtypes/pandas_backports.py @@ -25,14 +25,26 @@ import numpy import packaging.version import pandas -from pandas._libs.lib import is_integer # TODO: use public version if available +from pandas.api.types import is_integer import pandas.compat.numpy.function +import pandas.core.nanops pandas_release = packaging.version.parse(pandas.__version__).release # Create aliases for private methods in case they move in a future version. +nanall = pandas.core.nanops.nanall +nanany = pandas.core.nanops.nanany +nanmax = pandas.core.nanops.nanmax +nanmin = pandas.core.nanops.nanmin +numpy_validate_all = pandas.compat.numpy.function.validate_all numpy_validate_any = pandas.compat.numpy.function.validate_any +numpy_validate_max = pandas.compat.numpy.function.validate_max +numpy_validate_min = pandas.compat.numpy.function.validate_min + +if pandas_release >= (1, 2): + nanmedian = pandas.core.nanops.nanmedian + numpy_validate_median = pandas.compat.numpy.function.validate_median def import_default(module_name, force=False, default=None): From a4e89c20db28b7212655450d8753304348b7c146 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 25 Jan 2022 16:39:03 -0600 Subject: [PATCH 5/6] add note about _validate_scalar to docstring --- db_dtypes/core.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/db_dtypes/core.py b/db_dtypes/core.py index 6dd288c..05daf37 100644 --- a/db_dtypes/core.py +++ b/db_dtypes/core.py @@ -100,6 +100,10 @@ def isna(self): return pandas.isna(self._ndarray) def _validate_scalar(self, value): + """ + Validate and convert a scalar value to datetime64[ns] for storage in + backing NumPy array. + """ return self._datetime(value) def any( From 5de6407b947c2461f1f1d79446d8b052c8ca648b Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 25 Jan 2022 17:11:04 -0600 Subject: [PATCH 6/6] comment why we can't use public mixin --- db_dtypes/pandas_backports.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/db_dtypes/pandas_backports.py b/db_dtypes/pandas_backports.py index f7d9c9f..4b733cc 100644 --- a/db_dtypes/pandas_backports.py +++ b/db_dtypes/pandas_backports.py @@ -72,7 +72,11 @@ def import_default(module_name, force=False, default=None): return getattr(module, name, default) -@import_default("pandas.core.arraylike") # TODO: is there a public API for this? +# pandas.core.arraylike.OpsMixin is private, but the related public API +# "ExtensionScalarOpsMixin" is not sufficient for adding dates to times. +# It results in unsupported operand type(s) for +: 'datetime.time' and +# 'datetime.date' +@import_default("pandas.core.arraylike") class OpsMixin: def _cmp_method(self, other, op): # pragma: NO COVER return NotImplemented @@ -98,8 +102,8 @@ def __ge__(self, other): __add__ = __radd__ = __sub__ = lambda self, other: NotImplemented -# TODO: use public API if possible -# https://github.com/pandas-dev/pandas/pull/45544/files +# TODO: use public API once pandas 1.5 / 2.x is released. +# See: https://github.com/pandas-dev/pandas/pull/45544 @import_default("pandas.core.arrays._mixins", pandas_release < (1, 3)) class NDArrayBackedExtensionArray(pandas.core.arrays.base.ExtensionArray):