From 246b78768a0ef9d514d918dde51f094d4bddaa5d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 27 Mar 2020 10:26:28 +0100 Subject: [PATCH 01/10] ENH/PERF: use mask in factorize --- pandas/_libs/hashtable_class_helper.pxi.in | 35 +++++++++++++++++----- pandas/core/algorithms.py | 10 +++++-- pandas/core/arrays/integer.py | 12 +++++++- 3 files changed, 47 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 3ce3bc519b311..866b16178be48 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -368,7 +368,7 @@ cdef class {{name}}HashTable(HashTable): def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None, bint ignore_na=False, - bint return_inverse=False): + object mask=None, bint return_inverse=False): """ Calculate unique values and labels (no sorting!) @@ -391,6 +391,10 @@ cdef class {{name}}HashTable(HashTable): Whether NA-values should be ignored for calculating the uniques. If True, the labels corresponding to missing values will be set to na_sentinel. + mask : ndarray[bool], optional + If not None, the mask is used as indicator for missing values + (True = missing, False = valid) instead of `na_value` or + condition "val != val". return_inverse : boolean, default False Whether the mapping of the original array values to their location in the vector of uniques should be returned. @@ -409,12 +413,17 @@ cdef class {{name}}HashTable(HashTable): {{dtype}}_t val, na_value2 khiter_t k {{name}}VectorData *ud - bint use_na_value + bint use_na_value, use_mask + uint8_t[:] mask_values if return_inverse: labels = np.empty(n, dtype=np.int64) ud = uniques.data use_na_value = na_value is not None + use_mask = mask is not None + + if use_mask: + mask_values = mask.view("uint8") if use_na_value: # We need this na_value2 because we want to allow users @@ -430,7 +439,11 @@ cdef class {{name}}HashTable(HashTable): for i in range(n): val = values[i] - if ignore_na and ( + if ignore_na and use_mask: + if mask_values[i]: + labels[i] = na_sentinel + continue + elif ignore_na and ( {{if not name.lower().startswith(("uint", "int"))}} val != val or {{endif}} @@ -494,7 +507,7 @@ cdef class {{name}}HashTable(HashTable): return_inverse=return_inverse) def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1, - object na_value=None): + object na_value=None, object mask=None): """ Calculate unique values and labels (no sorting!) @@ -512,6 +525,10 @@ cdef class {{name}}HashTable(HashTable): any value "val" satisfying val != val is considered missing. If na_value is not None, then _additionally_, any value "val" satisfying val == na_value is considered missing. + mask : ndarray[bool], optional + If not None, the mask is used as indicator for missing values + (True = missing, False = valid) instead of `na_value` or + condition "val != val". Returns ------- @@ -522,7 +539,7 @@ cdef class {{name}}HashTable(HashTable): """ uniques_vector = {{name}}Vector() return self._unique(values, uniques_vector, na_sentinel=na_sentinel, - na_value=na_value, ignore_na=True, + na_value=na_value, ignore_na=True, mask=mask, return_inverse=True) def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, @@ -855,7 +872,7 @@ cdef class StringHashTable(HashTable): return_inverse=return_inverse) def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, - object na_value=None): + object na_value=None, object mask=None): """ Calculate unique values and labels (no sorting!) @@ -873,6 +890,8 @@ cdef class StringHashTable(HashTable): that is not a string is considered missing. If na_value is not None, then _additionally_ any value "val" satisfying val == na_value is considered missing. + mask : ndarray[bool], optional + Not yet implementd for StringHashTable. Returns ------- @@ -1094,7 +1113,7 @@ cdef class PyObjectHashTable(HashTable): return_inverse=return_inverse) def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, - object na_value=None): + object na_value=None, object mask=None): """ Calculate unique values and labels (no sorting!) @@ -1112,6 +1131,8 @@ cdef class PyObjectHashTable(HashTable): any value "val" satisfying val != val is considered missing. If na_value is not None, then _additionally_, any value "val" satisfying val == na_value is considered missing. + mask : ndarray[bool], optional + Not yet implemented for PyObjectHashTable. Returns ------- diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9afdb82467f90..444b3610ef0a6 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -455,7 +455,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: def _factorize_array( - values, na_sentinel: int = -1, size_hint=None, na_value=None + values, na_sentinel: int = -1, size_hint=None, na_value=None, mask=None, ) -> Tuple[np.ndarray, np.ndarray]: """ Factorize an array-like to codes and uniques. @@ -473,6 +473,10 @@ def _factorize_array( parameter when you know that you don't have any values pandas would consider missing in the array (NaN for float data, iNaT for datetimes, etc.). + mask : ndarray[bool], optional + If not None, the mask is used as indicator for missing values + (True = missing, False = valid) instead of `na_value` or + condition "val != val". Returns ------- @@ -482,7 +486,9 @@ def _factorize_array( hash_klass, values = _get_data_algo(values) table = hash_klass(size_hint or len(values)) - uniques, codes = table.factorize(values, na_sentinel=na_sentinel, na_value=na_value) + uniques, codes = table.factorize( + values, na_sentinel=na_sentinel, na_value=na_value, mask=mask + ) codes = ensure_platform_int(codes) return codes, uniques diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index f2880c5cbee42..e03b89dfda2b1 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -27,6 +27,7 @@ from pandas.core.dtypes.missing import isna from pandas.core import nanops, ops +from pandas.core.algorithms import _factorize_array import pandas.core.common as com from pandas.core.indexers import check_array_indexer from pandas.core.ops import invalid_comparison @@ -481,7 +482,16 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: def _values_for_factorize(self) -> Tuple[np.ndarray, float]: # TODO: https://github.com/pandas-dev/pandas/issues/30037 # use masked algorithms, rather than object-dtype / np.nan. - return self.to_numpy(na_value=np.nan), np.nan + return self.to_numpy(dtype=float, na_value=np.nan), np.nan + + def factorize2(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray"]: + arr = self._data + mask = self._mask + + codes, uniques = _factorize_array(arr, na_sentinel=na_sentinel, mask=mask) + + uniques = IntegerArray(uniques, np.zeros(len(uniques), dtype=bool)) + return codes, uniques def _values_for_argsort(self) -> np.ndarray: """ From af1cdeaab0b5b2f86f21b060ee348701037affbf Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 27 Mar 2020 12:57:19 +0100 Subject: [PATCH 02/10] fix typing --- pandas/core/arrays/integer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index e03b89dfda2b1..24f5760193432 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -484,7 +484,7 @@ def _values_for_factorize(self) -> Tuple[np.ndarray, float]: # use masked algorithms, rather than object-dtype / np.nan. return self.to_numpy(dtype=float, na_value=np.nan), np.nan - def factorize2(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray"]: + def factorize2(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "IntegerArray"]: arr = self._data mask = self._mask From be5a21dc9a567ee4e2ec6e6be4690c21dae4da04 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 2 Apr 2020 16:27:23 +0200 Subject: [PATCH 03/10] override in factorize, moved to base masked --- pandas/core/arrays/boolean.py | 13 ++++--------- pandas/core/arrays/integer.py | 19 ------------------- pandas/core/arrays/masked.py | 17 +++++++++++++++-- pandas/tests/extension/base/methods.py | 2 ++ 4 files changed, 21 insertions(+), 30 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 442d4ca8cef6d..370b00d3ca562 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -83,6 +83,10 @@ def type(self) -> Type[np.bool_]: def kind(self) -> str: return "b" + @property + def numpy_dtype(self) -> np.dtype: + return np.dtype("bool") + @classmethod def construct_array_type(cls) -> Type["BooleanArray"]: """ @@ -314,15 +318,6 @@ def map_string(s): scalars = [map_string(x) for x in strings] return cls._from_sequence(scalars, dtype, copy) - def _values_for_factorize(self) -> Tuple[np.ndarray, int]: - data = self._data.astype("int8") - data[self._mask] = -1 - return data, -1 - - @classmethod - def _from_factorized(cls, values, original: "BooleanArray") -> "BooleanArray": - return cls._from_sequence(values, dtype=original.dtype) - _HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_) def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs): diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 9bd80cca747da..3a84ce59bd0e0 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -27,7 +27,6 @@ from pandas.core.dtypes.missing import isna from pandas.core import nanops, ops -from pandas.core.algorithms import _factorize_array from pandas.core.array_algos import masked_reductions import pandas.core.common as com from pandas.core.indexers import check_array_indexer @@ -367,10 +366,6 @@ def _from_sequence_of_strings( scalars = to_numeric(strings, errors="raise") return cls._from_sequence(scalars, dtype, copy) - @classmethod - def _from_factorized(cls, values, original) -> "IntegerArray": - return integer_array(values, dtype=original.dtype) - _HANDLED_TYPES = (np.ndarray, numbers.Number) def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs): @@ -480,20 +475,6 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: data = self.to_numpy(dtype=dtype, **kwargs) return astype_nansafe(data, dtype, copy=False) - def _values_for_factorize(self) -> Tuple[np.ndarray, float]: - # TODO: https://github.com/pandas-dev/pandas/issues/30037 - # use masked algorithms, rather than object-dtype / np.nan. - return self.to_numpy(dtype=float, na_value=np.nan), np.nan - - def factorize2(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "IntegerArray"]: - arr = self._data - mask = self._mask - - codes, uniques = _factorize_array(arr, na_sentinel=na_sentinel, mask=mask) - - uniques = IntegerArray(uniques, np.zeros(len(uniques), dtype=bool)) - return codes, uniques - def _values_for_argsort(self) -> np.ndarray: """ Return values for sorting. diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index d23d26d870f75..d2abe7eb340b2 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1,14 +1,15 @@ -from typing import TYPE_CHECKING, Optional, Type, TypeVar +from typing import TYPE_CHECKING, Optional, Tuple, Type, TypeVar import numpy as np from pandas._libs import lib, missing as libmissing from pandas._typing import Scalar +from pandas.util._decorators import doc from pandas.core.dtypes.common import is_integer, is_object_dtype, is_string_dtype from pandas.core.dtypes.missing import isna, notna -from pandas.core.algorithms import take +from pandas.core.algorithms import _factorize_array, take from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin from pandas.core.indexers import check_array_indexer @@ -217,6 +218,18 @@ def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT: mask = mask.copy() return type(self)(data, mask, copy=False) + @doc(ExtensionArray.factorize) + def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]: + arr = self._data + mask = self._mask + + codes, uniques = _factorize_array(arr, na_sentinel=na_sentinel, mask=mask) + + # the hashtables don't handle all different types of bits + uniques = uniques.astype(self.dtype.numpy_dtype, copy=False) + uniques = type(self)(uniques, np.zeros(len(uniques), dtype=bool)) + return codes, uniques + def value_counts(self, dropna: bool = True) -> "Series": """ Returns a Series containing counts of each unique value. diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 22e53dbc89f01..c3d25e8cb26c2 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -133,6 +133,8 @@ def test_factorize_equivalence(self, data_for_grouping, na_sentinel): tm.assert_numpy_array_equal(codes_1, codes_2) self.assert_extension_array_equal(uniques_1, uniques_2) + assert len(uniques_1) == len(pd.unique(uniques_1)) + assert uniques_1.dtype == data_for_grouping.dtype def test_factorize_empty(self, data): codes, uniques = pd.factorize(data[:0]) From b0a88b9e2edc81c8764f933d2bc80caf0dfe5ab6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 2 Apr 2020 16:40:25 +0200 Subject: [PATCH 04/10] add Int64/boolean to factorize benchmarks --- asv_bench/benchmarks/algorithms.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 1768e682b3db4..48c75262b8df4 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -34,7 +34,16 @@ class Factorize: params = [ [True, False], [True, False], - ["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"], + [ + "int", + "uint", + "float", + "string", + "datetime64[ns]", + "datetime64[ns, tz]", + "Int64", + "boolean", + ], ] param_names = ["unique", "sort", "dtype"] @@ -49,13 +58,21 @@ def setup(self, unique, sort, dtype): "datetime64[ns, tz]": pd.date_range( "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" ), + "Int64": pd.array(np.arange(N), dtype="Int64"), + "boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"), }[dtype] if not unique: data = data.repeat(5) self.idx = data + if dtype in ("Int64", "boolean") and sort: + # sort is not a keyword on EAs + raise NotImplementedError def time_factorize(self, unique, sort, dtype): - self.idx.factorize(sort=sort) + if sort: + self.idx.factorize(sort=sort) + else: + self.idx.factorize() class Duplicated: From 2e948428d2240f10661cd66950a2bab8c9d9b8d3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 2 Apr 2020 16:43:37 +0200 Subject: [PATCH 05/10] add whatsnew --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 25f847c698278..141edd171875d 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -274,6 +274,7 @@ Performance improvements :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`, :issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`). - Performance improvement in :meth:`Series.sum` for nullable (integer and boolean) dtypes (:issue:`30982`). +- Performance improvement in :func:`factorize` for nullable (integer and boolean) dtypes (:issue:`33064`). .. --------------------------------------------------------------------------- From c97d35712fc4fdd3dcce6082837cd3aa220b7fcb Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 4 Apr 2020 09:15:49 +0200 Subject: [PATCH 06/10] add # type: ignore --- pandas/core/arrays/masked.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index d2abe7eb340b2..52e31144a1468 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -226,7 +226,7 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]: codes, uniques = _factorize_array(arr, na_sentinel=na_sentinel, mask=mask) # the hashtables don't handle all different types of bits - uniques = uniques.astype(self.dtype.numpy_dtype, copy=False) + uniques = uniques.astype(self.dtype.numpy_dtype, copy=False) # type: ignore uniques = type(self)(uniques, np.zeros(len(uniques), dtype=bool)) return codes, uniques From a6bc6fc1aedf388aa9977473bdbe937bcee3c60b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 7 Apr 2020 08:56:21 +0200 Subject: [PATCH 07/10] update benchmark --- asv_bench/benchmarks/algorithms.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 48c75262b8df4..65e52e03c43c7 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -63,16 +63,10 @@ def setup(self, unique, sort, dtype): }[dtype] if not unique: data = data.repeat(5) - self.idx = data - if dtype in ("Int64", "boolean") and sort: - # sort is not a keyword on EAs - raise NotImplementedError + self.data = data def time_factorize(self, unique, sort, dtype): - if sort: - self.idx.factorize(sort=sort) - else: - self.idx.factorize() + pd.factorize(self.data, sort=sort) class Duplicated: From 889b1b997242232c811ad0e3a39d954a03018b4b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 7 Apr 2020 22:33:21 +0200 Subject: [PATCH 08/10] add BaseMaskedDtype --- pandas/core/arrays/boolean.py | 16 ++-------------- pandas/core/arrays/integer.py | 6 ++---- pandas/core/arrays/masked.py | 20 +++++++++++++++++++- 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index d11bf4bb41df2..254ee0b34411e 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -9,7 +9,6 @@ from pandas.compat import set_function_name from pandas.compat.numpy import function as nv -from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( is_bool_dtype, @@ -30,14 +29,14 @@ from pandas.core.array_algos import masked_reductions from pandas.core.indexers import check_array_indexer -from .masked import BaseMaskedArray +from .masked import BaseMaskedArray, BaseMaskedDtype if TYPE_CHECKING: import pyarrow # noqa: F401 @register_extension_dtype -class BooleanDtype(ExtensionDtype): +class BooleanDtype(BaseMaskedDtype): """ Extension dtype for boolean data. @@ -64,17 +63,6 @@ class BooleanDtype(ExtensionDtype): name = "boolean" - @property - def na_value(self) -> libmissing.NAType: - """ - BooleanDtype uses :attr:`pandas.NA` as the missing NA value. - - .. warning:: - - `na_value` may change in a future release. - """ - return libmissing.NA - @property def type(self) -> Type[np.bool_]: return np.bool_ diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 649f7f3c4751d..74ea28953325d 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -9,7 +9,6 @@ from pandas.compat import set_function_name from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( is_bool_dtype, @@ -34,13 +33,13 @@ from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.tools.numeric import to_numeric -from .masked import BaseMaskedArray +from .masked import BaseMaskedArray, BaseMaskedDtype if TYPE_CHECKING: import pyarrow # noqa: F401 -class _IntegerDtype(ExtensionDtype): +class _IntegerDtype(BaseMaskedDtype): """ An ExtensionDtype to hold a single size & kind of integer dtype. @@ -53,7 +52,6 @@ class _IntegerDtype(ExtensionDtype): name: str base = None type: Type - na_value = libmissing.NA def __repr__(self) -> str: sign = "U" if self.is_unsigned_integer else "" diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 52e31144a1468..dc8b9da039edd 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -4,8 +4,10 @@ from pandas._libs import lib, missing as libmissing from pandas._typing import Scalar +from pandas.errors import AbstractMethodError from pandas.util._decorators import doc +from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import is_integer, is_object_dtype, is_string_dtype from pandas.core.dtypes.missing import isna, notna @@ -20,6 +22,18 @@ BaseMaskedArrayT = TypeVar("BaseMaskedArrayT", bound="BaseMaskedArray") +class BaseMaskedDtype(ExtensionDtype): + """ + Base class for dtypes for BasedMaskedArray subclasses. + """ + + na_value = libmissing.NA + + @property + def numpy_dtype(self) -> np.dtype: + raise AbstractMethodError + + class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin): """ Base class for masked arrays (which use _data and _mask to store the data). @@ -38,6 +52,10 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): self._data = values self._mask = mask + @property + def dtype(self) -> BaseMaskedDtype: + raise AbstractMethodError(self) + def __getitem__(self, item): if is_integer(item): if self._mask[item]: @@ -226,7 +244,7 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]: codes, uniques = _factorize_array(arr, na_sentinel=na_sentinel, mask=mask) # the hashtables don't handle all different types of bits - uniques = uniques.astype(self.dtype.numpy_dtype, copy=False) # type: ignore + uniques = uniques.astype(self.dtype.numpy_dtype, copy=False) uniques = type(self)(uniques, np.zeros(len(uniques), dtype=bool)) return codes, uniques From 0370045436d35fb5d6f663fa1150a0135a1765ef Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 7 Apr 2020 22:53:32 +0200 Subject: [PATCH 09/10] use class variable annotation --- pandas/core/arrays/masked.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index dc8b9da039edd..56d355499438c 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -43,6 +43,7 @@ class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin): # The value used to fill '_data' to avoid upcasting _internal_fill_value: Scalar + dtype: BaseMaskedDtype def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): if copy: @@ -52,10 +53,6 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): self._data = values self._mask = mask - @property - def dtype(self) -> BaseMaskedDtype: - raise AbstractMethodError(self) - def __getitem__(self, item): if is_integer(item): if self._mask[item]: From 6ed52397baa62d339aba2a3fa44f3e719d8a5aa0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 1 May 2020 13:55:17 +0200 Subject: [PATCH 10/10] use abstract dtype --- pandas/core/arrays/masked.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 59adbb16bc8f6..127de82e318a2 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -43,7 +43,6 @@ class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin): # The value used to fill '_data' to avoid upcasting _internal_fill_value: Scalar - dtype: BaseMaskedDtype def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): # values is supposed to already be validated in the subclass @@ -64,6 +63,10 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): self._data = values self._mask = mask + @property + def dtype(self) -> BaseMaskedDtype: + raise AbstractMethodError(self) + def __getitem__(self, item): if is_integer(item): if self._mask[item]: