diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 1768e682b3db4..65e52e03c43c7 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -34,7 +34,16 @@ class Factorize: params = [ [True, False], [True, False], - ["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"], + [ + "int", + "uint", + "float", + "string", + "datetime64[ns]", + "datetime64[ns, tz]", + "Int64", + "boolean", + ], ] param_names = ["unique", "sort", "dtype"] @@ -49,13 +58,15 @@ def setup(self, unique, sort, dtype): "datetime64[ns, tz]": pd.date_range( "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" ), + "Int64": pd.array(np.arange(N), dtype="Int64"), + "boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"), }[dtype] if not unique: data = data.repeat(5) - self.idx = data + self.data = data def time_factorize(self, unique, sort, dtype): - self.idx.factorize(sort=sort) + pd.factorize(self.data, sort=sort) class Duplicated: diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 3ce0db2cf38d0..95cb4ccbbb796 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -523,6 +523,7 @@ Performance improvements sparse values from ``scipy.sparse`` matrices using the :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`, :issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`). +- Performance improvement in :func:`factorize` for nullable (integer and boolean) dtypes (:issue:`33064`). - Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`). diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index c251c92cb072a..ad65f9707610b 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -365,7 +365,7 @@ cdef class {{name}}HashTable(HashTable): def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None, bint ignore_na=False, - bint return_inverse=False): + object mask=None, bint return_inverse=False): """ Calculate unique values and labels (no sorting!) @@ -388,6 +388,10 @@ cdef class {{name}}HashTable(HashTable): Whether NA-values should be ignored for calculating the uniques. If True, the labels corresponding to missing values will be set to na_sentinel. + mask : ndarray[bool], optional + If not None, the mask is used as indicator for missing values + (True = missing, False = valid) instead of `na_value` or + condition "val != val". return_inverse : boolean, default False Whether the mapping of the original array values to their location in the vector of uniques should be returned. @@ -406,12 +410,17 @@ cdef class {{name}}HashTable(HashTable): {{dtype}}_t val, na_value2 khiter_t k {{name}}VectorData *ud - bint use_na_value + bint use_na_value, use_mask + uint8_t[:] mask_values if return_inverse: labels = np.empty(n, dtype=np.int64) ud = uniques.data use_na_value = na_value is not None + use_mask = mask is not None + + if use_mask: + mask_values = mask.view("uint8") if use_na_value: # We need this na_value2 because we want to allow users @@ -427,7 +436,11 @@ cdef class {{name}}HashTable(HashTable): for i in range(n): val = values[i] - if ignore_na and ( + if ignore_na and use_mask: + if mask_values[i]: + labels[i] = na_sentinel + continue + elif ignore_na and ( {{if not name.lower().startswith(("uint", "int"))}} val != val or {{endif}} @@ -491,7 +504,7 @@ cdef class {{name}}HashTable(HashTable): return_inverse=return_inverse) def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1, - object na_value=None): + object na_value=None, object mask=None): """ Calculate unique values and labels (no sorting!) @@ -509,6 +522,10 @@ cdef class {{name}}HashTable(HashTable): any value "val" satisfying val != val is considered missing. If na_value is not None, then _additionally_, any value "val" satisfying val == na_value is considered missing. + mask : ndarray[bool], optional + If not None, the mask is used as indicator for missing values + (True = missing, False = valid) instead of `na_value` or + condition "val != val". Returns ------- @@ -519,7 +536,7 @@ cdef class {{name}}HashTable(HashTable): """ uniques_vector = {{name}}Vector() return self._unique(values, uniques_vector, na_sentinel=na_sentinel, - na_value=na_value, ignore_na=True, + na_value=na_value, ignore_na=True, mask=mask, return_inverse=True) def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, @@ -852,7 +869,7 @@ cdef class StringHashTable(HashTable): return_inverse=return_inverse) def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, - object na_value=None): + object na_value=None, object mask=None): """ Calculate unique values and labels (no sorting!) @@ -870,6 +887,8 @@ cdef class StringHashTable(HashTable): that is not a string is considered missing. If na_value is not None, then _additionally_ any value "val" satisfying val == na_value is considered missing. + mask : ndarray[bool], optional + Not yet implementd for StringHashTable. Returns ------- @@ -1091,7 +1110,7 @@ cdef class PyObjectHashTable(HashTable): return_inverse=return_inverse) def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, - object na_value=None): + object na_value=None, object mask=None): """ Calculate unique values and labels (no sorting!) @@ -1109,6 +1128,8 @@ cdef class PyObjectHashTable(HashTable): any value "val" satisfying val != val is considered missing. If na_value is not None, then _additionally_, any value "val" satisfying val == na_value is considered missing. + mask : ndarray[bool], optional + Not yet implemented for PyObjectHashTable. Returns ------- diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b97063cfa7fd0..90927adf19885 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -461,7 +461,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: def _factorize_array( - values, na_sentinel: int = -1, size_hint=None, na_value=None + values, na_sentinel: int = -1, size_hint=None, na_value=None, mask=None, ) -> Tuple[np.ndarray, np.ndarray]: """ Factorize an array-like to codes and uniques. @@ -479,6 +479,10 @@ def _factorize_array( parameter when you know that you don't have any values pandas would consider missing in the array (NaN for float data, iNaT for datetimes, etc.). + mask : ndarray[bool], optional + If not None, the mask is used as indicator for missing values + (True = missing, False = valid) instead of `na_value` or + condition "val != val". Returns ------- @@ -488,7 +492,9 @@ def _factorize_array( hash_klass, values = _get_data_algo(values) table = hash_klass(size_hint or len(values)) - uniques, codes = table.factorize(values, na_sentinel=na_sentinel, na_value=na_value) + uniques, codes = table.factorize( + values, na_sentinel=na_sentinel, na_value=na_value, mask=mask + ) codes = ensure_platform_int(codes) return codes, uniques diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 685a9ec48228f..b1d318d4d9678 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -9,7 +9,6 @@ from pandas.compat import set_function_name from pandas.compat.numpy import function as nv -from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( is_bool_dtype, @@ -30,14 +29,14 @@ from pandas.core.array_algos import masked_reductions from pandas.core.indexers import check_array_indexer -from .masked import BaseMaskedArray +from .masked import BaseMaskedArray, BaseMaskedDtype if TYPE_CHECKING: import pyarrow # noqa: F401 @register_extension_dtype -class BooleanDtype(ExtensionDtype): +class BooleanDtype(BaseMaskedDtype): """ Extension dtype for boolean data. @@ -64,17 +63,6 @@ class BooleanDtype(ExtensionDtype): name = "boolean" - @property - def na_value(self) -> libmissing.NAType: - """ - BooleanDtype uses :attr:`pandas.NA` as the missing NA value. - - .. warning:: - - `na_value` may change in a future release. - """ - return libmissing.NA - @property def type(self) -> Type[np.bool_]: return np.bool_ @@ -83,6 +71,10 @@ def type(self) -> Type[np.bool_]: def kind(self) -> str: return "b" + @property + def numpy_dtype(self) -> np.dtype: + return np.dtype("bool") + @classmethod def construct_array_type(cls) -> Type["BooleanArray"]: """ @@ -304,15 +296,6 @@ def map_string(s): scalars = [map_string(x) for x in strings] return cls._from_sequence(scalars, dtype, copy) - def _values_for_factorize(self) -> Tuple[np.ndarray, int]: - data = self._data.astype("int8") - data[self._mask] = -1 - return data, -1 - - @classmethod - def _from_factorized(cls, values, original: "BooleanArray") -> "BooleanArray": - return cls._from_sequence(values, dtype=original.dtype) - _HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_) def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs): diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 743267534bfaa..5a90ea4a36a21 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -10,7 +10,6 @@ from pandas.compat.numpy import function as nv from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( is_bool_dtype, @@ -34,13 +33,13 @@ from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.tools.numeric import to_numeric -from .masked import BaseMaskedArray +from .masked import BaseMaskedArray, BaseMaskedDtype if TYPE_CHECKING: import pyarrow # noqa: F401 -class _IntegerDtype(ExtensionDtype): +class _IntegerDtype(BaseMaskedDtype): """ An ExtensionDtype to hold a single size & kind of integer dtype. @@ -53,7 +52,6 @@ class _IntegerDtype(ExtensionDtype): name: str base = None type: Type - na_value = libmissing.NA def __repr__(self) -> str: sign = "U" if self.is_unsigned_integer else "" @@ -372,10 +370,6 @@ def _from_sequence_of_strings( scalars = to_numeric(strings, errors="raise") return cls._from_sequence(scalars, dtype, copy) - @classmethod - def _from_factorized(cls, values, original) -> "IntegerArray": - return integer_array(values, dtype=original.dtype) - _HANDLED_TYPES = (np.ndarray, numbers.Number) def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs): @@ -485,11 +479,6 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: data = self.to_numpy(dtype=dtype, **kwargs) return astype_nansafe(data, dtype, copy=False) - def _values_for_factorize(self) -> Tuple[np.ndarray, float]: - # TODO: https://github.com/pandas-dev/pandas/issues/30037 - # use masked algorithms, rather than object-dtype / np.nan. - return self.to_numpy(na_value=np.nan), np.nan - def _values_for_argsort(self) -> np.ndarray: """ Return values for sorting. diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index fc5b307bd5754..127de82e318a2 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1,14 +1,17 @@ -from typing import TYPE_CHECKING, Optional, Type, TypeVar +from typing import TYPE_CHECKING, Optional, Tuple, Type, TypeVar import numpy as np from pandas._libs import lib, missing as libmissing from pandas._typing import Scalar +from pandas.errors import AbstractMethodError +from pandas.util._decorators import doc +from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import is_integer, is_object_dtype, is_string_dtype from pandas.core.dtypes.missing import isna, notna -from pandas.core.algorithms import take +from pandas.core.algorithms import _factorize_array, take from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin from pandas.core.indexers import check_array_indexer @@ -19,6 +22,18 @@ BaseMaskedArrayT = TypeVar("BaseMaskedArrayT", bound="BaseMaskedArray") +class BaseMaskedDtype(ExtensionDtype): + """ + Base class for dtypes for BasedMaskedArray subclasses. + """ + + na_value = libmissing.NA + + @property + def numpy_dtype(self) -> np.dtype: + raise AbstractMethodError + + class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin): """ Base class for masked arrays (which use _data and _mask to store the data). @@ -48,6 +63,10 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): self._data = values self._mask = mask + @property + def dtype(self) -> BaseMaskedDtype: + raise AbstractMethodError(self) + def __getitem__(self, item): if is_integer(item): if self._mask[item]: @@ -228,6 +247,18 @@ def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT: mask = mask.copy() return type(self)(data, mask, copy=False) + @doc(ExtensionArray.factorize) + def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]: + arr = self._data + mask = self._mask + + codes, uniques = _factorize_array(arr, na_sentinel=na_sentinel, mask=mask) + + # the hashtables don't handle all different types of bits + uniques = uniques.astype(self.dtype.numpy_dtype, copy=False) + uniques = type(self)(uniques, np.zeros(len(uniques), dtype=bool)) + return codes, uniques + def value_counts(self, dropna: bool = True) -> "Series": """ Returns a Series containing counts of each unique value. diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 4a6d827b36b02..874a8dfd4253f 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -147,6 +147,8 @@ def test_factorize_equivalence(self, data_for_grouping, na_sentinel): tm.assert_numpy_array_equal(codes_1, codes_2) self.assert_extension_array_equal(uniques_1, uniques_2) + assert len(uniques_1) == len(pd.unique(uniques_1)) + assert uniques_1.dtype == data_for_grouping.dtype def test_factorize_empty(self, data): codes, uniques = pd.factorize(data[:0])