Skip to content

Commit 89257e2

Browse files
jorisvandenbosscherhshadrach
authored andcommitted
ENH/PERF: use mask in factorize for nullable dtypes (pandas-dev#33064)
1 parent 67b0cc4 commit 89257e2

File tree

8 files changed

+94
-50
lines changed

8 files changed

+94
-50
lines changed

asv_bench/benchmarks/algorithms.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,16 @@ class Factorize:
3434
params = [
3535
[True, False],
3636
[True, False],
37-
["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"],
37+
[
38+
"int",
39+
"uint",
40+
"float",
41+
"string",
42+
"datetime64[ns]",
43+
"datetime64[ns, tz]",
44+
"Int64",
45+
"boolean",
46+
],
3847
]
3948
param_names = ["unique", "sort", "dtype"]
4049

@@ -49,13 +58,15 @@ def setup(self, unique, sort, dtype):
4958
"datetime64[ns, tz]": pd.date_range(
5059
"2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
5160
),
61+
"Int64": pd.array(np.arange(N), dtype="Int64"),
62+
"boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"),
5263
}[dtype]
5364
if not unique:
5465
data = data.repeat(5)
55-
self.idx = data
66+
self.data = data
5667

5768
def time_factorize(self, unique, sort, dtype):
58-
self.idx.factorize(sort=sort)
69+
pd.factorize(self.data, sort=sort)
5970

6071

6172
class Duplicated:

doc/source/whatsnew/v1.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -523,6 +523,7 @@ Performance improvements
523523
sparse values from ``scipy.sparse`` matrices using the
524524
:meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`,
525525
:issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`).
526+
- Performance improvement in :func:`factorize` for nullable (integer and boolean) dtypes (:issue:`33064`).
526527
- Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`).
527528

528529

pandas/_libs/hashtable_class_helper.pxi.in

+28-7
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,7 @@ cdef class {{name}}HashTable(HashTable):
365365
def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
366366
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
367367
object na_value=None, bint ignore_na=False,
368-
bint return_inverse=False):
368+
object mask=None, bint return_inverse=False):
369369
"""
370370
Calculate unique values and labels (no sorting!)
371371

@@ -388,6 +388,10 @@ cdef class {{name}}HashTable(HashTable):
388388
Whether NA-values should be ignored for calculating the uniques. If
389389
True, the labels corresponding to missing values will be set to
390390
na_sentinel.
391+
mask : ndarray[bool], optional
392+
If not None, the mask is used as indicator for missing values
393+
(True = missing, False = valid) instead of `na_value` or
394+
condition "val != val".
391395
return_inverse : boolean, default False
392396
Whether the mapping of the original array values to their location
393397
in the vector of uniques should be returned.
@@ -406,12 +410,17 @@ cdef class {{name}}HashTable(HashTable):
406410
{{dtype}}_t val, na_value2
407411
khiter_t k
408412
{{name}}VectorData *ud
409-
bint use_na_value
413+
bint use_na_value, use_mask
414+
uint8_t[:] mask_values
410415

411416
if return_inverse:
412417
labels = np.empty(n, dtype=np.int64)
413418
ud = uniques.data
414419
use_na_value = na_value is not None
420+
use_mask = mask is not None
421+
422+
if use_mask:
423+
mask_values = mask.view("uint8")
415424

416425
if use_na_value:
417426
# We need this na_value2 because we want to allow users
@@ -427,7 +436,11 @@ cdef class {{name}}HashTable(HashTable):
427436
for i in range(n):
428437
val = values[i]
429438

430-
if ignore_na and (
439+
if ignore_na and use_mask:
440+
if mask_values[i]:
441+
labels[i] = na_sentinel
442+
continue
443+
elif ignore_na and (
431444
{{if not name.lower().startswith(("uint", "int"))}}
432445
val != val or
433446
{{endif}}
@@ -491,7 +504,7 @@ cdef class {{name}}HashTable(HashTable):
491504
return_inverse=return_inverse)
492505

493506
def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
494-
object na_value=None):
507+
object na_value=None, object mask=None):
495508
"""
496509
Calculate unique values and labels (no sorting!)
497510

@@ -509,6 +522,10 @@ cdef class {{name}}HashTable(HashTable):
509522
any value "val" satisfying val != val is considered missing.
510523
If na_value is not None, then _additionally_, any value "val"
511524
satisfying val == na_value is considered missing.
525+
mask : ndarray[bool], optional
526+
If not None, the mask is used as indicator for missing values
527+
(True = missing, False = valid) instead of `na_value` or
528+
condition "val != val".
512529

513530
Returns
514531
-------
@@ -519,7 +536,7 @@ cdef class {{name}}HashTable(HashTable):
519536
"""
520537
uniques_vector = {{name}}Vector()
521538
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
522-
na_value=na_value, ignore_na=True,
539+
na_value=na_value, ignore_na=True, mask=mask,
523540
return_inverse=True)
524541

525542
def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
@@ -852,7 +869,7 @@ cdef class StringHashTable(HashTable):
852869
return_inverse=return_inverse)
853870

854871
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
855-
object na_value=None):
872+
object na_value=None, object mask=None):
856873
"""
857874
Calculate unique values and labels (no sorting!)
858875

@@ -870,6 +887,8 @@ cdef class StringHashTable(HashTable):
870887
that is not a string is considered missing. If na_value is
871888
not None, then _additionally_ any value "val" satisfying
872889
val == na_value is considered missing.
890+
mask : ndarray[bool], optional
891+
Not yet implementd for StringHashTable.
873892

874893
Returns
875894
-------
@@ -1091,7 +1110,7 @@ cdef class PyObjectHashTable(HashTable):
10911110
return_inverse=return_inverse)
10921111

10931112
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
1094-
object na_value=None):
1113+
object na_value=None, object mask=None):
10951114
"""
10961115
Calculate unique values and labels (no sorting!)
10971116

@@ -1109,6 +1128,8 @@ cdef class PyObjectHashTable(HashTable):
11091128
any value "val" satisfying val != val is considered missing.
11101129
If na_value is not None, then _additionally_, any value "val"
11111130
satisfying val == na_value is considered missing.
1131+
mask : ndarray[bool], optional
1132+
Not yet implemented for PyObjectHashTable.
11121133

11131134
Returns
11141135
-------

pandas/core/algorithms.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
461461

462462

463463
def _factorize_array(
464-
values, na_sentinel: int = -1, size_hint=None, na_value=None
464+
values, na_sentinel: int = -1, size_hint=None, na_value=None, mask=None,
465465
) -> Tuple[np.ndarray, np.ndarray]:
466466
"""
467467
Factorize an array-like to codes and uniques.
@@ -479,6 +479,10 @@ def _factorize_array(
479479
parameter when you know that you don't have any values pandas would
480480
consider missing in the array (NaN for float data, iNaT for
481481
datetimes, etc.).
482+
mask : ndarray[bool], optional
483+
If not None, the mask is used as indicator for missing values
484+
(True = missing, False = valid) instead of `na_value` or
485+
condition "val != val".
482486
483487
Returns
484488
-------
@@ -488,7 +492,9 @@ def _factorize_array(
488492
hash_klass, values = _get_data_algo(values)
489493

490494
table = hash_klass(size_hint or len(values))
491-
uniques, codes = table.factorize(values, na_sentinel=na_sentinel, na_value=na_value)
495+
uniques, codes = table.factorize(
496+
values, na_sentinel=na_sentinel, na_value=na_value, mask=mask
497+
)
492498

493499
codes = ensure_platform_int(codes)
494500
return codes, uniques

pandas/core/arrays/boolean.py

+6-23
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
from pandas.compat import set_function_name
1010
from pandas.compat.numpy import function as nv
1111

12-
from pandas.core.dtypes.base import ExtensionDtype
1312
from pandas.core.dtypes.cast import astype_nansafe
1413
from pandas.core.dtypes.common import (
1514
is_bool_dtype,
@@ -30,14 +29,14 @@
3029
from pandas.core.array_algos import masked_reductions
3130
from pandas.core.indexers import check_array_indexer
3231

33-
from .masked import BaseMaskedArray
32+
from .masked import BaseMaskedArray, BaseMaskedDtype
3433

3534
if TYPE_CHECKING:
3635
import pyarrow # noqa: F401
3736

3837

3938
@register_extension_dtype
40-
class BooleanDtype(ExtensionDtype):
39+
class BooleanDtype(BaseMaskedDtype):
4140
"""
4241
Extension dtype for boolean data.
4342
@@ -64,17 +63,6 @@ class BooleanDtype(ExtensionDtype):
6463

6564
name = "boolean"
6665

67-
@property
68-
def na_value(self) -> libmissing.NAType:
69-
"""
70-
BooleanDtype uses :attr:`pandas.NA` as the missing NA value.
71-
72-
.. warning::
73-
74-
`na_value` may change in a future release.
75-
"""
76-
return libmissing.NA
77-
7866
@property
7967
def type(self) -> Type[np.bool_]:
8068
return np.bool_
@@ -83,6 +71,10 @@ def type(self) -> Type[np.bool_]:
8371
def kind(self) -> str:
8472
return "b"
8573

74+
@property
75+
def numpy_dtype(self) -> np.dtype:
76+
return np.dtype("bool")
77+
8678
@classmethod
8779
def construct_array_type(cls) -> Type["BooleanArray"]:
8880
"""
@@ -304,15 +296,6 @@ def map_string(s):
304296
scalars = [map_string(x) for x in strings]
305297
return cls._from_sequence(scalars, dtype, copy)
306298

307-
def _values_for_factorize(self) -> Tuple[np.ndarray, int]:
308-
data = self._data.astype("int8")
309-
data[self._mask] = -1
310-
return data, -1
311-
312-
@classmethod
313-
def _from_factorized(cls, values, original: "BooleanArray") -> "BooleanArray":
314-
return cls._from_sequence(values, dtype=original.dtype)
315-
316299
_HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_)
317300

318301
def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs):

pandas/core/arrays/integer.py

+2-13
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
from pandas.compat.numpy import function as nv
1111
from pandas.util._decorators import cache_readonly
1212

13-
from pandas.core.dtypes.base import ExtensionDtype
1413
from pandas.core.dtypes.cast import astype_nansafe
1514
from pandas.core.dtypes.common import (
1615
is_bool_dtype,
@@ -34,13 +33,13 @@
3433
from pandas.core.ops.common import unpack_zerodim_and_defer
3534
from pandas.core.tools.numeric import to_numeric
3635

37-
from .masked import BaseMaskedArray
36+
from .masked import BaseMaskedArray, BaseMaskedDtype
3837

3938
if TYPE_CHECKING:
4039
import pyarrow # noqa: F401
4140

4241

43-
class _IntegerDtype(ExtensionDtype):
42+
class _IntegerDtype(BaseMaskedDtype):
4443
"""
4544
An ExtensionDtype to hold a single size & kind of integer dtype.
4645
@@ -53,7 +52,6 @@ class _IntegerDtype(ExtensionDtype):
5352
name: str
5453
base = None
5554
type: Type
56-
na_value = libmissing.NA
5755

5856
def __repr__(self) -> str:
5957
sign = "U" if self.is_unsigned_integer else ""
@@ -372,10 +370,6 @@ def _from_sequence_of_strings(
372370
scalars = to_numeric(strings, errors="raise")
373371
return cls._from_sequence(scalars, dtype, copy)
374372

375-
@classmethod
376-
def _from_factorized(cls, values, original) -> "IntegerArray":
377-
return integer_array(values, dtype=original.dtype)
378-
379373
_HANDLED_TYPES = (np.ndarray, numbers.Number)
380374

381375
def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs):
@@ -485,11 +479,6 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike:
485479
data = self.to_numpy(dtype=dtype, **kwargs)
486480
return astype_nansafe(data, dtype, copy=False)
487481

488-
def _values_for_factorize(self) -> Tuple[np.ndarray, float]:
489-
# TODO: https://github.com/pandas-dev/pandas/issues/30037
490-
# use masked algorithms, rather than object-dtype / np.nan.
491-
return self.to_numpy(na_value=np.nan), np.nan
492-
493482
def _values_for_argsort(self) -> np.ndarray:
494483
"""
495484
Return values for sorting.

pandas/core/arrays/masked.py

+33-2
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
1-
from typing import TYPE_CHECKING, Optional, Type, TypeVar
1+
from typing import TYPE_CHECKING, Optional, Tuple, Type, TypeVar
22

33
import numpy as np
44

55
from pandas._libs import lib, missing as libmissing
66
from pandas._typing import Scalar
7+
from pandas.errors import AbstractMethodError
8+
from pandas.util._decorators import doc
79

10+
from pandas.core.dtypes.base import ExtensionDtype
811
from pandas.core.dtypes.common import is_integer, is_object_dtype, is_string_dtype
912
from pandas.core.dtypes.missing import isna, notna
1013

11-
from pandas.core.algorithms import take
14+
from pandas.core.algorithms import _factorize_array, take
1215
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
1316
from pandas.core.indexers import check_array_indexer
1417

@@ -19,6 +22,18 @@
1922
BaseMaskedArrayT = TypeVar("BaseMaskedArrayT", bound="BaseMaskedArray")
2023

2124

25+
class BaseMaskedDtype(ExtensionDtype):
26+
"""
27+
Base class for dtypes for BasedMaskedArray subclasses.
28+
"""
29+
30+
na_value = libmissing.NA
31+
32+
@property
33+
def numpy_dtype(self) -> np.dtype:
34+
raise AbstractMethodError
35+
36+
2237
class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin):
2338
"""
2439
Base class for masked arrays (which use _data and _mask to store the data).
@@ -48,6 +63,10 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
4863
self._data = values
4964
self._mask = mask
5065

66+
@property
67+
def dtype(self) -> BaseMaskedDtype:
68+
raise AbstractMethodError(self)
69+
5170
def __getitem__(self, item):
5271
if is_integer(item):
5372
if self._mask[item]:
@@ -228,6 +247,18 @@ def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
228247
mask = mask.copy()
229248
return type(self)(data, mask, copy=False)
230249

250+
@doc(ExtensionArray.factorize)
251+
def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]:
252+
arr = self._data
253+
mask = self._mask
254+
255+
codes, uniques = _factorize_array(arr, na_sentinel=na_sentinel, mask=mask)
256+
257+
# the hashtables don't handle all different types of bits
258+
uniques = uniques.astype(self.dtype.numpy_dtype, copy=False)
259+
uniques = type(self)(uniques, np.zeros(len(uniques), dtype=bool))
260+
return codes, uniques
261+
231262
def value_counts(self, dropna: bool = True) -> "Series":
232263
"""
233264
Returns a Series containing counts of each unique value.

pandas/tests/extension/base/methods.py

+2
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,8 @@ def test_factorize_equivalence(self, data_for_grouping, na_sentinel):
147147

148148
tm.assert_numpy_array_equal(codes_1, codes_2)
149149
self.assert_extension_array_equal(uniques_1, uniques_2)
150+
assert len(uniques_1) == len(pd.unique(uniques_1))
151+
assert uniques_1.dtype == data_for_grouping.dtype
150152

151153
def test_factorize_empty(self, data):
152154
codes, uniques = pd.factorize(data[:0])

0 commit comments

Comments
 (0)