Skip to content

Commit a375061

Browse files
authored
REGR: Performance decrease in factorize (#48620)
1 parent 71fc89c commit a375061

File tree

2 files changed

+13
-11
lines changed

2 files changed

+13
-11
lines changed

doc/source/whatsnew/v1.5.1.rst

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ including other versions of pandas.
1515
Fixed regressions
1616
~~~~~~~~~~~~~~~~~
1717
- Regression in :func:`.read_csv` causing an ``EmptyDataError`` when using an UTF-8 file handle that was already read from (:issue:`48646`)
18+
- Fixed performance regression in :func:`factorize` when ``na_sentinel`` is not ``None`` and ``sort=False`` (:issue:`48620`)
1819
-
1920

2021
.. ---------------------------------------------------------------------------

pandas/core/algorithms.py

+12-11
Original file line numberDiff line numberDiff line change
@@ -569,17 +569,6 @@ def factorize_array(
569569

570570
hash_klass, values = _get_hashtable_algo(values)
571571

572-
# factorize can now handle differentiating various types of null values.
573-
# However, for backwards compatibility we only use the null for the
574-
# provided dtype. This may be revisited in the future, see GH#48476.
575-
null_mask = isna(values)
576-
if null_mask.any():
577-
na_value = na_value_for_dtype(values.dtype, compat=False)
578-
# Don't modify (potentially user-provided) array
579-
# error: No overload variant of "where" matches argument types "Any", "object",
580-
# "ndarray[Any, Any]"
581-
values = np.where(null_mask, na_value, values) # type: ignore[call-overload]
582-
583572
table = hash_klass(size_hint or len(values))
584573
uniques, codes = table.factorize(
585574
values,
@@ -813,6 +802,18 @@ def factorize(
813802
na_sentinel_arg = None
814803
else:
815804
na_sentinel_arg = na_sentinel
805+
806+
if not dropna and not sort and is_object_dtype(values):
807+
# factorize can now handle differentiating various types of null values.
808+
# These can only occur when the array has object dtype.
809+
# However, for backwards compatibility we only use the null for the
810+
# provided dtype. This may be revisited in the future, see GH#48476.
811+
null_mask = isna(values)
812+
if null_mask.any():
813+
na_value = na_value_for_dtype(values.dtype, compat=False)
814+
# Don't modify (potentially user-provided) array
815+
values = np.where(null_mask, na_value, values)
816+
816817
codes, uniques = factorize_array(
817818
values,
818819
na_sentinel=na_sentinel_arg,

0 commit comments

Comments
 (0)