Skip to content

Commit 56a7184

Browse files
rhshadrachasv-bot
and
asv-bot
authored
BUG: algorithms.factorize moves null values when sort=False (#46601)
* BUG: algos.factorizes moves null values when sort=False * Implementation for pyarrow < 4.0 * fixup * fixups * test fixup * type-hints * improvements * remove override of _from_factorized in string_.py * Rework na_sentinel/dropna/ignore_na * fixups * fixup for pyarrow < 4.0 * whatsnew note * doc fixup * fixups * fixup whatsnew note * whatsnew note; comment on old vs new API Co-authored-by: asv-bot <[email protected]>
1 parent 8819247 commit 56a7184

File tree

12 files changed

+179
-58
lines changed

12 files changed

+179
-58
lines changed

doc/source/whatsnew/v1.5.0.rst

+3
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,7 @@ Other enhancements
299299
- Added ``copy`` keyword to :meth:`Series.set_axis` and :meth:`DataFrame.set_axis` to allow user to set axis on a new object without necessarily copying the underlying data (:issue:`47932`)
300300
- :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support a ``copy`` argument. If ``False``, the underlying data is not copied in the returned object (:issue:`47934`)
301301
- :meth:`DataFrame.set_index` now supports a ``copy`` keyword. If ``False``, the underlying data is not copied when a new :class:`DataFrame` is returned (:issue:`48043`)
302+
- The method :meth:`.ExtensionArray.factorize` accepts ``use_na_sentinel=False`` for determining how null values are to be treated (:issue:`46601`)
302303

303304
.. ---------------------------------------------------------------------------
304305
.. _whatsnew_150.notable_bug_fixes:
@@ -922,6 +923,7 @@ Numeric
922923
- Bug in division, ``pow`` and ``mod`` operations on array-likes with ``dtype="boolean"`` not being like their ``np.bool_`` counterparts (:issue:`46063`)
923924
- Bug in multiplying a :class:`Series` with ``IntegerDtype`` or ``FloatingDtype`` by an array-like with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`45622`)
924925
- Bug in :meth:`mean` where the optional dependency ``bottleneck`` causes precision loss linear in the length of the array. ``bottleneck`` has been disabled for :meth:`mean` improving the loss to log-linear but may result in a performance decrease. (:issue:`42878`)
926+
- Bug in :func:`factorize` would convert the value ``None`` to ``np.nan`` (:issue:`46601`)
925927

926928
Conversion
927929
^^^^^^^^^^
@@ -1093,6 +1095,7 @@ Groupby/resample/rolling
10931095
- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` would not respect ``dropna=False`` when the input DataFrame/Series had a NaN values in a :class:`MultiIndex` (:issue:`46783`)
10941096
- Bug in :meth:`DataFrameGroupBy.resample` raises ``KeyError`` when getting the result from a key list which misses the resample key (:issue:`47362`)
10951097
- Bug in :meth:`DataFrame.groupby` would lose index columns when the DataFrame is empty for transforms, like fillna (:issue:`47787`)
1098+
- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` with ``dropna=False`` and ``sort=False`` would put any null groups at the end instead the order that they are encountered (:issue:`46584`)
10961099
-
10971100

10981101
Reshaping

pandas/_libs/hashtable_class_helper.pxi.in

+6-6
Original file line numberDiff line numberDiff line change
@@ -711,7 +711,7 @@ cdef class {{name}}HashTable(HashTable):
711711
return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask)
712712

713713
def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
714-
object na_value=None, object mask=None):
714+
object na_value=None, object mask=None, ignore_na=True):
715715
"""
716716
Calculate unique values and labels (no sorting!)
717717

@@ -743,7 +743,7 @@ cdef class {{name}}HashTable(HashTable):
743743
"""
744744
uniques_vector = {{name}}Vector()
745745
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
746-
na_value=na_value, ignore_na=True, mask=mask,
746+
na_value=na_value, ignore_na=ignore_na, mask=mask,
747747
return_inverse=True)
748748

749749
def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
@@ -1092,7 +1092,7 @@ cdef class StringHashTable(HashTable):
10921092
return_inverse=return_inverse)
10931093

10941094
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
1095-
object na_value=None, object mask=None):
1095+
object na_value=None, object mask=None, ignore_na=True):
10961096
"""
10971097
Calculate unique values and labels (no sorting!)
10981098

@@ -1122,7 +1122,7 @@ cdef class StringHashTable(HashTable):
11221122
"""
11231123
uniques_vector = ObjectVector()
11241124
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
1125-
na_value=na_value, ignore_na=True,
1125+
na_value=na_value, ignore_na=ignore_na,
11261126
return_inverse=True)
11271127

11281128
def get_labels(self, ndarray[object] values, ObjectVector uniques,
@@ -1347,7 +1347,7 @@ cdef class PyObjectHashTable(HashTable):
13471347
return_inverse=return_inverse)
13481348

13491349
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
1350-
object na_value=None, object mask=None):
1350+
object na_value=None, object mask=None, ignore_na=True):
13511351
"""
13521352
Calculate unique values and labels (no sorting!)
13531353

@@ -1377,7 +1377,7 @@ cdef class PyObjectHashTable(HashTable):
13771377
"""
13781378
uniques_vector = ObjectVector()
13791379
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
1380-
na_value=na_value, ignore_na=True,
1380+
na_value=na_value, ignore_na=ignore_na,
13811381
return_inverse=True)
13821382

13831383
def get_labels(self, ndarray[object] values, ObjectVector uniques,

pandas/core/algorithms.py

+48-19
Original file line numberDiff line numberDiff line change
@@ -521,7 +521,7 @@ def f(c, v):
521521

522522
def factorize_array(
523523
values: np.ndarray,
524-
na_sentinel: int = -1,
524+
na_sentinel: int | None = -1,
525525
size_hint: int | None = None,
526526
na_value: object = None,
527527
mask: npt.NDArray[np.bool_] | None = None,
@@ -552,6 +552,10 @@ def factorize_array(
552552
codes : ndarray[np.intp]
553553
uniques : ndarray
554554
"""
555+
ignore_na = na_sentinel is not None
556+
if not ignore_na:
557+
na_sentinel = -1
558+
555559
original = values
556560
if values.dtype.kind in ["m", "M"]:
557561
# _get_hashtable_algo will cast dt64/td64 to i8 via _ensure_data, so we
@@ -564,7 +568,11 @@ def factorize_array(
564568

565569
table = hash_klass(size_hint or len(values))
566570
uniques, codes = table.factorize(
567-
values, na_sentinel=na_sentinel, na_value=na_value, mask=mask
571+
values,
572+
na_sentinel=na_sentinel,
573+
na_value=na_value,
574+
mask=mask,
575+
ignore_na=ignore_na,
568576
)
569577

570578
# re-cast e.g. i8->dt64/td64, uint8->bool
@@ -738,6 +746,10 @@ def factorize(
738746
# responsible only for factorization. All data coercion, sorting and boxing
739747
# should happen here.
740748

749+
# GH#46910 deprecated na_sentinel in favor of use_na_sentinel:
750+
# na_sentinel=None corresponds to use_na_sentinel=False
751+
# na_sentinel=-1 correspond to use_na_sentinel=True
752+
# Other na_sentinel values will not be supported when the deprecation is enforced.
741753
na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
742754
if isinstance(values, ABCRangeIndex):
743755
return values.factorize(sort=sort)
@@ -749,10 +761,7 @@ def factorize(
749761

750762
# GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques
751763
# of values, assign na_sentinel=-1 to replace code value for NaN.
752-
dropna = True
753-
if na_sentinel is None:
754-
na_sentinel = -1
755-
dropna = False
764+
dropna = na_sentinel is not None
756765

757766
if (
758767
isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray))
@@ -765,38 +774,58 @@ def factorize(
765774

766775
elif not isinstance(values.dtype, np.dtype):
767776
if (
768-
na_sentinel == -1
769-
and "use_na_sentinel" in inspect.signature(values.factorize).parameters
770-
):
777+
na_sentinel == -1 or na_sentinel is None
778+
) and "use_na_sentinel" in inspect.signature(values.factorize).parameters:
771779
# Avoid using catch_warnings when possible
772780
# GH#46910 - TimelikeOps has deprecated signature
773781
codes, uniques = values.factorize( # type: ignore[call-arg]
774-
use_na_sentinel=True
782+
use_na_sentinel=na_sentinel is not None
775783
)
776784
else:
785+
na_sentinel_arg = -1 if na_sentinel is None else na_sentinel
777786
with warnings.catch_warnings():
778787
# We've already warned above
779788
warnings.filterwarnings("ignore", ".*use_na_sentinel.*", FutureWarning)
780-
codes, uniques = values.factorize(na_sentinel=na_sentinel)
789+
codes, uniques = values.factorize(na_sentinel=na_sentinel_arg)
781790

782791
else:
783792
values = np.asarray(values) # convert DTA/TDA/MultiIndex
793+
# TODO: pass na_sentinel=na_sentinel to factorize_array. When sort is True and
794+
# na_sentinel is None we append NA on the end because safe_sort does not
795+
# handle null values in uniques.
796+
if na_sentinel is None and sort:
797+
na_sentinel_arg = -1
798+
elif na_sentinel is None:
799+
na_sentinel_arg = None
800+
else:
801+
na_sentinel_arg = na_sentinel
784802
codes, uniques = factorize_array(
785-
values, na_sentinel=na_sentinel, size_hint=size_hint
803+
values,
804+
na_sentinel=na_sentinel_arg,
805+
size_hint=size_hint,
786806
)
787807

788808
if sort and len(uniques) > 0:
809+
if na_sentinel is None:
810+
# TODO: Can remove when na_sentinel=na_sentinel as in TODO above
811+
na_sentinel = -1
789812
uniques, codes = safe_sort(
790813
uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False
791814
)
792815

793-
code_is_na = codes == na_sentinel
794-
if not dropna and code_is_na.any():
795-
# na_value is set based on the dtype of uniques, and compat set to False is
796-
# because we do not want na_value to be 0 for integers
797-
na_value = na_value_for_dtype(uniques.dtype, compat=False)
798-
uniques = np.append(uniques, [na_value])
799-
codes = np.where(code_is_na, len(uniques) - 1, codes)
816+
if not dropna and sort:
817+
# TODO: Can remove entire block when na_sentinel=na_sentinel as in TODO above
818+
if na_sentinel is None:
819+
na_sentinel_arg = -1
820+
else:
821+
na_sentinel_arg = na_sentinel
822+
code_is_na = codes == na_sentinel_arg
823+
if code_is_na.any():
824+
# na_value is set based on the dtype of uniques, and compat set to False is
825+
# because we do not want na_value to be 0 for integers
826+
na_value = na_value_for_dtype(uniques.dtype, compat=False)
827+
uniques = np.append(uniques, [na_value])
828+
codes = np.where(code_is_na, len(uniques) - 1, codes)
800829

801830
uniques = _reconstruct_data(uniques, original.dtype, original)
802831

pandas/core/arrays/arrow/array.py

+17-5
Original file line numberDiff line numberDiff line change
@@ -551,20 +551,32 @@ def factorize(
551551
use_na_sentinel: bool | lib.NoDefault = lib.no_default,
552552
) -> tuple[np.ndarray, ExtensionArray]:
553553
resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
554-
if resolved_na_sentinel is None:
555-
raise NotImplementedError("Encoding NaN values is not yet implemented")
554+
if pa_version_under4p0:
555+
encoded = self._data.dictionary_encode()
556556
else:
557-
na_sentinel = resolved_na_sentinel
558-
encoded = self._data.dictionary_encode()
557+
null_encoding = "mask" if resolved_na_sentinel is not None else "encode"
558+
encoded = self._data.dictionary_encode(null_encoding=null_encoding)
559559
indices = pa.chunked_array(
560560
[c.indices for c in encoded.chunks], type=encoded.type.index_type
561561
).to_pandas()
562562
if indices.dtype.kind == "f":
563-
indices[np.isnan(indices)] = na_sentinel
563+
indices[np.isnan(indices)] = (
564+
resolved_na_sentinel if resolved_na_sentinel is not None else -1
565+
)
564566
indices = indices.astype(np.int64, copy=False)
565567

566568
if encoded.num_chunks:
567569
uniques = type(self)(encoded.chunk(0).dictionary)
570+
if resolved_na_sentinel is None and pa_version_under4p0:
571+
# TODO: share logic with BaseMaskedArray.factorize
572+
# Insert na with the proper code
573+
na_mask = indices.values == -1
574+
na_index = na_mask.argmax()
575+
if na_mask[na_index]:
576+
uniques = uniques.insert(na_index, self.dtype.na_value)
577+
na_code = 0 if na_index == 0 else indices[:na_index].argmax() + 1
578+
indices[indices >= na_code] += 1
579+
indices[indices == -1] = na_code
568580
else:
569581
uniques = type(self)(pa.array([], type=encoded.type.value_type))
570582

pandas/core/arrays/base.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -1081,14 +1081,10 @@ def factorize(
10811081
# 2. ExtensionArray.factorize.
10821082
# Complete control over factorization.
10831083
resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
1084-
if resolved_na_sentinel is None:
1085-
raise NotImplementedError("Encoding NaN values is not yet implemented")
1086-
else:
1087-
na_sentinel = resolved_na_sentinel
10881084
arr, na_value = self._values_for_factorize()
10891085

10901086
codes, uniques = factorize_array(
1091-
arr, na_sentinel=na_sentinel, na_value=na_value
1087+
arr, na_sentinel=resolved_na_sentinel, na_value=na_value
10921088
)
10931089

10941090
uniques_ea = self._from_factorized(uniques, self)

pandas/core/arrays/masked.py

+26-6
Original file line numberDiff line numberDiff line change
@@ -886,19 +886,39 @@ def factorize(
886886
use_na_sentinel: bool | lib.NoDefault = lib.no_default,
887887
) -> tuple[np.ndarray, ExtensionArray]:
888888
resolved_na_sentinel = algos.resolve_na_sentinel(na_sentinel, use_na_sentinel)
889-
if resolved_na_sentinel is None:
890-
raise NotImplementedError("Encoding NaN values is not yet implemented")
891-
else:
892-
na_sentinel = resolved_na_sentinel
893889
arr = self._data
894890
mask = self._mask
895891

896-
codes, uniques = factorize_array(arr, na_sentinel=na_sentinel, mask=mask)
892+
# Pass non-None na_sentinel; recode and add NA to uniques if necessary below
893+
na_sentinel_arg = -1 if resolved_na_sentinel is None else resolved_na_sentinel
894+
codes, uniques = factorize_array(arr, na_sentinel=na_sentinel_arg, mask=mask)
897895

898896
# check that factorize_array correctly preserves dtype.
899897
assert uniques.dtype == self.dtype.numpy_dtype, (uniques.dtype, self.dtype)
900898

901-
uniques_ea = type(self)(uniques, np.zeros(len(uniques), dtype=bool))
899+
has_na = mask.any()
900+
if resolved_na_sentinel is not None or not has_na:
901+
size = len(uniques)
902+
else:
903+
# Make room for an NA value
904+
size = len(uniques) + 1
905+
uniques_mask = np.zeros(size, dtype=bool)
906+
if resolved_na_sentinel is None and has_na:
907+
na_index = mask.argmax()
908+
# Insert na with the proper code
909+
if na_index == 0:
910+
na_code = np.intp(0)
911+
else:
912+
# mypy error: Slice index must be an integer or None
913+
# https://github.com/python/mypy/issues/2410
914+
na_code = codes[:na_index].argmax() + 1 # type: ignore[misc]
915+
codes[codes >= na_code] += 1
916+
codes[codes == -1] = na_code
917+
# dummy value for uniques; not used since uniques_mask will be True
918+
uniques = np.insert(uniques, na_code, 0)
919+
uniques_mask[na_code] = True
920+
uniques_ea = type(self)(uniques, uniques_mask)
921+
902922
return codes, uniques_ea
903923

904924
@doc(ExtensionArray._values_for_argsort)

pandas/core/arrays/sparse/array.py

+4
Original file line numberDiff line numberDiff line change
@@ -879,6 +879,10 @@ def factorize(
879879
codes, uniques = algos.factorize(
880880
np.asarray(self), na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel
881881
)
882+
if na_sentinel is lib.no_default:
883+
na_sentinel = -1
884+
if use_na_sentinel is lib.no_default or use_na_sentinel:
885+
codes[codes == -1] = na_sentinel
882886
uniques_sp = SparseArray(uniques, dtype=self.dtype)
883887
return codes, uniques_sp
884888

pandas/core/arrays/string_.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -380,8 +380,8 @@ def __arrow_array__(self, type=None):
380380
def _values_for_factorize(self):
381381
arr = self._ndarray.copy()
382382
mask = self.isna()
383-
arr[mask] = -1
384-
return arr, -1
383+
arr[mask] = None
384+
return arr, None
385385

386386
def __setitem__(self, key, value):
387387
value = extract_array(value, extract_numpy=True)

pandas/core/groupby/grouper.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -658,9 +658,10 @@ def group_index(self) -> Index:
658658

659659
@cache_readonly
660660
def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
661-
if self._passed_categorical:
661+
if self._dropna and self._passed_categorical:
662662
# we make a CategoricalIndex out of the cat grouper
663-
# preserving the categories / ordered attributes
663+
# preserving the categories / ordered attributes;
664+
# doesn't (yet - GH#46909) handle dropna=False
664665
cat = self.grouping_vector
665666
categories = cat.categories
666667

0 commit comments

Comments
 (0)