Skip to content

Commit 670c2e8

Browse files
committed
BUG: algos.factorizes moves null values when sort=False
1 parent b6f21f3 commit 670c2e8

File tree

13 files changed

+155
-42
lines changed

13 files changed

+155
-42
lines changed

doc/source/whatsnew/v1.5.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ Other enhancements
9696
- :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`)
9797
- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, and :meth:`DataFrame.cov` (:issue:`46560`)
9898
- A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`)
99+
- The method :meth:`.ExtensionArray.factorize` has gained the argument ``dropna`` for determining how null values are to be treated. (:issue:`46601`)
99100

100101
.. ---------------------------------------------------------------------------
101102
.. _whatsnew_150.notable_bug_fixes:
@@ -605,6 +606,7 @@ Groupby/resample/rolling
605606
- Bug in :meth:`SeriesGroupBy.apply` would incorrectly name its result when there was a unique group (:issue:`46369`)
606607
- Bug in :meth:`.Rolling.var` would segfault calculating weighted variance when window size was larger than data size (:issue:`46760`)
607608
- Bug in :meth:`Grouper.__repr__` where ``dropna`` was not included. Now it is (:issue:`46754`)
609+
- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` with ``dropna=False`` and ``sort=False`` would put any null groups at the end instead the order that they are encountered (:issue:`46584`)
608610

609611
Reshaping
610612
^^^^^^^^^

pandas/_libs/hashtable_class_helper.pxi.in

+6-6
Original file line numberDiff line numberDiff line change
@@ -658,7 +658,7 @@ cdef class {{name}}HashTable(HashTable):
658658
return_inverse=return_inverse)
659659

660660
def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
661-
object na_value=None, object mask=None):
661+
object na_value=None, object mask=None, ignore_na=True):
662662
"""
663663
Calculate unique values and labels (no sorting!)
664664

@@ -690,7 +690,7 @@ cdef class {{name}}HashTable(HashTable):
690690
"""
691691
uniques_vector = {{name}}Vector()
692692
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
693-
na_value=na_value, ignore_na=True, mask=mask,
693+
na_value=na_value, ignore_na=ignore_na, mask=mask,
694694
return_inverse=True)
695695

696696
def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
@@ -1037,7 +1037,7 @@ cdef class StringHashTable(HashTable):
10371037
return_inverse=return_inverse)
10381038

10391039
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
1040-
object na_value=None, object mask=None):
1040+
object na_value=None, object mask=None, ignore_na=True):
10411041
"""
10421042
Calculate unique values and labels (no sorting!)
10431043

@@ -1067,7 +1067,7 @@ cdef class StringHashTable(HashTable):
10671067
"""
10681068
uniques_vector = ObjectVector()
10691069
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
1070-
na_value=na_value, ignore_na=True,
1070+
na_value=na_value, ignore_na=ignore_na,
10711071
return_inverse=True)
10721072

10731073
def get_labels(self, ndarray[object] values, ObjectVector uniques,
@@ -1290,7 +1290,7 @@ cdef class PyObjectHashTable(HashTable):
12901290
return_inverse=return_inverse)
12911291

12921292
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
1293-
object na_value=None, object mask=None):
1293+
object na_value=None, object mask=None, ignore_na=True):
12941294
"""
12951295
Calculate unique values and labels (no sorting!)
12961296

@@ -1320,7 +1320,7 @@ cdef class PyObjectHashTable(HashTable):
13201320
"""
13211321
uniques_vector = ObjectVector()
13221322
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
1323-
na_value=na_value, ignore_na=True,
1323+
na_value=na_value, ignore_na=ignore_na,
13241324
return_inverse=True)
13251325

13261326
def get_labels(self, ndarray[object] values, ObjectVector uniques,

pandas/core/algorithms.py

+30-10
Original file line numberDiff line numberDiff line change
@@ -502,6 +502,7 @@ def factorize_array(
502502
size_hint: int | None = None,
503503
na_value=None,
504504
mask: npt.NDArray[np.bool_] | None = None,
505+
dropna: bool = True,
505506
) -> tuple[npt.NDArray[np.intp], np.ndarray]:
506507
"""
507508
Factorize a numpy array to codes and uniques.
@@ -523,6 +524,11 @@ def factorize_array(
523524
If not None, the mask is used as indicator for missing values
524525
(True = missing, False = valid) instead of `na_value` or
525526
condition "val != val".
527+
dropna: bool, default True
528+
Whether null values will appear in uniques. When False, null values
529+
will receive a nonnegative code instead of na_sentinel.
530+
531+
..versionadded:: 1.5.0
526532
527533
Returns
528534
-------
@@ -541,7 +547,11 @@ def factorize_array(
541547

542548
table = hash_klass(size_hint or len(values))
543549
uniques, codes = table.factorize(
544-
values, na_sentinel=na_sentinel, na_value=na_value, mask=mask
550+
values,
551+
na_sentinel=na_sentinel,
552+
na_value=na_value,
553+
mask=mask,
554+
ignore_na=dropna,
545555
)
546556

547557
# re-cast e.g. i8->dt64/td64, uint8->bool
@@ -728,25 +738,35 @@ def factorize(
728738

729739
if not isinstance(values.dtype, np.dtype):
730740
# i.e. ExtensionDtype
731-
codes, uniques = values.factorize(na_sentinel=na_sentinel)
741+
# TODO: pass ignore_na=dropna. When sort is True we ignore_na here and append
742+
# on the end because safe_sort does not handle null values in uniques
743+
codes, uniques = values.factorize(
744+
na_sentinel=na_sentinel, dropna=dropna or sort
745+
)
732746
else:
733747
values = np.asarray(values) # convert DTA/TDA/MultiIndex
748+
# TODO: pass ignore_na=dropna; see above
734749
codes, uniques = factorize_array(
735-
values, na_sentinel=na_sentinel, size_hint=size_hint
750+
values,
751+
na_sentinel=na_sentinel,
752+
size_hint=size_hint,
753+
dropna=dropna or sort,
736754
)
737755

738756
if sort and len(uniques) > 0:
739757
uniques, codes = safe_sort(
740758
uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False
741759
)
742760

743-
code_is_na = codes == na_sentinel
744-
if not dropna and code_is_na.any():
745-
# na_value is set based on the dtype of uniques, and compat set to False is
746-
# because we do not want na_value to be 0 for integers
747-
na_value = na_value_for_dtype(uniques.dtype, compat=False)
748-
uniques = np.append(uniques, [na_value])
749-
codes = np.where(code_is_na, len(uniques) - 1, codes)
761+
if not dropna and sort:
762+
# TODO: Can remove if we pass ignore_na=dropna; see above
763+
code_is_na = codes == na_sentinel
764+
if code_is_na.any():
765+
# na_value is set based on the dtype of uniques, and compat set to False is
766+
# because we do not want na_value to be 0 for integers
767+
na_value = na_value_for_dtype(uniques.dtype, compat=False)
768+
uniques = np.append(uniques, [na_value])
769+
codes = np.where(code_is_na, len(uniques) - 1, codes)
750770

751771
uniques = _reconstruct_data(uniques, original.dtype, original)
752772

pandas/core/arrays/arrow/array.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,11 @@ def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
105105
return type(self)(self._data)
106106

107107
@doc(ExtensionArray.factorize)
108-
def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
109-
encoded = self._data.dictionary_encode()
108+
def factorize(
109+
self, na_sentinel: int = -1, dropna=True
110+
) -> tuple[np.ndarray, ExtensionArray]:
111+
null_encoding = "mask" if dropna else "encode"
112+
encoded = self._data.dictionary_encode(null_encoding=null_encoding)
110113
indices = pa.chunked_array(
111114
[c.indices for c in encoded.chunks], type=encoded.type.index_type
112115
).to_pandas()

pandas/core/arrays/base.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -1002,14 +1002,21 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
10021002
"""
10031003
return self.astype(object), np.nan
10041004

1005-
def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
1005+
def factorize(
1006+
self, na_sentinel: int = -1, dropna: bool = True
1007+
) -> tuple[np.ndarray, ExtensionArray]:
10061008
"""
10071009
Encode the extension array as an enumerated type.
10081010
10091011
Parameters
10101012
----------
10111013
na_sentinel : int, default -1
10121014
Value to use in the `codes` array to indicate missing values.
1015+
dropna: bool, default True
1016+
Whether null values will appear in uniques. When False, null values
1017+
will receive a nonnegative code instead of na_sentinel.
1018+
1019+
..versionadded:: 1.5.0
10131020
10141021
Returns
10151022
-------
@@ -1044,7 +1051,7 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
10441051
arr, na_value = self._values_for_factorize()
10451052

10461053
codes, uniques = factorize_array(
1047-
arr, na_sentinel=na_sentinel, na_value=na_value
1054+
arr, na_sentinel=na_sentinel, na_value=na_value, dropna=dropna
10481055
)
10491056

10501057
uniques_ea = self._from_factorized(uniques, self)

pandas/core/arrays/datetimelike.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1878,7 +1878,7 @@ def _with_freq(self, freq):
18781878

18791879
# --------------------------------------------------------------
18801880

1881-
def factorize(self, na_sentinel=-1, sort: bool = False):
1881+
def factorize(self, na_sentinel=-1, sort: bool = False, dropna: bool = True):
18821882
if self.freq is not None:
18831883
# We must be unique, so can short-circuit (and retain freq)
18841884
codes = np.arange(len(self), dtype=np.intp)
@@ -1888,7 +1888,8 @@ def factorize(self, na_sentinel=-1, sort: bool = False):
18881888
uniques = uniques[::-1]
18891889
return codes, uniques
18901890
# FIXME: shouldn't get here; we are ignoring sort
1891-
return super().factorize(na_sentinel=na_sentinel)
1891+
result = super().factorize(na_sentinel=na_sentinel, dropna=dropna)
1892+
return result
18921893

18931894

18941895
# -------------------------------------------------------------------

pandas/core/arrays/masked.py

+25-3
Original file line numberDiff line numberDiff line change
@@ -868,16 +868,38 @@ def searchsorted(
868868
return self._data.searchsorted(value, side=side, sorter=sorter)
869869

870870
@doc(ExtensionArray.factorize)
871-
def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
871+
def factorize(
872+
self, na_sentinel: int = -1, dropna: bool = True
873+
) -> tuple[np.ndarray, ExtensionArray]:
872874
arr = self._data
873875
mask = self._mask
874876

875-
codes, uniques = factorize_array(arr, na_sentinel=na_sentinel, mask=mask)
877+
codes, uniques = factorize_array(
878+
arr, na_sentinel=na_sentinel, mask=mask, dropna=True
879+
)
876880

877881
# check that factorize_array correctly preserves dtype.
878882
assert uniques.dtype == self.dtype.numpy_dtype, (uniques.dtype, self.dtype)
879883

880-
uniques_ea = type(self)(uniques, np.zeros(len(uniques), dtype=bool))
884+
# Make room for a null value if we're not ignoring it and it exists
885+
size = len(uniques) if dropna or not mask.any() else len(uniques) + 1
886+
uniques_mask = np.zeros(size, dtype=bool)
887+
if not dropna:
888+
na_index = mask.argmax()
889+
if mask[na_index]:
890+
# Insert na with the proper code
891+
na_code = 0 if na_index == 0 else codes[:na_index].argmax() + 1
892+
if na_sentinel < 0:
893+
# codes can never equal na_sentinel and be >= na_code
894+
codes[codes >= na_code] += 1
895+
else:
896+
codes[(codes >= na_code) & (codes != na_sentinel)] += 1
897+
codes[codes == na_sentinel] = na_code
898+
# dummy value for uniques; not used since uniques_mask will be True
899+
uniques = np.insert(uniques, na_code, 0)
900+
uniques_mask[na_code] = True
901+
uniques_ea = type(self)(uniques, uniques_mask)
902+
881903
return codes, uniques_ea
882904

883905
@doc(ExtensionArray._values_for_argsort)

pandas/core/arrays/sparse/array.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -847,13 +847,18 @@ def _values_for_factorize(self):
847847
# Still override this for hash_pandas_object
848848
return np.asarray(self), self.fill_value
849849

850-
def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, SparseArray]:
850+
def factorize(
851+
self, na_sentinel: int = -1, dropna: bool = True
852+
) -> tuple[np.ndarray, SparseArray]:
851853
# Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
852854
# The sparsity on this is backwards from what Sparse would want. Want
853855
# ExtensionArray.factorize -> Tuple[EA, EA]
854856
# Given that we have to return a dense array of codes, why bother
855857
# implementing an efficient factorize?
856-
codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
858+
na_sentinel_arg = na_sentinel if dropna else None
859+
codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel_arg)
860+
if not dropna:
861+
codes[codes == -1] = na_sentinel
857862
uniques_sp = SparseArray(uniques, dtype=self.dtype)
858863
return codes, uniques_sp
859864

pandas/core/arrays/string_.py

+7
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,13 @@ def _values_for_factorize(self):
382382
arr[mask] = -1
383383
return arr, -1
384384

385+
@classmethod
386+
def _from_factorized(cls, values, original):
387+
assert values.dtype == original._ndarray.dtype
388+
# When dropna (i.e. ignore_na) is False, can get -1 from nulls
389+
values[values == -1] = None
390+
return original._from_backing_data(values)
391+
385392
def __setitem__(self, key, value):
386393
value = extract_array(value, extract_numpy=True)
387394
if isinstance(value, type(self)):

pandas/core/groupby/grouper.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -657,9 +657,10 @@ def group_index(self) -> Index:
657657

658658
@cache_readonly
659659
def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
660-
if self._passed_categorical:
660+
if self._dropna and self._passed_categorical:
661661
# we make a CategoricalIndex out of the cat grouper
662-
# preserving the categories / ordered attributes
662+
# preserving the categories / ordered attributes;
663+
# doesn't (yet) handle dropna=False
663664
cat = self.grouping_vector
664665
categories = cat.categories
665666

pandas/tests/groupby/test_groupby_dropna.py

+49
Original file line numberDiff line numberDiff line change
@@ -348,3 +348,52 @@ def test_groupby_nan_included():
348348
tm.assert_numpy_array_equal(result_values, expected_values)
349349
assert np.isnan(list(result.keys())[2])
350350
assert list(result.keys())[0:2] == ["g1", "g2"]
351+
352+
353+
@pytest.mark.parametrize(
354+
"key",
355+
[
356+
pd.Series([2, np.nan, 1, 2]),
357+
pd.Series([2, np.nan, 1, 2], dtype="UInt8"),
358+
pd.Series([2, np.nan, 1, 2], dtype="Int8"),
359+
pd.Series([2, np.nan, 1, 2], dtype="UInt16"),
360+
pd.Series([2, np.nan, 1, 2], dtype="Int16"),
361+
pd.Series([2, np.nan, 1, 2], dtype="UInt32"),
362+
pd.Series([2, np.nan, 1, 2], dtype="Int32"),
363+
pd.Series([2, np.nan, 1, 2], dtype="UInt64"),
364+
pd.Series([2, np.nan, 1, 2], dtype="Int64"),
365+
pd.Series([2, np.nan, 1, 2], dtype="Float32"),
366+
pd.Series([2, np.nan, 1, 2], dtype="Int64"),
367+
pd.Series([2, np.nan, 1, 2], dtype="Float64"),
368+
pd.Series(["y", None, "x", "y"], dtype="category"),
369+
pd.Series(["y", pd.NA, "x", "y"], dtype="string"),
370+
pd.Series(["y", pd.NA, "x", "y"], dtype="string[pyarrow]"),
371+
pd.Series(
372+
["2016-01-01", np.datetime64("NaT"), "2017-01-01", "2016-01-01"],
373+
dtype="datetime64[ns]",
374+
),
375+
pd.Series(
376+
[
377+
pd.Period("2012-02-01", freq="D"),
378+
pd.NA,
379+
pd.Period("2012-01-01", freq="D"),
380+
pd.Period("2012-02-01", freq="D"),
381+
]
382+
),
383+
pd.Series(pd.arrays.SparseArray([2, np.nan, 1, 2])),
384+
],
385+
)
386+
@pytest.mark.parametrize("test_series", [True, False])
387+
def test_no_sort_keep_na(key, test_series):
388+
df = pd.DataFrame({"key": key, "a": [1, 2, 3, 4]})
389+
gb = df.groupby("key", dropna=False, sort=False)
390+
if test_series:
391+
gb = gb["a"]
392+
result = gb.sum()
393+
expected = pd.DataFrame({"a": [5, 2, 3]}, index=key[:-1].rename("key"))
394+
if test_series:
395+
expected = expected["a"]
396+
if expected.index.is_categorical():
397+
# TODO: Slicing reorders categories?
398+
expected.index = expected.index.reorder_categories(["y", "x"])
399+
tm.assert_equal(result, expected)

pandas/tests/groupby/transform/test_transform.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -1311,12 +1311,8 @@ def test_transform_cumcount():
13111311

13121312

13131313
@pytest.mark.parametrize("keys", [["A1"], ["A1", "A2"]])
1314-
def test_null_group_lambda_self(request, sort, dropna, keys):
1314+
def test_null_group_lambda_self(sort, dropna, keys):
13151315
# GH 17093
1316-
if not sort and not dropna:
1317-
msg = "GH#46584: null values get sorted when sort=False"
1318-
request.node.add_marker(pytest.mark.xfail(reason=msg, strict=False))
1319-
13201316
size = 50
13211317
nulls1 = np.random.choice([False, True], size)
13221318
nulls2 = np.random.choice([False, True], size)

0 commit comments

Comments
 (0)