From 5dac4bda30ebca8cdf74bd5f28b5f004f755ad90 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 1 Apr 2022 10:14:36 -0400 Subject: [PATCH 1/4] CLN: Simplify factorize --- pandas/core/algorithms.py | 49 ++++++++------------------------------- 1 file changed, 10 insertions(+), 39 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6a04cbf4b5846..a818c3fabb9d7 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -772,10 +772,6 @@ def factorize( if not isinstance(values, ABCMultiIndex): values = extract_array(values, extract_numpy=True) - # GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques - # of values, assign na_sentinel=-1 to replace code value for NaN. - dropna = na_sentinel is not None - if ( isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray)) and values.freq is not None @@ -803,43 +799,17 @@ def factorize( else: values = np.asarray(values) # convert DTA/TDA/MultiIndex - # TODO: pass na_sentinel=na_sentinel to factorize_array. When sort is True and - # na_sentinel is None we append NA on the end because safe_sort does not - # handle null values in uniques. - if na_sentinel is None and sort: - na_sentinel_arg = -1 - elif na_sentinel is None: - na_sentinel_arg = None - else: - na_sentinel_arg = na_sentinel codes, uniques = factorize_array( values, - na_sentinel=na_sentinel_arg, + na_sentinel=na_sentinel, size_hint=size_hint, ) if sort and len(uniques) > 0: - if na_sentinel is None: - # TODO: Can remove when na_sentinel=na_sentinel as in TODO above - na_sentinel = -1 uniques, codes = safe_sort( uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False ) - if not dropna and sort: - # TODO: Can remove entire block when na_sentinel=na_sentinel as in TODO above - if na_sentinel is None: - na_sentinel_arg = -1 - else: - na_sentinel_arg = na_sentinel - code_is_na = codes == na_sentinel_arg - if code_is_na.any(): - # na_value is set based on the dtype of uniques, and compat set to False is - # because we do not want na_value to be 0 for integers - na_value = na_value_for_dtype(uniques.dtype, compat=False) - uniques = np.append(uniques, [na_value]) - codes = np.where(code_is_na, len(uniques) - 1, codes) - uniques = _reconstruct_data(uniques, original.dtype, original) return _re_wrap_factorize(original, uniques, codes) @@ -1918,24 +1888,25 @@ def safe_sort( # may deal with them here without performance loss using `mode='wrap'` new_codes = reverse_indexer.take(codes, mode="wrap") - mask = codes == na_sentinel - if verify: - mask = mask | (codes < -len(values)) | (codes >= len(values)) + if na_sentinel is not None: + mask = codes == na_sentinel + if verify: + mask = mask | (codes < -len(values)) | (codes >= len(values)) - if mask is not None: + if na_sentinel is not None and mask is not None: np.putmask(new_codes, mask, na_sentinel) return ordered, ensure_platform_int(new_codes) def _sort_mixed(values) -> np.ndarray: - """order ints before strings in 1d arrays, safe in py3""" + """order ints before strings before nulls in 1d arrays""" str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) - none_pos = np.array([x is None for x in values], dtype=bool) - nums = np.sort(values[~str_pos & ~none_pos]) + null_pos = np.array([isna(x) for x in values], dtype=bool) + nums = np.sort(values[~str_pos & ~null_pos]) strs = np.sort(values[str_pos]) return np.concatenate( - [nums, np.asarray(strs, dtype=object), np.array(values[none_pos])] + [nums, np.asarray(strs, dtype=object), np.array(values[null_pos])] ) From a11398906a569358c0b17eb5f79e8b9f7f7e746e Mon Sep 17 00:00:00 2001 From: richard Date: Thu, 15 Sep 2022 22:47:18 -0400 Subject: [PATCH 2/4] Update type-hint and docstring --- pandas/core/algorithms.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index a818c3fabb9d7..b5c6b7a18eda3 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1764,7 +1764,7 @@ def diff(arr, n: int, axis: int = 0): def safe_sort( values, codes=None, - na_sentinel: int = -1, + na_sentinel: int | None = -1, assume_unique: bool = False, verify: bool = True, ) -> np.ndarray | MultiIndex | tuple[np.ndarray | MultiIndex, np.ndarray]: @@ -1781,8 +1781,8 @@ def safe_sort( codes : list_like, optional Indices to ``values``. All out of bound indices are treated as "not found" and will be masked with ``na_sentinel``. - na_sentinel : int, default -1 - Value in ``codes`` to mark "not found". + na_sentinel : int or None, default -1 + Value in ``codes`` to mark "not found", or None to encode null values as normal. Ignored when ``codes`` is None. assume_unique : bool, default False When True, ``values`` are assumed to be unique, which can speed up From ccd61ece0c682dfaad89797e6adf940f4f2bc222 Mon Sep 17 00:00:00 2001 From: richard Date: Fri, 16 Sep 2022 20:58:11 -0400 Subject: [PATCH 3/4] Update test --- pandas/tests/test_sorting.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 537792ea8263c..ba1943878cfad 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -506,10 +506,10 @@ def test_extension_array_codes(self, verify, na_sentinel): tm.assert_numpy_array_equal(codes, expected_codes) -def test_mixed_str_nan(): - values = np.array(["b", np.nan, "a", "b"], dtype=object) +def test_mixed_str_null(nulls_fixture): + values = np.array(["b", nulls_fixture, "a", "b"], dtype=object) result = safe_sort(values) - expected = np.array([np.nan, "a", "b", "b"], dtype=object) + expected = np.array(["a", "b", "b", nulls_fixture], dtype=object) tm.assert_numpy_array_equal(result, expected) From f189a2fd063acdbafa6a508798b5bfd96abf7854 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 30 Sep 2022 08:18:07 -0400 Subject: [PATCH 4/4] cleanup --- pandas/tests/groupby/test_groupby_dropna.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index de3cc3fbdc6a4..360e3096ceb63 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -93,9 +93,6 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups( mi = mi.set_levels([["A", "B", np.nan], ["A", "B", np.nan]]) expected = pd.DataFrame(outputs, index=mi) - print(grouped) - print(expected) - tm.assert_frame_equal(grouped, expected)