From 8e102f235abb83083496e1243a2c583411ae5f21 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 1 Apr 2022 10:14:36 -0400 Subject: [PATCH] BUG: GroupBy equates different types of null values --- pandas/core/algorithms.py | 54 +++----------- pandas/tests/groupby/test_groupby_dropna.py | 82 +++++++++++++++++---- 2 files changed, 79 insertions(+), 57 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 77de1636a92eb..643a3ee05cc1a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -78,10 +78,7 @@ ABCSeries, ABCTimedeltaArray, ) -from pandas.core.dtypes.missing import ( - isna, - na_value_for_dtype, -) +from pandas.core.dtypes.missing import isna from pandas.core.array_algos.take import take_nd from pandas.core.construction import ( @@ -761,10 +758,6 @@ def factorize( if not isinstance(values, ABCMultiIndex): values = extract_array(values, extract_numpy=True) - # GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques - # of values, assign na_sentinel=-1 to replace code value for NaN. - dropna = na_sentinel is not None - if ( isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray)) and values.freq is not None @@ -792,43 +785,17 @@ def factorize( else: values = np.asarray(values) # convert DTA/TDA/MultiIndex - # TODO: pass na_sentinel=na_sentinel to factorize_array. When sort is True and - # na_sentinel is None we append NA on the end because safe_sort does not - # handle null values in uniques. - if na_sentinel is None and sort: - na_sentinel_arg = -1 - elif na_sentinel is None: - na_sentinel_arg = None - else: - na_sentinel_arg = na_sentinel codes, uniques = factorize_array( values, - na_sentinel=na_sentinel_arg, + na_sentinel=na_sentinel, size_hint=size_hint, ) if sort and len(uniques) > 0: - if na_sentinel is None: - # TODO: Can remove when na_sentinel=na_sentinel as in TODO above - na_sentinel = -1 uniques, codes = safe_sort( uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False ) - if not dropna and sort: - # TODO: Can remove entire block when na_sentinel=na_sentinel as in TODO above - if na_sentinel is None: - na_sentinel_arg = -1 - else: - na_sentinel_arg = na_sentinel - code_is_na = codes == na_sentinel_arg - if code_is_na.any(): - # na_value is set based on the dtype of uniques, and compat set to False is - # because we do not want na_value to be 0 for integers - na_value = na_value_for_dtype(uniques.dtype, compat=False) - uniques = np.append(uniques, [na_value]) - codes = np.where(code_is_na, len(uniques) - 1, codes) - uniques = _reconstruct_data(uniques, original.dtype, original) return _re_wrap_factorize(original, uniques, codes) @@ -1898,24 +1865,25 @@ def safe_sort( # may deal with them here without performance loss using `mode='wrap'` new_codes = reverse_indexer.take(codes, mode="wrap") - mask = codes == na_sentinel - if verify: - mask = mask | (codes < -len(values)) | (codes >= len(values)) + if na_sentinel is not None: + mask = codes == na_sentinel + if verify: + mask = mask | (codes < -len(values)) | (codes >= len(values)) - if mask is not None: + if na_sentinel is not None and mask is not None: np.putmask(new_codes, mask, na_sentinel) return ordered, ensure_platform_int(new_codes) def _sort_mixed(values) -> np.ndarray: - """order ints before strings in 1d arrays, safe in py3""" + """order ints before strings before nulls in 1d arrays""" str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) - none_pos = np.array([x is None for x in values], dtype=bool) - nums = np.sort(values[~str_pos & ~none_pos]) + null_pos = np.array([isna(x) for x in values], dtype=bool) + nums = np.sort(values[~str_pos & ~null_pos]) strs = np.sort(values[str_pos]) return np.concatenate( - [nums, np.asarray(strs, dtype=object), np.array(values[none_pos])] + [nums, np.asarray(strs, dtype=object), np.array(values[null_pos])] ) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 394b5adcf0370..36e022b75d553 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -1,3 +1,5 @@ +import copy + import numpy as np import pytest @@ -60,18 +62,23 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_one_group( ), ( False, - [["A", "B"], ["A", np.nan], ["B", "A"], [np.nan, "B"]], + # "null1" and "null2" are placeholders for fixtures + [["A", "B"], ["A", "null1"], ["A", "null2"], ["B", "A"], ["null1", "B"]], { - "c": [12.0, 13.3, 123.23, 1.0], - "d": [12.0, 234.0, 123.0, 1.0], - "e": [12.0, 13.0, 1.0, 1.0], + "c": [12.0, 12.3, 1.0, 123.23, 1.0], + "d": [12.0, 233.0, 1.0, 123.0, 1.0], + "e": [12.0, 12.0, 1.0, 1.0, 1.0], }, ), ], ) -def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups( +def test_groupby_dropna_multi_index_dataframe_different_nan_in_two_groups( dropna, tuples, outputs, nulls_fixture, nulls_fixture2 ): + if not dropna and type(nulls_fixture).__name__ == type(nulls_fixture2).__name__: + pytest.skip( + "tested in test_groupby_dropna_multi_index_dataframe_same_nan_in_two_groups" + ) # GH 3729 this is to test that NA in different groups with different representations df_list = [ ["A", "B", 12, 12, 12], @@ -83,6 +90,60 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups( df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"]) grouped = df.groupby(["a", "b"], dropna=dropna).sum() + if not dropna: + tuples = copy.deepcopy(tuples) + tuples[1][1] = nulls_fixture + tuples[2][1] = nulls_fixture2 + tuples[4][0] = nulls_fixture + mi = pd.MultiIndex.from_tuples(tuples, names=list("ab")) + + # Since right now, by default MI will drop NA from levels when we create MI + # via `from_*`, so we need to add NA for level manually afterwards. + if not dropna: + mi = mi.set_levels([["A", "B", np.nan], ["A", "B", np.nan]]) + expected = pd.DataFrame(outputs, index=mi) + + tm.assert_frame_equal(grouped, expected) + + +@pytest.mark.parametrize( + "dropna, tuples, outputs", + [ + ( + True, + [["A", "B"], ["B", "A"]], + {"c": [12.0, 123.23], "d": [12.0, 123.0], "e": [12.0, 1.0]}, + ), + ( + False, + # "null" is a placeholders for fixtures + [["A", "B"], ["A", "null"], ["B", "A"], ["null", "B"]], + { + "c": [12.0, 13.3, 123.23, 1.0], + "d": [12.0, 234.0, 123.0, 1.0], + "e": [12.0, 13.0, 1.0, 1.0], + }, + ), + ], +) +def test_groupby_dropna_multi_index_dataframe_same_nan_in_two_groups( + dropna, tuples, outputs, nulls_fixture +): + # GH 3729 this is to test that NA in different groups with different representations + df_list = [ + ["A", "B", 12, 12, 12], + ["A", nulls_fixture, 12.3, 233.0, 12], + ["B", "A", 123.23, 123, 1], + [nulls_fixture, "B", 1, 1, 1.0], + ["A", nulls_fixture, 1, 1, 1.0], + ] + df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"]) + grouped = df.groupby(["a", "b"], dropna=dropna).sum() + + if not dropna: + tuples = copy.deepcopy(tuples) + tuples[1][1] = nulls_fixture + tuples[3][0] = nulls_fixture mi = pd.MultiIndex.from_tuples(tuples, names=list("ab")) # Since right now, by default MI will drop NA from levels when we create MI @@ -439,15 +500,8 @@ def test_no_sort_keep_na(values, dtype, test_series): gb = df.groupby("key", dropna=False, sort=False) if test_series: gb = gb["a"] - - warn = None - if isinstance(values, pd.arrays.SparseArray): - warn = FutureWarning - msg = "passing a SparseArray to pd.Index will store that array directly" - with tm.assert_produces_warning(warn, match=msg): - result = gb.sum() - expected = pd.DataFrame({"a": [5, 2, 3]}, index=key[:-1].rename("key")) - + result = gb.sum() + expected = pd.DataFrame({"a": [5, 2, 3]}, index=key[:-1].rename("key")) if test_series: expected = expected["a"] if expected.index.is_categorical():