diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 71d5c46b81ea0..1ab744f3f1b69 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -784,6 +784,7 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.nth` failing on ``axis=1`` (:issue:`43926`) - Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not respecting right bound on centered datetime-like windows, if the index contain duplicates (:issue:`3944`) - Bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` when using a :class:`pandas.api.indexers.BaseIndexer` subclass that returned unequal start and end arrays would segfault instead of raising a ``ValueError`` (:issue:`44470`) +- Bug in :meth:`DataFrame.groupby` when grouping on multiple columns where at least one includes ``np.nan`` which resulted in a ``KeyError`` when the ``np.nan`` containing index was selected with :meth:`Series.loc` (:issue:`43814`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index a05f8e581d12f..e57de0f34a876 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -35,6 +35,7 @@ Categorical, ExtensionArray, ) +from pandas.core.dtypes.missing import isna import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.groupby import ops @@ -690,12 +691,23 @@ def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]: codes, uniques = algorithms.factorize( self.grouping_vector, sort=self._sort, na_sentinel=na_sentinel ) + return codes, uniques @cache_readonly def groups(self) -> dict[Hashable, np.ndarray]: return self._index.groupby(Categorical.from_codes(self.codes, self.group_index)) + @cache_readonly + def _has_na_placeholder(self) -> bool: + # GH43943, store placeholder for (np.nan, pd.NaT, np.datetime64("NaT", "ns"), + # np.timedelta64("NaT", "ns")), is used to replaced codes correctly to -1 + # pandas/core/groupby:reconstructed_codes + if not self._dropna: + if isna(self.grouping_vector).any(): + return True + return False + def get_grouper( obj: NDFrameT, diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7915e107afae6..f890443f0375f 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -869,7 +869,16 @@ def ngroups(self) -> int: def reconstructed_codes(self) -> list[np.ndarray]: codes = self.codes ids, obs_ids, _ = self.group_info - return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True) + # get the levels in which to reconstruct codes for na + levels_with_na = [ + idx + for idx, grouping in enumerate(self.groupings) + if grouping._has_na_placeholder + ] + reconstructed_codes = decons_obs_group_ids( + ids, obs_ids, self.shape, codes, xnull=True, levels_with_na=levels_with_na + ) + return reconstructed_codes @final @cache_readonly diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 7813182222d67..5bf0ccd91ce59 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -243,7 +243,12 @@ def decons_group_index(comp_labels, shape): def decons_obs_group_ids( - comp_ids: npt.NDArray[np.intp], obs_ids, shape, labels, xnull: bool + comp_ids: npt.NDArray[np.intp], + obs_ids, + shape, + labels, + xnull: bool, + levels_with_na: list[int] | None = None, ): """ Reconstruct labels from observed group ids. @@ -254,6 +259,21 @@ def decons_obs_group_ids( xnull : bool If nulls are excluded; i.e. -1 labels are passed through. """ + + def reconstruct_na_in_codes(codes): + new_codes = [] + if levels_with_na: + for idx, code_level in enumerate(codes): + if idx in levels_with_na: + new_codes.append( + np.where(code_level == max(code_level), -1, code_level) + ) + else: + new_codes.append(code_level) + else: + new_codes = codes + return new_codes + if not xnull: lift = np.fromiter(((a == -1).any() for a in labels), dtype="i8") shape = np.asarray(shape, dtype="i8") + lift @@ -261,10 +281,12 @@ def decons_obs_group_ids( if not is_int64_overflow_possible(shape): # obs ids are deconstructable! take the fast route! out = decons_group_index(obs_ids, shape) + out = reconstruct_na_in_codes(out) return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)] indexer = unique_label_indices(comp_ids) - return [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels] + out = [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels] + return reconstruct_na_in_codes(out) def indexer_from_factorized( diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index ab568e24ff029..797b0d5dd36d0 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -370,3 +370,59 @@ def test_groupby_nan_included(): tm.assert_numpy_array_equal(result_values, expected_values) assert np.isnan(list(result.keys())[2]) assert list(result.keys())[0:2] == ["g1", "g2"] + + +@pytest.mark.parametrize( + "na", + [np.nan, pd.NaT, pd.NA, np.datetime64("NaT", "ns"), np.timedelta64("NaT", "ns")], +) +def test_groupby_codes_with_nan_in_multiindex(na): + # GH 43814 + df = pd.DataFrame( + { + "temp_playlist": [0, 0, 0, 0], + "objId": ["o1", na, "o1", na], + "x": [1, 2, 3, 4], + } + ) + + result = df.groupby(by=["temp_playlist", "objId"], dropna=False)["x"].sum() + expected = pd.MultiIndex.from_arrays( + [[0, 0], ["o1", na]], names=["temp_playlist", "objId"] + ) + expected = pd.Series( + [4, 6], + index=pd.MultiIndex( + levels=[[0], ["o1", np.nan]], + codes=[[0, 0], [0, -1]], + names=["temp_playlist", "objId"], + ), + name="x", + ) + tm.assert_series_equal(result, expected) + + +def test_groupby_multiindex_multiply_with_series(): + # GH#36060 + df = pd.DataFrame( + { + "animal": ["Falcon", "Falcon", "Parrot", "Parrot"], + "type": [np.nan, np.nan, np.nan, np.nan], + "speed": [380.0, 370.0, 24.0, 26.0], + } + ) + speed = df.groupby(["animal", "type"], dropna=False)["speed"].first() + # Reconstruct same index to allow for multiplication. + ix_wing = pd.MultiIndex.from_tuples( + [("Falcon", np.nan), ("Parrot", np.nan)], names=["animal", "type"] + ) + wing = pd.Series([42, 44], index=ix_wing) + + result = wing * speed + expected = pd.Series( + [15960.0, 1056.0], + index=pd.MultiIndex.from_tuples( + [("Falcon", np.nan), ("Parrot", np.nan)], names=["animal", "type"] + ), + ) + tm.assert_series_equal(result, expected)