pandas-dev · CloseChoice · Oct 9, 2021 · Oct 9, 2021 · Oct 10, 2021 · Oct 10, 2021
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -784,6 +784,7 @@ Groupby/resample/rolling
 - Bug in :meth:`GroupBy.nth` failing on ``axis=1`` (:issue:`43926`)
 - Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not respecting right bound on centered datetime-like windows, if the index contain duplicates (:issue:`3944`)
 - Bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` when using a :class:`pandas.api.indexers.BaseIndexer` subclass that returned unequal start and end arrays would segfault instead of raising a ``ValueError`` (:issue:`44470`)
+- Bug in :meth:`DataFrame.groupby` when grouping on multiple columns where at least one includes ``np.nan`` which resulted in a ``KeyError`` when the ``np.nan`` containing index was selected with :meth:`Series.loc` (:issue:`43814`)
 
 Reshaping
 ^^^^^^^^^

@@ -35,6 +35,7 @@
     Categorical,
     ExtensionArray,
 )
+from pandas.core.dtypes.missing import isna
 import pandas.core.common as com
 from pandas.core.frame import DataFrame
 from pandas.core.groupby import ops
@@ -690,12 +691,23 @@ def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]:
             codes, uniques = algorithms.factorize(
                 self.grouping_vector, sort=self._sort, na_sentinel=na_sentinel
             )
+
         return codes, uniques
 
     @cache_readonly
     def groups(self) -> dict[Hashable, np.ndarray]:
         return self._index.groupby(Categorical.from_codes(self.codes, self.group_index))
 
+    @cache_readonly
+    def _has_na_placeholder(self) -> bool:
+        # GH43943, store placeholder for (np.nan, pd.NaT, np.datetime64("NaT", "ns"),
+        # np.timedelta64("NaT", "ns")), is used to replaced codes correctly to -1
+        # pandas/core/groupby:reconstructed_codes
+        if not self._dropna:
+            if isna(self.grouping_vector).any():
+                return True
+        return False
+
 
 def get_grouper(
     obj: NDFrameT,

@@ -869,7 +869,16 @@ def ngroups(self) -> int:
     def reconstructed_codes(self) -> list[np.ndarray]:
         codes = self.codes
         ids, obs_ids, _ = self.group_info
-        return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True)
+        # get the levels in which to reconstruct codes for na
+        levels_with_na = [
+            idx
+            for idx, grouping in enumerate(self.groupings)
+            if grouping._has_na_placeholder
+        ]
+        reconstructed_codes = decons_obs_group_ids(
+            ids, obs_ids, self.shape, codes, xnull=True, levels_with_na=levels_with_na
+        )
+        return reconstructed_codes
 
     @final
     @cache_readonly

diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
@@ -243,7 +243,12 @@ def decons_group_index(comp_labels, shape):
 
 
 def decons_obs_group_ids(
-    comp_ids: npt.NDArray[np.intp], obs_ids, shape, labels, xnull: bool
+    comp_ids: npt.NDArray[np.intp],
+    obs_ids,
+    shape,
+    labels,
+    xnull: bool,
+    levels_with_na: list[int] | None = None,
 ):
     """
     Reconstruct labels from observed group ids.
@@ -254,17 +259,34 @@ def decons_obs_group_ids(
     xnull : bool
         If nulls are excluded; i.e. -1 labels are passed through.
     """
+
+    def reconstruct_na_in_codes(codes):
+        new_codes = []
+        if levels_with_na:
+            for idx, code_level in enumerate(codes):
+                if idx in levels_with_na:
+                    new_codes.append(
+                        np.where(code_level == max(code_level), -1, code_level)
+                    )
+                else:
+                    new_codes.append(code_level)
+        else:
+            new_codes = codes
+        return new_codes
+
     if not xnull:
         lift = np.fromiter(((a == -1).any() for a in labels), dtype="i8")
         shape = np.asarray(shape, dtype="i8") + lift
 
     if not is_int64_overflow_possible(shape):
         # obs ids are deconstructable! take the fast route!
         out = decons_group_index(obs_ids, shape)
+        out = reconstruct_na_in_codes(out)
         return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)]
 
     indexer = unique_label_indices(comp_ids)
-    return [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels]
+    out = [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels]
+    return reconstruct_na_in_codes(out)
 
 
 def indexer_from_factorized(

diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
@@ -370,3 +370,59 @@ def test_groupby_nan_included():
         tm.assert_numpy_array_equal(result_values, expected_values)
     assert np.isnan(list(result.keys())[2])
     assert list(result.keys())[0:2] == ["g1", "g2"]
+
+
+@pytest.mark.parametrize(
+    "na",
+    [np.nan, pd.NaT, pd.NA, np.datetime64("NaT", "ns"), np.timedelta64("NaT", "ns")],
+)
+def test_groupby_codes_with_nan_in_multiindex(na):
+    # GH 43814
+    df = pd.DataFrame(
+        {
+            "temp_playlist": [0, 0, 0, 0],
+            "objId": ["o1", na, "o1", na],
+            "x": [1, 2, 3, 4],
+        }
+    )
+
+    result = df.groupby(by=["temp_playlist", "objId"], dropna=False)["x"].sum()
+    expected = pd.MultiIndex.from_arrays(
+        [[0, 0], ["o1", na]], names=["temp_playlist", "objId"]
+    )
+    expected = pd.Series(
+        [4, 6],
+        index=pd.MultiIndex(
+            levels=[[0], ["o1", np.nan]],
+            codes=[[0, 0], [0, -1]],
+            names=["temp_playlist", "objId"],
+        ),
+        name="x",
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_groupby_multiindex_multiply_with_series():
+    # GH#36060
+    df = pd.DataFrame(
+        {
+            "animal": ["Falcon", "Falcon", "Parrot", "Parrot"],
+            "type": [np.nan, np.nan, np.nan, np.nan],
+            "speed": [380.0, 370.0, 24.0, 26.0],
+        }
+    )
+    speed = df.groupby(["animal", "type"], dropna=False)["speed"].first()
+    # Reconstruct same index to allow for multiplication.
+    ix_wing = pd.MultiIndex.from_tuples(
+        [("Falcon", np.nan), ("Parrot", np.nan)], names=["animal", "type"]
+    )
+    wing = pd.Series([42, 44], index=ix_wing)
+
+    result = wing * speed
+    expected = pd.Series(
+        [15960.0, 1056.0],
+        index=pd.MultiIndex.from_tuples(
+            [("Falcon", np.nan), ("Parrot", np.nan)], names=["animal", "type"]
+        ),
+    )
+    tm.assert_series_equal(result, expected)