pandas-dev · CloseChoice · Oct 9, 2021 · Oct 9, 2021 · Oct 10, 2021 · Oct 10, 2021
diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
@@ -121,6 +121,7 @@ cdef class ObjectFactorizer(Factorizer):
             uniques = ObjectVector()
             uniques.extend(self.uniques.to_array())
             self.uniques = uniques
+        print('WE ARE IN FACTORIZE')
         labels = self.table.get_labels(values, self.uniques,
                                        self.count, na_sentinel, na_value)
         mask = (labels == na_sentinel)

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -129,6 +129,7 @@ cdef class IndexEngine:
         # We assume before we get here:
         #  - val is hashable
         self._ensure_mapping_populated()
+        #print('THIS IS THE MAPPING ' + str(self.mapping))
         return val in self.mapping
 
     cpdef get_loc(self, object val):
@@ -158,6 +159,8 @@ cdef class IndexEngine:
             return self._get_loc_duplicates(val)
 
         try:
+            #print('THIS IS val: ' + str(val))
+            #print('THIS IS MAPPING: ' + str(self.mapping))
             return self.mapping.get_item(val)
         except OverflowError as err:
             # GH#41775 OverflowError e.g. if we are uint64 and val is -1
@@ -594,13 +597,16 @@ cdef class BaseMultiIndexCodesEngine:
 
         # Transform labels in a single array, and add 1 so that we are working
         # with positive integers (-1 for NaN becomes 0):
+        #print('\nTHIS ARE LABELS IN INITIALIZATION OF BASEMULTIINDEXCODESENGINE: ' + str(labels) + " THIS IS THE TYPE OF LABELS: " + str(type(labels)))
         codes = (np.array(labels, dtype='int64').T + 1).astype('uint64',
                                                                copy=False)
+        #print('\nTHIS ARE THE CODES: ' + str(codes) + " THIS IS THE TYPE OF CODES: " + str(type(codes)) + '\n')
 
         # Map each codes combination in the index to an integer unambiguously
         # (no collisions possible), based on the "offsets", which describe the
         # number of bits to switch labels for each level:
         lab_ints = self._codes_to_ints(codes)
+        #print('\nTHIS ARE THE LAB INTS: ' + str(lab_ints) + " THIS IS THE TYPE OF LAB INTS: " + str(type(codes)) + '\n')
 
         # Initialize underlying index (e.g. libindex.UInt64Engine) with
         # integers representing labels: we will use its get_loc and get_indexer
@@ -731,13 +737,23 @@ cdef class BaseMultiIndexCodesEngine:
         return sorted_indexer[np.argsort(target_order)]
 
     def get_loc(self, object key):
+        #print('get_loc, we want to get key' + str(key))
         if is_definitely_invalid_key(key):
             raise TypeError(f"'{key}' is an invalid key")
         if not isinstance(key, tuple):
             raise KeyError(key)
+        # print('LEVELS: ' + str(type(self.levels[0])) + 'KEYS: ' + str(key))
         try:
-            indices = [0 if checknull(v) else lev.get_loc(v) + 1
-                       for lev, v in zip(self.levels, key)]
+            # indices = [0 if checknull(v) else lev.get_loc(v) + 1
+            #            for lev, v in zip(self.levels, key)]
+            indices = []
+            for key, lev in zip(key, self.levels):
+                try:
+                    indices.append(lev.get_loc(key) + 1)
+                except KeyError:
+                    indices.append(0)
+            # indices = [lev.get_loc(v) + 1
+            #            for lev, v in zip(self.levels, key)]
         except KeyError:
             raise KeyError(key)
 

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1835,6 +1835,7 @@ def _get_label_or_level_values(self, key: str, axis: int = 0) -> np.ndarray:
         axis = self._get_axis_number(axis)
         other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis]
 
+        # import pdb; pdb.set_trace()
         if self._is_label_reference(key, axis=axis):
             self._check_label_or_level_ambiguity(key, axis=axis)
             values = self.xs(key, axis=other_axes[0])._values
@@ -3846,6 +3847,8 @@ class   animal   locomotion
         self._consolidate_inplace()
 
         if isinstance(index, MultiIndex):
+            #import pdb; pdb.set_trace()
+            import traceback as tb;
             loc, new_index = index._get_loc_level(key, level=0)
             if not drop_level:
                 if lib.is_integer(loc):

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -3460,8 +3460,13 @@ def get_loc(self, key, method=None, tolerance=None):
                     "tolerance argument only valid if using pad, "
                     "backfill or nearest lookups"
                 )
+            # print(f'key before casting {key}')
             casted_key = self._maybe_cast_indexer(key)
             try:
+                #import pdb; pdb.set_trace()
+                # print(f'casted key {casted_key}')
+                # import pdb; pdb.set_trace()
+                # import traceback as tb; tb.print_stack()
                 return self._engine.get_loc(casted_key)
             except KeyError as err:
                 raise KeyError(key) from err

diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -132,7 +132,15 @@ def _codes_to_ints(self, codes):
         """
         # Shift the representation of each level by the pre-calculated number
         # of bits:
+        #print(f'codes {codes}, self.offsets {self.offsets}')
+        #import pdb; pdb.set_trace()
         codes <<= self.offsets
+        #if len(codes.shape) == 1 and list(codes) == [4,0]:
+            # import traceback as tb; tb.print_stack()
+            #codes = np.array([4, 2])
+            #import pdb; pdb.set_trace()
+            #print('hi')
+        #print(f'codes after shifting {codes}')
 
         # Now sum and OR are in fact interchangeable. This is a simple
         # composition of the (disjunct) significant bits of each level (i.e.
@@ -3001,7 +3009,12 @@ def maybe_mi_droplevels(indexer, levels):
                 if len(key) == self.nlevels and self.is_unique:
                     # Complete key in unique index -> standard get_loc
                     try:
-                        return (self._engine.get_loc(key), None)
+                        engine = self._engine
+                        #import pdb; pdb.set_trace()
+                        # print(f'key before handing it to engine {engine} is key: {key}')
+                        # import pdb; pdb.set_trace()
+                        loc_res = engine.get_loc(key)
+                        return (loc_res, None)
                     except KeyError as err:
                         raise KeyError(key) from err
                     except TypeError:

diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -1110,6 +1110,7 @@ def _get_label(self, label, axis: int):
 
     def _handle_lowerdim_multi_index_axis0(self, tup: tuple):
         # we have an axis0 multi-index, handle or raise
+        #import pdb; pdb.set_trace()
         axis = self.axis or 0
         try:
             # fast path for series or for tup devoid of slices

diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py
@@ -625,6 +625,7 @@ def test_get_loc_cast_bool(self):
         # GH 19086 : int is casted to bool, but not vice-versa
         levels = [[False, True], np.arange(2, dtype="int64")]
         idx = MultiIndex.from_product(levels)
+        import pdb; pdb.set_trace()
 
         assert idx.get_loc((0, 1)) == 1
         assert idx.get_loc((1, 0)) == 2

diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py
@@ -392,3 +392,17 @@ def test_loc_empty_multiindex():
     result = df
     expected = DataFrame([1, 2, 3, 4], index=index, columns=["value"])
     tm.assert_frame_equal(result, expected)
+
+def test_loc_nan_multiindex():
+    df = DataFrame(
+        {
+            "temp_playlist": [0, 0, 0, 0],
+            "objId": ["o1", np.nan, "o1", np.nan],
+            "x": [1, 2, 3, 4],
+        }
+    )
+
+    agg_df = df.groupby(by=['temp_playlist', 'objId'], dropna=False)["x"].agg(list)
+    result = agg_df.loc[agg_df.index[-1]]
+    expected = [2, 4]
+    assert result == expected