From 549386ceb4bfd72648dc02021f315c4b0fd7cbb0 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sat, 9 Oct 2021 22:35:27 +0200 Subject: [PATCH 01/43] fix problem without adding tests --- pandas/_libs/hashtable.pyx | 1 + pandas/_libs/index.pyx | 20 ++++++++++++++++++-- pandas/core/generic.py | 3 +++ pandas/core/indexes/base.py | 5 +++++ pandas/core/indexes/multi.py | 15 ++++++++++++++- pandas/core/indexing.py | 1 + pandas/tests/indexes/multi/test_indexing.py | 1 + 7 files changed, 43 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 6e97c13c644cf..d7a2861013a1a 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -121,6 +121,7 @@ cdef class ObjectFactorizer(Factorizer): uniques = ObjectVector() uniques.extend(self.uniques.to_array()) self.uniques = uniques + print('WE ARE IN FACTORIZE') labels = self.table.get_labels(values, self.uniques, self.count, na_sentinel, na_value) mask = (labels == na_sentinel) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index ea0bebea8299b..0716e8792536a 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -129,6 +129,7 @@ cdef class IndexEngine: # We assume before we get here: # - val is hashable self._ensure_mapping_populated() + #print('THIS IS THE MAPPING ' + str(self.mapping)) return val in self.mapping cpdef get_loc(self, object val): @@ -158,6 +159,8 @@ cdef class IndexEngine: return self._get_loc_duplicates(val) try: + #print('THIS IS val: ' + str(val)) + #print('THIS IS MAPPING: ' + str(self.mapping)) return self.mapping.get_item(val) except OverflowError as err: # GH#41775 OverflowError e.g. if we are uint64 and val is -1 @@ -594,13 +597,16 @@ cdef class BaseMultiIndexCodesEngine: # Transform labels in a single array, and add 1 so that we are working # with positive integers (-1 for NaN becomes 0): + #print('\nTHIS ARE LABELS IN INITIALIZATION OF BASEMULTIINDEXCODESENGINE: ' + str(labels) + " THIS IS THE TYPE OF LABELS: " + str(type(labels))) codes = (np.array(labels, dtype='int64').T + 1).astype('uint64', copy=False) + #print('\nTHIS ARE THE CODES: ' + str(codes) + " THIS IS THE TYPE OF CODES: " + str(type(codes)) + '\n') # Map each codes combination in the index to an integer unambiguously # (no collisions possible), based on the "offsets", which describe the # number of bits to switch labels for each level: lab_ints = self._codes_to_ints(codes) + #print('\nTHIS ARE THE LAB INTS: ' + str(lab_ints) + " THIS IS THE TYPE OF LAB INTS: " + str(type(codes)) + '\n') # Initialize underlying index (e.g. libindex.UInt64Engine) with # integers representing labels: we will use its get_loc and get_indexer @@ -731,13 +737,23 @@ cdef class BaseMultiIndexCodesEngine: return sorted_indexer[np.argsort(target_order)] def get_loc(self, object key): + #print('get_loc, we want to get key' + str(key)) if is_definitely_invalid_key(key): raise TypeError(f"'{key}' is an invalid key") if not isinstance(key, tuple): raise KeyError(key) + # print('LEVELS: ' + str(type(self.levels[0])) + 'KEYS: ' + str(key)) try: - indices = [0 if checknull(v) else lev.get_loc(v) + 1 - for lev, v in zip(self.levels, key)] + # indices = [0 if checknull(v) else lev.get_loc(v) + 1 + # for lev, v in zip(self.levels, key)] + indices = [] + for key, lev in zip(key, self.levels): + try: + indices.append(lev.get_loc(key) + 1) + except KeyError: + indices.append(0) + # indices = [lev.get_loc(v) + 1 + # for lev, v in zip(self.levels, key)] except KeyError: raise KeyError(key) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b235f120d98c8..123b0747dce70 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1835,6 +1835,7 @@ def _get_label_or_level_values(self, key: str, axis: int = 0) -> np.ndarray: axis = self._get_axis_number(axis) other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis] + # import pdb; pdb.set_trace() if self._is_label_reference(key, axis=axis): self._check_label_or_level_ambiguity(key, axis=axis) values = self.xs(key, axis=other_axes[0])._values @@ -3846,6 +3847,8 @@ class animal locomotion self._consolidate_inplace() if isinstance(index, MultiIndex): + #import pdb; pdb.set_trace() + import traceback as tb; loc, new_index = index._get_loc_level(key, level=0) if not drop_level: if lib.is_integer(loc): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2ff9b3973a526..ee83709df4257 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3460,8 +3460,13 @@ def get_loc(self, key, method=None, tolerance=None): "tolerance argument only valid if using pad, " "backfill or nearest lookups" ) + # print(f'key before casting {key}') casted_key = self._maybe_cast_indexer(key) try: + #import pdb; pdb.set_trace() + # print(f'casted key {casted_key}') + # import pdb; pdb.set_trace() + # import traceback as tb; tb.print_stack() return self._engine.get_loc(casted_key) except KeyError as err: raise KeyError(key) from err diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index daca14692ed09..237e718a0f7bd 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -132,7 +132,15 @@ def _codes_to_ints(self, codes): """ # Shift the representation of each level by the pre-calculated number # of bits: + #print(f'codes {codes}, self.offsets {self.offsets}') + #import pdb; pdb.set_trace() codes <<= self.offsets + #if len(codes.shape) == 1 and list(codes) == [4,0]: + # import traceback as tb; tb.print_stack() + #codes = np.array([4, 2]) + #import pdb; pdb.set_trace() + #print('hi') + #print(f'codes after shifting {codes}') # Now sum and OR are in fact interchangeable. This is a simple # composition of the (disjunct) significant bits of each level (i.e. @@ -3001,7 +3009,12 @@ def maybe_mi_droplevels(indexer, levels): if len(key) == self.nlevels and self.is_unique: # Complete key in unique index -> standard get_loc try: - return (self._engine.get_loc(key), None) + engine = self._engine + #import pdb; pdb.set_trace() + # print(f'key before handing it to engine {engine} is key: {key}') + # import pdb; pdb.set_trace() + loc_res = engine.get_loc(key) + return (loc_res, None) except KeyError as err: raise KeyError(key) from err except TypeError: diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index bbb3cb3391dfa..24d543d2af139 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1110,6 +1110,7 @@ def _get_label(self, label, axis: int): def _handle_lowerdim_multi_index_axis0(self, tup: tuple): # we have an axis0 multi-index, handle or raise + #import pdb; pdb.set_trace() axis = self.axis or 0 try: # fast path for series or for tup devoid of slices diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 99322f474dd9e..e7b67fdc9f9c7 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -625,6 +625,7 @@ def test_get_loc_cast_bool(self): # GH 19086 : int is casted to bool, but not vice-versa levels = [[False, True], np.arange(2, dtype="int64")] idx = MultiIndex.from_product(levels) + import pdb; pdb.set_trace() assert idx.get_loc((0, 1)) == 1 assert idx.get_loc((1, 0)) == 2 From 81970c7163f2685ee79765d8a360e729008d6fb8 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sat, 9 Oct 2021 22:42:22 +0200 Subject: [PATCH 02/43] add test, still not all tests running --- pandas/tests/indexing/multiindex/test_getitem.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index 3790a6e9a5319..441591047eb62 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -392,3 +392,17 @@ def test_loc_empty_multiindex(): result = df expected = DataFrame([1, 2, 3, 4], index=index, columns=["value"]) tm.assert_frame_equal(result, expected) + +def test_loc_nan_multiindex(): + df = DataFrame( + { + "temp_playlist": [0, 0, 0, 0], + "objId": ["o1", np.nan, "o1", np.nan], + "x": [1, 2, 3, 4], + } + ) + + agg_df = df.groupby(by=['temp_playlist', 'objId'], dropna=False)["x"].agg(list) + result = agg_df.loc[agg_df.index[-1]] + expected = [2, 4] + assert result == expected From 8443c62941431886d8594aaa172a9cab581cb03d Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sun, 10 Oct 2021 12:20:23 +0200 Subject: [PATCH 03/43] remove print and debug statements --- pandas/_libs/hashtable.pyx | 1 - pandas/_libs/index.pyx | 28 ++++++--------------- pandas/core/generic.py | 3 --- pandas/core/indexes/base.py | 5 ---- pandas/core/indexes/multi.py | 15 +---------- pandas/core/indexing.py | 1 - pandas/tests/indexes/multi/test_indexing.py | 1 - 7 files changed, 9 insertions(+), 45 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index d7a2861013a1a..6e97c13c644cf 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -121,7 +121,6 @@ cdef class ObjectFactorizer(Factorizer): uniques = ObjectVector() uniques.extend(self.uniques.to_array()) self.uniques = uniques - print('WE ARE IN FACTORIZE') labels = self.table.get_labels(values, self.uniques, self.count, na_sentinel, na_value) mask = (labels == na_sentinel) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 0716e8792536a..f1b174774d6fe 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -129,7 +129,6 @@ cdef class IndexEngine: # We assume before we get here: # - val is hashable self._ensure_mapping_populated() - #print('THIS IS THE MAPPING ' + str(self.mapping)) return val in self.mapping cpdef get_loc(self, object val): @@ -159,8 +158,6 @@ cdef class IndexEngine: return self._get_loc_duplicates(val) try: - #print('THIS IS val: ' + str(val)) - #print('THIS IS MAPPING: ' + str(self.mapping)) return self.mapping.get_item(val) except OverflowError as err: # GH#41775 OverflowError e.g. if we are uint64 and val is -1 @@ -597,16 +594,13 @@ cdef class BaseMultiIndexCodesEngine: # Transform labels in a single array, and add 1 so that we are working # with positive integers (-1 for NaN becomes 0): - #print('\nTHIS ARE LABELS IN INITIALIZATION OF BASEMULTIINDEXCODESENGINE: ' + str(labels) + " THIS IS THE TYPE OF LABELS: " + str(type(labels))) codes = (np.array(labels, dtype='int64').T + 1).astype('uint64', copy=False) - #print('\nTHIS ARE THE CODES: ' + str(codes) + " THIS IS THE TYPE OF CODES: " + str(type(codes)) + '\n') # Map each codes combination in the index to an integer unambiguously # (no collisions possible), based on the "offsets", which describe the # number of bits to switch labels for each level: lab_ints = self._codes_to_ints(codes) - #print('\nTHIS ARE THE LAB INTS: ' + str(lab_ints) + " THIS IS THE TYPE OF LAB INTS: " + str(type(codes)) + '\n') # Initialize underlying index (e.g. libindex.UInt64Engine) with # integers representing labels: we will use its get_loc and get_indexer @@ -737,25 +731,19 @@ cdef class BaseMultiIndexCodesEngine: return sorted_indexer[np.argsort(target_order)] def get_loc(self, object key): - #print('get_loc, we want to get key' + str(key)) if is_definitely_invalid_key(key): raise TypeError(f"'{key}' is an invalid key") if not isinstance(key, tuple): raise KeyError(key) - # print('LEVELS: ' + str(type(self.levels[0])) + 'KEYS: ' + str(key)) - try: - # indices = [0 if checknull(v) else lev.get_loc(v) + 1 - # for lev, v in zip(self.levels, key)] - indices = [] - for key, lev in zip(key, self.levels): - try: - indices.append(lev.get_loc(key) + 1) - except KeyError: + indices = [] + for k, lev in zip(key, self.levels): + try: + indices.append(lev.get_loc(k) + 1) + except KeyError as err: + if checknull(k): indices.append(0) - # indices = [lev.get_loc(v) + 1 - # for lev, v in zip(self.levels, key)] - except KeyError: - raise KeyError(key) + else: + raise KeyError(key) from err # Transform indices into single integer: lab_int = self._codes_to_ints(np.array(indices, dtype='uint64')) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 123b0747dce70..b235f120d98c8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1835,7 +1835,6 @@ def _get_label_or_level_values(self, key: str, axis: int = 0) -> np.ndarray: axis = self._get_axis_number(axis) other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis] - # import pdb; pdb.set_trace() if self._is_label_reference(key, axis=axis): self._check_label_or_level_ambiguity(key, axis=axis) values = self.xs(key, axis=other_axes[0])._values @@ -3847,8 +3846,6 @@ class animal locomotion self._consolidate_inplace() if isinstance(index, MultiIndex): - #import pdb; pdb.set_trace() - import traceback as tb; loc, new_index = index._get_loc_level(key, level=0) if not drop_level: if lib.is_integer(loc): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ee83709df4257..2ff9b3973a526 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3460,13 +3460,8 @@ def get_loc(self, key, method=None, tolerance=None): "tolerance argument only valid if using pad, " "backfill or nearest lookups" ) - # print(f'key before casting {key}') casted_key = self._maybe_cast_indexer(key) try: - #import pdb; pdb.set_trace() - # print(f'casted key {casted_key}') - # import pdb; pdb.set_trace() - # import traceback as tb; tb.print_stack() return self._engine.get_loc(casted_key) except KeyError as err: raise KeyError(key) from err diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 237e718a0f7bd..daca14692ed09 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -132,15 +132,7 @@ def _codes_to_ints(self, codes): """ # Shift the representation of each level by the pre-calculated number # of bits: - #print(f'codes {codes}, self.offsets {self.offsets}') - #import pdb; pdb.set_trace() codes <<= self.offsets - #if len(codes.shape) == 1 and list(codes) == [4,0]: - # import traceback as tb; tb.print_stack() - #codes = np.array([4, 2]) - #import pdb; pdb.set_trace() - #print('hi') - #print(f'codes after shifting {codes}') # Now sum and OR are in fact interchangeable. This is a simple # composition of the (disjunct) significant bits of each level (i.e. @@ -3009,12 +3001,7 @@ def maybe_mi_droplevels(indexer, levels): if len(key) == self.nlevels and self.is_unique: # Complete key in unique index -> standard get_loc try: - engine = self._engine - #import pdb; pdb.set_trace() - # print(f'key before handing it to engine {engine} is key: {key}') - # import pdb; pdb.set_trace() - loc_res = engine.get_loc(key) - return (loc_res, None) + return (self._engine.get_loc(key), None) except KeyError as err: raise KeyError(key) from err except TypeError: diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 24d543d2af139..bbb3cb3391dfa 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1110,7 +1110,6 @@ def _get_label(self, label, axis: int): def _handle_lowerdim_multi_index_axis0(self, tup: tuple): # we have an axis0 multi-index, handle or raise - #import pdb; pdb.set_trace() axis = self.axis or 0 try: # fast path for series or for tup devoid of slices diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index e7b67fdc9f9c7..99322f474dd9e 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -625,7 +625,6 @@ def test_get_loc_cast_bool(self): # GH 19086 : int is casted to bool, but not vice-versa levels = [[False, True], np.arange(2, dtype="int64")] idx = MultiIndex.from_product(levels) - import pdb; pdb.set_trace() assert idx.get_loc((0, 1)) == 1 assert idx.get_loc((1, 0)) == 2 From 9eae7739a329d3678654405c23aa4d6be4b47afd Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sun, 10 Oct 2021 10:23:21 +0000 Subject: [PATCH 04/43] Fixes from pre-commit [automated commit] --- pandas/tests/indexing/multiindex/test_getitem.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index 441591047eb62..f635e30570709 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -393,6 +393,7 @@ def test_loc_empty_multiindex(): expected = DataFrame([1, 2, 3, 4], index=index, columns=["value"]) tm.assert_frame_equal(result, expected) + def test_loc_nan_multiindex(): df = DataFrame( { @@ -402,7 +403,7 @@ def test_loc_nan_multiindex(): } ) - agg_df = df.groupby(by=['temp_playlist', 'objId'], dropna=False)["x"].agg(list) + agg_df = df.groupby(by=["temp_playlist", "objId"], dropna=False)["x"].agg(list) result = agg_df.loc[agg_df.index[-1]] expected = [2, 4] assert result == expected From c497e34d906231b177c46c3d93a493c768b4ae55 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sun, 10 Oct 2021 12:53:58 +0200 Subject: [PATCH 05/43] add new test --- .../tests/indexing/multiindex/test_getitem.py | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index 441591047eb62..c1c77d6a146ae 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -406,3 +406,28 @@ def test_loc_nan_multiindex(): result = agg_df.loc[agg_df.index[-1]] expected = [2, 4] assert result == expected + +def test_loc_nan_multiindex2(): + arrays = [[1, 1, 1], [np.nan, 100, np.nan]] + names = ("idx1", "idx2") + index = MultiIndex.from_arrays(arrays, names=names) + df = DataFrame([1, 2, 3], index=index, columns=['col1']) + result = df.loc[(1, np.nan)] + + expected = DataFrame([1 , 3], index=MultiIndex.from_arrays([[1,1], [np.nan, np.nan]], names=names), columns=['col1']) + tm.assert_frame_equal(result, expected, check_index_type=False) + + +def test_loc_nan_multiindex(): + df = DataFrame( + { + "temp_playlist": [0, 0, 0, 0], + "objId": ["o1", np.nan, "o1", np.nan], + "x": [1, 2, 3, 4], + } + ) + + agg_df = df.groupby(by=['temp_playlist', 'objId'], dropna=False)["x"].agg(list) + result = agg_df.loc[agg_df.index[-1]] + expected = [2, 4] + assert result == expected From fde027ba89bc73308381fa552231cf4a40607e38 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sun, 10 Oct 2021 20:21:29 +0200 Subject: [PATCH 06/43] update test as desired in PR comments --- .../tests/indexing/multiindex/test_getitem.py | 26 +++++++------------ 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index c1c77d6a146ae..42495ba304cd8 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -393,7 +393,9 @@ def test_loc_empty_multiindex(): expected = DataFrame([1, 2, 3, 4], index=index, columns=["value"]) tm.assert_frame_equal(result, expected) -def test_loc_nan_multiindex(): +@pytest.mark.parametrize("dropna", [True, False]) +def test_loc_nan_multiindex(dropna): + # GH 43943 df = DataFrame( { "temp_playlist": [0, 0, 0, 0], @@ -402,9 +404,14 @@ def test_loc_nan_multiindex(): } ) - agg_df = df.groupby(by=['temp_playlist', 'objId'], dropna=False)["x"].agg(list) + agg_df = df.groupby(by=['temp_playlist', 'objId'], dropna=dropna)["x"].agg(list) result = agg_df.loc[agg_df.index[-1]] - expected = [2, 4] + if dropna: + # nans are dropped, therefore only [1, 3] is left + expected = [1, 3] + else: + # last index is (0, np.nan) which corresponds to the list [2, 4] + expected = [2, 4] assert result == expected def test_loc_nan_multiindex2(): @@ -418,16 +425,3 @@ def test_loc_nan_multiindex2(): tm.assert_frame_equal(result, expected, check_index_type=False) -def test_loc_nan_multiindex(): - df = DataFrame( - { - "temp_playlist": [0, 0, 0, 0], - "objId": ["o1", np.nan, "o1", np.nan], - "x": [1, 2, 3, 4], - } - ) - - agg_df = df.groupby(by=['temp_playlist', 'objId'], dropna=False)["x"].agg(list) - result = agg_df.loc[agg_df.index[-1]] - expected = [2, 4] - assert result == expected From 246a421fb44bbad6a88b4fdf128061dc220c4b98 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sun, 10 Oct 2021 20:29:23 +0200 Subject: [PATCH 07/43] add whatsnew entry --- doc/source/whatsnew/v1.3.4.rst | 1 + pandas/tests/indexing/multiindex/test_getitem.py | 1 + 2 files changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v1.3.4.rst b/doc/source/whatsnew/v1.3.4.rst index 6f07dc3e1e2f9..060dbc97524df 100644 --- a/doc/source/whatsnew/v1.3.4.rst +++ b/doc/source/whatsnew/v1.3.4.rst @@ -26,6 +26,7 @@ Fixed regressions - Fixed regression in :meth:`Series.aggregate` attempting to pass ``args`` and ``kwargs`` multiple times to the user supplied ``func`` in certain cases (:issue:`43357`) - Fixed regression when iterating over a :class:`DataFrame.groupby.rolling` object causing the resulting DataFrames to have an incorrect index if the input groupings were not sorted (:issue:`43386`) - Fixed regression in :meth:`DataFrame.groupby.rolling.cov` and :meth:`DataFrame.groupby.rolling.corr` computing incorrect results if the input groupings were not sorted (:issue:`43386`) +- Fixed regression in :meth:`DataFrame.loc` when `MultiIndex` contained `np.nan` (:issue`43814`) .. --------------------------------------------------------------------------- diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index c519588dcec48..baa98601f5a9b 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -393,6 +393,7 @@ def test_loc_empty_multiindex(): expected = DataFrame([1, 2, 3, 4], index=index, columns=["value"]) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("dropna", [True, False]) def test_loc_nan_multiindex(dropna): # GH 43943 From bbeeb1b993ef6089e13ba39b37ce8907d6082247 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sun, 10 Oct 2021 20:30:52 +0200 Subject: [PATCH 08/43] fix wrong issue number --- pandas/tests/indexing/multiindex/test_getitem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index baa98601f5a9b..c9242789f1486 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -396,7 +396,7 @@ def test_loc_empty_multiindex(): @pytest.mark.parametrize("dropna", [True, False]) def test_loc_nan_multiindex(dropna): - # GH 43943 + # GH 43814 df = DataFrame( { "temp_playlist": [0, 0, 0, 0], From 6b38ba4312ce7a302d650bf7cc8c5147fb0898ba Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Tue, 12 Oct 2021 19:45:14 +0200 Subject: [PATCH 09/43] save nan encoding in new state variable and replace it before returning the output --- pandas/_libs/index.pyx | 14 +++++------- pandas/core/groupby/grouper.py | 9 ++++++++ pandas/core/groupby/ops.py | 17 +++++++++++++- pandas/tests/groupby/test_groupby_dropna.py | 18 +++++++++++++++ .../tests/indexing/multiindex/test_getitem.py | 22 ------------------- 5 files changed, 48 insertions(+), 32 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index f1b174774d6fe..ea0bebea8299b 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -735,15 +735,11 @@ cdef class BaseMultiIndexCodesEngine: raise TypeError(f"'{key}' is an invalid key") if not isinstance(key, tuple): raise KeyError(key) - indices = [] - for k, lev in zip(key, self.levels): - try: - indices.append(lev.get_loc(k) + 1) - except KeyError as err: - if checknull(k): - indices.append(0) - else: - raise KeyError(key) from err + try: + indices = [0 if checknull(v) else lev.get_loc(v) + 1 + for lev, v in zip(self.levels, key)] + except KeyError: + raise KeyError(key) # Transform indices into single integer: lab_int = self._codes_to_ints(np.array(indices, dtype='uint64')) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 7577b1e671d60..f62964ff378c0 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -486,6 +486,7 @@ def __init__( self._observed = observed self.in_axis = in_axis self._dropna = dropna + self._na_placeholder = None self._passed_categorical = False @@ -690,6 +691,14 @@ def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]: codes, uniques = algorithms.factorize( self.grouping_vector, sort=self._sort, na_sentinel=na_sentinel ) + # GH43943, store placeholder for np.nan, will later be replaced by -1 in + # pandas/core/groupby:reconstructed_codes + if not self._dropna: + if any( + isinstance(v, float) and np.isnan(v) for v in self.grouping_vector + ): + self._na_placeholder = max(codes) + return codes, uniques @cache_readonly diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 46e4465667e7e..6fa9c6607cfb0 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -847,7 +847,22 @@ def ngroups(self) -> int: def reconstructed_codes(self) -> list[np.ndarray]: codes = self.codes ids, obs_ids, _ = self.group_info - return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True) + reconstructed_codes = decons_obs_group_ids( + ids, obs_ids, self.shape, codes, xnull=True + ) + + def transform_codes(code_level, grouping): + if grouping._na_placeholder is not None: + return np.where(code_level == max(code_level), -1, code_level) + else: + return code_level + + # import pdb; pdb.set_trace() + transformed_rec_codes = [ + transform_codes(code_level, grouping) + for code_level, grouping in zip(reconstructed_codes, self._groupings) + ] + return transformed_rec_codes @final @cache_readonly diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index ab568e24ff029..b8f3bb2c4c8a8 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -370,3 +370,21 @@ def test_groupby_nan_included(): tm.assert_numpy_array_equal(result_values, expected_values) assert np.isnan(list(result.keys())[2]) assert list(result.keys())[0:2] == ["g1", "g2"] + + +def test_groupby_codes_with_nan_in_multiindex(): + # GH 43814 + df = pd.DataFrame( + { + "temp_playlist": [0, 0, 0, 0], + "objId": ["o1", np.nan, "o1", np.nan], + "x": [1, 2, 3, 4], + } + ) + + grouped_df = df.groupby(by=["temp_playlist", "objId"], dropna=False)["x"].sum() + expected = pd.MultiIndex.from_arrays( + [[0, 0], ["o1", np.nan]], names=["temp_playlist", "objId"] + ) + result = grouped_df.index + assert all((res == ex).all() for res, ex in zip(result.codes, expected.codes)) diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index c9242789f1486..3790a6e9a5319 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -392,25 +392,3 @@ def test_loc_empty_multiindex(): result = df expected = DataFrame([1, 2, 3, 4], index=index, columns=["value"]) tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("dropna", [True, False]) -def test_loc_nan_multiindex(dropna): - # GH 43814 - df = DataFrame( - { - "temp_playlist": [0, 0, 0, 0], - "objId": ["o1", np.nan, "o1", np.nan], - "x": [1, 2, 3, 4], - } - ) - - agg_df = df.groupby(by=["temp_playlist", "objId"], dropna=False)["x"].agg(list) - result = agg_df.loc[agg_df.index[-1]] - if dropna: - # nans are dropped, therefore only [1, 3] is left - expected = [1, 3] - else: - # last index is (0, np.nan) which corresponds to the list [2, 4] - expected = [2, 4] - assert result == expected From d04b7b8f66ade97ffeeba8aaac45500f93211bca Mon Sep 17 00:00:00 2001 From: CloseChoice Date: Tue, 12 Oct 2021 21:56:10 +0200 Subject: [PATCH 10/43] remove wrong whatsnew entry --- doc/source/whatsnew/v1.3.4.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.4.rst b/doc/source/whatsnew/v1.3.4.rst index 060dbc97524df..6f07dc3e1e2f9 100644 --- a/doc/source/whatsnew/v1.3.4.rst +++ b/doc/source/whatsnew/v1.3.4.rst @@ -26,7 +26,6 @@ Fixed regressions - Fixed regression in :meth:`Series.aggregate` attempting to pass ``args`` and ``kwargs`` multiple times to the user supplied ``func`` in certain cases (:issue:`43357`) - Fixed regression when iterating over a :class:`DataFrame.groupby.rolling` object causing the resulting DataFrames to have an incorrect index if the input groupings were not sorted (:issue:`43386`) - Fixed regression in :meth:`DataFrame.groupby.rolling.cov` and :meth:`DataFrame.groupby.rolling.corr` computing incorrect results if the input groupings were not sorted (:issue:`43386`) -- Fixed regression in :meth:`DataFrame.loc` when `MultiIndex` contained `np.nan` (:issue`43814`) .. --------------------------------------------------------------------------- From cc75b447e54b1e3b1df2ab26220d6e3ca16e91a6 Mon Sep 17 00:00:00 2001 From: CloseChoice Date: Tue, 12 Oct 2021 22:00:49 +0200 Subject: [PATCH 11/43] remove debug statement --- pandas/core/groupby/ops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 6fa9c6607cfb0..fdf4d18802a35 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -857,7 +857,6 @@ def transform_codes(code_level, grouping): else: return code_level - # import pdb; pdb.set_trace() transformed_rec_codes = [ transform_codes(code_level, grouping) for code_level, grouping in zip(reconstructed_codes, self._groupings) From c88b2d8fb4985837f81a5d65b3d2d70e9d49752b Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Wed, 13 Oct 2021 13:58:39 +0200 Subject: [PATCH 12/43] add whatsnew entry --- doc/source/whatsnew/v1.4.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index e638a24f830ef..5053421866ef4 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -503,6 +503,7 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.apply` with time-based :class:`Grouper` objects incorrectly raising ``ValueError`` in corner cases where the grouping vector contains a ``NaT`` (:issue:`43500`, :issue:`43515`) - Bug in :meth:`GroupBy.mean` failing with ``complex`` dtype (:issue:`43701`) - Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not calculating window bounds correctly for the first row when ``center=True`` and index is decreasing (:issue:`43927`) +- Bug in :meth:`DataFrame.groupby` when grouping on multiple columns where at least one includes ``np.nan`` which resulted in a `KeyError` when the ``np.nan`` containing index was selected with :meth:`Series.loc` (:issue:`43814`) Reshaping ^^^^^^^^^ From 77e7283cacda1a93ca5af4262ab1417343793a4f Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sun, 17 Oct 2021 20:39:15 +0200 Subject: [PATCH 13/43] fix wrong code quotes in whatsnew --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 1a687891b2d39..6d21d23c6d967 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -558,7 +558,7 @@ Groupby/resample/rolling - Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` for centered datetimelike windows with uneven nanosecond (:issue:`43997`) - Bug in :meth:`GroupBy.nth` failing on ``axis=1`` (:issue:`43926`) - Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not respecting right bound on centered datetime-like windows, if the index contain duplicates (:issue:`#3944`) -- Bug in :meth:`DataFrame.groupby` when grouping on multiple columns where at least one includes ``np.nan`` which resulted in a `KeyError` when the ``np.nan`` containing index was selected with :meth:`Series.loc` (:issue:`43814`) +- Bug in :meth:`DataFrame.groupby` when grouping on multiple columns where at least one includes ``np.nan`` which resulted in a ``KeyError`` when the ``np.nan`` containing index was selected with :meth:`Series.loc` (:issue:`43814`) Reshaping ^^^^^^^^^ From 72fb02684fabbd6a71f864b53e8f105c5b01b986 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Fri, 22 Oct 2021 06:44:53 +0200 Subject: [PATCH 14/43] WIP: add support for (pd.NA, pd.NaT, np.datetime64("NaT", "ns"), np.timedelta64("NaT", "ns") --- pandas/core/groupby/grouper.py | 8 +++--- pandas/core/groupby/ops.py | 3 ++- pandas/tests/groupby/test_groupby_dropna.py | 28 ++++++++++++++++++--- 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index f62964ff378c0..43023264c5faf 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -34,6 +34,7 @@ Categorical, ExtensionArray, ) +from pandas.core.base import isna import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.groupby import ops @@ -50,6 +51,8 @@ from pandas.io.formats.printing import pprint_thing +from pandas._libs.tslibs.nattype import NaTType + if TYPE_CHECKING: from pandas.core.generic import NDFrame @@ -693,10 +696,9 @@ def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]: ) # GH43943, store placeholder for np.nan, will later be replaced by -1 in # pandas/core/groupby:reconstructed_codes + import pdb; pdb.set_trace() if not self._dropna: - if any( - isinstance(v, float) and np.isnan(v) for v in self.grouping_vector - ): + if isna(self.grouping_vector).any(): self._na_placeholder = max(codes) return codes, uniques diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index fdf4d18802a35..2f68260f01cfa 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -851,9 +851,10 @@ def reconstructed_codes(self) -> list[np.ndarray]: ids, obs_ids, self.shape, codes, xnull=True ) + import pdb; pdb.set_trace() def transform_codes(code_level, grouping): if grouping._na_placeholder is not None: - return np.where(code_level == max(code_level), -1, code_level) + return np.where(code_level == grouping._na_placeholder, -1, code_level) else: return code_level diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index b8f3bb2c4c8a8..ed468b8d67546 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -372,19 +372,41 @@ def test_groupby_nan_included(): assert list(result.keys())[0:2] == ["g1", "g2"] -def test_groupby_codes_with_nan_in_multiindex(): +@pytest.mark.parametrize( + "na", [np.nan, pd.NaT, np.datetime64("NaT", "ns"), np.timedelta64("NaT", "ns")] +) +def test_groupby_codes_with_nan_in_multiindex(na): + # GH 43814 + df = pd.DataFrame( + { + "temp_playlist": [0, 0, 0, 0], + "objId": ["o1", na, "o1", na], + "x": [1, 2, 3, 4], + } + ) + + grouped_df = df.groupby(by=["temp_playlist", "objId"], dropna=False)["x"].sum() + expected = pd.MultiIndex.from_arrays( + [[0, 0], ["o1", na]], names=["temp_playlist", "objId"] + ) + result = grouped_df.index + assert all((res == ex).all() for res, ex in zip(result.codes, expected.codes)) + + +def test_groupby_codes_with_pd_nat_in_multiindex(): # GH 43814 df = pd.DataFrame( { "temp_playlist": [0, 0, 0, 0], - "objId": ["o1", np.nan, "o1", np.nan], + "objId": ["o1", pd.NaT, "o1", pd.NaT], "x": [1, 2, 3, 4], } ) grouped_df = df.groupby(by=["temp_playlist", "objId"], dropna=False)["x"].sum() expected = pd.MultiIndex.from_arrays( - [[0, 0], ["o1", np.nan]], names=["temp_playlist", "objId"] + [[0, 0], ["o1", pd.NaT]], names=["temp_playlist", "objId"] ) result = grouped_df.index + # import pdb; pdb.set_trace() assert all((res == ex).all() for res, ex in zip(result.codes, expected.codes)) From cf0355d77ae157203ccb1efa75a3286ece3e3a83 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Fri, 22 Oct 2021 06:58:40 +0200 Subject: [PATCH 15/43] WIP: add support for (pd.NA, pd.NaT, np.datetime64("NaT", "ns"), np.timedelta64("NaT", "ns") --- pandas/core/groupby/grouper.py | 4 ++-- pandas/core/groupby/ops.py | 1 - pandas/tests/groupby/test_groupby_dropna.py | 19 ------------------- 3 files changed, 2 insertions(+), 22 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 43023264c5faf..ad9763a68bc66 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -694,9 +694,9 @@ def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]: codes, uniques = algorithms.factorize( self.grouping_vector, sort=self._sort, na_sentinel=na_sentinel ) - # GH43943, store placeholder for np.nan, will later be replaced by -1 in + # GH43943, store placeholder for (np.nan, pd.NaT, np.datetime64("NaT", "ns"), + # np.timedelta64("NaT", "ns")), will later be replaced by -1 in # pandas/core/groupby:reconstructed_codes - import pdb; pdb.set_trace() if not self._dropna: if isna(self.grouping_vector).any(): self._na_placeholder = max(codes) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 2f68260f01cfa..e8c0bdc5be370 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -851,7 +851,6 @@ def reconstructed_codes(self) -> list[np.ndarray]: ids, obs_ids, self.shape, codes, xnull=True ) - import pdb; pdb.set_trace() def transform_codes(code_level, grouping): if grouping._na_placeholder is not None: return np.where(code_level == grouping._na_placeholder, -1, code_level) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index ed468b8d67546..2485276503ac9 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -391,22 +391,3 @@ def test_groupby_codes_with_nan_in_multiindex(na): ) result = grouped_df.index assert all((res == ex).all() for res, ex in zip(result.codes, expected.codes)) - - -def test_groupby_codes_with_pd_nat_in_multiindex(): - # GH 43814 - df = pd.DataFrame( - { - "temp_playlist": [0, 0, 0, 0], - "objId": ["o1", pd.NaT, "o1", pd.NaT], - "x": [1, 2, 3, 4], - } - ) - - grouped_df = df.groupby(by=["temp_playlist", "objId"], dropna=False)["x"].sum() - expected = pd.MultiIndex.from_arrays( - [[0, 0], ["o1", pd.NaT]], names=["temp_playlist", "objId"] - ) - result = grouped_df.index - # import pdb; pdb.set_trace() - assert all((res == ex).all() for res, ex in zip(result.codes, expected.codes)) From 427a5151e1ecf185491ab28e04de03d20c2bd70a Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Fri, 22 Oct 2021 05:11:39 +0000 Subject: [PATCH 16/43] Fixes from pre-commit [automated commit] --- pandas/core/groupby/grouper.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index ad9763a68bc66..1a044fd5ec1aa 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -14,6 +14,7 @@ import numpy as np +from pandas._libs.tslibs.nattype import NaTType from pandas._typing import ( ArrayLike, NDFrameT, @@ -51,8 +52,6 @@ from pandas.io.formats.printing import pprint_thing -from pandas._libs.tslibs.nattype import NaTType - if TYPE_CHECKING: from pandas.core.generic import NDFrame From 29c37b68f77ce9d14f74a65247ef79901f51a2bc Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sat, 9 Oct 2021 22:35:27 +0200 Subject: [PATCH 17/43] fix problem without adding tests --- pandas/_libs/hashtable.pyx | 1 + pandas/_libs/index.pyx | 20 ++++++++++++++++++-- pandas/core/generic.py | 3 +++ pandas/core/indexes/base.py | 5 +++++ pandas/core/indexes/multi.py | 15 ++++++++++++++- pandas/core/indexing.py | 1 + pandas/tests/indexes/multi/test_indexing.py | 1 + 7 files changed, 43 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 6e97c13c644cf..d7a2861013a1a 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -121,6 +121,7 @@ cdef class ObjectFactorizer(Factorizer): uniques = ObjectVector() uniques.extend(self.uniques.to_array()) self.uniques = uniques + print('WE ARE IN FACTORIZE') labels = self.table.get_labels(values, self.uniques, self.count, na_sentinel, na_value) mask = (labels == na_sentinel) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 92837a43e2b69..fdcc4e61f77c9 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -129,6 +129,7 @@ cdef class IndexEngine: # We assume before we get here: # - val is hashable self._ensure_mapping_populated() + #print('THIS IS THE MAPPING ' + str(self.mapping)) return val in self.mapping cpdef get_loc(self, object val): @@ -158,6 +159,8 @@ cdef class IndexEngine: return self._get_loc_duplicates(val) try: + #print('THIS IS val: ' + str(val)) + #print('THIS IS MAPPING: ' + str(self.mapping)) return self.mapping.get_item(val) except OverflowError as err: # GH#41775 OverflowError e.g. if we are uint64 and val is -1 @@ -619,13 +622,16 @@ cdef class BaseMultiIndexCodesEngine: # Transform labels in a single array, and add 1 so that we are working # with positive integers (-1 for NaN becomes 0): + #print('\nTHIS ARE LABELS IN INITIALIZATION OF BASEMULTIINDEXCODESENGINE: ' + str(labels) + " THIS IS THE TYPE OF LABELS: " + str(type(labels))) codes = (np.array(labels, dtype='int64').T + 1).astype('uint64', copy=False) + #print('\nTHIS ARE THE CODES: ' + str(codes) + " THIS IS THE TYPE OF CODES: " + str(type(codes)) + '\n') # Map each codes combination in the index to an integer unambiguously # (no collisions possible), based on the "offsets", which describe the # number of bits to switch labels for each level: lab_ints = self._codes_to_ints(codes) + #print('\nTHIS ARE THE LAB INTS: ' + str(lab_ints) + " THIS IS THE TYPE OF LAB INTS: " + str(type(codes)) + '\n') # Initialize underlying index (e.g. libindex.UInt64Engine) with # integers representing labels: we will use its get_loc and get_indexer @@ -756,13 +762,23 @@ cdef class BaseMultiIndexCodesEngine: return sorted_indexer[np.argsort(target_order)] def get_loc(self, object key): + #print('get_loc, we want to get key' + str(key)) if is_definitely_invalid_key(key): raise TypeError(f"'{key}' is an invalid key") if not isinstance(key, tuple): raise KeyError(key) + # print('LEVELS: ' + str(type(self.levels[0])) + 'KEYS: ' + str(key)) try: - indices = [0 if checknull(v) else lev.get_loc(v) + 1 - for lev, v in zip(self.levels, key)] + # indices = [0 if checknull(v) else lev.get_loc(v) + 1 + # for lev, v in zip(self.levels, key)] + indices = [] + for key, lev in zip(key, self.levels): + try: + indices.append(lev.get_loc(key) + 1) + except KeyError: + indices.append(0) + # indices = [lev.get_loc(v) + 1 + # for lev, v in zip(self.levels, key)] except KeyError: raise KeyError(key) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c3ad87082c8ed..452b02dc0ad96 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1835,6 +1835,7 @@ def _get_label_or_level_values(self, key: str, axis: int = 0) -> np.ndarray: axis = self._get_axis_number(axis) other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis] + # import pdb; pdb.set_trace() if self._is_label_reference(key, axis=axis): self._check_label_or_level_ambiguity(key, axis=axis) values = self.xs(key, axis=other_axes[0])._values @@ -3804,6 +3805,8 @@ class animal locomotion self._consolidate_inplace() if isinstance(index, MultiIndex): + #import pdb; pdb.set_trace() + import traceback as tb; loc, new_index = index._get_loc_level(key, level=0) if not drop_level: if lib.is_integer(loc): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 05047540c6ccd..274ffd33abe78 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3508,8 +3508,13 @@ def get_loc(self, key, method=None, tolerance=None): "tolerance argument only valid if using pad, " "backfill or nearest lookups" ) + # print(f'key before casting {key}') casted_key = self._maybe_cast_indexer(key) try: + #import pdb; pdb.set_trace() + # print(f'casted key {casted_key}') + # import pdb; pdb.set_trace() + # import traceback as tb; tb.print_stack() return self._engine.get_loc(casted_key) except KeyError as err: raise KeyError(key) from err diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index e2f1a2d6a1e23..8d72805e59d4d 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -132,7 +132,15 @@ def _codes_to_ints(self, codes): """ # Shift the representation of each level by the pre-calculated number # of bits: + #print(f'codes {codes}, self.offsets {self.offsets}') + #import pdb; pdb.set_trace() codes <<= self.offsets + #if len(codes.shape) == 1 and list(codes) == [4,0]: + # import traceback as tb; tb.print_stack() + #codes = np.array([4, 2]) + #import pdb; pdb.set_trace() + #print('hi') + #print(f'codes after shifting {codes}') # Now sum and OR are in fact interchangeable. This is a simple # composition of the (disjunct) significant bits of each level (i.e. @@ -3001,7 +3009,12 @@ def maybe_mi_droplevels(indexer, levels): if len(key) == self.nlevels and self.is_unique: # Complete key in unique index -> standard get_loc try: - return (self._engine.get_loc(key), None) + engine = self._engine + #import pdb; pdb.set_trace() + # print(f'key before handing it to engine {engine} is key: {key}') + # import pdb; pdb.set_trace() + loc_res = engine.get_loc(key) + return (loc_res, None) except KeyError as err: raise KeyError(key) from err except TypeError: diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 918a9ea1b8030..a1c48a3213318 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1140,6 +1140,7 @@ def _get_label(self, label, axis: int): def _handle_lowerdim_multi_index_axis0(self, tup: tuple): # we have an axis0 multi-index, handle or raise + #import pdb; pdb.set_trace() axis = self.axis or 0 try: # fast path for series or for tup devoid of slices diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 99322f474dd9e..e7b67fdc9f9c7 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -625,6 +625,7 @@ def test_get_loc_cast_bool(self): # GH 19086 : int is casted to bool, but not vice-versa levels = [[False, True], np.arange(2, dtype="int64")] idx = MultiIndex.from_product(levels) + import pdb; pdb.set_trace() assert idx.get_loc((0, 1)) == 1 assert idx.get_loc((1, 0)) == 2 From d274bca24676eab593ec2ef7a2d93bf59e45be63 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sat, 9 Oct 2021 22:42:22 +0200 Subject: [PATCH 18/43] add test, still not all tests running --- pandas/tests/indexing/multiindex/test_getitem.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index 3790a6e9a5319..441591047eb62 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -392,3 +392,17 @@ def test_loc_empty_multiindex(): result = df expected = DataFrame([1, 2, 3, 4], index=index, columns=["value"]) tm.assert_frame_equal(result, expected) + +def test_loc_nan_multiindex(): + df = DataFrame( + { + "temp_playlist": [0, 0, 0, 0], + "objId": ["o1", np.nan, "o1", np.nan], + "x": [1, 2, 3, 4], + } + ) + + agg_df = df.groupby(by=['temp_playlist', 'objId'], dropna=False)["x"].agg(list) + result = agg_df.loc[agg_df.index[-1]] + expected = [2, 4] + assert result == expected From f72cbf714a0cf5f41011a13899adf8924e99cb83 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sun, 10 Oct 2021 12:20:23 +0200 Subject: [PATCH 19/43] remove print and debug statements --- pandas/_libs/hashtable.pyx | 1 - pandas/_libs/index.pyx | 28 ++++++--------------- pandas/core/generic.py | 3 --- pandas/core/indexes/base.py | 5 ---- pandas/core/indexes/multi.py | 15 +---------- pandas/core/indexing.py | 1 - pandas/tests/indexes/multi/test_indexing.py | 1 - 7 files changed, 9 insertions(+), 45 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index d7a2861013a1a..6e97c13c644cf 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -121,7 +121,6 @@ cdef class ObjectFactorizer(Factorizer): uniques = ObjectVector() uniques.extend(self.uniques.to_array()) self.uniques = uniques - print('WE ARE IN FACTORIZE') labels = self.table.get_labels(values, self.uniques, self.count, na_sentinel, na_value) mask = (labels == na_sentinel) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index fdcc4e61f77c9..d07329678d9f1 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -129,7 +129,6 @@ cdef class IndexEngine: # We assume before we get here: # - val is hashable self._ensure_mapping_populated() - #print('THIS IS THE MAPPING ' + str(self.mapping)) return val in self.mapping cpdef get_loc(self, object val): @@ -159,8 +158,6 @@ cdef class IndexEngine: return self._get_loc_duplicates(val) try: - #print('THIS IS val: ' + str(val)) - #print('THIS IS MAPPING: ' + str(self.mapping)) return self.mapping.get_item(val) except OverflowError as err: # GH#41775 OverflowError e.g. if we are uint64 and val is -1 @@ -622,16 +619,13 @@ cdef class BaseMultiIndexCodesEngine: # Transform labels in a single array, and add 1 so that we are working # with positive integers (-1 for NaN becomes 0): - #print('\nTHIS ARE LABELS IN INITIALIZATION OF BASEMULTIINDEXCODESENGINE: ' + str(labels) + " THIS IS THE TYPE OF LABELS: " + str(type(labels))) codes = (np.array(labels, dtype='int64').T + 1).astype('uint64', copy=False) - #print('\nTHIS ARE THE CODES: ' + str(codes) + " THIS IS THE TYPE OF CODES: " + str(type(codes)) + '\n') # Map each codes combination in the index to an integer unambiguously # (no collisions possible), based on the "offsets", which describe the # number of bits to switch labels for each level: lab_ints = self._codes_to_ints(codes) - #print('\nTHIS ARE THE LAB INTS: ' + str(lab_ints) + " THIS IS THE TYPE OF LAB INTS: " + str(type(codes)) + '\n') # Initialize underlying index (e.g. libindex.UInt64Engine) with # integers representing labels: we will use its get_loc and get_indexer @@ -762,25 +756,19 @@ cdef class BaseMultiIndexCodesEngine: return sorted_indexer[np.argsort(target_order)] def get_loc(self, object key): - #print('get_loc, we want to get key' + str(key)) if is_definitely_invalid_key(key): raise TypeError(f"'{key}' is an invalid key") if not isinstance(key, tuple): raise KeyError(key) - # print('LEVELS: ' + str(type(self.levels[0])) + 'KEYS: ' + str(key)) - try: - # indices = [0 if checknull(v) else lev.get_loc(v) + 1 - # for lev, v in zip(self.levels, key)] - indices = [] - for key, lev in zip(key, self.levels): - try: - indices.append(lev.get_loc(key) + 1) - except KeyError: + indices = [] + for k, lev in zip(key, self.levels): + try: + indices.append(lev.get_loc(k) + 1) + except KeyError as err: + if checknull(k): indices.append(0) - # indices = [lev.get_loc(v) + 1 - # for lev, v in zip(self.levels, key)] - except KeyError: - raise KeyError(key) + else: + raise KeyError(key) from err # Transform indices into single integer: lab_int = self._codes_to_ints(np.array(indices, dtype='uint64')) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 452b02dc0ad96..c3ad87082c8ed 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1835,7 +1835,6 @@ def _get_label_or_level_values(self, key: str, axis: int = 0) -> np.ndarray: axis = self._get_axis_number(axis) other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis] - # import pdb; pdb.set_trace() if self._is_label_reference(key, axis=axis): self._check_label_or_level_ambiguity(key, axis=axis) values = self.xs(key, axis=other_axes[0])._values @@ -3805,8 +3804,6 @@ class animal locomotion self._consolidate_inplace() if isinstance(index, MultiIndex): - #import pdb; pdb.set_trace() - import traceback as tb; loc, new_index = index._get_loc_level(key, level=0) if not drop_level: if lib.is_integer(loc): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 274ffd33abe78..05047540c6ccd 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3508,13 +3508,8 @@ def get_loc(self, key, method=None, tolerance=None): "tolerance argument only valid if using pad, " "backfill or nearest lookups" ) - # print(f'key before casting {key}') casted_key = self._maybe_cast_indexer(key) try: - #import pdb; pdb.set_trace() - # print(f'casted key {casted_key}') - # import pdb; pdb.set_trace() - # import traceback as tb; tb.print_stack() return self._engine.get_loc(casted_key) except KeyError as err: raise KeyError(key) from err diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 8d72805e59d4d..e2f1a2d6a1e23 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -132,15 +132,7 @@ def _codes_to_ints(self, codes): """ # Shift the representation of each level by the pre-calculated number # of bits: - #print(f'codes {codes}, self.offsets {self.offsets}') - #import pdb; pdb.set_trace() codes <<= self.offsets - #if len(codes.shape) == 1 and list(codes) == [4,0]: - # import traceback as tb; tb.print_stack() - #codes = np.array([4, 2]) - #import pdb; pdb.set_trace() - #print('hi') - #print(f'codes after shifting {codes}') # Now sum and OR are in fact interchangeable. This is a simple # composition of the (disjunct) significant bits of each level (i.e. @@ -3009,12 +3001,7 @@ def maybe_mi_droplevels(indexer, levels): if len(key) == self.nlevels and self.is_unique: # Complete key in unique index -> standard get_loc try: - engine = self._engine - #import pdb; pdb.set_trace() - # print(f'key before handing it to engine {engine} is key: {key}') - # import pdb; pdb.set_trace() - loc_res = engine.get_loc(key) - return (loc_res, None) + return (self._engine.get_loc(key), None) except KeyError as err: raise KeyError(key) from err except TypeError: diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index a1c48a3213318..918a9ea1b8030 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1140,7 +1140,6 @@ def _get_label(self, label, axis: int): def _handle_lowerdim_multi_index_axis0(self, tup: tuple): # we have an axis0 multi-index, handle or raise - #import pdb; pdb.set_trace() axis = self.axis or 0 try: # fast path for series or for tup devoid of slices diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index e7b67fdc9f9c7..99322f474dd9e 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -625,7 +625,6 @@ def test_get_loc_cast_bool(self): # GH 19086 : int is casted to bool, but not vice-versa levels = [[False, True], np.arange(2, dtype="int64")] idx = MultiIndex.from_product(levels) - import pdb; pdb.set_trace() assert idx.get_loc((0, 1)) == 1 assert idx.get_loc((1, 0)) == 2 From 7ab0c9ca567a27bd5a68f5d2b2c8058d940ff437 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sun, 10 Oct 2021 10:23:21 +0000 Subject: [PATCH 20/43] Fixes from pre-commit [automated commit] --- pandas/tests/indexing/multiindex/test_getitem.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index 441591047eb62..f635e30570709 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -393,6 +393,7 @@ def test_loc_empty_multiindex(): expected = DataFrame([1, 2, 3, 4], index=index, columns=["value"]) tm.assert_frame_equal(result, expected) + def test_loc_nan_multiindex(): df = DataFrame( { @@ -402,7 +403,7 @@ def test_loc_nan_multiindex(): } ) - agg_df = df.groupby(by=['temp_playlist', 'objId'], dropna=False)["x"].agg(list) + agg_df = df.groupby(by=["temp_playlist", "objId"], dropna=False)["x"].agg(list) result = agg_df.loc[agg_df.index[-1]] expected = [2, 4] assert result == expected From a758d4449e9fa8b9c6395282102592adc681f80c Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sun, 10 Oct 2021 12:53:58 +0200 Subject: [PATCH 21/43] add new test --- .../tests/indexing/multiindex/test_getitem.py | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index f635e30570709..0cfe166e94d48 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -407,3 +407,28 @@ def test_loc_nan_multiindex(): result = agg_df.loc[agg_df.index[-1]] expected = [2, 4] assert result == expected + +def test_loc_nan_multiindex2(): + arrays = [[1, 1, 1], [np.nan, 100, np.nan]] + names = ("idx1", "idx2") + index = MultiIndex.from_arrays(arrays, names=names) + df = DataFrame([1, 2, 3], index=index, columns=['col1']) + result = df.loc[(1, np.nan)] + + expected = DataFrame([1 , 3], index=MultiIndex.from_arrays([[1,1], [np.nan, np.nan]], names=names), columns=['col1']) + tm.assert_frame_equal(result, expected, check_index_type=False) + + +def test_loc_nan_multiindex(): + df = DataFrame( + { + "temp_playlist": [0, 0, 0, 0], + "objId": ["o1", np.nan, "o1", np.nan], + "x": [1, 2, 3, 4], + } + ) + + agg_df = df.groupby(by=['temp_playlist', 'objId'], dropna=False)["x"].agg(list) + result = agg_df.loc[agg_df.index[-1]] + expected = [2, 4] + assert result == expected From c3ec7f0843dcc5e00338bc19e259387867955632 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sun, 10 Oct 2021 20:21:29 +0200 Subject: [PATCH 22/43] update test as desired in PR comments --- .../tests/indexing/multiindex/test_getitem.py | 25 ++++++++----------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index 0cfe166e94d48..b08afeee42375 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -395,6 +395,9 @@ def test_loc_empty_multiindex(): def test_loc_nan_multiindex(): +@pytest.mark.parametrize("dropna", [True, False]) +def test_loc_nan_multiindex(dropna): + # GH 43943 df = DataFrame( { "temp_playlist": [0, 0, 0, 0], @@ -403,9 +406,14 @@ def test_loc_nan_multiindex(): } ) - agg_df = df.groupby(by=["temp_playlist", "objId"], dropna=False)["x"].agg(list) + agg_df = df.groupby(by=['temp_playlist', 'objId'], dropna=dropna)["x"].agg(list) result = agg_df.loc[agg_df.index[-1]] - expected = [2, 4] + if dropna: + # nans are dropped, therefore only [1, 3] is left + expected = [1, 3] + else: + # last index is (0, np.nan) which corresponds to the list [2, 4] + expected = [2, 4] assert result == expected def test_loc_nan_multiindex2(): @@ -419,16 +427,3 @@ def test_loc_nan_multiindex2(): tm.assert_frame_equal(result, expected, check_index_type=False) -def test_loc_nan_multiindex(): - df = DataFrame( - { - "temp_playlist": [0, 0, 0, 0], - "objId": ["o1", np.nan, "o1", np.nan], - "x": [1, 2, 3, 4], - } - ) - - agg_df = df.groupby(by=['temp_playlist', 'objId'], dropna=False)["x"].agg(list) - result = agg_df.loc[agg_df.index[-1]] - expected = [2, 4] - assert result == expected From 245b14191edfeb58d5db8884ca4c58134c7292da Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sun, 10 Oct 2021 20:29:23 +0200 Subject: [PATCH 23/43] add whatsnew entry --- doc/source/whatsnew/v1.3.4.rst | 1 + pandas/tests/indexing/multiindex/test_getitem.py | 15 +-------------- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v1.3.4.rst b/doc/source/whatsnew/v1.3.4.rst index b46744d51d74d..e63f500f0ab52 100644 --- a/doc/source/whatsnew/v1.3.4.rst +++ b/doc/source/whatsnew/v1.3.4.rst @@ -27,6 +27,7 @@ Fixed regressions - Fixed regression in :meth:`Series.aggregate` attempting to pass ``args`` and ``kwargs`` multiple times to the user supplied ``func`` in certain cases (:issue:`43357`) - Fixed regression when iterating over a :class:`DataFrame.groupby.rolling` object causing the resulting DataFrames to have an incorrect index if the input groupings were not sorted (:issue:`43386`) - Fixed regression in :meth:`DataFrame.groupby.rolling.cov` and :meth:`DataFrame.groupby.rolling.corr` computing incorrect results if the input groupings were not sorted (:issue:`43386`) +- Fixed regression in :meth:`DataFrame.loc` when `MultiIndex` contained `np.nan` (:issue`43814`) .. --------------------------------------------------------------------------- diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index b08afeee42375..baa98601f5a9b 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -394,7 +394,6 @@ def test_loc_empty_multiindex(): tm.assert_frame_equal(result, expected) -def test_loc_nan_multiindex(): @pytest.mark.parametrize("dropna", [True, False]) def test_loc_nan_multiindex(dropna): # GH 43943 @@ -406,7 +405,7 @@ def test_loc_nan_multiindex(dropna): } ) - agg_df = df.groupby(by=['temp_playlist', 'objId'], dropna=dropna)["x"].agg(list) + agg_df = df.groupby(by=["temp_playlist", "objId"], dropna=False)["x"].agg(list) result = agg_df.loc[agg_df.index[-1]] if dropna: # nans are dropped, therefore only [1, 3] is left @@ -415,15 +414,3 @@ def test_loc_nan_multiindex(dropna): # last index is (0, np.nan) which corresponds to the list [2, 4] expected = [2, 4] assert result == expected - -def test_loc_nan_multiindex2(): - arrays = [[1, 1, 1], [np.nan, 100, np.nan]] - names = ("idx1", "idx2") - index = MultiIndex.from_arrays(arrays, names=names) - df = DataFrame([1, 2, 3], index=index, columns=['col1']) - result = df.loc[(1, np.nan)] - - expected = DataFrame([1 , 3], index=MultiIndex.from_arrays([[1,1], [np.nan, np.nan]], names=names), columns=['col1']) - tm.assert_frame_equal(result, expected, check_index_type=False) - - From ae706c87936e4e584cf6463256c9e67aed78ba51 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sun, 10 Oct 2021 20:30:52 +0200 Subject: [PATCH 24/43] fix wrong issue number --- pandas/tests/indexing/multiindex/test_getitem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index baa98601f5a9b..c9242789f1486 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -396,7 +396,7 @@ def test_loc_empty_multiindex(): @pytest.mark.parametrize("dropna", [True, False]) def test_loc_nan_multiindex(dropna): - # GH 43943 + # GH 43814 df = DataFrame( { "temp_playlist": [0, 0, 0, 0], From 30097ca3eacfa90e369a7a5d6ff15bbe739ac720 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Tue, 12 Oct 2021 19:45:14 +0200 Subject: [PATCH 25/43] save nan encoding in new state variable and replace it before returning the output --- pandas/_libs/index.pyx | 14 +++++------- pandas/core/groupby/grouper.py | 9 ++++++++ pandas/core/groupby/ops.py | 17 +++++++++++++- pandas/tests/groupby/test_groupby_dropna.py | 18 +++++++++++++++ .../tests/indexing/multiindex/test_getitem.py | 22 ------------------- 5 files changed, 48 insertions(+), 32 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index d07329678d9f1..92837a43e2b69 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -760,15 +760,11 @@ cdef class BaseMultiIndexCodesEngine: raise TypeError(f"'{key}' is an invalid key") if not isinstance(key, tuple): raise KeyError(key) - indices = [] - for k, lev in zip(key, self.levels): - try: - indices.append(lev.get_loc(k) + 1) - except KeyError as err: - if checknull(k): - indices.append(0) - else: - raise KeyError(key) from err + try: + indices = [0 if checknull(v) else lev.get_loc(v) + 1 + for lev, v in zip(self.levels, key)] + except KeyError: + raise KeyError(key) # Transform indices into single integer: lab_int = self._codes_to_ints(np.array(indices, dtype='uint64')) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 7577b1e671d60..f62964ff378c0 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -486,6 +486,7 @@ def __init__( self._observed = observed self.in_axis = in_axis self._dropna = dropna + self._na_placeholder = None self._passed_categorical = False @@ -690,6 +691,14 @@ def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]: codes, uniques = algorithms.factorize( self.grouping_vector, sort=self._sort, na_sentinel=na_sentinel ) + # GH43943, store placeholder for np.nan, will later be replaced by -1 in + # pandas/core/groupby:reconstructed_codes + if not self._dropna: + if any( + isinstance(v, float) and np.isnan(v) for v in self.grouping_vector + ): + self._na_placeholder = max(codes) + return codes, uniques @cache_readonly diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 60c8851f059fe..b65a600634095 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -859,7 +859,22 @@ def ngroups(self) -> int: def reconstructed_codes(self) -> list[np.ndarray]: codes = self.codes ids, obs_ids, _ = self.group_info - return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True) + reconstructed_codes = decons_obs_group_ids( + ids, obs_ids, self.shape, codes, xnull=True + ) + + def transform_codes(code_level, grouping): + if grouping._na_placeholder is not None: + return np.where(code_level == max(code_level), -1, code_level) + else: + return code_level + + # import pdb; pdb.set_trace() + transformed_rec_codes = [ + transform_codes(code_level, grouping) + for code_level, grouping in zip(reconstructed_codes, self._groupings) + ] + return transformed_rec_codes @final @cache_readonly diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index ab568e24ff029..b8f3bb2c4c8a8 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -370,3 +370,21 @@ def test_groupby_nan_included(): tm.assert_numpy_array_equal(result_values, expected_values) assert np.isnan(list(result.keys())[2]) assert list(result.keys())[0:2] == ["g1", "g2"] + + +def test_groupby_codes_with_nan_in_multiindex(): + # GH 43814 + df = pd.DataFrame( + { + "temp_playlist": [0, 0, 0, 0], + "objId": ["o1", np.nan, "o1", np.nan], + "x": [1, 2, 3, 4], + } + ) + + grouped_df = df.groupby(by=["temp_playlist", "objId"], dropna=False)["x"].sum() + expected = pd.MultiIndex.from_arrays( + [[0, 0], ["o1", np.nan]], names=["temp_playlist", "objId"] + ) + result = grouped_df.index + assert all((res == ex).all() for res, ex in zip(result.codes, expected.codes)) diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index c9242789f1486..3790a6e9a5319 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -392,25 +392,3 @@ def test_loc_empty_multiindex(): result = df expected = DataFrame([1, 2, 3, 4], index=index, columns=["value"]) tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("dropna", [True, False]) -def test_loc_nan_multiindex(dropna): - # GH 43814 - df = DataFrame( - { - "temp_playlist": [0, 0, 0, 0], - "objId": ["o1", np.nan, "o1", np.nan], - "x": [1, 2, 3, 4], - } - ) - - agg_df = df.groupby(by=["temp_playlist", "objId"], dropna=False)["x"].agg(list) - result = agg_df.loc[agg_df.index[-1]] - if dropna: - # nans are dropped, therefore only [1, 3] is left - expected = [1, 3] - else: - # last index is (0, np.nan) which corresponds to the list [2, 4] - expected = [2, 4] - assert result == expected From ef951a6e90edd7bf6f0571cf47828fc9760ca9e3 Mon Sep 17 00:00:00 2001 From: CloseChoice Date: Tue, 12 Oct 2021 21:56:10 +0200 Subject: [PATCH 26/43] remove wrong whatsnew entry --- doc/source/whatsnew/v1.3.4.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.4.rst b/doc/source/whatsnew/v1.3.4.rst index e63f500f0ab52..b46744d51d74d 100644 --- a/doc/source/whatsnew/v1.3.4.rst +++ b/doc/source/whatsnew/v1.3.4.rst @@ -27,7 +27,6 @@ Fixed regressions - Fixed regression in :meth:`Series.aggregate` attempting to pass ``args`` and ``kwargs`` multiple times to the user supplied ``func`` in certain cases (:issue:`43357`) - Fixed regression when iterating over a :class:`DataFrame.groupby.rolling` object causing the resulting DataFrames to have an incorrect index if the input groupings were not sorted (:issue:`43386`) - Fixed regression in :meth:`DataFrame.groupby.rolling.cov` and :meth:`DataFrame.groupby.rolling.corr` computing incorrect results if the input groupings were not sorted (:issue:`43386`) -- Fixed regression in :meth:`DataFrame.loc` when `MultiIndex` contained `np.nan` (:issue`43814`) .. --------------------------------------------------------------------------- From 0fa1784088939191d8eac8dfe38525f63b37be9e Mon Sep 17 00:00:00 2001 From: CloseChoice Date: Tue, 12 Oct 2021 22:00:49 +0200 Subject: [PATCH 27/43] remove debug statement --- pandas/core/groupby/ops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b65a600634095..fe4e14f992606 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -869,7 +869,6 @@ def transform_codes(code_level, grouping): else: return code_level - # import pdb; pdb.set_trace() transformed_rec_codes = [ transform_codes(code_level, grouping) for code_level, grouping in zip(reconstructed_codes, self._groupings) From 1f780234f5cccdd4674a72759e6a4ef83c8c934b Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Wed, 13 Oct 2021 13:58:39 +0200 Subject: [PATCH 28/43] add whatsnew entry --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index e13e6380905f2..db677a50b35f0 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -591,7 +591,7 @@ Groupby/resample/rolling - Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` for centered datetimelike windows with uneven nanosecond (:issue:`43997`) - Bug in :meth:`GroupBy.nth` failing on ``axis=1`` (:issue:`43926`) - Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not respecting right bound on centered datetime-like windows, if the index contain duplicates (:issue:`#3944`) - +- Bug in :meth:`DataFrame.groupby` when grouping on multiple columns where at least one includes ``np.nan`` which resulted in a `KeyError` when the ``np.nan`` containing index was selected with :meth:`Series.loc` (:issue:`43814`) Reshaping ^^^^^^^^^ From 5b3c61d032af081db8dd9626a62a42bcb228a4e4 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sun, 17 Oct 2021 20:39:15 +0200 Subject: [PATCH 29/43] fix wrong code quotes in whatsnew --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index db677a50b35f0..3c40243d5a4f3 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -591,7 +591,7 @@ Groupby/resample/rolling - Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` for centered datetimelike windows with uneven nanosecond (:issue:`43997`) - Bug in :meth:`GroupBy.nth` failing on ``axis=1`` (:issue:`43926`) - Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not respecting right bound on centered datetime-like windows, if the index contain duplicates (:issue:`#3944`) -- Bug in :meth:`DataFrame.groupby` when grouping on multiple columns where at least one includes ``np.nan`` which resulted in a `KeyError` when the ``np.nan`` containing index was selected with :meth:`Series.loc` (:issue:`43814`) +- Bug in :meth:`DataFrame.groupby` when grouping on multiple columns where at least one includes ``np.nan`` which resulted in a ``KeyError`` when the ``np.nan`` containing index was selected with :meth:`Series.loc` (:issue:`43814`) Reshaping ^^^^^^^^^ From 98c8f13cdeedd2024ae0118d2f18550bd15c3f65 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Fri, 22 Oct 2021 06:44:53 +0200 Subject: [PATCH 30/43] WIP: add support for (pd.NA, pd.NaT, np.datetime64("NaT", "ns"), np.timedelta64("NaT", "ns") --- pandas/core/groupby/grouper.py | 8 +++--- pandas/core/groupby/ops.py | 3 ++- pandas/tests/groupby/test_groupby_dropna.py | 28 ++++++++++++++++++--- 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index f62964ff378c0..43023264c5faf 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -34,6 +34,7 @@ Categorical, ExtensionArray, ) +from pandas.core.base import isna import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.groupby import ops @@ -50,6 +51,8 @@ from pandas.io.formats.printing import pprint_thing +from pandas._libs.tslibs.nattype import NaTType + if TYPE_CHECKING: from pandas.core.generic import NDFrame @@ -693,10 +696,9 @@ def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]: ) # GH43943, store placeholder for np.nan, will later be replaced by -1 in # pandas/core/groupby:reconstructed_codes + import pdb; pdb.set_trace() if not self._dropna: - if any( - isinstance(v, float) and np.isnan(v) for v in self.grouping_vector - ): + if isna(self.grouping_vector).any(): self._na_placeholder = max(codes) return codes, uniques diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index fe4e14f992606..1e95bf8fa1f3e 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -863,9 +863,10 @@ def reconstructed_codes(self) -> list[np.ndarray]: ids, obs_ids, self.shape, codes, xnull=True ) + import pdb; pdb.set_trace() def transform_codes(code_level, grouping): if grouping._na_placeholder is not None: - return np.where(code_level == max(code_level), -1, code_level) + return np.where(code_level == grouping._na_placeholder, -1, code_level) else: return code_level diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index b8f3bb2c4c8a8..ed468b8d67546 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -372,19 +372,41 @@ def test_groupby_nan_included(): assert list(result.keys())[0:2] == ["g1", "g2"] -def test_groupby_codes_with_nan_in_multiindex(): +@pytest.mark.parametrize( + "na", [np.nan, pd.NaT, np.datetime64("NaT", "ns"), np.timedelta64("NaT", "ns")] +) +def test_groupby_codes_with_nan_in_multiindex(na): + # GH 43814 + df = pd.DataFrame( + { + "temp_playlist": [0, 0, 0, 0], + "objId": ["o1", na, "o1", na], + "x": [1, 2, 3, 4], + } + ) + + grouped_df = df.groupby(by=["temp_playlist", "objId"], dropna=False)["x"].sum() + expected = pd.MultiIndex.from_arrays( + [[0, 0], ["o1", na]], names=["temp_playlist", "objId"] + ) + result = grouped_df.index + assert all((res == ex).all() for res, ex in zip(result.codes, expected.codes)) + + +def test_groupby_codes_with_pd_nat_in_multiindex(): # GH 43814 df = pd.DataFrame( { "temp_playlist": [0, 0, 0, 0], - "objId": ["o1", np.nan, "o1", np.nan], + "objId": ["o1", pd.NaT, "o1", pd.NaT], "x": [1, 2, 3, 4], } ) grouped_df = df.groupby(by=["temp_playlist", "objId"], dropna=False)["x"].sum() expected = pd.MultiIndex.from_arrays( - [[0, 0], ["o1", np.nan]], names=["temp_playlist", "objId"] + [[0, 0], ["o1", pd.NaT]], names=["temp_playlist", "objId"] ) result = grouped_df.index + # import pdb; pdb.set_trace() assert all((res == ex).all() for res, ex in zip(result.codes, expected.codes)) From ec218d52d02bbfc6879fb045a8c27c5d8ca1679b Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Fri, 22 Oct 2021 06:58:40 +0200 Subject: [PATCH 31/43] WIP: add support for (pd.NA, pd.NaT, np.datetime64("NaT", "ns"), np.timedelta64("NaT", "ns") --- pandas/core/groupby/grouper.py | 4 ++-- pandas/core/groupby/ops.py | 1 - pandas/tests/groupby/test_groupby_dropna.py | 19 ------------------- 3 files changed, 2 insertions(+), 22 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 43023264c5faf..ad9763a68bc66 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -694,9 +694,9 @@ def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]: codes, uniques = algorithms.factorize( self.grouping_vector, sort=self._sort, na_sentinel=na_sentinel ) - # GH43943, store placeholder for np.nan, will later be replaced by -1 in + # GH43943, store placeholder for (np.nan, pd.NaT, np.datetime64("NaT", "ns"), + # np.timedelta64("NaT", "ns")), will later be replaced by -1 in # pandas/core/groupby:reconstructed_codes - import pdb; pdb.set_trace() if not self._dropna: if isna(self.grouping_vector).any(): self._na_placeholder = max(codes) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 1e95bf8fa1f3e..8c1e64aa53f70 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -863,7 +863,6 @@ def reconstructed_codes(self) -> list[np.ndarray]: ids, obs_ids, self.shape, codes, xnull=True ) - import pdb; pdb.set_trace() def transform_codes(code_level, grouping): if grouping._na_placeholder is not None: return np.where(code_level == grouping._na_placeholder, -1, code_level) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index ed468b8d67546..2485276503ac9 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -391,22 +391,3 @@ def test_groupby_codes_with_nan_in_multiindex(na): ) result = grouped_df.index assert all((res == ex).all() for res, ex in zip(result.codes, expected.codes)) - - -def test_groupby_codes_with_pd_nat_in_multiindex(): - # GH 43814 - df = pd.DataFrame( - { - "temp_playlist": [0, 0, 0, 0], - "objId": ["o1", pd.NaT, "o1", pd.NaT], - "x": [1, 2, 3, 4], - } - ) - - grouped_df = df.groupby(by=["temp_playlist", "objId"], dropna=False)["x"].sum() - expected = pd.MultiIndex.from_arrays( - [[0, 0], ["o1", pd.NaT]], names=["temp_playlist", "objId"] - ) - result = grouped_df.index - # import pdb; pdb.set_trace() - assert all((res == ex).all() for res, ex in zip(result.codes, expected.codes)) From 6850b73c8cf42932c3c2cac08b56c617ff433903 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sat, 6 Nov 2021 16:05:43 +0100 Subject: [PATCH 32/43] add tests for issue 36060 --- pandas/tests/groupby/test_groupby_dropna.py | 29 ++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 2485276503ac9..70e6061f7f213 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -373,7 +373,8 @@ def test_groupby_nan_included(): @pytest.mark.parametrize( - "na", [np.nan, pd.NaT, np.datetime64("NaT", "ns"), np.timedelta64("NaT", "ns")] + "na", + [np.nan, pd.NaT, pd.NA, np.datetime64("NaT", "ns"), np.timedelta64("NaT", "ns")], ) def test_groupby_codes_with_nan_in_multiindex(na): # GH 43814 @@ -391,3 +392,29 @@ def test_groupby_codes_with_nan_in_multiindex(na): ) result = grouped_df.index assert all((res == ex).all() for res, ex in zip(result.codes, expected.codes)) + + +def test_groupby_multiindex_multiply_with_series(): + # GH#36060 + df = pd.DataFrame( + { + "animal": ["Falcon", "Falcon", "Parrot", "Parrot"], + "type": [np.nan, np.nan, np.nan, np.nan], + "speed": [380.0, 370.0, 24.0, 26.0], + } + ) + speed = df.groupby(["animal", "type"], dropna=False)["speed"].first() + # Reconstruct same index to allow for multiplication. + ix_wing = pd.MultiIndex.from_tuples( + [("Falcon", np.nan), ("Parrot", np.nan)], names=["animal", "type"] + ) + wing = pd.Series([42, 44], index=ix_wing) + + result = wing * speed + expected = pd.Series( + [15960.0, 1056.0], + index=pd.MultiIndex.from_tuples( + [("Falcon", np.nan), ("Parrot", np.nan)], names=["animal", "type"] + ), + ) + tm.assert_series_equal(result, expected) From e7a52e33a3ef1fc616f70265764ef279e592b5d1 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sat, 6 Nov 2021 15:14:40 +0000 Subject: [PATCH 33/43] Fixes from pre-commit [automated commit] --- pandas/core/groupby/grouper.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 86cb3c58e28ba..1a044fd5ec1aa 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -52,8 +52,6 @@ from pandas.io.formats.printing import pprint_thing -from pandas._libs.tslibs.nattype import NaTType - if TYPE_CHECKING: from pandas.core.generic import NDFrame From 9682350af8aa915cf7c4a684e5abf52406c3cfa2 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sun, 7 Nov 2021 18:58:57 +0100 Subject: [PATCH 34/43] changes according to PR discussions --- pandas/core/groupby/grouper.py | 4 ++-- pandas/core/groupby/ops.py | 15 ++------------- pandas/core/sorting.py | 27 +++++++++++++++++++++++++-- 3 files changed, 29 insertions(+), 17 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 86cb3c58e28ba..c5a481e1207cf 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -490,7 +490,7 @@ def __init__( self._observed = observed self.in_axis = in_axis self._dropna = dropna - self._na_placeholder = None + self._has_na_placeholder = False self._passed_categorical = False @@ -700,7 +700,7 @@ def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]: # pandas/core/groupby:reconstructed_codes if not self._dropna: if isna(self.grouping_vector).any(): - self._na_placeholder = max(codes) + self._has_na_placeholder = True return codes, uniques diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 8c1e64aa53f70..ee3572e8b90bf 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -860,20 +860,9 @@ def reconstructed_codes(self) -> list[np.ndarray]: codes = self.codes ids, obs_ids, _ = self.group_info reconstructed_codes = decons_obs_group_ids( - ids, obs_ids, self.shape, codes, xnull=True + ids, obs_ids, self.shape, codes, xnull=True, groupings=self._groupings ) - - def transform_codes(code_level, grouping): - if grouping._na_placeholder is not None: - return np.where(code_level == grouping._na_placeholder, -1, code_level) - else: - return code_level - - transformed_rec_codes = [ - transform_codes(code_level, grouping) - for code_level, grouping in zip(reconstructed_codes, self._groupings) - ] - return transformed_rec_codes + return reconstructed_codes @final @cache_readonly diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 7813182222d67..fb3b3e1ed8264 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -41,6 +41,7 @@ if TYPE_CHECKING: from pandas import MultiIndex from pandas.core.indexes.base import Index + from pandas.core.groupby.grouper import Grouping def get_indexer_indexer( @@ -243,7 +244,12 @@ def decons_group_index(comp_labels, shape): def decons_obs_group_ids( - comp_ids: npt.NDArray[np.intp], obs_ids, shape, labels, xnull: bool + comp_ids: npt.NDArray[np.intp], + obs_ids, + shape, + labels, + xnull: bool, + groupings: list[Grouping] = None, ): """ Reconstruct labels from observed group ids. @@ -254,6 +260,21 @@ def decons_obs_group_ids( xnull : bool If nulls are excluded; i.e. -1 labels are passed through. """ + + def transform_codes(code_level, grouping): + if grouping._has_na_placeholder: + return np.where(code_level == max(code_level), -1, code_level) + else: + return code_level + + def reconstruct_na_in_codes(codes): + if groupings: + codes = [ + transform_codes(code_level, grouping) + for code_level, grouping in zip(codes, groupings) + ] + return codes + if not xnull: lift = np.fromiter(((a == -1).any() for a in labels), dtype="i8") shape = np.asarray(shape, dtype="i8") + lift @@ -261,10 +282,12 @@ def decons_obs_group_ids( if not is_int64_overflow_possible(shape): # obs ids are deconstructable! take the fast route! out = decons_group_index(obs_ids, shape) + out = reconstruct_na_in_codes(out) return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)] indexer = unique_label_indices(comp_ids) - return [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels] + out = [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels] + return reconstruct_na_in_codes(out) def indexer_from_factorized( From cb92f953125711921f6467e48cab25d4b2bb5c46 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sun, 7 Nov 2021 19:14:13 +0100 Subject: [PATCH 35/43] use assert_series_equal in test_groupby_dropna --- pandas/tests/groupby/test_groupby_dropna.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 70e6061f7f213..797b0d5dd36d0 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -386,12 +386,20 @@ def test_groupby_codes_with_nan_in_multiindex(na): } ) - grouped_df = df.groupby(by=["temp_playlist", "objId"], dropna=False)["x"].sum() + result = df.groupby(by=["temp_playlist", "objId"], dropna=False)["x"].sum() expected = pd.MultiIndex.from_arrays( [[0, 0], ["o1", na]], names=["temp_playlist", "objId"] ) - result = grouped_df.index - assert all((res == ex).all() for res, ex in zip(result.codes, expected.codes)) + expected = pd.Series( + [4, 6], + index=pd.MultiIndex( + levels=[[0], ["o1", np.nan]], + codes=[[0, 0], [0, -1]], + names=["temp_playlist", "objId"], + ), + name="x", + ) + tm.assert_series_equal(result, expected) def test_groupby_multiindex_multiply_with_series(): From e3c734d40c7e0c91774fadd6eb74e25d4c4ab590 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Wed, 10 Nov 2021 17:21:52 +0100 Subject: [PATCH 36/43] make _has_na_placeholder cache_readonly --- pandas/core/groupby/grouper.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 7b281f53443eb..20f0cca9ff61e 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -488,7 +488,6 @@ def __init__( self._observed = observed self.in_axis = in_axis self._dropna = dropna - self._has_na_placeholder = False self._passed_categorical = False @@ -603,6 +602,9 @@ def _ilevel(self) -> int | None: return index.names.index(level) return level + # @cache_readonly + # def _has + @property def ngroups(self) -> int: return len(self.group_index) @@ -693,12 +695,6 @@ def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]: codes, uniques = algorithms.factorize( self.grouping_vector, sort=self._sort, na_sentinel=na_sentinel ) - # GH43943, store placeholder for (np.nan, pd.NaT, np.datetime64("NaT", "ns"), - # np.timedelta64("NaT", "ns")), will later be replaced by -1 in - # pandas/core/groupby:reconstructed_codes - if not self._dropna: - if isna(self.grouping_vector).any(): - self._has_na_placeholder = True return codes, uniques @@ -706,6 +702,15 @@ def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]: def groups(self) -> dict[Hashable, np.ndarray]: return self._index.groupby(Categorical.from_codes(self.codes, self.group_index)) + @cache_readonly + def _has_na_placeholder(self) -> bool: + # GH43943, store placeholder for (np.nan, pd.NaT, np.datetime64("NaT", "ns"), + # np.timedelta64("NaT", "ns")), is used to replaced codes correctly to -1 + # pandas/core/groupby:reconstructed_codes + if not self._dropna: + if isna(self.grouping_vector).any(): + return True + return False def get_grouper( obj: NDFrameT, From cf48e70ac063d82dae2ea91183530c86937ab8e8 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Wed, 10 Nov 2021 17:22:21 +0100 Subject: [PATCH 37/43] make _has_na_placeholder cache_readonly --- pandas/core/groupby/grouper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 20f0cca9ff61e..333853f232439 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -712,6 +712,7 @@ def _has_na_placeholder(self) -> bool: return True return False + def get_grouper( obj: NDFrameT, key=None, From 49b1d655e0b28b32a0f8d94a155daa046c898b77 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Wed, 10 Nov 2021 16:25:29 +0000 Subject: [PATCH 38/43] Fixes from pre-commit [automated commit] --- pandas/core/sorting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index fb3b3e1ed8264..a953bc1edd557 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -40,8 +40,8 @@ if TYPE_CHECKING: from pandas import MultiIndex - from pandas.core.indexes.base import Index from pandas.core.groupby.grouper import Grouping + from pandas.core.indexes.base import Index def get_indexer_indexer( From d00b39ff0bfef9032c192a9c655196328bcd2fed Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Wed, 10 Nov 2021 17:35:05 +0100 Subject: [PATCH 39/43] remove commented out stuff --- pandas/core/groupby/grouper.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 333853f232439..1045747c98553 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -602,9 +602,6 @@ def _ilevel(self) -> int | None: return index.names.index(level) return level - # @cache_readonly - # def _has - @property def ngroups(self) -> int: return len(self.group_index) From d1809012630420e148b68496efa3b16baa326596 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Wed, 10 Nov 2021 18:18:46 +0100 Subject: [PATCH 40/43] remove unnecessary import --- pandas/core/groupby/grouper.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 1045747c98553..c9e13d3923aa3 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -14,7 +14,6 @@ import numpy as np -from pandas._libs.tslibs.nattype import NaTType from pandas._typing import ( ArrayLike, NDFrameT, From cc6af9d732b23c9da605193e6007ffb844512d79 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Tue, 16 Nov 2021 05:53:35 +0100 Subject: [PATCH 41/43] WIP: intermediate commit for loop solution --- pandas/core/groupby/ops.py | 8 +++++++- pandas/core/sorting.py | 26 +++++++++++++------------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index ee3572e8b90bf..f5de5e2a346fc 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -859,8 +859,14 @@ def ngroups(self) -> int: def reconstructed_codes(self) -> list[np.ndarray]: codes = self.codes ids, obs_ids, _ = self.group_info + # get the levels in which to reconstruct codes for na + levels_with_na = [ + idx + for idx, grouping in enumerate(self.groupings) + if grouping._has_na_placeholder + ] reconstructed_codes = decons_obs_group_ids( - ids, obs_ids, self.shape, codes, xnull=True, groupings=self._groupings + ids, obs_ids, self.shape, codes, xnull=True, levels_with_na=levels_with_na ) return reconstructed_codes diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index a953bc1edd557..f9cf616396244 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -249,7 +249,7 @@ def decons_obs_group_ids( shape, labels, xnull: bool, - groupings: list[Grouping] = None, + levels_with_na: list[int] = None, ): """ Reconstruct labels from observed group ids. @@ -261,19 +261,19 @@ def decons_obs_group_ids( If nulls are excluded; i.e. -1 labels are passed through. """ - def transform_codes(code_level, grouping): - if grouping._has_na_placeholder: - return np.where(code_level == max(code_level), -1, code_level) - else: - return code_level - def reconstruct_na_in_codes(codes): - if groupings: - codes = [ - transform_codes(code_level, grouping) - for code_level, grouping in zip(codes, groupings) - ] - return codes + new_codes = [] + if levels_with_na: + for idx, code_level in enumerate(codes): + if idx in levels_with_na: + new_codes.append( + np.where(code_level == max(code_level), -1, code_level) + ) + else: + new_codes.append(code_level) + else: + new_codes = codes + return new_codes if not xnull: lift = np.fromiter(((a == -1).any() for a in labels), dtype="i8") From 02bf699a0d15035a7df5987a6ae1410dd8e3dad7 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Tue, 16 Nov 2021 05:56:38 +0100 Subject: [PATCH 42/43] changes for static analysis checks --- pandas/core/sorting.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index f9cf616396244..5bf0ccd91ce59 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -40,7 +40,6 @@ if TYPE_CHECKING: from pandas import MultiIndex - from pandas.core.groupby.grouper import Grouping from pandas.core.indexes.base import Index @@ -249,7 +248,7 @@ def decons_obs_group_ids( shape, labels, xnull: bool, - levels_with_na: list[int] = None, + levels_with_na: list[int] | None = None, ): """ Reconstruct labels from observed group ids. From 45204444966702d799c301fa1b5055db7895f90c Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Wed, 8 Dec 2021 21:17:48 +0100 Subject: [PATCH 43/43] fix imports --- pandas/core/groupby/grouper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index fab0f6f879db8..e57de0f34a876 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -35,7 +35,7 @@ Categorical, ExtensionArray, ) -from pandas.core.base import isna +from pandas.core.dtypes.missing import isna import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.groupby import ops