From 54b012497a730418fc304d4db646296b24ecb763 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 9 Jul 2021 19:12:11 -0700 Subject: [PATCH] BUG: string slicing on MultiIndex DatetimeIndex level --- pandas/core/indexes/base.py | 11 --- pandas/core/indexes/multi.py | 74 ++++++++++--------- pandas/core/indexing.py | 7 -- .../indexes/datetimes/test_partial_slicing.py | 10 +-- .../indexes/multi/test_partial_indexing.py | 9 ++- pandas/tests/indexing/multiindex/test_loc.py | 20 ++--- 6 files changed, 63 insertions(+), 68 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2f1d85f1340a4..961ac2f857245 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -43,7 +43,6 @@ DtypeObj, F, Shape, - T, final, ) from pandas.compat.numpy import function as nv @@ -3720,16 +3719,6 @@ def _filter_indexer_tolerance( # -------------------------------------------------------------------- # Indexer Conversion Methods - def _get_partial_string_timestamp_match_key(self, key: T) -> T: - """ - Translate any partial string timestamp matches in key, returning the - new key. - - Only relevant for MultiIndex. - """ - # GH#10331 - return key - @final def _validate_positional_slice(self, key: slice) -> None: """ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 8903d29782610..bb78498f4fa23 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2577,35 +2577,6 @@ def _convert_listlike_indexer(self, keyarr) -> np.ndarray | None: return indexer - def _get_partial_string_timestamp_match_key(self, key): - """ - Translate any partial string timestamp matches in key, returning the - new key. - - Only relevant for MultiIndex. - """ - # GH#10331 - if isinstance(key, str) and self.levels[0]._supports_partial_string_indexing: - # Convert key '2016-01-01' to - # ('2016-01-01'[, slice(None, None, None)]+) - key = (key,) + (slice(None),) * (len(self.levels) - 1) - - if isinstance(key, tuple): - # Convert (..., '2016-01-01', ...) in tuple to - # (..., slice('2016-01-01', '2016-01-01', None), ...) - new_key = [] - for i, component in enumerate(key): - if ( - isinstance(component, str) - and self.levels[i]._supports_partial_string_indexing - ): - new_key.append(slice(component, component, None)) - else: - new_key.append(component) - key = tuple(new_key) - - return key - def get_slice_bound( self, label: Hashable | Sequence[Hashable], side: str, kind: str | None = None ) -> int: @@ -2854,7 +2825,12 @@ def _maybe_to_slice(loc): ) if keylen == self.nlevels and self.is_unique: - return self._engine.get_loc(key) + try: + return self._engine.get_loc(key) + except TypeError: + # e.g. partial string slicing + loc, _ = self.get_loc_level(key, list(range(self.nlevels))) + return loc # -- partial selection or non-unique index # break the key into 2 parts based on the lexsort_depth of the index; @@ -3004,6 +2980,10 @@ def maybe_mi_droplevels(indexer, levels): return (self._engine.get_loc(key), None) except KeyError as err: raise KeyError(key) from err + except TypeError: + # e.g. partial string indexing + # test_partial_string_timestamp_multiindex + pass # partial selection indexer = self.get_loc(key) @@ -3015,7 +2995,19 @@ def maybe_mi_droplevels(indexer, levels): # TODO: in some cases we still need to drop some levels, # e.g. test_multiindex_perf_warn - ilevels = [] + # test_partial_string_timestamp_multiindex + ilevels = [ + i + for i in range(len(key)) + if ( + not isinstance(key[i], str) + or not self.levels[i]._supports_partial_string_indexing + ) + and key[i] != slice(None, None) + ] + if len(ilevels) == self.nlevels: + # TODO: why? + ilevels = [] return indexer, maybe_mi_droplevels(indexer, ilevels) else: @@ -3056,6 +3048,16 @@ def maybe_mi_droplevels(indexer, levels): return indexer, maybe_mi_droplevels(indexer, ilevels) else: indexer = self._get_level_indexer(key, level=level) + if ( + isinstance(key, str) + and self.levels[level]._supports_partial_string_indexing + ): + # check to see if we did an exact lookup vs sliced + check = self.levels[level].get_loc(key) + if not is_integer(check): + # e.g. test_partial_string_timestamp_multiindex + return indexer, self[indexer] + return indexer, maybe_mi_droplevels(indexer, [level]) def _get_level_indexer( @@ -3153,6 +3155,10 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): if level > 0 or self._lexsort_depth == 0: # Desired level is not sorted + if isinstance(idx, slice): + locs = (level_codes >= idx.start) & (level_codes < idx.stop) + return locs + locs = np.array(level_codes == idx, dtype=bool, copy=False) if not locs.any(): # The label is present in self.levels[level] but unused: @@ -3160,8 +3166,10 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): return locs if isinstance(idx, slice): - start = idx.start - end = idx.stop + # e.g. test_partial_string_timestamp_multiindex + start = level_codes.searchsorted(idx.start, side="left") + # NB: "left" here bc of slice semantics + end = level_codes.searchsorted(idx.stop, side="left") else: start = level_codes.searchsorted(idx, side="left") end = level_codes.searchsorted(idx, side="right") diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 387dcca6897b7..ec9a233c3ab0d 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1129,12 +1129,6 @@ def _handle_lowerdim_multi_index_axis0(self, tup: tuple): try: # fast path for series or for tup devoid of slices return self._get_label(tup, axis=axis) - except TypeError as err: - # slices are unhashable - # FIXME: this raises when we have a DatetimeIndex first level and a - # string for the first tup entry - # see test_partial_slicing_with_multiindex - raise IndexingError("No label returned") from err except KeyError as ek: # raise KeyError if number of indexers match @@ -1149,7 +1143,6 @@ def _getitem_axis(self, key, axis: int): key = list(key) labels = self.obj._get_axis(axis) - key = labels._get_partial_string_timestamp_match_key(key) if isinstance(key, slice): self._validate_key(key, axis) diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 87c56ea588a5d..c5b47053471eb 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -16,7 +16,6 @@ date_range, ) import pandas._testing as tm -from pandas.core.indexing import IndexingError class TestSlicing: @@ -337,11 +336,10 @@ def test_partial_slicing_with_multiindex(self): result = df_multi.loc[("2013-06-19 09:30:00", "ACCT1", "ABC")] tm.assert_series_equal(result, expected) - # this is an IndexingError as we don't do partial string selection on - # multi-levels. - msg = "Too many indexers" - with pytest.raises(IndexingError, match=msg): - df_multi.loc[("2013-06-19", "ACCT1", "ABC")] + # partial string indexing on first level, scalar indexing on the other two + result = df_multi.loc[("2013-06-19", "ACCT1", "ABC")] + expected = df_multi.iloc[:1].droplevel([1, 2]) + tm.assert_frame_equal(result, expected) def test_partial_slicing_with_multiindex_series(self): # GH 4294 diff --git a/pandas/tests/indexes/multi/test_partial_indexing.py b/pandas/tests/indexes/multi/test_partial_indexing.py index 286522f6b946d..fb34fc7f570ba 100644 --- a/pandas/tests/indexes/multi/test_partial_indexing.py +++ b/pandas/tests/indexes/multi/test_partial_indexing.py @@ -72,7 +72,9 @@ def test_partial_string_timestamp_multiindex(df): # partial string match on date and hour, from middle result = df.loc["2016-01-02 12"] - expected = df.iloc[9:12] + # hourly resolution, same as index.levels[0], so we are _not_ slicing on + # that level, so that level gets dropped + expected = df.iloc[9:12].droplevel(0) tm.assert_frame_equal(result, expected) # partial string match on secondary index @@ -81,11 +83,14 @@ def test_partial_string_timestamp_multiindex(df): tm.assert_frame_equal(result, expected) # tuple selector with partial string match on date + # "2016-01-01" has daily resolution, so _is_ a slice on the first level. result = df.loc[("2016-01-01", "a"), :] expected = df.iloc[[0, 3]] + expected = df.iloc[[0, 3]].droplevel(1) tm.assert_frame_equal(result, expected) - # Slicing date on first level should break (of course) + # Slicing date on first level should break (of course) bc the DTI is the + # second level on df_swap with pytest.raises(KeyError, match="'2016-01-01'"): df_swap.loc["2016-01-01"] diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index bc59c51e359ae..b87ce82f18509 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -547,15 +547,17 @@ def test_loc_period_string_indexing(): ), ) result = df.loc[("2013Q1", 1111), "OMS"] - expected = Series( - [np.nan], - dtype=object, - name="OMS", - index=MultiIndex.from_tuples( - [(pd.Period("2013Q1"), 1111)], names=["Period", "CVR"] - ), - ) - tm.assert_series_equal(result, expected) + + alt = df.loc[(a[0], 1111), "OMS"] + assert np.isnan(alt) + + # Because the resolution of the string matches, it is an exact lookup, + # not a slice + assert np.isnan(result) + + # TODO: should it figure this out? + # alt = df.loc["2013Q1", 1111, "OMS"] + # assert np.isnan(alt) def test_loc_datetime_mask_slicing():