From c3c5d64244b36feae38a2ab01b2d17669507c55e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 2 Feb 2020 18:23:58 -0800 Subject: [PATCH 1/4] CLN: remove IndexEngine.get_value --- pandas/_libs/index.pyx | 15 --------------- pandas/core/indexes/multi.py | 3 ++- pandas/core/series.py | 5 +++-- pandas/tests/indexes/multi/test_get_set.py | 2 -- 4 files changed, 5 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index b39afc57f34f6..fc22d65ce5d74 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -72,21 +72,6 @@ cdef class IndexEngine: self._ensure_mapping_populated() return val in self.mapping - cpdef get_value(self, ndarray arr, object key, object tz=None): - """ - Parameters - ---------- - arr : 1-dimensional ndarray - """ - cdef: - object loc - - loc = self.get_loc(key) - if isinstance(loc, slice) or util.is_array(loc): - return arr[loc] - else: - return get_value_at(arr, loc, tz=tz) - cpdef get_loc(self, object val): cdef: Py_ssize_t loc diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index c560d81ba95f6..c305b989e9a25 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2329,7 +2329,8 @@ def _try_mi(k): ).__finalize__(self) try: - return self._engine.get_value(s, k) + loc = self._engine.get_loc(k) + return series.iloc[loc] except KeyError as e1: try: return _try_mi(key) diff --git a/pandas/core/series.py b/pandas/core/series.py index bfe9969daaa8e..040fcf392733b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -23,6 +23,7 @@ from pandas._config import get_option from pandas._libs import lib, properties, reshape, tslibs +from pandas._libs.index import validate_numeric_casting from pandas._typing import Label from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution @@ -1022,7 +1023,7 @@ def __setitem__(self, key, value): def _set_with_engine(self, key, value): # fails with AttributeError for IntervalIndex loc = self.index._engine.get_loc(key) - libindex.validate_numeric_casting(self.dtype, value) + validate_numeric_casting(self.dtype, value) self._values[loc] = value def _set_with(self, key, value): @@ -1105,7 +1106,7 @@ def _set_value(self, label, value, takeable: bool = False): self._values[label] = value else: loc = self.index.get_loc(label) - libindex.validate_numeric_casting(self.dtype, value) + validate_numeric_casting(self.dtype, value) self._values[loc] = value except KeyError: diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py index 074072ae581b2..f77458957f03a 100644 --- a/pandas/tests/indexes/multi/test_get_set.py +++ b/pandas/tests/indexes/multi/test_get_set.py @@ -57,8 +57,6 @@ def test_get_value_duplicates(): ) assert index.get_loc("D") == slice(0, 3) - with pytest.raises(KeyError, match=r"^'D'$"): - index._engine.get_value(np.array([]), "D") def test_get_level_values_all_na(): From 7672f6aa4eb9140ccfe9d5520ab2d0e7bdbfc56e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 3 Feb 2020 17:42:58 -0800 Subject: [PATCH 2/4] checkpoint with tests passing --- pandas/_libs/index.pyx | 10 ---- pandas/_libs/util.pxd | 48 ------------------- pandas/core/arrays/sparse/array.py | 9 +++- pandas/core/indexes/multi.py | 10 +++- .../tests/indexing/multiindex/test_getitem.py | 4 +- 5 files changed, 17 insertions(+), 64 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index fc22d65ce5d74..db88845c5e390 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -35,16 +35,6 @@ cdef inline bint is_definitely_invalid_key(object val): return False -cpdef get_value_at(ndarray arr, object loc, object tz=None): - obj = util.get_value_at(arr, loc) - - if arr.descr.type_num == NPY_DATETIME: - return Timestamp(obj, tz=tz) - elif arr.descr.type_num == NPY_TIMEDELTA: - return Timedelta(obj) - return obj - - # Don't populate hash tables in monotonic indexes larger than this _SIZE_CUTOFF = 1_000_000 diff --git a/pandas/_libs/util.pxd b/pandas/_libs/util.pxd index 15fedbb20beec..828bccf7d5641 100644 --- a/pandas/_libs/util.pxd +++ b/pandas/_libs/util.pxd @@ -1,7 +1,5 @@ from pandas._libs.tslibs.util cimport * -from cython cimport Py_ssize_t - cimport numpy as cnp from numpy cimport ndarray @@ -51,49 +49,3 @@ cdef inline void set_array_not_contiguous(ndarray ao) nogil: PyArray_CLEARFLAGS(ao, (NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS)) - -cdef inline Py_ssize_t validate_indexer(ndarray arr, object loc) except -1: - """ - Cast the given indexer `loc` to an integer. If it is negative, i.e. a - python-style indexing-from-the-end indexer, translate it to a - from-the-front indexer. Raise if this is not possible. - - Parameters - ---------- - arr : ndarray - loc : object - - Returns - ------- - idx : Py_ssize_t - - Raises - ------ - IndexError - """ - cdef: - Py_ssize_t idx, size - int casted - - if is_float_object(loc): - casted = int(loc) - if casted == loc: - loc = casted - - idx = loc - size = cnp.PyArray_SIZE(arr) - - if idx < 0 and size > 0: - idx += size - if idx >= size or size == 0 or idx < 0: - raise IndexError('index out of bounds') - - return idx - - -cdef inline object get_value_at(ndarray arr, object loc): - cdef: - Py_ssize_t i - - i = validate_indexer(arr, loc) - return arr[i] diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index b476a019c66cc..85557f0525ba8 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -9,7 +9,7 @@ import numpy as np -from pandas._libs import index as libindex, lib +from pandas._libs import lib import pandas._libs.sparse as splib from pandas._libs.sparse import BlockIndex, IntIndex, SparseIndex from pandas._libs.tslibs import NaT @@ -794,7 +794,12 @@ def _get_val_at(self, loc): if sp_loc == -1: return self.fill_value else: - return libindex.get_value_at(self.sp_values, sp_loc) + val = self.sp_values[sp_loc] + if self.sp_values.dtype.kind in ["m", "M"]: + # TODO: this can be avoided if we ever have sp_values + # of DatetimeArray/TimedeltaArray + val = com.maybe_box_datetimelike(val) + return val def take(self, indices, allow_fill=False, fill_value=None): if is_scalar(indices): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index c305b989e9a25..ee73009ec713c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2330,7 +2330,11 @@ def _try_mi(k): try: loc = self._engine.get_loc(k) - return series.iloc[loc] + if not is_scalar(loc): + # TODO: Not the right error to raise, but we can't let this through here. + raise KeyError + result = series.iloc[loc] + return result except KeyError as e1: try: return _try_mi(key) @@ -2338,7 +2342,9 @@ def _try_mi(k): pass try: - return libindex.get_value_at(s, k) + if is_integer(k): + return series._values[k] + raise TypeError except IndexError: raise except TypeError: diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index c15fa34283f21..7e75b5324445e 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -87,8 +87,8 @@ def test_series_getitem_returns_scalar( (lambda s: s[(2000, 3, 4)], KeyError, r"^\(2000, 3, 4\)$"), (lambda s: s.loc[(2000, 3, 4)], KeyError, r"^\(2000, 3, 4\)$"), (lambda s: s.loc[(2000, 3, 4, 5)], IndexingError, "Too many indexers"), - (lambda s: s.__getitem__(len(s)), IndexError, "index out of bounds"), - (lambda s: s[len(s)], IndexError, "index out of bounds"), + (lambda s: s.__getitem__(len(s)), IndexError, "is out of bounds"), + (lambda s: s[len(s)], IndexError, "is out of bounds"), ( lambda s: s.iloc[len(s)], IndexError, From 5dd63bfcde840ff0d557f203f50e7122abed1fd7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 4 Feb 2020 15:28:30 -0800 Subject: [PATCH 3/4] rebase on indexing-mi --- pandas/core/arrays/sparse/array.py | 5 +-- pandas/core/common.py | 6 ++- pandas/core/indexes/multi.py | 70 +++++++----------------------- 3 files changed, 21 insertions(+), 60 deletions(-) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 85557f0525ba8..8008805ddcf87 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -795,10 +795,7 @@ def _get_val_at(self, loc): return self.fill_value else: val = self.sp_values[sp_loc] - if self.sp_values.dtype.kind in ["m", "M"]: - # TODO: this can be avoided if we ever have sp_values - # of DatetimeArray/TimedeltaArray - val = com.maybe_box_datetimelike(val) + val = com.maybe_box_datetimelike(val, self.sp_values.dtype) return val def take(self, indices, allow_fill=False, fill_value=None): diff --git a/pandas/core/common.py b/pandas/core/common.py index 745a56ce2be7f..00c7a41477017 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -72,8 +72,12 @@ def consensus_name_attr(objs): return name -def maybe_box_datetimelike(value): +def maybe_box_datetimelike(value, dtype=None): # turn a datetime like into a Timestamp/timedelta as needed + if dtype == object: + # If we dont have datetime64/timedelta64 dtype, we dont want to + # box datetimelike scalars + return value if isinstance(value, (np.datetime64, datetime)): value = tslibs.Timestamp(value) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index bcc4c14bf0a73..79b4cbbea7815 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1,4 +1,3 @@ -import datetime from sys import getsizeof from typing import Any, Hashable, Iterable, List, Optional, Sequence, Tuple, Union import warnings @@ -7,7 +6,7 @@ from pandas._config import get_option -from pandas._libs import Timestamp, algos as libalgos, index as libindex, lib, tslibs +from pandas._libs import algos as libalgos, index as libindex, lib from pandas._libs.hashtable import duplicated_int64 from pandas._typing import AnyArrayLike, ArrayLike, Scalar from pandas.compat.numpy import function as nv @@ -2321,13 +2320,20 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): def get_value(self, series, key): # Label-based - s = com.values_from_object(series) - k = com.values_from_object(key) + if not is_hashable(key) or is_iterator(key): + # We allow tuples if they are hashable, whereas other Index + # subclasses require scalar. + # We have to explicitly exclude generators, as these are hashable. + raise InvalidIndexError(key) def _try_mi(k): # TODO: what if a level contains tuples?? loc = self.get_loc(k) + new_values = series._values[loc] + if is_scalar(loc): + return new_values + new_index = self[loc] new_index = maybe_droplevels(new_index, k) return series._constructor( @@ -2335,58 +2341,12 @@ def _try_mi(k): ).__finalize__(self) try: - loc = self._engine.get_loc(k) - if not is_scalar(loc): - # TODO: Not the right error to raise, but we can't let this through here. - raise KeyError - result = series.iloc[loc] - return result - except KeyError as e1: - try: - return _try_mi(key) - except KeyError: - pass - - try: - if is_integer(k): - return series._values[k] - raise TypeError - except IndexError: + return _try_mi(key) + except KeyError: + if is_integer(key): + return series._values[key] + else: raise - except TypeError: - # generator/iterator-like - if is_iterator(key): - raise InvalidIndexError(key) - else: - raise e1 - except Exception: # pragma: no cover - raise e1 - except TypeError: - - # a Timestamp will raise a TypeError in a multi-index - # rather than a KeyError, try it here - # note that a string that 'looks' like a Timestamp will raise - # a KeyError! (GH5725) - if isinstance(key, (datetime.datetime, np.datetime64, str)): - try: - return _try_mi(key) - except KeyError: - raise - except (IndexError, ValueError, TypeError): - pass - - try: - return _try_mi(Timestamp(key)) - except ( - KeyError, - TypeError, - IndexError, - ValueError, - tslibs.OutOfBoundsDatetime, - ): - pass - - raise InvalidIndexError(key) def _convert_listlike_indexer(self, keyarr, kind=None): """ From 06b192779d232e2698f993fc534ad4c4ab5e549a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 4 Feb 2020 19:19:00 -0800 Subject: [PATCH 4/4] remove unused imports --- pandas/_libs/index.pyx | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index ec456b2fb893a..4185cc2084469 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -1,17 +1,12 @@ -from datetime import datetime, timedelta, date import warnings -import cython - import numpy as np cimport numpy as cnp from numpy cimport (ndarray, intp_t, float64_t, float32_t, int64_t, int32_t, int16_t, int8_t, - uint64_t, uint32_t, uint16_t, uint8_t, - # Note: NPY_DATETIME, NPY_TIMEDELTA are only available - # for cimport in cython>=0.27.3 - NPY_DATETIME, NPY_TIMEDELTA) + uint64_t, uint32_t, uint16_t, uint8_t +) cnp.import_array() @@ -23,7 +18,7 @@ from pandas._libs.tslibs.c_timestamp cimport _Timestamp from pandas._libs.hashtable cimport HashTable from pandas._libs import algos, hashtable as _hash -from pandas._libs.tslibs import Timestamp, Timedelta, period as periodlib +from pandas._libs.tslibs import Timedelta, period as periodlib from pandas._libs.missing import checknull