From 972514c629ad3bd11734b9a3fb5325ba33db345b Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 9 Jan 2020 10:18:42 -0800 Subject: [PATCH 1/8] simplify _to_dt64 --- pandas/_libs/tslibs/offsets.pyx | 8 ++++---- pandas/tseries/offsets.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index f24dce28cd5f7..31dc2945f0395 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -216,7 +216,7 @@ def _get_calendar(weekmask, holidays, calendar): holidays = holidays + calendar.holidays().tolist() except AttributeError: pass - holidays = [_to_dt64(dt, dtype='datetime64[D]') for dt in holidays] + holidays = [_to_dt64D(dt) for dt in holidays] holidays = tuple(sorted(holidays)) kwargs = {'weekmask': weekmask} @@ -227,7 +227,7 @@ def _get_calendar(weekmask, holidays, calendar): return busdaycalendar, holidays -def _to_dt64(dt, dtype='datetime64'): +def _to_dt64D(dt): # Currently # > np.datetime64(dt.datetime(2013,5,1),dtype='datetime64[D]') # numpy.datetime64('2013-05-01T02:00:00.000000+0200') @@ -238,8 +238,8 @@ def _to_dt64(dt, dtype='datetime64'): dt = np.int64(dt).astype('datetime64[ns]') else: dt = np.datetime64(dt) - if dt.dtype.name != dtype: - dt = dt.astype(dtype) + if dt.dtype.name != "datetime64[D]": + dt = dt.astype("datetime64[D]") return dt diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 8bb98a271bce8..001daf7886ee6 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -26,7 +26,7 @@ BaseOffset, _get_calendar, _is_normalized, - _to_dt64, + _to_dt64D, apply_index_wraps, as_datetime, roll_yearday, @@ -1090,7 +1090,7 @@ def apply_index(self, i): def is_on_offset(self, dt): if self.normalize and not _is_normalized(dt): return False - day64 = _to_dt64(dt, "datetime64[D]") + day64 = _to_dt64D(dt) return np.is_busday(day64, busdaycal=self.calendar) From 24bafca58b5a8028f2a2aa2623f87f945c200d6b Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Fri, 10 Jan 2020 10:52:40 -0800 Subject: [PATCH 2/8] revert --- pandas/_libs/tslibs/offsets.pyx | 8 ++++---- pandas/tseries/offsets.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 31dc2945f0395..f24dce28cd5f7 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -216,7 +216,7 @@ def _get_calendar(weekmask, holidays, calendar): holidays = holidays + calendar.holidays().tolist() except AttributeError: pass - holidays = [_to_dt64D(dt) for dt in holidays] + holidays = [_to_dt64(dt, dtype='datetime64[D]') for dt in holidays] holidays = tuple(sorted(holidays)) kwargs = {'weekmask': weekmask} @@ -227,7 +227,7 @@ def _get_calendar(weekmask, holidays, calendar): return busdaycalendar, holidays -def _to_dt64D(dt): +def _to_dt64(dt, dtype='datetime64'): # Currently # > np.datetime64(dt.datetime(2013,5,1),dtype='datetime64[D]') # numpy.datetime64('2013-05-01T02:00:00.000000+0200') @@ -238,8 +238,8 @@ def _to_dt64D(dt): dt = np.int64(dt).astype('datetime64[ns]') else: dt = np.datetime64(dt) - if dt.dtype.name != "datetime64[D]": - dt = dt.astype("datetime64[D]") + if dt.dtype.name != dtype: + dt = dt.astype(dtype) return dt diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 001daf7886ee6..8bb98a271bce8 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -26,7 +26,7 @@ BaseOffset, _get_calendar, _is_normalized, - _to_dt64D, + _to_dt64, apply_index_wraps, as_datetime, roll_yearday, @@ -1090,7 +1090,7 @@ def apply_index(self, i): def is_on_offset(self, dt): if self.normalize and not _is_normalized(dt): return False - day64 = _to_dt64D(dt) + day64 = _to_dt64(dt, "datetime64[D]") return np.is_busday(day64, busdaycal=self.calendar) From 67a76c9d1f932f1f388062d7ec2154b40bcd81b1 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sun, 12 Jan 2020 08:16:29 -0800 Subject: [PATCH 3/8] tests passing --- pandas/_libs/index.pyx | 53 ++++++++++++++++------------ pandas/_libs/tslibs/conversion.pxd | 2 -- pandas/_libs/tslibs/conversion.pyx | 25 ------------- pandas/core/indexes/datetimes.py | 5 +++ pandas/tests/indexes/test_engines.py | 47 ++++++++++++++++++++++++ 5 files changed, 83 insertions(+), 49 deletions(-) create mode 100644 pandas/tests/indexes/test_engines.py diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 28d269a9a809e..02d4a11767de0 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -17,13 +17,13 @@ cnp.import_array() cimport pandas._libs.util as util -from pandas._libs.tslibs.conversion cimport maybe_datetimelike_to_i8 from pandas._libs.tslibs.nattype cimport c_NaT as NaT +from pandas._libs.tslibs.c_timestamp cimport _Timestamp from pandas._libs.hashtable cimport HashTable from pandas._libs import algos, hashtable as _hash -from pandas._libs.tslibs import Timestamp, Timedelta, period as periodlib +from pandas._libs.tslibs import Period, Timestamp, Timedelta, period as periodlib from pandas._libs.missing import checknull cdef int64_t NPY_NAT = util.get_nat() @@ -409,20 +409,27 @@ cdef class DatetimeEngine(Int64Engine): cdef _get_box_dtype(self): return 'M8[ns]' + cdef int64_t _unbox_scalar(self, scalar) except? -1: + # NB: caller is responsible for ensuring tzawareness compat + # before we get here + if not (isinstance(scalar, _Timestamp) or scalar is NaT): + raise TypeError(scalar) + return scalar.value + def __contains__(self, object val): cdef: - int64_t loc + int64_t loc, conv + conv = self._unbox_scalar(val) if self.over_size_threshold and self.is_monotonic_increasing: if not self.is_unique: - return self._get_loc_duplicates(val) + return self._get_loc_duplicates(conv) values = self._get_index_values() - conv = maybe_datetimelike_to_i8(val) loc = values.searchsorted(conv, side='left') return values[loc] == conv self._ensure_mapping_populated() - return maybe_datetimelike_to_i8(val) in self.mapping + return conv in self.mapping cdef _get_index_values(self): return self.vgetter().view('i8') @@ -431,22 +438,28 @@ cdef class DatetimeEngine(Int64Engine): return algos.is_monotonic(values, timelike=True) cpdef get_loc(self, object val): + # NB: the caller is responsible for ensuring that we are called + # with either a Timestamp or NaT (Timedelta or NaT for TimedeltaEngine) + cdef: int64_t loc if is_definitely_invalid_key(val): raise TypeError + try: + conv = self._unbox_scalar(val) + except TypeError: + raise KeyError(val) + # Welcome to the spaghetti factory if self.over_size_threshold and self.is_monotonic_increasing: if not self.is_unique: - val = maybe_datetimelike_to_i8(val) - return self._get_loc_duplicates(val) + return self._get_loc_duplicates(conv) values = self._get_index_values() - + try: - conv = maybe_datetimelike_to_i8(val) loc = values.searchsorted(conv, side='left') - except TypeError: + except TypeError: # TODO: is this possible? raise KeyError(val) if loc == len(values) or values[loc] != conv: @@ -455,21 +468,12 @@ cdef class DatetimeEngine(Int64Engine): self._ensure_mapping_populated() if not self.unique: - val = maybe_datetimelike_to_i8(val) - return self._get_loc_duplicates(val) + return self._get_loc_duplicates(conv) try: - return self.mapping.get_item(val.value) + return self.mapping.get_item(conv) except KeyError: raise KeyError(val) - except AttributeError: - pass - - try: - val = maybe_datetimelike_to_i8(val) - return self.mapping.get_item(val) - except (TypeError, ValueError): - raise KeyError(val) def get_indexer(self, values): self._ensure_mapping_populated() @@ -496,6 +500,11 @@ cdef class TimedeltaEngine(DatetimeEngine): cdef _get_box_dtype(self): return 'm8[ns]' + cdef int64_t _unbox_scalar(self, scalar) except? -1: + if not (isinstance(scalar, Timedelta) or scalar is NaT): + raise TypeError(scalar) + return scalar.value + cdef class PeriodEngine(Int64Engine): diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index 36e6b14be182a..d4ae3fa8c5b99 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -25,6 +25,4 @@ cdef int64_t get_datetime64_nanos(object val) except? -1 cpdef int64_t pydt_to_i8(object pydt) except? -1 -cdef maybe_datetimelike_to_i8(object val) - cpdef datetime localize_pydatetime(datetime dt, object tz) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 2988d7bae9a5e..f22b7bb6a3687 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -202,31 +202,6 @@ def datetime_to_datetime64(object[:] values): return result, inferred_tz -cdef inline maybe_datetimelike_to_i8(object val): - """ - Try to convert to a nanosecond timestamp. Fall back to returning the - input value. - - Parameters - ---------- - val : object - - Returns - ------- - val : int64 timestamp or original input - """ - cdef: - npy_datetimestruct dts - try: - return val.value - except AttributeError: - if is_datetime64_object(val): - return get_datetime64_value(val) - elif PyDateTime_Check(val): - return convert_datetime_to_tsobject(val, None).value - return val - - # ---------------------------------------------------------------------- # _TSObject Conversion diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 2241921e94694..ec72a3d643c78 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -649,6 +649,11 @@ def get_value(self, series, key): try: value = Index.get_value(self, series, key) + #except AttributeError: + # # not a Timestamp or NaT + # if is_integer(key): + # return series[key] + # raise KeyError(key) except KeyError: try: loc = self._get_string_slice(key) diff --git a/pandas/tests/indexes/test_engines.py b/pandas/tests/indexes/test_engines.py new file mode 100644 index 0000000000000..0f504db4f79fc --- /dev/null +++ b/pandas/tests/indexes/test_engines.py @@ -0,0 +1,47 @@ +import pytest + +import pandas as pd + + +class TestDatetimeEngine: + @pytest.mark.parametrize("scalar", [ + pd.Timedelta(pd.Timestamp("2016-01-01").asm8.view("m8[ns]")), + pd.Timestamp("2016-01-01").value, + pd.Timestamp("2016-01-01").to_pydatetime(), + pd.Timestamp("2016-01-01").to_datetime64(), + ]) + def test_not_contains_requires_timestamp(self, scalar): + dti1 = pd.date_range("2016-01-01", periods=3) + dti2 = dti1.insert(1, pd.NaT) # non-monotonic + dti3 = dti1.insert(3, dti1[0]) # non-unique + dti4 = pd.date_range("2016-01-01", freq="ns", periods=2_000_000) + dti5 = dti4.insert(0, dti4[0]) # over size threshold, not unique + + for dti in [dti1, dti2, dti3, dti4, dti5]: + with pytest.raises(TypeError): + scalar in dti._engine + + with pytest.raises(KeyError): + dti._engine.get_loc(scalar) + + +class TestTimedeltaEngine: + @pytest.mark.parametrize("scalar", [ + pd.Timestamp(pd.Timedelta(days=42).asm8.view("datetime64[ns]")), + pd.Timedelta(days=42).value, + pd.Timedelta(days=42).to_pytimedelta(), + pd.Timedelta(days=42).to_timedelta64() + ]) + def test_not_contains_requires_timestamp(self, scalar): + tdi1 = pd.timedelta_range("42 days", freq="9h", periods=1234) + tdi2 = tdi1.insert(1, pd.NaT) # non-monotonic + tdi3 = tdi1.insert(3, tdi1[0]) # non-unique + tdi4 = pd.timedelta_range("42 days", freq="ns", periods=2_000_000) + tdi5 = tdi4.insert(0, tdi4[0]) # over size threshold, not unique + + for tdi in [tdi1, tdi2, tdi3, tdi4, tdi5]: + with pytest.raises(TypeError): + scalar in tdi._engine + + with pytest.raises(KeyError): + tdi._engine.get_loc(scalar) From 1a5b8413dcc3c234d311cc898cbf0343c2ab3c6f Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sun, 12 Jan 2020 08:56:36 -0800 Subject: [PATCH 4/8] no need for try/except --- pandas/_libs/index.pyx | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 02d4a11767de0..de293dad3355d 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -456,11 +456,8 @@ cdef class DatetimeEngine(Int64Engine): if not self.is_unique: return self._get_loc_duplicates(conv) values = self._get_index_values() - - try: - loc = values.searchsorted(conv, side='left') - except TypeError: # TODO: is this possible? - raise KeyError(val) + + loc = values.searchsorted(conv, side='left') if loc == len(values) or values[loc] != conv: raise KeyError(val) From 890204ca3f4932c480d27399b798e317b0ce424e Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sun, 12 Jan 2020 09:06:27 -0800 Subject: [PATCH 5/8] remove commented-out --- pandas/core/indexes/datetimes.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index ec72a3d643c78..2241921e94694 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -649,11 +649,6 @@ def get_value(self, series, key): try: value = Index.get_value(self, series, key) - #except AttributeError: - # # not a Timestamp or NaT - # if is_integer(key): - # return series[key] - # raise KeyError(key) except KeyError: try: loc = self._get_string_slice(key) From cff78c52b5211f3e56922038d0aaf1bd95f3f423 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sun, 12 Jan 2020 13:35:33 -0800 Subject: [PATCH 6/8] remove unused import --- pandas/_libs/index.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index de293dad3355d..74815f64360b9 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -23,7 +23,7 @@ from pandas._libs.tslibs.c_timestamp cimport _Timestamp from pandas._libs.hashtable cimport HashTable from pandas._libs import algos, hashtable as _hash -from pandas._libs.tslibs import Period, Timestamp, Timedelta, period as periodlib +from pandas._libs.tslibs import Timestamp, Timedelta, period as periodlib from pandas._libs.missing import checknull cdef int64_t NPY_NAT = util.get_nat() From d3d78b7d8209dcc2e49545d8616d588922130f47 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sun, 12 Jan 2020 14:11:12 -0800 Subject: [PATCH 7/8] blackify --- pandas/tests/indexes/test_engines.py | 32 +++++++++++++++++----------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/pandas/tests/indexes/test_engines.py b/pandas/tests/indexes/test_engines.py index 0f504db4f79fc..08177e7ba898b 100644 --- a/pandas/tests/indexes/test_engines.py +++ b/pandas/tests/indexes/test_engines.py @@ -4,12 +4,15 @@ class TestDatetimeEngine: - @pytest.mark.parametrize("scalar", [ - pd.Timedelta(pd.Timestamp("2016-01-01").asm8.view("m8[ns]")), - pd.Timestamp("2016-01-01").value, - pd.Timestamp("2016-01-01").to_pydatetime(), - pd.Timestamp("2016-01-01").to_datetime64(), - ]) + @pytest.mark.parametrize( + "scalar", + [ + pd.Timedelta(pd.Timestamp("2016-01-01").asm8.view("m8[ns]")), + pd.Timestamp("2016-01-01").value, + pd.Timestamp("2016-01-01").to_pydatetime(), + pd.Timestamp("2016-01-01").to_datetime64(), + ], + ) def test_not_contains_requires_timestamp(self, scalar): dti1 = pd.date_range("2016-01-01", periods=3) dti2 = dti1.insert(1, pd.NaT) # non-monotonic @@ -26,18 +29,21 @@ def test_not_contains_requires_timestamp(self, scalar): class TestTimedeltaEngine: - @pytest.mark.parametrize("scalar", [ - pd.Timestamp(pd.Timedelta(days=42).asm8.view("datetime64[ns]")), - pd.Timedelta(days=42).value, - pd.Timedelta(days=42).to_pytimedelta(), - pd.Timedelta(days=42).to_timedelta64() - ]) + @pytest.mark.parametrize( + "scalar", + [ + pd.Timestamp(pd.Timedelta(days=42).asm8.view("datetime64[ns]")), + pd.Timedelta(days=42).value, + pd.Timedelta(days=42).to_pytimedelta(), + pd.Timedelta(days=42).to_timedelta64(), + ], + ) def test_not_contains_requires_timestamp(self, scalar): tdi1 = pd.timedelta_range("42 days", freq="9h", periods=1234) tdi2 = tdi1.insert(1, pd.NaT) # non-monotonic tdi3 = tdi1.insert(3, tdi1[0]) # non-unique tdi4 = pd.timedelta_range("42 days", freq="ns", periods=2_000_000) - tdi5 = tdi4.insert(0, tdi4[0]) # over size threshold, not unique + tdi5 = tdi4.insert(0, tdi4[0]) # over size threshold, not unique for tdi in [tdi1, tdi2, tdi3, tdi4, tdi5]: with pytest.raises(TypeError): From 365c6b1a2b94f0a73a266a90f600344d1a1375b9 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Mon, 13 Jan 2020 07:50:32 -0800 Subject: [PATCH 8/8] add match to pytest.raises --- pandas/tests/indexes/test_engines.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/tests/indexes/test_engines.py b/pandas/tests/indexes/test_engines.py index 08177e7ba898b..ee224c9c6ec89 100644 --- a/pandas/tests/indexes/test_engines.py +++ b/pandas/tests/indexes/test_engines.py @@ -1,3 +1,5 @@ +import re + import pytest import pandas as pd @@ -20,11 +22,12 @@ def test_not_contains_requires_timestamp(self, scalar): dti4 = pd.date_range("2016-01-01", freq="ns", periods=2_000_000) dti5 = dti4.insert(0, dti4[0]) # over size threshold, not unique + msg = "|".join([re.escape(str(scalar)), re.escape(repr(scalar))]) for dti in [dti1, dti2, dti3, dti4, dti5]: - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): scalar in dti._engine - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=msg): dti._engine.get_loc(scalar) @@ -45,9 +48,10 @@ def test_not_contains_requires_timestamp(self, scalar): tdi4 = pd.timedelta_range("42 days", freq="ns", periods=2_000_000) tdi5 = tdi4.insert(0, tdi4[0]) # over size threshold, not unique + msg = "|".join([re.escape(str(scalar)), re.escape(repr(scalar))]) for tdi in [tdi1, tdi2, tdi3, tdi4, tdi5]: - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): scalar in tdi._engine - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=msg): tdi._engine.get_loc(scalar)