From 10a923fd433d46d94915e27e315d431a75124fd8 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 30 Oct 2018 17:17:32 -0700 Subject: [PATCH 1/6] make simple_new stricter, avoid use of shallow_copy --- pandas/core/arrays/datetimelike.py | 4 +++- pandas/core/arrays/datetimes.py | 25 +++++++++++++++---------- pandas/core/arrays/timedeltas.py | 22 ++++++++++------------ pandas/core/indexes/datetimelike.py | 24 +++++++++++++++++------- pandas/core/indexes/datetimes.py | 2 ++ pandas/core/indexes/timedeltas.py | 28 ++++++++++++++++++++++++++-- 6 files changed, 73 insertions(+), 32 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 0247ce8dc6ac4..b2c8ff37d756d 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -422,7 +422,9 @@ def _add_nat(self): # and datetime dtypes result = np.zeros(len(self), dtype=np.int64) result.fill(iNaT) - return self._shallow_copy(result, freq=None) + if is_timedelta64_dtype(self): + return type(self)(result, freq=None) + return type(self)(result, tz=self.tz, freq=None) def _sub_nat(self): """Subtract pd.NaT from self""" diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index b656690b30e34..99c996e35a265 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -177,16 +177,11 @@ def _simple_new(cls, values, freq=None, tz=None, **kwargs): we require the we have a dtype compat for the values if we are passed a non-dtype compat, then coerce using the constructor """ + assert isinstance(values, np.ndarray), type(values) + if values.dtype == 'i8': + values = values.view('M8[ns]') - if getattr(values, 'dtype', None) is None: - # empty, but with dtype compat - if values is None: - values = np.empty(0, dtype=_NS_DTYPE) - return cls(values, freq=freq, tz=tz, **kwargs) - values = np.array(values, copy=False) - - if not is_datetime64_dtype(values): - values = ensure_int64(values).view(_NS_DTYPE) + assert values.dtype == 'M8[ns]', values.dtype result = object.__new__(cls) result._data = values @@ -209,6 +204,15 @@ def __new__(cls, values, freq=None, tz=None, dtype=None): # if dtype has an embedded tz, capture it tz = dtl.validate_tz_from_dtype(dtype, tz) + if isinstance(values, cls): + values = values.asi8 + if values.dtype == 'i8': + values = values.view('M8[ns]') + + assert isinstance(values, np.ndarray), type(values) + assert is_datetime64_dtype(values) + values = conversion.ensure_datetime64ns(values, copy=False) + result = cls._simple_new(values, freq=freq, tz=tz) if freq_infer: inferred = result.inferred_freq @@ -843,7 +847,8 @@ def to_perioddelta(self, freq): # TODO: consider privatizing (discussion in GH#23113) from pandas.core.arrays.timedeltas import TimedeltaArrayMixin i8delta = self.asi8 - self.to_period(freq).to_timestamp().asi8 - return TimedeltaArrayMixin(i8delta) + m8delta = i8delta.view('m8[ns]') + return TimedeltaArrayMixin(m8delta) # ----------------------------------------------------------------- # Properties - Vectorized Timestamp Properties/Methods diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 397297c1b88d0..a1e99b9640e75 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -111,16 +111,9 @@ def dtype(self): _attributes = ["freq"] @classmethod - def _simple_new(cls, values, freq=None, **kwargs): - values = np.array(values, copy=False) - if values.dtype == np.object_: - values = array_to_timedelta64(values) - if values.dtype != _TD_DTYPE: - if is_timedelta64_dtype(values): - # non-nano unit - values = values.astype(_TD_DTYPE) - else: - values = ensure_int64(values).view(_TD_DTYPE) + def _simple_new(cls, values, freq=None): + assert isinstance(values, np.ndarray), type(values) + assert values.dtype == 'm8[ns]', values.dtype result = object.__new__(cls) result._data = values @@ -131,6 +124,10 @@ def __new__(cls, values, freq=None): freq, freq_infer = dtl.maybe_infer_freq(freq) + values = np.array(values, copy=False) + if values.dtype == np.object_: + values = array_to_timedelta64(values) + result = cls._simple_new(values, freq=freq) if freq_infer: inferred = result.inferred_freq @@ -166,10 +163,11 @@ def _generate_range(cls, start, end, periods, freq, closed=None): if freq is not None: index = _generate_regular_range(start, end, periods, freq) - index = cls._simple_new(index, freq=freq) else: index = np.linspace(start.value, end.value, periods).astype('i8') - index = cls._simple_new(index, freq=freq) + + index = index.view('m8[ns]') + index = cls._simple_new(index, freq=freq) if not left_closed: index = index[1:] diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 14325f42ff0d8..7d90dee17be1c 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -189,13 +189,12 @@ def _round(self, freq, mode, ambiguous): result = self._maybe_mask_results(result, fill_value=NaT) attribs = self._get_attributes_dict() - if 'freq' in attribs: - attribs['freq'] = None + attribs['freq'] = None if 'tz' in attribs: attribs['tz'] = None - return self._ensure_localized( - self._shallow_copy(result, **attribs), ambiguous - ) + + result = self._shallow_copy(result, **attribs) + return self._ensure_localized(result, ambiguous) @Appender((_round_doc + _round_example).format(op="round")) def round(self, freq, ambiguous='raise'): @@ -222,6 +221,18 @@ class DatetimeIndexOpsMixin(DatetimeLikeArrayMixin): _resolution = cache_readonly(DatetimeLikeArrayMixin._resolution.fget) resolution = cache_readonly(DatetimeLikeArrayMixin.resolution.fget) + def _shallow_copy(self, values=None, **kwargs): + if isinstance(values, list): + # reached via Index.insert + assert len(values) == 0 + values = np.array([], dtype='i8') + + # unwrap for case where e.g. _get_unique_index passes an instance + # of own class instead of ndarray + values = getattr(values, '_data', values) + + return DatetimeLikeArrayMixin._shallow_copy(self, values, **kwargs) + def equals(self, other): """ Determines if two Index objects contain the same elements. @@ -640,8 +651,7 @@ def where(self, cond, other=None): result = np.where(cond, values, other).astype('i8') result = self._ensure_localized(result, from_utc=True) - return self._shallow_copy(result, - **self._get_attributes_dict()) + return self._shallow_copy(result) def _summary(self, name=None): """ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 39f247a7c4cfe..3e3ef50690399 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -298,6 +298,8 @@ def __new__(cls, data=None, data = data.astype(np.int64, copy=False) subarr = data.view(_NS_DTYPE) + assert isinstance(subarr, np.ndarray), type(subarr) + assert subarr.dtype == 'M8[ns]', dubarr.dtype subarr = cls._simple_new(subarr, name=name, freq=freq, tz=tz) if dtype is not None: if not is_dtype_equal(subarr.dtype, dtype): diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index e5da21478d0a4..d1a0fff682f7b 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -35,6 +35,7 @@ to_timedelta, _coerce_scalar_to_timedelta_type) from pandas._libs import (lib, index as libindex, join as libjoin, Timedelta, NaT) +from pandas._libs.tslibs.timedeltas import array_to_timedelta64 class TimedeltaIndex(TimedeltaArrayMixin, DatetimeIndexOpsMixin, @@ -166,6 +167,19 @@ def __new__(cls, data=None, unit=None, freq=None, start=None, end=None, elif copy: data = np.array(data, copy=True) + data = np.array(data, copy=False) + if data.dtype == np.object_: + data = array_to_timedelta64(data) + + if data.dtype != _TD_DTYPE: + if is_timedelta64_dtype(data): + # non-nano unit + # TODO: watch out for overflows + data = data.astype(_TD_DTYPE) + else: + data = ensure_int64(data).view(_TD_DTYPE) + + assert data.dtype == 'm8[ns]', data.dtype subarr = cls._simple_new(data, name=name, freq=freq) # check that we are matching freqs if verify_integrity and len(subarr) > 0: @@ -180,8 +194,18 @@ def __new__(cls, data=None, unit=None, freq=None, start=None, end=None, return subarr @classmethod - def _simple_new(cls, values, name=None, freq=None, **kwargs): - result = super(TimedeltaIndex, cls)._simple_new(values, freq, **kwargs) + def _simple_new(cls, values, name=None, freq=None, dtype=_TD_DTYPE): + + # `dtype` is not always passed, but if it is, it should always + # be m8[ns] + assert dtype == _TD_DTYPE + + assert isinstance(values, np.ndarray), type(values) + if values.dtype == 'i8': + values = values.view('m8[ns]') + assert values.dtype == 'm8[ns]', values.dtype + + result = super(TimedeltaIndex, cls)._simple_new(values, freq) result.name = name result._reset_identity() return result From 9060f1ad6ab9df981e713cedad4977639c29b6c2 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 30 Oct 2018 17:31:14 -0700 Subject: [PATCH 2/6] Avoid use of shallow_copy --- pandas/core/arrays/datetimes.py | 4 ++-- pandas/core/indexes/datetimes.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 99c996e35a265..e8596ddb348aa 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -590,7 +590,7 @@ def tz_convert(self, tz): 'tz_localize to localize') # No conversion since timestamps are all UTC to begin with - return self._shallow_copy(tz=tz) + return self._simple_new(self.asi8, tz=tz, freq=self.freq) def tz_localize(self, tz, ambiguous='raise', nonexistent='raise', errors=None): @@ -712,7 +712,7 @@ def tz_localize(self, tz, ambiguous='raise', nonexistent='raise', self.asi8, tz, ambiguous=ambiguous, nonexistent=nonexistent, ) new_dates = new_dates.view(_NS_DTYPE) - return self._shallow_copy(new_dates, tz=tz) + return self._simple_new(new_dates, tz=tz, freq=self.freq) # ---------------------------------------------------------------- # Conversion Methods - Vectorized analogues of Timestamp methods diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 3e3ef50690399..ceb7590c9b884 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1136,6 +1136,8 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): is_year_end = wrap_field_accessor(DatetimeArrayMixin.is_year_end) is_leap_year = wrap_field_accessor(DatetimeArrayMixin.is_leap_year) + tz_localize = wrap_array_method(DatetimeArrayMixin.tz_localize, True) + tz_convert = wrap_array_method(DatetimeArrayMixin.tz_convert, True) to_perioddelta = wrap_array_method(DatetimeArrayMixin.to_perioddelta, False) to_period = wrap_array_method(DatetimeArrayMixin.to_period, True) From d5b0bfd9fc05b8651e3d6d2881883ae3ff0058e9 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 30 Oct 2018 17:51:05 -0700 Subject: [PATCH 3/6] make dtype explicit --- pandas/core/arrays/datetimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index e8596ddb348aa..aeb6c8eef57f9 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1360,7 +1360,7 @@ def _generate_regular_range(cls, start, end, periods, freq): xdr = generate_range(start=start, end=end, periods=periods, offset=freq) - values = np.array([x.value for x in xdr]) + values = np.array([x.value for x in xdr], dtype=np.int64) data = cls._simple_new(values, freq=freq, tz=tz) return data From 233367a61111903ff689ba42febb646773195393 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 30 Oct 2018 18:03:08 -0700 Subject: [PATCH 4/6] cosmetics --- pandas/core/arrays/datetimes.py | 29 +++++++++++------------------ pandas/core/arrays/timedeltas.py | 6 ++---- 2 files changed, 13 insertions(+), 22 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index aeb6c8eef57f9..252185987f7b1 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -257,28 +257,22 @@ def _generate_range(cls, start, end, periods, freq, tz=None, if tz is not None: # Localize the start and end arguments - start = _maybe_localize_point( - start, getattr(start, 'tz', None), start, freq, tz - ) - end = _maybe_localize_point( - end, getattr(end, 'tz', None), end, freq, tz - ) + start = _maybe_localize_point(start, getattr(start, 'tz', None), + start, freq, tz) + end = _maybe_localize_point(end, getattr(end, 'tz', None), + end, freq, tz) if start and end: # Make sure start and end have the same tz - start = _maybe_localize_point( - start, start.tz, end.tz, freq, tz - ) - end = _maybe_localize_point( - end, end.tz, start.tz, freq, tz - ) + start = _maybe_localize_point(start, start.tz, end.tz, freq, tz) + end = _maybe_localize_point(end, end.tz, start.tz, freq, tz) + if freq is not None: # TODO: consider re-implementing _cached_range; GH#17914 index = _generate_regular_range(cls, start, end, periods, freq) if tz is not None and getattr(index, 'tz', None) is None: - arr = conversion.tz_localize_to_utc( - ensure_int64(index.values), - tz, ambiguous=ambiguous) + arr = conversion.tz_localize_to_utc(ensure_int64(index.values), + tz, ambiguous=ambiguous) index = cls(arr) @@ -291,9 +285,8 @@ def _generate_range(cls, start, end, periods, freq, tz=None, else: # Create a linearly spaced date_range in local time arr = np.linspace(start.value, end.value, periods) - index = cls._simple_new( - arr.astype('M8[ns]', copy=False), freq=None, tz=tz - ) + arr = arr.astype('M8[ns]', copy=False) + index = cls._simple_new(arr, freq=None, tz=tz) if not left_closed and len(index) and index[0] == start: index = index[1:] diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index a1e99b9640e75..d5ae5b8028296 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -166,15 +166,13 @@ def _generate_range(cls, start, end, periods, freq, closed=None): else: index = np.linspace(start.value, end.value, periods).astype('i8') - index = index.view('m8[ns]') - index = cls._simple_new(index, freq=freq) - if not left_closed: index = index[1:] if not right_closed: index = index[:-1] - return index + index = index.view('m8[ns]') + return cls._simple_new(index, freq=freq) # ---------------------------------------------------------------- # Arithmetic Methods From f37ace369fda91767fa132d6f400bdfb06a4015a Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 30 Oct 2018 19:03:54 -0700 Subject: [PATCH 5/6] docstring and simplication for generate_regular_range --- pandas/core/arrays/datetimes.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 252185987f7b1..5674f9dd1b503 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -204,7 +204,7 @@ def __new__(cls, values, freq=None, tz=None, dtype=None): # if dtype has an embedded tz, capture it tz = dtl.validate_tz_from_dtype(dtype, tz) - if isinstance(values, cls): + if isinstance(values, DatetimeArrayMixin): values = values.asi8 if values.dtype == 'i8': values = values.view('M8[ns]') @@ -270,7 +270,7 @@ def _generate_range(cls, start, end, periods, freq, tz=None, # TODO: consider re-implementing _cached_range; GH#17914 index = _generate_regular_range(cls, start, end, periods, freq) - if tz is not None and getattr(index, 'tz', None) is None: + if tz is not None and index.tz is None: arr = conversion.tz_localize_to_utc(ensure_int64(index.values), tz, ambiguous=ambiguous) @@ -1318,6 +1318,20 @@ def to_julian_date(self): def _generate_regular_range(cls, start, end, periods, freq): + """ + + Parameters + ---------- + cls : class + start : Timestamp or None + end : Timestamp or None + periods : int + freq : DateOffset + + Returns + ------- + ndarray[np.int64] representing nanosecond unix timestamps + """ if isinstance(freq, Tick): stride = freq.nanos if periods is None: @@ -1341,21 +1355,20 @@ def _generate_regular_range(cls, start, end, periods, freq): "if a 'period' is given.") data = np.arange(b, e, stride, dtype=np.int64) - data = cls._simple_new(data.view(_NS_DTYPE), None, tz=tz) else: tz = None # start and end should have the same timezone by this point - if isinstance(start, Timestamp): + if start is not None: tz = start.tz - elif isinstance(end, Timestamp): + elif end is not None: tz = end.tz xdr = generate_range(start=start, end=end, periods=periods, offset=freq) - values = np.array([x.value for x in xdr], dtype=np.int64) - data = cls._simple_new(values, freq=freq, tz=tz) + data = np.array([x.value for x in xdr], dtype=np.int64) + data = cls._simple_new(data.view(_NS_DTYPE), freq=freq, tz=tz) return data From 777ddffac81134e7a8e3c8399f09fe18b2b70097 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 30 Oct 2018 20:52:06 -0700 Subject: [PATCH 6/6] flake8 fixups --- pandas/core/arrays/timedeltas.py | 2 +- pandas/core/indexes/datetimes.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index d5ae5b8028296..686e6517edd5d 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -11,7 +11,7 @@ from pandas import compat from pandas.core.dtypes.common import ( - _TD_DTYPE, ensure_int64, is_timedelta64_dtype, is_list_like) + _TD_DTYPE, is_list_like) from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.missing import isna diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index ceb7590c9b884..f87d1c0925e04 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -299,7 +299,7 @@ def __new__(cls, data=None, subarr = data.view(_NS_DTYPE) assert isinstance(subarr, np.ndarray), type(subarr) - assert subarr.dtype == 'M8[ns]', dubarr.dtype + assert subarr.dtype == 'M8[ns]', subarr.dtype subarr = cls._simple_new(subarr, name=name, freq=freq, tz=tz) if dtype is not None: if not is_dtype_equal(subarr.dtype, dtype):