From 13b6fe80179d9f51b6d692d75f1c2fd942d70b57 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 18 Jan 2020 13:30:18 -0800 Subject: [PATCH 1/5] REF: require PeriodArray in PeriodIndex._simple_new --- pandas/core/indexes/datetimelike.py | 28 +++++++++---------- pandas/core/indexes/period.py | 20 ++++--------- pandas/tests/arrays/test_datetimelike.py | 15 +++++----- .../tests/indexes/period/test_constructors.py | 20 ++++++++++--- 4 files changed, 44 insertions(+), 39 deletions(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index bf1272b223f70..d262fcdc92ebf 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -193,20 +193,21 @@ def sort_values(self, return_indexer=False, ascending=True): # because the treatment of NaT has been changed to put NaT last # instead of first. sorted_values = np.sort(self.asi8) - attribs = self._get_attributes_dict() - freq = attribs["freq"] + freq = self.freq if freq is not None and not is_period_dtype(self): if freq.n > 0 and not ascending: freq = freq * -1 elif freq.n < 0 and ascending: freq = freq * -1 - attribs["freq"] = freq if not ascending: sorted_values = sorted_values[::-1] - return self._simple_new(sorted_values, **attribs) + arr = type(self._data)._simple_new( + sorted_values, dtype=self.dtype, freq=freq + ) + return self._simple_new(arr, name=self.name) @Appender(_index_shared_docs["take"] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): @@ -503,22 +504,21 @@ def _concat_same_dtype(self, to_concat, name): """ Concatenate to_concat which has the same class. """ - attribs = self._get_attributes_dict() - attribs["name"] = name + # do not pass tz to set because tzlocal cannot be hashed if len({str(x.dtype) for x in to_concat}) != 1: raise ValueError("to_concat must have the same tz") - new_data = type(self._values)._concat_same_type(to_concat).asi8 + new_data = type(self._data)._concat_same_type(to_concat) - # GH 3232: If the concat result is evenly spaced, we can retain the - # original frequency - is_diff_evenly_spaced = len(unique_deltas(new_data)) == 1 - if not is_period_dtype(self) and not is_diff_evenly_spaced: - # reset freq - attribs["freq"] = None + if not is_period_dtype(self.dtype): + # GH 3232: If the concat result is evenly spaced, we can retain the + # original frequency + is_diff_evenly_spaced = len(unique_deltas(new_data.asi8)) == 1 + if is_diff_evenly_spaced: + new_data._freq = self.freq - return self._simple_new(new_data, **attribs) + return self._simple_new(new_data, name=name) def shift(self, periods=1, freq=None): """ diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 35f96e61704f0..20e390f2dc7d9 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -15,7 +15,6 @@ is_datetime64_any_dtype, is_dtype_equal, is_float, - is_float_dtype, is_integer, is_integer_dtype, is_list_like, @@ -234,21 +233,12 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs): Parameters ---------- - values : PeriodArray, PeriodIndex, Index[int64], ndarray[int64] + values : PeriodArray Values that can be converted to a PeriodArray without inference or coercion. - """ - # TODO: raising on floats is tested, but maybe not useful. - # Should the callers know not to pass floats? - # At the very least, I think we can ensure that lists aren't passed. - if isinstance(values, list): - values = np.asarray(values) - if is_float_dtype(values): - raise TypeError("PeriodIndex._simple_new does not accept floats.") - if freq: - freq = Period._maybe_convert_freq(freq) - values = PeriodArray(values, freq=freq) + assert isinstance(values, PeriodArray), type(values) + assert freq is None or freq == values.freq, (freq, values.freq) result = object.__new__(cls) result._data = values @@ -834,7 +824,9 @@ def _union(self, other, sort): def _apply_meta(self, rawarr): if not isinstance(rawarr, PeriodIndex): - rawarr = PeriodIndex._simple_new(rawarr, freq=self.freq, name=self.name) + if not isinstance(rawarr, PeriodArray): + rawarr = PeriodArray(rawarr, freq=self.freq) + rawarr = PeriodIndex._simple_new(rawarr, name=self.name) return rawarr def memory_usage(self, deep=False): diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index fa45db93c6102..87b825c8c27bd 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -65,8 +65,8 @@ def test_compare_len1_raises(self): # to the case where one has length-1, which numpy would broadcast data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 - idx = self.index_cls._simple_new(data, freq="D") - arr = self.array_cls(idx) + idx = self.array_cls._simple_new(data, freq="D") + arr = self.index_cls(idx) with pytest.raises(ValueError, match="Lengths must match"): arr == arr[:1] @@ -79,8 +79,8 @@ def test_take(self): data = np.arange(100, dtype="i8") * 24 * 3600 * 10 ** 9 np.random.shuffle(data) - idx = self.index_cls._simple_new(data, freq="D") - arr = self.array_cls(idx) + arr = self.array_cls._simple_new(data, freq="D") + idx = self.index_cls._simple_new(arr) takers = [1, 4, 94] result = arr.take(takers) @@ -97,8 +97,7 @@ def test_take(self): def test_take_fill(self): data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 - idx = self.index_cls._simple_new(data, freq="D") - arr = self.array_cls(idx) + arr = self.array_cls._simple_new(data, freq="D") result = arr.take([-1, 1], allow_fill=True, fill_value=None) assert result[0] is pd.NaT @@ -121,7 +120,9 @@ def test_take_fill(self): def test_concat_same_type(self): data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 - idx = self.index_cls._simple_new(data, freq="D").insert(0, pd.NaT) + arr = self.array_cls._simple_new(data, freq="D") + idx = self.index_cls(arr) + idx = idx.insert(0, pd.NaT) arr = self.array_cls(idx) result = arr._concat_same_type([arr[:-1], arr[1:], arr]) diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index 27ee915e48e5c..b407f45f19dd1 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -322,22 +322,34 @@ def test_constructor_mixed(self): def test_constructor_simple_new(self): idx = period_range("2007-01", name="p", periods=2, freq="M") - result = idx._simple_new(idx, name="p", freq=idx.freq) + + with pytest.raises(AssertionError, match=""): + idx._simple_new(idx, name="p", freq=idx.freq) + + result = idx._simple_new(idx._data, name="p", freq=idx.freq) tm.assert_index_equal(result, idx) - result = idx._simple_new(idx.astype("i8"), name="p", freq=idx.freq) + with pytest.raises(AssertionError): + # Need ndarray, not Int64Index + type(idx._data)._simple_new(idx.astype("i8"), freq=idx.freq) + + arr = type(idx._data)._simple_new(idx.asi8, freq=idx.freq) + result = idx._simple_new(arr, name="p") tm.assert_index_equal(result, idx) def test_constructor_simple_new_empty(self): # GH13079 idx = PeriodIndex([], freq="M", name="p") - result = idx._simple_new(idx, name="p", freq="M") + with pytest.raises(AssertionError, match=""): + idx._simple_new(idx, name="p", freq="M") + + result = idx._simple_new(idx._data, name="p", freq="M") tm.assert_index_equal(result, idx) @pytest.mark.parametrize("floats", [[1.1, 2.1], np.array([1.1, 2.1])]) def test_constructor_floats(self, floats): msg = r"PeriodIndex\._simple_new does not accept floats" - with pytest.raises(TypeError, match=msg): + with pytest.raises(AssertionError, match=" Date: Thu, 23 Jan 2020 16:07:14 -0800 Subject: [PATCH 2/5] tighter control over TimedeltaArray._simple_new args --- pandas/core/arrays/timedeltas.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index d77a37ad355a7..a7b16fd86468e 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -195,9 +195,12 @@ def __init__(self, values, dtype=_TD_DTYPE, freq=None, copy=False): def _simple_new(cls, values, freq=None, dtype=_TD_DTYPE): assert dtype == _TD_DTYPE, dtype assert isinstance(values, np.ndarray), type(values) + if values.dtype != _TD_DTYPE: + assert values.dtype == "i8" + values = values.view(_TD_DTYPE) result = object.__new__(cls) - result._data = values.view(_TD_DTYPE) + result._data = values result._freq = to_offset(freq) result._dtype = _TD_DTYPE return result From c1a56a89a5b7b5411e8d01106359114d1ee23914 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 24 Jan 2020 17:33:08 -0800 Subject: [PATCH 3/5] be stricter in shallow_copy --- pandas/core/indexes/datetimelike.py | 18 +++++++++++++++--- pandas/core/indexes/timedeltas.py | 15 ++++----------- pandas/core/resample.py | 8 ++++++-- pandas/tests/resample/test_base.py | 18 +++++++++++++----- 4 files changed, 38 insertions(+), 21 deletions(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index b87dd0f02252f..01cb4dfbb7b47 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -570,7 +570,8 @@ def delete(self, loc): if loc.start in (0, None) or loc.stop in (len(self), None): freq = self.freq - return self._shallow_copy(new_i8s, freq=freq) + arr = type(self._data)._simple_new(new_i8s, dtype=self.dtype, freq=freq) + return type(self)._simple_new(arr, name=self.name) class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin, Int64Index): @@ -611,6 +612,13 @@ def _shallow_copy(self, values=None, **kwargs): if values is None: values = self._data + if isinstance(values, type(self)): + values = values._data + if isinstance(values, np.ndarray): + # TODO: try to avoid this case + freq = getattr(kwargs, "freq", None) + values = type(self._data)(values, dtype=self.dtype, freq=freq) + attributes = self._get_attributes_dict() if "freq" not in kwargs and self.freq is not None: @@ -789,7 +797,10 @@ def _union(self, other, sort): this, other = self._maybe_utc_convert(other) if this._can_fast_union(other): - return this._fast_union(other, sort=sort) + result = this._fast_union(other, sort=sort) + if result.freq is None: + result._set_freq("infer") + return result else: result = Index._union(this, other, sort=sort) if isinstance(result, type(self)): @@ -923,7 +934,8 @@ def insert(self, loc, item): new_i8s = np.concatenate( (self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8) ) - return self._shallow_copy(new_i8s, freq=freq) + arr = type(self._data)._simple_new(new_i8s, dtype=self.dtype, freq=freq) + return type(self)._simple_new(arr, name=self.name) except (AttributeError, TypeError): # fall back to object index diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index d0a31b68250ad..66974349c0bb5 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -185,22 +185,15 @@ def __new__( def _simple_new(cls, values, name=None, freq=None, dtype=_TD_DTYPE): # `dtype` is passed by _shallow_copy in corner cases, should always # be timedelta64[ns] if present - - if not isinstance(values, TimedeltaArray): - values = TimedeltaArray._simple_new(values, dtype=dtype, freq=freq) - else: - if freq is None: - freq = values.freq - assert isinstance(values, TimedeltaArray), type(values) assert dtype == _TD_DTYPE, dtype - assert values.dtype == "m8[ns]", values.dtype + assert isinstance(values, TimedeltaArray) + assert freq is None or values.freq == freq - tdarr = TimedeltaArray._simple_new(values._data, freq=freq) result = object.__new__(cls) - result._data = tdarr + result._data = values result._name = name # For groupby perf. See note in indexes/base about _index_data - result._index_data = tdarr._data + result._index_data = values._data result._reset_identity() return result diff --git a/pandas/core/resample.py b/pandas/core/resample.py index fb837409a00f5..508c45398b1bc 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -427,7 +427,9 @@ def _wrap_result(self, result): if isinstance(obj.index, PeriodIndex): result.index = obj.index.asfreq(self.freq) else: - result.index = obj.index._shallow_copy(freq=self.freq) + idx = obj.index + index = type(idx)([], dtype=idx.dtype, freq=self.freq, name=idx.name) + result.index = index result.name = getattr(obj, "name", None) return result @@ -1787,8 +1789,10 @@ def asfreq(obj, freq, method=None, how=None, normalize=False, fill_value=None): elif len(obj.index) == 0: new_obj = obj.copy() - new_obj.index = obj.index._shallow_copy(freq=to_offset(freq)) + idx = obj.index + new_index = type(idx)([], dtype=idx.dtype, name=idx.name, freq=freq) + new_obj.index = new_index else: dti = date_range(obj.index[0], obj.index[-1], freq=freq) dti.name = obj.index.name diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index f8a1810e66219..805fb6defb580 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -106,7 +106,9 @@ def test_resample_empty_series(freq, empty_series, resample_method): if isinstance(s.index, PeriodIndex): expected.index = s.index.asfreq(freq=freq) else: - expected.index = s.index._shallow_copy(freq=freq) + idx = s.index + index = type(idx)([], dtype=idx.dtype, freq=freq, name=idx.name) + expected.index = index tm.assert_index_equal(result.index, expected.index) assert result.index.freq == expected.index.freq tm.assert_series_equal(result, expected, check_dtype=False) @@ -122,7 +124,9 @@ def test_resample_count_empty_series(freq, empty_series, resample_method): if isinstance(empty_series.index, PeriodIndex): index = empty_series.index.asfreq(freq=freq) else: - index = empty_series.index._shallow_copy(freq=freq) + idx = empty_series.index + index = type(idx)([], dtype=idx.dtype, freq=freq, name=idx.name) + expected = pd.Series([], dtype="int64", index=index, name=empty_series.name) tm.assert_series_equal(result, expected) @@ -144,7 +148,9 @@ def test_resample_empty_dataframe(empty_frame, freq, resample_method): if isinstance(df.index, PeriodIndex): expected.index = df.index.asfreq(freq=freq) else: - expected.index = df.index._shallow_copy(freq=freq) + idx = df.index + index = type(idx)([], dtype=idx.dtype, freq=freq, name=idx.name) + expected.index = index tm.assert_index_equal(result.index, expected.index) assert result.index.freq == expected.index.freq tm.assert_almost_equal(result, expected, check_dtype=False) @@ -165,7 +171,8 @@ def test_resample_count_empty_dataframe(freq, empty_frame): if isinstance(empty_frame.index, PeriodIndex): index = empty_frame.index.asfreq(freq=freq) else: - index = empty_frame.index._shallow_copy(freq=freq) + idx = empty_frame.index + index = type(idx)([], dtype=idx.dtype, freq=freq, name=idx.name) expected = pd.DataFrame({"a": []}, dtype="int64", index=index) tm.assert_frame_equal(result, expected) @@ -184,7 +191,8 @@ def test_resample_size_empty_dataframe(freq, empty_frame): if isinstance(empty_frame.index, PeriodIndex): index = empty_frame.index.asfreq(freq=freq) else: - index = empty_frame.index._shallow_copy(freq=freq) + idx = empty_frame.index + index = type(idx)([], dtype=idx.dtype, freq=freq, name=idx.name) expected = pd.Series([], dtype="int64", index=index) tm.assert_series_equal(result, expected) From 9c90e2d5544891f2a81499b9cf66649ae749ab06 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 25 Jan 2020 14:34:35 -0800 Subject: [PATCH 4/5] check none-freq --- pandas/core/indexes/datetimelike.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 01cb4dfbb7b47..7fa92596dc23b 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -615,9 +615,10 @@ def _shallow_copy(self, values=None, **kwargs): if isinstance(values, type(self)): values = values._data if isinstance(values, np.ndarray): - # TODO: try to avoid this case - freq = getattr(kwargs, "freq", None) - values = type(self._data)(values, dtype=self.dtype, freq=freq) + # TODO: We would rather not get here + if kwargs.get("freq") is not None: + raise ValueError(kwargs) + values = type(self._data)(values, dtype=self.dtype) attributes = self._get_attributes_dict() From b30aafa1d984fe7211069223ac5fd91e1c5ec601 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 26 Jan 2020 10:29:16 -0800 Subject: [PATCH 5/5] implement _asfreq_compat --- pandas/core/resample.py | 37 ++++++++++++++++++++++-------- pandas/tests/resample/test_base.py | 37 ++++++++---------------------- 2 files changed, 38 insertions(+), 36 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 508c45398b1bc..94ff1f0056663 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -23,6 +23,7 @@ from pandas.core.groupby.groupby import GroupBy, _GroupBy, _pipe_template, get_groupby from pandas.core.groupby.grouper import Grouper from pandas.core.groupby.ops import BinGrouper +from pandas.core.indexes.api import Index from pandas.core.indexes.datetimes import DatetimeIndex, date_range from pandas.core.indexes.period import PeriodIndex, period_range from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range @@ -424,12 +425,7 @@ def _wrap_result(self, result): if isinstance(result, ABCSeries) and result.empty: obj = self.obj - if isinstance(obj.index, PeriodIndex): - result.index = obj.index.asfreq(self.freq) - else: - idx = obj.index - index = type(idx)([], dtype=idx.dtype, freq=self.freq, name=idx.name) - result.index = index + result.index = _asfreq_compat(obj.index, freq=self.freq) result.name = getattr(obj, "name", None) return result @@ -1790,9 +1786,7 @@ def asfreq(obj, freq, method=None, how=None, normalize=False, fill_value=None): elif len(obj.index) == 0: new_obj = obj.copy() - idx = obj.index - new_index = type(idx)([], dtype=idx.dtype, name=idx.name, freq=freq) - new_obj.index = new_index + new_obj.index = _asfreq_compat(obj.index, freq) else: dti = date_range(obj.index[0], obj.index[-1], freq=freq) dti.name = obj.index.name @@ -1801,3 +1795,28 @@ def asfreq(obj, freq, method=None, how=None, normalize=False, fill_value=None): new_obj.index = new_obj.index.normalize() return new_obj + + +def _asfreq_compat(index, freq): + """ + Helper to mimic asfreq on (empty) DatetimeIndex and TimedeltaIndex. + + Parameters + ---------- + index : PeriodIndex, DatetimeIndex, or TimedeltaIndex + freq : DateOffset + + Returns + ------- + same type as index + """ + if len(index) != 0: + # This should never be reached, always checked by the caller + raise ValueError( + "Can only set arbitrary freq for empty DatetimeIndex or TimedeltaIndex" + ) + if isinstance(index, PeriodIndex): + new_index = index.asfreq(freq=freq) + else: + new_index = Index([], dtype=index.dtype, freq=freq, name=index.name) + return new_index diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 805fb6defb580..c84a5bf653b0a 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -11,6 +11,7 @@ from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.period import PeriodIndex, period_range from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range +from pandas.core.resample import _asfreq_compat # a fixture value can be overridden by the test parameter value. Note that the # value of the fixture can be overridden this way even if the test doesn't use @@ -103,12 +104,8 @@ def test_resample_empty_series(freq, empty_series, resample_method): result = getattr(s.resample(freq), resample_method)() expected = s.copy() - if isinstance(s.index, PeriodIndex): - expected.index = s.index.asfreq(freq=freq) - else: - idx = s.index - index = type(idx)([], dtype=idx.dtype, freq=freq, name=idx.name) - expected.index = index + expected.index = _asfreq_compat(s.index, freq) + tm.assert_index_equal(result.index, expected.index) assert result.index.freq == expected.index.freq tm.assert_series_equal(result, expected, check_dtype=False) @@ -121,11 +118,7 @@ def test_resample_count_empty_series(freq, empty_series, resample_method): # GH28427 result = getattr(empty_series.resample(freq), resample_method)() - if isinstance(empty_series.index, PeriodIndex): - index = empty_series.index.asfreq(freq=freq) - else: - idx = empty_series.index - index = type(idx)([], dtype=idx.dtype, freq=freq, name=idx.name) + index = _asfreq_compat(empty_series.index, freq) expected = pd.Series([], dtype="int64", index=index, name=empty_series.name) @@ -145,12 +138,8 @@ def test_resample_empty_dataframe(empty_frame, freq, resample_method): # GH14962 expected = Series([], dtype=object) - if isinstance(df.index, PeriodIndex): - expected.index = df.index.asfreq(freq=freq) - else: - idx = df.index - index = type(idx)([], dtype=idx.dtype, freq=freq, name=idx.name) - expected.index = index + expected.index = _asfreq_compat(df.index, freq) + tm.assert_index_equal(result.index, expected.index) assert result.index.freq == expected.index.freq tm.assert_almost_equal(result, expected, check_dtype=False) @@ -168,11 +157,8 @@ def test_resample_count_empty_dataframe(freq, empty_frame): result = empty_frame.resample(freq).count() - if isinstance(empty_frame.index, PeriodIndex): - index = empty_frame.index.asfreq(freq=freq) - else: - idx = empty_frame.index - index = type(idx)([], dtype=idx.dtype, freq=freq, name=idx.name) + index = _asfreq_compat(empty_frame.index, freq) + expected = pd.DataFrame({"a": []}, dtype="int64", index=index) tm.assert_frame_equal(result, expected) @@ -188,11 +174,8 @@ def test_resample_size_empty_dataframe(freq, empty_frame): result = empty_frame.resample(freq).size() - if isinstance(empty_frame.index, PeriodIndex): - index = empty_frame.index.asfreq(freq=freq) - else: - idx = empty_frame.index - index = type(idx)([], dtype=idx.dtype, freq=freq, name=idx.name) + index = _asfreq_compat(empty_frame.index, freq) + expected = pd.Series([], dtype="int64", index=index) tm.assert_series_equal(result, expected)