diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 3f881485937d8..b04d2eeba1ed0 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -380,6 +380,7 @@ Backwards incompatible API changes - ``max_rows`` and ``max_cols`` parameters removed from :class:`HTMLFormatter` since truncation is handled by :class:`DataFrameFormatter` (:issue:`23818`) - :meth:`read_csv` will now raise a ``ValueError`` if a column with missing values is declared as having dtype ``bool`` (:issue:`20591`) - The column order of the resultant :class:`DataFrame` from :meth:`MultiIndex.to_frame` is now guaranteed to match the :attr:`MultiIndex.names` order. (:issue:`22420`) +- :func:`pd.offsets.generate_range` argument ``time_rule`` has been removed; use ``offset`` instead (:issue:`24157`) Percentage change on groupby changes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -1133,7 +1134,6 @@ Deprecations - In :meth:`Series.where` with Categorical data, providing an ``other`` that is not present in the categories is deprecated. Convert the categorical to a different dtype or add the ``other`` to the categories first (:issue:`24077`). - :meth:`Series.clip_lower`, :meth:`Series.clip_upper`, :meth:`DataFrame.clip_lower` and :meth:`DataFrame.clip_upper` are deprecated and will be removed in a future version. Use ``Series.clip(lower=threshold)``, ``Series.clip(upper=threshold)`` and the equivalent ``DataFrame`` methods (:issue:`24203`) - .. _whatsnew_0240.deprecations.datetimelike_int_ops: Integer Addition/Subtraction with Datetime-like Classes Is Deprecated @@ -1310,6 +1310,9 @@ Datetimelike - Bug in :class:`Index` where calling ``np.array(dtindex, dtype=object)`` on a timezone-naive :class:`DatetimeIndex` would return an array of ``datetime`` objects instead of :class:`Timestamp` objects, potentially losing nanosecond portions of the timestamps (:issue:`23524`) - Bug in :class:`Categorical.__setitem__` not allowing setting with another ``Categorical`` when both are undordered and have the same categories, but in a different order (:issue:`24142`) - Bug in :func:`date_range` where using dates with millisecond resolution or higher could return incorrect values or the wrong number of values in the index (:issue:`24110`) +- Bug in :class:`DatetimeIndex` where constructing a :class:`DatetimeIndex` from a :class:`Categorical` or :class:`CategoricalIndex` would incorrectly drop timezone information (:issue:`18664`) +- Bug in :class:`DatetimeIndex` and :class:`TimedeltaIndex` where indexing with ``Ellipsis`` would incorrectly lose the index's ``freq`` attribute (:issue:`21282`) +- Clarified error message produced when passing an incorrect ``freq`` argument to :class:`DatetimeIndex` with ``NaT`` as the first entry in the passed data (:issue:`11587`) Timedelta ^^^^^^^^^ @@ -1422,6 +1425,7 @@ Indexing - Bug in :func:`Index.union` and :func:`Index.intersection` where name of the ``Index`` of the result was not computed correctly for certain cases (:issue:`9943`, :issue:`9862`) - Bug in :class:`Index` slicing with boolean :class:`Index` may raise ``TypeError`` (:issue:`22533`) - Bug in ``PeriodArray.__setitem__`` when accepting slice and list-like value (:issue:`23978`) +- Bug in :class:`DatetimeIndex`, :class:`TimedeltaIndex` where indexing with ``Ellipsis`` would lose their ``freq`` attribute (:issue:`21282`) Missing ^^^^^^^ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index ceaf9e748fe5a..a6eacc3bb4bfd 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -351,6 +351,10 @@ def __getitem__(self, key): freq = key.step * self.freq else: freq = self.freq + elif key is Ellipsis: + # GH#21282 indexing with Ellipsis is similar to a full slice, + # should preserve `freq` attribute + freq = self.freq attribs['freq'] = freq @@ -547,9 +551,22 @@ def _validate_frequency(cls, index, freq, **kwargs): if index.size == 0 or inferred == freq.freqstr: return None - on_freq = cls._generate_range(start=index[0], end=None, - periods=len(index), freq=freq, **kwargs) - if not np.array_equal(index.asi8, on_freq.asi8): + try: + on_freq = cls._generate_range(start=index[0], end=None, + periods=len(index), freq=freq, + **kwargs) + if not np.array_equal(index.asi8, on_freq.asi8): + raise ValueError + except ValueError as e: + if "non-fixed" in str(e): + # non-fixed frequencies are not meaningful for timedelta64; + # we retain that error message + raise e + # GH#11587 the main way this is reached is if the `np.array_equal` + # check above is False. This can also be reached if index[0] + # is `NaT`, in which case the call to `cls._generate_range` will + # raise a ValueError, which we re-raise with a more targeted + # message. raise ValueError('Inferred frequency {infer} from passed values ' 'does not conform to passed frequency {passed}' .format(infer=inferred, passed=freq.freqstr)) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 4849ee1e3e665..2ecbf9f0ff847 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -14,9 +14,9 @@ from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( - _INT64_DTYPE, _NS_DTYPE, is_datetime64_dtype, is_datetime64tz_dtype, - is_extension_type, is_float_dtype, is_int64_dtype, is_object_dtype, - is_period_dtype, is_string_dtype, is_timedelta64_dtype) + _INT64_DTYPE, _NS_DTYPE, is_categorical_dtype, is_datetime64_dtype, + is_datetime64tz_dtype, is_extension_type, is_float_dtype, is_int64_dtype, + is_object_dtype, is_period_dtype, is_string_dtype, is_timedelta64_dtype) from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna @@ -264,6 +264,8 @@ def _generate_range(cls, start, end, periods, freq, tz=None, if closed is not None: raise ValueError("Closed has to be None if not both of start" "and end are defined") + if start is NaT or end is NaT: + raise ValueError("Neither `start` nor `end` can be NaT") left_closed, right_closed = dtl.validate_endpoints(closed) @@ -1652,6 +1654,13 @@ def maybe_convert_dtype(data, copy): raise TypeError("Passing PeriodDtype data is invalid. " "Use `data.to_timestamp()` instead") + elif is_categorical_dtype(data): + # GH#18664 preserve tz in going DTI->Categorical->DTI + # TODO: cases where we need to do another pass through this func, + # e.g. the categories are timedelta64s + data = data.categories.take(data.codes, fill_value=NaT) + copy = False + elif is_extension_type(data) and not is_datetime64tz_dtype(data): # Includes categorical # TODO: We have no tests for these diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py index 5de79044bc239..88c322ff7c9ff 100644 --- a/pandas/tests/indexes/datetimes/test_construction.py +++ b/pandas/tests/indexes/datetimes/test_construction.py @@ -14,12 +14,42 @@ from pandas import ( DatetimeIndex, Index, Timestamp, date_range, datetime, offsets, to_datetime) -from pandas.core.arrays import period_array +from pandas.core.arrays import ( + DatetimeArrayMixin as DatetimeArray, period_array) import pandas.util.testing as tm class TestDatetimeIndex(object): + @pytest.mark.parametrize('dt_cls', [DatetimeIndex, DatetimeArray]) + def test_freq_validation_with_nat(self, dt_cls): + # GH#11587 make sure we get a useful error message when generate_range + # raises + msg = ("Inferred frequency None from passed values does not conform " + "to passed frequency D") + with pytest.raises(ValueError, match=msg): + dt_cls([pd.NaT, pd.Timestamp('2011-01-01')], freq='D') + with pytest.raises(ValueError, match=msg): + dt_cls([pd.NaT, pd.Timestamp('2011-01-01').value], + freq='D') + + def test_categorical_preserves_tz(self): + # GH#18664 retain tz when going DTI-->Categorical-->DTI + # TODO: parametrize over DatetimeIndex/DatetimeArray + # once CategoricalIndex(DTA) works + + dti = pd.DatetimeIndex( + [pd.NaT, '2015-01-01', '1999-04-06 15:14:13', '2015-01-01'], + tz='US/Eastern') + + ci = pd.CategoricalIndex(dti) + carr = pd.Categorical(dti) + cser = pd.Series(ci) + + for obj in [ci, carr, cser]: + result = pd.DatetimeIndex(obj) + tm.assert_index_equal(result, dti) + def test_dti_with_period_data_raises(self): # GH#23675 data = pd.PeriodIndex(['2016Q1', '2016Q2'], freq='Q') diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 11cefec4f34cf..a39100b3ec204 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -80,6 +80,14 @@ def test_date_range_timestamp_equiv_preserve_frequency(self): class TestDateRanges(TestData): + def test_date_range_nat(self): + # GH#11587 + msg = "Neither `start` nor `end` can be NaT" + with pytest.raises(ValueError, match=msg): + date_range(start='2016-01-01', end=pd.NaT, freq='D') + with pytest.raises(ValueError, match=msg): + date_range(start=pd.NaT, end='2016-01-01', freq='D') + def test_date_range_out_of_bounds(self): # GH#14187 with pytest.raises(OutOfBoundsDatetime): @@ -533,12 +541,12 @@ class TestGenRangeGeneration(object): def test_generate(self): rng1 = list(generate_range(START, END, offset=BDay())) - rng2 = list(generate_range(START, END, time_rule='B')) + rng2 = list(generate_range(START, END, offset='B')) assert rng1 == rng2 def test_generate_cday(self): rng1 = list(generate_range(START, END, offset=CDay())) - rng2 = list(generate_range(START, END, time_rule='C')) + rng2 = list(generate_range(START, END, offset='C')) assert rng1 == rng2 def test_1(self): diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index 944c925dabe3e..c3b00133228d8 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -16,6 +16,15 @@ class TestGetItem(object): + def test_ellipsis(self): + # GH#21282 + idx = pd.date_range('2011-01-01', '2011-01-31', freq='D', + tz='Asia/Tokyo', name='idx') + + result = idx[...] + assert result.equals(idx) + assert result is not idx + def test_getitem(self): idx1 = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') idx2 = pd.date_range('2011-01-01', '2011-01-31', freq='D', diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index aaa1126e92f3d..29b96604b7ea8 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -13,6 +13,14 @@ class TestGetItem(object): + def test_ellipsis(self): + # GH#21282 + idx = period_range('2011-01-01', '2011-01-31', freq='D', + name='idx') + + result = idx[...] + assert result.equals(idx) + assert result is not idx def test_getitem(self): idx1 = pd.period_range('2011-01-01', '2011-01-31', freq='D', diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index 94d694b644eb8..4e98732456d2c 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -9,6 +9,14 @@ class TestGetItem(object): + def test_ellipsis(self): + # GH#21282 + idx = timedelta_range('1 day', '31 day', freq='D', name='idx') + + result = idx[...] + assert result.equals(idx) + assert result is not idx + def test_getitem(self): idx1 = timedelta_range('1 day', '31 day', freq='D', name='idx') diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 030887ac731f3..456e0b10e5a96 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -53,17 +53,11 @@ def test_to_m8(): valb = datetime(2007, 10, 1) valu = _to_m8(valb) assert isinstance(valu, np.datetime64) - # assert valu == np.datetime64(datetime(2007,10,1)) - # def test_datetime64_box(): - # valu = np.datetime64(datetime(2007,10,1)) - # valb = _dt_box(valu) - # assert type(valb) == datetime - # assert valb == datetime(2007,10,1) - ##### - # DateOffset Tests - ##### +##### +# DateOffset Tests +##### class Base(object): diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 45f10a2f06fa2..cff9556a4230e 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -2457,8 +2457,7 @@ class Nano(Tick): # --------------------------------------------------------------------- -def generate_range(start=None, end=None, periods=None, - offset=BDay(), time_rule=None): +def generate_range(start=None, end=None, periods=None, offset=BDay()): """ Generates a sequence of dates corresponding to the specified time offset. Similar to dateutil.rrule except uses pandas DateOffset @@ -2470,8 +2469,6 @@ def generate_range(start=None, end=None, periods=None, end : datetime (default None) periods : int, (default None) offset : DateOffset, (default BDay()) - time_rule : (legacy) name of DateOffset object to be used, optional - Corresponds with names expected by tseries.frequencies.get_offset Notes ----- @@ -2479,17 +2476,13 @@ def generate_range(start=None, end=None, periods=None, * At least two of (start, end, periods) must be specified. * If both start and end are specified, the returned dates will satisfy start <= date <= end. - * If both time_rule and offset are specified, time_rule supersedes offset. Returns ------- dates : generator object - """ - if time_rule is not None: - from pandas.tseries.frequencies import get_offset - - offset = get_offset(time_rule) + from pandas.tseries.frequencies import to_offset + offset = to_offset(offset) start = to_datetime(start) end = to_datetime(end)