From 533bba364cd41ad9ebcd4831dff30d853d2b550e Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sun, 24 Jul 2016 16:04:58 +0900 Subject: [PATCH] BUG: value_counts may raise OutOfBoundsDatetime --- doc/source/whatsnew/v0.19.0.txt | 2 ++ pandas/core/series.py | 12 +++++++----- pandas/indexes/base.py | 8 +++++--- pandas/tests/indexes/test_datetimelike.py | 16 ++++++++++++++-- pandas/tests/test_algos.py | 18 +++++++++++++++--- 5 files changed, 43 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 721da38baf67d..227975819372b 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -747,6 +747,8 @@ Bug Fixes - Bug in invalid datetime parsing in ``to_datetime`` and ``DatetimeIndex`` may raise ``TypeError`` rather than ``ValueError`` (:issue:`11169`, :issue:`11287`) - Bug in ``Index`` created with tz-aware ``Timestamp`` and mismatched ``tz`` option incorrectly coerces timezone (:issue:`13692`) - Bug in ``DatetimeIndex`` with nanosecond frequency does not include timestamp specified with ``end`` (:issue:`13672`) +- Bug in ``Index`` raises ``OutOfBoundsDatetime`` if ``datetime`` exceeds ``datetime64[ns]`` bounds, rather than coercing to ``object`` dtype (:issue:`13663`) +- Bug in ``.value_counts`` raises ``OutOfBoundsDatetime`` if data exceeds ``datetime64[ns]`` bounds (:issue:`13663`) - Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) - Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`) diff --git a/pandas/core/series.py b/pandas/core/series.py index c3f5b1b8e641c..e1cff96b9741e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -291,11 +291,13 @@ def _set_axis(self, axis, labels, fastpath=False): if not isinstance(labels, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): - labels = DatetimeIndex(labels) - - # need to set here becuase we changed the index - if fastpath: - self._data.set_axis(axis, labels) + try: + labels = DatetimeIndex(labels) + # need to set here becuase we changed the index + if fastpath: + self._data.set_axis(axis, labels) + except tslib.OutOfBoundsDatetime: + pass self._set_subtyp(is_all_dates) object.__setattr__(self, '_index', labels) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 850d049ef9f45..b5ce456bda254 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -258,13 +258,15 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, pass elif inferred != 'string': if inferred.startswith('datetime'): - if (lib.is_datetime_with_singletz_array(subarr) or 'tz' in kwargs): # only when subarr has the same tz from pandas.tseries.index import DatetimeIndex - return DatetimeIndex(subarr, copy=copy, name=name, - **kwargs) + try: + return DatetimeIndex(subarr, copy=copy, + name=name, **kwargs) + except tslib.OutOfBoundsDatetime: + pass elif inferred.startswith('timedelta'): from pandas.tseries.tdi import TimedeltaIndex diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index 378e8c545ec83..9371bef8b8f2e 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -from datetime import timedelta, time +from datetime import datetime, timedelta, time import numpy as np @@ -12,7 +12,7 @@ import pandas.util.testing as tm import pandas as pd -from pandas.lib import Timestamp +from pandas.tslib import Timestamp, OutOfBoundsDatetime from .common import Base @@ -336,6 +336,18 @@ def test_construction_base_constructor(self): tm.assert_index_equal(pd.Index(np.array(arr)), pd.DatetimeIndex(np.array(arr))) + def test_construction_outofbounds(self): + # GH 13663 + dates = [datetime(3000, 1, 1), datetime(4000, 1, 1), + datetime(5000, 1, 1), datetime(6000, 1, 1)] + exp = Index(dates, dtype=object) + # coerces to object + tm.assert_index_equal(Index(dates), exp) + + with tm.assertRaises(OutOfBoundsDatetime): + # can't create DatetimeIndex + DatetimeIndex(dates) + def test_astype(self): # GH 13149, GH 13209 idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index cf23d096d99ba..3c77d19aa7f3c 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -4,7 +4,7 @@ import numpy as np from numpy.random import RandomState from numpy import nan -import datetime +from datetime import datetime from pandas import Series, Categorical, CategoricalIndex, Index import pandas as pd @@ -121,7 +121,7 @@ def test_mixed_integer(self): def test_unsortable(self): # GH 13714 - arr = np.array([1, 2, datetime.datetime.now(), 0, 3], dtype=object) + arr = np.array([1, 2, datetime.now(), 0, 3], dtype=object) if compat.PY2 and not pd._np_version_under1p10: # RuntimeWarning: tp_compare didn't return -1 or -2 for exception with tm.assert_produces_warning(RuntimeWarning): @@ -556,6 +556,18 @@ def test_value_counts_nat(self): tm.assert_series_equal(algos.value_counts(dt), exp_dt) # TODO same for (timedelta) + def test_value_counts_datetime_outofbounds(self): + # GH 13663 + s = pd.Series([datetime(3000, 1, 1), datetime(5000, 1, 1), + datetime(5000, 1, 1), datetime(6000, 1, 1), + datetime(3000, 1, 1), datetime(3000, 1, 1)]) + res = s.value_counts() + + exp_index = pd.Index([datetime(3000, 1, 1), datetime(5000, 1, 1), + datetime(6000, 1, 1)], dtype=object) + exp = pd.Series([3, 2, 1], index=exp_index) + tm.assert_series_equal(res, exp) + def test_categorical(self): s = Series(pd.Categorical(list('aaabbc'))) result = s.value_counts() @@ -818,7 +830,7 @@ def _check(arr): def test_pad_backfill_object_segfault(): old = np.array([], dtype='O') - new = np.array([datetime.datetime(2010, 12, 31)], dtype='O') + new = np.array([datetime(2010, 12, 31)], dtype='O') result = _algos.pad_object(old, new) expected = np.array([-1], dtype=np.int64)