From c5fc3c640fcb1c126c91cea1773c0ed7084d78f0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 14 Jun 2019 12:04:11 +0200 Subject: [PATCH 01/10] BUG: catch out-of-bounds datetime64 in Series/DataFrame constructor --- pandas/core/dtypes/cast.py | 2 +- pandas/core/internals/construction.py | 5 ++++- pandas/tests/series/test_constructors.py | 5 +++++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 2f66e9ed46aa0..a25b1bc650ac8 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1063,7 +1063,7 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'): dtype = value.dtype if dtype.kind == 'M' and dtype != _NS_DTYPE: - value = value.astype(_NS_DTYPE) + value = tslibs.conversion.ensure_datetime64ns(value) elif dtype.kind == 'm' and dtype != _TD_DTYPE: value = to_timedelta(value) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 863b9f7fb16d7..0fb56e1899606 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -8,7 +8,7 @@ import numpy.ma as ma from pandas._libs import lib -from pandas._libs.tslibs import IncompatibleFrequency +from pandas._libs.tslibs import IncompatibleFrequency, OutOfBoundsDatetime from pandas.compat import raise_with_traceback from pandas.core.dtypes.cast import ( @@ -701,6 +701,9 @@ def _try_cast(arr, take_fast_path, dtype, copy, raise_cast_failure): elif not is_extension_type(subarr): subarr = construct_1d_ndarray_preserving_na(subarr, dtype, copy=copy) + except OutOfBoundsDatetime: + if raise_cast_failure: + raise except (ValueError, TypeError): if is_categorical_dtype(dtype): # We *do* allow casting to categorical, since we know diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index f2345a0822f6d..908d2e7acdfa3 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -909,6 +909,11 @@ def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): expected = Series(pd.Timestamp(arg)).dt.tz_localize('CET') assert_series_equal(result, expected) + def test_constructor_datetime64_outofbound(self): + # GH-26206 out of bound non-ns unit + with pytest.raises(pd.errors.OutOfBoundsDatetime): + pd.Series(np.array(['2262-04-12'], dtype='datetime64[D]')) + def test_construction_interval(self): # construction from interval & array of intervals index = IntervalIndex.from_breaks(np.arange(3), closed='right') From a0c7051cc5b3f50ad3ecc28bd18361a7c417f705 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 17 Jun 2019 10:34:28 +0200 Subject: [PATCH 02/10] Update tests according to discussion --- pandas/tests/series/test_constructors.py | 27 +++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 908d2e7acdfa3..29f64606a0a19 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -909,10 +909,31 @@ def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): expected = Series(pd.Timestamp(arg)).dt.tz_localize('CET') assert_series_equal(result, expected) - def test_constructor_datetime64_outofbound(self): - # GH-26206 out of bound non-ns unit + @pytest.mark.parametrize("a", [ + np.array(['2263-01-01'], dtype='datetime64[D]'), + np.array([datetime(2263, 1, 1)], dtype=object), + np.array([np.datetime64('2263-01-01', 'D')], dtype=object), + np.array(["2263-01-01"], dtype=object) + ], ids=['datetime64[D]', 'object-datetime.datetime', + 'object-numpy-scalar', 'object-string']) + def test_constructor_datetime_outofbound(self, a): + # GH-26853 (+ bug GH-26206 out of bound non-ns unit) + + # No dtype specified (dtype inference) + # datetime64[non-ns] raise error, other cases result in object dtype + # and preserve original data + if a.dtype.kind == 'M': + with pytest.raises(pd.errors.OutOfBoundsDatetime): + pd.Series(a) + else: + result = pd.Series(a) + assert result.dtype == 'object' + tm.assert_numpy_array_equal(result.to_numpy(), a) + + # Explicit dtype specified + # Forced conversion fails for all -> all cases raise error with pytest.raises(pd.errors.OutOfBoundsDatetime): - pd.Series(np.array(['2262-04-12'], dtype='datetime64[D]')) + pd.Series(a, dtype='datetime64[ns]') def test_construction_interval(self): # construction from interval & array of intervals From b69f82edbe6e5dc4324c3bde31b49cc38721d9fd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 17 Jun 2019 10:49:23 +0200 Subject: [PATCH 03/10] update fix --- pandas/core/dtypes/cast.py | 2 ++ pandas/core/internals/construction.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index a25b1bc650ac8..c68d469d291e7 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1038,6 +1038,8 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'): .tz_convert(dtype.tz)) elif is_timedelta64: value = to_timedelta(value, errors=errors)._values + except OutOfBoundsDatetime: + raise except (AttributeError, ValueError, TypeError): pass diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 0fb56e1899606..031680c11d7b7 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -702,8 +702,8 @@ def _try_cast(arr, take_fast_path, dtype, copy, raise_cast_failure): subarr = construct_1d_ndarray_preserving_na(subarr, dtype, copy=copy) except OutOfBoundsDatetime: - if raise_cast_failure: - raise + # in case of out of bound datetime64 -> always raise + raise except (ValueError, TypeError): if is_categorical_dtype(dtype): # We *do* allow casting to categorical, since we know From 4d28cade5abd08e10b329857097cfd0be174d711 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 17 Jun 2019 11:00:07 +0200 Subject: [PATCH 04/10] lint --- pandas/tests/series/test_constructors.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 29f64606a0a19..db8886b9348e5 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -910,12 +910,12 @@ def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): assert_series_equal(result, expected) @pytest.mark.parametrize("a", [ - np.array(['2263-01-01'], dtype='datetime64[D]'), - np.array([datetime(2263, 1, 1)], dtype=object), - np.array([np.datetime64('2263-01-01', 'D')], dtype=object), - np.array(["2263-01-01"], dtype=object) - ], ids=['datetime64[D]', 'object-datetime.datetime', - 'object-numpy-scalar', 'object-string']) + np.array(['2263-01-01'], dtype='datetime64[D]'), + np.array([datetime(2263, 1, 1)], dtype=object), + np.array([np.datetime64('2263-01-01', 'D')], dtype=object), + np.array(["2263-01-01"], dtype=object) + ], ids=['datetime64[D]', 'object-datetime.datetime', + 'object-numpy-scalar', 'object-string']) def test_constructor_datetime_outofbound(self, a): # GH-26853 (+ bug GH-26206 out of bound non-ns unit) From 2e431b6dc3f9490463eeaea2c4b7a70f038723fc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 18 Jun 2019 13:20:22 +0200 Subject: [PATCH 05/10] add whatsnew --- doc/source/whatsnew/v0.25.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 2b1a61186dca6..9d095995258b8 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -567,6 +567,7 @@ Datetimelike - Bug in :meth:`isin` for datetimelike indexes; :class:`DatetimeIndex`, :class:`TimedeltaIndex` and :class:`PeriodIndex` where the ``levels`` parameter was ignored. (:issue:`26675`) - Bug in :func:`to_datetime` which raises ``TypeError`` for ``format='%Y%m%d'`` when called for invalid integer dates with length >= 6 digits with ``errors='ignore'`` - Bug when comparing a :class:`PeriodIndex` against a zero-dimensional numpy array (:issue:`26689`) +- Bug in constructing a ``Series`` or ``DataFrame`` from a numpy ``datetime64`` array with a non-ns unit and out-of-bound timestamps generating rubbish data (:issue:`26206`). Timedelta ^^^^^^^^^ From ef89acf8bf6d37916cf7af0c5a415013613254ac Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 18 Jun 2019 13:36:18 +0200 Subject: [PATCH 06/10] parametrize example --- pandas/tests/series/test_constructors.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index db8886b9348e5..7dc67b961f281 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -909,6 +909,12 @@ def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): expected = Series(pd.Timestamp(arg)).dt.tz_localize('CET') assert_series_equal(result, expected) + @pytest.mark.parametrize("klass", [ + Series, + lambda x, **kwargs: DataFrame({'a': x}, **kwargs)['a'], + lambda x, **kwargs: DataFrame(x, **kwargs)[0], + Index, + ]) @pytest.mark.parametrize("a", [ np.array(['2263-01-01'], dtype='datetime64[D]'), np.array([datetime(2263, 1, 1)], dtype=object), @@ -916,7 +922,7 @@ def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): np.array(["2263-01-01"], dtype=object) ], ids=['datetime64[D]', 'object-datetime.datetime', 'object-numpy-scalar', 'object-string']) - def test_constructor_datetime_outofbound(self, a): + def test_constructor_datetime_outofbound(self, a, klass): # GH-26853 (+ bug GH-26206 out of bound non-ns unit) # No dtype specified (dtype inference) @@ -924,16 +930,16 @@ def test_constructor_datetime_outofbound(self, a): # and preserve original data if a.dtype.kind == 'M': with pytest.raises(pd.errors.OutOfBoundsDatetime): - pd.Series(a) + klass(a) else: - result = pd.Series(a) + result = klass(a) assert result.dtype == 'object' tm.assert_numpy_array_equal(result.to_numpy(), a) # Explicit dtype specified # Forced conversion fails for all -> all cases raise error with pytest.raises(pd.errors.OutOfBoundsDatetime): - pd.Series(a, dtype='datetime64[ns]') + klass(a, dtype='datetime64[ns]') def test_construction_interval(self): # construction from interval & array of intervals From bbb59b1c99e735108cf210e612159e936dc5e74b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 18 Jun 2019 13:40:40 +0200 Subject: [PATCH 07/10] xfail dataframe from array case --- pandas/tests/series/test_constructors.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 7dc67b961f281..d57b10f1624f3 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -912,7 +912,8 @@ def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): @pytest.mark.parametrize("klass", [ Series, lambda x, **kwargs: DataFrame({'a': x}, **kwargs)['a'], - lambda x, **kwargs: DataFrame(x, **kwargs)[0], + pytest.param(lambda x, **kwargs: DataFrame(x, **kwargs)[0], + marks=pytest.mark.xfail), Index, ]) @pytest.mark.parametrize("a", [ From bf2e77c6ded856ee6fdcb48cd6c6382d7d8f3015 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 18 Jun 2019 13:49:36 +0200 Subject: [PATCH 08/10] move tests --- pandas/tests/series/test_constructors.py | 33 -------------------- pandas/tests/test_base.py | 38 ++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 33 deletions(-) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index d57b10f1624f3..f2345a0822f6d 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -909,39 +909,6 @@ def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): expected = Series(pd.Timestamp(arg)).dt.tz_localize('CET') assert_series_equal(result, expected) - @pytest.mark.parametrize("klass", [ - Series, - lambda x, **kwargs: DataFrame({'a': x}, **kwargs)['a'], - pytest.param(lambda x, **kwargs: DataFrame(x, **kwargs)[0], - marks=pytest.mark.xfail), - Index, - ]) - @pytest.mark.parametrize("a", [ - np.array(['2263-01-01'], dtype='datetime64[D]'), - np.array([datetime(2263, 1, 1)], dtype=object), - np.array([np.datetime64('2263-01-01', 'D')], dtype=object), - np.array(["2263-01-01"], dtype=object) - ], ids=['datetime64[D]', 'object-datetime.datetime', - 'object-numpy-scalar', 'object-string']) - def test_constructor_datetime_outofbound(self, a, klass): - # GH-26853 (+ bug GH-26206 out of bound non-ns unit) - - # No dtype specified (dtype inference) - # datetime64[non-ns] raise error, other cases result in object dtype - # and preserve original data - if a.dtype.kind == 'M': - with pytest.raises(pd.errors.OutOfBoundsDatetime): - klass(a) - else: - result = klass(a) - assert result.dtype == 'object' - tm.assert_numpy_array_equal(result.to_numpy(), a) - - # Explicit dtype specified - # Forced conversion fails for all -> all cases raise error - with pytest.raises(pd.errors.OutOfBoundsDatetime): - klass(a, dtype='datetime64[ns]') - def test_construction_interval(self): # construction from interval & array of intervals index = IntervalIndex.from_breaks(np.arange(3), closed='right') diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 3b4f85e680f6e..b718e35518da5 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1341,3 +1341,41 @@ def test_to_numpy_dtype(as_series): expected = np.array(['2000-01-01T05', '2001-01-01T05'], dtype='M8[ns]') tm.assert_numpy_array_equal(result, expected) + + +class TestConstruction: + # test certain constructor behaviours on dtype inference across Series, + # Index and DataFrame + + @pytest.mark.parametrize("klass", [ + Series, + lambda x, **kwargs: DataFrame({'a': x}, **kwargs)['a'], + pytest.param(lambda x, **kwargs: DataFrame(x, **kwargs)[0], + marks=pytest.mark.xfail), + Index, + ]) + @pytest.mark.parametrize("a", [ + np.array(['2263-01-01'], dtype='datetime64[D]'), + np.array([datetime(2263, 1, 1)], dtype=object), + np.array([np.datetime64('2263-01-01', 'D')], dtype=object), + np.array(["2263-01-01"], dtype=object) + ], ids=['datetime64[D]', 'object-datetime.datetime', + 'object-numpy-scalar', 'object-string']) + def test_constructor_datetime_outofbound(self, a, klass): + # GH-26853 (+ bug GH-26206 out of bound non-ns unit) + + # No dtype specified (dtype inference) + # datetime64[non-ns] raise error, other cases result in object dtype + # and preserve original data + if a.dtype.kind == 'M': + with pytest.raises(pd.errors.OutOfBoundsDatetime): + klass(a) + else: + result = klass(a) + assert result.dtype == 'object' + tm.assert_numpy_array_equal(result.to_numpy(), a) + + # Explicit dtype specified + # Forced conversion fails for all -> all cases raise error + with pytest.raises(pd.errors.OutOfBoundsDatetime): + klass(a, dtype='datetime64[ns]') \ No newline at end of file From fd742b777806b5389f8a5df0556d2e9077b70198 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 18 Jun 2019 14:17:03 +0200 Subject: [PATCH 09/10] lint --- pandas/tests/test_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index b718e35518da5..d24ed9433f4f7 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1378,4 +1378,4 @@ def test_constructor_datetime_outofbound(self, a, klass): # Explicit dtype specified # Forced conversion fails for all -> all cases raise error with pytest.raises(pd.errors.OutOfBoundsDatetime): - klass(a, dtype='datetime64[ns]') \ No newline at end of file + klass(a, dtype='datetime64[ns]') From 968d85491ea47bc1c30bbe1a85240f8d2d4dd544 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 19 Jun 2019 14:30:17 +0200 Subject: [PATCH 10/10] update whatsnew --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 9d095995258b8..3e72774dace71 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -567,7 +567,7 @@ Datetimelike - Bug in :meth:`isin` for datetimelike indexes; :class:`DatetimeIndex`, :class:`TimedeltaIndex` and :class:`PeriodIndex` where the ``levels`` parameter was ignored. (:issue:`26675`) - Bug in :func:`to_datetime` which raises ``TypeError`` for ``format='%Y%m%d'`` when called for invalid integer dates with length >= 6 digits with ``errors='ignore'`` - Bug when comparing a :class:`PeriodIndex` against a zero-dimensional numpy array (:issue:`26689`) -- Bug in constructing a ``Series`` or ``DataFrame`` from a numpy ``datetime64`` array with a non-ns unit and out-of-bound timestamps generating rubbish data (:issue:`26206`). +- Bug in constructing a ``Series`` or ``DataFrame`` from a numpy ``datetime64`` array with a non-ns unit and out-of-bound timestamps generating rubbish data, which will now correctly raise an ``OutOfBoundsDatetime`` error (:issue:`26206`). Timedelta ^^^^^^^^^