diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 485591b9357ea..04aaa536b5ca7 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -76,13 +76,26 @@ New Behavior: type(s.tolist()[0]) +.. _whatsnew_0182.api.to_datetime_coerce: +``.to_datetime()`` when coercing +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +A bug is fixed in ``.to_datetime()`` when passing integers or floats, and no ``unit`` and ``errors='coerce'`` (:issue:`13180`). +Previously if ``.to_datetime()`` encountered mixed integers/floats and strings, but no datetimes with ``errors='coerce'`` it would convert all to ``NaT``. +Previous Behavior: +.. code-block:: ipython + In [2]: pd.to_datetime([1, 'foo'], errors='coerce') + Out[2]: DatetimeIndex(['NaT', 'NaT'], dtype='datetime64[ns]', freq=None) +This will now convert integers/floats with the default unit of ``ns``. +.. ipython:: python + + pd.to_datetime([1, 'foo'], errors='coerce') .. _whatsnew_0182.api.other: @@ -136,7 +149,6 @@ Bug Fixes - - Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`) diff --git a/pandas/tests/series/test_internals.py b/pandas/tests/series/test_internals.py index 93bd7f0eec7c5..e3a0e056f4da1 100644 --- a/pandas/tests/series/test_internals.py +++ b/pandas/tests/series/test_internals.py @@ -103,7 +103,8 @@ def test_convert_objects(self): with tm.assert_produces_warning(FutureWarning): result = s.convert_objects(convert_dates='coerce', convert_numeric=False) - assert_series_equal(result, s) + expected = Series([lib.NaT] * 2 + [Timestamp(1)] * 2) + assert_series_equal(result, expected) # preserver if non-object s = Series([1], dtype='float32') @@ -270,7 +271,7 @@ def test_convert(self): s = Series(['foo', 'bar', 1, 1.0], dtype='O') result = s._convert(datetime=True, coerce=True) - expected = Series([lib.NaT] * 4) + expected = Series([lib.NaT] * 2 + [Timestamp(1)] * 2) assert_series_equal(result, expected) # preserver if non-object diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 3d8e389ba30f2..880713964ec90 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -762,6 +762,15 @@ def test_to_datetime_unit(self): with self.assertRaises(ValueError): to_datetime([1, 2, 111111111], unit='D') + # coerce we can process + expected = DatetimeIndex([Timestamp('1970-01-02'), + Timestamp('1970-01-03')] + ['NaT'] * 1) + result = to_datetime([1, 2, 'foo'], unit='D', errors='coerce') + tm.assert_index_equal(result, expected) + + result = to_datetime([1, 2, 111111111], unit='D', errors='coerce') + tm.assert_index_equal(result, expected) + def test_series_ctor_datetime64(self): rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s') dates = np.asarray(rng) @@ -2283,6 +2292,123 @@ def test_to_datetime_tz_psycopg2(self): dtype='datetime64[ns, UTC]') tm.assert_index_equal(result, expected) + def test_unit(self): + # GH 11758 + # test proper behavior with erros + + with self.assertRaises(ValueError): + to_datetime([1], unit='D', format='%Y%m%d') + + values = [11111111, 1, 1.0, tslib.iNaT, pd.NaT, np.nan, + 'NaT', ''] + result = to_datetime(values, unit='D', errors='ignore') + expected = Index([11111111, Timestamp('1970-01-02'), + Timestamp('1970-01-02'), pd.NaT, + pd.NaT, pd.NaT, pd.NaT, pd.NaT], + dtype=object) + tm.assert_index_equal(result, expected) + + result = to_datetime(values, unit='D', errors='coerce') + expected = DatetimeIndex(['NaT', '1970-01-02', '1970-01-02', + 'NaT', 'NaT', 'NaT', 'NaT', 'NaT']) + tm.assert_index_equal(result, expected) + + with self.assertRaises(tslib.OutOfBoundsDatetime): + to_datetime(values, unit='D', errors='raise') + + values = [1420043460000, tslib.iNaT, pd.NaT, np.nan, 'NaT'] + + result = to_datetime(values, errors='ignore', unit='s') + expected = Index([1420043460000, pd.NaT, pd.NaT, + pd.NaT, pd.NaT], dtype=object) + tm.assert_index_equal(result, expected) + + result = to_datetime(values, errors='coerce', unit='s') + expected = DatetimeIndex(['NaT', 'NaT', 'NaT', 'NaT', 'NaT']) + tm.assert_index_equal(result, expected) + + with self.assertRaises(tslib.OutOfBoundsDatetime): + to_datetime(values, errors='raise', unit='s') + + # if we have a string, then we raise a ValueError + # and NOT an OutOfBoundsDatetime + for val in ['foo', Timestamp('20130101')]: + try: + to_datetime(val, errors='raise', unit='s') + except tslib.OutOfBoundsDatetime: + raise AssertionError("incorrect exception raised") + except ValueError: + pass + + def test_unit_consistency(self): + + # consistency of conversions + expected = Timestamp('1970-05-09 14:25:11') + result = pd.to_datetime(11111111, unit='s', errors='raise') + self.assertEqual(result, expected) + self.assertIsInstance(result, Timestamp) + + result = pd.to_datetime(11111111, unit='s', errors='coerce') + self.assertEqual(result, expected) + self.assertIsInstance(result, Timestamp) + + result = pd.to_datetime(11111111, unit='s', errors='ignore') + self.assertEqual(result, expected) + self.assertIsInstance(result, Timestamp) + + def test_unit_with_numeric(self): + + # GH 13180 + # coercions from floats/ints are ok + expected = DatetimeIndex(['2015-06-19 05:33:20', + '2015-05-27 22:33:20']) + arr1 = [1.434692e+18, 1.432766e+18] + arr2 = np.array(arr1).astype(int) + for errors in ['ignore', 'raise', 'coerce']: + result = pd.to_datetime(arr1, errors=errors) + tm.assert_index_equal(result, expected) + + result = pd.to_datetime(arr2, errors=errors) + tm.assert_index_equal(result, expected) + + # but we want to make sure that we are coercing + # if we have ints/strings + expected = DatetimeIndex(['NaT', + '2015-06-19 05:33:20', + '2015-05-27 22:33:20']) + arr = ['foo', 1.434692e+18, 1.432766e+18] + result = pd.to_datetime(arr, errors='coerce') + tm.assert_index_equal(result, expected) + + expected = DatetimeIndex(['2015-06-19 05:33:20', + '2015-05-27 22:33:20', + 'NaT', + 'NaT']) + arr = [1.434692e+18, 1.432766e+18, 'foo', 'NaT'] + result = pd.to_datetime(arr, errors='coerce') + tm.assert_index_equal(result, expected) + + def test_unit_mixed(self): + + # mixed integers/datetimes + expected = DatetimeIndex(['2013-01-01', 'NaT', 'NaT']) + arr = [pd.Timestamp('20130101'), 1.434692e+18, 1.432766e+18] + result = pd.to_datetime(arr, errors='coerce') + tm.assert_index_equal(result, expected) + + with self.assertRaises(ValueError): + pd.to_datetime(arr, errors='raise') + + expected = DatetimeIndex(['NaT', + 'NaT', + '2013-01-01']) + arr = [1.434692e+18, 1.432766e+18, pd.Timestamp('20130101')] + result = pd.to_datetime(arr, errors='coerce') + tm.assert_index_equal(result, expected) + + with self.assertRaises(ValueError): + pd.to_datetime(arr, errors='raise') + def test_index_to_datetime(self): idx = Index(['1/1/2000', '1/2/2000', '1/3/2000']) @@ -4229,68 +4355,6 @@ def check(val, unit=None, h=1, s=1, us=0): result = Timestamp('NaT') self.assertIs(result, NaT) - def test_unit_errors(self): - # GH 11758 - # test proper behavior with erros - - with self.assertRaises(ValueError): - to_datetime([1], unit='D', format='%Y%m%d') - - values = [11111111, 1, 1.0, tslib.iNaT, pd.NaT, np.nan, - 'NaT', ''] - result = to_datetime(values, unit='D', errors='ignore') - expected = Index([11111111, Timestamp('1970-01-02'), - Timestamp('1970-01-02'), pd.NaT, - pd.NaT, pd.NaT, pd.NaT, pd.NaT], - dtype=object) - tm.assert_index_equal(result, expected) - - result = to_datetime(values, unit='D', errors='coerce') - expected = DatetimeIndex(['NaT', '1970-01-02', '1970-01-02', - 'NaT', 'NaT', 'NaT', 'NaT', 'NaT']) - tm.assert_index_equal(result, expected) - - with self.assertRaises(tslib.OutOfBoundsDatetime): - to_datetime(values, unit='D', errors='raise') - - values = [1420043460000, tslib.iNaT, pd.NaT, np.nan, 'NaT'] - - result = to_datetime(values, errors='ignore', unit='s') - expected = Index([1420043460000, pd.NaT, pd.NaT, - pd.NaT, pd.NaT], dtype=object) - tm.assert_index_equal(result, expected) - - result = to_datetime(values, errors='coerce', unit='s') - expected = DatetimeIndex(['NaT', 'NaT', 'NaT', 'NaT', 'NaT']) - tm.assert_index_equal(result, expected) - - with self.assertRaises(tslib.OutOfBoundsDatetime): - to_datetime(values, errors='raise', unit='s') - - # if we have a string, then we raise a ValueError - # and NOT an OutOfBoundsDatetime - for val in ['foo', Timestamp('20130101')]: - try: - to_datetime(val, errors='raise', unit='s') - except tslib.OutOfBoundsDatetime: - raise AssertionError("incorrect exception raised") - except ValueError: - pass - - # consistency of conversions - expected = Timestamp('1970-05-09 14:25:11') - result = pd.to_datetime(11111111, unit='s', errors='raise') - self.assertEqual(result, expected) - self.assertIsInstance(result, Timestamp) - - result = pd.to_datetime(11111111, unit='s', errors='coerce') - self.assertEqual(result, expected) - self.assertIsInstance(result, Timestamp) - - result = pd.to_datetime(11111111, unit='s', errors='ignore') - self.assertEqual(result, expected) - self.assertIsInstance(result, Timestamp) - def test_roundtrip(self): # test value to string and back conversions diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index a46149035dbae..d5e87d1df2462 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -221,7 +221,8 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, - If True, require an exact format match. - If False, allow the format to match anywhere in the target string. - unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch + unit : string, default 'ns' + unit of the arg (D,s,ms,us,ns) denote the unit in epoch (e.g. a unix timestamp), which is an integer/float number. infer_datetime_format : boolean, default False If True and no `format` is given, attempt to infer the format of the diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index a240558025090..3a006372900a9 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -2082,6 +2082,7 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): unit)) elif is_ignore: raise AssertionError + iresult[i] = NPY_NAT except: if is_raise: raise OutOfBoundsDatetime("cannot convert input {0}" @@ -2149,7 +2150,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', ndarray[int64_t] iresult ndarray[object] oresult pandas_datetimestruct dts - bint utc_convert = bool(utc), seen_integer=0, seen_datetime=0 + bint utc_convert = bool(utc), seen_integer=0, seen_string=0, seen_datetime=0 bint is_raise=errors=='raise', is_ignore=errors=='ignore', is_coerce=errors=='coerce' _TSObject _ts int out_local=0, out_tzoffset=0 @@ -2215,25 +2216,32 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', continue raise - # if we are coercing, dont' allow integers - elif is_integer_object(val) and not is_coerce: - if val == NPY_NAT: + # these must be ns unit by-definition + elif is_integer_object(val) or is_float_object(val): + + if val != val or val == NPY_NAT: iresult[i] = NPY_NAT - else: + elif is_raise or is_ignore: iresult[i] = val seen_integer=1 - elif is_float_object(val) and not is_coerce: - if val != val or val == NPY_NAT: - iresult[i] = NPY_NAT else: - iresult[i] = val - seen_integer=1 + # coerce + # we now need to parse this as if unit='ns' + # we can ONLY accept integers at this point + # if we have previously (or in future accept + # datetimes/strings, then we must coerce) + seen_integer = 1 + try: + iresult[i] = cast_from_unit(val, 'ns') + except: + iresult[i] = NPY_NAT else: try: if len(val) == 0 or val in _nat_strings: iresult[i] = NPY_NAT continue + seen_string=1 _string_to_dts(val, &dts, &out_local, &out_tzoffset) value = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) if out_local == 1: @@ -2276,11 +2284,20 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', continue raise - # don't allow mixed integers and datetime like - # higher levels can catch and is_coerce to object, for - # example - if seen_integer and seen_datetime: - raise ValueError("mixed datetimes and integers in passed array") + if seen_datetime and seen_integer: + # we have mixed datetimes & integers + + if is_coerce: + # coerce all of the integers/floats to NaT, preserve + # the datetimes and other convertibles + for i in range(n): + val = values[i] + if is_integer_object(val) or is_float_object(val): + result[i] = NPY_NAT + elif is_raise: + raise ValueError("mixed datetimes and integers in passed array") + else: + raise TypeError return result except OutOfBoundsDatetime: