diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 014f251ffb90a..db1b1ba07088a 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -131,13 +131,89 @@ Other Enhancements - :func:`pd.read_sas()` now recognizes much more of the most frequently used date (datetime) formats in SAS7BDAT files (:issue:`15871`). - :func:`DataFrame.items` and :func:`Series.items` is now present in both Python 2 and 3 and is lazy in all cases (:issue:`13918`, :issue:`17213`) - - .. _whatsnew_0210.api_breaking: Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. _whatsnew_0210.api_breaking.pandas_to_datetime: + +Numerical values need an explicit unit in pd.to_datetime +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- :func:`to_datetime` requires an unit with numerical arg (scalar or iterable), if not provided it raises an error (:issue:`15836`) +For example: + +.. ipython:: python + + # Old behaviour: + In [1]: pd.to_datetime(42) + Out[1]: Timestamp('1970-01-01 00:00:00.000000042') + + # New behaviour + In [1]: pd.to_datetime(42) + --------------------------------------------------------------------------- + ValueError Traceback (most recent call last) + in () + ----> 1 pd.to_datetime(42) + + /home/anthony/src/pandas/pandas/core/tools/datetimes.py in to_datetime(arg, errors, dayfirst, yearfirst, utc, box, format, exact, unit, infer_datetime_format, origin) + 461 elif ((not isinstance(arg, DataFrame)) and + 462 (check_numerical_arg() and unit is None and format is None)): + --> 463 raise ValueError("a unit is required in case of numerical arg") + 464 + 465 # handle origin + + ValueError: a unit is required in case of numerical arg + + In [2]: pd.to_datetime(42, unit='ns') + Out[2]: Timestamp('1970-01-01 00:00:00.000000042') + +Furthermore, this change fixes a bug with boolean values. + +.. ipython:: python + # Old behaviour + In [1]: pd.to_datetime(True, unit='ms') + Out[1]: Timestamp('1970-01-01 00:00:00.001000') + + # New behaviour + In [2]: pd.to_datetime(True, unit='ms') + --------------------------------------------------------------------------- + TypeError Traceback (most recent call last) + in () + ----> 1 pd.to_datetime(True, unit='ms') + + /home/anthony/src/pandas/pandas/core/tools/datetimes.py in to_datetime(arg, errors, dayfirst, yearfirst, utc, box, format, exact, unit, infer_datetime_format, origin) + 533 result = _convert_listlike(arg, box, format) + 534 else: + --> 535 result = _convert_listlike(np.array([arg]), box, format)[0] + 536 + 537 return result + + /home/anthony/src/pandas/pandas/core/tools/datetimes.py in _convert_listlike(arg, box, format, name, tz) + 374 arg = getattr(arg, 'values', arg) + 375 result = tslib.array_with_unit_to_datetime(arg, unit, + --> 376 errors=errors) + 377 if box: + 378 if errors == 'ignore': + + /home/anthony/src/pandas/pandas/_libs/tslib.pyx in pandas._libs.tslib.array_with_unit_to_datetime() + 2210 + 2211 + -> 2212 cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): + 2213 """ + 2214 convert the ndarray according to the unit + + /home/anthony/src/pandas/pandas/_libs/tslib.pyx in pandas._libs.tslib.array_with_unit_to_datetime() + 2246 raise TypeError("{0} is not convertible to datetime" + 2247 .format(values.dtype)) + -> 2248 + 2249 # try a quick conversion to i8 + 2250 # if we have nulls that are not type-compat + + TypeError: bool is not convertible to datetime + +Now boolean values raise an error everytime. .. _whatsnew_0210.api_breaking.deps: diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 5dd30072fb7aa..06c2e4962a3fd 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -31,7 +31,7 @@ cdef extern from "Python.h": from libc.stdlib cimport free from util cimport (is_integer_object, is_float_object, is_datetime64_object, - is_timedelta64_object, INT64_MAX) + is_bool_object, is_timedelta64_object, INT64_MAX) cimport util # this is our datetime.pxd @@ -2242,6 +2242,9 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): m = cast_from_unit(None, unit) if is_raise: + if np.issubdtype(values.dtype, np.bool_): + raise TypeError("{0} is not convertible to datetime" + .format(values.dtype)) # try a quick conversion to i8 # if we have nulls that are not type-compat @@ -2277,6 +2280,16 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): if _checknull_with_nat(val): iresult[i] = NPY_NAT + elif is_bool_object(val): + if is_raise: + raise TypeError( + "{0} is not convertible to datetime" + .format(values.dtype) + ) + elif is_ignore: + raise AssertionError + iresult[i] = NPY_NAT + elif is_integer_object(val) or is_float_object(val): if val != val or val == NPY_NAT: @@ -2320,7 +2333,7 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): else: if is_raise: - raise ValueError("non convertible value {0}" + raise ValueError("non convertible value {0} " "with the unit '{1}'".format( val, unit)) @@ -2344,6 +2357,8 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): if _checknull_with_nat(val): oresult[i] = NaT + elif is_bool_object(val): + oresult[i] = val elif is_integer_object(val) or is_float_object(val): if val != val or val == NPY_NAT: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index c2cf6afc1a7b5..3e9af1c491741 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -155,7 +155,7 @@ def trans(x): # noqa if dtype.tz: # convert to datetime and change timezone from pandas import to_datetime - result = to_datetime(result).tz_localize('utc') + result = to_datetime(result, unit='ns').tz_localize('utc') result = result.tz_convert(dtype.tz) except: @@ -963,11 +963,13 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'): dtype): try: if is_datetime64: - value = to_datetime(value, errors=errors)._values + value = to_datetime(value, unit='ns', + errors=errors)._values elif is_datetime64tz: # input has to be UTC at this point, so just # localize - value = (to_datetime(value, errors=errors) + value = (to_datetime(value, unit='ns', + errors=errors) .tz_localize('UTC') .tz_convert(dtype.tz) ) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 5a04c550f4502..90d44223d2172 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -277,6 +277,7 @@ def __new__(cls, data=None, dayfirst = kwargs.pop('dayfirst', None) yearfirst = kwargs.pop('yearfirst', None) + unit = kwargs.pop('unit', None) freq_infer = False if not isinstance(freq, DateOffset): @@ -333,7 +334,7 @@ def __new__(cls, data=None, if not (is_datetime64_dtype(data) or is_datetimetz(data) or is_integer_dtype(data)): data = tools.to_datetime(data, dayfirst=dayfirst, - yearfirst=yearfirst) + unit=unit, yearfirst=yearfirst) if issubclass(data.dtype.type, np.datetime64) or is_datetimetz(data): diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index c0f234a36803d..51d0157167d60 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -235,7 +235,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, - If True, require an exact format match. - If False, allow the format to match anywhere in the target string. - unit : string, default 'ns' + unit : string, default None unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer or float number. This will be based off the origin. Example, with unit='ms' and origin='unix' (the default), this @@ -342,6 +342,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, pandas.to_timedelta : Convert argument to timedelta. """ from pandas.core.indexes.datetimes import DatetimeIndex + from pandas.core.frame import DataFrame tz = 'utc' if utc else None @@ -451,8 +452,15 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): except (ValueError, TypeError): raise e + def check_numerical_arg(): + return ((is_scalar(arg) and (is_integer(arg) or is_float(arg))) or + (is_numeric_dtype(np.asarray(arg)) and np.asarray(arg).size)) + if arg is None: return None + elif ((not isinstance(arg, DataFrame)) and + (check_numerical_arg() and unit is None and format is None)): + raise ValueError("a unit is required in case of numerical arg") # handle origin if origin == 'julian': @@ -479,8 +487,7 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): # arg must be a numeric original = arg - if not ((is_scalar(arg) and (is_integer(arg) or is_float(arg))) or - is_numeric_dtype(np.asarray(arg))): + if not check_numerical_arg(): raise ValueError( "'{arg}' is not compatible with origin='{origin}'; " "it must be numeric with a unit specified ".format( @@ -605,7 +612,7 @@ def f(value): if len(excess): raise ValueError("extra keys have been passed " "to the datetime assemblage: " - "[{excess}]".format(','.join(excess=excess))) + "[{}]".format(','.join(excess))) def coerce(values): # we allow coercion to if errors allows diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 50669ee357bbd..c27c06de18f82 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -25,6 +25,35 @@ compat) +@pytest.fixture(params=['D', 's', 'ms', 'us', 'ns']) +def units(request): + return request.param + + +@pytest.fixture +def epoch_1960(): + # for origin as 1960-01-01 + return Timestamp('1960-01-01') + + +@pytest.fixture +def units_from_epochs(): + return list(range(5)) + + +@pytest.fixture(params=[epoch_1960(), + epoch_1960().to_pydatetime(), + epoch_1960().to_datetime64(), + str(epoch_1960())]) +def epochs(request): + return request.param + + +@pytest.fixture +def julian_dates(): + return pd.date_range('2014-1-1', periods=10).to_julian_date().values + + class TimeConversionFormats(object): def test_to_datetime_format(self): @@ -306,25 +335,6 @@ def test_to_datetime_tz_psycopg2(self): dtype='datetime64[ns, UTC]') tm.assert_index_equal(result, expected) - def test_datetime_bool(self): - # GH13176 - with pytest.raises(TypeError): - to_datetime(False) - assert to_datetime(False, errors="coerce") is NaT - assert to_datetime(False, errors="ignore") is False - with pytest.raises(TypeError): - to_datetime(True) - assert to_datetime(True, errors="coerce") is NaT - assert to_datetime(True, errors="ignore") is True - with pytest.raises(TypeError): - to_datetime([False, datetime.today()]) - with pytest.raises(TypeError): - to_datetime(['20130101', True]) - tm.assert_index_equal(to_datetime([0, False, NaT, 0.0], - errors="coerce"), - DatetimeIndex([to_datetime(0), NaT, - NaT, to_datetime(0)])) - def test_datetime_invalid_datatype(self): # GH13176 @@ -334,7 +344,27 @@ def test_datetime_invalid_datatype(self): pd.to_datetime(pd.to_datetime) -class ToDatetimeUnit(object): +class TestToDatetimeUnit(object): + + def test_datetime_bool(self, units): + # GH13176 + with pytest.raises(TypeError): + to_datetime(False, unit=units) + assert to_datetime(False, unit=units, errors="coerce") is NaT + assert (not to_datetime(False, unit=units, errors="ignore")) + with pytest.raises(TypeError): + to_datetime(True, unit=units) + assert to_datetime(True, unit=units, errors="coerce") is NaT + assert to_datetime(True, unit=units, errors="ignore") + with pytest.raises(TypeError): + to_datetime([False, datetime.today()], unit=units) + with pytest.raises(TypeError): + to_datetime([True, '20130101'], unit=units) + + tm.assert_index_equal(to_datetime([0, False, NaT, 0.0], + errors="coerce"), + DatetimeIndex([to_datetime(0, unit=units), NaT, + NaT, to_datetime(0, unit=units)])) def test_unit(self): # GH 11758 @@ -409,10 +439,10 @@ def test_unit_with_numeric(self): arr1 = [1.434692e+18, 1.432766e+18] arr2 = np.array(arr1).astype('int64') for errors in ['ignore', 'raise', 'coerce']: - result = pd.to_datetime(arr1, errors=errors) + result = pd.to_datetime(arr1, unit='ns', errors=errors) tm.assert_index_equal(result, expected) - result = pd.to_datetime(arr2, errors=errors) + result = pd.to_datetime(arr2, unit='ns', errors=errors) tm.assert_index_equal(result, expected) # but we want to make sure that we are coercing @@ -421,7 +451,7 @@ def test_unit_with_numeric(self): '2015-06-19 05:33:20', '2015-05-27 22:33:20']) arr = ['foo', 1.434692e+18, 1.432766e+18] - result = pd.to_datetime(arr, errors='coerce') + result = pd.to_datetime(arr, unit='ns', errors='coerce') tm.assert_index_equal(result, expected) expected = DatetimeIndex(['2015-06-19 05:33:20', @@ -429,7 +459,7 @@ def test_unit_with_numeric(self): 'NaT', 'NaT']) arr = [1.434692e+18, 1.432766e+18, 'foo', 'NaT'] - result = pd.to_datetime(arr, errors='coerce') + result = pd.to_datetime(arr, unit='ns', errors='coerce') tm.assert_index_equal(result, expected) def test_unit_mixed(self): @@ -437,21 +467,21 @@ def test_unit_mixed(self): # mixed integers/datetimes expected = DatetimeIndex(['2013-01-01', 'NaT', 'NaT']) arr = [pd.Timestamp('20130101'), 1.434692e+18, 1.432766e+18] - result = pd.to_datetime(arr, errors='coerce') + result = pd.to_datetime(arr, unit='ns', errors='coerce') tm.assert_index_equal(result, expected) with pytest.raises(ValueError): - pd.to_datetime(arr, errors='raise') + pd.to_datetime(arr, unit='ns', errors='raise') expected = DatetimeIndex(['NaT', 'NaT', '2013-01-01']) arr = [1.434692e+18, 1.432766e+18, pd.Timestamp('20130101')] - result = pd.to_datetime(arr, errors='coerce') + result = pd.to_datetime(arr, unit='ns', errors='coerce') tm.assert_index_equal(result, expected) with pytest.raises(ValueError): - pd.to_datetime(arr, errors='raise') + pd.to_datetime(arr, unit='ns', errors='raise') def test_dataframe(self): @@ -1488,35 +1518,6 @@ def test_normalize_date(): assert (result == datetime(2012, 9, 7)) -@pytest.fixture(params=['D', 's', 'ms', 'us', 'ns']) -def units(request): - return request.param - - -@pytest.fixture -def epoch_1960(): - # for origin as 1960-01-01 - return Timestamp('1960-01-01') - - -@pytest.fixture -def units_from_epochs(): - return list(range(5)) - - -@pytest.fixture(params=[epoch_1960(), - epoch_1960().to_pydatetime(), - epoch_1960().to_datetime64(), - str(epoch_1960())]) -def epochs(request): - return request.param - - -@pytest.fixture -def julian_dates(): - return pd.date_range('2014-1-1', periods=10).to_julian_date().values - - class TestOrigin(object): def test_to_basic(self, julian_dates): diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 0900d21b250ed..195519b0e66fa 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -98,7 +98,7 @@ def create_block(typestr, placement, item_shape=None, num_offset=0): assert m is not None, "incompatible typestr -> {0}".format(typestr) tz = m.groups()[0] assert num_items == 1, "must have only 1 num items for a tz-aware" - values = DatetimeIndex(np.arange(N) * 1e9, tz=tz) + values = DatetimeIndex(np.arange(N) * 1e9, unit='ns', tz=tz) elif typestr in ('timedelta', 'td', 'm8[ns]'): values = (mat * 1).astype('m8[ns]') elif typestr in ('category', ): diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index e447a74b2b462..3fb04d94d94ec 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -349,15 +349,16 @@ def test_make_field_float(self): def test_make_field_datetime(self): data = [1., 2., 3.] - kinds = [pd.Series(pd.to_datetime(data), name='values'), - pd.to_datetime(data)] + kinds = [pd.Series(pd.to_datetime(data, unit='ns'), name='values'), + pd.to_datetime(data, unit='ns')] for kind in kinds: result = make_field(kind) expected = {"name": "values", "type": 'datetime'} assert result == expected - kinds = [pd.Series(pd.to_datetime(data, utc=True), name='values'), - pd.to_datetime(data, utc=True)] + kinds = [pd.Series(pd.to_datetime(data, unit='ns', utc=True), + name='values'), + pd.to_datetime(data, unit='ns', utc=True)] for kind in kinds: result = make_field(kind) expected = {"name": "values", "type": 'datetime', "tz": "UTC"} diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 671d4248818e4..6a937eac3dba6 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -181,7 +181,7 @@ def _check_orient(df, orient, dtype=None, numpy=False, if not convert_axes and df.index.dtype.type == np.datetime64: unser.index = DatetimeIndex( - unser.index.values.astype('i8') * 1e6) + unser.index.values.astype('i8') * 1e6, unit='ns') if orient == "records": # index is not captured in this orientation tm.assert_almost_equal(df.values, unser.values, @@ -832,7 +832,7 @@ def test_timedelta(self): result = pd.read_json(frame.to_json(date_unit='ns')) result['a'] = pd.to_timedelta(result.a, unit='ns') - result['c'] = pd.to_datetime(result.c) + result['c'] = pd.to_datetime(result.c, unit='ns') assert_frame_equal(frame, result) def test_mixed_timedelta_datetime(self): diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index d42e37048d87f..43473108de28d 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -1730,7 +1730,7 @@ def test_nanosecond_resample_error(self): start = 1443707890427 exp_start = 1443707890400 indx = pd.date_range( - start=pd.to_datetime(start), + start=pd.to_datetime(start, unit='ns'), periods=10, freq='100n' ) @@ -1739,7 +1739,7 @@ def test_nanosecond_resample_error(self): result = r.agg('mean') exp_indx = pd.date_range( - start=pd.to_datetime(exp_start), + start=pd.to_datetime(exp_start, unit='ns'), periods=10, freq='100n' )