From 73d58a9eb4dcd1b8e98d04aa7bb49bb5dfbda4b2 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 8 Jun 2013 23:36:16 -0400 Subject: [PATCH 1/3] ENH: Add unit keyword to Timestamp and to_datetime to enable passing of integers or floats that are in an epoch unit of s, ms, us, ns (e.g. unix timestamps or epoch s, with fracional seconds allowed) (GH 3540) --- RELEASE.rst | 4 ++ pandas/src/inference.pyx | 4 +- pandas/src/offsets.pyx | 2 +- pandas/tseries/tests/test_timeseries.py | 43 +++++++++++++++ pandas/tseries/tools.py | 6 ++- pandas/tslib.pxd | 2 +- pandas/tslib.pyx | 70 ++++++++++++++++++------- 7 files changed, 105 insertions(+), 26 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 161047c478d88..0d94337ffea78 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -82,6 +82,9 @@ pandas 0.11.1 - Series and DataFrame hist methods now take a ``figsize`` argument (GH3834_) - DatetimeIndexes no longer try to convert mixed-integer indexes during join operations (GH3877_) + - Add ``unit`` keyword to ``Timestamp`` and ``to_datetime`` to enable passing of + integers or floats that are in an epoch unit of ``s, ms, us, ns`` + (e.g. unix timestamps or epoch ``s``, with fracional seconds allowed) (GH3540_) **API Changes** @@ -264,6 +267,7 @@ pandas 0.11.1 .. _GH3499: https://github.com/pydata/pandas/issues/3499 .. _GH3495: https://github.com/pydata/pandas/issues/3495 .. _GH3492: https://github.com/pydata/pandas/issues/3492 +.. _GH3540: https://github.com/pydata/pandas/issues/3540 .. _GH3552: https://github.com/pydata/pandas/issues/3552 .. _GH3562: https://github.com/pydata/pandas/issues/3562 .. _GH3586: https://github.com/pydata/pandas/issues/3586 diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 5343819b9fbfe..270fb01a42033 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -471,7 +471,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, seen_float = 1 elif util.is_datetime64_object(val): if convert_datetime: - idatetimes[i] = convert_to_tsobject(val, None).value + idatetimes[i] = convert_to_tsobject(val, None, None).value seen_datetime = 1 else: seen_object = 1 @@ -493,7 +493,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, elif PyDateTime_Check(val) or util.is_datetime64_object(val): if convert_datetime: seen_datetime = 1 - idatetimes[i] = convert_to_tsobject(val, None).value + idatetimes[i] = convert_to_tsobject(val, None, None).value else: seen_object = 1 break diff --git a/pandas/src/offsets.pyx b/pandas/src/offsets.pyx index 5868ca5210e33..1823edeb0a4d9 100644 --- a/pandas/src/offsets.pyx +++ b/pandas/src/offsets.pyx @@ -76,7 +76,7 @@ cdef class _Offset: cpdef anchor(self, object start=None): if start is not None: self.start = start - self.ts = convert_to_tsobject(self.start) + self.ts = convert_to_tsobject(self.start, None, None) self._setup() cdef _setup(self): diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index f5415a195db77..6efddb281d894 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -593,6 +593,14 @@ def test_frame_add_datetime64_col_other_units(self): self.assert_((tmp['dates'].values == ex_vals).all()) + def test_to_datetime_unit(self): + + epoch = 1370745748 + s = Series([ epoch + t for t in range(20) ]) + result = to_datetime(s,unit='s') + expected = Series([ Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20) ]) + assert_series_equal(result,expected) + def test_series_ctor_datetime64(self): rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s') @@ -2691,6 +2699,41 @@ def test_basics_nanos(self): self.assert_(stamp.microsecond == 0) self.assert_(stamp.nanosecond == 500) + def test_unit(self): + def check(val,unit=None,s=1,us=0): + stamp = Timestamp(val, unit=unit) + self.assert_(stamp.year == 2000) + self.assert_(stamp.month == 1) + self.assert_(stamp.day == 1) + self.assert_(stamp.hour == 1) + self.assert_(stamp.minute == 1) + self.assert_(stamp.second == s) + self.assert_(stamp.microsecond == us) + self.assert_(stamp.nanosecond == 0) + + val = Timestamp('20000101 01:01:01').value + + check(val) + check(val/1000L,unit='us') + check(val/1000000L,unit='ms') + check(val/1000000000L,unit='s') + + # get chopped + check((val+500000)/1000000000L,unit='s') + check((val+500000000)/1000000000L,unit='s') + check((val+500000)/1000000L,unit='ms') + + # ok + check((val+500000)/1000L,unit='us',us=500) + check((val+500000000)/1000000L,unit='ms',us=500000) + + # floats + check(val/1000.0 + 5,unit='us',us=5) + check(val/1000.0 + 5000,unit='us',us=5000) + check(val/1000000.0 + 0.5,unit='ms',us=500) + check(val/1000000.0 + 0.005,unit='ms',us=5) + check(val/1000000000.0 + 0.5,unit='s',us=500000) + def test_comparison(self): # 5-18-2012 00:00:00.000 stamp = 1337299200000000000L diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index 62ee19da6b845..46bcee6f907cf 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -50,7 +50,7 @@ def _maybe_get_tz(tz): def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True, - format=None, coerce=False): + format=None, coerce=False, unit=None): """ Convert argument to datetime @@ -69,6 +69,8 @@ def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True, format : string, default None strftime to parse time, eg "%d/%m/%Y" coerce : force errors to NaT (False by default) + unit : unit of the arg (s,ms,us,ns) denote the unit in epoch + (e.g. a unix timestamp) Returns ------- @@ -86,7 +88,7 @@ def _convert_f(arg): else: result = tslib.array_to_datetime(arg, raise_=errors == 'raise', utc=utc, dayfirst=dayfirst, - coerce=coerce) + coerce=coerce, unit=unit) if com.is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None) return result diff --git a/pandas/tslib.pxd b/pandas/tslib.pxd index 3e7a6ef615e00..a70f9883c5bb1 100644 --- a/pandas/tslib.pxd +++ b/pandas/tslib.pxd @@ -1,3 +1,3 @@ from numpy cimport ndarray, int64_t -cdef convert_to_tsobject(object, object) +cdef convert_to_tsobject(object, object, object) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index abec45b52a363..94279e61e440e 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -131,21 +131,17 @@ class Timestamp(_Timestamp): note: by definition there cannot be any tz info on the ordinal itself """ return cls(datetime.fromordinal(ordinal),offset=offset,tz=tz) - def __new__(cls, object ts_input, object offset=None, tz=None): + def __new__(cls, object ts_input, object offset=None, tz=None, unit=None): cdef _TSObject ts cdef _Timestamp ts_base - if PyFloat_Check(ts_input): - # to do, do we want to support this, ie with fractional seconds? - raise TypeError("Cannot convert a float to datetime") - if util.is_string_object(ts_input): try: ts_input = parse_date(ts_input) except Exception: pass - ts = convert_to_tsobject(ts_input, tz) + ts = convert_to_tsobject(ts_input, tz, unit) if ts.value == NPY_NAT: return NaT @@ -311,7 +307,7 @@ class Timestamp(_Timestamp): if self.nanosecond != 0 and warn: print 'Warning: discarding nonzero nanoseconds' - ts = convert_to_tsobject(self, self.tzinfo) + ts = convert_to_tsobject(self, self.tzinfo, None) return datetime(ts.dts.year, ts.dts.month, ts.dts.day, ts.dts.hour, ts.dts.min, ts.dts.sec, @@ -530,7 +526,7 @@ cdef class _Timestamp(datetime): cdef: pandas_datetimestruct dts _TSObject ts - ts = convert_to_tsobject(self, self.tzinfo) + ts = convert_to_tsobject(self, self.tzinfo, None) dts = ts.dts return datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, @@ -623,12 +619,13 @@ cpdef _get_utcoffset(tzinfo, obj): return tzinfo.utcoffset(obj) # helper to extract datetime and int64 from several different possibilities -cdef convert_to_tsobject(object ts, object tz): +cdef convert_to_tsobject(object ts, object tz, object unit): """ Extract datetime and int64 from any of: - - np.int64 + - np.int64 (with unit providing a possible modifier) - np.datetime64 - - python int or long object + - a float (with unit providing a possible modifier) + - python int or long object (with unit providing a possible modifier) - iso8601 string object - python datetime object - another timestamp object @@ -647,6 +644,11 @@ cdef convert_to_tsobject(object ts, object tz): obj.value = _get_datetime64_nanos(ts) pandas_datetime_to_datetimestruct(obj.value, PANDAS_FR_ns, &obj.dts) elif is_integer_object(ts): + ts = ts * cast_from_unit(unit,None) + obj.value = ts + pandas_datetime_to_datetimestruct(ts, PANDAS_FR_ns, &obj.dts) + elif util.is_float_object(ts): + ts = cast_from_unit(unit,ts) obj.value = ts pandas_datetime_to_datetimestruct(ts, PANDAS_FR_ns, &obj.dts) elif util.is_string_object(ts): @@ -699,7 +701,7 @@ cdef convert_to_tsobject(object ts, object tz): elif PyDate_Check(ts): # Keep the converter same as PyDateTime's ts = datetime.combine(ts, datetime_time()) - return convert_to_tsobject(ts, tz) + return convert_to_tsobject(ts, tz, None) else: raise ValueError("Could not construct Timestamp from argument %s" % type(ts)) @@ -804,7 +806,7 @@ def datetime_to_datetime64(ndarray[object] values): else: inferred_tz = _get_zone(val.tzinfo) - _ts = convert_to_tsobject(val, None) + _ts = convert_to_tsobject(val, None, None) iresult[i] = _ts.value _check_dts_bounds(iresult[i], &_ts.dts) else: @@ -819,7 +821,7 @@ def datetime_to_datetime64(ndarray[object] values): def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, - format=None, utc=None, coerce=False): + format=None, utc=None, coerce=False, unit=None): cdef: Py_ssize_t i, n = len(values) object val @@ -828,6 +830,7 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, pandas_datetimestruct dts bint utc_convert = bool(utc) _TSObject _ts + int64_t m = cast_from_unit(unit,None) from dateutil.parser import parse @@ -841,7 +844,7 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, elif PyDateTime_Check(val): if val.tzinfo is not None: if utc_convert: - _ts = convert_to_tsobject(val, None) + _ts = convert_to_tsobject(val, None, unit) iresult[i] = _ts.value _check_dts_bounds(iresult[i], &_ts.dts) else: @@ -861,7 +864,9 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, # if we are coercing, dont' allow integers elif util.is_integer_object(val) and not coerce: - iresult[i] = val + iresult[i] = val*m + elif util.is_float_object(val) and not coerce: + iresult[i] = cast_from_unit(unit,val) else: try: if len(val) == 0: @@ -1246,6 +1251,31 @@ cdef inline _get_datetime64_nanos(object val): else: return ival +cdef inline int64_t cast_from_unit(object unit, object ts): + """ return a casting of the unit represented to nanoseconds + round the fractional part of a float to our precision, p """ + p = 0 + if unit == 's': + m = 1000000000L + p = 6 + elif unit == 'ms': + m = 1000000L + p = 3 + elif unit == 'us': + m = 1000L + p = 0 + else: + m = 1L + + # just give me the unit back + if ts is None: + return m + + # cast the unit, multiply base/frace separately + # to avoid precision issues from float -> int + base = ts + frac = ts-base + return (base*m) + (round(frac,p)*m) def cast_to_nanoseconds(ndarray arr): cdef: @@ -1286,7 +1316,7 @@ def pydt_to_i8(object pydt): cdef: _TSObject ts - ts = convert_to_tsobject(pydt, None) + ts = convert_to_tsobject(pydt, None, None) return ts.value @@ -1784,7 +1814,7 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - ts = convert_to_tsobject(dtindex[i], None) + ts = convert_to_tsobject(dtindex[i], None, None) out[i] = ts_dayofweek(ts) return out @@ -1793,7 +1823,7 @@ def get_date_field(ndarray[int64_t] dtindex, object field): if dtindex[i] == NPY_NAT: out[i] = -1; continue pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) - ts = convert_to_tsobject(dtindex[i], None) + ts = convert_to_tsobject(dtindex[i], None, None) isleap = is_leapyear(dts.year) isleap_prev = is_leapyear(dts.year - 1) mo_off = _month_offset[isleap, dts.month - 1] @@ -1831,7 +1861,7 @@ def get_date_field(ndarray[int64_t] dtindex, object field): cdef inline int m8_weekday(int64_t val): - ts = convert_to_tsobject(val, None) + ts = convert_to_tsobject(val, None, None) return ts_dayofweek(ts) cdef int64_t DAY_NS = 86400000000000LL From 7e4ccbe9a40ff88d38dffc2d77c17f680267e8d9 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 9 Jun 2013 17:41:17 -0400 Subject: [PATCH 2/3] TST: disallow slicing a timeseries with floats TST: manage truediv in py3 for unit comparisons --- pandas/tseries/index.py | 3 +++ pandas/tseries/tests/test_timeseries.py | 15 +++++++++++---- pandas/tseries/tools.py | 4 ++-- pandas/tslib.pyx | 2 +- 4 files changed, 17 insertions(+), 7 deletions(-) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 51e657d1723b2..1cb986ee6cd7c 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1204,6 +1204,9 @@ def slice_indexer(self, start=None, end=None, step=None): if isinstance(start, time) or isinstance(end, time): raise KeyError('Cannot mix time and non-time slice keys') + if isinstance(start, float) or isinstance(end, float): + raise TypeError('Cannot index datetime64 with float keys') + return Index.slice_indexer(self, start, end, step) def slice_locs(self, start=None, end=None): diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 6efddb281d894..88dee987f4ba2 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -2718,10 +2718,17 @@ def check(val,unit=None,s=1,us=0): check(val/1000000L,unit='ms') check(val/1000000000L,unit='s') - # get chopped - check((val+500000)/1000000000L,unit='s') - check((val+500000000)/1000000000L,unit='s') - check((val+500000)/1000000L,unit='ms') + # using truediv, so these are like floats + if py3compat.PY3: + check((val+500000)/1000000000L,unit='s',us=500) + check((val+500000000)/1000000000L,unit='s',us=500000) + check((val+500000)/1000000L,unit='ms',us=500) + + # get chopped in py2 + else: + check((val+500000)/1000000000L,unit='s') + check((val+500000000)/1000000000L,unit='s') + check((val+500000)/1000000L,unit='ms') # ok check((val+500000)/1000L,unit='us',us=500) diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index 46bcee6f907cf..90bc0beb8eb84 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -50,7 +50,7 @@ def _maybe_get_tz(tz): def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True, - format=None, coerce=False, unit=None): + format=None, coerce=False, unit='ns'): """ Convert argument to datetime @@ -70,7 +70,7 @@ def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True, strftime to parse time, eg "%d/%m/%Y" coerce : force errors to NaT (False by default) unit : unit of the arg (s,ms,us,ns) denote the unit in epoch - (e.g. a unix timestamp) + (e.g. a unix timestamp), which is an integer/float number Returns ------- diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 94279e61e440e..c2a3f429e60f7 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -1254,7 +1254,6 @@ cdef inline _get_datetime64_nanos(object val): cdef inline int64_t cast_from_unit(object unit, object ts): """ return a casting of the unit represented to nanoseconds round the fractional part of a float to our precision, p """ - p = 0 if unit == 's': m = 1000000000L p = 6 @@ -1266,6 +1265,7 @@ cdef inline int64_t cast_from_unit(object unit, object ts): p = 0 else: m = 1L + p = 0 # just give me the unit back if ts is None: From fbcd5abcadb37c612804aefc5255e2e99b009444 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 12 Jun 2013 09:04:55 -0400 Subject: [PATCH 3/3] BUG: make sure that nan/none like values to Timestamp are returned as NaT --- pandas/tseries/tests/test_timeseries.py | 35 ++++++++++++++++++++++++- pandas/tslib.pyx | 32 +++++++++++++++------- 2 files changed, 57 insertions(+), 10 deletions(-) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 88dee987f4ba2..ac02dee335afc 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -38,6 +38,7 @@ import pandas.util.py3compat as py3compat from pandas.core.datetools import BDay import pandas.core.common as com +from pandas import concat from numpy.testing.decorators import slow @@ -171,7 +172,6 @@ def test_indexing_over_size_cutoff(self): def test_indexing_unordered(self): # GH 2437 - from pandas import concat rng = date_range(start='2011-01-01', end='2011-01-15') ts = Series(randn(len(rng)), index=rng) ts2 = concat([ts[0:4],ts[-4:],ts[4:-4]]) @@ -601,6 +601,26 @@ def test_to_datetime_unit(self): expected = Series([ Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20) ]) assert_series_equal(result,expected) + s = Series([ epoch + t for t in range(20) ]).astype(float) + result = to_datetime(s,unit='s') + expected = Series([ Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20) ]) + assert_series_equal(result,expected) + + s = Series([ epoch + t for t in range(20) ] + [iNaT]) + result = to_datetime(s,unit='s') + expected = Series([ Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20) ] + [NaT]) + assert_series_equal(result,expected) + + s = Series([ epoch + t for t in range(20) ] + [iNaT]).astype(float) + result = to_datetime(s,unit='s') + expected = Series([ Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20) ] + [NaT]) + assert_series_equal(result,expected) + + s = concat([Series([ epoch + t for t in range(20) ]).astype(float),Series([np.nan])],ignore_index=True) + result = to_datetime(s,unit='s') + expected = Series([ Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20) ] + [NaT]) + assert_series_equal(result,expected) + def test_series_ctor_datetime64(self): rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s') @@ -2741,6 +2761,19 @@ def check(val,unit=None,s=1,us=0): check(val/1000000.0 + 0.005,unit='ms',us=5) check(val/1000000000.0 + 0.5,unit='s',us=500000) + # nan + result = Timestamp(np.nan) + self.assert_(result is NaT) + + result = Timestamp(None) + self.assert_(result is NaT) + + result = Timestamp(iNaT) + self.assert_(result is NaT) + + result = Timestamp(NaT) + self.assert_(result is NaT) + def test_comparison(self): # 5-18-2012 00:00:00.000 stamp = 1337299200000000000L diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index c2a3f429e60f7..ec11de7392680 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -640,17 +640,25 @@ cdef convert_to_tsobject(object ts, object tz, object unit): obj = _TSObject() - if is_datetime64_object(ts): + if ts is None or ts is NaT: + obj.value = NPY_NAT + elif is_datetime64_object(ts): obj.value = _get_datetime64_nanos(ts) pandas_datetime_to_datetimestruct(obj.value, PANDAS_FR_ns, &obj.dts) elif is_integer_object(ts): - ts = ts * cast_from_unit(unit,None) - obj.value = ts - pandas_datetime_to_datetimestruct(ts, PANDAS_FR_ns, &obj.dts) + if ts == NPY_NAT: + obj.value = NPY_NAT + else: + ts = ts * cast_from_unit(unit,None) + obj.value = ts + pandas_datetime_to_datetimestruct(ts, PANDAS_FR_ns, &obj.dts) elif util.is_float_object(ts): - ts = cast_from_unit(unit,ts) - obj.value = ts - pandas_datetime_to_datetimestruct(ts, PANDAS_FR_ns, &obj.dts) + if ts != ts or ts == NPY_NAT: + obj.value = NPY_NAT + else: + ts = cast_from_unit(unit,ts) + obj.value = ts + pandas_datetime_to_datetimestruct(ts, PANDAS_FR_ns, &obj.dts) elif util.is_string_object(ts): if ts in _nat_strings: obj.value = NPY_NAT @@ -864,9 +872,15 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, # if we are coercing, dont' allow integers elif util.is_integer_object(val) and not coerce: - iresult[i] = val*m + if val == iNaT: + iresult[i] = iNaT + else: + iresult[i] = val*m elif util.is_float_object(val) and not coerce: - iresult[i] = cast_from_unit(unit,val) + if val != val or val == iNaT: + iresult[i] = iNaT + else: + iresult[i] = cast_from_unit(unit,val) else: try: if len(val) == 0: