diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 7136b15a7633a..44c200e13b877 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -252,7 +252,8 @@ Epoch Timestamps It's also possible to convert integer or float epoch times. The default unit for these is nanoseconds (since these are how ``Timestamp`` s are stored). However, -often epochs are stored in another ``unit`` which can be specified: +often epochs are stored in another ``unit`` which can be specified. These are computed +from the starting point specified by the :ref:`Origin Parameter `. Typical epoch stored units @@ -276,6 +277,29 @@ These *work*, but the results may be unexpected. Epoch times will be rounded to the nearest nanosecond. +.. _timeseries.origin: + +Using the Origin Parameter +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.20.0 + +Using the ``origin`` parameter, one can specify an alternative starting point for creation +of a ``DatetimeIndex``. + +Start with 1960-01-01 as the starting date + +.. ipython:: python + + pd.to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01')) + +The default is set at ``origin='unix'``, which defaults to ``1970-01-01 00:00:00``. +Commonly called 'unix epoch' or POSIX time. + +.. ipython:: python + + pd.to_datetime([1, 2, 3], unit='D') + .. _timeseries.daterange: Generating Ranges of Timestamps diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 781a912555e14..84e6bd99e5ebd 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -27,7 +27,6 @@ Check the :ref:`API Changes ` and :ref:`deprecations New features ~~~~~~~~~~~~ - .. _whatsnew_0200.enhancements.dataio_dtype: ``dtype`` keyword for data IO @@ -55,6 +54,27 @@ fixed-width text files, and :func:`read_excel` for parsing Excel files. pd.read_fwf(StringIO(data)).dtypes pd.read_fwf(StringIO(data), dtype={'a':'float64', 'b':'object'}).dtypes +.. _whatsnew_0120.enhancements.datetime_origin: + +to_datetime has gained an origin parameter +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``pd.to_datetime`` has gained a new parameter, ``origin``, to define a reference date +from where to compute the resulting ``DatetimeIndex``. (:issue:`11276`, :issue:`11745`) + +Start with 1960-01-01 as the starting date + +.. ipython:: python + + pd.to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01')) + +The default is set at ``origin='unix'``, which defaults to ``1970-01-01 00:00:00``. +Commonly called 'unix epoch' or POSIX time. + +.. ipython:: python + + pd.to_datetime([1, 2, 3], unit='D') + .. _whatsnew_0200.enhancements.groupby_access: Groupby Enhancements @@ -317,7 +337,7 @@ Other Enhancements - ``pd.DataFrame.to_latex`` and ``pd.DataFrame.to_string`` now allow optional header aliases. (:issue:`15536`) - Re-enable the ``parse_dates`` keyword of ``read_excel`` to parse string columns as dates (:issue:`14326`) - Added ``.empty`` property to subclasses of ``Index``. (:issue:`15270`) - +- Enabled floor division for ``Timedelta`` and ``TimedeltaIndex`` (:issue:`15828`) - ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`) - ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`) - ``pandas.io.json.json_normalize()`` has gained a ``sep`` option that accepts ``str`` to separate joined fields; the default is ".", which is backward compatible. (:issue:`14883`) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 5aa8e15d0d087..cc1439711c1d4 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -3073,6 +3073,7 @@ class Timedelta(_Timedelta): return np.timedelta64(self.value, 'ns') def _validate_ops_compat(self, other): + # return True if we are compat with operating if _checknull_with_nat(other): return True @@ -3179,11 +3180,41 @@ class Timedelta(_Timedelta): __div__ = __truediv__ __rdiv__ = __rtruediv__ - def _not_implemented(self, *args, **kwargs): - return NotImplemented + def __floordiv__(self, other): + + if hasattr(other, 'dtype'): + + # work with i8 + other = other.astype('m8[ns]').astype('i8') + + return self.value // other - __floordiv__ = _not_implemented - __rfloordiv__ = _not_implemented + # integers only + if is_integer_object(other): + return Timedelta(self.value // other, unit='ns') + + if not self._validate_ops_compat(other): + return NotImplemented + + other = Timedelta(other) + if other is NaT: + return np.nan + return self.value // other.value + + def __rfloordiv__(self, other): + if hasattr(other, 'dtype'): + + # work with i8 + other = other.astype('m8[ns]').astype('i8') + return other // self.value + + if not self._validate_ops_compat(other): + return NotImplemented + + other = Timedelta(other) + if other is NaT: + return NaT + return other.value // self.value def _op_unary_method(func, name): diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 512a3e1c38629..02630c76abb93 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -1515,3 +1515,120 @@ def test_normalize_date(): result = normalize_date(value) assert (result == datetime(2012, 9, 7)) + + +@pytest.fixture(params=['D', 's', 'ms', 'us', 'ns']) +def units(request): + return request.param + + +@pytest.fixture +def epoch_1960(): + # for origin as 1960-01-01 + return Timestamp('1960-01-01') + + +@pytest.fixture +def units_from_epochs(): + return list(range(5)) + + +@pytest.fixture(params=[epoch_1960(), epoch_1960().to_datetime(), + epoch_1960().to_datetime64(), + str(epoch_1960())]) +def epochs(request): + return request.param + + +@pytest.fixture +def julian_dates(): + return pd.date_range('2014-1-1', periods=10).to_julian_date().values + + +class TestOrigin(object): + + def test_to_basic(self, julian_dates): + # gh-11276, gh-11745 + # for origin as julian + + result = Series(pd.to_datetime( + julian_dates, unit='D', origin='julian')) + expected = Series(pd.to_datetime( + julian_dates - pd.Timestamp(0).to_julian_date(), unit='D')) + assert_series_equal(result, expected) + + result = Series(pd.to_datetime( + [0, 1, 2], unit='D', origin='unix')) + expected = Series([Timestamp('1970-01-01'), + Timestamp('1970-01-02'), + Timestamp('1970-01-03')]) + assert_series_equal(result, expected) + + # default + result = Series(pd.to_datetime( + [0, 1, 2], unit='D')) + expected = Series([Timestamp('1970-01-01'), + Timestamp('1970-01-02'), + Timestamp('1970-01-03')]) + assert_series_equal(result, expected) + + def test_julian_round_trip(self): + result = pd.to_datetime(2456658, origin='julian', unit='D') + assert result.to_julian_date() == 2456658 + + # out-of-bounds + with pytest.raises(ValueError): + pd.to_datetime(1, origin="julian", unit='D') + + def test_invalid_unit(self, units, julian_dates): + + # checking for invalid combination of origin='julian' and unit != D + if units != 'D': + with pytest.raises(ValueError): + pd.to_datetime(julian_dates, unit=units, origin='julian') + + def test_invalid_origin(self): + + # need to have a numeric specified + with pytest.raises(ValueError): + pd.to_datetime("2005-01-01", origin="1960-01-01") + + with pytest.raises(ValueError): + pd.to_datetime("2005-01-01", origin="1960-01-01", unit='D') + + def test_epoch(self, units, epochs, epoch_1960, units_from_epochs): + + expected = Series( + [pd.Timedelta(x, unit=units) + + epoch_1960 for x in units_from_epochs]) + + result = Series(pd.to_datetime( + units_from_epochs, unit=units, origin=epochs)) + assert_series_equal(result, expected) + + @pytest.mark.parametrize("origin, exc", + [('random_string', ValueError), + ('epoch', ValueError), + ('13-24-1990', ValueError), + (datetime(1, 1, 1), tslib.OutOfBoundsDatetime)]) + def test_invalid_origins(self, origin, exc, units, units_from_epochs): + + with pytest.raises(exc): + pd.to_datetime(units_from_epochs, unit=units, + origin=origin) + + def test_processing_order(self): + # make sure we handle out-of-bounds *before* + # constructing the dates + + result = pd.to_datetime(200 * 365, unit='D') + expected = Timestamp('2169-11-13 00:00:00') + assert result == expected + + result = pd.to_datetime(200 * 365, unit='D', origin='1870-01-01') + expected = Timestamp('2069-11-13 00:00:00') + assert result == expected + + result = pd.to_datetime(300 * 365, unit='D', origin='1870-01-01') + expected = Timestamp('2169-10-20 00:00:00') + assert result == expected diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index 2e9f11297dc83..36aac8cafecc1 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -284,6 +284,12 @@ def test_ops_compat(self): result = rng / offset tm.assert_index_equal(result, expected, exact=False) + # floor divide + expected = Int64Index((np.arange(10) + 1) * 12, name='foo') + for offset in offsets: + result = rng // offset + tm.assert_index_equal(result, expected, exact=False) + # divide with nats rng = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') expected = Float64Index([12, np.nan, 24], name='foo') @@ -867,10 +873,12 @@ def test_ops(self): self.assertEqual(td * 2, Timedelta(20, unit='d')) self.assertTrue((td * pd.NaT) is pd.NaT) self.assertEqual(td / 2, Timedelta(5, unit='d')) + self.assertEqual(td // 2, Timedelta(5, unit='d')) self.assertEqual(abs(td), td) self.assertEqual(abs(-td), td) self.assertEqual(td / td, 1) self.assertTrue((td / pd.NaT) is np.nan) + self.assertTrue((td // pd.NaT) is np.nan) # invert self.assertEqual(-td, Timedelta('-10d')) @@ -878,9 +886,6 @@ def test_ops(self): self.assertEqual(-1 * td, Timedelta('-10d')) self.assertEqual(abs(-td), Timedelta('10d')) - # invalid - self.assertRaises(TypeError, lambda: Timedelta(11, unit='d') // 2) - # invalid multiply with another timedelta self.assertRaises(TypeError, lambda: td * td) @@ -991,7 +996,7 @@ class Other: self.assertTrue(td.__sub__(other) is NotImplemented) self.assertTrue(td.__truediv__(other) is NotImplemented) self.assertTrue(td.__mul__(other) is NotImplemented) - self.assertTrue(td.__floordiv__(td) is NotImplemented) + self.assertTrue(td.__floordiv__(other) is NotImplemented) def test_ops_error_str(self): # GH 13624 diff --git a/pandas/tests/scalar/test_timedelta.py b/pandas/tests/scalar/test_timedelta.py index c2b895925b685..c22d1d2329fba 100644 --- a/pandas/tests/scalar/test_timedelta.py +++ b/pandas/tests/scalar/test_timedelta.py @@ -216,6 +216,7 @@ def test_conversion(self): def test_freq_conversion(self): + # truediv td = Timedelta('1 days 2 hours 3 ns') result = td / np.timedelta64(1, 'D') self.assertEqual(result, td.value / float(86400 * 1e9)) @@ -224,6 +225,15 @@ def test_freq_conversion(self): result = td / np.timedelta64(1, 'ns') self.assertEqual(result, td.value) + # floordiv + td = Timedelta('1 days 2 hours 3 ns') + result = td // np.timedelta64(1, 'D') + self.assertEqual(result, 1) + result = td // np.timedelta64(1, 's') + self.assertEqual(result, 93600) + result = td // np.timedelta64(1, 'ns') + self.assertEqual(result, td.value) + def test_fields(self): def check(value): # that we are int/long like diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index 5d062dd38f9fc..d0f373fcc5a45 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -326,7 +326,7 @@ def _add_delta(self, delta): def _evaluate_with_timedelta_like(self, other, op, opstr): # allow division by a timedelta - if opstr in ['__div__', '__truediv__']: + if opstr in ['__div__', '__truediv__', '__floordiv__']: if _is_convertible_to_td(other): other = Timedelta(other) if isnull(other): @@ -334,7 +334,10 @@ def _evaluate_with_timedelta_like(self, other, op, opstr): "division by pd.NaT not implemented") i8 = self.asi8 - result = i8 / float(other.value) + if opstr in ['__floordiv__']: + result = i8 // other.value + else: + result = op(i8, float(other.value)) result = self._maybe_mask_results(result, convert='float64') return Index(result, name=self.name, copy=False) diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index 5dc9746c6d6f9..d0f1671f9e309 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -9,7 +9,11 @@ is_datetime64_dtype, is_datetime64tz_dtype, is_integer_dtype, - is_list_like) + is_integer, + is_float, + is_list_like, + is_scalar, + is_numeric_dtype) from pandas.types.generic import (ABCIndexClass, ABCSeries, ABCDataFrame) from pandas.types.missing import notnull @@ -177,7 +181,7 @@ def _guess_datetime_format_for_array(arr, **kwargs): def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, box=True, format=None, exact=True, - unit=None, infer_datetime_format=False): + unit=None, infer_datetime_format=False, origin='unix'): """ Convert argument to datetime. @@ -229,13 +233,27 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, - If False, allow the format to match anywhere in the target string. unit : string, default 'ns' - unit of the arg (D,s,ms,us,ns) denote the unit in epoch - (e.g. a unix timestamp), which is an integer/float number. + unit of the arg (D,s,ms,us,ns) denote the unit, which is an + integer or float number. This will be based off the origin. + Example, with unit='ms' and origin='unix' (the default), this + would calculate the number of milliseconds to the unix epoch start. infer_datetime_format : boolean, default False If True and no `format` is given, attempt to infer the format of the datetime strings, and if it can be inferred, switch to a faster method of parsing them. In some cases this can increase the parsing speed by ~5-10x. + origin : scalar, default is 'unix' + Define the reference date. The numeric values would be parsed as number + of units (defined by `unit`) since this reference date. + + - If 'unix' (or POSIX) time; origin is set to 1970-01-01. + - If 'julian', unit must be 'D', and origin is set to beginning of + Julian Calendar. Julian day number 0 is assigned to the day starting + at noon on January 1, 4713 BC. + - If Timestamp convertible, origin is set to Timestamp identified by + origin. + + .. versionadded: 0.20.0 Returns ------- @@ -297,8 +315,15 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, >>> %timeit pd.to_datetime(s,infer_datetime_format=False) 1 loop, best of 3: 471 ms per loop - """ + Using a non-unix epoch origin + + >>> pd.to_datetime([1, 2, 3], unit='D', + origin=pd.Timestamp('1960-01-01')) + 0 1960-01-02 + 1 1960-01-03 + 2 1960-01-04 + """ from pandas.tseries.index import DatetimeIndex tz = 'utc' if utc else None @@ -410,21 +435,77 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): raise e if arg is None: - return arg - elif isinstance(arg, tslib.Timestamp): - return arg + return None + + # handle origin + if origin == 'julian': + + original = arg + j0 = tslib.Timestamp(0).to_julian_date() + if unit != 'D': + raise ValueError("unit must be 'D' for origin='julian'") + try: + arg = arg - j0 + except: + raise ValueError("incompatible 'arg' type for given " + "'origin'='julian'") + + # premptively check this for a nice range + j_max = tslib.Timestamp.max.to_julian_date() - j0 + j_min = tslib.Timestamp.min.to_julian_date() - j0 + if np.any(arg > j_max) or np.any(arg < j_min): + raise tslib.OutOfBoundsDatetime( + "{original} is Out of Bounds for " + "origin='julian'".format(original=original)) + + elif origin not in ['unix', 'julian']: + + # arg must be a numeric + original = arg + if not ((is_scalar(arg) and (is_integer(arg) or is_float(arg))) or + is_numeric_dtype(np.asarray(arg))): + raise ValueError( + "'{arg}' is not compatible with origin='{origin}'; " + "it must be numeric with a unit specified ".format( + arg=arg, + origin=origin)) + + # we are going to offset back to unix / epoch time + try: + offset = tslib.Timestamp(origin) - tslib.Timestamp(0) + except tslib.OutOfBoundsDatetime: + raise tslib.OutOfBoundsDatetime( + "origin {} is Out of Bounds".format(origin)) + except ValueError: + raise ValueError("origin {} cannot be converted " + "to a Timestamp".format(origin)) + + # convert the offset to the unit of the arg + # this should be lossless in terms of precision + offset = offset // tslib.Timedelta(1, unit=unit) + + # scalars & ndarray-like can handle the addition + if is_list_like(arg) and not isinstance( + arg, (ABCSeries, ABCIndexClass, np.ndarray)): + arg = np.asarray(arg) + arg = arg + offset + + if isinstance(arg, tslib.Timestamp): + result = arg elif isinstance(arg, ABCSeries): from pandas import Series values = _convert_listlike(arg._values, False, format) - return Series(values, index=arg.index, name=arg.name) + result = Series(values, index=arg.index, name=arg.name) elif isinstance(arg, (ABCDataFrame, MutableMapping)): - return _assemble_from_unit_mappings(arg, errors=errors) + result = _assemble_from_unit_mappings(arg, errors=errors) elif isinstance(arg, ABCIndexClass): - return _convert_listlike(arg, box, format, name=arg.name) + result = _convert_listlike(arg, box, format, name=arg.name) elif is_list_like(arg): - return _convert_listlike(arg, box, format) + result = _convert_listlike(arg, box, format) + else: + result = _convert_listlike(np.array([arg]), box, format)[0] - return _convert_listlike(np.array([arg]), box, format)[0] + return result # mappings for assembling units