From 47723c3d24bbee3ee203341f338da4a05d70bfc2 Mon Sep 17 00:00:00 2001 From: James Santucci Date: Sat, 8 Oct 2016 12:07:39 -0400 Subject: [PATCH 1/4] Convert float freqstrs to ints at finer resolution Passing `'0.5min'` as a frequency string should generate 30 second intervals, rather than five minute intervals. By recursively increasing resolution until one is found for which the frequency is an integer, this commit ensures that that's the case for resolutions from days to microseconds. Fixes #8419 --- doc/source/whatsnew/v0.19.1.txt | 3 +- pandas/src/period.pyx | 28 ++++---- pandas/tseries/frequencies.py | 87 ++++++++++++++++++++---- pandas/tseries/tests/test_frequencies.py | 50 ++++++++++++-- pandas/tseries/tests/test_tslib.py | 12 ++-- 5 files changed, 140 insertions(+), 40 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index a81ab6ed0311c..30ee7615f3394 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -80,4 +80,5 @@ Bug Fixes - Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns`` - is not scalar and ``values`` is not specified (:issue:`14380`) \ No newline at end of file + is not scalar and ``values`` is not specified (:issue:`14380`) +- Bug in multiple offset aliases with decimal points regarded as ints (e.g. 0.5s as 5s) (:issue:`8419`) diff --git a/pandas/src/period.pyx b/pandas/src/period.pyx index 5565f25937394..2d92b9f192328 100644 --- a/pandas/src/period.pyx +++ b/pandas/src/period.pyx @@ -45,12 +45,12 @@ cdef bint PY2 = version_info[0] == 2 cdef int64_t NPY_NAT = util.get_nat() -cdef int US_RESO = frequencies.US_RESO -cdef int MS_RESO = frequencies.MS_RESO -cdef int S_RESO = frequencies.S_RESO -cdef int T_RESO = frequencies.T_RESO -cdef int H_RESO = frequencies.H_RESO -cdef int D_RESO = frequencies.D_RESO +cdef int RESO_US = frequencies.RESO_US +cdef int RESO_MS = frequencies.RESO_MS +cdef int RESO_SEC = frequencies.RESO_SEC +cdef int RESO_MIN = frequencies.RESO_MIN +cdef int RESO_HR = frequencies.RESO_HR +cdef int RESO_DAY = frequencies.RESO_DAY cdef extern from "period_helper.h": ctypedef struct date_info: @@ -516,7 +516,7 @@ cpdef resolution(ndarray[int64_t] stamps, tz=None): cdef: Py_ssize_t i, n = len(stamps) pandas_datetimestruct dts - int reso = D_RESO, curr_reso + int reso = RESO_DAY, curr_reso if tz is not None: tz = maybe_get_tz(tz) @@ -535,20 +535,20 @@ cpdef resolution(ndarray[int64_t] stamps, tz=None): cdef inline int _reso_stamp(pandas_datetimestruct *dts): if dts.us != 0: if dts.us % 1000 == 0: - return MS_RESO - return US_RESO + return RESO_MS + return RESO_US elif dts.sec != 0: - return S_RESO + return RESO_SEC elif dts.min != 0: - return T_RESO + return RESO_MIN elif dts.hour != 0: - return H_RESO - return D_RESO + return RESO_HR + return RESO_DAY cdef _reso_local(ndarray[int64_t] stamps, object tz): cdef: Py_ssize_t n = len(stamps) - int reso = D_RESO, curr_reso + int reso = RESO_DAY, curr_reso ndarray[int64_t] trans, deltas, pos pandas_datetimestruct dts diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index ac094c1f545f3..c9151b07a9498 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -38,32 +38,52 @@ class FreqGroup(object): FR_NS = 12000 -US_RESO = 0 -MS_RESO = 1 -S_RESO = 2 -T_RESO = 3 -H_RESO = 4 -D_RESO = 5 +RESO_NS = 0 +RESO_US = 1 +RESO_MS = 2 +RESO_SEC = 3 +RESO_MIN = 4 +RESO_HR = 5 +RESO_DAY = 6 class Resolution(object): # defined in period.pyx # note that these are different from freq codes - RESO_US = US_RESO - RESO_MS = MS_RESO - RESO_SEC = S_RESO - RESO_MIN = T_RESO - RESO_HR = H_RESO - RESO_DAY = D_RESO + RESOS = [RESO_NS, RESO_US, RESO_MS, RESO_SEC, RESO_MIN, RESO_HR, RESO_DAY] _reso_str_map = { + RESO_NS: 'nanosecond', RESO_US: 'microsecond', RESO_MS: 'millisecond', RESO_SEC: 'second', RESO_MIN: 'minute', RESO_HR: 'hour', - RESO_DAY: 'day'} + RESO_DAY: 'day' + } + + # factor to multiply a value by to convert it to the next finer grained + # resolution + _reso_mult_map = { + RESO_NS: None, + RESO_US: 1000, + RESO_MS: 1000, + RESO_SEC: 1000, + RESO_MIN: 60, + RESO_HR: 60, + RESO_DAY: 24 + } + + _reso_str_bump_map = { + 'D': 'H', + 'H': 'T', + 'T': 'S', + 'S': 'L', + 'L': 'U', + 'U': 'N', + 'N': None + } _str_reso_map = dict([(v, k) for k, v in compat.iteritems(_reso_str_map)]) @@ -160,6 +180,36 @@ def get_reso_from_freq(cls, freq): """ return cls.get_reso(cls.get_str_from_freq(freq)) + @classmethod + def get_stride_from_decimal(cls, value, freq): + """ + Convert freq with decimal stride into a higher freq with integer stride + + Example + ------- + >>> Resolution.get_stride_from_decimal(1.5, 'T') + (90, 'S') + + >>> Resolution.get_stride_from_decimal(1.04, 'H') + (3744, 'S') + + >>> Resolution.get_stride_from_decimal(1, 'D') + (1, 'D') + """ + + if np.isclose(value % 1, 0): + return int(value), freq + else: + start_reso = cls.get_reso_from_freq(freq) + if start_reso == 0: + raise ValueError( + "Could not convert to integer offset at any resolution" + ) + + next_value = cls._reso_mult_map[start_reso] * value + next_name = cls._reso_str_bump_map[freq] + return cls.get_stride_from_decimal(next_value, next_name) + def get_to_timestamp_base(base): """ @@ -472,12 +522,17 @@ def to_offset(freq): splitted[2::4]): if sep != '' and not sep.isspace(): raise ValueError('separator must be spaces') - offset = get_offset(name) + prefix = _lite_rule_alias.get(name) or name if stride_sign is None: stride_sign = -1 if stride.startswith('-') else 1 if not stride: stride = 1 + if prefix in Resolution._reso_str_bump_map.keys(): + stride, name = Resolution.get_stride_from_decimal( + float(stride), prefix + ) stride = int(stride) + offset = get_offset(name) offset = offset * int(np.fabs(stride) * stride_sign) if delta is None: delta = offset @@ -493,7 +548,9 @@ def to_offset(freq): # hack to handle WOM-1MON -opattern = re.compile(r'([\-]?\d*)\s*([A-Za-z]+([\-][\dA-Za-z\-]+)?)') +opattern = re.compile( + r'([\-]?\d*|[\-]?\d*\.\d*)\s*([A-Za-z]+([\-][\dA-Za-z\-]+)?)' +) def _base_and_stride(freqstr): diff --git a/pandas/tseries/tests/test_frequencies.py b/pandas/tseries/tests/test_frequencies.py index 5ba98f15aed8d..4823ef5109abe 100644 --- a/pandas/tseries/tests/test_frequencies.py +++ b/pandas/tseries/tests/test_frequencies.py @@ -39,6 +39,21 @@ def test_to_offset_multiple(self): expected = offsets.Hour(3) assert (result == expected) + freqstr = '2h 20.5min' + result = frequencies.to_offset(freqstr) + expected = offsets.Second(8430) + assert (result == expected) + + freqstr = '1.5min' + result = frequencies.to_offset(freqstr) + expected = offsets.Second(90) + assert (result == expected) + + freqstr = '0.5S' + result = frequencies.to_offset(freqstr) + expected = offsets.Milli(500) + assert (result == expected) + freqstr = '15l500u' result = frequencies.to_offset(freqstr) expected = offsets.Micro(15500) @@ -49,6 +64,16 @@ def test_to_offset_multiple(self): expected = offsets.Milli(10075) assert (result == expected) + freqstr = '1s0.25ms' + result = frequencies.to_offset(freqstr) + expected = offsets.Micro(1000250) + assert (result == expected) + + freqstr = '1s0.25L' + result = frequencies.to_offset(freqstr) + expected = offsets.Micro(1000250) + assert (result == expected) + freqstr = '2800N' result = frequencies.to_offset(freqstr) expected = offsets.Nano(2800) @@ -107,10 +132,8 @@ def test_to_offset_invalid(self): frequencies.to_offset('-2-3U') with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: -2D:3H'): frequencies.to_offset('-2D:3H') - - # ToDo: Must be fixed in #8419 - with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: .5S'): - frequencies.to_offset('.5S') + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: 1.5.0S'): + frequencies.to_offset('1.5.0S') # split offsets with spaces are valid assert frequencies.to_offset('2D 3H') == offsets.Hour(51) @@ -379,6 +402,25 @@ def test_freq_to_reso(self): result = Reso.get_freq(Reso.get_str(Reso.get_reso_from_freq(freq))) self.assertEqual(freq, result) + def test_resolution_bumping(self): + Reso = frequencies.Resolution + + self.assertEqual(Reso.get_stride_from_decimal(1.5, 'T'), (90, 'S')) + self.assertEqual(Reso.get_stride_from_decimal(62.4, 'T'), (3744, 'S')) + self.assertEqual(Reso.get_stride_from_decimal(1.04, 'H'), (3744, 'S')) + self.assertEqual(Reso.get_stride_from_decimal(1, 'D'), (1, 'D')) + self.assertEqual(Reso.get_stride_from_decimal(0.342931, 'H'), + (1234551600, 'U')) + self.assertEqual(Reso.get_stride_from_decimal(1.2345, 'D'), + (106660800, 'L')) + + with self.assertRaises(ValueError): + Reso.get_stride_from_decimal(0.5, 'N') + + # too much precision in the input can prevent + with self.assertRaises(ValueError): + Reso.get_stride_from_decimal(0.3429324798798269273987982, 'H') + def test_get_freq_code(self): # freqstr self.assertEqual(frequencies.get_freq_code('A'), diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index 21cfe84f153fa..93b79a9bc699e 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -14,7 +14,7 @@ from pandas.tseries.index import date_range, DatetimeIndex from pandas.tseries.frequencies import ( get_freq, - US_RESO, MS_RESO, S_RESO, H_RESO, D_RESO, T_RESO + RESO_US, RESO_MS, RESO_SEC, RESO_HR, RESO_DAY, RESO_MIN ) import pandas.tseries.tools as tools import pandas.tseries.offsets as offsets @@ -1527,11 +1527,11 @@ def test_resolution(self): for freq, expected in zip(['A', 'Q', 'M', 'D', 'H', 'T', 'S', 'L', 'U'], - [D_RESO, D_RESO, - D_RESO, D_RESO, - H_RESO, T_RESO, - S_RESO, MS_RESO, - US_RESO]): + [RESO_DAY, RESO_DAY, + RESO_DAY, RESO_DAY, + RESO_HR, RESO_MIN, + RESO_SEC, RESO_MS, + RESO_US]): for tz in [None, 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Eastern']: idx = date_range(start='2013-04-01', periods=30, freq=freq, From 01afe8ae010de3285fd36b84ee42b6247184088d Mon Sep 17 00:00:00 2001 From: James Santucci Date: Thu, 27 Oct 2016 10:58:07 -0400 Subject: [PATCH 2/4] Bring back resolution definitions and move whatnew to 0.20 --- doc/source/whatsnew/v0.19.1.txt | 1 - doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tseries/frequencies.py | 9 ++++++--- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 30ee7615f3394..15994ddaa2ed5 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -81,4 +81,3 @@ Bug Fixes - Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns`` is not scalar and ``values`` is not specified (:issue:`14380`) -- Bug in multiple offset aliases with decimal points regarded as ints (e.g. 0.5s as 5s) (:issue:`8419`) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 7fa9991138fba..4ca5186c7a4b6 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -30,6 +30,7 @@ New features Other enhancements ^^^^^^^^^^^^^^^^^^ +- Multiple offset aliases with decimal points no longer regarded as ints (e.g. 0.5s as 5s) (:issue:`8419`) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index c9151b07a9498..bef1d141a18e0 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -49,9 +49,12 @@ class FreqGroup(object): class Resolution(object): - # defined in period.pyx - # note that these are different from freq codes - RESOS = [RESO_NS, RESO_US, RESO_MS, RESO_SEC, RESO_MIN, RESO_HR, RESO_DAY] + RESO_US = RESO_US + RESO_MS = RESO_MS + RESO_SEC = RESO_SEC + RESO_MIN = RESO_MIN + RESO_HR = RESO_HR + RESO_DAY = RESO_DAY _reso_str_map = { RESO_NS: 'nanosecond', From 65a516347afafa0f45cc266761e3ae27a8c0079f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 14 Dec 2016 14:37:37 +0100 Subject: [PATCH 3/4] reword whatsnew --- doc/source/whatsnew/v0.20.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index df50db40ee9da..5cc9d575521f3 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -53,7 +53,7 @@ Other enhancements - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) -- Multiple offset aliases with decimal points no longer regarded as ints (e.g. 0.5s as 5s) (:issue:`8419`) +- Multiple offset aliases with decimal points are now supported (e.g. '0.5min' is parsed as '30s') (:issue:`8419`) - New ``UnsortedIndexError`` (subclass of ``KeyError``) raised when indexing/slicing into an unsorted MultiIndex (:issue:`11897`). This allows differentiation between errors due to lack From 80b6d5adc38e1e4f708398b6e228e9ef73360e88 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 14 Dec 2016 17:06:21 +0100 Subject: [PATCH 4/4] latest comments --- pandas/tseries/frequencies.py | 11 +++++++++++ pandas/tseries/tests/test_frequencies.py | 1 + 2 files changed, 12 insertions(+) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index bef1d141a18e0..e0c602bf5a037 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -188,6 +188,17 @@ def get_stride_from_decimal(cls, value, freq): """ Convert freq with decimal stride into a higher freq with integer stride + Parameters + ---------- + value : integer or float + freq : string + Frequency string + + Raises + ------ + ValueError + If the float cannot be converted to an integer at any resolution. + Example ------- >>> Resolution.get_stride_from_decimal(1.5, 'T') diff --git a/pandas/tseries/tests/test_frequencies.py b/pandas/tseries/tests/test_frequencies.py index 4823ef5109abe..dfb7b26371d7a 100644 --- a/pandas/tseries/tests/test_frequencies.py +++ b/pandas/tseries/tests/test_frequencies.py @@ -403,6 +403,7 @@ def test_freq_to_reso(self): self.assertEqual(freq, result) def test_resolution_bumping(self): + # GH 14378 Reso = frequencies.Resolution self.assertEqual(Reso.get_stride_from_decimal(1.5, 'T'), (90, 'S'))