Skip to content

Commit c386806

Browse files
jisantucischurov
authored andcommitted
BUG: Convert float freqstrs to ints at finer resolution (pandas-dev#14378)
Passing `'0.5min'` as a frequency string should generate 30 second intervals, rather than five minute intervals. By recursively increasing resolution until one is found for which the frequency is an integer, this commit ensures that that's the case for resolutions from days to microseconds. Fixes pandas-dev#8419
1 parent e58b66b commit c386806

File tree

6 files changed

+159
-42
lines changed

6 files changed

+159
-42
lines changed

doc/source/whatsnew/v0.19.1.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -58,4 +58,4 @@ Bug Fixes
5858
- Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index level (:issue`14327`)
5959
- Bug in ``df.groupby`` where ``TypeError`` raised when ``pd.Grouper(key=...)`` is passed in a list (:issue:`14334`)
6060
- Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns``
61-
is not scalar and ``values`` is not specified (:issue:`14380`)
61+
is not scalar and ``values`` is not specified (:issue:`14380`)

doc/source/whatsnew/v0.20.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ Other enhancements
5252

5353
- ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`)
5454

55+
56+
- Multiple offset aliases with decimal points are now supported (e.g. '0.5min' is parsed as '30s') (:issue:`8419`)
57+
5558
- New ``UnsortedIndexError`` (subclass of ``KeyError``) raised when indexing/slicing into an
5659
unsorted MultiIndex (:issue:`11897`). This allows differentiation between errors due to lack
5760
of sorting or an incorrect key. See :ref:`here <advanced.unsorted>`

pandas/src/period.pyx

+14-14
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,12 @@ cdef bint PY2 = version_info[0] == 2
4545

4646
cdef int64_t NPY_NAT = util.get_nat()
4747

48-
cdef int US_RESO = frequencies.US_RESO
49-
cdef int MS_RESO = frequencies.MS_RESO
50-
cdef int S_RESO = frequencies.S_RESO
51-
cdef int T_RESO = frequencies.T_RESO
52-
cdef int H_RESO = frequencies.H_RESO
53-
cdef int D_RESO = frequencies.D_RESO
48+
cdef int RESO_US = frequencies.RESO_US
49+
cdef int RESO_MS = frequencies.RESO_MS
50+
cdef int RESO_SEC = frequencies.RESO_SEC
51+
cdef int RESO_MIN = frequencies.RESO_MIN
52+
cdef int RESO_HR = frequencies.RESO_HR
53+
cdef int RESO_DAY = frequencies.RESO_DAY
5454

5555
cdef extern from "period_helper.h":
5656
ctypedef struct date_info:
@@ -516,7 +516,7 @@ cpdef resolution(ndarray[int64_t] stamps, tz=None):
516516
cdef:
517517
Py_ssize_t i, n = len(stamps)
518518
pandas_datetimestruct dts
519-
int reso = D_RESO, curr_reso
519+
int reso = RESO_DAY, curr_reso
520520

521521
if tz is not None:
522522
tz = maybe_get_tz(tz)
@@ -535,20 +535,20 @@ cpdef resolution(ndarray[int64_t] stamps, tz=None):
535535
cdef inline int _reso_stamp(pandas_datetimestruct *dts):
536536
if dts.us != 0:
537537
if dts.us % 1000 == 0:
538-
return MS_RESO
539-
return US_RESO
538+
return RESO_MS
539+
return RESO_US
540540
elif dts.sec != 0:
541-
return S_RESO
541+
return RESO_SEC
542542
elif dts.min != 0:
543-
return T_RESO
543+
return RESO_MIN
544544
elif dts.hour != 0:
545-
return H_RESO
546-
return D_RESO
545+
return RESO_HR
546+
return RESO_DAY
547547

548548
cdef _reso_local(ndarray[int64_t] stamps, object tz):
549549
cdef:
550550
Py_ssize_t n = len(stamps)
551-
int reso = D_RESO, curr_reso
551+
int reso = RESO_DAY, curr_reso
552552
ndarray[int64_t] trans, deltas, pos
553553
pandas_datetimestruct dts
554554

pandas/tseries/frequencies.py

+88-17
Original file line numberDiff line numberDiff line change
@@ -38,32 +38,55 @@ class FreqGroup(object):
3838
FR_NS = 12000
3939

4040

41-
US_RESO = 0
42-
MS_RESO = 1
43-
S_RESO = 2
44-
T_RESO = 3
45-
H_RESO = 4
46-
D_RESO = 5
41+
RESO_NS = 0
42+
RESO_US = 1
43+
RESO_MS = 2
44+
RESO_SEC = 3
45+
RESO_MIN = 4
46+
RESO_HR = 5
47+
RESO_DAY = 6
4748

4849

4950
class Resolution(object):
5051

51-
# defined in period.pyx
52-
# note that these are different from freq codes
53-
RESO_US = US_RESO
54-
RESO_MS = MS_RESO
55-
RESO_SEC = S_RESO
56-
RESO_MIN = T_RESO
57-
RESO_HR = H_RESO
58-
RESO_DAY = D_RESO
52+
RESO_US = RESO_US
53+
RESO_MS = RESO_MS
54+
RESO_SEC = RESO_SEC
55+
RESO_MIN = RESO_MIN
56+
RESO_HR = RESO_HR
57+
RESO_DAY = RESO_DAY
5958

6059
_reso_str_map = {
60+
RESO_NS: 'nanosecond',
6161
RESO_US: 'microsecond',
6262
RESO_MS: 'millisecond',
6363
RESO_SEC: 'second',
6464
RESO_MIN: 'minute',
6565
RESO_HR: 'hour',
66-
RESO_DAY: 'day'}
66+
RESO_DAY: 'day'
67+
}
68+
69+
# factor to multiply a value by to convert it to the next finer grained
70+
# resolution
71+
_reso_mult_map = {
72+
RESO_NS: None,
73+
RESO_US: 1000,
74+
RESO_MS: 1000,
75+
RESO_SEC: 1000,
76+
RESO_MIN: 60,
77+
RESO_HR: 60,
78+
RESO_DAY: 24
79+
}
80+
81+
_reso_str_bump_map = {
82+
'D': 'H',
83+
'H': 'T',
84+
'T': 'S',
85+
'S': 'L',
86+
'L': 'U',
87+
'U': 'N',
88+
'N': None
89+
}
6790

6891
_str_reso_map = dict([(v, k) for k, v in compat.iteritems(_reso_str_map)])
6992

@@ -160,6 +183,47 @@ def get_reso_from_freq(cls, freq):
160183
"""
161184
return cls.get_reso(cls.get_str_from_freq(freq))
162185

186+
@classmethod
187+
def get_stride_from_decimal(cls, value, freq):
188+
"""
189+
Convert freq with decimal stride into a higher freq with integer stride
190+
191+
Parameters
192+
----------
193+
value : integer or float
194+
freq : string
195+
Frequency string
196+
197+
Raises
198+
------
199+
ValueError
200+
If the float cannot be converted to an integer at any resolution.
201+
202+
Example
203+
-------
204+
>>> Resolution.get_stride_from_decimal(1.5, 'T')
205+
(90, 'S')
206+
207+
>>> Resolution.get_stride_from_decimal(1.04, 'H')
208+
(3744, 'S')
209+
210+
>>> Resolution.get_stride_from_decimal(1, 'D')
211+
(1, 'D')
212+
"""
213+
214+
if np.isclose(value % 1, 0):
215+
return int(value), freq
216+
else:
217+
start_reso = cls.get_reso_from_freq(freq)
218+
if start_reso == 0:
219+
raise ValueError(
220+
"Could not convert to integer offset at any resolution"
221+
)
222+
223+
next_value = cls._reso_mult_map[start_reso] * value
224+
next_name = cls._reso_str_bump_map[freq]
225+
return cls.get_stride_from_decimal(next_value, next_name)
226+
163227

164228
def get_to_timestamp_base(base):
165229
"""
@@ -472,12 +536,17 @@ def to_offset(freq):
472536
splitted[2::4]):
473537
if sep != '' and not sep.isspace():
474538
raise ValueError('separator must be spaces')
475-
offset = get_offset(name)
539+
prefix = _lite_rule_alias.get(name) or name
476540
if stride_sign is None:
477541
stride_sign = -1 if stride.startswith('-') else 1
478542
if not stride:
479543
stride = 1
544+
if prefix in Resolution._reso_str_bump_map.keys():
545+
stride, name = Resolution.get_stride_from_decimal(
546+
float(stride), prefix
547+
)
480548
stride = int(stride)
549+
offset = get_offset(name)
481550
offset = offset * int(np.fabs(stride) * stride_sign)
482551
if delta is None:
483552
delta = offset
@@ -493,7 +562,9 @@ def to_offset(freq):
493562

494563

495564
# hack to handle WOM-1MON
496-
opattern = re.compile(r'([\-]?\d*)\s*([A-Za-z]+([\-][\dA-Za-z\-]+)?)')
565+
opattern = re.compile(
566+
r'([\-]?\d*|[\-]?\d*\.\d*)\s*([A-Za-z]+([\-][\dA-Za-z\-]+)?)'
567+
)
497568

498569

499570
def _base_and_stride(freqstr):

pandas/tseries/tests/test_frequencies.py

+47-4
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,21 @@ def test_to_offset_multiple(self):
3939
expected = offsets.Hour(3)
4040
assert (result == expected)
4141

42+
freqstr = '2h 20.5min'
43+
result = frequencies.to_offset(freqstr)
44+
expected = offsets.Second(8430)
45+
assert (result == expected)
46+
47+
freqstr = '1.5min'
48+
result = frequencies.to_offset(freqstr)
49+
expected = offsets.Second(90)
50+
assert (result == expected)
51+
52+
freqstr = '0.5S'
53+
result = frequencies.to_offset(freqstr)
54+
expected = offsets.Milli(500)
55+
assert (result == expected)
56+
4257
freqstr = '15l500u'
4358
result = frequencies.to_offset(freqstr)
4459
expected = offsets.Micro(15500)
@@ -49,6 +64,16 @@ def test_to_offset_multiple(self):
4964
expected = offsets.Milli(10075)
5065
assert (result == expected)
5166

67+
freqstr = '1s0.25ms'
68+
result = frequencies.to_offset(freqstr)
69+
expected = offsets.Micro(1000250)
70+
assert (result == expected)
71+
72+
freqstr = '1s0.25L'
73+
result = frequencies.to_offset(freqstr)
74+
expected = offsets.Micro(1000250)
75+
assert (result == expected)
76+
5277
freqstr = '2800N'
5378
result = frequencies.to_offset(freqstr)
5479
expected = offsets.Nano(2800)
@@ -107,10 +132,8 @@ def test_to_offset_invalid(self):
107132
frequencies.to_offset('-2-3U')
108133
with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: -2D:3H'):
109134
frequencies.to_offset('-2D:3H')
110-
111-
# ToDo: Must be fixed in #8419
112-
with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: .5S'):
113-
frequencies.to_offset('.5S')
135+
with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: 1.5.0S'):
136+
frequencies.to_offset('1.5.0S')
114137

115138
# split offsets with spaces are valid
116139
assert frequencies.to_offset('2D 3H') == offsets.Hour(51)
@@ -379,6 +402,26 @@ def test_freq_to_reso(self):
379402
result = Reso.get_freq(Reso.get_str(Reso.get_reso_from_freq(freq)))
380403
self.assertEqual(freq, result)
381404

405+
def test_resolution_bumping(self):
406+
# GH 14378
407+
Reso = frequencies.Resolution
408+
409+
self.assertEqual(Reso.get_stride_from_decimal(1.5, 'T'), (90, 'S'))
410+
self.assertEqual(Reso.get_stride_from_decimal(62.4, 'T'), (3744, 'S'))
411+
self.assertEqual(Reso.get_stride_from_decimal(1.04, 'H'), (3744, 'S'))
412+
self.assertEqual(Reso.get_stride_from_decimal(1, 'D'), (1, 'D'))
413+
self.assertEqual(Reso.get_stride_from_decimal(0.342931, 'H'),
414+
(1234551600, 'U'))
415+
self.assertEqual(Reso.get_stride_from_decimal(1.2345, 'D'),
416+
(106660800, 'L'))
417+
418+
with self.assertRaises(ValueError):
419+
Reso.get_stride_from_decimal(0.5, 'N')
420+
421+
# too much precision in the input can prevent
422+
with self.assertRaises(ValueError):
423+
Reso.get_stride_from_decimal(0.3429324798798269273987982, 'H')
424+
382425
def test_get_freq_code(self):
383426
# freqstr
384427
self.assertEqual(frequencies.get_freq_code('A'),

pandas/tseries/tests/test_tslib.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from pandas.tseries.index import date_range, DatetimeIndex
1515
from pandas.tseries.frequencies import (
1616
get_freq,
17-
US_RESO, MS_RESO, S_RESO, H_RESO, D_RESO, T_RESO
17+
RESO_US, RESO_MS, RESO_SEC, RESO_HR, RESO_DAY, RESO_MIN
1818
)
1919
import pandas.tseries.tools as tools
2020
import pandas.tseries.offsets as offsets
@@ -1528,11 +1528,11 @@ def test_resolution(self):
15281528

15291529
for freq, expected in zip(['A', 'Q', 'M', 'D', 'H', 'T',
15301530
'S', 'L', 'U'],
1531-
[D_RESO, D_RESO,
1532-
D_RESO, D_RESO,
1533-
H_RESO, T_RESO,
1534-
S_RESO, MS_RESO,
1535-
US_RESO]):
1531+
[RESO_DAY, RESO_DAY,
1532+
RESO_DAY, RESO_DAY,
1533+
RESO_HR, RESO_MIN,
1534+
RESO_SEC, RESO_MS,
1535+
RESO_US]):
15361536
for tz in [None, 'Asia/Tokyo', 'US/Eastern',
15371537
'dateutil/US/Eastern']:
15381538
idx = date_range(start='2013-04-01', periods=30, freq=freq,

0 commit comments

Comments
 (0)