Skip to content

BUG: Convert float freqstrs to ints at finer resolution #14378

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.19.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,4 @@ Bug Fixes
- Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index level (:issue`14327`)
- Bug in ``df.groupby`` where ``TypeError`` raised when ``pd.Grouper(key=...)`` is passed in a list (:issue:`14334`)
- Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns``
is not scalar and ``values`` is not specified (:issue:`14380`)
is not scalar and ``values`` is not specified (:issue:`14380`)
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ Other enhancements

- ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`)


- Multiple offset aliases with decimal points are now supported (e.g. '0.5min' is parsed as '30s') (:issue:`8419`)

- New ``UnsortedIndexError`` (subclass of ``KeyError``) raised when indexing/slicing into an
unsorted MultiIndex (:issue:`11897`). This allows differentiation between errors due to lack
of sorting or an incorrect key. See :ref:`here <advanced.unsorted>`
Expand Down
28 changes: 14 additions & 14 deletions pandas/src/period.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,12 @@ cdef bint PY2 = version_info[0] == 2

cdef int64_t NPY_NAT = util.get_nat()

cdef int US_RESO = frequencies.US_RESO
cdef int MS_RESO = frequencies.MS_RESO
cdef int S_RESO = frequencies.S_RESO
cdef int T_RESO = frequencies.T_RESO
cdef int H_RESO = frequencies.H_RESO
cdef int D_RESO = frequencies.D_RESO
cdef int RESO_US = frequencies.RESO_US
cdef int RESO_MS = frequencies.RESO_MS
cdef int RESO_SEC = frequencies.RESO_SEC
cdef int RESO_MIN = frequencies.RESO_MIN
cdef int RESO_HR = frequencies.RESO_HR
cdef int RESO_DAY = frequencies.RESO_DAY

cdef extern from "period_helper.h":
ctypedef struct date_info:
Expand Down Expand Up @@ -516,7 +516,7 @@ cpdef resolution(ndarray[int64_t] stamps, tz=None):
cdef:
Py_ssize_t i, n = len(stamps)
pandas_datetimestruct dts
int reso = D_RESO, curr_reso
int reso = RESO_DAY, curr_reso

if tz is not None:
tz = maybe_get_tz(tz)
Expand All @@ -535,20 +535,20 @@ cpdef resolution(ndarray[int64_t] stamps, tz=None):
cdef inline int _reso_stamp(pandas_datetimestruct *dts):
if dts.us != 0:
if dts.us % 1000 == 0:
return MS_RESO
return US_RESO
return RESO_MS
return RESO_US
elif dts.sec != 0:
return S_RESO
return RESO_SEC
elif dts.min != 0:
return T_RESO
return RESO_MIN
elif dts.hour != 0:
return H_RESO
return D_RESO
return RESO_HR
return RESO_DAY

cdef _reso_local(ndarray[int64_t] stamps, object tz):
cdef:
Py_ssize_t n = len(stamps)
int reso = D_RESO, curr_reso
int reso = RESO_DAY, curr_reso
ndarray[int64_t] trans, deltas, pos
pandas_datetimestruct dts

Expand Down
105 changes: 88 additions & 17 deletions pandas/tseries/frequencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,32 +38,55 @@ class FreqGroup(object):
FR_NS = 12000


US_RESO = 0
MS_RESO = 1
S_RESO = 2
T_RESO = 3
H_RESO = 4
D_RESO = 5
RESO_NS = 0
RESO_US = 1
RESO_MS = 2
RESO_SEC = 3
RESO_MIN = 4
RESO_HR = 5
RESO_DAY = 6


class Resolution(object):

# defined in period.pyx
# note that these are different from freq codes
RESO_US = US_RESO
RESO_MS = MS_RESO
RESO_SEC = S_RESO
RESO_MIN = T_RESO
RESO_HR = H_RESO
RESO_DAY = D_RESO
RESO_US = RESO_US
RESO_MS = RESO_MS
RESO_SEC = RESO_SEC
RESO_MIN = RESO_MIN
RESO_HR = RESO_HR
RESO_DAY = RESO_DAY

_reso_str_map = {
RESO_NS: 'nanosecond',
RESO_US: 'microsecond',
RESO_MS: 'millisecond',
RESO_SEC: 'second',
RESO_MIN: 'minute',
RESO_HR: 'hour',
RESO_DAY: 'day'}
RESO_DAY: 'day'
}

# factor to multiply a value by to convert it to the next finer grained
# resolution
_reso_mult_map = {
RESO_NS: None,
RESO_US: 1000,
RESO_MS: 1000,
RESO_SEC: 1000,
RESO_MIN: 60,
RESO_HR: 60,
RESO_DAY: 24
}

_reso_str_bump_map = {
'D': 'H',
'H': 'T',
'T': 'S',
'S': 'L',
'L': 'U',
'U': 'N',
'N': None
}

_str_reso_map = dict([(v, k) for k, v in compat.iteritems(_reso_str_map)])

Expand Down Expand Up @@ -160,6 +183,47 @@ def get_reso_from_freq(cls, freq):
"""
return cls.get_reso(cls.get_str_from_freq(freq))

@classmethod
def get_stride_from_decimal(cls, value, freq):
"""
Convert freq with decimal stride into a higher freq with integer stride
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add Parameters/Returns section to detail the args

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and Raises


Parameters
----------
value : integer or float
freq : string
Frequency string

Raises
------
ValueError
If the float cannot be converted to an integer at any resolution.

Example
-------
>>> Resolution.get_stride_from_decimal(1.5, 'T')
(90, 'S')

>>> Resolution.get_stride_from_decimal(1.04, 'H')
(3744, 'S')

>>> Resolution.get_stride_from_decimal(1, 'D')
(1, 'D')
"""

if np.isclose(value % 1, 0):
return int(value), freq
else:
start_reso = cls.get_reso_from_freq(freq)
if start_reso == 0:
raise ValueError(
"Could not convert to integer offset at any resolution"
)

next_value = cls._reso_mult_map[start_reso] * value
next_name = cls._reso_str_bump_map[freq]
return cls.get_stride_from_decimal(next_value, next_name)


def get_to_timestamp_base(base):
"""
Expand Down Expand Up @@ -472,12 +536,17 @@ def to_offset(freq):
splitted[2::4]):
if sep != '' and not sep.isspace():
raise ValueError('separator must be spaces')
offset = get_offset(name)
prefix = _lite_rule_alias.get(name) or name
if stride_sign is None:
stride_sign = -1 if stride.startswith('-') else 1
if not stride:
stride = 1
if prefix in Resolution._reso_str_bump_map.keys():
stride, name = Resolution.get_stride_from_decimal(
float(stride), prefix
)
stride = int(stride)
offset = get_offset(name)
offset = offset * int(np.fabs(stride) * stride_sign)
if delta is None:
delta = offset
Expand All @@ -493,7 +562,9 @@ def to_offset(freq):


# hack to handle WOM-1MON
opattern = re.compile(r'([\-]?\d*)\s*([A-Za-z]+([\-][\dA-Za-z\-]+)?)')
opattern = re.compile(
r'([\-]?\d*|[\-]?\d*\.\d*)\s*([A-Za-z]+([\-][\dA-Za-z\-]+)?)'
)


def _base_and_stride(freqstr):
Expand Down
51 changes: 47 additions & 4 deletions pandas/tseries/tests/test_frequencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,21 @@ def test_to_offset_multiple(self):
expected = offsets.Hour(3)
assert (result == expected)

freqstr = '2h 20.5min'
result = frequencies.to_offset(freqstr)
expected = offsets.Second(8430)
assert (result == expected)

freqstr = '1.5min'
result = frequencies.to_offset(freqstr)
expected = offsets.Second(90)
assert (result == expected)

freqstr = '0.5S'
result = frequencies.to_offset(freqstr)
expected = offsets.Milli(500)
assert (result == expected)

freqstr = '15l500u'
result = frequencies.to_offset(freqstr)
expected = offsets.Micro(15500)
Expand All @@ -49,6 +64,16 @@ def test_to_offset_multiple(self):
expected = offsets.Milli(10075)
assert (result == expected)

freqstr = '1s0.25ms'
result = frequencies.to_offset(freqstr)
expected = offsets.Micro(1000250)
assert (result == expected)

freqstr = '1s0.25L'
result = frequencies.to_offset(freqstr)
expected = offsets.Micro(1000250)
assert (result == expected)

freqstr = '2800N'
result = frequencies.to_offset(freqstr)
expected = offsets.Nano(2800)
Expand Down Expand Up @@ -107,10 +132,8 @@ def test_to_offset_invalid(self):
frequencies.to_offset('-2-3U')
with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: -2D:3H'):
frequencies.to_offset('-2D:3H')

# ToDo: Must be fixed in #8419
with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: .5S'):
frequencies.to_offset('.5S')
with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: 1.5.0S'):
frequencies.to_offset('1.5.0S')

# split offsets with spaces are valid
assert frequencies.to_offset('2D 3H') == offsets.Hour(51)
Expand Down Expand Up @@ -379,6 +402,26 @@ def test_freq_to_reso(self):
result = Reso.get_freq(Reso.get_str(Reso.get_reso_from_freq(freq)))
self.assertEqual(freq, result)

def test_resolution_bumping(self):
# GH 14378
Reso = frequencies.Resolution
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add the issue number here as as comment


self.assertEqual(Reso.get_stride_from_decimal(1.5, 'T'), (90, 'S'))
self.assertEqual(Reso.get_stride_from_decimal(62.4, 'T'), (3744, 'S'))
self.assertEqual(Reso.get_stride_from_decimal(1.04, 'H'), (3744, 'S'))
self.assertEqual(Reso.get_stride_from_decimal(1, 'D'), (1, 'D'))
self.assertEqual(Reso.get_stride_from_decimal(0.342931, 'H'),
(1234551600, 'U'))
self.assertEqual(Reso.get_stride_from_decimal(1.2345, 'D'),
(106660800, 'L'))

with self.assertRaises(ValueError):
Reso.get_stride_from_decimal(0.5, 'N')

# too much precision in the input can prevent
with self.assertRaises(ValueError):
Reso.get_stride_from_decimal(0.3429324798798269273987982, 'H')

def test_get_freq_code(self):
# freqstr
self.assertEqual(frequencies.get_freq_code('A'),
Expand Down
12 changes: 6 additions & 6 deletions pandas/tseries/tests/test_tslib.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from pandas.tseries.index import date_range, DatetimeIndex
from pandas.tseries.frequencies import (
get_freq,
US_RESO, MS_RESO, S_RESO, H_RESO, D_RESO, T_RESO
RESO_US, RESO_MS, RESO_SEC, RESO_HR, RESO_DAY, RESO_MIN
)
import pandas.tseries.tools as tools
import pandas.tseries.offsets as offsets
Expand Down Expand Up @@ -1528,11 +1528,11 @@ def test_resolution(self):

for freq, expected in zip(['A', 'Q', 'M', 'D', 'H', 'T',
'S', 'L', 'U'],
[D_RESO, D_RESO,
D_RESO, D_RESO,
H_RESO, T_RESO,
S_RESO, MS_RESO,
US_RESO]):
[RESO_DAY, RESO_DAY,
RESO_DAY, RESO_DAY,
RESO_HR, RESO_MIN,
RESO_SEC, RESO_MS,
RESO_US]):
for tz in [None, 'Asia/Tokyo', 'US/Eastern',
'dateutil/US/Eastern']:
idx = date_range(start='2013-04-01', periods=30, freq=freq,
Expand Down