Skip to content

BUG: Parsing week-freq Periods #50803

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jan 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1100,6 +1100,8 @@ Period
- Bug in :meth:`Period.strftime` and :meth:`PeriodIndex.strftime`, raising ``UnicodeDecodeError`` when a locale-specific directive was passed (:issue:`46319`)
- Bug in adding a :class:`Period` object to an array of :class:`DateOffset` objects incorrectly raising ``TypeError`` (:issue:`50162`)
- Bug in :class:`Period` where passing a string with finer resolution than nanosecond would result in a ``KeyError`` instead of dropping the extra precision (:issue:`50417`)
- Bug in parsing strings representing Week-periods e.g. "2017-01-23/2017-01-29" as minute-frequency instead of week-frequency (:issue:`50803`)
-

Plotting
^^^^^^^^
Expand Down
28 changes: 13 additions & 15 deletions pandas/_libs/tslibs/parsing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -316,21 +316,6 @@ def parse_datetime_string(
dt = dateutil_parse(date_string, default=_DEFAULT_DATETIME,
dayfirst=dayfirst, yearfirst=yearfirst,
ignoretz=False, out_bestunit=&out_bestunit)

if dt.tzinfo is not None:
# dateutil can return a datetime with a tzoffset outside of (-24H, 24H)
# bounds, which is invalid (can be constructed, but raises if we call
# str(dt)). Check that and raise here if necessary.
try:
dt.utcoffset()
except ValueError as err:
# offset must be a timedelta strictly between -timedelta(hours=24)
# and timedelta(hours=24)
raise ValueError(
f'Parsed string "{date_string}" gives an invalid tzoffset, '
"which must be between -timedelta(hours=24) and timedelta(hours=24)"
)

return dt


Expand Down Expand Up @@ -696,6 +681,19 @@ cdef datetime dateutil_parse(
elif res.tzoffset:
ret = ret.replace(tzinfo=tzoffset(res.tzname, res.tzoffset))

# dateutil can return a datetime with a tzoffset outside of (-24H, 24H)
# bounds, which is invalid (can be constructed, but raises if we call
# str(ret)). Check that and raise here if necessary.
try:
ret.utcoffset()
except ValueError as err:
# offset must be a timedelta strictly between -timedelta(hours=24)
# and timedelta(hours=24)
raise ValueError(
f'Parsed string "{timestr}" gives an invalid tzoffset, '
"which must be between -timedelta(hours=24) and timedelta(hours=24)"
)

out_bestunit[0] = attrname_to_npy_unit[reso]
return ret

Expand Down
64 changes: 51 additions & 13 deletions pandas/_libs/tslibs/period.pyx
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

cimport numpy as cnp
from cpython.object cimport (
Py_EQ,
Expand Down Expand Up @@ -2591,20 +2593,31 @@ class Period(_Period):
value = value.upper()

freqstr = freq.rule_code if freq is not None else None
dt, reso = parse_datetime_string_with_reso(value, freqstr)
if reso == "nanosecond":
nanosecond = dt.nanosecond
if dt is NaT:
ordinal = NPY_NAT
try:
dt, reso = parse_datetime_string_with_reso(value, freqstr)
except ValueError as err:
match = re.search(r"^\d{4}-\d{2}-\d{2}/\d{4}-\d{2}-\d{2}", value)
if match:
# Case that cannot be parsed (correctly) by our datetime
# parsing logic
dt, freq = _parse_weekly_str(value, freq)
else:
raise err

if freq is None and ordinal != NPY_NAT:
# Skip NaT, since it doesn't have a resolution
try:
freq = attrname_to_abbrevs[reso]
except KeyError:
raise ValueError(f"Invalid frequency or could not "
f"infer: {reso}")
freq = to_offset(freq)
else:
if reso == "nanosecond":
nanosecond = dt.nanosecond
if dt is NaT:
ordinal = NPY_NAT

if freq is None and ordinal != NPY_NAT:
# Skip NaT, since it doesn't have a resolution
try:
freq = attrname_to_abbrevs[reso]
except KeyError:
raise ValueError(f"Invalid frequency or could not "
f"infer: {reso}")
freq = to_offset(freq)

elif PyDateTime_Check(value):
dt = value
Expand Down Expand Up @@ -2664,3 +2677,28 @@ def validate_end_alias(how: str) -> str: # Literal["E", "S"]
if how not in {"S", "E"}:
raise ValueError("How must be one of S or E")
return how


cdef _parse_weekly_str(value, BaseOffset freq):
"""
Parse e.g. "2017-01-23/2017-01-29", which cannot be parsed by the general
datetime-parsing logic. This ensures that we can round-trip with
Period.__str__ with weekly freq.
"""
# GH#50803
start, end = value.split("/")
start = Timestamp(start)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any reason to use Timestamp here?

IIUC from the last time I was in Period code, Timestamp sends strings down parse_datetime_string_with_reso.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could use datetime.strptime. either way is fine. Timestamp doesn't yet use parse_datetime_string_with_reso, but getting there is one of the goals this is aiming at. even when it does, that won't affect behavior here.

end = Timestamp(end)

if (end - start).days != 6:
# We are interested in cases where this is str(period)
# of a Week-freq period
raise ValueError("Could not parse as weekly-freq Period")

if freq is None:
day_name = end.day_name()[:3].upper()
freqstr = f"W-{day_name}"
freq = to_offset(freqstr)
# We _should_ have freq.is_on_offset(end)

return end, freq
13 changes: 13 additions & 0 deletions pandas/tests/scalar/period/test_period.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,19 @@ def test_period_cons_weekly(self, num, day):
assert result == expected
assert isinstance(result, Period)

def test_parse_week_str_roundstrip(self):
# GH#50803
per = Period("2017-01-23/2017-01-29")
assert per.freq.freqstr == "W-SUN"

per = Period("2017-01-24/2017-01-30")
assert per.freq.freqstr == "W-MON"

msg = "Could not parse as weekly-freq Period"
with pytest.raises(ValueError, match=msg):
# not 6 days apart
Period("2016-01-23/2017-01-29")

def test_period_from_ordinal(self):
p = Period("2011-01", freq="M")
res = Period._from_ordinal(p.ordinal, freq="M")
Expand Down