From 008e5aab4ec93c4da4050e084c5d975fd9455e9b Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 17 Jan 2023 14:52:36 -0800 Subject: [PATCH 1/3] BUG: Parsing week-freq Periods --- doc/source/whatsnew/v2.0.0.rst | 2 + pandas/_libs/tslibs/period.pyx | 66 ++++++++++++++++++----- pandas/tests/scalar/period/test_period.py | 12 +++++ 3 files changed, 67 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 1894ce4ee12d9..57d4f2db68525 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1069,6 +1069,8 @@ Period - Bug in :meth:`Period.strftime` and :meth:`PeriodIndex.strftime`, raising ``UnicodeDecodeError`` when a locale-specific directive was passed (:issue:`46319`) - Bug in adding a :class:`Period` object to an array of :class:`DateOffset` objects incorrectly raising ``TypeError`` (:issue:`50162`) - Bug in :class:`Period` where passing a string with finer resolution than nanosecond would result in a ``KeyError`` instead of dropping the extra precision (:issue:`50417`) +- Bug in parsing strings representing Week-periods e.g. "2017-01-23/2017-01-29" as minute-frequency instead of week-frequency (:issue:`??`) +- Plotting ^^^^^^^^ diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 333728ad1198d..8107e467dad7b 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1,3 +1,5 @@ +import re + cimport numpy as cnp from cpython.object cimport ( Py_EQ, @@ -2591,20 +2593,30 @@ class Period(_Period): value = value.upper() freqstr = freq.rule_code if freq is not None else None - dt, reso = parse_datetime_string_with_reso(value, freqstr) - if reso == "nanosecond": - nanosecond = dt.nanosecond - if dt is NaT: - ordinal = NPY_NAT - if freq is None and ordinal != NPY_NAT: - # Skip NaT, since it doesn't have a resolution - try: - freq = attrname_to_abbrevs[reso] - except KeyError: - raise ValueError(f"Invalid frequency or could not " - f"infer: {reso}") - freq = to_offset(freq) + try: + dt, reso = parse_datetime_string_with_reso(value, freqstr) + except ValueError: + match = re.search(r"^\d{4}-\d{2}-\d{2}/\d{4}-\d{2}-\d{2}", value) + if not match: + raise + # Case that cannot be parsed (correctly) by our datetime + # parsing logic + dt, freq = parse_weekly_str(value, freq) + else: + if reso == "nanosecond": + nanosecond = dt.nanosecond + if dt is NaT: + ordinal = NPY_NAT + + if freq is None and ordinal != NPY_NAT: + # Skip NaT, since it doesn't have a resolution + try: + freq = attrname_to_abbrevs[reso] + except KeyError: + raise ValueError(f"Invalid frequency or could not " + f"infer: {reso}") + freq = to_offset(freq) elif PyDateTime_Check(value): dt = value @@ -2664,3 +2676,31 @@ def validate_end_alias(how: str) -> str: # Literal["E", "S"] if how not in {"S", "E"}: raise ValueError("How must be one of S or E") return how + + +cdef parse_weekly_str(value, BaseOffset freq): + """ + Parse e.g. "2017-01-23/2017-01-29", which cannot be parsed by the general + datetime-parsing logic. This ensures that we can round-trip with + Period.__str__ with weekly freq. + """ + match = re.search(r"^\d{4}-\d{2}-\d{2}/\d{4}-\d{2}-\d{2}", value) + if not match: + raise ValueError("Could not parse as weekly-freq Period") + + start, end = value.split("/") + start = Timestamp(start) + end = Timestamp(end) + + if (end - start).days != 6: + # We are interested in cases where this is str(period) + # of a Week-freq period + raise ValueError("Could not parse as weekly-freq Period") + + if freq is None: + day_name = end.day_name()[:3].upper() + freqstr = f"W-{day_name}" + freq = to_offset(freqstr) + # We _should_ have freq.is_on_offset(end) + + return end, freq diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 710a98ba6f005..5cb1e69c71840 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -399,6 +399,18 @@ def test_period_cons_weekly(self, num, day): assert result == expected assert isinstance(result, Period) + def test_parse_week_str_roundstrip(self): + per = Period("2017-01-23/2017-01-29") + assert per.freq.freqstr == "W-SUN" + + per = Period("2017-01-24/2017-01-30") + assert per.freq.freqstr == "W-MON" + + msg = "Could not parse as weekly-freq Period" + with pytest.raises(ValueError, match=msg): + # not 6 days apart + Period("2016-01-23/2017-01-29") + def test_period_from_ordinal(self): p = Period("2011-01", freq="M") res = Period._from_ordinal(p.ordinal, freq="M") From 5fdad9ae3549584c290ef1618ebc1775b6214988 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 17 Jan 2023 14:57:19 -0800 Subject: [PATCH 2/3] GH ref, remove redundant check --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/_libs/tslibs/period.pyx | 5 +---- pandas/tests/scalar/period/test_period.py | 1 + 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 57d4f2db68525..aac0a85ea961f 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1069,7 +1069,7 @@ Period - Bug in :meth:`Period.strftime` and :meth:`PeriodIndex.strftime`, raising ``UnicodeDecodeError`` when a locale-specific directive was passed (:issue:`46319`) - Bug in adding a :class:`Period` object to an array of :class:`DateOffset` objects incorrectly raising ``TypeError`` (:issue:`50162`) - Bug in :class:`Period` where passing a string with finer resolution than nanosecond would result in a ``KeyError`` instead of dropping the extra precision (:issue:`50417`) -- Bug in parsing strings representing Week-periods e.g. "2017-01-23/2017-01-29" as minute-frequency instead of week-frequency (:issue:`??`) +- Bug in parsing strings representing Week-periods e.g. "2017-01-23/2017-01-29" as minute-frequency instead of week-frequency (:issue:`50803`) - Plotting diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 8107e467dad7b..7dff4806f731f 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2684,10 +2684,7 @@ cdef parse_weekly_str(value, BaseOffset freq): datetime-parsing logic. This ensures that we can round-trip with Period.__str__ with weekly freq. """ - match = re.search(r"^\d{4}-\d{2}-\d{2}/\d{4}-\d{2}-\d{2}", value) - if not match: - raise ValueError("Could not parse as weekly-freq Period") - + # GH#50803 start, end = value.split("/") start = Timestamp(start) end = Timestamp(end) diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 5cb1e69c71840..bb9a7dd9374b5 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -400,6 +400,7 @@ def test_period_cons_weekly(self, num, day): assert isinstance(result, Period) def test_parse_week_str_roundstrip(self): + # GH#50803 per = Period("2017-01-23/2017-01-29") assert per.freq.freqstr == "W-SUN" From a34c9bc7b5d4aa3467aa775092fee878aaee5a2b Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 19 Jan 2023 06:56:34 -0800 Subject: [PATCH 3/3] handle week as fallback --- pandas/_libs/tslibs/parsing.pyx | 29 ++++++++++++++--------------- pandas/_libs/tslibs/period.pyx | 17 +++++++++-------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index b0918c5b690c2..33e400e487e38 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -289,21 +289,6 @@ def parse_datetime_string( dt, _ = dateutil_parse(date_string, default=_DEFAULT_DATETIME, dayfirst=dayfirst, yearfirst=yearfirst, ignoretz=False) - - if dt.tzinfo is not None: - # dateutil can return a datetime with a tzoffset outside of (-24H, 24H) - # bounds, which is invalid (can be constructed, but raises if we call - # str(dt)). Check that and raise here if necessary. - try: - dt.utcoffset() - except ValueError as err: - # offset must be a timedelta strictly between -timedelta(hours=24) - # and timedelta(hours=24) - raise ValueError( - f'Parsed string "{date_string}" gives an invalid tzoffset, ' - "which must be between -timedelta(hours=24) and timedelta(hours=24)" - ) - return dt @@ -656,6 +641,20 @@ cdef dateutil_parse( ret = ret.replace(tzinfo=_dateutil_tzutc()) elif res.tzoffset: ret = ret.replace(tzinfo=tzoffset(res.tzname, res.tzoffset)) + + # dateutil can return a datetime with a tzoffset outside of (-24H, 24H) + # bounds, which is invalid (can be constructed, but raises if we call + # str(ret)). Check that and raise here if necessary. + try: + ret.utcoffset() + except ValueError as err: + # offset must be a timedelta strictly between -timedelta(hours=24) + # and timedelta(hours=24) + raise ValueError( + f'Parsed string "{timestr}" gives an invalid tzoffset, ' + "which must be between -timedelta(hours=24) and timedelta(hours=24)" + ) + return ret, reso diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 7dff4806f731f..e36198da03d04 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2593,16 +2593,17 @@ class Period(_Period): value = value.upper() freqstr = freq.rule_code if freq is not None else None - try: dt, reso = parse_datetime_string_with_reso(value, freqstr) - except ValueError: + except ValueError as err: match = re.search(r"^\d{4}-\d{2}-\d{2}/\d{4}-\d{2}-\d{2}", value) - if not match: - raise - # Case that cannot be parsed (correctly) by our datetime - # parsing logic - dt, freq = parse_weekly_str(value, freq) + if match: + # Case that cannot be parsed (correctly) by our datetime + # parsing logic + dt, freq = _parse_weekly_str(value, freq) + else: + raise err + else: if reso == "nanosecond": nanosecond = dt.nanosecond @@ -2678,7 +2679,7 @@ def validate_end_alias(how: str) -> str: # Literal["E", "S"] return how -cdef parse_weekly_str(value, BaseOffset freq): +cdef _parse_weekly_str(value, BaseOffset freq): """ Parse e.g. "2017-01-23/2017-01-29", which cannot be parsed by the general datetime-parsing logic. This ensures that we can round-trip with