Skip to content

Commit dba96f9

Browse files
authored
REF: standardize reso-return in parsing (#50914)
* REF: tighter typing in _parse_dateabbr_string * cover bases * REF: tighter typing in _parse_dateabbr_string * cover bases
1 parent a82b8e2 commit dba96f9

File tree

3 files changed

+86
-45
lines changed

3 files changed

+86
-45
lines changed

pandas/_libs/tslibs/dtypes.pxd

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ cpdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso)
1212

1313
cdef dict attrname_to_abbrevs
1414
cdef dict npy_unit_to_attrname
15+
cdef dict attrname_to_npy_unit
1516

1617
cdef enum c_FreqGroup:
1718
# Mirrors FreqGroup in the .pyx file

pandas/_libs/tslibs/dtypes.pyx

+1
Original file line numberDiff line numberDiff line change
@@ -435,3 +435,4 @@ cdef dict npy_unit_to_attrname = {
435435
NPY_DATETIMEUNIT.NPY_FR_us: "microsecond",
436436
NPY_DATETIMEUNIT.NPY_FR_ns: "nanosecond",
437437
}
438+
cdef dict attrname_to_npy_unit = {v: k for k, v in npy_unit_to_attrname.items()}

pandas/_libs/tslibs/parsing.pyx

+84-45
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@ from cpython.datetime cimport (
1515
timedelta,
1616
tzinfo,
1717
)
18+
1819
from datetime import timezone
20+
1921
from cpython.object cimport PyObject_Str
2022
from cython cimport Py_ssize_t
2123
from libc.string cimport strchr
@@ -52,18 +54,25 @@ from dateutil.tz import (
5254
from pandas._config import get_option
5355

5456
from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS
55-
from pandas._libs.tslibs.dtypes cimport npy_unit_to_attrname
57+
from pandas._libs.tslibs.dtypes cimport (
58+
attrname_to_npy_unit,
59+
npy_unit_to_attrname,
60+
)
5661
from pandas._libs.tslibs.nattype cimport (
5762
c_NaT as NaT,
5863
c_nat_strings as nat_strings,
5964
)
65+
6066
from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
67+
6168
from pandas._libs.tslibs.np_datetime cimport (
6269
NPY_DATETIMEUNIT,
6370
npy_datetimestruct,
6471
string_to_dts,
6572
)
73+
6674
from pandas._libs.tslibs.strptime import array_strptime
75+
6776
from pandas._libs.tslibs.util cimport (
6877
get_c_string_buf_and_size,
6978
is_array,
@@ -92,6 +101,14 @@ _DEFAULT_DATETIME = datetime(1, 1, 1).replace(hour=0, minute=0,
92101
cdef:
93102
set _not_datelike_strings = {"a", "A", "m", "M", "p", "P", "t", "T"}
94103

104+
# _timestamp_units -> units that we round to nanos
105+
set _timestamp_units = {
106+
NPY_DATETIMEUNIT.NPY_FR_ns,
107+
NPY_DATETIMEUNIT.NPY_FR_ps,
108+
NPY_DATETIMEUNIT.NPY_FR_fs,
109+
NPY_DATETIMEUNIT.NPY_FR_as,
110+
}
111+
95112
# ----------------------------------------------------------------------
96113
cdef:
97114
const char* delimiters = " /-."
@@ -125,7 +142,7 @@ cdef int _parse_4digit(const char* s):
125142

126143

127144
cdef datetime _parse_delimited_date(
128-
str date_string, bint dayfirst, NPY_DATETIMEUNIT* creso
145+
str date_string, bint dayfirst, NPY_DATETIMEUNIT* out_bestunit
129146
):
130147
"""
131148
Parse special cases of dates: MM/DD/YYYY, DD/MM/YYYY, MM/YYYY.
@@ -144,7 +161,7 @@ cdef datetime _parse_delimited_date(
144161
----------
145162
date_string : str
146163
dayfirst : bool
147-
creso : NPY_DATETIMEUNIT*
164+
out_bestunit : NPY_DATETIMEUNIT*
148165
For specifying identified resolution.
149166
150167
Returns:
@@ -163,28 +180,28 @@ cdef datetime _parse_delimited_date(
163180
month = _parse_2digit(buf)
164181
day = _parse_2digit(buf + 3)
165182
year = _parse_4digit(buf + 6)
166-
creso[0] = NPY_DATETIMEUNIT.NPY_FR_D
183+
out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_D
167184
can_swap = 1
168185
elif length == 9 and _is_delimiter(buf[1]) and _is_delimiter(buf[4]):
169186
# parsing M?DD?YYYY and D?MM?YYYY dates
170187
month = _parse_1digit(buf)
171188
day = _parse_2digit(buf + 2)
172189
year = _parse_4digit(buf + 5)
173-
creso[0] = NPY_DATETIMEUNIT.NPY_FR_D
190+
out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_D
174191
can_swap = 1
175192
elif length == 9 and _is_delimiter(buf[2]) and _is_delimiter(buf[4]):
176193
# parsing MM?D?YYYY and DD?M?YYYY dates
177194
month = _parse_2digit(buf)
178195
day = _parse_1digit(buf + 3)
179196
year = _parse_4digit(buf + 5)
180-
creso[0] = NPY_DATETIMEUNIT.NPY_FR_D
197+
out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_D
181198
can_swap = 1
182199
elif length == 8 and _is_delimiter(buf[1]) and _is_delimiter(buf[3]):
183200
# parsing M?D?YYYY and D?M?YYYY dates
184201
month = _parse_1digit(buf)
185202
day = _parse_1digit(buf + 2)
186203
year = _parse_4digit(buf + 4)
187-
creso[0] = NPY_DATETIMEUNIT.NPY_FR_D
204+
out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_D
188205
can_swap = 1
189206
elif length == 7 and _is_delimiter(buf[2]):
190207
# parsing MM?YYYY dates
@@ -194,7 +211,7 @@ cdef datetime _parse_delimited_date(
194211
return None
195212
month = _parse_2digit(buf)
196213
year = _parse_4digit(buf + 3)
197-
creso[0] = NPY_DATETIMEUNIT.NPY_FR_M
214+
out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_M
198215
else:
199216
return None
200217

@@ -270,7 +287,8 @@ def parse_datetime_string(
270287

271288
cdef:
272289
datetime dt
273-
NPY_DATETIMEUNIT creso
290+
NPY_DATETIMEUNIT out_bestunit
291+
bint is_quarter = 0
274292

275293
if not _does_string_look_like_datetime(date_string):
276294
raise ValueError(f'Given date string "{date_string}" not likely a datetime')
@@ -281,21 +299,23 @@ def parse_datetime_string(
281299
yearfirst=yearfirst)
282300
return dt
283301

284-
dt = _parse_delimited_date(date_string, dayfirst, &creso)
302+
dt = _parse_delimited_date(date_string, dayfirst, &out_bestunit)
285303
if dt is not None:
286304
return dt
287305

288306
try:
289-
dt, _ = _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq=None)
307+
dt = _parse_dateabbr_string(
308+
date_string, _DEFAULT_DATETIME, None, &out_bestunit, &is_quarter
309+
)
290310
return dt
291311
except DateParseError:
292312
raise
293313
except ValueError:
294314
pass
295315

296-
dt, _ = dateutil_parse(date_string, default=_DEFAULT_DATETIME,
297-
dayfirst=dayfirst, yearfirst=yearfirst,
298-
ignoretz=False)
316+
dt = dateutil_parse(date_string, default=_DEFAULT_DATETIME,
317+
dayfirst=dayfirst, yearfirst=yearfirst,
318+
ignoretz=False, out_bestunit=&out_bestunit)
299319

300320
if dt.tzinfo is not None:
301321
# dateutil can return a datetime with a tzoffset outside of (-24H, 24H)
@@ -361,26 +381,24 @@ def parse_datetime_string_with_reso(
361381
int out_local = 0
362382
int out_tzoffset
363383
tzinfo tz
384+
bint is_quarter = 0
364385

365386
if not _does_string_look_like_datetime(date_string):
366387
raise ValueError(f'Given date string "{date_string}" not likely a datetime')
367388

368-
parsed = _parse_delimited_date(date_string, dayfirst, &out_bestunit)
369-
if parsed is not None:
370-
reso = npy_unit_to_attrname[out_bestunit]
371-
return parsed, reso
372-
373389
# Try iso8601 first, as it handles nanoseconds
374390
string_to_dts_failed = string_to_dts(
375391
date_string, &dts, &out_bestunit, &out_local,
376392
&out_tzoffset, False
377393
)
378394
if not string_to_dts_failed:
379-
timestamp_units = {NPY_DATETIMEUNIT.NPY_FR_ns,
380-
NPY_DATETIMEUNIT.NPY_FR_ps,
381-
NPY_DATETIMEUNIT.NPY_FR_fs,
382-
NPY_DATETIMEUNIT.NPY_FR_as}
383-
if out_bestunit in timestamp_units:
395+
# Match Timestamp and drop picoseconds, femtoseconds, attoseconds
396+
# The new resolution will just be nano
397+
# GH#50417
398+
if out_bestunit in _timestamp_units:
399+
out_bestunit = NPY_DATETIMEUNIT.NPY_FR_ns
400+
401+
if out_bestunit == NPY_DATETIMEUNIT.NPY_FR_ns:
384402
# TODO: avoid circular import
385403
from pandas import Timestamp
386404
parsed = Timestamp(date_string)
@@ -392,25 +410,34 @@ def parse_datetime_string_with_reso(
392410
parsed = datetime_new(
393411
dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz
394412
)
395-
# Match Timestamp and drop picoseconds, femtoseconds, attoseconds
396-
# The new resolution will just be nano
397-
# GH 50417
398-
if out_bestunit in timestamp_units:
399-
out_bestunit = NPY_DATETIMEUNIT.NPY_FR_ns
400413

401414
reso = npy_unit_to_attrname[out_bestunit]
402415
return parsed, reso
403416

417+
parsed = _parse_delimited_date(date_string, dayfirst, &out_bestunit)
418+
if parsed is not None:
419+
reso = npy_unit_to_attrname[out_bestunit]
420+
return parsed, reso
421+
404422
try:
405-
return _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq)
423+
parsed = _parse_dateabbr_string(
424+
date_string, _DEFAULT_DATETIME, freq, &out_bestunit, &is_quarter
425+
)
406426
except DateParseError:
407427
raise
408428
except ValueError:
409429
pass
430+
else:
431+
if is_quarter:
432+
reso = "quarter"
433+
else:
434+
reso = npy_unit_to_attrname[out_bestunit]
435+
return parsed, reso
410436

411-
parsed, reso = dateutil_parse(date_string, _DEFAULT_DATETIME,
412-
dayfirst=dayfirst, yearfirst=yearfirst,
413-
ignoretz=False)
437+
parsed = dateutil_parse(date_string, _DEFAULT_DATETIME,
438+
dayfirst=dayfirst, yearfirst=yearfirst,
439+
ignoretz=False, out_bestunit=&out_bestunit)
440+
reso = npy_unit_to_attrname[out_bestunit]
414441
return parsed, reso
415442

416443

@@ -461,8 +488,9 @@ cpdef bint _does_string_look_like_datetime(str py_string):
461488
return True
462489

463490

464-
cdef object _parse_dateabbr_string(str date_string, datetime default,
465-
str freq=None):
491+
cdef datetime _parse_dateabbr_string(str date_string, datetime default,
492+
str freq, NPY_DATETIMEUNIT* out_bestunit,
493+
bint* is_quarter):
466494
# special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1
467495
cdef:
468496
datetime ret
@@ -472,7 +500,9 @@ cdef object _parse_dateabbr_string(str date_string, datetime default,
472500
const char* buf
473501

474502
if date_string in nat_strings:
475-
return NaT, ""
503+
# default to nanos, could also reasonably do NPY_FR_GENERIC
504+
out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_ns
505+
return NaT
476506

477507
date_string = date_string.upper()
478508
date_len = len(date_string)
@@ -481,7 +511,8 @@ cdef object _parse_dateabbr_string(str date_string, datetime default,
481511
# parse year only like 2000
482512
try:
483513
ret = default.replace(year=int(date_string))
484-
return ret, "year"
514+
out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_Y
515+
return ret
485516
except ValueError:
486517
pass
487518

@@ -534,7 +565,10 @@ cdef object _parse_dateabbr_string(str date_string, datetime default,
534565
f"freq: {freq}")
535566

536567
ret = default.replace(year=year, month=month)
537-
return ret, "quarter"
568+
# Monthly is as close as we can get to a non-existent NPY_FR_Q
569+
out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_M
570+
is_quarter[0] = 1
571+
return ret
538572

539573
except DateParseError:
540574
raise
@@ -547,15 +581,17 @@ cdef object _parse_dateabbr_string(str date_string, datetime default,
547581
month = int(date_string[4:6])
548582
try:
549583
ret = default.replace(year=year, month=month)
550-
return ret, "month"
584+
out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_M
585+
return ret
551586
except ValueError as err:
552587
# We can infer that none of the patterns below will match
553588
raise ValueError(f"Unable to parse {date_string}") from err
554589

555590
for pat in ["%Y-%m", "%b %Y", "%b-%Y"]:
556591
try:
557592
ret = datetime.strptime(date_string, pat)
558-
return ret, "month"
593+
out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_M
594+
return ret
559595
except ValueError:
560596
pass
561597

@@ -597,12 +633,13 @@ cpdef quarter_to_myear(int year, int quarter, str freq):
597633
return year, month
598634

599635

600-
cdef dateutil_parse(
636+
cdef datetime dateutil_parse(
601637
str timestr,
602638
datetime default,
603-
bint ignoretz=False,
604-
bint dayfirst=False,
605-
bint yearfirst=False,
639+
bint ignoretz,
640+
bint dayfirst,
641+
bint yearfirst,
642+
NPY_DATETIMEUNIT* out_bestunit
606643
):
607644
""" lifted from dateutil to get resolution"""
608645

@@ -658,7 +695,9 @@ cdef dateutil_parse(
658695
ret = ret.replace(tzinfo=_dateutil_tzutc())
659696
elif res.tzoffset:
660697
ret = ret.replace(tzinfo=tzoffset(res.tzname, res.tzoffset))
661-
return ret, reso
698+
699+
out_bestunit[0] = attrname_to_npy_unit[reso]
700+
return ret
662701

663702

664703
# ----------------------------------------------------------------------

0 commit comments

Comments
 (0)