Skip to content

Commit bd21f6b

Browse files
authored
REF: array_strptime (#55750)
* CLN: remove unnecessary arg from parse_pydatetime * REF: strptime
1 parent 5180fee commit bd21f6b

File tree

5 files changed

+125
-87
lines changed

5 files changed

+125
-87
lines changed

pandas/_libs/tslib.pyx

+1-5
Original file line numberDiff line numberDiff line change
@@ -477,7 +477,7 @@ cpdef array_to_datetime(
477477

478478
elif PyDateTime_Check(val):
479479
tz_out = state.process_datetime(val, tz_out, utc_convert)
480-
iresult[i] = parse_pydatetime(val, &dts, utc_convert, creso=creso)
480+
iresult[i] = parse_pydatetime(val, &dts, creso=creso)
481481

482482
elif PyDate_Check(val):
483483
iresult[i] = pydate_to_dt64(val, &dts)
@@ -519,10 +519,6 @@ cpdef array_to_datetime(
519519
# store the UTC offsets in seconds instead
520520
nsecs = tz.utcoffset(None).total_seconds()
521521
out_tzoffset_vals.add(nsecs)
522-
# need to set seen_datetime_offset *after* the
523-
# potentially-raising timezone(timedelta(...)) call,
524-
# otherwise we can go down the is_same_offsets path
525-
# bc len(out_tzoffset_vals) == 0
526522
seen_datetime_offset = True
527523
else:
528524
# Add a marker for naive string, to track if we are

pandas/_libs/tslibs/conversion.pxd

-1
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,5 @@ cdef maybe_localize_tso(_TSObject obj, tzinfo tz, NPY_DATETIMEUNIT reso)
5151
cdef int64_t parse_pydatetime(
5252
datetime val,
5353
npy_datetimestruct *dts,
54-
bint utc_convert,
5554
NPY_DATETIMEUNIT creso,
5655
) except? -1

pandas/_libs/tslibs/conversion.pyx

+2-9
Original file line numberDiff line numberDiff line change
@@ -666,7 +666,6 @@ cpdef inline datetime localize_pydatetime(datetime dt, tzinfo tz):
666666
cdef int64_t parse_pydatetime(
667667
datetime val,
668668
npy_datetimestruct *dts,
669-
bint utc_convert,
670669
NPY_DATETIMEUNIT creso,
671670
) except? -1:
672671
"""
@@ -678,8 +677,6 @@ cdef int64_t parse_pydatetime(
678677
Element being processed.
679678
dts : *npy_datetimestruct
680679
Needed to use in pydatetime_to_dt64, which writes to it.
681-
utc_convert : bool
682-
Whether to convert/localize to UTC.
683680
creso : NPY_DATETIMEUNIT
684681
Resolution to store the the result.
685682
@@ -692,12 +689,8 @@ cdef int64_t parse_pydatetime(
692689
int64_t result
693690

694691
if val.tzinfo is not None:
695-
if utc_convert:
696-
_ts = convert_datetime_to_tsobject(val, None, nanos=0, reso=creso)
697-
result = _ts.value
698-
else:
699-
_ts = convert_datetime_to_tsobject(val, None, nanos=0, reso=creso)
700-
result = _ts.value
692+
_ts = convert_datetime_to_tsobject(val, None, nanos=0, reso=creso)
693+
result = _ts.value
701694
else:
702695
if isinstance(val, _Timestamp):
703696
result = (<_Timestamp>val)._as_creso(creso, round_ok=False)._value

pandas/_libs/tslibs/strptime.pyx

+120-72
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ from pandas._libs.tslibs.timestamps import Timestamp
8080

8181
cnp.import_array()
8282

83+
8384
cdef bint format_is_iso(f: str):
8485
"""
8586
Does format match the iso8601 set that can be handled by the C parser?
@@ -154,6 +155,77 @@ cdef dict _parse_code_table = {"y": 0,
154155
"u": 22}
155156

156157

158+
cdef _validate_fmt(str fmt):
159+
if "%W" in fmt or "%U" in fmt:
160+
if "%Y" not in fmt and "%y" not in fmt:
161+
raise ValueError("Cannot use '%W' or '%U' without day and year")
162+
if "%A" not in fmt and "%a" not in fmt and "%w" not in fmt:
163+
raise ValueError("Cannot use '%W' or '%U' without day and year")
164+
elif "%Z" in fmt and "%z" in fmt:
165+
raise ValueError("Cannot parse both %Z and %z")
166+
elif "%j" in fmt and "%G" in fmt:
167+
raise ValueError("Day of the year directive '%j' is not "
168+
"compatible with ISO year directive '%G'. "
169+
"Use '%Y' instead.")
170+
elif "%G" in fmt and (
171+
"%V" not in fmt
172+
or not (
173+
"%A" in fmt
174+
or "%a" in fmt
175+
or "%w" in fmt
176+
or "%u" in fmt
177+
)
178+
):
179+
raise ValueError("ISO year directive '%G' must be used with "
180+
"the ISO week directive '%V' and a weekday "
181+
"directive '%A', '%a', '%w', or '%u'.")
182+
elif "%V" in fmt and "%Y" in fmt:
183+
raise ValueError("ISO week directive '%V' is incompatible with "
184+
"the year directive '%Y'. Use the ISO year "
185+
"'%G' instead.")
186+
elif "%V" in fmt and (
187+
"%G" not in fmt
188+
or not (
189+
"%A" in fmt
190+
or "%a" in fmt
191+
or "%w" in fmt
192+
or "%u" in fmt
193+
)
194+
):
195+
raise ValueError("ISO week directive '%V' must be used with "
196+
"the ISO year directive '%G' and a weekday "
197+
"directive '%A', '%a', '%w', or '%u'.")
198+
199+
200+
cdef _get_format_regex(str fmt):
201+
global _TimeRE_cache, _regex_cache
202+
with _cache_lock:
203+
if _getlang() != _TimeRE_cache.locale_time.lang:
204+
_TimeRE_cache = TimeRE()
205+
_regex_cache.clear()
206+
if len(_regex_cache) > _CACHE_MAX_SIZE:
207+
_regex_cache.clear()
208+
locale_time = _TimeRE_cache.locale_time
209+
format_regex = _regex_cache.get(fmt)
210+
if not format_regex:
211+
try:
212+
format_regex = _TimeRE_cache.compile(fmt)
213+
except KeyError, err:
214+
# KeyError raised when a bad format is found; can be specified as
215+
# \\, in which case it was a stray % but with a space after it
216+
bad_directive = err.args[0]
217+
if bad_directive == "\\":
218+
bad_directive = "%"
219+
del err
220+
raise ValueError(f"'{bad_directive}' is a bad directive "
221+
f"in format '{fmt}'")
222+
except IndexError:
223+
# IndexError only occurs when the format string is "%"
224+
raise ValueError(f"stray % in format '{fmt}'")
225+
_regex_cache[fmt] = format_regex
226+
return format_regex, locale_time
227+
228+
157229
cdef class DatetimeParseState:
158230
def __cinit__(self):
159231
self.found_tz = False
@@ -221,71 +293,8 @@ def array_strptime(
221293

222294
assert is_raise or is_ignore or is_coerce
223295

224-
if "%W" in fmt or "%U" in fmt:
225-
if "%Y" not in fmt and "%y" not in fmt:
226-
raise ValueError("Cannot use '%W' or '%U' without day and year")
227-
if "%A" not in fmt and "%a" not in fmt and "%w" not in fmt:
228-
raise ValueError("Cannot use '%W' or '%U' without day and year")
229-
elif "%Z" in fmt and "%z" in fmt:
230-
raise ValueError("Cannot parse both %Z and %z")
231-
elif "%j" in fmt and "%G" in fmt:
232-
raise ValueError("Day of the year directive '%j' is not "
233-
"compatible with ISO year directive '%G'. "
234-
"Use '%Y' instead.")
235-
elif "%G" in fmt and (
236-
"%V" not in fmt
237-
or not (
238-
"%A" in fmt
239-
or "%a" in fmt
240-
or "%w" in fmt
241-
or "%u" in fmt
242-
)
243-
):
244-
raise ValueError("ISO year directive '%G' must be used with "
245-
"the ISO week directive '%V' and a weekday "
246-
"directive '%A', '%a', '%w', or '%u'.")
247-
elif "%V" in fmt and "%Y" in fmt:
248-
raise ValueError("ISO week directive '%V' is incompatible with "
249-
"the year directive '%Y'. Use the ISO year "
250-
"'%G' instead.")
251-
elif "%V" in fmt and (
252-
"%G" not in fmt
253-
or not (
254-
"%A" in fmt
255-
or "%a" in fmt
256-
or "%w" in fmt
257-
or "%u" in fmt
258-
)
259-
):
260-
raise ValueError("ISO week directive '%V' must be used with "
261-
"the ISO year directive '%G' and a weekday "
262-
"directive '%A', '%a', '%w', or '%u'.")
263-
264-
global _TimeRE_cache, _regex_cache
265-
with _cache_lock:
266-
if _getlang() != _TimeRE_cache.locale_time.lang:
267-
_TimeRE_cache = TimeRE()
268-
_regex_cache.clear()
269-
if len(_regex_cache) > _CACHE_MAX_SIZE:
270-
_regex_cache.clear()
271-
locale_time = _TimeRE_cache.locale_time
272-
format_regex = _regex_cache.get(fmt)
273-
if not format_regex:
274-
try:
275-
format_regex = _TimeRE_cache.compile(fmt)
276-
# KeyError raised when a bad format is found; can be specified as
277-
# \\, in which case it was a stray % but with a space after it
278-
except KeyError, err:
279-
bad_directive = err.args[0]
280-
if bad_directive == "\\":
281-
bad_directive = "%"
282-
del err
283-
raise ValueError(f"'{bad_directive}' is a bad directive "
284-
f"in format '{fmt}'")
285-
# IndexError only occurs when the format string is "%"
286-
except IndexError:
287-
raise ValueError(f"stray % in format '{fmt}'")
288-
_regex_cache[fmt] = format_regex
296+
_validate_fmt(fmt)
297+
format_regex, locale_time = _get_format_regex(fmt)
289298

290299
result = np.empty(n, dtype="M8[ns]")
291300
iresult = result.view("i8")
@@ -366,8 +375,10 @@ def array_strptime(
366375
raise ValueError(f"Time data {val} is not ISO8601 format")
367376

368377
tz = _parse_with_format(
369-
val, fmt, exact, format_regex, locale_time, &iresult[i]
378+
val, fmt, exact, format_regex, locale_time, &dts
370379
)
380+
iresult[i] = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts)
381+
check_dts_bounds(&dts)
371382
result_timezone[i] = tz
372383

373384
except (ValueError, OutOfBoundsDatetime) as ex:
@@ -391,10 +402,10 @@ def array_strptime(
391402

392403

393404
cdef tzinfo _parse_with_format(
394-
str val, str fmt, bint exact, format_regex, locale_time, int64_t* iresult
405+
str val, str fmt, bint exact, format_regex, locale_time, npy_datetimestruct* dts
395406
):
407+
# Based on https://github.com/python/cpython/blob/main/Lib/_strptime.py#L293
396408
cdef:
397-
npy_datetimestruct dts
398409
int year, month, day, minute, hour, second, weekday, julian
399410
int week_of_year, week_of_year_start, parse_code, ordinal
400411
int iso_week, iso_year
@@ -452,24 +463,32 @@ cdef tzinfo _parse_with_format(
452463
# value in the range of [00, 68] is in the century 2000, while
453464
# [69,99] is in the century 1900
454465
if year <= 68:
466+
# e.g. val='May 04'; fmt='%b %y'
455467
year += 2000
456468
else:
457469
year += 1900
470+
# TODO: not reached in tests 2023-10-28
458471
elif parse_code == 1:
472+
# e.g. val='17-10-2010 07:15:30'; fmt='%d-%m-%Y %H:%M:%S'
459473
year = int(found_dict["Y"])
460474
elif parse_code == 2:
475+
# e.g. val='17-10-2010 07:15:30'; fmt='%d-%m-%Y %H:%M:%S'
461476
month = int(found_dict["m"])
462477
# elif group_key == 'B':
463478
elif parse_code == 3:
479+
# e.g. val='30/December/2011'; fmt='%d/%B/%Y'
464480
month = locale_time.f_month.index(found_dict["B"].lower())
465481
# elif group_key == 'b':
466482
elif parse_code == 4:
483+
# e.g. val='30/Dec/2011 00:00:00'; fmt='%d/%b/%Y %H:%M:%S'
467484
month = locale_time.a_month.index(found_dict["b"].lower())
468485
# elif group_key == 'd':
469486
elif parse_code == 5:
487+
# e.g. val='17-10-2010 07:15:30'; fmt='%d-%m-%Y %H:%M:%S'
470488
day = int(found_dict["d"])
471489
# elif group_key == 'H':
472490
elif parse_code == 6:
491+
# e.g. val='17-10-2010 07:15:30'; fmt='%d-%m-%Y %H:%M:%S'
473492
hour = int(found_dict["H"])
474493
elif parse_code == 7:
475494
hour = int(found_dict["I"])
@@ -481,71 +500,101 @@ cdef tzinfo _parse_with_format(
481500
# 12 midnight == 12 AM == hour 0
482501
if hour == 12:
483502
hour = 0
503+
# TODO: not reached in tests 2023-10-28; the implicit `else`
504+
# branch is tested with e.g.
505+
# val='Tuesday 24 Aug 2021 01:30:48 AM'
506+
# fmt='%A %d %b %Y %I:%M:%S %p'
484507
elif ampm == locale_time.am_pm[1]:
485508
# We're in PM so we need to add 12 to the hour unless
486509
# we're looking at 12 noon.
487510
# 12 noon == 12 PM == hour 12
488511
if hour != 12:
512+
# e.g. val='01/10/2010 08:14 PM'; fmt='%m/%d/%Y %I:%M %p'
489513
hour += 12
514+
# TODO: the implicit `else` branch is not tested 2023-10-28
515+
# TODO: the implicit `else` branch is not reached 2023-10-28; possible?
490516
elif parse_code == 8:
517+
# e.g. val='17-10-2010 07:15:30'; fmt='%d-%m-%Y %H:%M:%S'
491518
minute = int(found_dict["M"])
492519
elif parse_code == 9:
520+
# e.g. val='17-10-2010 07:15:30'; fmt='%d-%m-%Y %H:%M:%S'
493521
second = int(found_dict["S"])
494522
elif parse_code == 10:
523+
# e.g. val='10:10:10.100'; fmt='%H:%M:%S.%f'
495524
s = found_dict["f"]
496525
# Pad to always return nanoseconds
497526
s += "0" * (9 - len(s))
498527
us = long(s)
499528
ns = us % 1000
500529
us = us // 1000
501530
elif parse_code == 11:
531+
# e.g val='Tuesday 24 Aug 2021 01:30:48 AM'; fmt='%A %d %b %Y %I:%M:%S %p'
502532
weekday = locale_time.f_weekday.index(found_dict["A"].lower())
503533
elif parse_code == 12:
534+
# e.g. val='Tue 24 Aug 2021 01:30:48 AM'; fmt='%a %d %b %Y %I:%M:%S %p'
504535
weekday = locale_time.a_weekday.index(found_dict["a"].lower())
505536
elif parse_code == 13:
506537
weekday = int(found_dict["w"])
507538
if weekday == 0:
539+
# e.g. val='2013020'; fmt='%Y%U%w'
508540
weekday = 6
509541
else:
542+
# e.g. val='2009324'; fmt='%Y%W%w'
510543
weekday -= 1
511544
elif parse_code == 14:
545+
# e.g. val='2009164202000'; fmt='%Y%j%H%M%S'
512546
julian = int(found_dict["j"])
513547
elif parse_code == 15 or parse_code == 16:
514548
week_of_year = int(found_dict[group_key])
515549
if group_key == "U":
550+
# e.g. val='2013020'; fmt='%Y%U%w'
516551
# U starts week on Sunday.
517552
week_of_year_start = 6
518553
else:
554+
# e.g. val='2009324'; fmt='%Y%W%w'
519555
# W starts week on Monday.
520556
week_of_year_start = 0
521557
elif parse_code == 17:
558+
# e.g. val='2011-12-30T00:00:00.000000UTC'; fmt='%Y-%m-%dT%H:%M:%S.%f%Z'
522559
tz = pytz.timezone(found_dict["Z"])
523560
elif parse_code == 19:
561+
# e.g. val='March 1, 2018 12:00:00+0400'; fmt='%B %d, %Y %H:%M:%S%z'
524562
tz = parse_timezone_directive(found_dict["z"])
525563
elif parse_code == 20:
564+
# e.g. val='2015-1-7'; fmt='%G-%V-%u'
526565
iso_year = int(found_dict["G"])
527566
elif parse_code == 21:
567+
# e.g. val='2015-1-7'; fmt='%G-%V-%u'
528568
iso_week = int(found_dict["V"])
529569
elif parse_code == 22:
570+
# e.g. val='2015-1-7'; fmt='%G-%V-%u'
530571
weekday = int(found_dict["u"])
531572
weekday -= 1
532573

533574
# If we know the wk of the year and what day of that wk, we can figure
534575
# out the Julian day of the year.
535576
if julian == -1 and weekday != -1:
536577
if week_of_year != -1:
578+
# e.g. val='2013020'; fmt='%Y%U%w'
537579
week_starts_Mon = week_of_year_start == 0
538580
julian = _calc_julian_from_U_or_W(year, week_of_year, weekday,
539581
week_starts_Mon)
540582
elif iso_year != -1 and iso_week != -1:
583+
# e.g. val='2015-1-7'; fmt='%G-%V-%u'
541584
year, julian = _calc_julian_from_V(iso_year, iso_week,
542585
weekday + 1)
586+
# else:
587+
# # e.g. val='Thu Sep 25 2003'; fmt='%a %b %d %Y'
588+
# pass
589+
543590
# Cannot pre-calculate date() since can change in Julian
544591
# calculation and thus could have different value for the day of the wk
545592
# calculation.
546593
if julian == -1:
547594
# Need to add 1 to result since first day of the year is 1, not
548595
# 0.
596+
# We don't actually need ordinal/julian here, but need to raise
597+
# on e.g. val='2015-04-31'; fmt='%Y-%m-%d'
549598
ordinal = date(year, month, day).toordinal()
550599
julian = ordinal - date(year, 1, 1).toordinal() + 1
551600
else:
@@ -557,6 +606,9 @@ cdef tzinfo _parse_with_format(
557606
month = datetime_result.month
558607
day = datetime_result.day
559608
if weekday == -1:
609+
# We don't actually use weekday here, but need to do this in order to
610+
# raise on y/m/d combinations
611+
# TODO: not reached in tests 2023-10-28; necessary?
560612
weekday = date(year, month, day).weekday()
561613

562614
dts.year = year
@@ -567,10 +619,6 @@ cdef tzinfo _parse_with_format(
567619
dts.sec = second
568620
dts.us = us
569621
dts.ps = ns * 1000
570-
571-
iresult[0] = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts)
572-
check_dts_bounds(&dts)
573-
574622
return tz
575623

576624

0 commit comments

Comments
 (0)