Skip to content

Commit 260c559

Browse files
author
MarcoGorelli
committed
wip
1 parent 113bdb3 commit 260c559

File tree

5 files changed

+347
-185
lines changed

5 files changed

+347
-185
lines changed

pandas/_libs/tslib.pyx

+98-12
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ from pandas._libs.tslibs.np_datetime cimport (
3939
pydatetime_to_dt64,
4040
string_to_dts,
4141
)
42+
from pandas._libs.tslibs.strptime cimport strptime
4243
from pandas._libs.util cimport (
4344
is_datetime64_object,
4445
is_float_object,
@@ -75,6 +76,19 @@ from pandas._libs.tslibs.timestamps import Timestamp
7576
from pandas._libs.missing cimport checknull_with_nat_and_na
7677
from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single
7778

79+
from _thread import allocate_lock as _thread_allocate_lock
80+
81+
from _strptime import _getlang
82+
83+
from pandas._libs.tslibs.strptime import TimeRE
84+
85+
_cache_lock = _thread_allocate_lock()
86+
# DO NOT modify _TimeRE_cache or _regex_cache without acquiring the cache lock
87+
# first!
88+
_TimeRE_cache = TimeRE()
89+
_CACHE_MAX_SIZE = 5 # Max number of regexes stored in _regex_cache
90+
_regex_cache = {}
91+
7892

7993
def _test_parse_iso8601(ts: str):
8094
"""
@@ -524,6 +538,41 @@ cpdef array_to_datetime(
524538
result = np.empty(n, dtype="M8[ns]")
525539
iresult = result.view("i8")
526540

541+
if format is not None and not require_iso8601:
542+
if "%W" in format or "%U" in format:
543+
if "%Y" not in format and "%y" not in format:
544+
raise ValueError("Cannot use '%W' or '%U' without day and year")
545+
if "%A" not in format and "%a" not in format and "%w" not in format:
546+
raise ValueError("Cannot use '%W' or '%U' without day and year")
547+
elif "%Z" in format and "%z" in format:
548+
raise ValueError("Cannot parse both %Z and %z")
549+
550+
global _TimeRE_cache, _regex_cache
551+
with _cache_lock:
552+
if _getlang() != _TimeRE_cache.locale_time.lang:
553+
_TimeRE_cache = TimeRE()
554+
_regex_cache.clear()
555+
if len(_regex_cache) > _CACHE_MAX_SIZE:
556+
_regex_cache.clear()
557+
locale_time = _TimeRE_cache.locale_time
558+
format_regex = _regex_cache.get(format)
559+
if not format_regex:
560+
try:
561+
format_regex = _TimeRE_cache.compile(format)
562+
# KeyError raised when a bad format is found; can be specified as
563+
# \\, in which case it was a stray % but with a space after it
564+
except KeyError, err:
565+
bad_directive = err.args[0]
566+
if bad_directive == "\\":
567+
bad_directive = "%"
568+
del err
569+
raise ValueError(f"'{bad_directive}' is a bad directive "
570+
f"in format '{format}'")
571+
# IndexError only occurs when the format string is "%"
572+
except IndexError:
573+
raise ValueError(f"stray % in format '{format}'")
574+
_regex_cache[format] = format_regex
575+
527576
try:
528577
for i in range(n):
529578
val = values[i]
@@ -556,17 +605,10 @@ cpdef array_to_datetime(
556605
seen_datetime = True
557606
iresult[i] = get_datetime64_nanos(val, NPY_FR_ns)
558607

559-
elif is_integer_object(val) or is_float_object(val):
560-
if require_iso8601:
561-
if is_coerce:
562-
iresult[i] = NPY_NAT
563-
continue
564-
elif is_raise:
565-
raise ValueError(
566-
f"time data \"{val}\" at position {i} doesn't "
567-
f"match format \"{format}\""
568-
)
569-
return values, tz_out
608+
elif (
609+
(is_integer_object(val) or is_float_object(val))
610+
and format is None
611+
):
570612
# these must be ns unit by-definition
571613
seen_integer = True
572614

@@ -585,7 +627,15 @@ cpdef array_to_datetime(
585627
except OverflowError:
586628
iresult[i] = NPY_NAT
587629

588-
elif isinstance(val, str):
630+
elif (
631+
(is_integer_object(val) or is_float_object(val))
632+
or isinstance(val, str)
633+
):
634+
if not isinstance(val, str):
635+
if val != val or val == NPY_NAT:
636+
iresult[i] = NPY_NAT
637+
continue
638+
589639
# string
590640
if type(val) is not str:
591641
# GH#32264 np.str_ object
@@ -595,6 +645,42 @@ cpdef array_to_datetime(
595645
iresult[i] = NPY_NAT
596646
continue
597647

648+
if (
649+
format is not None
650+
and (
651+
not require_iso8601
652+
or (
653+
require_iso8601 and format == "%Y%m%d" and len(val) != 8
654+
)
655+
)
656+
and val not in ("today", "now")
657+
):
658+
try:
659+
_iresult, _tzinfo = strptime(
660+
val, format, exact, format_regex, locale_time, dts
661+
)
662+
except (ValueError, OverflowError):
663+
if is_coerce:
664+
iresult[i] = NPY_NAT
665+
continue
666+
elif is_raise:
667+
raise
668+
return values, tz_out
669+
value = tz_localize_to_utc_single(_iresult, _tzinfo)
670+
if _tzinfo is not None:
671+
found_tz = True
672+
tz_out = convert_timezone(
673+
_tzinfo,
674+
tz_out,
675+
found_naive,
676+
found_tz,
677+
utc_convert,
678+
)
679+
else:
680+
found_naive = True
681+
iresult[i] = value
682+
continue
683+
598684
string_to_dts_failed = string_to_dts(
599685
val, &dts, &out_bestunit, &out_local,
600686
&out_tzoffset, False, format, exact

pandas/_libs/tslibs/strptime.pxd

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from pandas._libs.tslibs.np_datetime cimport npy_datetimestruct
2+
3+
4+
cdef strptime(
5+
val,
6+
str fmt,
7+
bint exact,
8+
format_regex,
9+
locale_time,
10+
npy_datetimestruct dts,
11+
)

pandas/_libs/tslibs/strptime.pyx

+190
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,196 @@ cdef dict _parse_code_table = {"y": 0,
7171
"V": 21,
7272
"u": 22}
7373

74+
cdef strptime(
75+
val,
76+
str fmt,
77+
bint exact,
78+
format_regex,
79+
locale_time,
80+
npy_datetimestruct dts,
81+
):
82+
if exact:
83+
found = format_regex.match(val)
84+
if not found:
85+
raise ValueError(f"time data '{val}' does not match "
86+
f"format '{fmt}' (match)")
87+
if len(val) != found.end():
88+
raise ValueError(f"unconverted data remains: {val[found.end():]}")
89+
90+
# search
91+
else:
92+
found = format_regex.search(val)
93+
if not found:
94+
raise ValueError(f"time data {repr(val)} does not match format "
95+
f"{repr(fmt)} (search)")
96+
97+
iso_year = -1
98+
year = 1900
99+
month = day = 1
100+
hour = minute = second = ns = us = 0
101+
tz = None
102+
# Default to -1 to signify that values not known; not critical to have,
103+
# though
104+
iso_week = week_of_year = -1
105+
week_of_year_start = -1
106+
# weekday and julian defaulted to -1 so as to signal need to calculate
107+
# values
108+
weekday = julian = -1
109+
found_dict = found.groupdict()
110+
for group_key in found_dict.iterkeys():
111+
# Directives not explicitly handled below:
112+
# c, x, X
113+
# handled by making out of other directives
114+
# U, W
115+
# worthless without day of the week
116+
parse_code = _parse_code_table[group_key]
117+
118+
if parse_code == 0:
119+
year = int(found_dict["y"])
120+
# Open Group specification for strptime() states that a %y
121+
# value in the range of [00, 68] is in the century 2000, while
122+
# [69,99] is in the century 1900
123+
if year <= 68:
124+
year += 2000
125+
else:
126+
year += 1900
127+
elif parse_code == 1:
128+
year = int(found_dict["Y"])
129+
elif parse_code == 2:
130+
month = int(found_dict["m"])
131+
# elif group_key == 'B':
132+
elif parse_code == 3:
133+
month = locale_time.f_month.index(found_dict["B"].lower())
134+
# elif group_key == 'b':
135+
elif parse_code == 4:
136+
month = locale_time.a_month.index(found_dict["b"].lower())
137+
# elif group_key == 'd':
138+
elif parse_code == 5:
139+
day = int(found_dict["d"])
140+
# elif group_key == 'H':
141+
elif parse_code == 6:
142+
hour = int(found_dict["H"])
143+
elif parse_code == 7:
144+
hour = int(found_dict["I"])
145+
ampm = found_dict.get("p", "").lower()
146+
# If there was no AM/PM indicator, we'll treat this like AM
147+
if ampm in ("", locale_time.am_pm[0]):
148+
# We're in AM so the hour is correct unless we're
149+
# looking at 12 midnight.
150+
# 12 midnight == 12 AM == hour 0
151+
if hour == 12:
152+
hour = 0
153+
elif ampm == locale_time.am_pm[1]:
154+
# We're in PM so we need to add 12 to the hour unless
155+
# we're looking at 12 noon.
156+
# 12 noon == 12 PM == hour 12
157+
if hour != 12:
158+
hour += 12
159+
elif parse_code == 8:
160+
minute = int(found_dict["M"])
161+
elif parse_code == 9:
162+
second = int(found_dict["S"])
163+
elif parse_code == 10:
164+
s = found_dict["f"]
165+
# Pad to always return nanoseconds
166+
s += "0" * (9 - len(s))
167+
us = long(s)
168+
ns = us % 1000
169+
us = us // 1000
170+
elif parse_code == 11:
171+
weekday = locale_time.f_weekday.index(found_dict["A"].lower())
172+
elif parse_code == 12:
173+
weekday = locale_time.a_weekday.index(found_dict["a"].lower())
174+
elif parse_code == 13:
175+
weekday = int(found_dict["w"])
176+
if weekday == 0:
177+
weekday = 6
178+
else:
179+
weekday -= 1
180+
elif parse_code == 14:
181+
julian = int(found_dict["j"])
182+
elif parse_code == 15 or parse_code == 16:
183+
week_of_year = int(found_dict[group_key])
184+
if group_key == "U":
185+
# U starts week on Sunday.
186+
week_of_year_start = 6
187+
else:
188+
# W starts week on Monday.
189+
week_of_year_start = 0
190+
elif parse_code == 17:
191+
tz = pytz.timezone(found_dict["Z"])
192+
elif parse_code == 19:
193+
tz = parse_timezone_directive(found_dict["z"])
194+
elif parse_code == 20:
195+
iso_year = int(found_dict["G"])
196+
elif parse_code == 21:
197+
iso_week = int(found_dict["V"])
198+
elif parse_code == 22:
199+
weekday = int(found_dict["u"])
200+
weekday -= 1
201+
202+
# don't assume default values for ISO week/year
203+
if iso_year != -1:
204+
if iso_week == -1 or weekday == -1:
205+
raise ValueError("ISO year directive '%G' must be used with "
206+
"the ISO week directive '%V' and a weekday "
207+
"directive '%A', '%a', '%w', or '%u'.")
208+
if julian != -1:
209+
raise ValueError("Day of the year directive '%j' is not "
210+
"compatible with ISO year directive '%G'. "
211+
"Use '%Y' instead.")
212+
elif year != -1 and week_of_year == -1 and iso_week != -1:
213+
if weekday == -1:
214+
raise ValueError("ISO week directive '%V' must be used with "
215+
"the ISO year directive '%G' and a weekday "
216+
"directive '%A', '%a', '%w', or '%u'.")
217+
else:
218+
raise ValueError("ISO week directive '%V' is incompatible with "
219+
"the year directive '%Y'. Use the ISO year "
220+
"'%G' instead.")
221+
222+
# If we know the wk of the year and what day of that wk, we can figure
223+
# out the Julian day of the year.
224+
if julian == -1 and weekday != -1:
225+
if week_of_year != -1:
226+
week_starts_Mon = week_of_year_start == 0
227+
julian = _calc_julian_from_U_or_W(year, week_of_year, weekday,
228+
week_starts_Mon)
229+
elif iso_year != -1 and iso_week != -1:
230+
year, julian = _calc_julian_from_V(iso_year, iso_week,
231+
weekday + 1)
232+
# Cannot pre-calculate date() since can change in Julian
233+
# calculation and thus could have different value for the day of the wk
234+
# calculation.
235+
if julian == -1:
236+
# Need to add 1 to result since first day of the year is 1, not
237+
# 0.
238+
ordinal = date(year, month, day).toordinal()
239+
julian = ordinal - date(year, 1, 1).toordinal() + 1
240+
else:
241+
# Assume that if they bothered to include Julian day it will
242+
# be accurate.
243+
datetime_result = date.fromordinal(
244+
(julian - 1) + date(year, 1, 1).toordinal())
245+
year = datetime_result.year
246+
month = datetime_result.month
247+
day = datetime_result.day
248+
if weekday == -1:
249+
weekday = date(year, month, day).weekday()
250+
251+
dts.year = year
252+
dts.month = month
253+
dts.day = day
254+
dts.hour = hour
255+
dts.min = minute
256+
dts.sec = second
257+
dts.us = us
258+
dts.ps = ns * 1000
259+
260+
iresult = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts)
261+
check_dts_bounds(&dts)
262+
return iresult, tz
263+
74264

75265
def array_strptime(
76266
ndarray[object] values,

0 commit comments

Comments
 (0)