Skip to content

REF: array_strptime #55750

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -477,7 +477,7 @@ cpdef array_to_datetime(

elif PyDateTime_Check(val):
tz_out = state.process_datetime(val, tz_out, utc_convert)
iresult[i] = parse_pydatetime(val, &dts, utc_convert, creso=creso)
iresult[i] = parse_pydatetime(val, &dts, creso=creso)

elif PyDate_Check(val):
iresult[i] = pydate_to_dt64(val, &dts)
Expand Down Expand Up @@ -519,10 +519,6 @@ cpdef array_to_datetime(
# store the UTC offsets in seconds instead
nsecs = tz.utcoffset(None).total_seconds()
out_tzoffset_vals.add(nsecs)
# need to set seen_datetime_offset *after* the
# potentially-raising timezone(timedelta(...)) call,
# otherwise we can go down the is_same_offsets path
# bc len(out_tzoffset_vals) == 0
seen_datetime_offset = True
else:
# Add a marker for naive string, to track if we are
Expand Down
1 change: 0 additions & 1 deletion pandas/_libs/tslibs/conversion.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,5 @@ cdef maybe_localize_tso(_TSObject obj, tzinfo tz, NPY_DATETIMEUNIT reso)
cdef int64_t parse_pydatetime(
datetime val,
npy_datetimestruct *dts,
bint utc_convert,
NPY_DATETIMEUNIT creso,
) except? -1
11 changes: 2 additions & 9 deletions pandas/_libs/tslibs/conversion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -675,7 +675,6 @@ cpdef inline datetime localize_pydatetime(datetime dt, tzinfo tz):
cdef int64_t parse_pydatetime(
datetime val,
npy_datetimestruct *dts,
bint utc_convert,
NPY_DATETIMEUNIT creso,
) except? -1:
"""
Expand All @@ -687,8 +686,6 @@ cdef int64_t parse_pydatetime(
Element being processed.
dts : *npy_datetimestruct
Needed to use in pydatetime_to_dt64, which writes to it.
utc_convert : bool
Whether to convert/localize to UTC.
creso : NPY_DATETIMEUNIT
Resolution to store the the result.

Expand All @@ -701,12 +698,8 @@ cdef int64_t parse_pydatetime(
int64_t result

if val.tzinfo is not None:
if utc_convert:
_ts = convert_datetime_to_tsobject(val, None, nanos=0, reso=creso)
result = _ts.value
else:
_ts = convert_datetime_to_tsobject(val, None, nanos=0, reso=creso)
result = _ts.value
_ts = convert_datetime_to_tsobject(val, None, nanos=0, reso=creso)
result = _ts.value
else:
if isinstance(val, _Timestamp):
result = (<_Timestamp>val)._as_creso(creso, round_ok=False)._value
Expand Down
192 changes: 120 additions & 72 deletions pandas/_libs/tslibs/strptime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ from pandas._libs.tslibs.timestamps import Timestamp

cnp.import_array()


cdef bint format_is_iso(f: str):
"""
Does format match the iso8601 set that can be handled by the C parser?
Expand Down Expand Up @@ -154,6 +155,77 @@ cdef dict _parse_code_table = {"y": 0,
"u": 22}


cdef _validate_fmt(str fmt):
if "%W" in fmt or "%U" in fmt:
if "%Y" not in fmt and "%y" not in fmt:
raise ValueError("Cannot use '%W' or '%U' without day and year")
if "%A" not in fmt and "%a" not in fmt and "%w" not in fmt:
raise ValueError("Cannot use '%W' or '%U' without day and year")
elif "%Z" in fmt and "%z" in fmt:
raise ValueError("Cannot parse both %Z and %z")
elif "%j" in fmt and "%G" in fmt:
raise ValueError("Day of the year directive '%j' is not "
"compatible with ISO year directive '%G'. "
"Use '%Y' instead.")
elif "%G" in fmt and (
"%V" not in fmt
or not (
"%A" in fmt
or "%a" in fmt
or "%w" in fmt
or "%u" in fmt
)
):
raise ValueError("ISO year directive '%G' must be used with "
"the ISO week directive '%V' and a weekday "
"directive '%A', '%a', '%w', or '%u'.")
elif "%V" in fmt and "%Y" in fmt:
raise ValueError("ISO week directive '%V' is incompatible with "
"the year directive '%Y'. Use the ISO year "
"'%G' instead.")
elif "%V" in fmt and (
"%G" not in fmt
or not (
"%A" in fmt
or "%a" in fmt
or "%w" in fmt
or "%u" in fmt
)
):
raise ValueError("ISO week directive '%V' must be used with "
"the ISO year directive '%G' and a weekday "
"directive '%A', '%a', '%w', or '%u'.")


cdef _get_format_regex(str fmt):
global _TimeRE_cache, _regex_cache
with _cache_lock:
if _getlang() != _TimeRE_cache.locale_time.lang:
_TimeRE_cache = TimeRE()
_regex_cache.clear()
if len(_regex_cache) > _CACHE_MAX_SIZE:
_regex_cache.clear()
locale_time = _TimeRE_cache.locale_time
format_regex = _regex_cache.get(fmt)
if not format_regex:
try:
format_regex = _TimeRE_cache.compile(fmt)
except KeyError, err:
# KeyError raised when a bad format is found; can be specified as
# \\, in which case it was a stray % but with a space after it
bad_directive = err.args[0]
if bad_directive == "\\":
bad_directive = "%"
del err
raise ValueError(f"'{bad_directive}' is a bad directive "
f"in format '{fmt}'")
except IndexError:
# IndexError only occurs when the format string is "%"
raise ValueError(f"stray % in format '{fmt}'")
_regex_cache[fmt] = format_regex
return format_regex, locale_time


cdef class DatetimeParseState:
def __cinit__(self):
self.found_tz = False
Expand Down Expand Up @@ -221,71 +293,8 @@ def array_strptime(

assert is_raise or is_ignore or is_coerce

if "%W" in fmt or "%U" in fmt:
if "%Y" not in fmt and "%y" not in fmt:
raise ValueError("Cannot use '%W' or '%U' without day and year")
if "%A" not in fmt and "%a" not in fmt and "%w" not in fmt:
raise ValueError("Cannot use '%W' or '%U' without day and year")
elif "%Z" in fmt and "%z" in fmt:
raise ValueError("Cannot parse both %Z and %z")
elif "%j" in fmt and "%G" in fmt:
raise ValueError("Day of the year directive '%j' is not "
"compatible with ISO year directive '%G'. "
"Use '%Y' instead.")
elif "%G" in fmt and (
"%V" not in fmt
or not (
"%A" in fmt
or "%a" in fmt
or "%w" in fmt
or "%u" in fmt
)
):
raise ValueError("ISO year directive '%G' must be used with "
"the ISO week directive '%V' and a weekday "
"directive '%A', '%a', '%w', or '%u'.")
elif "%V" in fmt and "%Y" in fmt:
raise ValueError("ISO week directive '%V' is incompatible with "
"the year directive '%Y'. Use the ISO year "
"'%G' instead.")
elif "%V" in fmt and (
"%G" not in fmt
or not (
"%A" in fmt
or "%a" in fmt
or "%w" in fmt
or "%u" in fmt
)
):
raise ValueError("ISO week directive '%V' must be used with "
"the ISO year directive '%G' and a weekday "
"directive '%A', '%a', '%w', or '%u'.")

global _TimeRE_cache, _regex_cache
with _cache_lock:
if _getlang() != _TimeRE_cache.locale_time.lang:
_TimeRE_cache = TimeRE()
_regex_cache.clear()
if len(_regex_cache) > _CACHE_MAX_SIZE:
_regex_cache.clear()
locale_time = _TimeRE_cache.locale_time
format_regex = _regex_cache.get(fmt)
if not format_regex:
try:
format_regex = _TimeRE_cache.compile(fmt)
# KeyError raised when a bad format is found; can be specified as
# \\, in which case it was a stray % but with a space after it
except KeyError, err:
bad_directive = err.args[0]
if bad_directive == "\\":
bad_directive = "%"
del err
raise ValueError(f"'{bad_directive}' is a bad directive "
f"in format '{fmt}'")
# IndexError only occurs when the format string is "%"
except IndexError:
raise ValueError(f"stray % in format '{fmt}'")
_regex_cache[fmt] = format_regex
_validate_fmt(fmt)
format_regex, locale_time = _get_format_regex(fmt)

result = np.empty(n, dtype="M8[ns]")
iresult = result.view("i8")
Expand Down Expand Up @@ -366,8 +375,10 @@ def array_strptime(
raise ValueError(f"Time data {val} is not ISO8601 format")

tz = _parse_with_format(
val, fmt, exact, format_regex, locale_time, &iresult[i]
val, fmt, exact, format_regex, locale_time, &dts
)
iresult[i] = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts)
check_dts_bounds(&dts)
result_timezone[i] = tz

except (ValueError, OutOfBoundsDatetime) as ex:
Expand All @@ -391,10 +402,10 @@ def array_strptime(


cdef tzinfo _parse_with_format(
str val, str fmt, bint exact, format_regex, locale_time, int64_t* iresult
str val, str fmt, bint exact, format_regex, locale_time, npy_datetimestruct* dts
):
# Based on https://github.com/python/cpython/blob/main/Lib/_strptime.py#L293
cdef:
npy_datetimestruct dts
int year, month, day, minute, hour, second, weekday, julian
int week_of_year, week_of_year_start, parse_code, ordinal
int iso_week, iso_year
Expand Down Expand Up @@ -452,24 +463,32 @@ cdef tzinfo _parse_with_format(
# value in the range of [00, 68] is in the century 2000, while
# [69,99] is in the century 1900
if year <= 68:
# e.g. val='May 04'; fmt='%b %y'
year += 2000
else:
year += 1900
# TODO: not reached in tests 2023-10-28
elif parse_code == 1:
# e.g. val='17-10-2010 07:15:30'; fmt='%d-%m-%Y %H:%M:%S'
year = int(found_dict["Y"])
elif parse_code == 2:
# e.g. val='17-10-2010 07:15:30'; fmt='%d-%m-%Y %H:%M:%S'
month = int(found_dict["m"])
# elif group_key == 'B':
elif parse_code == 3:
# e.g. val='30/December/2011'; fmt='%d/%B/%Y'
month = locale_time.f_month.index(found_dict["B"].lower())
# elif group_key == 'b':
elif parse_code == 4:
# e.g. val='30/Dec/2011 00:00:00'; fmt='%d/%b/%Y %H:%M:%S'
month = locale_time.a_month.index(found_dict["b"].lower())
# elif group_key == 'd':
elif parse_code == 5:
# e.g. val='17-10-2010 07:15:30'; fmt='%d-%m-%Y %H:%M:%S'
day = int(found_dict["d"])
# elif group_key == 'H':
elif parse_code == 6:
# e.g. val='17-10-2010 07:15:30'; fmt='%d-%m-%Y %H:%M:%S'
hour = int(found_dict["H"])
elif parse_code == 7:
hour = int(found_dict["I"])
Expand All @@ -481,71 +500,101 @@ cdef tzinfo _parse_with_format(
# 12 midnight == 12 AM == hour 0
if hour == 12:
hour = 0
# TODO: not reached in tests 2023-10-28; the implicit `else`
# branch is tested with e.g.
# val='Tuesday 24 Aug 2021 01:30:48 AM'
# fmt='%A %d %b %Y %I:%M:%S %p'
elif ampm == locale_time.am_pm[1]:
# We're in PM so we need to add 12 to the hour unless
# we're looking at 12 noon.
# 12 noon == 12 PM == hour 12
if hour != 12:
# e.g. val='01/10/2010 08:14 PM'; fmt='%m/%d/%Y %I:%M %p'
hour += 12
# TODO: the implicit `else` branch is not tested 2023-10-28
# TODO: the implicit `else` branch is not reached 2023-10-28; possible?
elif parse_code == 8:
# e.g. val='17-10-2010 07:15:30'; fmt='%d-%m-%Y %H:%M:%S'
minute = int(found_dict["M"])
elif parse_code == 9:
# e.g. val='17-10-2010 07:15:30'; fmt='%d-%m-%Y %H:%M:%S'
second = int(found_dict["S"])
elif parse_code == 10:
# e.g. val='10:10:10.100'; fmt='%H:%M:%S.%f'
s = found_dict["f"]
# Pad to always return nanoseconds
s += "0" * (9 - len(s))
us = long(s)
ns = us % 1000
us = us // 1000
elif parse_code == 11:
# e.g val='Tuesday 24 Aug 2021 01:30:48 AM'; fmt='%A %d %b %Y %I:%M:%S %p'
weekday = locale_time.f_weekday.index(found_dict["A"].lower())
elif parse_code == 12:
# e.g. val='Tue 24 Aug 2021 01:30:48 AM'; fmt='%a %d %b %Y %I:%M:%S %p'
weekday = locale_time.a_weekday.index(found_dict["a"].lower())
elif parse_code == 13:
weekday = int(found_dict["w"])
if weekday == 0:
# e.g. val='2013020'; fmt='%Y%U%w'
weekday = 6
else:
# e.g. val='2009324'; fmt='%Y%W%w'
weekday -= 1
elif parse_code == 14:
# e.g. val='2009164202000'; fmt='%Y%j%H%M%S'
julian = int(found_dict["j"])
elif parse_code == 15 or parse_code == 16:
week_of_year = int(found_dict[group_key])
if group_key == "U":
# e.g. val='2013020'; fmt='%Y%U%w'
# U starts week on Sunday.
week_of_year_start = 6
else:
# e.g. val='2009324'; fmt='%Y%W%w'
# W starts week on Monday.
week_of_year_start = 0
elif parse_code == 17:
# e.g. val='2011-12-30T00:00:00.000000UTC'; fmt='%Y-%m-%dT%H:%M:%S.%f%Z'
tz = pytz.timezone(found_dict["Z"])
elif parse_code == 19:
# e.g. val='March 1, 2018 12:00:00+0400'; fmt='%B %d, %Y %H:%M:%S%z'
tz = parse_timezone_directive(found_dict["z"])
elif parse_code == 20:
# e.g. val='2015-1-7'; fmt='%G-%V-%u'
iso_year = int(found_dict["G"])
elif parse_code == 21:
# e.g. val='2015-1-7'; fmt='%G-%V-%u'
iso_week = int(found_dict["V"])
elif parse_code == 22:
# e.g. val='2015-1-7'; fmt='%G-%V-%u'
weekday = int(found_dict["u"])
weekday -= 1

# If we know the wk of the year and what day of that wk, we can figure
# out the Julian day of the year.
if julian == -1 and weekday != -1:
if week_of_year != -1:
# e.g. val='2013020'; fmt='%Y%U%w'
week_starts_Mon = week_of_year_start == 0
julian = _calc_julian_from_U_or_W(year, week_of_year, weekday,
week_starts_Mon)
elif iso_year != -1 and iso_week != -1:
# e.g. val='2015-1-7'; fmt='%G-%V-%u'
year, julian = _calc_julian_from_V(iso_year, iso_week,
weekday + 1)
# else:
# # e.g. val='Thu Sep 25 2003'; fmt='%a %b %d %Y'
# pass

# Cannot pre-calculate date() since can change in Julian
# calculation and thus could have different value for the day of the wk
# calculation.
if julian == -1:
# Need to add 1 to result since first day of the year is 1, not
# 0.
# We don't actually need ordinal/julian here, but need to raise
# on e.g. val='2015-04-31'; fmt='%Y-%m-%d'
ordinal = date(year, month, day).toordinal()
julian = ordinal - date(year, 1, 1).toordinal() + 1
else:
Expand All @@ -557,6 +606,9 @@ cdef tzinfo _parse_with_format(
month = datetime_result.month
day = datetime_result.day
if weekday == -1:
# We don't actually use weekday here, but need to do this in order to
# raise on y/m/d combinations
# TODO: not reached in tests 2023-10-28; necessary?
weekday = date(year, month, day).weekday()

dts.year = year
Expand All @@ -567,10 +619,6 @@ cdef tzinfo _parse_with_format(
dts.sec = second
dts.us = us
dts.ps = ns * 1000

iresult[0] = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts)
check_dts_bounds(&dts)

return tz


Expand Down
Loading