Skip to content

WIP Share paths 2 #50258

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 98 additions & 12 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ from pandas._libs.tslibs.np_datetime cimport (
pydatetime_to_dt64,
string_to_dts,
)
from pandas._libs.tslibs.strptime cimport strptime
from pandas._libs.util cimport (
is_datetime64_object,
is_float_object,
Expand Down Expand Up @@ -75,6 +76,19 @@ from pandas._libs.tslibs.timestamps import Timestamp
from pandas._libs.missing cimport checknull_with_nat_and_na
from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single

from _thread import allocate_lock as _thread_allocate_lock

from _strptime import _getlang

from pandas._libs.tslibs.strptime import TimeRE

_cache_lock = _thread_allocate_lock()
# DO NOT modify _TimeRE_cache or _regex_cache without acquiring the cache lock
# first!
_TimeRE_cache = TimeRE()
_CACHE_MAX_SIZE = 5 # Max number of regexes stored in _regex_cache
_regex_cache = {}


def _test_parse_iso8601(ts: str):
"""
Expand Down Expand Up @@ -524,6 +538,41 @@ cpdef array_to_datetime(
result = np.empty(n, dtype="M8[ns]")
iresult = result.view("i8")

if format is not None and not require_iso8601:
if "%W" in format or "%U" in format:
if "%Y" not in format and "%y" not in format:
raise ValueError("Cannot use '%W' or '%U' without day and year")
if "%A" not in format and "%a" not in format and "%w" not in format:
raise ValueError("Cannot use '%W' or '%U' without day and year")
elif "%Z" in format and "%z" in format:
raise ValueError("Cannot parse both %Z and %z")

global _TimeRE_cache, _regex_cache
with _cache_lock:
if _getlang() != _TimeRE_cache.locale_time.lang:
_TimeRE_cache = TimeRE()
_regex_cache.clear()
if len(_regex_cache) > _CACHE_MAX_SIZE:
_regex_cache.clear()
locale_time = _TimeRE_cache.locale_time
format_regex = _regex_cache.get(format)
if not format_regex:
try:
format_regex = _TimeRE_cache.compile(format)
# KeyError raised when a bad format is found; can be specified as
# \\, in which case it was a stray % but with a space after it
except KeyError, err:
bad_directive = err.args[0]
if bad_directive == "\\":
bad_directive = "%"
del err
raise ValueError(f"'{bad_directive}' is a bad directive "
f"in format '{format}'")
# IndexError only occurs when the format string is "%"
except IndexError:
raise ValueError(f"stray % in format '{format}'")
_regex_cache[format] = format_regex
Comment on lines +541 to +574
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

taken from

if fmt is not None:
if "%W" in fmt or "%U" in fmt:
if "%Y" not in fmt and "%y" not in fmt:
raise ValueError("Cannot use '%W' or '%U' without day and year")
if "%A" not in fmt and "%a" not in fmt and "%w" not in fmt:
raise ValueError("Cannot use '%W' or '%U' without day and year")
elif "%Z" in fmt and "%z" in fmt:
raise ValueError("Cannot parse both %Z and %z")
global _TimeRE_cache, _regex_cache
with _cache_lock:
if _getlang() != _TimeRE_cache.locale_time.lang:
_TimeRE_cache = TimeRE()
_regex_cache.clear()
if len(_regex_cache) > _CACHE_MAX_SIZE:
_regex_cache.clear()
locale_time = _TimeRE_cache.locale_time
format_regex = _regex_cache.get(fmt)
if not format_regex:
try:
format_regex = _TimeRE_cache.compile(fmt)
# KeyError raised when a bad format is found; can be specified as
# \\, in which case it was a stray % but with a space after it
except KeyError, err:
bad_directive = err.args[0]
if bad_directive == "\\":
bad_directive = "%"
del err
raise ValueError(f"'{bad_directive}' is a bad directive "
f"in format '{fmt}'")
# IndexError only occurs when the format string is "%"
except IndexError:
raise ValueError(f"stray % in format '{fmt}'")
_regex_cache[fmt] = format_regex


try:
for i in range(n):
val = values[i]
Expand Down Expand Up @@ -556,17 +605,10 @@ cpdef array_to_datetime(
seen_datetime = True
iresult[i] = get_datetime64_nanos(val, NPY_FR_ns)

elif is_integer_object(val) or is_float_object(val):
if require_iso8601:
if is_coerce:
iresult[i] = NPY_NAT
continue
elif is_raise:
raise ValueError(
f"time data \"{val}\" at position {i} doesn't "
f"match format \"{format}\""
)
return values, tz_out
elif (
(is_integer_object(val) or is_float_object(val))
and format is None
):
Comment on lines +608 to +611
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

parsing integers/floats with format is allowed in the other path, so let's keep allowing it here

# these must be ns unit by-definition
seen_integer = True

Expand All @@ -585,7 +627,15 @@ cpdef array_to_datetime(
except OverflowError:
iresult[i] = NPY_NAT

elif isinstance(val, str):
elif (
(is_integer_object(val) or is_float_object(val))
or isinstance(val, str)
):
if not isinstance(val, str):
if val != val or val == NPY_NAT:
iresult[i] = NPY_NAT
continue

# string
if type(val) is not str:
# GH#32264 np.str_ object
Expand All @@ -595,6 +645,42 @@ cpdef array_to_datetime(
iresult[i] = NPY_NAT
continue

if (
format is not None
and (
not require_iso8601
or (
require_iso8601 and format == "%Y%m%d" and len(val) != 8
)
Comment on lines +652 to +654
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the fast ISO path handles %Y%m%d, but only if the date string is 8 digits long. So, if that's the case, we use the fastpath. Else, we down the slower strptime one

)
and val not in ("today", "now")
):
try:
_iresult, _tzinfo = strptime(
val, format, exact, format_regex, locale_time, dts
)
except (ValueError, OverflowError):
if is_coerce:
iresult[i] = NPY_NAT
continue
elif is_raise:
raise
return values, tz_out
value = tz_localize_to_utc_single(_iresult, _tzinfo)
if _tzinfo is not None:
found_tz = True
tz_out = convert_timezone(
_tzinfo,
tz_out,
found_naive,
found_tz,
utc_convert,
)
else:
found_naive = True
Comment on lines +669 to +680
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the thing with _tzinfo here is that it might not be a fixed offset. E.g. it could be pytz.timezone('US/Pacific'). So, I don't think we can take the same approach as is done below in the ISO8601 fastpath of just saving the offset minutes from UTC to out_tzoffset

iresult[i] = value
continue

string_to_dts_failed = string_to_dts(
val, &dts, &out_bestunit, &out_local,
&out_tzoffset, False, format, exact
Expand Down
11 changes: 11 additions & 0 deletions pandas/_libs/tslibs/strptime.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from pandas._libs.tslibs.np_datetime cimport npy_datetimestruct


cdef strptime(
val,
str fmt,
bint exact,
format_regex,
locale_time,
npy_datetimestruct dts,
)
190 changes: 190 additions & 0 deletions pandas/_libs/tslibs/strptime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,196 @@ cdef dict _parse_code_table = {"y": 0,
"V": 21,
"u": 22}

cdef strptime(
val,
str fmt,
bint exact,
format_regex,
locale_time,
npy_datetimestruct dts,
):
if exact:
found = format_regex.match(val)
if not found:
raise ValueError(f"time data '{val}' does not match "
f"format '{fmt}' (match)")
if len(val) != found.end():
raise ValueError(f"unconverted data remains: {val[found.end():]}")

# search
else:
found = format_regex.search(val)
if not found:
raise ValueError(f"time data {repr(val)} does not match format "
f"{repr(fmt)} (search)")

iso_year = -1
year = 1900
month = day = 1
hour = minute = second = ns = us = 0
tz = None
# Default to -1 to signify that values not known; not critical to have,
# though
iso_week = week_of_year = -1
week_of_year_start = -1
# weekday and julian defaulted to -1 so as to signal need to calculate
# values
weekday = julian = -1
found_dict = found.groupdict()
for group_key in found_dict.iterkeys():
# Directives not explicitly handled below:
# c, x, X
# handled by making out of other directives
# U, W
# worthless without day of the week
parse_code = _parse_code_table[group_key]

if parse_code == 0:
year = int(found_dict["y"])
# Open Group specification for strptime() states that a %y
# value in the range of [00, 68] is in the century 2000, while
# [69,99] is in the century 1900
if year <= 68:
year += 2000
else:
year += 1900
elif parse_code == 1:
year = int(found_dict["Y"])
elif parse_code == 2:
month = int(found_dict["m"])
# elif group_key == 'B':
elif parse_code == 3:
month = locale_time.f_month.index(found_dict["B"].lower())
# elif group_key == 'b':
elif parse_code == 4:
month = locale_time.a_month.index(found_dict["b"].lower())
# elif group_key == 'd':
elif parse_code == 5:
day = int(found_dict["d"])
# elif group_key == 'H':
elif parse_code == 6:
hour = int(found_dict["H"])
elif parse_code == 7:
hour = int(found_dict["I"])
ampm = found_dict.get("p", "").lower()
# If there was no AM/PM indicator, we'll treat this like AM
if ampm in ("", locale_time.am_pm[0]):
# We're in AM so the hour is correct unless we're
# looking at 12 midnight.
# 12 midnight == 12 AM == hour 0
if hour == 12:
hour = 0
elif ampm == locale_time.am_pm[1]:
# We're in PM so we need to add 12 to the hour unless
# we're looking at 12 noon.
# 12 noon == 12 PM == hour 12
if hour != 12:
hour += 12
elif parse_code == 8:
minute = int(found_dict["M"])
elif parse_code == 9:
second = int(found_dict["S"])
elif parse_code == 10:
s = found_dict["f"]
# Pad to always return nanoseconds
s += "0" * (9 - len(s))
us = long(s)
ns = us % 1000
us = us // 1000
elif parse_code == 11:
weekday = locale_time.f_weekday.index(found_dict["A"].lower())
elif parse_code == 12:
weekday = locale_time.a_weekday.index(found_dict["a"].lower())
elif parse_code == 13:
weekday = int(found_dict["w"])
if weekday == 0:
weekday = 6
else:
weekday -= 1
elif parse_code == 14:
julian = int(found_dict["j"])
elif parse_code == 15 or parse_code == 16:
week_of_year = int(found_dict[group_key])
if group_key == "U":
# U starts week on Sunday.
week_of_year_start = 6
else:
# W starts week on Monday.
week_of_year_start = 0
elif parse_code == 17:
tz = pytz.timezone(found_dict["Z"])
elif parse_code == 19:
tz = parse_timezone_directive(found_dict["z"])
elif parse_code == 20:
iso_year = int(found_dict["G"])
elif parse_code == 21:
iso_week = int(found_dict["V"])
elif parse_code == 22:
weekday = int(found_dict["u"])
weekday -= 1

# don't assume default values for ISO week/year
if iso_year != -1:
if iso_week == -1 or weekday == -1:
raise ValueError("ISO year directive '%G' must be used with "
"the ISO week directive '%V' and a weekday "
"directive '%A', '%a', '%w', or '%u'.")
if julian != -1:
raise ValueError("Day of the year directive '%j' is not "
"compatible with ISO year directive '%G'. "
"Use '%Y' instead.")
elif year != -1 and week_of_year == -1 and iso_week != -1:
if weekday == -1:
raise ValueError("ISO week directive '%V' must be used with "
"the ISO year directive '%G' and a weekday "
"directive '%A', '%a', '%w', or '%u'.")
else:
raise ValueError("ISO week directive '%V' is incompatible with "
"the year directive '%Y'. Use the ISO year "
"'%G' instead.")

# If we know the wk of the year and what day of that wk, we can figure
# out the Julian day of the year.
if julian == -1 and weekday != -1:
if week_of_year != -1:
week_starts_Mon = week_of_year_start == 0
julian = _calc_julian_from_U_or_W(year, week_of_year, weekday,
week_starts_Mon)
elif iso_year != -1 and iso_week != -1:
year, julian = _calc_julian_from_V(iso_year, iso_week,
weekday + 1)
# Cannot pre-calculate date() since can change in Julian
# calculation and thus could have different value for the day of the wk
# calculation.
if julian == -1:
# Need to add 1 to result since first day of the year is 1, not
# 0.
ordinal = date(year, month, day).toordinal()
julian = ordinal - date(year, 1, 1).toordinal() + 1
else:
# Assume that if they bothered to include Julian day it will
# be accurate.
datetime_result = date.fromordinal(
(julian - 1) + date(year, 1, 1).toordinal())
year = datetime_result.year
month = datetime_result.month
day = datetime_result.day
if weekday == -1:
weekday = date(year, month, day).weekday()

dts.year = year
dts.month = month
dts.day = day
dts.hour = hour
dts.min = minute
dts.sec = second
dts.us = us
dts.ps = ns * 1000

iresult = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts)
check_dts_bounds(&dts)
return iresult, tz


def array_strptime(
ndarray[object] values,
Comment on lines 265 to 266
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be possible to delete this completely now

Expand Down
Loading