WIP Share paths 2 #50258

MarcoGorelli · 2022-12-14T14:50:44Z

Attempt at addressing the many, many inconsistencies between the ISO8601 path and the non-ISO8601 path

Can we just get rid of array_strptime, and only keep the part that does pattern matching?

@jbrockmendel does the general idea look reasonable? Asking you'd previously commented "yikes" about this 😄 . If so, I'll clean it up and get it ready, just wanted to check that there's not something you know that I don't that would prevent this from being possible

MarcoGorelli · 2022-12-14T14:51:43Z

pandas/_libs/tslib.pyx

+    if format is not None and not require_iso8601:
+        if "%W" in format or "%U" in format:
+            if "%Y" not in format and "%y" not in format:
+                raise ValueError("Cannot use '%W' or '%U' without day and year")
+            if "%A" not in format and "%a" not in format and "%w" not in format:
+                raise ValueError("Cannot use '%W' or '%U' without day and year")
+        elif "%Z" in format and "%z" in format:
+            raise ValueError("Cannot parse both %Z and %z")
+
+        global _TimeRE_cache, _regex_cache
+        with _cache_lock:
+            if _getlang() != _TimeRE_cache.locale_time.lang:
+                _TimeRE_cache = TimeRE()
+                _regex_cache.clear()
+            if len(_regex_cache) > _CACHE_MAX_SIZE:
+                _regex_cache.clear()
+            locale_time = _TimeRE_cache.locale_time
+            format_regex = _regex_cache.get(format)
+            if not format_regex:
+                try:
+                    format_regex = _TimeRE_cache.compile(format)
+                # KeyError raised when a bad format is found; can be specified as
+                # \\, in which case it was a stray % but with a space after it
+                except KeyError, err:
+                    bad_directive = err.args[0]
+                    if bad_directive == "\\":
+                        bad_directive = "%"
+                    del err
+                    raise ValueError(f"'{bad_directive}' is a bad directive "
+                                     f"in format '{format}'")
+                # IndexError only occurs when the format string is "%"
+                except IndexError:
+                    raise ValueError(f"stray % in format '{format}'")
+                _regex_cache[format] = format_regex


taken from

pandas/pandas/_libs/tslibs/strptime.pyx

Lines 112 to 145 in 5a372d8

if fmt is not None:

if "%W" in fmt or "%U" in fmt:

if "%Y" not in fmt and "%y" not in fmt:

raise ValueError("Cannot use '%W' or '%U' without day and year")

if "%A" not in fmt and "%a" not in fmt and "%w" not in fmt:

raise ValueError("Cannot use '%W' or '%U' without day and year")

elif "%Z" in fmt and "%z" in fmt:

raise ValueError("Cannot parse both %Z and %z")

global _TimeRE_cache, _regex_cache

with _cache_lock:

if _getlang() != _TimeRE_cache.locale_time.lang:

_TimeRE_cache = TimeRE()

_regex_cache.clear()

if len(_regex_cache) > _CACHE_MAX_SIZE:

_regex_cache.clear()

locale_time = _TimeRE_cache.locale_time

format_regex = _regex_cache.get(fmt)

if not format_regex:

try:

format_regex = _TimeRE_cache.compile(fmt)

# KeyError raised when a bad format is found; can be specified as

# \\, in which case it was a stray % but with a space after it

except KeyError, err:

bad_directive = err.args[0]

if bad_directive == "\\":

bad_directive = "%"

del err

raise ValueError(f"'{bad_directive}' is a bad directive "

f"in format '{fmt}'")

# IndexError only occurs when the format string is "%"

except IndexError:

raise ValueError(f"stray % in format '{fmt}'")

_regex_cache[fmt] = format_regex

MarcoGorelli · 2022-12-14T14:52:25Z

pandas/_libs/tslib.pyx

+                elif (
+                    (is_integer_object(val) or is_float_object(val))
+                    and format is None
+                ):


parsing integers/floats with format is allowed in the other path, so let's keep allowing it here

MarcoGorelli · 2022-12-14T14:53:20Z

pandas/_libs/tslib.pyx

+                            or (
+                                require_iso8601 and format == "%Y%m%d" and len(val) != 8
+                            )


the fast ISO path handles %Y%m%d, but only if the date string is 8 digits long. So, if that's the case, we use the fastpath. Else, we down the slower strptime one

MarcoGorelli · 2022-12-14T14:54:01Z

pandas/_libs/tslibs/strptime.pyx

 def array_strptime(
    ndarray[object] values,


should be possible to delete this completely now

MarcoGorelli · 2022-12-14T14:55:53Z

pandas/tests/tools/test_to_datetime.py

-        "format,warning", [(None, UserWarning), ("H%:M%:S%", None)]
+        "format,warning", [(None, UserWarning), ("%H:%M:%S", None)]


H%:M%:S% isn't even a valid format, pretty sure the test was meant to use %H:%M:%S

MarcoGorelli · 2022-12-14T14:59:34Z

pandas/tests/tools/test_to_datetime.py

-            [
-                "%Y-%m-%d %H:%M:%S %z",
-                ["2010-01-01 12:00:00 +0100", "2010-01-01 12:00:00 -0100"],
-                [
-                    Timestamp(
-                        "2010-01-01 12:00:00", tzinfo=timezone(timedelta(minutes=60))
-                    ),
-                    Timestamp(
-                        "2010-01-01 12:00:00", tzinfo=timezone(timedelta(minutes=-60))
-                    ),
-                ],
-            ],


it's odd to me that this passed in the first place. It didn't for pydatetime inputs:

>>> to_datetime([Timestamp("2010-01-01 12:00:00 +0100"), Timestamp("2010-01-01 12:00:00 -0100")]), format='%Y-%m-%d') [...] ValueError: Tz-aware datetime.datetime cannot be converted to datetime64 unless utc=True

It was only allowed in the non-ISO path. I'd be OK with just adopting the ISO-path behaviour uniformly here

MarcoGorelli · 2022-12-14T15:00:11Z

pandas/tests/tools/test_to_datetime.py

-            [datetime(2012, 12, 31), datetime(2014, 12, 31), datetime(9999, 12, 31)],
+            [20121231, 20141231, 99991231],


this test was never correct to begin with #50072

MarcoGorelli · 2022-12-14T15:00:42Z

pandas/tests/tools/test_to_datetime.py

-        result = to_datetime(ser2, format="%Y%m%d", cache=cache)
-        tm.assert_series_equal(result, expected)
+        with pytest.raises(ValueError, match=None):
+            to_datetime(ser2, format="%Y%m%d", cache=cache)


this wasn't right to begin with #50051

MarcoGorelli · 2022-12-14T15:05:24Z

pandas/_libs/tslib.pyx

+                        value = tz_localize_to_utc_single(_iresult, _tzinfo)
+                        if _tzinfo is not None:
+                            found_tz = True
+                            tz_out = convert_timezone(
+                                _tzinfo,
+                                tz_out,
+                                found_naive,
+                                found_tz,
+                                utc_convert,
+                            )
+                        else:
+                            found_naive = True


the thing with _tzinfo here is that it might not be a fixed offset. E.g. it could be pytz.timezone('US/Pacific'). So, I don't think we can take the same approach as is done below in the ISO8601 fastpath of just saving the offset minutes from UTC to out_tzoffset

jbrockmendel · 2022-12-14T17:04:13Z

Asking you'd previously commented "yikes" about this

IIRC the context for that is that to_datetime is the last big thing that needs to be updated to support non-nano, and AFAICT that is going to require adding non-trivial complexity to array_to_datetime. So the prospect of adding more from a different direction merited a "yikes". But that was and is speculative, so don't let that dissuade you here.

MarcoGorelli · 2022-12-18T19:56:54Z

doing this the other way round works better, so closing in favour of #50242

wip

260c559

MarcoGorelli force-pushed the share-paths-2 branch from aa6cc7c to 260c559 Compare December 14, 2022 14:56

MarcoGorelli commented Dec 14, 2022

View reviewed changes

MarcoGorelli mentioned this pull request Dec 14, 2022

BUG refactor datetime parsing and fix 8 bugs #50242

Merged

12 tasks

MarcoGorelli marked this pull request as draft December 15, 2022 09:12

MarcoGorelli closed this Dec 18, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

WIP Share paths 2 #50258

WIP Share paths 2 #50258

MarcoGorelli commented Dec 14, 2022

MarcoGorelli Dec 14, 2022

MarcoGorelli Dec 14, 2022

MarcoGorelli Dec 14, 2022

MarcoGorelli Dec 14, 2022

MarcoGorelli Dec 14, 2022

MarcoGorelli Dec 14, 2022

MarcoGorelli Dec 14, 2022

MarcoGorelli Dec 14, 2022

MarcoGorelli Dec 14, 2022

jbrockmendel commented Dec 14, 2022

MarcoGorelli commented Dec 18, 2022

	if fmt is not None:
	if "%W" in fmt or "%U" in fmt:
	if "%Y" not in fmt and "%y" not in fmt:
	raise ValueError("Cannot use '%W' or '%U' without day and year")
	if "%A" not in fmt and "%a" not in fmt and "%w" not in fmt:
	raise ValueError("Cannot use '%W' or '%U' without day and year")
	elif "%Z" in fmt and "%z" in fmt:
	raise ValueError("Cannot parse both %Z and %z")

	global _TimeRE_cache, _regex_cache
	with _cache_lock:
	if _getlang() != _TimeRE_cache.locale_time.lang:
	_TimeRE_cache = TimeRE()
	_regex_cache.clear()
	if len(_regex_cache) > _CACHE_MAX_SIZE:
	_regex_cache.clear()
	locale_time = _TimeRE_cache.locale_time
	format_regex = _regex_cache.get(fmt)
	if not format_regex:
	try:
	format_regex = _TimeRE_cache.compile(fmt)
	# KeyError raised when a bad format is found; can be specified as
	# \\, in which case it was a stray % but with a space after it
	except KeyError, err:
	bad_directive = err.args[0]
	if bad_directive == "\\":
	bad_directive = "%"
	del err
	raise ValueError(f"'{bad_directive}' is a bad directive "
	f"in format '{fmt}'")
	# IndexError only occurs when the format string is "%"
	except IndexError:
	raise ValueError(f"stray % in format '{fmt}'")
	_regex_cache[fmt] = format_regex

		"format,warning", [(None, UserWarning), ("H%:M%:S%", None)]
		"format,warning", [(None, UserWarning), ("%H:%M:%S", None)]

		[datetime(2012, 12, 31), datetime(2014, 12, 31), datetime(9999, 12, 31)],
		[20121231, 20141231, 99991231],

WIP Share paths 2 #50258

WIP Share paths 2 #50258

Conversation

MarcoGorelli commented Dec 14, 2022

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

jbrockmendel commented Dec 14, 2022

MarcoGorelli commented Dec 18, 2022