Skip to content

Commit 2f8fade

Browse files
author
MarcoGorelli
committed
share paths and fix bugs
1 parent f31da23 commit 2f8fade

File tree

10 files changed

+232
-162
lines changed

10 files changed

+232
-162
lines changed

doc/source/whatsnew/v2.0.0.rst

+8
Original file line numberDiff line numberDiff line change
@@ -768,6 +768,7 @@ Performance improvements
768768
- Performance improvement in :func:`merge` when not merging on the index - the new index will now be :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49478`)
769769
- Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when using any non-object dtypes (:issue:`46470`)
770770
- Performance improvement in :func:`read_html` when there are multiple tables (:issue:`49929`)
771+
- Performance improvement in :func:`to_datetime` when using ``'%Y%m%d'`` format (:issue:`17410`)
771772

772773
.. ---------------------------------------------------------------------------
773774
.. _whatsnew_200.bug_fixes:
@@ -794,6 +795,13 @@ Datetimelike
794795
- Bug in :func:`to_datetime` was raising ``ValueError`` when parsing empty string and non-ISO8601 format was passed. Now, empty strings will be parsed as :class:`NaT`, for compatibility with how is done for ISO8601 formats (:issue:`50251`)
795796
- Bug in :class:`Timestamp` was showing ``UserWarning``, which was not actionable by users, when parsing non-ISO8601 delimited date strings (:issue:`50232`)
796797
- Bug in :func:`to_datetime` was showing misleading ``ValueError`` when parsing dates with format containing ISO week directive and ISO weekday directive (:issue:`50308`)
798+
- Bug in :func:`to_datetime` was not raising ``ValueError`` when parsing string with decimal date with format ``'%Y%m%d'`` (:issue:`50051`)
799+
- Bug in :func:`to_datetime` was not converting ``None`` to ``NaT`` when parsing mixed-offset date strings with ISO8601 format (:issue:`50071`)
800+
- Bug in :func:`to_datetime` was not returning input when parsing out-of-bounds date string with ``errors='ignore'`` and ``format='%Y%m%d'`` (:issue:`14487`)
801+
- Bug in :func:`to_datetime` was converting timezone-naive ``datetime.datetime`` to timezone-aware when parsing with timezone-aware strings, ISO8601 format, and ``utc=False`` (:issue:`50254`)
802+
- Bug in :func:`to_datetime` was throwing ``ValueError`` when parsing dates with ISO8601 format where some values were not zero-padded (:issue:`21422`)
803+
- Bug in :func:`to_datetime` was giving incorrect results when using ``format='%Y%m%d'`` and ``errors='ignore'`` (:issue:`26493`)
804+
- Bug in :func:`to_datetime` was failing to parse date strings ``'today'`` and ``'now'`` if ``format`` was not ISO8601 (:issue:`50359`)
797805
-
798806

799807
Timedelta

pandas/_libs/tslib.pyi

-3
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,6 @@ def array_to_datetime(
2323
dayfirst: bool = ...,
2424
yearfirst: bool = ...,
2525
utc: bool = ...,
26-
require_iso8601: bool = ...,
27-
format: str | None = ...,
28-
exact: bool = ...,
2926
) -> tuple[np.ndarray, tzinfo | None]: ...
3027

3128
# returned ndarray may be object dtype or datetime64[ns]

pandas/_libs/tslib.pyx

+3-61
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ from pandas._libs.tslibs.np_datetime cimport (
3838
pydatetime_to_dt64,
3939
string_to_dts,
4040
)
41+
from pandas._libs.tslibs.strptime cimport parse_today_now
4142
from pandas._libs.util cimport (
4243
is_datetime64_object,
4344
is_float_object,
@@ -409,9 +410,6 @@ cpdef array_to_datetime(
409410
bint dayfirst=False,
410411
bint yearfirst=False,
411412
bint utc=False,
412-
bint require_iso8601=False,
413-
format: str | None=None,
414-
bint exact=True,
415413
):
416414
"""
417415
Converts a 1D array of date-like values to a numpy array of either:
@@ -438,8 +436,6 @@ cpdef array_to_datetime(
438436
yearfirst parsing behavior when encountering datetime strings
439437
utc : bool, default False
440438
indicator whether the dates should be UTC
441-
require_iso8601 : bool, default False
442-
indicator whether the datetime string should be iso8601
443439
444440
Returns
445441
-------
@@ -510,16 +506,6 @@ cpdef array_to_datetime(
510506
iresult[i] = get_datetime64_nanos(val, NPY_FR_ns)
511507

512508
elif is_integer_object(val) or is_float_object(val):
513-
if require_iso8601:
514-
if is_coerce:
515-
iresult[i] = NPY_NAT
516-
continue
517-
elif is_raise:
518-
raise ValueError(
519-
f"time data \"{val}\" at position {i} doesn't "
520-
f"match format \"{format}\""
521-
)
522-
return values, tz_out
523509
# these must be ns unit by-definition
524510
seen_integer = True
525511

@@ -550,25 +536,13 @@ cpdef array_to_datetime(
550536

551537
string_to_dts_failed = string_to_dts(
552538
val, &dts, &out_bestunit, &out_local,
553-
&out_tzoffset, False, format, exact
539+
&out_tzoffset, False, None, False
554540
)
555541
if string_to_dts_failed:
556542
# An error at this point is a _parsing_ error
557543
# specifically _not_ OutOfBoundsDatetime
558-
if _parse_today_now(val, &iresult[i], utc):
544+
if parse_today_now(val, &iresult[i], utc):
559545
continue
560-
elif require_iso8601:
561-
# if requiring iso8601 strings, skip trying
562-
# other formats
563-
if is_coerce:
564-
iresult[i] = NPY_NAT
565-
continue
566-
elif is_raise:
567-
raise ValueError(
568-
f"time data \"{val}\" at position {i} doesn't "
569-
f"match format \"{format}\""
570-
)
571-
return values, tz_out
572546

573547
try:
574548
py_dt = parse_datetime_string(val,
@@ -631,18 +605,6 @@ cpdef array_to_datetime(
631605
if is_coerce:
632606
iresult[i] = NPY_NAT
633607
continue
634-
elif require_iso8601 and isinstance(val, str):
635-
# GH#19382 for just-barely-OutOfBounds falling back to
636-
# dateutil parser will return incorrect result because
637-
# it will ignore nanoseconds
638-
if is_raise:
639-
640-
# Still raise OutOfBoundsDatetime,
641-
# as error message is informative.
642-
raise
643-
644-
assert is_ignore
645-
return values, tz_out
646608
raise
647609

648610
except OutOfBoundsDatetime:
@@ -801,26 +763,6 @@ cdef _array_to_datetime_object(
801763
return oresult, None
802764

803765

804-
cdef bint _parse_today_now(str val, int64_t* iresult, bint utc):
805-
# We delay this check for as long as possible
806-
# because it catches relatively rare cases
807-
808-
# Multiply by 1000 to convert to nanos, since these methods naturally have
809-
# microsecond resolution
810-
if val == "now":
811-
if utc:
812-
iresult[0] = Timestamp.utcnow().value * 1000
813-
else:
814-
# GH#18705 make sure to_datetime("now") matches Timestamp("now")
815-
# Note using Timestamp.now() is faster than Timestamp("now")
816-
iresult[0] = Timestamp.now().value * 1000
817-
return True
818-
elif val == "today":
819-
iresult[0] = Timestamp.today().value * 1000
820-
return True
821-
return False
822-
823-
824766
def array_to_datetime_with_tz(ndarray values, tzinfo tz):
825767
"""
826768
Vectorized analogue to pd.Timestamp(value, tz=tz)

pandas/_libs/tslibs/parsing.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -846,7 +846,7 @@ def format_is_iso(f: str) -> bint:
846846
but must be consistent. Leading 0s in dates and times are optional.
847847
"""
848848
iso_template = "%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}".format
849-
excluded_formats = ["%Y%m%d", "%Y%m", "%Y"]
849+
excluded_formats = ["%Y%m"]
850850

851851
for date_sep in [" ", "/", "\\", "-", ".", ""]:
852852
for time_sep in [" ", "T"]:

pandas/_libs/tslibs/strptime.pxd

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from numpy cimport int64_t
2+
3+
4+
cdef bint parse_today_now(str val, int64_t* iresult, bint utc)

pandas/_libs/tslibs/strptime.pyx

+59
Original file line numberDiff line numberDiff line change
@@ -34,22 +34,43 @@ from pandas._libs.tslibs.nattype cimport (
3434
c_nat_strings as nat_strings,
3535
)
3636
from pandas._libs.tslibs.np_datetime cimport (
37+
NPY_DATETIMEUNIT,
3738
NPY_FR_ns,
3839
check_dts_bounds,
3940
npy_datetimestruct,
4041
npy_datetimestruct_to_datetime,
4142
pydate_to_dt64,
4243
pydatetime_to_dt64,
44+
string_to_dts,
4345
)
4446
from pandas._libs.tslibs.timestamps cimport _Timestamp
4547
from pandas._libs.util cimport (
4648
is_datetime64_object,
4749
is_float_object,
4850
is_integer_object,
4951
)
52+
from pandas._libs.tslibs.timestamps import Timestamp
5053

5154
cnp.import_array()
5255

56+
cdef bint parse_today_now(str val, int64_t* iresult, bint utc):
57+
# We delay this check for as long as possible
58+
# because it catches relatively rare cases
59+
60+
# Multiply by 1000 to convert to nanos, since these methods naturally have
61+
# microsecond resolution
62+
if val == "now":
63+
if utc:
64+
iresult[0] = Timestamp.utcnow().value * 1000
65+
else:
66+
# GH#18705 make sure to_datetime("now") matches Timestamp("now")
67+
# Note using Timestamp.now() is faster than Timestamp("now")
68+
iresult[0] = Timestamp.now().value * 1000
69+
return True
70+
elif val == "today":
71+
iresult[0] = Timestamp.today().value * 1000
72+
return True
73+
return False
5374

5475
cdef dict _parse_code_table = {"y": 0,
5576
"Y": 1,
@@ -93,6 +114,7 @@ def array_strptime(
93114
exact : matches must be exact if True, search if False
94115
errors : string specifying error handling, {'raise', 'ignore', 'coerce'}
95116
"""
117+
from pandas._libs.tslibs.parsing import format_is_iso
96118

97119
cdef:
98120
Py_ssize_t i, n = len(values)
@@ -110,6 +132,9 @@ def array_strptime(
110132
bint found_naive = False
111133
bint found_tz = False
112134
tzinfo tz_out = None
135+
bint iso_format = fmt is not None and format_is_iso(fmt)
136+
NPY_DATETIMEUNIT out_bestunit
137+
int out_local = 0, out_tzoffset = 0
113138

114139
assert is_raise or is_ignore or is_coerce
115140

@@ -230,6 +255,40 @@ def array_strptime(
230255
else:
231256
val = str(val)
232257

258+
if iso_format:
259+
string_to_dts_failed = string_to_dts(
260+
val, &dts, &out_bestunit, &out_local,
261+
&out_tzoffset, False, fmt, exact
262+
)
263+
if not string_to_dts_failed:
264+
# No error reported by string_to_dts, pick back up
265+
# where we left off
266+
value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts)
267+
if out_local == 1:
268+
# Store the out_tzoffset in seconds
269+
# since we store the total_seconds of
270+
# dateutil.tz.tzoffset objects
271+
tz = timezone(timedelta(minutes=out_tzoffset))
272+
result_timezone[i] = tz
273+
out_local = 0
274+
out_tzoffset = 0
275+
iresult[i] = value
276+
try:
277+
check_dts_bounds(&dts)
278+
except ValueError:
279+
if is_coerce:
280+
iresult[i] = NPY_NAT
281+
continue
282+
raise
283+
continue
284+
285+
if parse_today_now(val, &iresult[i], utc):
286+
continue
287+
288+
# Some ISO formats can't be parsed by string_to_dts
289+
# For example, 6-digit YYYYMD. So, if there's an error,
290+
# try the string-matching code below.
291+
233292
# exact matching
234293
if exact:
235294
found = format_regex.match(val)

pandas/core/arrays/datetimes.py

-7
Original file line numberDiff line numberDiff line change
@@ -2118,10 +2118,7 @@ def objects_to_datetime64ns(
21182118
yearfirst,
21192119
utc: bool = False,
21202120
errors: DateTimeErrorChoices = "raise",
2121-
require_iso8601: bool = False,
21222121
allow_object: bool = False,
2123-
format: str | None = None,
2124-
exact: bool = True,
21252122
):
21262123
"""
21272124
Convert data to array of timestamps.
@@ -2134,7 +2131,6 @@ def objects_to_datetime64ns(
21342131
utc : bool, default False
21352132
Whether to convert/localize timestamps to UTC.
21362133
errors : {'raise', 'ignore', 'coerce'}
2137-
require_iso8601 : bool, default False
21382134
allow_object : bool
21392135
Whether to return an object-dtype ndarray instead of raising if the
21402136
data contains more than one timezone.
@@ -2165,9 +2161,6 @@ def objects_to_datetime64ns(
21652161
utc=utc,
21662162
dayfirst=dayfirst,
21672163
yearfirst=yearfirst,
2168-
require_iso8601=require_iso8601,
2169-
format=format,
2170-
exact=exact,
21712164
)
21722165
result = result.reshape(data.shape, order=order)
21732166
except OverflowError as err:

0 commit comments

Comments
 (0)