Skip to content

Commit a65e8ab

Browse files
author
Marco Gorelli
committed
record format
1 parent 2be9661 commit a65e8ab

File tree

10 files changed

+212
-13
lines changed

10 files changed

+212
-13
lines changed

pandas/_libs/tslib.pyx

+35-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import warnings
2+
import re
23

34
cimport cython
45
from cpython.datetime cimport (
@@ -85,6 +86,8 @@ def _test_parse_iso8601(ts: str):
8586
_TSObject obj
8687
int out_local = 0, out_tzoffset = 0
8788
NPY_DATETIMEUNIT out_bestunit
89+
char inferred_format
90+
int format_len
8891

8992
obj = _TSObject()
9093

@@ -93,7 +96,7 @@ def _test_parse_iso8601(ts: str):
9396
elif ts == 'today':
9497
return Timestamp.now().normalize()
9598

96-
string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True)
99+
string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True, &inferred_format, &format_len)
97100
obj.value = npy_datetimestruct_to_datetime(NPY_FR_ns, &obj.dts)
98101
check_dts_bounds(&obj.dts)
99102
if out_local == 1:
@@ -449,6 +452,8 @@ cpdef array_to_datetime(
449452
bint utc=False,
450453
bint require_iso8601=False,
451454
bint allow_mixed=False,
455+
str format=None,
456+
bint exact=False,
452457
):
453458
"""
454459
Converts a 1D array of date-like values to a numpy array of either:
@@ -509,6 +514,8 @@ cpdef array_to_datetime(
509514
datetime py_dt
510515
tzinfo tz_out = None
511516
bint found_tz = False, found_naive = False
517+
char inferred_format[100]
518+
int format_len
512519

513520
# specify error conditions
514521
assert is_raise or is_ignore or is_coerce
@@ -568,6 +575,15 @@ cpdef array_to_datetime(
568575
iresult[i] = get_datetime64_nanos(val, NPY_FR_ns)
569576

570577
elif is_integer_object(val) or is_float_object(val):
578+
if require_iso8601:
579+
if is_coerce:
580+
iresult[i] = NPY_NAT
581+
continue
582+
elif is_raise:
583+
raise ValueError(
584+
f"time data \"{val}\" at position {i} doesn't match format {format}"
585+
)
586+
return values, tz_out
571587
# these must be ns unit by-definition
572588
seen_integer = True
573589

@@ -598,7 +614,8 @@ cpdef array_to_datetime(
598614

599615
string_to_dts_failed = string_to_dts(
600616
val, &dts, &out_bestunit, &out_local,
601-
&out_tzoffset, False
617+
&out_tzoffset, False, inferred_format,
618+
&format_len,
602619
)
603620
if string_to_dts_failed:
604621
# An error at this point is a _parsing_ error
@@ -613,7 +630,7 @@ cpdef array_to_datetime(
613630
continue
614631
elif is_raise:
615632
raise ValueError(
616-
f"time data \"{val}\" at position {i} doesn't match format specified"
633+
f"time data \"{val}\" at position {i} doesn't match {format}"
617634
)
618635
return values, tz_out
619636

@@ -644,6 +661,21 @@ cpdef array_to_datetime(
644661
_ts = convert_datetime_to_tsobject(py_dt, None)
645662
iresult[i] = _ts.value
646663
if not string_to_dts_failed:
664+
if require_iso8601:
665+
guess = inferred_format[:format_len].decode('utf-8')
666+
if (
667+
(exact and format != guess)
668+
or
669+
(not exact and re.search(format, guess) is None)
670+
):
671+
if is_coerce:
672+
iresult[i] = NPY_NAT
673+
continue
674+
elif is_raise:
675+
raise ValueError(
676+
f"time data \"{val}\" at position {i} doesn't match format {format}"
677+
)
678+
return values, tz_out
647679
# No error reported by string_to_dts, pick back up
648680
# where we left off
649681
value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts)

pandas/_libs/tslibs/conversion.pyx

+3-1
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,8 @@ cdef _TSObject _convert_str_to_tsobject(object ts, tzinfo tz, str unit,
469469
datetime dt
470470
int64_t ival
471471
NPY_DATETIMEUNIT out_bestunit
472+
char inferred_format
473+
int format_len
472474

473475
if len(ts) == 0 or ts in nat_strings:
474476
ts = NaT
@@ -488,7 +490,7 @@ cdef _TSObject _convert_str_to_tsobject(object ts, tzinfo tz, str unit,
488490
else:
489491
string_to_dts_failed = string_to_dts(
490492
ts, &dts, &out_bestunit, &out_local,
491-
&out_tzoffset, False
493+
&out_tzoffset, False, &inferred_format, &format_len
492494
)
493495
if not string_to_dts_failed:
494496
try:

pandas/_libs/tslibs/np_datetime.pxd

+2
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,8 @@ cdef int string_to_dts(
9595
int* out_local,
9696
int* out_tzoffset,
9797
bint want_exc,
98+
char *inferred_format,
99+
int *format_len,
98100
) except? -1
99101

100102
cdef NPY_DATETIMEUNIT get_unit_from_dtype(cnp.dtype dtype)

pandas/_libs/tslibs/np_datetime.pyx

+6-2
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,8 @@ cdef extern from "src/datetime/np_datetime_strings.h":
5252
int parse_iso_8601_datetime(const char *str, int len, int want_exc,
5353
npy_datetimestruct *out,
5454
NPY_DATETIMEUNIT *out_bestunit,
55-
int *out_local, int *out_tzoffset)
55+
int *out_local, int *out_tzoffset, char *inferred_format,
56+
int *format_len)
5657

5758

5859
# ----------------------------------------------------------------------
@@ -273,14 +274,17 @@ cdef inline int string_to_dts(
273274
int* out_local,
274275
int* out_tzoffset,
275276
bint want_exc,
277+
char *inferred_format,
278+
int *format_len
276279
) except? -1:
277280
cdef:
278281
Py_ssize_t length
279282
const char* buf
280283

281284
buf = get_c_string_buf_and_size(val, &length)
282285
return parse_iso_8601_datetime(buf, length, want_exc,
283-
dts, out_bestunit, out_local, out_tzoffset)
286+
dts, out_bestunit, out_local, out_tzoffset,
287+
inferred_format, format_len)
284288

285289

286290
cpdef ndarray astype_overflowsafe(

pandas/_libs/tslibs/parsing.pyx

+3-1
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,8 @@ cdef parse_datetime_string_with_reso(
397397
NPY_DATETIMEUNIT out_bestunit
398398
int out_local
399399
int out_tzoffset
400+
char inferred_format
401+
int format_len
400402

401403
if not _does_string_look_like_datetime(date_string):
402404
raise ValueError(f'Given date string {date_string} not likely a datetime')
@@ -409,7 +411,7 @@ cdef parse_datetime_string_with_reso(
409411
# TODO: does this render some/all of parse_delimited_date redundant?
410412
string_to_dts_failed = string_to_dts(
411413
date_string, &dts, &out_bestunit, &out_local,
412-
&out_tzoffset, False
414+
&out_tzoffset, False, &inferred_format, &format_len
413415
)
414416
if not string_to_dts_failed:
415417
if dts.ps != 0 or out_local:

pandas/_libs/tslibs/src/datetime/np_datetime_strings.c

+65-2
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,9 @@ This file implements string parsing and creation for NumPy datetime.
6969
int parse_iso_8601_datetime(const char *str, int len, int want_exc,
7070
npy_datetimestruct *out,
7171
NPY_DATETIMEUNIT *out_bestunit,
72-
int *out_local, int *out_tzoffset) {
72+
int *out_local, int *out_tzoffset,
73+
char *inferred_format, int *format_len) {
74+
int fmt_idx = 0;
7375
int year_leap = 0;
7476
int i, numdigits;
7577
const char *substr;
@@ -104,6 +106,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
104106
while (sublen > 0 && isspace(*substr)) {
105107
++substr;
106108
--sublen;
109+
inferred_format[fmt_idx] = ' ';
110+
++fmt_idx;
107111
}
108112

109113
/* Leading '-' sign for negative year */
@@ -125,6 +129,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
125129

126130
substr += 4;
127131
sublen -= 4;
132+
inferred_format[fmt_idx] = '%';
133+
++fmt_idx;
134+
inferred_format[fmt_idx] = 'Y';
135+
++fmt_idx;
128136
}
129137

130138
/* Negate the year if necessary */
@@ -156,6 +164,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
156164
ymd_sep = valid_ymd_sep[i];
157165
++substr;
158166
--sublen;
167+
inferred_format[fmt_idx] = ymd_sep;
168+
++fmt_idx;
159169
/* Cannot have trailing separator */
160170
if (sublen == 0 || !isdigit(*substr)) {
161171
goto parse_error;
@@ -183,6 +193,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
183193
goto error;
184194
}
185195

196+
inferred_format[fmt_idx] = '%';
197+
++fmt_idx;
198+
inferred_format[fmt_idx] = 'm';
199+
++fmt_idx;
200+
186201
/* Next character must be the separator, start of day, or end of string */
187202
if (sublen == 0) {
188203
bestunit = NPY_FR_M;
@@ -201,6 +216,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
201216
if (*substr != ymd_sep || sublen == 1) {
202217
goto parse_error;
203218
}
219+
inferred_format[fmt_idx] = *substr;
220+
++fmt_idx;
204221
++substr;
205222
--sublen;
206223
}
@@ -230,6 +247,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
230247
goto error;
231248
}
232249

250+
inferred_format[fmt_idx] = '%';
251+
++fmt_idx;
252+
inferred_format[fmt_idx] = 'd';
253+
++fmt_idx;
254+
233255
/* Next character must be a 'T', ' ', or end of string */
234256
if (sublen == 0) {
235257
if (out_local != NULL) {
@@ -242,6 +264,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
242264
if ((*substr != 'T' && *substr != ' ') || sublen == 1) {
243265
goto parse_error;
244266
}
267+
inferred_format[fmt_idx] = *substr;
268+
++fmt_idx;
245269
++substr;
246270
--sublen;
247271

@@ -269,6 +293,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
269293
}
270294
}
271295

296+
inferred_format[fmt_idx] = '%';
297+
++fmt_idx;
298+
inferred_format[fmt_idx] = 'H';
299+
++fmt_idx;
300+
272301
/* Next character must be a ':' or the end of the string */
273302
if (sublen == 0) {
274303
if (!hour_was_2_digits) {
@@ -279,6 +308,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
279308
}
280309

281310
if (*substr == ':') {
311+
inferred_format[fmt_idx] = ':';
312+
++fmt_idx;
282313
has_hms_sep = 1;
283314
++substr;
284315
--sublen;
@@ -315,6 +346,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
315346
goto parse_error;
316347
}
317348

349+
inferred_format[fmt_idx] = '%';
350+
++fmt_idx;
351+
inferred_format[fmt_idx] = 'M';
352+
++fmt_idx;
353+
318354
if (sublen == 0) {
319355
bestunit = NPY_FR_m;
320356
goto finish;
@@ -323,6 +359,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
323359
/* If we make it through this condition block, then the next
324360
* character is a digit. */
325361
if (has_hms_sep && *substr == ':') {
362+
inferred_format[fmt_idx] = ':';
363+
++fmt_idx;
326364
++substr;
327365
--sublen;
328366
/* Cannot have a trailing ':' */
@@ -356,15 +394,28 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
356394
goto parse_error;
357395
}
358396

397+
inferred_format[fmt_idx] = '%';
398+
++fmt_idx;
399+
inferred_format[fmt_idx] = 'S';
400+
++fmt_idx;
401+
359402
/* Next character may be a '.' indicating fractional seconds */
360403
if (sublen > 0 && *substr == '.') {
361404
++substr;
362405
--sublen;
406+
inferred_format[fmt_idx] = '.';
407+
++fmt_idx;
363408
} else {
364409
bestunit = NPY_FR_s;
365410
goto parse_timezone;
366411
}
367412

413+
inferred_format[fmt_idx] = '%';
414+
++fmt_idx;
415+
inferred_format[fmt_idx] = 'f';
416+
++fmt_idx;
417+
418+
368419
/* PARSE THE MICROSECONDS (0 to 6 digits) */
369420
numdigits = 0;
370421
for (i = 0; i < 6; ++i) {
@@ -430,6 +481,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
430481
while (sublen > 0 && isspace(*substr)) {
431482
++substr;
432483
--sublen;
484+
inferred_format[fmt_idx] = ' ';
485+
++fmt_idx;
433486
}
434487

435488
if (sublen == 0) {
@@ -439,6 +492,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
439492

440493
/* UTC specifier */
441494
if (*substr == 'Z') {
495+
inferred_format[fmt_idx] = '%';
496+
++fmt_idx;
497+
inferred_format[fmt_idx] = 'Z';
498+
++fmt_idx;
442499
/* "Z" should be equivalent to tz offset "+00:00" */
443500
if (out_local != NULL) {
444501
*out_local = 1;
@@ -455,6 +512,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
455512
--sublen;
456513
}
457514
} else if (*substr == '-' || *substr == '+') {
515+
inferred_format[fmt_idx] = '%';
516+
++fmt_idx;
517+
inferred_format[fmt_idx] = 'z';
518+
++fmt_idx;
458519
/* Time zone offset */
459520
int offset_neg = 0, offset_hour = 0, offset_minute = 0;
460521

@@ -533,11 +594,12 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
533594
*out_tzoffset = 60 * offset_hour + offset_minute;
534595
}
535596
}
536-
537597
/* Skip trailing whitespace */
538598
while (sublen > 0 && isspace(*substr)) {
539599
++substr;
540600
--sublen;
601+
inferred_format[fmt_idx] = ' ';
602+
++fmt_idx;
541603
}
542604

543605
if (sublen != 0) {
@@ -548,6 +610,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
548610
if (out_bestunit != NULL) {
549611
*out_bestunit = bestunit;
550612
}
613+
*format_len = fmt_idx;
551614
return 0;
552615

553616
parse_error:

pandas/_libs/tslibs/src/datetime/np_datetime_strings.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,8 @@ parse_iso_8601_datetime(const char *str, int len, int want_exc,
5858
npy_datetimestruct *out,
5959
NPY_DATETIMEUNIT *out_bestunit,
6060
int *out_local,
61-
int *out_tzoffset);
61+
int *out_tzoffset, char *inferred_format,
62+
int *format_len);
6263

6364
/*
6465
* Provides a string length to use for converting datetime

0 commit comments

Comments
 (0)