Skip to content

Commit 3e90d43

Browse files
vnlitvinovjreback
authored andcommitted
PERF: Parse certain dates in Cython instead of falling back to dateutil.parse (#25922)
1 parent 3face07 commit 3e90d43

File tree

5 files changed

+224
-6
lines changed

5 files changed

+224
-6
lines changed

asv_bench/benchmarks/io/csv.py

+18
Original file line numberDiff line numberDiff line change
@@ -251,4 +251,22 @@ def mem_parser_chunks(self):
251251
pass
252252

253253

254+
class ReadCSVParseSpecialDate(StringIORewind):
255+
params = (['mY', 'mdY'],)
256+
params_name = ['value']
257+
objects = {
258+
'mY': '01-2019\n10-2019\n02/2000\n',
259+
'mdY': '12/02/2010\n'
260+
}
261+
262+
def setup(self, value):
263+
count_elem = 10000
264+
data = self.objects[value] * count_elem
265+
self.StringIO_input = StringIO(data)
266+
267+
def time_read_special_date(self, value):
268+
read_csv(self.data(self.StringIO_input), sep=',', header=None,
269+
names=['Date'], parse_dates=['Date'])
270+
271+
254272
from ..pandas_vb_common import setup # noqa: F401

doc/source/whatsnew/v0.25.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,7 @@ Performance Improvements
250250
- Improved performance of :meth:`read_csv` by faster tokenizing and faster parsing of small float numbers (:issue:`25784`)
251251
- Improved performance of :meth:`read_csv` by faster parsing of N/A and boolean values (:issue:`25804`)
252252
- Improved performance of :meth:DataFrame.`to_csv` when write datetime dtype data (:issue:`25708`)
253+
- Improved performance of :meth:`read_csv` by much faster parsing of MM/YYYY and DD/MM/YYYY datetime formats (:issue:`25922`)
253254

254255
.. _whatsnew_0250.bug_fixes:
255256

pandas/_libs/src/headers/portable.h

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
// GH-23516 - works around locale perf issues
99
// from MUSL libc, MIT Licensed - see LICENSES
1010
#define isdigit_ascii(c) (((unsigned)(c) - '0') < 10u)
11+
#define getdigit_ascii(c, default) (isdigit_ascii(c) ? ((int)((c) - '0')) : default)
1112
#define isspace_ascii(c) (((c) == ' ') || (((unsigned)(c) - '\t') < 5))
1213
#define toupper_ascii(c) ((((unsigned)(c) - 'a') < 26) ? ((c) & 0x5f) : (c))
1314
#define tolower_ascii(c) ((((unsigned)(c) - 'A') < 26) ? ((c) | 0x20) : (c))

pandas/_libs/tslibs/parsing.pyx

+110-2
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,11 @@ import re
66
import time
77
from io import StringIO
88

9-
from cpython.datetime cimport datetime
9+
from libc.string cimport strchr
1010

11+
from cpython.datetime cimport datetime, datetime_new, import_datetime
12+
from cpython.version cimport PY_VERSION_HEX
13+
import_datetime()
1114

1215
import numpy as np
1316

@@ -24,6 +27,10 @@ from pandas._config import get_option
2427

2528
from pandas._libs.tslibs.ccalendar import MONTH_NUMBERS
2629
from pandas._libs.tslibs.nattype import nat_strings, NaT
30+
from pandas._libs.tslibs.util cimport get_c_string_buf_and_size
31+
32+
cdef extern from "../src/headers/portable.h":
33+
int getdigit_ascii(char c, int default) nogil
2734

2835
# ----------------------------------------------------------------------
2936
# Constants
@@ -42,6 +49,99 @@ cdef:
4249
set _not_datelike_strings = {'a', 'A', 'm', 'M', 'p', 'P', 't', 'T'}
4350

4451
# ----------------------------------------------------------------------
52+
cdef:
53+
const char* delimiters = " /-."
54+
int MAX_DAYS_IN_MONTH = 31, MAX_MONTH = 12
55+
56+
57+
cdef inline bint _is_not_delimiter(const char ch):
58+
return strchr(delimiters, ch) == NULL
59+
60+
61+
cdef inline int _parse_2digit(const char* s):
62+
cdef int result = 0
63+
result += getdigit_ascii(s[0], -10) * 10
64+
result += getdigit_ascii(s[1], -100) * 1
65+
return result
66+
67+
68+
cdef inline int _parse_4digit(const char* s):
69+
cdef int result = 0
70+
result += getdigit_ascii(s[0], -10) * 1000
71+
result += getdigit_ascii(s[1], -100) * 100
72+
result += getdigit_ascii(s[2], -1000) * 10
73+
result += getdigit_ascii(s[3], -10000) * 1
74+
return result
75+
76+
77+
cdef inline object _parse_delimited_date(object date_string, bint dayfirst):
78+
"""
79+
Parse special cases of dates: MM/DD/YYYY, DD/MM/YYYY, MM/YYYY.
80+
At the beginning function tries to parse date in MM/DD/YYYY format, but
81+
if month > 12 - in DD/MM/YYYY (`dayfirst == False`).
82+
With `dayfirst == True` function makes an attempt to parse date in
83+
DD/MM/YYYY, if an attemp is wrong - in DD/MM/YYYY
84+
85+
Note
86+
----
87+
For MM/DD/YYYY, DD/MM/YYYY: delimiter can be a space or one of /-.
88+
For MM/YYYY: delimiter can be a space or one of /-
89+
If `date_string` can't be converted to date, then function returns
90+
None, None
91+
92+
Parameters
93+
----------
94+
date_string : str
95+
dayfirst : bint
96+
97+
Returns:
98+
--------
99+
datetime, resolution
100+
"""
101+
cdef:
102+
const char* buf
103+
Py_ssize_t length
104+
int day = 1, month = 1, year
105+
bint can_swap = 0
106+
107+
buf = get_c_string_buf_and_size(date_string, &length)
108+
if length == 10:
109+
# parsing MM?DD?YYYY and DD?MM?YYYY dates
110+
if _is_not_delimiter(buf[2]) or _is_not_delimiter(buf[5]):
111+
return None, None
112+
month = _parse_2digit(buf)
113+
day = _parse_2digit(buf + 3)
114+
year = _parse_4digit(buf + 6)
115+
reso = 'day'
116+
can_swap = 1
117+
elif length == 7:
118+
# parsing MM?YYYY dates
119+
if buf[2] == b'.' or _is_not_delimiter(buf[2]):
120+
# we cannot reliably tell whether e.g. 10.2010 is a float
121+
# or a date, thus we refuse to parse it here
122+
return None, None
123+
month = _parse_2digit(buf)
124+
year = _parse_4digit(buf + 3)
125+
reso = 'month'
126+
else:
127+
return None, None
128+
129+
if month < 0 or day < 0 or year < 1000:
130+
# some part is not an integer, so
131+
# date_string can't be converted to date, above format
132+
return None, None
133+
134+
if 1 <= month <= MAX_DAYS_IN_MONTH and 1 <= day <= MAX_DAYS_IN_MONTH \
135+
and (month <= MAX_MONTH or day <= MAX_MONTH):
136+
if (month > MAX_MONTH or (day <= MAX_MONTH and dayfirst)) and can_swap:
137+
day, month = month, day
138+
if PY_VERSION_HEX >= 0x03060100:
139+
# In Python <= 3.6.0 there is no range checking for invalid dates
140+
# in C api, thus we call faster C version for 3.6.1 or newer
141+
return datetime_new(year, month, day, 0, 0, 0, 0, None), reso
142+
return datetime(year, month, day, 0, 0, 0, 0, None), reso
143+
144+
raise DateParseError("Invalid date specified ({}/{})".format(month, day))
45145

46146

47147
def parse_datetime_string(date_string, freq=None, dayfirst=False,
@@ -66,6 +166,10 @@ def parse_datetime_string(date_string, freq=None, dayfirst=False,
66166
yearfirst=yearfirst, **kwargs)
67167
return dt
68168

169+
dt, _ = _parse_delimited_date(date_string, dayfirst)
170+
if dt is not None:
171+
return dt
172+
69173
try:
70174
dt, _, _ = _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq)
71175
return dt
@@ -146,6 +250,10 @@ cdef parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False,
146250
if not _does_string_look_like_datetime(date_string):
147251
raise ValueError('Given date string not likely a datetime.')
148252

253+
parsed, reso = _parse_delimited_date(date_string, dayfirst)
254+
if parsed is not None:
255+
return parsed, parsed, reso
256+
149257
try:
150258
return _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq)
151259
except DateParseError:
@@ -279,7 +387,7 @@ cdef inline object _parse_dateabbr_string(object date_string, object default,
279387
except ValueError:
280388
pass
281389

282-
for pat in ['%Y-%m', '%m-%Y', '%b %Y', '%b-%Y']:
390+
for pat in ['%Y-%m', '%b %Y', '%b-%Y']:
283391
try:
284392
ret = datetime.strptime(date_string, pat)
285393
return ret, ret, 'month'

pandas/tests/io/parser/test_parse_dates.py

+94-4
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,16 @@
88
from datetime import date, datetime
99
from io import StringIO
1010

11-
from dateutil.parser import parse
11+
from dateutil.parser import parse as du_parse
12+
from hypothesis import given, settings, strategies as st
1213
import numpy as np
1314
import pytest
1415
import pytz
1516

1617
from pandas._libs.tslib import Timestamp
1718
from pandas._libs.tslibs import parsing
18-
from pandas.compat import lrange
19+
from pandas._libs.tslibs.parsing import parse_datetime_string
20+
from pandas.compat import is_platform_windows, lrange
1921
from pandas.compat.numpy import np_array_datetime64_compat
2022

2123
import pandas as pd
@@ -26,6 +28,15 @@
2628
import pandas.io.date_converters as conv
2729
import pandas.io.parsers as parsers
2830

31+
# constant
32+
_DEFAULT_DATETIME = datetime(1, 1, 1)
33+
34+
# Strategy for hypothesis
35+
if is_platform_windows():
36+
date_strategy = st.datetimes(min_value=datetime(1900, 1, 1))
37+
else:
38+
date_strategy = st.datetimes()
39+
2940

3041
def test_separator_date_conflict(all_parsers):
3142
# Regression test for gh-4678
@@ -439,7 +450,7 @@ def test_parse_dates_custom_euro_format(all_parsers, kwargs):
439450
"""
440451
if "dayfirst" in kwargs:
441452
df = parser.read_csv(StringIO(data), names=["time", "Q", "NTU"],
442-
date_parser=lambda d: parse(d, **kwargs),
453+
date_parser=lambda d: du_parse(d, **kwargs),
443454
header=0, index_col=0, parse_dates=True,
444455
na_values=["NA"])
445456
exp_index = Index([datetime(2010, 1, 31), datetime(2010, 2, 1),
@@ -451,7 +462,7 @@ def test_parse_dates_custom_euro_format(all_parsers, kwargs):
451462
msg = "got an unexpected keyword argument 'day_first'"
452463
with pytest.raises(TypeError, match=msg):
453464
parser.read_csv(StringIO(data), names=["time", "Q", "NTU"],
454-
date_parser=lambda d: parse(d, **kwargs),
465+
date_parser=lambda d: du_parse(d, **kwargs),
455466
skiprows=[0], index_col=0, parse_dates=True,
456467
na_values=["NA"])
457468

@@ -849,3 +860,82 @@ def test_parse_timezone(all_parsers):
849860

850861
expected = DataFrame(expected_data)
851862
tm.assert_frame_equal(result, expected)
863+
864+
865+
@pytest.mark.parametrize("date_string", [
866+
"32/32/2019",
867+
"02/30/2019",
868+
"13/13/2019",
869+
"13/2019",
870+
"a3/11/2018",
871+
"10/11/2o17"
872+
])
873+
def test_invalid_parse_delimited_date(all_parsers, date_string):
874+
parser = all_parsers
875+
expected = DataFrame({0: [date_string]}, dtype="object")
876+
result = parser.read_csv(StringIO(date_string),
877+
header=None, parse_dates=[0])
878+
tm.assert_frame_equal(result, expected)
879+
880+
881+
@pytest.mark.parametrize("date_string,dayfirst,expected", [
882+
# %d/%m/%Y; month > 12 thus replacement
883+
("13/02/2019", False, datetime(2019, 2, 13)),
884+
("13/02/2019", True, datetime(2019, 2, 13)),
885+
# %m/%d/%Y; day > 12 thus there will be no replacement
886+
("02/13/2019", False, datetime(2019, 2, 13)),
887+
("02/13/2019", True, datetime(2019, 2, 13)),
888+
# %d/%m/%Y; dayfirst==True thus replacement
889+
("04/02/2019", True, datetime(2019, 2, 4))
890+
])
891+
def test_parse_delimited_date_swap(all_parsers, date_string,
892+
dayfirst, expected):
893+
parser = all_parsers
894+
expected = DataFrame({0: [expected]}, dtype="datetime64[ns]")
895+
result = parser.read_csv(StringIO(date_string), header=None,
896+
dayfirst=dayfirst, parse_dates=[0])
897+
tm.assert_frame_equal(result, expected)
898+
899+
900+
def _helper_hypothesis_delimited_date(call, date_string, **kwargs):
901+
msg, result = None, None
902+
try:
903+
result = call(date_string, **kwargs)
904+
except ValueError as er:
905+
msg = str(er)
906+
pass
907+
return msg, result
908+
909+
910+
@given(date_strategy)
911+
@settings(deadline=None)
912+
@pytest.mark.parametrize("delimiter", list(" -./"))
913+
@pytest.mark.parametrize("dayfirst", [True, False])
914+
@pytest.mark.parametrize("date_format", [
915+
"%d %m %Y",
916+
"%m %d %Y",
917+
"%m %Y",
918+
"%Y %m %d",
919+
"%y %m %d",
920+
"%Y%m%d",
921+
"%y%m%d",
922+
])
923+
def test_hypothesis_delimited_date(date_format, dayfirst,
924+
delimiter, test_datetime):
925+
if date_format == "%m %Y" and delimiter == ".":
926+
pytest.skip("parse_datetime_string cannot reliably tell whether \
927+
e.g. %m.%Y is a float or a date, thus we skip it")
928+
result, expected = None, None
929+
except_in_dateutil, except_out_dateutil = None, None
930+
date_string = test_datetime.strftime(date_format.replace(' ', delimiter))
931+
932+
except_out_dateutil, result = _helper_hypothesis_delimited_date(
933+
parse_datetime_string, date_string,
934+
dayfirst=dayfirst)
935+
except_in_dateutil, expected = _helper_hypothesis_delimited_date(
936+
du_parse, date_string,
937+
default=_DEFAULT_DATETIME,
938+
dayfirst=dayfirst, yearfirst=False)
939+
940+
assert except_out_dateutil == except_in_dateutil
941+
assert result == expected

0 commit comments

Comments
 (0)