diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index c51fb09ad8671..36ba7c569d34f 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -96,6 +96,35 @@ def time_read_csv(self, infer_datetime_format, format): infer_datetime_format=infer_datetime_format) +class ReadCSVConcatDatetime(StringIORewind): + + iso8601 = '%Y-%m-%d %H:%M:%S' + + def setup(self): + rng = date_range('1/1/2000', periods=50000, freq='S') + self.StringIO_input = StringIO('\n'.join( + rng.strftime(self.iso8601).tolist())) + + def time_read_csv(self): + read_csv(self.data(self.StringIO_input), + header=None, names=['foo'], parse_dates=['foo'], + infer_datetime_format=False) + + +class ReadCSVConcatDatetimeBadDateValue(StringIORewind): + + params = (['nan', '0', ''],) + param_names = ['bad_date_value'] + + def setup(self, bad_date_value): + self.StringIO_input = StringIO(('%s,\n' % bad_date_value) * 50000) + + def time_read_csv(self, bad_date_value): + read_csv(self.data(self.StringIO_input), + header=None, names=['foo', 'bar'], parse_dates=['foo'], + infer_datetime_format=False) + + class ReadCSVSkipRows(BaseIO): fname = '__test__.csv' diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py new file mode 100644 index 0000000000000..493955d394443 --- /dev/null +++ b/asv_bench/benchmarks/io/parsers.py @@ -0,0 +1,34 @@ +import numpy as np + +from pandas._libs.tslibs.parsing import ( + _concat_date_cols, _does_string_look_like_datetime) + + +class DoesStringLookLikeDatetime(object): + + params = (['2Q2005', '0.0', '10000'],) + param_names = ['value'] + + def setup(self, value): + self.objects = [value] * 1000000 + + def time_check_datetimes(self, value): + for obj in self.objects: + _does_string_look_like_datetime(obj) + + +class ConcatDateCols(object): + + params = ([1234567890, 'AAAA'], [1, 2]) + param_names = ['value', 'dim'] + + def setup(self, value, dim): + count_elem = 10000 + if dim == 1: + self.object = (np.array([value] * count_elem),) + if dim == 2: + self.object = (np.array([value] * count_elem), + np.array([value] * count_elem)) + + def time_check_concat(self, value, dim): + _concat_date_cols(self.object) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 578e24009d35a..61d3a8f8ed517 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -252,6 +252,8 @@ Performance Improvements - Improved performance of :meth:`read_csv` by much faster parsing of ``MM/YYYY`` and ``DD/MM/YYYY`` datetime formats (:issue:`25922`) - Improved performance of nanops for dtypes that cannot store NaNs. Speedup is particularly prominent for :meth:`Series.all` and :meth:`Series.any` (:issue:`25070`) - Improved performance of :meth:`Series.map` for dictionary mappers on categorical series by mapping the categories instead of mapping all values (:issue:`23785`) +- Improved performance of :meth:`read_csv` by faster concatenating date columns without extra conversion to string for integer/float zero + and float NaN; by faster checking the string for the possibility of being a date (:issue:`25754`) .. _whatsnew_0250.bug_fixes: diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 7f66b93b58a1a..c09fb96eb9182 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -8,10 +8,9 @@ import warnings import cython from cython import Py_ssize_t -from cpython cimport (Py_INCREF, PyTuple_SET_ITEM, - PyTuple_New, - Py_EQ, - PyObject_RichCompareBool) +from cpython cimport (Py_INCREF, PyTuple_SET_ITEM, PyTuple_New, PyObject_Str, + Py_EQ, Py_SIZE, PyObject_RichCompareBool, + PyUnicode_Join, PyList_New) from cpython.datetime cimport (PyDateTime_Check, PyDate_Check, PyTime_Check, PyDelta_Check, @@ -23,10 +22,8 @@ cimport numpy as cnp from numpy cimport (ndarray, PyArray_GETITEM, PyArray_ITER_DATA, PyArray_ITER_NEXT, PyArray_IterNew, flatiter, NPY_OBJECT, - int64_t, - float32_t, float64_t, - uint8_t, uint64_t, - complex128_t) + int64_t, float32_t, float64_t, + uint8_t, uint64_t, complex128_t) cnp.import_array() cdef extern from "numpy/arrayobject.h": diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 1c8bfe4b4bc20..068ad016459a8 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -7,11 +7,19 @@ from io import StringIO from libc.string cimport strchr +import cython + +from cpython cimport PyObject_Str, PyUnicode_Join + from cpython.datetime cimport datetime, datetime_new, import_datetime from cpython.version cimport PY_VERSION_HEX import_datetime() import numpy as np +cimport numpy as cnp +from numpy cimport (PyArray_GETITEM, PyArray_ITER_DATA, PyArray_ITER_NEXT, + PyArray_IterNew, flatiter, float64_t) +cnp.import_array() # dateutil compat from dateutil.tz import (tzoffset, @@ -26,11 +34,16 @@ from pandas._config import get_option from pandas._libs.tslibs.ccalendar import MONTH_NUMBERS from pandas._libs.tslibs.nattype import nat_strings, NaT -from pandas._libs.tslibs.util cimport get_c_string_buf_and_size +from pandas._libs.tslibs.util cimport is_array, get_c_string_buf_and_size cdef extern from "../src/headers/portable.h": int getdigit_ascii(char c, int default) nogil +cdef extern from "../src/parser/tokenizer.h": + double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, + int skip_trailing, int *error, int *maybe_int) + + # ---------------------------------------------------------------------- # Constants @@ -302,20 +315,48 @@ cdef parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False, return parsed, parsed, reso -cpdef bint _does_string_look_like_datetime(object date_string): - if date_string.startswith('0'): - # Strings starting with 0 are more consistent with a - # date-like string than a number - return True +cpdef bint _does_string_look_like_datetime(object py_string): + """ + Checks whether given string is a datetime: it has to start with '0' or + be greater than 1000. - try: - if float(date_string) < 1000: - return False - except ValueError: - pass + Parameters + ---------- + py_string: object - if date_string in _not_datelike_strings: - return False + Returns + ------- + whether given string is a datetime + """ + cdef: + const char *buf + char *endptr = NULL + Py_ssize_t length = -1 + double converted_date + char first + int error = 0 + + buf = get_c_string_buf_and_size(py_string, &length) + if length >= 1: + first = buf[0] + if first == b'0': + # Strings starting with 0 are more consistent with a + # date-like string than a number + return True + elif py_string in _not_datelike_strings: + return False + else: + # xstrtod with such paramaters copies behavior of python `float` + # cast; for example, " 35.e-1 " is valid string for this cast so, + # for correctly xstrtod call necessary to pass these params: + # b'.' - a dot is used as separator, b'e' - an exponential form of + # a float number can be used, b'\0' - not to use a thousand + # separator, 1 - skip extra spaces before and after, + converted_date = xstrtod(buf, &endptr, + b'.', b'e', b'\0', 1, &error, NULL) + # if there were no errors and the whole line was parsed, then ... + if error == 0 and endptr == buf + length: + return converted_date >= 1000 return True @@ -857,3 +898,117 @@ def _guess_datetime_format(dt_str, dayfirst=False, dt_str_parse=du_parse, return guessed_format else: return None + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline object convert_to_unicode(object item, + bint keep_trivial_numbers): + """ + Convert `item` to str. + + Parameters + ---------- + item : object + keep_trivial_numbers : bool + if True, then conversion (to string from integer/float zero) + is not performed + + Returns + ------- + str or int or float + """ + cdef: + float64_t float_item + + if keep_trivial_numbers: + if isinstance(item, int): + if item == 0: + return item + elif isinstance(item, float): + float_item = item + if float_item == 0.0 or float_item != float_item: + return item + + if not isinstance(item, str): + item = PyObject_Str(item) + + return item + + +@cython.wraparound(False) +@cython.boundscheck(False) +def _concat_date_cols(tuple date_cols, bint keep_trivial_numbers=True): + """ + Concatenates elements from numpy arrays in `date_cols` into strings. + + Parameters + ---------- + date_cols : tuple of numpy arrays + keep_trivial_numbers : bool, default True + if True and len(date_cols) == 1, then + conversion (to string from integer/float zero) is not performed + + Returns + ------- + arr_of_rows : ndarray (dtype=object) + + Examples + -------- + >>> dates=np.array(['3/31/2019', '4/31/2019'], dtype=object) + >>> times=np.array(['11:20', '10:45'], dtype=object) + >>> result = _concat_date_cols((dates, times)) + >>> result + array(['3/31/2019 11:20', '4/31/2019 10:45'], dtype=object) + """ + cdef: + Py_ssize_t rows_count = 0, col_count = len(date_cols) + Py_ssize_t col_idx, row_idx + list list_to_join + cnp.ndarray[object] iters + object[::1] iters_view + flatiter it + cnp.ndarray[object] result + object[:] result_view + + if col_count == 0: + return np.zeros(0, dtype=object) + + if not all(is_array(array) for array in date_cols): + raise ValueError("not all elements from date_cols are numpy arrays") + + rows_count = min(len(array) for array in date_cols) + result = np.zeros(rows_count, dtype=object) + result_view = result + + if col_count == 1: + array = date_cols[0] + it = PyArray_IterNew(array) + for row_idx in range(rows_count): + item = PyArray_GETITEM(array, PyArray_ITER_DATA(it)) + result_view[row_idx] = convert_to_unicode(item, + keep_trivial_numbers) + PyArray_ITER_NEXT(it) + else: + # create fixed size list - more effecient memory allocation + list_to_join = [None] * col_count + iters = np.zeros(col_count, dtype=object) + + # create memoryview of iters ndarray, that will contain some + # flatiter's for each array in `date_cols` - more effecient indexing + iters_view = iters + for col_idx, array in enumerate(date_cols): + iters_view[col_idx] = PyArray_IterNew(array) + + # array elements that are on the same line are converted to one string + for row_idx in range(rows_count): + for col_idx, array in enumerate(date_cols): + # this cast is needed, because we did not find a way + # to efficiently store `flatiter` type objects in ndarray + it = iters_view[col_idx] + item = PyArray_GETITEM(array, PyArray_ITER_DATA(it)) + list_to_join[col_idx] = convert_to_unicode(item, False) + PyArray_ITER_NEXT(it) + result_view[row_idx] = PyUnicode_Join(' ', list_to_join) + + return result diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index a3fde2c2bf4dd..f25142fcfcf58 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -3186,7 +3186,7 @@ def _make_date_converter(date_parser=None, dayfirst=False, infer_datetime_format=False, cache_dates=True): def converter(*date_cols): if date_parser is None: - strs = _concat_date_cols(date_cols) + strs = parsing._concat_date_cols(date_cols) try: return tools.to_datetime( @@ -3216,10 +3216,10 @@ def converter(*date_cols): except Exception: try: return tools.to_datetime( - parsing.try_parse_dates(_concat_date_cols(date_cols), - parser=date_parser, - dayfirst=dayfirst), - cache=cache_dates, + parsing.try_parse_dates( + parsing._concat_date_cols(date_cols), + parser=date_parser, + dayfirst=dayfirst), errors='ignore') except Exception: return generic_parser(date_parser, *date_cols) @@ -3511,15 +3511,6 @@ def _get_col_names(colspec, columns): return colnames -def _concat_date_cols(date_cols): - if len(date_cols) == 1: - return np.array([str(x) for x in date_cols[0]], dtype=object) - - rs = np.array([' '.join(str(y) for y in x) - for x in zip(*date_cols)], dtype=object) - return rs - - class FixedWidthReader(BaseIterator): """ A reader of fixed-width lines. diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 6c4dfe2ffa1fa..46353b5345018 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -19,12 +19,11 @@ from pandas.compat.numpy import np_array_datetime64_compat import pandas as pd -from pandas import DataFrame, DatetimeIndex, Index, MultiIndex +from pandas import DataFrame, DatetimeIndex, Index, MultiIndex, Series from pandas.core.indexes.datetimes import date_range import pandas.util.testing as tm import pandas.io.date_converters as conv -import pandas.io.parsers as parsers # constant _DEFAULT_DATETIME = datetime(1, 1, 1) @@ -76,7 +75,7 @@ def date_parser(*date_cols): ------- parsed : Series """ - return parsing.try_parse_dates(parsers._concat_date_cols(date_cols)) + return parsing.try_parse_dates(parsing._concat_date_cols(date_cols)) result = parser.read_csv(StringIO(data), header=None, date_parser=date_parser, prefix="X", @@ -117,6 +116,18 @@ def date_parser(*date_cols): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("container", [list, tuple, Index, Series]) +@pytest.mark.parametrize("dim", [1, 2]) +def test_concat_date_col_fail(container, dim): + msg = "not all elements from date_cols are numpy arrays" + value = "19990127" + + date_cols = tuple(container([value]) for _ in range(dim)) + + with pytest.raises(ValueError, match=msg): + parsing._concat_date_cols(date_cols) + + @pytest.mark.parametrize("keep_date_col", [True, False]) def test_multiple_date_col(all_parsers, keep_date_col): data = """\ diff --git a/setup.py b/setup.py index d121a54ded2a1..0dbf93ec925e0 100755 --- a/setup.py +++ b/setup.py @@ -634,7 +634,8 @@ def srcpath(name=None, suffix='.pyx', subdir='src'): 'sources': np_datetime_sources}, '_libs.tslibs.parsing': { 'pyxfile': '_libs/tslibs/parsing', - 'include': []}, + 'depends': ['pandas/_libs/src/parser/tokenizer.h'], + 'sources': ['pandas/_libs/src/parser/tokenizer.c']}, '_libs.tslibs.period': { 'pyxfile': '_libs/tslibs/period', 'include': ts_include, @@ -762,6 +763,9 @@ def srcpath(name=None, suffix='.pyx', subdir='src'): extra_link_args=extra_link_args) extensions.append(_move_ext) +# ---------------------------------------------------------------------- + + # The build cache system does string matching below this point. # if you change something, be careful.