From 45dfa46410ab1c5198637de91d4d5e9f46866e5c Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Sun, 17 Mar 2019 19:48:53 +0300 Subject: [PATCH 01/42] PERF: rewrited _concat_date_cols function on C with removing extra conversation for integer/float zero and float NaN; rewrited _does_string_look_like_datetime on C --- pandas/_libs/tslibs/parsing.pyx | 18 - .../_libs/tslibs/src/datetime/datehelpers.c | 428 ++++++++++++++++++ pandas/io/parsers.py | 18 +- setup.py | 18 + 4 files changed, 453 insertions(+), 29 deletions(-) create mode 100644 pandas/_libs/tslibs/src/datetime/datehelpers.c diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 1c8bfe4b4bc20..72de28c5ac54b 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -302,24 +302,6 @@ cdef parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False, return parsed, parsed, reso -cpdef bint _does_string_look_like_datetime(object date_string): - if date_string.startswith('0'): - # Strings starting with 0 are more consistent with a - # date-like string than a number - return True - - try: - if float(date_string) < 1000: - return False - except ValueError: - pass - - if date_string in _not_datelike_strings: - return False - - return True - - cdef inline object _parse_dateabbr_string(object date_string, object default, object freq): cdef: diff --git a/pandas/_libs/tslibs/src/datetime/datehelpers.c b/pandas/_libs/tslibs/src/datetime/datehelpers.c new file mode 100644 index 0000000000000..2a4ced54d753f --- /dev/null +++ b/pandas/_libs/tslibs/src/datetime/datehelpers.c @@ -0,0 +1,428 @@ +#include + +#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION +#include + +#include + +#include "../../../src/inline_helper.h" +#include "../../../src/parser/tokenizer.h" + +#if PY_MAJOR_VERSION >= 3 + #define PY_STRING_CHECK(string) (PyUnicode_Check(string)) +#else + #define PY_STRING_CHECK(string) \ + (PyString_Check(string) || PyUnicode_Check(string)) +#endif + +int PANDAS_INLINE convert_and_set_item(PyObject *item, Py_ssize_t index, + PyArrayObject *result, + int keep_trivial_numbers) { + int needs_decref = 0, do_convert = 1; + if (item == NULL) { + return 0; + } + if (keep_trivial_numbers) { + // don't convert an integer if it's zero, + // don't convert a float if it's zero or NaN +#if PY_MAJOR_VERSION >= 3 + if (PyLong_Check(item)) { + PyLongObject* v = (PyLongObject*)item; + switch (Py_SIZE(v)) { + case 0: + do_convert = 0; + break; + case 1: // fallthrough + case -1: + if (v->ob_digit[0] == 0) { + do_convert = 0; + } + } +#else + if (PyInt_CheckExact(item)) { + if (((PyIntObject*)item)->ob_ival == 0) do_convert = 0; +#endif + } else if (PyFloat_Check(item)) { + double v = PyFloat_AS_DOUBLE(item); + if (v == 0.0 || v != v) { + do_convert = 0; + } + } + } + + if (do_convert) { + if (!PY_STRING_CHECK(item)) { + PyObject *str_item = PyObject_Str(item); + if (str_item == NULL) { + return 0; + } + item = str_item; + needs_decref = 1; + } + } + if (PyArray_SETITEM(result, PyArray_GETPTR1(result, index), item) != 0) { + PyErr_SetString(PyExc_RuntimeError, "Cannot set resulting item"); + if (needs_decref) Py_DECREF(item); + return 0; + } + if (needs_decref) Py_DECREF(item); + return 1; +} + +static int put_object_as_string(PyObject* list, Py_ssize_t idx, + PyObject* item) { + if (!PY_STRING_CHECK(item)) { + PyObject* str_item = PyObject_Str(item); + if (str_item == NULL) { + return 0; + } + Py_DECREF(item); + item = str_item; + } + return (PyList_SetItem(list, idx, item) == 0) ? 1 : 0; +} + +static PyObject* free_arrays(PyObject** arrays, Py_ssize_t size) { + PyObject** item = arrays; + Py_ssize_t i; + for (i = 0; i < size; ++i, ++item) Py_DECREF(*item); + free(arrays); + return NULL; +} + +static PyObject* concat_date_cols(PyObject *self, PyObject *args, + PyObject *kwds) { + PyObject *sequence = NULL; + PyObject *py_keep_trivial_numbers = NULL; + PyArrayObject *result = NULL; + Py_ssize_t sequence_size = 0; + int keep_trivial_numbers; + char* kwlist[] = {"", "keep_trivial_numbers", NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O", kwlist, + &sequence, &py_keep_trivial_numbers)) { + return NULL; + } + if (!PySequence_Check(sequence)) { + PyErr_SetString(PyExc_TypeError, "argument must be sequence"); + return NULL; + } + keep_trivial_numbers = (py_keep_trivial_numbers != NULL) ? \ + PyObject_IsTrue(py_keep_trivial_numbers) : 0; + + sequence_size = PySequence_Size(sequence); + if (sequence_size == -1) { + return NULL; + } else if (sequence_size == 0) { + npy_intp dims[1]; + dims[0] = 0; + result = (PyArrayObject*)PyArray_ZEROS(1, dims, NPY_OBJECT, 0); + return (PyObject*)result; + } else if (sequence_size == 1) { + PyObject* array = PySequence_GetItem(sequence, 0); + Py_ssize_t array_size; + if (array == NULL) { + return NULL; + } + + array_size = PySequence_Size(array); + if (array_size == -1) { + Py_DECREF(array); + return NULL; + } + + { + npy_intp dims[1]; + dims[0] = array_size; + result = (PyArrayObject*)PyArray_ZEROS(1, dims, NPY_OBJECT, 0); + if (result == NULL) { + Py_DECREF(array); + return NULL; + } + } + + if (PyArray_CheckExact(array)) { + PyArrayObject *ndarray = (PyArrayObject*)array; + Py_ssize_t i; + for (i = 0; i < array_size; ++i) { + PyObject *item = PyArray_GETITEM(ndarray, + PyArray_GETPTR1(ndarray, i)); + if (!convert_and_set_item(item, i, result, + keep_trivial_numbers)) { + Py_DECREF(result); + Py_DECREF(array); + Py_DECREF(item); + return NULL; + } + Py_DECREF(item); + } + } else { + PyObject* fast_array = PySequence_Fast(array, + "elements of input sequence must be sequence"); + Py_ssize_t i; + if (fast_array == NULL) { + Py_DECREF(result); + Py_DECREF(array); + // PySequence_Fast set message, which in second argument + return NULL; + } + + for (i = 0; i < array_size; ++i) { + PyObject* item = PySequence_Fast_GET_ITEM(fast_array, i); + if (!convert_and_set_item(item, i, result, + keep_trivial_numbers)) { + Py_DECREF(result); + Py_DECREF(array); + Py_DECREF(fast_array); + return NULL; + } + } + Py_DECREF(fast_array); + } + Py_DECREF(array); + return (PyObject*)result; + } else { + size_t mem_size = sizeof(PyObject*) * sequence_size; + PyObject **arrays = (PyObject**) malloc(mem_size); + PyObject *array = NULL; + PyObject **parray = NULL; + PyObject *fast_array = NULL; + PyObject *separator = NULL; + PyObject *item = NULL; + PyObject *list_to_join = NULL; + Py_ssize_t min_array_size = 0; + int all_numpy = 1; + Py_ssize_t i; + for (i = 0; i < sequence_size; ++i) { + array = PySequence_GetItem(sequence, i); + if (array == NULL) { + return free_arrays(arrays, i); + } + if (PyArray_CheckExact(array)) { + if (PyArray_NDIM((PyArrayObject*)array) != 1) { + PyErr_SetString(PyExc_ValueError, + "ndarrays must be 1-dimentional"); + return free_arrays(arrays, i); + } + } else { + all_numpy = 0; + } + arrays[i] = array; + } + + parray = arrays; + if (all_numpy) { + Py_ssize_t i; + for (i = 0; i < sequence_size; ++i, ++parray) { + Py_ssize_t array_size = PyArray_SIZE((PyArrayObject*)(*parray)); + + if (array_size < 0) { + return free_arrays(arrays, sequence_size); + } + + if (array_size < min_array_size || min_array_size == 0) { + min_array_size = array_size; + } + } + } else { + Py_ssize_t i; + for (i = 0; i < sequence_size; ++i, ++parray) { + Py_ssize_t array_size; + fast_array = PySequence_Fast(*parray, + "elements of input sequence must be sequence"); + array_size = (fast_array != NULL) ? \ + PySequence_Fast_GET_SIZE(fast_array) : -1; + + if (array_size < 0) { + Py_XDECREF(fast_array); + return free_arrays(arrays, sequence_size); + } + Py_DECREF(array); + arrays[i] = fast_array; + + if (array_size < min_array_size || min_array_size == 0) { + min_array_size = array_size; + } + } + } + + { + npy_intp dims[1]; + dims[0] = min_array_size; + result = (PyArrayObject*)PyArray_ZEROS(1, dims, NPY_OBJECT, 0); + if (result == NULL) { + return free_arrays(arrays, sequence_size); + } + } + + separator = PyUnicode_FromFormat(" "); + if (separator == NULL) { + Py_DECREF(result); + return free_arrays(arrays, sequence_size); + } + list_to_join = PyList_New(sequence_size); + for (i = 0; i < min_array_size; ++i) { + PyObject *result_string = NULL; + parray = arrays; + if (all_numpy) { + Py_ssize_t j; + for (j = 0; j < sequence_size; ++j, ++parray) { + PyArrayObject* arr = (PyArrayObject*)(*parray); + item = PyArray_GETITEM(arr, PyArray_GETPTR1(arr, i)); + if (item == NULL) { + Py_DECREF(list_to_join); + Py_DECREF(result); + return free_arrays(arrays, sequence_size); + } + if (!put_object_as_string(list_to_join, j, item)) { + Py_DECREF(item); + Py_DECREF(list_to_join); + Py_DECREF(result); + return free_arrays(arrays, sequence_size); + } + } + } else { + Py_ssize_t j; + for (j = 0; j < sequence_size; ++j, ++parray) { + item = PySequence_Fast_GET_ITEM(*parray, i); + if (item == NULL) { + Py_DECREF(list_to_join); + Py_DECREF(result); + return free_arrays(arrays, sequence_size); + } + Py_INCREF(item); + if (!put_object_as_string(list_to_join, j, item)) { + Py_DECREF(item); + Py_DECREF(list_to_join); + Py_DECREF(result); + return free_arrays(arrays, sequence_size); + } + } + } + result_string = PyUnicode_Join(separator, list_to_join); + if (result_string == NULL) { + Py_DECREF(list_to_join); + Py_DECREF(result); + return free_arrays(arrays, sequence_size); + } + if (PyArray_SETITEM(result, PyArray_GETPTR1(result, i), + result_string) != 0) { + PyErr_SetString(PyExc_RuntimeError, + "Cannot set resulting item"); + Py_DECREF(list_to_join); + Py_DECREF(result); + Py_DECREF(result_string); + return free_arrays(arrays, sequence_size); + } + Py_DECREF(result_string); + } + Py_DECREF(list_to_join); + (void)free_arrays(arrays, sequence_size); + return (PyObject*)result; + } +} + +static char not_datelike[sizeof(char) * 256]; + +static PyObject* _does_string_look_like_datetime(PyObject* unused, + PyObject* arg) { + char *buf = NULL, *endptr = NULL; + Py_ssize_t length = -1; + double converted_date; + int error = 0; + int result = 1; + +#if PY_MAJOR_VERSION == 2 + if (!PyString_CheckExact(arg)) { + if (!PyUnicode_CheckExact(arg)) { + // arg is not a string, so it's certainly + // not a datetime-looking string + PyErr_SetString(PyExc_ValueError, + "_does_string_look_like_datetime expects a string"); + return NULL; + } + buf = PyUnicode_AS_DATA(arg); + length = (int)PyUnicode_GET_SIZE(arg); + } else { + if (PyString_AsStringAndSize(arg, &buf, &length) == -1) { + return NULL; + } + } +#else + if (!PyUnicode_CheckExact(arg) || !PyUnicode_IS_READY(arg)) { + PyErr_SetString(PyExc_ValueError, + "_does_string_look_like_datetime expects a string"); + return NULL; + } + buf = PyUnicode_DATA(arg); + length = PyUnicode_GET_LENGTH(arg); +#endif + + if (length >= 1) { + char first = *buf; + if (first == '0') { + result = 1; + } else if (length == 1 && not_datelike[Py_CHARMASK(first)]) { + result = 0; + } else { + converted_date = xstrtod(buf, &endptr, '.', 'e', '\0', 1); + if ((errno == 0) && (endptr == buf + length)) { + result = (converted_date >= 1000) ? 1 : 0; + } + } + } + + if (result) { + Py_RETURN_TRUE; + } else { + Py_RETURN_FALSE; + } +} + +static PyMethodDef module_methods[] = { + /* name from python, name in C-file, ..., __doc__ string of method */ + { + "concat_date_cols", (PyCFunction)concat_date_cols, + METH_VARARGS | METH_KEYWORDS, + "concatenates date cols and returns numpy array" + }, + { + "_does_string_look_like_datetime", _does_string_look_like_datetime, + METH_O, + "checks if string looks like a datetime" + }, + {NULL, NULL, 0, NULL} +}; + +#if PY_MAJOR_VERSION >= 3 +static struct PyModuleDef moduledef = { + PyModuleDef_HEAD_INIT, + "datehelpers", // name of module + "helpers for datetime structures manipulation", // module documentation + -1, // size of per-interpreter state of the module, + // or -1 if the module keeps state in global variables. + module_methods +}; +#define PY_DATEHELPERS_MODULE_INIT PyMODINIT_FUNC PyInit_datehelpers(void) +#define PY_MODULE_CREATE PyModule_Create(&moduledef) +#define PY_RETURN_MODULE return module +#else +#define PY_DATEHELPERS_MODULE_INIT void initdatehelpers(void) +#define PY_MODULE_CREATE Py_InitModule("datehelpers", module_methods) +#define PY_RETURN_MODULE +#endif + +PY_DATEHELPERS_MODULE_INIT { + PyObject *module = NULL; + import_array(); + + module = PY_MODULE_CREATE; + + memset(not_datelike, 0, sizeof(not_datelike)); + not_datelike['a'] = not_datelike['A'] = 1; + not_datelike['m'] = not_datelike['M'] = 1; + not_datelike['p'] = not_datelike['P'] = 1; + not_datelike['t'] = not_datelike['T'] = 1; + + PY_RETURN_MODULE; +} diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index a3fde2c2bf4dd..d884007725d6e 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -13,6 +13,7 @@ import numpy as np +from pandas._libs.datehelpers import concat_date_cols as _concat_date_cols import pandas._libs.lib as lib import pandas._libs.ops as libops import pandas._libs.parsers as parsers @@ -3186,7 +3187,7 @@ def _make_date_converter(date_parser=None, dayfirst=False, infer_datetime_format=False, cache_dates=True): def converter(*date_cols): if date_parser is None: - strs = _concat_date_cols(date_cols) + strs = _concat_date_cols(date_cols, keep_trivial_numbers=True) try: return tools.to_datetime( @@ -3216,7 +3217,11 @@ def converter(*date_cols): except Exception: try: return tools.to_datetime( - parsing.try_parse_dates(_concat_date_cols(date_cols), + parsing.try_parse_dates( + _concat_date_cols( + date_cols, + keep_trivial_numbers=True + ), parser=date_parser, dayfirst=dayfirst), cache=cache_dates, @@ -3511,15 +3516,6 @@ def _get_col_names(colspec, columns): return colnames -def _concat_date_cols(date_cols): - if len(date_cols) == 1: - return np.array([str(x) for x in date_cols[0]], dtype=object) - - rs = np.array([' '.join(str(y) for y in x) - for x in zip(*date_cols)], dtype=object) - return rs - - class FixedWidthReader(BaseIterator): """ A reader of fixed-width lines. diff --git a/setup.py b/setup.py index d121a54ded2a1..18b282d17a28b 100755 --- a/setup.py +++ b/setup.py @@ -243,6 +243,7 @@ def initialize_options(self): ujson_lib = pjoin(base, 'ujson', 'lib') self._clean_exclude = [pjoin(dt, 'np_datetime.c'), pjoin(dt, 'np_datetime_strings.c'), + pjoin(dt, 'datehelpers.c'), pjoin(parser, 'tokenizer.c'), pjoin(parser, 'io.c'), pjoin(ujson_python, 'ujson.c'), @@ -762,6 +763,23 @@ def srcpath(name=None, suffix='.pyx', subdir='src'): extra_link_args=extra_link_args) extensions.append(_move_ext) +# ---------------------------------------------------------------------- +# datehelpers +datehelpers_sources = [ + 'pandas/_libs/tslibs/src/datetime/datehelpers.c', + 'pandas/_libs/src/parser/tokenizer.c' +] +datehelpers_ext = Extension('pandas._libs.datehelpers', + depends=[ + 'pandas/_libs/src/parser/tokenizer.h' + ], + sources=datehelpers_sources, + include_dirs=['pandas/_libs/src/klib/'], + extra_compile_args=extra_compile_args, + define_macros=macros) +extensions.append(datehelpers_ext) + + # The build cache system does string matching below this point. # if you change something, be careful. From 1531ec9caa9a7fc2f51f643b4585f1646b7bfabb Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 18 Mar 2019 14:40:40 +0300 Subject: [PATCH 02/42] perf bench for _concat_date_cols --- asv_bench/benchmarks/io/csv.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index c51fb09ad8671..36ba7c569d34f 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -96,6 +96,35 @@ def time_read_csv(self, infer_datetime_format, format): infer_datetime_format=infer_datetime_format) +class ReadCSVConcatDatetime(StringIORewind): + + iso8601 = '%Y-%m-%d %H:%M:%S' + + def setup(self): + rng = date_range('1/1/2000', periods=50000, freq='S') + self.StringIO_input = StringIO('\n'.join( + rng.strftime(self.iso8601).tolist())) + + def time_read_csv(self): + read_csv(self.data(self.StringIO_input), + header=None, names=['foo'], parse_dates=['foo'], + infer_datetime_format=False) + + +class ReadCSVConcatDatetimeBadDateValue(StringIORewind): + + params = (['nan', '0', ''],) + param_names = ['bad_date_value'] + + def setup(self, bad_date_value): + self.StringIO_input = StringIO(('%s,\n' % bad_date_value) * 50000) + + def time_read_csv(self, bad_date_value): + read_csv(self.data(self.StringIO_input), + header=None, names=['foo', 'bar'], parse_dates=['foo'], + infer_datetime_format=False) + + class ReadCSVSkipRows(BaseIO): fname = '__test__.csv' From 0756da952b04e74998f3aa4bf2cc4b47c357cff5 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Mon, 18 Mar 2019 09:55:03 -0500 Subject: [PATCH 03/42] Add benchmark for _does_string_look_like_datetime --- asv_bench/benchmarks/io/parsers.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 asv_bench/benchmarks/io/parsers.py diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py new file mode 100644 index 0000000000000..2575521002528 --- /dev/null +++ b/asv_bench/benchmarks/io/parsers.py @@ -0,0 +1,20 @@ +from pandas._libs.tslibs.parsing import _does_string_look_like_datetime + + +class DoesStringLookLikeDatetime(object): + + params = (['2Q2005', '0.0', '10000'],) + param_names = ['value'] + + def setup(self, value): + self.objects = [value] * 1000000 + + def time_check_datetimes(self, value): + for obj in self.objects: + try: + _does_string_look_like_datetime(obj) + except ValueError: + pass + + +from ..pandas_vb_common import setup # noqa: F401 From 36b8bdb60d60dbccc95ad80f12dcaf4607870b0c Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 19 Mar 2019 16:30:23 +0300 Subject: [PATCH 04/42] implemented _does_string_look_like_datetime in cython --- pandas/_libs/tslibs/parsing.pyx | 42 ++++++++++++ .../_libs/tslibs/src/datetime/datehelpers.c | 68 ------------------- setup.py | 4 +- 3 files changed, 45 insertions(+), 69 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 72de28c5ac54b..a2871ca353a23 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -6,6 +6,7 @@ import time from io import StringIO from libc.string cimport strchr +from cpython cimport PyUnicode_Check, PyBytes_Check, PyBytes_AsStringAndSize from cpython.datetime cimport datetime, datetime_new, import_datetime from cpython.version cimport PY_VERSION_HEX @@ -31,6 +32,21 @@ from pandas._libs.tslibs.util cimport get_c_string_buf_and_size cdef extern from "../src/headers/portable.h": int getdigit_ascii(char c, int default) nogil +cdef extern from "../src/parser/tokenizer.h": + double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, + int skip_trailing) + +cdef extern from *: + char* PyUnicode_AsUTF8AndSize(object unicode, Py_ssize_t* length) + +cdef inline bint get_string_data(object s, char **buf, Py_ssize_t *length): + if PyUnicode_Check(s): + buf[0] = PyUnicode_AsUTF8AndSize(s, length) + return buf[0] != NULL + if PyBytes_Check(s): + return PyBytes_AsStringAndSize(s, buf, length) == 0 + return False + # ---------------------------------------------------------------------- # Constants @@ -45,6 +61,8 @@ _DEFAULT_DATETIME = datetime(1, 1, 1).replace(hour=0, minute=0, cdef: set _not_datelike_strings = {'a', 'A', 'm', 'M', 'p', 'P', 't', 'T'} + set _not_datelike_strings = {'a', 'A', 'm', 'M', 'p', 'P', 't', 'T'} + # ---------------------------------------------------------------------- cdef: const char* delimiters = " /-." @@ -302,6 +320,30 @@ cdef parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False, return parsed, parsed, reso +cpdef bint _does_string_look_like_datetime(object date_string): + cdef: + char *buf = NULL + char *endptr = NULL + Py_ssize_t length = -1 + double converted_date + char first + + if not get_string_data(date_string, &buf, &length): + return False + if length >= 1: + first = buf[0] + if first == '0': + return True + elif length == 1 and date_string in _not_datelike_strings: + return False + else: + converted_date = xstrtod(buf, &endptr, '.', 'e', '\0', 1) + if errno == 0 and endptr == buf + length: + return converted_date >= 1000 + + return True + + cdef inline object _parse_dateabbr_string(object date_string, object default, object freq): cdef: diff --git a/pandas/_libs/tslibs/src/datetime/datehelpers.c b/pandas/_libs/tslibs/src/datetime/datehelpers.c index 2a4ced54d753f..551631475b4f6 100644 --- a/pandas/_libs/tslibs/src/datetime/datehelpers.c +++ b/pandas/_libs/tslibs/src/datetime/datehelpers.c @@ -322,63 +322,6 @@ static PyObject* concat_date_cols(PyObject *self, PyObject *args, } } -static char not_datelike[sizeof(char) * 256]; - -static PyObject* _does_string_look_like_datetime(PyObject* unused, - PyObject* arg) { - char *buf = NULL, *endptr = NULL; - Py_ssize_t length = -1; - double converted_date; - int error = 0; - int result = 1; - -#if PY_MAJOR_VERSION == 2 - if (!PyString_CheckExact(arg)) { - if (!PyUnicode_CheckExact(arg)) { - // arg is not a string, so it's certainly - // not a datetime-looking string - PyErr_SetString(PyExc_ValueError, - "_does_string_look_like_datetime expects a string"); - return NULL; - } - buf = PyUnicode_AS_DATA(arg); - length = (int)PyUnicode_GET_SIZE(arg); - } else { - if (PyString_AsStringAndSize(arg, &buf, &length) == -1) { - return NULL; - } - } -#else - if (!PyUnicode_CheckExact(arg) || !PyUnicode_IS_READY(arg)) { - PyErr_SetString(PyExc_ValueError, - "_does_string_look_like_datetime expects a string"); - return NULL; - } - buf = PyUnicode_DATA(arg); - length = PyUnicode_GET_LENGTH(arg); -#endif - - if (length >= 1) { - char first = *buf; - if (first == '0') { - result = 1; - } else if (length == 1 && not_datelike[Py_CHARMASK(first)]) { - result = 0; - } else { - converted_date = xstrtod(buf, &endptr, '.', 'e', '\0', 1); - if ((errno == 0) && (endptr == buf + length)) { - result = (converted_date >= 1000) ? 1 : 0; - } - } - } - - if (result) { - Py_RETURN_TRUE; - } else { - Py_RETURN_FALSE; - } -} - static PyMethodDef module_methods[] = { /* name from python, name in C-file, ..., __doc__ string of method */ { @@ -386,11 +329,6 @@ static PyMethodDef module_methods[] = { METH_VARARGS | METH_KEYWORDS, "concatenates date cols and returns numpy array" }, - { - "_does_string_look_like_datetime", _does_string_look_like_datetime, - METH_O, - "checks if string looks like a datetime" - }, {NULL, NULL, 0, NULL} }; @@ -418,11 +356,5 @@ PY_DATEHELPERS_MODULE_INIT { module = PY_MODULE_CREATE; - memset(not_datelike, 0, sizeof(not_datelike)); - not_datelike['a'] = not_datelike['A'] = 1; - not_datelike['m'] = not_datelike['M'] = 1; - not_datelike['p'] = not_datelike['P'] = 1; - not_datelike['t'] = not_datelike['T'] = 1; - PY_RETURN_MODULE; } diff --git a/setup.py b/setup.py index 18b282d17a28b..705fa0b24ddd4 100755 --- a/setup.py +++ b/setup.py @@ -635,7 +635,9 @@ def srcpath(name=None, suffix='.pyx', subdir='src'): 'sources': np_datetime_sources}, '_libs.tslibs.parsing': { 'pyxfile': '_libs/tslibs/parsing', - 'include': []}, + 'include': common_include, + 'depends': ['pandas/_libs/src/parser/tokenizer.h'], + 'sources': ['pandas/_libs/src/parser/tokenizer.c']}, '_libs.tslibs.period': { 'pyxfile': '_libs/tslibs/period', 'include': ts_include, From a9afbdbac2cee171f8facb362509b1d519655ad7 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 19 Mar 2019 18:25:46 +0300 Subject: [PATCH 05/42] new benchmark for _concat_date_cols func --- asv_bench/benchmarks/io/parsers.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py index 2575521002528..1e0010a000c34 100644 --- a/asv_bench/benchmarks/io/parsers.py +++ b/asv_bench/benchmarks/io/parsers.py @@ -1,4 +1,6 @@ from pandas._libs.tslibs.parsing import _does_string_look_like_datetime +from pandas.io.parsers import _concat_date_cols +import numpy as np class DoesStringLookLikeDatetime(object): @@ -17,4 +19,18 @@ def time_check_datetimes(self, value): pass -from ..pandas_vb_common import setup # noqa: F401 +class ConcatDateCols(object): + + params = ([1234567890, 'AAAA'], [1, 2]) + param_names = ['value', 'dim'] + + def setup(self, value, dim): + count_elem = 1000000 + if dim == 1: + self.object = (np.array([value] * count_elem),) + if dim == 2: + self.object = (np.array([value] * count_elem), + np.array([value] * count_elem)) + + def time_check_concat(self, value, dim): + _concat_date_cols(self.object) From ee1f32baba89b83c2af897dd01107613e887a0ec Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 20 Mar 2019 21:21:46 +0300 Subject: [PATCH 06/42] init cython version of _concat_date_cols --- pandas/_libs/lib.pyx | 145 ++++++++++++++++++++++++++++++++++++++++--- pandas/io/parsers.py | 3 +- 2 files changed, 139 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 7f66b93b58a1a..7f54931ce4203 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -8,10 +8,11 @@ import warnings import cython from cython import Py_ssize_t -from cpython cimport (Py_INCREF, PyTuple_SET_ITEM, - PyTuple_New, +from cpython cimport (PyErr_SetString, Py_INCREF, PyTuple_SET_ITEM, + PyTuple_New, PyObject_Str, PyList_SetItem, Py_EQ, - PyObject_RichCompareBool) + PyObject_RichCompareBool, + PyUnicode_Join, PyList_New) from cpython.datetime cimport (PyDateTime_Check, PyDate_Check, PyTime_Check, PyDelta_Check, @@ -20,12 +21,12 @@ PyDateTime_IMPORT import numpy as np cimport numpy as cnp -from numpy cimport (ndarray, PyArray_GETITEM, +from numpy cimport (ndarray, PyArray_GETITEM, PyArray_CheckExact, PyArray_ITER_DATA, PyArray_ITER_NEXT, PyArray_IterNew, - flatiter, NPY_OBJECT, - int64_t, - float32_t, float64_t, - uint8_t, uint64_t, + flatiter, NPY_OBJECT, PyArray_SETITEM, + int64_t, PyArray_GETPTR1, + float32_t, float64_t, npy_intp, PyArray_NDIM, + uint8_t, uint64_t, PyArray_ZEROS, complex128_t) cnp.import_array() @@ -46,6 +47,9 @@ cdef extern from "numpy/arrayobject.h": object fields tuple names +cdef extern from "Python.h": + object PyUnicode_FromFormat(const char *format, ...) + cdef extern from "src/parse_helper.h": int floatify(object, float64_t *result, int *maybe_int) except -1 @@ -2314,3 +2318,128 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): output[i] = default return maybe_convert_objects(output) + + +cdef inline int convert_and_set_item(object item, Py_ssize_t index, + object result, + int keep_trivial_numbers): + cdef: + int do_convert = 1 + object str_item + int int_item + double double_item + + if keep_trivial_numbers: + if isinstance(item, int): + int_item = item + if int_item == 0: + do_convert = 0 + elif isinstance(item, float): + double_item = item + if double_item == 0.0: + do_convert = 0 + + if do_convert: + if not isinstance(item, (str, bytes)): + str_item = PyObject_Str(item) + item = str_item + + if PyArray_SETITEM(result, PyArray_GETPTR1(result, index), item): + PyErr_SetString(RuntimeError, "Cannot set resulting item") + return 0 + + return 1 + + +cpdef int put_object_as_unicode(object list, Py_ssize_t idx, object item): + if not isinstance(item, str): + item = PyObject_Str(item) + Py_INCREF(item) + return 1 if PyList_SetItem(list, idx, item) == 0 else 0 + +cpdef object _concat_date_cols(object date_cols, + object keep_trivial_numbers=False): + cdef: + object sequence + int keep_numbers, all_numpy = 1 + Py_ssize_t sequence_size + Py_ssize_t array_size, min_array_size = 0 + Py_ssize_t i, j + object result, arrays + object array, fast_array, item + npy_intp dims[1] + object separator + object list_to_join, result_string + + sequence = date_cols + keep_numbers = keep_trivial_numbers + sequence_size = len(date_cols) + + if sequence_size == -1: + return None + elif sequence_size == 0: + return np.zeros(0, dtype=object) + elif sequence_size == 1: + array = sequence[0] + array_size = len(array) + dims[0] = array_size + result = PyArray_ZEROS(1, dims, NPY_OBJECT, 0) + if PyArray_CheckExact(array): + for i in range(array_size): + item = PyArray_GETITEM(array, + PyArray_GETPTR1(array, i)) + if not convert_and_set_item(item, i, result, keep_numbers): + raise RuntimeError + else: + if not isinstance(array, (tuple, list)): + fast_array = tuple(array) + else: + fast_array = array + for i in range(array_size): + item = fast_array[i] + if not convert_and_set_item(item, i, result, keep_numbers): + raise RuntimeError + + return result + else: + arrays = list(sequence) + for i in range(sequence_size): + array = arrays[i] + if PyArray_CheckExact(array): + if PyArray_NDIM(array) != 1: + raise RuntimeError("ndarrays must be 1-dimentional") + elif not isinstance(array, (tuple, list)): + all_numpy = 0 + fast_array = tuple(array) + array = fast_array + else: + all_numpy = 0 + if len(array) < min_array_size or min_array_size == 0: + min_array_size = len(array) + dims[0] = min_array_size + result = PyArray_ZEROS(1, dims, NPY_OBJECT, 0) + + separator = PyUnicode_FromFormat(" ") + list_to_join = PyList_New(sequence_size) + + for i in range(min_array_size): + if all_numpy: + for j in range(sequence_size): + array = arrays[j] + item = PyArray_GETITEM(array, PyArray_GETPTR1(array, i)) + if not put_object_as_unicode(list_to_join, j, item): + raise RuntimeError + else: + for j in range(sequence_size): + array = arrays[j] + item = array[i] + if not put_object_as_unicode(list_to_join, j, item): + raise RuntimeError + + result_string = PyUnicode_Join(separator, list_to_join) + + if (PyArray_SETITEM(result, PyArray_GETPTR1(result, i), + result_string) != 0): + raise RuntimeError("Cannot set resulting item") + + return result diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index d884007725d6e..4e55a566723be 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -13,7 +13,8 @@ import numpy as np -from pandas._libs.datehelpers import concat_date_cols as _concat_date_cols +#from pandas._libs.datehelpers import concat_date_cols as _concat_date_cols +from pandas._libs.lib import _concat_date_cols import pandas._libs.lib as lib import pandas._libs.ops as libops import pandas._libs.parsers as parsers From 84e1b007fae8aed241260389f7740db7e8662c29 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 20 Mar 2019 21:24:32 +0300 Subject: [PATCH 07/42] fix C version of _concat_date_cols --- pandas/_libs/tslibs/src/datetime/datehelpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/src/datetime/datehelpers.c b/pandas/_libs/tslibs/src/datetime/datehelpers.c index 551631475b4f6..174b88fe02468 100644 --- a/pandas/_libs/tslibs/src/datetime/datehelpers.c +++ b/pandas/_libs/tslibs/src/datetime/datehelpers.c @@ -237,7 +237,7 @@ static PyObject* concat_date_cols(PyObject *self, PyObject *args, Py_XDECREF(fast_array); return free_arrays(arrays, sequence_size); } - Py_DECREF(array); + Py_DECREF(*parray); arrays[i] = fast_array; if (array_size < min_array_size || min_array_size == 0) { From 2cf9f22d2bd139fec452117115aee4ef14a18bc8 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 20 Mar 2019 21:27:37 +0300 Subject: [PATCH 08/42] added ConcatDateColsList benchmark --- asv_bench/benchmarks/io/parsers.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py index 1e0010a000c34..a85dd79b6f3e0 100644 --- a/asv_bench/benchmarks/io/parsers.py +++ b/asv_bench/benchmarks/io/parsers.py @@ -34,3 +34,19 @@ def setup(self, value, dim): def time_check_concat(self, value, dim): _concat_date_cols(self.object) + +class ConcatDateColsList(object): + + params = ([1234567890, 'AAAA'], [1, 2]) + param_names = ['value', 'dim'] + + def setup(self, value, dim): + count_elem = 1000000 + if dim == 1: + self.object = ([value] * count_elem,) + if dim == 2: + self.object = ([value] * count_elem, + [value] * count_elem) + + def time_check_concat(self, value, dim): + _concat_date_cols(self.object) \ No newline at end of file From 28fd5f51f5119dff43521d1d4451c9c11ae8b7ac Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 21 Mar 2019 14:23:28 +0300 Subject: [PATCH 09/42] ready cython version, combined concat benchmarks --- asv_bench/benchmarks/io/parsers.py | 32 ++---- pandas/_libs/lib.pyx | 156 +++++++++-------------------- 2 files changed, 56 insertions(+), 132 deletions(-) diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py index a85dd79b6f3e0..fa06d1b24e436 100644 --- a/asv_bench/benchmarks/io/parsers.py +++ b/asv_bench/benchmarks/io/parsers.py @@ -21,32 +21,16 @@ def time_check_datetimes(self, value): class ConcatDateCols(object): - params = ([1234567890, 'AAAA'], [1, 2]) - param_names = ['value', 'dim'] + params = ([1234567890, 'AAAA'], [1, 2], [np.array, list]) + param_names = ['value', 'dim', 'container'] - def setup(self, value, dim): - count_elem = 1000000 + def setup(self, value, dim, container): + count_elem = 10000 if dim == 1: - self.object = (np.array([value] * count_elem),) + self.object = (container([value] * count_elem),) if dim == 2: - self.object = (np.array([value] * count_elem), - np.array([value] * count_elem)) + self.object = (container([value] * count_elem), + container([value] * count_elem)) - def time_check_concat(self, value, dim): + def time_check_concat(self, value, dim, container): _concat_date_cols(self.object) - -class ConcatDateColsList(object): - - params = ([1234567890, 'AAAA'], [1, 2]) - param_names = ['value', 'dim'] - - def setup(self, value, dim): - count_elem = 1000000 - if dim == 1: - self.object = ([value] * count_elem,) - if dim == 2: - self.object = ([value] * count_elem, - [value] * count_elem) - - def time_check_concat(self, value, dim): - _concat_date_cols(self.object) \ No newline at end of file diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 7f54931ce4203..5e125b3335cb1 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -8,10 +8,8 @@ import warnings import cython from cython import Py_ssize_t -from cpython cimport (PyErr_SetString, Py_INCREF, PyTuple_SET_ITEM, - PyTuple_New, PyObject_Str, PyList_SetItem, - Py_EQ, - PyObject_RichCompareBool, +from cpython cimport (Py_INCREF, PyTuple_SET_ITEM, PyTuple_New, PyObject_Str, + Py_EQ, Py_SIZE, PyObject_RichCompareBool, PyUnicode_Join, PyList_New) from cpython.datetime cimport (PyDateTime_Check, PyDate_Check, @@ -21,13 +19,11 @@ PyDateTime_IMPORT import numpy as np cimport numpy as cnp -from numpy cimport (ndarray, PyArray_GETITEM, PyArray_CheckExact, +from numpy cimport (ndarray, PyArray_GETITEM, PyArray_ITER_DATA, PyArray_ITER_NEXT, PyArray_IterNew, - flatiter, NPY_OBJECT, PyArray_SETITEM, - int64_t, PyArray_GETPTR1, - float32_t, float64_t, npy_intp, PyArray_NDIM, - uint8_t, uint64_t, PyArray_ZEROS, - complex128_t) + flatiter, NPY_OBJECT, + int64_t, float32_t, float64_t, + uint8_t, uint64_t, complex128_t) cnp.import_array() cdef extern from "numpy/arrayobject.h": @@ -47,9 +43,6 @@ cdef extern from "numpy/arrayobject.h": object fields tuple names -cdef extern from "Python.h": - object PyUnicode_FromFormat(const char *format, ...) - cdef extern from "src/parse_helper.h": int floatify(object, float64_t *result, int *maybe_int) except -1 @@ -2320,126 +2313,73 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): return maybe_convert_objects(output) -cdef inline int convert_and_set_item(object item, Py_ssize_t index, - object result, - int keep_trivial_numbers): +cdef inline void convert_and_set_item(object item, Py_ssize_t index, + object[:] result, + bint keep_trivial_numbers): cdef: - int do_convert = 1 - object str_item - int int_item - double double_item + bint do_convert = 1 if keep_trivial_numbers: - if isinstance(item, int): - int_item = item - if int_item == 0: + if isinstance(item, int) and Py_SIZE(item) < 2: + if item == 0: do_convert = 0 elif isinstance(item, float): - double_item = item - if double_item == 0.0: + if item == 0.0: do_convert = 0 - if do_convert: - if not isinstance(item, (str, bytes)): - str_item = PyObject_Str(item) - item = str_item - - if PyArray_SETITEM(result, PyArray_GETPTR1(result, index), item): - PyErr_SetString(RuntimeError, "Cannot set resulting item") - return 0 + if do_convert and not isinstance(item, (str, bytes)): + item = PyObject_Str(item) - return 1 + result[index] = item -cpdef int put_object_as_unicode(object list, Py_ssize_t idx, object item): +cdef inline void put_object_as_unicode(object[:] lst, Py_ssize_t idx, + object item): if not isinstance(item, str): item = PyObject_Str(item) - Py_INCREF(item) - return 1 if PyList_SetItem(list, idx, item) == 0 else 0 + lst[idx] = item + cpdef object _concat_date_cols(object date_cols, object keep_trivial_numbers=False): cdef: - object sequence - int keep_numbers, all_numpy = 1 - Py_ssize_t sequence_size - Py_ssize_t array_size, min_array_size = 0 - Py_ssize_t i, j - object result, arrays - object array, fast_array, item - npy_intp dims[1] - object separator + bint keep_numbers + Py_ssize_t sequence_size, i, j + Py_ssize_t array_size, min_size + object result + object separator = " " object list_to_join, result_string + object[:] list_view + object[:] result_view + object[:] iterator + object[::] arrays - sequence = date_cols keep_numbers = keep_trivial_numbers sequence_size = len(date_cols) - if sequence_size == -1: - return None - elif sequence_size == 0: - return np.zeros(0, dtype=object) + if sequence_size == 0: + result = np.zeros(0, dtype=object) elif sequence_size == 1: - array = sequence[0] - array_size = len(array) - dims[0] = array_size - result = PyArray_ZEROS(1, dims, NPY_OBJECT, 0) - if PyArray_CheckExact(array): - for i in range(array_size): - item = PyArray_GETITEM(array, - PyArray_GETPTR1(array, i)) - if not convert_and_set_item(item, i, result, keep_numbers): - raise RuntimeError - else: - if not isinstance(array, (tuple, list)): - fast_array = tuple(array) - else: - fast_array = array - for i in range(array_size): - item = fast_array[i] - if not convert_and_set_item(item, i, result, keep_numbers): - raise RuntimeError - - return result + iterator = date_cols[0] + array_size = len(iterator) + result = np.zeros(array_size, dtype=object) + result_view = result + for i in range(array_size): + convert_and_set_item(iterator[i], i, result_view, keep_numbers) else: - arrays = list(sequence) - for i in range(sequence_size): - array = arrays[i] - if PyArray_CheckExact(array): - if PyArray_NDIM(array) != 1: - raise RuntimeError("ndarrays must be 1-dimentional") - elif not isinstance(array, (tuple, list)): - all_numpy = 0 - fast_array = tuple(array) - array = fast_array - else: - all_numpy = 0 - if len(array) < min_array_size or min_array_size == 0: - min_array_size = len(array) - dims[0] = min_array_size - result = PyArray_ZEROS(1, dims, NPY_OBJECT, 0) + arrays = date_cols - separator = PyUnicode_FromFormat(" ") - list_to_join = PyList_New(sequence_size) + min_size = min([len(arr) for arr in date_cols]) + result = np.zeros(min_size, dtype=object) + result_view = result - for i in range(min_array_size): - if all_numpy: - for j in range(sequence_size): - array = arrays[j] - item = PyArray_GETITEM(array, PyArray_GETPTR1(array, i)) - if not put_object_as_unicode(list_to_join, j, item): - raise RuntimeError - else: - for j in range(sequence_size): - array = arrays[j] - item = array[i] - if not put_object_as_unicode(list_to_join, j, item): - raise RuntimeError + list_to_join = PyList_New(sequence_size) + list_view = list_to_join + for i in range(min_size): + for j in range(sequence_size): + put_object_as_unicode(list_view, j, arrays[j][i]) result_string = PyUnicode_Join(separator, list_to_join) + result_view[i] = result_string - if (PyArray_SETITEM(result, PyArray_GETPTR1(result, i), - result_string) != 0): - raise RuntimeError("Cannot set resulting item") - - return result + return result From 1f17cf974a91f8e3613246df60533480f47d8321 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 21 Mar 2019 14:30:40 +0300 Subject: [PATCH 10/42] added forgotten check for float NaN --- pandas/_libs/lib.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 5e125b3335cb1..a05656b741bc5 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2318,13 +2318,15 @@ cdef inline void convert_and_set_item(object item, Py_ssize_t index, bint keep_trivial_numbers): cdef: bint do_convert = 1 + double double_item if keep_trivial_numbers: if isinstance(item, int) and Py_SIZE(item) < 2: if item == 0: do_convert = 0 elif isinstance(item, float): - if item == 0.0: + double_item = item + if double_item == 0.0 or double_item != double_item: do_convert = 0 if do_convert and not isinstance(item, (str, bytes)): From d1f8ce5093a44efd73b6e6466f31ed9533a0d9fd Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 21 Mar 2019 17:50:15 +0300 Subject: [PATCH 11/42] Cython version of _concat_date_cols works for all cases --- pandas/_libs/lib.pyx | 69 ++++++++++++++++++++++++++++++++------------ 1 file changed, 50 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a05656b741bc5..04a0bfd12e5e1 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -19,7 +19,7 @@ PyDateTime_IMPORT import numpy as np cimport numpy as cnp -from numpy cimport (ndarray, PyArray_GETITEM, +from numpy cimport (ndarray, PyArray_GETITEM, PyArray_Check, PyArray_ITER_DATA, PyArray_ITER_NEXT, PyArray_IterNew, flatiter, NPY_OBJECT, int64_t, float32_t, float64_t, @@ -2335,26 +2335,29 @@ cdef inline void convert_and_set_item(object item, Py_ssize_t index, result[index] = item -cdef inline void put_object_as_unicode(object[:] lst, Py_ssize_t idx, +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline void put_object_as_unicode(list lst, Py_ssize_t idx, object item): if not isinstance(item, str): item = PyObject_Str(item) lst[idx] = item +@cython.wraparound(False) +@cython.boundscheck(False) cpdef object _concat_date_cols(object date_cols, object keep_trivial_numbers=False): cdef: bint keep_numbers Py_ssize_t sequence_size, i, j Py_ssize_t array_size, min_size - object result - object separator = " " - object list_to_join, result_string - object[:] list_view object[:] result_view - object[:] iterator - object[::] arrays + object[:,:] arrays_view + + object[:] obj_iter + int64_t[:] int_iter + float64_t[:] double_iter keep_numbers = keep_trivial_numbers sequence_size = len(date_cols) @@ -2362,26 +2365,54 @@ cpdef object _concat_date_cols(object date_cols, if sequence_size == 0: result = np.zeros(0, dtype=object) elif sequence_size == 1: - iterator = date_cols[0] - array_size = len(iterator) + array = date_cols[0] + array_size = len(array) result = np.zeros(array_size, dtype=object) result_view = result - for i in range(array_size): - convert_and_set_item(iterator[i], i, result_view, keep_numbers) + if PyArray_Check(array): + if array.dtype == np.int64: + int_iter = array + for i in range(array_size): + convert_and_set_item(int_iter[i], i, + result_view, keep_numbers) + elif array.dtype == np.float64: + double_iter = array + for i in range(array_size): + convert_and_set_item(double_iter[i], i, + result_view, keep_numbers) + else: + if array.dtype == object: + obj_iter = array + else: + obj_array = np.astype(object) + obj_iter = obj_array + for i in range(array_size): + convert_and_set_item(obj_iter[i], i, result_view, keep_numbers) + else: + for i, item in enumerate(array): + convert_and_set_item(item, i, result_view, keep_numbers) else: - arrays = date_cols - min_size = min([len(arr) for arr in date_cols]) + + arrays = np.zeros((len(date_cols), min_size), dtype=object) + for idx, array in enumerate(date_cols): + if PyArray_Check(array): + if array.dtype == object: + arrays[idx] = array + else: + arrays[idx] = array.astype(object) + else: + arrays[idx] = np.array(array, dtype=object) + arrays_view = arrays + result = np.zeros(min_size, dtype=object) result_view = result - list_to_join = PyList_New(sequence_size) - list_view = list_to_join + list_to_join = [None] * sequence_size for i in range(min_size): for j in range(sequence_size): - put_object_as_unicode(list_view, j, arrays[j][i]) - result_string = PyUnicode_Join(separator, list_to_join) - result_view[i] = result_string + put_object_as_unicode(list_to_join, j, arrays_view[j, i]) + result_view[i] = PyUnicode_Join(' ', list_to_join) return result From e44212c66b50d04c6f34d7f49575fdbb53722d6d Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Thu, 21 Mar 2019 18:21:31 +0300 Subject: [PATCH 12/42] Fix typo in _concat_date_cols --- pandas/_libs/lib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 04a0bfd12e5e1..c576e0e0514a6 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2384,7 +2384,7 @@ cpdef object _concat_date_cols(object date_cols, if array.dtype == object: obj_iter = array else: - obj_array = np.astype(object) + obj_array = array.astype(object) obj_iter = obj_array for i in range(array_size): convert_and_set_item(obj_iter[i], i, result_view, keep_numbers) From 6af73bf1a78cad4eec7c05c323269a56353651ba Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 22 Mar 2019 00:52:50 +0300 Subject: [PATCH 13/42] used flatiter for numpy array --- pandas/_libs/lib.pyx | 75 ++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 38 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c576e0e0514a6..d03c5075014d9 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2346,18 +2346,22 @@ cdef inline void put_object_as_unicode(list lst, Py_ssize_t idx, @cython.wraparound(False) @cython.boundscheck(False) -cpdef object _concat_date_cols(object date_cols, +cpdef object _concat_date_cols(tuple date_cols, object keep_trivial_numbers=False): cdef: bint keep_numbers Py_ssize_t sequence_size, i, j - Py_ssize_t array_size, min_size + Py_ssize_t array_size, min_size = 0 object[:] result_view object[:,:] arrays_view - object[:] obj_iter - int64_t[:] int_iter - float64_t[:] double_iter + flatiter it + int all_numpy = 1 + cnp.ndarray[object] iters + object[::1] iters_view + object array + list list_to_join + keep_numbers = keep_trivial_numbers sequence_size = len(date_cols) @@ -2370,40 +2374,28 @@ cpdef object _concat_date_cols(object date_cols, result = np.zeros(array_size, dtype=object) result_view = result if PyArray_Check(array): - if array.dtype == np.int64: - int_iter = array - for i in range(array_size): - convert_and_set_item(int_iter[i], i, - result_view, keep_numbers) - elif array.dtype == np.float64: - double_iter = array - for i in range(array_size): - convert_and_set_item(double_iter[i], i, - result_view, keep_numbers) - else: - if array.dtype == object: - obj_iter = array - else: - obj_array = array.astype(object) - obj_iter = obj_array - for i in range(array_size): - convert_and_set_item(obj_iter[i], i, result_view, keep_numbers) - else: - for i, item in enumerate(array): + it = PyArray_IterNew(array) + for i in range(array_size): + item = PyArray_GETITEM(array, PyArray_ITER_DATA(it)) convert_and_set_item(item, i, result_view, keep_numbers) + PyArray_ITER_NEXT(it) + else: + for i in range(array_size): + convert_and_set_item(array[i], i, result_view, keep_numbers) else: - min_size = min([len(arr) for arr in date_cols]) + for i in range(sequence_size): + array = date_cols[i] + if not PyArray_Check(array): + all_numpy = 0 + if len(array) < min_size or min_size == 0: + min_size = len(array) + + if all_numpy: + iters = np.zeros(sequence_size, dtype=object) + iters_view = iters + for i in range(sequence_size): + iters_view[i] = PyArray_IterNew(date_cols[i]) - arrays = np.zeros((len(date_cols), min_size), dtype=object) - for idx, array in enumerate(date_cols): - if PyArray_Check(array): - if array.dtype == object: - arrays[idx] = array - else: - arrays[idx] = array.astype(object) - else: - arrays[idx] = np.array(array, dtype=object) - arrays_view = arrays result = np.zeros(min_size, dtype=object) result_view = result @@ -2411,8 +2403,15 @@ cpdef object _concat_date_cols(object date_cols, list_to_join = [None] * sequence_size for i in range(min_size): - for j in range(sequence_size): - put_object_as_unicode(list_to_join, j, arrays_view[j, i]) + if all_numpy: + for j in range(sequence_size): + it = iters_view[j] + item = PyArray_GETITEM(date_cols[j], PyArray_ITER_DATA(it)) + put_object_as_unicode(list_to_join, j, item) + PyArray_ITER_NEXT(it) + else: + for j in range(sequence_size): + put_object_as_unicode(list_to_join, j, date_cols[j][i]) result_view[i] = PyUnicode_Join(' ', list_to_join) return result From d4305a968b7dbdfbb880b51e7e043a6adf3ae281 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Fri, 22 Mar 2019 14:53:23 +0300 Subject: [PATCH 14/42] Fix Cython compilation issues --- pandas/_libs/tslibs/parsing.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index a2871ca353a23..87f194af85d03 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -332,12 +332,12 @@ cpdef bint _does_string_look_like_datetime(object date_string): return False if length >= 1: first = buf[0] - if first == '0': + if first == b'0': return True - elif length == 1 and date_string in _not_datelike_strings: + elif date_string in _not_datelike_strings: return False else: - converted_date = xstrtod(buf, &endptr, '.', 'e', '\0', 1) + converted_date = xstrtod(buf, &endptr, b'.', b'e', b'\0', 1) if errno == 0 and endptr == buf + length: return converted_date >= 1000 From fa3ae05f62d43c7fad78af93930674f9793156e3 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Fri, 22 Mar 2019 15:35:58 +0300 Subject: [PATCH 15/42] Remove C version of _concat_date_cols --- .../_libs/tslibs/src/datetime/datehelpers.c | 360 ------------------ pandas/io/parsers.py | 1 - setup.py | 15 - 3 files changed, 376 deletions(-) delete mode 100644 pandas/_libs/tslibs/src/datetime/datehelpers.c diff --git a/pandas/_libs/tslibs/src/datetime/datehelpers.c b/pandas/_libs/tslibs/src/datetime/datehelpers.c deleted file mode 100644 index 174b88fe02468..0000000000000 --- a/pandas/_libs/tslibs/src/datetime/datehelpers.c +++ /dev/null @@ -1,360 +0,0 @@ -#include - -#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION -#include - -#include - -#include "../../../src/inline_helper.h" -#include "../../../src/parser/tokenizer.h" - -#if PY_MAJOR_VERSION >= 3 - #define PY_STRING_CHECK(string) (PyUnicode_Check(string)) -#else - #define PY_STRING_CHECK(string) \ - (PyString_Check(string) || PyUnicode_Check(string)) -#endif - -int PANDAS_INLINE convert_and_set_item(PyObject *item, Py_ssize_t index, - PyArrayObject *result, - int keep_trivial_numbers) { - int needs_decref = 0, do_convert = 1; - if (item == NULL) { - return 0; - } - if (keep_trivial_numbers) { - // don't convert an integer if it's zero, - // don't convert a float if it's zero or NaN -#if PY_MAJOR_VERSION >= 3 - if (PyLong_Check(item)) { - PyLongObject* v = (PyLongObject*)item; - switch (Py_SIZE(v)) { - case 0: - do_convert = 0; - break; - case 1: // fallthrough - case -1: - if (v->ob_digit[0] == 0) { - do_convert = 0; - } - } -#else - if (PyInt_CheckExact(item)) { - if (((PyIntObject*)item)->ob_ival == 0) do_convert = 0; -#endif - } else if (PyFloat_Check(item)) { - double v = PyFloat_AS_DOUBLE(item); - if (v == 0.0 || v != v) { - do_convert = 0; - } - } - } - - if (do_convert) { - if (!PY_STRING_CHECK(item)) { - PyObject *str_item = PyObject_Str(item); - if (str_item == NULL) { - return 0; - } - item = str_item; - needs_decref = 1; - } - } - if (PyArray_SETITEM(result, PyArray_GETPTR1(result, index), item) != 0) { - PyErr_SetString(PyExc_RuntimeError, "Cannot set resulting item"); - if (needs_decref) Py_DECREF(item); - return 0; - } - if (needs_decref) Py_DECREF(item); - return 1; -} - -static int put_object_as_string(PyObject* list, Py_ssize_t idx, - PyObject* item) { - if (!PY_STRING_CHECK(item)) { - PyObject* str_item = PyObject_Str(item); - if (str_item == NULL) { - return 0; - } - Py_DECREF(item); - item = str_item; - } - return (PyList_SetItem(list, idx, item) == 0) ? 1 : 0; -} - -static PyObject* free_arrays(PyObject** arrays, Py_ssize_t size) { - PyObject** item = arrays; - Py_ssize_t i; - for (i = 0; i < size; ++i, ++item) Py_DECREF(*item); - free(arrays); - return NULL; -} - -static PyObject* concat_date_cols(PyObject *self, PyObject *args, - PyObject *kwds) { - PyObject *sequence = NULL; - PyObject *py_keep_trivial_numbers = NULL; - PyArrayObject *result = NULL; - Py_ssize_t sequence_size = 0; - int keep_trivial_numbers; - char* kwlist[] = {"", "keep_trivial_numbers", NULL}; - - if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O", kwlist, - &sequence, &py_keep_trivial_numbers)) { - return NULL; - } - if (!PySequence_Check(sequence)) { - PyErr_SetString(PyExc_TypeError, "argument must be sequence"); - return NULL; - } - keep_trivial_numbers = (py_keep_trivial_numbers != NULL) ? \ - PyObject_IsTrue(py_keep_trivial_numbers) : 0; - - sequence_size = PySequence_Size(sequence); - if (sequence_size == -1) { - return NULL; - } else if (sequence_size == 0) { - npy_intp dims[1]; - dims[0] = 0; - result = (PyArrayObject*)PyArray_ZEROS(1, dims, NPY_OBJECT, 0); - return (PyObject*)result; - } else if (sequence_size == 1) { - PyObject* array = PySequence_GetItem(sequence, 0); - Py_ssize_t array_size; - if (array == NULL) { - return NULL; - } - - array_size = PySequence_Size(array); - if (array_size == -1) { - Py_DECREF(array); - return NULL; - } - - { - npy_intp dims[1]; - dims[0] = array_size; - result = (PyArrayObject*)PyArray_ZEROS(1, dims, NPY_OBJECT, 0); - if (result == NULL) { - Py_DECREF(array); - return NULL; - } - } - - if (PyArray_CheckExact(array)) { - PyArrayObject *ndarray = (PyArrayObject*)array; - Py_ssize_t i; - for (i = 0; i < array_size; ++i) { - PyObject *item = PyArray_GETITEM(ndarray, - PyArray_GETPTR1(ndarray, i)); - if (!convert_and_set_item(item, i, result, - keep_trivial_numbers)) { - Py_DECREF(result); - Py_DECREF(array); - Py_DECREF(item); - return NULL; - } - Py_DECREF(item); - } - } else { - PyObject* fast_array = PySequence_Fast(array, - "elements of input sequence must be sequence"); - Py_ssize_t i; - if (fast_array == NULL) { - Py_DECREF(result); - Py_DECREF(array); - // PySequence_Fast set message, which in second argument - return NULL; - } - - for (i = 0; i < array_size; ++i) { - PyObject* item = PySequence_Fast_GET_ITEM(fast_array, i); - if (!convert_and_set_item(item, i, result, - keep_trivial_numbers)) { - Py_DECREF(result); - Py_DECREF(array); - Py_DECREF(fast_array); - return NULL; - } - } - Py_DECREF(fast_array); - } - Py_DECREF(array); - return (PyObject*)result; - } else { - size_t mem_size = sizeof(PyObject*) * sequence_size; - PyObject **arrays = (PyObject**) malloc(mem_size); - PyObject *array = NULL; - PyObject **parray = NULL; - PyObject *fast_array = NULL; - PyObject *separator = NULL; - PyObject *item = NULL; - PyObject *list_to_join = NULL; - Py_ssize_t min_array_size = 0; - int all_numpy = 1; - Py_ssize_t i; - for (i = 0; i < sequence_size; ++i) { - array = PySequence_GetItem(sequence, i); - if (array == NULL) { - return free_arrays(arrays, i); - } - if (PyArray_CheckExact(array)) { - if (PyArray_NDIM((PyArrayObject*)array) != 1) { - PyErr_SetString(PyExc_ValueError, - "ndarrays must be 1-dimentional"); - return free_arrays(arrays, i); - } - } else { - all_numpy = 0; - } - arrays[i] = array; - } - - parray = arrays; - if (all_numpy) { - Py_ssize_t i; - for (i = 0; i < sequence_size; ++i, ++parray) { - Py_ssize_t array_size = PyArray_SIZE((PyArrayObject*)(*parray)); - - if (array_size < 0) { - return free_arrays(arrays, sequence_size); - } - - if (array_size < min_array_size || min_array_size == 0) { - min_array_size = array_size; - } - } - } else { - Py_ssize_t i; - for (i = 0; i < sequence_size; ++i, ++parray) { - Py_ssize_t array_size; - fast_array = PySequence_Fast(*parray, - "elements of input sequence must be sequence"); - array_size = (fast_array != NULL) ? \ - PySequence_Fast_GET_SIZE(fast_array) : -1; - - if (array_size < 0) { - Py_XDECREF(fast_array); - return free_arrays(arrays, sequence_size); - } - Py_DECREF(*parray); - arrays[i] = fast_array; - - if (array_size < min_array_size || min_array_size == 0) { - min_array_size = array_size; - } - } - } - - { - npy_intp dims[1]; - dims[0] = min_array_size; - result = (PyArrayObject*)PyArray_ZEROS(1, dims, NPY_OBJECT, 0); - if (result == NULL) { - return free_arrays(arrays, sequence_size); - } - } - - separator = PyUnicode_FromFormat(" "); - if (separator == NULL) { - Py_DECREF(result); - return free_arrays(arrays, sequence_size); - } - list_to_join = PyList_New(sequence_size); - for (i = 0; i < min_array_size; ++i) { - PyObject *result_string = NULL; - parray = arrays; - if (all_numpy) { - Py_ssize_t j; - for (j = 0; j < sequence_size; ++j, ++parray) { - PyArrayObject* arr = (PyArrayObject*)(*parray); - item = PyArray_GETITEM(arr, PyArray_GETPTR1(arr, i)); - if (item == NULL) { - Py_DECREF(list_to_join); - Py_DECREF(result); - return free_arrays(arrays, sequence_size); - } - if (!put_object_as_string(list_to_join, j, item)) { - Py_DECREF(item); - Py_DECREF(list_to_join); - Py_DECREF(result); - return free_arrays(arrays, sequence_size); - } - } - } else { - Py_ssize_t j; - for (j = 0; j < sequence_size; ++j, ++parray) { - item = PySequence_Fast_GET_ITEM(*parray, i); - if (item == NULL) { - Py_DECREF(list_to_join); - Py_DECREF(result); - return free_arrays(arrays, sequence_size); - } - Py_INCREF(item); - if (!put_object_as_string(list_to_join, j, item)) { - Py_DECREF(item); - Py_DECREF(list_to_join); - Py_DECREF(result); - return free_arrays(arrays, sequence_size); - } - } - } - result_string = PyUnicode_Join(separator, list_to_join); - if (result_string == NULL) { - Py_DECREF(list_to_join); - Py_DECREF(result); - return free_arrays(arrays, sequence_size); - } - if (PyArray_SETITEM(result, PyArray_GETPTR1(result, i), - result_string) != 0) { - PyErr_SetString(PyExc_RuntimeError, - "Cannot set resulting item"); - Py_DECREF(list_to_join); - Py_DECREF(result); - Py_DECREF(result_string); - return free_arrays(arrays, sequence_size); - } - Py_DECREF(result_string); - } - Py_DECREF(list_to_join); - (void)free_arrays(arrays, sequence_size); - return (PyObject*)result; - } -} - -static PyMethodDef module_methods[] = { - /* name from python, name in C-file, ..., __doc__ string of method */ - { - "concat_date_cols", (PyCFunction)concat_date_cols, - METH_VARARGS | METH_KEYWORDS, - "concatenates date cols and returns numpy array" - }, - {NULL, NULL, 0, NULL} -}; - -#if PY_MAJOR_VERSION >= 3 -static struct PyModuleDef moduledef = { - PyModuleDef_HEAD_INIT, - "datehelpers", // name of module - "helpers for datetime structures manipulation", // module documentation - -1, // size of per-interpreter state of the module, - // or -1 if the module keeps state in global variables. - module_methods -}; -#define PY_DATEHELPERS_MODULE_INIT PyMODINIT_FUNC PyInit_datehelpers(void) -#define PY_MODULE_CREATE PyModule_Create(&moduledef) -#define PY_RETURN_MODULE return module -#else -#define PY_DATEHELPERS_MODULE_INIT void initdatehelpers(void) -#define PY_MODULE_CREATE Py_InitModule("datehelpers", module_methods) -#define PY_RETURN_MODULE -#endif - -PY_DATEHELPERS_MODULE_INIT { - PyObject *module = NULL; - import_array(); - - module = PY_MODULE_CREATE; - - PY_RETURN_MODULE; -} diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 4e55a566723be..085abc60f06f9 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -13,7 +13,6 @@ import numpy as np -#from pandas._libs.datehelpers import concat_date_cols as _concat_date_cols from pandas._libs.lib import _concat_date_cols import pandas._libs.lib as lib import pandas._libs.ops as libops diff --git a/setup.py b/setup.py index 705fa0b24ddd4..2ad5cf5d919a3 100755 --- a/setup.py +++ b/setup.py @@ -243,7 +243,6 @@ def initialize_options(self): ujson_lib = pjoin(base, 'ujson', 'lib') self._clean_exclude = [pjoin(dt, 'np_datetime.c'), pjoin(dt, 'np_datetime_strings.c'), - pjoin(dt, 'datehelpers.c'), pjoin(parser, 'tokenizer.c'), pjoin(parser, 'io.c'), pjoin(ujson_python, 'ujson.c'), @@ -766,20 +765,6 @@ def srcpath(name=None, suffix='.pyx', subdir='src'): extensions.append(_move_ext) # ---------------------------------------------------------------------- -# datehelpers -datehelpers_sources = [ - 'pandas/_libs/tslibs/src/datetime/datehelpers.c', - 'pandas/_libs/src/parser/tokenizer.c' -] -datehelpers_ext = Extension('pandas._libs.datehelpers', - depends=[ - 'pandas/_libs/src/parser/tokenizer.h' - ], - sources=datehelpers_sources, - include_dirs=['pandas/_libs/src/klib/'], - extra_compile_args=extra_compile_args, - define_macros=macros) -extensions.append(datehelpers_ext) # The build cache system does string matching below this point. From 49d66e0b5a54e8dd3d5a9175927ce6a9edeaad1d Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Fri, 22 Mar 2019 15:42:56 +0300 Subject: [PATCH 16/42] Fix linting errors --- pandas/_libs/lib.pyx | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d03c5075014d9..46dd86523f84a 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2353,7 +2353,7 @@ cpdef object _concat_date_cols(tuple date_cols, Py_ssize_t sequence_size, i, j Py_ssize_t array_size, min_size = 0 object[:] result_view - object[:,:] arrays_view + object[:, :] arrays_view flatiter it int all_numpy = 1 @@ -2362,7 +2362,6 @@ cpdef object _concat_date_cols(tuple date_cols, object array list list_to_join - keep_numbers = keep_trivial_numbers sequence_size = len(date_cols) @@ -2396,7 +2395,6 @@ cpdef object _concat_date_cols(tuple date_cols, for i in range(sequence_size): iters_view[i] = PyArray_IterNew(date_cols[i]) - result = np.zeros(min_size, dtype=object) result_view = result From 09e4da6cc2bcd01c04dcb14c512cf621ded7d82c Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Fri, 22 Mar 2019 08:20:12 -0500 Subject: [PATCH 17/42] Try to speed up 1D list --- pandas/_libs/lib.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 46dd86523f84a..2316573305ef4 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2313,6 +2313,8 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): return maybe_convert_objects(output) +@cython.wraparound(False) +@cython.boundscheck(False) cdef inline void convert_and_set_item(object item, Py_ssize_t index, object[:] result, bint keep_trivial_numbers): @@ -2353,13 +2355,11 @@ cpdef object _concat_date_cols(tuple date_cols, Py_ssize_t sequence_size, i, j Py_ssize_t array_size, min_size = 0 object[:] result_view - object[:, :] arrays_view flatiter it int all_numpy = 1 cnp.ndarray[object] iters object[::1] iters_view - object array list list_to_join keep_numbers = keep_trivial_numbers @@ -2379,8 +2379,8 @@ cpdef object _concat_date_cols(tuple date_cols, convert_and_set_item(item, i, result_view, keep_numbers) PyArray_ITER_NEXT(it) else: - for i in range(array_size): - convert_and_set_item(array[i], i, result_view, keep_numbers) + for i, item in enumerate(array): + convert_and_set_item(item, i, result_view, keep_numbers) else: for i in range(sequence_size): array = date_cols[i] From 67d9509d1f9ebffd4c112e038757f26aceb3e88a Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Fri, 22 Mar 2019 08:44:03 -0500 Subject: [PATCH 18/42] Hopefully speed up 2D case --- pandas/_libs/lib.pyx | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 2316573305ef4..c74e35bc1e255 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2382,8 +2382,7 @@ cpdef object _concat_date_cols(tuple date_cols, for i, item in enumerate(array): convert_and_set_item(item, i, result_view, keep_numbers) else: - for i in range(sequence_size): - array = date_cols[i] + for i, array in enumerate(date_cols): if not PyArray_Check(array): all_numpy = 0 if len(array) < min_size or min_size == 0: @@ -2392,24 +2391,26 @@ cpdef object _concat_date_cols(tuple date_cols, if all_numpy: iters = np.zeros(sequence_size, dtype=object) iters_view = iters - for i in range(sequence_size): - iters_view[i] = PyArray_IterNew(date_cols[i]) + for i, array in enumerate(date_cols): + iters_view[i] = PyArray_IterNew(array) result = np.zeros(min_size, dtype=object) result_view = result list_to_join = [None] * sequence_size - for i in range(min_size): - if all_numpy: - for j in range(sequence_size): + if all_numpy: + for i in range(min_size): + for j, array in enumerate(date_cols): it = iters_view[j] - item = PyArray_GETITEM(date_cols[j], PyArray_ITER_DATA(it)) + item = PyArray_GETITEM(array, PyArray_ITER_DATA(it)) put_object_as_unicode(list_to_join, j, item) PyArray_ITER_NEXT(it) - else: - for j in range(sequence_size): - put_object_as_unicode(list_to_join, j, date_cols[j][i]) - result_view[i] = PyUnicode_Join(' ', list_to_join) + result_view[i] = PyUnicode_Join(' ', list_to_join) + else: + for i in range(min_size): + for j, array in enumerate(date_cols): + put_object_as_unicode(list_to_join, j, array[i]) + result_view[i] = PyUnicode_Join(' ', list_to_join) return result From f05564d619f5dd726dd41e275eb506f0af234523 Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Fri, 22 Mar 2019 09:06:04 -0500 Subject: [PATCH 19/42] Fix isort, retain some comments --- asv_bench/benchmarks/io/parsers.py | 4 +++- pandas/_libs/tslibs/parsing.pyx | 2 ++ pandas/io/parsers.py | 8 ++------ 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py index fa06d1b24e436..f453705c5f859 100644 --- a/asv_bench/benchmarks/io/parsers.py +++ b/asv_bench/benchmarks/io/parsers.py @@ -1,6 +1,8 @@ +import numpy as np + from pandas._libs.tslibs.parsing import _does_string_look_like_datetime + from pandas.io.parsers import _concat_date_cols -import numpy as np class DoesStringLookLikeDatetime(object): diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 87f194af85d03..2c41c569693a7 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -333,6 +333,8 @@ cpdef bint _does_string_look_like_datetime(object date_string): if length >= 1: first = buf[0] if first == b'0': + # Strings starting with 0 are more consistent with a + # date-like string than a number return True elif date_string in _not_datelike_strings: return False diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 085abc60f06f9..2f8aa29162a24 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -13,8 +13,8 @@ import numpy as np -from pandas._libs.lib import _concat_date_cols import pandas._libs.lib as lib +from pandas._libs.lib import _concat_date_cols import pandas._libs.ops as libops import pandas._libs.parsers as parsers from pandas._libs.tslibs import parsing @@ -3217,11 +3217,7 @@ def converter(*date_cols): except Exception: try: return tools.to_datetime( - parsing.try_parse_dates( - _concat_date_cols( - date_cols, - keep_trivial_numbers=True - ), + parsing.try_parse_dates(_concat_date_cols(date_cols), parser=date_parser, dayfirst=dayfirst), cache=cache_dates, From b9c96fdd10bfb76e9cab95bab49e026c7286ac2f Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 25 Mar 2019 19:01:01 +0300 Subject: [PATCH 20/42] removed unnecessary common_include list with headers; some change code style --- pandas/_libs/lib.pyx | 29 +++++++++++++---------------- setup.py | 1 - 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c74e35bc1e255..e6f853258a2c1 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2349,22 +2349,17 @@ cdef inline void put_object_as_unicode(list lst, Py_ssize_t idx, @cython.wraparound(False) @cython.boundscheck(False) cpdef object _concat_date_cols(tuple date_cols, - object keep_trivial_numbers=False): + bint keep_trivial_numbers=False): cdef: - bint keep_numbers - Py_ssize_t sequence_size, i, j + Py_ssize_t i, j, sequence_size = len(date_cols) Py_ssize_t array_size, min_size = 0 object[:] result_view - flatiter it int all_numpy = 1 cnp.ndarray[object] iters object[::1] iters_view list list_to_join - keep_numbers = keep_trivial_numbers - sequence_size = len(date_cols) - if sequence_size == 0: result = np.zeros(0, dtype=object) elif sequence_size == 1: @@ -2373,33 +2368,35 @@ cpdef object _concat_date_cols(tuple date_cols, result = np.zeros(array_size, dtype=object) result_view = result if PyArray_Check(array): + # for numpy array case use special api for performance it = PyArray_IterNew(array) for i in range(array_size): item = PyArray_GETITEM(array, PyArray_ITER_DATA(it)) - convert_and_set_item(item, i, result_view, keep_numbers) + convert_and_set_item(item, i, result_view, keep_trivial_numbers) PyArray_ITER_NEXT(it) else: for i, item in enumerate(array): - convert_and_set_item(item, i, result_view, keep_numbers) + convert_and_set_item(item, i, result_view, keep_trivial_numbers) else: for i, array in enumerate(date_cols): if not PyArray_Check(array): all_numpy = 0 + # find min length for arrays in date_cols + # imitation python zip behavior if len(array) < min_size or min_size == 0: min_size = len(array) - if all_numpy: - iters = np.zeros(sequence_size, dtype=object) - iters_view = iters - for i, array in enumerate(date_cols): - iters_view[i] = PyArray_IterNew(array) - result = np.zeros(min_size, dtype=object) result_view = result - list_to_join = [None] * sequence_size if all_numpy: + # setup iterators + iters = np.zeros(sequence_size, dtype=object) + iters_view = iters + for i, array in enumerate(date_cols): + iters_view[i] = PyArray_IterNew(array) + # for numpy array case use special api for performance for i in range(min_size): for j, array in enumerate(date_cols): it = iters_view[j] diff --git a/setup.py b/setup.py index 2ad5cf5d919a3..0dbf93ec925e0 100755 --- a/setup.py +++ b/setup.py @@ -634,7 +634,6 @@ def srcpath(name=None, suffix='.pyx', subdir='src'): 'sources': np_datetime_sources}, '_libs.tslibs.parsing': { 'pyxfile': '_libs/tslibs/parsing', - 'include': common_include, 'depends': ['pandas/_libs/src/parser/tokenizer.h'], 'sources': ['pandas/_libs/src/parser/tokenizer.c']}, '_libs.tslibs.period': { From 6dc3c5127fe4e3f806e8baf446a85b9935f7a77b Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 27 Mar 2019 21:54:13 +0300 Subject: [PATCH 21/42] using util.is_array now; changed double to float64_t; fix docstring --- doc/source/whatsnew/v0.25.0.rst | 2 ++ pandas/_libs/lib.pyx | 12 ++++++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 578e24009d35a..61d3a8f8ed517 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -252,6 +252,8 @@ Performance Improvements - Improved performance of :meth:`read_csv` by much faster parsing of ``MM/YYYY`` and ``DD/MM/YYYY`` datetime formats (:issue:`25922`) - Improved performance of nanops for dtypes that cannot store NaNs. Speedup is particularly prominent for :meth:`Series.all` and :meth:`Series.any` (:issue:`25070`) - Improved performance of :meth:`Series.map` for dictionary mappers on categorical series by mapping the categories instead of mapping all values (:issue:`23785`) +- Improved performance of :meth:`read_csv` by faster concatenating date columns without extra conversion to string for integer/float zero + and float NaN; by faster checking the string for the possibility of being a date (:issue:`25754`) .. _whatsnew_0250.bug_fixes: diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e6f853258a2c1..fd6217e400de8 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -19,7 +19,7 @@ PyDateTime_IMPORT import numpy as np cimport numpy as cnp -from numpy cimport (ndarray, PyArray_GETITEM, PyArray_Check, +from numpy cimport (ndarray, PyArray_GETITEM, PyArray_ITER_DATA, PyArray_ITER_NEXT, PyArray_IterNew, flatiter, NPY_OBJECT, int64_t, float32_t, float64_t, @@ -2320,15 +2320,15 @@ cdef inline void convert_and_set_item(object item, Py_ssize_t index, bint keep_trivial_numbers): cdef: bint do_convert = 1 - double double_item + float64_t float_item if keep_trivial_numbers: if isinstance(item, int) and Py_SIZE(item) < 2: if item == 0: do_convert = 0 elif isinstance(item, float): - double_item = item - if double_item == 0.0 or double_item != double_item: + float_item = item + if float_item == 0.0 or float_item != float_item: do_convert = 0 if do_convert and not isinstance(item, (str, bytes)): @@ -2367,7 +2367,7 @@ cpdef object _concat_date_cols(tuple date_cols, array_size = len(array) result = np.zeros(array_size, dtype=object) result_view = result - if PyArray_Check(array): + if util.is_array(array): # for numpy array case use special api for performance it = PyArray_IterNew(array) for i in range(array_size): @@ -2379,7 +2379,7 @@ cpdef object _concat_date_cols(tuple date_cols, convert_and_set_item(item, i, result_view, keep_trivial_numbers) else: for i, array in enumerate(date_cols): - if not PyArray_Check(array): + if not util.is_array(array): all_numpy = 0 # find min length for arrays in date_cols # imitation python zip behavior From 08c7f476e2733fe3a6687aa9ecd8198896cbb7c4 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 28 Mar 2019 00:57:30 +0300 Subject: [PATCH 22/42] split _concat_date_cols functionality --- pandas/_libs/lib.pyx | 117 +++++++++++++++++++++++++------------------ 1 file changed, 67 insertions(+), 50 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index fd6217e400de8..c7977bed86e9a 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2348,66 +2348,83 @@ cdef inline void put_object_as_unicode(list lst, Py_ssize_t idx, @cython.wraparound(False) @cython.boundscheck(False) -cpdef object _concat_date_cols(tuple date_cols, - bint keep_trivial_numbers=False): +cdef void concat_date_cols_numpy(tuple date_cols, object[:] result_view, + Py_ssize_t min_size, + bint keep_trivial_numbers=False): cdef: Py_ssize_t i, j, sequence_size = len(date_cols) - Py_ssize_t array_size, min_size = 0 - object[:] result_view - flatiter it - int all_numpy = 1 + list list_to_join cnp.ndarray[object] iters object[::1] iters_view - list list_to_join + flatiter it - if sequence_size == 0: - result = np.zeros(0, dtype=object) - elif sequence_size == 1: + if sequence_size == 1: array = date_cols[0] - array_size = len(array) - result = np.zeros(array_size, dtype=object) - result_view = result - if util.is_array(array): - # for numpy array case use special api for performance - it = PyArray_IterNew(array) - for i in range(array_size): + it = PyArray_IterNew(array) + for i in range(min_size): + item = PyArray_GETITEM(array, PyArray_ITER_DATA(it)) + convert_and_set_item(item, i, result_view, keep_trivial_numbers) + PyArray_ITER_NEXT(it) + else: + list_to_join = [None] * sequence_size + # setup iterators + iters = np.zeros(sequence_size, dtype=object) + iters_view = iters + for i, array in enumerate(date_cols): + iters_view[i] = PyArray_IterNew(array) + for i in range(min_size): + for j, array in enumerate(date_cols): + it = iters_view[j] item = PyArray_GETITEM(array, PyArray_ITER_DATA(it)) - convert_and_set_item(item, i, result_view, keep_trivial_numbers) + put_object_as_unicode(list_to_join, j, item) PyArray_ITER_NEXT(it) - else: - for i, item in enumerate(array): - convert_and_set_item(item, i, result_view, keep_trivial_numbers) + result_view[i] = PyUnicode_Join(' ', list_to_join) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef void concat_date_cols_sequence(tuple date_cols, object[:] result_view, + Py_ssize_t min_size, + bint keep_trivial_numbers=False): + cdef: + Py_ssize_t i, j, sequence_size = len(date_cols) + list list_to_join + + if sequence_size == 1: + for i, item in enumerate(date_cols[0]): + convert_and_set_item(item, i, result_view, keep_trivial_numbers) else: - for i, array in enumerate(date_cols): - if not util.is_array(array): - all_numpy = 0 - # find min length for arrays in date_cols - # imitation python zip behavior - if len(array) < min_size or min_size == 0: - min_size = len(array) - - result = np.zeros(min_size, dtype=object) - result_view = result list_to_join = [None] * sequence_size + for i in range(min_size): + for j, array in enumerate(date_cols): + put_object_as_unicode(list_to_join, j, array[i]) + result_view[i] = PyUnicode_Join(' ', list_to_join) - if all_numpy: - # setup iterators - iters = np.zeros(sequence_size, dtype=object) - iters_view = iters - for i, array in enumerate(date_cols): - iters_view[i] = PyArray_IterNew(array) - # for numpy array case use special api for performance - for i in range(min_size): - for j, array in enumerate(date_cols): - it = iters_view[j] - item = PyArray_GETITEM(array, PyArray_ITER_DATA(it)) - put_object_as_unicode(list_to_join, j, item) - PyArray_ITER_NEXT(it) - result_view[i] = PyUnicode_Join(' ', list_to_join) - else: - for i in range(min_size): - for j, array in enumerate(date_cols): - put_object_as_unicode(list_to_join, j, array[i]) - result_view[i] = PyUnicode_Join(' ', list_to_join) +cpdef object _concat_date_cols(tuple date_cols, + bint keep_trivial_numbers=False): + cdef: + Py_ssize_t min_size = 0, sequence_size = len(date_cols) + cnp.ndarray[object] result + int all_numpy = 1 + + if sequence_size == 0: + return np.zeros(0, dtype=object) + + for i, array in enumerate(date_cols): + if not util.is_array(array): + all_numpy = 0 + # find min length for arrays in date_cols + # imitation python zip behavior + if len(array) < min_size or min_size == 0: + min_size = len(array) + + result = np.zeros(min_size, dtype=object) + if all_numpy: + # call special function to increase performance + concat_date_cols_numpy(date_cols, result, min_size, + keep_trivial_numbers) + else: + concat_date_cols_sequence(date_cols, result, min_size, + keep_trivial_numbers) return result From ba6b86a66794e99f8975af3f6133c12978c7dd7d Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 28 Mar 2019 15:37:00 +0300 Subject: [PATCH 23/42] added error parameter for xstrtod call --- pandas/_libs/tslibs/parsing.pyx | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 2c41c569693a7..38bafe8c447f0 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -34,7 +34,7 @@ cdef extern from "../src/headers/portable.h": cdef extern from "../src/parser/tokenizer.h": double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, - int skip_trailing) + int skip_trailing, int *error) cdef extern from *: char* PyUnicode_AsUTF8AndSize(object unicode, Py_ssize_t* length) @@ -327,6 +327,7 @@ cpdef bint _does_string_look_like_datetime(object date_string): Py_ssize_t length = -1 double converted_date char first + int error = 0 if not get_string_data(date_string, &buf, &length): return False @@ -339,8 +340,9 @@ cpdef bint _does_string_look_like_datetime(object date_string): elif date_string in _not_datelike_strings: return False else: - converted_date = xstrtod(buf, &endptr, b'.', b'e', b'\0', 1) - if errno == 0 and endptr == buf + length: + converted_date = xstrtod(buf, &endptr, + b'.', b'e', b'\0', 1, &error) + if error == 0 and endptr == buf + length: return converted_date >= 1000 return True From 14b9cad42a7bd7f40a3570ab8b5842ac81161c00 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 28 Mar 2019 16:03:17 +0300 Subject: [PATCH 24/42] removed Py_SIZE; renamed indexes --- pandas/_libs/lib.pyx | 93 +++++++++++++++++++++----------------------- 1 file changed, 44 insertions(+), 49 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c7977bed86e9a..ec35518d3c745 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2323,7 +2323,7 @@ cdef inline void convert_and_set_item(object item, Py_ssize_t index, float64_t float_item if keep_trivial_numbers: - if isinstance(item, int) and Py_SIZE(item) < 2: + if isinstance(item, int): if item == 0: do_convert = 0 elif isinstance(item, float): @@ -2348,83 +2348,78 @@ cdef inline void put_object_as_unicode(list lst, Py_ssize_t idx, @cython.wraparound(False) @cython.boundscheck(False) -cdef void concat_date_cols_numpy(tuple date_cols, object[:] result_view, - Py_ssize_t min_size, - bint keep_trivial_numbers=False): +cdef void _concat_date_cols_numpy(tuple date_cols, object[:] result_view, + Py_ssize_t rows_count, Py_ssize_t col_count, + bint keep_trivial_numbers): cdef: - Py_ssize_t i, j, sequence_size = len(date_cols) + Py_ssize_t col_idx, row_idx list list_to_join cnp.ndarray[object] iters object[::1] iters_view flatiter it - if sequence_size == 1: + if col_count == 1: array = date_cols[0] it = PyArray_IterNew(array) - for i in range(min_size): + for row_idx in range(rows_count): item = PyArray_GETITEM(array, PyArray_ITER_DATA(it)) - convert_and_set_item(item, i, result_view, keep_trivial_numbers) + convert_and_set_item(item, row_idx, result_view, + keep_trivial_numbers) PyArray_ITER_NEXT(it) else: - list_to_join = [None] * sequence_size + list_to_join = [None] * col_count # setup iterators - iters = np.zeros(sequence_size, dtype=object) + iters = np.zeros(col_count, dtype=object) iters_view = iters - for i, array in enumerate(date_cols): - iters_view[i] = PyArray_IterNew(array) - for i in range(min_size): - for j, array in enumerate(date_cols): - it = iters_view[j] + for col_idx, array in enumerate(date_cols): + iters_view[col_idx] = PyArray_IterNew(array) + for row_idx in range(rows_count): + for col_idx, array in enumerate(date_cols): + it = iters_view[col_idx] item = PyArray_GETITEM(array, PyArray_ITER_DATA(it)) - put_object_as_unicode(list_to_join, j, item) + put_object_as_unicode(list_to_join, col_idx, item) PyArray_ITER_NEXT(it) - result_view[i] = PyUnicode_Join(' ', list_to_join) + result_view[row_idx] = PyUnicode_Join(' ', list_to_join) @cython.wraparound(False) @cython.boundscheck(False) -cdef void concat_date_cols_sequence(tuple date_cols, object[:] result_view, - Py_ssize_t min_size, - bint keep_trivial_numbers=False): +cdef void _concat_date_cols_sequence(tuple date_cols, object[:] result_view, + Py_ssize_t rows_count, + Py_ssize_t col_count, + bint keep_trivial_numbers): cdef: - Py_ssize_t i, j, sequence_size = len(date_cols) + Py_ssize_t col_idx, row_idx list list_to_join - if sequence_size == 1: - for i, item in enumerate(date_cols[0]): - convert_and_set_item(item, i, result_view, keep_trivial_numbers) + if col_count == 1: + for row_idx, item in enumerate(date_cols[0]): + convert_and_set_item(item, row_idx, result_view, + keep_trivial_numbers) else: - list_to_join = [None] * sequence_size - for i in range(min_size): - for j, array in enumerate(date_cols): - put_object_as_unicode(list_to_join, j, array[i]) - result_view[i] = PyUnicode_Join(' ', list_to_join) + list_to_join = [None] * col_count + for row_idx in range(rows_count): + for col_idx, array in enumerate(date_cols): + put_object_as_unicode(list_to_join, col_idx, array[row_idx]) + result_view[row_idx] = PyUnicode_Join(' ', list_to_join) -cpdef object _concat_date_cols(tuple date_cols, - bint keep_trivial_numbers=False): +def _concat_date_cols(tuple date_cols, bint keep_trivial_numbers=False): cdef: - Py_ssize_t min_size = 0, sequence_size = len(date_cols) + Py_ssize_t rows_count = 0, col_count = len(date_cols) cnp.ndarray[object] result - int all_numpy = 1 - if sequence_size == 0: + if col_count == 0: return np.zeros(0, dtype=object) - for i, array in enumerate(date_cols): - if not util.is_array(array): - all_numpy = 0 - # find min length for arrays in date_cols - # imitation python zip behavior - if len(array) < min_size or min_size == 0: - min_size = len(array) - - result = np.zeros(min_size, dtype=object) - if all_numpy: - # call special function to increase performance - concat_date_cols_numpy(date_cols, result, min_size, - keep_trivial_numbers) + rows_count = min(len(array) for array in date_cols) + + result = np.zeros(rows_count, dtype=object) + if all(util.is_array(array) for array in date_cols): + # call specialized function to increase performance + _concat_date_cols_numpy(date_cols, result, rows_count, col_count, + keep_trivial_numbers) else: - concat_date_cols_sequence(date_cols, result, min_size, - keep_trivial_numbers) + _concat_date_cols_sequence(date_cols, result, rows_count, col_count, + keep_trivial_numbers) return result From 4e9211b8ff8f564f69dbf0cdfc923bac6a4405ed Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Fri, 29 Mar 2019 18:55:36 +0300 Subject: [PATCH 25/42] Switch to helper method for getting C buffer of string object --- pandas/_libs/tslibs/parsing.pyx | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 38bafe8c447f0..fe1121f0efa66 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -6,7 +6,6 @@ import time from io import StringIO from libc.string cimport strchr -from cpython cimport PyUnicode_Check, PyBytes_Check, PyBytes_AsStringAndSize from cpython.datetime cimport datetime, datetime_new, import_datetime from cpython.version cimport PY_VERSION_HEX @@ -36,16 +35,6 @@ cdef extern from "../src/parser/tokenizer.h": double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error) -cdef extern from *: - char* PyUnicode_AsUTF8AndSize(object unicode, Py_ssize_t* length) - -cdef inline bint get_string_data(object s, char **buf, Py_ssize_t *length): - if PyUnicode_Check(s): - buf[0] = PyUnicode_AsUTF8AndSize(s, length) - return buf[0] != NULL - if PyBytes_Check(s): - return PyBytes_AsStringAndSize(s, buf, length) == 0 - return False # ---------------------------------------------------------------------- # Constants @@ -322,15 +311,14 @@ cdef parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False, cpdef bint _does_string_look_like_datetime(object date_string): cdef: - char *buf = NULL + const char *buf char *endptr = NULL Py_ssize_t length = -1 double converted_date char first int error = 0 - if not get_string_data(date_string, &buf, &length): - return False + buf = get_c_string_buf_and_size(date_string, &length) if length >= 1: first = buf[0] if first == b'0': From 0aefa7bccde6a57cbdecfe48c4d9560c1c8116b6 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Sun, 31 Mar 2019 22:50:44 +0300 Subject: [PATCH 26/42] changed return type in _concat_date_cols_* functions from void to cnp.ndarray[object] --- pandas/_libs/lib.pyx | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index ec35518d3c745..e47c890b93f3f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2348,15 +2348,21 @@ cdef inline void put_object_as_unicode(list lst, Py_ssize_t idx, @cython.wraparound(False) @cython.boundscheck(False) -cdef void _concat_date_cols_numpy(tuple date_cols, object[:] result_view, - Py_ssize_t rows_count, Py_ssize_t col_count, - bint keep_trivial_numbers): +cdef cnp.ndarray[object] _concat_date_cols_numpy(tuple date_cols, + Py_ssize_t rows_count, + Py_ssize_t col_count, + bint keep_trivial_numbers): cdef: Py_ssize_t col_idx, row_idx list list_to_join cnp.ndarray[object] iters object[::1] iters_view flatiter it + cnp.ndarray[object] result + object[:] result_view + + result = np.zeros(rows_count, dtype=object) + result_view = result if col_count == 1: array = date_cols[0] @@ -2381,16 +2387,23 @@ cdef void _concat_date_cols_numpy(tuple date_cols, object[:] result_view, PyArray_ITER_NEXT(it) result_view[row_idx] = PyUnicode_Join(' ', list_to_join) + return result + @cython.wraparound(False) @cython.boundscheck(False) -cdef void _concat_date_cols_sequence(tuple date_cols, object[:] result_view, - Py_ssize_t rows_count, - Py_ssize_t col_count, - bint keep_trivial_numbers): +cdef cnp.ndarray[object] _concat_date_cols_sequence(tuple date_cols, + Py_ssize_t rows_count, + Py_ssize_t col_count, + bint keep_trivial_numbers): cdef: Py_ssize_t col_idx, row_idx list list_to_join + cnp.ndarray[object] result + object[:] result_view + + result = np.zeros(rows_count, dtype=object) + result_view = result if col_count == 1: for row_idx, item in enumerate(date_cols[0]): @@ -2403,6 +2416,8 @@ cdef void _concat_date_cols_sequence(tuple date_cols, object[:] result_view, put_object_as_unicode(list_to_join, col_idx, array[row_idx]) result_view[row_idx] = PyUnicode_Join(' ', list_to_join) + return result + def _concat_date_cols(tuple date_cols, bint keep_trivial_numbers=False): cdef: @@ -2414,12 +2429,11 @@ def _concat_date_cols(tuple date_cols, bint keep_trivial_numbers=False): rows_count = min(len(array) for array in date_cols) - result = np.zeros(rows_count, dtype=object) if all(util.is_array(array) for array in date_cols): # call specialized function to increase performance - _concat_date_cols_numpy(date_cols, result, rows_count, col_count, - keep_trivial_numbers) + result = _concat_date_cols_numpy(date_cols, rows_count, col_count, + keep_trivial_numbers) else: - _concat_date_cols_sequence(date_cols, result, rows_count, col_count, - keep_trivial_numbers) + result = _concat_date_cols_sequence(date_cols, rows_count, col_count, + keep_trivial_numbers) return result From a3a0a7776f2f25a2f88c5f038118a32c77242e5b Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Sun, 31 Mar 2019 23:46:42 +0300 Subject: [PATCH 27/42] added doc-string to _concat_date_cols* functions --- pandas/_libs/lib.pyx | 61 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e47c890b93f3f..cee664d8a84cd 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2352,6 +2352,28 @@ cdef cnp.ndarray[object] _concat_date_cols_numpy(tuple date_cols, Py_ssize_t rows_count, Py_ssize_t col_count, bint keep_trivial_numbers): + """ + Concatenates `rows_count` elements from each `col_count` numpy arrays + in `date_cols` into strings. + + Note + ---- + This function speeds up concatenation for numpy arrays. + You also can use `_concat_date_cols_sequence` function. + + Parameters + ---------- + date_cols : tuple of numpy arrays + rows_count : Py_ssize_t + col_count : Py_ssize_t + keep_trivial_numbers : bool, default False + If True, then for the case of one sequence in `date_cols`, + conversion (to string from integer/float zero) is not performed + + Returns + ------- + arr_of_rows : 1-d numpy array + """ cdef: Py_ssize_t col_idx, row_idx list list_to_join @@ -2396,6 +2418,23 @@ cdef cnp.ndarray[object] _concat_date_cols_sequence(tuple date_cols, Py_ssize_t rows_count, Py_ssize_t col_count, bint keep_trivial_numbers): + """ + Concatenates `rows_count` elements from each `col_count` sequences + in `date_cols` into strings. + + Parameters + ---------- + date_cols : tuple of sequences + rows_count : Py_ssize_t + col_count : Py_ssize_t + keep_trivial_numbers : bool, default False + If True, then for the case of one sequence in `date_cols`, + conversion (to string from integer/float zero) is not performed + + Returns + ------- + arr_of_rows : 1-d numpy array + """ cdef: Py_ssize_t col_idx, row_idx list list_to_join @@ -2420,6 +2459,28 @@ cdef cnp.ndarray[object] _concat_date_cols_sequence(tuple date_cols, def _concat_date_cols(tuple date_cols, bint keep_trivial_numbers=False): + """ + Concatenates elements from sequences in `date_cols` into strings. + + Parameters + ---------- + date_cols : tuple of sequences + keep_trivial_numbers : bool, default False + If True, then for the case of one sequence in `date_cols`, + conversion (to string from integer/float zero) is not performed + + Returns + ------- + arr_of_rows : 1-d numpy array + + Examples + -------- + >>> dates=np.array(['3/31/2019', '4/31/2019'], dtype=object) + >>> times=np.array(['11:20', '10:45'], dtype=object) + >>> result = _concat_date_cols((dates, times)) + >>> result + array(['3/31/2019 11:20', '4/31/2019 10:45'], dtype=object) + """ cdef: Py_ssize_t rows_count = 0, col_count = len(date_cols) cnp.ndarray[object] result From f1ae23cb4e3a135d244cc4015b1cadabf2343c47 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 1 Apr 2019 15:11:36 +0300 Subject: [PATCH 28/42] added doc-string for convert_and_set_item func; removed isinstance(item, bytes) check --- pandas/_libs/lib.pyx | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index cee664d8a84cd..cf430303ecd3b 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2318,6 +2318,17 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): cdef inline void convert_and_set_item(object item, Py_ssize_t index, object[:] result, bint keep_trivial_numbers): + """ + Convert `item` to str and set into result[index]. + + Parameters + ---------- + item : object + index : Py_ssize_t + keep_trivial_numbers : bool, default False + If `keep_trivial_numbers` is True, then conversion + (to string from integer/float zero) is not performed + """ cdef: bint do_convert = 1 float64_t float_item @@ -2331,7 +2342,7 @@ cdef inline void convert_and_set_item(object item, Py_ssize_t index, if float_item == 0.0 or float_item != float_item: do_convert = 0 - if do_convert and not isinstance(item, (str, bytes)): + if do_convert and not isinstance(item, str): item = PyObject_Str(item) result[index] = item @@ -2483,7 +2494,6 @@ def _concat_date_cols(tuple date_cols, bint keep_trivial_numbers=False): """ cdef: Py_ssize_t rows_count = 0, col_count = len(date_cols) - cnp.ndarray[object] result if col_count == 0: return np.zeros(0, dtype=object) @@ -2492,9 +2502,8 @@ def _concat_date_cols(tuple date_cols, bint keep_trivial_numbers=False): if all(util.is_array(array) for array in date_cols): # call specialized function to increase performance - result = _concat_date_cols_numpy(date_cols, rows_count, col_count, - keep_trivial_numbers) + return _concat_date_cols_numpy(date_cols, rows_count, col_count, + keep_trivial_numbers) else: - result = _concat_date_cols_sequence(date_cols, rows_count, col_count, - keep_trivial_numbers) - return result + return _concat_date_cols_sequence(date_cols, rows_count, col_count, + keep_trivial_numbers) From 8797e5323368f5cca4ef06e4fec79488255100ed Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 1 Apr 2019 15:15:10 +0300 Subject: [PATCH 29/42] fix docstrings --- pandas/_libs/lib.pyx | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index cf430303ecd3b..d8bfaacae1cbe 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2325,9 +2325,10 @@ cdef inline void convert_and_set_item(object item, Py_ssize_t index, ---------- item : object index : Py_ssize_t - keep_trivial_numbers : bool, default False - If `keep_trivial_numbers` is True, then conversion - (to string from integer/float zero) is not performed + result : memoryview of 1-d ndarray + keep_trivial_numbers : bool + if True, then conversion (to string from integer/float zero) + is not performed """ cdef: bint do_convert = 1 @@ -2378,12 +2379,12 @@ cdef cnp.ndarray[object] _concat_date_cols_numpy(tuple date_cols, rows_count : Py_ssize_t col_count : Py_ssize_t keep_trivial_numbers : bool, default False - If True, then for the case of one sequence in `date_cols`, + if True and len(date_cols) == 1, then conversion (to string from integer/float zero) is not performed Returns ------- - arr_of_rows : 1-d numpy array + arr_of_rows : ndarray (dtype=object) """ cdef: Py_ssize_t col_idx, row_idx @@ -2439,12 +2440,12 @@ cdef cnp.ndarray[object] _concat_date_cols_sequence(tuple date_cols, rows_count : Py_ssize_t col_count : Py_ssize_t keep_trivial_numbers : bool, default False - If True, then for the case of one sequence in `date_cols`, + if True and len(date_cols) == 1, then conversion (to string from integer/float zero) is not performed Returns ------- - arr_of_rows : 1-d numpy array + arr_of_rows : ndarray (dtype=object) """ cdef: Py_ssize_t col_idx, row_idx @@ -2477,12 +2478,12 @@ def _concat_date_cols(tuple date_cols, bint keep_trivial_numbers=False): ---------- date_cols : tuple of sequences keep_trivial_numbers : bool, default False - If True, then for the case of one sequence in `date_cols`, + if True and len(date_cols) == 1, then conversion (to string from integer/float zero) is not performed Returns ------- - arr_of_rows : 1-d numpy array + arr_of_rows : ndarray (dtype=object) Examples -------- From 3bdb452999b8cf29f09ab78a89daccda4c11b2cb Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 5 Apr 2019 22:09:48 +0300 Subject: [PATCH 30/42] currently only one conversion function is used - convert_to_unicode --- pandas/_libs/lib.pyx | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d8bfaacae1cbe..68304e6213f79 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2315,20 +2315,21 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): @cython.wraparound(False) @cython.boundscheck(False) -cdef inline void convert_and_set_item(object item, Py_ssize_t index, - object[:] result, +cdef inline object convert_to_unicode(object item, bint keep_trivial_numbers): """ - Convert `item` to str and set into result[index]. + Convert `item` to str. Parameters ---------- item : object - index : Py_ssize_t - result : memoryview of 1-d ndarray keep_trivial_numbers : bool if True, then conversion (to string from integer/float zero) is not performed + + Returns + ------- + str """ cdef: bint do_convert = 1 @@ -2346,16 +2347,7 @@ cdef inline void convert_and_set_item(object item, Py_ssize_t index, if do_convert and not isinstance(item, str): item = PyObject_Str(item) - result[index] = item - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline void put_object_as_unicode(list lst, Py_ssize_t idx, - object item): - if not isinstance(item, str): - item = PyObject_Str(item) - lst[idx] = item + return item @cython.wraparound(False) @@ -2403,8 +2395,8 @@ cdef cnp.ndarray[object] _concat_date_cols_numpy(tuple date_cols, it = PyArray_IterNew(array) for row_idx in range(rows_count): item = PyArray_GETITEM(array, PyArray_ITER_DATA(it)) - convert_and_set_item(item, row_idx, result_view, - keep_trivial_numbers) + result_view[row_idx] = convert_to_unicode(item, + keep_trivial_numbers) PyArray_ITER_NEXT(it) else: list_to_join = [None] * col_count @@ -2417,7 +2409,7 @@ cdef cnp.ndarray[object] _concat_date_cols_numpy(tuple date_cols, for col_idx, array in enumerate(date_cols): it = iters_view[col_idx] item = PyArray_GETITEM(array, PyArray_ITER_DATA(it)) - put_object_as_unicode(list_to_join, col_idx, item) + list_to_join[col_idx] = convert_to_unicode(item, False) PyArray_ITER_NEXT(it) result_view[row_idx] = PyUnicode_Join(' ', list_to_join) @@ -2458,13 +2450,14 @@ cdef cnp.ndarray[object] _concat_date_cols_sequence(tuple date_cols, if col_count == 1: for row_idx, item in enumerate(date_cols[0]): - convert_and_set_item(item, row_idx, result_view, - keep_trivial_numbers) + result_view[row_idx] = convert_to_unicode(item, + keep_trivial_numbers) else: list_to_join = [None] * col_count for row_idx in range(rows_count): for col_idx, array in enumerate(date_cols): - put_object_as_unicode(list_to_join, col_idx, array[row_idx]) + list_to_join[col_idx] = convert_to_unicode(array[row_idx], + False) result_view[row_idx] = PyUnicode_Join(' ', list_to_join) return result From dcbcd9a4dbce3d0ce1e442dc32664f2ecb6500e2 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 5 Apr 2019 22:41:00 +0300 Subject: [PATCH 31/42] added some comments in _concat_date_cols_numpy --- pandas/_libs/lib.pyx | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 68304e6213f79..68dec7e85fb9b 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2399,14 +2399,19 @@ cdef cnp.ndarray[object] _concat_date_cols_numpy(tuple date_cols, keep_trivial_numbers) PyArray_ITER_NEXT(it) else: + # create fixed size list - more effecient memory allocation list_to_join = [None] * col_count - # setup iterators iters = np.zeros(col_count, dtype=object) + # create memoryview of iters ndarray, that will contain some + # flatiter's for each array in `date_cols` - more effecient indexing iters_view = iters for col_idx, array in enumerate(date_cols): iters_view[col_idx] = PyArray_IterNew(array) + # array elements that are on the same line are converted to one string for row_idx in range(rows_count): for col_idx, array in enumerate(date_cols): + # this cast is needed, because we did not find a way + # to efficiently store `flatiter` type objects in ndarray it = iters_view[col_idx] item = PyArray_GETITEM(array, PyArray_ITER_DATA(it)) list_to_join[col_idx] = convert_to_unicode(item, False) @@ -2446,6 +2451,7 @@ cdef cnp.ndarray[object] _concat_date_cols_sequence(tuple date_cols, object[:] result_view result = np.zeros(rows_count, dtype=object) + # create memoryview of result ndarray - more effecient indexing result_view = result if col_count == 1: From 1d9c7b768cb84b25a8a78e64a519522b83591b3a Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 5 Apr 2019 22:58:12 +0300 Subject: [PATCH 32/42] fix problem from rebase --- pandas/_libs/tslibs/parsing.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index fe1121f0efa66..4d17f8d2c6273 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -33,7 +33,7 @@ cdef extern from "../src/headers/portable.h": cdef extern from "../src/parser/tokenizer.h": double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, - int skip_trailing, int *error) + int skip_trailing, int *error, int *maybe_int) # ---------------------------------------------------------------------- @@ -329,7 +329,7 @@ cpdef bint _does_string_look_like_datetime(object date_string): return False else: converted_date = xstrtod(buf, &endptr, - b'.', b'e', b'\0', 1, &error) + b'.', b'e', b'\0', 1, &error, NULL) if error == 0 and endptr == buf + length: return converted_date >= 1000 From b4fc8876e62234aaf75bbdc90ca54b86da46ae67 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 5 Apr 2019 23:33:29 +0300 Subject: [PATCH 33/42] added some comments in _does_string_look_like_datetime --- pandas/_libs/tslibs/parsing.pyx | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 4d17f8d2c6273..ce1123670022b 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -328,8 +328,15 @@ cpdef bint _does_string_look_like_datetime(object date_string): elif date_string in _not_datelike_strings: return False else: + # xstrtod with such paramaters copies behavior of python `float` + # cast; for example, " 35.e-1 " is valid string for this cast so, + # for correctly xstrtod call necessary to pass these params: + # b'.' - a dot is used as separator, b'e' - an exponential form of + # a float number can be used, b'\0' - not to use a thousand + # separator, 1 - skip extra spaces before and after, converted_date = xstrtod(buf, &endptr, b'.', b'e', b'\0', 1, &error, NULL) + # if there were no errors and the whole line was parsed, then ... if error == 0 and endptr == buf + length: return converted_date >= 1000 From 25ee2d27e7d2df1f27baee963ded627a22cf5ce5 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 5 Apr 2019 23:47:23 +0300 Subject: [PATCH 34/42] changed default value of keep_trivial_numbers to true --- pandas/_libs/lib.pyx | 4 ++-- pandas/io/parsers.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 68dec7e85fb9b..675b059627036 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2469,14 +2469,14 @@ cdef cnp.ndarray[object] _concat_date_cols_sequence(tuple date_cols, return result -def _concat_date_cols(tuple date_cols, bint keep_trivial_numbers=False): +def _concat_date_cols(tuple date_cols, bint keep_trivial_numbers=True): """ Concatenates elements from sequences in `date_cols` into strings. Parameters ---------- date_cols : tuple of sequences - keep_trivial_numbers : bool, default False + keep_trivial_numbers : bool, default True if True and len(date_cols) == 1, then conversion (to string from integer/float zero) is not performed diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 2f8aa29162a24..939bb6ad287e2 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -3187,7 +3187,7 @@ def _make_date_converter(date_parser=None, dayfirst=False, infer_datetime_format=False, cache_dates=True): def converter(*date_cols): if date_parser is None: - strs = _concat_date_cols(date_cols, keep_trivial_numbers=True) + strs = _concat_date_cols(date_cols) try: return tools.to_datetime( From 2046dcb0f99bbd76975fa9a1bcd119495243016e Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Mon, 29 Apr 2019 07:30:05 -0500 Subject: [PATCH 35/42] Remove not needed try..except in _does_string_look_like_datetime benchmark --- asv_bench/benchmarks/io/parsers.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py index f453705c5f859..46abb00a727da 100644 --- a/asv_bench/benchmarks/io/parsers.py +++ b/asv_bench/benchmarks/io/parsers.py @@ -15,10 +15,7 @@ def setup(self, value): def time_check_datetimes(self, value): for obj in self.objects: - try: - _does_string_look_like_datetime(obj) - except ValueError: - pass + _does_string_look_like_datetime(obj) class ConcatDateCols(object): From 28b66704992e0d4c6df5290792c0282f8232478e Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 29 Apr 2019 16:10:06 +0300 Subject: [PATCH 36/42] upgraded doc-ststring; added some blank lines --- pandas/_libs/lib.pyx | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 675b059627036..586358750c3c5 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2357,19 +2357,15 @@ cdef cnp.ndarray[object] _concat_date_cols_numpy(tuple date_cols, Py_ssize_t col_count, bint keep_trivial_numbers): """ - Concatenates `rows_count` elements from each `col_count` numpy arrays - in `date_cols` into strings. - - Note - ---- - This function speeds up concatenation for numpy arrays. - You also can use `_concat_date_cols_sequence` function. + Concatenates elements from numpy arrays into strings. Parameters ---------- date_cols : tuple of numpy arrays rows_count : Py_ssize_t + count of elements from arrays that will be concatenated col_count : Py_ssize_t + count of arrays whose elements will be concatenated keep_trivial_numbers : bool, default False if True and len(date_cols) == 1, then conversion (to string from integer/float zero) is not performed @@ -2377,6 +2373,11 @@ cdef cnp.ndarray[object] _concat_date_cols_numpy(tuple date_cols, Returns ------- arr_of_rows : ndarray (dtype=object) + + Notes + ----- + This function speeds up concatenation for numpy arrays. + You also can use `_concat_date_cols_sequence` function. """ cdef: Py_ssize_t col_idx, row_idx @@ -2402,11 +2403,13 @@ cdef cnp.ndarray[object] _concat_date_cols_numpy(tuple date_cols, # create fixed size list - more effecient memory allocation list_to_join = [None] * col_count iters = np.zeros(col_count, dtype=object) + # create memoryview of iters ndarray, that will contain some # flatiter's for each array in `date_cols` - more effecient indexing iters_view = iters for col_idx, array in enumerate(date_cols): iters_view[col_idx] = PyArray_IterNew(array) + # array elements that are on the same line are converted to one string for row_idx in range(rows_count): for col_idx, array in enumerate(date_cols): @@ -2428,14 +2431,15 @@ cdef cnp.ndarray[object] _concat_date_cols_sequence(tuple date_cols, Py_ssize_t col_count, bint keep_trivial_numbers): """ - Concatenates `rows_count` elements from each `col_count` sequences - in `date_cols` into strings. + Concatenates elements from sequences into strings. Parameters ---------- date_cols : tuple of sequences rows_count : Py_ssize_t + count of elements from sequences that will be concatenated col_count : Py_ssize_t + count of sequences whose elements will be concatenated keep_trivial_numbers : bool, default False if True and len(date_cols) == 1, then conversion (to string from integer/float zero) is not performed @@ -2451,6 +2455,7 @@ cdef cnp.ndarray[object] _concat_date_cols_sequence(tuple date_cols, object[:] result_view result = np.zeros(rows_count, dtype=object) + # create memoryview of result ndarray - more effecient indexing result_view = result From 30f70ab6a02e367f5cbbb28460c6ad5a5eddb570 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 6 May 2019 12:43:43 +0300 Subject: [PATCH 37/42] removed '_concat_date_cols_sequence' func --- asv_bench/benchmarks/io/parsers.py | 16 ++++----- pandas/_libs/lib.pyx | 57 ++---------------------------- 2 files changed, 10 insertions(+), 63 deletions(-) diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py index 46abb00a727da..8a54b8c044ec6 100644 --- a/asv_bench/benchmarks/io/parsers.py +++ b/asv_bench/benchmarks/io/parsers.py @@ -20,16 +20,16 @@ def time_check_datetimes(self, value): class ConcatDateCols(object): - params = ([1234567890, 'AAAA'], [1, 2], [np.array, list]) - param_names = ['value', 'dim', 'container'] + params = ([1234567890, 'AAAA'], [1, 2]) + param_names = ['value', 'dim'] - def setup(self, value, dim, container): - count_elem = 10000 + def setup(self, value, dim): + count_elem = 100000 if dim == 1: - self.object = (container([value] * count_elem),) + self.object = (np.array([value] * count_elem),) if dim == 2: - self.object = (container([value] * count_elem), - container([value] * count_elem)) + self.object = (np.array([value] * count_elem), + np.array([value] * count_elem)) - def time_check_concat(self, value, dim, container): + def time_check_concat(self, value, dim): _concat_date_cols(self.object) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 586358750c3c5..900230a5ebf89 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2424,59 +2424,9 @@ cdef cnp.ndarray[object] _concat_date_cols_numpy(tuple date_cols, return result -@cython.wraparound(False) -@cython.boundscheck(False) -cdef cnp.ndarray[object] _concat_date_cols_sequence(tuple date_cols, - Py_ssize_t rows_count, - Py_ssize_t col_count, - bint keep_trivial_numbers): - """ - Concatenates elements from sequences into strings. - - Parameters - ---------- - date_cols : tuple of sequences - rows_count : Py_ssize_t - count of elements from sequences that will be concatenated - col_count : Py_ssize_t - count of sequences whose elements will be concatenated - keep_trivial_numbers : bool, default False - if True and len(date_cols) == 1, then - conversion (to string from integer/float zero) is not performed - - Returns - ------- - arr_of_rows : ndarray (dtype=object) - """ - cdef: - Py_ssize_t col_idx, row_idx - list list_to_join - cnp.ndarray[object] result - object[:] result_view - - result = np.zeros(rows_count, dtype=object) - - # create memoryview of result ndarray - more effecient indexing - result_view = result - - if col_count == 1: - for row_idx, item in enumerate(date_cols[0]): - result_view[row_idx] = convert_to_unicode(item, - keep_trivial_numbers) - else: - list_to_join = [None] * col_count - for row_idx in range(rows_count): - for col_idx, array in enumerate(date_cols): - list_to_join[col_idx] = convert_to_unicode(array[row_idx], - False) - result_view[row_idx] = PyUnicode_Join(' ', list_to_join) - - return result - - def _concat_date_cols(tuple date_cols, bint keep_trivial_numbers=True): """ - Concatenates elements from sequences in `date_cols` into strings. + Concatenates elements from numpy arrays in `date_cols` into strings. Parameters ---------- @@ -2506,9 +2456,6 @@ def _concat_date_cols(tuple date_cols, bint keep_trivial_numbers=True): rows_count = min(len(array) for array in date_cols) if all(util.is_array(array) for array in date_cols): - # call specialized function to increase performance return _concat_date_cols_numpy(date_cols, rows_count, col_count, keep_trivial_numbers) - else: - return _concat_date_cols_sequence(date_cols, rows_count, col_count, - keep_trivial_numbers) + raise ValueError("not all elements from date_cols are numpy arrays") From 3800c406f0292eb4580a17dfd4b5bbc0692e9d1f Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 6 May 2019 13:47:37 +0300 Subject: [PATCH 38/42] now only one function '_concat_date_cols' --- asv_bench/benchmarks/io/parsers.py | 2 +- pandas/_libs/lib.pyx | 70 ++++++++---------------------- 2 files changed, 20 insertions(+), 52 deletions(-) diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py index 8a54b8c044ec6..6ee935e5ea51d 100644 --- a/asv_bench/benchmarks/io/parsers.py +++ b/asv_bench/benchmarks/io/parsers.py @@ -24,7 +24,7 @@ class ConcatDateCols(object): param_names = ['value', 'dim'] def setup(self, value, dim): - count_elem = 100000 + count_elem = 10000 if dim == 1: self.object = (np.array([value] * count_elem),) if dim == 2: diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 900230a5ebf89..d61df351de9ea 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2352,21 +2352,14 @@ cdef inline object convert_to_unicode(object item, @cython.wraparound(False) @cython.boundscheck(False) -cdef cnp.ndarray[object] _concat_date_cols_numpy(tuple date_cols, - Py_ssize_t rows_count, - Py_ssize_t col_count, - bint keep_trivial_numbers): +def _concat_date_cols(tuple date_cols, bint keep_trivial_numbers=True): """ - Concatenates elements from numpy arrays into strings. + Concatenates elements from numpy arrays in `date_cols` into strings. Parameters ---------- date_cols : tuple of numpy arrays - rows_count : Py_ssize_t - count of elements from arrays that will be concatenated - col_count : Py_ssize_t - count of arrays whose elements will be concatenated - keep_trivial_numbers : bool, default False + keep_trivial_numbers : bool, default True if True and len(date_cols) == 1, then conversion (to string from integer/float zero) is not performed @@ -2374,12 +2367,16 @@ cdef cnp.ndarray[object] _concat_date_cols_numpy(tuple date_cols, ------- arr_of_rows : ndarray (dtype=object) - Notes - ----- - This function speeds up concatenation for numpy arrays. - You also can use `_concat_date_cols_sequence` function. + Examples + -------- + >>> dates=np.array(['3/31/2019', '4/31/2019'], dtype=object) + >>> times=np.array(['11:20', '10:45'], dtype=object) + >>> result = _concat_date_cols((dates, times)) + >>> result + array(['3/31/2019 11:20', '4/31/2019 10:45'], dtype=object) """ cdef: + Py_ssize_t rows_count = 0, col_count = len(date_cols) Py_ssize_t col_idx, row_idx list list_to_join cnp.ndarray[object] iters @@ -2388,6 +2385,14 @@ cdef cnp.ndarray[object] _concat_date_cols_numpy(tuple date_cols, cnp.ndarray[object] result object[:] result_view + if col_count == 0: + return np.zeros(0, dtype=object) + + + if not all(util.is_array(array) for array in date_cols): + raise ValueError("not all elements from date_cols are numpy arrays") + + rows_count = min(len(array) for array in date_cols) result = np.zeros(rows_count, dtype=object) result_view = result @@ -2422,40 +2427,3 @@ cdef cnp.ndarray[object] _concat_date_cols_numpy(tuple date_cols, result_view[row_idx] = PyUnicode_Join(' ', list_to_join) return result - - -def _concat_date_cols(tuple date_cols, bint keep_trivial_numbers=True): - """ - Concatenates elements from numpy arrays in `date_cols` into strings. - - Parameters - ---------- - date_cols : tuple of sequences - keep_trivial_numbers : bool, default True - if True and len(date_cols) == 1, then - conversion (to string from integer/float zero) is not performed - - Returns - ------- - arr_of_rows : ndarray (dtype=object) - - Examples - -------- - >>> dates=np.array(['3/31/2019', '4/31/2019'], dtype=object) - >>> times=np.array(['11:20', '10:45'], dtype=object) - >>> result = _concat_date_cols((dates, times)) - >>> result - array(['3/31/2019 11:20', '4/31/2019 10:45'], dtype=object) - """ - cdef: - Py_ssize_t rows_count = 0, col_count = len(date_cols) - - if col_count == 0: - return np.zeros(0, dtype=object) - - rows_count = min(len(array) for array in date_cols) - - if all(util.is_array(array) for array in date_cols): - return _concat_date_cols_numpy(date_cols, rows_count, col_count, - keep_trivial_numbers) - raise ValueError("not all elements from date_cols are numpy arrays") From b45df3f564c24d0a60fa9eb759d77a237b700161 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 7 May 2019 12:19:21 +0300 Subject: [PATCH 39/42] removed 'do_convert' local var from 'convert_to_unicode' --- pandas/_libs/lib.pyx | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d61df351de9ea..526bf00f21c0c 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2329,22 +2329,21 @@ cdef inline object convert_to_unicode(object item, Returns ------- - str + str or int or float """ cdef: - bint do_convert = 1 float64_t float_item if keep_trivial_numbers: if isinstance(item, int): if item == 0: - do_convert = 0 + return item elif isinstance(item, float): float_item = item if float_item == 0.0 or float_item != float_item: - do_convert = 0 + return item - if do_convert and not isinstance(item, str): + if not isinstance(item, str): item = PyObject_Str(item) return item @@ -2388,7 +2387,6 @@ def _concat_date_cols(tuple date_cols, bint keep_trivial_numbers=True): if col_count == 0: return np.zeros(0, dtype=object) - if not all(util.is_array(array) for array in date_cols): raise ValueError("not all elements from date_cols are numpy arrays") From 43dffec444a0a5451fe402822cda70fa27d53ba9 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 7 May 2019 12:54:50 +0300 Subject: [PATCH 40/42] replaced '_concat_date_cols' and 'convert_to_unicode' from lib.pyx to parsing.pyx --- asv_bench/benchmarks/io/parsers.py | 5 +- pandas/_libs/lib.pyx | 114 ------------------- pandas/_libs/tslibs/parsing.pyx | 124 ++++++++++++++++++++- pandas/io/parsers.py | 11 +- pandas/tests/io/parser/test_parse_dates.py | 2 +- 5 files changed, 131 insertions(+), 125 deletions(-) diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py index 6ee935e5ea51d..493955d394443 100644 --- a/asv_bench/benchmarks/io/parsers.py +++ b/asv_bench/benchmarks/io/parsers.py @@ -1,8 +1,7 @@ import numpy as np -from pandas._libs.tslibs.parsing import _does_string_look_like_datetime - -from pandas.io.parsers import _concat_date_cols +from pandas._libs.tslibs.parsing import ( + _concat_date_cols, _does_string_look_like_datetime) class DoesStringLookLikeDatetime(object): diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 526bf00f21c0c..c09fb96eb9182 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2311,117 +2311,3 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): output[i] = default return maybe_convert_objects(output) - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline object convert_to_unicode(object item, - bint keep_trivial_numbers): - """ - Convert `item` to str. - - Parameters - ---------- - item : object - keep_trivial_numbers : bool - if True, then conversion (to string from integer/float zero) - is not performed - - Returns - ------- - str or int or float - """ - cdef: - float64_t float_item - - if keep_trivial_numbers: - if isinstance(item, int): - if item == 0: - return item - elif isinstance(item, float): - float_item = item - if float_item == 0.0 or float_item != float_item: - return item - - if not isinstance(item, str): - item = PyObject_Str(item) - - return item - - -@cython.wraparound(False) -@cython.boundscheck(False) -def _concat_date_cols(tuple date_cols, bint keep_trivial_numbers=True): - """ - Concatenates elements from numpy arrays in `date_cols` into strings. - - Parameters - ---------- - date_cols : tuple of numpy arrays - keep_trivial_numbers : bool, default True - if True and len(date_cols) == 1, then - conversion (to string from integer/float zero) is not performed - - Returns - ------- - arr_of_rows : ndarray (dtype=object) - - Examples - -------- - >>> dates=np.array(['3/31/2019', '4/31/2019'], dtype=object) - >>> times=np.array(['11:20', '10:45'], dtype=object) - >>> result = _concat_date_cols((dates, times)) - >>> result - array(['3/31/2019 11:20', '4/31/2019 10:45'], dtype=object) - """ - cdef: - Py_ssize_t rows_count = 0, col_count = len(date_cols) - Py_ssize_t col_idx, row_idx - list list_to_join - cnp.ndarray[object] iters - object[::1] iters_view - flatiter it - cnp.ndarray[object] result - object[:] result_view - - if col_count == 0: - return np.zeros(0, dtype=object) - - if not all(util.is_array(array) for array in date_cols): - raise ValueError("not all elements from date_cols are numpy arrays") - - rows_count = min(len(array) for array in date_cols) - result = np.zeros(rows_count, dtype=object) - result_view = result - - if col_count == 1: - array = date_cols[0] - it = PyArray_IterNew(array) - for row_idx in range(rows_count): - item = PyArray_GETITEM(array, PyArray_ITER_DATA(it)) - result_view[row_idx] = convert_to_unicode(item, - keep_trivial_numbers) - PyArray_ITER_NEXT(it) - else: - # create fixed size list - more effecient memory allocation - list_to_join = [None] * col_count - iters = np.zeros(col_count, dtype=object) - - # create memoryview of iters ndarray, that will contain some - # flatiter's for each array in `date_cols` - more effecient indexing - iters_view = iters - for col_idx, array in enumerate(date_cols): - iters_view[col_idx] = PyArray_IterNew(array) - - # array elements that are on the same line are converted to one string - for row_idx in range(rows_count): - for col_idx, array in enumerate(date_cols): - # this cast is needed, because we did not find a way - # to efficiently store `flatiter` type objects in ndarray - it = iters_view[col_idx] - item = PyArray_GETITEM(array, PyArray_ITER_DATA(it)) - list_to_join[col_idx] = convert_to_unicode(item, False) - PyArray_ITER_NEXT(it) - result_view[row_idx] = PyUnicode_Join(' ', list_to_join) - - return result diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index ce1123670022b..5f8a817db053e 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -7,11 +7,19 @@ from io import StringIO from libc.string cimport strchr +import cython + +from cpython cimport PyObject_Str, PyUnicode_Join + from cpython.datetime cimport datetime, datetime_new, import_datetime from cpython.version cimport PY_VERSION_HEX import_datetime() import numpy as np +cimport numpy as cnp +from numpy cimport (PyArray_GETITEM, PyArray_ITER_DATA, PyArray_ITER_NEXT, + PyArray_IterNew, flatiter, float64_t) +cnp.import_array() # dateutil compat from dateutil.tz import (tzoffset, @@ -26,7 +34,7 @@ from pandas._config import get_option from pandas._libs.tslibs.ccalendar import MONTH_NUMBERS from pandas._libs.tslibs.nattype import nat_strings, NaT -from pandas._libs.tslibs.util cimport get_c_string_buf_and_size +from pandas._libs.tslibs.util cimport is_array, get_c_string_buf_and_size cdef extern from "../src/headers/portable.h": int getdigit_ascii(char c, int default) nogil @@ -880,3 +888,117 @@ def _guess_datetime_format(dt_str, dayfirst=False, dt_str_parse=du_parse, return guessed_format else: return None + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline object convert_to_unicode(object item, + bint keep_trivial_numbers): + """ + Convert `item` to str. + + Parameters + ---------- + item : object + keep_trivial_numbers : bool + if True, then conversion (to string from integer/float zero) + is not performed + + Returns + ------- + str or int or float + """ + cdef: + float64_t float_item + + if keep_trivial_numbers: + if isinstance(item, int): + if item == 0: + return item + elif isinstance(item, float): + float_item = item + if float_item == 0.0 or float_item != float_item: + return item + + if not isinstance(item, str): + item = PyObject_Str(item) + + return item + + +@cython.wraparound(False) +@cython.boundscheck(False) +def _concat_date_cols(tuple date_cols, bint keep_trivial_numbers=True): + """ + Concatenates elements from numpy arrays in `date_cols` into strings. + + Parameters + ---------- + date_cols : tuple of numpy arrays + keep_trivial_numbers : bool, default True + if True and len(date_cols) == 1, then + conversion (to string from integer/float zero) is not performed + + Returns + ------- + arr_of_rows : ndarray (dtype=object) + + Examples + -------- + >>> dates=np.array(['3/31/2019', '4/31/2019'], dtype=object) + >>> times=np.array(['11:20', '10:45'], dtype=object) + >>> result = _concat_date_cols((dates, times)) + >>> result + array(['3/31/2019 11:20', '4/31/2019 10:45'], dtype=object) + """ + cdef: + Py_ssize_t rows_count = 0, col_count = len(date_cols) + Py_ssize_t col_idx, row_idx + list list_to_join + cnp.ndarray[object] iters + object[::1] iters_view + flatiter it + cnp.ndarray[object] result + object[:] result_view + + if col_count == 0: + return np.zeros(0, dtype=object) + + if not all(is_array(array) for array in date_cols): + raise ValueError("not all elements from date_cols are numpy arrays") + + rows_count = min(len(array) for array in date_cols) + result = np.zeros(rows_count, dtype=object) + result_view = result + + if col_count == 1: + array = date_cols[0] + it = PyArray_IterNew(array) + for row_idx in range(rows_count): + item = PyArray_GETITEM(array, PyArray_ITER_DATA(it)) + result_view[row_idx] = convert_to_unicode(item, + keep_trivial_numbers) + PyArray_ITER_NEXT(it) + else: + # create fixed size list - more effecient memory allocation + list_to_join = [None] * col_count + iters = np.zeros(col_count, dtype=object) + + # create memoryview of iters ndarray, that will contain some + # flatiter's for each array in `date_cols` - more effecient indexing + iters_view = iters + for col_idx, array in enumerate(date_cols): + iters_view[col_idx] = PyArray_IterNew(array) + + # array elements that are on the same line are converted to one string + for row_idx in range(rows_count): + for col_idx, array in enumerate(date_cols): + # this cast is needed, because we did not find a way + # to efficiently store `flatiter` type objects in ndarray + it = iters_view[col_idx] + item = PyArray_GETITEM(array, PyArray_ITER_DATA(it)) + list_to_join[col_idx] = convert_to_unicode(item, False) + PyArray_ITER_NEXT(it) + result_view[row_idx] = PyUnicode_Join(' ', list_to_join) + + return result diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 939bb6ad287e2..f25142fcfcf58 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -14,7 +14,6 @@ import numpy as np import pandas._libs.lib as lib -from pandas._libs.lib import _concat_date_cols import pandas._libs.ops as libops import pandas._libs.parsers as parsers from pandas._libs.tslibs import parsing @@ -3187,7 +3186,7 @@ def _make_date_converter(date_parser=None, dayfirst=False, infer_datetime_format=False, cache_dates=True): def converter(*date_cols): if date_parser is None: - strs = _concat_date_cols(date_cols) + strs = parsing._concat_date_cols(date_cols) try: return tools.to_datetime( @@ -3217,10 +3216,10 @@ def converter(*date_cols): except Exception: try: return tools.to_datetime( - parsing.try_parse_dates(_concat_date_cols(date_cols), - parser=date_parser, - dayfirst=dayfirst), - cache=cache_dates, + parsing.try_parse_dates( + parsing._concat_date_cols(date_cols), + parser=date_parser, + dayfirst=dayfirst), errors='ignore') except Exception: return generic_parser(date_parser, *date_cols) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 6c4dfe2ffa1fa..709ca3e686229 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -76,7 +76,7 @@ def date_parser(*date_cols): ------- parsed : Series """ - return parsing.try_parse_dates(parsers._concat_date_cols(date_cols)) + return parsing.try_parse_dates(parsing._concat_date_cols(date_cols)) result = parser.read_csv(StringIO(data), header=None, date_parser=date_parser, prefix="X", From c06a662fe681c9d7c68d1a9289e93e466fc94311 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 7 May 2019 14:43:31 +0300 Subject: [PATCH 41/42] added 'test_concat_date_col_fail' test --- pandas/_libs/tslibs/parsing.pyx | 2 -- pandas/tests/io/parser/test_parse_dates.py | 15 +++++++++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 5f8a817db053e..18d55999aaa38 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -58,8 +58,6 @@ _DEFAULT_DATETIME = datetime(1, 1, 1).replace(hour=0, minute=0, cdef: set _not_datelike_strings = {'a', 'A', 'm', 'M', 'p', 'P', 't', 'T'} - set _not_datelike_strings = {'a', 'A', 'm', 'M', 'p', 'P', 't', 'T'} - # ---------------------------------------------------------------------- cdef: const char* delimiters = " /-." diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 709ca3e686229..46353b5345018 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -19,12 +19,11 @@ from pandas.compat.numpy import np_array_datetime64_compat import pandas as pd -from pandas import DataFrame, DatetimeIndex, Index, MultiIndex +from pandas import DataFrame, DatetimeIndex, Index, MultiIndex, Series from pandas.core.indexes.datetimes import date_range import pandas.util.testing as tm import pandas.io.date_converters as conv -import pandas.io.parsers as parsers # constant _DEFAULT_DATETIME = datetime(1, 1, 1) @@ -117,6 +116,18 @@ def date_parser(*date_cols): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("container", [list, tuple, Index, Series]) +@pytest.mark.parametrize("dim", [1, 2]) +def test_concat_date_col_fail(container, dim): + msg = "not all elements from date_cols are numpy arrays" + value = "19990127" + + date_cols = tuple(container([value]) for _ in range(dim)) + + with pytest.raises(ValueError, match=msg): + parsing._concat_date_cols(date_cols) + + @pytest.mark.parametrize("keep_date_col", [True, False]) def test_multiple_date_col(all_parsers, keep_date_col): data = """\ From 5dda33c5469bb9024db468638fb96101beafb5da Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 10 May 2019 00:11:33 +0300 Subject: [PATCH 42/42] added doc-string to '_does_string_look_like_datetime' func; changed 'date_string' -> 'py_string' --- pandas/_libs/tslibs/parsing.pyx | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 18d55999aaa38..068ad016459a8 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -315,7 +315,19 @@ cdef parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False, return parsed, parsed, reso -cpdef bint _does_string_look_like_datetime(object date_string): +cpdef bint _does_string_look_like_datetime(object py_string): + """ + Checks whether given string is a datetime: it has to start with '0' or + be greater than 1000. + + Parameters + ---------- + py_string: object + + Returns + ------- + whether given string is a datetime + """ cdef: const char *buf char *endptr = NULL @@ -324,14 +336,14 @@ cpdef bint _does_string_look_like_datetime(object date_string): char first int error = 0 - buf = get_c_string_buf_and_size(date_string, &length) + buf = get_c_string_buf_and_size(py_string, &length) if length >= 1: first = buf[0] if first == b'0': # Strings starting with 0 are more consistent with a # date-like string than a number return True - elif date_string in _not_datelike_strings: + elif py_string in _not_datelike_strings: return False else: # xstrtod with such paramaters copies behavior of python `float`