diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 7a9db38957e94..a6512080eb428 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -187,9 +187,11 @@ cdef extern from "parser/tokenizer.h": int64_t skipfooter # pick one, depending on whether the converter requires GIL float64_t (*double_converter_nogil)(const char *, char **, - char, char, char, int, int *) nogil + char, char, char, + int, int *, int *) nogil float64_t (*double_converter_withgil)(const char *, char **, - char, char, char, int) + char, char, char, + int, int *, int *) # error handling char *warn_msg @@ -237,12 +239,15 @@ cdef extern from "parser/tokenizer.h": uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max, uint64_t uint_max, int *error, char tsep) nogil - float64_t xstrtod(const char *p, char **q, char decimal, char sci, - char tsep, int skip_trailing, int *error) nogil - float64_t precise_xstrtod(const char *p, char **q, char decimal, char sci, - char tsep, int skip_trailing, int *error) nogil - float64_t round_trip(const char *p, char **q, char decimal, char sci, - char tsep, int skip_trailing) nogil + float64_t xstrtod(const char *p, char **q, char decimal, + char sci, char tsep, int skip_trailing, + int *error, int *maybe_int) nogil + float64_t precise_xstrtod(const char *p, char **q, char decimal, + char sci, char tsep, int skip_trailing, + int *error, int *maybe_int) nogil + float64_t round_trip(const char *p, char **q, char decimal, + char sci, char tsep, int skip_trailing, + int *error, int *maybe_int) nogil int to_boolean(const char *item, uint8_t *val) nogil @@ -1737,7 +1742,8 @@ cdef _try_double(parser_t *parser, int64_t col, assert parser.double_converter_withgil != NULL error = _try_double_nogil(parser, parser.double_converter_withgil, col, line_start, line_end, na_filter, na_hashset, use_na_flist, @@ -1751,7 +1757,7 @@ cdef _try_double(parser_t *parser, int64_t col, cdef inline int _try_double_nogil(parser_t *parser, float64_t (*double_converter)( const char *, char **, char, - char, char, int, int *) nogil, + char, char, int, int *, int *) nogil, int col, int line_start, int line_end, bint na_filter, kh_str_starts_t *na_hashset, bint use_na_flist, @@ -1780,7 +1786,7 @@ cdef inline int _try_double_nogil(parser_t *parser, else: data[0] = double_converter(word, &p_end, parser.decimal, parser.sci, parser.thousands, - 1, &error) + 1, &error, NULL) if error != 0 or p_end == word or p_end[0]: error = 0 if (strcasecmp(word, cinf) == 0 or @@ -1800,7 +1806,8 @@ cdef inline int _try_double_nogil(parser_t *parser, for i in range(lines): COLITER_NEXT(it, word) data[0] = double_converter(word, &p_end, parser.decimal, - parser.sci, parser.thousands, 1, &error) + parser.sci, parser.thousands, + 1, &error, NULL) if error != 0 or p_end == word or p_end[0]: error = 0 if (strcasecmp(word, cinf) == 0 or diff --git a/pandas/_libs/src/parse_helper.h b/pandas/_libs/src/parse_helper.h index 6fcd2ed0a9ea0..1db1878a8a773 100644 --- a/pandas/_libs/src/parse_helper.h +++ b/pandas/_libs/src/parse_helper.h @@ -10,21 +10,19 @@ The full license is in the LICENSE file, distributed with this software. #ifndef PANDAS__LIBS_SRC_PARSE_HELPER_H_ #define PANDAS__LIBS_SRC_PARSE_HELPER_H_ -#include #include #include "inline_helper.h" #include "headers/portable.h" - -static double xstrtod(const char *p, char **q, char decimal, char sci, - int skip_trailing, int *maybe_int); +#include "parser/tokenizer.h" int to_double(char *item, double *p_value, char sci, char decimal, int *maybe_int) { char *p_end = NULL; + int error = 0; - *p_value = xstrtod(item, &p_end, decimal, sci, 1, maybe_int); + *p_value = xstrtod(item, &p_end, decimal, sci, '\0', 1, &error, maybe_int); - return (errno == 0) && (!*p_end); + return (error == 0) && (!*p_end); } #if PY_VERSION_HEX < 0x02060000 @@ -82,61 +80,8 @@ int floatify(PyObject *str, double *result, int *maybe_int) { PyErr_Format(PyExc_ValueError, "Unable to parse string \"%s\"", data); Py_XDECREF(tmp); return -1; - - /* - #if PY_VERSION_HEX >= 0x03000000 - return PyFloat_FromString(str); - #else - return PyFloat_FromString(str, NULL); - #endif - */ } -// --------------------------------------------------------------------------- -// Implementation of xstrtod - -// -// strtod.c -// -// Convert string to double -// -// Copyright (C) 2002 Michael Ringgaard. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// 3. Neither the name of the project nor the names of its contributors -// may be used to endorse or promote products derived from this software -// without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND -// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -// LIABLE -// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -// SUCH DAMAGE. -// -// ----------------------------------------------------------------------- -// Modifications by Warren Weckesser, March 2011: -// * Rename strtod() to xstrtod(). -// * Added decimal and sci arguments. -// * Skip trailing spaces. -// * Commented out the other functions. -// - PANDAS_INLINE void lowercase(char *p) { for (; *p; ++p) *p = tolower_ascii(*p); } @@ -145,130 +90,4 @@ PANDAS_INLINE void uppercase(char *p) { for (; *p; ++p) *p = toupper_ascii(*p); } -static double xstrtod(const char *str, char **endptr, char decimal, char sci, - int skip_trailing, int *maybe_int) { - double number; - int exponent; - int negative; - char *p = (char *)str; - double p10; - int n; - int num_digits; - int num_decimals; - - errno = 0; - *maybe_int = 1; - - // Skip leading whitespace - while (isspace(*p)) p++; - - // Handle optional sign - negative = 0; - switch (*p) { - case '-': - negative = 1; // Fall through to increment position - case '+': - p++; - } - - number = 0.; - exponent = 0; - num_digits = 0; - num_decimals = 0; - - // Process string of digits - while (isdigit_ascii(*p)) { - number = number * 10. + (*p - '0'); - p++; - num_digits++; - } - - // Process decimal part - if (*p == decimal) { - *maybe_int = 0; - p++; - - while (isdigit_ascii(*p)) { - number = number * 10. + (*p - '0'); - p++; - num_digits++; - num_decimals++; - } - - exponent -= num_decimals; - } - - if (num_digits == 0) { - errno = ERANGE; - return 0.0; - } - - // Correct for sign - if (negative) number = -number; - - // Process an exponent string - if (toupper_ascii(*p) == toupper_ascii(sci)) { - *maybe_int = 0; - - // Handle optional sign - negative = 0; - switch (*++p) { - case '-': - negative = 1; // Fall through to increment pos - case '+': - p++; - } - - // Process string of digits - num_digits = 0; - n = 0; - while (isdigit_ascii(*p)) { - n = n * 10 + (*p - '0'); - num_digits++; - p++; - } - - if (negative) - exponent -= n; - else - exponent += n; - - // If no digits, after the 'e'/'E', un-consume it - if (num_digits == 0) p--; - } - - if (exponent < DBL_MIN_EXP || exponent > DBL_MAX_EXP) { - errno = ERANGE; - return HUGE_VAL; - } - - // Scale the result - p10 = 10.; - n = exponent; - if (n < 0) n = -n; - while (n) { - if (n & 1) { - if (exponent < 0) - number /= p10; - else - number *= p10; - } - n >>= 1; - p10 *= p10; - } - - if (number == HUGE_VAL) { - errno = ERANGE; - } - - if (skip_trailing) { - // Skip trailing whitespace - while (isspace_ascii(*p)) p++; - } - - if (endptr) *endptr = p; - - return number; -} - #endif // PANDAS__LIBS_SRC_PARSE_HELPER_H_ diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 712e12829a937..3a6c7ba07a444 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1544,7 +1544,7 @@ int main(int argc, char *argv[]) { const int max_int_decimal_digits = (sizeof(unsigned int) * 8) / 4; double xstrtod(const char *str, char **endptr, char decimal, char sci, - char tsep, int skip_trailing, int *error) { + char tsep, int skip_trailing, int *error, int *maybe_int) { double number; unsigned int i_number = 0; int exponent; @@ -1555,7 +1555,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, int num_digits; int num_decimals; - + if (maybe_int != NULL) *maybe_int = 1; // Skip leading whitespace. while (isspace_ascii(*p)) p++; @@ -1595,6 +1595,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, // Process decimal part. if (*p == decimal) { + if (maybe_int != NULL) *maybe_int = 0; p++; while (isdigit_ascii(*p)) { @@ -1617,6 +1618,8 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, // Process an exponent string. if (toupper_ascii(*p) == toupper_ascii(sci)) { + if (maybe_int != NULL) *maybe_int = 0; + // Handle optional sign. negative = 0; switch (*++p) { @@ -1674,12 +1677,12 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, } if (endptr) *endptr = p; - return number; } -double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, - char tsep, int skip_trailing, int *error) { +double precise_xstrtod(const char *str, char **endptr, char decimal, + char sci, char tsep, int skip_trailing, + int *error, int *maybe_int) { double number; int exponent; int negative; @@ -1688,6 +1691,8 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, int num_decimals; int max_digits = 17; int n; + + if (maybe_int != NULL) *maybe_int = 1; // Cache powers of 10 in memory. static double e[] = { 1., 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, @@ -1754,6 +1759,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, // Process decimal part if (*p == decimal) { + if (maybe_int != NULL) *maybe_int = 0; p++; while (num_digits < max_digits && isdigit_ascii(*p)) { @@ -1779,6 +1785,8 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, // Process an exponent string. if (toupper_ascii(*p) == toupper_ascii(sci)) { + if (maybe_int != NULL) *maybe_int = 0; + // Handle optional sign negative = 0; switch (*++p) { @@ -1832,8 +1840,11 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, } double round_trip(const char *p, char **q, char decimal, char sci, char tsep, - int skip_trailing) { + int skip_trailing, int *error, int *maybe_int) { double r = PyOS_string_to_double(p, q, 0); + if (maybe_int != NULL) *maybe_int = 0; + if (PyErr_Occurred() != NULL) *error = -1; + else if (r == Py_HUGE_VAL) *error = Py_HUGE_VAL; PyErr_Clear(); return r; } diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index 70d7e4a3aff40..2ee4281d2f349 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -260,11 +260,12 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep); double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, - int skip_trailing, int *error); -double precise_xstrtod(const char *p, char **q, char decimal, char sci, - char tsep, int skip_trailing, int *error); + int skip_trailing, int *error, int *maybe_int); +double precise_xstrtod(const char *p, char **q, char decimal, + char sci, char tsep, int skip_trailing, + int *error, int *maybe_int); double round_trip(const char *p, char **q, char decimal, char sci, char tsep, - int skip_trailing); + int skip_trailing, int *error, int *maybe_int); int to_boolean(const char *item, uint8_t *val); #endif // PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_ diff --git a/setup.py b/setup.py index 1dca7fa77219f..2bf0b7eee22f3 100755 --- a/setup.py +++ b/setup.py @@ -558,7 +558,8 @@ def srcpath(name=None, suffix='.pyx', subdir='src'): '_libs.lib': { 'pyxfile': '_libs/lib', 'include': common_include + ts_include, - 'depends': lib_depends + tseries_depends}, + 'depends': lib_depends + tseries_depends, + 'sources': ['pandas/_libs/src/parser/tokenizer.c']}, '_libs.missing': { 'pyxfile': '_libs/missing', 'include': common_include + ts_include,