From 9d9b50562fb39a5bb85d3cb19ab6cecec0e6b06e Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 12 Jul 2018 10:46:36 -0700 Subject: [PATCH 1/3] Cleanup imports, implement bits of numpy_helper in util.pxd --- pandas/_libs/groupby.pyx | 13 ++--- pandas/_libs/index.pyx | 10 ++-- pandas/_libs/interval.pyx | 19 +++--- pandas/_libs/intervaltree.pxi.in | 3 - pandas/_libs/lib.pyx | 2 +- pandas/_libs/parsers.pyx | 4 +- pandas/_libs/src/numpy_helper.h | 18 ------ pandas/_libs/src/util.pxd | 27 ++++++++- pandas/_libs/tslib.pyx | 36 ++++++------ pandas/_libs/tslibs/offsets.pyx | 2 - pandas/_libs/tslibs/parsing.pyx | 22 +++---- pandas/_libs/tslibs/period.pyx | 8 +-- pandas/_libs/tslibs/resolution.pyx | 92 +++++++++++++++--------------- 13 files changed, 126 insertions(+), 130 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index f5f9c06a7e4c2..5e4a431caca00 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1,21 +1,20 @@ # -*- coding: utf-8 -*- # cython: profile=False -cimport numpy as cnp -import numpy as np - cimport cython +from cython cimport Py_ssize_t -cnp.import_array() +from libc.stdlib cimport malloc, free +import numpy as np from numpy cimport (ndarray, double_t, int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t, float32_t, float64_t) -from libc.stdlib cimport malloc, free from util cimport numeric, get_nat + from algos cimport (swap, TiebreakEnumType, TIEBREAK_AVERAGE, TIEBREAK_MIN, TIEBREAK_MAX, TIEBREAK_FIRST, TIEBREAK_DENSE) from algos import take_2d_axis1_float64_float64, groupsort_indexer, tiebreakers @@ -74,8 +73,8 @@ cdef inline float64_t kth_smallest_c(float64_t* a, double_t x, t l = 0 - m = n -1 - while (l=0.27.3 + NPY_DATETIME, NPY_TIMEDELTA) cnp.import_array() -cdef extern from "numpy/arrayobject.h": - # These can be cimported directly from numpy in cython>=0.27.3 - cdef enum NPY_TYPES: - NPY_DATETIME - NPY_TIMEDELTA cimport util diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 4129132251682..b0d8ce9e4b237 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -1,17 +1,22 @@ -cimport numpy as cnp -import numpy as np +# -*- coding: utf-8 -*- +import numbers + +from cpython.object cimport (Py_EQ, Py_NE, Py_GT, Py_LT, Py_GE, Py_LE, + PyObject_RichCompare) -cimport util cimport cython -import cython +from cython cimport Py_ssize_t + +import numpy as np from numpy cimport ndarray + + +cimport util + from tslibs import Timestamp from tslibs.timezones cimport tz_compare -from cpython.object cimport (Py_EQ, Py_NE, Py_GT, Py_LT, Py_GE, Py_LE, - PyObject_RichCompare) -import numbers _VALID_CLOSED = frozenset(['left', 'right', 'both', 'neither']) diff --git a/pandas/_libs/intervaltree.pxi.in b/pandas/_libs/intervaltree.pxi.in index 9ed76242a95c3..8a369797d0308 100644 --- a/pandas/_libs/intervaltree.pxi.in +++ b/pandas/_libs/intervaltree.pxi.in @@ -13,9 +13,6 @@ import numpy as np cimport cython from cython cimport Py_ssize_t -cimport numpy as cnp -cnp.import_array() - from hashtable cimport Int64Vector, Int64VectorData diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 23aebc85e6300..172117f7d8059 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -753,4 +753,4 @@ def indices_fast(object index, ndarray[int64_t] labels, list keys, return result -include "inference.pyx" +include "src/inference.pyx" diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index a24e2cdd99f6f..57355886a56a2 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -924,7 +924,7 @@ cdef class TextReader: status = tokenize_nrows(self.parser, nrows) if self.parser.warn_msg != NULL: - print >> sys.stderr, self.parser.warn_msg + print(self.parser.warn_msg) free(self.parser.warn_msg) self.parser.warn_msg = NULL @@ -952,7 +952,7 @@ cdef class TextReader: status = tokenize_all_rows(self.parser) if self.parser.warn_msg != NULL: - print >> sys.stderr, self.parser.warn_msg + print(self.parser.warn_msg) free(self.parser.warn_msg) self.parser.warn_msg = NULL diff --git a/pandas/_libs/src/numpy_helper.h b/pandas/_libs/src/numpy_helper.h index 5cfa51dc8a0be..f409fec44890d 100644 --- a/pandas/_libs/src/numpy_helper.h +++ b/pandas/_libs/src/numpy_helper.h @@ -30,24 +30,6 @@ PANDAS_INLINE PyObject* get_value_1d(PyArrayObject* ap, Py_ssize_t i) { return PyArray_Scalar(item, PyArray_DESCR(ap), (PyObject*)ap); } -// returns ASCII or UTF8 (py3) view on python str -// python object owns memory, should not be freed -PANDAS_INLINE const char* get_c_string(PyObject* obj) { -#if PY_VERSION_HEX >= 0x03000000 - return PyUnicode_AsUTF8(obj); -#else - return PyString_AsString(obj); -#endif -} - -PANDAS_INLINE PyObject* char_to_string(const char* data) { -#if PY_VERSION_HEX >= 0x03000000 - return PyUnicode_FromString(data); -#else - return PyString_FromString(data); -#endif -} - void set_array_not_contiguous(PyArrayObject* ao) { ao->flags &= ~(NPY_C_CONTIGUOUS | NPY_F_CONTIGUOUS); diff --git a/pandas/_libs/src/util.pxd b/pandas/_libs/src/util.pxd index a8cd78016665f..728eb63dc836c 100644 --- a/pandas/_libs/src/util.pxd +++ b/pandas/_libs/src/util.pxd @@ -4,6 +4,9 @@ cnp.import_array() cimport cpython from cpython cimport PyTypeObject +from cpython.string cimport PyString_FromString, PyString_AsString + +DEF PY3 = bytes != str cdef extern from "Python.h": # Note: importing extern-style allows us to declare these as nogil @@ -14,6 +17,8 @@ cdef extern from "Python.h": bint PyFloat_Check(object obj) nogil bint PyComplex_Check(object obj) nogil bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil + char* PyUnicode_AsUTF8(object unicode) + object PyUnicode_FromString(const char* u) nogil cdef extern from "numpy/arrayobject.h": @@ -69,8 +74,6 @@ cdef extern from "numpy_helper.h": int assign_value_1d(ndarray, Py_ssize_t, object) except -1 cnp.int64_t get_nat() object get_value_1d(ndarray, Py_ssize_t) - char *get_c_string(object) except NULL - object char_to_string(char*) ctypedef fused numeric: cnp.int8_t @@ -101,6 +104,26 @@ cdef extern from "headers/stdint.h": enum: INT64_MIN +cdef inline const char* get_c_string(object obj) except NULL: + """ + returns ASCII or UTF8 (py3) view on python str + python object owns memory, should not be freed + """ + # TODO: this docstring is copied verbatim from version that was + # directly in numpy_helper.C; is it still accurate? + IF PY3: + return PyUnicode_AsUTF8(obj) + ELSE: + return PyString_AsString(obj) + + +cdef inline object char_to_string(const char* data): + IF PY3: + return PyUnicode_FromString(data) + ELSE: + return PyString_FromString(data) + + cdef inline object get_value_at(ndarray arr, object loc): cdef: Py_ssize_t i, sz diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 281e497945c5f..1d44af6b81992 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -1,24 +1,29 @@ # -*- coding: utf-8 -*- # cython: profile=False +cimport cython +from cython cimport Py_ssize_t + +from cpython cimport PyFloat_Check, PyUnicode_Check + +from cpython.datetime cimport (PyDateTime_Check, PyDate_Check, + PyDateTime_CheckExact, + PyDateTime_IMPORT, + timedelta, datetime, date, time) +# import datetime C API +PyDateTime_IMPORT + cimport numpy as cnp from numpy cimport int64_t, ndarray, float64_t import numpy as np cnp.import_array() +import pytz -from cpython cimport PyFloat_Check, PyUnicode_Check from util cimport (is_integer_object, is_float_object, is_string_object, is_datetime64_object) -from cpython.datetime cimport (PyDateTime_Check, PyDate_Check, - PyDateTime_CheckExact, - PyDateTime_IMPORT, - timedelta, datetime, date, time) -# import datetime C API -PyDateTime_IMPORT - from tslibs.np_datetime cimport (check_dts_bounds, pandas_datetimestruct, @@ -30,13 +35,6 @@ from tslibs.np_datetime import OutOfBoundsDatetime from tslibs.parsing import parse_datetime_string -cimport cython -from cython cimport Py_ssize_t - - -import pytz - - from tslibs.timedeltas cimport cast_from_unit from tslibs.timezones cimport (is_utc, is_tzlocal, is_fixed_offset, treat_tz_as_pytz, get_dst_info) @@ -54,7 +52,8 @@ from tslibs.timestamps cimport (create_timestamp_from_ts, _NS_UPPER_BOUND, _NS_LOWER_BOUND) from tslibs.timestamps import Timestamp -cdef bint PY2 = str == bytes + +DEF PY2 = str == bytes cdef inline object create_datetime_from_ts( @@ -556,8 +555,9 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', if len(val) == 0 or val in nat_strings: iresult[i] = NPY_NAT continue - if PyUnicode_Check(val) and PY2: - val = val.encode('utf-8') + if PY2: + if PyUnicode_Check(val): + val = val.encode('utf-8') try: _string_to_dts(val, &dts, &out_local, &out_tzoffset) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 27b7f03358a3a..094a37b210516 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -334,8 +334,6 @@ class _BaseOffset(object): # other is not a DateOffset object return False - return self._params == other._params - def __ne__(self, other): return not self == other diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index f5048d32e826b..580d155f87fa8 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -10,7 +10,7 @@ cimport cython from cython cimport Py_ssize_t -from datetime import datetime +from cpython.datetime cimport datetime import time import numpy as np @@ -37,7 +37,7 @@ from dateutil.parser import DEFAULTPARSER from dateutil.parser import parse as du_parse from ccalendar import MONTH_NUMBERS -from nattype import nat_strings +from nattype import nat_strings, NaT # ---------------------------------------------------------------------- # Constants @@ -54,9 +54,6 @@ cdef object _TIMEPAT = re.compile(r'^([01]?[0-9]|2[0-3]):([0-5][0-9])') cdef set _not_datelike_strings = {'a', 'A', 'm', 'M', 'p', 'P', 't', 'T'} -NAT_SENTINEL = object() -# This allows us to reference NaT without having to import it - # ---------------------------------------------------------------------- @@ -136,9 +133,6 @@ def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): res = parse_datetime_string_with_reso(arg, freq=freq, dayfirst=dayfirst, yearfirst=yearfirst) - if res[0] is NAT_SENTINEL: - from pandas._libs.tslib import NaT - res = (NaT,) + res[1:] return res @@ -206,7 +200,7 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, # should be NaT??? if date_string in nat_strings: - return NAT_SENTINEL, NAT_SENTINEL, '' + return NaT, NaT, '' date_string = date_string.upper() date_len = len(date_string) @@ -407,7 +401,7 @@ def try_parse_dates(ndarray[object] values, parser=None, # EAFP here try: - for i from 0 <= i < n: + for i in range(n): if values[i] == '': result[i] = np.nan else: @@ -419,7 +413,7 @@ def try_parse_dates(ndarray[object] values, parser=None, parse_date = parser try: - for i from 0 <= i < n: + for i in range(n): if values[i] == '': result[i] = np.nan else: @@ -459,7 +453,7 @@ def try_parse_date_and_time(ndarray[object] dates, ndarray[object] times, else: parse_time = time_parser - for i from 0 <= i < n: + for i in range(n): d = parse_date(str(dates[i])) t = parse_time(str(times[i])) result[i] = datetime(d.year, d.month, d.day, @@ -479,7 +473,7 @@ def try_parse_year_month_day(ndarray[object] years, ndarray[object] months, raise ValueError('Length of years/months/days must all be equal') result = np.empty(n, dtype='O') - for i from 0 <= i < n: + for i in range(n): result[i] = datetime(int(years[i]), int(months[i]), int(days[i])) return result @@ -505,7 +499,7 @@ def try_parse_datetime_components(ndarray[object] years, raise ValueError('Length of all datetime components must be equal') result = np.empty(n, dtype='O') - for i from 0 <= i < n: + for i in range(n): float_secs = float(seconds[i]) secs = int(float_secs) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 0ec5d25beeeb9..2ce1008d0ffb3 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -46,14 +46,14 @@ from conversion cimport tz_convert_utc_to_tzlocal from frequencies cimport (get_freq_code, get_base_alias, get_to_timestamp_base, get_freq_str, get_rule_month) -from parsing import parse_time_string, NAT_SENTINEL +from parsing import parse_time_string from resolution import Resolution from nattype import nat_strings, NaT, iNaT from nattype cimport _nat_scalar_rules, NPY_NAT, is_null_datetimelike from offsets cimport to_offset from offsets import _Tick -cdef bint PY2 = str == bytes +DEF PY2 = str == bytes cdef extern from "period_helper.h": @@ -729,7 +729,7 @@ cdef object _period_strftime(int64_t value, int freq, object fmt): result = result.replace(str_extra_fmts[i], repl) - if PY2: + IF PY2: result = result.decode('utf-8', 'ignore') return result @@ -1820,7 +1820,7 @@ class Period(_Period): value = str(value) value = value.upper() dt, _, reso = parse_time_string(value, freq) - if dt is NAT_SENTINEL: + if dt is NaT: ordinal = iNaT if freq is None: diff --git a/pandas/_libs/tslibs/resolution.pyx b/pandas/_libs/tslibs/resolution.pyx index a53d794b48cfa..b0ffb7e5c4fa5 100644 --- a/pandas/_libs/tslibs/resolution.pyx +++ b/pandas/_libs/tslibs/resolution.pyx @@ -367,6 +367,50 @@ cdef ndarray[int64_t, ndim=1] unique_deltas(ndarray[int64_t] arr): return result +cdef object month_position_check(fields, weekdays): + cdef: + int32_t daysinmonth, y, m, d + bint calendar_end = True + bint business_end = True + bint calendar_start = True + bint business_start = True + bint cal + int32_t[:] years + int32_t[:] months + int32_t[:] days + + years = fields['Y'] + months = fields['M'] + days = fields['D'] + + for y, m, d, wd in zip(years, months, days, weekdays): + if calendar_start: + calendar_start &= d == 1 + if business_start: + business_start &= d == 1 or (d <= 3 and wd == 0) + + if calendar_end or business_end: + daysinmonth = get_days_in_month(y, m) + cal = d == daysinmonth + if calendar_end: + calendar_end &= cal + if business_end: + business_end &= cal or (daysinmonth - d < 3 and wd == 4) + elif not calendar_start and not business_start: + break + + if calendar_end: + return 'ce' + elif business_end: + return 'be' + elif calendar_start: + return 'cs' + elif business_start: + return 'bs' + else: + return None + + cdef inline bint _is_multiple(int64_t us, int64_t mult): return us % mult == 0 @@ -475,52 +519,8 @@ cdef class _FrequencyInferer(object): def rep_stamp(self): return Timestamp(self.values[0]) - cdef month_position_check(self): - # TODO: cythonize this, very slow - cdef: - int32_t daysinmonth, y, m, d - bint calendar_end = True - bint business_end = True - bint calendar_start = True - bint business_start = True - bint cal - int32_t[:] years - int32_t[:] months - int32_t[:] days - - fields = self.fields - years = fields['Y'] - months = fields['M'] - days = fields['D'] - weekdays = self.index.dayofweek - - for y, m, d, wd in zip(years, months, days, weekdays): - - if calendar_start: - calendar_start &= d == 1 - if business_start: - business_start &= d == 1 or (d <= 3 and wd == 0) - - if calendar_end or business_end: - daysinmonth = get_days_in_month(y, m) - cal = d == daysinmonth - if calendar_end: - calendar_end &= cal - if business_end: - business_end &= cal or (daysinmonth - d < 3 and wd == 4) - elif not calendar_start and not business_start: - break - - if calendar_end: - return 'ce' - elif business_end: - return 'be' - elif calendar_start: - return 'cs' - elif business_start: - return 'bs' - else: - return None + cdef object month_position_check(self): + return month_position_check(self.fields, self.index.dayofweek) @cache_readonly def mdiffs(self): From e81c5193e630055fbd220acff20a9af1eafbdf85 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 12 Jul 2018 11:22:09 -0700 Subject: [PATCH 2/3] revert removal of import_array to fix segfault --- pandas/_libs/intervaltree.pxi.in | 3 +++ pandas/_libs/tslibs/resolution.pyx | 4 ---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/intervaltree.pxi.in b/pandas/_libs/intervaltree.pxi.in index 8a369797d0308..9ed76242a95c3 100644 --- a/pandas/_libs/intervaltree.pxi.in +++ b/pandas/_libs/intervaltree.pxi.in @@ -13,6 +13,9 @@ import numpy as np cimport cython from cython cimport Py_ssize_t +cimport numpy as cnp +cnp.import_array() + from hashtable cimport Int64Vector, Int64VectorData diff --git a/pandas/_libs/tslibs/resolution.pyx b/pandas/_libs/tslibs/resolution.pyx index b0ffb7e5c4fa5..5f085ff135d93 100644 --- a/pandas/_libs/tslibs/resolution.pyx +++ b/pandas/_libs/tslibs/resolution.pyx @@ -339,10 +339,6 @@ class Resolution(object): # ---------------------------------------------------------------------- # Frequency Inference - -# TODO: this is non performant logic here (and duplicative) and this -# simply should call unique_1d directly -# plus no reason to depend on khash directly cdef ndarray[int64_t, ndim=1] unique_deltas(ndarray[int64_t] arr): cdef: Py_ssize_t i, n = len(arr) From 003babfa4ea299f4e46d380a2e1bf16948b44fca Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 12 Jul 2018 12:13:36 -0700 Subject: [PATCH 3/3] revert to ancient print notation since it broke tests --- pandas/_libs/parsers.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 57355886a56a2..a24e2cdd99f6f 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -924,7 +924,7 @@ cdef class TextReader: status = tokenize_nrows(self.parser, nrows) if self.parser.warn_msg != NULL: - print(self.parser.warn_msg) + print >> sys.stderr, self.parser.warn_msg free(self.parser.warn_msg) self.parser.warn_msg = NULL @@ -952,7 +952,7 @@ cdef class TextReader: status = tokenize_all_rows(self.parser) if self.parser.warn_msg != NULL: - print(self.parser.warn_msg) + print >> sys.stderr, self.parser.warn_msg free(self.parser.warn_msg) self.parser.warn_msg = NULL