From 216cc820cabe0c729bae7a2c925d4bc5465c0d78 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Mon, 6 Aug 2018 10:08:46 -0700 Subject: [PATCH] merge inference into lib.pyx --- pandas/_libs/lib.pyx | 1567 ++++++++++++++++++++++++++++++- pandas/_libs/src/inference.pyx | 1566 ------------------------------ pandas/_libs/src/parse_helper.h | 1 + setup.py | 15 +- 4 files changed, 1541 insertions(+), 1608 deletions(-) delete mode 100644 pandas/_libs/src/inference.pyx diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 4cc119a700ca0..ab7e480f39088 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1,21 +1,12 @@ # -*- coding: utf-8 -*- -# cython: profile=False +from decimal import Decimal +import sys + cimport cython from cython cimport Py_ssize_t -import numpy as np -cimport numpy as cnp -from numpy cimport (ndarray, PyArray_NDIM, PyArray_GETITEM, - PyArray_ITER_DATA, PyArray_ITER_NEXT, PyArray_IterNew, - flatiter, NPY_OBJECT, - int64_t, - float32_t, float64_t, - uint8_t, uint64_t, - complex128_t) -cnp.import_array() - from cpython cimport (Py_INCREF, PyTuple_SET_ITEM, - PyList_Check, PyFloat_Check, + PyList_Check, PyString_Check, PyBytes_Check, PyUnicode_Check, @@ -28,13 +19,63 @@ from cpython.datetime cimport (PyDateTime_Check, PyDate_Check, PyDateTime_IMPORT) PyDateTime_IMPORT -from tslib import NaT, array_to_datetime -from missing cimport checknull +import numpy as np +cimport numpy as cnp +from numpy cimport (ndarray, PyArray_NDIM, PyArray_GETITEM, + PyArray_ITER_DATA, PyArray_ITER_NEXT, PyArray_IterNew, + flatiter, NPY_OBJECT, + int64_t, + float32_t, float64_t, + uint8_t, uint64_t, + complex128_t) +cnp.import_array() + +cdef extern from "numpy/arrayobject.h": + # cython's numpy.dtype specification is incorrect, which leads to + # errors in issubclass(self.dtype.type, np.bool_), so we directly + # include the correct version + # https://github.com/cython/cython/issues/2022 + + ctypedef class numpy.dtype [object PyArray_Descr]: + # Use PyDataType_* macros when possible, however there are no macros + # for accessing some of the fields, so some are defined. Please + # ask on cython-dev if you need more. + cdef int type_num + cdef int itemsize "elsize" + cdef char byteorder + cdef object fields + cdef tuple names + + +cdef extern from "src/parse_helper.h": + int floatify(object, double *result, int *maybe_int) except -1 cimport util +from util cimport (_checknull, + UINT8_MAX, UINT64_MAX, INT64_MAX, INT64_MIN) + +from tslib import array_to_datetime +from tslibs.nattype import NaT +from tslibs.conversion cimport convert_to_tsobject +from tslibs.timedeltas cimport convert_to_timedelta64 +from tslibs.timezones cimport get_timezone, tz_compare + +from missing cimport (checknull, + is_null_datetime64, is_null_timedelta64, is_null_period) + + +# constants that will be compared to potentially arbitrarily large +# python int +cdef object oINT64_MAX = INT64_MAX +cdef object oINT64_MIN = INT64_MIN +cdef object oUINT64_MAX = UINT64_MAX + cdef int64_t NPY_NAT = util.get_nat() -from util cimport is_array, _checknull +iNaT = util.get_nat() + +cdef bint PY2 = sys.version_info[0] == 2 +cdef double nan = np.NaN def values_from_object(object o): @@ -57,7 +98,7 @@ def memory_usage_of_objects(ndarray[object, ndim=1] arr): cdef int64_t s = 0 n = len(arr) - for i from 0 <= i < n: + for i in range(n): s += arr[i].__sizeof__() return s @@ -138,10 +179,10 @@ def fast_unique_multiple(list arrays): dict table = {} object val, stub = 0 - for i from 0 <= i < k: + for i in range(k): buf = arrays[i] n = len(buf) - for j from 0 <= j < n: + for j in range(n): val = buf[j] if val not in table: table[val] = stub @@ -165,10 +206,10 @@ def fast_unique_multiple_list(list lists, bint sort=True): dict table = {} object val, stub = 0 - for i from 0 <= i < k: + for i in range(k): buf = lists[i] n = len(buf) - for j from 0 <= j < n: + for j in range(n): val = buf[j] if val not in table: table[val] = stub @@ -208,7 +249,7 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True): for buf in gen: n = len(buf) - for j from 0 <= j < n: + for j in range(n): val = buf[j] if val not in table: table[val] = stub @@ -595,7 +636,7 @@ def row_bool_subset(ndarray[float64_t, ndim=2] values, ndarray[float64_t, ndim=2] out n, k = ( values).shape - assert(n == len(mask)) + assert (n == len(mask)) out = np.empty((mask.sum(), k), dtype=np.float64) @@ -617,7 +658,7 @@ def row_bool_subset_object(ndarray[object, ndim=2] values, ndarray[object, ndim=2] out n, k = ( values).shape - assert(n == len(mask)) + assert (n == len(mask)) out = np.empty((mask.sum(), k), dtype=object) @@ -663,21 +704,21 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, Py_ssize_t i, j, k, n ndarray[int64_t, ndim=2] counts - assert(axis == 0 or axis == 1) + assert (axis == 0 or axis == 1) n, k = ( mask).shape if axis == 0: counts = np.zeros((max_bin, k), dtype='i8') with nogil: - for i from 0 <= i < n: - for j from 0 <= j < k: + for i in range(n): + for j in range(k): counts[labels[i], j] += mask[i, j] else: # axis == 1 counts = np.zeros((n, max_bin), dtype='i8') with nogil: - for i from 0 <= i < n: - for j from 0 <= j < k: + for i in range(n): + for j in range(k): counts[i, labels[j]] += mask[i, j] return counts @@ -753,4 +794,1470 @@ def indices_fast(object index, ndarray[int64_t] labels, list keys, return result -include "src/inference.pyx" +# core.common import for fast inference checks + +cpdef bint is_float(object obj): + return util.is_float_object(obj) + + +cpdef bint is_integer(object obj): + return util.is_integer_object(obj) + + +cpdef bint is_bool(object obj): + return util.is_bool_object(obj) + + +cpdef bint is_complex(object obj): + return util.is_complex_object(obj) + + +cpdef bint is_decimal(object obj): + return isinstance(obj, Decimal) + + +cpdef bint is_interval(object obj): + return getattr(obj, '_typ', '_typ') == 'interval' + + +cpdef bint is_period(object val): + """ Return a boolean if this is a Period object """ + return util.is_period_object(val) + + +cdef inline bint is_offset(object val): + return getattr(val, '_typ', '_typ') == 'dateoffset' + + +_TYPE_MAP = { + 'categorical': 'categorical', + 'category': 'categorical', + 'int8': 'integer', + 'int16': 'integer', + 'int32': 'integer', + 'int64': 'integer', + 'i': 'integer', + 'uint8': 'integer', + 'uint16': 'integer', + 'uint32': 'integer', + 'uint64': 'integer', + 'u': 'integer', + 'float32': 'floating', + 'float64': 'floating', + 'f': 'floating', + 'complex128': 'complex', + 'c': 'complex', + 'string': 'string' if PY2 else 'bytes', + 'S': 'string' if PY2 else 'bytes', + 'unicode': 'unicode' if PY2 else 'string', + 'U': 'unicode' if PY2 else 'string', + 'bool': 'boolean', + 'b': 'boolean', + 'datetime64[ns]': 'datetime64', + 'M': 'datetime64', + 'timedelta64[ns]': 'timedelta64', + 'm': 'timedelta64', +} + +# types only exist on certain platform +try: + np.float128 + _TYPE_MAP['float128'] = 'floating' +except AttributeError: + pass +try: + np.complex256 + _TYPE_MAP['complex256'] = 'complex' +except AttributeError: + pass +try: + np.float16 + _TYPE_MAP['float16'] = 'floating' +except AttributeError: + pass + + +cdef class Seen(object): + """ + Class for keeping track of the types of elements + encountered when trying to perform type conversions. + """ + + cdef: + bint int_ # seen_int + bint bool_ # seen_bool + bint null_ # seen_null + bint uint_ # seen_uint (unsigned integer) + bint sint_ # seen_sint (signed integer) + bint float_ # seen_float + bint object_ # seen_object + bint complex_ # seen_complex + bint datetime_ # seen_datetime + bint coerce_numeric # coerce data to numeric + bint timedelta_ # seen_timedelta + bint datetimetz_ # seen_datetimetz + + def __cinit__(self, bint coerce_numeric=0): + """ + Initialize a Seen instance. + + Parameters + ---------- + coerce_numeric : bint, default 0 + Whether or not to force conversion to a numeric data type if + initial methods to convert to numeric fail. + """ + self.int_ = 0 + self.bool_ = 0 + self.null_ = 0 + self.uint_ = 0 + self.sint_ = 0 + self.float_ = 0 + self.object_ = 0 + self.complex_ = 0 + self.datetime_ = 0 + self.timedelta_ = 0 + self.datetimetz_ = 0 + self.coerce_numeric = coerce_numeric + + cdef inline bint check_uint64_conflict(self) except -1: + """ + Check whether we can safely convert a uint64 array to a numeric dtype. + + There are two cases when conversion to numeric dtype with a uint64 + array is not safe (and will therefore not be performed) + + 1) A NaN element is encountered. + + uint64 cannot be safely cast to float64 due to truncation issues + at the extreme ends of the range. + + 2) A negative number is encountered. + + There is no numerical dtype that can hold both negative numbers + and numbers greater than INT64_MAX. Hence, at least one number + will be improperly cast if we convert to a numeric dtype. + + Returns + ------- + return_values : bool + Whether or not we should return the original input array to avoid + data truncation. + + Raises + ------ + ValueError : uint64 elements were detected, and at least one of the + two conflict cases was also detected. However, we are + trying to force conversion to a numeric dtype. + """ + return (self.uint_ and (self.null_ or self.sint_) + and not self.coerce_numeric) + + cdef inline saw_null(self): + """ + Set flags indicating that a null value was encountered. + """ + self.null_ = 1 + self.float_ = 1 + + cdef saw_int(self, object val): + """ + Set flags indicating that an integer value was encountered. + + In addition to setting a flag that an integer was seen, we + also set two flags depending on the type of integer seen: + + 1) sint_ : a negative (signed) number in the + range of [-2**63, 0) was encountered + 2) uint_ : a positive number in the range of + [2**63, 2**64) was encountered + + Parameters + ---------- + val : Python int + Value with which to set the flags. + """ + self.int_ = 1 + self.sint_ = self.sint_ or (oINT64_MIN <= val < 0) + self.uint_ = self.uint_ or (oINT64_MAX < val <= oUINT64_MAX) + + @property + def numeric_(self): + return self.complex_ or self.float_ or self.int_ + + @property + def is_bool(self): + return not (self.datetime_ or self.numeric_ or self.timedelta_) + + @property + def is_float_or_complex(self): + return not (self.bool_ or self.datetime_ or self.timedelta_) + + +cdef _try_infer_map(v): + """ if its in our map, just return the dtype """ + cdef: + object attr, val + for attr in ['name', 'kind', 'base']: + val = getattr(v.dtype, attr) + if val in _TYPE_MAP: + return _TYPE_MAP[val] + return None + + +def infer_dtype(object value, bint skipna=False): + """ + Efficiently infer the type of a passed val, or list-like + array of values. Return a string describing the type. + + Parameters + ---------- + value : scalar, list, ndarray, or pandas type + skipna : bool, default False + Ignore NaN values when inferring the type. The default of ``False`` + will be deprecated in a later version of pandas. + + .. versionadded:: 0.21.0 + + Returns + ------- + string describing the common type of the input data. + Results can include: + + - string + - unicode + - bytes + - floating + - integer + - mixed-integer + - mixed-integer-float + - decimal + - complex + - categorical + - boolean + - datetime64 + - datetime + - date + - timedelta64 + - timedelta + - time + - period + - mixed + + Raises + ------ + TypeError if ndarray-like but cannot infer the dtype + + Notes + ----- + - 'mixed' is the catchall for anything that is not otherwise + specialized + - 'mixed-integer-float' are floats and integers + - 'mixed-integer' are integers mixed with non-integers + + Examples + -------- + >>> infer_dtype(['foo', 'bar']) + 'string' + + >>> infer_dtype(['a', np.nan, 'b'], skipna=True) + 'string' + + >>> infer_dtype(['a', np.nan, 'b'], skipna=False) + 'mixed' + + >>> infer_dtype([b'foo', b'bar']) + 'bytes' + + >>> infer_dtype([1, 2, 3]) + 'integer' + + >>> infer_dtype([1, 2, 3.5]) + 'mixed-integer-float' + + >>> infer_dtype([1.0, 2.0, 3.5]) + 'floating' + + >>> infer_dtype(['a', 1]) + 'mixed-integer' + + >>> infer_dtype([Decimal(1), Decimal(2.0)]) + 'decimal' + + >>> infer_dtype([True, False]) + 'boolean' + + >>> infer_dtype([True, False, np.nan]) + 'mixed' + + >>> infer_dtype([pd.Timestamp('20130101')]) + 'datetime' + + >>> infer_dtype([datetime.date(2013, 1, 1)]) + 'date' + + >>> infer_dtype([np.datetime64('2013-01-01')]) + 'datetime64' + + >>> infer_dtype([datetime.timedelta(0, 1, 1)]) + 'timedelta' + + >>> infer_dtype(pd.Series(list('aabc')).astype('category')) + 'categorical' + """ + cdef: + Py_ssize_t i, n + object val + ndarray values + bint seen_pdnat = False + bint seen_val = False + + if util.is_array(value): + values = value + elif hasattr(value, 'dtype'): + # this will handle ndarray-like + # e.g. categoricals + try: + values = getattr(value, '_values', getattr(value, 'values', value)) + except: + value = _try_infer_map(value) + if value is not None: + return value + + # its ndarray like but we can't handle + raise ValueError("cannot infer type for {typ}" + .format(typ=type(value))) + + else: + if not PyList_Check(value): + value = list(value) + from pandas.core.dtypes.cast import ( + construct_1d_object_array_from_listlike) + values = construct_1d_object_array_from_listlike(value) + + values = getattr(values, 'values', values) + val = _try_infer_map(values) + if val is not None: + return val + + if values.dtype != np.object_: + values = values.astype('O') + + # make contiguous + values = values.ravel() + + n = len(values) + if n == 0: + return 'empty' + + # try to use a valid value + for i in range(n): + val = util.get_value_1d(values, i) + + # do not use is_nul_datetimelike to keep + # np.datetime64('nat') and np.timedelta64('nat') + if util._checknull(val): + pass + elif val is NaT: + seen_pdnat = True + else: + seen_val = True + break + + # if all values are nan/NaT + if seen_val is False and seen_pdnat is True: + return 'datetime' + # float/object nan is handled in latter logic + + if util.is_datetime64_object(val): + if is_datetime64_array(values): + return 'datetime64' + elif is_timedelta_or_timedelta64_array(values): + return 'timedelta' + + elif is_timedelta(val): + if is_timedelta_or_timedelta64_array(values): + return 'timedelta' + + elif util.is_integer_object(val): + # a timedelta will show true here as well + if is_timedelta(val): + if is_timedelta_or_timedelta64_array(values): + return 'timedelta' + + if is_integer_array(values): + return 'integer' + elif is_integer_float_array(values): + return 'mixed-integer-float' + elif is_timedelta_or_timedelta64_array(values): + return 'timedelta' + return 'mixed-integer' + + elif PyDateTime_Check(val): + if is_datetime_array(values): + return 'datetime' + + elif PyDate_Check(val): + if is_date_array(values, skipna=skipna): + return 'date' + + elif PyTime_Check(val): + if is_time_array(values, skipna=skipna): + return 'time' + + elif is_decimal(val): + return 'decimal' + + elif util.is_float_object(val): + if is_float_array(values): + return 'floating' + elif is_integer_float_array(values): + return 'mixed-integer-float' + + elif util.is_bool_object(val): + if is_bool_array(values, skipna=skipna): + return 'boolean' + + elif PyString_Check(val): + if is_string_array(values, skipna=skipna): + return 'string' + + elif PyUnicode_Check(val): + if is_unicode_array(values, skipna=skipna): + return 'unicode' + + elif PyBytes_Check(val): + if is_bytes_array(values, skipna=skipna): + return 'bytes' + + elif is_period(val): + if is_period_array(values): + return 'period' + + elif is_interval(val): + if is_interval_array(values): + return 'interval' + + for i in range(n): + val = util.get_value_1d(values, i) + if (util.is_integer_object(val) and + not util.is_timedelta64_object(val) and + not util.is_datetime64_object(val)): + return 'mixed-integer' + + return 'mixed' + + +cpdef object infer_datetimelike_array(object arr): + """ + infer if we have a datetime or timedelta array + - date: we have *only* date and maybe strings, nulls + - datetime: we have *only* datetimes and maybe strings, nulls + - timedelta: we have *only* timedeltas and maybe strings, nulls + - nat: we do not have *any* date, datetimes or timedeltas, but do have + at least a NaT + - mixed: other objects (strings, a mix of tz-aware and tz-naive, or + actual objects) + + Parameters + ---------- + arr : object array + + Returns + ------- + string: {datetime, timedelta, date, nat, mixed} + """ + + cdef: + Py_ssize_t i, n = len(arr) + bint seen_timedelta = 0, seen_date = 0, seen_datetime = 0 + bint seen_tz_aware = 0, seen_tz_naive = 0 + bint seen_nat = 0 + list objs = [] + object v + + for i in range(n): + v = arr[i] + if util.is_string_object(v): + objs.append(v) + + if len(objs) == 3: + break + + elif util._checknull(v): + # nan or None + pass + elif v is NaT: + seen_nat = 1 + elif PyDateTime_Check(v): + # datetime + seen_datetime = 1 + + # disambiguate between tz-naive and tz-aware + if v.tzinfo is None: + seen_tz_naive = 1 + else: + seen_tz_aware = 1 + + if seen_tz_naive and seen_tz_aware: + return 'mixed' + elif util.is_datetime64_object(v): + # np.datetime64 + seen_datetime = 1 + elif PyDate_Check(v): + seen_date = 1 + elif is_timedelta(v) or util.is_timedelta64_object(v): + # timedelta, or timedelta64 + seen_timedelta = 1 + else: + return 'mixed' + + if seen_date and not (seen_datetime or seen_timedelta): + return 'date' + elif seen_datetime and not seen_timedelta: + return 'datetime' + elif seen_timedelta and not seen_datetime: + return 'timedelta' + elif seen_nat: + return 'nat' + + # short-circuit by trying to + # actually convert these strings + # this is for performance as we don't need to try + # convert *every* string array + if len(objs): + try: + array_to_datetime(objs, errors='raise') + return 'datetime' + except: + pass + + # we are *not* going to infer from strings + # for timedelta as too much ambiguity + + return 'mixed' + + +cdef inline bint is_timedelta(object o): + return PyDelta_Check(o) or util.is_timedelta64_object(o) + + +cdef class Validator: + + cdef: + Py_ssize_t n + dtype dtype + bint skipna + + def __cinit__(self, Py_ssize_t n, dtype dtype=np.dtype(np.object_), + bint skipna=False): + self.n = n + self.dtype = dtype + self.skipna = skipna + + cdef bint validate(self, ndarray values) except -1: + if not self.n: + return False + + if self.is_array_typed(): + return True + elif self.dtype.type_num == NPY_OBJECT: + if self.skipna: + return self._validate_skipna(values) + else: + return self._validate(values) + else: + return False + + @cython.wraparound(False) + @cython.boundscheck(False) + cdef bint _validate(self, ndarray values) except -1: + cdef: + Py_ssize_t i + Py_ssize_t n = self.n + + for i in range(n): + if not self.is_valid(values[i]): + return False + + return self.finalize_validate() + + @cython.wraparound(False) + @cython.boundscheck(False) + cdef bint _validate_skipna(self, ndarray values) except -1: + cdef: + Py_ssize_t i + Py_ssize_t n = self.n + + for i in range(n): + if not self.is_valid_skipna(values[i]): + return False + + return self.finalize_validate_skipna() + + cdef bint is_valid(self, object value) except -1: + return self.is_value_typed(value) + + cdef bint is_valid_skipna(self, object value) except -1: + return self.is_valid(value) or self.is_valid_null(value) + + cdef bint is_value_typed(self, object value) except -1: + raise NotImplementedError( + '{typ} child class must define is_value_typed' + .format(typ=type(self).__name__)) + + cdef bint is_valid_null(self, object value) except -1: + return util._checknull(value) + + cdef bint is_array_typed(self) except -1: + return False + + cdef inline bint finalize_validate(self): + return True + + cdef bint finalize_validate_skipna(self): + # TODO(phillipc): Remove the existing validate methods and replace them + # with the skipna versions upon full deprecation of skipna=False + return True + + +cdef class BoolValidator(Validator): + cdef inline bint is_value_typed(self, object value) except -1: + return util.is_bool_object(value) + + cdef inline bint is_array_typed(self) except -1: + return issubclass(self.dtype.type, np.bool_) + + +cpdef bint is_bool_array(ndarray values, bint skipna=False): + cdef: + BoolValidator validator = BoolValidator(len(values), + values.dtype, + skipna=skipna) + return validator.validate(values) + + +cdef class IntegerValidator(Validator): + cdef inline bint is_value_typed(self, object value) except -1: + return util.is_integer_object(value) + + cdef inline bint is_array_typed(self) except -1: + return issubclass(self.dtype.type, np.integer) + + +cpdef bint is_integer_array(ndarray values): + cdef: + IntegerValidator validator = IntegerValidator(len(values), + values.dtype) + return validator.validate(values) + + +cdef class IntegerFloatValidator(Validator): + cdef inline bint is_value_typed(self, object value) except -1: + return util.is_integer_object(value) or util.is_float_object(value) + + cdef inline bint is_array_typed(self) except -1: + return issubclass(self.dtype.type, np.integer) + + +cdef bint is_integer_float_array(ndarray values): + cdef: + IntegerFloatValidator validator = IntegerFloatValidator(len(values), + values.dtype) + return validator.validate(values) + + +cdef class FloatValidator(Validator): + cdef inline bint is_value_typed(self, object value) except -1: + return util.is_float_object(value) + + cdef inline bint is_array_typed(self) except -1: + return issubclass(self.dtype.type, np.floating) + + +cpdef bint is_float_array(ndarray values): + cdef: + FloatValidator validator = FloatValidator(len(values), values.dtype) + return validator.validate(values) + + +cdef class StringValidator(Validator): + cdef inline bint is_value_typed(self, object value) except -1: + return PyString_Check(value) + + cdef inline bint is_array_typed(self) except -1: + return issubclass(self.dtype.type, np.str_) + + +cpdef bint is_string_array(ndarray values, bint skipna=False): + cdef: + StringValidator validator = StringValidator(len(values), + values.dtype, + skipna=skipna) + return validator.validate(values) + + +cdef class UnicodeValidator(Validator): + cdef inline bint is_value_typed(self, object value) except -1: + return PyUnicode_Check(value) + + cdef inline bint is_array_typed(self) except -1: + return issubclass(self.dtype.type, np.unicode_) + + +cdef bint is_unicode_array(ndarray values, bint skipna=False): + cdef: + UnicodeValidator validator = UnicodeValidator(len(values), + values.dtype, + skipna=skipna) + return validator.validate(values) + + +cdef class BytesValidator(Validator): + cdef inline bint is_value_typed(self, object value) except -1: + return PyBytes_Check(value) + + cdef inline bint is_array_typed(self) except -1: + return issubclass(self.dtype.type, np.bytes_) + + +cdef bint is_bytes_array(ndarray values, bint skipna=False): + cdef: + BytesValidator validator = BytesValidator(len(values), values.dtype, + skipna=skipna) + return validator.validate(values) + + +cdef class TemporalValidator(Validator): + cdef: + Py_ssize_t generic_null_count + + def __cinit__(self, Py_ssize_t n, dtype dtype=np.dtype(np.object_), + bint skipna=False): + self.n = n + self.dtype = dtype + self.skipna = skipna + self.generic_null_count = 0 + + cdef inline bint is_valid(self, object value) except -1: + return self.is_value_typed(value) or self.is_valid_null(value) + + cdef bint is_valid_null(self, object value) except -1: + raise NotImplementedError( + '{typ} child class must define is_valid_null' + .format(typ=type(self).__name__)) + + cdef inline bint is_valid_skipna(self, object value) except -1: + cdef: + bint is_typed_null = self.is_valid_null(value) + bint is_generic_null = util._checknull(value) + self.generic_null_count += is_typed_null and is_generic_null + return self.is_value_typed(value) or is_typed_null or is_generic_null + + cdef inline bint finalize_validate_skipna(self): + return self.generic_null_count != self.n + + +cdef class DatetimeValidator(TemporalValidator): + cdef bint is_value_typed(self, object value) except -1: + return PyDateTime_Check(value) + + cdef inline bint is_valid_null(self, object value) except -1: + return is_null_datetime64(value) + + +cpdef bint is_datetime_array(ndarray values): + cdef: + DatetimeValidator validator = DatetimeValidator(len(values), + skipna=True) + return validator.validate(values) + + +cdef class Datetime64Validator(DatetimeValidator): + cdef inline bint is_value_typed(self, object value) except -1: + return util.is_datetime64_object(value) + + +cpdef bint is_datetime64_array(ndarray values): + cdef: + Datetime64Validator validator = Datetime64Validator(len(values), + skipna=True) + return validator.validate(values) + + +cpdef bint is_datetime_with_singletz_array(ndarray values): + """ + Check values have the same tzinfo attribute. + Doesn't check values are datetime-like types. + """ + cdef: + Py_ssize_t i, j, n = len(values) + object base_val, base_tz, val, tz + + if n == 0: + return False + + for i in range(n): + base_val = values[i] + if base_val is not NaT: + base_tz = get_timezone(getattr(base_val, 'tzinfo', None)) + + for j in range(i, n): + val = values[j] + if val is not NaT: + tz = getattr(val, 'tzinfo', None) + if not tz_compare(base_tz, tz): + return False + break + + return True + + +cdef class TimedeltaValidator(TemporalValidator): + cdef bint is_value_typed(self, object value) except -1: + return PyDelta_Check(value) + + cdef inline bint is_valid_null(self, object value) except -1: + return is_null_timedelta64(value) + + +cpdef bint is_timedelta_array(ndarray values): + cdef: + TimedeltaValidator validator = TimedeltaValidator(len(values), + skipna=True) + return validator.validate(values) + + +cdef class Timedelta64Validator(TimedeltaValidator): + cdef inline bint is_value_typed(self, object value) except -1: + return util.is_timedelta64_object(value) + + +cpdef bint is_timedelta64_array(ndarray values): + cdef: + Timedelta64Validator validator = Timedelta64Validator(len(values), + skipna=True) + return validator.validate(values) + + +cdef class AnyTimedeltaValidator(TimedeltaValidator): + cdef inline bint is_value_typed(self, object value) except -1: + return is_timedelta(value) + + +cpdef bint is_timedelta_or_timedelta64_array(ndarray values): + """ infer with timedeltas and/or nat/none """ + cdef: + AnyTimedeltaValidator validator = AnyTimedeltaValidator(len(values), + skipna=True) + return validator.validate(values) + + +cdef class DateValidator(Validator): + cdef inline bint is_value_typed(self, object value) except -1: + return PyDate_Check(value) + + +cpdef bint is_date_array(ndarray values, bint skipna=False): + cdef: + DateValidator validator = DateValidator(len(values), skipna=skipna) + return validator.validate(values) + + +cdef class TimeValidator(Validator): + cdef inline bint is_value_typed(self, object value) except -1: + return PyTime_Check(value) + + +cpdef bint is_time_array(ndarray values, bint skipna=False): + cdef: + TimeValidator validator = TimeValidator(len(values), skipna=skipna) + return validator.validate(values) + + +cdef class PeriodValidator(TemporalValidator): + cdef inline bint is_value_typed(self, object value) except -1: + return is_period(value) + + cdef inline bint is_valid_null(self, object value) except -1: + return is_null_period(value) + + +cpdef bint is_period_array(ndarray values): + cdef: + PeriodValidator validator = PeriodValidator(len(values), skipna=True) + return validator.validate(values) + + +cdef class IntervalValidator(Validator): + cdef inline bint is_value_typed(self, object value) except -1: + return is_interval(value) + + +cpdef bint is_interval_array(ndarray values): + cdef: + IntervalValidator validator = IntervalValidator(len(values), + skipna=True) + return validator.validate(values) + + +@cython.boundscheck(False) +@cython.wraparound(False) +def maybe_convert_numeric(ndarray[object] values, set na_values, + bint convert_empty=True, bint coerce_numeric=False): + """ + Convert object array to a numeric array if possible. + + Parameters + ---------- + values : ndarray + Array of object elements to convert. + na_values : set + Set of values that should be interpreted as NaN. + convert_empty : bool, default True + If an empty array-like object is encountered, whether to interpret + that element as NaN or not. If set to False, a ValueError will be + raised if such an element is encountered and 'coerce_numeric' is False. + coerce_numeric : bool, default False + If initial attempts to convert to numeric have failed, whether to + force conversion to numeric via alternative methods or by setting the + element to NaN. Otherwise, an Exception will be raised when such an + element is encountered. + + This boolean also has an impact on how conversion behaves when a + numeric array has no suitable numerical dtype to return (i.e. uint64, + int32, uint8). If set to False, the original object array will be + returned. Otherwise, a ValueError will be raised. + + Returns + ------- + numeric_array : array of converted object values to numerical ones + """ + + if len(values) == 0: + return np.array([], dtype='i8') + + # fastpath for ints - try to convert all based on first value + cdef: + object val = values[0] + + if util.is_integer_object(val): + try: + maybe_ints = values.astype('i8') + if (maybe_ints == values).all(): + return maybe_ints + except (ValueError, OverflowError, TypeError): + pass + + # otherwise, iterate and do full infererence + cdef: + int status, maybe_int + Py_ssize_t i, n = values.size + Seen seen = Seen(coerce_numeric) + ndarray[float64_t] floats = np.empty(n, dtype='f8') + ndarray[complex128_t] complexes = np.empty(n, dtype='c16') + ndarray[int64_t] ints = np.empty(n, dtype='i8') + ndarray[uint64_t] uints = np.empty(n, dtype='u8') + ndarray[uint8_t] bools = np.empty(n, dtype='u1') + float64_t fval + + for i in range(n): + val = values[i] + + if val.__hash__ is not None and val in na_values: + seen.saw_null() + floats[i] = complexes[i] = nan + elif util.is_float_object(val): + fval = val + if fval != fval: + seen.null_ = True + + floats[i] = complexes[i] = fval + seen.float_ = True + elif util.is_integer_object(val): + floats[i] = complexes[i] = val + + val = int(val) + seen.saw_int(val) + + if val >= 0: + if val <= oUINT64_MAX: + uints[i] = val + else: + seen.float_ = True + + if val <= oINT64_MAX: + ints[i] = val + + if seen.sint_ and seen.uint_: + seen.float_ = True + + elif util.is_bool_object(val): + floats[i] = uints[i] = ints[i] = bools[i] = val + seen.bool_ = True + elif val is None: + seen.saw_null() + floats[i] = complexes[i] = nan + elif hasattr(val, '__len__') and len(val) == 0: + if convert_empty or seen.coerce_numeric: + seen.saw_null() + floats[i] = complexes[i] = nan + else: + raise ValueError('Empty string encountered') + elif util.is_complex_object(val): + complexes[i] = val + seen.complex_ = True + elif is_decimal(val): + floats[i] = complexes[i] = val + seen.float_ = True + else: + try: + status = floatify(val, &fval, &maybe_int) + + if fval in na_values: + seen.saw_null() + floats[i] = complexes[i] = nan + else: + if fval != fval: + seen.null_ = True + + floats[i] = fval + + if maybe_int: + as_int = int(val) + + if as_int in na_values: + seen.saw_null() + else: + seen.saw_int(as_int) + + if not (seen.float_ or as_int in na_values): + if as_int < oINT64_MIN or as_int > oUINT64_MAX: + raise ValueError('Integer out of range.') + + if as_int >= 0: + uints[i] = as_int + if as_int <= oINT64_MAX: + ints[i] = as_int + + seen.float_ = seen.float_ or (seen.uint_ and seen.sint_) + else: + seen.float_ = True + except (TypeError, ValueError) as e: + if not seen.coerce_numeric: + raise type(e)(str(e) + ' at position {pos}'.format(pos=i)) + elif "uint64" in str(e): # Exception from check functions. + raise + seen.saw_null() + floats[i] = nan + + if seen.check_uint64_conflict(): + return values + + if seen.complex_: + return complexes + elif seen.float_: + return floats + elif seen.int_: + if seen.uint_: + return uints + else: + return ints + elif seen.bool_: + return bools.view(np.bool_) + elif seen.uint_: + return uints + return ints + + +@cython.boundscheck(False) +@cython.wraparound(False) +def maybe_convert_objects(ndarray[object] objects, bint try_float=0, + bint safe=0, bint convert_datetime=0, + bint convert_timedelta=0): + """ + Type inference function-- convert object array to proper dtype + """ + cdef: + Py_ssize_t i, n + ndarray[float64_t] floats + ndarray[complex128_t] complexes + ndarray[int64_t] ints + ndarray[uint64_t] uints + ndarray[uint8_t] bools + ndarray[int64_t] idatetimes + ndarray[int64_t] itimedeltas + Seen seen = Seen() + object val + float64_t fval, fnan + + n = len(objects) + + floats = np.empty(n, dtype='f8') + complexes = np.empty(n, dtype='c16') + ints = np.empty(n, dtype='i8') + uints = np.empty(n, dtype='u8') + bools = np.empty(n, dtype=np.uint8) + + if convert_datetime: + datetimes = np.empty(n, dtype='M8[ns]') + idatetimes = datetimes.view(np.int64) + + if convert_timedelta: + timedeltas = np.empty(n, dtype='m8[ns]') + itimedeltas = timedeltas.view(np.int64) + + fnan = np.nan + + for i in range(n): + val = objects[i] + + if val is None: + seen.null_ = 1 + floats[i] = complexes[i] = fnan + elif val is NaT: + if convert_datetime: + idatetimes[i] = iNaT + seen.datetime_ = 1 + if convert_timedelta: + itimedeltas[i] = iNaT + seen.timedelta_ = 1 + if not (convert_datetime or convert_timedelta): + seen.object_ = 1 + elif util.is_bool_object(val): + seen.bool_ = 1 + bools[i] = val + elif util.is_float_object(val): + floats[i] = complexes[i] = val + seen.float_ = 1 + elif util.is_datetime64_object(val): + if convert_datetime: + idatetimes[i] = convert_to_tsobject( + val, None, None, 0, 0).value + seen.datetime_ = 1 + else: + seen.object_ = 1 + break + elif is_timedelta(val): + if convert_timedelta: + itimedeltas[i] = convert_to_timedelta64(val, 'ns') + seen.timedelta_ = 1 + else: + seen.object_ = 1 + break + elif util.is_integer_object(val): + seen.int_ = 1 + floats[i] = val + complexes[i] = val + if not seen.null_: + seen.saw_int(int(val)) + + if ((seen.uint_ and seen.sint_) or + val > oUINT64_MAX or val < oINT64_MIN): + seen.object_ = 1 + break + + if seen.uint_: + uints[i] = val + elif seen.sint_: + ints[i] = val + else: + uints[i] = val + ints[i] = val + + elif util.is_complex_object(val): + complexes[i] = val + seen.complex_ = 1 + elif PyDateTime_Check(val) or util.is_datetime64_object(val): + + # if we have an tz's attached then return the objects + if convert_datetime: + if getattr(val, 'tzinfo', None) is not None: + seen.datetimetz_ = 1 + break + else: + seen.datetime_ = 1 + idatetimes[i] = convert_to_tsobject( + val, None, None, 0, 0).value + else: + seen.object_ = 1 + break + elif try_float and not util.is_string_object(val): + # this will convert Decimal objects + try: + floats[i] = float(val) + complexes[i] = complex(val) + seen.float_ = 1 + except Exception: + seen.object_ = 1 + break + else: + seen.object_ = 1 + break + + # we try to coerce datetime w/tz but must all have the same tz + if seen.datetimetz_: + if len({getattr(val, 'tzinfo', None) for val in objects}) == 1: + from pandas import DatetimeIndex + return DatetimeIndex(objects) + seen.object_ = 1 + + if not seen.object_: + if not safe: + if seen.null_: + if seen.is_float_or_complex: + if seen.complex_: + return complexes + elif seen.float_ or seen.int_: + return floats + else: + if not seen.bool_: + if seen.datetime_: + if not seen.numeric_: + return datetimes + elif seen.timedelta_: + if not seen.numeric_: + return timedeltas + else: + if seen.complex_: + return complexes + elif seen.float_: + return floats + elif seen.int_: + if seen.uint_: + return uints + else: + return ints + elif seen.is_bool: + return bools.view(np.bool_) + + else: + # don't cast int to float, etc. + if seen.null_: + if seen.is_float_or_complex: + if seen.complex_: + if not seen.int_: + return complexes + elif seen.float_: + if not seen.int_: + return floats + else: + if not seen.bool_: + if seen.datetime_: + if not seen.numeric_: + return datetimes + elif seen.timedelta_: + if not seen.numeric_: + return timedeltas + else: + if seen.complex_: + if not seen.int_: + return complexes + elif seen.float_: + if not seen.int_: + return floats + elif seen.int_: + if seen.uint_: + return uints + else: + return ints + elif seen.is_bool: + return bools.view(np.bool_) + + return objects + + +def map_infer_mask(ndarray arr, object f, ndarray[uint8_t] mask, + bint convert=1): + """ + Substitute for np.vectorize with pandas-friendly dtype inference + + Parameters + ---------- + arr : ndarray + f : function + + Returns + ------- + mapped : ndarray + """ + cdef: + Py_ssize_t i, n + ndarray[object] result + object val + + n = len(arr) + result = np.empty(n, dtype=object) + for i in range(n): + if mask[i]: + val = util.get_value_at(arr, i) + else: + val = f(util.get_value_at(arr, i)) + + if util.is_array(val) and PyArray_NDIM(val) == 0: + # unbox 0-dim arrays, GH#690 + # TODO: is there a faster way to unbox? + # item_from_zerodim? + val = val.item() + + result[i] = val + + if convert: + return maybe_convert_objects(result, + try_float=0, + convert_datetime=0, + convert_timedelta=0) + + return result + + +def map_infer(ndarray arr, object f, bint convert=1): + """ + Substitute for np.vectorize with pandas-friendly dtype inference + + Parameters + ---------- + arr : ndarray + f : function + + Returns + ------- + mapped : ndarray + """ + cdef: + Py_ssize_t i, n + ndarray[object] result + object val + + n = len(arr) + result = np.empty(n, dtype=object) + for i in range(n): + val = f(util.get_value_at(arr, i)) + + if util.is_array(val) and PyArray_NDIM(val) == 0: + # unbox 0-dim arrays, GH#690 + # TODO: is there a faster way to unbox? + # item_from_zerodim? + val = val.item() + + result[i] = val + + if convert: + return maybe_convert_objects(result, + try_float=0, + convert_datetime=0, + convert_timedelta=0) + + return result + + +def to_object_array(list rows, int min_width=0): + """ + Convert a list of lists into an object array. + + Parameters + ---------- + rows : 2-d array (N, K) + A list of lists to be converted into an array + min_width : int + The minimum width of the object array. If a list + in `rows` contains fewer than `width` elements, + the remaining elements in the corresponding row + will all be `NaN`. + + Returns + ------- + obj_array : numpy array of the object dtype + """ + cdef: + Py_ssize_t i, j, n, k, tmp + ndarray[object, ndim=2] result + list row + + n = len(rows) + + k = min_width + for i in range(n): + tmp = len(rows[i]) + if tmp > k: + k = tmp + + result = np.empty((n, k), dtype=object) + + for i in range(n): + row = rows[i] + + for j in range(len(row)): + result[i, j] = row[j] + + return result + + +def tuples_to_object_array(ndarray[object] tuples): + cdef: + Py_ssize_t i, j, n, k, tmp + ndarray[object, ndim=2] result + tuple tup + + n = len(tuples) + k = len(tuples[0]) + result = np.empty((n, k), dtype=object) + for i in range(n): + tup = tuples[i] + for j in range(k): + result[i, j] = tup[j] + + return result + + +def to_object_array_tuples(list rows): + cdef: + Py_ssize_t i, j, n, k, tmp + ndarray[object, ndim=2] result + tuple row + + n = len(rows) + + k = 0 + for i in range(n): + tmp = len(rows[i]) + if tmp > k: + k = tmp + + result = np.empty((n, k), dtype=object) + + try: + for i in range(n): + row = rows[i] + for j in range(len(row)): + result[i, j] = row[j] + except Exception: + # upcast any subclasses to tuple + for i in range(n): + row = tuple(rows[i]) + for j in range(len(row)): + result[i, j] = row[j] + + return result + + +def fast_multiget(dict mapping, ndarray keys, default=np.nan): + cdef: + Py_ssize_t i, n = len(keys) + object val + ndarray[object] output = np.empty(n, dtype='O') + + if n == 0: + # kludge, for Series + return np.empty(0, dtype='f8') + + keys = getattr(keys, 'values', keys) + + for i in range(n): + val = util.get_value_1d(keys, i) + if val in mapping: + output[i] = mapping[val] + else: + output[i] = default + + return maybe_convert_objects(output) diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx deleted file mode 100644 index 8cf69057a7e74..0000000000000 --- a/pandas/_libs/src/inference.pyx +++ /dev/null @@ -1,1566 +0,0 @@ -import sys -from decimal import Decimal -cimport util -cimport cython -from tslibs.nattype import NaT -from tslibs.conversion cimport convert_to_tsobject -from tslibs.timedeltas cimport convert_to_timedelta64 -from tslibs.timezones cimport get_timezone, tz_compare - -iNaT = util.get_nat() - -cdef bint PY2 = sys.version_info[0] == 2 -cdef double nan = np.NaN - -cdef extern from "numpy/arrayobject.h": - # cython's numpy.dtype specification is incorrect, which leads to - # errors in issubclass(self.dtype.type, np.bool_), so we directly - # include the correct version - # https://github.com/cython/cython/issues/2022 - - ctypedef class numpy.dtype [object PyArray_Descr]: - # Use PyDataType_* macros when possible, however there are no macros - # for accessing some of the fields, so some are defined. Please - # ask on cython-dev if you need more. - cdef int type_num - cdef int itemsize "elsize" - cdef char byteorder - cdef object fields - cdef tuple names - -from missing cimport is_null_datetime64, is_null_timedelta64, is_null_period - -from util cimport UINT8_MAX, UINT64_MAX, INT64_MAX, INT64_MIN - -# core.common import for fast inference checks - -cpdef bint is_float(object obj): - return util.is_float_object(obj) - - -cpdef bint is_integer(object obj): - return util.is_integer_object(obj) - - -cpdef bint is_bool(object obj): - return util.is_bool_object(obj) - - -cpdef bint is_complex(object obj): - return util.is_complex_object(obj) - - -cpdef bint is_decimal(object obj): - return isinstance(obj, Decimal) - - -cpdef bint is_interval(object obj): - return getattr(obj, '_typ', '_typ') == 'interval' - - -cpdef bint is_period(object val): - """ Return a boolean if this is a Period object """ - return util.is_period_object(val) - -cdef inline bint is_offset(object val): - return getattr(val, '_typ', '_typ') == 'dateoffset' - -_TYPE_MAP = { - 'categorical': 'categorical', - 'category': 'categorical', - 'int8': 'integer', - 'int16': 'integer', - 'int32': 'integer', - 'int64': 'integer', - 'i': 'integer', - 'uint8': 'integer', - 'uint16': 'integer', - 'uint32': 'integer', - 'uint64': 'integer', - 'u': 'integer', - 'float32': 'floating', - 'float64': 'floating', - 'f': 'floating', - 'complex128': 'complex', - 'c': 'complex', - 'string': 'string' if PY2 else 'bytes', - 'S': 'string' if PY2 else 'bytes', - 'unicode': 'unicode' if PY2 else 'string', - 'U': 'unicode' if PY2 else 'string', - 'bool': 'boolean', - 'b': 'boolean', - 'datetime64[ns]': 'datetime64', - 'M': 'datetime64', - 'timedelta64[ns]': 'timedelta64', - 'm': 'timedelta64', -} - -# types only exist on certain platform -try: - np.float128 - _TYPE_MAP['float128'] = 'floating' -except AttributeError: - pass -try: - np.complex256 - _TYPE_MAP['complex256'] = 'complex' -except AttributeError: - pass -try: - np.float16 - _TYPE_MAP['float16'] = 'floating' -except AttributeError: - pass - - -cdef class Seen(object): - """ - Class for keeping track of the types of elements - encountered when trying to perform type conversions. - """ - - cdef: - bint int_ # seen_int - bint bool_ # seen_bool - bint null_ # seen_null - bint uint_ # seen_uint (unsigned integer) - bint sint_ # seen_sint (signed integer) - bint float_ # seen_float - bint object_ # seen_object - bint complex_ # seen_complex - bint datetime_ # seen_datetime - bint coerce_numeric # coerce data to numeric - bint timedelta_ # seen_timedelta - bint datetimetz_ # seen_datetimetz - - def __cinit__(self, bint coerce_numeric=0): - """ - Initialize a Seen instance. - - Parameters - ---------- - coerce_numeric : bint, default 0 - Whether or not to force conversion to a numeric data type if - initial methods to convert to numeric fail. - """ - self.int_ = 0 - self.bool_ = 0 - self.null_ = 0 - self.uint_ = 0 - self.sint_ = 0 - self.float_ = 0 - self.object_ = 0 - self.complex_ = 0 - self.datetime_ = 0 - self.timedelta_ = 0 - self.datetimetz_ = 0 - self.coerce_numeric = coerce_numeric - - cdef inline bint check_uint64_conflict(self) except -1: - """ - Check whether we can safely convert a uint64 array to a numeric dtype. - - There are two cases when conversion to numeric dtype with a uint64 - array is not safe (and will therefore not be performed) - - 1) A NaN element is encountered. - - uint64 cannot be safely cast to float64 due to truncation issues - at the extreme ends of the range. - - 2) A negative number is encountered. - - There is no numerical dtype that can hold both negative numbers - and numbers greater than INT64_MAX. Hence, at least one number - will be improperly cast if we convert to a numeric dtype. - - Returns - ------- - return_values : bool - Whether or not we should return the original input array to avoid - data truncation. - - Raises - ------ - ValueError : uint64 elements were detected, and at least one of the - two conflict cases was also detected. However, we are - trying to force conversion to a numeric dtype. - """ - return (self.uint_ and (self.null_ or self.sint_) - and not self.coerce_numeric) - - cdef inline saw_null(self): - """ - Set flags indicating that a null value was encountered. - """ - self.null_ = 1 - self.float_ = 1 - - cdef saw_int(self, object val): - """ - Set flags indicating that an integer value was encountered. - - In addition to setting a flag that an integer was seen, we - also set two flags depending on the type of integer seen: - - 1) sint_ : a negative (signed) number in the - range of [-2**63, 0) was encountered - 2) uint_ : a positive number in the range of - [2**63, 2**64) was encountered - - Parameters - ---------- - val : Python int - Value with which to set the flags. - """ - self.int_ = 1 - self.sint_ = self.sint_ or (oINT64_MIN <= val < 0) - self.uint_ = self.uint_ or (oINT64_MAX < val <= oUINT64_MAX) - - @property - def numeric_(self): - return self.complex_ or self.float_ or self.int_ - - @property - def is_bool(self): - return not (self.datetime_ or self.numeric_ or self.timedelta_) - - @property - def is_float_or_complex(self): - return not (self.bool_ or self.datetime_ or self.timedelta_) - - -cdef _try_infer_map(v): - """ if its in our map, just return the dtype """ - cdef: - object attr, val - for attr in ['name', 'kind', 'base']: - val = getattr(v.dtype, attr) - if val in _TYPE_MAP: - return _TYPE_MAP[val] - return None - - -def infer_dtype(object value, bint skipna=False): - """ - Efficiently infer the type of a passed val, or list-like - array of values. Return a string describing the type. - - Parameters - ---------- - value : scalar, list, ndarray, or pandas type - skipna : bool, default False - Ignore NaN values when inferring the type. The default of ``False`` - will be deprecated in a later version of pandas. - - .. versionadded:: 0.21.0 - - Returns - ------- - string describing the common type of the input data. - Results can include: - - - string - - unicode - - bytes - - floating - - integer - - mixed-integer - - mixed-integer-float - - decimal - - complex - - categorical - - boolean - - datetime64 - - datetime - - date - - timedelta64 - - timedelta - - time - - period - - mixed - - Raises - ------ - TypeError if ndarray-like but cannot infer the dtype - - Notes - ----- - - 'mixed' is the catchall for anything that is not otherwise - specialized - - 'mixed-integer-float' are floats and integers - - 'mixed-integer' are integers mixed with non-integers - - Examples - -------- - >>> infer_dtype(['foo', 'bar']) - 'string' - - >>> infer_dtype(['a', np.nan, 'b'], skipna=True) - 'string' - - >>> infer_dtype(['a', np.nan, 'b'], skipna=False) - 'mixed' - - >>> infer_dtype([b'foo', b'bar']) - 'bytes' - - >>> infer_dtype([1, 2, 3]) - 'integer' - - >>> infer_dtype([1, 2, 3.5]) - 'mixed-integer-float' - - >>> infer_dtype([1.0, 2.0, 3.5]) - 'floating' - - >>> infer_dtype(['a', 1]) - 'mixed-integer' - - >>> infer_dtype([Decimal(1), Decimal(2.0)]) - 'decimal' - - >>> infer_dtype([True, False]) - 'boolean' - - >>> infer_dtype([True, False, np.nan]) - 'mixed' - - >>> infer_dtype([pd.Timestamp('20130101')]) - 'datetime' - - >>> infer_dtype([datetime.date(2013, 1, 1)]) - 'date' - - >>> infer_dtype([np.datetime64('2013-01-01')]) - 'datetime64' - - >>> infer_dtype([datetime.timedelta(0, 1, 1)]) - 'timedelta' - - >>> infer_dtype(pd.Series(list('aabc')).astype('category')) - 'categorical' - """ - cdef: - Py_ssize_t i, n - object val - ndarray values - bint seen_pdnat = False - bint seen_val = False - - if util.is_array(value): - values = value - elif hasattr(value, 'dtype'): - - # this will handle ndarray-like - # e.g. categoricals - try: - values = getattr(value, '_values', getattr( - value, 'values', value)) - except: - value = _try_infer_map(value) - if value is not None: - return value - - # its ndarray like but we can't handle - raise ValueError("cannot infer type for {0}".format(type(value))) - - else: - if not PyList_Check(value): - value = list(value) - from pandas.core.dtypes.cast import ( - construct_1d_object_array_from_listlike) - values = construct_1d_object_array_from_listlike(value) - - values = getattr(values, 'values', values) - val = _try_infer_map(values) - if val is not None: - return val - - if values.dtype != np.object_: - values = values.astype('O') - - # make contiguous - values = values.ravel() - - n = len(values) - if n == 0: - return 'empty' - - # try to use a valid value - for i in range(n): - val = util.get_value_1d(values, i) - - # do not use is_nul_datetimelike to keep - # np.datetime64('nat') and np.timedelta64('nat') - if util._checknull(val): - pass - elif val is NaT: - seen_pdnat = True - else: - seen_val = True - break - - # if all values are nan/NaT - if seen_val is False and seen_pdnat is True: - return 'datetime' - # float/object nan is handled in latter logic - - if util.is_datetime64_object(val): - if is_datetime64_array(values): - return 'datetime64' - elif is_timedelta_or_timedelta64_array(values): - return 'timedelta' - - elif is_timedelta(val): - if is_timedelta_or_timedelta64_array(values): - return 'timedelta' - - elif util.is_integer_object(val): - # a timedelta will show true here as well - if is_timedelta(val): - if is_timedelta_or_timedelta64_array(values): - return 'timedelta' - - if is_integer_array(values): - return 'integer' - elif is_integer_float_array(values): - return 'mixed-integer-float' - elif is_timedelta_or_timedelta64_array(values): - return 'timedelta' - return 'mixed-integer' - - elif is_datetime(val): - if is_datetime_array(values): - return 'datetime' - - elif is_date(val): - if is_date_array(values, skipna=skipna): - return 'date' - - elif is_time(val): - if is_time_array(values, skipna=skipna): - return 'time' - - elif is_decimal(val): - return 'decimal' - - elif util.is_float_object(val): - if is_float_array(values): - return 'floating' - elif is_integer_float_array(values): - return 'mixed-integer-float' - - elif util.is_bool_object(val): - if is_bool_array(values, skipna=skipna): - return 'boolean' - - elif PyString_Check(val): - if is_string_array(values, skipna=skipna): - return 'string' - - elif PyUnicode_Check(val): - if is_unicode_array(values, skipna=skipna): - return 'unicode' - - elif PyBytes_Check(val): - if is_bytes_array(values, skipna=skipna): - return 'bytes' - - elif is_period(val): - if is_period_array(values): - return 'period' - - elif is_interval(val): - if is_interval_array(values): - return 'interval' - - for i in range(n): - val = util.get_value_1d(values, i) - if (util.is_integer_object(val) and - not util.is_timedelta64_object(val) and - not util.is_datetime64_object(val)): - return 'mixed-integer' - - return 'mixed' - - -cpdef object infer_datetimelike_array(object arr): - """ - infer if we have a datetime or timedelta array - - date: we have *only* date and maybe strings, nulls - - datetime: we have *only* datetimes and maybe strings, nulls - - timedelta: we have *only* timedeltas and maybe strings, nulls - - nat: we do not have *any* date, datetimes or timedeltas, but do have - at least a NaT - - mixed: other objects (strings, a mix of tz-aware and tz-naive, or - actual objects) - - Parameters - ---------- - arr : object array - - Returns - ------- - string: {datetime, timedelta, date, nat, mixed} - - """ - - cdef: - Py_ssize_t i, n = len(arr) - bint seen_timedelta = 0, seen_date = 0, seen_datetime = 0 - bint seen_tz_aware = 0, seen_tz_naive = 0 - bint seen_nat = 0 - list objs = [] - object v - - for i in range(n): - v = arr[i] - if util.is_string_object(v): - objs.append(v) - - if len(objs) == 3: - break - - elif util._checknull(v): - # nan or None - pass - elif v is NaT: - seen_nat = 1 - elif is_datetime(v): - # datetime - seen_datetime = 1 - - # disambiguate between tz-naive and tz-aware - if v.tzinfo is None: - seen_tz_naive = 1 - else: - seen_tz_aware = 1 - - if seen_tz_naive and seen_tz_aware: - return 'mixed' - elif util.is_datetime64_object(v): - # np.datetime64 - seen_datetime = 1 - elif is_date(v): - seen_date = 1 - elif is_timedelta(v) or util.is_timedelta64_object(v): - # timedelta, or timedelta64 - seen_timedelta = 1 - else: - return 'mixed' - - if seen_date and not (seen_datetime or seen_timedelta): - return 'date' - elif seen_datetime and not seen_timedelta: - return 'datetime' - elif seen_timedelta and not seen_datetime: - return 'timedelta' - elif seen_nat: - return 'nat' - - # short-circuit by trying to - # actually convert these strings - # this is for performance as we don't need to try - # convert *every* string array - if len(objs): - try: - array_to_datetime(objs, errors='raise') - return 'datetime' - except: - pass - - # we are *not* going to infer from strings - # for timedelta as too much ambiguity - - return 'mixed' - - -cdef inline bint is_datetime(object o): - return PyDateTime_Check(o) - -cdef inline bint is_date(object o): - return PyDate_Check(o) - -cdef inline bint is_time(object o): - return PyTime_Check(o) - -cdef inline bint is_timedelta(object o): - return PyDelta_Check(o) or util.is_timedelta64_object(o) - - -cdef class Validator: - - cdef: - Py_ssize_t n - dtype dtype - bint skipna - - def __cinit__( - self, - Py_ssize_t n, - dtype dtype=np.dtype(np.object_), - bint skipna=False - ): - self.n = n - self.dtype = dtype - self.skipna = skipna - - cdef bint validate(self, ndarray values) except -1: - if not self.n: - return False - - if self.is_array_typed(): - return True - elif self.dtype.type_num == NPY_OBJECT: - if self.skipna: - return self._validate_skipna(values) - else: - return self._validate(values) - else: - return False - - @cython.wraparound(False) - @cython.boundscheck(False) - cdef bint _validate(self, ndarray values) except -1: - cdef: - Py_ssize_t i - Py_ssize_t n = self.n - - for i in range(n): - if not self.is_valid(values[i]): - return False - - return self.finalize_validate() - - @cython.wraparound(False) - @cython.boundscheck(False) - cdef bint _validate_skipna(self, ndarray values) except -1: - cdef: - Py_ssize_t i - Py_ssize_t n = self.n - - for i in range(n): - if not self.is_valid_skipna(values[i]): - return False - - return self.finalize_validate_skipna() - - cdef bint is_valid(self, object value) except -1: - return self.is_value_typed(value) - - cdef bint is_valid_skipna(self, object value) except -1: - return self.is_valid(value) or self.is_valid_null(value) - - cdef bint is_value_typed(self, object value) except -1: - raise NotImplementedError( - '{} child class must define is_value_typed'.format( - type(self).__name__ - ) - ) - - cdef bint is_valid_null(self, object value) except -1: - return util._checknull(value) - - cdef bint is_array_typed(self) except -1: - return False - - cdef inline bint finalize_validate(self): - return True - - cdef bint finalize_validate_skipna(self): - # TODO(phillipc): Remove the existing validate methods and replace them - # with the skipna versions upon full deprecation of skipna=False - return True - - -cdef class BoolValidator(Validator): - - cdef inline bint is_value_typed(self, object value) except -1: - return util.is_bool_object(value) - - cdef inline bint is_array_typed(self) except -1: - return issubclass(self.dtype.type, np.bool_) - - -cpdef bint is_bool_array(ndarray values, bint skipna=False): - cdef: - BoolValidator validator = BoolValidator( - len(values), - values.dtype, - skipna=skipna - ) - return validator.validate(values) - - -cdef class IntegerValidator(Validator): - - cdef inline bint is_value_typed(self, object value) except -1: - return util.is_integer_object(value) - - cdef inline bint is_array_typed(self) except -1: - return issubclass(self.dtype.type, np.integer) - - -cpdef bint is_integer_array(ndarray values): - cdef: - IntegerValidator validator = IntegerValidator( - len(values), - values.dtype, - ) - return validator.validate(values) - - -cdef class IntegerFloatValidator(Validator): - - cdef inline bint is_value_typed(self, object value) except -1: - return util.is_integer_object(value) or util.is_float_object(value) - - cdef inline bint is_array_typed(self) except -1: - return issubclass(self.dtype.type, np.integer) - - -cdef bint is_integer_float_array(ndarray values): - cdef: - IntegerFloatValidator validator = IntegerFloatValidator( - len(values), - values.dtype, - ) - return validator.validate(values) - - -cdef class FloatValidator(Validator): - - cdef inline bint is_value_typed(self, object value) except -1: - return util.is_float_object(value) - - cdef inline bint is_array_typed(self) except -1: - return issubclass(self.dtype.type, np.floating) - - -cpdef bint is_float_array(ndarray values): - cdef FloatValidator validator = FloatValidator(len(values), values.dtype) - return validator.validate(values) - - -cdef class StringValidator(Validator): - - cdef inline bint is_value_typed(self, object value) except -1: - return PyString_Check(value) - - cdef inline bint is_array_typed(self) except -1: - return issubclass(self.dtype.type, np.str_) - - -cpdef bint is_string_array(ndarray values, bint skipna=False): - cdef: - StringValidator validator = StringValidator( - len(values), - values.dtype, - skipna=skipna, - ) - return validator.validate(values) - - -cdef class UnicodeValidator(Validator): - - cdef inline bint is_value_typed(self, object value) except -1: - return PyUnicode_Check(value) - - cdef inline bint is_array_typed(self) except -1: - return issubclass(self.dtype.type, np.unicode_) - - -cdef bint is_unicode_array(ndarray values, bint skipna=False): - cdef: - UnicodeValidator validator = UnicodeValidator( - len(values), - values.dtype, - skipna=skipna, - ) - return validator.validate(values) - - -cdef class BytesValidator(Validator): - - cdef inline bint is_value_typed(self, object value) except -1: - return PyBytes_Check(value) - - cdef inline bint is_array_typed(self) except -1: - return issubclass(self.dtype.type, np.bytes_) - - -cdef bint is_bytes_array(ndarray values, bint skipna=False): - cdef: - BytesValidator validator = BytesValidator( - len(values), - values.dtype, - skipna=skipna - ) - return validator.validate(values) - - -cdef class TemporalValidator(Validator): - - cdef Py_ssize_t generic_null_count - - def __cinit__( - self, - Py_ssize_t n, - dtype dtype=np.dtype(np.object_), - bint skipna=False - ): - self.n = n - self.dtype = dtype - self.skipna = skipna - self.generic_null_count = 0 - - cdef inline bint is_valid(self, object value) except -1: - return self.is_value_typed(value) or self.is_valid_null(value) - - cdef bint is_valid_null(self, object value) except -1: - raise NotImplementedError( - '{} child class must define is_valid_null'.format( - type(self).__name__ - ) - ) - - cdef inline bint is_valid_skipna(self, object value) except -1: - cdef: - bint is_typed_null = self.is_valid_null(value) - bint is_generic_null = util._checknull(value) - self.generic_null_count += is_typed_null and is_generic_null - return self.is_value_typed(value) or is_typed_null or is_generic_null - - cdef inline bint finalize_validate_skipna(self): - return self.generic_null_count != self.n - - -cdef class DatetimeValidator(TemporalValidator): - - cdef bint is_value_typed(self, object value) except -1: - return is_datetime(value) - - cdef inline bint is_valid_null(self, object value) except -1: - return is_null_datetime64(value) - - -cpdef bint is_datetime_array(ndarray values): - cdef: - DatetimeValidator validator = DatetimeValidator( - len(values), - skipna=True, - ) - return validator.validate(values) - - -cdef class Datetime64Validator(DatetimeValidator): - - cdef inline bint is_value_typed(self, object value) except -1: - return util.is_datetime64_object(value) - - -cpdef bint is_datetime64_array(ndarray values): - cdef: - Datetime64Validator validator = Datetime64Validator( - len(values), - skipna=True, - ) - return validator.validate(values) - - -cpdef bint is_datetime_with_singletz_array(ndarray values): - """ - Check values have the same tzinfo attribute. - Doesn't check values are datetime-like types. - """ - - cdef Py_ssize_t i, j, n = len(values) - cdef object base_val, base_tz, val, tz - - if n == 0: - return False - - for i in range(n): - base_val = values[i] - if base_val is not NaT: - base_tz = get_timezone(getattr(base_val, 'tzinfo', None)) - - for j in range(i, n): - val = values[j] - if val is not NaT: - tz = getattr(val, 'tzinfo', None) - if not tz_compare(base_tz, tz): - return False - break - - return True - - -cdef class TimedeltaValidator(TemporalValidator): - - cdef bint is_value_typed(self, object value) except -1: - return PyDelta_Check(value) - - cdef inline bint is_valid_null(self, object value) except -1: - return is_null_timedelta64(value) - - -cpdef bint is_timedelta_array(ndarray values): - cdef: - TimedeltaValidator validator = TimedeltaValidator( - len(values), - skipna=True, - ) - return validator.validate(values) - - -cdef class Timedelta64Validator(TimedeltaValidator): - - cdef inline bint is_value_typed(self, object value) except -1: - return util.is_timedelta64_object(value) - - -cpdef bint is_timedelta64_array(ndarray values): - cdef: - Timedelta64Validator validator = Timedelta64Validator( - len(values), - skipna=True, - ) - return validator.validate(values) - - -cdef class AnyTimedeltaValidator(TimedeltaValidator): - - cdef inline bint is_value_typed(self, object value) except -1: - return is_timedelta(value) - - -cpdef bint is_timedelta_or_timedelta64_array(ndarray values): - """ infer with timedeltas and/or nat/none """ - cdef: - AnyTimedeltaValidator validator = AnyTimedeltaValidator( - len(values), - skipna=True, - ) - return validator.validate(values) - - -cdef class DateValidator(Validator): - - cdef inline bint is_value_typed(self, object value) except -1: - return is_date(value) - - -cpdef bint is_date_array(ndarray values, bint skipna=False): - cdef DateValidator validator = DateValidator(len(values), skipna=skipna) - return validator.validate(values) - - -cdef class TimeValidator(Validator): - - cdef inline bint is_value_typed(self, object value) except -1: - return is_time(value) - - -cpdef bint is_time_array(ndarray values, bint skipna=False): - cdef TimeValidator validator = TimeValidator(len(values), skipna=skipna) - return validator.validate(values) - - -cdef class PeriodValidator(TemporalValidator): - - cdef inline bint is_value_typed(self, object value) except -1: - return is_period(value) - - cdef inline bint is_valid_null(self, object value) except -1: - return is_null_period(value) - - -cpdef bint is_period_array(ndarray values): - cdef PeriodValidator validator = PeriodValidator(len(values), skipna=True) - return validator.validate(values) - - -cdef class IntervalValidator(Validator): - - cdef inline bint is_value_typed(self, object value) except -1: - return is_interval(value) - - -cpdef bint is_interval_array(ndarray values): - cdef: - IntervalValidator validator = IntervalValidator( - len(values), - skipna=True, - ) - return validator.validate(values) - - -cdef extern from "parse_helper.h": - int floatify(object, double *result, int *maybe_int) except -1 - -# constants that will be compared to potentially arbitrarily large -# python int -cdef object oINT64_MAX = INT64_MAX -cdef object oINT64_MIN = INT64_MIN -cdef object oUINT64_MAX = UINT64_MAX - - -@cython.boundscheck(False) -@cython.wraparound(False) -def maybe_convert_numeric(ndarray[object] values, set na_values, - bint convert_empty=True, bint coerce_numeric=False): - """ - Convert object array to a numeric array if possible. - - Parameters - ---------- - values : ndarray - Array of object elements to convert. - na_values : set - Set of values that should be interpreted as NaN. - convert_empty : bool, default True - If an empty array-like object is encountered, whether to interpret - that element as NaN or not. If set to False, a ValueError will be - raised if such an element is encountered and 'coerce_numeric' is False. - coerce_numeric : bool, default False - If initial attempts to convert to numeric have failed, whether to - force conversion to numeric via alternative methods or by setting the - element to NaN. Otherwise, an Exception will be raised when such an - element is encountered. - - This boolean also has an impact on how conversion behaves when a - numeric array has no suitable numerical dtype to return (i.e. uint64, - int32, uint8). If set to False, the original object array will be - returned. Otherwise, a ValueError will be raised. - - Returns - ------- - numeric_array : array of converted object values to numerical ones - """ - - if len(values) == 0: - return np.array([], dtype='i8') - - # fastpath for ints - try to convert all based on first value - cdef object val = values[0] - - if util.is_integer_object(val): - try: - maybe_ints = values.astype('i8') - if (maybe_ints == values).all(): - return maybe_ints - except (ValueError, OverflowError, TypeError): - pass - - # otherwise, iterate and do full infererence - cdef: - int status, maybe_int - Py_ssize_t i, n = values.size - Seen seen = Seen(coerce_numeric) - ndarray[float64_t] floats = np.empty(n, dtype='f8') - ndarray[complex128_t] complexes = np.empty(n, dtype='c16') - ndarray[int64_t] ints = np.empty(n, dtype='i8') - ndarray[uint64_t] uints = np.empty(n, dtype='u8') - ndarray[uint8_t] bools = np.empty(n, dtype='u1') - float64_t fval - - for i in range(n): - val = values[i] - - if val.__hash__ is not None and val in na_values: - seen.saw_null() - floats[i] = complexes[i] = nan - elif util.is_float_object(val): - fval = val - if fval != fval: - seen.null_ = True - - floats[i] = complexes[i] = fval - seen.float_ = True - elif util.is_integer_object(val): - floats[i] = complexes[i] = val - - val = int(val) - seen.saw_int(val) - - if val >= 0: - if val <= oUINT64_MAX: - uints[i] = val - else: - seen.float_ = True - - if val <= oINT64_MAX: - ints[i] = val - - if seen.sint_ and seen.uint_: - seen.float_ = True - - elif util.is_bool_object(val): - floats[i] = uints[i] = ints[i] = bools[i] = val - seen.bool_ = True - elif val is None: - seen.saw_null() - floats[i] = complexes[i] = nan - elif hasattr(val, '__len__') and len(val) == 0: - if convert_empty or seen.coerce_numeric: - seen.saw_null() - floats[i] = complexes[i] = nan - else: - raise ValueError('Empty string encountered') - elif util.is_complex_object(val): - complexes[i] = val - seen.complex_ = True - elif is_decimal(val): - floats[i] = complexes[i] = val - seen.float_ = True - else: - try: - status = floatify(val, &fval, &maybe_int) - - if fval in na_values: - seen.saw_null() - floats[i] = complexes[i] = nan - else: - if fval != fval: - seen.null_ = True - - floats[i] = fval - - if maybe_int: - as_int = int(val) - - if as_int in na_values: - seen.saw_null() - else: - seen.saw_int(as_int) - - if not (seen.float_ or as_int in na_values): - if as_int < oINT64_MIN or as_int > oUINT64_MAX: - raise ValueError('Integer out of range.') - - if as_int >= 0: - uints[i] = as_int - if as_int <= oINT64_MAX: - ints[i] = as_int - - seen.float_ = seen.float_ or (seen.uint_ and seen.sint_) - else: - seen.float_ = True - except (TypeError, ValueError) as e: - if not seen.coerce_numeric: - raise type(e)(str(e) + ' at position {}'.format(i)) - elif "uint64" in str(e): # Exception from check functions. - raise - seen.saw_null() - floats[i] = nan - - if seen.check_uint64_conflict(): - return values - - if seen.complex_: - return complexes - elif seen.float_: - return floats - elif seen.int_: - if seen.uint_: - return uints - else: - return ints - elif seen.bool_: - return bools.view(np.bool_) - elif seen.uint_: - return uints - return ints - - -@cython.boundscheck(False) -@cython.wraparound(False) -def maybe_convert_objects(ndarray[object] objects, bint try_float=0, - bint safe=0, bint convert_datetime=0, - bint convert_timedelta=0): - """ - Type inference function-- convert object array to proper dtype - """ - cdef: - Py_ssize_t i, n - ndarray[float64_t] floats - ndarray[complex128_t] complexes - ndarray[int64_t] ints - ndarray[uint64_t] uints - ndarray[uint8_t] bools - ndarray[int64_t] idatetimes - ndarray[int64_t] itimedeltas - Seen seen = Seen() - object val - float64_t fval, fnan - - n = len(objects) - - floats = np.empty(n, dtype='f8') - complexes = np.empty(n, dtype='c16') - ints = np.empty(n, dtype='i8') - uints = np.empty(n, dtype='u8') - bools = np.empty(n, dtype=np.uint8) - - if convert_datetime: - datetimes = np.empty(n, dtype='M8[ns]') - idatetimes = datetimes.view(np.int64) - - if convert_timedelta: - timedeltas = np.empty(n, dtype='m8[ns]') - itimedeltas = timedeltas.view(np.int64) - - fnan = np.nan - - for i from 0 <= i < n: - val = objects[i] - - if val is None: - seen.null_ = 1 - floats[i] = complexes[i] = fnan - elif val is NaT: - if convert_datetime: - idatetimes[i] = iNaT - seen.datetime_ = 1 - if convert_timedelta: - itimedeltas[i] = iNaT - seen.timedelta_ = 1 - if not (convert_datetime or convert_timedelta): - seen.object_ = 1 - elif util.is_bool_object(val): - seen.bool_ = 1 - bools[i] = val - elif util.is_float_object(val): - floats[i] = complexes[i] = val - seen.float_ = 1 - elif util.is_datetime64_object(val): - if convert_datetime: - idatetimes[i] = convert_to_tsobject( - val, None, None, 0, 0).value - seen.datetime_ = 1 - else: - seen.object_ = 1 - break - elif is_timedelta(val): - if convert_timedelta: - itimedeltas[i] = convert_to_timedelta64(val, 'ns') - seen.timedelta_ = 1 - else: - seen.object_ = 1 - break - elif util.is_integer_object(val): - seen.int_ = 1 - floats[i] = val - complexes[i] = val - if not seen.null_: - seen.saw_int(int(val)) - - if ((seen.uint_ and seen.sint_) or - val > oUINT64_MAX or val < oINT64_MIN): - seen.object_ = 1 - break - - if seen.uint_: - uints[i] = val - elif seen.sint_: - ints[i] = val - else: - uints[i] = val - ints[i] = val - - elif util.is_complex_object(val): - complexes[i] = val - seen.complex_ = 1 - elif PyDateTime_Check(val) or util.is_datetime64_object(val): - - # if we have an tz's attached then return the objects - if convert_datetime: - if getattr(val, 'tzinfo', None) is not None: - seen.datetimetz_ = 1 - break - else: - seen.datetime_ = 1 - idatetimes[i] = convert_to_tsobject( - val, None, None, 0, 0).value - else: - seen.object_ = 1 - break - elif try_float and not util.is_string_object(val): - # this will convert Decimal objects - try: - floats[i] = float(val) - complexes[i] = complex(val) - seen.float_ = 1 - except Exception: - seen.object_ = 1 - break - else: - seen.object_ = 1 - break - - # we try to coerce datetime w/tz but must all have the same tz - if seen.datetimetz_: - if len({getattr(val, 'tzinfo', None) for val in objects}) == 1: - from pandas import DatetimeIndex - return DatetimeIndex(objects) - seen.object_ = 1 - - if not seen.object_: - if not safe: - if seen.null_: - if seen.is_float_or_complex: - if seen.complex_: - return complexes - elif seen.float_ or seen.int_: - return floats - else: - if not seen.bool_: - if seen.datetime_: - if not seen.numeric_: - return datetimes - elif seen.timedelta_: - if not seen.numeric_: - return timedeltas - else: - if seen.complex_: - return complexes - elif seen.float_: - return floats - elif seen.int_: - if seen.uint_: - return uints - else: - return ints - elif seen.is_bool: - return bools.view(np.bool_) - - else: - # don't cast int to float, etc. - if seen.null_: - if seen.is_float_or_complex: - if seen.complex_: - if not seen.int_: - return complexes - elif seen.float_: - if not seen.int_: - return floats - else: - if not seen.bool_: - if seen.datetime_: - if not seen.numeric_: - return datetimes - elif seen.timedelta_: - if not seen.numeric_: - return timedeltas - else: - if seen.complex_: - if not seen.int_: - return complexes - elif seen.float_: - if not seen.int_: - return floats - elif seen.int_: - if seen.uint_: - return uints - else: - return ints - elif seen.is_bool: - return bools.view(np.bool_) - - return objects - - -def map_infer_mask(ndarray arr, object f, ndarray[uint8_t] mask, - bint convert=1): - """ - Substitute for np.vectorize with pandas-friendly dtype inference - - Parameters - ---------- - arr : ndarray - f : function - - Returns - ------- - mapped : ndarray - """ - cdef: - Py_ssize_t i, n - ndarray[object] result - object val - - n = len(arr) - result = np.empty(n, dtype=object) - for i in range(n): - if mask[i]: - val = util.get_value_at(arr, i) - else: - val = f(util.get_value_at(arr, i)) - - # unbox 0-dim arrays, GH #690 - if is_array(val) and PyArray_NDIM(val) == 0: - # is there a faster way to unbox? - val = val.item() - - result[i] = val - - if convert: - return maybe_convert_objects(result, - try_float=0, - convert_datetime=0, - convert_timedelta=0) - - return result - - -def map_infer(ndarray arr, object f, bint convert=1): - """ - Substitute for np.vectorize with pandas-friendly dtype inference - - Parameters - ---------- - arr : ndarray - f : function - - Returns - ------- - mapped : ndarray - """ - cdef: - Py_ssize_t i, n - ndarray[object] result - object val - - n = len(arr) - result = np.empty(n, dtype=object) - for i in range(n): - val = f(util.get_value_at(arr, i)) - - # unbox 0-dim arrays, GH #690 - if is_array(val) and PyArray_NDIM(val) == 0: - # is there a faster way to unbox? - val = val.item() - - result[i] = val - - if convert: - return maybe_convert_objects(result, - try_float=0, - convert_datetime=0, - convert_timedelta=0) - - return result - - -def to_object_array(list rows, int min_width=0): - """ - Convert a list of lists into an object array. - - Parameters - ---------- - rows : 2-d array (N, K) - A list of lists to be converted into an array - min_width : int - The minimum width of the object array. If a list - in `rows` contains fewer than `width` elements, - the remaining elements in the corresponding row - will all be `NaN`. - - Returns - ------- - obj_array : numpy array of the object dtype - """ - cdef: - Py_ssize_t i, j, n, k, tmp - ndarray[object, ndim=2] result - list row - - n = len(rows) - - k = min_width - for i from 0 <= i < n: - tmp = len(rows[i]) - if tmp > k: - k = tmp - - result = np.empty((n, k), dtype=object) - - for i from 0 <= i < n: - row = rows[i] - - for j from 0 <= j < len(row): - result[i, j] = row[j] - - return result - - -def tuples_to_object_array(ndarray[object] tuples): - cdef: - Py_ssize_t i, j, n, k, tmp - ndarray[object, ndim=2] result - tuple tup - - n = len(tuples) - k = len(tuples[0]) - result = np.empty((n, k), dtype=object) - for i in range(n): - tup = tuples[i] - for j in range(k): - result[i, j] = tup[j] - - return result - - -def to_object_array_tuples(list rows): - cdef: - Py_ssize_t i, j, n, k, tmp - ndarray[object, ndim=2] result - tuple row - - n = len(rows) - - k = 0 - for i from 0 <= i < n: - tmp = len(rows[i]) - if tmp > k: - k = tmp - - result = np.empty((n, k), dtype=object) - - try: - for i in range(n): - row = rows[i] - for j from 0 <= j < len(row): - result[i, j] = row[j] - except Exception: - # upcast any subclasses to tuple - for i in range(n): - row = tuple(rows[i]) - for j from 0 <= j < len(row): - result[i, j] = row[j] - - return result - - -def fast_multiget(dict mapping, ndarray keys, default=np.nan): - cdef: - Py_ssize_t i, n = len(keys) - object val - ndarray[object] output = np.empty(n, dtype='O') - - if n == 0: - # kludge, for Series - return np.empty(0, dtype='f8') - - keys = getattr(keys, 'values', keys) - - for i in range(n): - val = util.get_value_1d(keys, i) - if val in mapping: - output[i] = mapping[val] - else: - output[i] = default - - return maybe_convert_objects(output) diff --git a/pandas/_libs/src/parse_helper.h b/pandas/_libs/src/parse_helper.h index 6dd8b66eab33d..d17d9166ea3ee 100644 --- a/pandas/_libs/src/parse_helper.h +++ b/pandas/_libs/src/parse_helper.h @@ -12,6 +12,7 @@ The full license is in the LICENSE file, distributed with this software. #include #include +#include "helper.h" #include "headers/portable.h" static double xstrtod(const char *p, char **q, char decimal, char sci, diff --git a/setup.py b/setup.py index 3289f1e99b87f..8d93882acfe19 100755 --- a/setup.py +++ b/setup.py @@ -484,26 +484,17 @@ def maybe_cythonize(extensions, *args, **kwargs): return extensions -lib_depends = ['inference'] - - def srcpath(name=None, suffix='.pyx', subdir='src'): return pjoin('pandas', subdir, name + suffix) -if suffix == '.pyx': - lib_depends = [srcpath(f, suffix='.pyx', subdir='_libs/src') - for f in lib_depends] -else: - lib_depends = [] - common_include = ['pandas/_libs/src/klib', 'pandas/_libs/src'] ts_include = ['pandas/_libs/tslibs/src'] -lib_depends = lib_depends + ['pandas/_libs/src/numpy_helper.h', - 'pandas/_libs/src/parse_helper.h', - 'pandas/_libs/src/compat_helper.h'] +lib_depends = ['pandas/_libs/src/numpy_helper.h', + 'pandas/_libs/src/parse_helper.h', + 'pandas/_libs/src/compat_helper.h'] np_datetime_headers = [ 'pandas/_libs/tslibs/src/datetime/np_datetime.h',