From 33386195a574f6c3610628d414b65d049b257d05 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 24 Dec 2021 12:38:19 -0800 Subject: [PATCH 1/4] PERF/WIP: avoid copies in lib.infer_dtype --- pandas/_libs/lib.pyx | 85 +++++++++++-------- pandas/conftest.py | 1 + pandas/core/arrays/floating.py | 8 +- pandas/core/arrays/integer.py | 12 +-- .../arrays/floating/test_construction.py | 1 + .../tests/arrays/integer/test_construction.py | 1 + pandas/tests/dtypes/test_inference.py | 28 +++++- 7 files changed, 81 insertions(+), 55 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 8f9016e726f1e..d13aa6e2b60fc 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -97,7 +97,6 @@ from pandas._libs.missing cimport ( is_matching_na, is_null_datetime64, is_null_timedelta64, - isnaobj, ) from pandas._libs.tslibs.conversion cimport convert_to_tsobject from pandas._libs.tslibs.nattype cimport ( @@ -1420,6 +1419,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: ndarray values bint seen_pdnat = False bint seen_val = False + flatiter it if util.is_array(value): values = value @@ -1457,24 +1457,22 @@ def infer_dtype(value: object, skipna: bool = True) -> str: # This should not be reached values = values.astype(object) - # for f-contiguous array 1000 x 1000, passing order="K" gives 5000x speedup - values = values.ravel(order="K") - - if skipna: - values = values[~isnaobj(values)] - n = cnp.PyArray_SIZE(values) if n == 0: return "empty" # Iterate until we find our first valid value. We will use this # value to decide which of the is_foo_array functions to call. + it = PyArray_IterNew(values) for i in range(n): - val = values[i] + # The PyArray_GETITEM and PyArray_ITER_NEXT are faster + # equivalents to `val = values[i]` + val = PyArray_GETITEM(values, PyArray_ITER_DATA(it)) + PyArray_ITER_NEXT(it) # do not use checknull to keep # np.datetime64('nat') and np.timedelta64('nat') - if val is None or util.is_nan(val): + if val is None or util.is_nan(val) or val is C_NA: pass elif val is NaT: seen_pdnat = True @@ -1486,23 +1484,25 @@ def infer_dtype(value: object, skipna: bool = True) -> str: if seen_val is False and seen_pdnat is True: return "datetime" # float/object nan is handled in latter logic + if seen_val is False and skipna: + return "empty" if util.is_datetime64_object(val): - if is_datetime64_array(values): + if is_datetime64_array(values, skipna=skipna): return "datetime64" elif is_timedelta(val): - if is_timedelta_or_timedelta64_array(values): + if is_timedelta_or_timedelta64_array(values, skipna=skipna): return "timedelta" elif util.is_integer_object(val): # ordering matters here; this check must come after the is_timedelta # check otherwise numpy timedelta64 objects would come through here - if is_integer_array(values): + if is_integer_array(values, skipna=skipna): return "integer" - elif is_integer_float_array(values): - if is_integer_na_array(values): + elif is_integer_float_array(values, skipna=skipna): + if is_integer_na_array(values, skipna=skipna): return "integer-na" else: return "mixed-integer-float" @@ -1523,7 +1523,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: return "time" elif is_decimal(val): - if is_decimal_array(values): + if is_decimal_array(values, skipna=skipna): return "decimal" elif util.is_complex_object(val): @@ -1533,8 +1533,8 @@ def infer_dtype(value: object, skipna: bool = True) -> str: elif util.is_float_object(val): if is_float_array(values): return "floating" - elif is_integer_float_array(values): - if is_integer_na_array(values): + elif is_integer_float_array(values, skipna=skipna): + if is_integer_na_array(values, skipna=skipna): return "integer-na" else: return "mixed-integer-float" @@ -1552,15 +1552,18 @@ def infer_dtype(value: object, skipna: bool = True) -> str: return "bytes" elif is_period_object(val): - if is_period_array(values): + if is_period_array(values, skipna=skipna): return "period" elif is_interval(val): if is_interval_array(values): return "interval" + cnp.PyArray_ITER_RESET(it) for i in range(n): - val = values[i] + val = PyArray_GETITEM(values, PyArray_ITER_DATA(it)) + PyArray_ITER_NEXT(it) + if util.is_integer_object(val): return "mixed-integer" @@ -1789,10 +1792,11 @@ cdef class IntegerValidator(Validator): # Note: only python-exposed for tests -cpdef bint is_integer_array(ndarray values): +cpdef bint is_integer_array(ndarray values, bint skipna=True): cdef: IntegerValidator validator = IntegerValidator(len(values), - values.dtype) + values.dtype, + skipna=skipna) return validator.validate(values) @@ -1803,10 +1807,10 @@ cdef class IntegerNaValidator(Validator): or (util.is_nan(value) and util.is_float_object(value))) -cdef bint is_integer_na_array(ndarray values): +cdef bint is_integer_na_array(ndarray values, bint skipna=True): cdef: IntegerNaValidator validator = IntegerNaValidator(len(values), - values.dtype) + values.dtype, skipna=skipna) return validator.validate(values) @@ -1819,10 +1823,11 @@ cdef class IntegerFloatValidator(Validator): return issubclass(self.dtype.type, np.integer) -cdef bint is_integer_float_array(ndarray values): +cdef bint is_integer_float_array(ndarray values, bint skipna=True): cdef: IntegerFloatValidator validator = IntegerFloatValidator(len(values), - values.dtype) + values.dtype, + skipna=skipna) return validator.validate(values) @@ -1866,9 +1871,11 @@ cdef class DecimalValidator(Validator): return is_decimal(value) -cdef bint is_decimal_array(ndarray values): +cdef bint is_decimal_array(ndarray values, bint skipna=False): cdef: - DecimalValidator validator = DecimalValidator(len(values), values.dtype) + DecimalValidator validator = DecimalValidator( + len(values), values.dtype, skipna=skipna + ) return validator.validate(values) @@ -1882,7 +1889,7 @@ cdef class StringValidator(Validator): cdef bint is_valid_null(self, object value) except -1: # We deliberately exclude None / NaN here since StringArray uses NA - return value is C_NA + return value is C_NA or value is None or util.is_nan(value) cpdef bint is_string_array(ndarray values, bint skipna=False): @@ -1967,10 +1974,10 @@ cdef class Datetime64Validator(DatetimeValidator): # Note: only python-exposed for tests -cpdef bint is_datetime64_array(ndarray values): +cpdef bint is_datetime64_array(ndarray values, bint skipna=True): cdef: Datetime64Validator validator = Datetime64Validator(len(values), - skipna=True) + skipna=skipna) return validator.validate(values) @@ -1982,10 +1989,10 @@ cdef class AnyDatetimeValidator(DatetimeValidator): ) -cdef bint is_datetime_or_datetime64_array(ndarray values): +cdef bint is_datetime_or_datetime64_array(ndarray values, bint skipna=True): cdef: AnyDatetimeValidator validator = AnyDatetimeValidator(len(values), - skipna=True) + skipna=skipna) return validator.validate(values) @@ -2039,13 +2046,13 @@ cdef class AnyTimedeltaValidator(TimedeltaValidator): # Note: only python-exposed for tests -cpdef bint is_timedelta_or_timedelta64_array(ndarray values): +cpdef bint is_timedelta_or_timedelta64_array(ndarray values, bint skipna=True): """ Infer with timedeltas and/or nat/none. """ cdef: AnyTimedeltaValidator validator = AnyTimedeltaValidator(len(values), - skipna=True) + skipna=skipna) return validator.validate(values) @@ -2075,7 +2082,8 @@ cpdef bint is_time_array(ndarray values, bint skipna=False): return validator.validate(values) -cdef bint is_period_array(ndarray[object] values): +# FIXME: actually use skipna +cdef bint is_period_array(ndarray[object] values, bint skipna=True): """ Is this an ndarray of Period objects (or NaT) with a single `freq`? """ @@ -2083,12 +2091,17 @@ cdef bint is_period_array(ndarray[object] values): Py_ssize_t i, n = len(values) int dtype_code = -10000 # i.e. c_FreqGroup.FR_UND object val + flatiter it if len(values) == 0: return False + it = PyArray_IterNew(values) for i in range(n): - val = values[i] + # The PyArray_GETITEM and PyArray_ITER_NEXT are faster + # equivalents to `val = values[i]` + val = PyArray_GETITEM(values, PyArray_ITER_DATA(it)) + PyArray_ITER_NEXT(it) if is_period_object(val): if dtype_code == -10000: diff --git a/pandas/conftest.py b/pandas/conftest.py index be28dbe35fcb2..07b828824e26c 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1521,6 +1521,7 @@ def any_numpy_dtype(request): _any_skipna_inferred_dtype = [ ("string", ["a", np.nan, "c"]), ("string", ["a", pd.NA, "c"]), + ("mixed", ["a", pd.NaT, "c"]), # pd.NaT not considered valid by is_string_array ("bytes", [b"a", np.nan, b"c"]), ("empty", [np.nan, np.nan, np.nan]), ("empty", []), diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 4c868747fa930..319db7ccbd80e 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -131,13 +131,7 @@ def coerce_to_array( inferred_type = lib.infer_dtype(values, skipna=True) if inferred_type == "empty": pass - elif inferred_type not in [ - "floating", - "integer", - "mixed-integer", - "integer-na", - "mixed-integer-float", - ]: + elif inferred_type == "boolean": raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype") elif is_bool_dtype(values) and is_float_dtype(dtype): diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 3587575503d33..69f241cfa55e3 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -181,16 +181,8 @@ def coerce_to_array( inferred_type = lib.infer_dtype(values, skipna=True) if inferred_type == "empty": pass - elif inferred_type not in [ - "floating", - "integer", - "mixed-integer", - "integer-na", - "mixed-integer-float", - "string", - "unicode", - ]: - raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype") + elif inferred_type == "boolean": + raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype") elif is_bool_dtype(values) and is_integer_dtype(dtype): values = np.array(values, dtype=int, copy=copy) diff --git a/pandas/tests/arrays/floating/test_construction.py b/pandas/tests/arrays/floating/test_construction.py index 169b23c31f863..a4f36c0168fda 100644 --- a/pandas/tests/arrays/floating/test_construction.py +++ b/pandas/tests/arrays/floating/test_construction.py @@ -137,6 +137,7 @@ def test_to_array_error(values): "cannot be converted to a FloatingDtype", "values must be a 1D list-like", "Cannot pass scalar", + r"float\(\) argument must be a string or a number, not 'dict'", ] ) with pytest.raises((TypeError, ValueError), match=msg): diff --git a/pandas/tests/arrays/integer/test_construction.py b/pandas/tests/arrays/integer/test_construction.py index e5fd4977ec2b8..fa61f23fc5d64 100644 --- a/pandas/tests/arrays/integer/test_construction.py +++ b/pandas/tests/arrays/integer/test_construction.py @@ -139,6 +139,7 @@ def test_to_integer_array_error(values): r"invalid literal for int\(\) with base 10:", r"values must be a 1D list-like", r"Cannot pass scalar", + r"int\(\) argument must be a string", ] ) with pytest.raises((ValueError, TypeError), match=msg): diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 7953d650636be..1696d5b835247 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1134,10 +1134,20 @@ def test_unicode(self): # This could also return "string" or "mixed-string" assert result == "mixed" + # even though we use skipna, we are only skipping those NAs that are + # considered matching by is_string_array arr = ["a", np.nan, "c"] result = lib.infer_dtype(arr, skipna=True) assert result == "string" + arr = ["a", pd.NA, "c"] + result = lib.infer_dtype(arr, skipna=True) + assert result == "string" + + arr = ["a", pd.NaT, "c"] + result = lib.infer_dtype(arr, skipna=True) + assert result == "mixed" + arr = ["a", "c"] result = lib.infer_dtype(arr, skipna=False) assert result == "string" @@ -1542,10 +1552,24 @@ def test_is_string_array(self): assert lib.is_string_array( np.array(["foo", "bar", pd.NA], dtype=object), skipna=True ) - # NaN is not valid for string array, just NA - assert not lib.is_string_array( + # we allow NaN/None in the StringArray constructor, so its allowed here + assert lib.is_string_array( np.array(["foo", "bar", np.nan], dtype=object), skipna=True ) + assert lib.is_string_array( + np.array(["foo", "bar", None], dtype=object), skipna=True + ) + + # But not e.g. datetimelike or Decimal NAs + assert not lib.is_string_array( + np.array(["foo", "bar", pd.NaT], dtype=object), skipna=True + ) + assert not lib.is_string_array( + np.array(["foo", "bar", np.datetime64("NaT")], dtype=object), skipna=True + ) + assert not lib.is_string_array( + np.array(["foo", "bar", Decimal("NaN")], dtype=object), skipna=True + ) assert not lib.is_string_array(np.array([1, 2])) From 3939517f97f654d4b995c84b1c38517168a35ab7 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 27 Dec 2021 12:16:36 -0800 Subject: [PATCH 2/4] update test --- pandas/tests/arrays/string_/test_string.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 7c3a8c691b786..5bc1416900099 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -266,15 +266,26 @@ def test_constructor_raises(cls): with pytest.raises(ValueError, match=msg): cls(np.array([])) - with pytest.raises(ValueError, match=msg): + if cls is pd.arrays.StringArray: + # GH#45057 np.nan and None do NOT raise, as they are considered valid NAs + # for string dtype cls(np.array(["a", np.nan], dtype=object)) - - with pytest.raises(ValueError, match=msg): cls(np.array(["a", None], dtype=object)) + else: + with pytest.raises(ValueError, match=msg): + cls(np.array(["a", np.nan], dtype=object)) + with pytest.raises(ValueError, match=msg): + cls(np.array(["a", None], dtype=object)) with pytest.raises(ValueError, match=msg): cls(np.array(["a", pd.NaT], dtype=object)) + with pytest.raises(ValueError, match=msg): + cls(np.array(["a", np.datetime64("NaT", "ns")], dtype=object)) + + with pytest.raises(ValueError, match=msg): + cls(np.array(["a", np.timedelta64("NaT", "ns")], dtype=object)) + @pytest.mark.parametrize("copy", [True, False]) def test_from_sequence_no_mutate(copy, cls, request): From 132b6c4c3df2108068c5be8c85377af4ac66328b Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 27 Dec 2021 21:27:01 -0800 Subject: [PATCH 3/4] update exception message --- pandas/tests/arrays/floating/test_construction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/floating/test_construction.py b/pandas/tests/arrays/floating/test_construction.py index 0503d61c101c4..7debf9dd257b1 100644 --- a/pandas/tests/arrays/floating/test_construction.py +++ b/pandas/tests/arrays/floating/test_construction.py @@ -137,7 +137,7 @@ def test_to_array_error(values): "cannot be converted to a FloatingDtype", "values must be a 1D list-like", "Cannot pass scalar", - r"float\(\) argument must be a string or a number, not 'dict'", + r"float\(\) argument must be a string or a (real )?number, not 'dict'", ] ) with pytest.raises((TypeError, ValueError), match=msg): From 994b5cd4fa1cf05623ce55194aec9698ad5a4d08 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 28 Dec 2021 10:21:08 -0800 Subject: [PATCH 4/4] is_period_array compat --- pandas/_libs/lib.pyx | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d13aa6e2b60fc..7bc1f5d559441 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2083,21 +2083,23 @@ cpdef bint is_time_array(ndarray values, bint skipna=False): # FIXME: actually use skipna -cdef bint is_period_array(ndarray[object] values, bint skipna=True): +cdef bint is_period_array(ndarray values, bint skipna=True): """ Is this an ndarray of Period objects (or NaT) with a single `freq`? """ + # values should be object-dtype, but ndarray[object] assumes 1D, while + # this _may_ be 2D. cdef: - Py_ssize_t i, n = len(values) + Py_ssize_t i, N = values.size int dtype_code = -10000 # i.e. c_FreqGroup.FR_UND object val flatiter it - if len(values) == 0: + if N == 0: return False it = PyArray_IterNew(values) - for i in range(n): + for i in range(N): # The PyArray_GETITEM and PyArray_ITER_NEXT are faster # equivalents to `val = values[i]` val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))