From c12aeade6329d1870fa961911d55ee16ad3269ea Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 19 Dec 2021 11:26:40 -0800 Subject: [PATCH 1/3] PERF: avoid allocating memory in ensure_datetime64ns --- pandas/_libs/tslibs/conversion.pyx | 33 +++++++++++++++--------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index fe9fa4169c547..a6dc8cc16b229 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -227,12 +227,7 @@ def ensure_datetime64ns(arr: ndarray, copy: bool = True): dtype = arr.dtype arr = arr.astype(dtype.newbyteorder("<")) - ivalues = arr.view(np.int64).ravel("K") - - result = np.empty_like(arr, dtype=DT64NS_DTYPE) - iresult = result.ravel("K").view(np.int64) - - if len(iresult) == 0: + if arr.size == 0: result = arr.view(DT64NS_DTYPE) if copy: result = result.copy() @@ -245,17 +240,23 @@ def ensure_datetime64ns(arr: ndarray, copy: bool = True): raise ValueError("datetime64/timedelta64 must have a unit specified") if unit == NPY_FR_ns: + # Check this before allocating result for perf, might save some memory if copy: - arr = arr.copy() - result = arr - else: - for i in range(n): - if ivalues[i] != NPY_NAT: - pandas_datetime_to_datetimestruct(ivalues[i], unit, &dts) - iresult[i] = dtstruct_to_dt64(&dts) - check_dts_bounds(&dts) - else: - iresult[i] = NPY_NAT + return arr.copy() + return arr + + ivalues = arr.view(np.int64).ravel("K") + + result = np.empty_like(arr, dtype=DT64NS_DTYPE) + iresult = result.ravel("K").view(np.int64) + + for i in range(n): + if ivalues[i] != NPY_NAT: + pandas_datetime_to_datetimestruct(ivalues[i], unit, &dts) + iresult[i] = dtstruct_to_dt64(&dts) + check_dts_bounds(&dts) + else: + iresult[i] = NPY_NAT return result From ea11a2b5374326d9e8072217eeab48f06d29a564 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 19 Dec 2021 11:38:17 -0800 Subject: [PATCH 2/3] PERF: avoid copies in infer_dtype --- pandas/_libs/lib.pyx | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index f527882a9dc9d..5a4ba199c98df 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1440,25 +1440,27 @@ def infer_dtype(value: object, skipna: bool = True) -> str: from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike values = construct_1d_object_array_from_listlike(value) - # make contiguous - # for f-contiguous array 1000 x 1000, passing order="K" gives 5000x speedup - values = values.ravel(order="K") - val = _try_infer_map(values.dtype) if val is not None: + # Anything other than object-dtype should return here. return val - if values.dtype != np.object_: - values = values.astype("O") + if values.descr.type_num != NPY_OBJECT: + # i.e. values.dtype != np.object + # This should not be reached + values = values.astype(object) if skipna: + # for f-contiguous array 1000 x 1000, passing order="K" gives 5000x speedup + values = values.ravel(order="K") values = values[~isnaobj(values)] - n = len(values) + n = cnp.PyArray_SIZE(values) if n == 0: return "empty" - # try to use a valid value + # Iterate until we find our first valid value. We will use this + # value to decide which of the is_foo_array functions to call. for i in range(n): val = values[i] From a7bd2bf210c2e86c6a8bddac17c7d5e5c2f2e83b Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 20 Dec 2021 08:34:42 -0800 Subject: [PATCH 3/3] restore unconditional ravel --- pandas/_libs/lib.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 5a4ba199c98df..985ddf37a8b5c 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1450,9 +1450,10 @@ def infer_dtype(value: object, skipna: bool = True) -> str: # This should not be reached values = values.astype(object) + # for f-contiguous array 1000 x 1000, passing order="K" gives 5000x speedup + values = values.ravel(order="K") + if skipna: - # for f-contiguous array 1000 x 1000, passing order="K" gives 5000x speedup - values = values.ravel(order="K") values = values[~isnaobj(values)] n = cnp.PyArray_SIZE(values)