From cc8497eaf3cefb67e823b3f0e2e8d86ce3e3731b Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 18 Sep 2023 11:19:15 -0400 Subject: [PATCH 1/5] faster ensure_string_array --- pandas/_libs/lib.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 0c0610f72044e..f7f4a9f676fdf 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -776,6 +776,7 @@ cpdef ndarray[object] ensure_string_array( cdef: Py_ssize_t i = 0, n = len(arr) bint already_copied = True + ndarray[object] newarr if hasattr(arr, "to_numpy"): @@ -800,8 +801,9 @@ cpdef ndarray[object] ensure_string_array( # short-circuit, all elements are str return result + newarr = np.asarray(arr, dtype=object) for i in range(n): - val = arr[i] + val = newarr[i] if isinstance(val, str): continue From 7a823afa436e68501159316b6aab61ce57e1a845 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 18 Sep 2023 12:23:43 -0400 Subject: [PATCH 2/5] faster map_infer_mask --- pandas/_libs/lib.pyx | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index f7f4a9f676fdf..910e6ad972857 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2833,9 +2833,14 @@ NoDefault = Literal[_NoDefault.no_default] @cython.boundscheck(False) @cython.wraparound(False) -def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=True, - object na_value=no_default, cnp.dtype dtype=np.dtype(object) - ) -> np.ndarray: +def map_infer_mask( + ndarray[object] arr, + object f, + const uint8_t[:] mask, + bint convert=True, + object na_value=no_default, + cnp.dtype dtype=np.dtype(object) +) -> np.ndarray: """ Substitute for np.vectorize with pandas-friendly dtype inference. From c2f4cd155ee5eeebc5917f35952658335389d2d2 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 18 Sep 2023 13:44:16 -0400 Subject: [PATCH 3/5] non-optimized float path --- pandas/_libs/lib.pyx | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 910e6ad972857..65126e18627a4 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -801,6 +801,29 @@ cpdef ndarray[object] ensure_string_array( # short-circuit, all elements are str return result + from pandas.core.dtypes.common import is_float_dtype + if is_float_dtype(arr.dtype): # non-optimized path + print("going non-optimal route") + for i in range(n): + val = arr[i] + + if not already_copied: + result = result.copy() + already_copied = True + + if not checknull(val): + # f"{val}" is not always equivalent to str(val) for floats + result[i] = str(val) + else: + if convert_na_value: + val = na_value + if skipna: + result[i] = val + else: + result[i] = f"{val}" + + return result + newarr = np.asarray(arr, dtype=object) for i in range(n): val = newarr[i] From d9062bf893e01e7c2dbef95d79d7abcdd0ed6870 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 18 Sep 2023 14:40:22 -0400 Subject: [PATCH 4/5] remove print --- pandas/_libs/lib.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 65126e18627a4..bee90e9df18b3 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -803,7 +803,6 @@ cpdef ndarray[object] ensure_string_array( from pandas.core.dtypes.common import is_float_dtype if is_float_dtype(arr.dtype): # non-optimized path - print("going non-optimal route") for i in range(n): val = arr[i] From affc3490285a0a23adb6cfe3c0f431b1063ce5e4 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 18 Sep 2023 18:01:10 -0400 Subject: [PATCH 5/5] feedback --- pandas/_libs/lib.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index bee90e9df18b3..23c8066b973f8 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -801,8 +801,7 @@ cpdef ndarray[object] ensure_string_array( # short-circuit, all elements are str return result - from pandas.core.dtypes.common import is_float_dtype - if is_float_dtype(arr.dtype): # non-optimized path + if arr.dtype.kind == "f": # non-optimized path for i in range(n): val = arr[i]