-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
PERF: improve perf of conversion to string arrays #43805
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Some formatting issue with numpy floats. Closing temporarily. |
Looks like this happens bc in the cython version we're using Could do this optimization with a note to revert it once we use the newer cython? |
Yeah, wasn't aware of that. Do you have any estimate if the perf. improvement from that and when it's released? I 'd be ok with not doing this PR if cython 3 solves this and it will be released soon(-ish). |
Not off the top of my head. Another option would be to use
Not really clear. There have been 9 alpha releases since Apr 2020, I don't know what the blocker/timeline is. |
New version. Unfortunately >>> x = np.float16(0.1)
>>> str(x)
'0.1'
>>> f"{x}"
'0.0999755859375' This version guards against this, at the cost of slight complexity This is slightly slower now: >>> int_arr = np.arange(1_000_000)
>>> %timeit pd.Series(int_arr, dtype="string")
850 ms ± 8.65 ms per loop # master
455 ms ± 8.65 ms per loop # first commit in thsi PR
514 ms ± 8.65 ms per loop # second commit
>>> ser = pd.Series(int_arr)
>>> %timeit ser.astype("string")
890 ms ± 8.65 ms per loop # master
505 ms ± 8.65 ms per loop # first commit in this PR
535 ms ± 8.65 ms per loop # second commit Using |
seems reasonble |
well that is surprising. Any potential explanations in the generated C code? |
I can look into this when Im at the computer again later today. |
Here's the diff in I'm not proficient in C or Cython-generated C files,, but can you see what makes the difference? diff --git a/lib.c b/lib.c
index 82a90b5..7e0ad24 100644
--- a/lib.c
+++ b/lib.c
@@ -26,8 +26,8 @@
],
"include_dirs": [
"pandas\\_libs",
- ".\\pandas\\_libs",
".\\pandas\\_libs\\tslibs",
+ ".\\pandas\\_libs",
"pandas/_libs/src/klib",
"C:\\Users\\TP\\Miniconda3\\envs\\pandas-dev\\lib\\site-packages\\numpy\\core\\include"
],
@@ -2668,28 +2668,6 @@ static CYTHON_INLINE int __Pyx_PyBytes_Equals(PyObject* s1, PyObject* s2, int eq
/* UnicodeEquals.proto */
static CYTHON_INLINE int __Pyx_PyUnicode_Equals(PyObject* s1, PyObject* s2, int equals);
-/* PyObjectFormatSimple.proto */
-#if CYTHON_COMPILING_IN_PYPY
- #define __Pyx_PyObject_FormatSimple(s, f) (\
- likely(PyUnicode_CheckExact(s)) ? (Py_INCREF(s), s) :\
- PyObject_Format(s, f))
-#elif PY_MAJOR_VERSION < 3
- #define __Pyx_PyObject_FormatSimple(s, f) (\
- likely(PyUnicode_CheckExact(s)) ? (Py_INCREF(s), s) :\
- likely(PyString_CheckExact(s)) ? PyUnicode_FromEncodedObject(s, NULL, "strict") :\
- PyObject_Format(s, f))
-#elif CYTHON_USE_TYPE_SLOTS
- #define __Pyx_PyObject_FormatSimple(s, f) (\
- likely(PyUnicode_CheckExact(s)) ? (Py_INCREF(s), s) :\
- likely(PyLong_CheckExact(s)) ? PyLong_Type.tp_str(s) :\
- likely(PyFloat_CheckExact(s)) ? PyFloat_Type.tp_str(s) :\
- PyObject_Format(s, f))
-#else
- #define __Pyx_PyObject_FormatSimple(s, f) (\
- likely(PyUnicode_CheckExact(s)) ? (Py_INCREF(s), s) :\
- PyObject_Format(s, f))
-#endif
-
/* ObjectGetItem.proto */
#if CYTHON_USE_TYPE_SLOTS
static CYTHON_INLINE PyObject *__Pyx_PyObject_GetItem(PyObject *obj, PyObject* key);
@@ -2712,6 +2690,28 @@ static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level);
/* ImportFrom.proto */
static PyObject* __Pyx_ImportFrom(PyObject* module, PyObject* name);
+/* PyObjectFormatSimple.proto */
+#if CYTHON_COMPILING_IN_PYPY
+ #define __Pyx_PyObject_FormatSimple(s, f) (\
+ likely(PyUnicode_CheckExact(s)) ? (Py_INCREF(s), s) :\
+ PyObject_Format(s, f))
+#elif PY_MAJOR_VERSION < 3
+ #define __Pyx_PyObject_FormatSimple(s, f) (\
+ likely(PyUnicode_CheckExact(s)) ? (Py_INCREF(s), s) :\
+ likely(PyString_CheckExact(s)) ? PyUnicode_FromEncodedObject(s, NULL, "strict") :\
+ PyObject_Format(s, f))
+#elif CYTHON_USE_TYPE_SLOTS
+ #define __Pyx_PyObject_FormatSimple(s, f) (\
+ likely(PyUnicode_CheckExact(s)) ? (Py_INCREF(s), s) :\
+ likely(PyLong_CheckExact(s)) ? PyLong_Type.tp_str(s) :\
+ likely(PyFloat_CheckExact(s)) ? PyFloat_Type.tp_str(s) :\
+ PyObject_Format(s, f))
+#else
+ #define __Pyx_PyObject_FormatSimple(s, f) (\
+ likely(PyUnicode_CheckExact(s)) ? (Py_INCREF(s), s) :\
+ PyObject_Format(s, f))
+#endif
+
/* None.proto */
static CYTHON_INLINE void __Pyx_RaiseUnboundLocalError(const char *varname);
@@ -11297,7 +11297,7 @@ static PyArrayObject *__pyx_f_6pandas_5_libs_3lib_ensure_string_array(PyObject *
* if not checknull(val):
* if not isinstance(val, np.floating): # <<<<<<<<<<<<<<
* # f"{val}" is faster than str(val)
- * result[i] = f"{val}"
+ * result[i] = PyObject_Str(val)
*/
__Pyx_GetModuleGlobalName(__pyx_t_6, __pyx_n_s_np); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 730, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_6);
@@ -11312,11 +11312,11 @@ static PyArrayObject *__pyx_f_6pandas_5_libs_3lib_ensure_string_array(PyObject *
/* "pandas/_libs/lib.pyx":732
* if not isinstance(val, np.floating):
* # f"{val}" is faster than str(val)
- * result[i] = f"{val}" # <<<<<<<<<<<<<<
+ * result[i] = PyObject_Str(val) # <<<<<<<<<<<<<<
* else:
* # f"{val}" is not always equivalent to str(val) for floats
*/
- __pyx_t_7 = __Pyx_PyObject_FormatSimple(__pyx_v_val, __pyx_empty_unicode); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 732, __pyx_L1_error)
+ __pyx_t_7 = PyObject_Str(__pyx_v_val); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 732, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_7);
if (unlikely(__Pyx_SetItemInt(__pyx_v_result, __pyx_v_i, __pyx_t_7, Py_ssize_t, 1, PyInt_FromSsize_t, 0, 0, 0) < 0)) __PYX_ERR(0, 732, __pyx_L1_error)
__Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
@@ -11326,7 +11326,7 @@ static PyArrayObject *__pyx_f_6pandas_5_libs_3lib_ensure_string_array(PyObject *
* if not checknull(val):
* if not isinstance(val, np.floating): # <<<<<<<<<<<<<<
* # f"{val}" is faster than str(val)
- * result[i] = f"{val}"
+ * result[i] = PyObject_Str(val)
*/
goto __pyx_L16;
}
@@ -11401,7 +11401,7 @@ static PyArrayObject *__pyx_f_6pandas_5_libs_3lib_ensure_string_array(PyObject *
* if skipna:
* result[i] = val # <<<<<<<<<<<<<<
* else:
- * result[i] = f"{val}"
+ * result[i] = PyObject_Str(val)
*/
if (unlikely(__Pyx_SetItemInt(__pyx_v_result, __pyx_v_i, __pyx_v_val, Py_ssize_t, 1, PyInt_FromSsize_t, 0, 0, 0) < 0)) __PYX_ERR(0, 740, __pyx_L1_error)
@@ -11418,12 +11418,12 @@ static PyArrayObject *__pyx_f_6pandas_5_libs_3lib_ensure_string_array(PyObject *
/* "pandas/_libs/lib.pyx":742
* result[i] = val
* else:
- * result[i] = f"{val}" # <<<<<<<<<<<<<<
+ * result[i] = PyObject_Str(val) # <<<<<<<<<<<<<<
*
* return result
*/
/*else*/ {
- __pyx_t_7 = __Pyx_PyObject_FormatSimple(__pyx_v_val, __pyx_empty_unicode); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 742, __pyx_L1_error)
+ __pyx_t_7 = PyObject_Str(__pyx_v_val); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 742, __pyx_L1_error)
__Pyx_GOTREF(__pyx_t_7);
if (unlikely(__Pyx_SetItemInt(__pyx_v_result, __pyx_v_i, __pyx_t_7, Py_ssize_t, 1, PyInt_FromSsize_t, 0, 0, 0) < 0)) __PYX_ERR(0, 742, __pyx_L1_error)
__Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
@@ -11435,7 +11435,7 @@ static PyArrayObject *__pyx_f_6pandas_5_libs_3lib_ensure_string_array(PyObject *
}
/* "pandas/_libs/lib.pyx":744
- * result[i] = f"{val}"
+ * result[i] = PyObject_Str(val)
*
* return result # <<<<<<<<<<<<<<
*
@@ -59044,7 +59044,7 @@ if (!__Pyx_RefNanny) {
/* "pandas/_libs/lib.pyx":30
* from cython cimport floating
- *
+ * from cpython.object cimport PyObject_Str
* PyDateTime_IMPORT # <<<<<<<<<<<<<<
*
* import numpy as np |
The relevant lines in that diff are
So with the f-str it is using the C call
So it looks like |
thanks @topper-123 |
f"{val}"
is faster thanstr(val)
. Utilizing this in the tight string conversion loop inensure_string_array
gives a nice performance boost.The same improvement is given when we use
dtype=str
.