From 9c619182b14c644e4cc13a4d6f19952d8981e86e Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Thu, 16 Jun 2022 18:20:15 +0200 Subject: [PATCH 1/3] Speedup --- pandas/io/sas/sas.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index 9fcef64e07133..bb7c8a1055856 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -426,8 +426,10 @@ cdef class Parser: jb += 1 elif column_types[j] == column_type_string: # string - string_chunk[js, current_row] = np.array(source[start:( - start + lngt)]).tobytes().rstrip(b"\x00 ") + # Skip trailing whitespace + while lngt > 0 and source[start+lngt-1] in b"\x00 ": + lngt -= 1 + string_chunk[js, current_row] = source[start:start+lngt] js += 1 self.current_row_on_page_index += 1 From 010a75b8d27f8a07dd80474a71fd9ee7271ed94a Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Fri, 17 Jun 2022 15:32:51 +0200 Subject: [PATCH 2/3] Fix --- pandas/io/sas/sas.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index bb7c8a1055856..b46a4a780f919 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -429,7 +429,7 @@ cdef class Parser: # Skip trailing whitespace while lngt > 0 and source[start+lngt-1] in b"\x00 ": lngt -= 1 - string_chunk[js, current_row] = source[start:start+lngt] + string_chunk[js, current_row] = (&source[start])[:lngt] js += 1 self.current_row_on_page_index += 1 From e20cf659580949541607d3732fd3abfa04a51fb9 Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Sat, 9 Jul 2022 09:15:01 +0200 Subject: [PATCH 3/3] Review feedback --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/io/sas/sas.pyx | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 7aa1c1e84aa09..e63107a8eb243 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -799,6 +799,7 @@ Performance improvements - Performance improvement in :class:`BusinessHour` ``str`` and ``repr`` (:issue:`44764`) - Performance improvement in datetime arrays string formatting when one of the default strftime formats ``"%Y-%m-%d %H:%M:%S"`` or ``"%Y-%m-%d %H:%M:%S.%f"`` is used. (:issue:`44764`) - Performance improvement in :meth:`Series.to_sql` and :meth:`DataFrame.to_sql` (:class:`SQLiteTable`) when processing time arrays. (:issue:`44764`) +- Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47404`, :issue:`47405`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index ee252b08b8dc3..d8591c0b033a6 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -424,7 +424,8 @@ cdef class Parser: jb += 1 elif column_types[j] == column_type_string: # string - # Skip trailing whitespace + # Skip trailing whitespace. This is equivalent to calling + # .rstrip(b"\x00 ") but without Python call overhead. while lngt > 0 and source[start+lngt-1] in b"\x00 ": lngt -= 1 string_chunk[js, current_row] = (&source[start])[:lngt]