From df8e4d6498ee63fef64e4d7dd704b65f78e21e12 Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 2 Aug 2020 20:03:39 +0100 Subject: [PATCH 1/9] REF: StringArray._from_sequence --- doc/source/whatsnew/v1.1.1.rst | 5 +++++ pandas/_libs/lib.pyx | 14 ++++++++++++++ pandas/core/arrays/string_.py | 21 ++++----------------- pandas/tests/arrays/string_/test_string.py | 14 +++++++++----- 4 files changed, 32 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 565b4a014bd0c..f1a2f3be5dca3 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -74,6 +74,11 @@ Categorical - Bug in :class:`DataFrame` constructor failing to raise ``ValueError`` in some cases when data and index have mismatched lengths (:issue:`33437`) - +**Strings** + +- fix memory usage issue when instantiating large :class:`pandas.arrays.StringArray` (:issue:`35499`) + + .. --------------------------------------------------------------------------- .. _whatsnew_111.contributors: diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 5fa91ffee8ea8..d052e3eaeff03 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1698,6 +1698,20 @@ cpdef bint is_string_array(ndarray values, bint skipna=False): return validator.validate(values) +cpdef ndarray ensure_string_array(ndarray values, object na_value): + cdef: + Py_ssize_t i = 0, n = len(values) + + for i in range(n): + val = values[i] + if not checknull(val): + values[i] = str(val) + else: + values[i] = na_value + + return values + + cdef class BytesValidator(Validator): cdef inline bint is_value_typed(self, object value) except -1: return isinstance(value, bytes) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index bb55c3cdea45c..e7a89a3174ac0 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -177,11 +177,10 @@ class StringArray(PandasArray): def __init__(self, values, copy=False): values = extract_array(values) - skip_validation = isinstance(values, type(self)) super().__init__(values, copy=copy) self._dtype = StringDtype() - if not skip_validation: + if not isinstance(values, type(self)): self._validate() def _validate(self): @@ -195,7 +194,7 @@ def _validate(self): ) @classmethod - def _from_sequence(cls, scalars, dtype=None, copy=False): + def _from_sequence(cls, scalars, dtype=None, copy=True): if dtype: assert dtype == "string" @@ -203,20 +202,8 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): if copy and result is scalars: result = result.copy() - # Standardize all missing-like values to NA - # TODO: it would be nice to do this in _validate / lib.is_string_array - # We are already doing a scan over the values there. - na_values = isna(result) - has_nans = na_values.any() - if has_nans and result is scalars: - # force a copy now, if we haven't already - result = result.copy() - - # convert to str, then to object to avoid dtype like ' Date: Mon, 3 Aug 2020 19:07:31 +0100 Subject: [PATCH 2/9] Use ensure_string_array in also in construct_1d_ndarray_preserving_na --- pandas/_libs/lib.pyx | 44 ++++++++++++++++++++++++++++------- pandas/core/arrays/string_.py | 5 ++-- pandas/core/dtypes/cast.py | 14 +++-------- 3 files changed, 41 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d052e3eaeff03..191340b6f9294 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1698,18 +1698,46 @@ cpdef bint is_string_array(ndarray values, bint skipna=False): return validator.validate(values) -cpdef ndarray ensure_string_array(ndarray values, object na_value): +cpdef ndarray ensure_string_array( + values, object na_value=np.nan, bint convert_na_value=True, bint copy=True): + """Returns a new numpy array with object dtype and only strings and na values. + + Parameters + --------- + values : array-like + The values to be converted to str, if needed + na_value : Any + The value to use for na. For example, np.nan or pd.NAN + convert_na_value : bool, default True + If False, existing na values will be used unchanged in the new array + copy : bool, default True + Whether to wnsure that a new array is returned + + Returns + ------- + ndarray + """ cdef: Py_ssize_t i = 0, n = len(values) - for i in range(n): - val = values[i] - if not checknull(val): - values[i] = str(val) - else: - values[i] = na_value + result = np.asarray(values, dtype="object") + if copy and result is values: + result = result.copy() - return values + if convert_na_value: + for i in range(n): + val = result[i] + if not checknull(val): + result[i] = str(val) + else: + result[i] = na_value + else: + for i in range(n): + val = result[i] + if not checknull(val): + result[i] = str(val) + + return result cdef class BytesValidator(Validator): diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index e7a89a3174ac0..68d09556d97d6 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -199,11 +199,10 @@ def _from_sequence(cls, scalars, dtype=None, copy=True): assert dtype == "string" result = np.asarray(scalars, dtype="object") - if copy and result is scalars: - result = result.copy() # convert non-na-likes to str, and nan-likes to StringDtype.na_value - result = lib.ensure_string_array(result, StringDtype.na_value) + result = lib.ensure_string_array( + result, na_value=StringDtype.na_value, copy=copy) return cls(result) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 228329898b6a4..5fcad6009504b 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1608,19 +1608,11 @@ def construct_1d_ndarray_preserving_na( >>> construct_1d_ndarray_preserving_na([1.0, 2.0, None], dtype=np.dtype('str')) array(['1.0', '2.0', None], dtype=object) """ - subarr = np.array(values, dtype=dtype, copy=copy) if dtype is not None and dtype.kind == "U": - # GH-21083 - # We can't just return np.array(subarr, dtype='str') since - # NumPy will convert the non-string objects into strings - # Including NA values. Se we have to go - # string -> object -> update NA, which requires an - # additional pass over the data. - na_values = isna(values) - subarr2 = subarr.astype(object) - subarr2[na_values] = np.asarray(values, dtype=object)[na_values] - subarr = subarr2 + subarr = lib.ensure_string_array(values, convert_na_value=False, copy=copy) + else: + subarr = np.array(values, dtype=dtype, copy=copy) return subarr From 887736ab5da9b336bce25b7d0a4e5811475aa8d2 Mon Sep 17 00:00:00 2001 From: tp Date: Mon, 3 Aug 2020 19:31:19 +0100 Subject: [PATCH 3/9] fix linting --- pandas/_libs/lib.pyx | 6 +++--- pandas/core/arrays/string_.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 191340b6f9294..8b42328ba8e8c 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1701,9 +1701,9 @@ cpdef bint is_string_array(ndarray values, bint skipna=False): cpdef ndarray ensure_string_array( values, object na_value=np.nan, bint convert_na_value=True, bint copy=True): """Returns a new numpy array with object dtype and only strings and na values. - + Parameters - --------- + ---------- values : array-like The values to be converted to str, if needed na_value : Any @@ -1715,7 +1715,7 @@ cpdef ndarray ensure_string_array( Returns ------- - ndarray + ndarray """ cdef: Py_ssize_t i = 0, n = len(values) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 68d09556d97d6..738571551de15 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -202,7 +202,8 @@ def _from_sequence(cls, scalars, dtype=None, copy=True): # convert non-na-likes to str, and nan-likes to StringDtype.na_value result = lib.ensure_string_array( - result, na_value=StringDtype.na_value, copy=copy) + result, na_value=StringDtype.na_value, copy=copy + ) return cls(result) From 61f3bd393e01f88f212ff6f74ece3e8a831c11bb Mon Sep 17 00:00:00 2001 From: tp Date: Mon, 3 Aug 2020 20:01:33 +0100 Subject: [PATCH 4/9] fix copy param --- pandas/core/arrays/string_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 738571551de15..381968f9724b6 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -194,7 +194,7 @@ def _validate(self): ) @classmethod - def _from_sequence(cls, scalars, dtype=None, copy=True): + def _from_sequence(cls, scalars, dtype=None, copy=False): if dtype: assert dtype == "string" From c6afa1e5ab3cb5d26436573106ce76a8ac67ccf1 Mon Sep 17 00:00:00 2001 From: tp Date: Mon, 3 Aug 2020 21:48:25 +0100 Subject: [PATCH 5/9] fix comments --- pandas/_libs/lib.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 8b42328ba8e8c..5132ce234c9a7 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1705,13 +1705,13 @@ cpdef ndarray ensure_string_array( Parameters ---------- values : array-like - The values to be converted to str, if needed + The values to be converted to str, if needed. na_value : Any - The value to use for na. For example, np.nan or pd.NAN + The value to use for na. For example, np.nan or pd.NA. convert_na_value : bool, default True - If False, existing na values will be used unchanged in the new array + If False, existing na values will be used unchanged in the new array. copy : bool, default True - Whether to wnsure that a new array is returned + Whether to ensure that a new array is returned. Returns ------- From ce18bb9f15905c01c4b1574862ca70beef839784 Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 4 Aug 2020 08:08:22 +0100 Subject: [PATCH 6/9] delete libs_.lib.astype_str --- pandas/_libs/lib.pyx | 93 ++++++++++++++------------------------ pandas/core/dtypes/cast.py | 3 +- 2 files changed, 36 insertions(+), 60 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 5132ce234c9a7..796f4357c397a 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -618,35 +618,52 @@ def astype_intsafe(ndarray[object] arr, new_dtype): @cython.wraparound(False) @cython.boundscheck(False) -def astype_str(arr: ndarray, skipna: bool=False) -> ndarray[object]: - """ - Convert all elements in an array to string. +cpdef ndarray[object] ensure_string_array( + ndarray[object] arr, + object na_value=np.nan, + bint convert_na_value=True, + bint copy=True, + bint skipna=True, +): + """Returns a new numpy array with object dtype and only strings and na values. Parameters ---------- - arr : ndarray - The array whose elements we are casting. - skipna : bool, default False + arr : array-like + The values to be converted to str, if needed. + na_value : Any + The value to use for na. For example, np.nan or pd.NA. + convert_na_value : bool, default True + If False, existing na values will be used unchanged in the new array. + copy : bool, default True + Whether to ensure that a new array is returned. + skipna : bool, default True Whether or not to coerce nulls to their stringified form - (e.g. NaN becomes 'nan'). + (e.g. if False, NaN becomes 'nan'). Returns ------- ndarray - A new array with the input array's elements casted. + An array with the input array's elements casted to str or nan-like. """ cdef: - object arr_i - Py_ssize_t i, n = arr.size - ndarray[object] result = np.empty(n, dtype=object) - - for i in range(n): - arr_i = arr[i] + Py_ssize_t i = 0, n = len(arr) - if not (skipna and checknull(arr_i)): - arr_i = str(arr_i) + result = np.asarray(arr, dtype="object") + if copy and result is arr: + result = result.copy() - result[i] = arr_i + for i in range(n): + val = result[i] + if not checknull(val): + result[i] = str(val) + else: + if convert_na_value: + val = na_value + if skipna: + result[i] = val + else: + result[i] = str(val) return result @@ -1698,48 +1715,6 @@ cpdef bint is_string_array(ndarray values, bint skipna=False): return validator.validate(values) -cpdef ndarray ensure_string_array( - values, object na_value=np.nan, bint convert_na_value=True, bint copy=True): - """Returns a new numpy array with object dtype and only strings and na values. - - Parameters - ---------- - values : array-like - The values to be converted to str, if needed. - na_value : Any - The value to use for na. For example, np.nan or pd.NA. - convert_na_value : bool, default True - If False, existing na values will be used unchanged in the new array. - copy : bool, default True - Whether to ensure that a new array is returned. - - Returns - ------- - ndarray - """ - cdef: - Py_ssize_t i = 0, n = len(values) - - result = np.asarray(values, dtype="object") - if copy and result is values: - result = result.copy() - - if convert_na_value: - for i in range(n): - val = result[i] - if not checknull(val): - result[i] = str(val) - else: - result[i] = na_value - else: - for i in range(n): - val = result[i] - if not checknull(val): - result[i] = str(val) - - return result - - cdef class BytesValidator(Validator): cdef inline bint is_value_typed(self, object value) except -1: return isinstance(value, bytes) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 5fcad6009504b..6c7c14740f6bd 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -916,7 +916,7 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): dtype = pandas_dtype(dtype) if issubclass(dtype.type, str): - return lib.astype_str(arr.ravel(), skipna=skipna).reshape(arr.shape) + return lib.ensure_string_array(arr.ravel(), skipna=skipna).reshape(arr.shape) elif is_datetime64_dtype(arr): if is_object_dtype(dtype): @@ -1610,6 +1610,7 @@ def construct_1d_ndarray_preserving_na( """ if dtype is not None and dtype.kind == "U": + values = np.asarray(values, dtype="object") subarr = lib.ensure_string_array(values, convert_na_value=False, copy=copy) else: subarr = np.array(values, dtype=dtype, copy=copy) From 9ef03559f4709224af7f1b11955fe9459f78f605 Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 4 Aug 2020 08:33:47 +0100 Subject: [PATCH 7/9] correct input parameter type --- pandas/_libs/lib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 796f4357c397a..52c694435e05a 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -619,7 +619,7 @@ def astype_intsafe(ndarray[object] arr, new_dtype): @cython.wraparound(False) @cython.boundscheck(False) cpdef ndarray[object] ensure_string_array( - ndarray[object] arr, + ndarray arr, object na_value=np.nan, bint convert_na_value=True, bint copy=True, From 3db2884c9878e3f7e1361a0b664a0b6d497db421 Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 4 Aug 2020 23:23:45 +0100 Subject: [PATCH 8/9] Add ASVs --- asv_bench/benchmarks/array.py | 2 ++ asv_bench/benchmarks/strings.py | 15 +++++++++++++++ pandas/_libs/lib.pyx | 2 +- pandas/core/dtypes/cast.py | 1 - 4 files changed, 18 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index 103df0fd94847..19e54e5ebcaa3 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -2,6 +2,8 @@ import pandas as pd +from .pandas_vb_common import tm + class BooleanArray: def setup(self): diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index d7fb2775376c0..61e47fb388662 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -7,6 +7,21 @@ from .pandas_vb_common import tm +class Construction: + + params = ["str", "string"] + param_names = ["dtype"] + + def setup(self): + self.data = tm.rands_array(nchars=10 ** 5, size=10) + + def time_construction(self, dtype): + Series(self.data, dtype=dtype) + + def peakmem_construction(self, dtype): + Series(self.data, dtype=dtype) + + class Methods: def setup(self): self.s = Series(tm.makeStringIndex(10 ** 5)) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 52c694435e05a..eadfcefaac73d 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -619,7 +619,7 @@ def astype_intsafe(ndarray[object] arr, new_dtype): @cython.wraparound(False) @cython.boundscheck(False) cpdef ndarray[object] ensure_string_array( - ndarray arr, + arr, object na_value=np.nan, bint convert_na_value=True, bint copy=True, diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 6c7c14740f6bd..2697f42eb05a4 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1610,7 +1610,6 @@ def construct_1d_ndarray_preserving_na( """ if dtype is not None and dtype.kind == "U": - values = np.asarray(values, dtype="object") subarr = lib.ensure_string_array(values, convert_na_value=False, copy=copy) else: subarr = np.array(values, dtype=dtype, copy=copy) From 47b5d69748eaaa5d58750926250c0fcc3455fdee Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 4 Aug 2020 23:45:43 +0100 Subject: [PATCH 9/9] cleanups --- asv_bench/benchmarks/array.py | 2 -- asv_bench/benchmarks/strings.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index 19e54e5ebcaa3..103df0fd94847 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -2,8 +2,6 @@ import pandas as pd -from .pandas_vb_common import tm - class BooleanArray: def setup(self): diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 61e47fb388662..2023858181baa 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -12,7 +12,7 @@ class Construction: params = ["str", "string"] param_names = ["dtype"] - def setup(self): + def setup(self, dtype): self.data = tm.rands_array(nchars=10 ** 5, size=10) def time_construction(self, dtype):