From 7abf9ff25e7712d9859c633eb683589c2d8b2d71 Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 12 Sep 2020 23:34:37 +0100 Subject: [PATCH 1/4] PERF: StringArray construction --- pandas/core/arrays/string_.py | 44 +++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 381968f9724b6..2907cd3ae9d02 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,5 +1,5 @@ import operator -from typing import TYPE_CHECKING, Type, Union +from typing import TYPE_CHECKING, Optional, Type, Union import numpy as np @@ -122,6 +122,9 @@ class StringArray(PandasArray): copy : bool, default False Whether to copy the array of data. + convert : bool, default False + If true, force conversion of non-na scalars to strings. + If False, raises a ValueError, if a scalar is neither a string nor na. Attributes ---------- @@ -162,7 +165,15 @@ class StringArray(PandasArray): ['1', '1'] Length: 2, dtype: string - However, instantiating StringArrays directly with non-strings will raise an error. + Instantiating StringArrays directly with non-strings will raise an error unless + ``convert=True``. + + >>> pd.arrays.StringArray(['1', 1]) + TypeError: Argument 'values' has incorrect type (expected numpy.ndarray, got list) + >>> pd.arrays.StringArray(['1', 1], convert=True) + + ['1', '1'] + Length: 2, dtype: string For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`: @@ -175,22 +186,30 @@ class StringArray(PandasArray): # undo the PandasArray hack _typ = "extension" - def __init__(self, values, copy=False): + def __init__(self, values, copy=False, convert: bool = False): values = extract_array(values) + if not isinstance(values, type(self)): + if convert: + values = lib.ensure_string_array( + values, na_value=StringDtype.na_value, copy=copy + ) + else: + self._validate(values) super().__init__(values, copy=copy) self._dtype = StringDtype() - if not isinstance(values, type(self)): - self._validate() - def _validate(self): + def _validate(self, values: Optional[np.ndarray] = None) -> None: """Validate that we only store NA or strings.""" - if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): + if values is None: + values = self._ndarray + + if len(values) and not lib.is_string_array(values, skipna=True): raise ValueError("StringArray requires a sequence of strings or pandas.NA") - if self._ndarray.dtype != "object": + if values.dtype != "object": raise ValueError( "StringArray requires a sequence of strings or pandas.NA. Got " - f"'{self._ndarray.dtype}' dtype instead." + f"'{values.dtype}' dtype instead." ) @classmethod @@ -200,12 +219,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): result = np.asarray(scalars, dtype="object") - # convert non-na-likes to str, and nan-likes to StringDtype.na_value - result = lib.ensure_string_array( - result, na_value=StringDtype.na_value, copy=copy - ) - - return cls(result) + return cls(result, copy=copy, convert=True) @classmethod def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): From f1721ac58773e94d754ffda045102edb9713c92a Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 13 Sep 2020 07:53:19 +0100 Subject: [PATCH 2/4] add issue number --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/arrays/string_.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index f398af6e4dd5e..278fb7274bd9b 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -219,7 +219,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- Performance improvements when creating Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`) +- Performance improvements when creating Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`) - Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`) - Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`) - Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 2907cd3ae9d02..3b58570c1adff 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -165,11 +165,11 @@ class StringArray(PandasArray): ['1', '1'] Length: 2, dtype: string - Instantiating StringArrays directly with non-strings will raise an error unless - ``convert=True``. + Instantiating StringArrays directly with non-strings arrays will raise an error + unless ``convert=True``. - >>> pd.arrays.StringArray(['1', 1]) - TypeError: Argument 'values' has incorrect type (expected numpy.ndarray, got list) + >>> pd.arrays.StringArray(np.array(['1', 1], dtype=object)) + ValueError: StringArray requires a sequence of strings or pandas.NA >>> pd.arrays.StringArray(['1', 1], convert=True) ['1', '1'] From ee01e0282c59f845bd914d10b6a238c45abe1628 Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 13 Sep 2020 08:42:29 +0100 Subject: [PATCH 3/4] clean doc string --- pandas/core/arrays/string_.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 3b58570c1adff..6f678d2225b49 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -168,7 +168,7 @@ class StringArray(PandasArray): Instantiating StringArrays directly with non-strings arrays will raise an error unless ``convert=True``. - >>> pd.arrays.StringArray(np.array(['1', 1], dtype=object)) + >>> pd.arrays.StringArray(np.array(['1', 1])) ValueError: StringArray requires a sequence of strings or pandas.NA >>> pd.arrays.StringArray(['1', 1], convert=True) @@ -190,9 +190,8 @@ def __init__(self, values, copy=False, convert: bool = False): values = extract_array(values) if not isinstance(values, type(self)): if convert: - values = lib.ensure_string_array( - values, na_value=StringDtype.na_value, copy=copy - ) + na_val = StringDtype.na_value + values = lib.ensure_string_array(values, na_value=na_val, copy=copy) else: self._validate(values) From 39ea860483533965df934ce328ee7af8de85afd7 Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 16 Sep 2020 13:22:06 +0100 Subject: [PATCH 4/4] Refactor to avoid call to StringArray__init__ & validation --- pandas/core/arrays/string_.py | 50 +++++++++++++++-------------------- 1 file changed, 21 insertions(+), 29 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 6f678d2225b49..cef35f2b1137c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,5 +1,5 @@ import operator -from typing import TYPE_CHECKING, Optional, Type, Union +from typing import TYPE_CHECKING, Type, Union import numpy as np @@ -122,9 +122,6 @@ class StringArray(PandasArray): copy : bool, default False Whether to copy the array of data. - convert : bool, default False - If true, force conversion of non-na scalars to strings. - If False, raises a ValueError, if a scalar is neither a string nor na. Attributes ---------- @@ -165,15 +162,7 @@ class StringArray(PandasArray): ['1', '1'] Length: 2, dtype: string - Instantiating StringArrays directly with non-strings arrays will raise an error - unless ``convert=True``. - - >>> pd.arrays.StringArray(np.array(['1', 1])) - ValueError: StringArray requires a sequence of strings or pandas.NA - >>> pd.arrays.StringArray(['1', 1], convert=True) - - ['1', '1'] - Length: 2, dtype: string + However, instantiating StringArrays directly with non-strings will raise an error. For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`: @@ -186,29 +175,22 @@ class StringArray(PandasArray): # undo the PandasArray hack _typ = "extension" - def __init__(self, values, copy=False, convert: bool = False): + def __init__(self, values, copy=False): values = extract_array(values) - if not isinstance(values, type(self)): - if convert: - na_val = StringDtype.na_value - values = lib.ensure_string_array(values, na_value=na_val, copy=copy) - else: - self._validate(values) super().__init__(values, copy=copy) self._dtype = StringDtype() + if not isinstance(values, type(self)): + self._validate() - def _validate(self, values: Optional[np.ndarray] = None) -> None: + def _validate(self): """Validate that we only store NA or strings.""" - if values is None: - values = self._ndarray - - if len(values) and not lib.is_string_array(values, skipna=True): + if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): raise ValueError("StringArray requires a sequence of strings or pandas.NA") - if values.dtype != "object": + if self._ndarray.dtype != "object": raise ValueError( "StringArray requires a sequence of strings or pandas.NA. Got " - f"'{values.dtype}' dtype instead." + f"'{self._ndarray.dtype}' dtype instead." ) @classmethod @@ -217,8 +199,18 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): assert dtype == "string" result = np.asarray(scalars, dtype="object") - - return cls(result, copy=copy, convert=True) + # convert non-na-likes to str, and nan-likes to StringDtype.na_value + result = lib.ensure_string_array( + result, na_value=StringDtype.na_value, copy=copy + ) + + # Manually creating new array avoids the validation step in the __init__, so is + # faster. Refactor need for validation? + new_string_array = object.__new__(cls) + new_string_array._dtype = StringDtype() + new_string_array._ndarray = result + + return new_string_array @classmethod def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):