Skip to content

Commit 39ea860

Browse files
committed
Refactor to avoid call to StringArray__init__ & validation
1 parent ee01e02 commit 39ea860

File tree

1 file changed

+21
-29
lines changed

1 file changed

+21
-29
lines changed

pandas/core/arrays/string_.py

+21-29
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import operator
2-
from typing import TYPE_CHECKING, Optional, Type, Union
2+
from typing import TYPE_CHECKING, Type, Union
33

44
import numpy as np
55

@@ -122,9 +122,6 @@ class StringArray(PandasArray):
122122
123123
copy : bool, default False
124124
Whether to copy the array of data.
125-
convert : bool, default False
126-
If true, force conversion of non-na scalars to strings.
127-
If False, raises a ValueError, if a scalar is neither a string nor na.
128125
129126
Attributes
130127
----------
@@ -165,15 +162,7 @@ class StringArray(PandasArray):
165162
['1', '1']
166163
Length: 2, dtype: string
167164
168-
Instantiating StringArrays directly with non-strings arrays will raise an error
169-
unless ``convert=True``.
170-
171-
>>> pd.arrays.StringArray(np.array(['1', 1]))
172-
ValueError: StringArray requires a sequence of strings or pandas.NA
173-
>>> pd.arrays.StringArray(['1', 1], convert=True)
174-
<StringArray>
175-
['1', '1']
176-
Length: 2, dtype: string
165+
However, instantiating StringArrays directly with non-strings will raise an error.
177166
178167
For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`:
179168
@@ -186,29 +175,22 @@ class StringArray(PandasArray):
186175
# undo the PandasArray hack
187176
_typ = "extension"
188177

189-
def __init__(self, values, copy=False, convert: bool = False):
178+
def __init__(self, values, copy=False):
190179
values = extract_array(values)
191-
if not isinstance(values, type(self)):
192-
if convert:
193-
na_val = StringDtype.na_value
194-
values = lib.ensure_string_array(values, na_value=na_val, copy=copy)
195-
else:
196-
self._validate(values)
197180

198181
super().__init__(values, copy=copy)
199182
self._dtype = StringDtype()
183+
if not isinstance(values, type(self)):
184+
self._validate()
200185

201-
def _validate(self, values: Optional[np.ndarray] = None) -> None:
186+
def _validate(self):
202187
"""Validate that we only store NA or strings."""
203-
if values is None:
204-
values = self._ndarray
205-
206-
if len(values) and not lib.is_string_array(values, skipna=True):
188+
if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
207189
raise ValueError("StringArray requires a sequence of strings or pandas.NA")
208-
if values.dtype != "object":
190+
if self._ndarray.dtype != "object":
209191
raise ValueError(
210192
"StringArray requires a sequence of strings or pandas.NA. Got "
211-
f"'{values.dtype}' dtype instead."
193+
f"'{self._ndarray.dtype}' dtype instead."
212194
)
213195

214196
@classmethod
@@ -217,8 +199,18 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
217199
assert dtype == "string"
218200

219201
result = np.asarray(scalars, dtype="object")
220-
221-
return cls(result, copy=copy, convert=True)
202+
# convert non-na-likes to str, and nan-likes to StringDtype.na_value
203+
result = lib.ensure_string_array(
204+
result, na_value=StringDtype.na_value, copy=copy
205+
)
206+
207+
# Manually creating new array avoids the validation step in the __init__, so is
208+
# faster. Refactor need for validation?
209+
new_string_array = object.__new__(cls)
210+
new_string_array._dtype = StringDtype()
211+
new_string_array._ndarray = result
212+
213+
return new_string_array
222214

223215
@classmethod
224216
def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):

0 commit comments

Comments
 (0)