Skip to content

Commit b6ea970

Browse files
authored
API: more permissive conversion to StringDtype (#33465)
1 parent 8c7d653 commit b6ea970

File tree

14 files changed

+136
-20
lines changed

14 files changed

+136
-20
lines changed

doc/source/user_guide/text.rst

+23
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,29 @@ Or ``astype`` after the ``Series`` or ``DataFrame`` is created
6363
s
6464
s.astype("string")
6565
66+
67+
.. versionchanged:: 1.1.0
68+
69+
You can also use :class:`StringDtype`/``"string"`` as the dtype on non-string data and
70+
it will be converted to ``string`` dtype:
71+
72+
.. ipython:: python
73+
74+
s = pd.Series(['a', 2, np.nan], dtype="string")
75+
s
76+
type(s[1])
77+
78+
or convert from existing pandas data:
79+
80+
.. ipython:: python
81+
82+
s1 = pd.Series([1, 2, np.nan], dtype="Int64")
83+
s1
84+
s2 = s1.astype("string")
85+
s2
86+
type(s2[0])
87+
88+
6689
.. _text.differences:
6790

6891
Behavior differences

doc/source/whatsnew/v1.1.0.rst

+18
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,24 @@ including other versions of pandas.
1313
Enhancements
1414
~~~~~~~~~~~~
1515

16+
.. _whatsnew_110.astype_string:
17+
18+
All dtypes can now be converted to ``StringDtype``
19+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
20+
21+
Previously, declaring or converting to :class:`StringDtype` was in general only possible if the data was already only ``str`` or nan-like (:issue:`31204`).
22+
:class:`StringDtype` now works in all situations where ``astype(str)`` or ``dtype=str`` work:
23+
24+
For example, the below now works:
25+
26+
.. ipython:: python
27+
28+
ser = pd.Series([1, "abc", np.nan], dtype="string")
29+
ser
30+
ser[0]
31+
pd.Series([1, 2, np.nan], dtype="Int64").astype("string")
32+
33+
1634
.. _whatsnew_110.period_index_partial_string_slicing:
1735

1836
Nonmonotonic PeriodIndex Partial String Slicing

pandas/core/arrays/base.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from pandas.util._validators import validate_fillna_kwargs
2121

2222
from pandas.core.dtypes.cast import maybe_cast_to_extension_array
23-
from pandas.core.dtypes.common import is_array_like, is_list_like
23+
from pandas.core.dtypes.common import is_array_like, is_list_like, pandas_dtype
2424
from pandas.core.dtypes.dtypes import ExtensionDtype
2525
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
2626
from pandas.core.dtypes.missing import isna
@@ -178,7 +178,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
178178
----------
179179
scalars : Sequence
180180
Each element will be an instance of the scalar type for this
181-
array, ``cls.dtype.type``.
181+
array, ``cls.dtype.type`` or be converted into this type in this method.
182182
dtype : dtype, optional
183183
Construct for this particular dtype. This should be a Dtype
184184
compatible with the ExtensionArray.
@@ -451,6 +451,12 @@ def astype(self, dtype, copy=True):
451451
array : ndarray
452452
NumPy ndarray with 'dtype' for its dtype.
453453
"""
454+
from pandas.core.arrays.string_ import StringDtype
455+
456+
dtype = pandas_dtype(dtype)
457+
if isinstance(dtype, StringDtype): # allow conversion to StringArrays
458+
return dtype.construct_array_type()._from_sequence(self, copy=False)
459+
454460
return np.array(self, dtype=dtype, copy=copy)
455461

456462
def isna(self) -> ArrayLike:

pandas/core/arrays/datetimelike.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
is_datetime64tz_dtype,
2828
is_datetime_or_timedelta_dtype,
2929
is_dtype_equal,
30+
is_extension_array_dtype,
3031
is_float_dtype,
3132
is_integer_dtype,
3233
is_list_like,
@@ -619,7 +620,11 @@ def astype(self, dtype, copy=True):
619620
if is_object_dtype(dtype):
620621
return self._box_values(self.asi8.ravel()).reshape(self.shape)
621622
elif is_string_dtype(dtype) and not is_categorical_dtype(dtype):
622-
return self._format_native_types()
623+
if is_extension_array_dtype(dtype):
624+
arr_cls = dtype.construct_array_type()
625+
return arr_cls._from_sequence(self, dtype=dtype)
626+
else:
627+
return self._format_native_types()
623628
elif is_integer_dtype(dtype):
624629
# we deliberately ignore int32 vs. int64 here.
625630
# See https://github.com/pandas-dev/pandas/issues/24381 for more.

pandas/core/arrays/integer.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import numbers
2-
from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union
2+
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type, Union
33
import warnings
44

55
import numpy as np
@@ -442,17 +442,20 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike:
442442
if incompatible type with an IntegerDtype, equivalent of same_kind
443443
casting
444444
"""
445-
from pandas.core.arrays.boolean import BooleanArray, BooleanDtype
445+
from pandas.core.arrays.boolean import BooleanDtype
446+
from pandas.core.arrays.string_ import StringDtype
446447

447448
dtype = pandas_dtype(dtype)
448449

449450
# if we are astyping to an existing IntegerDtype we can fastpath
450451
if isinstance(dtype, _IntegerDtype):
451452
result = self._data.astype(dtype.numpy_dtype, copy=False)
452-
return type(self)(result, mask=self._mask, copy=False)
453+
return dtype.construct_array_type()(result, mask=self._mask, copy=False)
453454
elif isinstance(dtype, BooleanDtype):
454455
result = self._data.astype("bool", copy=False)
455-
return BooleanArray(result, mask=self._mask, copy=False)
456+
return dtype.construct_array_type()(result, mask=self._mask, copy=False)
457+
elif isinstance(dtype, StringDtype):
458+
return dtype.construct_array_type()._from_sequence(self, copy=False)
456459

457460
# coerce
458461
if is_float_dtype(dtype):
@@ -722,7 +725,7 @@ class UInt64Dtype(_IntegerDtype):
722725
__doc__ = _dtype_docstring.format(dtype="uint64")
723726

724727

725-
_dtypes = {
728+
_dtypes: Dict[str, _IntegerDtype] = {
726729
"int8": Int8Dtype(),
727730
"int16": Int16Dtype(),
728731
"int32": Int32Dtype(),

pandas/core/arrays/interval.py

+6
Original file line numberDiff line numberDiff line change
@@ -680,8 +680,11 @@ def astype(self, dtype, copy=True):
680680
array : ExtensionArray or ndarray
681681
ExtensionArray or NumPy ndarray with 'dtype' for its dtype.
682682
"""
683+
from pandas.core.arrays.string_ import StringDtype
684+
683685
if dtype is not None:
684686
dtype = pandas_dtype(dtype)
687+
685688
if is_interval_dtype(dtype):
686689
if dtype == self.dtype:
687690
return self.copy() if copy else self
@@ -698,6 +701,9 @@ def astype(self, dtype, copy=True):
698701
return self._shallow_copy(new_left, new_right)
699702
elif is_categorical_dtype(dtype):
700703
return Categorical(np.asarray(self))
704+
elif isinstance(dtype, StringDtype):
705+
return dtype.construct_array_type()._from_sequence(self, copy=False)
706+
701707
# TODO: This try/except will be repeated.
702708
try:
703709
return np.asarray(self).astype(dtype, copy=copy)

pandas/core/arrays/sparse/dtype.py

+4
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from pandas.core.dtypes.cast import astype_nansafe
1414
from pandas.core.dtypes.common import (
1515
is_bool_dtype,
16+
is_extension_array_dtype,
1617
is_object_dtype,
1718
is_scalar,
1819
is_string_dtype,
@@ -322,6 +323,9 @@ def update_dtype(self, dtype):
322323
dtype = pandas_dtype(dtype)
323324

324325
if not isinstance(dtype, cls):
326+
if is_extension_array_dtype(dtype):
327+
raise TypeError("sparse arrays of extension dtypes not supported")
328+
325329
fill_value = astype_nansafe(np.array(self.fill_value), dtype).item()
326330
dtype = cls(dtype, fill_value=fill_value)
327331

pandas/core/arrays/string_.py

+21-10
Original file line numberDiff line numberDiff line change
@@ -152,15 +152,21 @@ class StringArray(PandasArray):
152152
['This is', 'some text', <NA>, 'data.']
153153
Length: 4, dtype: string
154154
155-
Unlike ``object`` dtype arrays, ``StringArray`` doesn't allow non-string
156-
values.
155+
Unlike arrays instantiated with ``dtype="object"``, ``StringArray``
156+
will convert the values to strings.
157157
158+
>>> pd.array(['1', 1], dtype="object")
159+
<PandasArray>
160+
['1', 1]
161+
Length: 2, dtype: object
158162
>>> pd.array(['1', 1], dtype="string")
159-
Traceback (most recent call last):
160-
...
161-
ValueError: StringArray requires an object-dtype ndarray of strings.
163+
<StringArray>
164+
['1', '1']
165+
Length: 2, dtype: string
166+
167+
However, instantiating StringArrays directly with non-strings will raise an error.
162168
163-
For comparison methods, this returns a :class:`pandas.BooleanArray`
169+
For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`:
164170
165171
>>> pd.array(["a", None, "c"], dtype="string") == "a"
166172
<BooleanArray>
@@ -203,10 +209,15 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
203209
# TODO: it would be nice to do this in _validate / lib.is_string_array
204210
# We are already doing a scan over the values there.
205211
na_values = isna(result)
206-
if na_values.any():
207-
if result is scalars:
208-
# force a copy now, if we haven't already
209-
result = result.copy()
212+
has_nans = na_values.any()
213+
if has_nans and result is scalars:
214+
# force a copy now, if we haven't already
215+
result = result.copy()
216+
217+
# convert to str, then to object to avoid dtype like '<U3', then insert na_value
218+
result = np.asarray(result, dtype=str)
219+
result = np.asarray(result, dtype="object")
220+
if has_nans:
210221
result[na_values] = StringDtype.na_value
211222

212223
return cls(result)

pandas/core/dtypes/cast.py

+7
Original file line numberDiff line numberDiff line change
@@ -337,9 +337,16 @@ def maybe_cast_to_extension_array(cls: Type["ExtensionArray"], obj, dtype=None):
337337
-------
338338
ExtensionArray or obj
339339
"""
340+
from pandas.core.arrays.string_ import StringArray
341+
340342
assert isinstance(cls, type), f"must pass a type: {cls}"
341343
assertion_msg = f"must pass a subclass of ExtensionArray: {cls}"
342344
assert issubclass(cls, ABCExtensionArray), assertion_msg
345+
346+
# Everything can be be converted to StringArrays, but we may not want to convert
347+
if issubclass(cls, StringArray) and lib.infer_dtype(obj) != "string":
348+
return obj
349+
343350
try:
344351
result = cls._from_sequence(obj, dtype=dtype)
345352
except Exception:

pandas/tests/extension/base/casting.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,13 @@ def test_tolist(self, data):
3333

3434
def test_astype_str(self, data):
3535
result = pd.Series(data[:5]).astype(str)
36-
expected = pd.Series(data[:5].astype(str))
36+
expected = pd.Series([str(x) for x in data[:5]], dtype=str)
37+
self.assert_series_equal(result, expected)
38+
39+
def test_astype_string(self, data):
40+
# GH-33465
41+
result = pd.Series(data[:5]).astype("string")
42+
expected = pd.Series([str(x) for x in data[:5]], dtype="string")
3743
self.assert_series_equal(result, expected)
3844

3945
def test_to_numpy(self, data):

pandas/tests/extension/decimal/array.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import numpy as np
88

99
from pandas.core.dtypes.base import ExtensionDtype
10+
from pandas.core.dtypes.common import pandas_dtype
1011

1112
import pandas as pd
1213
from pandas.api.extensions import no_default, register_extension_dtype
@@ -130,9 +131,11 @@ def copy(self):
130131
return type(self)(self._data.copy())
131132

132133
def astype(self, dtype, copy=True):
134+
dtype = pandas_dtype(dtype)
133135
if isinstance(dtype, type(self.dtype)):
134136
return type(self)(self._data, context=dtype.context)
135-
return np.asarray(self, dtype=dtype)
137+
138+
return super().astype(dtype, copy=copy)
136139

137140
def __setitem__(self, key, value):
138141
if pd.api.types.is_list_like(value):

pandas/tests/extension/json/array.py

+8
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121

2222
import numpy as np
2323

24+
from pandas.core.dtypes.common import pandas_dtype
25+
2426
import pandas as pd
2527
from pandas.api.extensions import ExtensionArray, ExtensionDtype
2628

@@ -160,12 +162,18 @@ def astype(self, dtype, copy=True):
160162
# NumPy has issues when all the dicts are the same length.
161163
# np.array([UserDict(...), UserDict(...)]) fails,
162164
# but np.array([{...}, {...}]) works, so cast.
165+
from pandas.core.arrays.string_ import StringDtype
163166

167+
dtype = pandas_dtype(dtype)
164168
# needed to add this check for the Series constructor
165169
if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
166170
if copy:
167171
return self.copy()
168172
return self
173+
elif isinstance(dtype, StringDtype):
174+
value = self.astype(str) # numpy doesn'y like nested dicts
175+
return dtype.construct_array_type()._from_sequence(value, copy=False)
176+
169177
return np.array([dict(x) for x in self], dtype=dtype, copy=copy)
170178

171179
def unique(self):

pandas/tests/extension/test_numpy.py

+6
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,12 @@ def test_astype_str(self, data):
139139
# ValueError: setting an array element with a sequence
140140
super().test_astype_str(data)
141141

142+
@skip_nested
143+
def test_astype_string(self, data):
144+
# GH-33465
145+
# ValueError: setting an array element with a sequence
146+
super().test_astype_string(data)
147+
142148

143149
class TestConstructors(BaseNumPyTests, base.BaseConstructorsTests):
144150
@pytest.mark.skip(reason="We don't register our dtype")

pandas/tests/extension/test_sparse.py

+10
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,16 @@ def test_astype_object_frame(self, all_data):
343343
# comp = result.dtypes.equals(df.dtypes)
344344
# assert not comp.any()
345345

346+
def test_astype_str(self, data):
347+
result = pd.Series(data[:5]).astype(str)
348+
expected_dtype = pd.SparseDtype(str, str(data.fill_value))
349+
expected = pd.Series([str(x) for x in data[:5]], dtype=expected_dtype)
350+
self.assert_series_equal(result, expected)
351+
352+
@pytest.mark.xfail(raises=TypeError, reason="no sparse StringDtype")
353+
def test_astype_string(self, data):
354+
super().test_astype_string(data)
355+
346356

347357
class TestArithmeticOps(BaseSparseTests, base.BaseArithmeticOpsTests):
348358
series_scalar_exc = None

0 commit comments

Comments
 (0)