Skip to content

Commit e5c533f

Browse files
committed
API: More permissive conversion to StringDtype
1 parent cb35d8a commit e5c533f

File tree

14 files changed

+117
-18
lines changed

14 files changed

+117
-18
lines changed

doc/source/user_guide/text.rst

+21
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,27 @@ Or ``astype`` after the ``Series`` or ``DataFrame`` is created
6363
s
6464
s.astype("string")
6565
66+
67+
.. versionchanged:: 1.1.0
68+
69+
You can also use ``string`` dtype on non-string data and it will be converted to
70+
``string`` dtype:
71+
72+
.. ipython:: python
73+
74+
s = pd.Series(['a', 2, np.nan], dtype="string")
75+
s
76+
type(s[1])
77+
78+
or convert from existing pandas data:
79+
80+
s1 = pd.Series([1,2, np.nan], dtype="Int64")
81+
s1
82+
s2 = s1.astype("string")
83+
s2
84+
type(s2[0])
85+
86+
6687
.. _text.differences:
6788

6889
Behavior differences

doc/source/whatsnew/v1.1.0.rst

+27
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,32 @@ including other versions of pandas.
1313
Enhancements
1414
~~~~~~~~~~~~
1515

16+
.. _whatsnew_110.astype_string:
17+
18+
All dtypes can now be converted to ``StringDtype``
19+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
20+
21+
Previously, declaring or converting to :class:`StringDtype` was in general only possible if the data was already only ``str`` or nan-like.
22+
For example:
23+
24+
.. code-block:: ipython
25+
26+
In [1]: pd.Series([1, "abc", np.nan], dtype="string")
27+
Out[1]: ValueError: StringArray requires a sequence of strings or pandas.NA
28+
In [2]: pd.Series([1,2, np.nan], dtype="Int64").astype("string")
29+
Out[2]: ValueError: StringArray requires a sequence of strings or pandas.NA
30+
31+
This meant that in order to convert arbitrary data to :class:`StringDtype`, you would often have to use ``.astype(str).astype('string')``, which was not intuitive.
32+
:class:`StringDtype` now works in all situations where ``astype(str)`` or ``dtype=str`` work:
33+
34+
.. ipython:: python
35+
36+
ser = pd.Series([1, "abc", np.nan], dtype="string")
37+
ser
38+
ser[0]
39+
pd.Series([1,2, np.nan], dtype="Int64").astype("string")
40+
41+
1642
.. _whatsnew_110.period_index_partial_string_slicing:
1743

1844
Nonmonotonic PeriodIndex Partial String Slicing
@@ -210,6 +236,7 @@ Other enhancements
210236
- :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`)
211237
- :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`)
212238
- :class:`Series.dt` and :class:`DatatimeIndex` now have an `isocalendar` method that returns a :class:`DataFrame` with year, week, and day calculated according to the ISO 8601 calendar (:issue:`33206`).
239+
- :meth:`Series.combine` has gained a ``dtype`` argument. If supplied, the combined series will get that dtype (:issue:`33465`)
213240
- The :meth:`DataFrame.to_feather` method now supports additional keyword
214241
arguments (e.g. to set the compression) that are added in pyarrow 0.17
215242
(:issue:`33422`).

pandas/core/arrays/base.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from pandas.util._validators import validate_fillna_kwargs
2121

2222
from pandas.core.dtypes.cast import maybe_cast_to_extension_array
23-
from pandas.core.dtypes.common import is_array_like, is_list_like
23+
from pandas.core.dtypes.common import is_array_like, is_list_like, pandas_dtype
2424
from pandas.core.dtypes.dtypes import ExtensionDtype
2525
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
2626
from pandas.core.dtypes.missing import isna
@@ -178,7 +178,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
178178
----------
179179
scalars : Sequence
180180
Each element will be an instance of the scalar type for this
181-
array, ``cls.dtype.type``.
181+
array, ``cls.dtype.type`` or be converted into this type in this method.
182182
dtype : dtype, optional
183183
Construct for this particular dtype. This should be a Dtype
184184
compatible with the ExtensionArray.
@@ -451,6 +451,11 @@ def astype(self, dtype, copy=True):
451451
array : ndarray
452452
NumPy ndarray with 'dtype' for its dtype.
453453
"""
454+
from pandas.core.arrays.string_ import StringDtype
455+
456+
dtype = pandas_dtype(dtype)
457+
if isinstance(dtype, StringDtype):
458+
return dtype.construct_array_type()._from_sequence(self, copy=False)
454459
return np.array(self, dtype=dtype, copy=copy)
455460

456461
def isna(self) -> ArrayLike:

pandas/core/arrays/integer.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import numbers
2-
from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union
2+
from typing import TYPE_CHECKING, List, Optional, Dict, Tuple, Type, Union
33
import warnings
44

55
import numpy as np
@@ -442,17 +442,20 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike:
442442
if incompatible type with an IntegerDtype, equivalent of same_kind
443443
casting
444444
"""
445-
from pandas.core.arrays.boolean import BooleanArray, BooleanDtype
445+
from pandas.core.arrays.boolean import BooleanDtype
446+
from pandas.core.arrays.string_ import StringDtype
446447

447448
dtype = pandas_dtype(dtype)
448449

449450
# if we are astyping to an existing IntegerDtype we can fastpath
450451
if isinstance(dtype, _IntegerDtype):
451452
result = self._data.astype(dtype.numpy_dtype, copy=False)
452-
return type(self)(result, mask=self._mask, copy=False)
453+
return dtype.construct_array_type()(result, mask=self._mask, copy=False)
453454
elif isinstance(dtype, BooleanDtype):
454455
result = self._data.astype("bool", copy=False)
455-
return BooleanArray(result, mask=self._mask, copy=False)
456+
return dtype.construct_array_type()(result, mask=self._mask, copy=False)
457+
elif isinstance(dtype, StringDtype):
458+
return dtype.construct_array_type()._from_sequence(self, copy=False)
456459

457460
# coerce
458461
if is_float_dtype(dtype):
@@ -722,7 +725,7 @@ class UInt64Dtype(_IntegerDtype):
722725
__doc__ = _dtype_docstring.format(dtype="uint64")
723726

724727

725-
_dtypes = {
728+
_dtypes: Dict[str, _IntegerDtype] = {
726729
"int8": Int8Dtype(),
727730
"int16": Int16Dtype(),
728731
"int32": Int32Dtype(),

pandas/core/arrays/interval.py

+6
Original file line numberDiff line numberDiff line change
@@ -680,8 +680,11 @@ def astype(self, dtype, copy=True):
680680
array : ExtensionArray or ndarray
681681
ExtensionArray or NumPy ndarray with 'dtype' for its dtype.
682682
"""
683+
from pandas.core.arrays.string_ import StringDtype
684+
683685
if dtype is not None:
684686
dtype = pandas_dtype(dtype)
687+
685688
if is_interval_dtype(dtype):
686689
if dtype == self.dtype:
687690
return self.copy() if copy else self
@@ -698,6 +701,9 @@ def astype(self, dtype, copy=True):
698701
return self._shallow_copy(new_left, new_right)
699702
elif is_categorical_dtype(dtype):
700703
return Categorical(np.asarray(self))
704+
elif isinstance(dtype, StringDtype):
705+
return dtype.construct_array_type()._from_sequence(self, copy=False)
706+
701707
# TODO: This try/except will be repeated.
702708
try:
703709
return np.asarray(self).astype(dtype, copy=copy)

pandas/core/arrays/sparse/dtype.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,7 @@ def update_dtype(self, dtype):
322322
dtype = pandas_dtype(dtype)
323323

324324
if not isinstance(dtype, cls):
325-
fill_value = astype_nansafe(np.array(self.fill_value), dtype).item()
325+
fill_value = astype_nansafe(np.array([self.fill_value]), dtype)[0]
326326
dtype = cls(dtype, fill_value=fill_value)
327327

328328
return dtype

pandas/core/arrays/string_.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -203,10 +203,14 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
203203
# TODO: it would be nice to do this in _validate / lib.is_string_array
204204
# We are already doing a scan over the values there.
205205
na_values = isna(result)
206-
if na_values.any():
207-
if result is scalars:
208-
# force a copy now, if we haven't already
209-
result = result.copy()
206+
has_nans = na_values.any()
207+
if has_nans and result is scalars:
208+
# force a copy now, if we haven't already
209+
result = result.copy()
210+
# convert to str, then to object to avoid dtype like '<U3', then insert na_value
211+
result = np.asarray(result, dtype=str)
212+
result = np.asarray(result, dtype="object")
213+
if has_nans:
210214
result[na_values] = StringDtype.na_value
211215

212216
return cls(result)

pandas/core/series.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -2676,7 +2676,7 @@ def _construct_result(
26762676
out.name = name
26772677
return out
26782678

2679-
def combine(self, other, func, fill_value=None) -> "Series":
2679+
def combine(self, other, func, fill_value=None, dtype=None) -> "Series":
26802680
"""
26812681
Combine the Series with a Series or scalar according to `func`.
26822682
@@ -2695,6 +2695,11 @@ def combine(self, other, func, fill_value=None) -> "Series":
26952695
The value to assume when an index is missing from
26962696
one Series or the other. The default specifies to use the
26972697
appropriate NaN value for the underlying dtype of the Series.
2698+
dtype : str, numpy.dtype, or ExtensionDtype, optional
2699+
Data type for the output Series. If not specified, this will be
2700+
inferred from the combined data.
2701+
2702+
.. versionadded:: 1.1.0
26982703
26992704
Returns
27002705
-------
@@ -2765,6 +2770,10 @@ def combine(self, other, func, fill_value=None) -> "Series":
27652770
new_values = [func(lv, other) for lv in self._values]
27662771
new_name = self.name
27672772

2773+
if dtype is not None:
2774+
return self._constructor(
2775+
new_values, index=new_index, name=new_name, dtype=dtype
2776+
)
27682777
if is_categorical_dtype(self.dtype):
27692778
pass
27702779
elif is_extension_array_dtype(self.dtype):

pandas/tests/arrays/sparse/test_array.py

-1
Original file line numberDiff line numberDiff line change
@@ -542,7 +542,6 @@ def test_astype_all(self, any_real_dtype):
542542
np.array([0, 1], dtype="datetime64[ns]"),
543543
dtype=SparseDtype("datetime64[ns]", pd.Timestamp("1970")),
544544
),
545-
marks=[pytest.mark.xfail(reason="NumPy-7619")],
546545
),
547546
(
548547
SparseArray([0, 1, 10]),

pandas/tests/extension/base/casting.py

+5
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,11 @@ def test_astype_str(self, data):
3636
expected = pd.Series(data[:5].astype(str))
3737
self.assert_series_equal(result, expected)
3838

39+
def test_astype_string(self, data):
40+
result = pd.Series(data[:5]).astype("string")
41+
expected = pd.Series(data[:5].astype("string"))
42+
self.assert_series_equal(result, expected)
43+
3944
def test_to_numpy(self, data):
4045
expected = np.asarray(data)
4146

pandas/tests/extension/base/methods.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -188,15 +188,16 @@ def test_combine_le(self, data_repeated):
188188
orig_data1, orig_data2 = data_repeated(2)
189189
s1 = pd.Series(orig_data1)
190190
s2 = pd.Series(orig_data2)
191-
result = s1.combine(s2, lambda x1, x2: x1 <= x2)
191+
result = s1.combine(s2, lambda x1, x2: x1 <= x2, dtype="boolean")
192192
expected = pd.Series(
193-
[a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))]
193+
[a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))],
194+
dtype="boolean",
194195
)
195196
self.assert_series_equal(result, expected)
196197

197198
val = s1.iloc[0]
198-
result = s1.combine(val, lambda x1, x2: x1 <= x2)
199-
expected = pd.Series([a <= val for a in list(orig_data1)])
199+
result = s1.combine(val, lambda x1, x2: x1 <= x2, dtype="boolean")
200+
expected = pd.Series([a <= val for a in list(orig_data1)], dtype="boolean")
200201
self.assert_series_equal(result, expected)
201202

202203
def test_combine_add(self, data_repeated):

pandas/tests/extension/decimal/array.py

+6
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import numpy as np
88

99
from pandas.core.dtypes.base import ExtensionDtype
10+
from pandas.core.dtypes.common import pandas_dtype
1011

1112
import pandas as pd
1213
from pandas.api.extensions import no_default, register_extension_dtype
@@ -130,8 +131,13 @@ def copy(self):
130131
return type(self)(self._data.copy())
131132

132133
def astype(self, dtype, copy=True):
134+
from pandas.core.arrays.string_ import StringDtype
135+
136+
dtype = pandas_dtype(dtype)
133137
if isinstance(dtype, type(self.dtype)):
134138
return type(self)(self._data, context=dtype.context)
139+
elif isinstance(dtype, StringDtype):
140+
return dtype.construct_array_type()._from_sequence(self, copy=False)
135141
return np.asarray(self, dtype=dtype)
136142

137143
def __setitem__(self, key, value):

pandas/tests/extension/json/array.py

+8
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121

2222
import numpy as np
2323

24+
from pandas.core.dtypes.common import pandas_dtype
25+
2426
import pandas as pd
2527
from pandas.api.extensions import ExtensionArray, ExtensionDtype
2628

@@ -160,12 +162,18 @@ def astype(self, dtype, copy=True):
160162
# NumPy has issues when all the dicts are the same length.
161163
# np.array([UserDict(...), UserDict(...)]) fails,
162164
# but np.array([{...}, {...}]) works, so cast.
165+
from pandas.core.arrays.string_ import StringDtype
163166

167+
dtype = pandas_dtype(dtype)
164168
# needed to add this check for the Series constructor
165169
if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
166170
if copy:
167171
return self.copy()
168172
return self
173+
elif isinstance(dtype, StringDtype):
174+
value = self.astype(str) # numpy doesn'y like nested dicts
175+
return dtype.construct_array_type()._from_sequence(value, copy=False)
176+
169177
return np.array([dict(x) for x in self], dtype=dtype, copy=copy)
170178

171179
def unique(self):

pandas/tests/extension/test_numpy.py

+5
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,11 @@ def test_astype_str(self, data):
139139
# ValueError: setting an array element with a sequence
140140
super().test_astype_str(data)
141141

142+
@skip_nested
143+
def test_astype_string(self, data):
144+
# ValueError: setting an array element with a sequence
145+
super().test_astype_string(data)
146+
142147

143148
class TestConstructors(BaseNumPyTests, base.BaseConstructorsTests):
144149
@pytest.mark.skip(reason="We don't register our dtype")

0 commit comments

Comments
 (0)