Skip to content

Commit 59e9c3b

Browse files
committed
use _from_sequence + add tests
1 parent cf501bc commit 59e9c3b

File tree

17 files changed

+123
-64
lines changed

17 files changed

+123
-64
lines changed

doc/source/user_guide/text.rst

+21
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,27 @@ Or ``astype`` after the ``Series`` or ``DataFrame`` is created
6363
s
6464
s.astype("string")
6565
66+
67+
.. versionchanged:: 1.1.0
68+
69+
You can also use ``string`` dtype on non-string data and it will be converted to
70+
``string`` dtype:
71+
72+
.. ipython:: python
73+
74+
s = pd.Series(['a', 2, np.nan], dtype="string")
75+
s
76+
type(s[1])
77+
78+
or convert from existing pandas data:
79+
80+
s1 = pd.Series([1,2, np.nan], dtype="Int64")
81+
s1
82+
s2 = s1.astype("string")
83+
s2
84+
type(s2[0])
85+
86+
6687
.. _text.differences:
6788

6889
Behavior differences

doc/source/whatsnew/v1.1.0.rst

+27
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,32 @@ including other versions of pandas.
1313
Enhancements
1414
~~~~~~~~~~~~
1515

16+
.. _whatsnew_110.astype_string:
17+
18+
All dtypes can now be converted to ``StringDtype``
19+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
20+
21+
Previously, declaring or converting to :class:`StringDtype` was in general only possible if the data was already only ``str`` or nan-like.
22+
For example:
23+
24+
.. code-block:: ipython
25+
26+
In [1]: pd.Series([1, "abc", np.nan], dtype="string")
27+
Out[1]: ValueError: StringArray requires a sequence of strings or pandas.NA
28+
In [2]: pd.Series([1,2, np.nan], dtype="Int64").astype("string")
29+
Out[2]: ValueError: StringArray requires a sequence of strings or pandas.NA
30+
31+
This meant that in order to convert arbitrary data to :class:`StringDtype`, you would often have to use ``.astype(str).astype('string')``, which was not intuitive.
32+
:class:`StringDtype` now works in all situations where ``astype(str)`` or ``dtype=str`` work:
33+
34+
.. ipython:: python
35+
36+
ser = pd.Series([1, "abc", np.nan], dtype="string")
37+
ser
38+
ser[0]
39+
pd.Series([1,2, np.nan], dtype="Int64").astype("string")
40+
41+
1642
.. _whatsnew_110.period_index_partial_string_slicing:
1743

1844
Nonmonotonic PeriodIndex Partial String Slicing
@@ -88,6 +114,7 @@ Other enhancements
88114
- :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`).
89115
- :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`)
90116
- :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`)
117+
- :meth:`Series.combine` has gained a ``dtype`` argument. If supplied, the combined series will get that dtype (:issue:`33465`)
91118
- The :meth:`DataFrame.to_feather` method now supports additional keyword
92119
arguments (e.g. to set the compression) that are added in pyarrow 0.17
93120
(:issue:`33422`).

pandas/core/arrays/base.py

+7-25
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from pandas.util._validators import validate_fillna_kwargs
2121

2222
from pandas.core.dtypes.cast import maybe_cast_to_extension_array
23-
from pandas.core.dtypes.common import is_array_like, is_list_like
23+
from pandas.core.dtypes.common import is_array_like, is_list_like, pandas_dtype
2424
from pandas.core.dtypes.dtypes import ExtensionDtype
2525
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
2626
from pandas.core.dtypes.missing import isna
@@ -176,7 +176,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
176176
----------
177177
scalars : Sequence
178178
Each element will be an instance of the scalar type for this
179-
array, ``cls.dtype.type``.
179+
array, ``cls.dtype.type`` or be converted into this type in this method.
180180
dtype : dtype, optional
181181
Construct for this particular dtype. This should be a Dtype
182182
compatible with the ExtensionArray.
@@ -213,29 +213,6 @@ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
213213
"""
214214
raise AbstractMethodError(cls)
215215

216-
@classmethod
217-
def _from_sequence_of_any_type(cls, scalars, dtype=None, copy=False):
218-
"""
219-
Construct a new ExtensionArray from a sequence of unknown types of scalars.
220-
221-
.. versionadded:: 1.1.0
222-
223-
Parameters
224-
----------
225-
scalars : Sequence
226-
Each element can be an instance of unknown scalar types.
227-
dtype : dtype, optional
228-
Construct for this particular dtype. This should be a Dtype
229-
compatible with the ExtensionArray.
230-
copy : bool, default False
231-
If True, copy the underlying data.
232-
233-
Returns
234-
-------
235-
ExtensionArray
236-
"""
237-
return cls._from_sequence(scalars, dtype=dtype, copy=copy)
238-
239216
@classmethod
240217
def _from_factorized(cls, values, original):
241218
"""
@@ -454,6 +431,11 @@ def astype(self, dtype, copy=True):
454431
array : ndarray
455432
NumPy ndarray with 'dtype' for its dtype.
456433
"""
434+
from pandas.core.arrays.string_ import StringDtype
435+
436+
dtype = pandas_dtype(dtype)
437+
if isinstance(dtype, StringDtype):
438+
return dtype.construct_array_type()._from_sequence(self, copy=False)
457439
return np.array(self, dtype=dtype, copy=copy)
458440

459441
def isna(self) -> ArrayLike:

pandas/core/arrays/integer.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -450,17 +450,20 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike:
450450
if incompatible type with an IntegerDtype, equivalent of same_kind
451451
casting
452452
"""
453-
from pandas.core.arrays.boolean import BooleanArray, BooleanDtype
453+
from pandas.core.arrays.boolean import BooleanDtype
454+
from pandas.core.arrays.string_ import StringDtype
454455

455456
dtype = pandas_dtype(dtype)
456457

457458
# if we are astyping to an existing IntegerDtype we can fastpath
458459
if isinstance(dtype, _IntegerDtype):
459460
result = self._data.astype(dtype.numpy_dtype, copy=False)
460-
return type(self)(result, mask=self._mask, copy=False)
461+
return dtype.construct_array_type()(result, mask=self._mask, copy=False)
461462
elif isinstance(dtype, BooleanDtype):
462463
result = self._data.astype("bool", copy=False)
463-
return BooleanArray(result, mask=self._mask, copy=False)
464+
return dtype.construct_array_type()(result, mask=self._mask, copy=False)
465+
elif isinstance(dtype, StringDtype):
466+
return dtype.construct_array_type()._from_sequence(self, copy=False)
464467

465468
# coerce
466469
if is_float_dtype(dtype):

pandas/core/arrays/interval.py

+5
Original file line numberDiff line numberDiff line change
@@ -678,6 +678,8 @@ def astype(self, dtype, copy=True):
678678
array : ExtensionArray or ndarray
679679
ExtensionArray or NumPy ndarray with 'dtype' for its dtype.
680680
"""
681+
from pandas.core.arrays.string_ import StringDtype
682+
681683
dtype = pandas_dtype(dtype)
682684
if is_interval_dtype(dtype):
683685
if dtype == self.dtype:
@@ -695,6 +697,9 @@ def astype(self, dtype, copy=True):
695697
return self._shallow_copy(new_left, new_right)
696698
elif is_categorical_dtype(dtype):
697699
return Categorical(np.asarray(self))
700+
elif isinstance(dtype, StringDtype):
701+
return dtype.construct_array_type()._from_sequence(self, copy=False)
702+
698703
# TODO: This try/except will be repeated.
699704
try:
700705
return np.asarray(self).astype(dtype, copy=copy)

pandas/core/arrays/sparse/dtype.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -320,7 +320,7 @@ def update_dtype(self, dtype):
320320
dtype = pandas_dtype(dtype)
321321

322322
if not isinstance(dtype, cls):
323-
fill_value = astype_nansafe(np.array(self.fill_value), dtype).item()
323+
fill_value = astype_nansafe(np.array([self.fill_value]), dtype)[0]
324324
dtype = cls(dtype, fill_value=fill_value)
325325

326326
return dtype

pandas/core/arrays/string_.py

+8-19
Original file line numberDiff line numberDiff line change
@@ -203,10 +203,14 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
203203
# TODO: it would be nice to do this in _validate / lib.is_string_array
204204
# We are already doing a scan over the values there.
205205
na_values = isna(result)
206-
if na_values.any():
207-
if result is scalars:
208-
# force a copy now, if we haven't already
209-
result = result.copy()
206+
has_nans = na_values.any()
207+
if has_nans and result is scalars:
208+
# force a copy now, if we haven't already
209+
result = result.copy()
210+
# convert to str, then to object to avoid dtype like '<U3', then insert na_value
211+
result = np.asarray(result, dtype=str)
212+
result = np.asarray(result, dtype="object")
213+
if has_nans:
210214
result[na_values] = StringDtype.na_value
211215

212216
return cls(result)
@@ -215,21 +219,6 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
215219
def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
216220
return cls._from_sequence(strings, dtype=dtype, copy=copy)
217221

218-
@classmethod
219-
def _from_sequence_of_any_type(cls, scalars, dtype=None, copy=False):
220-
values = np.asarray(scalars, dtype="object")
221-
na_values = isna(values)
222-
has_nans = na_values.any()
223-
if has_nans and values is scalars:
224-
# force a copy now, if we haven't already
225-
values = values.copy()
226-
# convert to str, then to object to avoid dtype like '<U3', then insert na_value
227-
values = np.asarray(values, dtype=str)
228-
values = np.asarray(values, dtype="object")
229-
if has_nans:
230-
values[na_values] = dtype.na_value
231-
return cls._from_sequence(values, dtype=dtype, copy=copy)
232-
233222
def __arrow_array__(self, type=None):
234223
"""
235224
Convert myself into a pyarrow Array.

pandas/core/construction.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,7 @@ def array(
283283

284284
if is_extension_array_dtype(dtype):
285285
cls = cast(ExtensionDtype, dtype).construct_array_type()
286-
return cls._from_sequence_of_any_type(data, dtype=dtype, copy=copy)
286+
return cls._from_sequence(data, dtype=dtype, copy=copy)
287287

288288
if dtype is None:
289289
inferred_dtype = lib.infer_dtype(data, skipna=True)
@@ -562,7 +562,7 @@ def _try_cast(
562562
elif is_extension_array_dtype(dtype):
563563
# create an extension array from its dtype
564564
dtype = cast(ExtensionDtype, dtype)
565-
array_type = dtype.construct_array_type()._from_sequence_of_any_type
565+
array_type = dtype.construct_array_type()._from_sequence
566566
subarr = array_type(arr, dtype=dtype, copy=copy)
567567
elif dtype is not None and raise_cast_failure:
568568
raise

pandas/core/dtypes/cast.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -924,8 +924,7 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False):
924924
"""
925925
# dispatch on extension dtype if needed
926926
if is_extension_array_dtype(dtype):
927-
arr_type = dtype.construct_array_type()._from_sequence_of_any_type
928-
return arr_type(arr, dtype=dtype, copy=copy)
927+
return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy)
929928

930929
if not isinstance(dtype, np.dtype):
931930
dtype = pandas_dtype(dtype)

pandas/core/generic.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -10478,7 +10478,7 @@ def _doc_parms(cls):
1047810478
True
1047910479
>>> pd.Series([True, False]).all()
1048010480
False
10481-
>>> pd.Series([], dtype=object).all()
10481+
>>> pd.Series([]).all()
1048210482
True
1048310483
>>> pd.Series([np.nan]).all()
1048410484
True
@@ -10846,7 +10846,7 @@ def _doc_parms(cls):
1084610846
False
1084710847
>>> pd.Series([True, False]).any()
1084810848
True
10849-
>>> pd.Series([], dtype=object).any()
10849+
>>> pd.Series([]).any()
1085010850
False
1085110851
>>> pd.Series([np.nan]).any()
1085210852
False
@@ -10948,13 +10948,13 @@ def _doc_parms(cls):
1094810948
1094910949
By default, the sum of an empty or all-NA Series is ``0``.
1095010950
10951-
>>> pd.Series([], dtype=float).sum() # min_count=0 is the default
10951+
>>> pd.Series([]).sum() # min_count=0 is the default
1095210952
0.0
1095310953
1095410954
This can be controlled with the ``min_count`` parameter. For example, if
1095510955
you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
1095610956
10957-
>>> pd.Series([], dtype=float).sum(min_count=1)
10957+
>>> pd.Series([]).sum(min_count=1)
1095810958
nan
1095910959
1096010960
Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
@@ -10995,12 +10995,12 @@ def _doc_parms(cls):
1099510995
--------
1099610996
By default, the product of an empty or all-NA Series is ``1``
1099710997
10998-
>>> pd.Series([], dtype=float).prod()
10998+
>>> pd.Series([]).prod()
1099910999
1.0
1100011000
1100111001
This can be controlled with the ``min_count`` parameter
1100211002
11003-
>>> pd.Series([], dtype=float).prod(min_count=1)
11003+
>>> pd.Series([]).prod(min_count=1)
1100411004
nan
1100511005
1100611006
Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and

pandas/core/series.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -2660,7 +2660,7 @@ def _construct_result(
26602660
out.name = name
26612661
return out
26622662

2663-
def combine(self, other, func, fill_value=None) -> "Series":
2663+
def combine(self, other, func, fill_value=None, dtype=None) -> "Series":
26642664
"""
26652665
Combine the Series with a Series or scalar according to `func`.
26662666
@@ -2679,6 +2679,11 @@ def combine(self, other, func, fill_value=None) -> "Series":
26792679
The value to assume when an index is missing from
26802680
one Series or the other. The default specifies to use the
26812681
appropriate NaN value for the underlying dtype of the Series.
2682+
dtype : str, numpy.dtype, or ExtensionDtype, optional
2683+
Data type for the output Series. If not specified, this will be
2684+
inferred from the combined data.
2685+
2686+
.. versionadded:: 1.1.0
26822687
26832688
Returns
26842689
-------
@@ -2749,6 +2754,10 @@ def combine(self, other, func, fill_value=None) -> "Series":
27492754
new_values = [func(lv, other) for lv in self._values]
27502755
new_name = self.name
27512756

2757+
if dtype is not None:
2758+
return self._constructor(
2759+
new_values, index=new_index, name=new_name, dtype=dtype
2760+
)
27522761
if is_categorical_dtype(self.dtype):
27532762
pass
27542763
elif is_extension_array_dtype(self.dtype):

pandas/tests/arrays/sparse/test_array.py

-1
Original file line numberDiff line numberDiff line change
@@ -529,7 +529,6 @@ def test_astype_all(self, any_real_dtype):
529529
np.array([0, 1], dtype="datetime64[ns]"),
530530
dtype=SparseDtype("datetime64[ns]", pd.Timestamp("1970")),
531531
),
532-
marks=[pytest.mark.xfail(reason="NumPy-7619")],
533532
),
534533
(
535534
SparseArray([0, 1, 10]),

pandas/tests/extension/base/casting.py

+5
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,11 @@ def test_astype_str(self, data):
3636
expected = pd.Series(data[:5].astype(str))
3737
self.assert_series_equal(result, expected)
3838

39+
def test_astype_string(self, data):
40+
result = pd.Series(data[:5]).astype("string")
41+
expected = pd.Series(data[:5].astype("string"))
42+
self.assert_series_equal(result, expected)
43+
3944
def test_to_numpy(self, data):
4045
expected = np.asarray(data)
4146

pandas/tests/extension/base/methods.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -172,15 +172,16 @@ def test_combine_le(self, data_repeated):
172172
orig_data1, orig_data2 = data_repeated(2)
173173
s1 = pd.Series(orig_data1)
174174
s2 = pd.Series(orig_data2)
175-
result = s1.combine(s2, lambda x1, x2: x1 <= x2)
175+
result = s1.combine(s2, lambda x1, x2: x1 <= x2, dtype="boolean")
176176
expected = pd.Series(
177-
[a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))]
177+
[a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))],
178+
dtype="boolean",
178179
)
179180
self.assert_series_equal(result, expected)
180181

181182
val = s1.iloc[0]
182-
result = s1.combine(val, lambda x1, x2: x1 <= x2)
183-
expected = pd.Series([a <= val for a in list(orig_data1)])
183+
result = s1.combine(val, lambda x1, x2: x1 <= x2, dtype="boolean")
184+
expected = pd.Series([a <= val for a in list(orig_data1)], dtype="boolean")
184185
self.assert_series_equal(result, expected)
185186

186187
def test_combine_add(self, data_repeated):

pandas/tests/extension/decimal/array.py

+6
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import numpy as np
88

99
from pandas.core.dtypes.base import ExtensionDtype
10+
from pandas.core.dtypes.common import pandas_dtype
1011

1112
import pandas as pd
1213
from pandas.api.extensions import no_default, register_extension_dtype
@@ -130,8 +131,13 @@ def copy(self):
130131
return type(self)(self._data.copy())
131132

132133
def astype(self, dtype, copy=True):
134+
from pandas.core.arrays.string_ import StringDtype
135+
136+
dtype = pandas_dtype(dtype)
133137
if isinstance(dtype, type(self.dtype)):
134138
return type(self)(self._data, context=dtype.context)
139+
elif isinstance(dtype, StringDtype):
140+
return dtype.construct_array_type()._from_sequence(self, copy=False)
135141
return np.asarray(self, dtype=dtype)
136142

137143
def __setitem__(self, key, value):

0 commit comments

Comments
 (0)