Skip to content

Commit 92f15b8

Browse files
committed
use _from_sequence + add tests
1 parent 8315e85 commit 92f15b8

File tree

16 files changed

+124
-64
lines changed

16 files changed

+124
-64
lines changed

doc/source/user_guide/text.rst

+21
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,27 @@ Or ``astype`` after the ``Series`` or ``DataFrame`` is created
6363
s
6464
s.astype("string")
6565
66+
67+
.. versionchanged:: 1.1.0
68+
69+
You can also use ``string`` dtype on non-string data and it will be converted to
70+
``string`` dtype:
71+
72+
.. ipython:: python
73+
74+
s = pd.Series(['a', 2, np.nan], dtype="string")
75+
s
76+
type(s[1])
77+
78+
or convert from existing pandas data:
79+
80+
s1 = pd.Series([1,2, np.nan], dtype="Int64")
81+
s1
82+
s2 = s1.astype("string")
83+
s2
84+
type(s2[0])
85+
86+
6687
.. _text.differences:
6788

6889
Behavior differences

doc/source/whatsnew/v1.1.0.rst

+27
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,32 @@ including other versions of pandas.
1313
Enhancements
1414
~~~~~~~~~~~~
1515

16+
.. _whatsnew_110.astype_string:
17+
18+
All dtypes can now be converted to ``StringDtype``
19+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
20+
21+
Previously, declaring or converting to :class:`StringDtype` was in general only possible if the data was already only ``str`` or nan-like.
22+
For example:
23+
24+
.. code-block:: ipython
25+
26+
In [1]: pd.Series([1, "abc", np.nan], dtype="string")
27+
Out[1]: ValueError: StringArray requires a sequence of strings or pandas.NA
28+
In [2]: pd.Series([1,2, np.nan], dtype="Int64").astype("string")
29+
Out[2]: ValueError: StringArray requires a sequence of strings or pandas.NA
30+
31+
This meant that in order to convert arbitrary data to :class:`StringDtype`, you would often have to use ``.astype(str).astype('string')``, which was not intuitive.
32+
:class:`StringDtype` now works in all situations where ``astype(str)`` or ``dtype=str`` work:
33+
34+
.. ipython:: python
35+
36+
ser = pd.Series([1, "abc", np.nan], dtype="string")
37+
ser
38+
ser[0]
39+
pd.Series([1,2, np.nan], dtype="Int64").astype("string")
40+
41+
1642
.. _whatsnew_110.period_index_partial_string_slicing:
1743

1844
Nonmonotonic PeriodIndex Partial String Slicing
@@ -88,6 +114,7 @@ Other enhancements
88114
- :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`).
89115
- :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`)
90116
- :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`)
117+
- :meth:`Series.combine` as gained a ``dtype`` argument. If supplied, the combined series will get that dtype (:issue:`33465`)
91118
-
92119

93120
.. ---------------------------------------------------------------------------

pandas/core/arrays/base.py

+8-26
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from pandas.util._validators import validate_fillna_kwargs
2121

2222
from pandas.core.dtypes.cast import maybe_cast_to_extension_array
23-
from pandas.core.dtypes.common import is_array_like, is_list_like
23+
from pandas.core.dtypes.common import is_array_like, is_list_like, pandas_dtype
2424
from pandas.core.dtypes.dtypes import ExtensionDtype
2525
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
2626
from pandas.core.dtypes.missing import isna
@@ -175,8 +175,8 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
175175
Parameters
176176
----------
177177
scalars : Sequence
178-
Each element will be an instance of the scalar type for this
179-
array, ``cls.dtype.type``.
178+
Each element will be a instance of the scalar type for this
179+
array, ``cls.dtype.type`` or be converted into this type in this method.
180180
dtype : dtype, optional
181181
Construct for this particular dtype. This should be a Dtype
182182
compatible with the ExtensionArray.
@@ -213,29 +213,6 @@ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
213213
"""
214214
raise AbstractMethodError(cls)
215215

216-
@classmethod
217-
def _from_sequence_of_any_type(cls, scalars, dtype=None, copy=False):
218-
"""
219-
Construct a new ExtensionArray from a sequence of unknown types of scalars.
220-
221-
.. versionadded:: 1.1.0
222-
223-
Parameters
224-
----------
225-
scalars : Sequence
226-
Each element can be an instance of unknown scalar types.
227-
dtype : dtype, optional
228-
Construct for this particular dtype. This should be a Dtype
229-
compatible with the ExtensionArray.
230-
copy : bool, default False
231-
If True, copy the underlying data.
232-
233-
Returns
234-
-------
235-
ExtensionArray
236-
"""
237-
return cls._from_sequence(scalars, dtype=dtype, copy=copy)
238-
239216
@classmethod
240217
def _from_factorized(cls, values, original):
241218
"""
@@ -454,6 +431,11 @@ def astype(self, dtype, copy=True):
454431
array : ndarray
455432
NumPy ndarray with 'dtype' for its dtype.
456433
"""
434+
from pandas.core.arrays.string_ import StringDtype
435+
436+
dtype = pandas_dtype(dtype)
437+
if isinstance(dtype, StringDtype):
438+
return dtype.construct_array_type()._from_sequence(self, copy=False)
457439
return np.array(self, dtype=dtype, copy=copy)
458440

459441
def isna(self) -> ArrayLike:

pandas/core/arrays/integer.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -455,17 +455,20 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike:
455455
if incompatible type with an IntegerDtype, equivalent of same_kind
456456
casting
457457
"""
458-
from pandas.core.arrays.boolean import BooleanArray, BooleanDtype
458+
from pandas.core.arrays.boolean import BooleanDtype
459+
from pandas.core.arrays.string_ import StringDtype
459460

460461
dtype = pandas_dtype(dtype)
461462

462463
# if we are astyping to an existing IntegerDtype we can fastpath
463464
if isinstance(dtype, _IntegerDtype):
464465
result = self._data.astype(dtype.numpy_dtype, copy=False)
465-
return type(self)(result, mask=self._mask, copy=False)
466+
return dtype.construct_array_type()(result, mask=self._mask, copy=False)
466467
elif isinstance(dtype, BooleanDtype):
467468
result = self._data.astype("bool", copy=False)
468-
return BooleanArray(result, mask=self._mask, copy=False)
469+
return dtype.construct_array_type()(result, mask=self._mask, copy=False)
470+
elif isinstance(dtype, StringDtype):
471+
return dtype.construct_array_type()._from_sequence(self, copy=False)
469472

470473
# coerce
471474
if is_float_dtype(dtype):

pandas/core/arrays/interval.py

+5
Original file line numberDiff line numberDiff line change
@@ -678,6 +678,8 @@ def astype(self, dtype, copy=True):
678678
array : ExtensionArray or ndarray
679679
ExtensionArray or NumPy ndarray with 'dtype' for its dtype.
680680
"""
681+
from pandas.core.arrays.string_ import StringDtype
682+
681683
dtype = pandas_dtype(dtype)
682684
if is_interval_dtype(dtype):
683685
if dtype == self.dtype:
@@ -695,6 +697,9 @@ def astype(self, dtype, copy=True):
695697
return self._shallow_copy(new_left, new_right)
696698
elif is_categorical_dtype(dtype):
697699
return Categorical(np.asarray(self))
700+
elif isinstance(dtype, StringDtype):
701+
return dtype.construct_array_type()._from_sequence(self, copy=False)
702+
698703
# TODO: This try/except will be repeated.
699704
try:
700705
return np.asarray(self).astype(dtype, copy=copy)

pandas/core/arrays/sparse/dtype.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -320,7 +320,7 @@ def update_dtype(self, dtype):
320320
dtype = pandas_dtype(dtype)
321321

322322
if not isinstance(dtype, cls):
323-
fill_value = astype_nansafe(np.array(self.fill_value), dtype).item()
323+
fill_value = astype_nansafe(np.array([self.fill_value]), dtype)[0]
324324
dtype = cls(dtype, fill_value=fill_value)
325325

326326
return dtype

pandas/core/arrays/string_.py

+8-19
Original file line numberDiff line numberDiff line change
@@ -203,10 +203,14 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
203203
# TODO: it would be nice to do this in _validate / lib.is_string_array
204204
# We are already doing a scan over the values there.
205205
na_values = isna(result)
206-
if na_values.any():
207-
if result is scalars:
208-
# force a copy now, if we haven't already
209-
result = result.copy()
206+
has_nans = na_values.any()
207+
if has_nans and result is scalars:
208+
# force a copy now, if we haven't already
209+
result = result.copy()
210+
# convert to str, then to object to avoid dtype like '<U3', then insert na_value
211+
result = np.asarray(result, dtype=str)
212+
result = np.asarray(result, dtype="object")
213+
if has_nans:
210214
result[na_values] = StringDtype.na_value
211215

212216
return cls(result)
@@ -215,21 +219,6 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
215219
def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
216220
return cls._from_sequence(strings, dtype=dtype, copy=copy)
217221

218-
@classmethod
219-
def _from_sequence_of_any_type(cls, scalars, dtype=None, copy=False):
220-
values = np.asarray(scalars, dtype="object")
221-
na_values = isna(values)
222-
has_nans = na_values.any()
223-
if has_nans and values is scalars:
224-
# force a copy now, if we haven't already
225-
values = values.copy()
226-
# convert to str, then to object to avoid dtype like '<U3', then insert na_value
227-
values = np.asarray(values, dtype=str)
228-
values = np.asarray(values, dtype="object")
229-
if has_nans:
230-
values[na_values] = dtype.na_value
231-
return cls._from_sequence(values, dtype=dtype, copy=copy)
232-
233222
def __arrow_array__(self, type=None):
234223
"""
235224
Convert myself into a pyarrow Array.

pandas/core/construction.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,7 @@ def array(
283283

284284
if is_extension_array_dtype(dtype):
285285
cls = cast(ExtensionDtype, dtype).construct_array_type()
286-
return cls._from_sequence_of_any_type(data, dtype=dtype, copy=copy)
286+
return cls._from_sequence(data, dtype=dtype, copy=copy)
287287

288288
if dtype is None:
289289
inferred_dtype = lib.infer_dtype(data, skipna=True)
@@ -562,7 +562,7 @@ def _try_cast(
562562
elif is_extension_array_dtype(dtype):
563563
# create an extension array from its dtype
564564
dtype = cast(ExtensionDtype, dtype)
565-
array_type = dtype.construct_array_type()._from_sequence_of_any_type
565+
array_type = dtype.construct_array_type()._from_sequence
566566
subarr = array_type(arr, dtype=dtype, copy=copy)
567567
elif dtype is not None and raise_cast_failure:
568568
raise

pandas/core/dtypes/cast.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -924,8 +924,7 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False):
924924
"""
925925
# dispatch on extension dtype if needed
926926
if is_extension_array_dtype(dtype):
927-
arr_type = dtype.construct_array_type()._from_sequence_of_any_type
928-
return arr_type(arr, dtype=dtype, copy=copy)
927+
return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy)
929928

930929
if not isinstance(dtype, np.dtype):
931930
dtype = pandas_dtype(dtype)

pandas/core/generic.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -10485,7 +10485,7 @@ def _doc_parms(cls):
1048510485
True
1048610486
>>> pd.Series([True, False]).all()
1048710487
False
10488-
>>> pd.Series([], dtype=object).all()
10488+
>>> pd.Series([]).all()
1048910489
True
1049010490
>>> pd.Series([np.nan]).all()
1049110491
True
@@ -10853,7 +10853,7 @@ def _doc_parms(cls):
1085310853
False
1085410854
>>> pd.Series([True, False]).any()
1085510855
True
10856-
>>> pd.Series([], dtype=object).any()
10856+
>>> pd.Series([]).any()
1085710857
False
1085810858
>>> pd.Series([np.nan]).any()
1085910859
False
@@ -10955,13 +10955,13 @@ def _doc_parms(cls):
1095510955
1095610956
By default, the sum of an empty or all-NA Series is ``0``.
1095710957
10958-
>>> pd.Series([], dtype=float).sum() # min_count=0 is the default
10958+
>>> pd.Series([]).sum() # min_count=0 is the default
1095910959
0.0
1096010960
1096110961
This can be controlled with the ``min_count`` parameter. For example, if
1096210962
you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
1096310963
10964-
>>> pd.Series([], dtype=float).sum(min_count=1)
10964+
>>> pd.Series([]).sum(min_count=1)
1096510965
nan
1096610966
1096710967
Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
@@ -11002,12 +11002,12 @@ def _doc_parms(cls):
1100211002
--------
1100311003
By default, the product of an empty or all-NA Series is ``1``
1100411004
11005-
>>> pd.Series([], dtype=float).prod()
11005+
>>> pd.Series([]).prod()
1100611006
1.0
1100711007
1100811008
This can be controlled with the ``min_count`` parameter
1100911009
11010-
>>> pd.Series([], dtype=float).prod(min_count=1)
11010+
>>> pd.Series([]).prod(min_count=1)
1101111011
nan
1101211012
1101311013
Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and

pandas/core/series.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -2629,7 +2629,7 @@ def _binop(self, other, func, level=None, fill_value=None):
26292629
ret = ops._construct_result(self, result, new_index, name)
26302630
return ret
26312631

2632-
def combine(self, other, func, fill_value=None) -> "Series":
2632+
def combine(self, other, func, fill_value=None, dtype=None) -> "Series":
26332633
"""
26342634
Combine the Series with a Series or scalar according to `func`.
26352635
@@ -2648,6 +2648,11 @@ def combine(self, other, func, fill_value=None) -> "Series":
26482648
The value to assume when an index is missing from
26492649
one Series or the other. The default specifies to use the
26502650
appropriate NaN value for the underlying dtype of the Series.
2651+
dtype : str, numpy.dtype, or ExtensionDtype, optional
2652+
Data type for the output Series. If not specified, this will be
2653+
inferred from the combined data.
2654+
2655+
.. versionadded:: 1.1.0
26512656
26522657
Returns
26532658
-------
@@ -2718,6 +2723,10 @@ def combine(self, other, func, fill_value=None) -> "Series":
27182723
new_values = [func(lv, other) for lv in self._values]
27192724
new_name = self.name
27202725

2726+
if dtype is not None:
2727+
return self._constructor(
2728+
new_values, index=new_index, name=new_name, dtype=dtype
2729+
)
27212730
if is_categorical_dtype(self.dtype):
27222731
pass
27232732
elif is_extension_array_dtype(self.dtype):

pandas/tests/extension/base/casting.py

+5
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,11 @@ def test_astype_str(self, data):
2424
expected = pd.Series(data[:5].astype(str))
2525
self.assert_series_equal(result, expected)
2626

27+
def test_astype_string(self, data):
28+
result = pd.Series(data[:5]).astype("string")
29+
expected = pd.Series(data[:5].astype("string"))
30+
self.assert_series_equal(result, expected)
31+
2732
def test_to_numpy(self, data):
2833
expected = np.asarray(data)
2934

pandas/tests/extension/base/methods.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -172,15 +172,16 @@ def test_combine_le(self, data_repeated):
172172
orig_data1, orig_data2 = data_repeated(2)
173173
s1 = pd.Series(orig_data1)
174174
s2 = pd.Series(orig_data2)
175-
result = s1.combine(s2, lambda x1, x2: x1 <= x2)
175+
result = s1.combine(s2, lambda x1, x2: x1 <= x2, dtype="boolean")
176176
expected = pd.Series(
177-
[a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))]
177+
[a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))],
178+
dtype="boolean",
178179
)
179180
self.assert_series_equal(result, expected)
180181

181182
val = s1.iloc[0]
182-
result = s1.combine(val, lambda x1, x2: x1 <= x2)
183-
expected = pd.Series([a <= val for a in list(orig_data1)])
183+
result = s1.combine(val, lambda x1, x2: x1 <= x2, dtype="boolean")
184+
expected = pd.Series([a <= val for a in list(orig_data1)], dtype="boolean")
184185
self.assert_series_equal(result, expected)
185186

186187
def test_combine_add(self, data_repeated):

pandas/tests/extension/decimal/array.py

+6
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import numpy as np
88

99
from pandas.core.dtypes.base import ExtensionDtype
10+
from pandas.core.dtypes.common import pandas_dtype
1011

1112
import pandas as pd
1213
from pandas.api.extensions import no_default, register_extension_dtype
@@ -130,8 +131,13 @@ def copy(self):
130131
return type(self)(self._data.copy())
131132

132133
def astype(self, dtype, copy=True):
134+
from pandas.core.arrays.string_ import StringDtype
135+
136+
dtype = pandas_dtype(dtype)
133137
if isinstance(dtype, type(self.dtype)):
134138
return type(self)(self._data, context=dtype.context)
139+
elif isinstance(dtype, StringDtype):
140+
return dtype.construct_array_type()._from_sequence(self, copy=False)
135141
return np.asarray(self, dtype=dtype)
136142

137143
def __setitem__(self, key, value):

0 commit comments

Comments
 (0)