Skip to content

Commit 2a835e4

Browse files
committed
Merge branch 'astype_string' of https://github.com/topper-123/pandas into astype_string
2 parents 556e1c2 + 59e9c3b commit 2a835e4

File tree

14 files changed

+116
-18
lines changed

14 files changed

+116
-18
lines changed

doc/source/user_guide/text.rst

+21
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,27 @@ Or ``astype`` after the ``Series`` or ``DataFrame`` is created
6363
s
6464
s.astype("string")
6565
66+
67+
.. versionchanged:: 1.1.0
68+
69+
You can also use ``string`` dtype on non-string data and it will be converted to
70+
``string`` dtype:
71+
72+
.. ipython:: python
73+
74+
s = pd.Series(['a', 2, np.nan], dtype="string")
75+
s
76+
type(s[1])
77+
78+
or convert from existing pandas data:
79+
80+
s1 = pd.Series([1,2, np.nan], dtype="Int64")
81+
s1
82+
s2 = s1.astype("string")
83+
s2
84+
type(s2[0])
85+
86+
6687
.. _text.differences:
6788

6889
Behavior differences

doc/source/whatsnew/v1.1.0.rst

+27
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,32 @@ including other versions of pandas.
1313
Enhancements
1414
~~~~~~~~~~~~
1515

16+
.. _whatsnew_110.astype_string:
17+
18+
All dtypes can now be converted to ``StringDtype``
19+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
20+
21+
Previously, declaring or converting to :class:`StringDtype` was in general only possible if the data was already only ``str`` or nan-like.
22+
For example:
23+
24+
.. code-block:: ipython
25+
26+
In [1]: pd.Series([1, "abc", np.nan], dtype="string")
27+
Out[1]: ValueError: StringArray requires a sequence of strings or pandas.NA
28+
In [2]: pd.Series([1,2, np.nan], dtype="Int64").astype("string")
29+
Out[2]: ValueError: StringArray requires a sequence of strings or pandas.NA
30+
31+
This meant that in order to convert arbitrary data to :class:`StringDtype`, you would often have to use ``.astype(str).astype('string')``, which was not intuitive.
32+
:class:`StringDtype` now works in all situations where ``astype(str)`` or ``dtype=str`` work:
33+
34+
.. ipython:: python
35+
36+
ser = pd.Series([1, "abc", np.nan], dtype="string")
37+
ser
38+
ser[0]
39+
pd.Series([1,2, np.nan], dtype="Int64").astype("string")
40+
41+
1642
.. _whatsnew_110.period_index_partial_string_slicing:
1743

1844
Nonmonotonic PeriodIndex Partial String Slicing
@@ -89,6 +115,7 @@ Other enhancements
89115
- :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`)
90116
- :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`)
91117
- :class:`Series.dt` and :class:`DatatimeIndex` now have an `isocalendar` accessor that returns a :class:`DataFrame` with year, week, and day calculated according to the ISO 8601 calendar (:issue:`33206`).
118+
- :meth:`Series.combine` has gained a ``dtype`` argument. If supplied, the combined series will get that dtype (:issue:`33465`)
92119
- The :meth:`DataFrame.to_feather` method now supports additional keyword
93120
arguments (e.g. to set the compression) that are added in pyarrow 0.17
94121
(:issue:`33422`).

pandas/core/arrays/base.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from pandas.util._validators import validate_fillna_kwargs
2121

2222
from pandas.core.dtypes.cast import maybe_cast_to_extension_array
23-
from pandas.core.dtypes.common import is_array_like, is_list_like
23+
from pandas.core.dtypes.common import is_array_like, is_list_like, pandas_dtype
2424
from pandas.core.dtypes.dtypes import ExtensionDtype
2525
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
2626
from pandas.core.dtypes.missing import isna
@@ -176,7 +176,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
176176
----------
177177
scalars : Sequence
178178
Each element will be an instance of the scalar type for this
179-
array, ``cls.dtype.type``.
179+
array, ``cls.dtype.type`` or be converted into this type in this method.
180180
dtype : dtype, optional
181181
Construct for this particular dtype. This should be a Dtype
182182
compatible with the ExtensionArray.
@@ -431,6 +431,11 @@ def astype(self, dtype, copy=True):
431431
array : ndarray
432432
NumPy ndarray with 'dtype' for its dtype.
433433
"""
434+
from pandas.core.arrays.string_ import StringDtype
435+
436+
dtype = pandas_dtype(dtype)
437+
if isinstance(dtype, StringDtype):
438+
return dtype.construct_array_type()._from_sequence(self, copy=False)
434439
return np.array(self, dtype=dtype, copy=copy)
435440

436441
def isna(self) -> ArrayLike:

pandas/core/arrays/integer.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import numbers
2-
from typing import TYPE_CHECKING, Tuple, Type, Union
2+
from typing import TYPE_CHECKING, Dict, Tuple, Type, Union
33
import warnings
44

55
import numpy as np
@@ -449,17 +449,20 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike:
449449
if incompatible type with an IntegerDtype, equivalent of same_kind
450450
casting
451451
"""
452-
from pandas.core.arrays.boolean import BooleanArray, BooleanDtype
452+
from pandas.core.arrays.boolean import BooleanDtype
453+
from pandas.core.arrays.string_ import StringDtype
453454

454455
dtype = pandas_dtype(dtype)
455456

456457
# if we are astyping to an existing IntegerDtype we can fastpath
457458
if isinstance(dtype, _IntegerDtype):
458459
result = self._data.astype(dtype.numpy_dtype, copy=False)
459-
return type(self)(result, mask=self._mask, copy=False)
460+
return dtype.construct_array_type()(result, mask=self._mask, copy=False)
460461
elif isinstance(dtype, BooleanDtype):
461462
result = self._data.astype("bool", copy=False)
462-
return BooleanArray(result, mask=self._mask, copy=False)
463+
return dtype.construct_array_type()(result, mask=self._mask, copy=False)
464+
elif isinstance(dtype, StringDtype):
465+
return dtype.construct_array_type()._from_sequence(self, copy=False)
463466

464467
# coerce
465468
if is_float_dtype(dtype):
@@ -748,7 +751,7 @@ class UInt64Dtype(_IntegerDtype):
748751
__doc__ = _dtype_docstring.format(dtype="uint64")
749752

750753

751-
_dtypes = {
754+
_dtypes: Dict[str, _IntegerDtype] = {
752755
"int8": Int8Dtype(),
753756
"int16": Int16Dtype(),
754757
"int32": Int32Dtype(),

pandas/core/arrays/interval.py

+5
Original file line numberDiff line numberDiff line change
@@ -678,6 +678,8 @@ def astype(self, dtype, copy=True):
678678
array : ExtensionArray or ndarray
679679
ExtensionArray or NumPy ndarray with 'dtype' for its dtype.
680680
"""
681+
from pandas.core.arrays.string_ import StringDtype
682+
681683
dtype = pandas_dtype(dtype)
682684
if is_interval_dtype(dtype):
683685
if dtype == self.dtype:
@@ -695,6 +697,9 @@ def astype(self, dtype, copy=True):
695697
return self._shallow_copy(new_left, new_right)
696698
elif is_categorical_dtype(dtype):
697699
return Categorical(np.asarray(self))
700+
elif isinstance(dtype, StringDtype):
701+
return dtype.construct_array_type()._from_sequence(self, copy=False)
702+
698703
# TODO: This try/except will be repeated.
699704
try:
700705
return np.asarray(self).astype(dtype, copy=copy)

pandas/core/arrays/sparse/dtype.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -320,7 +320,7 @@ def update_dtype(self, dtype):
320320
dtype = pandas_dtype(dtype)
321321

322322
if not isinstance(dtype, cls):
323-
fill_value = astype_nansafe(np.array(self.fill_value), dtype).item()
323+
fill_value = astype_nansafe(np.array([self.fill_value]), dtype)[0]
324324
dtype = cls(dtype, fill_value=fill_value)
325325

326326
return dtype

pandas/core/arrays/string_.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -203,10 +203,14 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
203203
# TODO: it would be nice to do this in _validate / lib.is_string_array
204204
# We are already doing a scan over the values there.
205205
na_values = isna(result)
206-
if na_values.any():
207-
if result is scalars:
208-
# force a copy now, if we haven't already
209-
result = result.copy()
206+
has_nans = na_values.any()
207+
if has_nans and result is scalars:
208+
# force a copy now, if we haven't already
209+
result = result.copy()
210+
# convert to str, then to object to avoid dtype like '<U3', then insert na_value
211+
result = np.asarray(result, dtype=str)
212+
result = np.asarray(result, dtype="object")
213+
if has_nans:
210214
result[na_values] = StringDtype.na_value
211215

212216
return cls(result)

pandas/core/series.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -2663,7 +2663,7 @@ def _construct_result(
26632663
out.name = name
26642664
return out
26652665

2666-
def combine(self, other, func, fill_value=None) -> "Series":
2666+
def combine(self, other, func, fill_value=None, dtype=None) -> "Series":
26672667
"""
26682668
Combine the Series with a Series or scalar according to `func`.
26692669
@@ -2682,6 +2682,11 @@ def combine(self, other, func, fill_value=None) -> "Series":
26822682
The value to assume when an index is missing from
26832683
one Series or the other. The default specifies to use the
26842684
appropriate NaN value for the underlying dtype of the Series.
2685+
dtype : str, numpy.dtype, or ExtensionDtype, optional
2686+
Data type for the output Series. If not specified, this will be
2687+
inferred from the combined data.
2688+
2689+
.. versionadded:: 1.1.0
26852690
26862691
Returns
26872692
-------
@@ -2752,6 +2757,10 @@ def combine(self, other, func, fill_value=None) -> "Series":
27522757
new_values = [func(lv, other) for lv in self._values]
27532758
new_name = self.name
27542759

2760+
if dtype is not None:
2761+
return self._constructor(
2762+
new_values, index=new_index, name=new_name, dtype=dtype
2763+
)
27552764
if is_categorical_dtype(self.dtype):
27562765
pass
27572766
elif is_extension_array_dtype(self.dtype):

pandas/tests/arrays/sparse/test_array.py

-1
Original file line numberDiff line numberDiff line change
@@ -529,7 +529,6 @@ def test_astype_all(self, any_real_dtype):
529529
np.array([0, 1], dtype="datetime64[ns]"),
530530
dtype=SparseDtype("datetime64[ns]", pd.Timestamp("1970")),
531531
),
532-
marks=[pytest.mark.xfail(reason="NumPy-7619")],
533532
),
534533
(
535534
SparseArray([0, 1, 10]),

pandas/tests/extension/base/casting.py

+5
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,11 @@ def test_astype_str(self, data):
3636
expected = pd.Series(data[:5].astype(str))
3737
self.assert_series_equal(result, expected)
3838

39+
def test_astype_string(self, data):
40+
result = pd.Series(data[:5]).astype("string")
41+
expected = pd.Series(data[:5].astype("string"))
42+
self.assert_series_equal(result, expected)
43+
3944
def test_to_numpy(self, data):
4045
expected = np.asarray(data)
4146

pandas/tests/extension/base/methods.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -172,15 +172,16 @@ def test_combine_le(self, data_repeated):
172172
orig_data1, orig_data2 = data_repeated(2)
173173
s1 = pd.Series(orig_data1)
174174
s2 = pd.Series(orig_data2)
175-
result = s1.combine(s2, lambda x1, x2: x1 <= x2)
175+
result = s1.combine(s2, lambda x1, x2: x1 <= x2, dtype="boolean")
176176
expected = pd.Series(
177-
[a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))]
177+
[a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))],
178+
dtype="boolean",
178179
)
179180
self.assert_series_equal(result, expected)
180181

181182
val = s1.iloc[0]
182-
result = s1.combine(val, lambda x1, x2: x1 <= x2)
183-
expected = pd.Series([a <= val for a in list(orig_data1)])
183+
result = s1.combine(val, lambda x1, x2: x1 <= x2, dtype="boolean")
184+
expected = pd.Series([a <= val for a in list(orig_data1)], dtype="boolean")
184185
self.assert_series_equal(result, expected)
185186

186187
def test_combine_add(self, data_repeated):

pandas/tests/extension/decimal/array.py

+6
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import numpy as np
88

99
from pandas.core.dtypes.base import ExtensionDtype
10+
from pandas.core.dtypes.common import pandas_dtype
1011

1112
import pandas as pd
1213
from pandas.api.extensions import no_default, register_extension_dtype
@@ -130,8 +131,13 @@ def copy(self):
130131
return type(self)(self._data.copy())
131132

132133
def astype(self, dtype, copy=True):
134+
from pandas.core.arrays.string_ import StringDtype
135+
136+
dtype = pandas_dtype(dtype)
133137
if isinstance(dtype, type(self.dtype)):
134138
return type(self)(self._data, context=dtype.context)
139+
elif isinstance(dtype, StringDtype):
140+
return dtype.construct_array_type()._from_sequence(self, copy=False)
135141
return np.asarray(self, dtype=dtype)
136142

137143
def __setitem__(self, key, value):

pandas/tests/extension/json/array.py

+8
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121

2222
import numpy as np
2323

24+
from pandas.core.dtypes.common import pandas_dtype
25+
2426
import pandas as pd
2527
from pandas.api.extensions import ExtensionArray, ExtensionDtype
2628

@@ -154,12 +156,18 @@ def astype(self, dtype, copy=True):
154156
# NumPy has issues when all the dicts are the same length.
155157
# np.array([UserDict(...), UserDict(...)]) fails,
156158
# but np.array([{...}, {...}]) works, so cast.
159+
from pandas.core.arrays.string_ import StringDtype
157160

161+
dtype = pandas_dtype(dtype)
158162
# needed to add this check for the Series constructor
159163
if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
160164
if copy:
161165
return self.copy()
162166
return self
167+
elif isinstance(dtype, StringDtype):
168+
value = self.astype(str) # numpy doesn'y like nested dicts
169+
return dtype.construct_array_type()._from_sequence(value, copy=False)
170+
163171
return np.array([dict(x) for x in self], dtype=dtype, copy=copy)
164172

165173
def unique(self):

pandas/tests/extension/test_numpy.py

+5
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,11 @@ def test_astype_str(self, data):
139139
# ValueError: setting an array element with a sequence
140140
super().test_astype_str(data)
141141

142+
@skip_nested
143+
def test_astype_string(self, data):
144+
# ValueError: setting an array element with a sequence
145+
super().test_astype_string(data)
146+
142147

143148
class TestConstructors(BaseNumPyTests, base.BaseConstructorsTests):
144149
@pytest.mark.skip(reason="We don't register our dtype")

0 commit comments

Comments
 (0)