From 9b90c3ce401b83ad19e9c4fc44218d1f0ce9195b Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 14 Apr 2020 08:06:05 +0100 Subject: [PATCH 01/10] API: More permissive conversion to StringDtype --- doc/source/user_guide/text.rst | 21 ++++++++++++++++++ doc/source/whatsnew/v1.1.0.rst | 27 ++++++++++++++++++++++++ pandas/core/arrays/base.py | 9 ++++++-- pandas/core/arrays/integer.py | 13 +++++++----- pandas/core/arrays/interval.py | 6 ++++++ pandas/core/arrays/sparse/dtype.py | 2 +- pandas/core/arrays/string_.py | 12 +++++++---- pandas/core/series.py | 11 +++++++++- pandas/tests/arrays/sparse/test_array.py | 1 - pandas/tests/extension/base/casting.py | 5 +++++ pandas/tests/extension/base/methods.py | 9 ++++---- pandas/tests/extension/decimal/array.py | 6 ++++++ pandas/tests/extension/json/array.py | 8 +++++++ pandas/tests/extension/test_numpy.py | 5 +++++ 14 files changed, 117 insertions(+), 18 deletions(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index bea0f42f6849c..4f57a7c2825cf 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -63,6 +63,27 @@ Or ``astype`` after the ``Series`` or ``DataFrame`` is created s s.astype("string") + +.. versionchanged:: 1.1.0 + +You can also use ``string`` dtype on non-string data and it will be converted to +``string`` dtype: + +.. ipython:: python + + s = pd.Series(['a', 2, np.nan], dtype="string") + s + type(s[1]) + +or convert from existing pandas data: + + s1 = pd.Series([1,2, np.nan], dtype="Int64") + s1 + s2 = s1.astype("string") + s2 + type(s2[0]) + + .. _text.differences: Behavior differences diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 20e2cce1a3dfa..6e197c0139921 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -13,6 +13,32 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +.. _whatsnew_110.astype_string: + +All dtypes can now be converted to ``StringDtype`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, declaring or converting to :class:`StringDtype` was in general only possible if the data was already only ``str`` or nan-like. +For example: + +.. code-block:: ipython + + In [1]: pd.Series([1, "abc", np.nan], dtype="string") + Out[1]: ValueError: StringArray requires a sequence of strings or pandas.NA + In [2]: pd.Series([1,2, np.nan], dtype="Int64").astype("string") + Out[2]: ValueError: StringArray requires a sequence of strings or pandas.NA + +This meant that in order to convert arbitrary data to :class:`StringDtype`, you would often have to use ``.astype(str).astype('string')``, which was not intuitive. +:class:`StringDtype` now works in all situations where ``astype(str)`` or ``dtype=str`` work: + +.. ipython:: python + + ser = pd.Series([1, "abc", np.nan], dtype="string") + ser + ser[0] + pd.Series([1,2, np.nan], dtype="Int64").astype("string") + + .. _whatsnew_110.period_index_partial_string_slicing: Nonmonotonic PeriodIndex Partial String Slicing @@ -210,6 +236,7 @@ Other enhancements - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`) - :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`) - :class:`Series.dt` and :class:`DatatimeIndex` now have an `isocalendar` method that returns a :class:`DataFrame` with year, week, and day calculated according to the ISO 8601 calendar (:issue:`33206`). +- :meth:`Series.combine` has gained a ``dtype`` argument. If supplied, the combined series will get that dtype (:issue:`33465`) - The :meth:`DataFrame.to_feather` method now supports additional keyword arguments (e.g. to set the compression) that are added in pyarrow 0.17 (:issue:`33422`). diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index fb9e2f6732018..86150834e4180 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -20,7 +20,7 @@ from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.cast import maybe_cast_to_extension_array -from pandas.core.dtypes.common import is_array_like, is_list_like +from pandas.core.dtypes.common import is_array_like, is_list_like, pandas_dtype from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna @@ -178,7 +178,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): ---------- scalars : Sequence Each element will be an instance of the scalar type for this - array, ``cls.dtype.type``. + array, ``cls.dtype.type`` or be converted into this type in this method. dtype : dtype, optional Construct for this particular dtype. This should be a Dtype compatible with the ExtensionArray. @@ -451,6 +451,11 @@ def astype(self, dtype, copy=True): array : ndarray NumPy ndarray with 'dtype' for its dtype. """ + from pandas.core.arrays.string_ import StringDtype + + dtype = pandas_dtype(dtype) + if isinstance(dtype, StringDtype): + return dtype.construct_array_type()._from_sequence(self, copy=False) return np.array(self, dtype=dtype, copy=copy) def isna(self) -> ArrayLike: diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 59954f548fd33..d7e5f33f3cb61 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -1,5 +1,5 @@ import numbers -from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union +from typing import TYPE_CHECKING, List, Optional, Dict, Tuple, Type, Union import warnings import numpy as np @@ -442,17 +442,20 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: if incompatible type with an IntegerDtype, equivalent of same_kind casting """ - from pandas.core.arrays.boolean import BooleanArray, BooleanDtype + from pandas.core.arrays.boolean import BooleanDtype + from pandas.core.arrays.string_ import StringDtype dtype = pandas_dtype(dtype) # if we are astyping to an existing IntegerDtype we can fastpath if isinstance(dtype, _IntegerDtype): result = self._data.astype(dtype.numpy_dtype, copy=False) - return type(self)(result, mask=self._mask, copy=False) + return dtype.construct_array_type()(result, mask=self._mask, copy=False) elif isinstance(dtype, BooleanDtype): result = self._data.astype("bool", copy=False) - return BooleanArray(result, mask=self._mask, copy=False) + return dtype.construct_array_type()(result, mask=self._mask, copy=False) + elif isinstance(dtype, StringDtype): + return dtype.construct_array_type()._from_sequence(self, copy=False) # coerce if is_float_dtype(dtype): @@ -722,7 +725,7 @@ class UInt64Dtype(_IntegerDtype): __doc__ = _dtype_docstring.format(dtype="uint64") -_dtypes = { +_dtypes: Dict[str, _IntegerDtype] = { "int8": Int8Dtype(), "int16": Int16Dtype(), "int32": Int32Dtype(), diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index c5366884fbdfe..c861d25afd13f 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -680,8 +680,11 @@ def astype(self, dtype, copy=True): array : ExtensionArray or ndarray ExtensionArray or NumPy ndarray with 'dtype' for its dtype. """ + from pandas.core.arrays.string_ import StringDtype + if dtype is not None: dtype = pandas_dtype(dtype) + if is_interval_dtype(dtype): if dtype == self.dtype: return self.copy() if copy else self @@ -698,6 +701,9 @@ def astype(self, dtype, copy=True): return self._shallow_copy(new_left, new_right) elif is_categorical_dtype(dtype): return Categorical(np.asarray(self)) + elif isinstance(dtype, StringDtype): + return dtype.construct_array_type()._from_sequence(self, copy=False) + # TODO: This try/except will be repeated. try: return np.asarray(self).astype(dtype, copy=copy) diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py index a9090570e64a9..ffb1dcad85e7f 100644 --- a/pandas/core/arrays/sparse/dtype.py +++ b/pandas/core/arrays/sparse/dtype.py @@ -322,7 +322,7 @@ def update_dtype(self, dtype): dtype = pandas_dtype(dtype) if not isinstance(dtype, cls): - fill_value = astype_nansafe(np.array(self.fill_value), dtype).item() + fill_value = astype_nansafe(np.array([self.fill_value]), dtype)[0] dtype = cls(dtype, fill_value=fill_value) return dtype diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 537b1cf3dd439..bb01c37db2765 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -203,10 +203,14 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): # TODO: it would be nice to do this in _validate / lib.is_string_array # We are already doing a scan over the values there. na_values = isna(result) - if na_values.any(): - if result is scalars: - # force a copy now, if we haven't already - result = result.copy() + has_nans = na_values.any() + if has_nans and result is scalars: + # force a copy now, if we haven't already + result = result.copy() + # convert to str, then to object to avoid dtype like ' "Series": + def combine(self, other, func, fill_value=None, dtype=None) -> "Series": """ Combine the Series with a Series or scalar according to `func`. @@ -2695,6 +2695,11 @@ def combine(self, other, func, fill_value=None) -> "Series": The value to assume when an index is missing from one Series or the other. The default specifies to use the appropriate NaN value for the underlying dtype of the Series. + dtype : str, numpy.dtype, or ExtensionDtype, optional + Data type for the output Series. If not specified, this will be + inferred from the combined data. + + .. versionadded:: 1.1.0 Returns ------- @@ -2765,6 +2770,10 @@ def combine(self, other, func, fill_value=None) -> "Series": new_values = [func(lv, other) for lv in self._values] new_name = self.name + if dtype is not None: + return self._constructor( + new_values, index=new_index, name=new_name, dtype=dtype + ) if is_categorical_dtype(self.dtype): pass elif is_extension_array_dtype(self.dtype): diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 8450253f853c3..7940e9c534908 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -542,7 +542,6 @@ def test_astype_all(self, any_real_dtype): np.array([0, 1], dtype="datetime64[ns]"), dtype=SparseDtype("datetime64[ns]", pd.Timestamp("1970")), ), - marks=[pytest.mark.xfail(reason="NumPy-7619")], ), ( SparseArray([0, 1, 10]), diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index f33f960e8e341..f7dae9f460d77 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -36,6 +36,11 @@ def test_astype_str(self, data): expected = pd.Series(data[:5].astype(str)) self.assert_series_equal(result, expected) + def test_astype_string(self, data): + result = pd.Series(data[:5]).astype("string") + expected = pd.Series(data[:5].astype("string")) + self.assert_series_equal(result, expected) + def test_to_numpy(self, data): expected = np.asarray(data) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 874a8dfd4253f..c4ce579eadb1d 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -188,15 +188,16 @@ def test_combine_le(self, data_repeated): orig_data1, orig_data2 = data_repeated(2) s1 = pd.Series(orig_data1) s2 = pd.Series(orig_data2) - result = s1.combine(s2, lambda x1, x2: x1 <= x2) + result = s1.combine(s2, lambda x1, x2: x1 <= x2, dtype="boolean") expected = pd.Series( - [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))] + [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))], + dtype="boolean", ) self.assert_series_equal(result, expected) val = s1.iloc[0] - result = s1.combine(val, lambda x1, x2: x1 <= x2) - expected = pd.Series([a <= val for a in list(orig_data1)]) + result = s1.combine(val, lambda x1, x2: x1 <= x2, dtype="boolean") + expected = pd.Series([a <= val for a in list(orig_data1)], dtype="boolean") self.assert_series_equal(result, expected) def test_combine_add(self, data_repeated): diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 85d8ad6ec6e38..fb55c5ae03925 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -7,6 +7,7 @@ import numpy as np from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.common import pandas_dtype import pandas as pd from pandas.api.extensions import no_default, register_extension_dtype @@ -130,8 +131,13 @@ def copy(self): return type(self)(self._data.copy()) def astype(self, dtype, copy=True): + from pandas.core.arrays.string_ import StringDtype + + dtype = pandas_dtype(dtype) if isinstance(dtype, type(self.dtype)): return type(self)(self._data, context=dtype.context) + elif isinstance(dtype, StringDtype): + return dtype.construct_array_type()._from_sequence(self, copy=False) return np.asarray(self, dtype=dtype) def __setitem__(self, key, value): diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 94f971938b690..3132b39a7d6d6 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -21,6 +21,8 @@ import numpy as np +from pandas.core.dtypes.common import pandas_dtype + import pandas as pd from pandas.api.extensions import ExtensionArray, ExtensionDtype @@ -160,12 +162,18 @@ def astype(self, dtype, copy=True): # NumPy has issues when all the dicts are the same length. # np.array([UserDict(...), UserDict(...)]) fails, # but np.array([{...}, {...}]) works, so cast. + from pandas.core.arrays.string_ import StringDtype + dtype = pandas_dtype(dtype) # needed to add this check for the Series constructor if isinstance(dtype, type(self.dtype)) and dtype == self.dtype: if copy: return self.copy() return self + elif isinstance(dtype, StringDtype): + value = self.astype(str) # numpy doesn'y like nested dicts + return dtype.construct_array_type()._from_sequence(value, copy=False) + return np.array([dict(x) for x in self], dtype=dtype, copy=copy) def unique(self): diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 1e21249988df6..3d5329b98daf9 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -139,6 +139,11 @@ def test_astype_str(self, data): # ValueError: setting an array element with a sequence super().test_astype_str(data) + @skip_nested + def test_astype_string(self, data): + # ValueError: setting an array element with a sequence + super().test_astype_string(data) + class TestConstructors(BaseNumPyTests, base.BaseConstructorsTests): @pytest.mark.skip(reason="We don't register our dtype") From f94169e1652ef0c6a11ded7e9a45b6301e76c42b Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 14 Apr 2020 09:26:11 +0100 Subject: [PATCH 02/10] clean up rst and doc strings --- doc/source/whatsnew/v1.1.0.rst | 4 ++-- pandas/core/arrays/string_.py | 15 ++++++++++----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 6e197c0139921..5fe7c6b2dced4 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -25,7 +25,7 @@ For example: In [1]: pd.Series([1, "abc", np.nan], dtype="string") Out[1]: ValueError: StringArray requires a sequence of strings or pandas.NA - In [2]: pd.Series([1,2, np.nan], dtype="Int64").astype("string") + In [2]: pd.Series([1, 2, np.nan], dtype="Int64").astype("string") Out[2]: ValueError: StringArray requires a sequence of strings or pandas.NA This meant that in order to convert arbitrary data to :class:`StringDtype`, you would often have to use ``.astype(str).astype('string')``, which was not intuitive. @@ -36,7 +36,7 @@ This meant that in order to convert arbitrary data to :class:`StringDtype`, you ser = pd.Series([1, "abc", np.nan], dtype="string") ser ser[0] - pd.Series([1,2, np.nan], dtype="Int64").astype("string") + pd.Series([1, 2, np.nan], dtype="Int64").astype("string") .. _whatsnew_110.period_index_partial_string_slicing: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index bb01c37db2765..97f692314b5b9 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -104,6 +104,11 @@ class StringArray(PandasArray): .. versionadded:: 1.0.0 + .. versionchanged:: 1.1.0 + + ``StringArray`` allow non-string input values, but will always convert the + values to strings. (Before Pandas 1.1 non-string values were not allowed). + .. warning:: StringArray is considered experimental. The implementation and @@ -152,13 +157,13 @@ class StringArray(PandasArray): ['This is', 'some text', , 'data.'] Length: 4, dtype: string - Unlike ``object`` dtype arrays, ``StringArray`` doesn't allow non-string - values. + Like ``object`` dtype arrays instantiated with ``dtype="str"``, ``StringArray`` + allows non-string values but will always convert the values to strings. >>> pd.array(['1', 1], dtype="string") - Traceback (most recent call last): - ... - ValueError: StringArray requires an object-dtype ndarray of strings. + + ['1', '1'] + Length: 2, dtype: string For comparison methods, this returns a :class:`pandas.BooleanArray` From 681a211a924bc2581e7b72cbdd768994472c0b0a Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 23 May 2020 00:53:31 +0100 Subject: [PATCH 03/10] update --- doc/source/whatsnew/v1.1.0.rst | 1 - pandas/core/arrays/datetimelike.py | 7 ++++++- pandas/core/arrays/integer.py | 2 +- pandas/core/arrays/period.py | 1 - pandas/core/arrays/string_.py | 13 ++++++------- pandas/core/series.py | 15 +++++---------- pandas/tests/extension/base/casting.py | 5 +++-- pandas/tests/extension/base/methods.py | 9 ++++----- pandas/tests/extension/test_sparse.py | 9 +++++++++ 9 files changed, 34 insertions(+), 28 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5fe7c6b2dced4..9de192bdccbaa 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -236,7 +236,6 @@ Other enhancements - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`) - :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`) - :class:`Series.dt` and :class:`DatatimeIndex` now have an `isocalendar` method that returns a :class:`DataFrame` with year, week, and day calculated according to the ISO 8601 calendar (:issue:`33206`). -- :meth:`Series.combine` has gained a ``dtype`` argument. If supplied, the combined series will get that dtype (:issue:`33465`) - The :meth:`DataFrame.to_feather` method now supports additional keyword arguments (e.g. to set the compression) that are added in pyarrow 0.17 (:issue:`33422`). diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 3c1602344c314..cf3cde155a3bb 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -27,6 +27,7 @@ is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_dtype_equal, + is_extension_array_dtype, is_float_dtype, is_integer_dtype, is_list_like, @@ -619,7 +620,11 @@ def astype(self, dtype, copy=True): if is_object_dtype(dtype): return self._box_values(self.asi8.ravel()).reshape(self.shape) elif is_string_dtype(dtype) and not is_categorical_dtype(dtype): - return self._format_native_types() + if is_extension_array_dtype(dtype): + arr_cls = dtype.construct_array_type() + return arr_cls._from_sequence(self, dtype=dtype) + else: + return self._format_native_types() elif is_integer_dtype(dtype): # we deliberately ignore int32 vs. int64 here. # See https://github.com/pandas-dev/pandas/issues/24381 for more. diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index d7e5f33f3cb61..d4137f9666946 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -1,5 +1,5 @@ import numbers -from typing import TYPE_CHECKING, List, Optional, Dict, Tuple, Type, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type, Union import warnings import numpy as np diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 8b2925b2c0827..bb0270b335ce4 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -564,7 +564,6 @@ def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): actually format my specific types """ values = self.astype(object) - if date_format: formatter = lambda dt: dt.strftime(date_format) else: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 97f692314b5b9..d157d8ce0e996 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -104,11 +104,6 @@ class StringArray(PandasArray): .. versionadded:: 1.0.0 - .. versionchanged:: 1.1.0 - - ``StringArray`` allow non-string input values, but will always convert the - values to strings. (Before Pandas 1.1 non-string values were not allowed). - .. warning:: StringArray is considered experimental. The implementation and @@ -157,9 +152,13 @@ class StringArray(PandasArray): ['This is', 'some text', , 'data.'] Length: 4, dtype: string - Like ``object`` dtype arrays instantiated with ``dtype="str"``, ``StringArray`` - allows non-string values but will always convert the values to strings. + Unlike arrays instantiated with ``dtype="object"``, ``StringArray`` + will convert the values to strings. + >>> pd.array(['1', 1], dtype="object") + + ['1', 1] + Length: 2, dtype: object >>> pd.array(['1', 1], dtype="string") ['1', '1'] diff --git a/pandas/core/series.py b/pandas/core/series.py index 56e13e5af542a..d7333970c0ee3 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2676,7 +2676,7 @@ def _construct_result( out.name = name return out - def combine(self, other, func, fill_value=None, dtype=None) -> "Series": + def combine(self, other, func, fill_value=None) -> "Series": """ Combine the Series with a Series or scalar according to `func`. @@ -2695,11 +2695,6 @@ def combine(self, other, func, fill_value=None, dtype=None) -> "Series": The value to assume when an index is missing from one Series or the other. The default specifies to use the appropriate NaN value for the underlying dtype of the Series. - dtype : str, numpy.dtype, or ExtensionDtype, optional - Data type for the output Series. If not specified, this will be - inferred from the combined data. - - .. versionadded:: 1.1.0 Returns ------- @@ -2770,13 +2765,13 @@ def combine(self, other, func, fill_value=None, dtype=None) -> "Series": new_values = [func(lv, other) for lv in self._values] new_name = self.name - if dtype is not None: - return self._constructor( - new_values, index=new_index, name=new_name, dtype=dtype - ) if is_categorical_dtype(self.dtype): pass elif is_extension_array_dtype(self.dtype): + # Everything can be be converted to strings, but we may not want to convert + if self.dtype == "string" and lib.infer_dtype(new_values) != "string": + return self._constructor(new_values, index=new_index, name=new_name) + # TODO: can we do this for only SparseDtype? # The function can return something of any type, so check # if the type is compatible with the calling EA. diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index f7dae9f460d77..567a62a8b33a5 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -33,12 +33,13 @@ def test_tolist(self, data): def test_astype_str(self, data): result = pd.Series(data[:5]).astype(str) - expected = pd.Series(data[:5].astype(str)) + expected = pd.Series([str(x) for x in data[:5]], dtype=str) self.assert_series_equal(result, expected) def test_astype_string(self, data): + # GH-33465 result = pd.Series(data[:5]).astype("string") - expected = pd.Series(data[:5].astype("string")) + expected = pd.Series([str(x) for x in data[:5]], dtype="string") self.assert_series_equal(result, expected) def test_to_numpy(self, data): diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index c4ce579eadb1d..874a8dfd4253f 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -188,16 +188,15 @@ def test_combine_le(self, data_repeated): orig_data1, orig_data2 = data_repeated(2) s1 = pd.Series(orig_data1) s2 = pd.Series(orig_data2) - result = s1.combine(s2, lambda x1, x2: x1 <= x2, dtype="boolean") + result = s1.combine(s2, lambda x1, x2: x1 <= x2) expected = pd.Series( - [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))], - dtype="boolean", + [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))] ) self.assert_series_equal(result, expected) val = s1.iloc[0] - result = s1.combine(val, lambda x1, x2: x1 <= x2, dtype="boolean") - expected = pd.Series([a <= val for a in list(orig_data1)], dtype="boolean") + result = s1.combine(val, lambda x1, x2: x1 <= x2) + expected = pd.Series([a <= val for a in list(orig_data1)]) self.assert_series_equal(result, expected) def test_combine_add(self, data_repeated): diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index e59b3f0600867..fc356d1332fb7 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -343,6 +343,15 @@ def test_astype_object_frame(self, all_data): # comp = result.dtypes.equals(df.dtypes) # assert not comp.any() + @pytest.mark.xfail(raises=AssertionError, reason="no sparse str dtype") + def test_astype_str(self, data): + # Sparse arrays don't support str dtype + super().test_astype_str(data) + + @pytest.mark.xfail(raises=AssertionError, reason="no sparse StringDtype") + def test_astype_string(self, data): + super().test_astype_string(data) + class TestArithmeticOps(BaseSparseTests, base.BaseArithmeticOpsTests): series_scalar_exc = None From f316e4251df5ece017ad31adb93f678f25271cb0 Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 24 May 2020 11:28:29 +0100 Subject: [PATCH 04/10] changes --- pandas/core/arrays/base.py | 1 + pandas/core/arrays/period.py | 1 + pandas/core/arrays/sparse/dtype.py | 6 +++++- pandas/tests/arrays/sparse/test_array.py | 1 + pandas/tests/extension/test_numpy.py | 1 + pandas/tests/extension/test_sparse.py | 9 +++++---- 6 files changed, 14 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 86150834e4180..0d41b17272a7e 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -456,6 +456,7 @@ def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if isinstance(dtype, StringDtype): return dtype.construct_array_type()._from_sequence(self, copy=False) + return np.array(self, dtype=dtype, copy=copy) def isna(self) -> ArrayLike: diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index bb0270b335ce4..8b2925b2c0827 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -564,6 +564,7 @@ def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): actually format my specific types """ values = self.astype(object) + if date_format: formatter = lambda dt: dt.strftime(date_format) else: diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py index ffb1dcad85e7f..8d17ed412f6b4 100644 --- a/pandas/core/arrays/sparse/dtype.py +++ b/pandas/core/arrays/sparse/dtype.py @@ -13,6 +13,7 @@ from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( is_bool_dtype, + is_extension_array_dtype, is_object_dtype, is_scalar, is_string_dtype, @@ -322,7 +323,10 @@ def update_dtype(self, dtype): dtype = pandas_dtype(dtype) if not isinstance(dtype, cls): - fill_value = astype_nansafe(np.array([self.fill_value]), dtype)[0] + if is_extension_array_dtype(dtype): + raise TypeError("sparse arrays of extension dtypes not supported") + + fill_value = astype_nansafe(np.array(self.fill_value), dtype).item() dtype = cls(dtype, fill_value=fill_value) return dtype diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 7940e9c534908..8450253f853c3 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -542,6 +542,7 @@ def test_astype_all(self, any_real_dtype): np.array([0, 1], dtype="datetime64[ns]"), dtype=SparseDtype("datetime64[ns]", pd.Timestamp("1970")), ), + marks=[pytest.mark.xfail(reason="NumPy-7619")], ), ( SparseArray([0, 1, 10]), diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 3d5329b98daf9..78000c0252375 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -141,6 +141,7 @@ def test_astype_str(self, data): @skip_nested def test_astype_string(self, data): + # GH-33465 # ValueError: setting an array element with a sequence super().test_astype_string(data) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index fc356d1332fb7..332a7f5a4e124 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -343,12 +343,13 @@ def test_astype_object_frame(self, all_data): # comp = result.dtypes.equals(df.dtypes) # assert not comp.any() - @pytest.mark.xfail(raises=AssertionError, reason="no sparse str dtype") def test_astype_str(self, data): - # Sparse arrays don't support str dtype - super().test_astype_str(data) + result = pd.Series(data[:5]).astype(str) + expected_dtype = pd.SparseDtype(str, str(data.fill_value)) + expected = pd.Series([str(x) for x in data[:5]], dtype=expected_dtype) + self.assert_series_equal(result, expected) - @pytest.mark.xfail(raises=AssertionError, reason="no sparse StringDtype") + @pytest.mark.xfail(raises=TypeError, reason="no sparse StringDtype") def test_astype_string(self, data): super().test_astype_string(data) From 89ef931de7c24755fc4df61284a3ab46f1b01c8f Mon Sep 17 00:00:00 2001 From: tp Date: Mon, 25 May 2020 23:30:21 +0100 Subject: [PATCH 05/10] Change according to comments --- doc/source/whatsnew/v1.1.0.rst | 14 +++----------- pandas/core/arrays/base.py | 2 +- pandas/core/arrays/string_.py | 6 ++++++ pandas/core/dtypes/cast.py | 7 +++++++ pandas/core/series.py | 4 ---- pandas/tests/extension/decimal/array.py | 5 ++--- 6 files changed, 19 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 9de192bdccbaa..4be1393f906df 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -18,19 +18,11 @@ Enhancements All dtypes can now be converted to ``StringDtype`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Previously, declaring or converting to :class:`StringDtype` was in general only possible if the data was already only ``str`` or nan-like. -For example: - -.. code-block:: ipython - - In [1]: pd.Series([1, "abc", np.nan], dtype="string") - Out[1]: ValueError: StringArray requires a sequence of strings or pandas.NA - In [2]: pd.Series([1, 2, np.nan], dtype="Int64").astype("string") - Out[2]: ValueError: StringArray requires a sequence of strings or pandas.NA - -This meant that in order to convert arbitrary data to :class:`StringDtype`, you would often have to use ``.astype(str).astype('string')``, which was not intuitive. +Previously, declaring or converting to :class:`StringDtype` was in general only possible if the data was already only ``str`` or nan-like (:issue:`31204`). :class:`StringDtype` now works in all situations where ``astype(str)`` or ``dtype=str`` work: +For example the below now work: + .. ipython:: python ser = pd.Series([1, "abc", np.nan], dtype="string") diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 0d41b17272a7e..b5e917bafca7e 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -454,7 +454,7 @@ def astype(self, dtype, copy=True): from pandas.core.arrays.string_ import StringDtype dtype = pandas_dtype(dtype) - if isinstance(dtype, StringDtype): + if isinstance(dtype, StringDtype): # allow conversion to StringArrays return dtype.construct_array_type()._from_sequence(self, copy=False) return np.array(self, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index d157d8ce0e996..70e4135f3315d 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -164,6 +164,11 @@ class StringArray(PandasArray): ['1', '1'] Length: 2, dtype: string + On the other hand, instantiating StringArrays directly with non-strings will + raise an error: + >>> pd.arrays.StringArray(np.array([1, 2])) + ValueError: StringArray requires a sequence of strings or pandas.NA + For comparison methods, this returns a :class:`pandas.BooleanArray` >>> pd.array(["a", None, "c"], dtype="string") == "a" @@ -211,6 +216,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): if has_nans and result is scalars: # force a copy now, if we haven't already result = result.copy() + # convert to str, then to object to avoid dtype like ' "Series": if is_categorical_dtype(self.dtype): pass elif is_extension_array_dtype(self.dtype): - # Everything can be be converted to strings, but we may not want to convert - if self.dtype == "string" and lib.infer_dtype(new_values) != "string": - return self._constructor(new_values, index=new_index, name=new_name) - # TODO: can we do this for only SparseDtype? # The function can return something of any type, so check # if the type is compatible with the calling EA. diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index fb55c5ae03925..d934ca8243c9b 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -136,9 +136,8 @@ def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if isinstance(dtype, type(self.dtype)): return type(self)(self._data, context=dtype.context) - elif isinstance(dtype, StringDtype): - return dtype.construct_array_type()._from_sequence(self, copy=False) - return np.asarray(self, dtype=dtype) + + return super().astype(dtype, copy=copy) def __setitem__(self, key, value): if pd.api.types.is_list_like(value): From 053dae43ca5e655394be1411837c5bd9c858e817 Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 26 May 2020 00:43:51 +0100 Subject: [PATCH 06/10] Small clean-up --- pandas/core/arrays/string_.py | 1 + pandas/tests/extension/decimal/array.py | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 70e4135f3315d..a07a21e0dba87 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -166,6 +166,7 @@ class StringArray(PandasArray): On the other hand, instantiating StringArrays directly with non-strings will raise an error: + >>> pd.arrays.StringArray(np.array([1, 2])) ValueError: StringArray requires a sequence of strings or pandas.NA diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index d934ca8243c9b..4d5be75ff8200 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -131,8 +131,6 @@ def copy(self): return type(self)(self._data.copy()) def astype(self, dtype, copy=True): - from pandas.core.arrays.string_ import StringDtype - dtype = pandas_dtype(dtype) if isinstance(dtype, type(self.dtype)): return type(self)(self._data, context=dtype.context) From af0bf4da865213dfc18c32aed270b9da380bf392 Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 26 May 2020 07:22:33 +0100 Subject: [PATCH 07/10] failed doctest --- pandas/core/arrays/string_.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index a07a21e0dba87..1ef10450a1bd9 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -168,6 +168,7 @@ class StringArray(PandasArray): raise an error: >>> pd.arrays.StringArray(np.array([1, 2])) + Traceback (most recent call last) ValueError: StringArray requires a sequence of strings or pandas.NA For comparison methods, this returns a :class:`pandas.BooleanArray` From c96656272f5c637ac4762b87b359c7107d367c4f Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 26 May 2020 11:27:33 +0100 Subject: [PATCH 08/10] update doc string --- pandas/core/arrays/string_.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 1ef10450a1bd9..ac501a8afbe09 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -164,14 +164,9 @@ class StringArray(PandasArray): ['1', '1'] Length: 2, dtype: string - On the other hand, instantiating StringArrays directly with non-strings will - raise an error: + However, instantiating StringArrays directly with non-strings will raise an error. - >>> pd.arrays.StringArray(np.array([1, 2])) - Traceback (most recent call last) - ValueError: StringArray requires a sequence of strings or pandas.NA - - For comparison methods, this returns a :class:`pandas.BooleanArray` + For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`: >>> pd.array(["a", None, "c"], dtype="string") == "a" From 914349a8bbc8233a43eef6e05d79b824038b4be2 Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 26 May 2020 15:27:31 +0100 Subject: [PATCH 09/10] minor updates --- doc/source/user_guide/text.rst | 6 ++++-- doc/source/whatsnew/v1.1.0.rst | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 4f57a7c2825cf..b96cd5eb776e3 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -66,8 +66,8 @@ Or ``astype`` after the ``Series`` or ``DataFrame`` is created .. versionchanged:: 1.1.0 -You can also use ``string`` dtype on non-string data and it will be converted to -``string`` dtype: +You can also use :class:`StringDtype`/``"string"`` as the dtype on non-string data and +it will be converted to ``string`` dtype: .. ipython:: python @@ -77,6 +77,8 @@ You can also use ``string`` dtype on non-string data and it will be converted to or convert from existing pandas data: +.. ipython:: python + s1 = pd.Series([1,2, np.nan], dtype="Int64") s1 s2 = s1.astype("string") diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 4be1393f906df..64fdf5b2244cb 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -21,7 +21,7 @@ All dtypes can now be converted to ``StringDtype`` Previously, declaring or converting to :class:`StringDtype` was in general only possible if the data was already only ``str`` or nan-like (:issue:`31204`). :class:`StringDtype` now works in all situations where ``astype(str)`` or ``dtype=str`` work: -For example the below now work: +For example, the below now works: .. ipython:: python From 08ff77af0b30d33055063af9cfcaa07fb2040b90 Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 26 May 2020 16:47:06 +0100 Subject: [PATCH 10/10] lint --- doc/source/user_guide/text.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index b96cd5eb776e3..3408b98b3179d 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -79,7 +79,7 @@ or convert from existing pandas data: .. ipython:: python - s1 = pd.Series([1,2, np.nan], dtype="Int64") + s1 = pd.Series([1, 2, np.nan], dtype="Int64") s1 s2 = s1.astype("string") s2