diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 2ada386f76b87..ae135fd41d977 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -88,6 +88,7 @@ Other API changes ^^^^^^^^^^^^^^^^^ - 3rd party ``py.path`` objects are no longer explicitly supported in IO methods. Use :py:class:`pathlib.Path` objects instead (:issue:`57091`) - :attr:`MultiIndex.codes`, :attr:`MultiIndex.levels`, and :attr:`MultiIndex.names` now returns a ``tuple`` instead of a ``FrozenList`` (:issue:`53531`) +- Made ``dtype`` a required argument in :meth:`ExtensionArray._from_sequence_of_strings` (:issue:`56519`) - pickle and HDF (``.h5``) files created with Python 2 are no longer explicitly supported (:issue:`57387`) - diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index cbcc9b465762f..aeb6ef481e060 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -199,6 +199,8 @@ def floordiv_compat( npt, ) + from pandas.core.dtypes.dtypes import ExtensionDtype + from pandas import Series from pandas.core.arrays.datetimes import DatetimeArray from pandas.core.arrays.timedeltas import TimedeltaArray @@ -316,7 +318,7 @@ def _from_sequence( @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: Dtype | None = None, copy: bool = False + cls, strings, *, dtype: ExtensionDtype, copy: bool = False ) -> Self: """ Construct a new ExtensionArray from a sequence of strings. @@ -533,8 +535,9 @@ def _box_pa_array( ): # TODO: Move logic in _from_sequence_of_strings into # _box_pa_array + dtype = ArrowDtype(pa_type) return cls._from_sequence_of_strings( - value, dtype=pa_type + value, dtype=dtype )._pa_array else: raise diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 147b94e441f30..33e853ea16374 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -332,7 +332,7 @@ def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self: @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: Dtype | None = None, copy: bool = False + cls, strings, *, dtype: ExtensionDtype, copy: bool = False ) -> Self: """ Construct a new ExtensionArray from a sequence of strings. @@ -342,7 +342,7 @@ def _from_sequence_of_strings( strings : Sequence Each element will be an instance of the scalar type for this array, ``cls.dtype.type``. - dtype : dtype, optional + dtype : ExtensionDtype Construct for this particular dtype. This should be a Dtype compatible with the ExtensionArray. copy : bool, default False @@ -354,7 +354,9 @@ def _from_sequence_of_strings( Examples -------- - >>> pd.arrays.IntegerArray._from_sequence_of_strings(["1", "2", "3"]) + >>> pd.arrays.IntegerArray._from_sequence_of_strings( + ... ["1", "2", "3"], dtype=pd.Int64Dtype() + ... ) [1, 2, 3] Length: 3, dtype: Int64 diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 04e6f0a0bcdde..e347281a19b9f 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -29,13 +29,14 @@ import pyarrow from pandas._typing import ( - Dtype, DtypeObj, Self, npt, type_t, ) + from pandas.core.dtypes.dtypes import ExtensionDtype + @register_extension_dtype class BooleanDtype(BaseMaskedDtype): @@ -324,7 +325,7 @@ def _from_sequence_of_strings( cls, strings: list[str], *, - dtype: Dtype | None = None, + dtype: ExtensionDtype, copy: bool = False, true_values: list[str] | None = None, false_values: list[str] | None = None, diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 210450e868698..b946356a7f8ce 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -33,12 +33,13 @@ import pyarrow from pandas._typing import ( - Dtype, DtypeObj, Self, npt, ) + from pandas.core.dtypes.dtypes import ExtensionDtype + class NumericDtype(BaseMaskedDtype): _default_np_dtype: np.dtype @@ -270,7 +271,7 @@ def _coerce_to_array( @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: Dtype | None = None, copy: bool = False + cls, strings, *, dtype: ExtensionDtype, copy: bool = False ) -> Self: from pandas.core.tools.numeric import to_numeric diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index ab79622ddd8be..1de7a4ee5962f 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -90,6 +90,8 @@ npt, ) + from pandas.core.dtypes.dtypes import ExtensionDtype + from pandas.core.arrays import ( DatetimeArray, TimedeltaArray, @@ -303,7 +305,7 @@ def _from_sequence( @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: Dtype | None = None, copy: bool = False + cls, strings, *, dtype: ExtensionDtype, copy: bool = False ) -> Self: return cls._from_sequence(strings, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 1b9f803bafc5d..06c54303187eb 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -416,7 +416,7 @@ def _from_sequence( @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: Dtype | None = None, copy: bool = False + cls, strings, *, dtype: ExtensionDtype, copy: bool = False ) -> Self: return cls._from_sequence(strings, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 6f72d1dc8a93c..4b608a7efc82f 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -62,6 +62,8 @@ npt, ) + from pandas.core.dtypes.dtypes import ExtensionDtype + from pandas import Series @@ -202,7 +204,7 @@ def _from_sequence( @classmethod def _from_sequence_of_strings( - cls, strings, dtype: Dtype | None = None, copy: bool = False + cls, strings, *, dtype: ExtensionDtype, copy: bool = False ) -> Self: return cls._from_sequence(strings, dtype=dtype, copy=copy) diff --git a/pandas/tests/arrays/boolean/test_construction.py b/pandas/tests/arrays/boolean/test_construction.py index 645e763fbf00c..d821c52d3becb 100644 --- a/pandas/tests/arrays/boolean/test_construction.py +++ b/pandas/tests/arrays/boolean/test_construction.py @@ -243,7 +243,7 @@ def test_coerce_to_numpy_array(): def test_to_boolean_array_from_strings(): result = BooleanArray._from_sequence_of_strings( np.array(["True", "False", "1", "1.0", "0", "0.0", np.nan], dtype=object), - dtype="boolean", + dtype=pd.BooleanDtype(), ) expected = BooleanArray( np.array([True, False, True, True, False, False, False]), @@ -255,7 +255,7 @@ def test_to_boolean_array_from_strings(): def test_to_boolean_array_from_strings_invalid_string(): with pytest.raises(ValueError, match="cannot be cast"): - BooleanArray._from_sequence_of_strings(["donkey"], dtype="boolean") + BooleanArray._from_sequence_of_strings(["donkey"], dtype=pd.BooleanDtype()) @pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 521c1ff0b96bc..709cff59cd824 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -101,7 +101,7 @@ def _from_sequence(cls, scalars, *, dtype=None, copy=False): return cls(scalars) @classmethod - def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + def _from_sequence_of_strings(cls, strings, *, dtype: ExtensionDtype, copy=False): return cls._from_sequence( [decimal.Decimal(x) for x in strings], dtype=dtype, copy=copy ) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 0347ad5420a3c..fe6e484fbd6e7 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -354,10 +354,9 @@ def test_from_sequence_pa_array(self, data): assert isinstance(result._pa_array, pa.ChunkedArray) def test_from_sequence_pa_array_notimplemented(self, request): + dtype = ArrowDtype(pa.month_day_nano_interval()) with pytest.raises(NotImplementedError, match="Converting strings to"): - ArrowExtensionArray._from_sequence_of_strings( - ["12-1"], dtype=pa.month_day_nano_interval() - ) + ArrowExtensionArray._from_sequence_of_strings(["12-1"], dtype=dtype) def test_from_sequence_of_strings_pa_array(self, data, request): pa_dtype = data.dtype.pyarrow_dtype @@ -2409,7 +2408,8 @@ def test_duration_from_strings_with_nat(unit): # GH51175 strings = ["1000", "NaT"] pa_type = pa.duration(unit) - result = ArrowExtensionArray._from_sequence_of_strings(strings, dtype=pa_type) + dtype = ArrowDtype(pa_type) + result = ArrowExtensionArray._from_sequence_of_strings(strings, dtype=dtype) expected = ArrowExtensionArray(pa.array([1000, None], type=pa_type)) tm.assert_extension_array_equal(result, expected) @@ -2928,13 +2928,14 @@ def test_from_sequence_of_strings_boolean(): [True] * len(true_strings) + [False] * len(false_strings) + [None] * len(nulls) ) - result = ArrowExtensionArray._from_sequence_of_strings(strings, dtype=pa.bool_()) + dtype = ArrowDtype(pa.bool_()) + result = ArrowExtensionArray._from_sequence_of_strings(strings, dtype=dtype) expected = pd.array(bools, dtype="boolean[pyarrow]") tm.assert_extension_array_equal(result, expected) strings = ["True", "foo"] with pytest.raises(pa.ArrowInvalid, match="Failed to parse"): - ArrowExtensionArray._from_sequence_of_strings(strings, dtype=pa.bool_()) + ArrowExtensionArray._from_sequence_of_strings(strings, dtype=dtype) def test_concat_empty_arrow_backed_series(dtype):