From fc75adba687d101cae6e4b4db082239004a4472f Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 15 Dec 2023 12:51:58 -0800 Subject: [PATCH 1/5] TYP: make dtype required in _from_sequence_of_strings --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/arrow/array.py | 5 +++-- pandas/core/arrays/base.py | 7 ++++--- pandas/core/arrays/boolean.py | 3 +-- pandas/core/arrays/numeric.py | 3 +-- pandas/core/arrays/period.py | 2 +- pandas/core/arrays/string_.py | 2 +- pandas/core/arrays/string_arrow.py | 2 +- pandas/tests/arrays/boolean/test_construction.py | 4 ++-- pandas/tests/extension/decimal/array.py | 2 +- pandas/tests/extension/test_arrow.py | 13 +++++++------ 11 files changed, 23 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 0c4fb6d3d1164..8305d0599631e 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -355,6 +355,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor Other API changes ^^^^^^^^^^^^^^^^^ - ``check_exact`` now only takes effect for floating-point dtypes in :func:`testing.assert_frame_equal` and :func:`testing.assert_series_equal`. In particular, integer dtypes are always checked exactly (:issue:`55882`) +- Made ``dtype`` a required argument in :meth:`ExtensionArray._from_sequence_of_strings` (:issue:`??`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 6f7f42eca3794..2957e7736faef 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -286,7 +286,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: Dtype | None = None, copy: bool = False + cls, strings, *, dtype: StringDtype, copy: bool = False ): """ Construct a new ExtensionArray from a sequence of strings. @@ -506,8 +506,9 @@ def _box_pa_array( ): # TODO: Move logic in _from_sequence_of_strings into # _box_pa_array + dtype = ArrowDtype(pa_type) return cls._from_sequence_of_strings( - value, dtype=pa_type + value, dtype=dtype )._pa_array else: raise diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 3272a594f4cf4..8e4733574290b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -328,7 +328,7 @@ def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self: @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: Dtype | None = None, copy: bool = False + cls, strings, *, dtype: ExtensionDtype, copy: bool = False ): """ Construct a new ExtensionArray from a sequence of strings. @@ -338,7 +338,7 @@ def _from_sequence_of_strings( strings : Sequence Each element will be an instance of the scalar type for this array, ``cls.dtype.type``. - dtype : dtype, optional + dtype : ExtensionDtype Construct for this particular dtype. This should be a Dtype compatible with the ExtensionArray. copy : bool, default False @@ -350,7 +350,8 @@ def _from_sequence_of_strings( Examples -------- - >>> pd.arrays.IntegerArray._from_sequence_of_strings(["1", "2", "3"]) + >>> pd.arrays.IntegerArray._from_sequence_of_strings( + ... ["1", "2", "3"], dtype=pd.Int64Dtype()) [1, 2, 3] Length: 3, dtype: Int64 diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 04e6f0a0bcdde..0ccd00f217cac 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -29,7 +29,6 @@ import pyarrow from pandas._typing import ( - Dtype, DtypeObj, Self, npt, @@ -324,7 +323,7 @@ def _from_sequence_of_strings( cls, strings: list[str], *, - dtype: Dtype | None = None, + dtype: BooleanDtype, copy: bool = False, true_values: list[str] | None = None, false_values: list[str] | None = None, diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 210450e868698..1be1804d14211 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -33,7 +33,6 @@ import pyarrow from pandas._typing import ( - Dtype, DtypeObj, Self, npt, @@ -270,7 +269,7 @@ def _coerce_to_array( @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: Dtype | None = None, copy: bool = False + cls, strings, *, dtype: NumericDtype, copy: bool = False ) -> Self: from pandas.core.tools.numeric import to_numeric diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 0e2d4409b9f39..f8478e6c748ca 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -304,7 +304,7 @@ def _from_sequence( @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: Dtype | None = None, copy: bool = False + cls, strings, *, dtype: PeriodDtype, copy: bool = False ) -> Self: return cls._from_sequence(strings, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 32dc4ab63cc21..da5f76979a59b 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -413,7 +413,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: Dtype | None = None, copy: bool = False + cls, strings, *, dtype: StringDtype, copy: bool = False ): return cls._from_sequence(strings, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index ebf5f2636bf12..11dca2b73daa7 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -178,7 +178,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal @classmethod def _from_sequence_of_strings( - cls, strings, dtype: Dtype | None = None, copy: bool = False + cls, strings, *, dtype: StringDtype, copy: bool = False ): return cls._from_sequence(strings, dtype=dtype, copy=copy) diff --git a/pandas/tests/arrays/boolean/test_construction.py b/pandas/tests/arrays/boolean/test_construction.py index a5a2dd33940b8..d045668993928 100644 --- a/pandas/tests/arrays/boolean/test_construction.py +++ b/pandas/tests/arrays/boolean/test_construction.py @@ -243,7 +243,7 @@ def test_coerce_to_numpy_array(): def test_to_boolean_array_from_strings(): result = BooleanArray._from_sequence_of_strings( np.array(["True", "False", "1", "1.0", "0", "0.0", np.nan], dtype=object), - dtype="boolean", + dtype=pd.BooleanDtype(), ) expected = BooleanArray( np.array([True, False, True, True, False, False, False]), @@ -255,7 +255,7 @@ def test_to_boolean_array_from_strings(): def test_to_boolean_array_from_strings_invalid_string(): with pytest.raises(ValueError, match="cannot be cast"): - BooleanArray._from_sequence_of_strings(["donkey"], dtype="boolean") + BooleanArray._from_sequence_of_strings(["donkey"], dtype=pd.BooleanDtype()) @pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 521c1ff0b96bc..a49864660cfb9 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -101,7 +101,7 @@ def _from_sequence(cls, scalars, *, dtype=None, copy=False): return cls(scalars) @classmethod - def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + def _from_sequence_of_strings(cls, strings, *, dtype: DecimalDtype, copy=False): return cls._from_sequence( [decimal.Decimal(x) for x in strings], dtype=dtype, copy=copy ) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index de9f872aca01d..28728a2173a68 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -346,10 +346,9 @@ def test_from_sequence_pa_array(self, data): assert isinstance(result._pa_array, pa.ChunkedArray) def test_from_sequence_pa_array_notimplemented(self, request): + dtype = ArrowDtype(pa.month_day_nano_interval()) with pytest.raises(NotImplementedError, match="Converting strings to"): - ArrowExtensionArray._from_sequence_of_strings( - ["12-1"], dtype=pa.month_day_nano_interval() - ) + ArrowExtensionArray._from_sequence_of_strings(["12-1"], dtype=dtype) def test_from_sequence_of_strings_pa_array(self, data, request): pa_dtype = data.dtype.pyarrow_dtype @@ -2286,7 +2285,8 @@ def test_duration_from_strings_with_nat(unit): # GH51175 strings = ["1000", "NaT"] pa_type = pa.duration(unit) - result = ArrowExtensionArray._from_sequence_of_strings(strings, dtype=pa_type) + dtype = ArrowDtype(pa_type) + result = ArrowExtensionArray._from_sequence_of_strings(strings, dtype=dtype) expected = ArrowExtensionArray(pa.array([1000, None], type=pa_type)) tm.assert_extension_array_equal(result, expected) @@ -2700,13 +2700,14 @@ def test_from_sequence_of_strings_boolean(): [True] * len(true_strings) + [False] * len(false_strings) + [None] * len(nulls) ) - result = ArrowExtensionArray._from_sequence_of_strings(strings, dtype=pa.bool_()) + dtype = ArrowDtype(pa.bool_()) + result = ArrowExtensionArray._from_sequence_of_strings(strings, dtype=dtype) expected = pd.array(bools, dtype="boolean[pyarrow]") tm.assert_extension_array_equal(result, expected) strings = ["True", "foo"] with pytest.raises(pa.ArrowInvalid, match="Failed to parse"): - ArrowExtensionArray._from_sequence_of_strings(strings, dtype=pa.bool_()) + ArrowExtensionArray._from_sequence_of_strings(strings, dtype=dtype) def test_concat_empty_arrow_backed_series(dtype): From fa58370b2d9b3ce27cd4f31f63f6cae8a6ddd62f Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 15 Dec 2023 12:52:42 -0800 Subject: [PATCH 2/5] GH ref --- doc/source/whatsnew/v2.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 8305d0599631e..5e778fd18b563 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -354,8 +354,8 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor Other API changes ^^^^^^^^^^^^^^^^^ +- Made ``dtype`` a required argument in :meth:`ExtensionArray._from_sequence_of_strings` (:issue:`56519`) - ``check_exact`` now only takes effect for floating-point dtypes in :func:`testing.assert_frame_equal` and :func:`testing.assert_series_equal`. In particular, integer dtypes are always checked exactly (:issue:`55882`) -- Made ``dtype`` a required argument in :meth:`ExtensionArray._from_sequence_of_strings` (:issue:`??`) - .. --------------------------------------------------------------------------- From 579ac2f0205e0bfb92bb41b3c848d1a237c9c88a Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 15 Dec 2023 13:24:26 -0800 Subject: [PATCH 3/5] mypy fixup --- pandas/core/arrays/arrow/array.py | 4 +++- pandas/core/arrays/boolean.py | 4 +++- pandas/core/arrays/numeric.py | 4 +++- pandas/core/arrays/period.py | 4 +++- pandas/core/arrays/string_.py | 2 +- pandas/core/arrays/string_arrow.py | 4 +++- pandas/tests/extension/decimal/array.py | 2 +- 7 files changed, 17 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 2957e7736faef..b0b9966ed7f52 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -167,6 +167,8 @@ def floordiv_compat( npt, ) + from pandas.core.dtypes.dtypes import ExtensionDtype + from pandas import Series from pandas.core.arrays.datetimes import DatetimeArray from pandas.core.arrays.timedeltas import TimedeltaArray @@ -286,7 +288,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: StringDtype, copy: bool = False + cls, strings, *, dtype: ExtensionDtype, copy: bool = False ): """ Construct a new ExtensionArray from a sequence of strings. diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 0ccd00f217cac..e347281a19b9f 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -35,6 +35,8 @@ type_t, ) + from pandas.core.dtypes.dtypes import ExtensionDtype + @register_extension_dtype class BooleanDtype(BaseMaskedDtype): @@ -323,7 +325,7 @@ def _from_sequence_of_strings( cls, strings: list[str], *, - dtype: BooleanDtype, + dtype: ExtensionDtype, copy: bool = False, true_values: list[str] | None = None, false_values: list[str] | None = None, diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 1be1804d14211..b946356a7f8ce 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -38,6 +38,8 @@ npt, ) + from pandas.core.dtypes.dtypes import ExtensionDtype + class NumericDtype(BaseMaskedDtype): _default_np_dtype: np.dtype @@ -269,7 +271,7 @@ def _coerce_to_array( @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: NumericDtype, copy: bool = False + cls, strings, *, dtype: ExtensionDtype, copy: bool = False ) -> Self: from pandas.core.tools.numeric import to_numeric diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index f8478e6c748ca..87069d8aba7fa 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -90,6 +90,8 @@ npt, ) + from pandas.core.dtypes.dtypes import ExtensionDtype + from pandas.core.arrays import ( DatetimeArray, TimedeltaArray, @@ -304,7 +306,7 @@ def _from_sequence( @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: PeriodDtype, copy: bool = False + cls, strings, *, dtype: ExtensionDtype, copy: bool = False ) -> Self: return cls._from_sequence(strings, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index da5f76979a59b..5c75a68e7f98e 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -413,7 +413,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: StringDtype, copy: bool = False + cls, strings, *, dtype: ExtensionDtype, copy: bool = False ): return cls._from_sequence(strings, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 11dca2b73daa7..75c24728cef8d 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -61,6 +61,8 @@ npt, ) + from pandas.core.dtypes.dtypes import ExtensionDtype + from pandas import Series @@ -178,7 +180,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: StringDtype, copy: bool = False + cls, strings, *, dtype: ExtensionDtype, copy: bool = False ): return cls._from_sequence(strings, dtype=dtype, copy=copy) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index a49864660cfb9..709cff59cd824 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -101,7 +101,7 @@ def _from_sequence(cls, scalars, *, dtype=None, copy=False): return cls(scalars) @classmethod - def _from_sequence_of_strings(cls, strings, *, dtype: DecimalDtype, copy=False): + def _from_sequence_of_strings(cls, strings, *, dtype: ExtensionDtype, copy=False): return cls._from_sequence( [decimal.Decimal(x) for x in strings], dtype=dtype, copy=copy ) From dfc73a0d639f6b60e990674ddee8234b3305f477 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 1 Feb 2024 16:54:00 -0800 Subject: [PATCH 4/5] Move whatsnew --- doc/source/whatsnew/v2.2.0.rst | 1 - doc/source/whatsnew/v3.0.0.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 076994466d4e3..d9ab0452c8334 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -504,7 +504,6 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor Other API changes ^^^^^^^^^^^^^^^^^ -- Made ``dtype`` a required argument in :meth:`ExtensionArray._from_sequence_of_strings` (:issue:`56519`) - The hash values of nullable extension dtypes changed to improve the performance of the hashing operation (:issue:`56507`) - ``check_exact`` now only takes effect for floating-point dtypes in :func:`testing.assert_frame_equal` and :func:`testing.assert_series_equal`. In particular, integer dtypes are always checked exactly (:issue:`55882`) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index f9117253b61c1..d1841108edf87 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -85,6 +85,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor Other API changes ^^^^^^^^^^^^^^^^^ - 3rd party ``py.path`` objects are no longer explicitly supported in IO methods. Use :py:class:`pathlib.Path` objects instead (:issue:`57091`) +- Made ``dtype`` a required argument in :meth:`ExtensionArray._from_sequence_of_strings` (:issue:`56519`) - .. --------------------------------------------------------------------------- From 7e09f6d8ce422e6dca25948a57b2943d10e506fc Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 2 Feb 2024 12:20:50 -0800 Subject: [PATCH 5/5] Update pandas/core/arrays/base.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/arrays/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 24bf7daf0b8c1..f79301d2dd46b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -355,7 +355,8 @@ def _from_sequence_of_strings( Examples -------- >>> pd.arrays.IntegerArray._from_sequence_of_strings( - ... ["1", "2", "3"], dtype=pd.Int64Dtype()) + ... ["1", "2", "3"], dtype=pd.Int64Dtype() + ... ) [1, 2, 3] Length: 3, dtype: Int64