From 2f1bc376892afa4b532ed1f70de98d3a72da4487 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 20 May 2024 17:32:40 +0200 Subject: [PATCH 01/14] rename storage option and add na_value keyword --- pandas/core/arrays/string_.py | 77 ++++++++++++++++++++++++----------- 1 file changed, 53 insertions(+), 24 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 291cc2e62be62..3ad3b95909e3b 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -9,7 +9,10 @@ import numpy as np -from pandas._config import get_option +from pandas._config import ( + get_option, + using_pyarrow_string_dtype, +) from pandas._libs import ( lib, @@ -83,6 +86,7 @@ class StringDtype(StorageExtensionDtype): ---------- storage : {"python", "pyarrow", "pyarrow_numpy"}, optional If not given, the value of ``pd.options.mode.string_storage``. + na_value : Attributes ---------- @@ -113,30 +117,49 @@ class StringDtype(StorageExtensionDtype): # follows NumPy semantics, which uses nan. @property def na_value(self) -> libmissing.NAType | float: # type: ignore[override] - if self.storage == "pyarrow_numpy": - return np.nan - else: - return libmissing.NA + return self._na_value _metadata = ("storage",) - def __init__(self, storage=None) -> None: - if storage is None: - infer_string = get_option("future.infer_string") - if infer_string: - storage = "pyarrow_numpy" + def __init__(self, storage=None, na_value=None) -> None: + if not ( + na_value is None or (isinstance(na_value, float) and np.isnan(na_value)) + ): + raise ValueError( + "'na_value' must be the default value or pd.NA, got {na_value}" + ) + + # infer defaults + if storage is None and na_value is None: + if using_pyarrow_string_dtype(): + storage = "pyarrow" + na_value = np.nan else: storage = get_option("mode.string_storage") - if storage not in {"python", "pyarrow", "pyarrow_numpy"}: + na_value = libmissing.NA + elif storage is None: + # in this case na_value is NaN + storage = get_option("mode.string_storage") + elif na_value is None: + na_value = np.nan if using_pyarrow_string_dtype() else libmissing.NA + if na_value is not libmissing.NA and storage == "python": + raise NotImplementedError( + "'python' mode for na_value of NaN not yet implemented" + ) + + if storage == "pyarrow_numpy": + # TODO raise a deprecation warning + storage = "pyarrow" + if storage not in {"python", "pyarrow"}: raise ValueError( - f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. " - f"Got {storage} instead." + f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." ) - if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under10p1: + if storage == "pyarrow" and pa_version_under10p1: raise ImportError( "pyarrow>=10.0.1 is required for PyArrow backed StringArray." ) self.storage = storage + self._na_value = na_value @property def type(self) -> type[str]: @@ -176,11 +199,14 @@ def construct_from_string(cls, string) -> Self: ) if string == "string": return cls() + elif string == "String": + return cls(na_value=np.nan) elif string == "string[python]": - return cls(storage="python") + return cls(storage="python", na_value=np.nan) elif string == "string[pyarrow]": - return cls(storage="pyarrow") + return cls(storage="pyarrow", na_value=np.nan) elif string == "string[pyarrow_numpy]": + # TODO deprecate return cls(storage="pyarrow_numpy") else: raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") @@ -205,10 +231,10 @@ def construct_array_type( # type: ignore[override] if self.storage == "python": return StringArray - elif self.storage == "pyarrow": - return ArrowStringArray - else: + elif self.storage == "pyarrow" and self._na_value is libmissing.NA: return ArrowStringArrayNumpySemantics + else: + return ArrowStringArray def __from_arrow__( self, array: pyarrow.Array | pyarrow.ChunkedArray @@ -217,13 +243,16 @@ def __from_arrow__( Construct StringArray from pyarrow Array/ChunkedArray. """ if self.storage == "pyarrow": - from pandas.core.arrays.string_arrow import ArrowStringArray + if self._na_value is libmissing.NA: + from pandas.core.arrays.string_arrow import ( + ArrowStringArrayNumpySemantics, + ) - return ArrowStringArray(array) - elif self.storage == "pyarrow_numpy": - from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics + return ArrowStringArrayNumpySemantics(array) + else: + from pandas.core.arrays.string_arrow import ArrowStringArray - return ArrowStringArrayNumpySemantics(array) + return ArrowStringArray(array) else: import pyarrow From e29ca8de77aef7f4cb51deba1a3e365d819eeb09 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 26 Jul 2024 22:14:49 +0200 Subject: [PATCH 02/14] update init --- pandas/_libs/lib.pyx | 2 +- pandas/_testing/__init__.py | 4 +-- pandas/core/arrays/arrow/array.py | 6 ++-- pandas/core/arrays/string_.py | 46 ++++++++++----------------- pandas/core/arrays/string_arrow.py | 2 +- pandas/core/construction.py | 4 +-- pandas/core/dtypes/cast.py | 2 +- pandas/core/internals/construction.py | 2 +- pandas/core/reshape/encoding.py | 3 +- pandas/core/reshape/merge.py | 3 +- pandas/core/tools/numeric.py | 9 ++++-- pandas/io/_util.py | 6 ++-- 12 files changed, 40 insertions(+), 49 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 2650d60eb3cef..0bb47541e5963 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2702,7 +2702,7 @@ def maybe_convert_objects(ndarray[object] objects, if using_string_dtype() and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype(storage="pyarrow", na_value=np.nan) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) elif convert_to_nullable_dtype and is_string_array(objects, skipna=True): diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 1cd91ee5b120c..3aa53d4b07aa5 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -509,14 +509,14 @@ def shares_memory(left, right) -> bool: if ( isinstance(left, ExtensionArray) and is_string_dtype(left.dtype) - and left.dtype.storage in ("pyarrow", "pyarrow_numpy") # type: ignore[attr-defined] + and left.dtype.storage == "pyarrow" # type: ignore[attr-defined] ): # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669 left = cast("ArrowExtensionArray", left) if ( isinstance(right, ExtensionArray) and is_string_dtype(right.dtype) - and right.dtype.storage in ("pyarrow", "pyarrow_numpy") # type: ignore[attr-defined] + and right.dtype.storage == "pyarrow" # type: ignore[attr-defined] ): right = cast("ArrowExtensionArray", right) left_pa_data = left._pa_array diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 5da479760047f..a17056b51a014 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -575,10 +575,8 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, np.ndarray): if not len(item): # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] - if self._dtype.name == "string" and self._dtype.storage in ( - "pyarrow", - "pyarrow_numpy", - ): + if self._dtype.name == "string" and self._dtype.storage == "pyarrow": + # TODO(infer_string) should this be large_string? pa_dtype = pa.string() else: pa_dtype = self._dtype.pyarrow_dtype diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index edb6910b9fd5a..fed8943105fcf 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -11,7 +11,7 @@ from pandas._config import ( get_option, - using_pyarrow_string_dtype, + using_string_dtype, ) from pandas._libs import ( @@ -84,7 +84,7 @@ class StringDtype(StorageExtensionDtype): Parameters ---------- - storage : {"python", "pyarrow", "pyarrow_numpy"}, optional + storage : {"python", "pyarrow"}, optional If not given, the value of ``pd.options.mode.string_storage``. na_value : @@ -121,35 +121,24 @@ def na_value(self) -> libmissing.NAType | float: # type: ignore[override] _metadata = ("storage",) - def __init__(self, storage=None, na_value=None) -> None: + def __init__(self, storage=None, na_value=libmissing.NA) -> None: if not ( - na_value is None or (isinstance(na_value, float) and np.isnan(na_value)) + na_value is libmissing.NA + or (isinstance(na_value, float) and np.isnan(na_value)) ): - raise ValueError( - "'na_value' must be the default value or pd.NA, got {na_value}" - ) + raise ValueError("'na_value' must be np.nan or pd.NA, got {na_value}") # infer defaults - if storage is None and na_value is None: - if using_pyarrow_string_dtype(): + if storage is None: + if using_string_dtype(): storage = "pyarrow" - na_value = np.nan else: storage = get_option("mode.string_storage") - na_value = libmissing.NA - elif storage is None: - # in this case na_value is NaN - storage = get_option("mode.string_storage") - elif na_value is None: - na_value = np.nan if using_pyarrow_string_dtype() else libmissing.NA - if na_value is not libmissing.NA and storage == "python": - raise NotImplementedError( - "'python' mode for na_value of NaN not yet implemented" - ) if storage == "pyarrow_numpy": # TODO raise a deprecation warning storage = "pyarrow" + if storage not in {"python", "pyarrow"}: raise ValueError( f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." @@ -199,12 +188,10 @@ def construct_from_string(cls, string) -> Self: ) if string == "string": return cls() - elif string == "String": - return cls(na_value=np.nan) elif string == "string[python]": - return cls(storage="python", na_value=np.nan) + return cls(storage="python") elif string == "string[pyarrow]": - return cls(storage="pyarrow", na_value=np.nan) + return cls(storage="pyarrow") elif string == "string[pyarrow_numpy]": # TODO deprecate return cls(storage="pyarrow_numpy") @@ -232,9 +219,9 @@ def construct_array_type( # type: ignore[override] if self.storage == "python": return StringArray elif self.storage == "pyarrow" and self._na_value is libmissing.NA: - return ArrowStringArrayNumpySemantics - else: return ArrowStringArray + else: + return ArrowStringArrayNumpySemantics def __from_arrow__( self, array: pyarrow.Array | pyarrow.ChunkedArray @@ -244,15 +231,16 @@ def __from_arrow__( """ if self.storage == "pyarrow": if self._na_value is libmissing.NA: + from pandas.core.arrays.string_arrow import ArrowStringArray + + return ArrowStringArray(array) + else: from pandas.core.arrays.string_arrow import ( ArrowStringArrayNumpySemantics, ) return ArrowStringArrayNumpySemantics(array) - else: - from pandas.core.arrays.string_arrow import ArrowStringArray - return ArrowStringArray(array) else: import pyarrow diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 97c06149d0b7e..d8c322fdbf27a 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -597,7 +597,7 @@ def _rank( class ArrowStringArrayNumpySemantics(ArrowStringArray): - _storage = "pyarrow_numpy" + _storage = "pyarrow" @classmethod def _result_converter(cls, values, na=None): diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 32792aa7f0543..81aeb40f375b0 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -574,7 +574,7 @@ def sanitize_array( if isinstance(data, str) and using_string_dtype() and original_dtype is None: from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype("pyarrow_numpy") + dtype = StringDtype("pyarrow", na_value=np.nan) data = construct_1d_arraylike_from_scalar(data, len(index), dtype) return data @@ -608,7 +608,7 @@ def sanitize_array( elif data.dtype.kind == "U" and using_string_dtype(): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype(storage="pyarrow", na_value=np.nan) subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype) if subarr is data and copy: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 21e45505b40fc..d750451a1ca84 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -801,7 +801,7 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: if using_string_dtype(): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype(storage="pyarrow", na_value=np.nan) elif isinstance(val, (np.datetime64, dt.datetime)): try: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index c31479b3011e5..08e1650a5de12 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -302,7 +302,7 @@ def ndarray_to_mgr( nb = new_block_2d(values, placement=bp, refs=refs) block_values = [nb] elif dtype is None and values.dtype.kind == "U" and using_string_dtype(): - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype(storage="pyarrow", na_value=np.nan) obj_columns = list(values) block_values = [ diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 9d88e61951e99..c397c1c2566a5 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -10,6 +10,7 @@ import numpy as np +from pandas._libs import missing as libmissing from pandas._libs.sparse import IntIndex from pandas.core.dtypes.common import ( @@ -256,7 +257,7 @@ def _get_dummies_1d( dtype = ArrowDtype(pa.bool_()) # type: ignore[assignment] elif ( isinstance(input_dtype, StringDtype) - and input_dtype.storage != "pyarrow_numpy" + and input_dtype.na_value is libmissing.NA ): dtype = pandas_dtype("boolean") # type: ignore[assignment] else: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 2ce77ac19b9c5..6364072fd215c 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2677,8 +2677,7 @@ def _factorize_keys( elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or ( - isinstance(lk.dtype, StringDtype) - and lk.dtype.storage in ["pyarrow", "pyarrow_numpy"] + isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow" ): import pyarrow as pa import pyarrow.compute as pc diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 3d406d3bfb115..26e73794af298 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -7,7 +7,10 @@ import numpy as np -from pandas._libs import lib +from pandas._libs import ( + lib, + missing as libmissing, +) from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.cast import maybe_downcast_numeric @@ -218,7 +221,7 @@ def to_numeric( coerce_numeric=coerce_numeric, convert_to_masked_nullable=dtype_backend is not lib.no_default or isinstance(values_dtype, StringDtype) - and not values_dtype.storage == "pyarrow_numpy", + and values_dtype.na_value is libmissing.NA, ) if new_mask is not None: @@ -229,7 +232,7 @@ def to_numeric( dtype_backend is not lib.no_default and new_mask is None or isinstance(values_dtype, StringDtype) - and not values_dtype.storage == "pyarrow_numpy" + and values_dtype.na_value is libmissing.NA ): new_mask = np.zeros(values.shape, dtype=np.bool_) diff --git a/pandas/io/_util.py b/pandas/io/_util.py index cb0f89945e440..a72a16269959d 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -2,6 +2,8 @@ from typing import TYPE_CHECKING +import numpy as np + from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -32,6 +34,6 @@ def arrow_string_types_mapper() -> Callable: pa = import_optional_dependency("pyarrow") return { - pa.string(): pd.StringDtype(storage="pyarrow_numpy"), - pa.large_string(): pd.StringDtype(storage="pyarrow_numpy"), + pa.string(): pd.StringDtype(storage="pyarrow", na_value=np.nan), + pa.large_string(): pd.StringDtype(storage="pyarrow", na_value=np.nan), }.get From cb7410f461ff3b99d3dbab339ff50fb6255e5308 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 27 Jul 2024 13:08:22 +0200 Subject: [PATCH 03/14] fix propagating na_value to Array class + fix some tests --- pandas/core/arrays/string_.py | 1 + pandas/core/arrays/string_arrow.py | 4 +- pandas/tests/arrays/string_/test_string.py | 53 ++++++++----------- .../tests/arrays/string_/test_string_arrow.py | 2 +- pandas/tests/extension/base/methods.py | 8 +-- pandas/tests/extension/test_string.py | 6 +-- 6 files changed, 35 insertions(+), 39 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index fed8943105fcf..55977c2a2e93e 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -138,6 +138,7 @@ def __init__(self, storage=None, na_value=libmissing.NA) -> None: if storage == "pyarrow_numpy": # TODO raise a deprecation warning storage = "pyarrow" + na_value = np.nan if storage not in {"python", "pyarrow"}: raise ValueError( diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index d8c322fdbf27a..894dd1b0afc18 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -131,6 +131,7 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr # base class "ArrowExtensionArray" defined the type as "ArrowDtype") _dtype: StringDtype # type: ignore[assignment] _storage = "pyarrow" + _na_value = libmissing.NA def __init__(self, values) -> None: _chk_pyarrow_available() @@ -140,7 +141,7 @@ def __init__(self, values) -> None: values = pc.cast(values, pa.large_string()) super().__init__(values) - self._dtype = StringDtype(storage=self._storage) + self._dtype = StringDtype(storage=self._storage, na_value=self._na_value) if not pa.types.is_large_string(self._pa_array.type) and not ( pa.types.is_dictionary(self._pa_array.type) @@ -598,6 +599,7 @@ def _rank( class ArrowStringArrayNumpySemantics(ArrowStringArray): _storage = "pyarrow" + _na_value = np.nan @classmethod def _result_converter(cls, values, na=None): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 597b407a29c94..ee648783c47be 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -20,13 +20,6 @@ ) -def na_val(dtype): - if dtype.storage == "pyarrow_numpy": - return np.nan - else: - return pd.NA - - @pytest.fixture def dtype(string_storage): """Fixture giving StringDtype from parametrized 'string_storage'""" @@ -41,22 +34,22 @@ def cls(dtype): def test_repr(dtype): df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)}) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: expected = " A\n0 a\n1 NaN\n2 b" else: expected = " A\n0 a\n1 \n2 b" assert repr(df) == expected - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: expected = "0 a\n1 NaN\n2 b\nName: A, dtype: string" else: expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected - if dtype.storage == "pyarrow": + if dtype.storage == "pyarrow" and dtype.na_value is pd.NA: arr_name = "ArrowStringArray" expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" - elif dtype.storage == "pyarrow_numpy": + elif dtype.storage == "pyarrow" and dtype.na_value is np.nan: arr_name = "ArrowStringArrayNumpySemantics" expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" else: @@ -68,7 +61,7 @@ def test_repr(dtype): def test_none_to_nan(cls, dtype): a = cls._from_sequence(["a", None, "b"], dtype=dtype) assert a[1] is not None - assert a[1] is na_val(a.dtype) + assert a[1] is a.dtype.na_value def test_setitem_validates(cls, dtype): @@ -225,7 +218,7 @@ def test_comparison_methods_scalar(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = "a" result = getattr(a, op_name)(other) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: expected = np.array([getattr(item, op_name)(other) for item in a]) if comparison_op == operator.ne: expected[1] = True @@ -244,7 +237,7 @@ def test_comparison_methods_scalar_pd_na(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) result = getattr(a, op_name)(pd.NA) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: if operator.ne == comparison_op: expected = np.array([True, True, True]) else: @@ -271,7 +264,7 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): result = getattr(a, op_name)(other) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: expected_data = { "__eq__": [False, False, False], "__ne__": [True, True, True], @@ -293,7 +286,7 @@ def test_comparison_methods_array(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = [None, None, "c"] result = getattr(a, op_name)(other) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: if operator.ne == comparison_op: expected = np.array([True, True, False]) else: @@ -387,7 +380,7 @@ def test_astype_int(dtype): tm.assert_numpy_array_equal(result, expected) arr = pd.array(["1", pd.NA, "3"], dtype=dtype) - if dtype.storage == "pyarrow_numpy": + if dtype.na_value is np.nan: err = ValueError msg = "cannot convert float NaN to integer" else: @@ -441,7 +434,7 @@ def test_min_max(method, skipna, dtype): expected = "a" if method == "min" else "c" assert result == expected else: - assert result is na_val(arr.dtype) + assert result is arr.dtype.na_value @pytest.mark.parametrize("method", ["min", "max"]) @@ -522,7 +515,7 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): expected = df.astype(f"string[{string_storage2}]") tm.assert_frame_equal(result, expected) # ensure the missing value is represented by NA and not np.nan or None - assert result.loc[2, "a"] is na_val(result["a"].dtype) + assert result.loc[2, "a"] is result["a"].dtype.na_value @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") @@ -556,10 +549,10 @@ def test_arrow_load_from_zero_chunks( def test_value_counts_na(dtype): - if getattr(dtype, "storage", "") == "pyarrow": - exp_dtype = "int64[pyarrow]" - elif getattr(dtype, "storage", "") == "pyarrow_numpy": + if dtype.na_value is np.nan: exp_dtype = "int64" + elif dtype.storage == "pyarrow": + exp_dtype = "int64[pyarrow]" else: exp_dtype = "Int64" arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) @@ -573,10 +566,10 @@ def test_value_counts_na(dtype): def test_value_counts_with_normalize(dtype): - if getattr(dtype, "storage", "") == "pyarrow": - exp_dtype = "double[pyarrow]" - elif getattr(dtype, "storage", "") == "pyarrow_numpy": + if dtype.na_value is np.nan: exp_dtype = np.float64 + elif dtype.storage == "pyarrow": + exp_dtype = "double[pyarrow]" else: exp_dtype = "Float64" ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype) @@ -586,10 +579,10 @@ def test_value_counts_with_normalize(dtype): def test_value_counts_sort_false(dtype): - if getattr(dtype, "storage", "") == "pyarrow": - exp_dtype = "int64[pyarrow]" - elif getattr(dtype, "storage", "") == "pyarrow_numpy": + if dtype.na_value is np.nan: exp_dtype = "int64" + elif dtype.storage == "pyarrow": + exp_dtype = "int64[pyarrow]" else: exp_dtype = "Int64" ser = pd.Series(["a", "b", "c", "b"], dtype=dtype) @@ -621,7 +614,7 @@ def test_astype_from_float_dtype(float_dtype, dtype): def test_to_numpy_returns_pdna_default(dtype): arr = pd.array(["a", pd.NA, "b"], dtype=dtype) result = np.array(arr) - expected = np.array(["a", na_val(dtype), "b"], dtype=object) + expected = np.array(["a", dtype.na_value, "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) @@ -661,7 +654,7 @@ def test_setitem_scalar_with_mask_validation(dtype): mask = np.array([False, True, False]) ser[mask] = None - assert ser.array[1] is na_val(ser.dtype) + assert ser.array[1] is ser.dtype.na_value # for other non-string we should also raise an error ser = pd.Series(["a", "b", "c"], dtype=dtype) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 405c1c217b04d..458a3f5ae77d4 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -260,6 +260,6 @@ def test_pickle_roundtrip(dtype): def test_string_dtype_error_message(): # GH#55051 pytest.importorskip("pyarrow") - msg = "Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'." + msg = "Storage must be 'python' or 'pyarrow'." with pytest.raises(ValueError, match=msg): StringDtype("bla") diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index b7f0f973e640a..dd2ed0bd62a02 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -66,14 +66,14 @@ def test_value_counts_with_normalize(self, data): expected = pd.Series(0.0, index=result.index, name="proportion") expected[result > 0] = 1 / len(values) - if getattr(data.dtype, "storage", "") == "pyarrow" or isinstance( + if isinstance(data.dtype, pd.StringDtype) and data.dtype.na_value is np.nan: + # TODO: avoid special-casing + expected = expected.astype("float64") + elif getattr(data.dtype, "storage", "") == "pyarrow" or isinstance( data.dtype, pd.ArrowDtype ): # TODO: avoid special-casing expected = expected.astype("double[pyarrow]") - elif getattr(data.dtype, "storage", "") == "pyarrow_numpy": - # TODO: avoid special-casing - expected = expected.astype("float64") elif na_value_for_dtype(data.dtype) is pd.NA: # TODO(GH#44692): avoid special-casing expected = expected.astype("Float64") diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 49ad3fce92a5c..a44bbca07039b 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -190,7 +190,7 @@ def _get_expected_exception( def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: return ( op_name in ["min", "max"] - or ser.dtype.storage == "pyarrow_numpy" # type: ignore[union-attr] + or ser.dtype.na_value is np.nan # type: ignore[union-attr] and op_name in ("any", "all") ) @@ -198,10 +198,10 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): dtype = cast(StringDtype, tm.get_dtype(obj)) if op_name in ["__add__", "__radd__"]: cast_to = dtype + elif dtype.na_value is np.nan: + cast_to = np.bool_ # type: ignore[assignment] elif dtype.storage == "pyarrow": cast_to = "boolean[pyarrow]" # type: ignore[assignment] - elif dtype.storage == "pyarrow_numpy": - cast_to = np.bool_ # type: ignore[assignment] else: cast_to = "boolean" # type: ignore[assignment] return pointwise_result.astype(cast_to) From ffa7eaddd3934413938708e3d578655225af8948 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 27 Jul 2024 15:32:53 +0200 Subject: [PATCH 04/14] fix more tests --- pandas/core/indexes/base.py | 3 ++- pandas/tests/strings/__init__.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e67c59c86dd0c..50f44cc728aea 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5453,9 +5453,10 @@ def equals(self, other: Any) -> bool: if ( isinstance(self.dtype, StringDtype) - and self.dtype.storage == "pyarrow_numpy" + and self.dtype.na_value is np.nan and other.dtype != self.dtype ): + # TODO(infer_string) can we avoid this special case? # special case for object behavior return other.equals(self.astype(object)) diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py index 01b49b5e5b633..e94f656fc9823 100644 --- a/pandas/tests/strings/__init__.py +++ b/pandas/tests/strings/__init__.py @@ -7,7 +7,7 @@ def _convert_na_value(ser, expected): if ser.dtype != object: - if ser.dtype.storage == "pyarrow_numpy": + if ser.dtype.na_value is np.nan: expected = expected.fillna(np.nan) else: # GH#18463 From a9c466bf83e5c1339bc0da68640df7da1a7141b4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 27 Jul 2024 15:45:57 +0200 Subject: [PATCH 05/14] disallow pyarrow_numpy as option + fix more cases of checking storage to be pyarrow_numpy --- pandas/core/arrays/string_arrow.py | 5 +--- pandas/core/config_init.py | 2 +- pandas/tests/arrays/string_/test_string.py | 8 ++++- .../tests/arrays/string_/test_string_arrow.py | 2 ++ pandas/tests/extension/test_string.py | 30 +++++++------------ .../frame/methods/test_convert_dtypes.py | 6 +++- 6 files changed, 26 insertions(+), 27 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 894dd1b0afc18..e7acb7ca8af7a 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -188,10 +188,7 @@ def _from_sequence( if dtype and not (isinstance(dtype, str) and dtype == "string"): dtype = pandas_dtype(dtype) - assert isinstance(dtype, StringDtype) and dtype.storage in ( - "pyarrow", - "pyarrow_numpy", - ) + assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow" if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype in ensure_string_array and diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 352020f45388f..073d1de11e0db 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -460,7 +460,7 @@ def is_terminal() -> bool: "string_storage", "python", string_storage_doc, - validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]), + validator=is_one_of_factory(["python", "pyarrow"]), ) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index ee648783c47be..a7f45c185481c 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -483,7 +483,7 @@ def test_arrow_array(dtype): data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) expected = pa.array(list(data), type=pa.large_string(), from_pandas=True) - if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0: + if dtype.storage == "pyarrow" and pa_version_under12p0: expected = pa.chunked_array(expected) if dtype.storage == "python": expected = pc.cast(expected, pa.string()) @@ -501,6 +501,10 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): reason="infer_string takes precedence over string storage" ) ) + if string_storage2 == "pyarrow_numpy": + # we cannot set "pyarrow_numpy" as storage option anymore, need to + # update the tests for this + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) data = pd.array(["a", "b", None], dtype=dtype) df = pd.DataFrame({"a": data}) @@ -531,6 +535,8 @@ def test_arrow_load_from_zero_chunks( reason="infer_string takes precedence over string storage" ) ) + if string_storage2 == "pyarrow_numpy": + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) data = pd.array([], dtype=dtype) df = pd.DataFrame({"a": data}) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 458a3f5ae77d4..c610ef5315723 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -29,6 +29,8 @@ def test_eq_all_na(): def test_config(string_storage, request, using_infer_string): if using_infer_string and string_storage != "pyarrow_numpy": request.applymarker(pytest.mark.xfail(reason="infer string takes precedence")) + if string_storage == "pyarrow_numpy": + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) with pd.option_context("string_storage", string_storage): assert StringDtype().storage == string_storage result = pd.array(["a", "b"]) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index a44bbca07039b..b4858f02ad411 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -140,28 +140,21 @@ def _get_expected_exception( self, op_name: str, obj, other ) -> type[Exception] | None: if op_name in ["__divmod__", "__rdivmod__"]: - if isinstance(obj, pd.Series) and cast( - StringDtype, tm.get_dtype(obj) - ).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: + if ( + isinstance(obj, pd.Series) + and cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow" + ): # TODO: re-raise as TypeError? return NotImplementedError - elif isinstance(other, pd.Series) and cast( - StringDtype, tm.get_dtype(other) - ).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: + elif ( + isinstance(other, pd.Series) + and cast(StringDtype, tm.get_dtype(other)).storage == "pyarrow" + ): # TODO: re-raise as TypeError? return NotImplementedError return TypeError elif op_name in ["__mod__", "__rmod__", "__pow__", "__rpow__"]: - if cast(StringDtype, tm.get_dtype(obj)).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: + if cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow": return NotImplementedError return TypeError elif op_name in ["__mul__", "__rmul__"]: @@ -175,10 +168,7 @@ def _get_expected_exception( "__sub__", "__rsub__", ]: - if cast(StringDtype, tm.get_dtype(obj)).storage in [ - "pyarrow", - "pyarrow_numpy", - ]: + if cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow": import pyarrow as pa # TODO: better to re-raise as TypeError? diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 521d2cb14ac6a..4d2d3747a5298 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -18,13 +18,17 @@ def test_convert_dtypes( # Just check that it works for DataFrame here if using_infer_string: string_storage = "pyarrow_numpy" + + string_storage_option = string_storage + if string_storage == "pyarrow_numpy": + string_storage_option = "pyarrow" df = pd.DataFrame( { "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), } ) - with pd.option_context("string_storage", string_storage): + with pd.option_context("string_storage", string_storage_option): result = df.convert_dtypes(True, True, convert_integer, False) expected = pd.DataFrame( { From 1fc2113e9627b96a447c4cc6265a65dc8acf221b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 27 Jul 2024 16:09:23 +0200 Subject: [PATCH 06/14] restore pyarrow_numpy as option for now --- pandas/core/config_init.py | 2 +- pandas/tests/arrays/string_/test_string.py | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 073d1de11e0db..352020f45388f 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -460,7 +460,7 @@ def is_terminal() -> bool: "string_storage", "python", string_storage_doc, - validator=is_one_of_factory(["python", "pyarrow"]), + validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]), ) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index a7f45c185481c..78adc10631a79 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -501,10 +501,6 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): reason="infer_string takes precedence over string storage" ) ) - if string_storage2 == "pyarrow_numpy": - # we cannot set "pyarrow_numpy" as storage option anymore, need to - # update the tests for this - request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) data = pd.array(["a", "b", None], dtype=dtype) df = pd.DataFrame({"a": data}) @@ -535,8 +531,6 @@ def test_arrow_load_from_zero_chunks( reason="infer_string takes precedence over string storage" ) ) - if string_storage2 == "pyarrow_numpy": - request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) data = pd.array([], dtype=dtype) df = pd.DataFrame({"a": data}) From b347b942e35cda66bd4e751504533bed0b1ab591 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 27 Jul 2024 16:33:09 +0200 Subject: [PATCH 07/14] linting --- pandas/core/arrays/string_.py | 3 ++- pandas/core/arrays/string_arrow.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 55977c2a2e93e..f4d9c8a6efde4 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -86,7 +86,8 @@ class StringDtype(StorageExtensionDtype): ---------- storage : {"python", "pyarrow"}, optional If not given, the value of ``pd.options.mode.string_storage``. - na_value : + na_value : {np.nan, pd.NA}, default pd.NA + Whether the dtype follows NaN or NA missing value semantics. Attributes ---------- diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index e7acb7ca8af7a..869cc34d5f61d 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -131,7 +131,7 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr # base class "ArrowExtensionArray" defined the type as "ArrowDtype") _dtype: StringDtype # type: ignore[assignment] _storage = "pyarrow" - _na_value = libmissing.NA + _na_value: libmissing.NAType | float = libmissing.NA def __init__(self, values) -> None: _chk_pyarrow_available() From 80489fe1d01aa3b40447ae1c1e263653645eaa3b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 29 Jul 2024 15:36:29 +0200 Subject: [PATCH 08/14] try fix typing --- pandas/core/arrays/string_.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index f4d9c8a6efde4..0a731759243f9 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -122,7 +122,9 @@ def na_value(self) -> libmissing.NAType | float: # type: ignore[override] _metadata = ("storage",) - def __init__(self, storage=None, na_value=libmissing.NA) -> None: + def __init__( + self, storage=None, na_value: libmissing.NAType | float = libmissing.NA + ) -> None: if not ( na_value is libmissing.NA or (isinstance(na_value, float) and np.isnan(na_value)) From 8587297eb4955d0f3a97bdb8b34d920014e8493a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 29 Jul 2024 15:37:03 +0200 Subject: [PATCH 09/14] try fix typing --- pandas/core/arrays/string_.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 0a731759243f9..29cd0308831c8 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -123,7 +123,9 @@ def na_value(self) -> libmissing.NAType | float: # type: ignore[override] _metadata = ("storage",) def __init__( - self, storage=None, na_value: libmissing.NAType | float = libmissing.NA + self, + storage: str | None = None, + na_value: libmissing.NAType | float = libmissing.NA, ) -> None: if not ( na_value is libmissing.NA From a9650bb3ba4075f878e2dcb93aa552f91a224a07 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 29 Jul 2024 17:35:35 +0200 Subject: [PATCH 10/14] fix dtype equality to take into account the NaN vs NA --- pandas/core/arrays/string_.py | 35 +++++++++++++++++----- pandas/tests/arrays/string_/test_string.py | 21 +++++++++++++ pandas/tests/extension/test_string.py | 8 ++++- 3 files changed, 56 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 29cd0308831c8..a14f1dd840668 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -120,19 +120,13 @@ class StringDtype(StorageExtensionDtype): def na_value(self) -> libmissing.NAType | float: # type: ignore[override] return self._na_value - _metadata = ("storage",) + _metadata = ("storage", "na_value") def __init__( self, storage: str | None = None, na_value: libmissing.NAType | float = libmissing.NA, ) -> None: - if not ( - na_value is libmissing.NA - or (isinstance(na_value, float) and np.isnan(na_value)) - ): - raise ValueError("'na_value' must be np.nan or pd.NA, got {na_value}") - # infer defaults if storage is None: if using_string_dtype(): @@ -145,6 +139,7 @@ def __init__( storage = "pyarrow" na_value = np.nan + # validate options if storage not in {"python", "pyarrow"}: raise ValueError( f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." @@ -153,9 +148,35 @@ def __init__( raise ImportError( "pyarrow>=10.0.1 is required for PyArrow backed StringArray." ) + + if isinstance(na_value, float) and np.isnan(na_value): + # when passed a NaN value, always set to np.nan to ensure we use + # a consistent NaN value (and we can use `dtype.na_value is np.nan`) + na_value = np.nan + elif na_value is not libmissing.NA: + raise ValueError("'na_value' must be np.nan or pd.NA, got {na_value}") + self.storage = storage self._na_value = na_value + def __eq__(self, other: object) -> bool: + # we need to override the base class __eq__ because na_value (NA or NaN) + # cannot be checked with normal `==` + if isinstance(other, str): + if other == self.name: + return True + try: + other = self.construct_from_string(other) + except TypeError: + return False + if isinstance(other, type(self)): + return self.storage == other.storage and self.na_value is other.na_value + return False + + def __hash__(self) -> int: + # need to override __hash__ as well because of overriding __eq__ + return super().__hash__() + @property def type(self) -> type[str]: return str diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 78adc10631a79..7757847f3c841 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -32,6 +32,27 @@ def cls(dtype): return dtype.construct_array_type() +def test_dtype_equality(): + pytest.importorskip("pyarrow") + + dtype1 = pd.StringDtype("python") + dtype2 = pd.StringDtype("pyarrow") + dtype3 = pd.StringDtype("pyarrow", na_value=np.nan) + + assert dtype1 == pd.StringDtype("python", na_value=pd.NA) + assert dtype1 != dtype2 + assert dtype1 != dtype3 + + assert dtype2 == pd.StringDtype("pyarrow", na_value=pd.NA) + assert dtype2 != dtype1 + assert dtype2 != dtype3 + + assert dtype3 == pd.StringDtype("pyarrow", na_value=np.nan) + assert dtype3 == pd.StringDtype("pyarrow", na_value=float("nan")) + assert dtype3 != dtype1 + assert dtype3 != dtype2 + + def test_repr(dtype): df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)}) if dtype.na_value is np.nan: diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index b4858f02ad411..4628c5568b49b 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -96,9 +96,15 @@ def data_for_grouping(dtype, chunked): class TestStringArray(base.ExtensionTests): def test_eq_with_str(self, dtype): - assert dtype == f"string[{dtype.storage}]" super().test_eq_with_str(dtype) + if dtype.na_value is pd.NA: + # only the NA-variant supports parametrized string alias + assert dtype == f"string[{dtype.storage}]" + elif dtype.storage == "pyarrow": + # TODO(infer_string) deprecate this + assert dtype == "string[pyarrow_numpy]" + def test_is_not_string_type(self, dtype): # Different from BaseDtypeTests.test_is_not_string_type # because StringDtype is a string type From 4136c9eca61dd6f93a47ffd1c460b5a6c24ed18a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 29 Jul 2024 18:13:01 +0200 Subject: [PATCH 11/14] fix pickling of dtype --- pandas/core/arrays/string_.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index a14f1dd840668..d9a6036aa4da9 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -120,7 +120,7 @@ class StringDtype(StorageExtensionDtype): def na_value(self) -> libmissing.NAType | float: # type: ignore[override] return self._na_value - _metadata = ("storage", "na_value") + _metadata = ("storage", "_na_value") def __init__( self, @@ -177,6 +177,9 @@ def __hash__(self) -> int: # need to override __hash__ as well because of overriding __eq__ return super().__hash__() + def __reduce__(self): + return StringDtype, (self.storage, self.na_value) + @property def type(self) -> type[str]: return str From c33e14af9fc7228249adc10fe3dcd15f1fd8a51a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 29 Jul 2024 18:15:14 +0200 Subject: [PATCH 12/14] fix test_convert_dtypes --- pandas/tests/frame/methods/test_convert_dtypes.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 4d2d3747a5298..9cbbebf35b2d1 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -19,16 +19,13 @@ def test_convert_dtypes( if using_infer_string: string_storage = "pyarrow_numpy" - string_storage_option = string_storage - if string_storage == "pyarrow_numpy": - string_storage_option = "pyarrow" df = pd.DataFrame( { "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), } ) - with pd.option_context("string_storage", string_storage_option): + with pd.option_context("string_storage", string_storage): result = df.convert_dtypes(True, True, convert_integer, False) expected = pd.DataFrame( { From 151e3d110964a62652cfb6a0c0f9872ddb9c7ab4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 29 Jul 2024 18:19:35 +0200 Subject: [PATCH 13/14] update expected result for dtype='string' --- pandas/tests/series/test_constructors.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 44a7862c21273..91cf1708ed43b 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2113,9 +2113,12 @@ def test_series_string_inference_array_string_dtype(self): tm.assert_series_equal(ser, expected) def test_series_string_inference_storage_definition(self): - # GH#54793 + # https://github.com/pandas-dev/pandas/issues/54793 + # but after PDEP-14 (string dtype), it was decided to keep dtype="string" + # returning the NA string dtype, so expected is changed from + # "string[pyarrow_numpy]" to "string[pyarrow]" pytest.importorskip("pyarrow") - expected = Series(["a", "b"], dtype="string[pyarrow_numpy]") + expected = Series(["a", "b"], dtype="string[pyarrow]") with pd.option_context("future.infer_string", True): result = Series(["a", "b"], dtype="string") tm.assert_series_equal(result, expected) From 899e3fc96912f82fbc65956d512f7146a12bfefa Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 29 Jul 2024 20:10:17 +0200 Subject: [PATCH 14/14] suppress typing error with _metadata attribute --- pandas/core/arrays/string_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index d9a6036aa4da9..cae770d85637c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -120,7 +120,7 @@ class StringDtype(StorageExtensionDtype): def na_value(self) -> libmissing.NAType | float: # type: ignore[override] return self._na_value - _metadata = ("storage", "_na_value") + _metadata = ("storage", "_na_value") # type: ignore[assignment] def __init__( self,