From 63a7fc57f1548ceb164708889dc5254f64981c23 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 27 Apr 2024 10:13:33 +0200 Subject: [PATCH 01/12] String dtype: implement object-dtype based StringArray variant with NumPy semantics --- pandas/_libs/lib.pyx | 2 +- pandas/compat/__init__.py | 2 + pandas/compat/pyarrow.py | 2 + pandas/conftest.py | 2 + pandas/core/arrays/string_.py | 174 +++++++++++++++--- pandas/core/config_init.py | 6 +- pandas/core/construction.py | 4 +- pandas/tests/arrays/string_/test_string.py | 50 ++--- .../tests/arrays/string_/test_string_arrow.py | 4 +- pandas/tests/extension/test_string.py | 2 +- pandas/tests/series/test_constructors.py | 27 ++- pandas/tests/strings/__init__.py | 4 +- pandas/tests/strings/test_find_replace.py | 18 +- 13 files changed, 220 insertions(+), 77 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 5b6d83ba8e9ee..2199071e7ec4f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2692,7 +2692,7 @@ def maybe_convert_objects(ndarray[object] objects, if using_pyarrow_string_dtype() and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype() return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) elif convert_to_nullable_dtype and is_string_array(objects, skipna=True): diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index caa00b205a29c..cb6cecb4bdf08 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -26,6 +26,7 @@ import pandas.compat.compressors from pandas.compat.numpy import is_numpy_dev from pandas.compat.pyarrow import ( + HAS_PYARROW, pa_version_under10p1, pa_version_under11p0, pa_version_under13p0, @@ -189,6 +190,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under14p0", "pa_version_under14p1", "pa_version_under16p0", + "HAS_PYARROW", "IS64", "ISMUSL", "PY310", diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 5a96e5a4cc49a..2e0135a41d94d 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -16,6 +16,7 @@ pa_version_under14p1 = _palv < Version("14.0.1") pa_version_under15p0 = _palv < Version("15.0.0") pa_version_under16p0 = _palv < Version("16.0.0") + HAS_PYARROW = True except ImportError: pa_version_under10p1 = True pa_version_under11p0 = True @@ -25,3 +26,4 @@ pa_version_under14p1 = True pa_version_under15p0 = True pa_version_under16p0 = True + HAS_PYARROW = False diff --git a/pandas/conftest.py b/pandas/conftest.py index 21100178262c8..9890deb4084f8 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1292,6 +1292,7 @@ def nullable_string_dtype(request): @pytest.fixture( params=[ "python", + "python_numpy", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")), ] @@ -1353,6 +1354,7 @@ def object_dtype(request): params=[ "object", "string[python]", + "string[python_numpy]", pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), ] diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 291cc2e62be62..7a4fa46e83c53 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,7 +1,9 @@ from __future__ import annotations +import operator from typing import ( TYPE_CHECKING, + Any, ClassVar, Literal, cast, @@ -9,7 +11,10 @@ import numpy as np -from pandas._config import get_option +from pandas._config import ( + get_option, + using_pyarrow_string_dtype, +) from pandas._libs import ( lib, @@ -17,7 +22,10 @@ ) from pandas._libs.arrays import NDArrayBacked from pandas._libs.lib import ensure_string_array -from pandas.compat import pa_version_under10p1 +from pandas.compat import ( + HAS_PYARROW, + pa_version_under10p1, +) from pandas.compat.numpy import function as nv from pandas.util._decorators import doc @@ -81,7 +89,7 @@ class StringDtype(StorageExtensionDtype): Parameters ---------- - storage : {"python", "pyarrow", "pyarrow_numpy"}, optional + storage : {"python", "pyarrow", "python_numpy", "pyarrow_numpy"}, optional If not given, the value of ``pd.options.mode.string_storage``. Attributes @@ -113,7 +121,7 @@ class StringDtype(StorageExtensionDtype): # follows NumPy semantics, which uses nan. @property def na_value(self) -> libmissing.NAType | float: # type: ignore[override] - if self.storage == "pyarrow_numpy": + if self.storage in ("pyarrow_numpy", "python_numpy"): return np.nan else: return libmissing.NA @@ -122,15 +130,17 @@ def na_value(self) -> libmissing.NAType | float: # type: ignore[override] def __init__(self, storage=None) -> None: if storage is None: - infer_string = get_option("future.infer_string") - if infer_string: - storage = "pyarrow_numpy" + if using_pyarrow_string_dtype(): + if HAS_PYARROW: + storage = "pyarrow_numpy" + else: + storage = "python_numpy" else: storage = get_option("mode.string_storage") - if storage not in {"python", "pyarrow", "pyarrow_numpy"}: + if storage not in {"python", "pyarrow", "python_numpy", "pyarrow_numpy"}: raise ValueError( - f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. " - f"Got {storage} instead." + "Storage must be 'python', 'pyarrow', 'python_numpy' or 'pyarrow_numpy'" + f". Got {storage} instead." ) if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under10p1: raise ImportError( @@ -178,6 +188,8 @@ def construct_from_string(cls, string) -> Self: return cls() elif string == "string[python]": return cls(storage="python") + elif string == "string[python_numpy]": + return cls(storage="python_numpy") elif string == "string[pyarrow]": return cls(storage="pyarrow") elif string == "string[pyarrow_numpy]": @@ -207,6 +219,8 @@ def construct_array_type( # type: ignore[override] return StringArray elif self.storage == "pyarrow": return ArrowStringArray + elif self.storage == "python_numpy": + return StringArrayNumpySemantics else: return ArrowStringArrayNumpySemantics @@ -238,7 +252,7 @@ def __from_arrow__( # convert chunk by chunk to numpy and concatenate then, to avoid # overflow for large string data when concatenating the pyarrow arrays arr = arr.to_numpy(zero_copy_only=False) - arr = ensure_string_array(arr, na_value=libmissing.NA) + arr = ensure_string_array(arr, na_value=self.na_value) results.append(arr) if len(chunks) == 0: @@ -248,11 +262,7 @@ def __from_arrow__( # Bypass validation inside StringArray constructor, see GH#47781 new_string_array = StringArray.__new__(StringArray) - NDArrayBacked.__init__( - new_string_array, - arr, - StringDtype(storage="python"), - ) + NDArrayBacked.__init__(new_string_array, arr, self) return new_string_array @@ -360,6 +370,7 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] # undo the NumpyExtensionArray hack _typ = "extension" + _storage = "python" def __init__(self, values, copy: bool = False) -> None: values = extract_array(values) @@ -367,7 +378,7 @@ def __init__(self, values, copy: bool = False) -> None: super().__init__(values, copy=copy) if not isinstance(values, type(self)): self._validate() - NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python")) + NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage=self._storage)) def _validate(self) -> None: """Validate that we only store NA or strings.""" @@ -385,22 +396,41 @@ def _validate(self) -> None: else: lib.convert_nans_to_NA(self._ndarray) + def _validate_scalar(self, value): + # used by NDArrayBackedExtensionIndex.insert + if isna(value): + return self.dtype.na_value + elif not isinstance(value, str): + raise TypeError( + f"Cannot set non-string value '{value}' into a string array." + ) + return value + @classmethod def _from_sequence( cls, scalars, *, dtype: Dtype | None = None, copy: bool = False ) -> Self: if dtype and not (isinstance(dtype, str) and dtype == "string"): dtype = pandas_dtype(dtype) - assert isinstance(dtype, StringDtype) and dtype.storage == "python" + assert isinstance(dtype, StringDtype) and dtype.storage in ( + "python", + "python_numpy", + ) + else: + if get_option("future.infer_string"): + dtype = StringDtype(storage="python_numpy") + else: + dtype = StringDtype(storage="python") from pandas.core.arrays.masked import BaseMaskedArray + na_value = dtype.na_value if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype na_values = scalars._mask result = scalars._data result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) - result[na_values] = libmissing.NA + result[na_values] = na_value else: if lib.is_pyarrow_array(scalars): @@ -409,12 +439,12 @@ def _from_sequence( # zero_copy_only to True which caused problems see GH#52076 scalars = np.array(scalars) # convert non-na-likes to str, and nan-likes to StringDtype().na_value - result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy) + result = lib.ensure_string_array(scalars, na_value=na_value, copy=copy) # Manually creating new array avoids the validation step in the __init__, so is # faster. Refactor need for validation? new_string_array = cls.__new__(cls) - NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python")) + NDArrayBacked.__init__(new_string_array, result, dtype) return new_string_array @@ -464,7 +494,7 @@ def __setitem__(self, key, value) -> None: # validate new items if scalar_value: if isna(value): - value = libmissing.NA + value = self.dtype.na_value elif not isinstance(value, str): raise TypeError( f"Cannot set non-string value '{value}' into a StringArray." @@ -478,7 +508,7 @@ def __setitem__(self, key, value) -> None: mask = isna(value) if mask.any(): value = value.copy() - value[isna(value)] = libmissing.NA + value[isna(value)] = self.dtype.na_value super().__setitem__(key, value) @@ -591,9 +621,9 @@ def _cmp_method(self, other, op): if op.__name__ in ops.ARITHMETIC_BINOPS: result = np.empty_like(self._ndarray, dtype="object") - result[mask] = libmissing.NA + result[mask] = self.dtype.na_value result[valid] = op(self._ndarray[valid], other) - return StringArray(result) + return self._from_backing_data(result) else: # logical result = np.zeros(len(self._ndarray), dtype="bool") @@ -662,3 +692,97 @@ def _str_map( # or .findall returns a list). # -> We don't know the result type. E.g. `.get` can return anything. return lib.map_infer_mask(arr, f, mask.view("uint8")) + + +class StringArrayNumpySemantics(StringArray): + _storage = "python_numpy" + + @classmethod + def _from_sequence( + cls, scalars, *, dtype: Dtype | None = None, copy: bool = False + ) -> Self: + if dtype is None: + dtype = StringDtype(storage="python_numpy") + return super()._from_sequence(scalars, dtype=dtype, copy=copy) + + def _from_backing_data(self, arr: np.ndarray) -> NumpyExtensionArray: + # need to overrde NumpyExtensionArray._from_backing_data to ensure + # we always preserve the dtype + return NDArrayBacked._from_backing_data(self, arr) + + def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any: + # the masked_reductions use pd.NA + if result is libmissing.NA: + return np.nan + return super()._wrap_reduction_result(axis, result) + + def _cmp_method(self, other, op): + result = super()._cmp_method(other, op) + if op == operator.ne: + return result.to_numpy(np.bool_, na_value=True) + else: + return result.to_numpy(np.bool_, na_value=False) + + def value_counts(self, dropna: bool = True) -> Series: + from pandas.core.algorithms import value_counts_internal as value_counts + + result = value_counts(self._ndarray, sort=False, dropna=dropna) + result.index = result.index.astype(self.dtype) + return result + + # ------------------------------------------------------------------------ + # String methods interface + _str_na_value = np.nan + + def _str_map( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): + if dtype is None: + dtype = self.dtype + if na_value is None: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + convert = convert and not np.all(mask) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + # if is_integer_dtype(dtype): + # na_value = np.nan + # else: + # na_value = False + try: + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype(cast(type, dtype)), + ) + return result + + except ValueError: + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + ) + if convert and result.dtype == object: + result = lib.maybe_convert_objects(result) + return result + + elif is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, f, mask.view("uint8"), convert=False, na_value=na_value + ) + return type(self)(result) + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, f, mask.view("uint8")) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 46c9139c3456c..c31834253a2dc 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -460,7 +460,9 @@ def is_terminal() -> bool: "string_storage", "python", string_storage_doc, - validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]), + validator=is_one_of_factory( + ["python", "pyarrow", "python_numpy", "pyarrow_numpy"] + ), ) @@ -858,7 +860,7 @@ def register_converter_cb(key: str) -> None: with cf.config_prefix("future"): cf.register_option( "infer_string", - False, + True if os.environ.get("PANDAS_FUTURE_INFER_STRING", "0") == "1" else False, "Whether to infer sequence of str objects as pyarrow string " "dtype, which will be the default in pandas 3.0 " "(at which point this option will be deprecated).", diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 2718e9819cdf8..09985cfe61e28 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -581,7 +581,7 @@ def sanitize_array( ): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype("pyarrow_numpy") + dtype = StringDtype() data = construct_1d_arraylike_from_scalar(data, len(index), dtype) return data @@ -622,7 +622,7 @@ def sanitize_array( elif data.dtype.kind == "U" and using_pyarrow_string_dtype(): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow_numpy") + dtype = StringDtype() subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype) if subarr is data and copy: diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 597b407a29c94..e63f5d85afef1 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -14,6 +14,7 @@ import pandas as pd import pandas._testing as tm +from pandas.core.arrays.string_ import StringArrayNumpySemantics from pandas.core.arrays.string_arrow import ( ArrowStringArray, ArrowStringArrayNumpySemantics, @@ -21,7 +22,7 @@ def na_val(dtype): - if dtype.storage == "pyarrow_numpy": + if dtype.storage in ("python_numpy", "pyarrow_numpy"): return np.nan else: return pd.NA @@ -41,13 +42,13 @@ def cls(dtype): def test_repr(dtype): df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)}) - if dtype.storage == "pyarrow_numpy": + if dtype.storage in ("python_numpy", "pyarrow_numpy"): expected = " A\n0 a\n1 NaN\n2 b" else: expected = " A\n0 a\n1 \n2 b" assert repr(df) == expected - if dtype.storage == "pyarrow_numpy": + if dtype.storage in ("python_numpy", "pyarrow_numpy"): expected = "0 a\n1 NaN\n2 b\nName: A, dtype: string" else: expected = "0 a\n1 \n2 b\nName: A, dtype: string" @@ -59,6 +60,9 @@ def test_repr(dtype): elif dtype.storage == "pyarrow_numpy": arr_name = "ArrowStringArrayNumpySemantics" expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" + elif dtype.storage == "python_numpy": + arr_name = "StringArrayNumpySemantics" + expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" else: arr_name = "StringArray" expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" @@ -74,14 +78,14 @@ def test_none_to_nan(cls, dtype): def test_setitem_validates(cls, dtype): arr = cls._from_sequence(["a", "b"], dtype=dtype) - if cls is pd.arrays.StringArray: + if dtype.storage in ("python", "python_numpy"): msg = "Cannot set non-string value '10' into a StringArray." else: msg = "Scalar must be NA or str" with pytest.raises(TypeError, match=msg): arr[0] = 10 - if cls is pd.arrays.StringArray: + if dtype.storage in ("python", "python_numpy"): msg = "Must provide strings." else: msg = "Scalar must be NA or str" @@ -225,7 +229,7 @@ def test_comparison_methods_scalar(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = "a" result = getattr(a, op_name)(other) - if dtype.storage == "pyarrow_numpy": + if dtype.storage in ("python_numpy", "pyarrow_numpy"): expected = np.array([getattr(item, op_name)(other) for item in a]) if comparison_op == operator.ne: expected[1] = True @@ -244,7 +248,7 @@ def test_comparison_methods_scalar_pd_na(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) result = getattr(a, op_name)(pd.NA) - if dtype.storage == "pyarrow_numpy": + if dtype.storage in ("python_numpy", "pyarrow_numpy"): if operator.ne == comparison_op: expected = np.array([True, True, True]) else: @@ -271,7 +275,7 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): result = getattr(a, op_name)(other) - if dtype.storage == "pyarrow_numpy": + if dtype.storage in ("python_numpy", "pyarrow_numpy"): expected_data = { "__eq__": [False, False, False], "__ne__": [True, True, True], @@ -293,7 +297,7 @@ def test_comparison_methods_array(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = [None, None, "c"] result = getattr(a, op_name)(other) - if dtype.storage == "pyarrow_numpy": + if dtype.storage in ("python_numpy", "pyarrow_numpy"): if operator.ne == comparison_op: expected = np.array([True, True, False]) else: @@ -321,7 +325,7 @@ def test_comparison_methods_array(comparison_op, dtype): def test_constructor_raises(cls): - if cls is pd.arrays.StringArray: + if cls is pd.arrays.StringArray or cls is StringArrayNumpySemantics: msg = "StringArray requires a sequence of strings or pandas.NA" else: msg = "Unsupported type '' for ArrowExtensionArray" @@ -332,7 +336,7 @@ def test_constructor_raises(cls): with pytest.raises(ValueError, match=msg): cls(np.array([])) - if cls is pd.arrays.StringArray: + if cls is pd.arrays.StringArray or cls is StringArrayNumpySemantics: # GH#45057 np.nan and None do NOT raise, as they are considered valid NAs # for string dtype cls(np.array(["a", np.nan], dtype=object)) @@ -387,7 +391,7 @@ def test_astype_int(dtype): tm.assert_numpy_array_equal(result, expected) arr = pd.array(["1", pd.NA, "3"], dtype=dtype) - if dtype.storage == "pyarrow_numpy": + if dtype.storage in ("python_numpy", "pyarrow_numpy"): err = ValueError msg = "cannot convert float NaN to integer" else: @@ -492,7 +496,7 @@ def test_arrow_array(dtype): expected = pa.array(list(data), type=pa.large_string(), from_pandas=True) if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0: expected = pa.chunked_array(expected) - if dtype.storage == "python": + if dtype.storage in ("python", "python_numpy"): expected = pc.cast(expected, pa.string()) assert arr.equals(expected) @@ -502,7 +506,7 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): # roundtrip possible from arrow 1.0.0 pa = pytest.importorskip("pyarrow") - if using_infer_string and string_storage2 != "pyarrow_numpy": + if using_infer_string and string_storage2 not in ("python_numpy", "pyarrow_numpy"): request.applymarker( pytest.mark.xfail( reason="infer_string takes precedence over string storage" @@ -512,7 +516,7 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): data = pd.array(["a", "b", None], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) - if dtype.storage == "python": + if dtype.storage in ("python", "python_numpy"): assert table.field("a").type == "string" else: assert table.field("a").type == "large_string" @@ -542,7 +546,7 @@ def test_arrow_load_from_zero_chunks( data = pd.array([], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) - if dtype.storage == "python": + if dtype.storage in ("python", "python_numpy"): assert table.field("a").type == "string" else: assert table.field("a").type == "large_string" @@ -558,7 +562,7 @@ def test_arrow_load_from_zero_chunks( def test_value_counts_na(dtype): if getattr(dtype, "storage", "") == "pyarrow": exp_dtype = "int64[pyarrow]" - elif getattr(dtype, "storage", "") == "pyarrow_numpy": + elif getattr(dtype, "storage", "") in ("python_numpy", "pyarrow_numpy"): exp_dtype = "int64" else: exp_dtype = "Int64" @@ -575,7 +579,7 @@ def test_value_counts_na(dtype): def test_value_counts_with_normalize(dtype): if getattr(dtype, "storage", "") == "pyarrow": exp_dtype = "double[pyarrow]" - elif getattr(dtype, "storage", "") == "pyarrow_numpy": + elif getattr(dtype, "storage", "") in ("python_numpy", "pyarrow_numpy"): exp_dtype = np.float64 else: exp_dtype = "Float64" @@ -588,7 +592,7 @@ def test_value_counts_with_normalize(dtype): def test_value_counts_sort_false(dtype): if getattr(dtype, "storage", "") == "pyarrow": exp_dtype = "int64[pyarrow]" - elif getattr(dtype, "storage", "") == "pyarrow_numpy": + elif getattr(dtype, "storage", "") in ("python_numpy", "pyarrow_numpy"): exp_dtype = "int64" else: exp_dtype = "Int64" @@ -641,7 +645,11 @@ def test_isin(dtype, fixed_now_ts): tm.assert_series_equal(result, expected) result = s.isin(["a", pd.NA]) - expected = pd.Series([True, False, True]) + if dtype.storage == "python_numpy": + # TODO what do we want here? + expected = pd.Series([True, False, False]) + else: + expected = pd.Series([True, False, True]) tm.assert_series_equal(result, expected) result = s.isin([]) @@ -665,7 +673,7 @@ def test_setitem_scalar_with_mask_validation(dtype): # for other non-string we should also raise an error ser = pd.Series(["a", "b", "c"], dtype=dtype) - if type(ser.array) is pd.arrays.StringArray: + if dtype.storage in ("python", "python_numpy"): msg = "Cannot set non-string value" else: msg = "Scalar must be NA or str" diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 405c1c217b04d..90ca48a6a469d 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -27,7 +27,7 @@ def test_eq_all_na(): def test_config(string_storage, request, using_infer_string): - if using_infer_string and string_storage != "pyarrow_numpy": + if using_infer_string and string_storage in ("python_numpy", "pyarrow_numpy"): request.applymarker(pytest.mark.xfail(reason="infer string takes precedence")) with pd.option_context("string_storage", string_storage): assert StringDtype().storage == string_storage @@ -260,6 +260,6 @@ def test_pickle_roundtrip(dtype): def test_string_dtype_error_message(): # GH#55051 pytest.importorskip("pyarrow") - msg = "Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'." + msg = "Storage must be 'python', 'pyarrow', 'python_numpy' or 'pyarrow_numpy'." with pytest.raises(ValueError, match=msg): StringDtype("bla") diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 49ad3fce92a5c..7756ab40ebe56 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -200,7 +200,7 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): cast_to = dtype elif dtype.storage == "pyarrow": cast_to = "boolean[pyarrow]" # type: ignore[assignment] - elif dtype.storage == "pyarrow_numpy": + elif dtype.storage in ("python_numpy", "pyarrow_numpy"): cast_to = np.bool_ # type: ignore[assignment] else: cast_to = "boolean" # type: ignore[assignment] diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 3f9d5bbe806bb..d9c45e1498d5c 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -14,6 +14,7 @@ iNaT, lib, ) +from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gt2 from pandas.errors import IntCastingNaNError @@ -2079,11 +2080,10 @@ def test_series_from_index_dtype_equal_does_not_copy(self): def test_series_string_inference(self): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" - expected = Series(["a", "b"], dtype=dtype) with pd.option_context("future.infer_string", True): ser = Series(["a", "b"]) + dtype = "string[pyarrow_numpy]" if HAS_PYARROW else "string[python_numpy]" + expected = Series(["a", "b"], dtype=dtype) tm.assert_series_equal(ser, expected) expected = Series(["a", 1], dtype="object") @@ -2094,35 +2094,34 @@ def test_series_string_inference(self): @pytest.mark.parametrize("na_value", [None, np.nan, pd.NA]) def test_series_string_with_na_inference(self, na_value): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" - expected = Series(["a", na_value], dtype=dtype) with pd.option_context("future.infer_string", True): ser = Series(["a", na_value]) + dtype = "string[pyarrow_numpy]" if HAS_PYARROW else "string[python_numpy]" + expected = Series(["a", None], dtype=dtype) tm.assert_series_equal(ser, expected) def test_series_string_inference_scalar(self): # GH#54430 - pytest.importorskip("pyarrow") - expected = Series("a", index=[1], dtype="string[pyarrow_numpy]") with pd.option_context("future.infer_string", True): ser = Series("a", index=[1]) + dtype = "string[pyarrow_numpy]" if HAS_PYARROW else "string[python_numpy]" + expected = Series("a", index=[1], dtype=dtype) tm.assert_series_equal(ser, expected) def test_series_string_inference_array_string_dtype(self): # GH#54496 - pytest.importorskip("pyarrow") - expected = Series(["a", "b"], dtype="string[pyarrow_numpy]") with pd.option_context("future.infer_string", True): ser = Series(np.array(["a", "b"])) + dtype = "string[pyarrow_numpy]" if HAS_PYARROW else "string[python_numpy]" + expected = Series(["a", "b"], dtype=dtype) tm.assert_series_equal(ser, expected) def test_series_string_inference_storage_definition(self): # GH#54793 - pytest.importorskip("pyarrow") - expected = Series(["a", "b"], dtype="string[pyarrow_numpy]") with pd.option_context("future.infer_string", True): result = Series(["a", "b"], dtype="string") + dtype = "string[pyarrow_numpy]" if HAS_PYARROW else "string[python_numpy]" + expected = Series(["a", "b"], dtype=dtype) tm.assert_series_equal(result, expected) def test_series_constructor_infer_string_scalar(self): @@ -2135,10 +2134,10 @@ def test_series_constructor_infer_string_scalar(self): def test_series_string_inference_na_first(self): # GH#55655 - pytest.importorskip("pyarrow") - expected = Series([pd.NA, "b"], dtype="string[pyarrow_numpy]") with pd.option_context("future.infer_string", True): result = Series([pd.NA, "b"]) + dtype = "string[pyarrow_numpy]" if HAS_PYARROW else "string[python_numpy]" + expected = Series([None, "b"], dtype=dtype) tm.assert_series_equal(result, expected) def test_inference_on_pandas_objects(self): diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py index 01b49b5e5b633..4b3cc125fdf7d 100644 --- a/pandas/tests/strings/__init__.py +++ b/pandas/tests/strings/__init__.py @@ -2,12 +2,12 @@ import pandas as pd -object_pyarrow_numpy = ("object", "string[pyarrow_numpy]") +object_pyarrow_numpy = ("object", "string[python_numpy]", "string[pyarrow_numpy]") def _convert_na_value(ser, expected): if ser.dtype != object: - if ser.dtype.storage == "pyarrow_numpy": + if ser.dtype.storage in ("python_numpy", "pyarrow_numpy"): expected = expected.fillna(np.nan) else: # GH#18463 diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index fb308b72e47f5..2ac362fbe1aea 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -220,14 +220,18 @@ def test_contains_nan(any_string_dtype): expected = Series([True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) - result = s.str.contains("foo", na="foo") - if any_string_dtype == "object": - expected = Series(["foo", "foo", "foo"], dtype=np.object_) - elif any_string_dtype == "string[pyarrow_numpy]": - expected = Series([True, True, True], dtype=np.bool_) + if any_string_dtype == "string[python_numpy]": + with pytest.raises(TypeError): + result = s.str.contains("foo", na="foo") else: - expected = Series([True, True, True], dtype="boolean") - tm.assert_series_equal(result, expected) + result = s.str.contains("foo", na="foo") + if any_string_dtype == "object": + expected = Series(["foo", "foo", "foo"], dtype=np.object_) + elif any_string_dtype == "string[pyarrow_numpy]": + expected = Series([True, True, True], dtype=np.bool_) + else: + expected = Series([True, True, True], dtype="boolean") + tm.assert_series_equal(result, expected) result = s.str.contains("foo") expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" From 0eee6254c198266687fe6ebe02ef385cacbd31c3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 27 Apr 2024 20:27:13 +0200 Subject: [PATCH 02/12] fix constructor to not convert to NA --- pandas/_testing/asserters.py | 10 ++++++++++ pandas/core/arrays/string_.py | 13 +++++++++++++ pandas/tests/arrays/string_/test_string.py | 6 +++++- 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 543d7944e4c5d..ba7c28ba18c53 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -805,6 +805,16 @@ def assert_extension_array_equal( left_na, right_na, obj=f"{obj} NA mask", index_values=index_values ) + # Specifically for StringArrayNumpySemantics, validate here we have a valid array + if isinstance(left.dtype, StringDtype) and left.dtype.storage == "python_numpy": + assert np.all( + [np.isnan(val) for val in left._ndarray[left_na]] + ), "wrong missing value sentinels" + if isinstance(right.dtype, StringDtype) and right.dtype.storage == "python_numpy": + assert np.all( + [np.isnan(val) for val in right._ndarray[right_na]] + ), "wrong missing value sentinels" + left_valid = left[~left_na].to_numpy(dtype=object) right_valid = right[~right_na].to_numpy(dtype=object) if check_exact: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 7a4fa46e83c53..f23a34f3298dd 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -697,6 +697,19 @@ def _str_map( class StringArrayNumpySemantics(StringArray): _storage = "python_numpy" + def _validate(self) -> None: + """Validate that we only store NaN or strings.""" + if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): + raise ValueError( + "StringArrayNumpySemantics requires a sequence of strings or NaN" + ) + if self._ndarray.dtype != "object": + raise ValueError( + "StringArrayNumpySemantics requires a sequence of strings or NaN. Got " + f"'{self._ndarray.dtype}' dtype instead." + ) + # TODO validate or force NA/None to NaN + @classmethod def _from_sequence( cls, scalars, *, dtype: Dtype | None = None, copy: bool = False diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index e63f5d85afef1..e88beb73bacff 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -325,8 +325,10 @@ def test_comparison_methods_array(comparison_op, dtype): def test_constructor_raises(cls): - if cls is pd.arrays.StringArray or cls is StringArrayNumpySemantics: + if cls is pd.arrays.StringArray: msg = "StringArray requires a sequence of strings or pandas.NA" + elif cls is StringArrayNumpySemantics: + msg = "StringArrayNumpySemantics requires a sequence of strings or NaN" else: msg = "Unsupported type '' for ArrowExtensionArray" @@ -377,6 +379,8 @@ def test_from_sequence_no_mutate(copy, cls, dtype): import pyarrow as pa expected = cls(pa.array(na_arr, type=pa.string(), from_pandas=True)) + elif cls is StringArrayNumpySemantics: + expected = cls(nan_arr) else: expected = cls(na_arr) From 607b95e376528ca690988716edf8034158f227c1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 27 Apr 2024 20:34:49 +0200 Subject: [PATCH 03/12] fix typing --- pandas/_testing/asserters.py | 4 ++-- pandas/core/arrays/string_.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index ba7c28ba18c53..927003b0e955b 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -808,11 +808,11 @@ def assert_extension_array_equal( # Specifically for StringArrayNumpySemantics, validate here we have a valid array if isinstance(left.dtype, StringDtype) and left.dtype.storage == "python_numpy": assert np.all( - [np.isnan(val) for val in left._ndarray[left_na]] + [np.isnan(val) for val in left._ndarray[left_na]] # type: ignore[attr-defined] ), "wrong missing value sentinels" if isinstance(right.dtype, StringDtype) and right.dtype.storage == "python_numpy": assert np.all( - [np.isnan(val) for val in right._ndarray[right_na]] + [np.isnan(val) for val in right._ndarray[right_na]] # type: ignore[attr-defined] ), "wrong missing value sentinels" left_valid = left[~left_na].to_numpy(dtype=object) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index f23a34f3298dd..8aec02d2902aa 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -718,7 +718,7 @@ def _from_sequence( dtype = StringDtype(storage="python_numpy") return super()._from_sequence(scalars, dtype=dtype, copy=copy) - def _from_backing_data(self, arr: np.ndarray) -> NumpyExtensionArray: + def _from_backing_data(self, arr: np.ndarray) -> StringArrayNumpySemantics: # need to overrde NumpyExtensionArray._from_backing_data to ensure # we always preserve the dtype return NDArrayBacked._from_backing_data(self, arr) From bca157daeb757f80e684d0bc1d36c3f3c0a6d45b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 27 Apr 2024 20:55:56 +0200 Subject: [PATCH 04/12] improve logic in str_map --- pandas/core/arrays/string_.py | 48 ++++++++++++++++------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 8aec02d2902aa..224bea4ac7b91 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -760,32 +760,28 @@ def _str_map( convert = convert and not np.all(mask) if is_integer_dtype(dtype) or is_bool_dtype(dtype): - # if is_integer_dtype(dtype): - # na_value = np.nan - # else: - # na_value = False - try: - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - dtype=np.dtype(cast(type, dtype)), - ) - return result - - except ValueError: - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - ) - if convert and result.dtype == object: - result = lib.maybe_convert_objects(result) - return result + na_value_is_na = isna(na_value) + if na_value_is_na: + if is_integer_dtype(dtype): + na_value = 0 + else: + na_value = True + + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype(cast(type, dtype)), + ) + if na_value_is_na and mask.any(): + if is_integer_dtype(dtype): + result = result.astype("float64") + else: + result = result.astype("object") + result[mask] = np.nan + return result elif is_string_dtype(dtype) and not is_object_dtype(dtype): # i.e. StringDtype From ab96aa4734998dd8e44c75f3951973f7c8b24743 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 30 Jul 2024 09:28:47 +0200 Subject: [PATCH 05/12] remove most usage of python_numpy --- pandas/_testing/asserters.py | 20 +++++++++--- pandas/core/arrays/string_.py | 32 ++++++++++++------- pandas/tests/arrays/string_/test_string.py | 16 +++++----- .../tests/arrays/string_/test_string_arrow.py | 2 +- pandas/tests/extension/test_string.py | 2 +- 5 files changed, 46 insertions(+), 26 deletions(-) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 82fc4d51c5724..f627272de781b 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -578,13 +578,17 @@ def raise_assert_detail( if isinstance(left, np.ndarray): left = pprint_thing(left) - elif isinstance(left, (CategoricalDtype, NumpyEADtype, StringDtype)): + elif isinstance(left, (CategoricalDtype, NumpyEADtype)): left = repr(left) + elif isinstance(left, StringDtype): + left = f"StringDtype(storage={left.storage}, na_value={left.na_value})" if isinstance(right, np.ndarray): right = pprint_thing(right) - elif isinstance(right, (CategoricalDtype, NumpyEADtype, StringDtype)): + elif isinstance(right, (CategoricalDtype, NumpyEADtype)): right = repr(right) + elif isinstance(right, StringDtype): + right = f"StringDtype(storage={right.storage}, na_value={right.na_value})" msg += f""" [left]: {left} @@ -791,11 +795,19 @@ def assert_extension_array_equal( ) # Specifically for StringArrayNumpySemantics, validate here we have a valid array - if isinstance(left.dtype, StringDtype) and left.dtype.storage == "python_numpy": + if ( + isinstance(left.dtype, StringDtype) + and left.dtype.storage == "python" + and left.dtype.na_value is np.nan + ): assert np.all( [np.isnan(val) for val in left._ndarray[left_na]] # type: ignore[attr-defined] ), "wrong missing value sentinels" - if isinstance(right.dtype, StringDtype) and right.dtype.storage == "python_numpy": + if ( + isinstance(right.dtype, StringDtype) + and right.dtype.storage == "python" + and right.dtype.na_value is np.nan + ): assert np.all( [np.isnan(val) for val in right._ndarray[right_na]] # type: ignore[attr-defined] ), "wrong missing value sentinels" diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 775e6cf47d1e8..b1751a24f1c4f 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -146,6 +146,10 @@ def __init__( # TODO raise a deprecation warning storage = "pyarrow" na_value = np.nan + if storage == "python_numpy": + # TODO remove + storage = "python" + na_value = np.nan # validate options if storage not in {"python", "pyarrow"}: @@ -229,7 +233,8 @@ def construct_from_string(cls, string) -> Self: elif string == "string[python]": return cls(storage="python") elif string == "string[python_numpy]": - return cls(storage="python_numpy") + # TODO remove + return cls(storage="python", na_value=np.nan) elif string == "string[pyarrow]": return cls(storage="pyarrow") elif string == "string[pyarrow_numpy]": @@ -256,11 +261,11 @@ def construct_array_type( # type: ignore[override] ArrowStringArrayNumpySemantics, ) - if self.storage == "python": + if self.storage == "python" and self._na_value is libmissing.NA: return StringArray elif self.storage == "pyarrow" and self._na_value is libmissing.NA: return ArrowStringArray - elif self.storage == "python_numpy": + elif self.storage == "python": return StringArrayNumpySemantics else: return ArrowStringArrayNumpySemantics @@ -416,6 +421,7 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] # undo the NumpyExtensionArray hack _typ = "extension" _storage = "python" + _na_value = libmissing.NA def __init__(self, values, copy: bool = False) -> None: values = extract_array(values) @@ -423,7 +429,11 @@ def __init__(self, values, copy: bool = False) -> None: super().__init__(values, copy=copy) if not isinstance(values, type(self)): self._validate() - NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage=self._storage)) + NDArrayBacked.__init__( + self, + self._ndarray, + StringDtype(storage=self._storage, na_value=self._na_value), + ) def _validate(self) -> None: """Validate that we only store NA or strings.""" @@ -457,13 +467,10 @@ def _from_sequence( ) -> Self: if dtype and not (isinstance(dtype, str) and dtype == "string"): dtype = pandas_dtype(dtype) - assert isinstance(dtype, StringDtype) and dtype.storage in ( - "python", - "python_numpy", - ) + assert isinstance(dtype, StringDtype) and dtype.storage == "python" else: - if get_option("future.infer_string"): - dtype = StringDtype(storage="python_numpy") + if using_string_dtype(): + dtype = StringDtype(storage="python", na_value=np.nan) else: dtype = StringDtype(storage="python") @@ -749,7 +756,8 @@ def _str_map( class StringArrayNumpySemantics(StringArray): - _storage = "python_numpy" + _storage = "python" + _na_value = np.nan def _validate(self) -> None: """Validate that we only store NaN or strings.""" @@ -769,7 +777,7 @@ def _from_sequence( cls, scalars, *, dtype: Dtype | None = None, copy: bool = False ) -> Self: if dtype is None: - dtype = StringDtype(storage="python_numpy") + dtype = StringDtype(storage="python", na_value=np.nan) return super()._from_sequence(scalars, dtype=dtype, copy=copy) def _from_backing_data(self, arr: np.ndarray) -> StringArrayNumpySemantics: diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index bde5aaa8ecf5b..62632d4cf17b7 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -74,7 +74,7 @@ def test_repr(dtype): elif dtype.storage == "pyarrow" and dtype.na_value is np.nan: arr_name = "ArrowStringArrayNumpySemantics" expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" - elif dtype.storage == "python_numpy": + elif dtype.storage == "python" and dtype.na_value is np.nan: arr_name = "StringArrayNumpySemantics" expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" else: @@ -92,14 +92,14 @@ def test_none_to_nan(cls, dtype): def test_setitem_validates(cls, dtype): arr = cls._from_sequence(["a", "b"], dtype=dtype) - if dtype.storage in ("python", "python_numpy"): + if dtype.storage == "python": msg = "Cannot set non-string value '10' into a StringArray." else: msg = "Scalar must be NA or str" with pytest.raises(TypeError, match=msg): arr[0] = 10 - if dtype.storage in ("python", "python_numpy"): + if dtype.storage == "python": msg = "Must provide strings." else: msg = "Scalar must be NA or str" @@ -514,7 +514,7 @@ def test_arrow_array(dtype): expected = pa.array(list(data), type=pa.large_string(), from_pandas=True) if dtype.storage == "pyarrow" and pa_version_under12p0: expected = pa.chunked_array(expected) - if dtype.storage in ("python", "python_numpy"): + if dtype.storage == "python": expected = pc.cast(expected, pa.string()) assert arr.equals(expected) @@ -534,7 +534,7 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): data = pd.array(["a", "b", None], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) - if dtype.storage in ("python", "python_numpy"): + if dtype.storage == "python": assert table.field("a").type == "string" else: assert table.field("a").type == "large_string" @@ -564,7 +564,7 @@ def test_arrow_load_from_zero_chunks( data = pd.array([], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) - if dtype.storage in ("python", "python_numpy"): + if dtype.storage == "python": assert table.field("a").type == "string" else: assert table.field("a").type == "large_string" @@ -663,7 +663,7 @@ def test_isin(dtype, fixed_now_ts): tm.assert_series_equal(result, expected) result = s.isin(["a", pd.NA]) - if dtype.storage == "python_numpy": + if dtype.storage == "python" and dtype.na_value is np.nan: # TODO what do we want here? expected = pd.Series([True, False, False]) else: @@ -691,7 +691,7 @@ def test_setitem_scalar_with_mask_validation(dtype): # for other non-string we should also raise an error ser = pd.Series(["a", "b", "c"], dtype=dtype) - if dtype.storage in ("python", "python_numpy"): + if dtype.storage == "python": msg = "Cannot set non-string value" else: msg = "Scalar must be NA or str" diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index a2a17c9d3b938..fca8a0b39135b 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -29,7 +29,7 @@ def test_eq_all_na(): def test_config(string_storage, request, using_infer_string): if using_infer_string and string_storage in ("python_numpy", "pyarrow_numpy"): request.applymarker(pytest.mark.xfail(reason="infer string takes precedence")) - if string_storage == "pyarrow_numpy": + if string_storage in ("pyarrow_numpy", "python_numpy"): request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) with pd.option_context("string_storage", string_storage): assert StringDtype().storage == string_storage diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 64b383ded97b5..a747b9c30bb7f 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -192,7 +192,7 @@ def _get_expected_exception( def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: return ( op_name in ["min", "max"] - or ser.dtype.na_value is np.nan # type: ignore[union-attr] + or (ser.dtype.storage == "pyarrow" and ser.dtype.na_value is np.nan) # type: ignore[union-attr] and op_name in ("any", "all") ) From bae8d65b5f0b97511b2320fa0d133905e8e0e1ba Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 30 Jul 2024 09:54:09 +0200 Subject: [PATCH 06/12] update tests to avoid string[python_numpy] --- pandas/tests/series/test_constructors.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 113b3b9f4a93b..e66dc824b059c 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2077,7 +2077,7 @@ def test_series_string_inference(self): # GH#54430 with pd.option_context("future.infer_string", True): ser = Series(["a", "b"]) - dtype = "string[pyarrow_numpy]" if HAS_PYARROW else "string[python_numpy]" + dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan) expected = Series(["a", "b"], dtype=dtype) tm.assert_series_equal(ser, expected) @@ -2091,7 +2091,7 @@ def test_series_string_with_na_inference(self, na_value): # GH#54430 with pd.option_context("future.infer_string", True): ser = Series(["a", na_value]) - dtype = "string[pyarrow_numpy]" if HAS_PYARROW else "string[python_numpy]" + dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan) expected = Series(["a", None], dtype=dtype) tm.assert_series_equal(ser, expected) @@ -2099,7 +2099,7 @@ def test_series_string_inference_scalar(self): # GH#54430 with pd.option_context("future.infer_string", True): ser = Series("a", index=[1]) - dtype = "string[pyarrow_numpy]" if HAS_PYARROW else "string[python_numpy]" + dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan) expected = Series("a", index=[1], dtype=dtype) tm.assert_series_equal(ser, expected) @@ -2107,7 +2107,7 @@ def test_series_string_inference_array_string_dtype(self): # GH#54496 with pd.option_context("future.infer_string", True): ser = Series(np.array(["a", "b"])) - dtype = "string[pyarrow_numpy]" if HAS_PYARROW else "string[python_numpy]" + dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan) expected = Series(["a", "b"], dtype=dtype) tm.assert_series_equal(ser, expected) @@ -2134,7 +2134,7 @@ def test_series_string_inference_na_first(self): # GH#55655 with pd.option_context("future.infer_string", True): result = Series([pd.NA, "b"]) - dtype = "string[pyarrow_numpy]" if HAS_PYARROW else "string[python_numpy]" + dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan) expected = Series([None, "b"], dtype=dtype) tm.assert_series_equal(result, expected) From 864c166751087aa20a1be8c93321583f12ea61f6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 1 Aug 2024 21:12:10 +0200 Subject: [PATCH 07/12] remove all python_numpy usage --- pandas/conftest.py | 2 +- pandas/core/arrays/string_.py | 7 ------- pandas/tests/strings/test_find_replace.py | 18 +++++++----------- 3 files changed, 8 insertions(+), 19 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index c6f0afd90d036..7c485515f0784 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1295,7 +1295,6 @@ def nullable_string_dtype(request): @pytest.fixture( params=[ "python", - "python_numpy", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), ] ) @@ -1314,6 +1313,7 @@ def string_storage(request): ("python", pd.NA), pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + ("python", np.nan), ] ) def string_dtype_arguments(request): diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 2040ebf992981..59621647ea6bd 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -146,10 +146,6 @@ def __init__( # TODO raise a deprecation warning storage = "pyarrow" na_value = np.nan - if storage == "python_numpy": - # TODO remove - storage = "python" - na_value = np.nan # validate options if storage not in {"python", "pyarrow"}: @@ -234,9 +230,6 @@ def construct_from_string(cls, string) -> Self: return cls() elif string == "string[python]": return cls(storage="python") - elif string == "string[python_numpy]": - # TODO remove - return cls(storage="python", na_value=np.nan) elif string == "string[pyarrow]": return cls(storage="pyarrow") elif string == "string[pyarrow_numpy]": diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 4c977dd640ac1..00677ef4fcfe9 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -232,18 +232,14 @@ def test_contains_nan(any_string_dtype): expected = Series([True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) - if any_string_dtype == "string[python_numpy]": - with pytest.raises(TypeError): - result = s.str.contains("foo", na="foo") + result = s.str.contains("foo", na="foo") + if any_string_dtype == "object": + expected = Series(["foo", "foo", "foo"], dtype=np.object_) + elif any_string_dtype.na_value is np.nan: + expected = Series([True, True, True], dtype=np.bool_) else: - result = s.str.contains("foo", na="foo") - if any_string_dtype == "object": - expected = Series(["foo", "foo", "foo"], dtype=np.object_) - elif any_string_dtype == "string[pyarrow_numpy]": - expected = Series([True, True, True], dtype=np.bool_) - else: - expected = Series([True, True, True], dtype="boolean") - tm.assert_series_equal(result, expected) + expected = Series([True, True, True], dtype="boolean") + tm.assert_series_equal(result, expected) result = s.str.contains("foo") expected_dtype = ( From d3ad7b02208a036635455f953da514591f63fdcd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 2 Aug 2024 16:39:19 +0200 Subject: [PATCH 08/12] remove hardcoded storage --- pandas/core/dtypes/cast.py | 2 +- pandas/core/internals/construction.py | 2 +- pandas/io/_util.py | 4 ++-- pandas/io/pytables.py | 7 ++++--- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index d750451a1ca84..162f6a4d30f3f 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -801,7 +801,7 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: if using_string_dtype(): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(storage="pyarrow", na_value=np.nan) + dtype = StringDtype(na_value=np.nan) elif isinstance(val, (np.datetime64, dt.datetime)): try: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 08e1650a5de12..535397871588c 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -302,7 +302,7 @@ def ndarray_to_mgr( nb = new_block_2d(values, placement=bp, refs=refs) block_values = [nb] elif dtype is None and values.dtype.kind == "U" and using_string_dtype(): - dtype = StringDtype(storage="pyarrow", na_value=np.nan) + dtype = StringDtype(na_value=np.nan) obj_columns = list(values) block_values = [ diff --git a/pandas/io/_util.py b/pandas/io/_util.py index a72a16269959d..f502f827faa4e 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -34,6 +34,6 @@ def arrow_string_types_mapper() -> Callable: pa = import_optional_dependency("pyarrow") return { - pa.string(): pd.StringDtype(storage="pyarrow", na_value=np.nan), - pa.large_string(): pd.StringDtype(storage="pyarrow", na_value=np.nan), + pa.string(): pd.StringDtype(na_value=np.nan), + pa.large_string(): pd.StringDtype(na_value=np.nan), }.get diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 4b569fb7e39e2..618254fee9259 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -75,6 +75,7 @@ PeriodIndex, RangeIndex, Series, + StringDtype, TimedeltaIndex, concat, isna, @@ -3295,7 +3296,7 @@ def read( values = self.read_array("values", start=start, stop=stop) result = Series(values, index=index, name=self.name, copy=False) if using_string_dtype() and is_string_array(values, skipna=True): - result = result.astype("string[pyarrow_numpy]") + result = result.astype(StringDtype(na_value=np.nan)) return result def write(self, obj, **kwargs) -> None: @@ -3364,7 +3365,7 @@ def read( columns = items[items.get_indexer(blk_items)] df = DataFrame(values.T, columns=columns, index=axes[1], copy=False) if using_string_dtype() and is_string_array(values, skipna=True): - df = df.astype("string[pyarrow_numpy]") + df = df.astype(StringDtype(na_value=np.nan)) dfs.append(df) if len(dfs) > 0: @@ -4741,7 +4742,7 @@ def read( values, # type: ignore[arg-type] skipna=True, ): - df = df.astype("string[pyarrow_numpy]") + df = df.astype(StringDtype(na_value=np.nan)) frames.append(df) if len(frames) == 1: From 028dc2c6a37c2a5876646df66965cdb4434a651c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 2 Aug 2024 16:52:42 +0200 Subject: [PATCH 09/12] implement any/all reductions --- pandas/core/arrays/string_.py | 16 +++++++++++++++- pandas/tests/extension/test_string.py | 2 +- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 59621647ea6bd..c3724be2181b3 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -43,7 +43,10 @@ pandas_dtype, ) -from pandas.core import ops +from pandas.core import ( + nanops, + ops, +) from pandas.core.array_algos import masked_reductions from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.floating import ( @@ -780,6 +783,17 @@ def _from_backing_data(self, arr: np.ndarray) -> StringArrayNumpySemantics: # we always preserve the dtype return NDArrayBacked._from_backing_data(self, arr) + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + if name in ["any", "all"]: + if name == "any": + return nanops.nanany(self._ndarray, skipna=skipna) + else: + return nanops.nanall(self._ndarray, skipna=skipna) + else: + return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) + def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any: # the masked_reductions use pd.NA if result is libmissing.NA: diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 9ee506f560a32..2ab248787a1cf 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -193,7 +193,7 @@ def _get_expected_exception( def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: return ( op_name in ["min", "max"] - or (ser.dtype.storage == "pyarrow" and ser.dtype.na_value is np.nan) # type: ignore[union-attr] + or ser.dtype.na_value is np.nan # type: ignore[union-attr] and op_name in ("any", "all") ) From 7f4baf79bd7633825df230920ffb9ef7008ceeb2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 3 Aug 2024 12:42:48 +0200 Subject: [PATCH 10/12] fix typing --- pandas/core/arrays/string_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index c3724be2181b3..0a8988eeca480 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -419,7 +419,7 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] # undo the NumpyExtensionArray hack _typ = "extension" _storage = "python" - _na_value = libmissing.NA + _na_value: libmissing.NAType | float = libmissing.NA def __init__(self, values, copy: bool = False) -> None: values = extract_array(values) From fe6fce693f51aed7c095d1e2ba78f34c581b809e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 7 Aug 2024 14:36:02 +0200 Subject: [PATCH 11/12] Update pandas/core/arrays/string_.py Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/arrays/string_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 0a8988eeca480..4fa33977b579d 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -779,7 +779,7 @@ def _from_sequence( return super()._from_sequence(scalars, dtype=dtype, copy=copy) def _from_backing_data(self, arr: np.ndarray) -> StringArrayNumpySemantics: - # need to overrde NumpyExtensionArray._from_backing_data to ensure + # need to override NumpyExtensionArray._from_backing_data to ensure # we always preserve the dtype return NDArrayBacked._from_backing_data(self, arr) From 70325d4d9bf5003ea82d3929ff9fa718261e8b9b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 7 Aug 2024 14:45:47 +0200 Subject: [PATCH 12/12] update todo comment --- pandas/tests/arrays/string_/test_string.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index f57a0f804dc02..3688d2998b3c7 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -653,7 +653,7 @@ def test_isin(dtype, fixed_now_ts): result = s.isin(["a", pd.NA]) if dtype.storage == "python" and dtype.na_value is np.nan: - # TODO what do we want here? + # TODO(infer_string) we should make this consistent expected = pd.Series([True, False, False]) else: expected = pd.Series([True, False, True])