From f4441058c72cf455daf4a42fe45696f9d7e7ef09 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Sep 2024 13:04:03 +0200 Subject: [PATCH 01/21] String dtype: map builtin str alias to StringDtype --- pandas/_testing/__init__.py | 2 +- pandas/core/arrays/datetimelike.py | 10 ++++++++-- pandas/core/dtypes/base.py | 6 ++++++ pandas/tests/arrays/floating/test_astype.py | 6 ++---- pandas/tests/arrays/integer/test_dtypes.py | 6 ++---- pandas/tests/dtypes/test_common.py | 12 ++++++++++++ pandas/tests/frame/test_constructors.py | 12 +++++++++--- pandas/tests/series/test_constructors.py | 2 +- 8 files changed, 41 insertions(+), 15 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 5fa1a984b8aea..0be01da1816a2 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -108,7 +108,7 @@ COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"] if using_string_dtype(): - STRING_DTYPES: list[Dtype] = [str, "U"] + STRING_DTYPES: list[Dtype] = ["U"] else: STRING_DTYPES: list[Dtype] = [str, "str", "U"] # type: ignore[no-redef] COMPLEX_FLOAT_DTYPES: list[Dtype] = [*COMPLEX_DTYPES, *FLOAT_NUMPY_DTYPES] diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index fbe1677b95b33..94a57d30020f3 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -471,10 +471,16 @@ def astype(self, dtype, copy: bool = True): return self._box_values(self.asi8.ravel()).reshape(self.shape) + elif is_string_dtype(dtype): + arr_object = self._format_native_types() + if isinstance(dtype, ExtensionDtype): + cls = dtype.construct_array_type() + return cls._from_sequence(arr_object, dtype=dtype, copy=False) + else: + return arr_object + elif isinstance(dtype, ExtensionDtype): return super().astype(dtype, copy=copy) - elif is_string_dtype(dtype): - return self._format_native_types() elif dtype.kind in "iu": # we deliberately ignore int32 vs. int64 here. # See https://github.com/pandas-dev/pandas/issues/24381 for more. diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index d8a42d83b6c54..a23d1ac847b82 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -562,6 +562,12 @@ def find( return the first matching dtype, otherwise return None """ if not isinstance(dtype, str): + # builtin aliases + if dtype is str: + from pandas.core.arrays.string_ import StringDtype + + return StringDtype(na_value=np.nan) + dtype_type: type_t if not isinstance(dtype, type): dtype_type = type(dtype) diff --git a/pandas/tests/arrays/floating/test_astype.py b/pandas/tests/arrays/floating/test_astype.py index ccf644b34051d..752ebe194ffcf 100644 --- a/pandas/tests/arrays/floating/test_astype.py +++ b/pandas/tests/arrays/floating/test_astype.py @@ -68,11 +68,9 @@ def test_astype_str(using_infer_string): if using_infer_string: expected = pd.array(["0.1", "0.2", None], dtype=pd.StringDtype(na_value=np.nan)) - tm.assert_extension_array_equal(a.astype("str"), expected) - # TODO(infer_string) this should also be a string array like above - expected = np.array(["0.1", "0.2", ""], dtype="U32") - tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype("str"), expected) else: expected = np.array(["0.1", "0.2", ""], dtype="U32") diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index fadd7ac67b58d..7972ba7b9fb0f 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -281,11 +281,9 @@ def test_astype_str(using_infer_string): if using_infer_string: expected = pd.array(["1", "2", None], dtype=pd.StringDtype(na_value=np.nan)) - tm.assert_extension_array_equal(a.astype("str"), expected) - # TODO(infer_string) this should also be a string array like above - expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") - tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype("str"), expected) else: expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 2c2dff7a957fe..e338fb1331734 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -810,11 +810,23 @@ def test_pandas_dtype_string_dtypes(string_storage): "pyarrow" if HAS_PYARROW else "python", na_value=np.nan ) + with pd.option_context("future.infer_string", True): + # with the default string_storage setting + result = pandas_dtype(str) + assert result == pd.StringDtype( + "pyarrow" if HAS_PYARROW else "python", na_value=np.nan + ) + with pd.option_context("future.infer_string", True): with pd.option_context("string_storage", string_storage): result = pandas_dtype("str") assert result == pd.StringDtype(string_storage, na_value=np.nan) + with pd.option_context("future.infer_string", True): + with pd.option_context("string_storage", string_storage): + result = pandas_dtype(str) + assert result == pd.StringDtype(string_storage, na_value=np.nan) + with pd.option_context("future.infer_string", False): with pd.option_context("string_storage", string_storage): result = pandas_dtype("str") diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 0176a36fe78d7..ad78e75c6a400 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -82,7 +82,7 @@ def test_constructor_from_ndarray_with_str_dtype(self): # with an array of strings each of which is e.g. "[0 1 2]" arr = np.arange(12).reshape(4, 3) df = DataFrame(arr, dtype=str) - expected = DataFrame(arr.astype(str), dtype=object) + expected = DataFrame(arr.astype(str), dtype="str") tm.assert_frame_equal(df, expected) def test_constructor_from_2d_datetimearray(self): @@ -1766,12 +1766,18 @@ def test_constructor_column_duplicates(self): tm.assert_frame_equal(idf, edf) - def test_constructor_empty_with_string_dtype(self): + def test_constructor_empty_with_string_dtype(self, using_infer_string): # GH 9428 expected = DataFrame(index=[0, 1], columns=[0, 1], dtype=object) + expected_str = DataFrame( + index=[0, 1], columns=[0, 1], dtype=pd.StringDtype(na_value=np.nan) + ) df = DataFrame(index=[0, 1], columns=[0, 1], dtype=str) - tm.assert_frame_equal(df, expected) + if using_infer_string: + tm.assert_frame_equal(df, expected_str) + else: + tm.assert_frame_equal(df, expected) df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.str_) tm.assert_frame_equal(df, expected) df = DataFrame(index=[0, 1], columns=[0, 1], dtype="U5") diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 1771a4dfdb71f..69f42b5e42878 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -229,7 +229,7 @@ def test_constructor_empty(self, input_class, using_infer_string): # GH 19853 : with empty string, index and dtype str empty = Series("", dtype=str, index=range(3)) if using_infer_string: - empty2 = Series("", index=range(3), dtype=object) + empty2 = Series("", index=range(3), dtype="str") else: empty2 = Series("", index=range(3)) tm.assert_series_equal(empty, empty2) From 630d41c2f8420ec33323f3ef7328411cbe10a130 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Sep 2024 15:55:30 +0200 Subject: [PATCH 02/21] fix tests --- pandas/core/arrays/categorical.py | 10 ++++++- pandas/core/dtypes/base.py | 4 ++- pandas/core/indexes/base.py | 8 +++-- pandas/core/indexes/interval.py | 3 +- pandas/core/strings/object_array.py | 1 + pandas/tests/frame/methods/test_astype.py | 8 ++--- .../indexes/datetimes/methods/test_astype.py | 10 +++---- .../tests/indexes/interval/test_indexing.py | 4 +-- pandas/tests/indexes/object/test_astype.py | 4 +-- .../indexes/period/methods/test_astype.py | 2 +- .../indexes/timedeltas/methods/test_astype.py | 2 +- pandas/tests/interchange/test_impl.py | 1 + pandas/tests/io/excel/test_readers.py | 6 ++-- .../io/parser/dtypes/test_dtypes_basic.py | 15 ++++++---- pandas/tests/series/methods/test_astype.py | 30 +++++++++++-------- 15 files changed, 67 insertions(+), 41 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c613a345686cc..a4a7900342f5d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -13,7 +13,10 @@ import numpy as np -from pandas._config import get_option +from pandas._config import ( + get_option, + using_string_dtype, +) from pandas._libs import ( NaT, @@ -2685,6 +2688,11 @@ def _str_get_dummies(self, sep: str = "|"): # sep may not be in categories. Just bail on this. from pandas.core.arrays import NumpyExtensionArray + if using_string_dtype(): + return NumpyExtensionArray(self.astype(str).to_numpy())._str_get_dummies( + sep + ) + return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep) # ------------------------------------------------------------------------ diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index a23d1ac847b82..42b51b11ad41f 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -14,6 +14,8 @@ import numpy as np +from pandas._config import using_string_dtype + from pandas._libs import missing as libmissing from pandas._libs.hashtable import object_hash from pandas._libs.properties import cache_readonly @@ -563,7 +565,7 @@ def find( """ if not isinstance(dtype, str): # builtin aliases - if dtype is str: + if dtype is str and using_string_dtype(): from pandas.core.arrays.string_ import StringDtype return StringDtype(na_value=np.nan) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 582e1f96fa562..0a52a3691bc82 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3658,7 +3658,6 @@ def get_indexer( method = clean_reindex_fill_method(method) orig_target = target target = self._maybe_cast_listlike_indexer(target) - self._check_indexing_method(method, limit, tolerance) if not self._index_as_unique: @@ -6261,7 +6260,11 @@ def _should_compare(self, other: Index) -> bool: return False dtype = _unpack_nested_dtype(other) - return self._is_comparable_dtype(dtype) or is_object_dtype(dtype) + return ( + self._is_comparable_dtype(dtype) + or is_object_dtype(dtype) + or is_string_dtype(dtype) + ) def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: """ @@ -7727,6 +7730,7 @@ def get_values_for_csv( values = cast("IntervalArray", values) mask = values.isna() if not quoting: + # TODO result = np.asarray(values).astype(str) else: result = np.array(values, dtype=object, copy=True) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 359cdf880937b..8feac890883eb 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -51,6 +51,7 @@ is_number, is_object_dtype, is_scalar, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -712,7 +713,7 @@ def _get_indexer( # left/right get_indexer, compare elementwise, equality -> match indexer = self._get_indexer_unique_sides(target) - elif not is_object_dtype(target.dtype): + elif not (is_object_dtype(target.dtype) or is_string_dtype(target.dtype)): # homogeneous scalar index: use IntervalTree # we should always have self._should_partial_index(target) here target = self._maybe_convert_i8(target) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index c6b18d7049c57..28bd943688b79 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -405,6 +405,7 @@ def _str_get_dummies(self, sep: str = "|"): try: arr = sep + arr + sep except (TypeError, NotImplementedError): + # TODO arr = sep + arr.astype(str) + sep tags: set[str] = set() diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 8647df0e8ad96..65a53ccb95141 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -168,7 +168,7 @@ def test_astype_str(self): "d": list(map(str, d._values)), "e": list(map(str, e._values)), }, - dtype="object", + dtype="str", ) tm.assert_frame_equal(result, expected) @@ -176,13 +176,13 @@ def test_astype_str(self): def test_astype_str_float(self): # see GH#11302 result = DataFrame([np.nan]).astype(str) - expected = DataFrame(["nan"], dtype="object") + expected = DataFrame(["nan"], dtype="str") tm.assert_frame_equal(result, expected) result = DataFrame([1.12345678901234567890]).astype(str) val = "1.1234567890123457" - expected = DataFrame([val], dtype="object") + expected = DataFrame([val], dtype="str") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype_class", [dict, Series]) @@ -284,7 +284,7 @@ def test_astype_duplicate_col_series_arg(self): result = df.astype(dtypes) expected = DataFrame( { - 0: Series(vals[:, 0].astype(str), dtype=object), + 0: Series(vals[:, 0].astype(str), dtype="str"), 1: vals[:, 1], 2: pd.array(vals[:, 2], dtype="Float64"), 3: vals[:, 3], diff --git a/pandas/tests/indexes/datetimes/methods/test_astype.py b/pandas/tests/indexes/datetimes/methods/test_astype.py index 81dc3b3ecc45e..03957053199a8 100644 --- a/pandas/tests/indexes/datetimes/methods/test_astype.py +++ b/pandas/tests/indexes/datetimes/methods/test_astype.py @@ -107,7 +107,7 @@ def test_astype_str_nat(self): idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan]) result = idx.astype(str) - expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object) + expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype="str") tm.assert_index_equal(result, expected) def test_astype_str(self): @@ -117,7 +117,7 @@ def test_astype_str(self): expected = Index( ["2012-01-01", "2012-01-02", "2012-01-03", "2012-01-04"], name="test_name", - dtype=object, + dtype="str", ) tm.assert_index_equal(result, expected) @@ -132,7 +132,7 @@ def test_astype_str_tz_and_name(self): "2012-01-03 00:00:00-05:00", ], name="test_name", - dtype=object, + dtype="str", ) tm.assert_index_equal(result, expected) @@ -143,7 +143,7 @@ def test_astype_str_freq_and_name(self): expected = Index( ["2011-01-01 00:00:00", "2011-01-01 01:00:00", "2011-01-01 02:00:00"], name="test_name", - dtype=object, + dtype="str", ) tm.assert_index_equal(result, expected) @@ -155,7 +155,7 @@ def test_astype_str_freq_and_tz(self): result = dti.astype(str) expected = Index( ["2012-03-06 00:00:00+00:00", "2012-03-06 01:00:00+00:00"], - dtype=object, + dtype="str", name="test_name", ) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index 787461b944bd0..00d01c47251e3 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -365,9 +365,9 @@ def test_get_indexer_datetime(self): # TODO: with mismatched resolution get_indexer currently raises; # this should probably coerce? target = DatetimeIndex(["2018-01-02"], dtype="M8[ns]") - result = ii.get_indexer(target) + # result = ii.get_indexer(target) expected = np.array([0], dtype=np.intp) - tm.assert_numpy_array_equal(result, expected) + # tm.assert_numpy_array_equal(result, expected) result = ii.get_indexer(target.astype(str)) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/object/test_astype.py b/pandas/tests/indexes/object/test_astype.py index 9c1ef302c5b51..ce05b5e9f2238 100644 --- a/pandas/tests/indexes/object/test_astype.py +++ b/pandas/tests/indexes/object/test_astype.py @@ -15,12 +15,12 @@ def test_astype_str_from_bytes(): # ensure_string_array which does f"{val}" idx = Index(["あ", b"a"], dtype="object") result = idx.astype(str) - expected = Index(["あ", "a"], dtype="object") + expected = Index(["あ", "a"], dtype="str") tm.assert_index_equal(result, expected) # while we're here, check that Series.astype behaves the same result = Series(idx).astype(str) - expected = Series(expected, dtype=object) + expected = Series(expected, dtype="str") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/period/methods/test_astype.py b/pandas/tests/indexes/period/methods/test_astype.py index d545bfd2fae0f..1f9dad7c972d4 100644 --- a/pandas/tests/indexes/period/methods/test_astype.py +++ b/pandas/tests/indexes/period/methods/test_astype.py @@ -41,7 +41,7 @@ def test_astype_conversion(self): tm.assert_index_equal(result, expected) result = idx.astype(str) - expected = Index([str(x) for x in idx], name="idx", dtype=object) + expected = Index([str(x) for x in idx], name="idx", dtype="str") tm.assert_index_equal(result, expected) idx = period_range("1990", "2009", freq="Y", name="idx") diff --git a/pandas/tests/indexes/timedeltas/methods/test_astype.py b/pandas/tests/indexes/timedeltas/methods/test_astype.py index 311f2b5c9aa59..1a855e728b0e7 100644 --- a/pandas/tests/indexes/timedeltas/methods/test_astype.py +++ b/pandas/tests/indexes/timedeltas/methods/test_astype.py @@ -61,7 +61,7 @@ def test_astype(self): tm.assert_index_equal(result, expected) result = idx.astype(str) - expected = Index([str(x) for x in idx], name="idx", dtype=object) + expected = Index([str(x) for x in idx], name="idx", dtype="str") tm.assert_index_equal(result, expected) rng = timedelta_range("1 days", periods=10) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 76910db941d36..6885f2d6d0ee8 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -401,6 +401,7 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: pd.api.interchange.from_dataframe(df) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_empty_string_column(): # https://github.com/pandas-dev/pandas/issues/56703 df = pd.DataFrame({"a": []}, dtype=str) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index b831ec3bb2c6a..3989e022dbbd2 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -587,7 +587,7 @@ def test_reader_dtype(self, read_ext): expected["a"] = expected["a"].astype("float64") expected["b"] = expected["b"].astype("float32") - expected["c"] = Series(["001", "002", "003", "004"], dtype=object) + expected["c"] = Series(["001", "002", "003", "004"], dtype="str") tm.assert_frame_equal(actual, expected) msg = "Unable to convert column d to type int64" @@ -611,8 +611,8 @@ def test_reader_dtype(self, read_ext): { "a": Series([1, 2, 3, 4], dtype="float64"), "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"), - "c": Series(["001", "002", "003", "004"], dtype=object), - "d": Series(["1", "2", np.nan, "4"], dtype=object), + "c": Series(["001", "002", "003", "004"], dtype="str"), + "d": Series(["1", "2", np.nan, "4"], dtype="str"), }, ), ], diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 07f29518b7881..1e35a62e08c96 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -31,7 +31,7 @@ @pytest.mark.parametrize("dtype", [str, object]) @pytest.mark.parametrize("check_orig", [True, False]) @pytest.mark.usefixtures("pyarrow_xfail") -def test_dtype_all_columns(all_parsers, dtype, check_orig): +def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string): # see gh-3795, gh-6607 parser = all_parsers @@ -49,8 +49,10 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig): if check_orig: expected = df.copy() result = result.astype(float) - else: + elif using_infer_string and dtype is str: expected = df.astype(str) + else: + expected = df.astype(str).astype(object) tm.assert_frame_equal(result, expected) @@ -566,7 +568,7 @@ def test_string_inference(all_parsers): @pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_]) -def test_string_inference_object_dtype(all_parsers, dtype): +def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string): # GH#56047 pytest.importorskip("pyarrow") @@ -578,10 +580,11 @@ def test_string_inference_object_dtype(all_parsers, dtype): with pd.option_context("future.infer_string", True): result = parser.read_csv(StringIO(data), dtype=dtype) + expected_dtype = "str" if dtype is str and using_infer_string else object expected = DataFrame( { - "a": pd.Series(["x", "y", "z"], dtype=object), - "b": pd.Series(["a", "a", "a"], dtype=object), + "a": pd.Series(["x", "y", "z"], dtype=expected_dtype), + "b": pd.Series(["a", "a", "a"], dtype=expected_dtype), }, columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), ) @@ -592,7 +595,7 @@ def test_string_inference_object_dtype(all_parsers, dtype): expected = DataFrame( { - "a": pd.Series(["x", "y", "z"], dtype=object), + "a": pd.Series(["x", "y", "z"], dtype=expected_dtype), "b": pd.Series(["a", "a", "a"], dtype="string[pyarrow_numpy]"), }, columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 579d41f964df0..4a7e204ee4161 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -76,7 +76,7 @@ def test_astype_dict_like(self, dtype_class): dt1 = dtype_class({"abc": str}) result = ser.astype(dt1) - expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype=object) + expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype="str") tm.assert_series_equal(result, expected) dt2 = dtype_class({"abc": "float64"}) @@ -173,10 +173,14 @@ def test_astype_empty_constructor_equality(self, dtype): def test_astype_str_map(self, dtype, data, using_infer_string): # see GH#4405 series = Series(data) + using_string_dtype = using_infer_string and dtype is str result = series.astype(dtype) - expected = series.map(str) - if using_infer_string: - expected = expected.astype(object) + if using_string_dtype: + expected = series.map(lambda val: str(val) if val is not np.nan else np.nan) + else: + expected = series.map(str) + if using_infer_string: + expected = expected.astype(object) tm.assert_series_equal(result, expected) def test_astype_float_to_period(self): @@ -213,7 +217,7 @@ def test_astype_dt64_to_str(self): # GH#10442 : testing astype(str) is correct for Series/DatetimeIndex dti = date_range("2012-01-01", periods=3) result = Series(dti).astype(str) - expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype=object) + expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype="str") tm.assert_series_equal(result, expected) def test_astype_dt64tz_to_str(self): @@ -226,7 +230,7 @@ def test_astype_dt64tz_to_str(self): "2012-01-02 00:00:00-05:00", "2012-01-03 00:00:00-05:00", ], - dtype=object, + dtype="str", ) tm.assert_series_equal(result, expected) @@ -286,13 +290,13 @@ def test_astype_str_cast_dt64(self): ts = Series([Timestamp("2010-01-04 00:00:00")]) res = ts.astype(str) - expected = Series(["2010-01-04"], dtype=object) + expected = Series(["2010-01-04"], dtype="str") tm.assert_series_equal(res, expected) ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")]) res = ts.astype(str) - expected = Series(["2010-01-04 00:00:00-05:00"], dtype=object) + expected = Series(["2010-01-04 00:00:00-05:00"], dtype="str") tm.assert_series_equal(res, expected) def test_astype_str_cast_td64(self): @@ -301,7 +305,7 @@ def test_astype_str_cast_td64(self): td = Series([Timedelta(1, unit="D")]) ser = td.astype(str) - expected = Series(["1 days"], dtype=object) + expected = Series(["1 days"], dtype="str") tm.assert_series_equal(ser, expected) def test_dt64_series_astype_object(self): @@ -347,7 +351,7 @@ def test_astype_from_float_to_str(self, any_float_dtype): # https://github.com/pandas-dev/pandas/issues/36451 ser = Series([0.1], dtype=any_float_dtype) result = ser.astype(str) - expected = Series(["0.1"], dtype=object) + expected = Series(["0.1"], dtype="str") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -358,11 +362,13 @@ def test_astype_from_float_to_str(self, any_float_dtype): (NA, ""), ], ) - def test_astype_to_str_preserves_na(self, value, string_value): + def test_astype_to_str_preserves_na(self, value, string_value, using_infer_string): # https://github.com/pandas-dev/pandas/issues/36904 ser = Series(["a", "b", value], dtype=object) result = ser.astype(str) - expected = Series(["a", "b", string_value], dtype=object) + expected = Series( + ["a", "b", None if using_infer_string else string_value], dtype="str" + ) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", ["float32", "float64", "int64", "int32"]) From d127770052be4544468d3e50cb465b8a089dff30 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Sep 2024 16:59:46 +0200 Subject: [PATCH 03/21] fix datetimelike astype and more tests --- pandas/_libs/lib.pyx | 6 +++++- pandas/core/arrays/datetimelike.py | 4 ++-- pandas/core/arrays/string_.py | 2 +- pandas/core/arrays/string_arrow.py | 2 +- pandas/tests/extension/test_arrow.py | 20 ------------------- pandas/tests/frame/methods/test_astype.py | 11 +++++----- .../tests/indexes/interval/test_indexing.py | 4 ++-- .../io/parser/dtypes/test_dtypes_basic.py | 2 +- 8 files changed, 18 insertions(+), 33 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e1a2a0142c52e..0a51dcf117331 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -750,7 +750,11 @@ cpdef ndarray[object] ensure_string_array( if hasattr(arr, "to_numpy"): - if hasattr(arr, "dtype") and arr.dtype.kind in "mM": + if ( + hasattr(arr, "dtype") + and arr.dtype.kind in "mM" + and not hasattr(arr, "_pa_array") + ): # dtype check to exclude DataFrame # GH#41409 TODO: not a great place for this out = arr.astype(str).astype(object) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 94a57d30020f3..88aad1565f540 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -472,12 +472,12 @@ def astype(self, dtype, copy: bool = True): return self._box_values(self.asi8.ravel()).reshape(self.shape) elif is_string_dtype(dtype): - arr_object = self._format_native_types() if isinstance(dtype, ExtensionDtype): + arr_object = self._format_native_types(na_rep=dtype.na_value) cls = dtype.construct_array_type() return cls._from_sequence(arr_object, dtype=dtype, copy=False) else: - return arr_object + return self._format_native_types() elif isinstance(dtype, ExtensionDtype): return super().astype(dtype, copy=copy) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 143a13c54dbbb..28d315a28919e 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -625,7 +625,7 @@ def _from_sequence( # zero_copy_only to True which caused problems see GH#52076 scalars = np.array(scalars) # convert non-na-likes to str, and nan-likes to StringDtype().na_value - result = lib.ensure_string_array(scalars, na_value=na_value, copy=copy) + result = lib.ensure_string_array(scalars, na_value=na_value, copy=True) # Manually creating new array avoids the validation step in the __init__, so is # faster. Refactor need for validation? diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index e8e74b0ba1215..68d7606cc3b3f 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -200,7 +200,7 @@ def _from_sequence( return cls(pc.cast(scalars, pa.large_string())) # convert non-na-likes to str - result = lib.ensure_string_array(scalars, copy=copy) + result = lib.ensure_string_array(scalars, copy=True) return cls(pa.array(result, type=pa.large_string(), from_pandas=True)) @classmethod diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 3dbdda388d035..028409e720129 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -45,7 +45,6 @@ pa_version_under13p0, pa_version_under14p0, ) -import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import ( ArrowDtype, @@ -312,25 +311,6 @@ def test_astype_str(self, data, request): ) super().test_astype_str(data) - @pytest.mark.parametrize( - "nullable_string_dtype", - [ - "string[python]", - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], - ) - def test_astype_string(self, data, nullable_string_dtype, request): - pa_dtype = data.dtype.pyarrow_dtype - if ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ) or pa.types.is_duration(pa_dtype): - request.applymarker( - pytest.mark.xfail( - reason="pd.Timestamp/pd.Timedelta repr different from numpy repr", - ) - ) - super().test_astype_string(data, nullable_string_dtype) - def test_from_dtype(self, data, request): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_string(pa_dtype) or pa.types.is_decimal(pa_dtype): diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 65a53ccb95141..ab3743283ea13 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -173,10 +173,10 @@ def test_astype_str(self): tm.assert_frame_equal(result, expected) - def test_astype_str_float(self): + def test_astype_str_float(self, using_infer_string): # see GH#11302 result = DataFrame([np.nan]).astype(str) - expected = DataFrame(["nan"], dtype="str") + expected = DataFrame([np.nan if using_infer_string else "nan"], dtype="str") tm.assert_frame_equal(result, expected) result = DataFrame([1.12345678901234567890]).astype(str) @@ -647,9 +647,10 @@ def test_astype_dt64tz(self, timezone_frame): # dt64tz->dt64 deprecated timezone_frame.astype("datetime64[ns]") - def test_astype_dt64tz_to_str(self, timezone_frame): + def test_astype_dt64tz_to_str(self, timezone_frame, using_infer_string): # str formatting result = timezone_frame.astype(str) + na_value = np.nan if using_infer_string else "NaT" expected = DataFrame( [ [ @@ -657,7 +658,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame): "2013-01-01 00:00:00-05:00", "2013-01-01 00:00:00+01:00", ], - ["2013-01-02", "NaT", "NaT"], + ["2013-01-02", na_value, na_value], [ "2013-01-03", "2013-01-03 00:00:00-05:00", @@ -665,7 +666,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame): ], ], columns=timezone_frame.columns, - dtype="object", + dtype="str", ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index 00d01c47251e3..787461b944bd0 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -365,9 +365,9 @@ def test_get_indexer_datetime(self): # TODO: with mismatched resolution get_indexer currently raises; # this should probably coerce? target = DatetimeIndex(["2018-01-02"], dtype="M8[ns]") - # result = ii.get_indexer(target) + result = ii.get_indexer(target) expected = np.array([0], dtype=np.intp) - # tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) result = ii.get_indexer(target.astype(str)) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 1e35a62e08c96..256b09c62ec76 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -580,7 +580,7 @@ def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string): with pd.option_context("future.infer_string", True): result = parser.read_csv(StringIO(data), dtype=dtype) - expected_dtype = "str" if dtype is str and using_infer_string else object + expected_dtype = pd.StringDtype(na_value=np.nan) if dtype is str else object expected = DataFrame( { "a": pd.Series(["x", "y", "z"], dtype=expected_dtype), From 38d011a7463629b84b18d84d6597e4fe4352f60b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Sep 2024 17:15:01 +0200 Subject: [PATCH 04/21] remove xfails --- pandas/tests/extension/json/array.py | 3 +-- pandas/tests/frame/methods/test_select_dtypes.py | 5 ++++- pandas/tests/io/parser/dtypes/test_dtypes_basic.py | 2 -- pandas/tests/io/parser/test_na_values.py | 2 -- pandas/tests/io/parser/test_python_parser_only.py | 6 ++---- 5 files changed, 7 insertions(+), 11 deletions(-) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 3a4391edc99ef..4fa48023fbc95 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -208,9 +208,8 @@ def astype(self, dtype, copy=True): return self.copy() return self elif isinstance(dtype, StringDtype): - value = self.astype(str) # numpy doesn't like nested dicts arr_cls = dtype.construct_array_type() - return arr_cls._from_sequence(value, dtype=dtype, copy=False) + return arr_cls._from_sequence(self, dtype=dtype, copy=False) elif not copy: return np.asarray([dict(x) for x in self], dtype=dtype) else: diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 875dca321635f..0354e9df3d168 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -99,6 +99,9 @@ def test_select_dtypes_include_using_list_like(self, using_infer_string): ei = df[["a"]] tm.assert_frame_equal(ri, ei) + ri = df.select_dtypes(include=[str]) + tm.assert_frame_equal(ri, ei) + def test_select_dtypes_exclude_using_list_like(self): df = DataFrame( { @@ -358,7 +361,7 @@ def test_select_dtypes_datetime_with_tz(self): @pytest.mark.parametrize("dtype", [str, "str", np.bytes_, "S1", np.str_, "U1"]) @pytest.mark.parametrize("arg", ["include", "exclude"]) def test_select_dtypes_str_raises(self, dtype, arg, using_infer_string): - if using_infer_string and dtype == "str": + if using_infer_string and (dtype == "str" or dtype is str): # this is tested below pytest.skip("Selecting string columns works with future strings") df = DataFrame( diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 256b09c62ec76..a9c3ae8f0fba4 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -302,7 +302,6 @@ def test_true_values_cast_to_bool(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") @pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)]) def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): @@ -318,7 +317,6 @@ def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_mangle_dup_cols_single_dtype(all_parsers): # GH#42022 diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 360a5feebe073..f0a04a46c6f54 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -667,7 +667,6 @@ def test_inf_na_values_with_int_index(all_parsers): tm.assert_frame_equal(out, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # mismatched shape @pytest.mark.parametrize("na_filter", [True, False]) def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): @@ -719,7 +718,6 @@ def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): # TODO: this test isn't about the na_values keyword, it is about the empty entries # being returned with NaN entries, whereas the pyarrow engine returns "nan" @xfail_pyarrow # mismatched shapes -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_str_nan_dropped(all_parsers): # see gh-21131 parser = all_parsers diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 26480010fc687..a5bb151e84f47 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -18,8 +18,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ( ParserError, ParserWarning, @@ -499,7 +497,6 @@ def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parse tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}] ) @@ -524,10 +521,11 @@ def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, d "c": [0, 4000, 131], } ) + if dtype["a"] == object: + expected["a"] = expected["a"].astype(object) tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype,expected", [ From c4ed9d360d902cc985e00a1f611d52e3d0a09238 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Sep 2024 17:22:51 +0200 Subject: [PATCH 05/21] try fix typing --- pandas/core/arrays/categorical.py | 2 +- pandas/core/arrays/datetimelike.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index a4a7900342f5d..0a45cdb116e8f 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -544,7 +544,7 @@ def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self: return res @overload - def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: ... + def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> ArrayLike: ... @overload def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray: ... diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 88aad1565f540..7be8daa09c758 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -473,7 +473,7 @@ def astype(self, dtype, copy: bool = True): elif is_string_dtype(dtype): if isinstance(dtype, ExtensionDtype): - arr_object = self._format_native_types(na_rep=dtype.na_value) + arr_object = self._format_native_types(na_rep=dtype.na_value) # type: ignore[arg-type] cls = dtype.construct_array_type() return cls._from_sequence(arr_object, dtype=dtype, copy=False) else: From cad7d8f97be875ba987583e03e20148d9f2c5b12 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Sep 2024 17:31:26 +0200 Subject: [PATCH 06/21] fix copy_view tests --- pandas/tests/copy_view/test_astype.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index de56d5e4a07ee..d6046f86be79f 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -92,7 +92,12 @@ def test_astype_string_and_object(dtype, new_dtype): df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype) df_orig = df.copy() df2 = df.astype(new_dtype) - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + if new_dtype == "string": + # cast to string has to copy to avoid mutating the original during + # the call to ensure_string_array -> never a delayed copy + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) df2.iloc[0, 0] = "x" tm.assert_frame_equal(df, df_orig) @@ -105,7 +110,12 @@ def test_astype_string_and_object_update_original(dtype, new_dtype): df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype) df2 = df.astype(new_dtype) df_orig = df2.copy() - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + if new_dtype == "string": + # cast to string has to copy to avoid mutating the original during + # the call to ensure_string_array -> never a delayed copy + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) df.iloc[0, 0] = "x" tm.assert_frame_equal(df2, df_orig) @@ -220,7 +230,7 @@ def test_convert_dtypes(): df_orig = df.copy() df2 = df.convert_dtypes() - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) assert np.shares_memory(get_array(df2, "d"), get_array(df, "d")) assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) From 189e26d77799e9f3082d4b7f7604653b4346ea41 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Sep 2024 19:24:26 +0200 Subject: [PATCH 07/21] fix remaining tests with infer_string enabled --- pandas/core/arrays/categorical.py | 6 +++--- pandas/tests/arrays/sparse/test_astype.py | 4 ++-- pandas/tests/arrays/sparse/test_dtype.py | 2 +- pandas/tests/extension/base/casting.py | 4 ++-- pandas/tests/indexes/datetimes/methods/test_astype.py | 7 +++++-- pandas/tests/indexes/period/methods/test_astype.py | 9 +++++++-- pandas/tests/indexes/test_base.py | 3 --- pandas/tests/indexes/timedeltas/methods/test_astype.py | 9 +++++++-- pandas/tests/series/methods/test_map.py | 4 +--- pandas/tests/test_algos.py | 7 +++++-- 10 files changed, 33 insertions(+), 22 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 0a45cdb116e8f..cddb70dad6dba 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2689,9 +2689,9 @@ def _str_get_dummies(self, sep: str = "|"): from pandas.core.arrays import NumpyExtensionArray if using_string_dtype(): - return NumpyExtensionArray(self.astype(str).to_numpy())._str_get_dummies( - sep - ) + return NumpyExtensionArray( + self.astype(str).to_numpy(na_value="NaN") + )._str_get_dummies(sep) return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep) diff --git a/pandas/tests/arrays/sparse/test_astype.py b/pandas/tests/arrays/sparse/test_astype.py index 83a507e679d46..e6e4a11a0f5ab 100644 --- a/pandas/tests/arrays/sparse/test_astype.py +++ b/pandas/tests/arrays/sparse/test_astype.py @@ -81,8 +81,8 @@ def test_astype_all(self, any_real_numpy_dtype): ), ( SparseArray([0, 1, 10]), - str, - SparseArray(["0", "1", "10"], dtype=SparseDtype(str, "0")), + np.str_, + SparseArray(["0", "1", "10"], dtype=SparseDtype(np.str_, "0")), ), (SparseArray(["10", "20"]), float, SparseArray([10.0, 20.0])), ( diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py index 1819744d9a9ae..6143163735ab8 100644 --- a/pandas/tests/arrays/sparse/test_dtype.py +++ b/pandas/tests/arrays/sparse/test_dtype.py @@ -184,7 +184,7 @@ def test_construct_from_string_fill_value_raises(string): [ (SparseDtype(int, 0), float, SparseDtype(float, 0.0)), (SparseDtype(int, 1), float, SparseDtype(float, 1.0)), - (SparseDtype(int, 1), str, SparseDtype(object, "1")), + (SparseDtype(int, 1), np.str_, SparseDtype(object, "1")), (SparseDtype(float, 1.5), int, SparseDtype(int, 1)), ], ) diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index e924e38ee5030..8e3f21e1a4f56 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -44,8 +44,8 @@ def test_tolist(self, data): assert result == expected def test_astype_str(self, data): - result = pd.Series(data[:5]).astype(str) - expected = pd.Series([str(x) for x in data[:5]], dtype=str) + result = pd.Series(data[:2]).astype(str) + expected = pd.Series([str(x) for x in data[:2]], dtype=str) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/datetimes/methods/test_astype.py b/pandas/tests/indexes/datetimes/methods/test_astype.py index 03957053199a8..62be8903da206 100644 --- a/pandas/tests/indexes/datetimes/methods/test_astype.py +++ b/pandas/tests/indexes/datetimes/methods/test_astype.py @@ -101,13 +101,16 @@ def test_astype_tznaive_to_tzaware(self): # dt64->dt64tz deprecated idx._data.astype("datetime64[ns, US/Eastern]") - def test_astype_str_nat(self): + def test_astype_str_nat(self, using_infer_string): # GH 13149, GH 13209 # verify that we are returning NaT as a string (and not unicode) idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan]) result = idx.astype(str) - expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype="str") + if using_infer_string: + expected = Index(["2016-05-16", None, None, None], dtype="str") + else: + expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object) tm.assert_index_equal(result, expected) def test_astype_str(self): diff --git a/pandas/tests/indexes/period/methods/test_astype.py b/pandas/tests/indexes/period/methods/test_astype.py index 1f9dad7c972d4..af3c2667f51b4 100644 --- a/pandas/tests/indexes/period/methods/test_astype.py +++ b/pandas/tests/indexes/period/methods/test_astype.py @@ -22,7 +22,7 @@ def test_astype_raises(self, dtype): with pytest.raises(TypeError, match=msg): idx.astype(dtype) - def test_astype_conversion(self): + def test_astype_conversion(self, using_infer_string): # GH#13149, GH#13209 idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.nan], freq="D", name="idx") @@ -41,7 +41,12 @@ def test_astype_conversion(self): tm.assert_index_equal(result, expected) result = idx.astype(str) - expected = Index([str(x) for x in idx], name="idx", dtype="str") + if using_infer_string: + expected = Index( + [str(x) if x is not NaT else None for x in idx], name="idx", dtype="str" + ) + else: + expected = Index([str(x) for x in idx], name="idx", dtype=object) tm.assert_index_equal(result, expected) idx = period_range("1990", "2009", freq="Y", name="idx") diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 7ec66100b7291..cb4e922977d5e 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -76,9 +76,6 @@ def test_constructor_casting(self, index): tm.assert_contains_all(arr, new_index) tm.assert_index_equal(index, new_index) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) def test_constructor_copy(self, using_infer_string): index = Index(list("abc"), name="name") arr = np.array(index) diff --git a/pandas/tests/indexes/timedeltas/methods/test_astype.py b/pandas/tests/indexes/timedeltas/methods/test_astype.py index 1a855e728b0e7..5166cadae499e 100644 --- a/pandas/tests/indexes/timedeltas/methods/test_astype.py +++ b/pandas/tests/indexes/timedeltas/methods/test_astype.py @@ -44,7 +44,7 @@ def test_astype_object_with_nat(self): tm.assert_index_equal(result, expected) assert idx.tolist() == expected_list - def test_astype(self): + def test_astype(self, using_infer_string): # GH 13149, GH 13209 idx = TimedeltaIndex([1e14, "NaT", NaT, np.nan], name="idx") @@ -61,7 +61,12 @@ def test_astype(self): tm.assert_index_equal(result, expected) result = idx.astype(str) - expected = Index([str(x) for x in idx], name="idx", dtype="str") + if using_infer_string: + expected = Index( + [str(x) if x is not NaT else None for x in idx], name="idx", dtype="str" + ) + else: + expected = Index([str(x) for x in idx], name="idx", dtype=object) tm.assert_index_equal(result, expected) rng = timedelta_range("1 days", periods=10) diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index fe84ffafa70b4..7fa8686fcc6c8 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -549,13 +549,11 @@ def f(x): (list(range(3)), {0: 42}, [42] + [np.nan] * 3), ], ) -def test_map_missing_mixed(vals, mapping, exp, using_infer_string): +def test_map_missing_mixed(vals, mapping, exp): # GH20495 s = Series(vals + [np.nan]) result = s.map(mapping) exp = Series(exp) - if using_infer_string and mapping == {np.nan: "not NaN"}: - exp.iloc[-1] = np.nan tm.assert_series_equal(result, exp) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 06fd81ed722d9..696ea75cd8995 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1877,13 +1877,16 @@ def test_strobj_mode(self): tm.assert_series_equal(ser.mode(), exp) @pytest.mark.parametrize("dt", [str, object]) - def test_strobj_multi_char(self, dt): + def test_strobj_multi_char(self, dt, using_infer_string): exp = ["bar"] data = ["foo"] * 2 + ["bar"] * 3 ser = Series(data, dtype=dt) exp = Series(exp, dtype=dt) - tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) + if using_infer_string: + tm.assert_extension_array_equal(algos.mode(ser.values), exp.values) + else: + tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) tm.assert_series_equal(ser.mode(), exp) def test_datelike_mode(self): From 1089eb30948e1f2d5270198d23006b3c4583e5a9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Sep 2024 19:41:56 +0200 Subject: [PATCH 08/21] ignore typing issue for now --- pandas/core/arrays/categorical.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index cddb70dad6dba..19052701a59ab 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -544,7 +544,7 @@ def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self: return res @overload - def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> ArrayLike: ... + def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: ... @overload def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray: ... @@ -2690,7 +2690,7 @@ def _str_get_dummies(self, sep: str = "|"): if using_string_dtype(): return NumpyExtensionArray( - self.astype(str).to_numpy(na_value="NaN") + self.astype(str).to_numpy(na_value="NaN") # type: ignore[attr-defined] )._str_get_dummies(sep) return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep) From 8f7e96874b54df681ca9bacd1d3b8401b3909eb0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Sep 2024 20:23:59 +0200 Subject: [PATCH 09/21] move to common.py --- pandas/core/dtypes/base.py | 8 -------- pandas/core/dtypes/common.py | 8 ++++++++ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 42b51b11ad41f..d8a42d83b6c54 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -14,8 +14,6 @@ import numpy as np -from pandas._config import using_string_dtype - from pandas._libs import missing as libmissing from pandas._libs.hashtable import object_hash from pandas._libs.properties import cache_readonly @@ -564,12 +562,6 @@ def find( return the first matching dtype, otherwise return None """ if not isinstance(dtype, str): - # builtin aliases - if dtype is str and using_string_dtype(): - from pandas.core.arrays.string_ import StringDtype - - return StringDtype(na_value=np.nan) - dtype_type: type_t if not isinstance(dtype, type): dtype_type = type(dtype) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index bcf1ade9b0320..95c4fde422af5 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -12,6 +12,8 @@ import numpy as np +from pandas._config import using_string_dtype + from pandas._libs import ( Interval, Period, @@ -1703,6 +1705,12 @@ def pandas_dtype(dtype) -> DtypeObj: elif isinstance(dtype, (np.dtype, ExtensionDtype)): return dtype + # builtin aliases + if dtype is str and using_string_dtype(): + from pandas.core.arrays.string_ import StringDtype + + return StringDtype(na_value=np.nan) + # registered extension types result = registry.find(dtype) if result is not None: From 15f45d27be5b4b02167a3216ed1b91c65262b8ec Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Sep 2024 20:27:27 +0200 Subject: [PATCH 10/21] simplify Categorical._str_get_dummies --- pandas/core/arrays/categorical.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 19052701a59ab..0cbddadd1b5a6 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -13,10 +13,7 @@ import numpy as np -from pandas._config import ( - get_option, - using_string_dtype, -) +from pandas._config import get_option from pandas._libs import ( NaT, @@ -2688,12 +2685,9 @@ def _str_get_dummies(self, sep: str = "|"): # sep may not be in categories. Just bail on this. from pandas.core.arrays import NumpyExtensionArray - if using_string_dtype(): - return NumpyExtensionArray( - self.astype(str).to_numpy(na_value="NaN") # type: ignore[attr-defined] - )._str_get_dummies(sep) - - return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep) + return NumpyExtensionArray(self.to_numpy(str, na_value="NaN"))._str_get_dummies( + sep + ) # ------------------------------------------------------------------------ # GroupBy Methods From 4464fb113a61bde7749fc19db78efb92b65f2f03 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Sep 2024 20:36:44 +0200 Subject: [PATCH 11/21] small cleanup --- pandas/core/indexes/base.py | 2 +- pandas/core/strings/object_array.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 0a52a3691bc82..3b5751f188912 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3658,6 +3658,7 @@ def get_indexer( method = clean_reindex_fill_method(method) orig_target = target target = self._maybe_cast_listlike_indexer(target) + self._check_indexing_method(method, limit, tolerance) if not self._index_as_unique: @@ -7730,7 +7731,6 @@ def get_values_for_csv( values = cast("IntervalArray", values) mask = values.isna() if not quoting: - # TODO result = np.asarray(values).astype(str) else: result = np.array(values, dtype=object, copy=True) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 28bd943688b79..c6b18d7049c57 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -405,7 +405,6 @@ def _str_get_dummies(self, sep: str = "|"): try: arr = sep + arr + sep except (TypeError, NotImplementedError): - # TODO arr = sep + arr.astype(str) + sep tags: set[str] = set() From 650f694db269289c80469947659263ae9d1a4b29 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Sep 2024 21:27:27 +0200 Subject: [PATCH 12/21] fix ensure_string_array to not modify extension arrays inplace --- pandas/_libs/lib.pyx | 1 - pandas/core/arrays/string_.py | 2 +- pandas/core/arrays/string_arrow.py | 2 +- pandas/tests/copy_view/test_astype.py | 16 +++------------- 4 files changed, 5 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 0a51dcf117331..f375e4040f3ad 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -760,7 +760,6 @@ cpdef ndarray[object] ensure_string_array( out = arr.astype(str).astype(object) out[arr.isna()] = na_value return out - arr = arr.to_numpy(dtype=object) elif not util.is_array(arr): arr = np.array(arr, dtype="object") diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 28d315a28919e..143a13c54dbbb 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -625,7 +625,7 @@ def _from_sequence( # zero_copy_only to True which caused problems see GH#52076 scalars = np.array(scalars) # convert non-na-likes to str, and nan-likes to StringDtype().na_value - result = lib.ensure_string_array(scalars, na_value=na_value, copy=True) + result = lib.ensure_string_array(scalars, na_value=na_value, copy=copy) # Manually creating new array avoids the validation step in the __init__, so is # faster. Refactor need for validation? diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 68d7606cc3b3f..e8e74b0ba1215 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -200,7 +200,7 @@ def _from_sequence( return cls(pc.cast(scalars, pa.large_string())) # convert non-na-likes to str - result = lib.ensure_string_array(scalars, copy=True) + result = lib.ensure_string_array(scalars, copy=copy) return cls(pa.array(result, type=pa.large_string(), from_pandas=True)) @classmethod diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index d6046f86be79f..de56d5e4a07ee 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -92,12 +92,7 @@ def test_astype_string_and_object(dtype, new_dtype): df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype) df_orig = df.copy() df2 = df.astype(new_dtype) - if new_dtype == "string": - # cast to string has to copy to avoid mutating the original during - # the call to ensure_string_array -> never a delayed copy - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) df2.iloc[0, 0] = "x" tm.assert_frame_equal(df, df_orig) @@ -110,12 +105,7 @@ def test_astype_string_and_object_update_original(dtype, new_dtype): df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype) df2 = df.astype(new_dtype) df_orig = df2.copy() - if new_dtype == "string": - # cast to string has to copy to avoid mutating the original during - # the call to ensure_string_array -> never a delayed copy - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) df.iloc[0, 0] = "x" tm.assert_frame_equal(df2, df_orig) @@ -230,7 +220,7 @@ def test_convert_dtypes(): df_orig = df.copy() df2 = df.convert_dtypes() - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) assert np.shares_memory(get_array(df2, "d"), get_array(df, "d")) assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) From 9164dbb90caa563c99215e5ab8bb3c19d82c34c3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 3 Sep 2024 09:47:32 +0200 Subject: [PATCH 13/21] fix ensure_string_array once more + fix is_extension_array_dtype for str --- pandas/_libs/lib.pyx | 18 ++++++++++++------ pandas/core/dtypes/common.py | 3 +++ pandas/tests/extension/test_arrow.py | 8 -------- pandas/tests/test_algos.py | 2 +- 4 files changed, 16 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index f375e4040f3ad..35351fa0e371f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -733,7 +733,9 @@ cpdef ndarray[object] ensure_string_array( convert_na_value : bool, default True If False, existing na values will be used unchanged in the new array. copy : bool, default True - Whether to ensure that a new array is returned. + Whether to ensure that a new array is returned. When True, a new array + is always returned. When False, a new array is only returned when needed + to avoid mutating the input array. skipna : bool, default True Whether or not to coerce nulls to their stringified form (e.g. if False, NaN becomes 'nan'). @@ -765,11 +767,15 @@ cpdef ndarray[object] ensure_string_array( result = np.asarray(arr, dtype="object") - if copy and (result is arr or np.shares_memory(arr, result)): - # GH#54654 - result = result.copy() - elif not copy and result is arr: - already_copied = False + if result is arr or np.may_share_memory(arr, result): + # if np.asarray(..) did not make a copy of the input arr, we still need + # to do that to avoid mutating the input array + # GH#54654: share_memory check is needed for rare cases where np.asarray + # returns a new object without making a copy of the actual data + if copy: + result = result.copy() + else: + already_copied = False elif not copy and not result.flags.writeable: # Weird edge case where result is a view already_copied = False diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 95c4fde422af5..6e503704efd99 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1410,6 +1410,9 @@ def is_extension_array_dtype(arr_or_dtype) -> bool: elif isinstance(dtype, np.dtype): return False else: + # TODO ugly -> move into registry find()? Or make this work with pandas_dtype? + if dtype is str and using_string_dtype(): + return True return registry.find(dtype) is not None diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 028409e720129..1c8577844cd32 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -301,14 +301,6 @@ def test_astype_str(self, data, request): reason=f"For {pa_dtype} .astype(str) decodes.", ) ) - elif ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ) or pa.types.is_duration(pa_dtype): - request.applymarker( - pytest.mark.xfail( - reason="pd.Timestamp/pd.Timedelta repr different from numpy repr", - ) - ) super().test_astype_str(data) def test_from_dtype(self, data, request): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 696ea75cd8995..dac74a0e32a42 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1883,7 +1883,7 @@ def test_strobj_multi_char(self, dt, using_infer_string): ser = Series(data, dtype=dt) exp = Series(exp, dtype=dt) - if using_infer_string: + if using_infer_string and dt is str: tm.assert_extension_array_equal(algos.mode(ser.values), exp.values) else: tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) From cf9f855eb004b9370671cd203dad76c724de5bff Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 3 Sep 2024 11:28:17 +0200 Subject: [PATCH 14/21] still xfail TestArrowArray::test_astype_str when not using infer_string --- pandas/tests/extension/test_arrow.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 1c8577844cd32..e906c08a50257 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -293,7 +293,7 @@ def test_map(self, data_missing, na_action): expected = data_missing.to_numpy() tm.assert_numpy_array_equal(result, expected) - def test_astype_str(self, data, request): + def test_astype_str(self, data, request, using_infer_string): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_binary(pa_dtype): request.applymarker( @@ -301,6 +301,15 @@ def test_astype_str(self, data, request): reason=f"For {pa_dtype} .astype(str) decodes.", ) ) + elif not using_infer_string and ( + (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None) + or pa.types.is_duration(pa_dtype) + ): + request.applymarker( + pytest.mark.xfail( + reason="pd.Timestamp/pd.Timedelta repr different from numpy repr", + ) + ) super().test_astype_str(data) def test_from_dtype(self, data, request): From bd79fc9b6ced05c869d3c0d8715eb3bfc2247ad9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 3 Sep 2024 11:51:36 +0200 Subject: [PATCH 15/21] ensure maybe_convert_objects copies object dtype input array when inferring StringDtype --- pandas/_libs/lib.pyx | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 35351fa0e371f..919495e9a666a 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2712,13 +2712,17 @@ def maybe_convert_objects(ndarray[object] objects, from pandas.core.arrays.string_ import StringDtype dtype = StringDtype() - return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) + return dtype.construct_array_type()._from_sequence( + objects, dtype=dtype, copy=True + ) elif using_string_dtype() and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype dtype = StringDtype(na_value=np.nan) - return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) + return dtype.construct_array_type()._from_sequence( + objects, dtype=dtype, copy=True + ) seen.object_ = True elif seen.interval_: From 4c775d118184eb6d4bc11e960811182df4bbd420 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 3 Sep 2024 18:16:22 +0200 Subject: [PATCH 16/21] update test_1d_object_array_does_not_copy test --- pandas/tests/frame/test_constructors.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index ad78e75c6a400..5df9742500e31 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -24,7 +24,6 @@ from pandas._config import using_string_dtype from pandas._libs import lib -from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gt2 from pandas.errors import IntCastingNaNError @@ -300,11 +299,22 @@ def test_constructor_dtype_nocast_view_2d_array(self): df2 = DataFrame(df.values, dtype=df[0].dtype) assert df2._mgr.blocks[0].values.flags.c_contiguous - @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="conversion copies") - def test_1d_object_array_does_not_copy(self): + def test_1d_object_array_does_not_copy(self, using_infer_string): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array(["a", "b"], dtype="object") df = DataFrame(arr, copy=False) + if using_infer_string: + if df[0].dtype.storage == "pyarrow": + # object dtype strings are converted to arrow memory, + # no numpy arrays to compare + pass + else: + # TODO(infer_string): this should be fixed to still share memory? + assert not np.shares_memory(df[0].to_numpy(), arr) + else: + assert np.shares_memory(df.values, arr) + + df = DataFrame(arr, dtype=object, copy=False) assert np.shares_memory(df.values, arr) @pytest.mark.xfail(using_string_dtype(), reason="conversion copies") From b0276b2b1c8d78001142510046f660e1fffa778f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 10 Sep 2024 09:28:38 +0200 Subject: [PATCH 17/21] update constructor copy test + do not copy in maybe_convert_objects? --- pandas/_libs/lib.pyx | 9 +++------ pandas/tests/frame/test_constructors.py | 17 +++++++++++++---- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e7b93bfe510e2..08ed997cc8d78 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -762,6 +762,7 @@ cpdef ndarray[object] ensure_string_array( out = arr.astype(str).astype(object) out[arr.isna()] = na_value return out + arr = arr.to_numpy(dtype=object) elif not util.is_array(arr): arr = np.array(arr, dtype="object") @@ -2735,17 +2736,13 @@ def maybe_convert_objects(ndarray[object] objects, from pandas.core.arrays.string_ import StringDtype dtype = StringDtype() - return dtype.construct_array_type()._from_sequence( - objects, dtype=dtype, copy=True - ) + return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) elif using_string_dtype() and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype dtype = StringDtype(na_value=np.nan) - return dtype.construct_array_type()._from_sequence( - objects, dtype=dtype, copy=True - ) + return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True elif seen.interval_: diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 298666590a3fb..0a924aa393be5 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -309,19 +309,28 @@ def test_1d_object_array_does_not_copy(self, using_infer_string): # no numpy arrays to compare pass else: - # TODO(infer_string): this should be fixed to still share memory? - assert not np.shares_memory(df[0].to_numpy(), arr) + assert np.shares_memory(df[0].to_numpy(), arr) else: assert np.shares_memory(df.values, arr) df = DataFrame(arr, dtype=object, copy=False) assert np.shares_memory(df.values, arr) - @pytest.mark.xfail(using_string_dtype(), reason="conversion copies") - def test_2d_object_array_does_not_copy(self): + def test_2d_object_array_does_not_copy(self, using_infer_string): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array([["a", "b"], ["c", "d"]], dtype="object") df = DataFrame(arr, copy=False) + if using_infer_string: + if df[0].dtype.storage == "pyarrow": + # object dtype strings are converted to arrow memory, + # no numpy arrays to compare + pass + else: + assert np.shares_memory(df[0].to_numpy(), arr) + else: + assert np.shares_memory(df.values, arr) + + df = DataFrame(arr, dtype=object, copy=False) assert np.shares_memory(df.values, arr) def test_constructor_dtype_list_data(self): From d413fc649c230a846246902f4f1992ca84f62220 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 10 Sep 2024 13:57:41 +0200 Subject: [PATCH 18/21] skip str.get_dummies test for now --- pandas/tests/strings/test_get_dummies.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py index 0656f505dc745..3b989e284ca25 100644 --- a/pandas/tests/strings/test_get_dummies.py +++ b/pandas/tests/strings/test_get_dummies.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas import ( @@ -96,6 +98,7 @@ def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype): # GH#47872 +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_get_dummies_with_str_dtype(any_string_dtype): s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) result = s.str.get_dummies("|", dtype=str) From db8900cc05a17a0f8cf18fa7cb0ababda4dde2c4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 21 Sep 2024 10:55:58 +0200 Subject: [PATCH 19/21] use pandas_dtype() instead of registry.find --- pandas/core/dtypes/common.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index ddf21473d8017..d72e489708b51 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1450,10 +1450,11 @@ def is_extension_array_dtype(arr_or_dtype) -> bool: elif isinstance(dtype, np.dtype): return False else: - # TODO ugly -> move into registry find()? Or make this work with pandas_dtype? - if dtype is str and using_string_dtype(): - return True - return registry.find(dtype) is not None + try: + dtype = pandas_dtype(dtype) + except TypeError: + return False + return isinstance(dtype, ExtensionDtype) def is_ea_or_datetimelike_dtype(dtype: DtypeObj | None) -> bool: From e6aad17b36296520dee47e462158f6b5313ef0a1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 21 Sep 2024 11:25:48 +0200 Subject: [PATCH 20/21] fix corner cases for calling pandas_dtype --- pandas/core/dtypes/common.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index d72e489708b51..daa073d0b4eda 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1451,8 +1451,12 @@ def is_extension_array_dtype(arr_or_dtype) -> bool: return False else: try: - dtype = pandas_dtype(dtype) - except TypeError: + with warnings.catch_warnings(): + # pandas_dtype(..) can raise UserWarning for class input + warnings.simplefilter("ignore", UserWarning) + dtype = pandas_dtype(dtype) + except (TypeError, ValueError): + # np.dtype(..) can raise ValueError return False return isinstance(dtype, ExtensionDtype) From 4e6cf04cf41b66a0deb9a7073d24ebcddaeaad21 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 21 Sep 2024 15:44:15 +0200 Subject: [PATCH 21/21] add TODO comment in ensure_string_array --- pandas/_libs/lib.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e0c39f1769210..8af48a861967a 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -757,6 +757,9 @@ cpdef ndarray[object] ensure_string_array( if ( hasattr(arr, "dtype") and arr.dtype.kind in "mM" + # TODO: we should add a custom ArrowExtensionArray.astype implementation + # that handles astype(str) specifically, avoiding ending up here and + # then we can remove the below check for `_pa_array` (for ArrowEA) and not hasattr(arr, "_pa_array") ): # dtype check to exclude DataFrame