From 910ed9be46891881eb87c3c3c556f2120e350d58 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Mon, 4 Dec 2023 13:55:36 +0000 Subject: [PATCH 1/8] wip --- pandas/core/dtypes/missing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index a635ac77566e1..94c65d17bf8b8 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -643,7 +643,8 @@ def infer_fill_value(val): return np.array("NaT", dtype=TD64NS_DTYPE) return np.array(np.nan, dtype=object) elif val.dtype.kind == "U": - return np.array(np.nan, dtype=val.dtype) + return np.array(np.nan, dtype=object) + # return np.array(np.nan, dtype=val.dtype) return np.nan From b3774b18e1877267b2d51a2f0947f281f90d15ee Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Mon, 4 Dec 2023 14:33:13 +0000 Subject: [PATCH 2/8] Fixed bug when creating new column with missing values when setting a single string value --- doc/source/whatsnew/v2.1.4.rst | 2 +- pandas/core/dtypes/missing.py | 11 +++++++--- pandas/core/indexing.py | 4 +++- pandas/tests/frame/indexing/test_indexing.py | 22 ++++++++++++++++++++ 4 files changed, 34 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 9e3eb90436642..6c3a4f33ceb22 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -13,7 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- +- Fixed bug when creating new column with missing values when setting a single string value (:issue:`56204`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 94c65d17bf8b8..a82305ad20f7b 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -624,7 +624,7 @@ def array_equals(left: ArrayLike, right: ArrayLike) -> bool: return array_equivalent(left, right, dtype_equal=True) -def infer_fill_value(val): +def infer_fill_value(val, index: Index = None): """ infer the fill value for the nan/NaT from the provided scalar/ndarray/list-like if we are a NaT, return the correct dtyped @@ -643,8 +643,13 @@ def infer_fill_value(val): return np.array("NaT", dtype=TD64NS_DTYPE) return np.array(np.nan, dtype=object) elif val.dtype.kind == "U": - return np.array(np.nan, dtype=object) - # return np.array(np.nan, dtype=val.dtype) + if get_option("future.infer_string"): + from pandas import Series + + return Series( + [np.nan] * len(index), dtype="string[pyarrow_numpy]", index=index + ) + return None return np.nan diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index e3928621a4e48..78cece7522406 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1879,7 +1879,9 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc"): else: # FIXME: GH#42099#issuecomment-864326014 - self.obj[key] = infer_fill_value(value) + self.obj[key] = infer_fill_value( + value, index=self.obj.index + ) new_indexer = convert_from_missing_indexer_tuple( indexer, self.obj.axes diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index dfb4a3092789a..a296c75784dc0 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1922,6 +1922,28 @@ def test_adding_new_conditional_column() -> None: tm.assert_frame_equal(df, expected) +def test_adding_new_conditional_column_with_string() -> None: + # https://github.com/pandas-dev/pandas/issues/56204 + df = DataFrame({"a": [1, 2], "b": [3, 4]}) + df.loc[lambda x: x.a == 1, "c"] = "1" + expected = DataFrame({"a": [1, 2], "b": [3, 4], "c": ["1", None]}).astype( + {"a": "int64", "b": "int64", "c": "object"} + ) + tm.assert_frame_equal(df, expected) + + +def test_adding_new_conditional_column_with_infer_string() -> None: + # https://github.com/pandas-dev/pandas/issues/56204 + pytest.importorskip("pyarrow") + df = DataFrame({"a": [1, 2], "b": [3, 4]}) + with pd.option_context("future.infer_string", True): + df.loc[lambda x: x.a == 1, "c"] = "1" + expected = DataFrame({"a": [1, 2], "b": [3, 4], "c": ["1", None]}).astype( + {"a": "int64", "b": "int64", "c": "string[pyarrow_numpy]"} + ) + tm.assert_frame_equal(df, expected) + + def test_add_new_column_infer_string(): # GH#55366 pytest.importorskip("pyarrow") From 5ab3d09dde51e65ccd23fd3169679a67b2b12eac Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Mon, 4 Dec 2023 15:13:33 +0000 Subject: [PATCH 3/8] typing --- pandas/core/dtypes/missing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index a82305ad20f7b..64a634846549f 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -624,7 +624,7 @@ def array_equals(left: ArrayLike, right: ArrayLike) -> bool: return array_equivalent(left, right, dtype_equal=True) -def infer_fill_value(val, index: Index = None): +def infer_fill_value(val, index: Index): """ infer the fill value for the nan/NaT from the provided scalar/ndarray/list-like if we are a NaT, return the correct dtyped From 61cdef1bd42da74c75910b95e01be6fce87e8522 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 6 Dec 2023 09:51:58 +0000 Subject: [PATCH 4/8] use pd.array instead --- pandas/core/dtypes/missing.py | 8 +++----- pandas/core/indexing.py | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index c67a24da6f50c..3cf38d5692e0e 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -624,7 +624,7 @@ def array_equals(left: ArrayLike, right: ArrayLike) -> bool: return array_equivalent(left, right, dtype_equal=True) -def infer_fill_value(val, index: Index): +def infer_fill_value(val, length: int): """ infer the fill value for the nan/NaT from the provided scalar/ndarray/list-like if we are a NaT, return the correct dtyped @@ -644,11 +644,9 @@ def infer_fill_value(val, index: Index): return np.array(np.nan, dtype=object) elif val.dtype.kind == "U": if get_option("future.infer_string"): - from pandas import Series + from pandas.core.construction import array as pd_array - return Series( - [np.nan] * len(index), dtype="string[pyarrow_numpy]", index=index - ) + return pd_array([np.nan] * length, dtype="string[pyarrow_numpy]") return None return np.nan diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 78cece7522406..3278f960227e2 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1880,7 +1880,7 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc"): else: # FIXME: GH#42099#issuecomment-864326014 self.obj[key] = infer_fill_value( - value, index=self.obj.index + value, length=len(self.obj) ) new_indexer = convert_from_missing_indexer_tuple( From 9e7055af16bc1d8e449ebd6824e01687424a9139 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 6 Dec 2023 18:27:55 +0000 Subject: [PATCH 5/8] move in whatsnew section, parametrise --- doc/source/whatsnew/v2.1.4.rst | 2 +- pandas/tests/frame/indexing/test_indexing.py | 26 +++++++++----------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 40ab5c324787b..59c0b2d2c5173 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -13,7 +13,6 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- Fixed bug when creating new column with missing values when setting a single string value (:issue:`56204`) - Fixed regression when trying to read a pickled pandas :class:`DataFrame` from pandas 1.3 (:issue:`55137`) - @@ -33,6 +32,7 @@ Bug fixes - Fixed bug in :meth:`Series.mode` not keeping object dtype when ``infer_string`` is set (:issue:`56183`) - Fixed bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` when ``pat=None`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56271`) - Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`) +- Fixed bug when creating new column with missing values when setting a single string value (:issue:`56204`) .. --------------------------------------------------------------------------- .. _whatsnew_214.other: diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index a296c75784dc0..2e437604b7eb0 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1922,24 +1922,22 @@ def test_adding_new_conditional_column() -> None: tm.assert_frame_equal(df, expected) -def test_adding_new_conditional_column_with_string() -> None: - # https://github.com/pandas-dev/pandas/issues/56204 - df = DataFrame({"a": [1, 2], "b": [3, 4]}) - df.loc[lambda x: x.a == 1, "c"] = "1" - expected = DataFrame({"a": [1, 2], "b": [3, 4], "c": ["1", None]}).astype( - {"a": "int64", "b": "int64", "c": "object"} - ) - tm.assert_frame_equal(df, expected) - - -def test_adding_new_conditional_column_with_infer_string() -> None: +@pytest.mark.parametrize( + ("dtype", "infer_string"), + [ + (object, False), + ("string[pyarrow_numpy]", True), + ], +) +def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None: # https://github.com/pandas-dev/pandas/issues/56204 pytest.importorskip("pyarrow") + df = DataFrame({"a": [1, 2], "b": [3, 4]}) - with pd.option_context("future.infer_string", True): - df.loc[lambda x: x.a == 1, "c"] = "1" + with pd.option_context("future.infer_string", infer_string): + df.loc[df["a"] == 1, "c"] = "1" expected = DataFrame({"a": [1, 2], "b": [3, 4], "c": ["1", None]}).astype( - {"a": "int64", "b": "int64", "c": "string[pyarrow_numpy]"} + {"a": "int64", "b": "int64", "c": dtype} ) tm.assert_frame_equal(df, expected) From 20898db5f8d7e32b3ba1420de2e3b82ed69326d0 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 7 Dec 2023 10:53:27 +0000 Subject: [PATCH 6/8] find empty_value dtype by constructing array and doing take --- pandas/core/indexing.py | 13 +++++++++---- pandas/tests/frame/indexing/test_indexing.py | 2 +- pandas/tests/frame/indexing/test_set_value.py | 5 +---- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 3278f960227e2..12590b6c5f9e2 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -68,6 +68,7 @@ from pandas.core.construction import ( array as pd_array, extract_array, + sanitize_array, ) from pandas.core.indexers import ( check_array_indexer, @@ -1876,12 +1877,16 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc"): return self.obj[key] = empty_value - + elif not is_list_like(value): + # Find our empty_value dtype by constructing an array + # from our value and doing a .take on it + arr = sanitize_array(value, Index(range(1)), copy=False) + taker = -1 * np.ones(len(self.obj), dtype=np.intp) + empty_value = algos.take_nd(arr, taker) + self.obj[key] = empty_value else: # FIXME: GH#42099#issuecomment-864326014 - self.obj[key] = infer_fill_value( - value, length=len(self.obj) - ) + self.obj[key] = infer_fill_value(value) new_indexer = convert_from_missing_indexer_tuple( indexer, self.obj.axes diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 2e437604b7eb0..5e14728201a6a 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1936,7 +1936,7 @@ def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None: df = DataFrame({"a": [1, 2], "b": [3, 4]}) with pd.option_context("future.infer_string", infer_string): df.loc[df["a"] == 1, "c"] = "1" - expected = DataFrame({"a": [1, 2], "b": [3, 4], "c": ["1", None]}).astype( + expected = DataFrame({"a": [1, 2], "b": [3, 4], "c": ["1", float("nan")]}).astype( {"a": "int64", "b": "int64", "c": dtype} ) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/indexing/test_set_value.py b/pandas/tests/frame/indexing/test_set_value.py index 32312868adacb..eec953ebb8fe6 100644 --- a/pandas/tests/frame/indexing/test_set_value.py +++ b/pandas/tests/frame/indexing/test_set_value.py @@ -30,10 +30,7 @@ def test_set_value_resize(self, float_frame): assert res["baz"].dtype == np.object_ res = float_frame.copy() - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): - res._set_value("foobar", "baz", True) + res._set_value("foobar", "baz", True) assert res["baz"].dtype == np.object_ res = float_frame.copy() From 8c30505b2299d8bd3dc228fef9e1fc5161c4e35a Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 7 Dec 2023 10:54:22 +0000 Subject: [PATCH 7/8] revert --- pandas/core/dtypes/missing.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 3cf38d5692e0e..4dc0d477f89e8 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -624,7 +624,7 @@ def array_equals(left: ArrayLike, right: ArrayLike) -> bool: return array_equivalent(left, right, dtype_equal=True) -def infer_fill_value(val, length: int): +def infer_fill_value(val): """ infer the fill value for the nan/NaT from the provided scalar/ndarray/list-like if we are a NaT, return the correct dtyped @@ -643,11 +643,7 @@ def infer_fill_value(val, length: int): return np.array("NaT", dtype=TD64NS_DTYPE) return np.array(np.nan, dtype=object) elif val.dtype.kind == "U": - if get_option("future.infer_string"): - from pandas.core.construction import array as pd_array - - return pd_array([np.nan] * length, dtype="string[pyarrow_numpy]") - return None + return np.array(np.nan, dtype=val.dtype) return np.nan From 40093c2286cc8f92eed3571796e7f3737a31be7b Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 20 Dec 2023 19:14:34 +0000 Subject: [PATCH 8/8] move whatsnew note to 2.2.0 --- doc/source/whatsnew/v2.1.4.rst | 1 - doc/source/whatsnew/v2.2.0.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 5bd14bf93a2cf..57b83a294963b 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -35,7 +35,6 @@ Bug fixes - Fixed bug in :meth:`Series.reset_index` not preserving object dtype when ``infer_string`` is set (:issue:`56160`) - Fixed bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` when ``pat=None`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56271`) - Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`) -- Fixed bug when creating new column with missing values when setting a single string value (:issue:`56204`) .. --------------------------------------------------------------------------- .. _whatsnew_214.contributors: diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 8c475791df64d..80d86805ded49 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -618,6 +618,7 @@ Indexing - Bug in :meth:`DataFrame.loc` when setting :class:`Series` with extension dtype into NumPy dtype (:issue:`55604`) - Bug in :meth:`Index.difference` not returning a unique set of values when ``other`` is empty or ``other`` is considered non-comparable (:issue:`55113`) - Bug in setting :class:`Categorical` values into a :class:`DataFrame` with numpy dtypes raising ``RecursionError`` (:issue:`52927`) +- Fixed bug when creating new column with missing values when setting a single string value (:issue:`56204`) Missing ^^^^^^^