From 03d4943fb9ff325afb9c4e43a0655ec586214a45 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 12 Aug 2024 10:44:30 +0200 Subject: [PATCH 1/7] String dtype: maybe_converts_object give precedence to nullable dtype --- pandas/_libs/lib.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 489d4fa111d40..e1a2a0142c52e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2699,16 +2699,16 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True elif seen.str_: - if using_string_dtype() and is_string_array(objects, skipna=True): + if convert_to_nullable_dtype and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(na_value=np.nan) + dtype = StringDtype() return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) - elif convert_to_nullable_dtype and is_string_array(objects, skipna=True): + elif using_string_dtype() and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype() + dtype = StringDtype(na_value=np.nan) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True From c00577816012807ed191673436162dcbf8ab508d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 12 Aug 2024 15:07:07 +0200 Subject: [PATCH 2/7] update datetimelike input validation --- pandas/core/arrays/datetimelike.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index ad0bde3abbdd4..32a90cf83321d 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -130,9 +130,9 @@ from pandas.core.arrays.integer import IntegerArray import pandas.core.common as com from pandas.core.construction import ( - array as pd_array, ensure_wrapped_if_datetimelike, extract_array, + sanitize_array, ) from pandas.core.indexers import ( check_array_indexer, @@ -667,12 +667,10 @@ def _validate_listlike(self, value, allow_object: bool = False): msg = self._validation_error_message(value, True) raise TypeError(msg) from err - # Do type inference if necessary up front (after unpacking - # NumpyExtensionArray) + # Do type inference if necessary up front # e.g. we passed PeriodIndex.values and got an ndarray of Periods - value = extract_array(value, extract_numpy=True) - value = pd_array(value) - value = extract_array(value, extract_numpy=True) + value = sanitize_array(value, index=None) + value = ensure_wrapped_if_datetimelike(value) if is_all_strings(value): # We got a StringArray From 0bee1ac482232897761f760f93df003436d96025 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 12 Aug 2024 15:16:10 +0200 Subject: [PATCH 3/7] update tests and remove xfails --- pandas/tests/arrays/string_/test_string_arrow.py | 13 +++---------- pandas/tests/base/test_value_counts.py | 4 ++-- pandas/tests/dtypes/cast/test_construct_ndarray.py | 2 +- .../tests/io/parser/usecols/test_usecols_basic.py | 3 --- 4 files changed, 6 insertions(+), 16 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 7d4aae0f7bb4e..ef882a81f882f 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -4,7 +4,6 @@ import numpy as np import pytest -from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -27,20 +26,14 @@ def test_eq_all_na(): tm.assert_extension_array_equal(result, expected) -def test_config(string_storage, request, using_infer_string): - if using_infer_string and string_storage == "python" and HAS_PYARROW: - # string storage with na_value=NaN always uses pyarrow if available - # -> does not yet honor the option - request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) - +def test_config(string_storage): with pd.option_context("string_storage", string_storage): assert StringDtype().storage == string_storage result = pd.array(["a", "b"]) assert result.dtype.storage == string_storage - dtype = StringDtype( - string_storage, na_value=np.nan if using_infer_string else pd.NA - ) + # pd.array(..) by default always returns the NA-variant + dtype = StringDtype(string_storage, na_value=pd.NA) expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype) tm.assert_equal(result, expected) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index c72abfeb9f3e7..bcb31829a201f 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -114,7 +114,7 @@ def test_value_counts_inferred(index_or_series, using_infer_string): else: exp = np.unique(np.array(s_values, dtype=np.object_)) if using_infer_string: - exp = array(exp) + exp = array(exp, dtype="str") tm.assert_equal(s.unique(), exp) assert s.nunique() == 4 @@ -192,7 +192,7 @@ def test_value_counts_bins(index_or_series, using_infer_string): else: exp = np.array(["a", "b", np.nan, "d"], dtype=object) if using_infer_string: - exp = array(exp) + exp = array(exp, dtype="str") tm.assert_equal(s.unique(), exp) assert s.nunique() == 3 diff --git a/pandas/tests/dtypes/cast/test_construct_ndarray.py b/pandas/tests/dtypes/cast/test_construct_ndarray.py index ab468c81124bc..6b9b2dfda6e8b 100644 --- a/pandas/tests/dtypes/cast/test_construct_ndarray.py +++ b/pandas/tests/dtypes/cast/test_construct_ndarray.py @@ -21,7 +21,7 @@ def test_construct_1d_ndarray_preserving_na( ): result = sanitize_array(values, index=None, dtype=dtype) if using_infer_string and expected.dtype == object and dtype is None: - tm.assert_extension_array_equal(result, pd.array(expected)) + tm.assert_extension_array_equal(result, pd.array(expected, dtype="str")) else: tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index d02364a77df90..82b42beb38ae0 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ParserError from pandas import ( @@ -531,7 +529,6 @@ def test_usecols_additional_columns_integer_columns(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_usecols_dtype(all_parsers): parser = all_parsers data = """ From 0057158bedc675850607c4bd970dbc586e7550c2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 12 Aug 2024 15:30:29 +0200 Subject: [PATCH 4/7] explicitly test pd.array() behaviour (remove xfail) --- pandas/tests/arrays/test_array.py | 34 +++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 76b8928f28b65..0bb023e6b3b92 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm from pandas.api.extensions import register_extension_dtype @@ -215,6 +213,13 @@ def test_dt64_array(dtype_unit): .construct_array_type() ._from_sequence(["a", None], dtype=pd.StringDtype()), ), + ( + ["a", None], + "str", + pd.StringDtype(na_value=np.nan) + .construct_array_type() + ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)), + ), ( ["a", None], pd.StringDtype(), @@ -222,14 +227,29 @@ def test_dt64_array(dtype_unit): .construct_array_type() ._from_sequence(["a", None], dtype=pd.StringDtype()), ), + ( + ["a", None], + pd.StringDtype(na_value=np.nan), + pd.StringDtype(na_value=np.nan) + .construct_array_type() + ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)), + ), ( # numpy array with string dtype np.array(["a", "b"], dtype=str), - None, + pd.StringDtype(), pd.StringDtype() .construct_array_type() ._from_sequence(["a", "b"], dtype=pd.StringDtype()), ), + ( + # numpy array with string dtype + np.array(["a", "b"], dtype=str), + pd.StringDtype(na_value=np.nan), + pd.StringDtype(na_value=np.nan) + .construct_array_type() + ._from_sequence(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), + ), # Boolean ( [True, None], @@ -287,7 +307,6 @@ def test_array_copy(): assert tm.shares_memory(a, b) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "data, expected", [ @@ -387,6 +406,13 @@ def test_array_copy(): .construct_array_type() ._from_sequence(["a", None], dtype=pd.StringDtype()), ), + ( + # numpy array with string dtype + np.array(["a", "b"], dtype=str), + pd.StringDtype() + .construct_array_type() + ._from_sequence(["a", "b"], dtype=pd.StringDtype()), + ), # Boolean ([True, False], BooleanArray._from_sequence([True, False], dtype="boolean")), ([True, None], BooleanArray._from_sequence([True, None], dtype="boolean")), From 8af4cdac2c792a0c0ebabe378d85c71273d7c1ab Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 13 Aug 2024 08:51:29 +0200 Subject: [PATCH 5/7] fixup allow_2d --- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/construction.py | 24 +++++------------------- pandas/tests/arrays/test_datetimelike.py | 2 +- 3 files changed, 7 insertions(+), 21 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index e4995a3bb4898..3c69a5a450dd1 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -669,7 +669,7 @@ def _validate_listlike(self, value, allow_object: bool = False): # Do type inference if necessary up front # e.g. we passed PeriodIndex.values and got an ndarray of Periods - value = sanitize_array(value, index=None) + value = sanitize_array(value, index=None, allow_2d=True) value = ensure_wrapped_if_datetimelike(value) if is_all_strings(value): diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 665eb75953078..8035a550d175b 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -37,7 +37,6 @@ from pandas.core.dtypes.common import ( ensure_object, is_list_like, - is_object_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import NumpyEADtype @@ -49,8 +48,6 @@ ) from pandas.core.dtypes.missing import isna -import pandas.core.common as com - if TYPE_CHECKING: from collections.abc import Sequence @@ -706,23 +703,12 @@ def _sanitize_ndim( result = _maybe_repeat(result, index) elif result.ndim > 1: - if isinstance(data, np.ndarray): - if allow_2d: - return result - raise ValueError( - f"Data must be 1-dimensional, got ndarray of shape {data.shape} instead" - ) - if is_object_dtype(dtype) and isinstance(dtype, ExtensionDtype): - # i.e. NumpyEADtype("O") + if allow_2d: + return result + raise ValueError( + f"Data must be 1-dimensional, got ndarray of shape {data.shape} instead" + ) - result = com.asarray_tuplesafe(data, dtype=np.dtype("object")) - cls = dtype.construct_array_type() - result = cls._from_sequence(result, dtype=dtype) - else: - # error: Argument "dtype" to "asarray_tuplesafe" has incompatible type - # "Union[dtype[Any], ExtensionDtype, None]"; expected "Union[str, - # dtype[Any], None]" - result = com.asarray_tuplesafe(data, dtype=dtype) # type: ignore[arg-type] return result diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 5834b268be2be..cb9e14aaeb691 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -336,7 +336,7 @@ def test_searchsorted_castable_strings( match=re.escape( f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', " "or array of those. Got " - f"{'str' if using_infer_string else 'string'} array instead." + f"{'str' if using_infer_string else 'object'} array instead." ), ): arr.searchsorted([str(arr[1]), "baz"]) From 8b92517ceeb9cd2fc7908e29a908025cb3a013df Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 14 Aug 2024 23:29:51 +0200 Subject: [PATCH 6/7] undo changes related to datetimelike input validation --- pandas/core/arrays/datetimelike.py | 10 ++++++---- pandas/core/construction.py | 24 +++++++++++++++++++----- pandas/tests/arrays/test_datetimelike.py | 7 ++----- 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 3c69a5a450dd1..c5a08c1834a3e 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -130,9 +130,9 @@ from pandas.core.arrays.integer import IntegerArray import pandas.core.common as com from pandas.core.construction import ( + array as pd_array, ensure_wrapped_if_datetimelike, extract_array, - sanitize_array, ) from pandas.core.indexers import ( check_array_indexer, @@ -667,10 +667,12 @@ def _validate_listlike(self, value, allow_object: bool = False): msg = self._validation_error_message(value, True) raise TypeError(msg) from err - # Do type inference if necessary up front + # Do type inference if necessary up front (after unpacking + # NumpyExtensionArray) # e.g. we passed PeriodIndex.values and got an ndarray of Periods - value = sanitize_array(value, index=None, allow_2d=True) - value = ensure_wrapped_if_datetimelike(value) + value = extract_array(value, extract_numpy=True) + value = pd_array(value) + value = extract_array(value, extract_numpy=True) if is_all_strings(value): # We got a StringArray diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 8035a550d175b..665eb75953078 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -37,6 +37,7 @@ from pandas.core.dtypes.common import ( ensure_object, is_list_like, + is_object_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import NumpyEADtype @@ -48,6 +49,8 @@ ) from pandas.core.dtypes.missing import isna +import pandas.core.common as com + if TYPE_CHECKING: from collections.abc import Sequence @@ -703,12 +706,23 @@ def _sanitize_ndim( result = _maybe_repeat(result, index) elif result.ndim > 1: - if allow_2d: - return result - raise ValueError( - f"Data must be 1-dimensional, got ndarray of shape {data.shape} instead" - ) + if isinstance(data, np.ndarray): + if allow_2d: + return result + raise ValueError( + f"Data must be 1-dimensional, got ndarray of shape {data.shape} instead" + ) + if is_object_dtype(dtype) and isinstance(dtype, ExtensionDtype): + # i.e. NumpyEADtype("O") + result = com.asarray_tuplesafe(data, dtype=np.dtype("object")) + cls = dtype.construct_array_type() + result = cls._from_sequence(result, dtype=dtype) + else: + # error: Argument "dtype" to "asarray_tuplesafe" has incompatible type + # "Union[dtype[Any], ExtensionDtype, None]"; expected "Union[str, + # dtype[Any], None]" + result = com.asarray_tuplesafe(data, dtype=dtype) # type: ignore[arg-type] return result diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index cb9e14aaeb691..3d8f8d791b763 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -297,9 +297,7 @@ def test_searchsorted(self): assert result == 10 @pytest.mark.parametrize("box", [None, "index", "series"]) - def test_searchsorted_castable_strings( - self, arr1d, box, string_storage, using_infer_string - ): + def test_searchsorted_castable_strings(self, arr1d, box, string_storage): arr = arr1d if box is None: pass @@ -335,8 +333,7 @@ def test_searchsorted_castable_strings( TypeError, match=re.escape( f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', " - "or array of those. Got " - f"{'str' if using_infer_string else 'object'} array instead." + "or array of those. Got string array instead." ), ): arr.searchsorted([str(arr[1]), "baz"]) From 548b501e6162b73dea34abc52bd199c85ca16dc4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 20 Aug 2024 21:11:22 +0200 Subject: [PATCH 7/7] fix test for str on current main --- pandas/tests/arrays/test_array.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 0bb023e6b3b92..4070a2844846f 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas as pd import pandas._testing as tm from pandas.api.extensions import register_extension_dtype @@ -218,7 +220,9 @@ def test_dt64_array(dtype_unit): "str", pd.StringDtype(na_value=np.nan) .construct_array_type() - ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)), + ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)) + if using_string_dtype() + else NumpyExtensionArray(np.array(["a", "None"])), ), ( ["a", None],