From 129640f554981a996a0df1eee126b31e656a81b9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 25 Oct 2022 16:39:55 -0700 Subject: [PATCH 1/6] fix extension tests --- pandas/tests/extension/base/dtype.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index ea4443010c6a6..32a9246264d69 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -45,10 +45,10 @@ def test_is_dtype_other_input(self, dtype): assert dtype.is_dtype([1, 2, 3]) is False def test_is_not_string_type(self, dtype): - return not is_string_dtype(dtype) + assert not is_string_dtype(dtype) def test_is_not_object_type(self, dtype): - return not is_object_dtype(dtype) + assert not is_object_dtype(dtype) def test_eq_with_str(self, dtype): assert dtype == dtype.name From 854ba71e131b59c72016dbce8191628020450336 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 28 Oct 2022 15:09:39 -0700 Subject: [PATCH 2/6] Make is_string_dtype more strict --- doc/source/whatsnew/v2.0.0.rst | 4 ++-- pandas/core/dtypes/common.py | 17 +++++++++-------- pandas/tests/dtypes/test_common.py | 9 +++++++++ pandas/tests/extension/test_numpy.py | 7 +++++++ pandas/tests/extension/test_string.py | 4 ++++ 5 files changed, 31 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 5aa753dffcf7f..504f1f3038bf5 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -141,7 +141,7 @@ Other API changes - The ``other`` argument in :meth:`DataFrame.mask` and :meth:`Series.mask` now defaults to ``no_default`` instead of ``np.nan`` consistent with :meth:`DataFrame.where` and :meth:`Series.where`. Entries will be filled with the corresponding NULL value (``np.nan`` for numpy dtypes, ``pd.NA`` for extension dtypes). (:issue:`49111`) - When creating a :class:`Series` with a object-dtype :class:`Index` of datetime objects, pandas no longer silently converts the index to a :class:`DatetimeIndex` (:issue:`39307`, :issue:`23598`) - :meth:`Series.unique` with dtype "timedelta64[ns]" or "datetime64[ns]" now returns :class:`TimedeltaArray` or :class:`DatetimeArray` instead of ``numpy.ndarray`` (:issue:`49176`) -- +- :func:`pandas.api.dtypes.is_string_dtype` now only returns ``True`` for array-likes with ``dtype=object`` when the elements are inferred to be strings (:issue:`15585`) .. --------------------------------------------------------------------------- .. _whatsnew_200.deprecations: @@ -349,7 +349,7 @@ Conversion Strings ^^^^^^^ -- +- Bug in :func:`pandas.api.dtypes.is_string_dtype` that would not return ``True`` for :class:`StringDtype` (:issue:`15585`) - Interval diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 3c2aa1f6bab5d..fb4569ec3290a 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -524,15 +524,16 @@ def is_string_dtype(arr_or_dtype) -> bool: >>> is_string_dtype(pd.Series([1, 2])) False """ - # TODO: gh-15585: consider making the checks stricter. - def condition(dtype) -> bool: - return dtype.kind in ("O", "S", "U") and not is_excluded_dtype(dtype) + if hasattr(arr_or_dtype, "dtype"): + if arr_or_dtype.dtype.kind in "SU": + return True + elif arr_or_dtype.dtype.kind == "O": + return is_all_strings(arr_or_dtype) + else: + return False - def is_excluded_dtype(dtype) -> bool: - """ - These have kind = "O" but aren't string dtypes so need to be explicitly excluded - """ - return isinstance(dtype, (PeriodDtype, IntervalDtype, CategoricalDtype)) + def condition(dtype) -> bool: + return is_string_or_object_np_dtype(dtype) or dtype == "string" return _is_dtype(arr_or_dtype, condition) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 589e2e04d668a..c8a3c992248ad 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -290,6 +290,15 @@ def test_is_string_dtype(): assert com.is_string_dtype(pd.StringDtype()) +@pytest.mark.parametrize( + "data", + [[(0, 1), (1, 1)], pd.Categorical([1, 2, 3]), np.array([1, 2], dtype=object)], +) +def test_is_string_dtype_arraylike_with_object_elements_not_strings(data): + # GH 15585 + assert not com.is_string_dtype(pd.Series(data)) + + def test_is_string_dtype_nullable(nullable_string_dtype): assert com.is_string_dtype(pd.array(["a", "b"], dtype=nullable_string_dtype)) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 148059a6a16f3..2e349d824e32f 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -218,6 +218,13 @@ def test_check_dtype(self, data, request): ) super().test_check_dtype(data) + def test_is_not_object_type(self, dtype, request): + if dtype.numpy_dtype == "object": + request.node.add_marker( + pytest.mark.xfail(reason="PandasDtype(object) should be object") + ) + super().test_is_not_object_type(dtype) + class TestGetitem(BaseNumPyTests, base.BaseGetitemTests): @skip_nested diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index c5aebb282bafa..776c136fc4393 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -106,6 +106,10 @@ def test_eq_with_str(self, dtype): assert dtype == f"string[{dtype.storage}]" super().test_eq_with_str(dtype) + @pytest.mark.xfail(reason="StringDtype is a string dtype") + def test_is_not_string_type(self, dtype): + super().test_is_not_string_type(dtype) + class TestInterface(base.BaseInterfaceTests): def test_view(self, data, request): From b7f50577b8be81107a5b49a7eed8418e979d5b69 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 28 Oct 2022 15:18:59 -0700 Subject: [PATCH 3/6] Simplify and add example --- pandas/core/dtypes/common.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index fb4569ec3290a..2633801a0e24e 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -500,6 +500,9 @@ def is_string_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of the string dtype. + If an array is passed with an object dtype, the elements must be + inferred as strings. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -523,14 +526,11 @@ def is_string_dtype(arr_or_dtype) -> bool: True >>> is_string_dtype(pd.Series([1, 2])) False + >>> is_string_dtype(pd.Series([1, 2], dtype=object)) + False """ - if hasattr(arr_or_dtype, "dtype"): - if arr_or_dtype.dtype.kind in "SU": - return True - elif arr_or_dtype.dtype.kind == "O": - return is_all_strings(arr_or_dtype) - else: - return False + if hasattr(arr_or_dtype, "dtype") and arr_or_dtype.dtype.kind == "O": + return is_all_strings(arr_or_dtype) def condition(dtype) -> bool: return is_string_or_object_np_dtype(dtype) or dtype == "string" From 459a300799cec2048c4c5565c30755ac15fd22c6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 28 Oct 2022 16:26:03 -0700 Subject: [PATCH 4/6] get dtype --- pandas/core/dtypes/common.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 2633801a0e24e..0a8a42b502c04 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -521,7 +521,6 @@ def is_string_dtype(arr_or_dtype) -> bool: True >>> is_string_dtype(int) False - >>> >>> is_string_dtype(np.array(['a', 'b'])) True >>> is_string_dtype(pd.Series([1, 2])) @@ -529,7 +528,7 @@ def is_string_dtype(arr_or_dtype) -> bool: >>> is_string_dtype(pd.Series([1, 2], dtype=object)) False """ - if hasattr(arr_or_dtype, "dtype") and arr_or_dtype.dtype.kind == "O": + if hasattr(arr_or_dtype, "dtype") and get_dtype(arr_or_dtype).kind == "O": return is_all_strings(arr_or_dtype) def condition(dtype) -> bool: From 5a140b4ef0f6ee2007314888a9138fedf3f1ae82 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 31 Oct 2022 12:26:12 -0700 Subject: [PATCH 5/6] Catch cased when TypeError is raised --- pandas/core/dtypes/common.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 0a8a42b502c04..a7b8e720ad8e2 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -532,7 +532,12 @@ def is_string_dtype(arr_or_dtype) -> bool: return is_all_strings(arr_or_dtype) def condition(dtype) -> bool: - return is_string_or_object_np_dtype(dtype) or dtype == "string" + if is_string_or_object_np_dtype(dtype): + return True + try: + return dtype == "string" + except TypeError: + return False return _is_dtype(arr_or_dtype, condition) From 2dac8f70cc3827480a6292c12aab6eecd0a6e7b2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 1 Nov 2022 10:13:11 -0700 Subject: [PATCH 6/6] Change --- pandas/tests/extension/test_numpy.py | 10 ++++++---- pandas/tests/extension/test_string.py | 6 ++++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 2e349d824e32f..d6a5557c89f14 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -26,6 +26,7 @@ import pandas as pd import pandas._testing as tm +from pandas.api.types import is_object_dtype from pandas.core.arrays.numpy_ import PandasArray from pandas.core.internals import blocks from pandas.tests.extension import base @@ -220,10 +221,11 @@ def test_check_dtype(self, data, request): def test_is_not_object_type(self, dtype, request): if dtype.numpy_dtype == "object": - request.node.add_marker( - pytest.mark.xfail(reason="PandasDtype(object) should be object") - ) - super().test_is_not_object_type(dtype) + # Different from BaseDtypeTests.test_is_not_object_type + # because PandasDtype(object) is an object type + assert is_object_dtype(dtype) + else: + super().test_is_not_object_type(dtype) class TestGetitem(BaseNumPyTests, base.BaseGetitemTests): diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 776c136fc4393..8cbd4342ea13f 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -26,6 +26,7 @@ import pandas as pd import pandas._testing as tm +from pandas.api.types import is_string_dtype from pandas.core.arrays import ArrowStringArray from pandas.core.arrays.string_ import StringDtype from pandas.tests.extension import base @@ -106,9 +107,10 @@ def test_eq_with_str(self, dtype): assert dtype == f"string[{dtype.storage}]" super().test_eq_with_str(dtype) - @pytest.mark.xfail(reason="StringDtype is a string dtype") def test_is_not_string_type(self, dtype): - super().test_is_not_string_type(dtype) + # Different from BaseDtypeTests.test_is_not_string_type + # because StringDtype is a string type + assert is_string_dtype(dtype) class TestInterface(base.BaseInterfaceTests):