Skip to content

API/BUG: Fix is_string_dtype and make more strict #49378

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Nov 3, 2022
Merged
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ Other API changes
- The ``other`` argument in :meth:`DataFrame.mask` and :meth:`Series.mask` now defaults to ``no_default`` instead of ``np.nan`` consistent with :meth:`DataFrame.where` and :meth:`Series.where`. Entries will be filled with the corresponding NULL value (``np.nan`` for numpy dtypes, ``pd.NA`` for extension dtypes). (:issue:`49111`)
- When creating a :class:`Series` with a object-dtype :class:`Index` of datetime objects, pandas no longer silently converts the index to a :class:`DatetimeIndex` (:issue:`39307`, :issue:`23598`)
- :meth:`Series.unique` with dtype "timedelta64[ns]" or "datetime64[ns]" now returns :class:`TimedeltaArray` or :class:`DatetimeArray` instead of ``numpy.ndarray`` (:issue:`49176`)
- :func:`pandas.api.dtypes.is_string_dtype` now only returns ``True`` for array-likes with ``dtype=object`` when the elements are inferred to be strings (:issue:`15585`)
- Passing strings that cannot be parsed as datetimes to :class:`Series` or :class:`DataFrame` with ``dtype="datetime64[ns]"`` will raise instead of silently ignoring the keyword and returning ``object`` dtype (:issue:`24435`)
-

Expand Down Expand Up @@ -366,7 +367,7 @@ Conversion

Strings
^^^^^^^
-
- Bug in :func:`pandas.api.dtypes.is_string_dtype` that would not return ``True`` for :class:`StringDtype` (:issue:`15585`)
-

Interval
Expand Down
23 changes: 14 additions & 9 deletions pandas/core/dtypes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,9 @@ def is_string_dtype(arr_or_dtype) -> bool:
"""
Check whether the provided array or dtype is of the string dtype.

If an array is passed with an object dtype, the elements must be
inferred as strings.

Parameters
----------
arr_or_dtype : array-like or dtype
Expand All @@ -518,21 +521,23 @@ def is_string_dtype(arr_or_dtype) -> bool:
True
>>> is_string_dtype(int)
False
>>>
>>> is_string_dtype(np.array(['a', 'b']))
True
>>> is_string_dtype(pd.Series([1, 2]))
False
>>> is_string_dtype(pd.Series([1, 2], dtype=object))
False
"""
# TODO: gh-15585: consider making the checks stricter.
def condition(dtype) -> bool:
return dtype.kind in ("O", "S", "U") and not is_excluded_dtype(dtype)
if hasattr(arr_or_dtype, "dtype") and get_dtype(arr_or_dtype).kind == "O":
return is_all_strings(arr_or_dtype)

def is_excluded_dtype(dtype) -> bool:
"""
These have kind = "O" but aren't string dtypes so need to be explicitly excluded
"""
return isinstance(dtype, (PeriodDtype, IntervalDtype, CategoricalDtype))
def condition(dtype) -> bool:
if is_string_or_object_np_dtype(dtype):
return True
try:
return dtype == "string"
except TypeError:
return False

return _is_dtype(arr_or_dtype, condition)

Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/dtypes/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,15 @@ def test_is_string_dtype():
assert com.is_string_dtype(pd.StringDtype())


@pytest.mark.parametrize(
"data",
[[(0, 1), (1, 1)], pd.Categorical([1, 2, 3]), np.array([1, 2], dtype=object)],
)
def test_is_string_dtype_arraylike_with_object_elements_not_strings(data):
# GH 15585
assert not com.is_string_dtype(pd.Series(data))


def test_is_string_dtype_nullable(nullable_string_dtype):
assert com.is_string_dtype(pd.array(["a", "b"], dtype=nullable_string_dtype))

Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/extension/base/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,10 @@ def test_is_dtype_other_input(self, dtype):
assert dtype.is_dtype([1, 2, 3]) is False

def test_is_not_string_type(self, dtype):
return not is_string_dtype(dtype)
assert not is_string_dtype(dtype)

def test_is_not_object_type(self, dtype):
return not is_object_dtype(dtype)
assert not is_object_dtype(dtype)

def test_eq_with_str(self, dtype):
assert dtype == dtype.name
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/extension/test_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,13 @@ def test_check_dtype(self, data, request):
)
super().test_check_dtype(data)

def test_is_not_object_type(self, dtype, request):
if dtype.numpy_dtype == "object":
request.node.add_marker(
pytest.mark.xfail(reason="PandasDtype(object) should be object")
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can an xfail be avoided here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the same reasoning in https://github.com/pandas-dev/pandas/pull/49378/files#r1009898830, I think this should unexpectedly (and understandably) fail

super().test_is_not_object_type(dtype)


class TestGetitem(BaseNumPyTests, base.BaseGetitemTests):
@skip_nested
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/extension/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,10 @@ def test_eq_with_str(self, dtype):
assert dtype == f"string[{dtype.storage}]"
super().test_eq_with_str(dtype)

@pytest.mark.xfail(reason="StringDtype is a string dtype")
def test_is_not_string_type(self, dtype):
super().test_is_not_string_type(dtype)


class TestInterface(base.BaseInterfaceTests):
def test_view(self, data, request):
Expand Down