Skip to content

API/BUG: Fix is_string_dtype and make more strict #49378

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Nov 3, 2022
Merged
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ Other API changes
- The ``other`` argument in :meth:`DataFrame.mask` and :meth:`Series.mask` now defaults to ``no_default`` instead of ``np.nan`` consistent with :meth:`DataFrame.where` and :meth:`Series.where`. Entries will be filled with the corresponding NULL value (``np.nan`` for numpy dtypes, ``pd.NA`` for extension dtypes). (:issue:`49111`)
- When creating a :class:`Series` with a object-dtype :class:`Index` of datetime objects, pandas no longer silently converts the index to a :class:`DatetimeIndex` (:issue:`39307`, :issue:`23598`)
- :meth:`Series.unique` with dtype "timedelta64[ns]" or "datetime64[ns]" now returns :class:`TimedeltaArray` or :class:`DatetimeArray` instead of ``numpy.ndarray`` (:issue:`49176`)
- :func:`pandas.api.dtypes.is_string_dtype` now only returns ``True`` for array-likes with ``dtype=object`` when the elements are inferred to be strings (:issue:`15585`)
- Passing a sequence containing ``datetime`` objects and ``date`` objects to :class:`Series` constructor will return with ``object`` dtype instead of ``datetime64[ns]`` dtype, consistent with :class:`Index` behavior (:issue:`49341`)
- Passing strings that cannot be parsed as datetimes to :class:`Series` or :class:`DataFrame` with ``dtype="datetime64[ns]"`` will raise instead of silently ignoring the keyword and returning ``object`` dtype (:issue:`24435`)
-
Expand Down Expand Up @@ -393,7 +394,7 @@ Conversion

Strings
^^^^^^^
-
- Bug in :func:`pandas.api.dtypes.is_string_dtype` that would not return ``True`` for :class:`StringDtype` (:issue:`15585`)
-

Interval
Expand Down
23 changes: 14 additions & 9 deletions pandas/core/dtypes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,9 @@ def is_string_dtype(arr_or_dtype) -> bool:
"""
Check whether the provided array or dtype is of the string dtype.

If an array is passed with an object dtype, the elements must be
inferred as strings.

Parameters
----------
arr_or_dtype : array-like or dtype
Expand All @@ -518,21 +521,23 @@ def is_string_dtype(arr_or_dtype) -> bool:
True
>>> is_string_dtype(int)
False
>>>
>>> is_string_dtype(np.array(['a', 'b']))
True
>>> is_string_dtype(pd.Series([1, 2]))
False
>>> is_string_dtype(pd.Series([1, 2], dtype=object))
False
"""
# TODO: gh-15585: consider making the checks stricter.
def condition(dtype) -> bool:
return dtype.kind in ("O", "S", "U") and not is_excluded_dtype(dtype)
if hasattr(arr_or_dtype, "dtype") and get_dtype(arr_or_dtype).kind == "O":
return is_all_strings(arr_or_dtype)

def is_excluded_dtype(dtype) -> bool:
"""
These have kind = "O" but aren't string dtypes so need to be explicitly excluded
"""
return isinstance(dtype, (PeriodDtype, IntervalDtype, CategoricalDtype))
def condition(dtype) -> bool:
if is_string_or_object_np_dtype(dtype):
return True
try:
return dtype == "string"
except TypeError:
return False

return _is_dtype(arr_or_dtype, condition)

Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/dtypes/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,15 @@ def test_is_string_dtype():
assert com.is_string_dtype(pd.StringDtype())


@pytest.mark.parametrize(
"data",
[[(0, 1), (1, 1)], pd.Categorical([1, 2, 3]), np.array([1, 2], dtype=object)],
)
def test_is_string_dtype_arraylike_with_object_elements_not_strings(data):
# GH 15585
assert not com.is_string_dtype(pd.Series(data))


def test_is_string_dtype_nullable(nullable_string_dtype):
assert com.is_string_dtype(pd.array(["a", "b"], dtype=nullable_string_dtype))

Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/extension/base/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,10 @@ def test_is_dtype_other_input(self, dtype):
assert dtype.is_dtype([1, 2, 3]) is False

def test_is_not_string_type(self, dtype):
return not is_string_dtype(dtype)
assert not is_string_dtype(dtype)

def test_is_not_object_type(self, dtype):
return not is_object_dtype(dtype)
assert not is_object_dtype(dtype)

def test_eq_with_str(self, dtype):
assert dtype == dtype.name
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/extension/test_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

import pandas as pd
import pandas._testing as tm
from pandas.api.types import is_object_dtype
from pandas.core.arrays.numpy_ import PandasArray
from pandas.core.internals import blocks
from pandas.tests.extension import base
Expand Down Expand Up @@ -218,6 +219,14 @@ def test_check_dtype(self, data, request):
)
super().test_check_dtype(data)

def test_is_not_object_type(self, dtype, request):
if dtype.numpy_dtype == "object":
# Different from BaseDtypeTests.test_is_not_object_type
# because PandasDtype(object) is an object type
assert is_object_dtype(dtype)
else:
super().test_is_not_object_type(dtype)


class TestGetitem(BaseNumPyTests, base.BaseGetitemTests):
@skip_nested
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/extension/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

import pandas as pd
import pandas._testing as tm
from pandas.api.types import is_string_dtype
from pandas.core.arrays import ArrowStringArray
from pandas.core.arrays.string_ import StringDtype
from pandas.tests.extension import base
Expand Down Expand Up @@ -106,6 +107,11 @@ def test_eq_with_str(self, dtype):
assert dtype == f"string[{dtype.storage}]"
super().test_eq_with_str(dtype)

def test_is_not_string_type(self, dtype):
# Different from BaseDtypeTests.test_is_not_string_type
# because StringDtype is a string type
assert is_string_dtype(dtype)


class TestInterface(base.BaseInterfaceTests):
def test_view(self, data, request):
Expand Down