Skip to content

Commit 7e1470d

Browse files
mroeschkephofl
authored andcommitted
API/BUG: Fix is_string_dtype and make more strict (pandas-dev#49378)
* fix extension tests * Make is_string_dtype more strict * Simplify and add example * get dtype * Catch cased when TypeError is raised * Change
1 parent 1335f3e commit 7e1470d

File tree

6 files changed

+42
-12
lines changed

6 files changed

+42
-12
lines changed

doc/source/whatsnew/v2.0.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ Other API changes
147147
- The ``other`` argument in :meth:`DataFrame.mask` and :meth:`Series.mask` now defaults to ``no_default`` instead of ``np.nan`` consistent with :meth:`DataFrame.where` and :meth:`Series.where`. Entries will be filled with the corresponding NULL value (``np.nan`` for numpy dtypes, ``pd.NA`` for extension dtypes). (:issue:`49111`)
148148
- When creating a :class:`Series` with a object-dtype :class:`Index` of datetime objects, pandas no longer silently converts the index to a :class:`DatetimeIndex` (:issue:`39307`, :issue:`23598`)
149149
- :meth:`Series.unique` with dtype "timedelta64[ns]" or "datetime64[ns]" now returns :class:`TimedeltaArray` or :class:`DatetimeArray` instead of ``numpy.ndarray`` (:issue:`49176`)
150+
- :func:`pandas.api.dtypes.is_string_dtype` now only returns ``True`` for array-likes with ``dtype=object`` when the elements are inferred to be strings (:issue:`15585`)
150151
- Passing a sequence containing ``datetime`` objects and ``date`` objects to :class:`Series` constructor will return with ``object`` dtype instead of ``datetime64[ns]`` dtype, consistent with :class:`Index` behavior (:issue:`49341`)
151152
- Passing strings that cannot be parsed as datetimes to :class:`Series` or :class:`DataFrame` with ``dtype="datetime64[ns]"`` will raise instead of silently ignoring the keyword and returning ``object`` dtype (:issue:`24435`)
152153
-
@@ -395,7 +396,7 @@ Conversion
395396

396397
Strings
397398
^^^^^^^
398-
-
399+
- Bug in :func:`pandas.api.dtypes.is_string_dtype` that would not return ``True`` for :class:`StringDtype` (:issue:`15585`)
399400
-
400401

401402
Interval

pandas/core/dtypes/common.py

+14-9
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,9 @@ def is_string_dtype(arr_or_dtype) -> bool:
500500
"""
501501
Check whether the provided array or dtype is of the string dtype.
502502
503+
If an array is passed with an object dtype, the elements must be
504+
inferred as strings.
505+
503506
Parameters
504507
----------
505508
arr_or_dtype : array-like or dtype
@@ -518,21 +521,23 @@ def is_string_dtype(arr_or_dtype) -> bool:
518521
True
519522
>>> is_string_dtype(int)
520523
False
521-
>>>
522524
>>> is_string_dtype(np.array(['a', 'b']))
523525
True
524526
>>> is_string_dtype(pd.Series([1, 2]))
525527
False
528+
>>> is_string_dtype(pd.Series([1, 2], dtype=object))
529+
False
526530
"""
527-
# TODO: gh-15585: consider making the checks stricter.
528-
def condition(dtype) -> bool:
529-
return dtype.kind in ("O", "S", "U") and not is_excluded_dtype(dtype)
531+
if hasattr(arr_or_dtype, "dtype") and get_dtype(arr_or_dtype).kind == "O":
532+
return is_all_strings(arr_or_dtype)
530533

531-
def is_excluded_dtype(dtype) -> bool:
532-
"""
533-
These have kind = "O" but aren't string dtypes so need to be explicitly excluded
534-
"""
535-
return isinstance(dtype, (PeriodDtype, IntervalDtype, CategoricalDtype))
534+
def condition(dtype) -> bool:
535+
if is_string_or_object_np_dtype(dtype):
536+
return True
537+
try:
538+
return dtype == "string"
539+
except TypeError:
540+
return False
536541

537542
return _is_dtype(arr_or_dtype, condition)
538543

pandas/tests/dtypes/test_common.py

+9
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,15 @@ def test_is_string_dtype():
290290
assert com.is_string_dtype(pd.StringDtype())
291291

292292

293+
@pytest.mark.parametrize(
294+
"data",
295+
[[(0, 1), (1, 1)], pd.Categorical([1, 2, 3]), np.array([1, 2], dtype=object)],
296+
)
297+
def test_is_string_dtype_arraylike_with_object_elements_not_strings(data):
298+
# GH 15585
299+
assert not com.is_string_dtype(pd.Series(data))
300+
301+
293302
def test_is_string_dtype_nullable(nullable_string_dtype):
294303
assert com.is_string_dtype(pd.array(["a", "b"], dtype=nullable_string_dtype))
295304

pandas/tests/extension/base/dtype.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,10 @@ def test_is_dtype_other_input(self, dtype):
4545
assert dtype.is_dtype([1, 2, 3]) is False
4646

4747
def test_is_not_string_type(self, dtype):
48-
return not is_string_dtype(dtype)
48+
assert not is_string_dtype(dtype)
4949

5050
def test_is_not_object_type(self, dtype):
51-
return not is_object_dtype(dtype)
51+
assert not is_object_dtype(dtype)
5252

5353
def test_eq_with_str(self, dtype):
5454
assert dtype == dtype.name

pandas/tests/extension/test_numpy.py

+9
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
import pandas as pd
2828
import pandas._testing as tm
29+
from pandas.api.types import is_object_dtype
2930
from pandas.core.arrays.numpy_ import PandasArray
3031
from pandas.core.internals import blocks
3132
from pandas.tests.extension import base
@@ -218,6 +219,14 @@ def test_check_dtype(self, data, request):
218219
)
219220
super().test_check_dtype(data)
220221

222+
def test_is_not_object_type(self, dtype, request):
223+
if dtype.numpy_dtype == "object":
224+
# Different from BaseDtypeTests.test_is_not_object_type
225+
# because PandasDtype(object) is an object type
226+
assert is_object_dtype(dtype)
227+
else:
228+
super().test_is_not_object_type(dtype)
229+
221230

222231
class TestGetitem(BaseNumPyTests, base.BaseGetitemTests):
223232
@skip_nested

pandas/tests/extension/test_string.py

+6
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
import pandas as pd
2828
import pandas._testing as tm
29+
from pandas.api.types import is_string_dtype
2930
from pandas.core.arrays import ArrowStringArray
3031
from pandas.core.arrays.string_ import StringDtype
3132
from pandas.tests.extension import base
@@ -106,6 +107,11 @@ def test_eq_with_str(self, dtype):
106107
assert dtype == f"string[{dtype.storage}]"
107108
super().test_eq_with_str(dtype)
108109

110+
def test_is_not_string_type(self, dtype):
111+
# Different from BaseDtypeTests.test_is_not_string_type
112+
# because StringDtype is a string type
113+
assert is_string_dtype(dtype)
114+
109115

110116
class TestInterface(base.BaseInterfaceTests):
111117
def test_view(self, data, request):

0 commit comments

Comments
 (0)