diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index c1d4797af9145..720cbdc2aeba8 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -613,7 +613,7 @@ Strings ^^^^^^^ - Bug in the ``__name__`` attribute of several methods of :class:`Series.str`, which were set incorrectly (:issue:`23551`) -- +- Improved error message when passing :class:`Series` of wrong dtype to :meth:`Series.str.cat` (:issue:`22722`) - diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 619b49438cdbb..413c0e73f8410 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2,7 +2,7 @@ from functools import wraps import re import textwrap -from typing import Dict +from typing import Dict, List import warnings import numpy as np @@ -31,7 +31,7 @@ _shared_docs = dict() # type: Dict[str, str] -def cat_core(list_of_columns, sep): +def cat_core(list_of_columns: List, sep: str): """ Auxiliary function for :meth:`str.cat` @@ -53,6 +53,41 @@ def cat_core(list_of_columns, sep): return np.sum(list_with_sep, axis=0) +def cat_safe(list_of_columns: List, sep: str): + """ + Auxiliary function for :meth:`str.cat`. + + Same signature as cat_core, but handles TypeErrors in concatenation, which + happen if the arrays in list_of columns have the wrong dtypes or content. + + Parameters + ---------- + list_of_columns : list of numpy arrays + List of arrays to be concatenated with sep; + these arrays may not contain NaNs! + sep : string + The separator string for concatenating the columns + + Returns + ------- + nd.array + The concatenation of list_of_columns with sep + """ + try: + result = cat_core(list_of_columns, sep) + except TypeError: + # if there are any non-string values (wrong dtype or hidden behind + # object dtype), np.sum will fail; catch and return with better message + for column in list_of_columns: + dtype = lib.infer_dtype(column, skipna=True) + if dtype not in ['string', 'empty']: + raise TypeError( + 'Concatenation requires list-likes containing only ' + 'strings (or missing values). Offending values found in ' + 'column {}'.format(dtype)) from None + return result + + def _na_map(f, arr, na_result=np.nan, dtype=object): # should really _check_ for NA return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype) @@ -2314,16 +2349,16 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): np.putmask(result, union_mask, np.nan) not_masked = ~union_mask - result[not_masked] = cat_core([x[not_masked] for x in all_cols], + result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep) elif na_rep is not None and union_mask.any(): # fill NaNs with na_rep in case there are actually any NaNs all_cols = [np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols)] - result = cat_core(all_cols, sep) + result = cat_safe(all_cols, sep) else: # no NaNs - can just concatenate - result = cat_core(all_cols, sep) + result = cat_safe(all_cols, sep) if isinstance(self._orig, Index): # add dtype for case that result is all-NA diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index a1d522930e9aa..955554f60aa1f 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -428,6 +428,23 @@ def test_str_cat_categorical(self, box, dtype_caller, dtype_target, sep): result = s.str.cat(t, sep=sep) assert_series_or_index_equal(result, expected) + # test integer/float dtypes (inferred by constructor) and mixed + @pytest.mark.parametrize('data', [[1, 2, 3], [.1, .2, .3], [1, 2, 'b']], + ids=['integers', 'floats', 'mixed']) + # without dtype=object, np.array would cast [1, 2, 'b'] to ['1', '2', 'b'] + @pytest.mark.parametrize('box', [Series, Index, list, + lambda x: np.array(x, dtype=object)], + ids=['Series', 'Index', 'list', 'np.array']) + def test_str_cat_wrong_dtype_raises(self, box, data): + # GH 22722 + s = Series(['a', 'b', 'c']) + t = box(data) + + msg = 'Concatenation requires list-likes containing only strings.*' + with pytest.raises(TypeError, match=msg): + # need to use outer and na_rep, as otherwise Index would not raise + s.str.cat(t, join='outer', na_rep='-') + @pytest.mark.parametrize('box', [Series, Index]) def test_str_cat_mixed_inputs(self, box): s = Index(['a', 'b', 'c', 'd'])