Skip to content

Commit 4dc53a4

Browse files
h-vetinaritm9k1
authored andcommitted
CLN/ERR: str.cat internals (pandas-dev#22725)
1 parent 8b3cedb commit 4dc53a4

File tree

2 files changed

+57
-157
lines changed

2 files changed

+57
-157
lines changed

pandas/core/strings.py

+51-110
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33

44
from pandas.compat import zip
55
from pandas.core.dtypes.generic import ABCSeries, ABCIndex
6-
from pandas.core.dtypes.missing import isna, notna
6+
from pandas.core.dtypes.missing import isna
77
from pandas.core.dtypes.common import (
8+
ensure_object,
89
is_bool_dtype,
910
is_categorical_dtype,
1011
is_object_dtype,
@@ -36,114 +37,26 @@
3637
_shared_docs = dict()
3738

3839

39-
def _get_array_list(arr, others):
40-
"""
41-
Auxiliary function for :func:`str_cat`
42-
43-
Parameters
44-
----------
45-
arr : ndarray
46-
The left-most ndarray of the concatenation
47-
others : list, ndarray, Series
48-
The rest of the content to concatenate. If list of list-likes,
49-
all elements must be passable to ``np.asarray``.
50-
51-
Returns
52-
-------
53-
list
54-
List of all necessary arrays
55-
"""
56-
from pandas.core.series import Series
57-
58-
if len(others) and isinstance(com.values_from_object(others)[0],
59-
(list, np.ndarray, Series)):
60-
arrays = [arr] + list(others)
61-
else:
62-
arrays = [arr, others]
63-
64-
return [np.asarray(x, dtype=object) for x in arrays]
65-
66-
67-
def str_cat(arr, others=None, sep=None, na_rep=None):
40+
def cat_core(list_of_columns, sep):
6841
"""
6942
Auxiliary function for :meth:`str.cat`
7043
71-
If `others` is specified, this function concatenates the Series/Index
72-
and elements of `others` element-wise.
73-
If `others` is not being passed then all values in the Series are
74-
concatenated in a single string with a given `sep`.
75-
7644
Parameters
7745
----------
78-
others : list-like, or list of list-likes, optional
79-
List-likes (or a list of them) of the same length as calling object.
80-
If None, returns str concatenating strings of the Series.
81-
sep : string or None, default None
82-
If None, concatenates without any separator.
83-
na_rep : string or None, default None
84-
If None, NA in the series are ignored.
46+
list_of_columns : list of numpy arrays
47+
List of arrays to be concatenated with sep;
48+
these arrays may not contain NaNs!
49+
sep : string
50+
The separator string for concatenating the columns
8551
8652
Returns
8753
-------
88-
concat
89-
ndarray containing concatenated results (if `others is not None`)
90-
or str (if `others is None`)
54+
nd.array
55+
The concatenation of list_of_columns with sep
9156
"""
92-
if sep is None:
93-
sep = ''
94-
95-
if others is not None:
96-
arrays = _get_array_list(arr, others)
97-
98-
n = _length_check(arrays)
99-
masks = np.array([isna(x) for x in arrays])
100-
cats = None
101-
102-
if na_rep is None:
103-
na_mask = np.logical_or.reduce(masks, axis=0)
104-
105-
result = np.empty(n, dtype=object)
106-
np.putmask(result, na_mask, np.nan)
107-
108-
notmask = ~na_mask
109-
110-
tuples = zip(*[x[notmask] for x in arrays])
111-
cats = [sep.join(tup) for tup in tuples]
112-
113-
result[notmask] = cats
114-
else:
115-
for i, x in enumerate(arrays):
116-
x = np.where(masks[i], na_rep, x)
117-
if cats is None:
118-
cats = x
119-
else:
120-
cats = cats + sep + x
121-
122-
result = cats
123-
124-
return result
125-
else:
126-
arr = np.asarray(arr, dtype=object)
127-
mask = isna(arr)
128-
if na_rep is None and mask.any():
129-
if sep == '':
130-
na_rep = ''
131-
else:
132-
return sep.join(arr[notna(arr)])
133-
return sep.join(np.where(mask, na_rep, arr))
134-
135-
136-
def _length_check(others):
137-
n = None
138-
for x in others:
139-
try:
140-
if n is None:
141-
n = len(x)
142-
elif len(x) != n:
143-
raise ValueError('All arrays must be same length')
144-
except TypeError:
145-
raise ValueError('Must pass arrays containing strings to str_cat')
146-
return n
57+
list_with_sep = [sep] * (2 * len(list_of_columns) - 1)
58+
list_with_sep[::2] = list_of_columns
59+
return np.sum(list_with_sep, axis=0)
14760

14861

14962
def _na_map(f, arr, na_result=np.nan, dtype=object):
@@ -2283,6 +2196,8 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
22832196

22842197
if isinstance(others, compat.string_types):
22852198
raise ValueError("Did you mean to supply a `sep` keyword?")
2199+
if sep is None:
2200+
sep = ''
22862201

22872202
if isinstance(self._orig, Index):
22882203
data = Series(self._orig, index=self._orig)
@@ -2291,9 +2206,13 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
22912206

22922207
# concatenate Series/Index with itself if no "others"
22932208
if others is None:
2294-
result = str_cat(data, others=others, sep=sep, na_rep=na_rep)
2295-
return self._wrap_result(result,
2296-
use_codes=(not self._is_categorical))
2209+
data = ensure_object(data)
2210+
na_mask = isna(data)
2211+
if na_rep is None and na_mask.any():
2212+
data = data[~na_mask]
2213+
elif na_rep is not None and na_mask.any():
2214+
data = np.where(na_mask, na_rep, data)
2215+
return sep.join(data)
22972216

22982217
try:
22992218
# turn anything in "others" into lists of Series
@@ -2320,23 +2239,45 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
23202239
"'outer'|'inner'|'right'`. The future default will "
23212240
"be `join='left'`.", FutureWarning, stacklevel=2)
23222241

2242+
# if join is None, _get_series_list already force-aligned indexes
2243+
join = 'left' if join is None else join
2244+
23232245
# align if required
2324-
if join is not None:
2246+
if any(not data.index.equals(x.index) for x in others):
23252247
# Need to add keys for uniqueness in case of duplicate columns
23262248
others = concat(others, axis=1,
23272249
join=(join if join == 'inner' else 'outer'),
2328-
keys=range(len(others)))
2250+
keys=range(len(others)), copy=False)
23292251
data, others = data.align(others, join=join)
23302252
others = [others[x] for x in others] # again list of Series
23312253

2332-
# str_cat discards index
2333-
res = str_cat(data, others=others, sep=sep, na_rep=na_rep)
2254+
all_cols = [ensure_object(x) for x in [data] + others]
2255+
na_masks = np.array([isna(x) for x in all_cols])
2256+
union_mask = np.logical_or.reduce(na_masks, axis=0)
2257+
2258+
if na_rep is None and union_mask.any():
2259+
# no na_rep means NaNs for all rows where any column has a NaN
2260+
# only necessary if there are actually any NaNs
2261+
result = np.empty(len(data), dtype=object)
2262+
np.putmask(result, union_mask, np.nan)
2263+
2264+
not_masked = ~union_mask
2265+
result[not_masked] = cat_core([x[not_masked] for x in all_cols],
2266+
sep)
2267+
elif na_rep is not None and union_mask.any():
2268+
# fill NaNs with na_rep in case there are actually any NaNs
2269+
all_cols = [np.where(nm, na_rep, col)
2270+
for nm, col in zip(na_masks, all_cols)]
2271+
result = cat_core(all_cols, sep)
2272+
else:
2273+
# no NaNs - can just concatenate
2274+
result = cat_core(all_cols, sep)
23342275

23352276
if isinstance(self._orig, Index):
2336-
res = Index(res, name=self._orig.name)
2277+
result = Index(result, name=self._orig.name)
23372278
else: # Series
2338-
res = Series(res, index=data.index, name=self._orig.name)
2339-
return res
2279+
result = Series(result, index=data.index, name=self._orig.name)
2280+
return result
23402281

23412282
_shared_docs['str_split'] = ("""
23422283
Split strings around given separator/delimiter.

pandas/tests/test_strings.py

+6-47
Original file line numberDiff line numberDiff line change
@@ -97,53 +97,6 @@ def test_iter_object_try_string(self):
9797
assert i == 100
9898
assert s == 'h'
9999

100-
def test_cat(self):
101-
one = np.array(['a', 'a', 'b', 'b', 'c', NA], dtype=np.object_)
102-
two = np.array(['a', NA, 'b', 'd', 'foo', NA], dtype=np.object_)
103-
104-
# single array
105-
result = strings.str_cat(one)
106-
exp = 'aabbc'
107-
assert result == exp
108-
109-
result = strings.str_cat(one, na_rep='NA')
110-
exp = 'aabbcNA'
111-
assert result == exp
112-
113-
result = strings.str_cat(one, na_rep='-')
114-
exp = 'aabbc-'
115-
assert result == exp
116-
117-
result = strings.str_cat(one, sep='_', na_rep='NA')
118-
exp = 'a_a_b_b_c_NA'
119-
assert result == exp
120-
121-
result = strings.str_cat(two, sep='-')
122-
exp = 'a-b-d-foo'
123-
assert result == exp
124-
125-
# Multiple arrays
126-
result = strings.str_cat(one, [two], na_rep='NA')
127-
exp = np.array(['aa', 'aNA', 'bb', 'bd', 'cfoo', 'NANA'],
128-
dtype=np.object_)
129-
tm.assert_numpy_array_equal(result, exp)
130-
131-
result = strings.str_cat(one, two)
132-
exp = np.array(['aa', NA, 'bb', 'bd', 'cfoo', NA], dtype=np.object_)
133-
tm.assert_almost_equal(result, exp)
134-
135-
# error for incorrect lengths
136-
rgx = 'All arrays must be same length'
137-
three = Series(['1', '2', '3'])
138-
139-
with tm.assert_raises_regex(ValueError, rgx):
140-
strings.str_cat(one, three)
141-
142-
# error for incorrect type
143-
rgx = "Must pass arrays containing strings to str_cat"
144-
with tm.assert_raises_regex(ValueError, rgx):
145-
strings.str_cat(one, 'three')
146-
147100
@pytest.mark.parametrize('box', [Series, Index])
148101
@pytest.mark.parametrize('other', [None, Series, Index])
149102
def test_str_cat_name(self, box, other):
@@ -414,6 +367,12 @@ def test_str_cat_align_mixed_inputs(self, join):
414367
with tm.assert_raises_regex(ValueError, rgx):
415368
s.str.cat([t, z], join=join)
416369

370+
def test_str_cat_raises(self):
371+
# non-strings hiding behind object dtype
372+
s = Series([1, 2, 3, 4], dtype='object')
373+
with tm.assert_raises_regex(TypeError, "unsupported operand type.*"):
374+
s.str.cat(s)
375+
417376
def test_str_cat_special_cases(self):
418377
s = Series(['a', 'b', 'c', 'd'])
419378
t = Series(['d', 'a', 'e', 'b'], index=[3, 0, 4, 1])

0 commit comments

Comments
 (0)