3
3
4
4
from pandas .compat import zip
5
5
from pandas .core .dtypes .generic import ABCSeries , ABCIndex
6
- from pandas .core .dtypes .missing import isna , notna
6
+ from pandas .core .dtypes .missing import isna
7
7
from pandas .core .dtypes .common import (
8
+ ensure_object ,
8
9
is_bool_dtype ,
9
10
is_categorical_dtype ,
10
11
is_object_dtype ,
36
37
_shared_docs = dict ()
37
38
38
39
39
- def _get_array_list (arr , others ):
40
- """
41
- Auxiliary function for :func:`str_cat`
42
-
43
- Parameters
44
- ----------
45
- arr : ndarray
46
- The left-most ndarray of the concatenation
47
- others : list, ndarray, Series
48
- The rest of the content to concatenate. If list of list-likes,
49
- all elements must be passable to ``np.asarray``.
50
-
51
- Returns
52
- -------
53
- list
54
- List of all necessary arrays
55
- """
56
- from pandas .core .series import Series
57
-
58
- if len (others ) and isinstance (com .values_from_object (others )[0 ],
59
- (list , np .ndarray , Series )):
60
- arrays = [arr ] + list (others )
61
- else :
62
- arrays = [arr , others ]
63
-
64
- return [np .asarray (x , dtype = object ) for x in arrays ]
65
-
66
-
67
- def str_cat (arr , others = None , sep = None , na_rep = None ):
40
+ def cat_core (list_of_columns , sep ):
68
41
"""
69
42
Auxiliary function for :meth:`str.cat`
70
43
71
- If `others` is specified, this function concatenates the Series/Index
72
- and elements of `others` element-wise.
73
- If `others` is not being passed then all values in the Series are
74
- concatenated in a single string with a given `sep`.
75
-
76
44
Parameters
77
45
----------
78
- others : list-like, or list of list-likes, optional
79
- List-likes (or a list of them) of the same length as calling object.
80
- If None, returns str concatenating strings of the Series.
81
- sep : string or None, default None
82
- If None, concatenates without any separator.
83
- na_rep : string or None, default None
84
- If None, NA in the series are ignored.
46
+ list_of_columns : list of numpy arrays
47
+ List of arrays to be concatenated with sep;
48
+ these arrays may not contain NaNs!
49
+ sep : string
50
+ The separator string for concatenating the columns
85
51
86
52
Returns
87
53
-------
88
- concat
89
- ndarray containing concatenated results (if `others is not None`)
90
- or str (if `others is None`)
54
+ nd.array
55
+ The concatenation of list_of_columns with sep
91
56
"""
92
- if sep is None :
93
- sep = ''
94
-
95
- if others is not None :
96
- arrays = _get_array_list (arr , others )
97
-
98
- n = _length_check (arrays )
99
- masks = np .array ([isna (x ) for x in arrays ])
100
- cats = None
101
-
102
- if na_rep is None :
103
- na_mask = np .logical_or .reduce (masks , axis = 0 )
104
-
105
- result = np .empty (n , dtype = object )
106
- np .putmask (result , na_mask , np .nan )
107
-
108
- notmask = ~ na_mask
109
-
110
- tuples = zip (* [x [notmask ] for x in arrays ])
111
- cats = [sep .join (tup ) for tup in tuples ]
112
-
113
- result [notmask ] = cats
114
- else :
115
- for i , x in enumerate (arrays ):
116
- x = np .where (masks [i ], na_rep , x )
117
- if cats is None :
118
- cats = x
119
- else :
120
- cats = cats + sep + x
121
-
122
- result = cats
123
-
124
- return result
125
- else :
126
- arr = np .asarray (arr , dtype = object )
127
- mask = isna (arr )
128
- if na_rep is None and mask .any ():
129
- if sep == '' :
130
- na_rep = ''
131
- else :
132
- return sep .join (arr [notna (arr )])
133
- return sep .join (np .where (mask , na_rep , arr ))
134
-
135
-
136
- def _length_check (others ):
137
- n = None
138
- for x in others :
139
- try :
140
- if n is None :
141
- n = len (x )
142
- elif len (x ) != n :
143
- raise ValueError ('All arrays must be same length' )
144
- except TypeError :
145
- raise ValueError ('Must pass arrays containing strings to str_cat' )
146
- return n
57
+ list_with_sep = [sep ] * (2 * len (list_of_columns ) - 1 )
58
+ list_with_sep [::2 ] = list_of_columns
59
+ return np .sum (list_with_sep , axis = 0 )
147
60
148
61
149
62
def _na_map (f , arr , na_result = np .nan , dtype = object ):
@@ -2283,6 +2196,8 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
2283
2196
2284
2197
if isinstance (others , compat .string_types ):
2285
2198
raise ValueError ("Did you mean to supply a `sep` keyword?" )
2199
+ if sep is None :
2200
+ sep = ''
2286
2201
2287
2202
if isinstance (self ._orig , Index ):
2288
2203
data = Series (self ._orig , index = self ._orig )
@@ -2291,9 +2206,13 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
2291
2206
2292
2207
# concatenate Series/Index with itself if no "others"
2293
2208
if others is None :
2294
- result = str_cat (data , others = others , sep = sep , na_rep = na_rep )
2295
- return self ._wrap_result (result ,
2296
- use_codes = (not self ._is_categorical ))
2209
+ data = ensure_object (data )
2210
+ na_mask = isna (data )
2211
+ if na_rep is None and na_mask .any ():
2212
+ data = data [~ na_mask ]
2213
+ elif na_rep is not None and na_mask .any ():
2214
+ data = np .where (na_mask , na_rep , data )
2215
+ return sep .join (data )
2297
2216
2298
2217
try :
2299
2218
# turn anything in "others" into lists of Series
@@ -2320,23 +2239,45 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
2320
2239
"'outer'|'inner'|'right'`. The future default will "
2321
2240
"be `join='left'`." , FutureWarning , stacklevel = 2 )
2322
2241
2242
+ # if join is None, _get_series_list already force-aligned indexes
2243
+ join = 'left' if join is None else join
2244
+
2323
2245
# align if required
2324
- if join is not None :
2246
+ if any ( not data . index . equals ( x . index ) for x in others ) :
2325
2247
# Need to add keys for uniqueness in case of duplicate columns
2326
2248
others = concat (others , axis = 1 ,
2327
2249
join = (join if join == 'inner' else 'outer' ),
2328
- keys = range (len (others )))
2250
+ keys = range (len (others )), copy = False )
2329
2251
data , others = data .align (others , join = join )
2330
2252
others = [others [x ] for x in others ] # again list of Series
2331
2253
2332
- # str_cat discards index
2333
- res = str_cat (data , others = others , sep = sep , na_rep = na_rep )
2254
+ all_cols = [ensure_object (x ) for x in [data ] + others ]
2255
+ na_masks = np .array ([isna (x ) for x in all_cols ])
2256
+ union_mask = np .logical_or .reduce (na_masks , axis = 0 )
2257
+
2258
+ if na_rep is None and union_mask .any ():
2259
+ # no na_rep means NaNs for all rows where any column has a NaN
2260
+ # only necessary if there are actually any NaNs
2261
+ result = np .empty (len (data ), dtype = object )
2262
+ np .putmask (result , union_mask , np .nan )
2263
+
2264
+ not_masked = ~ union_mask
2265
+ result [not_masked ] = cat_core ([x [not_masked ] for x in all_cols ],
2266
+ sep )
2267
+ elif na_rep is not None and union_mask .any ():
2268
+ # fill NaNs with na_rep in case there are actually any NaNs
2269
+ all_cols = [np .where (nm , na_rep , col )
2270
+ for nm , col in zip (na_masks , all_cols )]
2271
+ result = cat_core (all_cols , sep )
2272
+ else :
2273
+ # no NaNs - can just concatenate
2274
+ result = cat_core (all_cols , sep )
2334
2275
2335
2276
if isinstance (self ._orig , Index ):
2336
- res = Index (res , name = self ._orig .name )
2277
+ result = Index (result , name = self ._orig .name )
2337
2278
else : # Series
2338
- res = Series (res , index = data .index , name = self ._orig .name )
2339
- return res
2279
+ result = Series (result , index = data .index , name = self ._orig .name )
2280
+ return result
2340
2281
2341
2282
_shared_docs ['str_split' ] = ("""
2342
2283
Split strings around given separator/delimiter.
0 commit comments