Skip to content

Commit 835476f

Browse files
committed
ENH: additional unicode handling
1 parent f0afc3b commit 835476f

File tree

2 files changed

+169
-10
lines changed

2 files changed

+169
-10
lines changed

pandas/core/strings.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -83,10 +83,10 @@ def wrapped(arr, n_results=None, *otherargs):
8383

8484
return wrapped
8585

86-
startswith = mapwrap(str.startswith)
87-
contains = mapwrap(str.__contains__)
88-
upper = mapwrap(str.upper)
89-
lower = mapwrap(str.lower)
86+
startswith = mapwrap(lambda x, p: x.startswith(p))
87+
contains = mapwrap(lambda x, p: x.__contains__(p))
88+
upper = mapwrap(lambda x: x.upper())
89+
lower = mapwrap(lambda x: x.lower())
9090

9191
def _re_get_groups(pattern, n):
9292
def inner(s, *groups):
@@ -294,7 +294,7 @@ def str_lower(arr):
294294
-------
295295
lowercase : array
296296
"""
297-
return _na_map(str.lower, arr)
297+
return _na_map(lambda x: x.lower(), arr)
298298

299299

300300
def str_upper(arr):
@@ -305,7 +305,7 @@ def str_upper(arr):
305305
-------
306306
uppercase : array
307307
"""
308-
return _na_map(str.upper, arr)
308+
return _na_map(lambda x: x.upper(), arr)
309309

310310

311311
def str_replace(arr, pat, repl, n=0):
@@ -358,7 +358,7 @@ def rep(x, r):
358358
except TypeError:
359359
return unicode.__mul__(x, r)
360360
repeats = np.asarray(repeats, dtype=object)
361-
result = lib.vec_binop(arr, repeats, str.__mul__)
361+
result = lib.vec_binop(arr, repeats, rep)
362362
return result
363363

364364
def str_match(arr, pat):
@@ -535,7 +535,7 @@ def str_strip(arr):
535535
-------
536536
stripped : array
537537
"""
538-
return _na_map(str.strip, arr)
538+
return _na_map(lambda x: x.strip(), arr)
539539

540540

541541
def str_lstrip(arr):
@@ -547,7 +547,7 @@ def str_lstrip(arr):
547547
-------
548548
stripped : array
549549
"""
550-
return _na_map(str.lstrip, arr)
550+
return _na_map(lambda x: x.lstrip(), arr)
551551

552552

553553
def str_rstrip(arr):
@@ -559,7 +559,7 @@ def str_rstrip(arr):
559559
-------
560560
stripped : array
561561
"""
562-
return _na_map(str.rstrip, arr)
562+
return _na_map(lambda x: x.rstrip(), arr)
563563

564564

565565
def str_wrap(arr, width=80):

pandas/tests/test_strings.py

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,17 @@ def test_count(self):
7171
self.assert_(isinstance(rs, Series))
7272
tm.assert_almost_equal(rs, xp)
7373

74+
#unicode
75+
values = [u'foo', u'foofoo', NA, u'foooofooofommmfoo']
76+
77+
result = strings.str_count(values, 'f[o]+')
78+
exp = [1, 2, NA, 4]
79+
tm.assert_almost_equal(result, exp)
80+
81+
result = Series(values).str.count('f[o]+')
82+
self.assert_(isinstance(result, Series))
83+
tm.assert_almost_equal(result, exp)
84+
7485
def test_contains(self):
7586
values = ['foo', NA, 'fooommm__foo', 'mmm_']
7687
pat = 'mmm[_]+'
@@ -95,6 +106,20 @@ def test_contains(self):
95106
self.assert_(isinstance(rs, Series))
96107
tm.assert_almost_equal(rs, xp)
97108

109+
#unicode
110+
values = [u'foo', NA, u'fooommm__foo', u'mmm_']
111+
pat = 'mmm[_]+'
112+
113+
result = strings.str_contains(values, pat)
114+
expected = [False, np.nan, True, True]
115+
tm.assert_almost_equal(result, expected)
116+
117+
values = ['foo', 'xyz', 'fooommm__foo', 'mmm_']
118+
result = strings.str_contains(values, pat)
119+
expected = [False, False, True, True]
120+
self.assert_(result.dtype == np.bool_)
121+
tm.assert_almost_equal(result, expected)
122+
98123
def test_startswith(self):
99124
values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo'])
100125

@@ -112,6 +137,14 @@ def test_startswith(self):
112137
self.assert_(isinstance(rs, Series))
113138
tm.assert_almost_equal(rs, xp)
114139

140+
#unicode
141+
values = Series([u'om', NA, u'foo_nom', u'nom', u'bar_foo', NA,
142+
u'foo'])
143+
144+
result = values.str.startswith('foo')
145+
exp = Series([False, NA, True, False, False, NA, True])
146+
tm.assert_series_equal(result, exp)
147+
115148
def test_endswith(self):
116149
values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo'])
117150

@@ -129,6 +162,14 @@ def test_endswith(self):
129162
self.assert_(isinstance(rs, Series))
130163
tm.assert_almost_equal(rs, xp)
131164

165+
#unicode
166+
values = Series([u'om', NA, u'foo_nom', u'nom', u'bar_foo', NA,
167+
u'foo'])
168+
169+
result = values.str.endswith('foo')
170+
exp = Series([False, NA, False, False, True, NA, True])
171+
tm.assert_series_equal(result, exp)
172+
132173
def test_lower_upper(self):
133174
values = Series(['om', NA, 'nom', 'nom'])
134175

@@ -148,6 +189,16 @@ def test_lower_upper(self):
148189
self.assert_(isinstance(rs, Series))
149190
tm.assert_almost_equal(rs, xp)
150191

192+
#unicode
193+
values = Series([u'om', NA, u'nom', u'nom'])
194+
195+
result = values.str.upper()
196+
exp = Series([u'OM', NA, u'NOM', u'NOM'])
197+
tm.assert_series_equal(result, exp)
198+
199+
result = result.str.lower()
200+
tm.assert_series_equal(result, values)
201+
151202
def test_replace(self):
152203
values = Series(['fooBAD__barBAD', NA])
153204

@@ -168,6 +219,17 @@ def test_replace(self):
168219
self.assert_(isinstance(rs, Series))
169220
tm.assert_almost_equal(rs, xp)
170221

222+
#unicode
223+
values = Series([u'fooBAD__barBAD', NA])
224+
225+
result = values.str.replace('BAD[_]*', '')
226+
exp = Series([u'foobar', NA])
227+
tm.assert_series_equal(result, exp)
228+
229+
result = values.str.replace('BAD[_]*', '', n=1)
230+
exp = Series([u'foobarBAD', NA])
231+
tm.assert_series_equal(result, exp)
232+
171233
def test_repeat(self):
172234
values = Series(['a', 'b', NA, 'c', NA, 'd'])
173235

@@ -188,6 +250,18 @@ def test_repeat(self):
188250
self.assert_(isinstance(rs, Series))
189251
tm.assert_almost_equal(rs, xp)
190252

253+
#unicode
254+
values = Series([u'a', u'b', NA, u'c', NA, u'd'])
255+
256+
result = values.str.repeat(3)
257+
exp = Series([u'aaa', u'bbb', NA, u'ccc', NA, u'ddd'])
258+
tm.assert_series_equal(result, exp)
259+
260+
result = values.str.repeat([1, 2, 3, 4, 5, 6])
261+
exp = Series([u'a', u'bb', NA, u'cccc', NA, u'dddddd'])
262+
tm.assert_series_equal(result, exp)
263+
264+
191265
def test_match(self):
192266
values = Series(['fooBAD__barBAD', NA, 'foo'])
193267

@@ -204,6 +278,13 @@ def test_match(self):
204278
self.assert_(isinstance(rs, Series))
205279
tm.assert_almost_equal(rs, xp)
206280

281+
#unicode
282+
values = Series([u'fooBAD__barBAD', NA, u'foo'])
283+
284+
result = values.str.match('.*(BAD[_]+).*(BAD)')
285+
exp = Series([(u'BAD__', u'BAD'), NA, []])
286+
tm.assert_series_equal(result, exp)
287+
207288
def test_join(self):
208289
values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'])
209290
result = values.str.split('_').str.join('_')
@@ -219,6 +300,11 @@ def test_join(self):
219300
self.assert_(isinstance(rs, Series))
220301
tm.assert_almost_equal(rs, xp)
221302

303+
#unicode
304+
values = Series([u'a_b_c', u'c_d_e', np.nan, u'f_g_h'])
305+
result = values.str.split('_').str.join('_')
306+
tm.assert_series_equal(values, result)
307+
222308
def test_len(self):
223309
values = Series(['foo', 'fooo', 'fooooo', np.nan, 'fooooooo'])
224310

@@ -236,6 +322,13 @@ def test_len(self):
236322
self.assert_(isinstance(rs, Series))
237323
tm.assert_almost_equal(rs, xp)
238324

325+
#unicode
326+
values = Series([u'foo', u'fooo', u'fooooo', np.nan, u'fooooooo'])
327+
328+
result = values.str.len()
329+
exp = values.map(lambda x: len(x) if com.notnull(x) else NA)
330+
tm.assert_series_equal(result, exp)
331+
239332
def test_findall(self):
240333
values = Series(['fooBAD__barBAD', NA, 'foo', 'BAD'])
241334

@@ -253,6 +346,13 @@ def test_findall(self):
253346
self.assert_(isinstance(rs, Series))
254347
tm.assert_almost_equal(rs, xp)
255348

349+
#unicode
350+
values = Series([u'fooBAD__barBAD', NA, u'foo', u'BAD'])
351+
352+
result = values.str.findall('BAD[_]*')
353+
exp = Series([[u'BAD__', u'BAD'], NA, [], [u'BAD']])
354+
tm.assert_almost_equal(result, exp)
355+
256356
def test_pad(self):
257357
values = Series(['a', 'b', NA, 'c', NA, 'eeeeee'])
258358

@@ -296,6 +396,21 @@ def test_pad(self):
296396
self.assert_(isinstance(rs, Series))
297397
tm.assert_almost_equal(rs, xp)
298398

399+
#unicode
400+
values = Series([u'a', u'b', NA, u'c', NA, u'eeeeee'])
401+
402+
result = values.str.pad(5, side='left')
403+
exp = Series([u' a', u' b', NA, u' c', NA, u'eeeeee'])
404+
tm.assert_almost_equal(result, exp)
405+
406+
result = values.str.pad(5, side='right')
407+
exp = Series([u'a ', u'b ', NA, u'c ', NA, u'eeeeee'])
408+
tm.assert_almost_equal(result, exp)
409+
410+
result = values.str.pad(5, side='both')
411+
exp = Series([u' a ', u' b ', NA, u' c ', NA, u'eeeeee'])
412+
tm.assert_almost_equal(result, exp)
413+
299414
def test_center(self):
300415
values = Series(['a', 'b', NA, 'c', NA, 'eeeeee'])
301416

@@ -314,6 +429,13 @@ def test_center(self):
314429
self.assert_(isinstance(rs, Series))
315430
tm.assert_almost_equal(rs, xp)
316431

432+
#unicode
433+
values = Series([u'a', u'b', NA, u'c', NA, u'eeeeee'])
434+
435+
result = values.str.center(5)
436+
exp = Series([u' a ', u' b ', NA, u' c ', NA, u'eeeeee'])
437+
tm.assert_almost_equal(result, exp)
438+
317439
def test_split(self):
318440
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
319441

@@ -332,6 +454,14 @@ def test_split(self):
332454
self.assert_(isinstance(rs, Series))
333455
tm.assert_almost_equal(rs, xp)
334456

457+
#unicode
458+
values = Series([u'a_b_c', u'c_d_e', NA, u'f_g_h'])
459+
460+
result = values.str.split('_')
461+
exp = Series([[u'a', u'b', u'c'], [u'c', u'd', u'e'], NA,
462+
[u'f', u'g', u'h']])
463+
tm.assert_series_equal(result, exp)
464+
335465
def test_slice(self):
336466
values = Series(['aafootwo','aabartwo', NA, 'aabazqux'])
337467

@@ -350,6 +480,13 @@ def test_slice(self):
350480
self.assert_(isinstance(rs, Series))
351481
tm.assert_almost_equal(rs, xp)
352482

483+
#unicode
484+
values = Series([u'aafootwo', u'aabartwo', NA, u'aabazqux'])
485+
486+
result = values.str.slice(2, 5)
487+
exp = Series([u'foo', u'bar', NA, u'baz'])
488+
tm.assert_series_equal(result, exp)
489+
353490
def test_slice_replace(self):
354491
pass
355492

@@ -393,6 +530,21 @@ def test_strip_lstrip_rstrip(self):
393530
self.assert_(isinstance(rs, Series))
394531
tm.assert_almost_equal(rs, xp)
395532

533+
#unicode
534+
values = Series([u' aa ', u' bb \n', NA, u'cc '])
535+
536+
result = values.str.strip()
537+
exp = Series([u'aa', u'bb', NA, u'cc'])
538+
tm.assert_series_equal(result, exp)
539+
540+
result = values.str.lstrip()
541+
exp = Series([u'aa ', u'bb \n', NA, u'cc '])
542+
tm.assert_series_equal(result, exp)
543+
544+
result = values.str.rstrip()
545+
exp = Series([u' aa', u' bb', NA, u'cc'])
546+
tm.assert_series_equal(result, exp)
547+
396548
def test_wrap(self):
397549
pass
398550

@@ -414,6 +566,13 @@ def test_get(self):
414566
self.assert_(isinstance(rs, Series))
415567
tm.assert_almost_equal(rs, xp)
416568

569+
#unicode
570+
values = Series([u'a_b_c', u'c_d_e', np.nan, u'f_g_h'])
571+
572+
result = values.str.split('_').str.get(1)
573+
expected = Series([u'b', u'd', np.nan, u'g'])
574+
tm.assert_series_equal(result, expected)
575+
417576
if __name__ == '__main__':
418577
nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],
419578
exit=False)

0 commit comments

Comments
 (0)