@@ -415,6 +415,94 @@ def test_match(self):
415
415
exp = Series ([(u ('BAD__' ), u ('BAD' )), NA , []])
416
416
tm .assert_series_equal (result , exp )
417
417
418
+ def test_extract (self ):
419
+ # Contains tests like those in test_match and some others.
420
+
421
+ values = Series (['fooBAD__barBAD' , NA , 'foo' ])
422
+ er = [NA , NA ] # empty row
423
+
424
+ result = values .str .extract ('.*(BAD[_]+).*(BAD)' )
425
+ exp = DataFrame ([['BAD__' , 'BAD' ], er , er ])
426
+ tm .assert_frame_equal (result , exp )
427
+
428
+ # mixed
429
+ mixed = Series (['aBAD_BAD' , NA , 'BAD_b_BAD' , True , datetime .today (),
430
+ 'foo' , None , 1 , 2. ])
431
+
432
+ rs = Series (mixed ).str .extract ('.*(BAD[_]+).*(BAD)' )
433
+ exp = DataFrame ([['BAD_' , 'BAD' ], er , ['BAD_' , 'BAD' ], er , er ,
434
+ er , er , er , er ])
435
+ tm .assert_frame_equal (rs , exp )
436
+
437
+ # unicode
438
+ values = Series ([u ('fooBAD__barBAD' ), NA , u ('foo' )])
439
+
440
+ result = values .str .extract ('.*(BAD[_]+).*(BAD)' )
441
+ exp = DataFrame ([[u ('BAD__' ), u ('BAD' )], er , er ])
442
+ tm .assert_frame_equal (result , exp )
443
+
444
+ # no groups
445
+ s = Series (['A1' , 'B2' , 'C3' ])
446
+ f = lambda : s .str .extract ('[ABC][123]' )
447
+ self .assertRaises (ValueError , f )
448
+
449
+ # only non-capturing groups
450
+ f = lambda : s .str .extract ('(?:[AB]).*' )
451
+ self .assertRaises (ValueError , f )
452
+
453
+ # one group, no matches
454
+ result = s .str .extract ('(_)' )
455
+ exp = Series ([NA , NA , NA ])
456
+ tm .assert_series_equal (result , exp )
457
+
458
+ # two groups, no matches
459
+ result = s .str .extract ('(_)(_)' )
460
+ exp = DataFrame ([[NA , NA ], [NA , NA ], [NA , NA ]])
461
+ tm .assert_frame_equal (result , exp )
462
+
463
+ # one group, some matches
464
+ result = s .str .extract ('([AB])[123]' )
465
+ exp = Series (['A' , 'B' , NA ])
466
+ tm .assert_series_equal (result , exp )
467
+
468
+ # two groups, some matches
469
+ result = s .str .extract ('([AB])([123])' )
470
+ exp = DataFrame ([['A' , '1' ], ['B' , '2' ], [NA , NA ]])
471
+ tm .assert_frame_equal (result , exp )
472
+
473
+ # named group/groups
474
+ result = s .str .extract ('(?P<letter>[AB])(?P<number>[123])' )
475
+ exp = DataFrame ([['A' , '1' ], ['B' , '2' ], [NA , NA ]], columns = ['letter' , 'number' ])
476
+ tm .assert_frame_equal (result , exp )
477
+ result = s .str .extract ('(?P<letter>[AB])' )
478
+ exp = Series (['A' , 'B' , NA ], name = 'letter' )
479
+ tm .assert_series_equal (result , exp )
480
+
481
+ # mix named and unnamed groups
482
+ result = s .str .extract ('([AB])(?P<number>[123])' )
483
+ exp = DataFrame ([['A' , '1' ], ['B' , '2' ], [NA , NA ]], columns = [0 , 'number' ])
484
+ tm .assert_frame_equal (result , exp )
485
+
486
+ # one normal group, one non-capturing group
487
+ result = s .str .extract ('([AB])(?:[123])' )
488
+ exp = Series (['A' , 'B' , NA ])
489
+ tm .assert_series_equal (result , exp )
490
+
491
+ # two normal groups, one non-capturing group
492
+ result = Series (['A11' , 'B22' , 'C33' ]).str .extract ('([AB])([123])(?:[123])' )
493
+ exp = DataFrame ([['A' , '1' ], ['B' , '2' ], [NA , NA ]])
494
+ tm .assert_frame_equal (result , exp )
495
+
496
+ # one optional group followed by one normal group
497
+ result = Series (['A1' , 'B2' , '3' ]).str .extract ('(?P<letter>[AB])?(?P<number>[123])' )
498
+ exp = DataFrame ([['A' , '1' ], ['B' , '2' ], [NA , '3' ]], columns = ['letter' , 'number' ])
499
+ tm .assert_frame_equal (result , exp )
500
+
501
+ # one normal group followed by one optional group
502
+ result = Series (['A1' , 'B2' , 'C' ]).str .extract ('(?P<letter>[ABC])(?P<number>[123])?' )
503
+ exp = DataFrame ([['A' , '1' ], ['B' , '2' ], ['C' , NA ]], columns = ['letter' , 'number' ])
504
+ tm .assert_frame_equal (result , exp )
505
+
418
506
def test_join (self ):
419
507
values = Series (['a_b_c' , 'c_d_e' , np .nan , 'f_g_h' ])
420
508
result = values .str .split ('_' ).str .join ('_' )
0 commit comments