4
4
import numpy as np
5
5
import pytest
6
6
7
+ import pandas .util ._test_decorators as td
8
+
7
9
import pandas as pd
8
10
from pandas import (
9
11
Index ,
12
14
)
13
15
14
16
15
- def test_contains ():
17
+ @pytest .fixture (
18
+ params = [
19
+ "object" ,
20
+ "string" ,
21
+ pytest .param (
22
+ "arrow_string" , marks = td .skip_if_no ("pyarrow" , min_version = "1.0.0" )
23
+ ),
24
+ ]
25
+ )
26
+ def any_string_dtype (request ):
27
+ """
28
+ Parametrized fixture for string dtypes.
29
+ * 'object'
30
+ * 'string'
31
+ * 'arrow_string'
32
+ """
33
+ from pandas .core .arrays .string_arrow import ArrowStringDtype # noqa: F401
34
+
35
+ return request .param
36
+
37
+
38
+ def test_contains (any_string_dtype ):
16
39
values = np .array (
17
40
["foo" , np .nan , "fooommm__foo" , "mmm_" , "foommm[_]+bar" ], dtype = np .object_
18
41
)
19
- values = Series (values )
42
+ values = Series (values , dtype = any_string_dtype )
20
43
pat = "mmm[_]+"
21
44
22
45
result = values .str .contains (pat )
23
- expected = Series (np .array ([False , np .nan , True , True , False ], dtype = np .object_ ))
46
+ expected_dtype = "object" if any_string_dtype == "object" else "boolean"
47
+ expected = Series (
48
+ np .array ([False , np .nan , True , True , False ], dtype = np .object_ ),
49
+ dtype = expected_dtype ,
50
+ )
24
51
tm .assert_series_equal (result , expected )
25
52
26
53
result = values .str .contains (pat , regex = False )
27
- expected = Series (np .array ([False , np .nan , False , False , True ], dtype = np .object_ ))
54
+ expected = Series (
55
+ np .array ([False , np .nan , False , False , True ], dtype = np .object_ ),
56
+ dtype = expected_dtype ,
57
+ )
28
58
tm .assert_series_equal (result , expected )
29
59
30
- values = Series (np .array (["foo" , "xyz" , "fooommm__foo" , "mmm_" ], dtype = object ))
60
+ values = Series (
61
+ np .array (["foo" , "xyz" , "fooommm__foo" , "mmm_" ], dtype = object ),
62
+ dtype = any_string_dtype ,
63
+ )
31
64
result = values .str .contains (pat )
32
- expected = Series ( np .array ([ False , False , True , True ]))
33
- assert result . dtype == np .bool_
65
+ expected_dtype = np .bool_ if any_string_dtype == "object" else "boolean"
66
+ expected = Series ( np .array ([ False , False , True , True ]), dtype = expected_dtype )
34
67
tm .assert_series_equal (result , expected )
35
68
36
69
# case insensitive using regex
37
- values = Series (np .array (["Foo" , "xYz" , "fOOomMm__fOo" , "MMM_" ], dtype = object ))
70
+ values = Series (
71
+ np .array (["Foo" , "xYz" , "fOOomMm__fOo" , "MMM_" ], dtype = object ),
72
+ dtype = any_string_dtype ,
73
+ )
38
74
result = values .str .contains ("FOO|mmm" , case = False )
39
- expected = Series (np .array ([True , False , True , True ]))
75
+ expected = Series (np .array ([True , False , True , True ]), dtype = expected_dtype )
40
76
tm .assert_series_equal (result , expected )
41
77
42
78
# case insensitive without regex
43
- result = Series ( values ) .str .contains ("foo" , regex = False , case = False )
44
- expected = Series (np .array ([True , False , True , False ]))
79
+ result = values .str .contains ("foo" , regex = False , case = False )
80
+ expected = Series (np .array ([True , False , True , False ]), dtype = expected_dtype )
45
81
tm .assert_series_equal (result , expected )
46
82
47
- # mixed
83
+ # unicode
84
+ values = Series (
85
+ np .array (["foo" , np .nan , "fooommm__foo" , "mmm_" ], dtype = np .object_ ),
86
+ dtype = any_string_dtype ,
87
+ )
88
+ pat = "mmm[_]+"
89
+
90
+ result = values .str .contains (pat )
91
+ expected_dtype = "object" if any_string_dtype == "object" else "boolean"
92
+ expected = Series (
93
+ np .array ([False , np .nan , True , True ], dtype = np .object_ ), dtype = expected_dtype
94
+ )
95
+ tm .assert_series_equal (result , expected )
96
+
97
+ result = values .str .contains (pat , na = False )
98
+ expected_dtype = np .bool_ if any_string_dtype == "object" else "boolean"
99
+ expected = Series (np .array ([False , False , True , True ]), dtype = expected_dtype )
100
+ tm .assert_series_equal (result , expected )
101
+
102
+ values = Series (
103
+ np .array (["foo" , "xyz" , "fooommm__foo" , "mmm_" ], dtype = np .object_ ),
104
+ dtype = any_string_dtype ,
105
+ )
106
+ result = values .str .contains (pat )
107
+ expected = Series (np .array ([False , False , True , True ]), dtype = expected_dtype )
108
+ tm .assert_series_equal (result , expected )
109
+
110
+
111
+ def test_contains_object_mixed ():
48
112
mixed = Series (
49
113
np .array (
50
114
["a" , np .nan , "b" , True , datetime .today (), "foo" , None , 1 , 2.0 ],
51
115
dtype = object ,
52
116
)
53
117
)
54
- rs = mixed .str .contains ("o" )
55
- xp = Series (
118
+ result = mixed .str .contains ("o" )
119
+ expected = Series (
56
120
np .array (
57
121
[False , np .nan , False , np .nan , np .nan , True , np .nan , np .nan , np .nan ],
58
122
dtype = np .object_ ,
59
123
)
60
124
)
61
- tm .assert_series_equal (rs , xp )
62
-
63
- rs = mixed .str .contains ("o" )
64
- xp = Series ([False , np .nan , False , np .nan , np .nan , True , np .nan , np .nan , np .nan ])
65
- assert isinstance (rs , Series )
66
- tm .assert_series_equal (rs , xp )
67
-
68
- # unicode
69
- values = Series (np .array (["foo" , np .nan , "fooommm__foo" , "mmm_" ], dtype = np .object_ ))
70
- pat = "mmm[_]+"
71
-
72
- result = values .str .contains (pat )
73
- expected = Series (np .array ([False , np .nan , True , True ], dtype = np .object_ ))
74
- tm .assert_series_equal (result , expected )
75
-
76
- result = values .str .contains (pat , na = False )
77
- expected = Series (np .array ([False , False , True , True ]))
78
- tm .assert_series_equal (result , expected )
79
-
80
- values = Series (np .array (["foo" , "xyz" , "fooommm__foo" , "mmm_" ], dtype = np .object_ ))
81
- result = values .str .contains (pat )
82
- expected = Series (np .array ([False , False , True , True ]))
83
- assert result .dtype == np .bool_
84
125
tm .assert_series_equal (result , expected )
85
126
86
127
87
- def test_contains_for_object_category ():
128
+ def test_contains_na_kwarg_for_object_category ():
88
129
# gh 22158
89
130
90
131
# na for category
@@ -108,6 +149,29 @@ def test_contains_for_object_category():
108
149
tm .assert_series_equal (result , expected )
109
150
110
151
152
+ @pytest .mark .parametrize (
153
+ "na, expected" ,
154
+ [
155
+ (None , pd .NA ),
156
+ (True , True ),
157
+ (False , False ),
158
+ (0 , False ),
159
+ (3 , True ),
160
+ (np .nan , pd .NA ),
161
+ ],
162
+ )
163
+ @pytest .mark .parametrize ("regex" , [True , False ])
164
+ def test_contains_na_kwarg_for_nullable_string_dtype (
165
+ nullable_string_dtype , na , expected , regex
166
+ ):
167
+ # https://github.com/pandas-dev/pandas/pull/41025#issuecomment-824062416
168
+
169
+ values = Series (["a" , "b" , "c" , "a" , np .nan ], dtype = nullable_string_dtype )
170
+ result = values .str .contains ("a" , na = na , regex = regex )
171
+ expected = Series ([True , False , False , True , expected ], dtype = "boolean" )
172
+ tm .assert_series_equal (result , expected )
173
+
174
+
111
175
@pytest .mark .parametrize ("dtype" , [None , "category" ])
112
176
@pytest .mark .parametrize ("null_value" , [None , np .nan , pd .NA ])
113
177
@pytest .mark .parametrize ("na" , [True , False ])
@@ -508,59 +572,73 @@ def _check(result, expected):
508
572
tm .assert_series_equal (result , expected )
509
573
510
574
511
- def test_contains_moar ():
575
+ def test_contains_moar (any_string_dtype ):
512
576
# PR #1179
513
- s = Series (["A" , "B" , "C" , "Aaba" , "Baca" , "" , np .nan , "CABA" , "dog" , "cat" ])
577
+ s = Series (
578
+ ["A" , "B" , "C" , "Aaba" , "Baca" , "" , np .nan , "CABA" , "dog" , "cat" ],
579
+ dtype = any_string_dtype ,
580
+ )
514
581
515
582
result = s .str .contains ("a" )
583
+ expected_dtype = "object" if any_string_dtype == "object" else "boolean"
516
584
expected = Series (
517
- [False , False , False , True , True , False , np .nan , False , False , True ]
585
+ [False , False , False , True , True , False , np .nan , False , False , True ],
586
+ dtype = expected_dtype ,
518
587
)
519
588
tm .assert_series_equal (result , expected )
520
589
521
590
result = s .str .contains ("a" , case = False )
522
591
expected = Series (
523
- [True , False , False , True , True , False , np .nan , True , False , True ]
592
+ [True , False , False , True , True , False , np .nan , True , False , True ],
593
+ dtype = expected_dtype ,
524
594
)
525
595
tm .assert_series_equal (result , expected )
526
596
527
597
result = s .str .contains ("Aa" )
528
598
expected = Series (
529
- [False , False , False , True , False , False , np .nan , False , False , False ]
599
+ [False , False , False , True , False , False , np .nan , False , False , False ],
600
+ dtype = expected_dtype ,
530
601
)
531
602
tm .assert_series_equal (result , expected )
532
603
533
604
result = s .str .contains ("ba" )
534
605
expected = Series (
535
- [False , False , False , True , False , False , np .nan , False , False , False ]
606
+ [False , False , False , True , False , False , np .nan , False , False , False ],
607
+ dtype = expected_dtype ,
536
608
)
537
609
tm .assert_series_equal (result , expected )
538
610
539
611
result = s .str .contains ("ba" , case = False )
540
612
expected = Series (
541
- [False , False , False , True , True , False , np .nan , True , False , False ]
613
+ [False , False , False , True , True , False , np .nan , True , False , False ],
614
+ dtype = expected_dtype ,
542
615
)
543
616
tm .assert_series_equal (result , expected )
544
617
545
618
546
- def test_contains_nan ():
619
+ def test_contains_nan (any_string_dtype ):
547
620
# PR #14171
548
- s = Series ([np .nan , np .nan , np .nan ], dtype = np . object_ )
621
+ s = Series ([np .nan , np .nan , np .nan ], dtype = any_string_dtype )
549
622
550
623
result = s .str .contains ("foo" , na = False )
551
- expected = Series ([False , False , False ], dtype = np .bool_ )
624
+ expected_dtype = np .bool_ if any_string_dtype == "object" else "boolean"
625
+ expected = Series ([False , False , False ], dtype = expected_dtype )
552
626
tm .assert_series_equal (result , expected )
553
627
554
628
result = s .str .contains ("foo" , na = True )
555
- expected = Series ([True , True , True ], dtype = np . bool_ )
629
+ expected = Series ([True , True , True ], dtype = expected_dtype )
556
630
tm .assert_series_equal (result , expected )
557
631
558
632
result = s .str .contains ("foo" , na = "foo" )
559
- expected = Series (["foo" , "foo" , "foo" ], dtype = np .object_ )
633
+ if any_string_dtype == "object" :
634
+ expected = Series (["foo" , "foo" , "foo" ], dtype = np .object_ )
635
+ else :
636
+ expected = Series ([True , True , True ], dtype = "boolean" )
560
637
tm .assert_series_equal (result , expected )
561
638
562
639
result = s .str .contains ("foo" )
563
- expected = Series ([np .nan , np .nan , np .nan ], dtype = np .object_ )
640
+ expected_dtype = "object" if any_string_dtype == "object" else "boolean"
641
+ expected = Series ([np .nan , np .nan , np .nan ], dtype = expected_dtype )
564
642
tm .assert_series_equal (result , expected )
565
643
566
644
@@ -609,14 +687,14 @@ def test_replace_moar():
609
687
tm .assert_series_equal (result , expected )
610
688
611
689
612
- def test_match_findall_flags ( ):
690
+ def test_flags_kwarg ( any_string_dtype ):
613
691
data = {
614
692
615
693
616
694
617
695
"Wes" : np .nan ,
618
696
}
619
- data = Series (data )
697
+ data = Series (data , dtype = any_string_dtype )
620
698
621
699
pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"
622
700
0 commit comments