@@ -1429,12 +1429,10 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default):
1429
1429
def fullmatch (self , pat , case : bool = True , flags : int = 0 , na = lib .no_default ):
1430
1430
"""
1431
1431
Determine if each string entirely matches a regular expression.
1432
-
1433
1432
Checks if each string in the Series or Index fully matches the
1434
1433
specified regular expression pattern. This function is useful when the
1435
1434
requirement is for an entire string to conform to a pattern, such as
1436
1435
validating formats like phone numbers or email addresses.
1437
-
1438
1436
Parameters
1439
1437
----------
1440
1438
pat : str
@@ -1448,29 +1446,77 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default):
1448
1446
array. For object-dtype, ``numpy.nan`` is used. For the nullable
1449
1447
``StringDtype``, ``pandas.NA`` is used. For the ``"str"`` dtype,
1450
1448
``False`` is used.
1451
-
1452
1449
Returns
1453
1450
-------
1454
1451
Series/Index/array of boolean values
1455
1452
The function returns a Series, Index, or array of boolean values,
1456
1453
where True indicates that the entire string matches the regular
1457
1454
expression pattern and False indicates that it does not.
1458
-
1459
1455
See Also
1460
1456
--------
1461
1457
match : Similar, but also returns `True` when only a *prefix* of the string
1462
1458
matches the regular expression.
1463
1459
extract : Extract matched groups.
1464
-
1460
+ Notes
1461
+ -----
1462
+ This method enforces consistent behavior between Python's string dtype
1463
+ and PyArrow-backed string arrays when using regular expressions
1464
+ containing alternation (|). For regex patterns with alternation operators,
1465
+ the method ensures proper grouping by wrapping the pattern in parentheses
1466
+ when using PyArrow-backed string arrays.
1465
1467
Examples
1466
1468
--------
1467
1469
>>> ser = pd.Series(["cat", "duck", "dove"])
1468
1470
>>> ser.str.fullmatch(r"d.+")
1469
- 0 False
1470
- 1 True
1471
- 2 True
1471
+ 0 False
1472
+ 1 True
1473
+ 2 True
1474
+ dtype: bool
1475
+ Ensure consistent behavior with alternation patterns:
1476
+ >>> ser = pd.Series(['asdf', 'as'], dtype='string[pyarrow]')
1477
+ >>> ser.str.fullmatch(r"(as)|(as)")
1478
+ 0 False
1479
+ 1 True
1472
1480
dtype: bool
1473
1481
"""
1482
+ is_pyarrow = False
1483
+ arr = self ._data .array
1484
+ arr_type = type (arr ).__name__
1485
+ is_pyarrow = arr_type == "ArrowStringArray"
1486
+ if not is_pyarrow :
1487
+ is_pyarrow = "Arrow" in arr_type
1488
+ if not is_pyarrow and hasattr (arr , "dtype" ):
1489
+ dtype_str = str (arr .dtype )
1490
+ is_pyarrow = "pyarrow" in dtype_str .lower () or "arrow" in dtype_str .lower ()
1491
+ if is_pyarrow and "|" in pat :
1492
+ def _is_fully_wrapped (pattern ):
1493
+ if not (pattern .startswith ('(' ) and pattern .endswith (')' )):
1494
+ return False
1495
+ inner = pattern [1 :- 1 ]
1496
+ level = 0
1497
+ escape = False
1498
+ in_char_class = False
1499
+ for char in inner :
1500
+ if escape :
1501
+ escape = False
1502
+ continue
1503
+ if char == '\\ ' :
1504
+ escape = True
1505
+ elif not in_char_class and char == '[' :
1506
+ in_char_class = True
1507
+ elif in_char_class and char == ']' :
1508
+ in_char_class = False
1509
+ elif not in_char_class :
1510
+ if char == '(' :
1511
+ level += 1
1512
+ elif char == ')' :
1513
+ if level == 0 :
1514
+ return False
1515
+ level -= 1
1516
+ return level == 0
1517
+ if not (pat .startswith ('(' ) and pat .endswith (')' ) and
1518
+ _is_fully_wrapped (pat )):
1519
+ pat = f"({ pat } )"
1474
1520
result = self ._data .array ._str_fullmatch (pat , case = case , flags = flags , na = na )
1475
1521
return self ._wrap_result (result , fill_value = na , returns_string = False )
1476
1522
0 commit comments