Skip to content

Commit 255e8ac

Browse files
mroeschkepmhatre1
authored andcommitted
PERF: Return RangeIndex columns in str.extract when possible (pandas-dev#57542)
1 parent ce29ce4 commit 255e8ac

File tree

3 files changed

+14
-7
lines changed

3 files changed

+14
-7
lines changed

doc/source/whatsnew/v3.0.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ Performance improvements
176176
- Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`)
177177
- Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`)
178178
- Performance improvement in indexing operations for string dtypes (:issue:`56997`)
179-
-
179+
- :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`?``)
180180

181181
.. ---------------------------------------------------------------------------
182182
.. _whatsnew_300.bug_fixes:

pandas/core/strings/accessor.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -3557,7 +3557,7 @@ def _get_single_group_name(regex: re.Pattern) -> Hashable:
35573557
return None
35583558

35593559

3560-
def _get_group_names(regex: re.Pattern) -> list[Hashable]:
3560+
def _get_group_names(regex: re.Pattern) -> list[Hashable] | range:
35613561
"""
35623562
Get named groups from compiled regex.
35633563
@@ -3571,8 +3571,15 @@ def _get_group_names(regex: re.Pattern) -> list[Hashable]:
35713571
-------
35723572
list of column labels
35733573
"""
3574+
rng = range(regex.groups)
35743575
names = {v: k for k, v in regex.groupindex.items()}
3575-
return [names.get(1 + i, i) for i in range(regex.groups)]
3576+
if not names:
3577+
return rng
3578+
result: list[Hashable] = [names.get(1 + i, i) for i in rng]
3579+
arr = np.array(result)
3580+
if arr.dtype.kind == "i" and lib.is_range_indexer(arr, len(arr)):
3581+
return rng
3582+
return result
35763583

35773584

35783585
def str_extractall(arr, pat, flags: int = 0) -> DataFrame:

pandas/tests/strings/test_extract.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -563,13 +563,13 @@ def test_extractall_no_matches(data, names, any_string_dtype):
563563

564564
# one un-named group.
565565
result = s.str.extractall("(z)")
566-
expected = DataFrame(columns=[0], index=expected_index, dtype=any_string_dtype)
567-
tm.assert_frame_equal(result, expected)
566+
expected = DataFrame(columns=range(1), index=expected_index, dtype=any_string_dtype)
567+
tm.assert_frame_equal(result, expected, check_column_type=True)
568568

569569
# two un-named groups.
570570
result = s.str.extractall("(z)(z)")
571-
expected = DataFrame(columns=[0, 1], index=expected_index, dtype=any_string_dtype)
572-
tm.assert_frame_equal(result, expected)
571+
expected = DataFrame(columns=range(2), index=expected_index, dtype=any_string_dtype)
572+
tm.assert_frame_equal(result, expected, check_column_type=True)
573573

574574
# one named group.
575575
result = s.str.extractall("(?P<first>z)")

0 commit comments

Comments
 (0)