Skip to content

[ArrowStringArray] REF: extract/extractall column names #41417

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 12, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 27 additions & 10 deletions pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
import re
from typing import (
Dict,
Hashable,
List,
Optional,
Pattern,
)
import warnings

Expand Down Expand Up @@ -3036,13 +3038,31 @@ def _result_dtype(arr):
return object


def _get_single_group_name(rx):
try:
return list(rx.groupindex.keys()).pop()
except IndexError:
def _get_single_group_name(regex: Pattern) -> Hashable:
if regex.groupindex:
return next(iter(regex.groupindex))
else:
return None


def _get_group_names(regex: Pattern) -> List[Hashable]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code has been working for a while, but wondering if it'd be useful to add a test for this function, seems trivial to add.

"""
Get named groups from compiled regex.

Unnamed groups are numbered.

Parameters
----------
regex : compiled regex

Returns
-------
list of column labels
"""
names = {v: k for k, v in regex.groupindex.items()}
return [names.get(1 + i, i) for i in range(regex.groups)]


def _str_extract_noexpand(arr, pat, flags=0):
"""
Find groups in each string in the Series using passed regular
Expand All @@ -3069,8 +3089,7 @@ def _str_extract_noexpand(arr, pat, flags=0):
if isinstance(arr, ABCIndex):
raise ValueError("only one regex group is supported with Index")
name = None
names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
columns = [names.get(1 + i, i) for i in range(regex.groups)]
columns = _get_group_names(regex)
if arr.size == 0:
# error: Incompatible types in assignment (expression has type
# "DataFrame", variable has type "ndarray")
Expand Down Expand Up @@ -3101,8 +3120,7 @@ def _str_extract_frame(arr, pat, flags=0):

regex = re.compile(pat, flags=flags)
groups_or_na = _groups_or_na_fun(regex)
names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
columns = [names.get(1 + i, i) for i in range(regex.groups)]
columns = _get_group_names(regex)

if len(arr) == 0:
return DataFrame(columns=columns, dtype=object)
Expand Down Expand Up @@ -3139,8 +3157,7 @@ def str_extractall(arr, pat, flags=0):
if isinstance(arr, ABCIndex):
arr = arr.to_series().reset_index(drop=True)

names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
columns = [names.get(1 + i, i) for i in range(regex.groups)]
columns = _get_group_names(regex)
match_list = []
index_list = []
is_mi = arr.index.nlevels > 1
Expand Down