Skip to content

Commit 2ef0292

Browse files
[ArrowStringArray] REF: str.extract - precusor to move from accessor to array (#41539)
1 parent ef99443 commit 2ef0292

File tree

1 file changed

+16
-42
lines changed

1 file changed

+16
-42
lines changed

pandas/core/strings/accessor.py

+16-42
Original file line numberDiff line numberDiff line change
@@ -3034,22 +3034,6 @@ def cat_core(list_of_columns: List, sep: str):
30343034
return np.sum(arr_with_sep, axis=0)
30353035

30363036

3037-
def _groups_or_na_fun(regex):
3038-
"""Used in both extract_noexpand and extract_frame"""
3039-
empty_row = [np.nan] * regex.groups
3040-
3041-
def f(x):
3042-
if not isinstance(x, str):
3043-
return empty_row
3044-
m = regex.search(x)
3045-
if m:
3046-
return [np.nan if item is None else item for item in m.groups()]
3047-
else:
3048-
return empty_row
3049-
3050-
return f
3051-
3052-
30533037
def _result_dtype(arr):
30543038
# workaround #27953
30553039
# ideally we just pass `dtype=arr.dtype` unconditionally, but this fails
@@ -3087,41 +3071,31 @@ def _get_group_names(regex: Pattern) -> List[Hashable]:
30873071
return [names.get(1 + i, i) for i in range(regex.groups)]
30883072

30893073

3090-
def _str_extract_noexpand(arr: ArrayLike, pat: str, flags=0):
3074+
def _str_extract(arr: ArrayLike, pat: str, flags=0, expand: bool = True):
30913075
"""
30923076
Find groups in each string in the array using passed regular expression.
30933077
3094-
This function is called from str_extract(expand=False) when there is a single group
3095-
in the regex.
3096-
30973078
Returns
30983079
-------
3099-
np.ndarray
3080+
np.ndarray or list of lists is expand is True
31003081
"""
31013082
regex = re.compile(pat, flags=flags)
3102-
groups_or_na = _groups_or_na_fun(regex)
3103-
3104-
result = np.array([groups_or_na(val)[0] for val in np.asarray(arr)], dtype=object)
3105-
return result
3106-
31073083

3108-
def _str_extract_expand(arr: ArrayLike, pat: str, flags: int = 0) -> List[List]:
3109-
"""
3110-
Find groups in each string in the array using passed regular expression.
3111-
3112-
For each subject string in the array, extract groups from the first match of
3113-
regular expression pat. This function is called from str_extract(expand=True) or
3114-
str_extract(expand=False) when there is more than one group in the regex.
3084+
empty_row = [np.nan] * regex.groups
31153085

3116-
Returns
3117-
-------
3118-
list of lists
3086+
def f(x):
3087+
if not isinstance(x, str):
3088+
return empty_row
3089+
m = regex.search(x)
3090+
if m:
3091+
return [np.nan if item is None else item for item in m.groups()]
3092+
else:
3093+
return empty_row
31193094

3120-
"""
3121-
regex = re.compile(pat, flags=flags)
3122-
groups_or_na = _groups_or_na_fun(regex)
3095+
if expand:
3096+
return [f(val) for val in np.asarray(arr)]
31233097

3124-
return [groups_or_na(val) for val in np.asarray(arr)]
3098+
return np.array([f(val)[0] for val in np.asarray(arr)], dtype=object)
31253099

31263100

31273101
def str_extract(accessor: StringMethods, pat: str, flags: int = 0, expand: bool = True):
@@ -3143,7 +3117,7 @@ def str_extract(accessor: StringMethods, pat: str, flags: int = 0, expand: bool
31433117
result = DataFrame(columns=columns, dtype=result_dtype)
31443118

31453119
else:
3146-
result_list = _str_extract_expand(obj.array, pat, flags=flags)
3120+
result_list = _str_extract(obj.array, pat, flags=flags, expand=returns_df)
31473121

31483122
result_index: Optional["Index"]
31493123
if isinstance(obj, ABCSeries):
@@ -3157,7 +3131,7 @@ def str_extract(accessor: StringMethods, pat: str, flags: int = 0, expand: bool
31573131

31583132
else:
31593133
name = _get_single_group_name(regex)
3160-
result_arr = _str_extract_noexpand(obj.array, pat, flags=flags)
3134+
result_arr = _str_extract(obj.array, pat, flags=flags, expand=returns_df)
31613135
# not dispatching, so we have to reconstruct here.
31623136
result = pd_array(result_arr, dtype=result_dtype)
31633137
return accessor._wrap_result(result, name=name)

0 commit comments

Comments
 (0)