@@ -3034,22 +3034,6 @@ def cat_core(list_of_columns: List, sep: str):
3034
3034
return np .sum (arr_with_sep , axis = 0 )
3035
3035
3036
3036
3037
- def _groups_or_na_fun (regex ):
3038
- """Used in both extract_noexpand and extract_frame"""
3039
- empty_row = [np .nan ] * regex .groups
3040
-
3041
- def f (x ):
3042
- if not isinstance (x , str ):
3043
- return empty_row
3044
- m = regex .search (x )
3045
- if m :
3046
- return [np .nan if item is None else item for item in m .groups ()]
3047
- else :
3048
- return empty_row
3049
-
3050
- return f
3051
-
3052
-
3053
3037
def _result_dtype (arr ):
3054
3038
# workaround #27953
3055
3039
# ideally we just pass `dtype=arr.dtype` unconditionally, but this fails
@@ -3087,41 +3071,31 @@ def _get_group_names(regex: Pattern) -> List[Hashable]:
3087
3071
return [names .get (1 + i , i ) for i in range (regex .groups )]
3088
3072
3089
3073
3090
- def _str_extract_noexpand (arr : ArrayLike , pat : str , flags = 0 ):
3074
+ def _str_extract (arr : ArrayLike , pat : str , flags = 0 , expand : bool = True ):
3091
3075
"""
3092
3076
Find groups in each string in the array using passed regular expression.
3093
3077
3094
- This function is called from str_extract(expand=False) when there is a single group
3095
- in the regex.
3096
-
3097
3078
Returns
3098
3079
-------
3099
- np.ndarray
3080
+ np.ndarray or list of lists is expand is True
3100
3081
"""
3101
3082
regex = re .compile (pat , flags = flags )
3102
- groups_or_na = _groups_or_na_fun (regex )
3103
-
3104
- result = np .array ([groups_or_na (val )[0 ] for val in np .asarray (arr )], dtype = object )
3105
- return result
3106
-
3107
3083
3108
- def _str_extract_expand (arr : ArrayLike , pat : str , flags : int = 0 ) -> List [List ]:
3109
- """
3110
- Find groups in each string in the array using passed regular expression.
3111
-
3112
- For each subject string in the array, extract groups from the first match of
3113
- regular expression pat. This function is called from str_extract(expand=True) or
3114
- str_extract(expand=False) when there is more than one group in the regex.
3084
+ empty_row = [np .nan ] * regex .groups
3115
3085
3116
- Returns
3117
- -------
3118
- list of lists
3086
+ def f (x ):
3087
+ if not isinstance (x , str ):
3088
+ return empty_row
3089
+ m = regex .search (x )
3090
+ if m :
3091
+ return [np .nan if item is None else item for item in m .groups ()]
3092
+ else :
3093
+ return empty_row
3119
3094
3120
- """
3121
- regex = re .compile (pat , flags = flags )
3122
- groups_or_na = _groups_or_na_fun (regex )
3095
+ if expand :
3096
+ return [f (val ) for val in np .asarray (arr )]
3123
3097
3124
- return [ groups_or_na (val ) for val in np .asarray (arr )]
3098
+ return np . array ([ f (val )[ 0 ] for val in np .asarray (arr )], dtype = object )
3125
3099
3126
3100
3127
3101
def str_extract (accessor : StringMethods , pat : str , flags : int = 0 , expand : bool = True ):
@@ -3143,7 +3117,7 @@ def str_extract(accessor: StringMethods, pat: str, flags: int = 0, expand: bool
3143
3117
result = DataFrame (columns = columns , dtype = result_dtype )
3144
3118
3145
3119
else :
3146
- result_list = _str_extract_expand (obj .array , pat , flags = flags )
3120
+ result_list = _str_extract (obj .array , pat , flags = flags , expand = returns_df )
3147
3121
3148
3122
result_index : Optional ["Index" ]
3149
3123
if isinstance (obj , ABCSeries ):
@@ -3157,7 +3131,7 @@ def str_extract(accessor: StringMethods, pat: str, flags: int = 0, expand: bool
3157
3131
3158
3132
else :
3159
3133
name = _get_single_group_name (regex )
3160
- result_arr = _str_extract_noexpand (obj .array , pat , flags = flags )
3134
+ result_arr = _str_extract (obj .array , pat , flags = flags , expand = returns_df )
3161
3135
# not dispatching, so we have to reconstruct here.
3162
3136
result = pd_array (result_arr , dtype = result_dtype )
3163
3137
return accessor ._wrap_result (result , name = name )
0 commit comments