Skip to content

Commit df7498f

Browse files
authored
ENH: Implement str.extract for ArrowDtype (#56334)
1 parent 2718b4e commit df7498f

File tree

3 files changed

+45
-8
lines changed

3 files changed

+45
-8
lines changed

doc/source/whatsnew/v2.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@ Other enhancements
227227
- Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`)
228228
- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
229229
- Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`)
230+
- Implemented :meth:`Series.str.extract` for :class:`ArrowDtype` (:issue:`56268`)
230231
- Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as "BMS" (:issue:`56243`)
231232
- Improved error message when constructing :class:`Period` with invalid offsets such as "QS" (:issue:`55785`)
232233

pandas/core/arrays/arrow/array.py

+13-3
Original file line numberDiff line numberDiff line change
@@ -2297,9 +2297,19 @@ def _str_encode(self, encoding: str, errors: str = "strict"):
22972297
return type(self)(pa.chunked_array(result))
22982298

22992299
def _str_extract(self, pat: str, flags: int = 0, expand: bool = True):
2300-
raise NotImplementedError(
2301-
"str.extract not supported with pd.ArrowDtype(pa.string())."
2302-
)
2300+
if flags:
2301+
raise NotImplementedError("Only flags=0 is implemented.")
2302+
groups = re.compile(pat).groupindex.keys()
2303+
if len(groups) == 0:
2304+
raise ValueError(f"{pat=} must contain a symbolic group name.")
2305+
result = pc.extract_regex(self._pa_array, pat)
2306+
if expand:
2307+
return {
2308+
col: type(self)(pc.struct_field(result, [i]))
2309+
for col, i in zip(groups, range(result.type.num_fields))
2310+
}
2311+
else:
2312+
return type(self)(pc.struct_field(result, [0]))
23032313

23042314
def _str_findall(self, pat: str, flags: int = 0):
23052315
regex = re.compile(pat, flags=flags)

pandas/tests/extension/test_arrow.py

+31-5
Original file line numberDiff line numberDiff line change
@@ -2159,14 +2159,40 @@ def test_str_rsplit():
21592159
tm.assert_frame_equal(result, expected)
21602160

21612161

2162-
def test_str_unsupported_extract():
2163-
ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
2164-
with pytest.raises(
2165-
NotImplementedError, match="str.extract not supported with pd.ArrowDtype"
2166-
):
2162+
def test_str_extract_non_symbolic():
2163+
ser = pd.Series(["a1", "b2", "c3"], dtype=ArrowDtype(pa.string()))
2164+
with pytest.raises(ValueError, match="pat=.* must contain a symbolic group name."):
21672165
ser.str.extract(r"[ab](\d)")
21682166

21692167

2168+
@pytest.mark.parametrize("expand", [True, False])
2169+
def test_str_extract(expand):
2170+
ser = pd.Series(["a1", "b2", "c3"], dtype=ArrowDtype(pa.string()))
2171+
result = ser.str.extract(r"(?P<letter>[ab])(?P<digit>\d)", expand=expand)
2172+
expected = pd.DataFrame(
2173+
{
2174+
"letter": ArrowExtensionArray(pa.array(["a", "b", None])),
2175+
"digit": ArrowExtensionArray(pa.array(["1", "2", None])),
2176+
}
2177+
)
2178+
tm.assert_frame_equal(result, expected)
2179+
2180+
2181+
def test_str_extract_expand():
2182+
ser = pd.Series(["a1", "b2", "c3"], dtype=ArrowDtype(pa.string()))
2183+
result = ser.str.extract(r"[ab](?P<digit>\d)", expand=True)
2184+
expected = pd.DataFrame(
2185+
{
2186+
"digit": ArrowExtensionArray(pa.array(["1", "2", None])),
2187+
}
2188+
)
2189+
tm.assert_frame_equal(result, expected)
2190+
2191+
result = ser.str.extract(r"[ab](?P<digit>\d)", expand=False)
2192+
expected = pd.Series(ArrowExtensionArray(pa.array(["1", "2", None])), name="digit")
2193+
tm.assert_series_equal(result, expected)
2194+
2195+
21702196
@pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"])
21712197
def test_duration_from_strings_with_nat(unit):
21722198
# GH51175

0 commit comments

Comments
 (0)