Skip to content

Commit 0c656fd

Browse files
committed
ENH: add regex parameter to .loc
1 parent 5b1a870 commit 0c656fd

File tree

4 files changed

+167
-2
lines changed

4 files changed

+167
-2
lines changed

doc/source/user_guide/indexing.rst

+32
Original file line numberDiff line numberDiff line change
@@ -563,6 +563,38 @@ without using a temporary variable.
563563
(bb.groupby(['year', 'team']).sum()
564564
.loc[lambda df: df.r > 100])
565565
566+
567+
.. _indexing.selecting_with_regex:
568+
569+
Selection by regular expression
570+
-------------------------------
571+
572+
.. versionadded:: 0.25.0
573+
574+
it is possible to call :attr:`~DataFrame.loc` with parameter ``regex=True`` to select by
575+
row/columns axis labels that match a regular expression pattern.
576+
577+
.. ipython:: python
578+
579+
df = pd.DataFrame(1, index=["A", "AB", "BC"], columns=["BC", "AB", "A"])
580+
df
581+
582+
df.loc(regex=True)["B", "B"]
583+
df.loc(axis=1, regex=True)["B"]
584+
585+
The regex matching will only work when looking up single strings, not list of strings etc.
586+
587+
.. ipython:: python
588+
589+
df.loc(regex=True)[["A"], "A"]
590+
591+
*Notice*: Is is currently not possible to set values for a given regular expression.
592+
593+
.. ipython:: python
594+
595+
df.loc(regex=True)["B", "B"] = [[1, 2], [3, 4]]
596+
597+
566598
.. _indexing.deprecate_ix:
567599

568600
IX indexer is deprecated

doc/source/whatsnew/v0.25.0.rst

+29
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,35 @@ The repr now looks like this:
182182
json_normalize(data, max_level=1)
183183
184184
185+
.. _whatsnew_0250.enhancements.loc_regex:
186+
187+
Selection using regular expressions
188+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
189+
190+
It is now possible to call :attr:`~DataFrame.loc` with parameter ``regex=True`` to select by
191+
row/columns axis labels that match a regular expression pattern.
192+
193+
.. ipython:: python
194+
195+
df = pd.DataFrame(1, index=["A", "AB", "BC"], columns=["BC", "AB", "A"])
196+
df
197+
198+
df.loc(regex=True)["B", "B"]
199+
df.loc(axis=1, regex=True)["B"]
200+
201+
The regex matching will only work when looking up single strings, not list of strings etc.
202+
203+
.. ipython:: python
204+
205+
df.loc(regex=True)[["A"], "A"]
206+
207+
Is is currently not possible to set values for a given regular expression.
208+
209+
.. ipython:: python
210+
211+
df.loc(regex=True)["B", "B"] = [[1, 2], [3, 4]]
212+
213+
185214
.. _whatsnew_0250.enhancements.explode:
186215

187216
Series.explode to split list-like values to rows

pandas/core/indexing.py

+46-2
Original file line numberDiff line numberDiff line change
@@ -939,11 +939,15 @@ def _getitem_lowerdim(self, tup: Tuple):
939939
# slice returns a new object.
940940
if com.is_null_slice(new_key):
941941
return section
942-
# This is an elided recursive call to iloc/loc/etc'
943-
return getattr(section, self.name)[new_key]
942+
943+
return self._getitem_lower_dim(section, new_key)
944944

945945
raise IndexingError("not applicable")
946946

947+
def _getitem_lower_dim(self, section, key):
948+
# This is an elided recursive call to iloc/loc/etc'
949+
return getattr(section, self.name)[key]
950+
947951
def _getitem_nested_tuple(self, tup: Tuple):
948952
# we have a nested tuple so have at least 1 multi-index level
949953
# we should be able to match up the dimensionality here
@@ -1689,6 +1693,14 @@ class _LocIndexer(_LocationIndexer):
16891693
)
16901694
_exception = KeyError
16911695

1696+
regex = False
1697+
1698+
def __call__(self, axis=None, regex=False):
1699+
new_self = super().__call__(axis=axis)
1700+
if regex:
1701+
new_self.regex = regex
1702+
return new_self
1703+
16921704
@Appender(_NDFrameIndexer._validate_key.__doc__)
16931705
def _validate_key(self, key, axis: int):
16941706

@@ -1755,6 +1767,27 @@ def _get_partial_string_timestamp_match_key(self, key, labels):
17551767

17561768
return key
17571769

1770+
def _get_regex_mappings(self, key, axis=None):
1771+
import re
1772+
1773+
if axis is None:
1774+
axis = self.axis or 0
1775+
1776+
labels = self.obj._get_axis(axis)
1777+
1778+
matcher = re.compile(key)
1779+
1780+
def f(x):
1781+
return matcher.search(x) is not None
1782+
1783+
return labels.map(f)
1784+
1785+
def _getitem_regex(self, key, axis=None):
1786+
"""Subset obj by regex-searching axis for key."""
1787+
mapped = self._get_regex_mappings(key, axis)
1788+
1789+
return self.obj.loc(axis=axis)[mapped]
1790+
17581791
def _getitem_axis(self, key, axis: int):
17591792
key = item_from_zerodim(key)
17601793
if is_iterator(key):
@@ -1816,10 +1849,21 @@ def _getitem_axis(self, key, axis: int):
18161849
indexer[axis] = locs
18171850
return self.obj.iloc[tuple(indexer)]
18181851

1852+
if self.regex and isinstance(key, str):
1853+
return self._getitem_regex(key, axis=axis)
1854+
18191855
# fall thru to straight lookup
18201856
self._validate_key(key, axis)
18211857
return self._get_label(key, axis=axis)
18221858

1859+
def _getitem_lower_dim(self, section, key):
1860+
return getattr(section, self.name)(regex=self.regex)[key]
1861+
1862+
def __setitem__(self, key, value):
1863+
if self.regex:
1864+
raise TypeError("Inserting with regex not supported")
1865+
return super().__setitem__(key, value)
1866+
18231867

18241868
class _iLocIndexer(_LocationIndexer):
18251869
"""

pandas/tests/indexing/test_loc.py

+60
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,66 @@
1313
from pandas.util import testing as tm
1414

1515

16+
class TestLocRegex:
17+
# test calls to df.loc, with a regex parameter set to True,
18+
# ie. df.loc(..., regex=True)[...]
19+
20+
idx = ["AB", "BC", "CD", "DE"]
21+
cols = idx[::-1]
22+
23+
def test_regex_frame(self):
24+
idx, cols = self.idx, self.cols
25+
26+
df = pd.DataFrame(1, index=idx, columns=cols)
27+
ser = df["AB"]
28+
29+
result = df.loc(regex=True)["B"]
30+
expected = pd.DataFrame(1, index=["AB", "BC"], columns=cols)
31+
tm.assert_frame_equal(result, expected)
32+
33+
result = ser.loc(regex=True)["B"]
34+
expected = pd.Series(1, index=["AB", "BC"], name="AB")
35+
tm.assert_series_equal(result, expected)
36+
37+
result = df.loc(regex=True)[:, "B"]
38+
expected = pd.DataFrame(1, index=idx, columns=["BC", "AB"])
39+
tm.assert_frame_equal(result, expected)
40+
41+
result = df.loc(regex=True)["B", "B"]
42+
expected = pd.DataFrame(1, index=["AB", "BC"], columns=["BC", "AB"])
43+
tm.assert_frame_equal(result, expected)
44+
45+
def test_regex_empty(self):
46+
idx, cols = self.idx, self.cols
47+
48+
df = pd.DataFrame(1, index=idx, columns=cols)
49+
ser = df["AB"]
50+
51+
result = df.loc(regex=True)["X"]
52+
expected = pd.DataFrame(columns=cols, dtype="int64")
53+
tm.assert_frame_equal(result, expected)
54+
55+
result = ser.loc(regex=True)["X"]
56+
expected = pd.Series(name="AB", dtype="int64")
57+
tm.assert_series_equal(result, expected)
58+
59+
result = df.loc(regex=True)[:, "X"]
60+
expected = pd.DataFrame(index=idx, dtype="int64")
61+
tm.assert_frame_equal(result, expected)
62+
63+
result = df.loc(regex=True)["X", "X"]
64+
expected = pd.DataFrame(dtype="int64")
65+
tm.assert_frame_equal(result, expected)
66+
67+
def test_regex_inserting(self):
68+
idx, cols = self.idx, self.cols
69+
70+
df = pd.DataFrame(1, index=idx, columns=cols)
71+
72+
with pytest.raises(TypeError, match="Inserting with regex not supported"):
73+
df.loc(regex=True)["B", "B"] = [[2, 2], [2, 2]]
74+
75+
1676
class TestLoc(Base):
1777
def test_loc_getitem_dups(self):
1878
# GH 5678

0 commit comments

Comments
 (0)