Skip to content

Commit 4a992e5

Browse files
committed
ENH: add regex parameter to .loc
1 parent c633579 commit 4a992e5

File tree

4 files changed

+167
-2
lines changed

4 files changed

+167
-2
lines changed

doc/source/user_guide/indexing.rst

+32
Original file line numberDiff line numberDiff line change
@@ -573,6 +573,38 @@ without using a temporary variable.
573573
(bb.groupby(['year', 'team']).sum()
574574
.loc[lambda df: df.r > 100])
575575
576+
577+
.. _indexing.selecting_with_regex:
578+
579+
Selection by regular expression
580+
-------------------------------
581+
582+
.. versionadded:: 0.25.0
583+
584+
it is possible to call :attr:`~DataFrame.loc` with parameter ``regex=True`` to select by
585+
row/columns axis labels that match a regular expression pattern.
586+
587+
.. ipython:: python
588+
589+
df = pd.DataFrame(1, index=["A", "AB", "BC", "CD"], columns=["CD", "BC", "AB", "A"])
590+
df
591+
592+
df.loc(regex=True)["B", "B"]
593+
df.loc(axis=1, regex=True)["B"]
594+
595+
The regex matching will only work when looking up single strings, not list of strings etc.
596+
597+
.. ipython:: python
598+
599+
df.loc(regex=True)[["A"], "A"]
600+
601+
*Notice*: Is is currently not possible to set values for a given regular expression.
602+
603+
.. ipython:: python
604+
605+
df.loc(regex=True)["B", "B"] = [[1, 2], [3, 4]]
606+
607+
576608
.. _indexing.deprecate_ix:
577609

578610
IX indexer is deprecated

doc/source/whatsnew/v0.25.0.rst

+29
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,35 @@ The repr now looks like this:
182182
json_normalize(data, max_level=1)
183183
184184
185+
.. _whatsnew_0250.enhancements.loc_regex:
186+
187+
Selection using regular expressions
188+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
189+
190+
It is now possible to call :attr:`~DataFrame.loc` with parameter ``regex=True`` to select by
191+
row/columns axis labels that match a regular expression pattern.
192+
193+
.. ipython:: python
194+
195+
df = pd.DataFrame(1, index=["A", "AB", "BC", "CD"], columns=["CD", "BC", "AB", "A"])
196+
df
197+
198+
df.loc(regex=True)["B", "B"]
199+
df.loc(axis=1, regex=True)["B"]
200+
201+
The regex matching will only work when looking up single strings, not list of strings etc.
202+
203+
.. ipython:: python
204+
205+
df.loc(regex=True)[["A"], "A"]
206+
207+
Is is currently not possible to set values for a given regular expression.
208+
209+
.. ipython:: python
210+
211+
df.loc(regex=True)["B", "B"] = [[1, 2], [3, 4]]
212+
213+
185214
.. _whatsnew_0250.enhancements.other:
186215

187216
Other enhancements

pandas/core/indexing.py

+46-2
Original file line numberDiff line numberDiff line change
@@ -942,11 +942,15 @@ def _getitem_lowerdim(self, tup):
942942
# slice returns a new object.
943943
if com.is_null_slice(new_key):
944944
return section
945-
# This is an elided recursive call to iloc/loc/etc'
946-
return getattr(section, self.name)[new_key]
945+
946+
return self._getitem_lower_dim(section, new_key)
947947

948948
raise IndexingError("not applicable")
949949

950+
def _getitem_lower_dim(self, section, key):
951+
# This is an elided recursive call to iloc/loc/etc'
952+
return getattr(section, self.name)[key]
953+
950954
def _getitem_nested_tuple(self, tup):
951955
# we have a nested tuple so have at least 1 multi-index level
952956
# we should be able to match up the dimensionality here
@@ -1692,6 +1696,14 @@ class _LocIndexer(_LocationIndexer):
16921696
)
16931697
_exception = KeyError
16941698

1699+
regex = False
1700+
1701+
def __call__(self, axis=None, regex=False):
1702+
new_self = super().__call__(axis=axis)
1703+
if regex:
1704+
new_self.regex = regex
1705+
return new_self
1706+
16951707
@Appender(_NDFrameIndexer._validate_key.__doc__)
16961708
def _validate_key(self, key, axis: int):
16971709

@@ -1761,6 +1773,27 @@ def _get_partial_string_timestamp_match_key(self, key, labels):
17611773

17621774
return key
17631775

1776+
def _get_regex_mappings(self, key, axis=None):
1777+
import re
1778+
1779+
if axis is None:
1780+
axis = self.axis or 0
1781+
1782+
labels = self.obj._get_axis(axis)
1783+
1784+
matcher = re.compile(key)
1785+
1786+
def f(x):
1787+
return matcher.search(x) is not None
1788+
1789+
return labels.map(f)
1790+
1791+
def _getitem_regex(self, key, axis=None):
1792+
"""Subset obj by regex-searching axis for key."""
1793+
mapped = self._get_regex_mappings(key, axis)
1794+
1795+
return self.obj.loc(axis=axis)[mapped]
1796+
17641797
def _getitem_axis(self, key, axis: int):
17651798
key = item_from_zerodim(key)
17661799
if is_iterator(key):
@@ -1822,10 +1855,21 @@ def _getitem_axis(self, key, axis: int):
18221855
indexer[axis] = locs
18231856
return self.obj.iloc[tuple(indexer)]
18241857

1858+
if self.regex and isinstance(key, str):
1859+
return self._getitem_regex(key, axis=axis)
1860+
18251861
# fall thru to straight lookup
18261862
self._validate_key(key, axis)
18271863
return self._get_label(key, axis=axis)
18281864

1865+
def _getitem_lower_dim(self, section, key):
1866+
return getattr(section, self.name)(regex=self.regex)[key]
1867+
1868+
def __setitem__(self, key, value):
1869+
if self.regex:
1870+
raise TypeError("Inserting with regex not supported")
1871+
return super().__setitem__(key, value)
1872+
18291873

18301874
class _iLocIndexer(_LocationIndexer):
18311875
"""

pandas/tests/indexing/test_loc.py

+60
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,66 @@
1313
from pandas.util import testing as tm
1414

1515

16+
class TestLocRegex:
17+
# test calls to df.loc, with a regex parameter set to True,
18+
# ie. df.loc(..., regex=True)[...]
19+
20+
idx = ["AB", "BC", "CD", "DE"]
21+
cols = idx[::-1]
22+
23+
def test_regex_frame(self):
24+
idx, cols = self.idx, self.cols
25+
26+
df = pd.DataFrame(1, index=idx, columns=cols)
27+
ser = df["AB"]
28+
29+
result = df.loc(regex=True)["B"]
30+
expected = pd.DataFrame(1, index=["AB", "BC"], columns=cols)
31+
tm.assert_frame_equal(result, expected)
32+
33+
result = ser.loc(regex=True)["B"]
34+
expected = pd.Series(1, index=["AB", "BC"], name="AB")
35+
tm.assert_series_equal(result, expected)
36+
37+
result = df.loc(regex=True)[:, "B"]
38+
expected = pd.DataFrame(1, index=idx, columns=["BC", "AB"])
39+
tm.assert_frame_equal(result, expected)
40+
41+
result = df.loc(regex=True)["B", "B"]
42+
expected = pd.DataFrame(1, index=["AB", "BC"], columns=["BC", "AB"])
43+
tm.assert_frame_equal(result, expected)
44+
45+
def test_regex_empty(self):
46+
idx, cols = self.idx, self.cols
47+
48+
df = pd.DataFrame(1, index=idx, columns=cols)
49+
ser = df["AB"]
50+
51+
result = df.loc(regex=True)["X"]
52+
expected = pd.DataFrame(columns=cols, dtype="int64")
53+
tm.assert_frame_equal(result, expected)
54+
55+
result = ser.loc(regex=True)["X"]
56+
expected = pd.Series(name="AB", dtype="int64")
57+
tm.assert_series_equal(result, expected)
58+
59+
result = df.loc(regex=True)[:, "X"]
60+
expected = pd.DataFrame(index=idx, dtype="int64")
61+
tm.assert_frame_equal(result, expected)
62+
63+
result = df.loc(regex=True)["X", "X"]
64+
expected = pd.DataFrame(dtype="int64")
65+
tm.assert_frame_equal(result, expected)
66+
67+
def test_regex_inserting(self):
68+
idx, cols = self.idx, self.cols
69+
70+
df = pd.DataFrame(1, index=idx, columns=cols)
71+
72+
with pytest.raises(TypeError, match="Inserting with regex not supported"):
73+
df.loc(regex=True)["B", "B"] = [[2, 2], [2, 2]]
74+
75+
1676
class TestLoc(Base):
1777
def test_loc_getitem_dups(self):
1878
# GH 5678

0 commit comments

Comments
 (0)