Skip to content

ENH: add regex parameter to .loc #27363

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions doc/source/user_guide/indexing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,39 @@ without using a temporary variable.
(bb.groupby(['year', 'team']).sum()
.loc[lambda df: df.r > 100])


.. _indexing.selecting_with_regex:

Selection by regular expression
-------------------------------

.. versionadded:: 1.0

it is possible to call :attr:`~DataFrame.loc` with parameter ``regex=True`` to select by
row/columns axis labels that match a regular expression pattern.

.. ipython:: python

df_re = pd.DataFrame(1, index=["A", "AB", "BC"], columns=["BC", "AB", "A"])
df_re

df_re.loc(regex=True)["B", "B"]
df_re.loc(axis=1, regex=True)["B"]

The regex matching will only work when looking up single strings, not list of strings etc.

.. ipython:: python

df_re.loc(regex=True)[["A"], "A"]

*Notice*: Is is currently not possible to set values for a given regular expression.

.. ipython:: python
:okexcept:

df_re.loc(regex=True)["B", "B"] = [[1, 2], [3, 4]]


.. _indexing.deprecate_ix:

IX indexer is deprecated
Expand Down
30 changes: 30 additions & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,36 @@ including other versions of pandas.
Enhancements
~~~~~~~~~~~~

.. _whatsnew_1000.enhancements.loc_regex:

Selection using regular expressions
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

It is now possible to call :attr:`~DataFrame.loc` with parameter ``regex=True`` to select by
row/columns axis labels that match a regular expression pattern.

.. ipython:: python

df = pd.DataFrame(1, index=["A", "AB", "BC"], columns=["BC", "AB", "A"])
df

df.loc(regex=True)["B", "B"]
df.loc(axis=1, regex=True)["B"]

The regex matching will only work when looking up single strings, not list of strings etc.

.. ipython:: python

df.loc(regex=True)[["A"], "A"]

Is is currently not possible to set values for a given regular expression.

.. ipython:: python
:okexcept:

df.loc(regex=True)["B", "B"] = [[1, 2], [3, 4]]


.. _whatsnew_1000.enhancements.other:

-
Expand Down
76 changes: 75 additions & 1 deletion pandas/core/indexing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
import textwrap
from typing import Tuple
import warnings
Expand Down Expand Up @@ -25,6 +26,7 @@
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
from pandas.core.dtypes.missing import _infer_fill_value, isna

from pandas._typing import FrameOrSeries
import pandas.core.common as com
from pandas.core.index import Index, InvalidIndexError, MultiIndex
from pandas.core.indexers import is_list_like_indexer, length_of_indexer
Expand Down Expand Up @@ -1182,7 +1184,7 @@ def _validate_read_indexer(
)

if not (ax.is_categorical() or ax.is_interval()):
warnings.warn(_missing_key_warning, FutureWarning, stacklevel=6)
warnings.warn(_missing_key_warning, FutureWarning, stacklevel=7)

def _convert_to_indexer(
self, obj, axis: int, is_setter: bool = False, raise_missing: bool = False
Expand Down Expand Up @@ -1467,6 +1469,12 @@ class _LocIndexer(_LocationIndexer):
- A ``callable`` function with one argument (the calling Series or
DataFrame) and that returns valid output for indexing (one of the above)

``.loc`` can be called before selecting using these parameters:

- ``axis``, to select by a single axis on a DataFrame, e.g. ``.loc(axis=1)['a']``.
- ``regex``, to let single strings be interpreted as regex patterns, e.g.
``.loc(regex=True)[:, '^col_']``

See more at :ref:`Selection by Label <indexing.label>`

Raises
Expand Down Expand Up @@ -1546,6 +1554,21 @@ class _LocIndexer(_LocationIndexer):
max_speed shield
sidewinder 7 8

The axis may be preselected

>>> df.loc(axis=1)["max_speed"]
cobra 1
viper 4
sidewinder 7
Name: max_speed, dtype: int64

Single strings are considered regex patterns if ``regex=True``

>>> df.loc(regex=True)["r$", "d$"]
max_speed shield
viper 4 5
sidewinder 7 8

**Setting values**

Set value for all items matching the list of labels
Expand Down Expand Up @@ -1689,6 +1712,19 @@ class _LocIndexer(_LocationIndexer):
)
_exception = KeyError

regex = False

def __call__(self, axis=None, regex=False):
new_self = super().__call__(axis=axis)
if regex:
new_self.regex = regex
return new_self

def __getitem__(self, key):
if self.regex:
return self._getitem_regex(key)
return super().__getitem__(key)

@Appender(_NDFrameIndexer._validate_key.__doc__)
def _validate_key(self, key, axis: int):

Expand Down Expand Up @@ -1755,6 +1791,36 @@ def _get_partial_string_timestamp_match_key(self, key, labels):

return key

def _getitem_regex(self, key, axis=None):
"""Subset obj by regex-searching axis for key."""
if isinstance(key, str):
if axis is None:
axis = self.axis or 0
return self._get_regex_axis(self.obj, pattern=key, axis=axis)
elif isinstance(key, tuple):
assert len(key) == 2
result = self.obj # type: ABCDataFrame
# slicing columns first, then index is typically faster
for ax, sub_key in zip([1, 0], reversed(key)):
if isinstance(sub_key, str):
result = self._get_regex_axis(result, pattern=sub_key, axis=ax)
else:
result = result.loc(axis=ax)[sub_key]
return result

def _get_regex_axis(
self, obj: FrameOrSeries, pattern: str, axis: int
) -> FrameOrSeries:
"""Subset a single axis of ``obj`` from a regex pattern."""
labels = obj._get_axis(axis)
matcher = re.compile(pattern)

def func(x):
return matcher.search(x) is not None

mapped = labels.map(func)
return obj.loc(axis=axis)[mapped]

def _getitem_axis(self, key, axis: int):
key = item_from_zerodim(key)
if is_iterator(key):
Expand Down Expand Up @@ -1816,10 +1882,18 @@ def _getitem_axis(self, key, axis: int):
indexer[axis] = locs
return self.obj.iloc[tuple(indexer)]

elif self.regex and isinstance(key, (str, bytes)):
return self._getitem_regex(key, axis=axis)

# fall thru to straight lookup
self._validate_key(key, axis)
return self._get_label(key, axis=axis)

def __setitem__(self, key, value):
if self.regex:
raise NotImplementedError("Inserting with regex has not been implemented")
return super().__setitem__(key, value)


class _iLocIndexer(_LocationIndexer):
"""
Expand Down
61 changes: 61 additions & 0 deletions pandas/tests/indexing/test_loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,67 @@
from pandas.util import testing as tm


class TestLocRegex:
# test calls to df.loc, with a regex parameter set to True,
# ie. df.loc(..., regex=True)[...]

idx = ["AB", "BC", "CD", "DE"]
cols = idx[::-1]

def test_regex_frame(self):
idx, cols = self.idx, self.cols

df = pd.DataFrame(1, index=idx, columns=cols)
ser = df["AB"]

result = df.loc(regex=True)["B"]
expected = pd.DataFrame(1, index=["AB", "BC"], columns=cols)
tm.assert_frame_equal(result, expected)

result = ser.loc(regex=True)["B"]
expected = pd.Series(1, index=["AB", "BC"], name="AB")
tm.assert_series_equal(result, expected)

result = df.loc(regex=True)[:, "B"]
expected = pd.DataFrame(1, index=idx, columns=["BC", "AB"])
tm.assert_frame_equal(result, expected)

result = df.loc(regex=True)["B", "B"]
expected = pd.DataFrame(1, index=["AB", "BC"], columns=["BC", "AB"])
tm.assert_frame_equal(result, expected)

def test_regex_empty(self):
idx, cols = self.idx, self.cols

df = pd.DataFrame(1, index=idx, columns=cols)
ser = df["AB"]

result = df.loc(regex=True)["X"]
expected = pd.DataFrame(columns=cols, dtype="int64")
tm.assert_frame_equal(result, expected)

result = ser.loc(regex=True)["X"]
expected = pd.Series(name="AB", dtype="int64")
tm.assert_series_equal(result, expected)

result = df.loc(regex=True)[:, "X"]
expected = pd.DataFrame(index=idx, dtype="int64")
tm.assert_frame_equal(result, expected)

result = df.loc(regex=True)["X", "X"]
expected = pd.DataFrame(dtype="int64")
tm.assert_frame_equal(result, expected)

def test_regex_inserting(self):
idx, cols = self.idx, self.cols

df = pd.DataFrame(1, index=idx, columns=cols)

msg = "Inserting with regex has not been implemented"
with pytest.raises(NotImplementedError, match=msg):
df.loc(regex=True)["B", "B"] = [[2, 2], [2, 2]]


class TestLoc(Base):
def test_loc_getitem_dups(self):
# GH 5678
Expand Down