Skip to content

Commit 52f5ce0

Browse files
committed
Refactored regex handling in LocIndexer
1 parent 01619b5 commit 52f5ce0

File tree

3 files changed

+37
-28
lines changed

3 files changed

+37
-28
lines changed

doc/source/user_guide/indexing.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -569,7 +569,7 @@ without using a temporary variable.
569569
Selection by regular expression
570570
-------------------------------
571571

572-
.. versionadded:: 0.25.0
572+
.. versionadded:: 1.0
573573

574574
it is possible to call :attr:`~DataFrame.loc` with parameter ``regex=True`` to select by
575575
row/columns axis labels that match a regular expression pattern.

doc/source/whatsnew/v1.0.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ including other versions of pandas.
2121
Enhancements
2222
~~~~~~~~~~~~
2323

24-
.. _whatsnew_0250.enhancements.loc_regex:
24+
.. _whatsnew_1000.enhancements.loc_regex:
2525

2626
Selection using regular expressions
2727
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

pandas/core/indexing.py

+35-26
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
2727
from pandas.core.dtypes.missing import _infer_fill_value, isna
2828

29+
from pandas._typing import FrameOrSeries
2930
import pandas.core.common as com
3031
from pandas.core.index import Index, InvalidIndexError, MultiIndex
3132
from pandas.core.indexers import is_list_like_indexer, length_of_indexer
@@ -940,15 +941,11 @@ def _getitem_lowerdim(self, tup: Tuple):
940941
# slice returns a new object.
941942
if com.is_null_slice(new_key):
942943
return section
943-
944-
return self._getitem_lower_section(section, new_key)
944+
# This is an elided recursive call to iloc/loc/etc'
945+
return getattr(section, self.name)[new_key]
945946

946947
raise IndexingError("not applicable")
947948

948-
def _getitem_lower_section(self, section, key):
949-
# This is an elided recursive call to iloc/loc/etc'
950-
return getattr(section, self.name)[key]
951-
952949
def _getitem_nested_tuple(self, tup: Tuple):
953950
# we have a nested tuple so have at least 1 multi-index level
954951
# we should be able to match up the dimensionality here
@@ -1472,10 +1469,10 @@ class _LocIndexer(_LocationIndexer):
14721469
- A ``callable`` function with one argument (the calling Series or
14731470
DataFrame) and that returns valid output for indexing (one of the above)
14741471
1475-
``.loc`` can be called before selecting using parameters:
1472+
``.loc`` can be called before selecting using these parameters:
14761473
14771474
- ``axis``, to select by a single axis on a DataFrame, e.g. ``.loc(axis=1)['a']``.
1478-
- ``regex``, to let strings be interpreted as regex patterns, e.g.
1475+
- ``regex``, to let single qstrings be interpreted as regex patterns, e.g.
14791476
``.loc(regex=True)[:, '^col_']``
14801477
14811478
See more at :ref:`Selection by Label <indexing.label>`
@@ -1723,6 +1720,11 @@ def __call__(self, axis=None, regex=False):
17231720
new_self.regex = regex
17241721
return new_self
17251722

1723+
def __getitem__(self, key):
1724+
if self.regex:
1725+
return self._getitem_regex(key)
1726+
return super().__getitem__(key)
1727+
17261728
@Appender(_NDFrameIndexer._validate_key.__doc__)
17271729
def _validate_key(self, key, axis: int):
17281730

@@ -1789,25 +1791,35 @@ def _get_partial_string_timestamp_match_key(self, key, labels):
17891791

17901792
return key
17911793

1792-
def _get_regex_mappings(self, key, axis=None):
1793-
1794-
if axis is None:
1795-
axis = self.axis or 0
1796-
1797-
labels = self.obj._get_axis(axis)
1794+
def _getitem_regex(self, key, axis=None):
1795+
"""Subset obj by regex-searching axis for key."""
1796+
if isinstance(key, str):
1797+
if axis is None:
1798+
axis = self.axis or 0
1799+
return self._get_regex_axis(self.obj, pattern=key, axis=axis)
1800+
elif isinstance(key, tuple):
1801+
assert len(key) == 2
1802+
result = self.obj # type: ABCDataFrame
1803+
# slicing columns first, then index is typically faster
1804+
for ax, sub_key in zip([1, 0], reversed(key)):
1805+
if isinstance(sub_key, str):
1806+
result = result._get_regex_axis(result, pattern=sub_key, axis=ax)
1807+
else:
1808+
result = result.loc(axis=ax)[sub_key]
1809+
return result
17981810

1799-
matcher = re.compile(key)
1811+
def _get_regex_axis(
1812+
self, obj: FrameOrSeries, pattern: str, axis: int
1813+
) -> FrameOrSeries:
1814+
"""Subset a single axis of ``obj`` from a regex pattern."""
1815+
labels = obj._get_axis(axis)
1816+
matcher = re.compile(pattern)
18001817

1801-
def f(x):
1818+
def func(x):
18021819
return matcher.search(x) is not None
18031820

1804-
return labels.map(f)
1805-
1806-
def _getitem_regex(self, key, axis=None):
1807-
"""Subset obj by regex-searching axis for key."""
1808-
mapped = self._get_regex_mappings(key, axis)
1809-
1810-
return self.obj.loc(axis=axis)[mapped]
1821+
mapped = labels.map(func)
1822+
return obj.loc(axis=axis)[mapped]
18111823

18121824
def _getitem_axis(self, key, axis: int):
18131825
key = item_from_zerodim(key)
@@ -1877,9 +1889,6 @@ def _getitem_axis(self, key, axis: int):
18771889
self._validate_key(key, axis)
18781890
return self._get_label(key, axis=axis)
18791891

1880-
def _getitem_lower_section(self, section, key):
1881-
return getattr(section, self.name)(regex=self.regex)[key]
1882-
18831892
def __setitem__(self, key, value):
18841893
if self.regex:
18851894
raise NotImplementedError("Inserting with regex has not been implemented")

0 commit comments

Comments
 (0)