pandas-dev · topper-123 · Jul 12, 2019 · Jul 12, 2019 · Jul 25, 2019 · Jul 26, 2019
diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst
@@ -563,6 +563,39 @@ without using a temporary variable.
    (bb.groupby(['year', 'team']).sum()
       .loc[lambda df: df.r > 100])
 
+
+.. _indexing.selecting_with_regex:
+
+Selection by regular expression
+-------------------------------
+
+.. versionadded:: 1.0
+
+it is possible to call :attr:`~DataFrame.loc` with parameter ``regex=True`` to select by
+row/columns axis labels that match a regular expression pattern.
+
+.. ipython:: python
+
+    df_re = pd.DataFrame(1, index=["A", "AB", "BC"], columns=["BC", "AB", "A"])
+    df_re
+
+    df_re.loc(regex=True)["B", "B"]
+    df_re.loc(axis=1, regex=True)["B"]
+
+The regex matching will only work when looking up single strings, not list of strings etc.
+
+.. ipython:: python
+
+    df_re.loc(regex=True)[["A"], "A"]
+
+*Notice*: Is is currently not possible to set values for a given regular expression.
+
+.. ipython:: python
+    :okexcept:
+
+    df_re.loc(regex=True)["B", "B"] = [[1, 2], [3, 4]]
+
+
 .. _indexing.deprecate_ix:
 
 IX indexer is deprecated

diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -21,6 +21,36 @@ including other versions of pandas.
 Enhancements
 ~~~~~~~~~~~~
 
+.. _whatsnew_1000.enhancements.loc_regex:
+
+Selection using regular expressions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+It is now possible to call :attr:`~DataFrame.loc` with parameter ``regex=True`` to select by
+row/columns axis labels that match a regular expression pattern.
+
+.. ipython:: python
+
+    df = pd.DataFrame(1, index=["A", "AB", "BC"], columns=["BC", "AB", "A"])
+    df
+
+    df.loc(regex=True)["B", "B"]
+    df.loc(axis=1, regex=True)["B"]
+
+The regex matching will only work when looking up single strings, not list of strings etc.
+
+.. ipython:: python
+
+    df.loc(regex=True)[["A"], "A"]
+
+Is is currently not possible to set values for a given regular expression.
+
+.. ipython:: python
+    :okexcept:
+
+    df.loc(regex=True)["B", "B"] = [[1, 2], [3, 4]]
+
+
 .. _whatsnew_1000.enhancements.other:
 
 -

diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -1,3 +1,4 @@
+import re
 import textwrap
 from typing import Tuple
 import warnings
@@ -25,6 +26,7 @@
 from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
 from pandas.core.dtypes.missing import _infer_fill_value, isna
 
+from pandas._typing import FrameOrSeries
 import pandas.core.common as com
 from pandas.core.index import Index, InvalidIndexError, MultiIndex
 from pandas.core.indexers import is_list_like_indexer, length_of_indexer
@@ -1182,7 +1184,7 @@ def _validate_read_indexer(
             )
 
             if not (ax.is_categorical() or ax.is_interval()):
-                warnings.warn(_missing_key_warning, FutureWarning, stacklevel=6)
+                warnings.warn(_missing_key_warning, FutureWarning, stacklevel=7)
 
     def _convert_to_indexer(
         self, obj, axis: int, is_setter: bool = False, raise_missing: bool = False
@@ -1467,6 +1469,12 @@ class _LocIndexer(_LocationIndexer):
     - A ``callable`` function with one argument (the calling Series or
       DataFrame) and that returns valid output for indexing (one of the above)
 
+    ``.loc`` can be called before selecting using these parameters:
+
+    - ``axis``, to select by a single axis on a DataFrame, e.g. ``.loc(axis=1)['a']``.
+    - ``regex``, to let single strings be interpreted as regex patterns, e.g.
+      ``.loc(regex=True)[:, '^col_']``
+
     See more at :ref:`Selection by Label <indexing.label>`
 
     Raises
@@ -1546,6 +1554,21 @@ class _LocIndexer(_LocationIndexer):
                 max_speed  shield
     sidewinder          7       8
 
+    The axis may be preselected
+
+    >>> df.loc(axis=1)["max_speed"]
+    cobra         1
+    viper         4
+    sidewinder    7
+    Name: max_speed, dtype: int64
+
+    Single strings are considered regex patterns if ``regex=True``
+
+    >>> df.loc(regex=True)["r$", "d$"]
+                max_speed  shield
+    viper               4       5
+    sidewinder          7       8
+
     **Setting values**
 
     Set value for all items matching the list of labels
@@ -1689,6 +1712,19 @@ class _LocIndexer(_LocationIndexer):
     )
     _exception = KeyError
 
+    regex = False
+
+    def __call__(self, axis=None, regex=False):
+        new_self = super().__call__(axis=axis)
+        if regex:
+            new_self.regex = regex
+        return new_self
+
+    def __getitem__(self, key):
+        if self.regex:
+            return self._getitem_regex(key)
+        return super().__getitem__(key)
+
     @Appender(_NDFrameIndexer._validate_key.__doc__)
     def _validate_key(self, key, axis: int):
 
@@ -1755,6 +1791,36 @@ def _get_partial_string_timestamp_match_key(self, key, labels):
 
         return key
 
+    def _getitem_regex(self, key, axis=None):
+        """Subset obj by regex-searching axis for key."""
+        if isinstance(key, str):
+            if axis is None:
+                axis = self.axis or 0
+            return self._get_regex_axis(self.obj, pattern=key, axis=axis)
+        elif isinstance(key, tuple):
+            assert len(key) == 2
+            result = self.obj  # type: ABCDataFrame
+            # slicing columns first, then index is typically faster
+            for ax, sub_key in zip([1, 0], reversed(key)):
+                if isinstance(sub_key, str):
+                    result = self._get_regex_axis(result, pattern=sub_key, axis=ax)
+                else:
+                    result = result.loc(axis=ax)[sub_key]
+            return result
+
+    def _get_regex_axis(
+        self, obj: FrameOrSeries, pattern: str, axis: int
+    ) -> FrameOrSeries:
+        """Subset a single axis of ``obj`` from a regex pattern."""
+        labels = obj._get_axis(axis)
+        matcher = re.compile(pattern)
+
+        def func(x):
+            return matcher.search(x) is not None
+
+        mapped = labels.map(func)
+        return obj.loc(axis=axis)[mapped]
+
     def _getitem_axis(self, key, axis: int):
         key = item_from_zerodim(key)
         if is_iterator(key):
@@ -1816,10 +1882,18 @@ def _getitem_axis(self, key, axis: int):
                 indexer[axis] = locs
                 return self.obj.iloc[tuple(indexer)]
 
+        elif self.regex and isinstance(key, (str, bytes)):
+            return self._getitem_regex(key, axis=axis)
+
         # fall thru to straight lookup
         self._validate_key(key, axis)
         return self._get_label(key, axis=axis)
 
+    def __setitem__(self, key, value):
+        if self.regex:
+            raise NotImplementedError("Inserting with regex has not been implemented")
+        return super().__setitem__(key, value)
+
 
 class _iLocIndexer(_LocationIndexer):
     """

diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
@@ -13,6 +13,67 @@
 from pandas.util import testing as tm
 
 
+class TestLocRegex:
+    # test calls to df.loc, with a regex parameter set to True,
+    # ie. df.loc(..., regex=True)[...]
+
+    idx = ["AB", "BC", "CD", "DE"]
+    cols = idx[::-1]
+
+    def test_regex_frame(self):
+        idx, cols = self.idx, self.cols
+
+        df = pd.DataFrame(1, index=idx, columns=cols)
+        ser = df["AB"]
+
+        result = df.loc(regex=True)["B"]
+        expected = pd.DataFrame(1, index=["AB", "BC"], columns=cols)
+        tm.assert_frame_equal(result, expected)
+
+        result = ser.loc(regex=True)["B"]
+        expected = pd.Series(1, index=["AB", "BC"], name="AB")
+        tm.assert_series_equal(result, expected)
+
+        result = df.loc(regex=True)[:, "B"]
+        expected = pd.DataFrame(1, index=idx, columns=["BC", "AB"])
+        tm.assert_frame_equal(result, expected)
+
+        result = df.loc(regex=True)["B", "B"]
+        expected = pd.DataFrame(1, index=["AB", "BC"], columns=["BC", "AB"])
+        tm.assert_frame_equal(result, expected)
+
+    def test_regex_empty(self):
+        idx, cols = self.idx, self.cols
+
+        df = pd.DataFrame(1, index=idx, columns=cols)
+        ser = df["AB"]
+
+        result = df.loc(regex=True)["X"]
+        expected = pd.DataFrame(columns=cols, dtype="int64")
+        tm.assert_frame_equal(result, expected)
+
+        result = ser.loc(regex=True)["X"]
+        expected = pd.Series(name="AB", dtype="int64")
+        tm.assert_series_equal(result, expected)
+
+        result = df.loc(regex=True)[:, "X"]
+        expected = pd.DataFrame(index=idx, dtype="int64")
+        tm.assert_frame_equal(result, expected)
+
+        result = df.loc(regex=True)["X", "X"]
+        expected = pd.DataFrame(dtype="int64")
+        tm.assert_frame_equal(result, expected)
+
+    def test_regex_inserting(self):
+        idx, cols = self.idx, self.cols
+
+        df = pd.DataFrame(1, index=idx, columns=cols)
+
+        msg = "Inserting with regex has not been implemented"
+        with pytest.raises(NotImplementedError, match=msg):
+            df.loc(regex=True)["B", "B"] = [[2, 2], [2, 2]]
+
+
 class TestLoc(Base):
     def test_loc_getitem_dups(self):
         # GH 5678