From 0c656fd44b1851fc470ae5434b39e903ed1784a9 Mon Sep 17 00:00:00 2001 From: tp Date: Fri, 12 Jul 2019 13:00:25 +0100 Subject: [PATCH 1/4] ENH: add regex parameter to .loc --- doc/source/user_guide/indexing.rst | 32 ++++++++++++++++ doc/source/whatsnew/v0.25.0.rst | 29 +++++++++++++++ pandas/core/indexing.py | 48 +++++++++++++++++++++++- pandas/tests/indexing/test_loc.py | 60 ++++++++++++++++++++++++++++++ 4 files changed, 167 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index e3b75afcf945e..32158dc0d1c91 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -563,6 +563,38 @@ without using a temporary variable. (bb.groupby(['year', 'team']).sum() .loc[lambda df: df.r > 100]) + +.. _indexing.selecting_with_regex: + +Selection by regular expression +------------------------------- + +.. versionadded:: 0.25.0 + +it is possible to call :attr:`~DataFrame.loc` with parameter ``regex=True`` to select by +row/columns axis labels that match a regular expression pattern. + +.. ipython:: python + + df = pd.DataFrame(1, index=["A", "AB", "BC"], columns=["BC", "AB", "A"]) + df + + df.loc(regex=True)["B", "B"] + df.loc(axis=1, regex=True)["B"] + +The regex matching will only work when looking up single strings, not list of strings etc. + +.. ipython:: python + + df.loc(regex=True)[["A"], "A"] + +*Notice*: Is is currently not possible to set values for a given regular expression. + +.. ipython:: python + + df.loc(regex=True)["B", "B"] = [[1, 2], [3, 4]] + + .. _indexing.deprecate_ix: IX indexer is deprecated diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 42e756635e739..3a29d9c1c335a 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -182,6 +182,35 @@ The repr now looks like this: json_normalize(data, max_level=1) +.. _whatsnew_0250.enhancements.loc_regex: + +Selection using regular expressions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +It is now possible to call :attr:`~DataFrame.loc` with parameter ``regex=True`` to select by +row/columns axis labels that match a regular expression pattern. + +.. ipython:: python + + df = pd.DataFrame(1, index=["A", "AB", "BC"], columns=["BC", "AB", "A"]) + df + + df.loc(regex=True)["B", "B"] + df.loc(axis=1, regex=True)["B"] + +The regex matching will only work when looking up single strings, not list of strings etc. + +.. ipython:: python + + df.loc(regex=True)[["A"], "A"] + +Is is currently not possible to set values for a given regular expression. + +.. ipython:: python + + df.loc(regex=True)["B", "B"] = [[1, 2], [3, 4]] + + .. _whatsnew_0250.enhancements.explode: Series.explode to split list-like values to rows diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 1d4ea54ef0d70..84afe6986f68d 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -939,11 +939,15 @@ def _getitem_lowerdim(self, tup: Tuple): # slice returns a new object. if com.is_null_slice(new_key): return section - # This is an elided recursive call to iloc/loc/etc' - return getattr(section, self.name)[new_key] + + return self._getitem_lower_dim(section, new_key) raise IndexingError("not applicable") + def _getitem_lower_dim(self, section, key): + # This is an elided recursive call to iloc/loc/etc' + return getattr(section, self.name)[key] + def _getitem_nested_tuple(self, tup: Tuple): # we have a nested tuple so have at least 1 multi-index level # we should be able to match up the dimensionality here @@ -1689,6 +1693,14 @@ class _LocIndexer(_LocationIndexer): ) _exception = KeyError + regex = False + + def __call__(self, axis=None, regex=False): + new_self = super().__call__(axis=axis) + if regex: + new_self.regex = regex + return new_self + @Appender(_NDFrameIndexer._validate_key.__doc__) def _validate_key(self, key, axis: int): @@ -1755,6 +1767,27 @@ def _get_partial_string_timestamp_match_key(self, key, labels): return key + def _get_regex_mappings(self, key, axis=None): + import re + + if axis is None: + axis = self.axis or 0 + + labels = self.obj._get_axis(axis) + + matcher = re.compile(key) + + def f(x): + return matcher.search(x) is not None + + return labels.map(f) + + def _getitem_regex(self, key, axis=None): + """Subset obj by regex-searching axis for key.""" + mapped = self._get_regex_mappings(key, axis) + + return self.obj.loc(axis=axis)[mapped] + def _getitem_axis(self, key, axis: int): key = item_from_zerodim(key) if is_iterator(key): @@ -1816,10 +1849,21 @@ def _getitem_axis(self, key, axis: int): indexer[axis] = locs return self.obj.iloc[tuple(indexer)] + if self.regex and isinstance(key, str): + return self._getitem_regex(key, axis=axis) + # fall thru to straight lookup self._validate_key(key, axis) return self._get_label(key, axis=axis) + def _getitem_lower_dim(self, section, key): + return getattr(section, self.name)(regex=self.regex)[key] + + def __setitem__(self, key, value): + if self.regex: + raise TypeError("Inserting with regex not supported") + return super().__setitem__(key, value) + class _iLocIndexer(_LocationIndexer): """ diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index abe0cd86c90d7..1f052f17c3d63 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -13,6 +13,66 @@ from pandas.util import testing as tm +class TestLocRegex: + # test calls to df.loc, with a regex parameter set to True, + # ie. df.loc(..., regex=True)[...] + + idx = ["AB", "BC", "CD", "DE"] + cols = idx[::-1] + + def test_regex_frame(self): + idx, cols = self.idx, self.cols + + df = pd.DataFrame(1, index=idx, columns=cols) + ser = df["AB"] + + result = df.loc(regex=True)["B"] + expected = pd.DataFrame(1, index=["AB", "BC"], columns=cols) + tm.assert_frame_equal(result, expected) + + result = ser.loc(regex=True)["B"] + expected = pd.Series(1, index=["AB", "BC"], name="AB") + tm.assert_series_equal(result, expected) + + result = df.loc(regex=True)[:, "B"] + expected = pd.DataFrame(1, index=idx, columns=["BC", "AB"]) + tm.assert_frame_equal(result, expected) + + result = df.loc(regex=True)["B", "B"] + expected = pd.DataFrame(1, index=["AB", "BC"], columns=["BC", "AB"]) + tm.assert_frame_equal(result, expected) + + def test_regex_empty(self): + idx, cols = self.idx, self.cols + + df = pd.DataFrame(1, index=idx, columns=cols) + ser = df["AB"] + + result = df.loc(regex=True)["X"] + expected = pd.DataFrame(columns=cols, dtype="int64") + tm.assert_frame_equal(result, expected) + + result = ser.loc(regex=True)["X"] + expected = pd.Series(name="AB", dtype="int64") + tm.assert_series_equal(result, expected) + + result = df.loc(regex=True)[:, "X"] + expected = pd.DataFrame(index=idx, dtype="int64") + tm.assert_frame_equal(result, expected) + + result = df.loc(regex=True)["X", "X"] + expected = pd.DataFrame(dtype="int64") + tm.assert_frame_equal(result, expected) + + def test_regex_inserting(self): + idx, cols = self.idx, self.cols + + df = pd.DataFrame(1, index=idx, columns=cols) + + with pytest.raises(TypeError, match="Inserting with regex not supported"): + df.loc(regex=True)["B", "B"] = [[2, 2], [2, 2]] + + class TestLoc(Base): def test_loc_getitem_dups(self): # GH 5678 From 87a8ce6df3b151df908afb7de9d4c5724a3b9706 Mon Sep 17 00:00:00 2001 From: tp Date: Fri, 12 Jul 2019 22:23:53 +0100 Subject: [PATCH 2/4] NotImplementedError + doc string --- doc/source/user_guide/indexing.rst | 13 +++++++------ doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/indexing.py | 28 +++++++++++++++++++++++++--- pandas/tests/indexing/test_loc.py | 3 ++- 4 files changed, 35 insertions(+), 10 deletions(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 32158dc0d1c91..c613d0599ac4a 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -576,23 +576,24 @@ row/columns axis labels that match a regular expression pattern. .. ipython:: python - df = pd.DataFrame(1, index=["A", "AB", "BC"], columns=["BC", "AB", "A"]) - df + df_re = pd.DataFrame(1, index=["A", "AB", "BC"], columns=["BC", "AB", "A"]) + df_re - df.loc(regex=True)["B", "B"] - df.loc(axis=1, regex=True)["B"] + df_re.loc(regex=True)["B", "B"] + df_re.loc(axis=1, regex=True)["B"] The regex matching will only work when looking up single strings, not list of strings etc. .. ipython:: python - df.loc(regex=True)[["A"], "A"] + df_re.loc(regex=True)[["A"], "A"] *Notice*: Is is currently not possible to set values for a given regular expression. .. ipython:: python + :okexcept: - df.loc(regex=True)["B", "B"] = [[1, 2], [3, 4]] + df_re.loc(regex=True)["B", "B"] = [[1, 2], [3, 4]] .. _indexing.deprecate_ix: diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 3a29d9c1c335a..18a40f1ae15b9 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -207,6 +207,7 @@ The regex matching will only work when looking up single strings, not list of st Is is currently not possible to set values for a given regular expression. .. ipython:: python + :okexcept: df.loc(regex=True)["B", "B"] = [[1, 2], [3, 4]] diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 84afe6986f68d..f1931322837ef 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1,3 +1,4 @@ +import re import textwrap from typing import Tuple import warnings @@ -1471,6 +1472,12 @@ class _LocIndexer(_LocationIndexer): - A ``callable`` function with one argument (the calling Series or DataFrame) and that returns valid output for indexing (one of the above) + ``.loc`` can be called before selecting using parameters: + + - ``axis``, to select by a single axis on a DataFrame, e.g. ``.loc(axis=1)['a']``. + - ``regex``, to let strings be interpreted as regex patterns, e.g. + ``.loc(regex=True)[:, '^col_']`` + See more at :ref:`Selection by Label ` Raises @@ -1550,6 +1557,21 @@ class _LocIndexer(_LocationIndexer): max_speed shield sidewinder 7 8 + The axis may be preselected + + >>> df.loc(axis=1)["max_speed"] + cobra 1 + viper 4 + sidewinder 7 + Name: max_speed, dtype: int64 + + Single strings are considered regex patterns if ``regex=True`` + + >>> df.loc(regex=True)["r$", "d$"] + max_speed shield + viper 4 5 + sidewinder 7 8 + **Setting values** Set value for all items matching the list of labels @@ -1768,7 +1790,6 @@ def _get_partial_string_timestamp_match_key(self, key, labels): return key def _get_regex_mappings(self, key, axis=None): - import re if axis is None: axis = self.axis or 0 @@ -1849,7 +1870,8 @@ def _getitem_axis(self, key, axis: int): indexer[axis] = locs return self.obj.iloc[tuple(indexer)] - if self.regex and isinstance(key, str): + elif self.regex and isinstance(key, (str, bytes)): + print(key, axis) return self._getitem_regex(key, axis=axis) # fall thru to straight lookup @@ -1861,7 +1883,7 @@ def _getitem_lower_dim(self, section, key): def __setitem__(self, key, value): if self.regex: - raise TypeError("Inserting with regex not supported") + raise NotImplementedError("Inserting with regex has not been implemented") return super().__setitem__(key, value) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 1f052f17c3d63..2f8cb6eb1401a 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -69,7 +69,8 @@ def test_regex_inserting(self): df = pd.DataFrame(1, index=idx, columns=cols) - with pytest.raises(TypeError, match="Inserting with regex not supported"): + msg = "Inserting with regex has not been implemented" + with pytest.raises(NotImplementedError, match=msg): df.loc(regex=True)["B", "B"] = [[2, 2], [2, 2]] From 01619b55b1c483c41335f48c2775a3ff44b05d00 Mon Sep 17 00:00:00 2001 From: tp Date: Fri, 26 Jul 2019 00:49:57 +0100 Subject: [PATCH 3/4] Move to 1.0 --- doc/source/whatsnew/v0.25.0.rst | 30 ------------------------------ doc/source/whatsnew/v1.0.0.rst | 30 ++++++++++++++++++++++++++++++ pandas/core/indexing.py | 7 +++---- 3 files changed, 33 insertions(+), 34 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 18a40f1ae15b9..42e756635e739 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -182,36 +182,6 @@ The repr now looks like this: json_normalize(data, max_level=1) -.. _whatsnew_0250.enhancements.loc_regex: - -Selection using regular expressions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -It is now possible to call :attr:`~DataFrame.loc` with parameter ``regex=True`` to select by -row/columns axis labels that match a regular expression pattern. - -.. ipython:: python - - df = pd.DataFrame(1, index=["A", "AB", "BC"], columns=["BC", "AB", "A"]) - df - - df.loc(regex=True)["B", "B"] - df.loc(axis=1, regex=True)["B"] - -The regex matching will only work when looking up single strings, not list of strings etc. - -.. ipython:: python - - df.loc(regex=True)[["A"], "A"] - -Is is currently not possible to set values for a given regular expression. - -.. ipython:: python - :okexcept: - - df.loc(regex=True)["B", "B"] = [[1, 2], [3, 4]] - - .. _whatsnew_0250.enhancements.explode: Series.explode to split list-like values to rows diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index fa7b945492d5d..a32eb6b908ebd 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -21,6 +21,36 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +.. _whatsnew_0250.enhancements.loc_regex: + +Selection using regular expressions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +It is now possible to call :attr:`~DataFrame.loc` with parameter ``regex=True`` to select by +row/columns axis labels that match a regular expression pattern. + +.. ipython:: python + + df = pd.DataFrame(1, index=["A", "AB", "BC"], columns=["BC", "AB", "A"]) + df + + df.loc(regex=True)["B", "B"] + df.loc(axis=1, regex=True)["B"] + +The regex matching will only work when looking up single strings, not list of strings etc. + +.. ipython:: python + + df.loc(regex=True)[["A"], "A"] + +Is is currently not possible to set values for a given regular expression. + +.. ipython:: python + :okexcept: + + df.loc(regex=True)["B", "B"] = [[1, 2], [3, 4]] + + .. _whatsnew_1000.enhancements.other: - diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index f1931322837ef..9a9fd198ba1e8 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -941,11 +941,11 @@ def _getitem_lowerdim(self, tup: Tuple): if com.is_null_slice(new_key): return section - return self._getitem_lower_dim(section, new_key) + return self._getitem_lower_section(section, new_key) raise IndexingError("not applicable") - def _getitem_lower_dim(self, section, key): + def _getitem_lower_section(self, section, key): # This is an elided recursive call to iloc/loc/etc' return getattr(section, self.name)[key] @@ -1871,14 +1871,13 @@ def _getitem_axis(self, key, axis: int): return self.obj.iloc[tuple(indexer)] elif self.regex and isinstance(key, (str, bytes)): - print(key, axis) return self._getitem_regex(key, axis=axis) # fall thru to straight lookup self._validate_key(key, axis) return self._get_label(key, axis=axis) - def _getitem_lower_dim(self, section, key): + def _getitem_lower_section(self, section, key): return getattr(section, self.name)(regex=self.regex)[key] def __setitem__(self, key, value): From 5a7c20d2d9b652884a3925fd44110844420d431f Mon Sep 17 00:00:00 2001 From: tp Date: Fri, 26 Jul 2019 12:13:35 +0100 Subject: [PATCH 4/4] Refactored regex handling in LocIndexer --- doc/source/user_guide/indexing.rst | 2 +- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/indexing.py | 63 +++++++++++++++++------------- 3 files changed, 38 insertions(+), 29 deletions(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index c613d0599ac4a..81172e037f388 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -569,7 +569,7 @@ without using a temporary variable. Selection by regular expression ------------------------------- -.. versionadded:: 0.25.0 +.. versionadded:: 1.0 it is possible to call :attr:`~DataFrame.loc` with parameter ``regex=True`` to select by row/columns axis labels that match a regular expression pattern. diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index a32eb6b908ebd..de0c396721b6b 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -21,7 +21,7 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ -.. _whatsnew_0250.enhancements.loc_regex: +.. _whatsnew_1000.enhancements.loc_regex: Selection using regular expressions ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 9a9fd198ba1e8..6ba07831156c5 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -26,6 +26,7 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.missing import _infer_fill_value, isna +from pandas._typing import FrameOrSeries import pandas.core.common as com from pandas.core.index import Index, InvalidIndexError, MultiIndex from pandas.core.indexers import is_list_like_indexer, length_of_indexer @@ -940,15 +941,11 @@ def _getitem_lowerdim(self, tup: Tuple): # slice returns a new object. if com.is_null_slice(new_key): return section - - return self._getitem_lower_section(section, new_key) + # This is an elided recursive call to iloc/loc/etc' + return getattr(section, self.name)[new_key] raise IndexingError("not applicable") - def _getitem_lower_section(self, section, key): - # This is an elided recursive call to iloc/loc/etc' - return getattr(section, self.name)[key] - def _getitem_nested_tuple(self, tup: Tuple): # we have a nested tuple so have at least 1 multi-index level # we should be able to match up the dimensionality here @@ -1187,7 +1184,7 @@ def _validate_read_indexer( ) if not (ax.is_categorical() or ax.is_interval()): - warnings.warn(_missing_key_warning, FutureWarning, stacklevel=6) + warnings.warn(_missing_key_warning, FutureWarning, stacklevel=7) def _convert_to_indexer( self, obj, axis: int, is_setter: bool = False, raise_missing: bool = False @@ -1472,10 +1469,10 @@ class _LocIndexer(_LocationIndexer): - A ``callable`` function with one argument (the calling Series or DataFrame) and that returns valid output for indexing (one of the above) - ``.loc`` can be called before selecting using parameters: + ``.loc`` can be called before selecting using these parameters: - ``axis``, to select by a single axis on a DataFrame, e.g. ``.loc(axis=1)['a']``. - - ``regex``, to let strings be interpreted as regex patterns, e.g. + - ``regex``, to let single strings be interpreted as regex patterns, e.g. ``.loc(regex=True)[:, '^col_']`` See more at :ref:`Selection by Label ` @@ -1723,6 +1720,11 @@ def __call__(self, axis=None, regex=False): new_self.regex = regex return new_self + def __getitem__(self, key): + if self.regex: + return self._getitem_regex(key) + return super().__getitem__(key) + @Appender(_NDFrameIndexer._validate_key.__doc__) def _validate_key(self, key, axis: int): @@ -1789,25 +1791,35 @@ def _get_partial_string_timestamp_match_key(self, key, labels): return key - def _get_regex_mappings(self, key, axis=None): - - if axis is None: - axis = self.axis or 0 - - labels = self.obj._get_axis(axis) + def _getitem_regex(self, key, axis=None): + """Subset obj by regex-searching axis for key.""" + if isinstance(key, str): + if axis is None: + axis = self.axis or 0 + return self._get_regex_axis(self.obj, pattern=key, axis=axis) + elif isinstance(key, tuple): + assert len(key) == 2 + result = self.obj # type: ABCDataFrame + # slicing columns first, then index is typically faster + for ax, sub_key in zip([1, 0], reversed(key)): + if isinstance(sub_key, str): + result = self._get_regex_axis(result, pattern=sub_key, axis=ax) + else: + result = result.loc(axis=ax)[sub_key] + return result - matcher = re.compile(key) + def _get_regex_axis( + self, obj: FrameOrSeries, pattern: str, axis: int + ) -> FrameOrSeries: + """Subset a single axis of ``obj`` from a regex pattern.""" + labels = obj._get_axis(axis) + matcher = re.compile(pattern) - def f(x): + def func(x): return matcher.search(x) is not None - return labels.map(f) - - def _getitem_regex(self, key, axis=None): - """Subset obj by regex-searching axis for key.""" - mapped = self._get_regex_mappings(key, axis) - - return self.obj.loc(axis=axis)[mapped] + mapped = labels.map(func) + return obj.loc(axis=axis)[mapped] def _getitem_axis(self, key, axis: int): key = item_from_zerodim(key) @@ -1877,9 +1889,6 @@ def _getitem_axis(self, key, axis: int): self._validate_key(key, axis) return self._get_label(key, axis=axis) - def _getitem_lower_section(self, section, key): - return getattr(section, self.name)(regex=self.regex)[key] - def __setitem__(self, key, value): if self.regex: raise NotImplementedError("Inserting with regex has not been implemented")