From 4ed06303b60f06c128e27127db0ebfc2c0119f2c Mon Sep 17 00:00:00 2001 From: tp Date: Thu, 11 Jul 2019 15:50:07 +0100 Subject: [PATCH] ENH: add DataFrame.select_str --- doc/source/reference/frame.rst | 1 + doc/source/reference/series.rst | 1 + pandas/core/frame.py | 5 ++ pandas/core/generic.py | 83 +++++++++++++++++++ .../tests/frame/test_axis_select_reindex.py | 18 ++++ 5 files changed, 108 insertions(+) diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index c0b58fd2d99f5..991ec4b3b0298 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -32,6 +32,7 @@ Attributes and underlying data DataFrame.get_dtype_counts DataFrame.get_ftype_counts DataFrame.select_dtypes + DataFrame.select_str DataFrame.values DataFrame.get_values DataFrame.axes diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 8d2a764c33a43..c267a2bfb10c1 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -211,6 +211,7 @@ Reindexing / selection / label manipulation Series.rename_axis Series.reset_index Series.sample + Series.select_str Series.set_axis Series.take Series.tail diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 55a9eb6a0810a..e540b0c8ba3f9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3347,6 +3347,11 @@ def select_dtypes(self, include=None, exclude=None): * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in 0.20.0) or ``'datetime64[ns, tz]'`` + See Also + -------- + DataFrame.select_str + DataFrame.loc + Examples -------- >>> df = pd.DataFrame({'a': [1, 2] * 3, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4e05dfca43e78..f394b8256d113 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4640,6 +4640,89 @@ def _reindex_with_indexers( return self._constructor(new_data).__finalize__(self) + def select_str( + self, *, startswith=None, endswith=None, regex=None, flags=0, axis=None + ): + """ + Select rows or columns of dataframe from the string labels in the selected axis. + + Only one of keywords arguments `startswith`, `endswith` and `regex` can be used. + + Parameters + ---------- + startswith: str, optional + Test if the start of each string element matches a pattern. + Equivalent to :meth:`str.startswith`. + endswith: str, optional + Test if the end of each string element matches a pattern. + Equivalent to :meth:`str.endsswith`. + regex : str, optional + Keep labels from axis for which re.search(regex, label) is True. + flags : int, default 0 (no flags) + re module flags, e.g. re.IGNORECASE. Can only be used with parameter regex. + axis : int or string axis name + The axis to filter on. By default this is the info axis, + 'index' for Series, 'columns' for DataFrame. + + Returns + ------- + same type as input object + + See Also + -------- + DataFrame.loc + DataFrame.select_dtypes + + Examples + -------- + >>> df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])), + ... index=['mouse', 'rabbit'], + ... columns=['one', 'two', 'three']) + + >>> df.select_str(startswith='t') + two three + mouse 2 3 + rabbit 5 6 + + >>> # select columns by regular expression + >>> df.select_str(regex=r'e$', axis=1) + one three + mouse 1 3 + rabbit 4 6 + + >>> # select rows containing 'bbi' + >>> df.select_str(regex=r'bbi', axis=0) + one two three + rabbit 4 5 6 + """ + import re + + num_kw = com.count_not_none(startswith, endswith, regex) + if num_kw != 1: + raise TypeError( + "Only one of keywords arguments `startswith`, `endswith` and " + "`regex` can be used." + ) + if regex is None and flags != 0: + raise ValueError("Can only be used togehter with parameter 'regex'") + + if axis is None: + axis = self._info_axis_name + labels = self._get_axis(axis) + + if startswith is not None: + mapped = labels.str.startswith(startswith) + elif endswith is not None: + mapped = labels.str.endsswith(endswith) + else: # regex + matcher = re.compile(regex, flags=flags) + + def f(x): + return matcher.search(x) is not None + + mapped = labels.map(f) + return self.loc(axis=axis)[mapped] + def filter(self, items=None, like=None, regex=None, axis=None): """ Subset rows or columns of dataframe according to labels in diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index 77be952506964..a4c6966077d8b 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -806,6 +806,24 @@ def test_align_series_combinations(self): tm.assert_series_equal(res1, exp2) tm.assert_frame_equal(res2, exp1) + def test_select_str(self, float_frame): + fcopy = float_frame.copy() + fcopy["AA"] = 1 + + # regex + selected = fcopy.select_str(regex="[A]+") + assert len(selected.columns) == 2 + assert "AA" in selected + + # doesn't have to be at beginning + df = DataFrame( + {"aBBa": [1, 2], "BBaBB": [1, 2], "aCCa": [1, 2], "aCCaBB": [1, 2]} + ) + + result = df.select_str(regex="BB") + exp = df[[x for x in df.columns if "BB" in x]] + assert_frame_equal(result, exp) + def test_filter(self, float_frame, float_string_frame): # Items filtered = float_frame.filter(["A", "B", "E"])