ENH: add DataFrame.select_str

topper-123 · topper-123 · commit c243435631a1 · 2019-07-11T17:55:22.000+01:00
diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst
@@ -32,6 +32,7 @@ Attributes and underlying data
    DataFrame.get_dtype_counts
    DataFrame.get_ftype_counts
    DataFrame.select_dtypes
+   DataFrame.select_str
    DataFrame.values
    DataFrame.get_values
    DataFrame.axes
diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst
@@ -211,6 +211,7 @@ Reindexing / selection / label manipulation
    Series.rename_axis
    Series.reset_index
    Series.sample
+   Series.select_str
    Series.set_axis
    Series.take
    Series.tail
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3347,6 +3347,11 @@ def select_dtypes(self, include=None, exclude=None):
         * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in
           0.20.0) or ``'datetime64[ns, tz]'``
 
+        See Also
+        --------
+        DataFrame.select_str
+        DataFrame.loc
+
         Examples
         --------
         >>> df = pd.DataFrame({'a': [1, 2] * 3,
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -4640,6 +4640,95 @@ def _reindex_with_indexers(
 
         return self._constructor(new_data).__finalize__(self)
 
+    def select_str(
+        self, *, startswith=None, endswith=None, regex=None, flags=0, axis=None
+    ):
+        """
+        Selects rows or columns of dataframe according to string labels in
+        the specified index.
+
+        Notes
+        -----
+        Only one of keywords arguments `startswith`, `endswith` and `regex` can be used.
+
+        Parameters
+        ----------
+        startswith: str, optional
+            Test if the start of each string element matches a pattern.
+            Equivalent to :meth:`str.startswith`.
+        endswith: str, optional
+            Test if the end of each string element matches a pattern.
+            Equivalent to :meth:`str.endsswith`.
+        regex : str, optional
+            Keep labels from axis for which re.search(regex, label) is True.
+        flags : int, default 0 (no flags)
+            re module flags, e.g. re.IGNORECASE. Can only be used with parameter regex.
+        axis : int or string axis name
+            The axis to filter on.  By default this is the info axis,
+            'index' for Series, 'columns' for DataFrame.
+
+        Returns
+        -------
+        same type as input object
+
+        See Also
+        --------
+        DataFrame.loc
+        DataFrame.select_dtypes
+
+        ``axis`` defaults to the info axis that is used when indexing
+        with ``[]``.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])),
+        ...                   index=['mouse', 'rabbit'],
+        ...                   columns=['one', 'two', 'three'])
+
+        >>> df.select_str(startswith='t')
+                two  three
+        mouse     2      3
+        rabbit    5      6
+
+        >>> # select columns by regular expression
+        >>> df.select_str(regex=r'e$', axis=1)
+                 one  three
+        mouse     1      3
+        rabbit    4      6
+
+        >>> # select rows containing 'bbi'
+        >>> df.select_str(regex=r'bbi', axis=0)
+                 one  two  three
+        rabbit    4    5      6
+        """
+        import re
+
+        num_kw = com.count_not_none(startswith, endswith, regex)
+        if num_kw != 1:
+            raise TypeError(
+                "Only one of keywords arguments `startswith`, `endswith` and "
+                "`regex` can be used."
+            )
+        if regex is None and flags != 0:
+            raise ValueError("Can only be used togehter with parameter 'regex'")
+
+        if axis is None:
+            axis = self._info_axis_name
+        labels = self._get_axis(axis)
+
+        if startswith is not None:
+            mapped = labels.str.startswith(startswith)
+        elif endswith is not None:
+            mapped = labels.str.endsswith(endswith)
+        else:  # regex
+            matcher = re.compile(regex, flags=flags)
+
+            def f(x):
+                return matcher.search(x) is not None
+
+            mapped = labels.map(f)
+        return self.loc(axis=axis)[mapped]
+
     def filter(self, items=None, like=None, regex=None, axis=None):
         """
         Subset rows or columns of dataframe according to labels in
diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py
@@ -806,6 +806,24 @@ def test_align_series_combinations(self):
         tm.assert_series_equal(res1, exp2)
         tm.assert_frame_equal(res2, exp1)
 
+    def test_select_str(self, float_frame):
+        fcopy = float_frame.copy()
+        fcopy["AA"] = 1
+
+        # regex
+        selected = fcopy.select_str(regex="[A]+")
+        assert len(selected.columns) == 2
+        assert "AA" in selected
+
+        # doesn't have to be at beginning
+        df = DataFrame(
+            {"aBBa": [1, 2], "BBaBB": [1, 2], "aCCa": [1, 2], "aCCaBB": [1, 2]}
+        )
+
+        result = df.select_str(regex="BB")
+        exp = df[[x for x in df.columns if "BB" in x]]
+        assert_frame_equal(result, exp)
+
     def test_filter(self, float_frame, float_string_frame):
         # Items
         filtered = float_frame.filter(["A", "B", "E"])