From 4ed06303b60f06c128e27127db0ebfc2c0119f2c Mon Sep 17 00:00:00 2001
From: tp <contribute@tensortable.com>
Date: Thu, 11 Jul 2019 15:50:07 +0100
Subject: [PATCH] ENH: add DataFrame.select_str

---
 doc/source/reference/frame.rst                |  1 +
 doc/source/reference/series.rst               |  1 +
 pandas/core/frame.py                          |  5 ++
 pandas/core/generic.py                        | 83 +++++++++++++++++++
 .../tests/frame/test_axis_select_reindex.py   | 18 ++++
 5 files changed, 108 insertions(+)

diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst
index c0b58fd2d99f5..991ec4b3b0298 100644
--- a/doc/source/reference/frame.rst
+++ b/doc/source/reference/frame.rst
@@ -32,6 +32,7 @@ Attributes and underlying data
    DataFrame.get_dtype_counts
    DataFrame.get_ftype_counts
    DataFrame.select_dtypes
+   DataFrame.select_str
    DataFrame.values
    DataFrame.get_values
    DataFrame.axes
diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst
index 8d2a764c33a43..c267a2bfb10c1 100644
--- a/doc/source/reference/series.rst
+++ b/doc/source/reference/series.rst
@@ -211,6 +211,7 @@ Reindexing / selection / label manipulation
    Series.rename_axis
    Series.reset_index
    Series.sample
+   Series.select_str
    Series.set_axis
    Series.take
    Series.tail
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 55a9eb6a0810a..e540b0c8ba3f9 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -3347,6 +3347,11 @@ def select_dtypes(self, include=None, exclude=None):
         * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in
           0.20.0) or ``'datetime64[ns, tz]'``
 
+        See Also
+        --------
+        DataFrame.select_str
+        DataFrame.loc
+
         Examples
         --------
         >>> df = pd.DataFrame({'a': [1, 2] * 3,
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 4e05dfca43e78..f394b8256d113 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -4640,6 +4640,89 @@ def _reindex_with_indexers(
 
         return self._constructor(new_data).__finalize__(self)
 
+    def select_str(
+        self, *, startswith=None, endswith=None, regex=None, flags=0, axis=None
+    ):
+        """
+        Select rows or columns of dataframe from the string labels in the selected axis.
+
+        Only one of keywords arguments `startswith`, `endswith` and `regex` can be used.
+
+        Parameters
+        ----------
+        startswith: str, optional
+            Test if the start of each string element matches a pattern.
+            Equivalent to :meth:`str.startswith`.
+        endswith: str, optional
+            Test if the end of each string element matches a pattern.
+            Equivalent to :meth:`str.endsswith`.
+        regex : str, optional
+            Keep labels from axis for which re.search(regex, label) is True.
+        flags : int, default 0 (no flags)
+            re module flags, e.g. re.IGNORECASE. Can only be used with parameter regex.
+        axis : int or string axis name
+            The axis to filter on.  By default this is the info axis,
+            'index' for Series, 'columns' for DataFrame.
+
+        Returns
+        -------
+        same type as input object
+
+        See Also
+        --------
+        DataFrame.loc
+        DataFrame.select_dtypes
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])),
+        ...                   index=['mouse', 'rabbit'],
+        ...                   columns=['one', 'two', 'three'])
+
+        >>> df.select_str(startswith='t')
+                two  three
+        mouse     2      3
+        rabbit    5      6
+
+        >>> # select columns by regular expression
+        >>> df.select_str(regex=r'e$', axis=1)
+                 one  three
+        mouse     1      3
+        rabbit    4      6
+
+        >>> # select rows containing 'bbi'
+        >>> df.select_str(regex=r'bbi', axis=0)
+                 one  two  three
+        rabbit    4    5      6
+        """
+        import re
+
+        num_kw = com.count_not_none(startswith, endswith, regex)
+        if num_kw != 1:
+            raise TypeError(
+                "Only one of keywords arguments `startswith`, `endswith` and "
+                "`regex` can be used."
+            )
+        if regex is None and flags != 0:
+            raise ValueError("Can only be used togehter with parameter 'regex'")
+
+        if axis is None:
+            axis = self._info_axis_name
+        labels = self._get_axis(axis)
+
+        if startswith is not None:
+            mapped = labels.str.startswith(startswith)
+        elif endswith is not None:
+            mapped = labels.str.endsswith(endswith)
+        else:  # regex
+            matcher = re.compile(regex, flags=flags)
+
+            def f(x):
+                return matcher.search(x) is not None
+
+            mapped = labels.map(f)
+        return self.loc(axis=axis)[mapped]
+
     def filter(self, items=None, like=None, regex=None, axis=None):
         """
         Subset rows or columns of dataframe according to labels in
diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py
index 77be952506964..a4c6966077d8b 100644
--- a/pandas/tests/frame/test_axis_select_reindex.py
+++ b/pandas/tests/frame/test_axis_select_reindex.py
@@ -806,6 +806,24 @@ def test_align_series_combinations(self):
         tm.assert_series_equal(res1, exp2)
         tm.assert_frame_equal(res2, exp1)
 
+    def test_select_str(self, float_frame):
+        fcopy = float_frame.copy()
+        fcopy["AA"] = 1
+
+        # regex
+        selected = fcopy.select_str(regex="[A]+")
+        assert len(selected.columns) == 2
+        assert "AA" in selected
+
+        # doesn't have to be at beginning
+        df = DataFrame(
+            {"aBBa": [1, 2], "BBaBB": [1, 2], "aCCa": [1, 2], "aCCaBB": [1, 2]}
+        )
+
+        result = df.select_str(regex="BB")
+        exp = df[[x for x in df.columns if "BB" in x]]
+        assert_frame_equal(result, exp)
+
     def test_filter(self, float_frame, float_string_frame):
         # Items
         filtered = float_frame.filter(["A", "B", "E"])