Skip to content

Commit c243435

Browse files
committed
ENH: add DataFrame.select_str
1 parent 2d0b20b commit c243435

File tree

5 files changed

+114
-0
lines changed

5 files changed

+114
-0
lines changed

doc/source/reference/frame.rst

+1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ Attributes and underlying data
3232
DataFrame.get_dtype_counts
3333
DataFrame.get_ftype_counts
3434
DataFrame.select_dtypes
35+
DataFrame.select_str
3536
DataFrame.values
3637
DataFrame.get_values
3738
DataFrame.axes

doc/source/reference/series.rst

+1
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,7 @@ Reindexing / selection / label manipulation
211211
Series.rename_axis
212212
Series.reset_index
213213
Series.sample
214+
Series.select_str
214215
Series.set_axis
215216
Series.take
216217
Series.tail

pandas/core/frame.py

+5
Original file line numberDiff line numberDiff line change
@@ -3347,6 +3347,11 @@ def select_dtypes(self, include=None, exclude=None):
33473347
* To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in
33483348
0.20.0) or ``'datetime64[ns, tz]'``
33493349
3350+
See Also
3351+
--------
3352+
DataFrame.select_str
3353+
DataFrame.loc
3354+
33503355
Examples
33513356
--------
33523357
>>> df = pd.DataFrame({'a': [1, 2] * 3,

pandas/core/generic.py

+89
Original file line numberDiff line numberDiff line change
@@ -4640,6 +4640,95 @@ def _reindex_with_indexers(
46404640

46414641
return self._constructor(new_data).__finalize__(self)
46424642

4643+
def select_str(
4644+
self, *, startswith=None, endswith=None, regex=None, flags=0, axis=None
4645+
):
4646+
"""
4647+
Selects rows or columns of dataframe according to string labels in
4648+
the specified index.
4649+
4650+
Notes
4651+
-----
4652+
Only one of keywords arguments `startswith`, `endswith` and `regex` can be used.
4653+
4654+
Parameters
4655+
----------
4656+
startswith: str, optional
4657+
Test if the start of each string element matches a pattern.
4658+
Equivalent to :meth:`str.startswith`.
4659+
endswith: str, optional
4660+
Test if the end of each string element matches a pattern.
4661+
Equivalent to :meth:`str.endsswith`.
4662+
regex : str, optional
4663+
Keep labels from axis for which re.search(regex, label) is True.
4664+
flags : int, default 0 (no flags)
4665+
re module flags, e.g. re.IGNORECASE. Can only be used with parameter regex.
4666+
axis : int or string axis name
4667+
The axis to filter on. By default this is the info axis,
4668+
'index' for Series, 'columns' for DataFrame.
4669+
4670+
Returns
4671+
-------
4672+
same type as input object
4673+
4674+
See Also
4675+
--------
4676+
DataFrame.loc
4677+
DataFrame.select_dtypes
4678+
4679+
``axis`` defaults to the info axis that is used when indexing
4680+
with ``[]``.
4681+
4682+
Examples
4683+
--------
4684+
>>> df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])),
4685+
... index=['mouse', 'rabbit'],
4686+
... columns=['one', 'two', 'three'])
4687+
4688+
>>> df.select_str(startswith='t')
4689+
two three
4690+
mouse 2 3
4691+
rabbit 5 6
4692+
4693+
>>> # select columns by regular expression
4694+
>>> df.select_str(regex=r'e$', axis=1)
4695+
one three
4696+
mouse 1 3
4697+
rabbit 4 6
4698+
4699+
>>> # select rows containing 'bbi'
4700+
>>> df.select_str(regex=r'bbi', axis=0)
4701+
one two three
4702+
rabbit 4 5 6
4703+
"""
4704+
import re
4705+
4706+
num_kw = com.count_not_none(startswith, endswith, regex)
4707+
if num_kw != 1:
4708+
raise TypeError(
4709+
"Only one of keywords arguments `startswith`, `endswith` and "
4710+
"`regex` can be used."
4711+
)
4712+
if regex is None and flags != 0:
4713+
raise ValueError("Can only be used togehter with parameter 'regex'")
4714+
4715+
if axis is None:
4716+
axis = self._info_axis_name
4717+
labels = self._get_axis(axis)
4718+
4719+
if startswith is not None:
4720+
mapped = labels.str.startswith(startswith)
4721+
elif endswith is not None:
4722+
mapped = labels.str.endsswith(endswith)
4723+
else: # regex
4724+
matcher = re.compile(regex, flags=flags)
4725+
4726+
def f(x):
4727+
return matcher.search(x) is not None
4728+
4729+
mapped = labels.map(f)
4730+
return self.loc(axis=axis)[mapped]
4731+
46434732
def filter(self, items=None, like=None, regex=None, axis=None):
46444733
"""
46454734
Subset rows or columns of dataframe according to labels in

pandas/tests/frame/test_axis_select_reindex.py

+18
Original file line numberDiff line numberDiff line change
@@ -806,6 +806,24 @@ def test_align_series_combinations(self):
806806
tm.assert_series_equal(res1, exp2)
807807
tm.assert_frame_equal(res2, exp1)
808808

809+
def test_select_str(self, float_frame):
810+
fcopy = float_frame.copy()
811+
fcopy["AA"] = 1
812+
813+
# regex
814+
selected = fcopy.select_str(regex="[A]+")
815+
assert len(selected.columns) == 2
816+
assert "AA" in selected
817+
818+
# doesn't have to be at beginning
819+
df = DataFrame(
820+
{"aBBa": [1, 2], "BBaBB": [1, 2], "aCCa": [1, 2], "aCCaBB": [1, 2]}
821+
)
822+
823+
result = df.select_str(regex="BB")
824+
exp = df[[x for x in df.columns if "BB" in x]]
825+
assert_frame_equal(result, exp)
826+
809827
def test_filter(self, float_frame, float_string_frame):
810828
# Items
811829
filtered = float_frame.filter(["A", "B", "E"])

0 commit comments

Comments
 (0)