Skip to content

Commit 0f64c13

Browse files
ENH: Implement DataFrame.select
1 parent 50e23e7 commit 0f64c13

File tree

3 files changed

+199
-0
lines changed

3 files changed

+199
-0
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ Other enhancements
3333
- :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html>`_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`)
3434
- Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`)
3535
- Added missing :meth:`pandas.Series.info` to API reference (:issue:`60926`)
36+
- Added new :meth:`DataFrame.select` method to select a subset of columns from the :class:`DataFrame` (:issue:`61522`)
3637
- :class:`pandas.api.typing.NoDefault` is available for typing ``no_default``
3738
- :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
3839
- :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`)

pandas/core/frame.py

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4479,6 +4479,119 @@ def _get_item(self, item: Hashable) -> Series:
44794479
# ----------------------------------------------------------------------
44804480
# Unsorted
44814481

4482+
def select(self, *args):
4483+
"""
4484+
Select a subset of columns from the DataFrame.
4485+
4486+
Select can be used to return a DataFrame with some specific columns.
4487+
This can be used to remove unwanted columns, as well as to return a
4488+
DataFrame with the columns sorted in a specific order.
4489+
4490+
Parameters
4491+
----------
4492+
*args : hashable or tuple of hashable
4493+
The names or the columns to return. In general this will be strings,
4494+
but pandas supports other types of column names, if they are hashable.
4495+
4496+
Returns
4497+
-------
4498+
DataFrame
4499+
The DataFrame with the selected columns.
4500+
4501+
See Also
4502+
--------
4503+
DataFrame.filter : To return a subset of rows, instead of a subset of columns.
4504+
4505+
Examples
4506+
--------
4507+
>>> df = pd.DataFrame(
4508+
... {
4509+
... "first_name": ["John", "Alice", "Bob"],
4510+
... "last_name": ["Smith", "Cooper", "Marley"],
4511+
... "age": [61, 22, 35],
4512+
... }
4513+
... )
4514+
4515+
Select a subset of columns:
4516+
4517+
>>> df.select("first_name", "age")
4518+
first_name age
4519+
0 John 61
4520+
1 Alice 22
4521+
2 Bob 35
4522+
4523+
Selecting with a pattern can be done with Python expressions:
4524+
4525+
>>> df.select(*[col for col in df.columns if col.endswith("_name")])
4526+
first_name last_name
4527+
0 John Smith
4528+
1 Alice Cooper
4529+
2 Bob Marley
4530+
4531+
All columns can be selected, but in a different order:
4532+
4533+
>>> df.select("last_name", "first_name", "age")
4534+
last_name first_name age
4535+
0 Smith John 61
4536+
1 Cooper Alice 22
4537+
2 Marley Bob 35
4538+
4539+
In case the columns are in a list, Python unpacking with star can be used:
4540+
4541+
>>> columns = ["last_name", "age"]
4542+
>>> df.select(*columns)
4543+
last_name age
4544+
0 Smith 61
4545+
1 Cooper 22
4546+
2 Marley 35
4547+
4548+
Note that a DataFrame is always returned. If a single column is requested, a
4549+
DataFrame with a single column is returned, not a Series:
4550+
4551+
>>> df.select("age")
4552+
age
4553+
0 61
4554+
1 22
4555+
2 35
4556+
4557+
The ``select`` method also works when columns are a ``MultiIndex``:
4558+
4559+
>>> df = pd.DataFrame(
4560+
... [("John", "Smith", 61), ("Alice", "Cooper", 22), ("Bob", "Marley", 35)],
4561+
... columns=pd.MultiIndex.from_tuples(
4562+
... [("names", "first_name"), ("names", "last_name"), ("other", "age")]
4563+
... ),
4564+
... )
4565+
4566+
If just column names are provided, they will select from the first level of the
4567+
``MultiIndex``:
4568+
4569+
>>> df.select("names")
4570+
names
4571+
first_name last_name
4572+
0 John Smith
4573+
1 Alice Cooper
4574+
2 Bob Marley
4575+
4576+
To select from multiple or all levels, tuples can be provided:
4577+
4578+
>>> df.select(("names", "last_name"), ("other", "age"))
4579+
names other
4580+
last_name age
4581+
0 Smith 61
4582+
1 Cooper 22
4583+
2 Marley 35
4584+
"""
4585+
if args and isinstance(args[0], list):
4586+
raise ValueError(
4587+
"`DataFrame.select` does not support a list. Please use "
4588+
"`df.select('col1', 'col2',...)` or `df.select(*['col1', 'col2',...])` "
4589+
"instead"
4590+
)
4591+
4592+
indexer = self.columns._get_indexer_strict(list(args), "columns")[1]
4593+
return self.take(indexer, axis=1)
4594+
44824595
@overload
44834596
def query(
44844597
self,
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
import pytest
2+
3+
import pandas as pd
4+
from pandas import DataFrame
5+
import pandas._testing as tm
6+
7+
8+
@pytest.fixture
9+
def regular_df():
10+
return DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8]})
11+
12+
13+
@pytest.fixture
14+
def multiindex_df():
15+
return DataFrame(
16+
[(0, 2, 4), (1, 3, 5)],
17+
columns=pd.MultiIndex.from_tuples([("A", "c"), ("A", "d"), ("B", "e")]),
18+
)
19+
20+
21+
class TestSelect:
22+
def test_select_subset_cols(self, regular_df):
23+
expected = DataFrame({"a": [1, 2], "c": [5, 6]})
24+
result = regular_df.select("a", "c")
25+
tm.assert_frame_equal(result, expected)
26+
27+
def test_single_value(self, regular_df):
28+
expected = DataFrame({"a": [1, 2]})
29+
result = regular_df.select("a")
30+
assert isinstance(result, DataFrame)
31+
tm.assert_frame_equal(result, expected)
32+
33+
def test_select_change_order(self, regular_df):
34+
expected = DataFrame({"b": [3, 4], "d": [7, 8], "a": [1, 2], "c": [5, 6]})
35+
result = regular_df.select("b", "d", "a", "c")
36+
tm.assert_frame_equal(result, expected)
37+
38+
def test_select_none(self, regular_df):
39+
result = regular_df.select()
40+
assert result.empty
41+
42+
def test_select_duplicated(self, regular_df):
43+
expected = ["a", "d", "a"]
44+
result = regular_df.select("a", "d", "a")
45+
assert result.columns.tolist() == expected
46+
47+
def test_select_list(self, regular_df):
48+
with pytest.raises(ValueError, match="does not support a list"):
49+
regular_df.select(["a", "b"])
50+
51+
def test_select_missing(self, regular_df):
52+
with pytest.raises(KeyError, match=r"None of .* are in the \[columns\]"):
53+
regular_df.select("z")
54+
55+
def test_select_not_hashable(self, regular_df):
56+
with pytest.raises(TypeError, match="unhashable type"):
57+
regular_df.select(set())
58+
59+
def test_select_multiindex_one_level(self, multiindex_df):
60+
expected = DataFrame(
61+
[(0, 2), (1, 3)],
62+
columns=pd.MultiIndex.from_tuples([("A", "c"), ("A", "d")]),
63+
)
64+
result = multiindex_df.select("A")
65+
tm.assert_frame_equal(result, expected)
66+
67+
def test_select_multiindex_single_column(self, multiindex_df):
68+
expected = DataFrame(
69+
[(2,), (3,)], columns=pd.MultiIndex.from_tuples([("A", "d")])
70+
)
71+
result = multiindex_df.select(("A", "d"))
72+
assert isinstance(result, DataFrame)
73+
tm.assert_frame_equal(result, expected)
74+
75+
def test_select_multiindex_multiple_columns(self, multiindex_df):
76+
expected = DataFrame(
77+
[(0, 4), (1, 5)],
78+
columns=pd.MultiIndex.from_tuples([("A", "c"), ("B", "e")]),
79+
)
80+
result = multiindex_df.select(("A", "c"), ("B", "e"))
81+
tm.assert_frame_equal(result, expected)
82+
83+
def test_select_multiindex_missing(self, multiindex_df):
84+
with pytest.raises(KeyError, match="not in index"):
85+
multiindex_df.select("Z")

0 commit comments

Comments
 (0)