diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 099e5bc48353a..65b8513d5ce56 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -33,6 +33,7 @@ Other enhancements - :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface `_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`) - Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`) - Added missing :meth:`pandas.Series.info` to API reference (:issue:`60926`) +- Added new :meth:`DataFrame.select` method to select a subset of columns from the :class:`DataFrame` (:issue:`61522`) - :class:`pandas.api.typing.NoDefault` is available for typing ``no_default`` - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b2c1e38f61f4c..4a603ba474a40 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4479,6 +4479,119 @@ def _get_item(self, item: Hashable) -> Series: # ---------------------------------------------------------------------- # Unsorted + def select(self, *args): + """ + Select a subset of columns from the DataFrame. + + Select can be used to return a DataFrame with some specific columns. + This can be used to remove unwanted columns, as well as to return a + DataFrame with the columns sorted in a specific order. + + Parameters + ---------- + *args : hashable or tuple of hashable + The names or the columns to return. In general this will be strings, + but pandas supports other types of column names, if they are hashable. + + Returns + ------- + DataFrame + The DataFrame with the selected columns. + + See Also + -------- + DataFrame.filter : To return a subset of rows, instead of a subset of columns. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "first_name": ["John", "Alice", "Bob"], + ... "last_name": ["Smith", "Cooper", "Marley"], + ... "age": [61, 22, 35], + ... } + ... ) + + Select a subset of columns: + + >>> df.select("first_name", "age") + first_name age + 0 John 61 + 1 Alice 22 + 2 Bob 35 + + Selecting with a pattern can be done with Python expressions: + + >>> df.select(*[col for col in df.columns if col.endswith("_name")]) + first_name last_name + 0 John Smith + 1 Alice Cooper + 2 Bob Marley + + All columns can be selected, but in a different order: + + >>> df.select("last_name", "first_name", "age") + last_name first_name age + 0 Smith John 61 + 1 Cooper Alice 22 + 2 Marley Bob 35 + + In case the columns are in a list, Python unpacking with star can be used: + + >>> columns = ["last_name", "age"] + >>> df.select(*columns) + last_name age + 0 Smith 61 + 1 Cooper 22 + 2 Marley 35 + + Note that a DataFrame is always returned. If a single column is requested, a + DataFrame with a single column is returned, not a Series: + + >>> df.select("age") + age + 0 61 + 1 22 + 2 35 + + The ``select`` method also works when columns are a ``MultiIndex``: + + >>> df = pd.DataFrame( + ... [("John", "Smith", 61), ("Alice", "Cooper", 22), ("Bob", "Marley", 35)], + ... columns=pd.MultiIndex.from_tuples( + ... [("names", "first_name"), ("names", "last_name"), ("other", "age")] + ... ), + ... ) + + If just column names are provided, they will select from the first level of the + ``MultiIndex``: + + >>> df.select("names") + names + first_name last_name + 0 John Smith + 1 Alice Cooper + 2 Bob Marley + + To select from multiple or all levels, tuples can be provided: + + >>> df.select(("names", "last_name"), ("other", "age")) + names other + last_name age + 0 Smith 61 + 1 Cooper 22 + 2 Marley 35 + """ + if args and isinstance(args[0], list): + raise ValueError( + "`DataFrame.select` does not support a list. Please use " + "`df.select('col1', 'col2',...)` or `df.select(*['col1', 'col2',...])` " + "instead" + ) + + indexer = self.columns._get_indexer_strict(list(args), "columns")[1] + return self.take(indexer, axis=1) + @overload def query( self, diff --git a/pandas/tests/frame/methods/test_select.py b/pandas/tests/frame/methods/test_select.py new file mode 100644 index 0000000000000..accf3ea336e18 --- /dev/null +++ b/pandas/tests/frame/methods/test_select.py @@ -0,0 +1,85 @@ +import pytest + +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm + + +@pytest.fixture +def regular_df(): + return DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8]}) + + +@pytest.fixture +def multiindex_df(): + return DataFrame( + [(0, 2, 4), (1, 3, 5)], + columns=pd.MultiIndex.from_tuples([("A", "c"), ("A", "d"), ("B", "e")]), + ) + + +class TestSelect: + def test_select_subset_cols(self, regular_df): + expected = DataFrame({"a": [1, 2], "c": [5, 6]}) + result = regular_df.select("a", "c") + tm.assert_frame_equal(result, expected) + + def test_single_value(self, regular_df): + expected = DataFrame({"a": [1, 2]}) + result = regular_df.select("a") + assert isinstance(result, DataFrame) + tm.assert_frame_equal(result, expected) + + def test_select_change_order(self, regular_df): + expected = DataFrame({"b": [3, 4], "d": [7, 8], "a": [1, 2], "c": [5, 6]}) + result = regular_df.select("b", "d", "a", "c") + tm.assert_frame_equal(result, expected) + + def test_select_none(self, regular_df): + result = regular_df.select() + assert result.empty + + def test_select_duplicated(self, regular_df): + expected = ["a", "d", "a"] + result = regular_df.select("a", "d", "a") + assert result.columns.tolist() == expected + + def test_select_list(self, regular_df): + with pytest.raises(ValueError, match="does not support a list"): + regular_df.select(["a", "b"]) + + def test_select_missing(self, regular_df): + with pytest.raises(KeyError, match=r"None of .* are in the \[columns\]"): + regular_df.select("z") + + def test_select_not_hashable(self, regular_df): + with pytest.raises(TypeError, match="unhashable type"): + regular_df.select(set()) + + def test_select_multiindex_one_level(self, multiindex_df): + expected = DataFrame( + [(0, 2), (1, 3)], + columns=pd.MultiIndex.from_tuples([("A", "c"), ("A", "d")]), + ) + result = multiindex_df.select("A") + tm.assert_frame_equal(result, expected) + + def test_select_multiindex_single_column(self, multiindex_df): + expected = DataFrame( + [(2,), (3,)], columns=pd.MultiIndex.from_tuples([("A", "d")]) + ) + result = multiindex_df.select(("A", "d")) + assert isinstance(result, DataFrame) + tm.assert_frame_equal(result, expected) + + def test_select_multiindex_multiple_columns(self, multiindex_df): + expected = DataFrame( + [(0, 4), (1, 5)], + columns=pd.MultiIndex.from_tuples([("A", "c"), ("B", "e")]), + ) + result = multiindex_df.select(("A", "c"), ("B", "e")) + tm.assert_frame_equal(result, expected) + + def test_select_multiindex_missing(self, multiindex_df): + with pytest.raises(KeyError, match="not in index"): + multiindex_df.select("Z")