ENH: Implement DataFrame.select

datapythonista · datapythonista · commit 0f64c13169f4 · 2025-05-31T17:01:14.000+04:00
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -33,6 +33,7 @@ Other enhancements
 - :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html>`_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`)
 - Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`)
 - Added missing :meth:`pandas.Series.info` to API reference (:issue:`60926`)
+- Added new :meth:`DataFrame.select` method to select a subset of columns from the :class:`DataFrame` (:issue:`61522`)
 - :class:`pandas.api.typing.NoDefault` is available for typing ``no_default``
 - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
 - :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4479,6 +4479,119 @@ def _get_item(self, item: Hashable) -> Series:
     # ----------------------------------------------------------------------
     # Unsorted
 
+    def select(self, *args):
+        """
+        Select a subset of columns from the DataFrame.
+
+        Select can be used to return a DataFrame with some specific columns.
+        This can be used to remove unwanted columns, as well as to return a
+        DataFrame with the columns sorted in a specific order.
+
+        Parameters
+        ----------
+        *args : hashable or tuple of hashable
+            The names or the columns to return. In general this will be strings,
+            but pandas supports other types of column names, if they are hashable.
+
+        Returns
+        -------
+        DataFrame
+            The DataFrame with the selected columns.
+
+        See Also
+        --------
+        DataFrame.filter : To return a subset of rows, instead of a subset of columns.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "first_name": ["John", "Alice", "Bob"],
+        ...         "last_name": ["Smith", "Cooper", "Marley"],
+        ...         "age": [61, 22, 35],
+        ...     }
+        ... )
+
+        Select a subset of columns:
+
+        >>> df.select("first_name", "age")
+          first_name  age
+        0       John   61
+        1      Alice   22
+        2        Bob   35
+
+        Selecting with a pattern can be done with Python expressions:
+
+        >>> df.select(*[col for col in df.columns if col.endswith("_name")])
+          first_name last_name
+        0       John     Smith
+        1      Alice    Cooper
+        2        Bob    Marley
+
+        All columns can be selected, but in a different order:
+
+        >>> df.select("last_name", "first_name", "age")
+          last_name first_name  age
+        0     Smith       John   61
+        1    Cooper      Alice   22
+        2    Marley        Bob   35
+
+        In case the columns are in a list, Python unpacking with star can be used:
+
+        >>> columns = ["last_name", "age"]
+        >>> df.select(*columns)
+                  last_name  age
+        0     Smith   61
+        1    Cooper   22
+        2    Marley   35
+
+        Note that a DataFrame is always returned. If a single column is requested, a
+        DataFrame with a single column is returned, not a Series:
+
+        >>> df.select("age")
+           age
+        0   61
+        1   22
+        2   35
+
+        The ``select`` method also works when columns are a ``MultiIndex``:
+
+        >>> df = pd.DataFrame(
+        ...     [("John", "Smith", 61), ("Alice", "Cooper", 22), ("Bob", "Marley", 35)],
+        ...     columns=pd.MultiIndex.from_tuples(
+        ...         [("names", "first_name"), ("names", "last_name"), ("other", "age")]
+        ...     ),
+        ... )
+
+        If just column names are provided, they will select from the first level of the
+        ``MultiIndex``:
+
+        >>> df.select("names")
+              names
+          first_name last_name
+        0       John     Smith
+        1      Alice    Cooper
+        2        Bob    Marley
+
+        To select from multiple or all levels, tuples can be provided:
+
+        >>> df.select(("names", "last_name"), ("other", "age"))
+              names other
+          last_name   age
+        0     Smith    61
+        1    Cooper    22
+        2    Marley    35
+        """
+        if args and isinstance(args[0], list):
+            raise ValueError(
+                "`DataFrame.select` does not support a list. Please use "
+                "`df.select('col1', 'col2',...)` or `df.select(*['col1', 'col2',...])` "
+                "instead"
+            )
+
+        indexer = self.columns._get_indexer_strict(list(args), "columns")[1]
+        return self.take(indexer, axis=1)
+
     @overload
     def query(
         self,
diff --git a/pandas/tests/frame/methods/test_select.py b/pandas/tests/frame/methods/test_select.py
@@ -0,0 +1,85 @@
+import pytest
+
+import pandas as pd
+from pandas import DataFrame
+import pandas._testing as tm
+
+
+@pytest.fixture
+def regular_df():
+    return DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8]})
+
+
+@pytest.fixture
+def multiindex_df():
+    return DataFrame(
+        [(0, 2, 4), (1, 3, 5)],
+        columns=pd.MultiIndex.from_tuples([("A", "c"), ("A", "d"), ("B", "e")]),
+    )
+
+
+class TestSelect:
+    def test_select_subset_cols(self, regular_df):
+        expected = DataFrame({"a": [1, 2], "c": [5, 6]})
+        result = regular_df.select("a", "c")
+        tm.assert_frame_equal(result, expected)
+
+    def test_single_value(self, regular_df):
+        expected = DataFrame({"a": [1, 2]})
+        result = regular_df.select("a")
+        assert isinstance(result, DataFrame)
+        tm.assert_frame_equal(result, expected)
+
+    def test_select_change_order(self, regular_df):
+        expected = DataFrame({"b": [3, 4], "d": [7, 8], "a": [1, 2], "c": [5, 6]})
+        result = regular_df.select("b", "d", "a", "c")
+        tm.assert_frame_equal(result, expected)
+
+    def test_select_none(self, regular_df):
+        result = regular_df.select()
+        assert result.empty
+
+    def test_select_duplicated(self, regular_df):
+        expected = ["a", "d", "a"]
+        result = regular_df.select("a", "d", "a")
+        assert result.columns.tolist() == expected
+
+    def test_select_list(self, regular_df):
+        with pytest.raises(ValueError, match="does not support a list"):
+            regular_df.select(["a", "b"])
+
+    def test_select_missing(self, regular_df):
+        with pytest.raises(KeyError, match=r"None of .* are in the \[columns\]"):
+            regular_df.select("z")
+
+    def test_select_not_hashable(self, regular_df):
+        with pytest.raises(TypeError, match="unhashable type"):
+            regular_df.select(set())
+
+    def test_select_multiindex_one_level(self, multiindex_df):
+        expected = DataFrame(
+            [(0, 2), (1, 3)],
+            columns=pd.MultiIndex.from_tuples([("A", "c"), ("A", "d")]),
+        )
+        result = multiindex_df.select("A")
+        tm.assert_frame_equal(result, expected)
+
+    def test_select_multiindex_single_column(self, multiindex_df):
+        expected = DataFrame(
+            [(2,), (3,)], columns=pd.MultiIndex.from_tuples([("A", "d")])
+        )
+        result = multiindex_df.select(("A", "d"))
+        assert isinstance(result, DataFrame)
+        tm.assert_frame_equal(result, expected)
+
+    def test_select_multiindex_multiple_columns(self, multiindex_df):
+        expected = DataFrame(
+            [(0, 4), (1, 5)],
+            columns=pd.MultiIndex.from_tuples([("A", "c"), ("B", "e")]),
+        )
+        result = multiindex_df.select(("A", "c"), ("B", "e"))
+        tm.assert_frame_equal(result, expected)
+
+    def test_select_multiindex_missing(self, multiindex_df):
+        with pytest.raises(KeyError, match="not in index"):
+            multiindex_df.select("Z")