Make select by substring

HoqueUM · HoqueUM · commit 64d2270522b7 · 2025-04-20T17:16:46.000-04:00
diff --git a/books.xml b/books.xml
@@ -0,0 +1,120 @@
+<?xml version="1.0"?>
+<catalog>
+   <book id="bk101">
+      <author>Gambardella, Matthew</author>
+      <title>XML Developer's Guide</title>
+      <genre>Computer</genre>
+      <price>44.95</price>
+      <publish_date>2000-10-01</publish_date>
+      <description>An in-depth look at creating applications 
+      with XML.</description>
+   </book>
+   <book id="bk102">
+      <author>Ralls, Kim</author>
+      <title>Midnight Rain</title>
+      <genre>Fantasy</genre>
+      <price>5.95</price>
+      <publish_date>2000-12-16</publish_date>
+      <description>A former architect battles corporate zombies, 
+      an evil sorceress, and her own childhood to become queen 
+      of the world.</description>
+   </book>
+   <book id="bk103">
+      <author>Corets, Eva</author>
+      <title>Maeve Ascendant</title>
+      <genre>Fantasy</genre>
+      <price>5.95</price>
+      <publish_date>2000-11-17</publish_date>
+      <description>After the collapse of a nanotechnology 
+      society in England, the young survivors lay the 
+      foundation for a new society.</description>
+   </book>
+   <book id="bk104">
+      <author>Corets, Eva</author>
+      <title>Oberon's Legacy</title>
+      <genre>Fantasy</genre>
+      <price>5.95</price>
+      <publish_date>2001-03-10</publish_date>
+      <description>In post-apocalypse England, the mysterious 
+      agent known only as Oberon helps to create a new life 
+      for the inhabitants of London. Sequel to Maeve 
+      Ascendant.</description>
+   </book>
+   <book id="bk105">
+      <author>Corets, Eva</author>
+      <title>The Sundered Grail</title>
+      <genre>Fantasy</genre>
+      <price>5.95</price>
+      <publish_date>2001-09-10</publish_date>
+      <description>The two daughters of Maeve, half-sisters, 
+      battle one another for control of England. Sequel to 
+      Oberon's Legacy.</description>
+   </book>
+   <book id="bk106">
+      <author>Randall, Cynthia</author>
+      <title>Lover Birds</title>
+      <genre>Romance</genre>
+      <price>4.95</price>
+      <publish_date>2000-09-02</publish_date>
+      <description>When Carla meets Paul at an ornithology 
+      conference, tempers fly as feathers get ruffled.</description>
+   </book>
+   <book id="bk107">
+      <author>Thurman, Paula</author>
+      <title>Splish Splash</title>
+      <genre>Romance</genre>
+      <price>4.95</price>
+      <publish_date>2000-11-02</publish_date>
+      <description>A deep sea diver finds true love twenty 
+      thousand leagues beneath the sea.</description>
+   </book>
+   <book id="bk108">
+      <author>Knorr, Stefan</author>
+      <title>Creepy Crawlies</title>
+      <genre>Horror</genre>
+      <price>4.95</price>
+      <publish_date>2000-12-06</publish_date>
+      <description>An anthology of horror stories about roaches,
+      centipedes, scorpions  and other insects.</description>
+   </book>
+   <book id="bk109">
+      <author>Kress, Peter</author>
+      <title>Paradox Lost</title>
+      <genre>Science Fiction</genre>
+      <price>6.95</price>
+      <publish_date>2000-11-02</publish_date>
+      <description>After an inadvertant trip through a Heisenberg
+      Uncertainty Device, James Salway discovers the problems 
+      of being quantum.</description>
+   </book>
+   <book id="bk110">
+      <author>O'Brien, Tim</author>
+      <title>Microsoft .NET: The Programming Bible</title>
+      <genre>Computer</genre>
+      <price>36.95</price>
+      <publish_date>2000-12-09</publish_date>
+      <description>Microsoft's .NET initiative is explored in 
+      detail in this deep programmer's reference.</description>
+   </book>
+   <book id="bk111">
+      <author>O'Brien, Tim</author>
+      <title>MSXML3: A Comprehensive Guide</title>
+      <genre>Computer</genre>
+      <price>36.95</price>
+      <publish_date>2000-12-01</publish_date>
+      <description>The Microsoft MSXML3 parser is covered in 
+      detail, with attention to XML DOM interfaces, XSLT processing, 
+      SAX and more.</description>
+   </book>
+   <book id="bk112">
+      <author>Galos, Mike</author>
+      <title>Visual Studio 7: A Comprehensive Guide</title>
+      <genre>Computer</genre>
+      <price>49.95</price>
+      <publish_date>2001-04-16</publish_date>
+      <description>Microsoft Visual Studio 7 is explored in depth,
+      looking at how Visual Basic, Visual C++, C#, and ASP+ are 
+      integrated into a comprehensive development 
+      environment.</description>
+   </book>
+</catalog>
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -31,8 +31,10 @@
     TYPE_CHECKING,
     Any,
     Literal,
+    List,
     cast,
     overload,
+    Union
 )
 import warnings
 
@@ -7716,6 +7718,86 @@ def nsmallest(
         Nauru         337000  182      NR
         """
         return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nsmallest()
+    
+    def select_by_substr(
+            self,
+            substr: Union[str, List[str]],
+            ignore_case: bool = True,
+        ) -> DataFrame | None:
+        """
+    Return columns whose names contain the specified substring(s).
+
+        Select and return all columns from the DataFrame whose names contain
+        the given substring or any of a list of substrings. By default, the
+        search is case-insensitive.
+
+        Parameters
+        ----------
+        substr : str or list of str
+            Substring or list of substrings to search for in column names.
+        ignore_case : bool, default True
+            Whether to ignore case when searching for substrings.
+
+        Returns
+        -------
+        DataFrame or None
+            DataFrame containing only the columns whose names match the
+            specified substring(s). Returns None if no columns match.
+
+        See Also
+        --------
+        DataFrame.filter : Subset the columns or rows of a DataFrame according to labels or a boolean array.
+        DataFrame.loc : Access a group of rows and columns by label(s) or a boolean array.
+
+        Notes
+        -----
+        All columns containing at least one of the provided substrings will be
+        returned. If no columns match, None is returned.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({
+        ...     "first_name": ["Alice", "Bob"],
+        ...     "last_name": ["Smith", "Jones"],
+        ...     "age": [25, 30],
+        ...     "city": ["NY", "LA"]
+        ... })
+        >>> df.select_by_substr("name")
+        first_name last_name
+        0       Alice     Smith
+        1         Bob     Jones
+
+        >>> df.select_by_substr(["name", "city"])
+        first_name last_name  city
+        0       Alice     Smith   NY
+        1         Bob     Jones   LA
+
+        >>> df.select_by_substr("AGE", ignore_case=False)  # No match due to case
+        None
+
+        >>> df.select_by_substr("AGE", ignore_case=True)
+        age
+        0   25
+        1   30
+        """
+        substr = [substr] if isinstance(substr, str) else substr
+        selected_cols = self.columns
+
+        if ignore_case:
+            selected_cols = [
+                col for col in self.columns
+                if any(sub.casefold() in col.casefold()
+                       for sub in substr)
+            ]
+        else:
+            selected_cols = [
+                col for col in self.columns
+                if any(sub in col
+                       for sub in substr)
+            ]
+
+        selected_cols = list(set(selected_cols))
+        return self[selected_cols]
 
     def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame:
         """
diff --git a/pandas/tests/frame/test_ b/pandas/tests/frame/test_
diff --git a/placeholder.txt b/placeholder.txt
@@ -0,0 +1,125 @@
+Return the first `n` rows ordered by `columns` in ascending order.
+
+        Return the first `n` rows with the smallest values in `columns`, in
+        ascending order. The columns that are not specified are returned as
+        well, but not used for ordering.
+
+        This method is equivalent to
+        ``df.sort_values(columns, ascending=True).head(n)``, but more
+        performant.
+
+        Parameters
+        ----------
+        n : int
+            Number of items to retrieve.
+        columns : list or str
+            Column name or names to order by.
+        keep : {'first', 'last', 'all'}, default 'first'
+            Where there are duplicate values:
+
+            - ``first`` : take the first occurrence.
+            - ``last`` : take the last occurrence.
+            - ``all`` : keep all the ties of the largest item even if it means
+              selecting more than ``n`` items.
+
+        Returns
+        -------
+        DataFrame
+            DataFrame with the first `n` rows ordered by `columns` in ascending order.
+
+        See Also
+        --------
+        DataFrame.nlargest : Return the first `n` rows ordered by `columns` in
+            descending order.
+        DataFrame.sort_values : Sort DataFrame by the values.
+        DataFrame.head : Return the first `n` rows without re-ordering.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "population": [
+        ...             59000000,
+        ...             65000000,
+        ...             434000,
+        ...             434000,
+        ...             434000,
+        ...             337000,
+        ...             337000,
+        ...             11300,
+        ...             11300,
+        ...         ],
+        ...         "GDP": [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311],
+        ...         "alpha-2": ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"],
+        ...     },
+        ...     index=[
+        ...         "Italy",
+        ...         "France",
+        ...         "Malta",
+        ...         "Maldives",
+        ...         "Brunei",
+        ...         "Iceland",
+        ...         "Nauru",
+        ...         "Tuvalu",
+        ...         "Anguilla",
+        ...     ],
+        ... )
+        >>> df
+                  population      GDP alpha-2
+        Italy       59000000  1937894      IT
+        France      65000000  2583560      FR
+        Malta         434000    12011      MT
+        Maldives      434000     4520      MV
+        Brunei        434000    12128      BN
+        Iceland       337000    17036      IS
+        Nauru         337000      182      NR
+        Tuvalu         11300       38      TV
+        Anguilla       11300      311      AI
+
+        In the following example, we will use ``nsmallest`` to select the
+        three rows having the smallest values in column "population".
+
+        >>> df.nsmallest(3, "population")
+                  population    GDP alpha-2
+        Tuvalu         11300     38      TV
+        Anguilla       11300    311      AI
+        Iceland       337000  17036      IS
+
+        When using ``keep='last'``, ties are resolved in reverse order:
+
+        >>> df.nsmallest(3, "population", keep="last")
+                  population  GDP alpha-2
+        Anguilla       11300  311      AI
+        Tuvalu         11300   38      TV
+        Nauru         337000  182      NR
+
+        When using ``keep='all'``, the number of element kept can go beyond ``n``
+        if there are duplicate values for the largest element, all the
+        ties are kept.
+
+        >>> df.nsmallest(3, "population", keep="all")
+                  population    GDP alpha-2
+        Tuvalu         11300     38      TV
+        Anguilla       11300    311      AI
+        Iceland       337000  17036      IS
+        Nauru         337000    182      NR
+
+        However, ``nsmallest`` does not keep ``n`` distinct
+        smallest elements:
+
+        >>> df.nsmallest(4, "population", keep="all")
+                  population    GDP alpha-2
+        Tuvalu         11300     38      TV
+        Anguilla       11300    311      AI
+        Iceland       337000  17036      IS
+        Nauru         337000    182      NR
+
+        To order by the smallest values in column "population" and then "GDP", we can
+        specify multiple columns like in the next example.
+
+        >>> df.nsmallest(3, ["population", "GDP"])
+                  population  GDP alpha-2
+        Tuvalu         11300   38      TV
+        Anguilla       11300  311      AI
+        Nauru         337000  182      NR
+        """
diff --git a/test.py b/test.py
@@ -0,0 +1,15 @@
+# contributing guide: https://pandas.pydata.org/docs/dev/development/contributing.html#pushing-your-changes
+import pandas as pd
+
+df = pd.DataFrame({
+    "yes": [5000, 2, 3],
+    "Byesyes": [4, 5, 6],
+    "no": [7, 8, 9],
+    "Byesno": [10, 11, 12],
+    "YES": [13, 14, 15],
+    "NO": [16, 17, 18],
+    "YESYES": [19, 20, 21],
+})
+
+# Test the DataFrame creation
+print(df.select_by_substr("skibidi"))