From c7cb3394632b81222462ebd50785b80413526d7e Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Fri, 17 Feb 2023 09:34:03 -0800
Subject: [PATCH] REF: move SelectN from core.algorithms

---
 pandas/core/algorithms.py      | 226 ----------------------------
 pandas/core/frame.py           |   7 +-
 pandas/core/methods/selectn.py | 262 +++++++++++++++++++++++++++++++++
 pandas/core/series.py          |   5 +-
 4 files changed, 268 insertions(+), 232 deletions(-)
 create mode 100644 pandas/core/methods/selectn.py

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 1de7643cc96fc..42c513aaf5aa6 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -8,11 +8,8 @@
 from textwrap import dedent
 from typing import (
     TYPE_CHECKING,
-    Hashable,
     Literal,
-    Sequence,
     cast,
-    final,
 )
 import warnings
 
@@ -29,7 +26,6 @@
     ArrayLike,
     AxisInt,
     DtypeObj,
-    IndexLabel,
     TakeIndexer,
     npt,
 )
@@ -97,7 +93,6 @@
 
     from pandas import (
         Categorical,
-        DataFrame,
         Index,
         Series,
     )
@@ -1167,227 +1162,6 @@ def checked_add_with_arr(
     return result
 
 
-# --------------- #
-# select n        #
-# --------------- #
-
-
-class SelectN:
-    def __init__(self, obj, n: int, keep: str) -> None:
-        self.obj = obj
-        self.n = n
-        self.keep = keep
-
-        if self.keep not in ("first", "last", "all"):
-            raise ValueError('keep must be either "first", "last" or "all"')
-
-    def compute(self, method: str) -> DataFrame | Series:
-        raise NotImplementedError
-
-    @final
-    def nlargest(self):
-        return self.compute("nlargest")
-
-    @final
-    def nsmallest(self):
-        return self.compute("nsmallest")
-
-    @final
-    @staticmethod
-    def is_valid_dtype_n_method(dtype: DtypeObj) -> bool:
-        """
-        Helper function to determine if dtype is valid for
-        nsmallest/nlargest methods
-        """
-        return (
-            not is_complex_dtype(dtype)
-            if is_numeric_dtype(dtype)
-            else needs_i8_conversion(dtype)
-        )
-
-
-class SelectNSeries(SelectN):
-    """
-    Implement n largest/smallest for Series
-
-    Parameters
-    ----------
-    obj : Series
-    n : int
-    keep : {'first', 'last'}, default 'first'
-
-    Returns
-    -------
-    nordered : Series
-    """
-
-    def compute(self, method: str) -> Series:
-        from pandas.core.reshape.concat import concat
-
-        n = self.n
-        dtype = self.obj.dtype
-        if not self.is_valid_dtype_n_method(dtype):
-            raise TypeError(f"Cannot use method '{method}' with dtype {dtype}")
-
-        if n <= 0:
-            return self.obj[[]]
-
-        dropped = self.obj.dropna()
-        nan_index = self.obj.drop(dropped.index)
-
-        # slow method
-        if n >= len(self.obj):
-            ascending = method == "nsmallest"
-            return self.obj.sort_values(ascending=ascending).head(n)
-
-        # fast method
-        new_dtype = dropped.dtype
-        arr = _ensure_data(dropped.values)
-        if method == "nlargest":
-            arr = -arr
-            if is_integer_dtype(new_dtype):
-                # GH 21426: ensure reverse ordering at boundaries
-                arr -= 1
-
-            elif is_bool_dtype(new_dtype):
-                # GH 26154: ensure False is smaller than True
-                arr = 1 - (-arr)
-
-        if self.keep == "last":
-            arr = arr[::-1]
-
-        nbase = n
-        narr = len(arr)
-        n = min(n, narr)
-
-        # arr passed into kth_smallest must be contiguous. We copy
-        # here because kth_smallest will modify its input
-        kth_val = algos.kth_smallest(arr.copy(order="C"), n - 1)
-        (ns,) = np.nonzero(arr <= kth_val)
-        inds = ns[arr[ns].argsort(kind="mergesort")]
-
-        if self.keep != "all":
-            inds = inds[:n]
-            findex = nbase
-        else:
-            if len(inds) < nbase <= len(nan_index) + len(inds):
-                findex = len(nan_index) + len(inds)
-            else:
-                findex = len(inds)
-
-        if self.keep == "last":
-            # reverse indices
-            inds = narr - 1 - inds
-
-        return concat([dropped.iloc[inds], nan_index]).iloc[:findex]
-
-
-class SelectNFrame(SelectN):
-    """
-    Implement n largest/smallest for DataFrame
-
-    Parameters
-    ----------
-    obj : DataFrame
-    n : int
-    keep : {'first', 'last'}, default 'first'
-    columns : list or str
-
-    Returns
-    -------
-    nordered : DataFrame
-    """
-
-    def __init__(self, obj: DataFrame, n: int, keep: str, columns: IndexLabel) -> None:
-        super().__init__(obj, n, keep)
-        if not is_list_like(columns) or isinstance(columns, tuple):
-            columns = [columns]
-
-        columns = cast(Sequence[Hashable], columns)
-        columns = list(columns)
-        self.columns = columns
-
-    def compute(self, method: str) -> DataFrame:
-        from pandas.core.api import Index
-
-        n = self.n
-        frame = self.obj
-        columns = self.columns
-
-        for column in columns:
-            dtype = frame[column].dtype
-            if not self.is_valid_dtype_n_method(dtype):
-                raise TypeError(
-                    f"Column {repr(column)} has dtype {dtype}, "
-                    f"cannot use method {repr(method)} with this dtype"
-                )
-
-        def get_indexer(current_indexer, other_indexer):
-            """
-            Helper function to concat `current_indexer` and `other_indexer`
-            depending on `method`
-            """
-            if method == "nsmallest":
-                return current_indexer.append(other_indexer)
-            else:
-                return other_indexer.append(current_indexer)
-
-        # Below we save and reset the index in case index contains duplicates
-        original_index = frame.index
-        cur_frame = frame = frame.reset_index(drop=True)
-        cur_n = n
-        indexer = Index([], dtype=np.int64)
-
-        for i, column in enumerate(columns):
-            # For each column we apply method to cur_frame[column].
-            # If it's the last column or if we have the number of
-            # results desired we are done.
-            # Otherwise there are duplicates of the largest/smallest
-            # value and we need to look at the rest of the columns
-            # to determine which of the rows with the largest/smallest
-            # value in the column to keep.
-            series = cur_frame[column]
-            is_last_column = len(columns) - 1 == i
-            values = getattr(series, method)(
-                cur_n, keep=self.keep if is_last_column else "all"
-            )
-
-            if is_last_column or len(values) <= cur_n:
-                indexer = get_indexer(indexer, values.index)
-                break
-
-            # Now find all values which are equal to
-            # the (nsmallest: largest)/(nlargest: smallest)
-            # from our series.
-            border_value = values == values[values.index[-1]]
-
-            # Some of these values are among the top-n
-            # some aren't.
-            unsafe_values = values[border_value]
-
-            # These values are definitely among the top-n
-            safe_values = values[~border_value]
-            indexer = get_indexer(indexer, safe_values.index)
-
-            # Go on and separate the unsafe_values on the remaining
-            # columns.
-            cur_frame = cur_frame.loc[unsafe_values.index]
-            cur_n = n - len(indexer)
-
-        frame = frame.take(indexer)
-
-        # Restore the index on frame
-        frame.index = original_index.take(indexer)
-
-        # If there is only one column, the frame is already sorted.
-        if len(columns) == 1:
-            return frame
-
-        ascending = method == "nsmallest"
-
-        return frame.sort_values(columns, ascending=ascending, kind="mergesort")
-
-
 # ---- #
 # take #
 # ---- #
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 79f423b17383d..b85913ee4716e 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -213,6 +213,7 @@
     to_arrays,
     treat_as_nested,
 )
+from pandas.core.methods import selectn
 from pandas.core.reshape.melt import melt
 from pandas.core.series import Series
 from pandas.core.shared_docs import _shared_docs
@@ -7178,7 +7179,7 @@ def nlargest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFram
         Italy     59000000  1937894      IT
         Brunei      434000    12128      BN
         """
-        return algorithms.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest()
+        return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest()
 
     def nsmallest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame:
         """
@@ -7276,9 +7277,7 @@ def nsmallest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFra
         Anguilla       11300  311      AI
         Nauru         337000  182      NR
         """
-        return algorithms.SelectNFrame(
-            self, n=n, keep=keep, columns=columns
-        ).nsmallest()
+        return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nsmallest()
 
     @doc(
         Series.swaplevel,
diff --git a/pandas/core/methods/selectn.py b/pandas/core/methods/selectn.py
new file mode 100644
index 0000000000000..241d55aa663f7
--- /dev/null
+++ b/pandas/core/methods/selectn.py
@@ -0,0 +1,262 @@
+"""
+Implementation of nlargest and nsmallest.
+"""
+
+from __future__ import annotations
+
+from typing import (
+    TYPE_CHECKING,
+    Hashable,
+    Sequence,
+    cast,
+    final,
+)
+
+import numpy as np
+
+from pandas._libs import algos as libalgos
+from pandas._typing import (
+    DtypeObj,
+    IndexLabel,
+)
+
+from pandas.core.dtypes.common import (
+    is_bool_dtype,
+    is_complex_dtype,
+    is_integer_dtype,
+    is_list_like,
+    is_numeric_dtype,
+    needs_i8_conversion,
+)
+from pandas.core.dtypes.dtypes import BaseMaskedDtype
+
+if TYPE_CHECKING:
+    from pandas import (
+        DataFrame,
+        Series,
+    )
+
+
+class SelectN:
+    def __init__(self, obj, n: int, keep: str) -> None:
+        self.obj = obj
+        self.n = n
+        self.keep = keep
+
+        if self.keep not in ("first", "last", "all"):
+            raise ValueError('keep must be either "first", "last" or "all"')
+
+    def compute(self, method: str) -> DataFrame | Series:
+        raise NotImplementedError
+
+    @final
+    def nlargest(self):
+        return self.compute("nlargest")
+
+    @final
+    def nsmallest(self):
+        return self.compute("nsmallest")
+
+    @final
+    @staticmethod
+    def is_valid_dtype_n_method(dtype: DtypeObj) -> bool:
+        """
+        Helper function to determine if dtype is valid for
+        nsmallest/nlargest methods
+        """
+        if is_numeric_dtype(dtype):
+            return not is_complex_dtype(dtype)
+        return needs_i8_conversion(dtype)
+
+
+class SelectNSeries(SelectN):
+    """
+    Implement n largest/smallest for Series
+
+    Parameters
+    ----------
+    obj : Series
+    n : int
+    keep : {'first', 'last'}, default 'first'
+
+    Returns
+    -------
+    nordered : Series
+    """
+
+    def compute(self, method: str) -> Series:
+        from pandas.core.reshape.concat import concat
+
+        n = self.n
+        dtype = self.obj.dtype
+        if not self.is_valid_dtype_n_method(dtype):
+            raise TypeError(f"Cannot use method '{method}' with dtype {dtype}")
+
+        if n <= 0:
+            return self.obj[[]]
+
+        dropped = self.obj.dropna()
+        nan_index = self.obj.drop(dropped.index)
+
+        # slow method
+        if n >= len(self.obj):
+            ascending = method == "nsmallest"
+            return self.obj.sort_values(ascending=ascending).head(n)
+
+        # fast method
+        new_dtype = dropped.dtype
+
+        # Similar to algorithms._ensure_data
+        arr = dropped._values
+        if needs_i8_conversion(arr.dtype):
+            arr = arr.view("i8")
+        elif isinstance(arr.dtype, BaseMaskedDtype):
+            arr = arr._data
+        else:
+            arr = np.asarray(arr)
+        if arr.dtype.kind == "b":
+            arr = arr.view(np.uint8)
+
+        if method == "nlargest":
+            arr = -arr
+            if is_integer_dtype(new_dtype):
+                # GH 21426: ensure reverse ordering at boundaries
+                arr -= 1
+
+            elif is_bool_dtype(new_dtype):
+                # GH 26154: ensure False is smaller than True
+                arr = 1 - (-arr)
+
+        if self.keep == "last":
+            arr = arr[::-1]
+
+        nbase = n
+        narr = len(arr)
+        n = min(n, narr)
+
+        # arr passed into kth_smallest must be contiguous. We copy
+        # here because kth_smallest will modify its input
+        kth_val = libalgos.kth_smallest(arr.copy(order="C"), n - 1)
+        (ns,) = np.nonzero(arr <= kth_val)
+        inds = ns[arr[ns].argsort(kind="mergesort")]
+
+        if self.keep != "all":
+            inds = inds[:n]
+            findex = nbase
+        else:
+            if len(inds) < nbase <= len(nan_index) + len(inds):
+                findex = len(nan_index) + len(inds)
+            else:
+                findex = len(inds)
+
+        if self.keep == "last":
+            # reverse indices
+            inds = narr - 1 - inds
+
+        return concat([dropped.iloc[inds], nan_index]).iloc[:findex]
+
+
+class SelectNFrame(SelectN):
+    """
+    Implement n largest/smallest for DataFrame
+
+    Parameters
+    ----------
+    obj : DataFrame
+    n : int
+    keep : {'first', 'last'}, default 'first'
+    columns : list or str
+
+    Returns
+    -------
+    nordered : DataFrame
+    """
+
+    def __init__(self, obj: DataFrame, n: int, keep: str, columns: IndexLabel) -> None:
+        super().__init__(obj, n, keep)
+        if not is_list_like(columns) or isinstance(columns, tuple):
+            columns = [columns]
+
+        columns = cast(Sequence[Hashable], columns)
+        columns = list(columns)
+        self.columns = columns
+
+    def compute(self, method: str) -> DataFrame:
+        from pandas.core.api import Index
+
+        n = self.n
+        frame = self.obj
+        columns = self.columns
+
+        for column in columns:
+            dtype = frame[column].dtype
+            if not self.is_valid_dtype_n_method(dtype):
+                raise TypeError(
+                    f"Column {repr(column)} has dtype {dtype}, "
+                    f"cannot use method {repr(method)} with this dtype"
+                )
+
+        def get_indexer(current_indexer, other_indexer):
+            """
+            Helper function to concat `current_indexer` and `other_indexer`
+            depending on `method`
+            """
+            if method == "nsmallest":
+                return current_indexer.append(other_indexer)
+            else:
+                return other_indexer.append(current_indexer)
+
+        # Below we save and reset the index in case index contains duplicates
+        original_index = frame.index
+        cur_frame = frame = frame.reset_index(drop=True)
+        cur_n = n
+        indexer = Index([], dtype=np.int64)
+
+        for i, column in enumerate(columns):
+            # For each column we apply method to cur_frame[column].
+            # If it's the last column or if we have the number of
+            # results desired we are done.
+            # Otherwise there are duplicates of the largest/smallest
+            # value and we need to look at the rest of the columns
+            # to determine which of the rows with the largest/smallest
+            # value in the column to keep.
+            series = cur_frame[column]
+            is_last_column = len(columns) - 1 == i
+            values = getattr(series, method)(
+                cur_n, keep=self.keep if is_last_column else "all"
+            )
+
+            if is_last_column or len(values) <= cur_n:
+                indexer = get_indexer(indexer, values.index)
+                break
+
+            # Now find all values which are equal to
+            # the (nsmallest: largest)/(nlargest: smallest)
+            # from our series.
+            border_value = values == values[values.index[-1]]
+
+            # Some of these values are among the top-n
+            # some aren't.
+            unsafe_values = values[border_value]
+
+            # These values are definitely among the top-n
+            safe_values = values[~border_value]
+            indexer = get_indexer(indexer, safe_values.index)
+
+            # Go on and separate the unsafe_values on the remaining
+            # columns.
+            cur_frame = cur_frame.loc[unsafe_values.index]
+            cur_n = n - len(indexer)
+
+        frame = frame.take(indexer)
+
+        # Restore the index on frame
+        frame.index = original_index.take(indexer)
+
+        # If there is only one column, the frame is already sorted.
+        if len(columns) == 1:
+            return frame
+
+        ascending = method == "nsmallest"
+
+        return frame.sort_values(columns, ascending=ascending, kind="mergesort")
diff --git a/pandas/core/series.py b/pandas/core/series.py
index d69c057c85783..574700fb513ef 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -158,6 +158,7 @@
     SingleArrayManager,
     SingleBlockManager,
 )
+from pandas.core.methods import selectn
 from pandas.core.shared_docs import _shared_docs
 from pandas.core.sorting import (
     ensure_key_mapped,
@@ -3948,7 +3949,7 @@ def nlargest(
         Brunei        434000
         dtype: int64
         """
-        return algorithms.SelectNSeries(self, n=n, keep=keep).nlargest()
+        return selectn.SelectNSeries(self, n=n, keep=keep).nlargest()
 
     def nsmallest(self, n: int = 5, keep: str = "first") -> Series:
         """
@@ -4045,7 +4046,7 @@ def nsmallest(self, n: int = 5, keep: str = "first") -> Series:
         Anguilla    11300
         dtype: int64
         """
-        return algorithms.SelectNSeries(self, n=n, keep=keep).nsmallest()
+        return selectn.SelectNSeries(self, n=n, keep=keep).nsmallest()
 
     @doc(
         klass=_shared_doc_kwargs["klass"],