REF: cast x and bins to Index early in cut, qcut (#54919)

jbrockmendel · web-flow · commit b5854c406d0e · 2023-09-01T10:05:18.000-07:00
* REF: cast x and bins to Index early in cut, qcut

* mypy fixup

* troubleshoot builds
diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
@@ -43,7 +43,6 @@
     to_datetime,
     to_timedelta,
 )
-from pandas.core import nanops
 import pandas.core.algorithms as algos
 
 if TYPE_CHECKING:
@@ -243,43 +242,18 @@ def cut(
     # NOTE: this binning code is changed a bit from histogram for var(x) == 0
 
     original = x
-    x = _preprocess_for_cut(x)
-    x, dtype = _coerce_to_type(x)
+    x_idx = _preprocess_for_cut(x)
+    x_idx, dtype = _coerce_to_type(x_idx)
 
     if not np.iterable(bins):
-        if is_scalar(bins) and bins < 1:
-            raise ValueError("`bins` should be a positive integer.")
-
-        sz = x.size
-
-        if sz == 0:
-            raise ValueError("Cannot cut empty array")
-
-        rng = (nanops.nanmin(x), nanops.nanmax(x))
-        mn, mx = (mi + 0.0 for mi in rng)
-
-        if np.isinf(mn) or np.isinf(mx):
-            # GH 24314
-            raise ValueError(
-                "cannot specify integer `bins` when input data contains infinity"
-            )
-        if mn == mx:  # adjust end points before binning
-            mn -= 0.001 * abs(mn) if mn != 0 else 0.001
-            mx += 0.001 * abs(mx) if mx != 0 else 0.001
-            bins = np.linspace(mn, mx, bins + 1, endpoint=True)
-        else:  # adjust end points after binning
-            bins = np.linspace(mn, mx, bins + 1, endpoint=True)
-            adj = (mx - mn) * 0.001  # 0.1% of the range
-            if right:
-                bins[0] -= adj
-            else:
-                bins[-1] += adj
+        bins = _nbins_to_bins(x_idx, bins, right)
 
     elif isinstance(bins, IntervalIndex):
         if bins.is_overlapping:
             raise ValueError("Overlapping IntervalIndex is not accepted.")
 
     else:
+        bins = Index(bins)
         if isinstance(getattr(bins, "dtype", None), DatetimeTZDtype):
             bins = np.asarray(bins, dtype=DT64NS_DTYPE)
         else:
@@ -289,9 +263,10 @@ def cut(
         # GH 26045: cast to float64 to avoid an overflow
         if (np.diff(bins.astype("float64")) < 0).any():
             raise ValueError("bins must increase monotonically.")
+        bins = Index(bins)
 
     fac, bins = _bins_to_cuts(
-        x,
+        x_idx,
         bins,
         right=right,
         labels=labels,
@@ -367,18 +342,18 @@ def qcut(
     array([0, 0, 1, 2, 3])
     """
     original = x
-    x = _preprocess_for_cut(x)
-    x, dtype = _coerce_to_type(x)
+    x_idx = _preprocess_for_cut(x)
+    x_idx, dtype = _coerce_to_type(x_idx)
 
     quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q
 
-    x_np = np.asarray(x)
+    x_np = np.asarray(x_idx)
     x_np = x_np[~np.isnan(x_np)]
     bins = np.quantile(x_np, quantiles)
 
     fac, bins = _bins_to_cuts(
-        x,
-        bins,
+        x_idx,
+        Index(bins),
         labels=labels,
         precision=precision,
         include_lowest=True,
@@ -389,9 +364,44 @@ def qcut(
     return _postprocess_for_cut(fac, bins, retbins, dtype, original)
 
 
+def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index:
+    """
+    If a user passed an integer N for bins, convert this to a sequence of N
+    equal(ish)-sized bins.
+    """
+    if is_scalar(nbins) and nbins < 1:
+        raise ValueError("`bins` should be a positive integer.")
+
+    if x_idx.size == 0:
+        raise ValueError("Cannot cut empty array")
+
+    rng = (x_idx.min(), x_idx.max())
+    mn, mx = rng
+
+    if np.isinf(mn) or np.isinf(mx):
+        # GH#24314
+        raise ValueError(
+            "cannot specify integer `bins` when input data contains infinity"
+        )
+
+    if mn == mx:  # adjust end points before binning
+        mn -= 0.001 * abs(mn) if mn != 0 else 0.001
+        mx += 0.001 * abs(mx) if mx != 0 else 0.001
+        bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
+    else:  # adjust end points after binning
+        bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
+        adj = (mx - mn) * 0.001  # 0.1% of the range
+        if right:
+            bins[0] -= adj
+        else:
+            bins[-1] += adj
+
+    return Index(bins)
+
+
 def _bins_to_cuts(
-    x,
-    bins: np.ndarray,
+    x: Index,
+    bins: Index,
     right: bool = True,
     labels=None,
     precision: int = 3,
@@ -408,6 +418,8 @@ def _bins_to_cuts(
             "invalid value for 'duplicates' parameter, valid options are: raise, drop"
         )
 
+    result: Categorical | np.ndarray
+
     if isinstance(bins, IntervalIndex):
         # we have a fast-path here
         ids = bins.get_indexer(x)
@@ -474,7 +486,7 @@ def _bins_to_cuts(
     return result, bins
 
 
-def _coerce_to_type(x):
+def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]:
     """
     if the passed data is of datetime/timedelta, bool or nullable int type,
     this method converts it to numeric so that cut or qcut method can
@@ -498,11 +510,13 @@ def _coerce_to_type(x):
     # https://github.com/pandas-dev/pandas/pull/31290
     # https://github.com/pandas-dev/pandas/issues/31389
     elif isinstance(x.dtype, ExtensionDtype) and is_numeric_dtype(x.dtype):
-        x = x.to_numpy(dtype=np.float64, na_value=np.nan)
+        x_arr = x.to_numpy(dtype=np.float64, na_value=np.nan)
+        x = Index(x_arr)
 
     if dtype is not None:
         # GH 19768: force NaT to NaN during integer conversion
-        x = np.where(x.notna(), x.view(np.int64), np.nan)
+        x_arr = np.where(x.notna(), x.view(np.int64), np.nan)
+        x = Index(x_arr)
 
     return x, dtype
 
@@ -564,7 +578,7 @@ def _convert_bin_to_datelike_type(bins, dtype: DtypeObj | None):
 
 
 def _format_labels(
-    bins,
+    bins: Index,
     precision: int,
     right: bool = True,
     include_lowest: bool = False,
@@ -597,7 +611,7 @@ def _format_labels(
     return IntervalIndex.from_breaks(breaks, closed=closed)
 
 
-def _preprocess_for_cut(x):
+def _preprocess_for_cut(x) -> Index:
     """
     handles preprocessing for cut where we convert passed
     input to array, strip the index information and store it
@@ -611,7 +625,7 @@ def _preprocess_for_cut(x):
     if x.ndim != 1:
         raise ValueError("Input array must be 1 dimensional")
 
-    return x
+    return Index(x)
 
 
 def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, original):
@@ -627,6 +641,8 @@ def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, origi
         return fac
 
     bins = _convert_bin_to_datelike_type(bins, dtype)
+    if isinstance(bins, Index) and is_numeric_dtype(bins.dtype):
+        bins = bins._values
 
     return fac, bins
 
@@ -646,7 +662,7 @@ def _round_frac(x, precision: int):
         return np.around(x, digits)
 
 
-def _infer_precision(base_precision: int, bins) -> int:
+def _infer_precision(base_precision: int, bins: Index) -> int:
     """
     Infer an appropriate precision for _round_frac
     """