43
43
to_datetime ,
44
44
to_timedelta ,
45
45
)
46
- from pandas .core import nanops
47
46
import pandas .core .algorithms as algos
48
47
49
48
if TYPE_CHECKING :
@@ -243,43 +242,18 @@ def cut(
243
242
# NOTE: this binning code is changed a bit from histogram for var(x) == 0
244
243
245
244
original = x
246
- x = _preprocess_for_cut (x )
247
- x , dtype = _coerce_to_type (x )
245
+ x_idx = _preprocess_for_cut (x )
246
+ x_idx , dtype = _coerce_to_type (x_idx )
248
247
249
248
if not np .iterable (bins ):
250
- if is_scalar (bins ) and bins < 1 :
251
- raise ValueError ("`bins` should be a positive integer." )
252
-
253
- sz = x .size
254
-
255
- if sz == 0 :
256
- raise ValueError ("Cannot cut empty array" )
257
-
258
- rng = (nanops .nanmin (x ), nanops .nanmax (x ))
259
- mn , mx = (mi + 0.0 for mi in rng )
260
-
261
- if np .isinf (mn ) or np .isinf (mx ):
262
- # GH 24314
263
- raise ValueError (
264
- "cannot specify integer `bins` when input data contains infinity"
265
- )
266
- if mn == mx : # adjust end points before binning
267
- mn -= 0.001 * abs (mn ) if mn != 0 else 0.001
268
- mx += 0.001 * abs (mx ) if mx != 0 else 0.001
269
- bins = np .linspace (mn , mx , bins + 1 , endpoint = True )
270
- else : # adjust end points after binning
271
- bins = np .linspace (mn , mx , bins + 1 , endpoint = True )
272
- adj = (mx - mn ) * 0.001 # 0.1% of the range
273
- if right :
274
- bins [0 ] -= adj
275
- else :
276
- bins [- 1 ] += adj
249
+ bins = _nbins_to_bins (x_idx , bins , right )
277
250
278
251
elif isinstance (bins , IntervalIndex ):
279
252
if bins .is_overlapping :
280
253
raise ValueError ("Overlapping IntervalIndex is not accepted." )
281
254
282
255
else :
256
+ bins = Index (bins )
283
257
if isinstance (getattr (bins , "dtype" , None ), DatetimeTZDtype ):
284
258
bins = np .asarray (bins , dtype = DT64NS_DTYPE )
285
259
else :
@@ -289,9 +263,10 @@ def cut(
289
263
# GH 26045: cast to float64 to avoid an overflow
290
264
if (np .diff (bins .astype ("float64" )) < 0 ).any ():
291
265
raise ValueError ("bins must increase monotonically." )
266
+ bins = Index (bins )
292
267
293
268
fac , bins = _bins_to_cuts (
294
- x ,
269
+ x_idx ,
295
270
bins ,
296
271
right = right ,
297
272
labels = labels ,
@@ -367,18 +342,18 @@ def qcut(
367
342
array([0, 0, 1, 2, 3])
368
343
"""
369
344
original = x
370
- x = _preprocess_for_cut (x )
371
- x , dtype = _coerce_to_type (x )
345
+ x_idx = _preprocess_for_cut (x )
346
+ x_idx , dtype = _coerce_to_type (x_idx )
372
347
373
348
quantiles = np .linspace (0 , 1 , q + 1 ) if is_integer (q ) else q
374
349
375
- x_np = np .asarray (x )
350
+ x_np = np .asarray (x_idx )
376
351
x_np = x_np [~ np .isnan (x_np )]
377
352
bins = np .quantile (x_np , quantiles )
378
353
379
354
fac , bins = _bins_to_cuts (
380
- x ,
381
- bins ,
355
+ x_idx ,
356
+ Index ( bins ) ,
382
357
labels = labels ,
383
358
precision = precision ,
384
359
include_lowest = True ,
@@ -389,9 +364,44 @@ def qcut(
389
364
return _postprocess_for_cut (fac , bins , retbins , dtype , original )
390
365
391
366
367
+ def _nbins_to_bins (x_idx : Index , nbins : int , right : bool ) -> Index :
368
+ """
369
+ If a user passed an integer N for bins, convert this to a sequence of N
370
+ equal(ish)-sized bins.
371
+ """
372
+ if is_scalar (nbins ) and nbins < 1 :
373
+ raise ValueError ("`bins` should be a positive integer." )
374
+
375
+ if x_idx .size == 0 :
376
+ raise ValueError ("Cannot cut empty array" )
377
+
378
+ rng = (x_idx .min (), x_idx .max ())
379
+ mn , mx = rng
380
+
381
+ if np .isinf (mn ) or np .isinf (mx ):
382
+ # GH#24314
383
+ raise ValueError (
384
+ "cannot specify integer `bins` when input data contains infinity"
385
+ )
386
+
387
+ if mn == mx : # adjust end points before binning
388
+ mn -= 0.001 * abs (mn ) if mn != 0 else 0.001
389
+ mx += 0.001 * abs (mx ) if mx != 0 else 0.001
390
+ bins = np .linspace (mn , mx , nbins + 1 , endpoint = True )
391
+ else : # adjust end points after binning
392
+ bins = np .linspace (mn , mx , nbins + 1 , endpoint = True )
393
+ adj = (mx - mn ) * 0.001 # 0.1% of the range
394
+ if right :
395
+ bins [0 ] -= adj
396
+ else :
397
+ bins [- 1 ] += adj
398
+
399
+ return Index (bins )
400
+
401
+
392
402
def _bins_to_cuts (
393
- x ,
394
- bins : np . ndarray ,
403
+ x : Index ,
404
+ bins : Index ,
395
405
right : bool = True ,
396
406
labels = None ,
397
407
precision : int = 3 ,
@@ -408,6 +418,8 @@ def _bins_to_cuts(
408
418
"invalid value for 'duplicates' parameter, valid options are: raise, drop"
409
419
)
410
420
421
+ result : Categorical | np .ndarray
422
+
411
423
if isinstance (bins , IntervalIndex ):
412
424
# we have a fast-path here
413
425
ids = bins .get_indexer (x )
@@ -474,7 +486,7 @@ def _bins_to_cuts(
474
486
return result , bins
475
487
476
488
477
- def _coerce_to_type (x ) :
489
+ def _coerce_to_type (x : Index ) -> tuple [ Index , DtypeObj | None ] :
478
490
"""
479
491
if the passed data is of datetime/timedelta, bool or nullable int type,
480
492
this method converts it to numeric so that cut or qcut method can
@@ -498,11 +510,13 @@ def _coerce_to_type(x):
498
510
# https://github.com/pandas-dev/pandas/pull/31290
499
511
# https://github.com/pandas-dev/pandas/issues/31389
500
512
elif isinstance (x .dtype , ExtensionDtype ) and is_numeric_dtype (x .dtype ):
501
- x = x .to_numpy (dtype = np .float64 , na_value = np .nan )
513
+ x_arr = x .to_numpy (dtype = np .float64 , na_value = np .nan )
514
+ x = Index (x_arr )
502
515
503
516
if dtype is not None :
504
517
# GH 19768: force NaT to NaN during integer conversion
505
- x = np .where (x .notna (), x .view (np .int64 ), np .nan )
518
+ x_arr = np .where (x .notna (), x .view (np .int64 ), np .nan )
519
+ x = Index (x_arr )
506
520
507
521
return x , dtype
508
522
@@ -564,7 +578,7 @@ def _convert_bin_to_datelike_type(bins, dtype: DtypeObj | None):
564
578
565
579
566
580
def _format_labels (
567
- bins ,
581
+ bins : Index ,
568
582
precision : int ,
569
583
right : bool = True ,
570
584
include_lowest : bool = False ,
@@ -597,7 +611,7 @@ def _format_labels(
597
611
return IntervalIndex .from_breaks (breaks , closed = closed )
598
612
599
613
600
- def _preprocess_for_cut (x ):
614
+ def _preprocess_for_cut (x ) -> Index :
601
615
"""
602
616
handles preprocessing for cut where we convert passed
603
617
input to array, strip the index information and store it
@@ -611,7 +625,7 @@ def _preprocess_for_cut(x):
611
625
if x .ndim != 1 :
612
626
raise ValueError ("Input array must be 1 dimensional" )
613
627
614
- return x
628
+ return Index ( x )
615
629
616
630
617
631
def _postprocess_for_cut (fac , bins , retbins : bool , dtype : DtypeObj | None , original ):
@@ -627,6 +641,8 @@ def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, origi
627
641
return fac
628
642
629
643
bins = _convert_bin_to_datelike_type (bins , dtype )
644
+ if isinstance (bins , Index ) and is_numeric_dtype (bins .dtype ):
645
+ bins = bins ._values
630
646
631
647
return fac , bins
632
648
@@ -646,7 +662,7 @@ def _round_frac(x, precision: int):
646
662
return np .around (x , digits )
647
663
648
664
649
- def _infer_precision (base_precision : int , bins ) -> int :
665
+ def _infer_precision (base_precision : int , bins : Index ) -> int :
650
666
"""
651
667
Infer an appropriate precision for _round_frac
652
668
"""
0 commit comments