17
17
Timestamp ,
18
18
lib ,
19
19
)
20
- from pandas ._libs .lib import infer_dtype
21
20
22
21
from pandas .core .dtypes .common import (
23
- DT64NS_DTYPE ,
24
22
ensure_platform_int ,
25
23
is_bool_dtype ,
26
24
is_integer ,
@@ -243,7 +241,7 @@ def cut(
243
241
244
242
original = x
245
243
x_idx = _preprocess_for_cut (x )
246
- x_idx , dtype = _coerce_to_type (x_idx )
244
+ x_idx , _ = _coerce_to_type (x_idx )
247
245
248
246
if not np .iterable (bins ):
249
247
bins = _nbins_to_bins (x_idx , bins , right )
@@ -254,16 +252,8 @@ def cut(
254
252
255
253
else :
256
254
bins = Index (bins )
257
- if isinstance (getattr (bins , "dtype" , None ), DatetimeTZDtype ):
258
- bins = np .asarray (bins , dtype = DT64NS_DTYPE )
259
- else :
260
- bins = np .asarray (bins )
261
- bins = _convert_bin_to_numeric_type (bins , dtype )
262
-
263
- # GH 26045: cast to float64 to avoid an overflow
264
- if (np .diff (bins .astype ("float64" )) < 0 ).any ():
255
+ if not bins .is_monotonic_increasing :
265
256
raise ValueError ("bins must increase monotonically." )
266
- bins = Index (bins )
267
257
268
258
fac , bins = _bins_to_cuts (
269
259
x_idx ,
@@ -272,12 +262,11 @@ def cut(
272
262
labels = labels ,
273
263
precision = precision ,
274
264
include_lowest = include_lowest ,
275
- dtype = dtype ,
276
265
duplicates = duplicates ,
277
266
ordered = ordered ,
278
267
)
279
268
280
- return _postprocess_for_cut (fac , bins , retbins , dtype , original )
269
+ return _postprocess_for_cut (fac , bins , retbins , original )
281
270
282
271
283
272
def qcut (
@@ -343,25 +332,22 @@ def qcut(
343
332
"""
344
333
original = x
345
334
x_idx = _preprocess_for_cut (x )
346
- x_idx , dtype = _coerce_to_type (x_idx )
335
+ x_idx , _ = _coerce_to_type (x_idx )
347
336
348
337
quantiles = np .linspace (0 , 1 , q + 1 ) if is_integer (q ) else q
349
338
350
- x_np = np .asarray (x_idx )
351
- x_np = x_np [~ np .isnan (x_np )]
352
- bins = np .quantile (x_np , quantiles )
339
+ bins = x_idx .to_series ().dropna ().quantile (quantiles )
353
340
354
341
fac , bins = _bins_to_cuts (
355
342
x_idx ,
356
343
Index (bins ),
357
344
labels = labels ,
358
345
precision = precision ,
359
346
include_lowest = True ,
360
- dtype = dtype ,
361
347
duplicates = duplicates ,
362
348
)
363
349
364
- return _postprocess_for_cut (fac , bins , retbins , dtype , original )
350
+ return _postprocess_for_cut (fac , bins , retbins , original )
365
351
366
352
367
353
def _nbins_to_bins (x_idx : Index , nbins : int , right : bool ) -> Index :
@@ -378,18 +364,41 @@ def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index:
378
364
rng = (x_idx .min (), x_idx .max ())
379
365
mn , mx = rng
380
366
381
- if np .isinf (mn ) or np .isinf (mx ):
367
+ is_dt_or_td = lib .is_np_dtype (x_idx .dtype , "mM" ) or isinstance (
368
+ x_idx .dtype , DatetimeTZDtype
369
+ )
370
+
371
+ if is_numeric_dtype (x_idx .dtype ) and (np .isinf (mn ) or np .isinf (mx )):
382
372
# GH#24314
383
373
raise ValueError (
384
374
"cannot specify integer `bins` when input data contains infinity"
385
375
)
386
376
387
377
if mn == mx : # adjust end points before binning
388
- mn -= 0.001 * abs (mn ) if mn != 0 else 0.001
389
- mx += 0.001 * abs (mx ) if mx != 0 else 0.001
390
- bins = np .linspace (mn , mx , nbins + 1 , endpoint = True )
378
+ if is_dt_or_td :
379
+ # using seconds=1 is pretty arbitrary here
380
+ td = Timedelta (seconds = 1 )
381
+ # Use DatetimeArray/TimedeltaArray method instead of linspace
382
+ # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
383
+ # has no attribute "_generate_range"
384
+ bins = x_idx ._values ._generate_range ( # type: ignore[union-attr]
385
+ start = mn - td , end = mx + td , periods = nbins + 1 , freq = None
386
+ )
387
+ else :
388
+ mn -= 0.001 * abs (mn ) if mn != 0 else 0.001
389
+ mx += 0.001 * abs (mx ) if mx != 0 else 0.001
390
+
391
+ bins = np .linspace (mn , mx , nbins + 1 , endpoint = True )
391
392
else : # adjust end points after binning
392
- bins = np .linspace (mn , mx , nbins + 1 , endpoint = True )
393
+ if is_dt_or_td :
394
+ # Use DatetimeArray/TimedeltaArray method instead of linspace
395
+ # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
396
+ # has no attribute "_generate_range"
397
+ bins = x_idx ._values ._generate_range ( # type: ignore[union-attr]
398
+ start = mn , end = mx , periods = nbins + 1 , freq = None
399
+ )
400
+ else :
401
+ bins = np .linspace (mn , mx , nbins + 1 , endpoint = True )
393
402
adj = (mx - mn ) * 0.001 # 0.1% of the range
394
403
if right :
395
404
bins [0 ] -= adj
@@ -400,13 +409,12 @@ def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index:
400
409
401
410
402
411
def _bins_to_cuts (
403
- x : Index ,
412
+ x_idx : Index ,
404
413
bins : Index ,
405
414
right : bool = True ,
406
415
labels = None ,
407
416
precision : int = 3 ,
408
417
include_lowest : bool = False ,
409
- dtype : DtypeObj | None = None ,
410
418
duplicates : str = "raise" ,
411
419
ordered : bool = True ,
412
420
):
@@ -422,7 +430,7 @@ def _bins_to_cuts(
422
430
423
431
if isinstance (bins , IntervalIndex ):
424
432
# we have a fast-path here
425
- ids = bins .get_indexer (x )
433
+ ids = bins .get_indexer (x_idx )
426
434
cat_dtype = CategoricalDtype (bins , ordered = True )
427
435
result = Categorical .from_codes (ids , dtype = cat_dtype , validate = False )
428
436
return result , bins
@@ -437,12 +445,29 @@ def _bins_to_cuts(
437
445
bins = unique_bins
438
446
439
447
side : Literal ["left" , "right" ] = "left" if right else "right"
440
- ids = ensure_platform_int (bins .searchsorted (x , side = side ))
448
+
449
+ try :
450
+ ids = bins .searchsorted (x_idx , side = side )
451
+ except TypeError as err :
452
+ # e.g. test_datetime_nan_error if bins are DatetimeArray and x_idx
453
+ # is integers
454
+ if x_idx .dtype .kind == "m" :
455
+ raise ValueError ("bins must be of timedelta64 dtype" ) from err
456
+ elif x_idx .dtype .kind == bins .dtype .kind == "M" :
457
+ raise ValueError (
458
+ "Cannot use timezone-naive bins with timezone-aware values, "
459
+ "or vice-versa"
460
+ ) from err
461
+ elif x_idx .dtype .kind == "M" :
462
+ raise ValueError ("bins must be of datetime64 dtype" ) from err
463
+ else :
464
+ raise
465
+ ids = ensure_platform_int (ids )
441
466
442
467
if include_lowest :
443
- ids [np . asarray ( x ) == bins [0 ]] = 1
468
+ ids [x_idx == bins [0 ]] = 1
444
469
445
- na_mask = isna (x ) | (ids == len (bins )) | (ids == 0 )
470
+ na_mask = isna (x_idx ) | (ids == len (bins )) | (ids == 0 )
446
471
has_nas = na_mask .any ()
447
472
448
473
if labels is not False :
@@ -454,7 +479,7 @@ def _bins_to_cuts(
454
479
455
480
if labels is None :
456
481
labels = _format_labels (
457
- bins , precision , right = right , include_lowest = include_lowest , dtype = dtype
482
+ bins , precision , right = right , include_lowest = include_lowest
458
483
)
459
484
elif ordered and len (set (labels )) != len (labels ):
460
485
raise ValueError (
@@ -513,90 +538,28 @@ def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]:
513
538
x_arr = x .to_numpy (dtype = np .float64 , na_value = np .nan )
514
539
x = Index (x_arr )
515
540
516
- if dtype is not None :
517
- # GH 19768: force NaT to NaN during integer conversion
518
- x_arr = np .where (x .notna (), x .view (np .int64 ), np .nan )
519
- x = Index (x_arr )
520
-
521
- return x , dtype
522
-
523
-
524
- def _convert_bin_to_numeric_type (bins , dtype : DtypeObj | None ):
525
- """
526
- if the passed bin is of datetime/timedelta type,
527
- this method converts it to integer
528
-
529
- Parameters
530
- ----------
531
- bins : list-like of bins
532
- dtype : dtype of data
533
-
534
- Raises
535
- ------
536
- ValueError if bins are not of a compat dtype to dtype
537
- """
538
- bins_dtype = infer_dtype (bins , skipna = False )
539
- if lib .is_np_dtype (dtype , "m" ):
540
- if bins_dtype in ["timedelta" , "timedelta64" ]:
541
- bins = to_timedelta (bins ).view (np .int64 )
542
- else :
543
- raise ValueError ("bins must be of timedelta64 dtype" )
544
- elif lib .is_np_dtype (dtype , "M" ) or isinstance (dtype , DatetimeTZDtype ):
545
- if bins_dtype in ["datetime" , "datetime64" ]:
546
- bins = to_datetime (bins )
547
- if lib .is_np_dtype (bins .dtype , "M" ):
548
- # As of 2.0, to_datetime may give non-nano, so we need to convert
549
- # here until the rest of this file recognizes non-nano
550
- bins = bins .astype ("datetime64[ns]" , copy = False )
551
- bins = bins .view (np .int64 )
552
- else :
553
- raise ValueError ("bins must be of datetime64 dtype" )
554
-
555
- return bins
556
-
557
-
558
- def _convert_bin_to_datelike_type (bins , dtype : DtypeObj | None ):
559
- """
560
- Convert bins to a DatetimeIndex or TimedeltaIndex if the original dtype is
561
- datelike
562
-
563
- Parameters
564
- ----------
565
- bins : list-like of bins
566
- dtype : dtype of data
567
-
568
- Returns
569
- -------
570
- bins : Array-like of bins, DatetimeIndex or TimedeltaIndex if dtype is
571
- datelike
572
- """
573
- if isinstance (dtype , DatetimeTZDtype ):
574
- bins = to_datetime (bins .astype (np .int64 ), utc = True ).tz_convert (dtype .tz )
575
- elif lib .is_np_dtype (dtype , "mM" ):
576
- bins = Index (bins .astype (np .int64 ), dtype = dtype )
577
- return bins
541
+ return Index (x ), dtype
578
542
579
543
580
544
def _format_labels (
581
545
bins : Index ,
582
546
precision : int ,
583
547
right : bool = True ,
584
548
include_lowest : bool = False ,
585
- dtype : DtypeObj | None = None ,
586
549
):
587
550
"""based on the dtype, return our labels"""
588
551
closed : IntervalLeftRight = "right" if right else "left"
589
552
590
553
formatter : Callable [[Any ], Timestamp ] | Callable [[Any ], Timedelta ]
591
554
592
- if isinstance (dtype , DatetimeTZDtype ):
593
- formatter = lambda x : Timestamp ( x , tz = dtype . tz )
555
+ if isinstance (bins . dtype , DatetimeTZDtype ):
556
+ formatter = lambda x : x
594
557
adjust = lambda x : x - Timedelta ("1ns" )
595
- elif lib .is_np_dtype (dtype , "M" ):
596
- formatter = Timestamp
558
+ elif lib .is_np_dtype (bins . dtype , "M" ):
559
+ formatter = lambda x : x
597
560
adjust = lambda x : x - Timedelta ("1ns" )
598
- elif lib .is_np_dtype (dtype , "m" ):
599
- formatter = Timedelta
561
+ elif lib .is_np_dtype (bins . dtype , "m" ):
562
+ formatter = lambda x : x
600
563
adjust = lambda x : x - Timedelta ("1ns" )
601
564
else :
602
565
precision = _infer_precision (precision , bins )
@@ -628,7 +591,7 @@ def _preprocess_for_cut(x) -> Index:
628
591
return Index (x )
629
592
630
593
631
- def _postprocess_for_cut (fac , bins , retbins : bool , dtype : DtypeObj | None , original ):
594
+ def _postprocess_for_cut (fac , bins , retbins : bool , original ):
632
595
"""
633
596
handles post processing for the cut method where
634
597
we combine the index information if the originally passed
@@ -640,7 +603,6 @@ def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, origi
640
603
if not retbins :
641
604
return fac
642
605
643
- bins = _convert_bin_to_datelike_type (bins , dtype )
644
606
if isinstance (bins , Index ) and is_numeric_dtype (bins .dtype ):
645
607
bins = bins ._values
646
608
0 commit comments