-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: support cut/qcut for datetime/timedelta (GH14714) #14737
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a6a4a11
3e22a77
01c6713
21a8b09
4404f11
a981b70
d50b38b
5c9ef9c
f68ed02
323ab7a
49854aa
0b77044
7406207
890252e
f44316c
9d9e3f9
8e4adbb
d324fd5
65eae67
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,8 @@ | |
import pandas.core.algorithms as algos | ||
import pandas.core.nanops as nanops | ||
from pandas.compat import zip | ||
from pandas import to_timedelta, to_datetime | ||
from pandas.types.common import is_datetime64_dtype, is_timedelta64_dtype | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you put this one above the numpy import? (to keep the pandas imports grouped like it was before) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have addressed this review comment |
||
|
||
import numpy as np | ||
|
||
|
@@ -81,14 +83,17 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, | |
array([1, 1, 1, 1, 1], dtype=int64) | ||
""" | ||
# NOTE: this binning code is changed a bit from histogram for var(x) == 0 | ||
|
||
# for handling the cut for datetime and timedelta objects | ||
x_is_series, series_index, name, x = _preprocess_for_cut(x) | ||
x, dtype = _coerce_to_type(x) | ||
|
||
if not np.iterable(bins): | ||
if is_scalar(bins) and bins < 1: | ||
raise ValueError("`bins` should be a positive integer.") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jorisvandenbossche you are referring to the try except in the pd.cut method, specifically these lines
Is that correct? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes (I couldn't comment on those lines) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jorisvandenbossche thanks, that is what i thought! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have addressed this review comment |
||
try: # for array-like | ||
sz = x.size | ||
except AttributeError: | ||
x = np.asarray(x) | ||
sz = x.size | ||
|
||
sz = x.size | ||
|
||
if sz == 0: | ||
raise ValueError('Cannot cut empty array') | ||
# handle empty arrays. Can't determine range, so use 0-1. | ||
|
@@ -114,9 +119,12 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, | |
if (np.diff(bins) < 0).any(): | ||
raise ValueError('bins must increase monotonically.') | ||
|
||
return _bins_to_cuts(x, bins, right=right, labels=labels, | ||
retbins=retbins, precision=precision, | ||
include_lowest=include_lowest) | ||
fac, bins = _bins_to_cuts(x, bins, right=right, labels=labels, | ||
precision=precision, | ||
include_lowest=include_lowest, dtype=dtype) | ||
|
||
return _postprocess_for_cut(fac, bins, retbins, x_is_series, | ||
series_index, name) | ||
|
||
|
||
def qcut(x, q, labels=None, retbins=False, precision=3): | ||
|
@@ -166,26 +174,26 @@ def qcut(x, q, labels=None, retbins=False, precision=3): | |
>>> pd.qcut(range(5), 4, labels=False) | ||
array([0, 0, 1, 2, 3], dtype=int64) | ||
""" | ||
x_is_series, series_index, name, x = _preprocess_for_cut(x) | ||
|
||
x, dtype = _coerce_to_type(x) | ||
|
||
if is_integer(q): | ||
quantiles = np.linspace(0, 1, q + 1) | ||
else: | ||
quantiles = q | ||
bins = algos.quantile(x, quantiles) | ||
return _bins_to_cuts(x, bins, labels=labels, retbins=retbins, | ||
precision=precision, include_lowest=True) | ||
fac, bins = _bins_to_cuts(x, bins, labels=labels, | ||
precision=precision, include_lowest=True, | ||
dtype=dtype) | ||
|
||
return _postprocess_for_cut(fac, bins, retbins, x_is_series, | ||
series_index, name) | ||
|
||
def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, | ||
precision=3, name=None, include_lowest=False): | ||
x_is_series = isinstance(x, Series) | ||
series_index = None | ||
|
||
if x_is_series: | ||
series_index = x.index | ||
if name is None: | ||
name = x.name | ||
|
||
x = np.asarray(x) | ||
def _bins_to_cuts(x, bins, right=True, labels=None, | ||
precision=3, include_lowest=False, | ||
dtype=None): | ||
|
||
side = 'left' if right else 'right' | ||
ids = bins.searchsorted(x, side=side) | ||
|
@@ -205,7 +213,8 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, | |
while True: | ||
try: | ||
levels = _format_levels(bins, precision, right=right, | ||
include_lowest=include_lowest) | ||
include_lowest=include_lowest, | ||
dtype=dtype) | ||
except ValueError: | ||
increases += 1 | ||
precision += 1 | ||
|
@@ -229,18 +238,12 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, | |
fac = fac.astype(np.float64) | ||
np.putmask(fac, na_mask, np.nan) | ||
|
||
if x_is_series: | ||
fac = Series(fac, index=series_index, name=name) | ||
|
||
if not retbins: | ||
return fac | ||
|
||
return fac, bins | ||
|
||
|
||
def _format_levels(bins, prec, right=True, | ||
include_lowest=False): | ||
fmt = lambda v: _format_label(v, precision=prec) | ||
include_lowest=False, dtype=None): | ||
fmt = lambda v: _format_label(v, precision=prec, dtype=dtype) | ||
if right: | ||
levels = [] | ||
for a, b in zip(bins, bins[1:]): | ||
|
@@ -258,12 +261,16 @@ def _format_levels(bins, prec, right=True, | |
else: | ||
levels = ['[%s, %s)' % (fmt(a), fmt(b)) | ||
for a, b in zip(bins, bins[1:])] | ||
|
||
return levels | ||
|
||
|
||
def _format_label(x, precision=3): | ||
def _format_label(x, precision=3, dtype=None): | ||
fmt_str = '%%.%dg' % precision | ||
|
||
if is_datetime64_dtype(dtype): | ||
return to_datetime(x, unit='ns') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i would like to see thur format_labels so to do all labels in 1 go There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jreback can you please explain, i am not getting what you are trying to say |
||
if is_timedelta64_dtype(dtype): | ||
return to_timedelta(x, unit='ns') | ||
if np.isinf(x): | ||
return str(x) | ||
elif is_float(x): | ||
|
@@ -300,3 +307,55 @@ def _trim_zeros(x): | |
if len(x) > 1 and x[-1] == '.': | ||
x = x[:-1] | ||
return x | ||
|
||
|
||
def _coerce_to_type(x): | ||
""" | ||
if the passed data is of datetime/timedelta type, | ||
this method converts it to integer so that cut method can | ||
handle it | ||
""" | ||
dtype = None | ||
|
||
if is_timedelta64_dtype(x): | ||
x = to_timedelta(x).view(np.int64) | ||
dtype = np.timedelta64 | ||
elif is_datetime64_dtype(x): | ||
x = to_datetime(x).view(np.int64) | ||
dtype = np.datetime64 | ||
|
||
return x, dtype | ||
|
||
|
||
def _preprocess_for_cut(x): | ||
""" | ||
handles preprocessing for cut where we convert passed | ||
input to array, strip the index information and store it | ||
seperately | ||
""" | ||
x_is_series = isinstance(x, Series) | ||
series_index = None | ||
name = None | ||
|
||
if x_is_series: | ||
series_index = x.index | ||
name = x.name | ||
|
||
x = np.asarray(x) | ||
|
||
return x_is_series, series_index, name, x | ||
|
||
|
||
def _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name): | ||
""" | ||
handles post processing for the cut method where | ||
we combine the index information if the originally passed | ||
datatype was a series | ||
""" | ||
if x_is_series: | ||
fac = Series(fac, index=series_index, name=name) | ||
|
||
if not retbins: | ||
return fac | ||
|
||
return fac, bins |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can u also test:
list
ofnp.datetime64
ndarray
ofdatetime64
dtypeDatetimeIndex
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@sinhrks I have added tests for list of np.datetime64 and ndarray. I did not get the DatetimeIndex part, to test this do we want to have a series/dataframe with datetime as the index and datetime as the data?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@sinhrks added test for DatetimeIndex cut in be9b2fd