diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 6fe0ad8092a03..5e94a95e38cbb 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -50,6 +50,7 @@ Other enhancements - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) +- ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (issue:`14714`) .. _whatsnew_0200.api_breaking: diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index e5b9c65b515d6..33d2a01b1256e 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -12,6 +12,7 @@ from pandas.core.algorithms import quantile from pandas.tools.tile import cut, qcut import pandas.tools.tile as tmod +from pandas import to_datetime, DatetimeIndex class TestCut(tm.TestCase): @@ -283,6 +284,35 @@ def test_single_bin(self): result = cut(s, 1, labels=False) tm.assert_series_equal(result, expected) + def test_datetime_cut(self): + # GH 14714 + # testing for time data to be present as series + data = to_datetime(Series(['2013-01-01', '2013-01-02', '2013-01-03'])) + result, bins = cut(data, 3, retbins=True) + expected = Series(['(2012-12-31 23:57:07.200000, 2013-01-01 16:00:00]', + '(2013-01-01 16:00:00, 2013-01-02 08:00:00]', + '(2013-01-02 08:00:00, 2013-01-03 00:00:00]'], + ).astype("category", ordered=True) + tm.assert_series_equal(result, expected) + + # testing for time data to be present as list + data = [np.datetime64('2013-01-01'), np.datetime64('2013-01-02'), + np.datetime64('2013-01-03')] + result, bins = cut(data, 3, retbins=True) + tm.assert_series_equal(Series(result), expected) + + # testing for time data to be present as ndarray + data = np.array([np.datetime64('2013-01-01'), + np.datetime64('2013-01-02'), + np.datetime64('2013-01-03')]) + result, bins = cut(data, 3, retbins=True) + tm.assert_series_equal(Series(result), expected) + + # testing for time data to be present as datetime index + data = DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03']) + result, bins = cut(data, 3, retbins=True) + tm.assert_series_equal(Series(result), expected) + def curpath(): pth, _ = os.path.split(os.path.abspath(__file__)) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index ef75f2f84779b..f62bac9e951a7 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -11,6 +11,8 @@ import pandas.core.algorithms as algos import pandas.core.nanops as nanops from pandas.compat import zip +from pandas import to_timedelta, to_datetime +from pandas.types.common import is_datetime64_dtype, is_timedelta64_dtype import numpy as np @@ -81,14 +83,17 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, array([1, 1, 1, 1, 1], dtype=int64) """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 + + # for handling the cut for datetime and timedelta objects + x_is_series, series_index, name, x = _preprocess_for_cut(x) + x, dtype = _coerce_to_type(x) + if not np.iterable(bins): if is_scalar(bins) and bins < 1: raise ValueError("`bins` should be a positive integer.") - try: # for array-like - sz = x.size - except AttributeError: - x = np.asarray(x) - sz = x.size + + sz = x.size + if sz == 0: raise ValueError('Cannot cut empty array') # handle empty arrays. Can't determine range, so use 0-1. @@ -114,9 +119,12 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, if (np.diff(bins) < 0).any(): raise ValueError('bins must increase monotonically.') - return _bins_to_cuts(x, bins, right=right, labels=labels, - retbins=retbins, precision=precision, - include_lowest=include_lowest) + fac, bins = _bins_to_cuts(x, bins, right=right, labels=labels, + precision=precision, + include_lowest=include_lowest, dtype=dtype) + + return _postprocess_for_cut(fac, bins, retbins, x_is_series, + series_index, name) def qcut(x, q, labels=None, retbins=False, precision=3): @@ -166,26 +174,26 @@ def qcut(x, q, labels=None, retbins=False, precision=3): >>> pd.qcut(range(5), 4, labels=False) array([0, 0, 1, 2, 3], dtype=int64) """ + x_is_series, series_index, name, x = _preprocess_for_cut(x) + + x, dtype = _coerce_to_type(x) + if is_integer(q): quantiles = np.linspace(0, 1, q + 1) else: quantiles = q bins = algos.quantile(x, quantiles) - return _bins_to_cuts(x, bins, labels=labels, retbins=retbins, - precision=precision, include_lowest=True) + fac, bins = _bins_to_cuts(x, bins, labels=labels, + precision=precision, include_lowest=True, + dtype=dtype) + return _postprocess_for_cut(fac, bins, retbins, x_is_series, + series_index, name) -def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, - precision=3, name=None, include_lowest=False): - x_is_series = isinstance(x, Series) - series_index = None - - if x_is_series: - series_index = x.index - if name is None: - name = x.name - x = np.asarray(x) +def _bins_to_cuts(x, bins, right=True, labels=None, + precision=3, include_lowest=False, + dtype=None): side = 'left' if right else 'right' ids = bins.searchsorted(x, side=side) @@ -205,7 +213,8 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, while True: try: levels = _format_levels(bins, precision, right=right, - include_lowest=include_lowest) + include_lowest=include_lowest, + dtype=dtype) except ValueError: increases += 1 precision += 1 @@ -229,18 +238,12 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, fac = fac.astype(np.float64) np.putmask(fac, na_mask, np.nan) - if x_is_series: - fac = Series(fac, index=series_index, name=name) - - if not retbins: - return fac - return fac, bins def _format_levels(bins, prec, right=True, - include_lowest=False): - fmt = lambda v: _format_label(v, precision=prec) + include_lowest=False, dtype=None): + fmt = lambda v: _format_label(v, precision=prec, dtype=dtype) if right: levels = [] for a, b in zip(bins, bins[1:]): @@ -258,12 +261,16 @@ def _format_levels(bins, prec, right=True, else: levels = ['[%s, %s)' % (fmt(a), fmt(b)) for a, b in zip(bins, bins[1:])] - return levels -def _format_label(x, precision=3): +def _format_label(x, precision=3, dtype=None): fmt_str = '%%.%dg' % precision + + if is_datetime64_dtype(dtype): + return to_datetime(x, unit='ns') + if is_timedelta64_dtype(dtype): + return to_timedelta(x, unit='ns') if np.isinf(x): return str(x) elif is_float(x): @@ -300,3 +307,55 @@ def _trim_zeros(x): if len(x) > 1 and x[-1] == '.': x = x[:-1] return x + + +def _coerce_to_type(x): + """ + if the passed data is of datetime/timedelta type, + this method converts it to integer so that cut method can + handle it + """ + dtype = None + + if is_timedelta64_dtype(x): + x = to_timedelta(x).view(np.int64) + dtype = np.timedelta64 + elif is_datetime64_dtype(x): + x = to_datetime(x).view(np.int64) + dtype = np.datetime64 + + return x, dtype + + +def _preprocess_for_cut(x): + """ + handles preprocessing for cut where we convert passed + input to array, strip the index information and store it + seperately + """ + x_is_series = isinstance(x, Series) + series_index = None + name = None + + if x_is_series: + series_index = x.index + name = x.name + + x = np.asarray(x) + + return x_is_series, series_index, name, x + + +def _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name): + """ + handles post processing for the cut method where + we combine the index information if the originally passed + datatype was a series + """ + if x_is_series: + fac = Series(fac, index=series_index, name=name) + + if not retbins: + return fac + + return fac, bins