ENH: support cut/qcut for datetime/timedelta (GH14714) (#14737)

aileronajay · jorisvandenbossche · commit 56c3aaeea3c5 · 2016-12-03T11:11:59.000+01:00
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -50,6 +50,7 @@ Other enhancements
 
 - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`)
 
+- ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (issue:`14714`)
 
 .. _whatsnew_0200.api_breaking:
 
diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py
@@ -12,6 +12,7 @@
 from pandas.core.algorithms import quantile
 from pandas.tools.tile import cut, qcut
 import pandas.tools.tile as tmod
+from pandas import to_datetime, DatetimeIndex
 
 
 class TestCut(tm.TestCase):
@@ -283,6 +284,35 @@ def test_single_bin(self):
         result = cut(s, 1, labels=False)
         tm.assert_series_equal(result, expected)
 
+    def test_datetime_cut(self):
+        # GH 14714
+        # testing for time data to be present as series
+        data = to_datetime(Series(['2013-01-01', '2013-01-02', '2013-01-03']))
+        result, bins = cut(data, 3, retbins=True)
+        expected = Series(['(2012-12-31 23:57:07.200000, 2013-01-01 16:00:00]',
+                          '(2013-01-01 16:00:00, 2013-01-02 08:00:00]',
+                           '(2013-01-02 08:00:00, 2013-01-03 00:00:00]'],
+                          ).astype("category", ordered=True)
+        tm.assert_series_equal(result, expected)
+
+        # testing for time data to be present as list
+        data = [np.datetime64('2013-01-01'), np.datetime64('2013-01-02'),
+                np.datetime64('2013-01-03')]
+        result, bins = cut(data, 3, retbins=True)
+        tm.assert_series_equal(Series(result), expected)
+
+        # testing for time data to be present as ndarray
+        data = np.array([np.datetime64('2013-01-01'),
+                        np.datetime64('2013-01-02'),
+                        np.datetime64('2013-01-03')])
+        result, bins = cut(data, 3, retbins=True)
+        tm.assert_series_equal(Series(result), expected)
+
+        # testing for time data to be present as datetime index
+        data = DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03'])
+        result, bins = cut(data, 3, retbins=True)
+        tm.assert_series_equal(Series(result), expected)
+
 
 def curpath():
     pth, _ = os.path.split(os.path.abspath(__file__))
diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py
@@ -11,6 +11,8 @@
 import pandas.core.algorithms as algos
 import pandas.core.nanops as nanops
 from pandas.compat import zip
+from pandas import to_timedelta, to_datetime
+from pandas.types.common import is_datetime64_dtype, is_timedelta64_dtype
 
 import numpy as np
 
@@ -81,14 +83,17 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
     array([1, 1, 1, 1, 1], dtype=int64)
     """
     # NOTE: this binning code is changed a bit from histogram for var(x) == 0
+
+    # for handling the cut for datetime and timedelta objects
+    x_is_series, series_index, name, x = _preprocess_for_cut(x)
+    x, dtype = _coerce_to_type(x)
+
     if not np.iterable(bins):
         if is_scalar(bins) and bins < 1:
             raise ValueError("`bins` should be a positive integer.")
-        try:  # for array-like
-            sz = x.size
-        except AttributeError:
-            x = np.asarray(x)
-            sz = x.size
+
+        sz = x.size
+
         if sz == 0:
             raise ValueError('Cannot cut empty array')
             # handle empty arrays. Can't determine range, so use 0-1.
@@ -114,9 +119,12 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
         if (np.diff(bins) < 0).any():
             raise ValueError('bins must increase monotonically.')
 
-    return _bins_to_cuts(x, bins, right=right, labels=labels,
-                         retbins=retbins, precision=precision,
-                         include_lowest=include_lowest)
+    fac, bins = _bins_to_cuts(x, bins, right=right, labels=labels,
+                              precision=precision,
+                              include_lowest=include_lowest, dtype=dtype)
+
+    return _postprocess_for_cut(fac, bins, retbins, x_is_series,
+                                series_index, name)
 
 
 def qcut(x, q, labels=None, retbins=False, precision=3):
@@ -166,26 +174,26 @@ def qcut(x, q, labels=None, retbins=False, precision=3):
     >>> pd.qcut(range(5), 4, labels=False)
     array([0, 0, 1, 2, 3], dtype=int64)
     """
+    x_is_series, series_index, name, x = _preprocess_for_cut(x)
+
+    x, dtype = _coerce_to_type(x)
+
     if is_integer(q):
         quantiles = np.linspace(0, 1, q + 1)
     else:
         quantiles = q
     bins = algos.quantile(x, quantiles)
-    return _bins_to_cuts(x, bins, labels=labels, retbins=retbins,
-                         precision=precision, include_lowest=True)
+    fac, bins = _bins_to_cuts(x, bins, labels=labels,
+                              precision=precision, include_lowest=True,
+                              dtype=dtype)
 
+    return _postprocess_for_cut(fac, bins, retbins, x_is_series,
+                                series_index, name)
 
-def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,
-                  precision=3, name=None, include_lowest=False):
-    x_is_series = isinstance(x, Series)
-    series_index = None
-
-    if x_is_series:
-        series_index = x.index
-        if name is None:
-            name = x.name
 
-    x = np.asarray(x)
+def _bins_to_cuts(x, bins, right=True, labels=None,
+                  precision=3, include_lowest=False,
+                  dtype=None):
 
     side = 'left' if right else 'right'
     ids = bins.searchsorted(x, side=side)
@@ -205,7 +213,8 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,
             while True:
                 try:
                     levels = _format_levels(bins, precision, right=right,
-                                            include_lowest=include_lowest)
+                                            include_lowest=include_lowest,
+                                            dtype=dtype)
                 except ValueError:
                     increases += 1
                     precision += 1
@@ -229,18 +238,12 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,
             fac = fac.astype(np.float64)
             np.putmask(fac, na_mask, np.nan)
 
-    if x_is_series:
-        fac = Series(fac, index=series_index, name=name)
-
-    if not retbins:
-        return fac
-
     return fac, bins
 
 
 def _format_levels(bins, prec, right=True,
-                   include_lowest=False):
-    fmt = lambda v: _format_label(v, precision=prec)
+                   include_lowest=False, dtype=None):
+    fmt = lambda v: _format_label(v, precision=prec, dtype=dtype)
     if right:
         levels = []
         for a, b in zip(bins, bins[1:]):
@@ -258,12 +261,16 @@ def _format_levels(bins, prec, right=True,
     else:
         levels = ['[%s, %s)' % (fmt(a), fmt(b))
                   for a, b in zip(bins, bins[1:])]
-
     return levels
 
 
-def _format_label(x, precision=3):
+def _format_label(x, precision=3, dtype=None):
     fmt_str = '%%.%dg' % precision
+
+    if is_datetime64_dtype(dtype):
+        return to_datetime(x, unit='ns')
+    if is_timedelta64_dtype(dtype):
+        return to_timedelta(x, unit='ns')
     if np.isinf(x):
         return str(x)
     elif is_float(x):
@@ -300,3 +307,55 @@ def _trim_zeros(x):
     if len(x) > 1 and x[-1] == '.':
         x = x[:-1]
     return x
+
+
+def _coerce_to_type(x):
+    """
+    if the passed data is of datetime/timedelta type,
+    this method converts it to integer so that cut method can
+    handle it
+    """
+    dtype = None
+
+    if is_timedelta64_dtype(x):
+        x = to_timedelta(x).view(np.int64)
+        dtype = np.timedelta64
+    elif is_datetime64_dtype(x):
+        x = to_datetime(x).view(np.int64)
+        dtype = np.datetime64
+
+    return x, dtype
+
+
+def _preprocess_for_cut(x):
+    """
+    handles preprocessing for cut where we convert passed
+    input to array, strip the index information and store it
+    seperately
+    """
+    x_is_series = isinstance(x, Series)
+    series_index = None
+    name = None
+
+    if x_is_series:
+        series_index = x.index
+        name = x.name
+
+    x = np.asarray(x)
+
+    return x_is_series, series_index, name, x
+
+
+def _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name):
+    """
+    handles post processing for the cut method where
+    we combine the index information if the originally passed
+    datatype was a series
+    """
+    if x_is_series:
+        fac = Series(fac, index=series_index, name=name)
+
+    if not retbins:
+        return fac
+
+    return fac, bins

Original file line number	Diff line number	Diff line change
`@@ -50,6 +50,7 @@ Other enhancements`
`50`	`50`
`51`	`51`	- ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`)
`52`	`52`
	`53`	+- ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (issue:`14714`)
`53`	`54`
`54`	`55`	`.. _whatsnew_0200.api_breaking:`
`55`	`56`