From a6a4a11660761c2ea56599bef035165fd43d16b7 Mon Sep 17 00:00:00 2001 From: Ajay Saxena Date: Fri, 25 Nov 2016 01:41:40 -0500 Subject: [PATCH 01/19] initial iteration for pd cut enhance to support time date type --- pandas/tools/tile.py | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index ef75f2f84779b..1b47b47605cf4 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -11,7 +11,8 @@ import pandas.core.algorithms as algos import pandas.core.nanops as nanops from pandas.compat import zip - +from pandas.tseries.timedeltas import to_timedelta +from pandas.types.common import (needs_i8_conversion) import numpy as np @@ -81,6 +82,13 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, array([1, 1, 1, 1, 1], dtype=int64) """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 + # for handling the cut for datetime and timedelta objects + if needs_i8_conversion(x): + x = x.values.view('i8') + time_data = True + else: + time_data = False + if not np.iterable(bins): if is_scalar(bins) and bins < 1: raise ValueError("`bins` should be a positive integer.") @@ -116,7 +124,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, return _bins_to_cuts(x, bins, right=right, labels=labels, retbins=retbins, precision=precision, - include_lowest=include_lowest) + include_lowest=include_lowest, time_data=time_data) def qcut(x, q, labels=None, retbins=False, precision=3): @@ -176,7 +184,8 @@ def qcut(x, q, labels=None, retbins=False, precision=3): def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, - precision=3, name=None, include_lowest=False): + precision=3, name=None, include_lowest=False, + time_data=False): x_is_series = isinstance(x, Series) series_index = None @@ -205,7 +214,8 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, while True: try: levels = _format_levels(bins, precision, right=right, - include_lowest=include_lowest) + include_lowest=include_lowest, + time_data=time_data) except ValueError: increases += 1 precision += 1 @@ -239,7 +249,7 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, def _format_levels(bins, prec, right=True, - include_lowest=False): + include_lowest=False, time_data=False): fmt = lambda v: _format_label(v, precision=prec) if right: levels = [] @@ -249,16 +259,24 @@ def _format_levels(bins, prec, right=True, if a != b and fa == fb: raise ValueError('precision too low') - formatted = '(%s, %s]' % (fa, fb) + if time_data: + formatted = '(%s, %s]' % (to_timedelta(float(fa), unit='ns'), + to_timedelta(float(fb), unit='ns')) + else: + formatted = '(%s, %s]' % (fa, fb) levels.append(formatted) if include_lowest: levels[0] = '[' + levels[0][1:] else: - levels = ['[%s, %s)' % (fmt(a), fmt(b)) - for a, b in zip(bins, bins[1:])] - + if time_data: + levels = ['[%s, %s)' % (to_timedelta(float(fmt(fa)), unit='ns'), + to_timedelta(float(fmt(b)), unit='ns')) + for a, b in zip(bins, bins[1:])] + else: + levels = ['[%s, %s)' % (fmt(a), fmt(b)) + for a, b in zip(bins, bins[1:])] return levels From 3e22a77a46ebfd5f2f8c6617d2f13033bbda9fb0 Mon Sep 17 00:00:00 2001 From: Ajay Saxena Date: Fri, 25 Nov 2016 16:13:31 -0500 Subject: [PATCH 02/19] added datetime handling and passing object type --- pandas/tools/tile.py | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 1b47b47605cf4..6d0ab39beb854 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -12,8 +12,9 @@ import pandas.core.nanops as nanops from pandas.compat import zip from pandas.tseries.timedeltas import to_timedelta -from pandas.types.common import (needs_i8_conversion) +from pandas import to_datetime import numpy as np +from pandas.types.common import (is_datetime64_dtype, is_timedelta64_dtype) def cut(x, bins, right=True, labels=None, retbins=False, precision=3, @@ -83,11 +84,15 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 # for handling the cut for datetime and timedelta objects - if needs_i8_conversion(x): - x = x.values.view('i8') - time_data = True - else: - time_data = False + + dtype = None + if is_timedelta64_dtype(x): + x = x.astype(np.int64) + dtype = np.timedelta64 + + if is_datetime64_dtype(x): + x = x.astype(np.int64) + dtype = np.datetime64 if not np.iterable(bins): if is_scalar(bins) and bins < 1: @@ -124,7 +129,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, return _bins_to_cuts(x, bins, right=right, labels=labels, retbins=retbins, precision=precision, - include_lowest=include_lowest, time_data=time_data) + include_lowest=include_lowest, dtype=dtype) def qcut(x, q, labels=None, retbins=False, precision=3): @@ -185,7 +190,7 @@ def qcut(x, q, labels=None, retbins=False, precision=3): def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, precision=3, name=None, include_lowest=False, - time_data=False): + dtype=None): x_is_series = isinstance(x, Series) series_index = None @@ -215,7 +220,7 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, try: levels = _format_levels(bins, precision, right=right, include_lowest=include_lowest, - time_data=time_data) + dtype=dtype) except ValueError: increases += 1 precision += 1 @@ -249,7 +254,7 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, def _format_levels(bins, prec, right=True, - include_lowest=False, time_data=False): + include_lowest=False, dtype=None): fmt = lambda v: _format_label(v, precision=prec) if right: levels = [] @@ -259,7 +264,10 @@ def _format_levels(bins, prec, right=True, if a != b and fa == fb: raise ValueError('precision too low') - if time_data: + if dtype == np.datetime64: + formatted = '(%s, %s]' % (to_datetime(float(fa), unit='ns'), + to_datetime(float(fb), unit='ns')) + elif dtype == np.timedelta64: formatted = '(%s, %s]' % (to_timedelta(float(fa), unit='ns'), to_timedelta(float(fb), unit='ns')) else: @@ -270,7 +278,11 @@ def _format_levels(bins, prec, right=True, if include_lowest: levels[0] = '[' + levels[0][1:] else: - if time_data: + if dtype == np.datetime64: + levels = ['[%s, %s)' % (to_datetime(float(fmt(fa)), unit='ns'), + to_datetime(float(fmt(b)), unit='ns')) + for a, b in zip(bins, bins[1:])] + elif dtype == np.timedelta64: levels = ['[%s, %s)' % (to_timedelta(float(fmt(fa)), unit='ns'), to_timedelta(float(fmt(b)), unit='ns')) for a, b in zip(bins, bins[1:])] From 01c6713af9fe6ebce3f2177b00e84eb46b0b0f29 Mon Sep 17 00:00:00 2001 From: Ajay Saxena Date: Fri, 25 Nov 2016 16:50:12 -0500 Subject: [PATCH 03/19] modified the formatter to handle time objects --- pandas/tools/tile.py | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 6d0ab39beb854..75ce91374183a 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -255,7 +255,7 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, def _format_levels(bins, prec, right=True, include_lowest=False, dtype=None): - fmt = lambda v: _format_label(v, precision=prec) + fmt = lambda v: _format_label(v, precision=prec, dtype=dtype) if right: levels = [] for a, b in zip(bins, bins[1:]): @@ -264,36 +264,25 @@ def _format_levels(bins, prec, right=True, if a != b and fa == fb: raise ValueError('precision too low') - if dtype == np.datetime64: - formatted = '(%s, %s]' % (to_datetime(float(fa), unit='ns'), - to_datetime(float(fb), unit='ns')) - elif dtype == np.timedelta64: - formatted = '(%s, %s]' % (to_timedelta(float(fa), unit='ns'), - to_timedelta(float(fb), unit='ns')) - else: - formatted = '(%s, %s]' % (fa, fb) + formatted = '(%s, %s]' % (fa, fb) levels.append(formatted) if include_lowest: levels[0] = '[' + levels[0][1:] else: - if dtype == np.datetime64: - levels = ['[%s, %s)' % (to_datetime(float(fmt(fa)), unit='ns'), - to_datetime(float(fmt(b)), unit='ns')) - for a, b in zip(bins, bins[1:])] - elif dtype == np.timedelta64: - levels = ['[%s, %s)' % (to_timedelta(float(fmt(fa)), unit='ns'), - to_timedelta(float(fmt(b)), unit='ns')) - for a, b in zip(bins, bins[1:])] - else: - levels = ['[%s, %s)' % (fmt(a), fmt(b)) - for a, b in zip(bins, bins[1:])] + levels = ['[%s, %s)' % (fmt(a), fmt(b)) + for a, b in zip(bins, bins[1:])] return levels -def _format_label(x, precision=3): +def _format_label(x, precision=3, dtype=None): fmt_str = '%%.%dg' % precision + + if dtype == np.datetime64: + return to_datetime(x, unit='ns') + if dtype == np.timedelta64: + return to_timedelta(x, unit='ns') if np.isinf(x): return str(x) elif is_float(x): From 21a8b0949b0847abcdec0b216aec00f6522160ed Mon Sep 17 00:00:00 2001 From: Ajay Saxena Date: Sat, 26 Nov 2016 22:09:07 -0500 Subject: [PATCH 04/19] added testcase for datetime cut --- pandas/tools/tests/test_tile.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index e5b9c65b515d6..252cc7a914851 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -12,6 +12,7 @@ from pandas.core.algorithms import quantile from pandas.tools.tile import cut, qcut import pandas.tools.tile as tmod +from pandas import to_datetime class TestCut(tm.TestCase): @@ -283,6 +284,16 @@ def test_single_bin(self): result = cut(s, 1, labels=False) tm.assert_series_equal(result, expected) + def test_datetime_cut(self): + data = to_datetime(Series(['2013-01-01', '2013-01-02', '2013-01-03'])) + result = cut(data, 3) + self.assertEqual(result[0], + '(2012-12-31 23:57:07.200000, 2013-01-01 16:00:00]') + self.assertEqual(result[1], + '(2013-01-01 16:00:00, 2013-01-02 08:00:00]') + self.assertEqual(result[2], + '(2013-01-02 08:00:00, 2013-01-03 00:00:00]') + def curpath(): pth, _ = os.path.split(os.path.abspath(__file__)) From 4404f114021a7cec95b3e7c9427ec6e5c4e25739 Mon Sep 17 00:00:00 2001 From: Ajay Saxena Date: Sun, 27 Nov 2016 23:31:41 -0500 Subject: [PATCH 05/19] added changes to qcut code --- pandas/tools/tile.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 75ce91374183a..af5f3aa505541 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -179,13 +179,23 @@ def qcut(x, q, labels=None, retbins=False, precision=3): >>> pd.qcut(range(5), 4, labels=False) array([0, 0, 1, 2, 3], dtype=int64) """ + dtype = None + if is_timedelta64_dtype(x): + x = x.astype(np.int64) + dtype = np.timedelta64 + + if is_datetime64_dtype(x): + x = x.astype(np.int64) + dtype = np.datetime64 + if is_integer(q): quantiles = np.linspace(0, 1, q + 1) else: quantiles = q bins = algos.quantile(x, quantiles) return _bins_to_cuts(x, bins, labels=labels, retbins=retbins, - precision=precision, include_lowest=True) + precision=precision, include_lowest=True, + dtype=dtype) def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, From a981b7076072f5f9e27685f77d285c893ac441b2 Mon Sep 17 00:00:00 2001 From: Ajay Saxena Date: Mon, 28 Nov 2016 02:22:01 -0500 Subject: [PATCH 06/19] code review comments --- pandas/tools/tile.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index af5f3aa505541..15c637d04020b 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -87,11 +87,11 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, dtype = None if is_timedelta64_dtype(x): - x = x.astype(np.int64) + x = x.view(np.int64) dtype = np.timedelta64 if is_datetime64_dtype(x): - x = x.astype(np.int64) + x = x.view(np.int64) dtype = np.datetime64 if not np.iterable(bins): @@ -181,11 +181,11 @@ def qcut(x, q, labels=None, retbins=False, precision=3): """ dtype = None if is_timedelta64_dtype(x): - x = x.astype(np.int64) + x = x.view(np.int64) dtype = np.timedelta64 if is_datetime64_dtype(x): - x = x.astype(np.int64) + x = x.view(np.int64) dtype = np.datetime64 if is_integer(q): @@ -289,9 +289,9 @@ def _format_levels(bins, prec, right=True, def _format_label(x, precision=3, dtype=None): fmt_str = '%%.%dg' % precision - if dtype == np.datetime64: + if is_datetime64_dtype(dtype): return to_datetime(x, unit='ns') - if dtype == np.timedelta64: + if is_timedelta64_dtype(dtype): return to_timedelta(x, unit='ns') if np.isinf(x): return str(x) From d50b38bc7b049cd03fa090d7cf0e3bbe5121d26b Mon Sep 17 00:00:00 2001 From: Ajay Saxena Date: Tue, 29 Nov 2016 02:15:03 -0500 Subject: [PATCH 07/19] code review comments --- pandas/tools/tests/test_tile.py | 14 +++++++------- pandas/tools/tile.py | 6 +++--- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index 252cc7a914851..2b213540326b8 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -285,14 +285,14 @@ def test_single_bin(self): tm.assert_series_equal(result, expected) def test_datetime_cut(self): + # GH 14714 data = to_datetime(Series(['2013-01-01', '2013-01-02', '2013-01-03'])) - result = cut(data, 3) - self.assertEqual(result[0], - '(2012-12-31 23:57:07.200000, 2013-01-01 16:00:00]') - self.assertEqual(result[1], - '(2013-01-01 16:00:00, 2013-01-02 08:00:00]') - self.assertEqual(result[2], - '(2013-01-02 08:00:00, 2013-01-03 00:00:00]') + result, bins = cut(data, 3, retbins=True) + expected = Series(['(2012-12-31 23:57:07.200000, 2013-01-01 16:00:00]', + '(2013-01-01 16:00:00, 2013-01-02 08:00:00]', + '(2013-01-02 08:00:00, 2013-01-03 00:00:00]'], + ).astype("category", ordered=True) + tm.assert_series_equal(result, expected) def curpath(): diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 15c637d04020b..0178bd7b1925e 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -11,7 +11,7 @@ import pandas.core.algorithms as algos import pandas.core.nanops as nanops from pandas.compat import zip -from pandas.tseries.timedeltas import to_timedelta +from pandas import to_timedelta from pandas import to_datetime import numpy as np from pandas.types.common import (is_datetime64_dtype, is_timedelta64_dtype) @@ -90,7 +90,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, x = x.view(np.int64) dtype = np.timedelta64 - if is_datetime64_dtype(x): + elif is_datetime64_dtype(x): x = x.view(np.int64) dtype = np.datetime64 @@ -184,7 +184,7 @@ def qcut(x, q, labels=None, retbins=False, precision=3): x = x.view(np.int64) dtype = np.timedelta64 - if is_datetime64_dtype(x): + elif is_datetime64_dtype(x): x = x.view(np.int64) dtype = np.datetime64 From 5c9ef9c25bd7a84d607bbf29e656d13dbbc023d5 Mon Sep 17 00:00:00 2001 From: Ajay Saxena Date: Tue, 29 Nov 2016 19:25:33 -0500 Subject: [PATCH 08/19] added private method for coercing data --- pandas/tools/tile.py | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 0178bd7b1925e..79742f43d8460 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -11,10 +11,9 @@ import pandas.core.algorithms as algos import pandas.core.nanops as nanops from pandas.compat import zip -from pandas import to_timedelta -from pandas import to_datetime +from pandas import to_timedelta, to_datetime import numpy as np -from pandas.types.common import (is_datetime64_dtype, is_timedelta64_dtype) +from pandas.types.common import is_datetime64_dtype, is_timedelta64_dtype def cut(x, bins, right=True, labels=None, retbins=False, precision=3, @@ -85,14 +84,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, # NOTE: this binning code is changed a bit from histogram for var(x) == 0 # for handling the cut for datetime and timedelta objects - dtype = None - if is_timedelta64_dtype(x): - x = x.view(np.int64) - dtype = np.timedelta64 - - elif is_datetime64_dtype(x): - x = x.view(np.int64) - dtype = np.datetime64 + original, x, dtype = _coerce_to_type(x) if not np.iterable(bins): if is_scalar(bins) and bins < 1: @@ -179,14 +171,7 @@ def qcut(x, q, labels=None, retbins=False, precision=3): >>> pd.qcut(range(5), 4, labels=False) array([0, 0, 1, 2, 3], dtype=int64) """ - dtype = None - if is_timedelta64_dtype(x): - x = x.view(np.int64) - dtype = np.timedelta64 - - elif is_datetime64_dtype(x): - x = x.view(np.int64) - dtype = np.datetime64 + original, x, dtype = _coerce_to_type(x) if is_integer(q): quantiles = np.linspace(0, 1, q + 1) @@ -329,3 +314,16 @@ def _trim_zeros(x): if len(x) > 1 and x[-1] == '.': x = x[:-1] return x + + +def _coerce_to_type(x): + dtype = None + original = x + if is_timedelta64_dtype(x): + x = x.view(np.int64) + dtype = np.timedelta64 + + elif is_datetime64_dtype(x): + x = x.view(np.int64) + dtype = np.datetime64 + return original, x, dtype From f68ed02b3120b8b6fb4ff40c56e9a06184e7c6ba Mon Sep 17 00:00:00 2001 From: Ajay Saxena Date: Tue, 29 Nov 2016 21:39:10 -0500 Subject: [PATCH 09/19] added private methods for pre and post processing --- pandas/tools/tile.py | 59 ++++++++++++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 79742f43d8460..43a5d503f3dd3 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -119,9 +119,14 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, if (np.diff(bins) < 0).any(): raise ValueError('bins must increase monotonically.') - return _bins_to_cuts(x, bins, right=right, labels=labels, - retbins=retbins, precision=precision, - include_lowest=include_lowest, dtype=dtype) + x_is_series, series_index, name, x = _preprocess_for_cut(x) + + fac, bins = _bins_to_cuts(x, bins, right=right, labels=labels, + retbins=retbins, precision=precision, + include_lowest=include_lowest, dtype=dtype) + + return _postprocess_for_cut(fac, bins, retbins, x_is_series, + series_index, name) def qcut(x, q, labels=None, retbins=False, precision=3): @@ -173,28 +178,24 @@ def qcut(x, q, labels=None, retbins=False, precision=3): """ original, x, dtype = _coerce_to_type(x) + x_is_series, series_index, name, x = _preprocess_for_cut(x) + if is_integer(q): quantiles = np.linspace(0, 1, q + 1) else: quantiles = q bins = algos.quantile(x, quantiles) - return _bins_to_cuts(x, bins, labels=labels, retbins=retbins, - precision=precision, include_lowest=True, - dtype=dtype) + fac, bins = _bins_to_cuts(x, bins, labels=labels, retbins=retbins, + precision=precision, include_lowest=True, + dtype=dtype) + + return _postprocess_for_cut(fac, bins, retbins, x_is_series, + series_index, name) def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, precision=3, name=None, include_lowest=False, dtype=None): - x_is_series = isinstance(x, Series) - series_index = None - - if x_is_series: - series_index = x.index - if name is None: - name = x.name - - x = np.asarray(x) side = 'left' if right else 'right' ids = bins.searchsorted(x, side=side) @@ -239,12 +240,6 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, fac = fac.astype(np.float64) np.putmask(fac, na_mask, np.nan) - if x_is_series: - fac = Series(fac, index=series_index, name=name) - - if not retbins: - return fac - return fac, bins @@ -327,3 +322,25 @@ def _coerce_to_type(x): x = x.view(np.int64) dtype = np.datetime64 return original, x, dtype + + +def _preprocess_for_cut(x): + x_is_series = isinstance(x, Series) + series_index = None + + name = None + if x_is_series: + series_index = x.index + if name is None: + name = x.name + return x_is_series, series_index, name, x + + +def _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name): + if x_is_series: + fac = Series(fac, index=series_index, name=name) + + if not retbins: + return fac + + return fac, bins From 323ab7a59e58e53f3a9a6019177a2edbd62bc2ce Mon Sep 17 00:00:00 2001 From: Ajay Saxena Date: Tue, 29 Nov 2016 22:15:45 -0500 Subject: [PATCH 10/19] moving preprocess prior to coerce to type --- pandas/tools/tile.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 43a5d503f3dd3..4830b490f7ae4 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -83,6 +83,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 # for handling the cut for datetime and timedelta objects + x_is_series, series_index, name, x = _preprocess_for_cut(x) original, x, dtype = _coerce_to_type(x) @@ -119,8 +120,6 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, if (np.diff(bins) < 0).any(): raise ValueError('bins must increase monotonically.') - x_is_series, series_index, name, x = _preprocess_for_cut(x) - fac, bins = _bins_to_cuts(x, bins, right=right, labels=labels, retbins=retbins, precision=precision, include_lowest=include_lowest, dtype=dtype) @@ -176,10 +175,10 @@ def qcut(x, q, labels=None, retbins=False, precision=3): >>> pd.qcut(range(5), 4, labels=False) array([0, 0, 1, 2, 3], dtype=int64) """ - original, x, dtype = _coerce_to_type(x) - x_is_series, series_index, name, x = _preprocess_for_cut(x) + original, x, dtype = _coerce_to_type(x) + if is_integer(q): quantiles = np.linspace(0, 1, q + 1) else: From 49854aa86e99a278bd94fb7b5b0716182790e270 Mon Sep 17 00:00:00 2001 From: Ajay Saxena Date: Tue, 29 Nov 2016 22:32:12 -0500 Subject: [PATCH 11/19] rectified preprocess method array conversion missing --- pandas/tools/tile.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 4830b490f7ae4..9a0239cda3851 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -332,6 +332,9 @@ def _preprocess_for_cut(x): series_index = x.index if name is None: name = x.name + + x = np.asarray(x) + return x_is_series, series_index, name, x From 0b77044438997ab9137abcd3dd0700dac93613ca Mon Sep 17 00:00:00 2001 From: Ajay Saxena Date: Wed, 30 Nov 2016 00:15:32 -0500 Subject: [PATCH 12/19] added tests and modified preprocessing code --- pandas/tools/tests/test_tile.py | 23 +++++++++++++++++++++++ pandas/tools/tile.py | 4 ++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index 2b213540326b8..fc4c64862b781 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -294,6 +294,29 @@ def test_datetime_cut(self): ).astype("category", ordered=True) tm.assert_series_equal(result, expected) + def test_datetime_list_cut(self): + # GH 14714 + data = [np.datetime64('2013-01-01'), np.datetime64('2013-01-02'), + np.datetime64('2013-01-03')] + result, bins = cut(data, 3, retbins=True) + expected = Series(['(2012-12-31 23:57:07.200000, 2013-01-01 16:00:00]', + '(2013-01-01 16:00:00, 2013-01-02 08:00:00]', + '(2013-01-02 08:00:00, 2013-01-03 00:00:00]'], + ).astype("category", ordered=True) + tm.assert_almost_equal(Series(result), expected) + + def test_datetime_ndarray_cut(self): + # GH 14714 + data = np.array([np.datetime64('2013-01-01'), + np.datetime64('2013-01-02'), + np.datetime64('2013-01-03')]) + result, bins = cut(data, 3, retbins=True) + expected = Series(['(2012-12-31 23:57:07.200000, 2013-01-01 16:00:00]', + '(2013-01-01 16:00:00, 2013-01-02 08:00:00]', + '(2013-01-02 08:00:00, 2013-01-03 00:00:00]'], + ).astype("category", ordered=True) + tm.assert_almost_equal(Series(result), expected) + def curpath(): pth, _ = os.path.split(os.path.abspath(__file__)) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 9a0239cda3851..ba9e7b7051278 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -314,11 +314,11 @@ def _coerce_to_type(x): dtype = None original = x if is_timedelta64_dtype(x): - x = x.view(np.int64) + x = to_timedelta(x).view(np.int64) dtype = np.timedelta64 elif is_datetime64_dtype(x): - x = x.view(np.int64) + x = to_datetime(x).view(np.int64) dtype = np.datetime64 return original, x, dtype From 7406207737b4af75b8b879915c087fb998343ec4 Mon Sep 17 00:00:00 2001 From: Ajay Saxena Date: Wed, 30 Nov 2016 00:59:50 -0500 Subject: [PATCH 13/19] added test for datetime index --- pandas/tools/tests/test_tile.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index fc4c64862b781..8a22c80ad1502 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -12,7 +12,7 @@ from pandas.core.algorithms import quantile from pandas.tools.tile import cut, qcut import pandas.tools.tile as tmod -from pandas import to_datetime +from pandas import to_datetime, DatetimeIndex class TestCut(tm.TestCase): @@ -303,7 +303,7 @@ def test_datetime_list_cut(self): '(2013-01-01 16:00:00, 2013-01-02 08:00:00]', '(2013-01-02 08:00:00, 2013-01-03 00:00:00]'], ).astype("category", ordered=True) - tm.assert_almost_equal(Series(result), expected) + tm.assert_series_equal(Series(result), expected) def test_datetime_ndarray_cut(self): # GH 14714 @@ -315,7 +315,17 @@ def test_datetime_ndarray_cut(self): '(2013-01-01 16:00:00, 2013-01-02 08:00:00]', '(2013-01-02 08:00:00, 2013-01-03 00:00:00]'], ).astype("category", ordered=True) - tm.assert_almost_equal(Series(result), expected) + tm.assert_series_equal(Series(result), expected) + + def test_datetime_index_cut(self): + # GH 14714 + data = DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03']) + result, bins = cut(data, 3, retbins=True) + expected = Series(['(2012-12-31 23:57:07.200000, 2013-01-01 16:00:00]', + '(2013-01-01 16:00:00, 2013-01-02 08:00:00]', + '(2013-01-02 08:00:00, 2013-01-03 00:00:00]'], + ).astype("category", ordered=True) + tm.assert_series_equal(Series(result), expected) def curpath(): From 890252e79e5cbb0e0f024412dbb061fb176b7152 Mon Sep 17 00:00:00 2001 From: Ajay Saxena Date: Wed, 30 Nov 2016 14:02:07 -0500 Subject: [PATCH 14/19] added docstring to new methods. Merged all datetime tests into one test, updated whatsnew --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tools/tests/test_tile.py | 23 +++++------------------ pandas/tools/tile.py | 15 +++++++++++++++ 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 6fe0ad8092a03..34192c0e07aca 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -50,6 +50,7 @@ Other enhancements - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) +ENH: support cut/qcut for datetime/timedelta (GH14714) .. _whatsnew_0200.api_breaking: diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index 8a22c80ad1502..fb6e650bd26db 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -286,6 +286,7 @@ def test_single_bin(self): def test_datetime_cut(self): # GH 14714 + # testing for time data to be present as series data = to_datetime(Series(['2013-01-01', '2013-01-02', '2013-01-03'])) result, bins = cut(data, 3, retbins=True) expected = Series(['(2012-12-31 23:57:07.200000, 2013-01-01 16:00:00]', @@ -294,37 +295,23 @@ def test_datetime_cut(self): ).astype("category", ordered=True) tm.assert_series_equal(result, expected) - def test_datetime_list_cut(self): - # GH 14714 + # testing for time data to be present as list data = [np.datetime64('2013-01-01'), np.datetime64('2013-01-02'), np.datetime64('2013-01-03')] result, bins = cut(data, 3, retbins=True) - expected = Series(['(2012-12-31 23:57:07.200000, 2013-01-01 16:00:00]', - '(2013-01-01 16:00:00, 2013-01-02 08:00:00]', - '(2013-01-02 08:00:00, 2013-01-03 00:00:00]'], - ).astype("category", ordered=True) tm.assert_series_equal(Series(result), expected) - def test_datetime_ndarray_cut(self): - # GH 14714 + # testing for time data to be present as ndarray + data = np.array([np.datetime64('2013-01-01'), np.datetime64('2013-01-02'), np.datetime64('2013-01-03')]) result, bins = cut(data, 3, retbins=True) - expected = Series(['(2012-12-31 23:57:07.200000, 2013-01-01 16:00:00]', - '(2013-01-01 16:00:00, 2013-01-02 08:00:00]', - '(2013-01-02 08:00:00, 2013-01-03 00:00:00]'], - ).astype("category", ordered=True) tm.assert_series_equal(Series(result), expected) - def test_datetime_index_cut(self): - # GH 14714 + # testing for time data to be present as datetime index data = DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03']) result, bins = cut(data, 3, retbins=True) - expected = Series(['(2012-12-31 23:57:07.200000, 2013-01-01 16:00:00]', - '(2013-01-01 16:00:00, 2013-01-02 08:00:00]', - '(2013-01-02 08:00:00, 2013-01-03 00:00:00]'], - ).astype("category", ordered=True) tm.assert_series_equal(Series(result), expected) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index ba9e7b7051278..93782cd7df84e 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -311,6 +311,11 @@ def _trim_zeros(x): def _coerce_to_type(x): + ''' + if the passed data is of datetime/timedelta type, + this method converts it to integer so that cut method can + handle it + ''' dtype = None original = x if is_timedelta64_dtype(x): @@ -324,6 +329,11 @@ def _coerce_to_type(x): def _preprocess_for_cut(x): + ''' + handles preprocessing for cut where we convert passed + input to array, strip the index information and store it + seperately + ''' x_is_series = isinstance(x, Series) series_index = None @@ -339,6 +349,11 @@ def _preprocess_for_cut(x): def _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name): + ''' + handles post processing for the cut method where + we combine the index information if the originally passed + datatype was a series + ''' if x_is_series: fac = Series(fac, index=series_index, name=name) From f44316cebb2011d919ece983aed91c7218f153e8 Mon Sep 17 00:00:00 2001 From: Ajay Saxena Date: Wed, 30 Nov 2016 17:30:45 -0500 Subject: [PATCH 15/19] modified the whatsnew message --- doc/source/whatsnew/v0.20.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 34192c0e07aca..101f616ea8278 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -50,7 +50,7 @@ Other enhancements - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) -ENH: support cut/qcut for datetime/timedelta (GH14714) +- pd.cut and qcut now support datetime64 and timedelta64 dtypes (issue:`14714`) .. _whatsnew_0200.api_breaking: From 9d9e3f941a79e7cd0a64982f4bd697e58e1271d5 Mon Sep 17 00:00:00 2001 From: Ajay Saxena Date: Wed, 30 Nov 2016 20:59:38 -0500 Subject: [PATCH 16/19] changed docstring and whatsnew --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/tools/tile.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 101f616ea8278..5e94a95e38cbb 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -50,7 +50,7 @@ Other enhancements - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) -- pd.cut and qcut now support datetime64 and timedelta64 dtypes (issue:`14714`) +- ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (issue:`14714`) .. _whatsnew_0200.api_breaking: diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 93782cd7df84e..2b2f145f1857d 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -311,11 +311,11 @@ def _trim_zeros(x): def _coerce_to_type(x): - ''' + """ if the passed data is of datetime/timedelta type, this method converts it to integer so that cut method can handle it - ''' + """ dtype = None original = x if is_timedelta64_dtype(x): @@ -329,11 +329,11 @@ def _coerce_to_type(x): def _preprocess_for_cut(x): - ''' + """ handles preprocessing for cut where we convert passed input to array, strip the index information and store it seperately - ''' + """ x_is_series = isinstance(x, Series) series_index = None @@ -349,11 +349,11 @@ def _preprocess_for_cut(x): def _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name): - ''' + """ handles post processing for the cut method where we combine the index information if the originally passed datatype was a series - ''' + """ if x_is_series: fac = Series(fac, index=series_index, name=name) From 8e4adbb6cf1f87c95440c4eb89fc61262645ffe9 Mon Sep 17 00:00:00 2001 From: Ajay Saxena Date: Thu, 1 Dec 2016 11:24:53 -0500 Subject: [PATCH 17/19] removed trailing whitespace from test_tile file --- pandas/tools/tests/test_tile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index fb6e650bd26db..ce634ddfc275b 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -303,7 +303,7 @@ def test_datetime_cut(self): # testing for time data to be present as ndarray - data = np.array([np.datetime64('2013-01-01'), + data = np.array([np.datetime64('2013-01-01'), np.datetime64('2013-01-02'), np.datetime64('2013-01-03')]) result, bins = cut(data, 3, retbins=True) From d324fd57b1de75f682a2811622bc54b8652262d3 Mon Sep 17 00:00:00 2001 From: Ajay Saxena Date: Fri, 2 Dec 2016 17:52:48 -0500 Subject: [PATCH 18/19] code review comments --- pandas/tools/tile.py | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 2b2f145f1857d..e316b87563d16 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -12,9 +12,10 @@ import pandas.core.nanops as nanops from pandas.compat import zip from pandas import to_timedelta, to_datetime -import numpy as np from pandas.types.common import is_datetime64_dtype, is_timedelta64_dtype +import numpy as np + def cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False): @@ -85,16 +86,14 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, # for handling the cut for datetime and timedelta objects x_is_series, series_index, name, x = _preprocess_for_cut(x) - original, x, dtype = _coerce_to_type(x) + x, dtype = _coerce_to_type(x) if not np.iterable(bins): if is_scalar(bins) and bins < 1: raise ValueError("`bins` should be a positive integer.") - try: # for array-like - sz = x.size - except AttributeError: - x = np.asarray(x) - sz = x.size + + sz = x.size + if sz == 0: raise ValueError('Cannot cut empty array') # handle empty arrays. Can't determine range, so use 0-1. @@ -121,7 +120,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, raise ValueError('bins must increase monotonically.') fac, bins = _bins_to_cuts(x, bins, right=right, labels=labels, - retbins=retbins, precision=precision, + precision=precision, include_lowest=include_lowest, dtype=dtype) return _postprocess_for_cut(fac, bins, retbins, x_is_series, @@ -177,14 +176,14 @@ def qcut(x, q, labels=None, retbins=False, precision=3): """ x_is_series, series_index, name, x = _preprocess_for_cut(x) - original, x, dtype = _coerce_to_type(x) + x, dtype = _coerce_to_type(x) if is_integer(q): quantiles = np.linspace(0, 1, q + 1) else: quantiles = q bins = algos.quantile(x, quantiles) - fac, bins = _bins_to_cuts(x, bins, labels=labels, retbins=retbins, + fac, bins = _bins_to_cuts(x, bins, labels=labels, precision=precision, include_lowest=True, dtype=dtype) @@ -192,8 +191,8 @@ def qcut(x, q, labels=None, retbins=False, precision=3): series_index, name) -def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, - precision=3, name=None, include_lowest=False, +def _bins_to_cuts(x, bins, right=True, labels=None, + precision=3, include_lowest=False, dtype=None): side = 'left' if right else 'right' @@ -317,7 +316,6 @@ def _coerce_to_type(x): handle it """ dtype = None - original = x if is_timedelta64_dtype(x): x = to_timedelta(x).view(np.int64) dtype = np.timedelta64 @@ -325,7 +323,7 @@ def _coerce_to_type(x): elif is_datetime64_dtype(x): x = to_datetime(x).view(np.int64) dtype = np.datetime64 - return original, x, dtype + return x, dtype def _preprocess_for_cut(x): @@ -340,8 +338,7 @@ def _preprocess_for_cut(x): name = None if x_is_series: series_index = x.index - if name is None: - name = x.name + name = x.name x = np.asarray(x) From 65eae672e7fe503a580a9c0900d96aeb893a50f3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 3 Dec 2016 11:07:47 +0100 Subject: [PATCH 19/19] some whitespace --- pandas/tools/tests/test_tile.py | 1 - pandas/tools/tile.py | 7 ++++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index ce634ddfc275b..33d2a01b1256e 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -302,7 +302,6 @@ def test_datetime_cut(self): tm.assert_series_equal(Series(result), expected) # testing for time data to be present as ndarray - data = np.array([np.datetime64('2013-01-01'), np.datetime64('2013-01-02'), np.datetime64('2013-01-03')]) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index e316b87563d16..f62bac9e951a7 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -83,9 +83,9 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, array([1, 1, 1, 1, 1], dtype=int64) """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 + # for handling the cut for datetime and timedelta objects x_is_series, series_index, name, x = _preprocess_for_cut(x) - x, dtype = _coerce_to_type(x) if not np.iterable(bins): @@ -316,13 +316,14 @@ def _coerce_to_type(x): handle it """ dtype = None + if is_timedelta64_dtype(x): x = to_timedelta(x).view(np.int64) dtype = np.timedelta64 - elif is_datetime64_dtype(x): x = to_datetime(x).view(np.int64) dtype = np.datetime64 + return x, dtype @@ -334,8 +335,8 @@ def _preprocess_for_cut(x): """ x_is_series = isinstance(x, Series) series_index = None - name = None + if x_is_series: series_index = x.index name = x.name