From 2bc5d24cb9aea21fdb08f4658854d7e10e110ee3 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Fri, 4 May 2018 22:25:13 -0700 Subject: [PATCH 1/6] ENH: Return DatetimeIndex or TimedeltaIndex bins for q/cut when input is datelike --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/reshape/tile.py | 25 ++++++++++++++++++++ pandas/tests/reshape/test_tile.py | 38 ++++++++++++++++++++++++++++++- 3 files changed, 63 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 979fbb5ddfdd0..5386e6c7cfb9d 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -524,6 +524,7 @@ Other Enhancements - Added new writer for exporting Stata dta files in version 117, ``StataWriter117``. This format supports exporting strings with lengths up to 2,000,000 characters (:issue:`16450`) - :func:`to_hdf` and :func:`read_hdf` now accept an ``errors`` keyword argument to control encoding error handling (:issue:`20835`) - :func:`date_range` now returns a linearly spaced ``DatetimeIndex`` if ``start``, ``stop``, and ``periods`` are specified, but ``freq`` is not. (:issue:`20808`) +- :func:`cut` and :func:`qcut` now returns a ``DatetimeIndex`` or ``TimedeltaIndex`` bins when the input is datetime or timedelta dtype respectively and ``retbins=True`` (:issue:`19891`) .. _whatsnew_0230.api_breaking: diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 118198ea0320d..941da4eb0304c 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -332,6 +332,8 @@ def _bins_to_cuts(x, bins, right=True, labels=None, result = result.astype(np.float64) np.putmask(result, na_mask, np.nan) + bins = _convert_bin_to_datelike_type(bins, dtype) + return result, bins @@ -396,6 +398,29 @@ def _convert_bin_to_numeric_type(bins, dtype): return bins +def _convert_bin_to_datelike_type(bins, dtype): + """ + Box bins in Timestamp/Timedelta if the orginal dtype is datelike + + Parameters + ---------- + bins : list-like of bins + dtype : dtype of data + + Returns + ------- + bins : Array-like of bins, DatetimeIndex or TimedeltaIndex if dtype is + datelike + """ + if is_datetime64tz_dtype(dtype): + bins = to_datetime(bins, utc=True).tz_convert(dtype.tz) + elif is_datetime64_dtype(dtype): + bins = to_datetime(bins) + elif is_timedelta64_dtype(dtype): + bins = to_timedelta(bins) + return bins + + def _format_labels(bins, precision, right=True, include_lowest=False, dtype=None): """ based on the dtype, return our labels """ diff --git a/pandas/tests/reshape/test_tile.py b/pandas/tests/reshape/test_tile.py index 8d093f2784ba1..d42300f465115 100644 --- a/pandas/tests/reshape/test_tile.py +++ b/pandas/tests/reshape/test_tile.py @@ -6,7 +6,8 @@ from pandas import (DataFrame, Series, isna, to_datetime, DatetimeIndex, Index, Timestamp, Interval, IntervalIndex, Categorical, - cut, qcut, date_range, NaT, TimedeltaIndex) + cut, qcut, date_range, timedelta_range, NaT, + TimedeltaIndex) from pandas.tseries.offsets import Nano, Day import pandas.util.testing as tm from pandas.api.types import CategoricalDtype as CDT @@ -589,3 +590,38 @@ def f(): mask = result.isna() tm.assert_numpy_array_equal( mask, np.array([False, True, True, True, True])) + + @pytest.mark.parametrize('tz', [None, 'UTC', 'US/Pacific']) + def test_datetime_cut_roundtrip(self, tz): + # GH 19891 + s = Series(date_range('20180101', periods=3, tz=tz)) + result, result_bins = cut(s, 2, retbins=True) + expected = cut(s, result_bins) + tm.assert_series_equal(result, expected) + expected_bins = DatetimeIndex(['2017-12-31 23:57:07.200000', + '2018-01-02 00:00:00', + '2018-01-03 00:00:00']) + expected_bins = expected_bins.tz_localize(tz) + tm.assert_index_equal(result_bins, expected_bins) + + def test_timedelta_cut_roundtrip(self): + # GH 19891 + s = Series(timedelta_range('1day', periods=3)) + result, result_bins = cut(s, 2, retbins=True) + expected = cut(s, result_bins) + tm.assert_series_equal(result, expected) + expected_bins = TimedeltaIndex(['0 days 23:57:07.200000', + '2 days 00:00:00', + '3 days 00:00:00']) + tm.assert_index_equal(result_bins, expected_bins) + + @pytest.mark.parametrize('arg, expected_bins', [ + [timedelta_range('1day', periods=3), + TimedeltaIndex(['1 days', '2 days', '3 days'])], + [date_range('20180101', periods=3), + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'])]]) + def test_datelike_qcut_bins(self, arg, expected_bins): + # GH 19891 + s = Series(arg) + result, result_bins = qcut(s, 2, retbins=True) + tm.assert_index_equal(result_bins, expected_bins) From 17ebfbe17242b5362f3ed88dc55ac5a472d7fe26 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Fri, 4 May 2018 22:28:18 -0700 Subject: [PATCH 2/6] Clarify docstring --- pandas/core/reshape/tile.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 941da4eb0304c..6acf7500bd211 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -400,7 +400,8 @@ def _convert_bin_to_numeric_type(bins, dtype): def _convert_bin_to_datelike_type(bins, dtype): """ - Box bins in Timestamp/Timedelta if the orginal dtype is datelike + Convert bins to a DatetimeIndex or TimedeltaIndex if the orginal dtype is + datelike Parameters ---------- From 326258d81ff4a432d692dfd59594f88811c55cb8 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sat, 5 May 2018 22:43:48 -0700 Subject: [PATCH 3/6] Address comments --- doc/source/whatsnew/v0.23.0.txt | 2 +- pandas/core/reshape/tile.py | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index c7c271c2c6523..1296a0992b96f 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -526,7 +526,7 @@ Other Enhancements - Added new writer for exporting Stata dta files in version 117, ``StataWriter117``. This format supports exporting strings with lengths up to 2,000,000 characters (:issue:`16450`) - :func:`to_hdf` and :func:`read_hdf` now accept an ``errors`` keyword argument to control encoding error handling (:issue:`20835`) - :func:`date_range` now returns a linearly spaced ``DatetimeIndex`` if ``start``, ``stop``, and ``periods`` are specified, but ``freq`` is not. (:issue:`20808`) -- :func:`cut` and :func:`qcut` now returns a ``DatetimeIndex`` or ``TimedeltaIndex`` bins when the input is datetime or timedelta dtype respectively and ``retbins=True`` (:issue:`19891`) +- :func:`cut` and :func:`qcut` now returns a :class:`DatetimeIndex` or :class:`TimedeltaIndex` bins when the input is datetime or timedelta dtype respectively and ``retbins=True`` (:issue:`19891`) .. _whatsnew_0230.api_breaking: diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 6acf7500bd211..d111173adcc29 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -18,7 +18,7 @@ from pandas._libs.lib import infer_dtype from pandas import (to_timedelta, to_datetime, Categorical, Timestamp, Timedelta, - Series, Interval, IntervalIndex) + Series, Index, Interval, IntervalIndex) import numpy as np @@ -413,12 +413,11 @@ def _convert_bin_to_datelike_type(bins, dtype): bins : Array-like of bins, DatetimeIndex or TimedeltaIndex if dtype is datelike """ + # Can be simplified once GH 20964 is fixed. if is_datetime64tz_dtype(dtype): bins = to_datetime(bins, utc=True).tz_convert(dtype.tz) - elif is_datetime64_dtype(dtype): - bins = to_datetime(bins) - elif is_timedelta64_dtype(dtype): - bins = to_timedelta(bins) + elif is_datetime64_dtype(dtype) or is_timedelta64_dtype(dtype): + bins = Index(bins, dtype=dtype) return bins From c70bf40b062ea626d5ab62d2cc33e29995d5ebef Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 14 May 2018 22:43:23 -0700 Subject: [PATCH 4/6] fix merge --- doc/source/whatsnew/v0.23.0.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 1144b8f6c24c8..d7c870af92561 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -525,7 +525,6 @@ Other Enhancements library. (:issue:`20564`) - Added new writer for exporting Stata dta files in version 117, ``StataWriter117``. This format supports exporting strings with lengths up to 2,000,000 characters (:issue:`16450`) - :func:`to_hdf` and :func:`read_hdf` now accept an ``errors`` keyword argument to control encoding error handling (:issue:`20835`) -- :func:`date_range` now returns a linearly spaced ``DatetimeIndex`` if ``start``, ``stop``, and ``periods`` are specified, but ``freq`` is not. (:issue:`20808`) - :func:`cut` has gained the ``duplicates='raise'|'drop'`` option to control whether to raise on duplicated edges (:issue:`20947`) - :func:`date_range`, :func:`timedelta_range`, and :func:`interval_range` now return a linearly spaced index if ``start``, ``stop``, and ``periods`` are specified, but ``freq`` is not. (:issue:`20808`, :issue:`20983`, :issue:`20976`) - :func:`cut` and :func:`qcut` now returns a :class:`DatetimeIndex` or :class:`TimedeltaIndex` bins when the input is datetime or timedelta dtype respectively and ``retbins=True`` (:issue:`19891`) From 876d9ee23ed6a9971b44cde8865783b857d2a7c4 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 27 Jun 2018 21:36:48 -0700 Subject: [PATCH 5/6] Address review --- doc/source/whatsnew/v0.23.0.txt | 1 - doc/source/whatsnew/v0.24.0.txt | 1 + pandas/core/reshape/tile.py | 9 +++------ 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index b6830c35ba5df..2430b6ac2bbd4 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -550,7 +550,6 @@ Other Enhancements - :func:`to_hdf` and :func:`read_hdf` now accept an ``errors`` keyword argument to control encoding error handling (:issue:`20835`) - :func:`cut` has gained the ``duplicates='raise'|'drop'`` option to control whether to raise on duplicated edges (:issue:`20947`) - :func:`date_range`, :func:`timedelta_range`, and :func:`interval_range` now return a linearly spaced index if ``start``, ``stop``, and ``periods`` are specified, but ``freq`` is not. (:issue:`20808`, :issue:`20983`, :issue:`20976`) -- :func:`cut` and :func:`qcut` now returns a :class:`DatetimeIndex` or :class:`TimedeltaIndex` bins when the input is datetime or timedelta dtype respectively and ``retbins=True`` (:issue:`19891`) .. _whatsnew_0230.api_breaking: diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 406ca9ba045c9..62cc9d32d77a1 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -74,6 +74,7 @@ Datetimelike API Changes - For :class:`DatetimeIndex` and :class:`TimedeltaIndex` with non-``None`` ``freq`` attribute, addition or subtraction of integer-dtyped array or ``Index`` will return an object of the same class (:issue:`19959`) - :class:`DateOffset` objects are now immutable. Attempting to alter one of these will now raise ``AttributeError`` (:issue:`21341`) +- :func:`cut` and :func:`qcut` now returns a :class:`DatetimeIndex` or :class:`TimedeltaIndex` bins when the input is datetime or timedelta dtype respectively and ``retbins=True`` (:issue:`19891`) .. _whatsnew_0240.api.other: diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index cff1938a78a76..bbab56aea01cd 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -11,6 +11,7 @@ is_datetime64_dtype, is_timedelta64_dtype, is_datetime64tz_dtype, + is_datetime_or_timedelta_dtype, _ensure_int64) import pandas.core.algorithms as algos @@ -312,7 +313,6 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): def _bins_to_cuts(x, bins, right=True, labels=None, precision=3, include_lowest=False, dtype=None, duplicates='raise'): - if duplicates not in ['raise', 'drop']: raise ValueError("invalid value for 'duplicates' parameter, " "valid options are: raise, drop") @@ -445,11 +445,8 @@ def _convert_bin_to_datelike_type(bins, dtype): bins : Array-like of bins, DatetimeIndex or TimedeltaIndex if dtype is datelike """ - # Can be simplified once GH 20964 is fixed. - if is_datetime64tz_dtype(dtype): - bins = to_datetime(bins, utc=True).tz_convert(dtype.tz) - elif is_datetime64_dtype(dtype) or is_timedelta64_dtype(dtype): - bins = Index(bins, dtype=dtype) + if is_datetime64tz_dtype(dtype) or is_datetime_or_timedelta_dtype(dtype): + bins = Index(bins.astype(np.int64), dtype=dtype) return bins From 4ef7309abbcf6947a9bb54e064146d0f807d80d1 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 27 Jun 2018 21:39:18 -0700 Subject: [PATCH 6/6] Undo removed space --- pandas/core/reshape/tile.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index bbab56aea01cd..863ebc6354136 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -313,6 +313,7 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): def _bins_to_cuts(x, bins, right=True, labels=None, precision=3, include_lowest=False, dtype=None, duplicates='raise'): + if duplicates not in ['raise', 'drop']: raise ValueError("invalid value for 'duplicates' parameter, " "valid options are: raise, drop")