Skip to content

BUG: Retain timezone dtype with cut and qcut #19890

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 9, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1018,6 +1018,7 @@ Reshaping
- Bug in :func:`DataFrame.iterrows`, which would infers strings not compliant to `ISO8601 <https://en.wikipedia.org/wiki/ISO_8601>`_ to datetimes (:issue:`19671`)
- Bug in :class:`Series` constructor with ``Categorical`` where a ```ValueError`` is not raised when an index of different length is given (:issue:`19342`)
- Bug in :meth:`DataFrame.astype` where column metadata is lost when converting to categorical or a dictionary of dtypes (:issue:`19920`)
- Bug in :func:`cut` and :func:`qcut` where timezone information was dropped (:issue:`19872`)

Other
^^^^^
Expand Down
30 changes: 22 additions & 8 deletions pandas/core/reshape/tile.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Quantilization functions and related stuff
"""
from functools import partial

from pandas.core.dtypes.missing import isna
from pandas.core.dtypes.common import (
Expand All @@ -9,6 +10,7 @@
is_categorical_dtype,
is_datetime64_dtype,
is_timedelta64_dtype,
is_datetime64tz_dtype,
_ensure_int64)

import pandas.core.algorithms as algos
Expand Down Expand Up @@ -239,7 +241,8 @@ def _bins_to_cuts(x, bins, right=True, labels=None,
ids = _ensure_int64(bins.searchsorted(x, side=side))

if include_lowest:
ids[x == bins[0]] = 1
# Numpy 1.9 support: ensure this mask is a Numpy array
ids[np.asarray(x == bins[0])] = 1

na_mask = isna(x) | (ids == len(bins)) | (ids == 0)
has_nas = na_mask.any()
Expand Down Expand Up @@ -284,12 +287,14 @@ def _coerce_to_type(x):
"""
dtype = None

if is_timedelta64_dtype(x):
x = to_timedelta(x)
dtype = np.timedelta64
if is_datetime64tz_dtype(x):
dtype = x.dtype
elif is_datetime64_dtype(x):
x = to_datetime(x)
dtype = np.datetime64
elif is_timedelta64_dtype(x):
x = to_timedelta(x)
dtype = np.timedelta64

if dtype is not None:
# GH 19768: force NaT to NaN during integer conversion
Expand All @@ -305,7 +310,7 @@ def _convert_bin_to_numeric_type(bins, dtype):

Parameters
----------
bins : list-liek of bins
bins : list-like of bins
dtype : dtype of data

Raises
Expand All @@ -318,7 +323,7 @@ def _convert_bin_to_numeric_type(bins, dtype):
bins = to_timedelta(bins).view(np.int64)
else:
raise ValueError("bins must be of timedelta64 dtype")
elif is_datetime64_dtype(dtype):
elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
if bins_dtype in ['datetime', 'datetime64']:
bins = to_datetime(bins).view(np.int64)
else:
Expand All @@ -333,7 +338,10 @@ def _format_labels(bins, precision, right=True,

closed = 'right' if right else 'left'

if is_datetime64_dtype(dtype):
if is_datetime64tz_dtype(dtype):
formatter = partial(Timestamp, tz=dtype.tz)
adjust = lambda x: x - Timedelta('1ns')
elif is_datetime64_dtype(dtype):
formatter = Timestamp
adjust = lambda x: x - Timedelta('1ns')
elif is_timedelta64_dtype(dtype):
Expand Down Expand Up @@ -372,7 +380,13 @@ def _preprocess_for_cut(x):
series_index = x.index
name = x.name

x = np.asarray(x)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can convert it then simply check the ndim <= 1 no?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

np.asarray here will strip tz information if a Series or DatetimeIndex is passed, so we should convert if it's not a numpy or pandas object.

In [3]: tz = 'UTC'

In [4]: s = pd.Series(pd.date_range('20130101', periods=3, tz=tz))

In [5]: s
Out[5]:
0   2013-01-01 00:00:00+00:00
1   2013-01-02 00:00:00+00:00
2   2013-01-03 00:00:00+00:00
dtype: datetime64[ns, UTC]

In [6]: np.asarray(s)
Out[6]:
array(['2013-01-01T00:00:00.000000000', '2013-01-02T00:00:00.000000000',
       '2013-01-03T00:00:00.000000000'], dtype='datetime64[ns]')

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so in that case, prefer to do not do this here, rather move any logic needed to _convert_bin_to_numeric_type

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well the main issue is that _preprocess_for_cut is called first in both cut and qcut, and it drops the timezone dtype when trying to convert the input array to a numpy structure before the dtype can be set to a variable in the next function _coerce_to_type

_convert_bin_to_numeric_type looks to be only applicable for cut and since it's for converting bins.

# Check that the passed array is a Pandas or Numpy object
# We don't want to strip away a Pandas data-type here (e.g. datetimetz)
ndim = getattr(x, 'ndim', None)
if ndim is None:
x = np.asarray(x)
if x.ndim != 1:
raise ValueError("Input array must be 1 dimensional")

return x_is_series, series_index, name, x

Expand Down
108 changes: 79 additions & 29 deletions pandas/tests/reshape/test_tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
from pandas.compat import zip

from pandas import (Series, isna, to_datetime, DatetimeIndex,
from pandas import (DataFrame, Series, isna, to_datetime, DatetimeIndex, Index,
Timestamp, Interval, IntervalIndex, Categorical,
cut, qcut, date_range, NaT, TimedeltaIndex)
from pandas.tseries.offsets import Nano, Day
Expand Down Expand Up @@ -104,6 +104,12 @@ def test_cut_corner(self):

pytest.raises(ValueError, cut, [1, 2, 3], 0.5)

@pytest.mark.parametrize('arg', [2, np.eye(2), DataFrame(np.eye(2))])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for an existing test, make sure bins are tested for (should work, but covering bases)

scalar, 0-dim ndarray, 1-d ndarray, Series, list, Index

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you mean to test various bins in this specific test (test_cut_not_1d_arg)?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean find a good test (or 2) and make sure that we are exercising all array-likes as input

@pytest.mark.parametrize('cut_func', [cut, qcut])
def test_cut_not_1d_arg(self, arg, cut_func):
with pytest.raises(ValueError):
cut_func(arg, 2)

def test_cut_out_of_range_more(self):
# #1511
s = Series([0, -1, 0, 1, -3], name='x')
Expand Down Expand Up @@ -251,18 +257,6 @@ def test_qcut_nas(self):
result = qcut(arr, 4)
assert isna(result[:20]).all()

@pytest.mark.parametrize('s', [
Series(DatetimeIndex(['20180101', NaT, '20180103'])),
Series(TimedeltaIndex(['0 days', NaT, '2 days']))],
ids=lambda x: str(x.dtype))
def test_qcut_nat(self, s):
# GH 19768
intervals = IntervalIndex.from_tuples(
[(s[0] - Nano(), s[2] - Day()), np.nan, (s[2] - Day(), s[2])])
expected = Series(Categorical(intervals, ordered=True))
result = qcut(s, 2)
tm.assert_series_equal(result, expected)

def test_qcut_index(self):
result = qcut([0, 2], 2)
intervals = [Interval(-0.001, 1), Interval(1, 2)]
Expand Down Expand Up @@ -452,6 +446,37 @@ def test_single_bin(self):
result = cut(s, 1, labels=False)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize(
"array_1_writeable, array_2_writeable",
[(True, True), (True, False), (False, False)])
def test_cut_read_only(self, array_1_writeable, array_2_writeable):
# issue 18773
array_1 = np.arange(0, 100, 10)
array_1.flags.writeable = array_1_writeable

array_2 = np.arange(0, 100, 10)
array_2.flags.writeable = array_2_writeable

hundred_elements = np.arange(100)

tm.assert_categorical_equal(cut(hundred_elements, array_1),
cut(hundred_elements, array_2))


class TestDatelike(object):

@pytest.mark.parametrize('s', [
Series(DatetimeIndex(['20180101', NaT, '20180103'])),
Series(TimedeltaIndex(['0 days', NaT, '2 days']))],
ids=lambda x: str(x.dtype))
def test_qcut_nat(self, s):
# GH 19768
intervals = IntervalIndex.from_tuples(
[(s[0] - Nano(), s[2] - Day()), np.nan, (s[2] - Day(), s[2])])
expected = Series(Categorical(intervals, ordered=True))
result = qcut(s, 2)
tm.assert_series_equal(result, expected)

def test_datetime_cut(self):
# GH 14714
# testing for time data to be present as series
Expand Down Expand Up @@ -488,6 +513,47 @@ def test_datetime_cut(self):
result, bins = cut(data, 3, retbins=True)
tm.assert_series_equal(Series(result), expected)

@pytest.mark.parametrize('bins', [
3, [Timestamp('2013-01-01 04:57:07.200000'),
Timestamp('2013-01-01 21:00:00'),
Timestamp('2013-01-02 13:00:00'),
Timestamp('2013-01-03 05:00:00')]])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mroeschke trying to address #47772 im getting a failure in this test bc i think the bins here need to be tzaware for this to make sense. without them being tzaware i think this should raise. but if i pass tz="US/Eastern" to the Timestamp constructor then i just get a regular failure. pls advise

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like if i do Timestamp(..., tz="UTC").tz_convert("US/Eastern") in the bins the the rest of the test passes. is that what you would expect?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would have expected just passing tz="US/Eastern" into Timestamp to also pass this test and not needing to localize to UTC first. Is the failure with all box types?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the failure with all box types?

Yes

@pytest.mark.parametrize('box', [list, np.array, Index, Series])
def test_datetimetz_cut(self, bins, box):
# GH 19872
tz = 'US/Eastern'
s = Series(date_range('20130101', periods=3, tz=tz))
if not isinstance(bins, int):
bins = box(bins)
result = cut(s, bins)
expected = (
Series(IntervalIndex([
Interval(Timestamp('2012-12-31 23:57:07.200000', tz=tz),
Timestamp('2013-01-01 16:00:00', tz=tz)),
Interval(Timestamp('2013-01-01 16:00:00', tz=tz),
Timestamp('2013-01-02 08:00:00', tz=tz)),
Interval(Timestamp('2013-01-02 08:00:00', tz=tz),
Timestamp('2013-01-03 00:00:00', tz=tz))]))
.astype(CDT(ordered=True)))
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize('bins', [3, np.linspace(0, 1, 4)])
def test_datetimetz_qcut(self, bins):
# GH 19872
tz = 'US/Eastern'
s = Series(date_range('20130101', periods=3, tz=tz))
result = qcut(s, bins)
expected = (
Series(IntervalIndex([
Interval(Timestamp('2012-12-31 23:59:59.999999999', tz=tz),
Timestamp('2013-01-01 16:00:00', tz=tz)),
Interval(Timestamp('2013-01-01 16:00:00', tz=tz),
Timestamp('2013-01-02 08:00:00', tz=tz)),
Interval(Timestamp('2013-01-02 08:00:00', tz=tz),
Timestamp('2013-01-03 00:00:00', tz=tz))]))
.astype(CDT(ordered=True)))
tm.assert_series_equal(result, expected)

def test_datetime_bin(self):
data = [np.datetime64('2012-12-13'), np.datetime64('2012-12-15')]
bin_data = ['2012-12-12', '2012-12-14', '2012-12-16']
Expand Down Expand Up @@ -523,19 +589,3 @@ def f():
mask = result.isna()
tm.assert_numpy_array_equal(
mask, np.array([False, True, True, True, True]))

@pytest.mark.parametrize(
"array_1_writeable, array_2_writeable",
[(True, True), (True, False), (False, False)])
def test_cut_read_only(self, array_1_writeable, array_2_writeable):
# issue 18773
array_1 = np.arange(0, 100, 10)
array_1.flags.writeable = array_1_writeable

array_2 = np.arange(0, 100, 10)
array_2.flags.writeable = array_2_writeable

hundred_elements = np.arange(100)

tm.assert_categorical_equal(cut(hundred_elements, array_1),
cut(hundred_elements, array_2))