-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: Retain timezone dtype with cut and qcut #19890
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,7 @@ | |
import numpy as np | ||
from pandas.compat import zip | ||
|
||
from pandas import (Series, isna, to_datetime, DatetimeIndex, | ||
from pandas import (DataFrame, Series, isna, to_datetime, DatetimeIndex, Index, | ||
Timestamp, Interval, IntervalIndex, Categorical, | ||
cut, qcut, date_range, NaT, TimedeltaIndex) | ||
from pandas.tseries.offsets import Nano, Day | ||
|
@@ -104,6 +104,12 @@ def test_cut_corner(self): | |
|
||
pytest.raises(ValueError, cut, [1, 2, 3], 0.5) | ||
|
||
@pytest.mark.parametrize('arg', [2, np.eye(2), DataFrame(np.eye(2))]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for an existing test, make sure bins are tested for (should work, but covering bases) scalar, 0-dim ndarray, 1-d ndarray, Series, list, Index There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did you mean to test various bins in this specific test ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I mean find a good test (or 2) and make sure that we are exercising all array-likes as input |
||
@pytest.mark.parametrize('cut_func', [cut, qcut]) | ||
def test_cut_not_1d_arg(self, arg, cut_func): | ||
with pytest.raises(ValueError): | ||
cut_func(arg, 2) | ||
|
||
def test_cut_out_of_range_more(self): | ||
# #1511 | ||
s = Series([0, -1, 0, 1, -3], name='x') | ||
|
@@ -251,18 +257,6 @@ def test_qcut_nas(self): | |
result = qcut(arr, 4) | ||
assert isna(result[:20]).all() | ||
|
||
@pytest.mark.parametrize('s', [ | ||
Series(DatetimeIndex(['20180101', NaT, '20180103'])), | ||
Series(TimedeltaIndex(['0 days', NaT, '2 days']))], | ||
ids=lambda x: str(x.dtype)) | ||
def test_qcut_nat(self, s): | ||
# GH 19768 | ||
intervals = IntervalIndex.from_tuples( | ||
[(s[0] - Nano(), s[2] - Day()), np.nan, (s[2] - Day(), s[2])]) | ||
expected = Series(Categorical(intervals, ordered=True)) | ||
result = qcut(s, 2) | ||
tm.assert_series_equal(result, expected) | ||
|
||
def test_qcut_index(self): | ||
result = qcut([0, 2], 2) | ||
intervals = [Interval(-0.001, 1), Interval(1, 2)] | ||
|
@@ -452,6 +446,37 @@ def test_single_bin(self): | |
result = cut(s, 1, labels=False) | ||
tm.assert_series_equal(result, expected) | ||
|
||
@pytest.mark.parametrize( | ||
"array_1_writeable, array_2_writeable", | ||
[(True, True), (True, False), (False, False)]) | ||
def test_cut_read_only(self, array_1_writeable, array_2_writeable): | ||
# issue 18773 | ||
array_1 = np.arange(0, 100, 10) | ||
array_1.flags.writeable = array_1_writeable | ||
|
||
array_2 = np.arange(0, 100, 10) | ||
array_2.flags.writeable = array_2_writeable | ||
|
||
hundred_elements = np.arange(100) | ||
|
||
tm.assert_categorical_equal(cut(hundred_elements, array_1), | ||
cut(hundred_elements, array_2)) | ||
|
||
|
||
class TestDatelike(object): | ||
|
||
@pytest.mark.parametrize('s', [ | ||
Series(DatetimeIndex(['20180101', NaT, '20180103'])), | ||
Series(TimedeltaIndex(['0 days', NaT, '2 days']))], | ||
ids=lambda x: str(x.dtype)) | ||
def test_qcut_nat(self, s): | ||
# GH 19768 | ||
intervals = IntervalIndex.from_tuples( | ||
[(s[0] - Nano(), s[2] - Day()), np.nan, (s[2] - Day(), s[2])]) | ||
expected = Series(Categorical(intervals, ordered=True)) | ||
result = qcut(s, 2) | ||
tm.assert_series_equal(result, expected) | ||
|
||
def test_datetime_cut(self): | ||
# GH 14714 | ||
# testing for time data to be present as series | ||
|
@@ -488,6 +513,47 @@ def test_datetime_cut(self): | |
result, bins = cut(data, 3, retbins=True) | ||
tm.assert_series_equal(Series(result), expected) | ||
|
||
@pytest.mark.parametrize('bins', [ | ||
3, [Timestamp('2013-01-01 04:57:07.200000'), | ||
Timestamp('2013-01-01 21:00:00'), | ||
Timestamp('2013-01-02 13:00:00'), | ||
Timestamp('2013-01-03 05:00:00')]]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mroeschke trying to address #47772 im getting a failure in this test bc i think the bins here need to be tzaware for this to make sense. without them being tzaware i think this should raise. but if i pass tz="US/Eastern" to the Timestamp constructor then i just get a regular failure. pls advise There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like if i do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would have expected just passing tz="US/Eastern" into There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes |
||
@pytest.mark.parametrize('box', [list, np.array, Index, Series]) | ||
def test_datetimetz_cut(self, bins, box): | ||
# GH 19872 | ||
tz = 'US/Eastern' | ||
s = Series(date_range('20130101', periods=3, tz=tz)) | ||
if not isinstance(bins, int): | ||
bins = box(bins) | ||
result = cut(s, bins) | ||
expected = ( | ||
Series(IntervalIndex([ | ||
Interval(Timestamp('2012-12-31 23:57:07.200000', tz=tz), | ||
Timestamp('2013-01-01 16:00:00', tz=tz)), | ||
Interval(Timestamp('2013-01-01 16:00:00', tz=tz), | ||
Timestamp('2013-01-02 08:00:00', tz=tz)), | ||
Interval(Timestamp('2013-01-02 08:00:00', tz=tz), | ||
Timestamp('2013-01-03 00:00:00', tz=tz))])) | ||
.astype(CDT(ordered=True))) | ||
tm.assert_series_equal(result, expected) | ||
|
||
@pytest.mark.parametrize('bins', [3, np.linspace(0, 1, 4)]) | ||
def test_datetimetz_qcut(self, bins): | ||
# GH 19872 | ||
tz = 'US/Eastern' | ||
s = Series(date_range('20130101', periods=3, tz=tz)) | ||
result = qcut(s, bins) | ||
expected = ( | ||
Series(IntervalIndex([ | ||
Interval(Timestamp('2012-12-31 23:59:59.999999999', tz=tz), | ||
Timestamp('2013-01-01 16:00:00', tz=tz)), | ||
Interval(Timestamp('2013-01-01 16:00:00', tz=tz), | ||
Timestamp('2013-01-02 08:00:00', tz=tz)), | ||
Interval(Timestamp('2013-01-02 08:00:00', tz=tz), | ||
Timestamp('2013-01-03 00:00:00', tz=tz))])) | ||
.astype(CDT(ordered=True))) | ||
tm.assert_series_equal(result, expected) | ||
|
||
def test_datetime_bin(self): | ||
data = [np.datetime64('2012-12-13'), np.datetime64('2012-12-15')] | ||
bin_data = ['2012-12-12', '2012-12-14', '2012-12-16'] | ||
|
@@ -523,19 +589,3 @@ def f(): | |
mask = result.isna() | ||
tm.assert_numpy_array_equal( | ||
mask, np.array([False, True, True, True, True])) | ||
|
||
@pytest.mark.parametrize( | ||
"array_1_writeable, array_2_writeable", | ||
[(True, True), (True, False), (False, False)]) | ||
def test_cut_read_only(self, array_1_writeable, array_2_writeable): | ||
# issue 18773 | ||
array_1 = np.arange(0, 100, 10) | ||
array_1.flags.writeable = array_1_writeable | ||
|
||
array_2 = np.arange(0, 100, 10) | ||
array_2.flags.writeable = array_2_writeable | ||
|
||
hundred_elements = np.arange(100) | ||
|
||
tm.assert_categorical_equal(cut(hundred_elements, array_1), | ||
cut(hundred_elements, array_2)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you can convert it then simply check the ndim <= 1 no?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
np.asarray
here will strip tz information if a Series or DatetimeIndex is passed, so we should convert if it's not a numpy or pandas object.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
so in that case, prefer to do not do this here, rather move any logic needed to
_convert_bin_to_numeric_type
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well the main issue is that
_preprocess_for_cut
is called first in bothcut
andqcut
, and it drops the timezone dtype when trying to convert the input array to a numpy structure before the dtype can be set to a variable in the next function_coerce_to_type
_convert_bin_to_numeric_type
looks to be only applicable forcut
and since it's for converting bins.