diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index dc8ed4c9f5aac..109ed8b286c22 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -233,8 +233,8 @@ Enhancements - - +- Bug in ``tslib.tz_convert`` and ``tslib.tz_convert_single`` may return different results (:issue:`7798`) +- Bug in ``DatetimeIndex.intersection`` of non-overlapping timestamps with tz raises ``IndexError`` (:issue:`7880`) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 1b5baf1bfe9da..88a86da27daf9 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -12636,23 +12636,6 @@ def test_consolidate_datetime64(self): assert_array_equal(df.starting.values, ser_starting.index.values) assert_array_equal(df.ending.values, ser_ending.index.values) - def test_tslib_tz_convert_trans_pos_plus_1__bug(self): - # Regression test for tslib.tz_convert(vals, tz1, tz2). - # See https://github.com/pydata/pandas/issues/4496 for details. - idx = pd.date_range(datetime(2011, 3, 26, 23), datetime(2011, 3, 27, 1), freq='1min') - idx = idx.tz_localize('UTC') - idx = idx.tz_convert('Europe/Moscow') - - test_vector = pd.Series([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 5], dtype=int) - - hours = idx.hour - - np.testing.assert_equal(hours, test_vector.values) - def _check_bool_op(self, name, alternative, frame=None, has_skipna=True, has_bool_only=False): if frame is None: diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 9bbcc781ca9d6..edc7b075da6f8 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -683,25 +683,6 @@ def infer_freq(index, warn=True): _ONE_HOUR = 60 * _ONE_MINUTE _ONE_DAY = 24 * _ONE_HOUR -def _tz_convert_with_transitions(values, to_tz, from_tz): - """ - convert i8 values from the specificed timezone to the to_tz zone, taking - into account DST transitions - """ - - # vectorization is slow, so tests if we can do this via the faster tz_convert - f = lambda x: tslib.tz_convert_single(x, to_tz, from_tz) - - if len(values) > 2: - first_slow, last_slow = f(values[0]),f(values[-1]) - - first_fast, last_fast = tslib.tz_convert(np.array([values[0],values[-1]],dtype='i8'),to_tz,from_tz) - - # don't cross a DST, so ok - if first_fast == first_slow and last_fast == last_slow: - return tslib.tz_convert(values,to_tz,from_tz) - - return np.vectorize(f)(values) class _FrequencyInferer(object): """ @@ -713,7 +694,7 @@ def __init__(self, index, warn=True): self.values = np.asarray(index).view('i8') if index.tz is not None: - self.values = _tz_convert_with_transitions(self.values,'UTC',index.tz) + self.values = tslib.tz_convert(self.values, 'UTC', index.tz) self.warn = warn diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 518bb4180ec89..5f7c93d38653a 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -14,7 +14,7 @@ from pandas.compat import u from pandas.tseries.frequencies import ( infer_freq, to_offset, get_period_alias, - Resolution, _tz_convert_with_transitions) + Resolution) from pandas.core.base import DatetimeIndexOpsMixin from pandas.tseries.offsets import DateOffset, generate_range, Tick, CDay from pandas.tseries.tools import parse_time_string, normalize_date @@ -1569,7 +1569,7 @@ def insert(self, loc, item): new_dates = np.concatenate((self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8)) if self.tz is not None: - new_dates = _tz_convert_with_transitions(new_dates,'UTC',self.tz) + new_dates = tslib.tz_convert(new_dates, 'UTC', self.tz) return DatetimeIndex(new_dates, name=self.name, freq=freq, tz=self.tz) except (AttributeError, TypeError): @@ -1606,7 +1606,7 @@ def delete(self, loc): freq = self.freq if self.tz is not None: - new_dates = _tz_convert_with_transitions(new_dates, 'UTC', self.tz) + new_dates = tslib.tz_convert(new_dates, 'UTC', self.tz) return DatetimeIndex(new_dates, name=self.name, freq=freq, tz=self.tz) def _view_like(self, ndarray): diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 9d5f45735feb5..c54c133dd2afe 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -3203,8 +3203,8 @@ def test_union(self): def test_intersection(self): # GH 4690 (with tz) - for tz in [None, 'Asia/Tokyo']: - rng = date_range('6/1/2000', '6/30/2000', freq='D', name='idx') + for tz in [None, 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific']: + base = date_range('6/1/2000', '6/30/2000', freq='D', name='idx') # if target has the same name, it is preserved rng2 = date_range('5/15/2000', '6/20/2000', freq='D', name='idx') @@ -3214,16 +3214,18 @@ def test_intersection(self): rng3 = date_range('5/15/2000', '6/20/2000', freq='D', name='other') expected3 = date_range('6/1/2000', '6/20/2000', freq='D', name=None) - result2 = rng.intersection(rng2) - result3 = rng.intersection(rng3) - for (result, expected) in [(result2, expected2), (result3, expected3)]: + rng4 = date_range('7/1/2000', '7/31/2000', freq='D', name='idx') + expected4 = DatetimeIndex([], name='idx') + + for (rng, expected) in [(rng2, expected2), (rng3, expected3), (rng4, expected4)]: + result = base.intersection(rng) self.assertTrue(result.equals(expected)) self.assertEqual(result.name, expected.name) self.assertEqual(result.freq, expected.freq) self.assertEqual(result.tz, expected.tz) # non-monotonic - rng = DatetimeIndex(['2011-01-05', '2011-01-04', '2011-01-02', '2011-01-03'], + base = DatetimeIndex(['2011-01-05', '2011-01-04', '2011-01-02', '2011-01-03'], tz=tz, name='idx') rng2 = DatetimeIndex(['2011-01-04', '2011-01-02', '2011-02-02', '2011-02-03'], @@ -3234,10 +3236,12 @@ def test_intersection(self): tz=tz, name='other') expected3 = DatetimeIndex(['2011-01-04', '2011-01-02'], tz=tz, name=None) - result2 = rng.intersection(rng2) - result3 = rng.intersection(rng3) - for (result, expected) in [(result2, expected2), (result3, expected3)]: - print(result, expected) + # GH 7880 + rng4 = date_range('7/1/2000', '7/31/2000', freq='D', tz=tz, name='idx') + expected4 = DatetimeIndex([], tz=tz, name='idx') + + for (rng, expected) in [(rng2, expected2), (rng3, expected3), (rng4, expected4)]: + result = base.intersection(rng) self.assertTrue(result.equals(expected)) self.assertEqual(result.name, expected.name) self.assertIsNone(result.freq) diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py index 21f915cb50e21..ab969f13289ac 100644 --- a/pandas/tseries/tests/test_timezones.py +++ b/pandas/tseries/tests/test_timezones.py @@ -787,6 +787,64 @@ def test_utc_with_system_utc(self): # check that the time hasn't changed. self.assertEqual(ts, ts.tz_convert(dateutil.tz.tzutc())) + def test_tslib_tz_convert_trans_pos_plus_1__bug(self): + # Regression test for tslib.tz_convert(vals, tz1, tz2). + # See https://github.com/pydata/pandas/issues/4496 for details. + for freq, n in [('H', 1), ('T', 60), ('S', 3600)]: + idx = date_range(datetime(2011, 3, 26, 23), datetime(2011, 3, 27, 1), freq=freq) + idx = idx.tz_localize('UTC') + idx = idx.tz_convert('Europe/Moscow') + + expected = np.repeat(np.array([3, 4, 5]), np.array([n, n, 1])) + self.assert_numpy_array_equal(idx.hour, expected) + + def test_tslib_tz_convert_dst(self): + for freq, n in [('H', 1), ('T', 60), ('S', 3600)]: + # Start DST + idx = date_range('2014-03-08 23:00', '2014-03-09 09:00', freq=freq, tz='UTC') + idx = idx.tz_convert('US/Eastern') + expected = np.repeat(np.array([18, 19, 20, 21, 22, 23, 0, 1, 3, 4, 5]), + np.array([n, n, n, n, n, n, n, n, n, n, 1])) + self.assert_numpy_array_equal(idx.hour, expected) + + idx = date_range('2014-03-08 18:00', '2014-03-09 05:00', freq=freq, tz='US/Eastern') + idx = idx.tz_convert('UTC') + expected = np.repeat(np.array([23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + np.array([n, n, n, n, n, n, n, n, n, n, 1])) + self.assert_numpy_array_equal(idx.hour, expected) + + # End DST + idx = date_range('2014-11-01 23:00', '2014-11-02 09:00', freq=freq, tz='UTC') + idx = idx.tz_convert('US/Eastern') + expected = np.repeat(np.array([19, 20, 21, 22, 23, 0, 1, 1, 2, 3, 4]), + np.array([n, n, n, n, n, n, n, n, n, n, 1])) + self.assert_numpy_array_equal(idx.hour, expected) + + idx = date_range('2014-11-01 18:00', '2014-11-02 05:00', freq=freq, tz='US/Eastern') + idx = idx.tz_convert('UTC') + expected = np.repeat(np.array([22, 23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), + np.array([n, n, n, n, n, n, n, n, n, n, n, n, 1])) + self.assert_numpy_array_equal(idx.hour, expected) + + # daily + # Start DST + idx = date_range('2014-03-08 00:00', '2014-03-09 00:00', freq='D', tz='UTC') + idx = idx.tz_convert('US/Eastern') + self.assert_numpy_array_equal(idx.hour, np.array([19, 19])) + + idx = date_range('2014-03-08 00:00', '2014-03-09 00:00', freq='D', tz='US/Eastern') + idx = idx.tz_convert('UTC') + self.assert_numpy_array_equal(idx.hour, np.array([5, 5])) + + # End DST + idx = date_range('2014-11-01 00:00', '2014-11-02 00:00', freq='D', tz='UTC') + idx = idx.tz_convert('US/Eastern') + self.assert_numpy_array_equal(idx.hour, np.array([20, 20])) + + idx = date_range('2014-11-01 00:00', '2014-11-02 000:00', freq='D', tz='US/Eastern') + idx = idx.tz_convert('UTC') + self.assert_numpy_array_equal(idx.hour, np.array([4, 4])) + class TestTimeZoneCacheKey(tm.TestCase): def test_cache_keys_are_distinct_for_pytz_vs_dateutil(self): diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index a47d6a178f8b2..79eaa97d50322 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -425,6 +425,44 @@ def test_period_ordinal_business_day(self): # Tuesday self.assertEqual(11418, period_ordinal(2013, 10, 8, 0, 0, 0, 0, 0, get_freq('B'))) + def test_tslib_tz_convert(self): + def compare_utc_to_local(tz_didx, utc_didx): + f = lambda x: tslib.tz_convert_single(x, 'UTC', tz_didx.tz) + result = tslib.tz_convert(tz_didx.asi8, 'UTC', tz_didx.tz) + result_single = np.vectorize(f)(tz_didx.asi8) + self.assert_numpy_array_equal(result, result_single) + + def compare_local_to_utc(tz_didx, utc_didx): + f = lambda x: tslib.tz_convert_single(x, tz_didx.tz, 'UTC') + result = tslib.tz_convert(utc_didx.asi8, tz_didx.tz, 'UTC') + result_single = np.vectorize(f)(utc_didx.asi8) + self.assert_numpy_array_equal(result, result_single) + + for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern', 'Europe/Moscow']: + # US: 2014-03-09 - 2014-11-11 + # MOSCOW: 2014-10-26 / 2014-12-31 + tz_didx = date_range('2014-03-01', '2015-01-10', freq='H', tz=tz) + utc_didx = date_range('2014-03-01', '2015-01-10', freq='H') + compare_utc_to_local(tz_didx, utc_didx) + # local tz to UTC can be differ in hourly (or higher) freqs because of DST + compare_local_to_utc(tz_didx, utc_didx) + + tz_didx = date_range('2000-01-01', '2020-01-01', freq='D', tz=tz) + utc_didx = date_range('2000-01-01', '2020-01-01', freq='D') + compare_utc_to_local(tz_didx, utc_didx) + compare_local_to_utc(tz_didx, utc_didx) + + tz_didx = date_range('2000-01-01', '2100-01-01', freq='A', tz=tz) + utc_didx = date_range('2000-01-01', '2100-01-01', freq='A') + compare_utc_to_local(tz_didx, utc_didx) + compare_local_to_utc(tz_didx, utc_didx) + + # Check empty array + result = tslib.tz_convert(np.array([], dtype=np.int64), + tslib.maybe_get_tz('US/Eastern'), + tslib.maybe_get_tz('Asia/Tokyo')) + self.assert_numpy_array_equal(result, np.array([], dtype=np.int64)) + class TestTimestampOps(tm.TestCase): def test_timestamp_and_datetime(self): self.assertEqual((Timestamp(datetime.datetime(2013, 10, 13)) - datetime.datetime(2013, 10, 12)).days, 1) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index dc9f3fa258985..b8342baae16bd 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -1907,10 +1907,14 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): Py_ssize_t i, pos, n = len(vals) int64_t v, offset pandas_datetimestruct dts + Py_ssize_t trans_len if not have_pytz: import pytz + if len(vals) == 0: + return np.array([], dtype=np.int64) + # Convert to UTC if _get_zone(tz1) != 'UTC': @@ -1927,6 +1931,7 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): else: deltas = _get_deltas(tz1) trans = _get_transitions(tz1) + trans_len = len(trans) pos = trans.searchsorted(vals[0]) - 1 if pos < 0: raise ValueError('First time before start of DST info') @@ -1934,7 +1939,7 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): offset = deltas[pos] for i in range(n): v = vals[i] - if v >= [pos + 1]: + while pos + 1 < trans_len and v >= trans[pos + 1]: pos += 1 offset = deltas[pos] utc_dates[i] = v - offset @@ -1957,29 +1962,23 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): # Convert UTC to other timezone trans = _get_transitions(tz2) + trans_len = len(trans) deltas = _get_deltas(tz2) - pos = trans.searchsorted(utc_dates[0]) - if pos == 0: + pos = trans.searchsorted(utc_dates[0]) - 1 + if pos < 0: raise ValueError('First time before start of DST info') - elif pos == len(trans): - return utc_dates + deltas[-1] # TODO: this assumed sortedness :/ - pos -= 1 - offset = deltas[pos] - cdef Py_ssize_t trans_len = len(trans) - for i in range(n): v = utc_dates[i] if vals[i] == NPY_NAT: result[i] = vals[i] else: - if (pos + 1) < trans_len and v >= trans[pos + 1]: + while pos + 1 < trans_len and v >= trans[pos + 1]: pos += 1 offset = deltas[pos] result[i] = v + offset - return result def tz_convert_single(int64_t val, object tz1, object tz2): @@ -2005,7 +2004,7 @@ def tz_convert_single(int64_t val, object tz1, object tz2): elif _get_zone(tz1) != 'UTC': deltas = _get_deltas(tz1) trans = _get_transitions(tz1) - pos = trans.searchsorted(val) - 1 + pos = trans.searchsorted(val, side='right') - 1 if pos < 0: raise ValueError('First time before start of DST info') offset = deltas[pos] @@ -2024,7 +2023,7 @@ def tz_convert_single(int64_t val, object tz1, object tz2): # Convert UTC to other timezone trans = _get_transitions(tz2) deltas = _get_deltas(tz2) - pos = trans.searchsorted(utc_date) - 1 + pos = trans.searchsorted(utc_date, side='right') - 1 if pos < 0: raise ValueError('First time before start of DST info')