Skip to content

Commit 0a4c3c1

Browse files
Merge pull request pandas-dev#3 in DATA/arctic from MDP-448_tickstore_timezone_inconsistency to master
* commit 'fc8eafdf5528a6fe52d1f471a9cdf9454e81f6e6': MDP-448 to_dt should respect default_tz for ms since epoch datetimes Add unit tests MDP-448 Make ms_to_datetime always return a non-naive datetime.datetime MDP-448 remove debug print MDP-448 Fix arctic_copy_data. If the VersionStore data has no TimeZone then don't slice the the original_data using a timezone during --spliceing MDP-448 to_pandas_closed_closed now does an implicit to_dt MDP-448 Ensure we use non-naive datetimes for the Mongo query in the read-path MDP-448 Ensure we set a TimeZone on the returned DataFrame on tickstore.read. We store time as ms since epoch, and this will prevent confusion on interpretation with naive DateTimes on read.
2 parents c7a5c8d + fc8eafd commit 0a4c3c1

File tree

10 files changed

+200
-62
lines changed

10 files changed

+200
-62
lines changed

Diff for: arctic/date/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from ._daterange import DateRange
22
from ._generalslice import OPEN_CLOSED, CLOSED_OPEN, OPEN_OPEN, CLOSED_CLOSED
33
from ._util import datetime_to_ms, ms_to_datetime
4-
from ._util import string_to_daterange, to_pandas_closed_closed
4+
from ._util import string_to_daterange, to_pandas_closed_closed, to_dt
55
from ._mktz import mktz, TimezoneError

Diff for: arctic/date/_util.py

+34-2
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,34 @@ def string_to_daterange(str_range, delimiter='-', as_dates=False, interval=CLOSE
7878
return DateRange(d[0], d[1], oc)
7979

8080

81+
def to_dt(date, default_tz=None):
82+
"""
83+
Returns a non-naive datetime.datetime.
84+
85+
Interprets numbers as ms-since-epoch.
86+
87+
Parameters
88+
----------
89+
date : `int` or `datetime.datetime`
90+
The datetime to convert
91+
92+
default_tz : tzinfo
93+
The TimeZone to use if none is found. If not supplied, and the
94+
datetime doesn't have a timezone, then we raise ValueError
95+
96+
Returns
97+
-------
98+
Non-naive datetime
99+
"""
100+
if isinstance(date, (int, long)):
101+
return ms_to_datetime(date, default_tz)
102+
elif date.tzinfo is None:
103+
if default_tz is None:
104+
raise ValueError("Must specify a TimeZone on incoming data")
105+
return date.replace(tzinfo=default_tz)
106+
return date
107+
108+
81109
def to_pandas_closed_closed(date_range):
82110
"""
83111
Pandas DateRange slicing is CLOSED-CLOSED inclusive at both ends.
@@ -86,12 +114,16 @@ def to_pandas_closed_closed(date_range):
86114
"""
87115
if not date_range:
88116
return None
117+
89118
start = date_range.start
90119
end = date_range.end
91120
if start:
121+
start = to_dt(start, mktz()) # Ensure they have timezones
92122
if date_range.startopen:
93123
start += timedelta(milliseconds=1)
124+
94125
if end:
126+
end = to_dt(end, mktz()) # Ensure they have timezones
95127
if date_range.endopen:
96128
end -= timedelta(milliseconds=1)
97129
return DateRange(start, end)
@@ -102,8 +134,8 @@ def ms_to_datetime(ms, tzinfo=None):
102134
if not isinstance(ms, (int, long)):
103135
raise TypeError('expected integer, not %s' % type(ms))
104136

105-
if tzinfo in (None, mktz()):
106-
return datetime.datetime.fromtimestamp(ms * 1e-3, mktz()).replace(tzinfo=None)
137+
if tzinfo is None:
138+
tzinfo = mktz()
107139

108140
return datetime.datetime.fromtimestamp(ms * 1e-3, tzinfo)
109141

Diff for: arctic/scripts/arctic_copy_data.py

+12-7
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from arctic.store.audit import ArcticTransaction
99

1010
from ..hosts import get_arctic_lib
11-
from ..date import DateRange, to_pandas_closed_closed, CLOSED_OPEN, OPEN_CLOSED
11+
from ..date import DateRange, to_pandas_closed_closed, CLOSED_OPEN, OPEN_CLOSED, mktz
1212
from .utils import setup_logging
1313

1414
logger = logging.getLogger(__name__)
@@ -38,12 +38,17 @@ def _copy_symbol(symbols):
3838

3939
if existing_data and splice:
4040
original_data = dest.read(symbol).data
41-
before = original_data.ix[:to_pandas_closed_closed(DateRange(None,
42-
new_data.index[0].to_pydatetime(),
43-
interval=CLOSED_OPEN)).end]
44-
after = original_data.ix[to_pandas_closed_closed(DateRange(new_data.index[-1].to_pydatetime(),
45-
None,
46-
interval=OPEN_CLOSED)).start:]
41+
preserve_start = to_pandas_closed_closed(DateRange(None, new_data.index[0].to_pydatetime(),
42+
interval=CLOSED_OPEN)).end
43+
preserve_end = to_pandas_closed_closed(DateRange(new_data.index[-1].to_pydatetime(),
44+
None,
45+
interval=OPEN_CLOSED)).start
46+
if not original_data.index.tz:
47+
# No timezone on the original, should we even allow this?
48+
preserve_start = preserve_start.replace(tzinfo=None)
49+
preserve_end = preserve_end.replace(tzinfo=None)
50+
before = original_data.ix[:preserve_start]
51+
after = original_data.ix[preserve_end:]
4752
new_data = before.append(new_data).append(after)
4853

4954
mt.write(symbol, new_data, metadata=version.metadata)

Diff for: arctic/store/version_store.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ def list_versions(self, symbol=None, snapshot=None, latest_only=False):
237237
continue
238238
seen_symbols.add(version['symbol'])
239239
versions.append({'symbol': version['symbol'], 'version': version['version'],
240-
# We return naive datetimes in London Time.
240+
# We return naive datetimes in Local Time.
241241
'date': ms_to_datetime(datetime_to_ms(version['_id'].generation_time)),
242242
'snapshots': self._find_snapshots(version.get('parent', []))})
243243
return versions

Diff for: arctic/tickstore/tickstore.py

+16-23
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,9 @@
99
import pymongo
1010
from pymongo.errors import OperationFailure
1111

12-
from ..date import DateRange, to_pandas_closed_closed, mktz, datetime_to_ms, ms_to_datetime
12+
from ..date import DateRange, to_pandas_closed_closed, mktz, datetime_to_ms, CLOSED_CLOSED, to_dt
1313
from ..decorators import mongo_retry
1414
from ..exceptions import OverlappingDataException, NoDataFoundException, UnhandledDtypeException, ArcticException
15-
from .._util import indent
1615

1716
logger = logging.getLogger(__name__)
1817

@@ -127,12 +126,8 @@ def delete(self, symbol, date_range=None):
127126
date_range = to_pandas_closed_closed(date_range)
128127
if date_range is not None:
129128
assert date_range.start and date_range.end
130-
if date_range.start:
131-
start = self._to_dt(date_range.start)
132-
if date_range.end:
133-
end = self._to_dt(date_range.end)
134-
query[START] = {'$gte': start}
135-
query[END] = {'$lte': end}
129+
query[START] = {'$gte': date_range.start}
130+
query[END] = {'$lte': date_range.end}
136131
self._collection.delete_many(query)
137132

138133
def list_symbols(self, date_range=None):
@@ -143,10 +138,14 @@ def _mongo_date_range_query(self, symbol, date_range):
143138
if not date_range:
144139
date_range = DateRange()
145140

141+
# We're assuming CLOSED_CLOSED on these Mongo queries
142+
assert date_range.interval == CLOSED_CLOSED
143+
146144
# Find the start bound
147145
start_range = {}
148146
first = last = None
149147
if date_range.start:
148+
assert date_range.start.tzinfo
150149
start = date_range.start
151150
startq = self._symbol_query(symbol)
152151
startq.update({START: {'$lte': start}})
@@ -159,6 +158,7 @@ def _mongo_date_range_query(self, symbol, date_range):
159158

160159
# Find the end bound
161160
if date_range.end:
161+
assert date_range.end.tzinfo
162162
end = date_range.end
163163
endq = self._symbol_query(symbol)
164164
endq.update({START: {'$gt': end}})
@@ -258,7 +258,7 @@ def read(self, symbol, date_range=None, columns=None, include_images=False, _tar
258258
raise NoDataFoundException("No Data found for {} in range: {}".format(symbol, date_range))
259259
rtn = self._pad_and_fix_dtypes(rtn, column_dtypes)
260260

261-
index = pd.to_datetime(np.concatenate(rtn[INDEX]), unit='ms')
261+
index = pd.to_datetime(np.concatenate(rtn[INDEX]), utc=True, unit='ms')
262262
if columns is None:
263263
columns = [x for x in rtn.keys() if x not in (INDEX, 'SYMBOL')]
264264
if multiple_symbols and 'SYMBOL' not in columns:
@@ -278,6 +278,8 @@ def read(self, symbol, date_range=None, columns=None, include_images=False, _tar
278278
logger.info("Got data in %s secs, creating DataFrame..." % t)
279279
mgr = _arrays_to_mgr(arrays, columns, index, columns, dtype=None)
280280
rtn = pd.DataFrame(mgr)
281+
# Present data in the user's default TimeZone
282+
rtn.index.tz = mktz()
281283

282284
t = (dt.now() - perf_start).total_seconds()
283285
ticks = len(rtn)
@@ -465,7 +467,7 @@ def write(self, symbol, data):
465467
pandas = True
466468
else:
467469
raise UnhandledDtypeException("Can't persist type %s to tickstore" % type(data))
468-
self._assert_nonoverlapping_data(symbol, self._to_dt(start), self._to_dt(end))
470+
self._assert_nonoverlapping_data(symbol, to_dt(start), to_dt(end))
469471

470472
if pandas:
471473
buckets = self._pandas_to_buckets(data, symbol)
@@ -498,15 +500,6 @@ def _to_ms(self, date):
498500
return datetime_to_ms(date)
499501
return date
500502

501-
def _to_dt(self, date, default_tz=None):
502-
if isinstance(date, (int, long)):
503-
return ms_to_datetime(date, mktz('UTC'))
504-
elif date.tzinfo is None:
505-
if default_tz is None:
506-
raise ValueError("Must specify a TimeZone on incoming data")
507-
return date.replace(tzinfo=default_tz)
508-
return date
509-
510503
def _str_dtype(self, dtype):
511504
"""
512505
Represent dtypes without byte order, as earlier Java tickstore code doesn't support explicit byte order.
@@ -540,8 +533,8 @@ def _ensure_supported_dtypes(self, array):
540533
return array
541534

542535
def _pandas_to_bucket(self, df, symbol):
543-
start = self._to_dt(df.index[0].to_datetime())
544-
end = self._to_dt(df.index[0].to_datetime())
536+
start = to_dt(df.index[0].to_datetime())
537+
end = to_dt(df.index[0].to_datetime())
545538
rtn = {START: start, END: end, SYMBOL: symbol}
546539
rtn[VERSION] = CHUNK_VERSION_NUMBER
547540
rtn[COUNT] = len(df)
@@ -566,8 +559,8 @@ def _pandas_to_bucket(self, df, symbol):
566559
def _to_bucket(self, ticks, symbol):
567560
data = {}
568561
rowmask = {}
569-
start = self._to_dt(ticks[0]['index'])
570-
end = self._to_dt(ticks[-1]['index'])
562+
start = to_dt(ticks[0]['index'])
563+
end = to_dt(ticks[-1]['index'])
571564
for i, t in enumerate(ticks):
572565
for k, v in t.iteritems():
573566
try:

Diff for: tests/integration/tickstore/test_ts_read.py

+53-17
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from numpy.testing.utils import assert_array_equal
55
from pandas.util.testing import assert_frame_equal
66
import pandas as pd
7+
from pandas.tseries.index import DatetimeIndex
78
import pytest
89
import pytz
910

@@ -123,11 +124,11 @@ def test_read_all_cols_all_dtypes(tickstore_lib, chunk_size):
123124
# Treat missing strings as None
124125
data[0]['ns'] = None
125126
data[1]['os'] = None
126-
# Strip TZ from the data for the moment
127-
data[0]['index'] = dt(1970, 1, 1)
128-
data[1]['index'] = dt(1970, 1, 1, 0, 0, 1)
129-
expected = pd.DataFrame(data)
130-
expected = expected.set_index('index')
127+
index = DatetimeIndex([dt(1970, 1, 1, tzinfo=mktz('UTC')),
128+
dt(1970, 1, 1, 0, 0, 1, tzinfo=mktz('UTC'))],
129+
)
130+
index.tz = mktz()
131+
expected = pd.DataFrame(data, index=index)
131132
expected = expected[df.columns]
132133
assert_frame_equal(expected, df, check_names=False)
133134

@@ -229,6 +230,41 @@ def test_date_range_end_not_in_range(tickstore_lib):
229230
assert tickstore_lib._collection.find(f.call_args_list[-1][0][0]).count() == 1
230231

231232

233+
@pytest.mark.parametrize('tz_name', ['UTC',
234+
'Europe/London', # Sometimes ahead of UTC
235+
'America/New_York', # Behind UTC
236+
])
237+
def test_date_range_default_timezone(tickstore_lib, tz_name):
238+
"""
239+
We assume naive datetimes are user-local
240+
"""
241+
DUMMY_DATA = [
242+
{'a': 1.,
243+
'b': 2.,
244+
'index': dt(2013, 1, 1, tzinfo=mktz(tz_name))
245+
},
246+
# Half-way through the year
247+
{'b': 3.,
248+
'c': 4.,
249+
'index': dt(2013, 7, 1, tzinfo=mktz(tz_name))
250+
},
251+
]
252+
253+
with patch('arctic.date._mktz.DEFAULT_TIME_ZONE_NAME', tz_name):
254+
tickstore_lib.chunk_size = 1
255+
tickstore_lib.write('SYM', DUMMY_DATA)
256+
df = tickstore_lib.read('SYM', date_range=DateRange(20130101, 20130701), columns=None)
257+
assert len(df) == 2
258+
assert df.index[1] == dt(2013, 7, 1, tzinfo=mktz(tz_name))
259+
assert df.index.tz == mktz(tz_name)
260+
261+
df = tickstore_lib.read('SYM', date_range=DateRange(20130101, 20130101), columns=None)
262+
assert len(df) == 1
263+
264+
df = tickstore_lib.read('SYM', date_range=DateRange(20130701, 20130701), columns=None)
265+
assert len(df) == 1
266+
267+
232268
def test_date_range_no_bounds(tickstore_lib):
233269
DUMMY_DATA = [
234270
{'a': 1.,
@@ -387,31 +423,31 @@ def test_read_with_image(tickstore_lib):
387423
assert_array_equal(df['a'].values, np.array([37, 1, np.nan]))
388424
assert_array_equal(df['b'].values, np.array([np.nan, np.nan, 4]))
389425
assert_array_equal(df['c'].values, np.array([2, np.nan, np.nan]))
390-
assert df.index[0] == dt(2013, 1, 1, 10)
391-
assert df.index[1] == dt(2013, 1, 1, 11)
392-
assert df.index[2] == dt(2013, 1, 1, 12)
426+
assert df.index[0] == dt(2013, 1, 1, 10, tzinfo=mktz('Europe/London'))
427+
assert df.index[1] == dt(2013, 1, 1, 11, tzinfo=mktz('Europe/London'))
428+
assert df.index[2] == dt(2013, 1, 1, 12, tzinfo=mktz('Europe/London'))
393429

394430
# Read just columns from the updates
395431
df = tickstore_lib.read('SYM', columns=('a', 'b'), date_range=dr, include_images=True)
396432
assert set(df.columns) == set(('a', 'b'))
397433
assert_array_equal(df['a'].values, np.array([37, 1, np.nan]))
398434
assert_array_equal(df['b'].values, np.array([np.nan, np.nan, 4]))
399-
assert df.index[0] == dt(2013, 1, 1, 10)
400-
assert df.index[1] == dt(2013, 1, 1, 11)
401-
assert df.index[2] == dt(2013, 1, 1, 12)
435+
assert df.index[0] == dt(2013, 1, 1, 10, tzinfo=mktz('Europe/London'))
436+
assert df.index[1] == dt(2013, 1, 1, 11, tzinfo=mktz('Europe/London'))
437+
assert df.index[2] == dt(2013, 1, 1, 12, tzinfo=mktz('Europe/London'))
402438

403439
# Read one column from the updates
404440
df = tickstore_lib.read('SYM', columns=('a',), date_range=dr, include_images=True)
405441
assert set(df.columns) == set(('a',))
406442
assert_array_equal(df['a'].values, np.array([37, 1, np.nan]))
407-
assert df.index[0] == dt(2013, 1, 1, 10)
408-
assert df.index[1] == dt(2013, 1, 1, 11)
409-
assert df.index[2] == dt(2013, 1, 1, 12)
443+
assert df.index[0] == dt(2013, 1, 1, 10, tzinfo=mktz('Europe/London'))
444+
assert df.index[1] == dt(2013, 1, 1, 11, tzinfo=mktz('Europe/London'))
445+
assert df.index[2] == dt(2013, 1, 1, 12, tzinfo=mktz('Europe/London'))
410446

411447
# Read just the image column
412448
df = tickstore_lib.read('SYM', columns=['c'], date_range=dr, include_images=True)
413449
assert set(df.columns) == set(['c'])
414450
assert_array_equal(df['c'].values, np.array([2, np.nan, np.nan]))
415-
assert df.index[0] == dt(2013, 1, 1, 10)
416-
assert df.index[1] == dt(2013, 1, 1, 11)
417-
assert df.index[2] == dt(2013, 1, 1, 12)
451+
assert df.index[0] == dt(2013, 1, 1, 10, tzinfo=mktz('Europe/London'))
452+
assert df.index[1] == dt(2013, 1, 1, 11, tzinfo=mktz('Europe/London'))
453+
assert df.index[2] == dt(2013, 1, 1, 12, tzinfo=mktz('Europe/London'))

Diff for: tests/integration/tickstore/test_ts_write.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
},
3030
{'b': 9.,
3131
'c': 10.,
32-
'index': dt(2013, 1, 5, tzinfo=mktz('Europe/London'))
32+
'index': dt(2013, 7, 5, tzinfo=mktz('Europe/London'))
3333
},
3434
]
3535

@@ -69,9 +69,11 @@ def test_ts_write_pandas(tickstore_lib):
6969
data = DUMMY_DATA
7070
tickstore_lib.write('SYM', data)
7171

72-
data = tickstore_lib.read('SYM', columns=None).tz_localize(mktz('Europe/London'))
72+
data = tickstore_lib.read('SYM', columns=None)
73+
assert data.index[0] == dt(2013, 1, 1, tzinfo=mktz('Europe/London'))
74+
assert data.a[0] == 1
7375
tickstore_lib.delete('SYM')
7476
tickstore_lib.write('SYM', data)
7577

76-
read = tickstore_lib.read('SYM', columns=None).tz_localize(mktz('Europe/London'))
78+
read = tickstore_lib.read('SYM', columns=None)
7779
assert_frame_equal(read, data, check_names=False)

Diff for: tests/unit/date/test_datetime_to_ms_roundtrip.py

+4-5
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ def assert_roundtrip(tz):
1111

1212
ts1 = ts.replace(tzinfo=tz)
1313
ts2 = ms_to_datetime(datetime_to_ms(ts1.astimezone(mktz("UTC"))), tz)
14-
ts1 = ts1.replace(tzinfo=None) if tz == mktz() else ts1
1514
#logger.info(ts2.tzinfo)
1615

1716
assert(ts2.hour == ts1.hour)
@@ -53,22 +52,22 @@ def test_mktz_London():
5352

5453
def test_datetime_roundtrip_local_no_tz():
5554
pdt = datetime.datetime(2012, 6, 12, 12, 12, 12, 123000)
56-
pdt2 = ms_to_datetime(datetime_to_ms(pdt))
55+
pdt2 = ms_to_datetime(datetime_to_ms(pdt)).replace(tzinfo=None)
5756
assert pdt2 == pdt
5857

5958
pdt = datetime.datetime(2012, 1, 12, 12, 12, 12, 123000)
60-
pdt2 = ms_to_datetime(datetime_to_ms(pdt))
59+
pdt2 = ms_to_datetime(datetime_to_ms(pdt)).replace(tzinfo=None)
6160
assert pdt2 == pdt
6261

6362

6463
def test_datetime_roundtrip_local_tz():
6564
pdt = datetime.datetime(2012, 6, 12, 12, 12, 12, 123000, tzinfo=mktz(DEFAULT_TIME_ZONE_NAME))
6665
pdt2 = ms_to_datetime(datetime_to_ms(pdt))
67-
assert pdt2 == pdt.replace(tzinfo=None)
66+
assert pdt2 == pdt
6867

6968
pdt = datetime.datetime(2012, 1, 12, 12, 12, 12, 123000, tzinfo=mktz(DEFAULT_TIME_ZONE_NAME))
7069
pdt2 = ms_to_datetime(datetime_to_ms(pdt))
71-
assert pdt2 == pdt.replace(tzinfo=None)
70+
assert pdt2 == pdt
7271

7372

7473
def test_datetime_roundtrip_est_tz():

0 commit comments

Comments
 (0)