Skip to content

Commit d75100d

Browse files
authored
Issue 20927 fix resolves read_sas error for dates/datetimes greater than 2262-04-11 (#28047)
1 parent f5ab5a8 commit d75100d

File tree

4 files changed

+130
-9
lines changed

4 files changed

+130
-9
lines changed

doc/source/whatsnew/v1.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -781,6 +781,7 @@ I/O
781781
timestamps with ``version="2.0"`` (:issue:`31652`).
782782
- Bug in :meth:`read_csv` was raising `TypeError` when `sep=None` was used in combination with `comment` keyword (:issue:`31396`)
783783
- Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`)
784+
- :func:`read_sas()` now handles dates and datetimes larger than :attr:`Timestamp.max` returning them as :class:`datetime.datetime` objects (:issue:`20927`)
784785
- Bug in :meth:`DataFrame.to_json` where ``Timedelta`` objects would not be serialized correctly with ``date_format="iso"`` (:issue:`28256`)
785786
- :func:`read_csv` will raise a ``ValueError`` when the column names passed in `parse_dates` are missing in the Dataframe (:issue:`31251`)
786787
- Bug in :meth:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`)

pandas/io/sas/sas7bdat.py

+37-9
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,12 @@
1414
http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
1515
"""
1616
from collections import abc
17-
from datetime import datetime
17+
from datetime import datetime, timedelta
1818
import struct
1919

2020
import numpy as np
2121

22-
from pandas.errors import EmptyDataError
22+
from pandas.errors import EmptyDataError, OutOfBoundsDatetime
2323

2424
import pandas as pd
2525

@@ -29,6 +29,39 @@
2929
from pandas.io.sas.sasreader import ReaderBase
3030

3131

32+
def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series:
33+
"""
34+
Convert to Timestamp if possible, otherwise to datetime.datetime.
35+
SAS float64 lacks precision for more than ms resolution so the fit
36+
to datetime.datetime is ok.
37+
38+
Parameters
39+
----------
40+
sas_datetimes : {Series, Sequence[float]}
41+
Dates or datetimes in SAS
42+
unit : {str}
43+
"d" if the floats represent dates, "s" for datetimes
44+
45+
Returns
46+
-------
47+
Series
48+
Series of datetime64 dtype or datetime.datetime.
49+
"""
50+
try:
51+
return pd.to_datetime(sas_datetimes, unit=unit, origin="1960-01-01")
52+
except OutOfBoundsDatetime:
53+
if unit == "s":
54+
return sas_datetimes.apply(
55+
lambda sas_float: datetime(1960, 1, 1) + timedelta(seconds=sas_float)
56+
)
57+
elif unit == "d":
58+
return sas_datetimes.apply(
59+
lambda sas_float: datetime(1960, 1, 1) + timedelta(days=sas_float)
60+
)
61+
else:
62+
raise ValueError("unit must be 'd' or 's'")
63+
64+
3265
class _subheader_pointer:
3366
pass
3467

@@ -706,15 +739,10 @@ def _chunk_to_dataframe(self):
706739
rslt[name] = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d")
707740
rslt[name] = np.asarray(rslt[name], dtype=np.float64)
708741
if self.convert_dates:
709-
unit = None
710742
if self.column_formats[j] in const.sas_date_formats:
711-
unit = "d"
743+
rslt[name] = _convert_datetimes(rslt[name], "d")
712744
elif self.column_formats[j] in const.sas_datetime_formats:
713-
unit = "s"
714-
if unit:
715-
rslt[name] = pd.to_datetime(
716-
rslt[name], unit=unit, origin="1960-01-01"
717-
)
745+
rslt[name] = _convert_datetimes(rslt[name], "s")
718746
jb += 1
719747
elif self._column_types[j] == b"s":
720748
rslt[name] = self._string_chunk[js, :]
384 KB
Binary file not shown.

pandas/tests/io/sas/test_sas7bdat.py

+92
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import os
44
from pathlib import Path
55

6+
import dateutil.parser
67
import numpy as np
78
import pytest
89

@@ -214,3 +215,94 @@ def test_zero_variables(datapath):
214215
fname = datapath("io", "sas", "data", "zero_variables.sas7bdat")
215216
with pytest.raises(EmptyDataError):
216217
pd.read_sas(fname)
218+
219+
220+
def round_datetime_to_ms(ts):
221+
if isinstance(ts, datetime):
222+
return ts.replace(microsecond=int(round(ts.microsecond, -3) / 1000) * 1000)
223+
elif isinstance(ts, str):
224+
_ts = dateutil.parser.parse(timestr=ts)
225+
return _ts.replace(microsecond=int(round(_ts.microsecond, -3) / 1000) * 1000)
226+
else:
227+
return ts
228+
229+
230+
def test_max_sas_date(datapath):
231+
# GH 20927
232+
# NB. max datetime in SAS dataset is 31DEC9999:23:59:59.999
233+
# but this is read as 29DEC9999:23:59:59.998993 by a buggy
234+
# sas7bdat module
235+
fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat")
236+
df = pd.read_sas(fname, encoding="iso-8859-1")
237+
238+
# SAS likes to left pad strings with spaces - lstrip before comparing
239+
df = df.applymap(lambda x: x.lstrip() if isinstance(x, str) else x)
240+
# GH 19732: Timestamps imported from sas will incur floating point errors
241+
try:
242+
df["dt_as_dt"] = df["dt_as_dt"].dt.round("us")
243+
except pd._libs.tslibs.np_datetime.OutOfBoundsDatetime:
244+
df = df.applymap(round_datetime_to_ms)
245+
except AttributeError:
246+
df["dt_as_dt"] = df["dt_as_dt"].apply(round_datetime_to_ms)
247+
# if there are any date/times > pandas.Timestamp.max then ALL in that chunk
248+
# are returned as datetime.datetime
249+
expected = pd.DataFrame(
250+
{
251+
"text": ["max", "normal"],
252+
"dt_as_float": [253717747199.999, 1880323199.999],
253+
"dt_as_dt": [
254+
datetime(9999, 12, 29, 23, 59, 59, 999000),
255+
datetime(2019, 8, 1, 23, 59, 59, 999000),
256+
],
257+
"date_as_float": [2936547.0, 21762.0],
258+
"date_as_date": [datetime(9999, 12, 29), datetime(2019, 8, 1)],
259+
},
260+
columns=["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"],
261+
)
262+
tm.assert_frame_equal(df, expected)
263+
264+
265+
def test_max_sas_date_iterator(datapath):
266+
# GH 20927
267+
# when called as an iterator, only those chunks with a date > pd.Timestamp.max
268+
# are returned as datetime.datetime, if this happens that whole chunk is returned
269+
# as datetime.datetime
270+
col_order = ["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"]
271+
fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat")
272+
results = []
273+
for df in pd.read_sas(fname, encoding="iso-8859-1", chunksize=1):
274+
# SAS likes to left pad strings with spaces - lstrip before comparing
275+
df = df.applymap(lambda x: x.lstrip() if isinstance(x, str) else x)
276+
# GH 19732: Timestamps imported from sas will incur floating point errors
277+
try:
278+
df["dt_as_dt"] = df["dt_as_dt"].dt.round("us")
279+
except pd._libs.tslibs.np_datetime.OutOfBoundsDatetime:
280+
df = df.applymap(round_datetime_to_ms)
281+
except AttributeError:
282+
df["dt_as_dt"] = df["dt_as_dt"].apply(round_datetime_to_ms)
283+
df.reset_index(inplace=True, drop=True)
284+
results.append(df)
285+
expected = [
286+
pd.DataFrame(
287+
{
288+
"text": ["max"],
289+
"dt_as_float": [253717747199.999],
290+
"dt_as_dt": [datetime(9999, 12, 29, 23, 59, 59, 999000)],
291+
"date_as_float": [2936547.0],
292+
"date_as_date": [datetime(9999, 12, 29)],
293+
},
294+
columns=col_order,
295+
),
296+
pd.DataFrame(
297+
{
298+
"text": ["normal"],
299+
"dt_as_float": [1880323199.999],
300+
"dt_as_dt": [np.datetime64("2019-08-01 23:59:59.999")],
301+
"date_as_float": [21762.0],
302+
"date_as_date": [np.datetime64("2019-08-01")],
303+
},
304+
columns=col_order,
305+
),
306+
]
307+
for result, expected in zip(results, expected):
308+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)