diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 20e2cce1a3dfa..3e7666c8636b9 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -781,6 +781,7 @@ I/O timestamps with ``version="2.0"`` (:issue:`31652`). - Bug in :meth:`read_csv` was raising `TypeError` when `sep=None` was used in combination with `comment` keyword (:issue:`31396`) - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`) +- :func:`read_sas()` now handles dates and datetimes larger than :attr:`Timestamp.max` returning them as :class:`datetime.datetime` objects (:issue:`20927`) - Bug in :meth:`DataFrame.to_json` where ``Timedelta`` objects would not be serialized correctly with ``date_format="iso"`` (:issue:`28256`) - :func:`read_csv` will raise a ``ValueError`` when the column names passed in `parse_dates` are missing in the Dataframe (:issue:`31251`) - Bug in :meth:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 2bfcd500ee239..c8f1336bcec60 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -14,12 +14,12 @@ http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm """ from collections import abc -from datetime import datetime +from datetime import datetime, timedelta import struct import numpy as np -from pandas.errors import EmptyDataError +from pandas.errors import EmptyDataError, OutOfBoundsDatetime import pandas as pd @@ -29,6 +29,39 @@ from pandas.io.sas.sasreader import ReaderBase +def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series: + """ + Convert to Timestamp if possible, otherwise to datetime.datetime. + SAS float64 lacks precision for more than ms resolution so the fit + to datetime.datetime is ok. + + Parameters + ---------- + sas_datetimes : {Series, Sequence[float]} + Dates or datetimes in SAS + unit : {str} + "d" if the floats represent dates, "s" for datetimes + + Returns + ------- + Series + Series of datetime64 dtype or datetime.datetime. + """ + try: + return pd.to_datetime(sas_datetimes, unit=unit, origin="1960-01-01") + except OutOfBoundsDatetime: + if unit == "s": + return sas_datetimes.apply( + lambda sas_float: datetime(1960, 1, 1) + timedelta(seconds=sas_float) + ) + elif unit == "d": + return sas_datetimes.apply( + lambda sas_float: datetime(1960, 1, 1) + timedelta(days=sas_float) + ) + else: + raise ValueError("unit must be 'd' or 's'") + + class _subheader_pointer: pass @@ -706,15 +739,10 @@ def _chunk_to_dataframe(self): rslt[name] = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d") rslt[name] = np.asarray(rslt[name], dtype=np.float64) if self.convert_dates: - unit = None if self.column_formats[j] in const.sas_date_formats: - unit = "d" + rslt[name] = _convert_datetimes(rslt[name], "d") elif self.column_formats[j] in const.sas_datetime_formats: - unit = "s" - if unit: - rslt[name] = pd.to_datetime( - rslt[name], unit=unit, origin="1960-01-01" - ) + rslt[name] = _convert_datetimes(rslt[name], "s") jb += 1 elif self._column_types[j] == b"s": rslt[name] = self._string_chunk[js, :] diff --git a/pandas/tests/io/sas/data/max_sas_date.sas7bdat b/pandas/tests/io/sas/data/max_sas_date.sas7bdat new file mode 100644 index 0000000000000..b7838ebdcfeea Binary files /dev/null and b/pandas/tests/io/sas/data/max_sas_date.sas7bdat differ diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 62e9ac6929c8e..8c14f9de9f61c 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -3,6 +3,7 @@ import os from pathlib import Path +import dateutil.parser import numpy as np import pytest @@ -214,3 +215,94 @@ def test_zero_variables(datapath): fname = datapath("io", "sas", "data", "zero_variables.sas7bdat") with pytest.raises(EmptyDataError): pd.read_sas(fname) + + +def round_datetime_to_ms(ts): + if isinstance(ts, datetime): + return ts.replace(microsecond=int(round(ts.microsecond, -3) / 1000) * 1000) + elif isinstance(ts, str): + _ts = dateutil.parser.parse(timestr=ts) + return _ts.replace(microsecond=int(round(_ts.microsecond, -3) / 1000) * 1000) + else: + return ts + + +def test_max_sas_date(datapath): + # GH 20927 + # NB. max datetime in SAS dataset is 31DEC9999:23:59:59.999 + # but this is read as 29DEC9999:23:59:59.998993 by a buggy + # sas7bdat module + fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat") + df = pd.read_sas(fname, encoding="iso-8859-1") + + # SAS likes to left pad strings with spaces - lstrip before comparing + df = df.applymap(lambda x: x.lstrip() if isinstance(x, str) else x) + # GH 19732: Timestamps imported from sas will incur floating point errors + try: + df["dt_as_dt"] = df["dt_as_dt"].dt.round("us") + except pd._libs.tslibs.np_datetime.OutOfBoundsDatetime: + df = df.applymap(round_datetime_to_ms) + except AttributeError: + df["dt_as_dt"] = df["dt_as_dt"].apply(round_datetime_to_ms) + # if there are any date/times > pandas.Timestamp.max then ALL in that chunk + # are returned as datetime.datetime + expected = pd.DataFrame( + { + "text": ["max", "normal"], + "dt_as_float": [253717747199.999, 1880323199.999], + "dt_as_dt": [ + datetime(9999, 12, 29, 23, 59, 59, 999000), + datetime(2019, 8, 1, 23, 59, 59, 999000), + ], + "date_as_float": [2936547.0, 21762.0], + "date_as_date": [datetime(9999, 12, 29), datetime(2019, 8, 1)], + }, + columns=["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"], + ) + tm.assert_frame_equal(df, expected) + + +def test_max_sas_date_iterator(datapath): + # GH 20927 + # when called as an iterator, only those chunks with a date > pd.Timestamp.max + # are returned as datetime.datetime, if this happens that whole chunk is returned + # as datetime.datetime + col_order = ["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"] + fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat") + results = [] + for df in pd.read_sas(fname, encoding="iso-8859-1", chunksize=1): + # SAS likes to left pad strings with spaces - lstrip before comparing + df = df.applymap(lambda x: x.lstrip() if isinstance(x, str) else x) + # GH 19732: Timestamps imported from sas will incur floating point errors + try: + df["dt_as_dt"] = df["dt_as_dt"].dt.round("us") + except pd._libs.tslibs.np_datetime.OutOfBoundsDatetime: + df = df.applymap(round_datetime_to_ms) + except AttributeError: + df["dt_as_dt"] = df["dt_as_dt"].apply(round_datetime_to_ms) + df.reset_index(inplace=True, drop=True) + results.append(df) + expected = [ + pd.DataFrame( + { + "text": ["max"], + "dt_as_float": [253717747199.999], + "dt_as_dt": [datetime(9999, 12, 29, 23, 59, 59, 999000)], + "date_as_float": [2936547.0], + "date_as_date": [datetime(9999, 12, 29)], + }, + columns=col_order, + ), + pd.DataFrame( + { + "text": ["normal"], + "dt_as_float": [1880323199.999], + "dt_as_dt": [np.datetime64("2019-08-01 23:59:59.999")], + "date_as_float": [21762.0], + "date_as_date": [np.datetime64("2019-08-01")], + }, + columns=col_order, + ), + ] + for result, expected in zip(results, expected): + tm.assert_frame_equal(result, expected)