diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 99ae60859b68c..11f8a5b6c2499 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -390,6 +390,7 @@ I/O - Bug in :func:`read_json` when ``orient="split"`` does not maintain numeric string index (:issue:`28556`) - :meth:`read_sql` returned an empty generator if ``chunksize`` was no-zero and the query returned no results. Now returns a generator with a single empty dataframe (:issue:`34411`) - Bug in :func:`read_hdf` returning unexpected records when filtering on categorical string columns using ``where`` parameter (:issue:`39189`) +- Bug in :func:`read_sas` raising ``ValueError`` when ``datetimes`` were null (:issue:`39725`) Period ^^^^^^ diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index fe06f103e6c5e..9853fa41d3fb9 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -23,6 +23,7 @@ from pandas.errors import EmptyDataError, OutOfBoundsDatetime import pandas as pd +from pandas import isna from pandas.io.common import get_handle from pandas.io.sas._sas import Parser @@ -30,6 +31,20 @@ from pandas.io.sas.sasreader import ReaderBase +def _parse_datetime(sas_datetime: float, unit: str): + if isna(sas_datetime): + return pd.NaT + + if unit == "s": + return datetime(1960, 1, 1) + timedelta(seconds=sas_datetime) + + elif unit == "d": + return datetime(1960, 1, 1) + timedelta(days=sas_datetime) + + else: + raise ValueError("unit must be 'd' or 's'") + + def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series: """ Convert to Timestamp if possible, otherwise to datetime.datetime. @@ -51,20 +66,9 @@ def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series: try: return pd.to_datetime(sas_datetimes, unit=unit, origin="1960-01-01") except OutOfBoundsDatetime: - if unit == "s": - s_series = sas_datetimes.apply( - lambda sas_float: datetime(1960, 1, 1) + timedelta(seconds=sas_float) - ) - s_series = cast(pd.Series, s_series) - return s_series - elif unit == "d": - d_series = sas_datetimes.apply( - lambda sas_float: datetime(1960, 1, 1) + timedelta(days=sas_float) - ) - d_series = cast(pd.Series, d_series) - return d_series - else: - raise ValueError("unit must be 'd' or 's'") + s_series = sas_datetimes.apply(_parse_datetime, unit=unit) + s_series = cast(pd.Series, s_series) + return s_series class _SubheaderPointer: diff --git a/pandas/tests/io/sas/data/dates_null.sas7bdat b/pandas/tests/io/sas/data/dates_null.sas7bdat new file mode 100644 index 0000000000000..beadf1a34f42e Binary files /dev/null and b/pandas/tests/io/sas/data/dates_null.sas7bdat differ diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 4035f6aff90ce..b23959a7d87a2 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -315,3 +315,22 @@ def test_max_sas_date_iterator(datapath): ] for result, expected in zip(results, expected): tm.assert_frame_equal(result, expected) + + +def test_null_date(datapath): + fname = datapath("io", "sas", "data", "dates_null.sas7bdat") + df = pd.read_sas(fname, encoding="utf-8") + + expected = pd.DataFrame( + { + "datecol": [ + datetime(9999, 12, 29), + pd.NaT, + ], + "datetimecol": [ + datetime(9999, 12, 29, 23, 59, 59, 998993), + pd.NaT, + ], + }, + ) + tm.assert_frame_equal(df, expected)