From d10bc9ed8aee932cc5ff6241fea38eb52666178b Mon Sep 17 00:00:00 2001 From: wertha Date: Wed, 10 Feb 2021 10:55:04 -0800 Subject: [PATCH 1/5] Validate null dates --- pandas/io/sas/sas7bdat.py | 4 ++++ pandas/tests/io/sas/data/dates_null.sas7bdat | Bin 0 -> 131072 bytes pandas/tests/io/sas/test_sas7bdat.py | 19 +++++++++++++++++++ 3 files changed, 23 insertions(+) create mode 100644 pandas/tests/io/sas/data/dates_null.sas7bdat diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index fe06f103e6c5e..e31adcbb45d99 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -54,12 +54,16 @@ def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series: if unit == "s": s_series = sas_datetimes.apply( lambda sas_float: datetime(1960, 1, 1) + timedelta(seconds=sas_float) + if pd.notnull(sas_float) + else pd.NaT ) s_series = cast(pd.Series, s_series) return s_series elif unit == "d": d_series = sas_datetimes.apply( lambda sas_float: datetime(1960, 1, 1) + timedelta(days=sas_float) + if pd.notnull(sas_float) + else pd.NaT ) d_series = cast(pd.Series, d_series) return d_series diff --git a/pandas/tests/io/sas/data/dates_null.sas7bdat b/pandas/tests/io/sas/data/dates_null.sas7bdat new file mode 100644 index 0000000000000000000000000000000000000000..beadf1a34f42ed484a6ecb52e55bd92a9f488f94 GIT binary patch literal 131072 zcmeIz%WGU!90%|-lTlL**vJS$1Vfb4MIADoI4ZcvXj5!RB#;QMY{hC?@PSaFyXsQu z%C-I#ZY@X`DsEi3=|XVl!j&5ztuf<$oHKLhCKE(lSooZ9=bqp9oO2(axpNc3xZV8u z!wa9c=fAu0$!s|Gb@%$hW5?%vOQF|mG`o#*T04Z9#zfw0Jl+fKc37N=Q$pAtY%iWY zx3P9+u(P&({-u{UHWm+U<1-)L%hnI_h56{mtuJ19_{I-|v_H5aKYb(nn{3Z^mRFX$ zoBfT;@4SDtGIYAr@AOVT`SepOonE)o?No+}?$u}d=ld(;G(YcG>))S9+jykxt*-Aj z^WOC9tFKTKAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK;Zu+Fj(Jh9(F}He~oE#F5I7WZ|#-! z$(hO7U&ef4G7gvSl))>tqqkyyG0kt3om;od{i$01Y0RIia?< zX!Sd2XFU=6_*YdM(GpbzM{Vp9vEe1=dU<< ztCBC)+L^)5V0&k6>*8xy-gy1pw?bC0D(JnQm>Oi%G34L>*Sj9duqo^b^6BBymX)SIzE^a-}_+>cjAc;8>?ow z|KYsqn!CCGTh{XHZ`N+&_pIg7FEO91jn6MH70L0K*Vm`%^*sG@A4(Il-*2_O`e0e5 z=Opc`bM&VR4Ayt+ Date: Thu, 11 Feb 2021 08:55:59 +0000 Subject: [PATCH 2/5] Changing notnull, to notna --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/io/sas/sas7bdat.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 99ae60859b68c..11f8a5b6c2499 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -390,6 +390,7 @@ I/O - Bug in :func:`read_json` when ``orient="split"`` does not maintain numeric string index (:issue:`28556`) - :meth:`read_sql` returned an empty generator if ``chunksize`` was no-zero and the query returned no results. Now returns a generator with a single empty dataframe (:issue:`34411`) - Bug in :func:`read_hdf` returning unexpected records when filtering on categorical string columns using ``where`` parameter (:issue:`39189`) +- Bug in :func:`read_sas` raising ``ValueError`` when ``datetimes`` were null (:issue:`39725`) Period ^^^^^^ diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index e31adcbb45d99..b348a8591bc12 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -23,6 +23,7 @@ from pandas.errors import EmptyDataError, OutOfBoundsDatetime import pandas as pd +from pandas import notna from pandas.io.common import get_handle from pandas.io.sas._sas import Parser @@ -54,7 +55,7 @@ def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series: if unit == "s": s_series = sas_datetimes.apply( lambda sas_float: datetime(1960, 1, 1) + timedelta(seconds=sas_float) - if pd.notnull(sas_float) + if notna(sas_float) else pd.NaT ) s_series = cast(pd.Series, s_series) @@ -62,7 +63,7 @@ def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series: elif unit == "d": d_series = sas_datetimes.apply( lambda sas_float: datetime(1960, 1, 1) + timedelta(days=sas_float) - if pd.notnull(sas_float) + if notna(sas_float) else pd.NaT ) d_series = cast(pd.Series, d_series) From bb29f1b896582530005c956bd97ef6dfea8c7684 Mon Sep 17 00:00:00 2001 From: wertha Date: Thu, 11 Feb 2021 14:17:04 -0800 Subject: [PATCH 3/5] Refactoring lambda function --- pandas/io/sas/sas7bdat.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index b348a8591bc12..8d3e84569156b 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -23,7 +23,7 @@ from pandas.errors import EmptyDataError, OutOfBoundsDatetime import pandas as pd -from pandas import notna +from pandas import isnull from pandas.io.common import get_handle from pandas.io.sas._sas import Parser @@ -31,6 +31,17 @@ from pandas.io.sas.sasreader import ReaderBase +def _parse_datetime(sas_datetime: float, unit: str): + if isnull(sas_datetime): + return pd.NaT + + if unit == "s": + return datetime(1960, 1, 1) + timedelta(seconds=sas_datetime) + + if unit == "d": + return datetime(1960, 1, 1) + timedelta(days=sas_datetime) + + def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series: """ Convert to Timestamp if possible, otherwise to datetime.datetime. @@ -52,22 +63,10 @@ def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series: try: return pd.to_datetime(sas_datetimes, unit=unit, origin="1960-01-01") except OutOfBoundsDatetime: - if unit == "s": - s_series = sas_datetimes.apply( - lambda sas_float: datetime(1960, 1, 1) + timedelta(seconds=sas_float) - if notna(sas_float) - else pd.NaT - ) + if unit in ["s", "d"]: + s_series = sas_datetimes.apply(_parse_datetime, unit=unit) s_series = cast(pd.Series, s_series) return s_series - elif unit == "d": - d_series = sas_datetimes.apply( - lambda sas_float: datetime(1960, 1, 1) + timedelta(days=sas_float) - if notna(sas_float) - else pd.NaT - ) - d_series = cast(pd.Series, d_series) - return d_series else: raise ValueError("unit must be 'd' or 's'") From 1b214675419d9746930e71903cbccaca9ed3a3bd Mon Sep 17 00:00:00 2001 From: wertha Date: Fri, 12 Feb 2021 00:31:40 -0800 Subject: [PATCH 4/5] Use isna instead os isnull --- pandas/io/sas/sas7bdat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 8d3e84569156b..d33eb8d01c0fe 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -23,7 +23,7 @@ from pandas.errors import EmptyDataError, OutOfBoundsDatetime import pandas as pd -from pandas import isnull +from pandas import isna from pandas.io.common import get_handle from pandas.io.sas._sas import Parser @@ -32,7 +32,7 @@ def _parse_datetime(sas_datetime: float, unit: str): - if isnull(sas_datetime): + if isna(sas_datetime): return pd.NaT if unit == "s": From 6aa6f6415140ac52f168aa3616260b4d00f1b255 Mon Sep 17 00:00:00 2001 From: wertha Date: Fri, 12 Feb 2021 19:15:39 -0800 Subject: [PATCH 5/5] Move exception for unit inside helper function --- pandas/io/sas/sas7bdat.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index d33eb8d01c0fe..9853fa41d3fb9 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -38,9 +38,12 @@ def _parse_datetime(sas_datetime: float, unit: str): if unit == "s": return datetime(1960, 1, 1) + timedelta(seconds=sas_datetime) - if unit == "d": + elif unit == "d": return datetime(1960, 1, 1) + timedelta(days=sas_datetime) + else: + raise ValueError("unit must be 'd' or 's'") + def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series: """ @@ -63,12 +66,9 @@ def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series: try: return pd.to_datetime(sas_datetimes, unit=unit, origin="1960-01-01") except OutOfBoundsDatetime: - if unit in ["s", "d"]: - s_series = sas_datetimes.apply(_parse_datetime, unit=unit) - s_series = cast(pd.Series, s_series) - return s_series - else: - raise ValueError("unit must be 'd' or 's'") + s_series = sas_datetimes.apply(_parse_datetime, unit=unit) + s_series = cast(pd.Series, s_series) + return s_series class _SubheaderPointer: