ENH: non-nano datetime64s for read_sas (#56127)

jbrockmendel · web-flow · commit 33ff3d97e6a6 · 2023-12-05T12:39:47.000-08:00
* ENH: non-nano datetime64s for read_sas

* GH ref

* edit expected for 32bit

* troubleshoot 32bit build

* troubleshoot 32bit build

* troubleshoot 32bit builds

* troubleshoot 32bit build

* troubleshoot 32bit build

* typo fixup
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -219,6 +219,7 @@ Other enhancements
 - :meth:`~DataFrame.to_sql` with method parameter set to ``multi`` works with Oracle on the backend
 - :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`).
 - :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
+- :func:`read_sas` returns ``datetime64`` dtypes with resolutions better matching those stored natively in SAS, and avoids returning object-dtype in cases that cannot be stored with ``datetime64[ns]`` dtype (:issue:`56127`)
 - :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs`. (:issue:`54264`)
 - :func:`tseries.api.guess_datetime_format` is now part of the public API (:issue:`54727`)
 - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
diff --git a/pandas/_libs/tslibs/conversion.pyi b/pandas/_libs/tslibs/conversion.pyi
@@ -9,4 +9,6 @@ DT64NS_DTYPE: np.dtype
 TD64NS_DTYPE: np.dtype
 
 def localize_pydatetime(dt: datetime, tz: tzinfo | None) -> datetime: ...
-def cast_from_unit_vectorized(values: np.ndarray, unit: str) -> np.ndarray: ...
+def cast_from_unit_vectorized(
+    values: np.ndarray, unit: str, out_unit: str = ...
+) -> np.ndarray: ...
diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx
@@ -97,6 +97,7 @@ TD64NS_DTYPE = np.dtype("m8[ns]")
 def cast_from_unit_vectorized(
     ndarray values,
     str unit,
+    str out_unit="ns",
 ):
     """
     Vectorized analogue to cast_from_unit.
@@ -122,11 +123,11 @@ def cast_from_unit_vectorized(
         # GH#47266 go through np.datetime64 to avoid weird results e.g. with "Y"
         #  and 150 we'd get 2120-01-01 09:00:00
         values = values.astype(f"M8[{unit}]")
-        dtype = np.dtype("M8[ns]")
+        dtype = np.dtype(f"M8[{out_unit}]")
         return astype_overflowsafe(values, dtype=dtype, copy=False).view("i8")
 
     in_reso = abbrev_to_npy_unit(unit)
-    out_reso = abbrev_to_npy_unit("ns")
+    out_reso = abbrev_to_npy_unit(out_unit)
     m, p = precision_from_unit(in_reso, out_reso)
 
     cdef:
diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py
@@ -21,10 +21,7 @@
     timedelta,
 )
 import sys
-from typing import (
-    TYPE_CHECKING,
-    cast,
-)
+from typing import TYPE_CHECKING
 
 import numpy as np
 
@@ -39,14 +36,13 @@
     Parser,
     get_subheader_index,
 )
-from pandas.errors import (
-    EmptyDataError,
-    OutOfBoundsDatetime,
-)
+from pandas._libs.tslibs.conversion import cast_from_unit_vectorized
+from pandas.errors import EmptyDataError
 
 import pandas as pd
 from pandas import (
     DataFrame,
+    Timestamp,
     isna,
 )
 
@@ -62,6 +58,10 @@
     )
 
 
+_unix_origin = Timestamp("1970-01-01")
+_sas_origin = Timestamp("1960-01-01")
+
+
 def _parse_datetime(sas_datetime: float, unit: str):
     if isna(sas_datetime):
         return pd.NaT
@@ -94,12 +94,16 @@ def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series:
     Series
        Series of datetime64 dtype or datetime.datetime.
     """
-    try:
-        return pd.to_datetime(sas_datetimes, unit=unit, origin="1960-01-01")
-    except OutOfBoundsDatetime:
-        s_series = sas_datetimes.apply(_parse_datetime, unit=unit)
-        s_series = cast(pd.Series, s_series)
-        return s_series
+    td = (_sas_origin - _unix_origin).as_unit("s")
+    if unit == "s":
+        millis = cast_from_unit_vectorized(
+            sas_datetimes._values, unit="s", out_unit="ms"
+        )
+        dt64ms = millis.view("M8[ms]") + td
+        return pd.Series(dt64ms, index=sas_datetimes.index)
+    else:
+        vals = np.array(sas_datetimes, dtype="M8[D]") + td
+        return pd.Series(vals, dtype="M8[s]", index=sas_datetimes.index)
 
 
 class _Column:
diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py
@@ -4,10 +4,10 @@
 import os
 from pathlib import Path
 
-import dateutil.parser
 import numpy as np
 import pytest
 
+from pandas.compat import IS64
 from pandas.errors import EmptyDataError
 import pandas.util._test_decorators as td
 
@@ -27,9 +27,9 @@ def data_test_ix(request, dirpath):
     df = pd.read_csv(fname)
     epoch = datetime(1960, 1, 1)
     t1 = pd.to_timedelta(df["Column4"], unit="d")
-    df["Column4"] = epoch + t1
+    df["Column4"] = (epoch + t1).astype("M8[s]")
     t2 = pd.to_timedelta(df["Column12"], unit="d")
-    df["Column12"] = epoch + t2
+    df["Column12"] = (epoch + t2).astype("M8[s]")
     for k in range(df.shape[1]):
         col = df.iloc[:, k]
         if col.dtype == np.int64:
@@ -59,7 +59,7 @@ def test_from_buffer(self, dirpath, data_test_ix):
                 buf, format="sas7bdat", iterator=True, encoding="utf-8"
             ) as rdr:
                 df = rdr.read()
-            tm.assert_frame_equal(df, df0, check_exact=False)
+            tm.assert_frame_equal(df, df0)
 
     @pytest.mark.slow
     def test_from_iterator(self, dirpath, data_test_ix):
@@ -157,6 +157,8 @@ def test_productsales(datapath):
     df0 = pd.read_csv(fname, parse_dates=["MONTH"])
     vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"]
     df0[vn] = df0[vn].astype(np.float64)
+
+    df0["MONTH"] = df0["MONTH"].astype("M8[s]")
     tm.assert_frame_equal(df, df0)
 
 
@@ -175,7 +177,7 @@ def test_airline(datapath):
     fname = datapath("io", "sas", "data", "airline.csv")
     df0 = pd.read_csv(fname)
     df0 = df0.astype(np.float64)
-    tm.assert_frame_equal(df, df0, check_exact=False)
+    tm.assert_frame_equal(df, df0)
 
 
 def test_date_time(datapath):
@@ -191,14 +193,20 @@ def test_date_time(datapath):
     #  access to SAS to read the sas7bdat file. We are really just testing
     #  that we are "close". This only seems to be an issue near the
     #  implementation bounds.
-    res = df.iloc[:, 3].dt.round("us").copy()
 
-    # the first and last elements are near the implementation bounds, where we
-    #  would expect floating point error to occur.
-    res.iloc[0] -= pd.Timedelta(microseconds=1)
-    res.iloc[-1] += pd.Timedelta(microseconds=1)
+    df[df.columns[3]] = df.iloc[:, 3].dt.round("us")
+    df0["Date1"] = df0["Date1"].astype("M8[s]")
+    df0["Date2"] = df0["Date2"].astype("M8[s]")
+    df0["DateTime"] = df0["DateTime"].astype("M8[ms]")
+    df0["Taiw"] = df0["Taiw"].astype("M8[s]")
 
-    df["DateTimeHi"] = res
+    res = df0["DateTimeHi"].astype("M8[us]").dt.round("ms")
+    df0["DateTimeHi"] = res.astype("M8[ms]")
+
+    if not IS64:
+        # No good reason for this, just what we get on the CI
+        df0.loc[0, "DateTimeHi"] += np.timedelta64(1, "ms")
+        df0.loc[[2, 3], "DateTimeHi"] -= np.timedelta64(1, "ms")
     tm.assert_frame_equal(df, df0)
 
 
@@ -258,16 +266,6 @@ def test_corrupt_read(datapath):
         pd.read_sas(fname)
 
 
-def round_datetime_to_ms(ts):
-    if isinstance(ts, datetime):
-        return ts.replace(microsecond=int(round(ts.microsecond, -3) / 1000) * 1000)
-    elif isinstance(ts, str):
-        _ts = dateutil.parser.parse(timestr=ts)
-        return _ts.replace(microsecond=int(round(_ts.microsecond, -3) / 1000) * 1000)
-    else:
-        return ts
-
-
 def test_max_sas_date(datapath):
     # GH 20927
     # NB. max datetime in SAS dataset is 31DEC9999:23:59:59.999
@@ -276,30 +274,33 @@ def test_max_sas_date(datapath):
     fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat")
     df = pd.read_sas(fname, encoding="iso-8859-1")
 
-    # SAS likes to left pad strings with spaces - lstrip before comparing
-    df = df.map(lambda x: x.lstrip() if isinstance(x, str) else x)
-    # GH 19732: Timestamps imported from sas will incur floating point errors
-    try:
-        df["dt_as_dt"] = df["dt_as_dt"].dt.round("us")
-    except pd._libs.tslibs.np_datetime.OutOfBoundsDatetime:
-        df = df.map(round_datetime_to_ms)
-    except AttributeError:
-        df["dt_as_dt"] = df["dt_as_dt"].apply(round_datetime_to_ms)
-    # if there are any date/times > pandas.Timestamp.max then ALL in that chunk
-    # are returned as datetime.datetime
     expected = pd.DataFrame(
         {
             "text": ["max", "normal"],
             "dt_as_float": [253717747199.999, 1880323199.999],
-            "dt_as_dt": [
-                datetime(9999, 12, 29, 23, 59, 59, 999000),
-                datetime(2019, 8, 1, 23, 59, 59, 999000),
-            ],
+            "dt_as_dt": np.array(
+                [
+                    datetime(9999, 12, 29, 23, 59, 59, 999000),
+                    datetime(2019, 8, 1, 23, 59, 59, 999000),
+                ],
+                dtype="M8[ms]",
+            ),
             "date_as_float": [2936547.0, 21762.0],
-            "date_as_date": [datetime(9999, 12, 29), datetime(2019, 8, 1)],
+            "date_as_date": np.array(
+                [
+                    datetime(9999, 12, 29),
+                    datetime(2019, 8, 1),
+                ],
+                dtype="M8[s]",
+            ),
         },
         columns=["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"],
     )
+
+    if not IS64:
+        # No good reason for this, just what we get on the CI
+        expected.loc[:, "dt_as_dt"] -= np.timedelta64(1, "ms")
+
     tm.assert_frame_equal(df, expected)
 
 
@@ -312,41 +313,40 @@ def test_max_sas_date_iterator(datapath):
     fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat")
     results = []
     for df in pd.read_sas(fname, encoding="iso-8859-1", chunksize=1):
-        # SAS likes to left pad strings with spaces - lstrip before comparing
-        df = df.map(lambda x: x.lstrip() if isinstance(x, str) else x)
         # GH 19732: Timestamps imported from sas will incur floating point errors
-        try:
-            df["dt_as_dt"] = df["dt_as_dt"].dt.round("us")
-        except pd._libs.tslibs.np_datetime.OutOfBoundsDatetime:
-            df = df.map(round_datetime_to_ms)
-        except AttributeError:
-            df["dt_as_dt"] = df["dt_as_dt"].apply(round_datetime_to_ms)
         df.reset_index(inplace=True, drop=True)
         results.append(df)
     expected = [
         pd.DataFrame(
             {
                 "text": ["max"],
                 "dt_as_float": [253717747199.999],
-                "dt_as_dt": [datetime(9999, 12, 29, 23, 59, 59, 999000)],
+                "dt_as_dt": np.array(
+                    [datetime(9999, 12, 29, 23, 59, 59, 999000)], dtype="M8[ms]"
+                ),
                 "date_as_float": [2936547.0],
-                "date_as_date": [datetime(9999, 12, 29)],
+                "date_as_date": np.array([datetime(9999, 12, 29)], dtype="M8[s]"),
             },
             columns=col_order,
         ),
         pd.DataFrame(
             {
                 "text": ["normal"],
                 "dt_as_float": [1880323199.999],
-                "dt_as_dt": [np.datetime64("2019-08-01 23:59:59.999")],
+                "dt_as_dt": np.array(["2019-08-01 23:59:59.999"], dtype="M8[ms]"),
                 "date_as_float": [21762.0],
-                "date_as_date": [np.datetime64("2019-08-01")],
+                "date_as_date": np.array(["2019-08-01"], dtype="M8[s]"),
             },
             columns=col_order,
         ),
     ]
-    for result, expected in zip(results, expected):
-        tm.assert_frame_equal(result, expected)
+    if not IS64:
+        # No good reason for this, just what we get on the CI
+        expected[0].loc[0, "dt_as_dt"] -= np.timedelta64(1, "ms")
+        expected[1].loc[0, "dt_as_dt"] -= np.timedelta64(1, "ms")
+
+    tm.assert_frame_equal(results[0], expected[0])
+    tm.assert_frame_equal(results[1], expected[1])
 
 
 def test_null_date(datapath):
@@ -355,16 +355,25 @@ def test_null_date(datapath):
 
     expected = pd.DataFrame(
         {
-            "datecol": [
-                datetime(9999, 12, 29),
-                pd.NaT,
-            ],
-            "datetimecol": [
-                datetime(9999, 12, 29, 23, 59, 59, 998993),
-                pd.NaT,
-            ],
+            "datecol": np.array(
+                [
+                    datetime(9999, 12, 29),
+                    np.datetime64("NaT"),
+                ],
+                dtype="M8[s]",
+            ),
+            "datetimecol": np.array(
+                [
+                    datetime(9999, 12, 29, 23, 59, 59, 999000),
+                    np.datetime64("NaT"),
+                ],
+                dtype="M8[ms]",
+            ),
         },
     )
+    if not IS64:
+        # No good reason for this, just what we get on the CI
+        expected.loc[0, "datetimecol"] -= np.timedelta64(1, "ms")
     tm.assert_frame_equal(df, expected)