Skip to content

Commit 33ff3d9

Browse files
authored
ENH: non-nano datetime64s for read_sas (#56127)
* ENH: non-nano datetime64s for read_sas * GH ref * edit expected for 32bit * troubleshoot 32bit build * troubleshoot 32bit build * troubleshoot 32bit builds * troubleshoot 32bit build * troubleshoot 32bit build * typo fixup
1 parent 0b9e784 commit 33ff3d9

File tree

5 files changed

+93
-76
lines changed

5 files changed

+93
-76
lines changed

doc/source/whatsnew/v2.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ Other enhancements
219219
- :meth:`~DataFrame.to_sql` with method parameter set to ``multi`` works with Oracle on the backend
220220
- :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`).
221221
- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
222+
- :func:`read_sas` returns ``datetime64`` dtypes with resolutions better matching those stored natively in SAS, and avoids returning object-dtype in cases that cannot be stored with ``datetime64[ns]`` dtype (:issue:`56127`)
222223
- :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs`. (:issue:`54264`)
223224
- :func:`tseries.api.guess_datetime_format` is now part of the public API (:issue:`54727`)
224225
- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)

pandas/_libs/tslibs/conversion.pyi

+3-1
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,6 @@ DT64NS_DTYPE: np.dtype
99
TD64NS_DTYPE: np.dtype
1010

1111
def localize_pydatetime(dt: datetime, tz: tzinfo | None) -> datetime: ...
12-
def cast_from_unit_vectorized(values: np.ndarray, unit: str) -> np.ndarray: ...
12+
def cast_from_unit_vectorized(
13+
values: np.ndarray, unit: str, out_unit: str = ...
14+
) -> np.ndarray: ...

pandas/_libs/tslibs/conversion.pyx

+3-2
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ TD64NS_DTYPE = np.dtype("m8[ns]")
9797
def cast_from_unit_vectorized(
9898
ndarray values,
9999
str unit,
100+
str out_unit="ns",
100101
):
101102
"""
102103
Vectorized analogue to cast_from_unit.
@@ -122,11 +123,11 @@ def cast_from_unit_vectorized(
122123
# GH#47266 go through np.datetime64 to avoid weird results e.g. with "Y"
123124
# and 150 we'd get 2120-01-01 09:00:00
124125
values = values.astype(f"M8[{unit}]")
125-
dtype = np.dtype("M8[ns]")
126+
dtype = np.dtype(f"M8[{out_unit}]")
126127
return astype_overflowsafe(values, dtype=dtype, copy=False).view("i8")
127128

128129
in_reso = abbrev_to_npy_unit(unit)
129-
out_reso = abbrev_to_npy_unit("ns")
130+
out_reso = abbrev_to_npy_unit(out_unit)
130131
m, p = precision_from_unit(in_reso, out_reso)
131132

132133
cdef:

pandas/io/sas/sas7bdat.py

+18-14
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,7 @@
2121
timedelta,
2222
)
2323
import sys
24-
from typing import (
25-
TYPE_CHECKING,
26-
cast,
27-
)
24+
from typing import TYPE_CHECKING
2825

2926
import numpy as np
3027

@@ -39,14 +36,13 @@
3936
Parser,
4037
get_subheader_index,
4138
)
42-
from pandas.errors import (
43-
EmptyDataError,
44-
OutOfBoundsDatetime,
45-
)
39+
from pandas._libs.tslibs.conversion import cast_from_unit_vectorized
40+
from pandas.errors import EmptyDataError
4641

4742
import pandas as pd
4843
from pandas import (
4944
DataFrame,
45+
Timestamp,
5046
isna,
5147
)
5248

@@ -62,6 +58,10 @@
6258
)
6359

6460

61+
_unix_origin = Timestamp("1970-01-01")
62+
_sas_origin = Timestamp("1960-01-01")
63+
64+
6565
def _parse_datetime(sas_datetime: float, unit: str):
6666
if isna(sas_datetime):
6767
return pd.NaT
@@ -94,12 +94,16 @@ def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series:
9494
Series
9595
Series of datetime64 dtype or datetime.datetime.
9696
"""
97-
try:
98-
return pd.to_datetime(sas_datetimes, unit=unit, origin="1960-01-01")
99-
except OutOfBoundsDatetime:
100-
s_series = sas_datetimes.apply(_parse_datetime, unit=unit)
101-
s_series = cast(pd.Series, s_series)
102-
return s_series
97+
td = (_sas_origin - _unix_origin).as_unit("s")
98+
if unit == "s":
99+
millis = cast_from_unit_vectorized(
100+
sas_datetimes._values, unit="s", out_unit="ms"
101+
)
102+
dt64ms = millis.view("M8[ms]") + td
103+
return pd.Series(dt64ms, index=sas_datetimes.index)
104+
else:
105+
vals = np.array(sas_datetimes, dtype="M8[D]") + td
106+
return pd.Series(vals, dtype="M8[s]", index=sas_datetimes.index)
103107

104108

105109
class _Column:

pandas/tests/io/sas/test_sas7bdat.py

+68-59
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
import os
55
from pathlib import Path
66

7-
import dateutil.parser
87
import numpy as np
98
import pytest
109

10+
from pandas.compat import IS64
1111
from pandas.errors import EmptyDataError
1212
import pandas.util._test_decorators as td
1313

@@ -27,9 +27,9 @@ def data_test_ix(request, dirpath):
2727
df = pd.read_csv(fname)
2828
epoch = datetime(1960, 1, 1)
2929
t1 = pd.to_timedelta(df["Column4"], unit="d")
30-
df["Column4"] = epoch + t1
30+
df["Column4"] = (epoch + t1).astype("M8[s]")
3131
t2 = pd.to_timedelta(df["Column12"], unit="d")
32-
df["Column12"] = epoch + t2
32+
df["Column12"] = (epoch + t2).astype("M8[s]")
3333
for k in range(df.shape[1]):
3434
col = df.iloc[:, k]
3535
if col.dtype == np.int64:
@@ -59,7 +59,7 @@ def test_from_buffer(self, dirpath, data_test_ix):
5959
buf, format="sas7bdat", iterator=True, encoding="utf-8"
6060
) as rdr:
6161
df = rdr.read()
62-
tm.assert_frame_equal(df, df0, check_exact=False)
62+
tm.assert_frame_equal(df, df0)
6363

6464
@pytest.mark.slow
6565
def test_from_iterator(self, dirpath, data_test_ix):
@@ -157,6 +157,8 @@ def test_productsales(datapath):
157157
df0 = pd.read_csv(fname, parse_dates=["MONTH"])
158158
vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"]
159159
df0[vn] = df0[vn].astype(np.float64)
160+
161+
df0["MONTH"] = df0["MONTH"].astype("M8[s]")
160162
tm.assert_frame_equal(df, df0)
161163

162164

@@ -175,7 +177,7 @@ def test_airline(datapath):
175177
fname = datapath("io", "sas", "data", "airline.csv")
176178
df0 = pd.read_csv(fname)
177179
df0 = df0.astype(np.float64)
178-
tm.assert_frame_equal(df, df0, check_exact=False)
180+
tm.assert_frame_equal(df, df0)
179181

180182

181183
def test_date_time(datapath):
@@ -191,14 +193,20 @@ def test_date_time(datapath):
191193
# access to SAS to read the sas7bdat file. We are really just testing
192194
# that we are "close". This only seems to be an issue near the
193195
# implementation bounds.
194-
res = df.iloc[:, 3].dt.round("us").copy()
195196

196-
# the first and last elements are near the implementation bounds, where we
197-
# would expect floating point error to occur.
198-
res.iloc[0] -= pd.Timedelta(microseconds=1)
199-
res.iloc[-1] += pd.Timedelta(microseconds=1)
197+
df[df.columns[3]] = df.iloc[:, 3].dt.round("us")
198+
df0["Date1"] = df0["Date1"].astype("M8[s]")
199+
df0["Date2"] = df0["Date2"].astype("M8[s]")
200+
df0["DateTime"] = df0["DateTime"].astype("M8[ms]")
201+
df0["Taiw"] = df0["Taiw"].astype("M8[s]")
200202

201-
df["DateTimeHi"] = res
203+
res = df0["DateTimeHi"].astype("M8[us]").dt.round("ms")
204+
df0["DateTimeHi"] = res.astype("M8[ms]")
205+
206+
if not IS64:
207+
# No good reason for this, just what we get on the CI
208+
df0.loc[0, "DateTimeHi"] += np.timedelta64(1, "ms")
209+
df0.loc[[2, 3], "DateTimeHi"] -= np.timedelta64(1, "ms")
202210
tm.assert_frame_equal(df, df0)
203211

204212

@@ -258,16 +266,6 @@ def test_corrupt_read(datapath):
258266
pd.read_sas(fname)
259267

260268

261-
def round_datetime_to_ms(ts):
262-
if isinstance(ts, datetime):
263-
return ts.replace(microsecond=int(round(ts.microsecond, -3) / 1000) * 1000)
264-
elif isinstance(ts, str):
265-
_ts = dateutil.parser.parse(timestr=ts)
266-
return _ts.replace(microsecond=int(round(_ts.microsecond, -3) / 1000) * 1000)
267-
else:
268-
return ts
269-
270-
271269
def test_max_sas_date(datapath):
272270
# GH 20927
273271
# NB. max datetime in SAS dataset is 31DEC9999:23:59:59.999
@@ -276,30 +274,33 @@ def test_max_sas_date(datapath):
276274
fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat")
277275
df = pd.read_sas(fname, encoding="iso-8859-1")
278276

279-
# SAS likes to left pad strings with spaces - lstrip before comparing
280-
df = df.map(lambda x: x.lstrip() if isinstance(x, str) else x)
281-
# GH 19732: Timestamps imported from sas will incur floating point errors
282-
try:
283-
df["dt_as_dt"] = df["dt_as_dt"].dt.round("us")
284-
except pd._libs.tslibs.np_datetime.OutOfBoundsDatetime:
285-
df = df.map(round_datetime_to_ms)
286-
except AttributeError:
287-
df["dt_as_dt"] = df["dt_as_dt"].apply(round_datetime_to_ms)
288-
# if there are any date/times > pandas.Timestamp.max then ALL in that chunk
289-
# are returned as datetime.datetime
290277
expected = pd.DataFrame(
291278
{
292279
"text": ["max", "normal"],
293280
"dt_as_float": [253717747199.999, 1880323199.999],
294-
"dt_as_dt": [
295-
datetime(9999, 12, 29, 23, 59, 59, 999000),
296-
datetime(2019, 8, 1, 23, 59, 59, 999000),
297-
],
281+
"dt_as_dt": np.array(
282+
[
283+
datetime(9999, 12, 29, 23, 59, 59, 999000),
284+
datetime(2019, 8, 1, 23, 59, 59, 999000),
285+
],
286+
dtype="M8[ms]",
287+
),
298288
"date_as_float": [2936547.0, 21762.0],
299-
"date_as_date": [datetime(9999, 12, 29), datetime(2019, 8, 1)],
289+
"date_as_date": np.array(
290+
[
291+
datetime(9999, 12, 29),
292+
datetime(2019, 8, 1),
293+
],
294+
dtype="M8[s]",
295+
),
300296
},
301297
columns=["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"],
302298
)
299+
300+
if not IS64:
301+
# No good reason for this, just what we get on the CI
302+
expected.loc[:, "dt_as_dt"] -= np.timedelta64(1, "ms")
303+
303304
tm.assert_frame_equal(df, expected)
304305

305306

@@ -312,41 +313,40 @@ def test_max_sas_date_iterator(datapath):
312313
fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat")
313314
results = []
314315
for df in pd.read_sas(fname, encoding="iso-8859-1", chunksize=1):
315-
# SAS likes to left pad strings with spaces - lstrip before comparing
316-
df = df.map(lambda x: x.lstrip() if isinstance(x, str) else x)
317316
# GH 19732: Timestamps imported from sas will incur floating point errors
318-
try:
319-
df["dt_as_dt"] = df["dt_as_dt"].dt.round("us")
320-
except pd._libs.tslibs.np_datetime.OutOfBoundsDatetime:
321-
df = df.map(round_datetime_to_ms)
322-
except AttributeError:
323-
df["dt_as_dt"] = df["dt_as_dt"].apply(round_datetime_to_ms)
324317
df.reset_index(inplace=True, drop=True)
325318
results.append(df)
326319
expected = [
327320
pd.DataFrame(
328321
{
329322
"text": ["max"],
330323
"dt_as_float": [253717747199.999],
331-
"dt_as_dt": [datetime(9999, 12, 29, 23, 59, 59, 999000)],
324+
"dt_as_dt": np.array(
325+
[datetime(9999, 12, 29, 23, 59, 59, 999000)], dtype="M8[ms]"
326+
),
332327
"date_as_float": [2936547.0],
333-
"date_as_date": [datetime(9999, 12, 29)],
328+
"date_as_date": np.array([datetime(9999, 12, 29)], dtype="M8[s]"),
334329
},
335330
columns=col_order,
336331
),
337332
pd.DataFrame(
338333
{
339334
"text": ["normal"],
340335
"dt_as_float": [1880323199.999],
341-
"dt_as_dt": [np.datetime64("2019-08-01 23:59:59.999")],
336+
"dt_as_dt": np.array(["2019-08-01 23:59:59.999"], dtype="M8[ms]"),
342337
"date_as_float": [21762.0],
343-
"date_as_date": [np.datetime64("2019-08-01")],
338+
"date_as_date": np.array(["2019-08-01"], dtype="M8[s]"),
344339
},
345340
columns=col_order,
346341
),
347342
]
348-
for result, expected in zip(results, expected):
349-
tm.assert_frame_equal(result, expected)
343+
if not IS64:
344+
# No good reason for this, just what we get on the CI
345+
expected[0].loc[0, "dt_as_dt"] -= np.timedelta64(1, "ms")
346+
expected[1].loc[0, "dt_as_dt"] -= np.timedelta64(1, "ms")
347+
348+
tm.assert_frame_equal(results[0], expected[0])
349+
tm.assert_frame_equal(results[1], expected[1])
350350

351351

352352
def test_null_date(datapath):
@@ -355,16 +355,25 @@ def test_null_date(datapath):
355355

356356
expected = pd.DataFrame(
357357
{
358-
"datecol": [
359-
datetime(9999, 12, 29),
360-
pd.NaT,
361-
],
362-
"datetimecol": [
363-
datetime(9999, 12, 29, 23, 59, 59, 998993),
364-
pd.NaT,
365-
],
358+
"datecol": np.array(
359+
[
360+
datetime(9999, 12, 29),
361+
np.datetime64("NaT"),
362+
],
363+
dtype="M8[s]",
364+
),
365+
"datetimecol": np.array(
366+
[
367+
datetime(9999, 12, 29, 23, 59, 59, 999000),
368+
np.datetime64("NaT"),
369+
],
370+
dtype="M8[ms]",
371+
),
366372
},
367373
)
374+
if not IS64:
375+
# No good reason for this, just what we get on the CI
376+
expected.loc[0, "datetimecol"] -= np.timedelta64(1, "ms")
368377
tm.assert_frame_equal(df, expected)
369378

370379

0 commit comments

Comments
 (0)