|
3 | 3 | import os
|
4 | 4 | from pathlib import Path
|
5 | 5 |
|
| 6 | +import dateutil.parser |
6 | 7 | import numpy as np
|
7 | 8 | import pytest
|
8 | 9 |
|
@@ -214,3 +215,94 @@ def test_zero_variables(datapath):
|
214 | 215 | fname = datapath("io", "sas", "data", "zero_variables.sas7bdat")
|
215 | 216 | with pytest.raises(EmptyDataError):
|
216 | 217 | pd.read_sas(fname)
|
| 218 | + |
| 219 | + |
| 220 | +def round_datetime_to_ms(ts): |
| 221 | + if isinstance(ts, datetime): |
| 222 | + return ts.replace(microsecond=int(round(ts.microsecond, -3) / 1000) * 1000) |
| 223 | + elif isinstance(ts, str): |
| 224 | + _ts = dateutil.parser.parse(timestr=ts) |
| 225 | + return _ts.replace(microsecond=int(round(_ts.microsecond, -3) / 1000) * 1000) |
| 226 | + else: |
| 227 | + return ts |
| 228 | + |
| 229 | + |
| 230 | +def test_max_sas_date(datapath): |
| 231 | + # GH 20927 |
| 232 | + # NB. max datetime in SAS dataset is 31DEC9999:23:59:59.999 |
| 233 | + # but this is read as 29DEC9999:23:59:59.998993 by a buggy |
| 234 | + # sas7bdat module |
| 235 | + fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat") |
| 236 | + df = pd.read_sas(fname, encoding="iso-8859-1") |
| 237 | + |
| 238 | + # SAS likes to left pad strings with spaces - lstrip before comparing |
| 239 | + df = df.applymap(lambda x: x.lstrip() if isinstance(x, str) else x) |
| 240 | + # GH 19732: Timestamps imported from sas will incur floating point errors |
| 241 | + try: |
| 242 | + df["dt_as_dt"] = df["dt_as_dt"].dt.round("us") |
| 243 | + except pd._libs.tslibs.np_datetime.OutOfBoundsDatetime: |
| 244 | + df = df.applymap(round_datetime_to_ms) |
| 245 | + except AttributeError: |
| 246 | + df["dt_as_dt"] = df["dt_as_dt"].apply(round_datetime_to_ms) |
| 247 | + # if there are any date/times > pandas.Timestamp.max then ALL in that chunk |
| 248 | + # are returned as datetime.datetime |
| 249 | + expected = pd.DataFrame( |
| 250 | + { |
| 251 | + "text": ["max", "normal"], |
| 252 | + "dt_as_float": [253717747199.999, 1880323199.999], |
| 253 | + "dt_as_dt": [ |
| 254 | + datetime(9999, 12, 29, 23, 59, 59, 999000), |
| 255 | + datetime(2019, 8, 1, 23, 59, 59, 999000), |
| 256 | + ], |
| 257 | + "date_as_float": [2936547.0, 21762.0], |
| 258 | + "date_as_date": [datetime(9999, 12, 29), datetime(2019, 8, 1)], |
| 259 | + }, |
| 260 | + columns=["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"], |
| 261 | + ) |
| 262 | + tm.assert_frame_equal(df, expected) |
| 263 | + |
| 264 | + |
| 265 | +def test_max_sas_date_iterator(datapath): |
| 266 | + # GH 20927 |
| 267 | + # when called as an iterator, only those chunks with a date > pd.Timestamp.max |
| 268 | + # are returned as datetime.datetime, if this happens that whole chunk is returned |
| 269 | + # as datetime.datetime |
| 270 | + col_order = ["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"] |
| 271 | + fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat") |
| 272 | + results = [] |
| 273 | + for df in pd.read_sas(fname, encoding="iso-8859-1", chunksize=1): |
| 274 | + # SAS likes to left pad strings with spaces - lstrip before comparing |
| 275 | + df = df.applymap(lambda x: x.lstrip() if isinstance(x, str) else x) |
| 276 | + # GH 19732: Timestamps imported from sas will incur floating point errors |
| 277 | + try: |
| 278 | + df["dt_as_dt"] = df["dt_as_dt"].dt.round("us") |
| 279 | + except pd._libs.tslibs.np_datetime.OutOfBoundsDatetime: |
| 280 | + df = df.applymap(round_datetime_to_ms) |
| 281 | + except AttributeError: |
| 282 | + df["dt_as_dt"] = df["dt_as_dt"].apply(round_datetime_to_ms) |
| 283 | + df.reset_index(inplace=True, drop=True) |
| 284 | + results.append(df) |
| 285 | + expected = [ |
| 286 | + pd.DataFrame( |
| 287 | + { |
| 288 | + "text": ["max"], |
| 289 | + "dt_as_float": [253717747199.999], |
| 290 | + "dt_as_dt": [datetime(9999, 12, 29, 23, 59, 59, 999000)], |
| 291 | + "date_as_float": [2936547.0], |
| 292 | + "date_as_date": [datetime(9999, 12, 29)], |
| 293 | + }, |
| 294 | + columns=col_order, |
| 295 | + ), |
| 296 | + pd.DataFrame( |
| 297 | + { |
| 298 | + "text": ["normal"], |
| 299 | + "dt_as_float": [1880323199.999], |
| 300 | + "dt_as_dt": [np.datetime64("2019-08-01 23:59:59.999")], |
| 301 | + "date_as_float": [21762.0], |
| 302 | + "date_as_date": [np.datetime64("2019-08-01")], |
| 303 | + }, |
| 304 | + columns=col_order, |
| 305 | + ), |
| 306 | + ] |
| 307 | + for result, expected in zip(results, expected): |
| 308 | + tm.assert_frame_equal(result, expected) |
0 commit comments