From 31b877004b20098e810490e7332c2920029b94cd Mon Sep 17 00:00:00 2001 From: Paul Lilley Date: Wed, 21 Aug 2019 00:34:28 +0200 Subject: [PATCH 01/23] BUG GH20927 Fixes read_sas error for dates/datetimes greater than 2262-04-11 --- pandas/tests/io/sas/data/max_sas_date.sas7bdat | Bin 0 -> 393216 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 pandas/tests/io/sas/data/max_sas_date.sas7bdat diff --git a/pandas/tests/io/sas/data/max_sas_date.sas7bdat b/pandas/tests/io/sas/data/max_sas_date.sas7bdat new file mode 100644 index 0000000000000000000000000000000000000000..b7838ebdcfeead0cf86efa1b5bb8af7b21201946 GIT binary patch literal 393216 zcmeIz&x=)6902h1-i$S&`2!?ehz=^@VjA9abh03rmot(u1L4fV+KdCE6^vRr!L%u( zAOt~a)vCY1UE2nt7Dby@MT?1O8$u|QY3l2~`_6f9-o-m3)gtM0#COj3d%ow~dq4Nb zgwSnY{PNz(Pq%OTcKM@$@Yq-NOFMRN-8Ql(jEq#O^-8C#uhNqrsq7sI`$HJ)>y%74 zrU#E6o1BUYoBu6uy49HL)+wZpfS}o!SkdKiqAGEK}`S9m1Ws{zS|V#y&rO zf44fn>VZn<>hrk261K+rG{2(|$1v~Wp_;W$awt2mRy!Ov=KXlwe6czUa(r^I(8e1xjp><*7Z*=0E$%z=LgRS0W`1#Y?)>cOg>!R@aXi16 z)lJMCo0^!PTRhdB&1z%Z>L(75*6QPp?onH#h?mZ#eJ!Ezg!V4^C()+(P^WuViJPTo zr5GoxluzeZjMIHd`SgSpRgF;r|3#G+kZL#d%0>4LjnW{5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAn?!&R4efkHb3;2;sOW|AV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly@UIE{ E1y>kE2mk;8 literal 0 HcmV?d00001 From 41ea3124b4f79989c26f8a579923263e28727cc4 Mon Sep 17 00:00:00 2001 From: Paul Lilley Date: Wed, 21 Aug 2019 00:43:36 +0200 Subject: [PATCH 02/23] BUG GH20927 Fixes read_sas error for dates/datetimes greater than 2262-04-11 --- doc/source/whatsnew/v0.25.1.rst | 1 + pandas/io/sas/sas7bdat.py | 33 ++++++++++++++++++++-------- pandas/tests/io/sas/test_sas7bdat.py | 31 ++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 463dcef9feab6..b8d702064336e 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -105,6 +105,7 @@ I/O ^^^ - Avoid calling ``S3File.s3`` when reading parquet, as this was removed in s3fs version 0.3.0 (:issue:`27756`) - Better error message when a negative header is passed in :func:`pandas.read_csv` (:issue:`27779`) +- :func:`read_sas()` now handles dates and datetimes larger pd.Timestamp.max returning them as datetime.date/datetime objects (:issue:`20927`) - Plotting diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 7cc9dc11a8ccc..fce57c3724e02 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -13,12 +13,12 @@ Reference for binary data compression: http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm """ -from datetime import datetime +from datetime import date, datetime, timedelta import struct import numpy as np -from pandas.errors import EmptyDataError +from pandas.errors import EmptyDataError, OutOfBoundsDatetime import pandas as pd @@ -703,15 +703,30 @@ def _chunk_to_dataframe(self): rslt[name] = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d") rslt[name] = np.asarray(rslt[name], dtype=np.float64) if self.convert_dates: - unit = None if self.column_formats[j] in const.sas_date_formats: - unit = "d" + try: + rslt[name] = pd.to_datetime( + rslt[name], unit="d", origin="1960-01-01" + ) + except OutOfBoundsDatetime: + # convert to datetime.date rather than np.datetime64 + rslt[name] = rslt[name].apply( + lambda sas_date_float: date(1960, 1, 1) + + timedelta(days=sas_date_float) + ) elif self.column_formats[j] in const.sas_datetime_formats: - unit = "s" - if unit: - rslt[name] = pd.to_datetime( - rslt[name], unit=unit, origin="1960-01-01" - ) + try: + rslt[name] = pd.to_datetime( + rslt[name], unit="s", origin="1960-01-01" + ) + except OutOfBoundsDatetime: + # convert to datetime.date rather than np.datetime64 + # SAS float64 lacks precision for more than ms + # resolution so the fit to datetime.datetime is ok + rslt[name] = rslt[name].apply( + lambda sas_date_float: datetime(1960, 1, 1) + + timedelta(seconds=sas_date_float) + ) jb += 1 elif self._column_types[j] == b"s": rslt[name] = self._string_chunk[js, :] diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index e37561c865c7a..132983e205010 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -1,3 +1,4 @@ +from datetime import date, datetime import io import os @@ -217,3 +218,33 @@ def test_zero_variables(datapath): fname = datapath("io", "sas", "data", "zero_variables.sas7bdat") with pytest.raises(EmptyDataError): pd.read_sas(fname) + + +def test_max_sas_date(datapath): + # GH 20927 + # NB. max datetime in SAS dataset is 31DEC9999:23:59:59.999 + # but this is read as 29DEC9999:23:59:59.998993 by a buggy + # sas7bdat module + fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat") + df = pd.read_sas(fname, encoding="iso-8859-1") + # SAS likes to left pad strings with spaces - lstrip before comparing + df = df.applymap(lambda x: x.lstrip() if isinstance(x, str) else x) + # GH 19732: Timestamps imported from sas will incur floating point errors + df = df.applymap( + lambda x: x.replace(microsecond=round(x.microsecond, -3)) + if isinstance(x, datetime) + else x + ) + expected = pd.DataFrame( + { + "text": ["max", "normal"], + "dt_as_float": [253717747199.999, 1880323199.999], + "dt_as_dt": [ + datetime.strptime("9999-12-29 23:59:59.999", "%Y-%m-%d %H:%M:%S.%f"), + datetime.strptime("2019-08-01 23:59:59.999", "%Y-%m-%d %H:%M:%S.%f"), + ], + "date_as_float": [float(2936547), float(21762)], + "date_as_date": [date(9999, 12, 29), date(2019, 8, 1)], + } + ) + tm.assert_frame_equal(df, expected) From 1e3a71ac4d3088f8ba757f3f3247ae7414e26414 Mon Sep 17 00:00:00 2001 From: Paul Lilley Date: Wed, 21 Aug 2019 01:02:01 +0200 Subject: [PATCH 03/23] DOC fixed typo --- doc/source/whatsnew/v0.25.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index b8d702064336e..8f6095cbe19ed 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -105,7 +105,7 @@ I/O ^^^ - Avoid calling ``S3File.s3`` when reading parquet, as this was removed in s3fs version 0.3.0 (:issue:`27756`) - Better error message when a negative header is passed in :func:`pandas.read_csv` (:issue:`27779`) -- :func:`read_sas()` now handles dates and datetimes larger pd.Timestamp.max returning them as datetime.date/datetime objects (:issue:`20927`) +- :func:`read_sas()` now handles dates and datetimes larger than pd.Timestamp.max returning them as datetime.date/datetime objects (:issue:`20927`) - Plotting From 0b4a97e89d99d1ee848ee8aab0f30b3a247b8df4 Mon Sep 17 00:00:00 2001 From: Paul Lilley Date: Thu, 22 Aug 2019 17:15:56 +0200 Subject: [PATCH 04/23] fixed failing test due to column ordering --- pandas/tests/io/sas/test_sas7bdat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 132983e205010..5d6ec325982b2 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -243,8 +243,8 @@ def test_max_sas_date(datapath): datetime.strptime("9999-12-29 23:59:59.999", "%Y-%m-%d %H:%M:%S.%f"), datetime.strptime("2019-08-01 23:59:59.999", "%Y-%m-%d %H:%M:%S.%f"), ], - "date_as_float": [float(2936547), float(21762)], "date_as_date": [date(9999, 12, 29), date(2019, 8, 1)], - } + "date_as_float": [float(2936547), float(21762)], + }, columns=["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"] ) tm.assert_frame_equal(df, expected) From d7ad4e7e9f8ae9b6be3fc1db2319c450735df538 Mon Sep 17 00:00:00 2001 From: Paul Lilley Date: Thu, 22 Aug 2019 17:18:05 +0200 Subject: [PATCH 05/23] black style fix --- pandas/tests/io/sas/test_sas7bdat.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 5d6ec325982b2..586af29320a37 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -245,6 +245,7 @@ def test_max_sas_date(datapath): ], "date_as_date": [date(9999, 12, 29), date(2019, 8, 1)], "date_as_float": [float(2936547), float(21762)], - }, columns=["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"] + }, + columns=["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"], ) tm.assert_frame_equal(df, expected) From 1aca08aa60838a275436dda43b537753303d5d18 Mon Sep 17 00:00:00 2001 From: Paul Lilley Date: Thu, 22 Aug 2019 22:58:30 +0200 Subject: [PATCH 06/23] Return datetime.datetime and raise UserWarning for all dates/datetimes > pd.Timestamp.max Also added test for iterator behaviour, chunks that have all dates/datetimes < pd.Timestamp.max will return np.datetime64 as usual, chunks with any date/datetime > pd.Timestamp.max will return datetime.datetime --- pandas/io/sas/sas7bdat.py | 12 +++-- pandas/tests/io/sas/test_sas7bdat.py | 69 ++++++++++++++++++++++++---- 2 files changed, 69 insertions(+), 12 deletions(-) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index fce57c3724e02..bcd27cb0b1be1 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -13,7 +13,7 @@ Reference for binary data compression: http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm """ -from datetime import date, datetime, timedelta +from datetime import datetime, timedelta import struct import numpy as np @@ -26,6 +26,8 @@ from pandas.io.sas._sas import Parser import pandas.io.sas.sas_constants as const +from warnings import warn + class _subheader_pointer: pass @@ -703,15 +705,18 @@ def _chunk_to_dataframe(self): rslt[name] = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d") rslt[name] = np.asarray(rslt[name], dtype=np.float64) if self.convert_dates: + warn_msg = "date > pandas.Timestamp.max, returning datetime.datetime objects instead" if self.column_formats[j] in const.sas_date_formats: try: rslt[name] = pd.to_datetime( rslt[name], unit="d", origin="1960-01-01" ) except OutOfBoundsDatetime: - # convert to datetime.date rather than np.datetime64 + # convert to datetime.datetime rather than np.datetime64 + # nb generally better support in pandas for datetime than for date + warn(warn_msg) rslt[name] = rslt[name].apply( - lambda sas_date_float: date(1960, 1, 1) + lambda sas_date_float: datetime(1960, 1, 1) + timedelta(days=sas_date_float) ) elif self.column_formats[j] in const.sas_datetime_formats: @@ -723,6 +728,7 @@ def _chunk_to_dataframe(self): # convert to datetime.date rather than np.datetime64 # SAS float64 lacks precision for more than ms # resolution so the fit to datetime.datetime is ok + warn(warn_msg) rslt[name] = rslt[name].apply( lambda sas_date_float: datetime(1960, 1, 1) + timedelta(seconds=sas_date_float) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 586af29320a37..0dd43b58cb85f 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -220,32 +220,83 @@ def test_zero_variables(datapath): pd.read_sas(fname) +def round_datetime_to_ms(ts): + if isinstance(ts, datetime): + return ts.replace(microsecond=int(round(ts.microsecond, -3) / 1000) * 1000) + else: + return ts + + def test_max_sas_date(datapath): # GH 20927 # NB. max datetime in SAS dataset is 31DEC9999:23:59:59.999 # but this is read as 29DEC9999:23:59:59.998993 by a buggy # sas7bdat module fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat") - df = pd.read_sas(fname, encoding="iso-8859-1") + with pytest.warns(UserWarning): + df = pd.read_sas(fname, encoding="iso-8859-1") # SAS likes to left pad strings with spaces - lstrip before comparing df = df.applymap(lambda x: x.lstrip() if isinstance(x, str) else x) # GH 19732: Timestamps imported from sas will incur floating point errors - df = df.applymap( - lambda x: x.replace(microsecond=round(x.microsecond, -3)) - if isinstance(x, datetime) - else x - ) + df = df.applymap(round_datetime_to_ms) + # if there are any date/times > pandas.Timestamp.max then ALL in that chunk + # are returned as datetime.datetime expected = pd.DataFrame( { "text": ["max", "normal"], "dt_as_float": [253717747199.999, 1880323199.999], "dt_as_dt": [ - datetime.strptime("9999-12-29 23:59:59.999", "%Y-%m-%d %H:%M:%S.%f"), - datetime.strptime("2019-08-01 23:59:59.999", "%Y-%m-%d %H:%M:%S.%f"), + datetime(9999, 12, 29, 23, 59, 59, 999000), + datetime(2019, 8, 1, 23, 59, 59, 999000), ], - "date_as_date": [date(9999, 12, 29), date(2019, 8, 1)], "date_as_float": [float(2936547), float(21762)], + "date_as_date": [datetime(9999, 12, 29), datetime(2019, 8, 1)], }, columns=["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"], ) tm.assert_frame_equal(df, expected) + + +def test_max_sas_date_iterator(datapath): + # GH 20927 + # when called as an iterator, only those chunks with a date > pd.Timestamp.max + # are returned as datetime.datetime, if this happens that whole chunk is returned + # as datetime.datetime + col_order = ["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"] + fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat") + results = [] + with pytest.warns(UserWarning): + for df in pd.read_sas(fname, encoding="iso-8859-1", chunksize=1): + # SAS likes to left pad strings with spaces - lstrip before comparing + df = df.applymap(lambda x: x.lstrip() if isinstance(x, str) else x) + # GH 19732: Timestamps imported from sas will incur floating point errors + try: + df["dt_as_dt"] = df["dt_as_dt"].dt.round("us") + except pd._libs.tslibs.np_datetime.OutOfBoundsDatetime: + df = df.applymap(round_datetime_to_ms) + df.reset_index(inplace=True, drop=True) + results.append(df) + expected = [ + pd.DataFrame( + { + "text": ["max"], + "dt_as_float": [253717747199.999], + "dt_as_dt": [datetime(9999, 12, 29, 23, 59, 59, 999000)], + "date_as_float": [float(2936547)], + "date_as_date": [datetime(9999, 12, 29)], + }, + columns=col_order, + ), + pd.DataFrame( + { + "text": ["normal"], + "dt_as_float": [1880323199.999], + "dt_as_dt": [np.datetime64("2019-08-01 23:59:59.999")], + "date_as_float": [float(21762)], + "date_as_date": [np.datetime64("2019-08-01")], + }, + columns=col_order, + ), + ] + for r, e in zip(results, expected): + tm.assert_frame_equal(r, e) From f9dfb440c4dd14232b6379d1d518b35537047222 Mon Sep 17 00:00:00 2001 From: Paul Lilley Date: Thu, 22 Aug 2019 23:39:33 +0200 Subject: [PATCH 07/23] fixed PEP8 formating issue --- pandas/io/sas/sas7bdat.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index bcd27cb0b1be1..718bb90f0710a 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -705,7 +705,10 @@ def _chunk_to_dataframe(self): rslt[name] = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d") rslt[name] = np.asarray(rslt[name], dtype=np.float64) if self.convert_dates: - warn_msg = "date > pandas.Timestamp.max, returning datetime.datetime objects instead" + warn_msg = ( + "date > pandas.Timestamp.max, returning " + "datetime.datetime objects instead" + ) if self.column_formats[j] in const.sas_date_formats: try: rslt[name] = pd.to_datetime( @@ -713,7 +716,8 @@ def _chunk_to_dataframe(self): ) except OutOfBoundsDatetime: # convert to datetime.datetime rather than np.datetime64 - # nb generally better support in pandas for datetime than for date + # nb generally better support in pandas for datetime than + # date warn(warn_msg) rslt[name] = rslt[name].apply( lambda sas_date_float: datetime(1960, 1, 1) From 1865cb3c9a399a23cb30152b0f0c8bb6b27da412 Mon Sep 17 00:00:00 2001 From: Paul Lilley Date: Tue, 27 Aug 2019 22:16:58 +0200 Subject: [PATCH 08/23] DOC fixed attribute and class markdown --- doc/source/whatsnew/v0.25.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index c5904cea82b9e..2e257685e0ac1 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -72,7 +72,7 @@ I/O - Avoid calling ``S3File.s3`` when reading parquet, as this was removed in s3fs version 0.3.0 (:issue:`27756`) - Better error message when a negative header is passed in :func:`pandas.read_csv` (:issue:`27779`) -- :func:`read_sas()` now handles dates and datetimes larger than pd.Timestamp.max returning them as datetime.datetime objects (:issue:`20927`) +- :func:`read_sas()` now handles dates and datetimes larger than :attr:`Timestamp.max` returning them as :class:`datetime.datetime` objects (:issue:`20927`) - Follow the ``min_rows`` display option (introduced in v0.25.0) correctly in the HTML repr in the notebook (:issue:`27991`). Plotting From 6881114c45028b3a4ec61658e7e03688a45227e0 Mon Sep 17 00:00:00 2001 From: Paul Lilley Date: Tue, 27 Aug 2019 23:56:04 +0200 Subject: [PATCH 09/23] fixed linting issue --- pandas/tests/io/sas/test_sas7bdat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 0dd43b58cb85f..331379464471e 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -1,4 +1,4 @@ -from datetime import date, datetime +from datetime import datetime import io import os From b0976afdd3ae374aeb86fedec80cc4d80d484334 Mon Sep 17 00:00:00 2001 From: Paul Lilley Date: Fri, 25 Oct 2019 17:13:04 +0200 Subject: [PATCH 10/23] CLN: minor refactor for reading sas files, updated whatsnew --- doc/source/whatsnew/v0.25.1.rst | 1 - doc/source/whatsnew/v1.0.0.rst | 1 + pandas/io/sas/sas7bdat.py | 65 ++++++++++++++++++--------------- 3 files changed, 36 insertions(+), 31 deletions(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 82d7868c2ca06..2e9524fea89b1 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -72,7 +72,6 @@ I/O - Avoid calling ``S3File.s3`` when reading parquet, as this was removed in s3fs version 0.3.0 (:issue:`27756`) - Better error message when a negative header is passed in :func:`pandas.read_csv` (:issue:`27779`) -- :func:`read_sas()` now handles dates and datetimes larger than :attr:`Timestamp.max` returning them as :class:`datetime.datetime` objects (:issue:`20927`) - Follow the ``min_rows`` display option (introduced in v0.25.0) correctly in the HTML repr in the notebook (:issue:`27991`). Plotting diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 736264a1196cf..3de76696e1a57 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -382,6 +382,7 @@ I/O - Bug in :meth:`DataFrame.read_json` where using ``orient="index"`` would not maintain the order (:issue:`28557`) - Bug in :meth:`DataFrame.to_html` where the length of the ``formatters`` argument was not verified (:issue:`28469`) - Bug in :meth:`pandas.io.formats.style.Styler` formatting for floating values not displaying decimals correctly (:issue:`13257`) +- :func:`read_sas()` now handles dates and datetimes larger than :attr:`Timestamp.max` returning them as :class:`datetime.datetime` objects (:issue:`20927`) Plotting ^^^^^^^^ diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index dd6badf3c62c2..2c0f3873d050e 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -689,6 +689,39 @@ def _read_next_page(self): return False + @staticmethod + def _convert_dates(sas_dates): + try: + return pd.to_datetime(sas_dates, unit="d", origin="1960-01-01") + except OutOfBoundsDatetime: + # convert to datetime.datetime rather than np.datetime64 + # NB generally better support in pandas for datetime than date + warn( + "SAS date > pandas.Timestamp.max, returning " + "datetime.datetime objects instead" + ) + return sas_dates.apply( + lambda sas_date_float: datetime(1960, 1, 1) + + timedelta(days=sas_date_float) + ) + + @staticmethod + def _convert_datetimes(sas_datetimes): + try: + return pd.to_datetime(sas_datetimes, unit="s", origin="1960-01-01") + except OutOfBoundsDatetime: + # convert to datetime.datetime rather than np.datetime64 + # SAS float64 lacks precision for more than ms resolution so the + # fit to datetime.datetime is ok + warn( + "SAS datetime > pandas.Timestamp.max, returning " + "datetime.datetime objects instead" + ) + return sas_datetimes.apply( + lambda sas_dt_float: datetime(1960, 1, 1) + + timedelta(seconds=sas_dt_float) + ) + def _chunk_to_dataframe(self): n = self._current_row_in_chunk_index @@ -705,38 +738,10 @@ def _chunk_to_dataframe(self): rslt[name] = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d") rslt[name] = np.asarray(rslt[name], dtype=np.float64) if self.convert_dates: - warn_msg = ( - "date > pandas.Timestamp.max, returning " - "datetime.datetime objects instead" - ) if self.column_formats[j] in const.sas_date_formats: - try: - rslt[name] = pd.to_datetime( - rslt[name], unit="d", origin="1960-01-01" - ) - except OutOfBoundsDatetime: - # convert to datetime.datetime rather than np.datetime64 - # nb generally better support in pandas for datetime than - # date - warn(warn_msg) - rslt[name] = rslt[name].apply( - lambda sas_date_float: datetime(1960, 1, 1) - + timedelta(days=sas_date_float) - ) + rslt[name] = self._convert_dates(rslt[name]) elif self.column_formats[j] in const.sas_datetime_formats: - try: - rslt[name] = pd.to_datetime( - rslt[name], unit="s", origin="1960-01-01" - ) - except OutOfBoundsDatetime: - # convert to datetime.date rather than np.datetime64 - # SAS float64 lacks precision for more than ms - # resolution so the fit to datetime.datetime is ok - warn(warn_msg) - rslt[name] = rslt[name].apply( - lambda sas_date_float: datetime(1960, 1, 1) - + timedelta(seconds=sas_date_float) - ) + rslt[name] = self._convert_datetimes(rslt[name]) jb += 1 elif self._column_types[j] == b"s": rslt[name] = self._string_chunk[js, :] From 096ae1cfe74f083387495135bb7a9a2d3ffd217e Mon Sep 17 00:00:00 2001 From: Paul Lilley Date: Wed, 11 Dec 2019 13:40:30 +0100 Subject: [PATCH 11/23] removed warnings --- pandas/io/sas/sas7bdat.py | 49 ++++++++++------------------ pandas/tests/io/sas/test_sas7bdat.py | 24 +++++++------- 2 files changed, 29 insertions(+), 44 deletions(-) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 2c0f3873d050e..ab129c17c1ee3 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -690,37 +690,24 @@ def _read_next_page(self): return False @staticmethod - def _convert_dates(sas_dates): + def _convert_datetimes(sas_datetimes, unit): + """Converts to np.datetime64 if possible, otherwise to dateime.datetime + (n.b. generally better support in pandas for datetime than date). + SAS float64 lacks precision for more than ms resolution so the fit + to datetime.datetime is ok + """ try: - return pd.to_datetime(sas_dates, unit="d", origin="1960-01-01") + return pd.to_datetime(sas_datetimes, unit=unit, origin="1960-01-01") except OutOfBoundsDatetime: - # convert to datetime.datetime rather than np.datetime64 - # NB generally better support in pandas for datetime than date - warn( - "SAS date > pandas.Timestamp.max, returning " - "datetime.datetime objects instead" - ) - return sas_dates.apply( - lambda sas_date_float: datetime(1960, 1, 1) - + timedelta(days=sas_date_float) - ) - - @staticmethod - def _convert_datetimes(sas_datetimes): - try: - return pd.to_datetime(sas_datetimes, unit="s", origin="1960-01-01") - except OutOfBoundsDatetime: - # convert to datetime.datetime rather than np.datetime64 - # SAS float64 lacks precision for more than ms resolution so the - # fit to datetime.datetime is ok - warn( - "SAS datetime > pandas.Timestamp.max, returning " - "datetime.datetime objects instead" - ) - return sas_datetimes.apply( - lambda sas_dt_float: datetime(1960, 1, 1) - + timedelta(seconds=sas_dt_float) - ) + if unit == "s": + return sas_datetimes.apply( + lambda sas_float: datetime(1960, 1, 1) + + timedelta(seconds=sas_float) + ) + elif unit == "d": + return sas_datetimes.apply( + lambda sas_float: datetime(1960, 1, 1) + timedelta(days=sas_float) + ) def _chunk_to_dataframe(self): @@ -739,9 +726,9 @@ def _chunk_to_dataframe(self): rslt[name] = np.asarray(rslt[name], dtype=np.float64) if self.convert_dates: if self.column_formats[j] in const.sas_date_formats: - rslt[name] = self._convert_dates(rslt[name]) + rslt[name] = self._convert_datetimes(rslt[name], "d") elif self.column_formats[j] in const.sas_datetime_formats: - rslt[name] = self._convert_datetimes(rslt[name]) + rslt[name] = self._convert_datetimes(rslt[name], "s") jb += 1 elif self._column_types[j] == b"s": rslt[name] = self._string_chunk[js, :] diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 331379464471e..900509f9beed2 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -233,8 +233,7 @@ def test_max_sas_date(datapath): # but this is read as 29DEC9999:23:59:59.998993 by a buggy # sas7bdat module fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat") - with pytest.warns(UserWarning): - df = pd.read_sas(fname, encoding="iso-8859-1") + df = pd.read_sas(fname, encoding="iso-8859-1") # SAS likes to left pad strings with spaces - lstrip before comparing df = df.applymap(lambda x: x.lstrip() if isinstance(x, str) else x) # GH 19732: Timestamps imported from sas will incur floating point errors @@ -265,17 +264,16 @@ def test_max_sas_date_iterator(datapath): col_order = ["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"] fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat") results = [] - with pytest.warns(UserWarning): - for df in pd.read_sas(fname, encoding="iso-8859-1", chunksize=1): - # SAS likes to left pad strings with spaces - lstrip before comparing - df = df.applymap(lambda x: x.lstrip() if isinstance(x, str) else x) - # GH 19732: Timestamps imported from sas will incur floating point errors - try: - df["dt_as_dt"] = df["dt_as_dt"].dt.round("us") - except pd._libs.tslibs.np_datetime.OutOfBoundsDatetime: - df = df.applymap(round_datetime_to_ms) - df.reset_index(inplace=True, drop=True) - results.append(df) + for df in pd.read_sas(fname, encoding="iso-8859-1", chunksize=1): + # SAS likes to left pad strings with spaces - lstrip before comparing + df = df.applymap(lambda x: x.lstrip() if isinstance(x, str) else x) + # GH 19732: Timestamps imported from sas will incur floating point errors + try: + df["dt_as_dt"] = df["dt_as_dt"].dt.round("us") + except pd._libs.tslibs.np_datetime.OutOfBoundsDatetime: + df = df.applymap(round_datetime_to_ms) + df.reset_index(inplace=True, drop=True) + results.append(df) expected = [ pd.DataFrame( { From 922c9d3cc07bd4cb5f89ff84a9caa614b6d53a21 Mon Sep 17 00:00:00 2001 From: Paul Lilley Date: Wed, 11 Dec 2019 16:34:56 +0100 Subject: [PATCH 12/23] tidied trailing whitespace --- pandas/io/sas/sas7bdat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 856933364c2f2..5501684630ac1 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -691,7 +691,7 @@ def _read_next_page(self): def _convert_datetimes(sas_datetimes, unit): """Converts to np.datetime64 if possible, otherwise to dateime.datetime (n.b. generally better support in pandas for datetime than date). - SAS float64 lacks precision for more than ms resolution so the fit + SAS float64 lacks precision for more than ms resolution so the fit to datetime.datetime is ok """ try: From 828e6875a4000f4bc23fc9e98fd1073c1335bc88 Mon Sep 17 00:00:00 2001 From: Paul Lilley Date: Mon, 27 Jan 2020 15:50:23 +0100 Subject: [PATCH 13/23] moved static method to module level function --- pandas/io/sas/sas7bdat.py | 49 +++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 5501684630ac1..97644e7a946d1 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -27,6 +27,31 @@ import pandas.io.sas.sas_constants as const +def _convert_datetimes(sas_datetimes: pd.Series, unit: str): + """ + Convert to Timestamp if possible, otherwise to datetime.datetime. + SAS float64 lacks precision for more than ms resolution so the fit + to datetime.datetime is ok. + + Parameters + ---------- + sas_datetimes : Series of 64bit floats representing dates or datetimes + in SAS + unit : "d" if the floats represent dates, "t" for datetimes + """ + try: + return pd.to_datetime(sas_datetimes, unit=unit, origin="1960-01-01") + except OutOfBoundsDatetime: + if unit == "s": + return sas_datetimes.apply( + lambda sas_float: datetime(1960, 1, 1) + timedelta(seconds=sas_float) + ) + elif unit == "d": + return sas_datetimes.apply( + lambda sas_float: datetime(1960, 1, 1) + timedelta(days=sas_float) + ) + + class _subheader_pointer: pass @@ -687,26 +712,6 @@ def _read_next_page(self): return False - @staticmethod - def _convert_datetimes(sas_datetimes, unit): - """Converts to np.datetime64 if possible, otherwise to dateime.datetime - (n.b. generally better support in pandas for datetime than date). - SAS float64 lacks precision for more than ms resolution so the fit - to datetime.datetime is ok - """ - try: - return pd.to_datetime(sas_datetimes, unit=unit, origin="1960-01-01") - except OutOfBoundsDatetime: - if unit == "s": - return sas_datetimes.apply( - lambda sas_float: datetime(1960, 1, 1) - + timedelta(seconds=sas_float) - ) - elif unit == "d": - return sas_datetimes.apply( - lambda sas_float: datetime(1960, 1, 1) + timedelta(days=sas_float) - ) - def _chunk_to_dataframe(self): n = self._current_row_in_chunk_index @@ -724,9 +729,9 @@ def _chunk_to_dataframe(self): rslt[name] = np.asarray(rslt[name], dtype=np.float64) if self.convert_dates: if self.column_formats[j] in const.sas_date_formats: - rslt[name] = self._convert_datetimes(rslt[name], "d") + rslt[name] = _convert_datetimes(rslt[name], "d") elif self.column_formats[j] in const.sas_datetime_formats: - rslt[name] = self._convert_datetimes(rslt[name], "s") + rslt[name] = _convert_datetimes(rslt[name], "s") jb += 1 elif self._column_types[j] == b"s": rslt[name] = self._string_chunk[js, :] From 6279ea7d063c15cb684b0e55b26905ae59b40cbc Mon Sep 17 00:00:00 2001 From: Paul Lilley Date: Wed, 29 Jan 2020 08:06:40 +0100 Subject: [PATCH 14/23] fixed import sorting --- pandas/io/sas/sas7bdat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index f5e0e9e83989a..24608bd55b5d4 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -13,8 +13,8 @@ Reference for binary data compression: http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm """ -from datetime import datetime, timedelta from collections import abc +from datetime import datetime, timedelta import struct import numpy as np From b38c1656a300a3aee99be4e04352d75539685347 Mon Sep 17 00:00:00 2001 From: Paul Lilley Date: Mon, 10 Feb 2020 16:46:20 +0100 Subject: [PATCH 15/23] moved whatsnew to 1.1, added space before comment --- doc/source/whatsnew/v1.0.0.rst | 1 - doc/source/whatsnew/v1.1.0.rst | 1 + pandas/tests/io/sas/test_sas7bdat.py | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index ebc13afe0c747..6597b764581a4 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -1186,7 +1186,6 @@ I/O - Bug in :func:`pandas.io.json.json_normalize` where a missing value in the location specified by `record_path` would raise a ``TypeError`` (:issue:`30148`) - :func:`read_excel` now accepts binary data (:issue:`15914`) - Bug in :meth:`read_csv` in which encoding handling was limited to just the string `utf-16` for the C engine (:issue:`24130`) -- :func:`read_sas()` now handles dates and datetimes larger than :attr:`Timestamp.max` returning them as :class:`datetime.datetime` objects (:issue:`20927`) Plotting ^^^^^^^^ diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 40abb8f83de2f..daf3806cd2188 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -186,6 +186,7 @@ I/O ``coerce_timestamps``; following pyarrow's default allows writing nanosecond timestamps with ``version="2.0"`` (:issue:`31652`). - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`) +- :func:`read_sas()` now handles dates and datetimes larger than :attr:`Timestamp.max` returning them as :class:`datetime.datetime` objects (:issue: `20927`) Plotting ^^^^^^^^ diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 8e0c6da577070..f184200ace50a 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -230,6 +230,7 @@ def test_max_sas_date(datapath): # sas7bdat module fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat") df = pd.read_sas(fname, encoding="iso-8859-1") + # SAS likes to left pad strings with spaces - lstrip before comparing df = df.applymap(lambda x: x.lstrip() if isinstance(x, str) else x) # GH 19732: Timestamps imported from sas will incur floating point errors From 3eaa34d2c949a0fe52fbfd6030f859437b07c2ca Mon Sep 17 00:00:00 2001 From: Paul Lilley Date: Mon, 10 Feb 2020 19:55:14 +0100 Subject: [PATCH 16/23] tidied float constants --- pandas/tests/io/sas/test_sas7bdat.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index f184200ace50a..ea65cfe6a6560 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -245,7 +245,7 @@ def test_max_sas_date(datapath): datetime(9999, 12, 29, 23, 59, 59, 999000), datetime(2019, 8, 1, 23, 59, 59, 999000), ], - "date_as_float": [float(2936547), float(21762)], + "date_as_float": [2936547., 21762.], "date_as_date": [datetime(9999, 12, 29), datetime(2019, 8, 1)], }, columns=["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"], @@ -277,7 +277,7 @@ def test_max_sas_date_iterator(datapath): "text": ["max"], "dt_as_float": [253717747199.999], "dt_as_dt": [datetime(9999, 12, 29, 23, 59, 59, 999000)], - "date_as_float": [float(2936547)], + "date_as_float": [2936547.], "date_as_date": [datetime(9999, 12, 29)], }, columns=col_order, @@ -287,7 +287,7 @@ def test_max_sas_date_iterator(datapath): "text": ["normal"], "dt_as_float": [1880323199.999], "dt_as_dt": [np.datetime64("2019-08-01 23:59:59.999")], - "date_as_float": [float(21762)], + "date_as_float": [21762.], "date_as_date": [np.datetime64("2019-08-01")], }, columns=col_order, From deb313cb2776029003e743e3894d6971fb1b4cf7 Mon Sep 17 00:00:00 2001 From: Paul Lilley Date: Mon, 10 Feb 2020 21:12:59 +0100 Subject: [PATCH 17/23] fixed linting --- pandas/tests/io/sas/test_sas7bdat.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index ea65cfe6a6560..427faf93ea06d 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -245,7 +245,7 @@ def test_max_sas_date(datapath): datetime(9999, 12, 29, 23, 59, 59, 999000), datetime(2019, 8, 1, 23, 59, 59, 999000), ], - "date_as_float": [2936547., 21762.], + "date_as_float": [2936547.0, 21762.0], "date_as_date": [datetime(9999, 12, 29), datetime(2019, 8, 1)], }, columns=["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"], @@ -277,7 +277,7 @@ def test_max_sas_date_iterator(datapath): "text": ["max"], "dt_as_float": [253717747199.999], "dt_as_dt": [datetime(9999, 12, 29, 23, 59, 59, 999000)], - "date_as_float": [2936547.], + "date_as_float": [2936547.0], "date_as_date": [datetime(9999, 12, 29)], }, columns=col_order, @@ -287,7 +287,7 @@ def test_max_sas_date_iterator(datapath): "text": ["normal"], "dt_as_float": [1880323199.999], "dt_as_dt": [np.datetime64("2019-08-01 23:59:59.999")], - "date_as_float": [21762.], + "date_as_float": [21762.0], "date_as_date": [np.datetime64("2019-08-01")], }, columns=col_order, From 80acc7d1c88c61caf1af1ce289c236acdf2b9cdc Mon Sep 17 00:00:00 2001 From: Paul Lilley Date: Thu, 21 May 2020 18:44:37 +0200 Subject: [PATCH 18/23] fixed failing test --- pandas/io/sas/sas7bdat.py | 7 ++++--- pandas/tests/io/sas/test_sas7bdat.py | 6 ++++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index be9e97d7b0406..78f32702957db 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -37,9 +37,10 @@ def _convert_datetimes(sas_datetimes: pd.Series, unit: str): Parameters ---------- - sas_datetimes : Series of 64bit floats representing dates or datetimes - in SAS - unit : "d" if the floats represent dates, "t" for datetimes + sas_datetimes : {Series, Sequence[float]} + Dates or datetimes in SAS + unit : {str} + "d" if the floats represent dates, "s" for datetimes """ try: return pd.to_datetime(sas_datetimes, unit=unit, origin="1960-01-01") diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 427faf93ea06d..9a9d8c2b8d1d1 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -1,4 +1,5 @@ from datetime import datetime +import dateutil.parser import io import os from pathlib import Path @@ -219,6 +220,9 @@ def test_zero_variables(datapath): def round_datetime_to_ms(ts): if isinstance(ts, datetime): return ts.replace(microsecond=int(round(ts.microsecond, -3) / 1000) * 1000) + elif isinstance(ts, str): + _ts = dateutil.parser.parse(timestr=ts) + return _ts.replace(microsecond=int(round(_ts.microsecond, -3) / 1000) * 1000) else: return ts @@ -269,6 +273,8 @@ def test_max_sas_date_iterator(datapath): df["dt_as_dt"] = df["dt_as_dt"].dt.round("us") except pd._libs.tslibs.np_datetime.OutOfBoundsDatetime: df = df.applymap(round_datetime_to_ms) + except AttributeError as e: + df["dt_as_dt"] = df["dt_as_dt"].apply(round_datetime_to_ms) df.reset_index(inplace=True, drop=True) results.append(df) expected = [ From e6403baab125c76848c44e8e8813cfacc934448a Mon Sep 17 00:00:00 2001 From: Paul Lilley Date: Thu, 21 May 2020 19:14:15 +0200 Subject: [PATCH 19/23] fixed failing test --- pandas/tests/io/sas/test_sas7bdat.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 9a9d8c2b8d1d1..92ea3ac3de0bb 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -238,7 +238,12 @@ def test_max_sas_date(datapath): # SAS likes to left pad strings with spaces - lstrip before comparing df = df.applymap(lambda x: x.lstrip() if isinstance(x, str) else x) # GH 19732: Timestamps imported from sas will incur floating point errors - df = df.applymap(round_datetime_to_ms) + try: + df["dt_as_dt"] = df["dt_as_dt"].dt.round("us") + except pd._libs.tslibs.np_datetime.OutOfBoundsDatetime: + df = df.applymap(round_datetime_to_ms) + except AttributeError as e: + df["dt_as_dt"] = df["dt_as_dt"].apply(round_datetime_to_ms) # if there are any date/times > pandas.Timestamp.max then ALL in that chunk # are returned as datetime.datetime expected = pd.DataFrame( From 7b7984a12866da4121b82250961cf050b92db011 Mon Sep 17 00:00:00 2001 From: Paul Lilley Date: Thu, 21 May 2020 20:07:27 +0200 Subject: [PATCH 20/23] fixed linting --- pandas/io/sas/sas7bdat.py | 2 +- pandas/tests/io/sas/test_sas7bdat.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 78f32702957db..c0db90c27c76e 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -37,7 +37,7 @@ def _convert_datetimes(sas_datetimes: pd.Series, unit: str): Parameters ---------- - sas_datetimes : {Series, Sequence[float]} + sas_datetimes : {Series, Sequence[float]} Dates or datetimes in SAS unit : {str} "d" if the floats represent dates, "s" for datetimes diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 92ea3ac3de0bb..9459b8b7d0ae9 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -242,7 +242,7 @@ def test_max_sas_date(datapath): df["dt_as_dt"] = df["dt_as_dt"].dt.round("us") except pd._libs.tslibs.np_datetime.OutOfBoundsDatetime: df = df.applymap(round_datetime_to_ms) - except AttributeError as e: + except AttributeError: df["dt_as_dt"] = df["dt_as_dt"].apply(round_datetime_to_ms) # if there are any date/times > pandas.Timestamp.max then ALL in that chunk # are returned as datetime.datetime @@ -278,7 +278,7 @@ def test_max_sas_date_iterator(datapath): df["dt_as_dt"] = df["dt_as_dt"].dt.round("us") except pd._libs.tslibs.np_datetime.OutOfBoundsDatetime: df = df.applymap(round_datetime_to_ms) - except AttributeError as e: + except AttributeError: df["dt_as_dt"] = df["dt_as_dt"].apply(round_datetime_to_ms) df.reset_index(inplace=True, drop=True) results.append(df) @@ -304,5 +304,5 @@ def test_max_sas_date_iterator(datapath): columns=col_order, ), ] - for r, e in zip(results, expected): - tm.assert_frame_equal(r, e) + for result, expected in zip(results, expected): + tm.assert_frame_equal(result, expected) From 6ca157711523d8a81e5eef2e90e20e2ed1e0b89f Mon Sep 17 00:00:00 2001 From: Paul Lilley Date: Thu, 21 May 2020 20:43:38 +0200 Subject: [PATCH 21/23] fixed import order --- pandas/tests/io/sas/test_sas7bdat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 9459b8b7d0ae9..8c14f9de9f61c 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -1,9 +1,9 @@ from datetime import datetime -import dateutil.parser import io import os from pathlib import Path +import dateutil.parser import numpy as np import pytest From 17b6515d226cce70635b8bd6856f798a63ff524d Mon Sep 17 00:00:00 2001 From: Paul Lilley Date: Mon, 25 May 2020 21:12:14 +0200 Subject: [PATCH 22/23] Added return type. Removed space before issue number. --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/io/sas/sas7bdat.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index cb2820f2de985..278ba91b2632e 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -781,7 +781,7 @@ I/O timestamps with ``version="2.0"`` (:issue:`31652`). - Bug in :meth:`read_csv` was raising `TypeError` when `sep=None` was used in combination with `comment` keyword (:issue:`31396`) - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`) -- :func:`read_sas()` now handles dates and datetimes larger than :attr:`Timestamp.max` returning them as :class:`datetime.datetime` objects (:issue: `20927`) +- :func:`read_sas()` now handles dates and datetimes larger than :attr:`Timestamp.max` returning them as :class:`datetime.datetime` objects (:issue:`20927`) - Bug in :meth:`DataFrame.to_json` where ``Timedelta`` objects would not be serialized correctly with ``date_format="iso"`` (:issue:`28256`) - :func:`read_csv` will raise a ``ValueError`` when the column names passed in `parse_dates` are missing in the Dataframe (:issue:`31251`) - Bug in :meth:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index c0db90c27c76e..5412395c812e1 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -29,7 +29,7 @@ from pandas.io.sas.sasreader import ReaderBase -def _convert_datetimes(sas_datetimes: pd.Series, unit: str): +def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series: """ Convert to Timestamp if possible, otherwise to datetime.datetime. SAS float64 lacks precision for more than ms resolution so the fit @@ -41,6 +41,11 @@ def _convert_datetimes(sas_datetimes: pd.Series, unit: str): Dates or datetimes in SAS unit : {str} "d" if the floats represent dates, "s" for datetimes + + Returns + ------- + Series + Series of datetime64 dtype or datetime.datetime. """ try: return pd.to_datetime(sas_datetimes, unit=unit, origin="1960-01-01") From a654327c617df5665363a7b5f7455e038825a031 Mon Sep 17 00:00:00 2001 From: Paul Lilley Date: Mon, 25 May 2020 22:23:06 +0200 Subject: [PATCH 23/23] Fixed mypy error. --- pandas/io/sas/sas7bdat.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 5412395c812e1..c8f1336bcec60 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -58,6 +58,8 @@ def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series: return sas_datetimes.apply( lambda sas_float: datetime(1960, 1, 1) + timedelta(days=sas_float) ) + else: + raise ValueError("unit must be 'd' or 's'") class _subheader_pointer: