From f5293f70c2e4d92249f7fe707728c33bca6fd0bf Mon Sep 17 00:00:00 2001 From: Yasin Tatar Date: Thu, 11 Aug 2022 23:44:44 +0200 Subject: [PATCH 1/6] ENH: allow user to infer SAS file encoding; use detected encoding; add correct encoding names --- pandas/io/sas/sas7bdat.py | 12 ++++++++---- pandas/io/sas/sas_constants.py | 17 ++++++++++++----- pandas/tests/io/sas/test_sas7bdat.py | 22 ++++++++++++++++++++++ 3 files changed, 42 insertions(+), 9 deletions(-) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 7282affe1b5e6..7af411ad094d3 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -147,8 +147,10 @@ class SAS7BDATReader(ReaderBase, abc.Iterator): chunksize : int, defaults to None Return SAS7BDATReader object for iterations, returns chunks with given number of lines. - encoding : string, defaults to None - String encoding. + encoding : str, 'infer', defaults to None + String encoding acc. to python standard encodings, + encoding='infer' tries to detect the encoding from the file header, + encoding=None will leave the data in binary format. convert_text : bool, defaults to True If False, text variables are left as raw bytes. convert_header_text : bool, defaults to True @@ -265,9 +267,11 @@ def _get_properties(self) -> None: # Get encoding information buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0] if buf in const.encoding_names: - self.file_encoding = const.encoding_names[buf] + self.inferred_encoding = const.encoding_names[buf] + if self.encoding == "infer": + self.encoding = self.inferred_encoding else: - self.file_encoding = f"unknown (code={buf})" + self.inferred_encoding = f"unknown (code={buf})" # Get platform information buf = self._read_bytes(const.platform_offset, const.platform_length) diff --git a/pandas/io/sas/sas_constants.py b/pandas/io/sas/sas_constants.py index 69bc16e6d294f..23dfd237cb90f 100644 --- a/pandas/io/sas/sas_constants.py +++ b/pandas/io/sas/sas_constants.py @@ -107,14 +107,21 @@ compression_literals: Final = [rle_compression, rdc_compression] # Incomplete list of encodings, using SAS nomenclature: -# http://support.sas.com/documentation/cdl/en/nlsref/61893/HTML/default/viewer.htm#a002607278.htm +# https://support.sas.com/documentation/onlinedoc/dfdmstudio/2.6/dmpdmsug/Content/dfU_Encodings_SAS.html +# corresponding to the Python documentation of standard encodings +# https://docs.python.org/3/library/codecs.html#standard-encodings encoding_names: Final = { - 29: "latin1", 20: "utf-8", + 29: "latin1", + 30: "latin2", + 31: "latin3", + 32: "latin4", 33: "cyrillic", - 60: "wlatin2", - 61: "wcyrillic", - 62: "wlatin1", + 34: "arabic", + 35: "greek", + 60: "cp1250", # wlatin2 + 61: "cp1251", # wcyrillic + 62: "cp1252", # wlatin1 90: "ebcdic870", } diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 2b7ecbcdf9f80..d21098d94aff7 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -136,6 +136,28 @@ def test_encoding_options(datapath): assert x == y.decode() +def test_encoding_infer(datapath): + from pandas.io.sas.sas7bdat import SAS7BDATReader + + fname = datapath("io", "sas", "data", "test1.sas7bdat") + + # check if inferred correctly + df1_reader: SAS7BDATReader = pd.read_sas(fname, encoding="infer", iterator=True) + assert ( + df1_reader.inferred_encoding == "cp1252" + ), f""" + Encoding has been inferred incorrectly: + {df1_reader.inferred_encoding} instead of 'cp1252' + """ + + # check if the reader reads correctly with encoding + df1 = df1_reader.read() + df2_reader: SAS7BDATReader = pd.read_sas(fname, encoding="cp1252", iterator=True) + df2 = df2_reader.read() + + tm.assert_frame_equal(df1, df2) + + def test_productsales(datapath): fname = datapath("io", "sas", "data", "productsales.sas7bdat") df = pd.read_sas(fname, encoding="utf-8") From 87fb92c2d01b542c81dfa931c970f4d070f58ea7 Mon Sep 17 00:00:00 2001 From: Yasin Tatar Date: Sat, 13 Aug 2022 00:46:09 +0200 Subject: [PATCH 2/6] remove typing annotation and assert error message --- pandas/tests/io/sas/test_sas7bdat.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index d21098d94aff7..b1f43e873189d 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -142,13 +142,8 @@ def test_encoding_infer(datapath): fname = datapath("io", "sas", "data", "test1.sas7bdat") # check if inferred correctly - df1_reader: SAS7BDATReader = pd.read_sas(fname, encoding="infer", iterator=True) - assert ( - df1_reader.inferred_encoding == "cp1252" - ), f""" - Encoding has been inferred incorrectly: - {df1_reader.inferred_encoding} instead of 'cp1252' - """ + df1_reader = pd.read_sas(fname, encoding="infer", iterator=True) + assert df1_reader.inferred_encoding == "cp1252" # check if the reader reads correctly with encoding df1 = df1_reader.read() From f637661831f730aa2c6ef3f3af44b1d914fe568f Mon Sep 17 00:00:00 2001 From: Yasin Tatar Date: Sat, 13 Aug 2022 01:00:29 +0200 Subject: [PATCH 3/6] include initial release documentation --- doc/source/whatsnew/v1.4.4.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/doc/source/whatsnew/v1.4.4.rst b/doc/source/whatsnew/v1.4.4.rst index 56b1254d8a359..54f386739c8b1 100644 --- a/doc/source/whatsnew/v1.4.4.rst +++ b/doc/source/whatsnew/v1.4.4.rst @@ -9,6 +9,17 @@ including other versions of pandas. {{ header }} .. --------------------------------------------------------------------------- +.. _whatsnew_150.enhancements: + +Enhancements +~~~~~~~~~~~~ + +.. _whatsnew_150.enhancements.read_sas-encoding: + +read_sas infers encoding from file +^^^^^^^^ + +:func:`read_sas` now supports using ``encoding='infer'`` to correctly read and use the encoding specified by the sas file. (:issue:`48048`) .. _whatsnew_144.regressions: From b4d7346fe1d151780fc0506cc0678eb7637e0b70 Mon Sep 17 00:00:00 2001 From: Yasin Tatar Date: Tue, 16 Aug 2022 19:58:35 +0200 Subject: [PATCH 4/6] rewrite test: no type hinting + context manager call --- pandas/tests/io/sas/test_sas7bdat.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index b1f43e873189d..cee416ac218de 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -137,19 +137,17 @@ def test_encoding_options(datapath): def test_encoding_infer(datapath): - from pandas.io.sas.sas7bdat import SAS7BDATReader - fname = datapath("io", "sas", "data", "test1.sas7bdat") - # check if inferred correctly - df1_reader = pd.read_sas(fname, encoding="infer", iterator=True) - assert df1_reader.inferred_encoding == "cp1252" + with pd.read_sas(fname, encoding="infer", iterator=True) as df1_reader: + # check: is encoding inferred correctly from file + assert df1_reader.inferred_encoding == "cp1252" + df1 = df1_reader.read() - # check if the reader reads correctly with encoding - df1 = df1_reader.read() - df2_reader: SAS7BDATReader = pd.read_sas(fname, encoding="cp1252", iterator=True) - df2 = df2_reader.read() + with pd.read_sas(fname, encoding="cp1252", iterator=True) as df2_reader: + df2 = df2_reader.read() + # check: reader reads correct information tm.assert_frame_equal(df1, df2) From 0d93338da52d38b4f7d045066fb0948df21072d9 Mon Sep 17 00:00:00 2001 From: Yasin Tatar Date: Tue, 16 Aug 2022 21:42:01 +0200 Subject: [PATCH 5/6] add encoding constant for whole documentation -- incl. explicit not found references --- pandas/io/sas/sas_constants.py | 50 +++++++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/pandas/io/sas/sas_constants.py b/pandas/io/sas/sas_constants.py index 23dfd237cb90f..a090b8a1acb3c 100644 --- a/pandas/io/sas/sas_constants.py +++ b/pandas/io/sas/sas_constants.py @@ -119,10 +119,52 @@ 33: "cyrillic", 34: "arabic", 35: "greek", - 60: "cp1250", # wlatin2 - 61: "cp1251", # wcyrillic - 62: "cp1252", # wlatin1 - 90: "ebcdic870", + 36: "hebrew", + 37: "latin5", + 38: "latin6", + 39: "cp874", + 40: "latin9", + 41: "cp437", + 42: "cp850", + 43: "cp852", + 44: "cp857", + 45: "cp858", + 46: "cp862", + 47: "cp864", + 48: "cp865", + 49: "cp866", + 50: "cp869", + 51: "cp874", + # 52: "", # not found + # 53: "", # not found + # 54: "", # not found + 55: "cp720", + 56: "cp737", + 57: "cp775", + 58: "cp860", + 59: "cp863", + 60: "cp1250", + 61: "cp1251", + 62: "cp1252", + 63: "cp1253", + 64: "cp1254", + 65: "cp1255", + 66: "cp1256", + 67: "cp1257", + 68: "cp1258", + 118: "cp950", + # 119: "", # not found + 123: "big5", + 125: "gb2312", + 126: "cp936", + 134: "euc_jp", + 136: "cp932", + 138: "shift_jis", + 140: "euc-kr", + 141: "cp949", + 227: "latin8", + # 228: "", # not found + # 229: "" # not found } From 4f77c09bf088b4a1007a4f027dbe77725fc21374 Mon Sep 17 00:00:00 2001 From: Yasin Tatar Date: Wed, 31 Aug 2022 18:20:45 +0200 Subject: [PATCH 6/6] moved release note to 1.6.0 --- doc/source/whatsnew/v1.4.4.rst | 11 ----------- doc/source/whatsnew/v1.6.0.rst | 3 +-- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v1.4.4.rst b/doc/source/whatsnew/v1.4.4.rst index 54f386739c8b1..56b1254d8a359 100644 --- a/doc/source/whatsnew/v1.4.4.rst +++ b/doc/source/whatsnew/v1.4.4.rst @@ -9,17 +9,6 @@ including other versions of pandas. {{ header }} .. --------------------------------------------------------------------------- -.. _whatsnew_150.enhancements: - -Enhancements -~~~~~~~~~~~~ - -.. _whatsnew_150.enhancements.read_sas-encoding: - -read_sas infers encoding from file -^^^^^^^^ - -:func:`read_sas` now supports using ``encoding='infer'`` to correctly read and use the encoding specified by the sas file. (:issue:`48048`) .. _whatsnew_144.regressions: diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index 83dfacb46784b..24e7cd58d8d35 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -28,8 +28,7 @@ enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ -- -- +- :func:`read_sas` now supports using ``encoding='infer'`` to correctly read and use the encoding specified by the sas file. (:issue:`48048`) .. --------------------------------------------------------------------------- .. _whatsnew_160.notable_bug_fixes: