diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index 405b8cc0a5ded..252a618150aff 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -28,6 +28,7 @@ enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ +- :func:`read_sas` now supports using ``encoding='infer'`` to correctly read and use the encoding specified by the sas file. (:issue:`48048`) - :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` now preserve nullable dtypes instead of casting to numpy dtypes (:issue:`37493`) - :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support an ``axis`` argument. If ``axis`` is set, the default behaviour of which axis to consider can be overwritten (:issue:`47819`) - :func:`assert_frame_equal` now shows the first element where the DataFrames differ, analogously to ``pytest``'s output (:issue:`47910`) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index bdefb6f42d8bd..91c5e6b227c35 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -147,8 +147,10 @@ class SAS7BDATReader(ReaderBase, abc.Iterator): chunksize : int, defaults to None Return SAS7BDATReader object for iterations, returns chunks with given number of lines. - encoding : string, defaults to None - String encoding. + encoding : str, 'infer', defaults to None + String encoding acc. to python standard encodings, + encoding='infer' tries to detect the encoding from the file header, + encoding=None will leave the data in binary format. convert_text : bool, defaults to True If False, text variables are left as raw bytes. convert_header_text : bool, defaults to True @@ -265,9 +267,11 @@ def _get_properties(self) -> None: # Get encoding information buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0] if buf in const.encoding_names: - self.file_encoding = const.encoding_names[buf] + self.inferred_encoding = const.encoding_names[buf] + if self.encoding == "infer": + self.encoding = self.inferred_encoding else: - self.file_encoding = f"unknown (code={buf})" + self.inferred_encoding = f"unknown (code={buf})" # Get platform information buf = self._read_bytes(const.platform_offset, const.platform_length) diff --git a/pandas/io/sas/sas_constants.py b/pandas/io/sas/sas_constants.py index 69bc16e6d294f..a090b8a1acb3c 100644 --- a/pandas/io/sas/sas_constants.py +++ b/pandas/io/sas/sas_constants.py @@ -107,15 +107,64 @@ compression_literals: Final = [rle_compression, rdc_compression] # Incomplete list of encodings, using SAS nomenclature: -# http://support.sas.com/documentation/cdl/en/nlsref/61893/HTML/default/viewer.htm#a002607278.htm +# https://support.sas.com/documentation/onlinedoc/dfdmstudio/2.6/dmpdmsug/Content/dfU_Encodings_SAS.html +# corresponding to the Python documentation of standard encodings +# https://docs.python.org/3/library/codecs.html#standard-encodings encoding_names: Final = { - 29: "latin1", 20: "utf-8", + 29: "latin1", + 30: "latin2", + 31: "latin3", + 32: "latin4", 33: "cyrillic", - 60: "wlatin2", - 61: "wcyrillic", - 62: "wlatin1", - 90: "ebcdic870", + 34: "arabic", + 35: "greek", + 36: "hebrew", + 37: "latin5", + 38: "latin6", + 39: "cp874", + 40: "latin9", + 41: "cp437", + 42: "cp850", + 43: "cp852", + 44: "cp857", + 45: "cp858", + 46: "cp862", + 47: "cp864", + 48: "cp865", + 49: "cp866", + 50: "cp869", + 51: "cp874", + # 52: "", # not found + # 53: "", # not found + # 54: "", # not found + 55: "cp720", + 56: "cp737", + 57: "cp775", + 58: "cp860", + 59: "cp863", + 60: "cp1250", + 61: "cp1251", + 62: "cp1252", + 63: "cp1253", + 64: "cp1254", + 65: "cp1255", + 66: "cp1256", + 67: "cp1257", + 68: "cp1258", + 118: "cp950", + # 119: "", # not found + 123: "big5", + 125: "gb2312", + 126: "cp936", + 134: "euc_jp", + 136: "cp932", + 138: "shift_jis", + 140: "euc-kr", + 141: "cp949", + 227: "latin8", + # 228: "", # not found + # 229: "" # not found } diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 2b7ecbcdf9f80..cee416ac218de 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -136,6 +136,21 @@ def test_encoding_options(datapath): assert x == y.decode() +def test_encoding_infer(datapath): + fname = datapath("io", "sas", "data", "test1.sas7bdat") + + with pd.read_sas(fname, encoding="infer", iterator=True) as df1_reader: + # check: is encoding inferred correctly from file + assert df1_reader.inferred_encoding == "cp1252" + df1 = df1_reader.read() + + with pd.read_sas(fname, encoding="cp1252", iterator=True) as df2_reader: + df2 = df2_reader.read() + + # check: reader reads correct information + tm.assert_frame_equal(df1, df2) + + def test_productsales(datapath): fname = datapath("io", "sas", "data", "productsales.sas7bdat") df = pd.read_sas(fname, encoding="utf-8")