ENH: allow user to infer SAS file encoding; add correct encoding names (pandas-dev#48050)

YYYasin19 · noatamir · commit 20fe7d49c890 · 2022-11-09T22:58:18.000+01:00
* ENH: allow user to infer SAS file encoding; use detected encoding; add correct encoding names

* remove typing annotation and assert error message

* include initial release documentation

* rewrite test: no type hinting + context manager call

* add encoding constant for whole documentation -- incl. explicit not found references

* moved release note to 1.6.0
diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst
@@ -28,6 +28,7 @@ enhancement2
 
 Other enhancements
 ^^^^^^^^^^^^^^^^^^
+- :func:`read_sas` now supports using ``encoding='infer'`` to correctly read and use the encoding specified by the sas file. (:issue:`48048`)
 - :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` now preserve nullable dtypes instead of casting to numpy dtypes (:issue:`37493`)
 - :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support an ``axis`` argument. If ``axis`` is set, the default behaviour of which axis to consider can be overwritten (:issue:`47819`)
 - :func:`assert_frame_equal` now shows the first element where the DataFrames differ, analogously to ``pytest``'s output (:issue:`47910`)
diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py
@@ -147,8 +147,10 @@ class SAS7BDATReader(ReaderBase, abc.Iterator):
     chunksize : int, defaults to None
         Return SAS7BDATReader object for iterations, returns chunks
         with given number of lines.
-    encoding : string, defaults to None
-        String encoding.
+    encoding : str, 'infer', defaults to None
+        String encoding acc. to python standard encodings,
+        encoding='infer' tries to detect the encoding from the file header,
+        encoding=None will leave the data in binary format.
     convert_text : bool, defaults to True
         If False, text variables are left as raw bytes.
     convert_header_text : bool, defaults to True
@@ -265,9 +267,11 @@ def _get_properties(self) -> None:
         # Get encoding information
         buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
         if buf in const.encoding_names:
-            self.file_encoding = const.encoding_names[buf]
+            self.inferred_encoding = const.encoding_names[buf]
+            if self.encoding == "infer":
+                self.encoding = self.inferred_encoding
         else:
-            self.file_encoding = f"unknown (code={buf})"
+            self.inferred_encoding = f"unknown (code={buf})"
 
         # Get platform information
         buf = self._read_bytes(const.platform_offset, const.platform_length)
diff --git a/pandas/io/sas/sas_constants.py b/pandas/io/sas/sas_constants.py
@@ -107,15 +107,64 @@
 compression_literals: Final = [rle_compression, rdc_compression]
 
 # Incomplete list of encodings, using SAS nomenclature:
-# http://support.sas.com/documentation/cdl/en/nlsref/61893/HTML/default/viewer.htm#a002607278.htm
+# https://support.sas.com/documentation/onlinedoc/dfdmstudio/2.6/dmpdmsug/Content/dfU_Encodings_SAS.html
+# corresponding to the Python documentation of standard encodings
+# https://docs.python.org/3/library/codecs.html#standard-encodings
 encoding_names: Final = {
-    29: "latin1",
     20: "utf-8",
+    29: "latin1",
+    30: "latin2",
+    31: "latin3",
+    32: "latin4",
     33: "cyrillic",
-    60: "wlatin2",
-    61: "wcyrillic",
-    62: "wlatin1",
-    90: "ebcdic870",
+    34: "arabic",
+    35: "greek",
+    36: "hebrew",
+    37: "latin5",
+    38: "latin6",
+    39: "cp874",
+    40: "latin9",
+    41: "cp437",
+    42: "cp850",
+    43: "cp852",
+    44: "cp857",
+    45: "cp858",
+    46: "cp862",
+    47: "cp864",
+    48: "cp865",
+    49: "cp866",
+    50: "cp869",
+    51: "cp874",
+    # 52: "",  # not found
+    # 53: "",  # not found
+    # 54: "",  # not found
+    55: "cp720",
+    56: "cp737",
+    57: "cp775",
+    58: "cp860",
+    59: "cp863",
+    60: "cp1250",
+    61: "cp1251",
+    62: "cp1252",
+    63: "cp1253",
+    64: "cp1254",
+    65: "cp1255",
+    66: "cp1256",
+    67: "cp1257",
+    68: "cp1258",
+    118: "cp950",
+    # 119: "",  # not found
+    123: "big5",
+    125: "gb2312",
+    126: "cp936",
+    134: "euc_jp",
+    136: "cp932",
+    138: "shift_jis",
+    140: "euc-kr",
+    141: "cp949",
+    227: "latin8",
+    # 228: "", # not found
+    # 229: ""  # not found
 }
 
 
diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py
@@ -136,6 +136,21 @@ def test_encoding_options(datapath):
         assert x == y.decode()
 
 
+def test_encoding_infer(datapath):
+    fname = datapath("io", "sas", "data", "test1.sas7bdat")
+
+    with pd.read_sas(fname, encoding="infer", iterator=True) as df1_reader:
+        # check: is encoding inferred correctly from file
+        assert df1_reader.inferred_encoding == "cp1252"
+        df1 = df1_reader.read()
+
+    with pd.read_sas(fname, encoding="cp1252", iterator=True) as df2_reader:
+        df2 = df2_reader.read()
+
+    # check: reader reads correct information
+    tm.assert_frame_equal(df1, df2)
+
+
 def test_productsales(datapath):
     fname = datapath("io", "sas", "data", "productsales.sas7bdat")
     df = pd.read_sas(fname, encoding="utf-8")