Skip to content

Commit 20fe7d4

Browse files
YYYasin19noatamir
authored andcommitted
ENH: allow user to infer SAS file encoding; add correct encoding names (pandas-dev#48050)
* ENH: allow user to infer SAS file encoding; use detected encoding; add correct encoding names * remove typing annotation and assert error message * include initial release documentation * rewrite test: no type hinting + context manager call * add encoding constant for whole documentation -- incl. explicit not found references * moved release note to 1.6.0
1 parent 72e6d44 commit 20fe7d4

File tree

4 files changed

+79
-10
lines changed

4 files changed

+79
-10
lines changed

doc/source/whatsnew/v1.6.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ enhancement2
2828

2929
Other enhancements
3030
^^^^^^^^^^^^^^^^^^
31+
- :func:`read_sas` now supports using ``encoding='infer'`` to correctly read and use the encoding specified by the sas file. (:issue:`48048`)
3132
- :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` now preserve nullable dtypes instead of casting to numpy dtypes (:issue:`37493`)
3233
- :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support an ``axis`` argument. If ``axis`` is set, the default behaviour of which axis to consider can be overwritten (:issue:`47819`)
3334
- :func:`assert_frame_equal` now shows the first element where the DataFrames differ, analogously to ``pytest``'s output (:issue:`47910`)

pandas/io/sas/sas7bdat.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -147,8 +147,10 @@ class SAS7BDATReader(ReaderBase, abc.Iterator):
147147
chunksize : int, defaults to None
148148
Return SAS7BDATReader object for iterations, returns chunks
149149
with given number of lines.
150-
encoding : string, defaults to None
151-
String encoding.
150+
encoding : str, 'infer', defaults to None
151+
String encoding acc. to python standard encodings,
152+
encoding='infer' tries to detect the encoding from the file header,
153+
encoding=None will leave the data in binary format.
152154
convert_text : bool, defaults to True
153155
If False, text variables are left as raw bytes.
154156
convert_header_text : bool, defaults to True
@@ -265,9 +267,11 @@ def _get_properties(self) -> None:
265267
# Get encoding information
266268
buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
267269
if buf in const.encoding_names:
268-
self.file_encoding = const.encoding_names[buf]
270+
self.inferred_encoding = const.encoding_names[buf]
271+
if self.encoding == "infer":
272+
self.encoding = self.inferred_encoding
269273
else:
270-
self.file_encoding = f"unknown (code={buf})"
274+
self.inferred_encoding = f"unknown (code={buf})"
271275

272276
# Get platform information
273277
buf = self._read_bytes(const.platform_offset, const.platform_length)

pandas/io/sas/sas_constants.py

+55-6
Original file line numberDiff line numberDiff line change
@@ -107,15 +107,64 @@
107107
compression_literals: Final = [rle_compression, rdc_compression]
108108

109109
# Incomplete list of encodings, using SAS nomenclature:
110-
# http://support.sas.com/documentation/cdl/en/nlsref/61893/HTML/default/viewer.htm#a002607278.htm
110+
# https://support.sas.com/documentation/onlinedoc/dfdmstudio/2.6/dmpdmsug/Content/dfU_Encodings_SAS.html
111+
# corresponding to the Python documentation of standard encodings
112+
# https://docs.python.org/3/library/codecs.html#standard-encodings
111113
encoding_names: Final = {
112-
29: "latin1",
113114
20: "utf-8",
115+
29: "latin1",
116+
30: "latin2",
117+
31: "latin3",
118+
32: "latin4",
114119
33: "cyrillic",
115-
60: "wlatin2",
116-
61: "wcyrillic",
117-
62: "wlatin1",
118-
90: "ebcdic870",
120+
34: "arabic",
121+
35: "greek",
122+
36: "hebrew",
123+
37: "latin5",
124+
38: "latin6",
125+
39: "cp874",
126+
40: "latin9",
127+
41: "cp437",
128+
42: "cp850",
129+
43: "cp852",
130+
44: "cp857",
131+
45: "cp858",
132+
46: "cp862",
133+
47: "cp864",
134+
48: "cp865",
135+
49: "cp866",
136+
50: "cp869",
137+
51: "cp874",
138+
# 52: "", # not found
139+
# 53: "", # not found
140+
# 54: "", # not found
141+
55: "cp720",
142+
56: "cp737",
143+
57: "cp775",
144+
58: "cp860",
145+
59: "cp863",
146+
60: "cp1250",
147+
61: "cp1251",
148+
62: "cp1252",
149+
63: "cp1253",
150+
64: "cp1254",
151+
65: "cp1255",
152+
66: "cp1256",
153+
67: "cp1257",
154+
68: "cp1258",
155+
118: "cp950",
156+
# 119: "", # not found
157+
123: "big5",
158+
125: "gb2312",
159+
126: "cp936",
160+
134: "euc_jp",
161+
136: "cp932",
162+
138: "shift_jis",
163+
140: "euc-kr",
164+
141: "cp949",
165+
227: "latin8",
166+
# 228: "", # not found
167+
# 229: "" # not found
119168
}
120169

121170

pandas/tests/io/sas/test_sas7bdat.py

+15
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,21 @@ def test_encoding_options(datapath):
136136
assert x == y.decode()
137137

138138

139+
def test_encoding_infer(datapath):
140+
fname = datapath("io", "sas", "data", "test1.sas7bdat")
141+
142+
with pd.read_sas(fname, encoding="infer", iterator=True) as df1_reader:
143+
# check: is encoding inferred correctly from file
144+
assert df1_reader.inferred_encoding == "cp1252"
145+
df1 = df1_reader.read()
146+
147+
with pd.read_sas(fname, encoding="cp1252", iterator=True) as df2_reader:
148+
df2 = df2_reader.read()
149+
150+
# check: reader reads correct information
151+
tm.assert_frame_equal(df1, df2)
152+
153+
139154
def test_productsales(datapath):
140155
fname = datapath("io", "sas", "data", "productsales.sas7bdat")
141156
df = pd.read_sas(fname, encoding="utf-8")

0 commit comments

Comments
 (0)