Skip to content

Commit 1af73b3

Browse files
committed
Further encoding work
1 parent c26d22b commit 1af73b3

File tree

2 files changed

+36
-13
lines changed

2 files changed

+36
-13
lines changed

pandas/io/sas/sas7bdat.py

+17-13
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ def __init__(self, path_or_buf, index=None, convert_dates=True,
7373
self.convert_text = convert_text
7474
self.convert_header_text = convert_header_text
7575

76+
self.default_encoding = "latin-1"
7677
self.compression = ""
7778
self.column_names_strings = []
7879
self.column_names = []
@@ -149,12 +150,12 @@ def _get_properties(self):
149150
buf = self._read_bytes(const.dataset_offset, const.dataset_length)
150151
self.name = buf.rstrip(b'\x00 ')
151152
if self.convert_header_text:
152-
self.name = self.name.decode(self.encoding)
153+
self.name = self.name.decode(self.encoding or self.default_encoding)
153154

154155
buf = self._read_bytes(const.file_type_offset, const.file_type_length)
155156
self.file_type = buf.rstrip(b'\x00 ')
156157
if self.convert_header_text:
157-
self.file_type = self.file_type.decode(self.encoding)
158+
self.file_type = self.file_type.decode(self.encoding or self.default_encoding)
158159

159160
# Timestamp is epoch 01/01/1960
160161
epoch = pd.datetime(1960, 1, 1)
@@ -183,31 +184,31 @@ def _get_properties(self):
183184
const.sas_release_length)
184185
self.sas_release = buf.rstrip(b'\x00 ')
185186
if self.convert_header_text:
186-
self.sas_release = self.sas_release.decode(self.encoding)
187+
self.sas_release = self.sas_release.decode(self.encoding or self.default_encoding)
187188

188189
buf = self._read_bytes(const.sas_server_type_offset + total_align,
189190
const.sas_server_type_length)
190191
self.server_type = buf.rstrip(b'\x00 ')
191192
if self.convert_header_text:
192-
self.server_type = self.server_type.decode(self.encoding)
193+
self.server_type = self.server_type.decode(self.encoding or self.default_encoding)
193194

194195
buf = self._read_bytes(const.os_version_number_offset + total_align,
195196
const.os_version_number_length)
196197
self.os_version = buf.rstrip(b'\x00 ')
197198
if self.convert_header_text:
198-
self.os_version = self.os_version.decode(self.encoding)
199+
self.os_version = self.os_version.decode(self.encoding or self.default_encoding)
199200

200201
buf = self._read_bytes(const.os_name_offset + total_align,
201202
const.os_name_length)
202203
buf = buf.rstrip(b'\x00 ')
203204
if len(buf) > 0:
204-
self.os_name = buf.decode(self.encoding)
205+
self.os_name = buf.decode(self.encoding or self.default_encoding)
205206
else:
206207
buf = self._read_bytes(const.os_maker_offset + total_align,
207208
const.os_maker_length)
208209
self.os_name = buf.rstrip(b'\x00 ')
209210
if self.convert_header_text:
210-
self.os_name = self.os_name.decode(self.encoding)
211+
self.os_name = self.os_name.decode(self.encoding or self.default_encoding)
211212

212213
# Read a single float of the given width (4 or 8).
213214
def _read_float(self, offset, width):
@@ -401,14 +402,14 @@ def _process_columntext_subheader(self, offset, length):
401402
buf = self._read_bytes(offset, text_block_size)
402403
cname = buf[0:text_block_size].rstrip(b"\x00 ")
403404
if self.convert_header_text:
404-
cname = cname.decode(self.encoding)
405+
cname = cname.decode(self.encoding or self.default_encoding)
405406
self.column_names_strings.append(cname)
406407

407408
if len(self.column_names_strings) == 1:
408409
column_name = self.column_names_strings[0]
409410
compression_literal = ""
410411
for cl in const.compression_literals:
411-
if cl in column_name:
412+
if cl in str(column_name):
412413
compression_literal = cl
413414
self.compression = compression_literal
414415
offset -= self._int_length
@@ -425,20 +426,23 @@ def _process_columntext_subheader(self, offset, length):
425426
if self.U64:
426427
offset1 += 4
427428
buf = self._read_bytes(offset1, self._lcp)
428-
self.creator_proc = buf[0:self._lcp].decode()
429+
self.creator_proc = buf[0:self._lcp]
429430
elif compression_literal == const.rle_compression:
430431
offset1 = offset + 40
431432
if self.U64:
432433
offset1 += 4
433434
buf = self._read_bytes(offset1, self._lcp)
434-
self.creator_proc = buf[0:self._lcp].decode()
435+
self.creator_proc = buf[0:self._lcp]
435436
elif self._lcs > 0:
436437
self._lcp = 0
437438
offset1 = offset + 16
438439
if self.U64:
439440
offset1 += 4
440441
buf = self._read_bytes(offset1, self._lcs)
441-
self.creator_proc = buf[0:self._lcp].decode()
442+
self.creator_proc = buf[0:self._lcp]
443+
if self.convert_header_text:
444+
if hasattr(self, "creator_proc"):
445+
self.creator_proc = self.creator_proc.decode(self.encoding or self.default_encoding)
442446

443447
def _process_columnname_subheader(self, offset, length):
444448
int_len = self._int_length
@@ -624,7 +628,7 @@ def _chunk_to_dataframe(self):
624628
elif self.column_types[j] == b's':
625629
rslt[name] = self._string_chunk[js, :]
626630
if self.convert_text and (self.encoding is not None):
627-
rslt[name] = rslt[name].str.decode(self.encoding)
631+
rslt[name] = rslt[name].str.decode(self.encoding or self.default_encoding)
628632
if self.blank_missing:
629633
ii = rslt[name].str.len() == 0
630634
rslt.loc[ii, name] = np.nan

pandas/io/tests/sas/test_sas7bdat.py

+19
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,25 @@ def test_from_iterator(self):
6464
tm.assert_frame_equal(df, df0.iloc[2:5, :])
6565

6666

67+
def test_encoding_options():
68+
dirpath = tm.get_data_path()
69+
fname = os.path.join(dirpath, "test1.sas7bdat")
70+
df1 = pd.read_sas(fname)
71+
df2 = pd.read_sas(fname, encoding='utf-8')
72+
for col in df1.columns:
73+
try:
74+
df1[col] = df1[col].str.decode('utf-8')
75+
except AttributeError:
76+
pass
77+
tm.assert_frame_equal(df1, df2)
78+
79+
from pandas.io.sas.sas7bdat import SAS7BDATReader
80+
rdr = SAS7BDATReader(fname, convert_header_text=False)
81+
df3 = rdr.read()
82+
for x,y in zip(df1.columns, df3.columns):
83+
assert(x == y.decode())
84+
85+
6786
def test_productsales():
6887
dirpath = tm.get_data_path()
6988
fname = os.path.join(dirpath, "productsales.sas7bdat")

0 commit comments

Comments
 (0)