Skip to content

Commit 3f711fc

Browse files
committed
MAINT: Refactor decode
Refactor decode and null terminate to use file encoding
1 parent 2aff757 commit 3f711fc

File tree

1 file changed

+18
-29
lines changed

1 file changed

+18
-29
lines changed

pandas/io/stata.py

+18-29
Original file line numberDiff line numberDiff line change
@@ -1136,7 +1136,7 @@ def _get_varlist(self):
11361136
elif self.format_version == 118:
11371137
b = 129
11381138

1139-
return [self._null_terminate(self.path_or_buf.read(b))
1139+
return [self._decode(self.path_or_buf.read(b))
11401140
for i in range(self.nvar)]
11411141

11421142
# Returns the format list
@@ -1150,7 +1150,7 @@ def _get_fmtlist(self):
11501150
else:
11511151
b = 7
11521152

1153-
return [self._null_terminate(self.path_or_buf.read(b))
1153+
return [self._decode(self.path_or_buf.read(b))
11541154
for i in range(self.nvar)]
11551155

11561156
# Returns the label list
@@ -1161,18 +1161,18 @@ def _get_lbllist(self):
11611161
b = 33
11621162
else:
11631163
b = 9
1164-
return [self._null_terminate(self.path_or_buf.read(b))
1164+
return [self._decode(self.path_or_buf.read(b))
11651165
for i in range(self.nvar)]
11661166

11671167
def _get_variable_labels(self):
11681168
if self.format_version == 118:
11691169
vlblist = [self._decode(self.path_or_buf.read(321))
11701170
for i in range(self.nvar)]
11711171
elif self.format_version > 105:
1172-
vlblist = [self._null_terminate(self.path_or_buf.read(81))
1172+
vlblist = [self._decode(self.path_or_buf.read(81))
11731173
for i in range(self.nvar)]
11741174
else:
1175-
vlblist = [self._null_terminate(self.path_or_buf.read(32))
1175+
vlblist = [self._decode(self.path_or_buf.read(32))
11761176
for i in range(self.nvar)]
11771177
return vlblist
11781178

@@ -1191,21 +1191,21 @@ def _get_data_label(self):
11911191
return self._decode(self.path_or_buf.read(strlen))
11921192
elif self.format_version == 117:
11931193
strlen = struct.unpack('b', self.path_or_buf.read(1))[0]
1194-
return self._null_terminate(self.path_or_buf.read(strlen))
1194+
return self._decode(self.path_or_buf.read(strlen))
11951195
elif self.format_version > 105:
1196-
return self._null_terminate(self.path_or_buf.read(81))
1196+
return self._decode(self.path_or_buf.read(81))
11971197
else:
1198-
return self._null_terminate(self.path_or_buf.read(32))
1198+
return self._decode(self.path_or_buf.read(32))
11991199

12001200
def _get_time_stamp(self):
12011201
if self.format_version == 118:
12021202
strlen = struct.unpack('b', self.path_or_buf.read(1))[0]
12031203
return self.path_or_buf.read(strlen).decode("utf-8")
12041204
elif self.format_version == 117:
12051205
strlen = struct.unpack('b', self.path_or_buf.read(1))[0]
1206-
return self._null_terminate(self.path_or_buf.read(strlen))
1206+
return self._decode(self.path_or_buf.read(strlen))
12071207
elif self.format_version > 104:
1208-
return self._null_terminate(self.path_or_buf.read(18))
1208+
return self._decode(self.path_or_buf.read(18))
12091209
else:
12101210
raise ValueError()
12111211

@@ -1266,10 +1266,10 @@ def _read_old_header(self, first_char):
12661266
.format(','.join(str(x) for x in typlist)))
12671267

12681268
if self.format_version > 108:
1269-
self.varlist = [self._null_terminate(self.path_or_buf.read(33))
1269+
self.varlist = [self._decode(self.path_or_buf.read(33))
12701270
for i in range(self.nvar)]
12711271
else:
1272-
self.varlist = [self._null_terminate(self.path_or_buf.read(9))
1272+
self.varlist = [self._decode(self.path_or_buf.read(9))
12731273
for i in range(self.nvar)]
12741274
self.srtlist = struct.unpack(
12751275
self.byteorder + ('h' * (self.nvar + 1)),
@@ -1326,20 +1326,13 @@ def _calcsize(self, fmt):
13261326
struct.calcsize(self.byteorder + fmt))
13271327

13281328
def _decode(self, s):
1329-
s = s.partition(b"\0")[0]
1330-
try:
1331-
return s.decode('utf-8')
1332-
except UnicodeDecodeError:
1333-
# GH 25960
1334-
return s.decode('latin-1')
1335-
1336-
def _null_terminate(self, s):
13371329
# have bytes not strings, so must decode
13381330
s = s.partition(b"\0")[0]
13391331
try:
13401332
return s.decode(self._encoding)
13411333
except UnicodeDecodeError:
1342-
# GH 25960
1334+
# GH 25960, fallback to handle incorrect format produced when 117
1335+
# files are converted to 118 files in Stata
13431336
return s.decode('latin-1')
13441337

13451338
def _read_value_labels(self):
@@ -1370,7 +1363,7 @@ def _read_value_labels(self):
13701363
if not slength:
13711364
break # end of value label table (format < 117)
13721365
if self.format_version <= 117:
1373-
labname = self._null_terminate(self.path_or_buf.read(33))
1366+
labname = self._decode(self.path_or_buf.read(33))
13741367
else:
13751368
labname = self._decode(self.path_or_buf.read(129))
13761369
self.path_or_buf.read(3) # padding
@@ -1392,12 +1385,8 @@ def _read_value_labels(self):
13921385
self.value_label_dict[labname] = dict()
13931386
for i in range(n):
13941387
end = off[i + 1] if i < n - 1 else txtlen
1395-
if self.format_version <= 117:
1396-
self.value_label_dict[labname][val[i]] = (
1397-
self._null_terminate(txt[off[i]:end]))
1398-
else:
1399-
self.value_label_dict[labname][val[i]] = (
1400-
self._decode(txt[off[i]:end]))
1388+
self.value_label_dict[labname][val[i]] = \
1389+
self._decode(txt[off[i]:end])
14011390
if self.format_version >= 117:
14021391
self.path_or_buf.read(6) # </lbl>
14031392
self._value_labels_read = True
@@ -1552,7 +1541,7 @@ def read(self, nrows=None, convert_dates=None,
15521541
for col, typ in zip(data, self.typlist):
15531542
if type(typ) is int:
15541543
data[col] = data[col].apply(
1555-
self._null_terminate, convert_dtype=True)
1544+
self._decode, convert_dtype=True)
15561545

15571546
data = self._insert_strls(data)
15581547

0 commit comments

Comments
 (0)