Skip to content

Commit b291a30

Browse files
author
Adrian Castravete
committed
BUG: Fix handling of encoding for the StataReader #21244
1 parent c85ab08 commit b291a30

File tree

2 files changed

+8
-7
lines changed

2 files changed

+8
-7
lines changed

pandas/io/stata.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@
3737
from pandas.util._decorators import deprecate_kwarg
3838

3939
VALID_ENCODINGS = ('ascii', 'us-ascii', 'latin-1', 'latin_1', 'iso-8859-1',
40-
'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1')
40+
'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1',
41+
'utf-8', 'utf8')
4142

4243
_version_error = ("Version of given Stata file is not 104, 105, 108, "
4344
"111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
@@ -1335,7 +1336,7 @@ def _calcsize(self, fmt):
13351336

13361337
def _decode(self, s):
13371338
s = s.partition(b"\0")[0]
1338-
return s.decode('utf-8')
1339+
return s.decode(self._encoding or self._default_encoding)
13391340

13401341
def _null_terminate(self, s):
13411342
if compat.PY3 or self._encoding is not None:

pandas/tests/io/test_stata.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -99,9 +99,9 @@ def setup_method(self, method):
9999

100100
self.stata_dates = os.path.join(self.dirpath, 'stata13_dates.dta')
101101

102-
def read_dta(self, file):
102+
def read_dta(self, file, encoding='latin-1'):
103103
# Legacy default reader configuration
104-
return read_stata(file, convert_dates=True)
104+
return read_stata(file, convert_dates=True, encoding=encoding)
105105

106106
def read_csv(self, file):
107107
return read_csv(file, parse_dates=True)
@@ -268,7 +268,7 @@ def test_read_dta12(self):
268268
tm.assert_frame_equal(parsed_117, expected, check_dtype=False)
269269

270270
def test_read_dta18(self):
271-
parsed_118 = self.read_dta(self.dta22_118)
271+
parsed_118 = self.read_dta(self.dta22_118, encoding='utf-8')
272272
parsed_118["Bytes"] = parsed_118["Bytes"].astype('O')
273273
expected = DataFrame.from_records(
274274
[['Cat', 'Bogota', u'Bogotá', 1, 1.0, u'option b Ünicode', 1.0],
@@ -283,7 +283,7 @@ def test_read_dta18(self):
283283
for col in parsed_118.columns:
284284
tm.assert_almost_equal(parsed_118[col], expected[col])
285285

286-
with StataReader(self.dta22_118) as rdr:
286+
with StataReader(self.dta22_118, encoding='utf-8') as rdr:
287287
vl = rdr.variable_labels()
288288
vl_expected = {u'Unicode_Cities_Strl':
289289
u'Here are some strls with Ünicode chars',
@@ -1358,7 +1358,7 @@ def test_invalid_encoding(self):
13581358
original = self.read_csv(self.csv3)
13591359
with pytest.raises(ValueError):
13601360
with tm.ensure_clean() as path:
1361-
original.to_stata(path, encoding='utf-8')
1361+
original.to_stata(path, encoding='pokemon')
13621362

13631363
def test_path_pathlib(self):
13641364
df = tm.makeDataFrame()

0 commit comments

Comments
 (0)