Skip to content

Commit cacb391

Browse files
author
Adrian Castravete
committed
BUG: Fix handling of encoding for the StataReader pandas-dev#21244
1 parent 3147a86 commit cacb391

File tree

2 files changed

+8
-7
lines changed

2 files changed

+8
-7
lines changed

pandas/io/stata.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@
3737
from pandas.util._decorators import deprecate_kwarg
3838

3939
VALID_ENCODINGS = ('ascii', 'us-ascii', 'latin-1', 'latin_1', 'iso-8859-1',
40-
'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1')
40+
'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1',
41+
'utf-8', 'utf8')
4142

4243
_version_error = ("Version of given Stata file is not 104, 105, 108, "
4344
"111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
@@ -1335,7 +1336,7 @@ def _calcsize(self, fmt):
13351336

13361337
def _decode(self, s):
13371338
s = s.partition(b"\0")[0]
1338-
return s.decode('utf-8')
1339+
return s.decode(self._encoding or self._default_encoding)
13391340

13401341
def _null_terminate(self, s):
13411342
if compat.PY3 or self._encoding is not None:

pandas/tests/io/test_stata.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,9 @@ def setup_method(self, method):
9797

9898
self.stata_dates = os.path.join(self.dirpath, 'stata13_dates.dta')
9999

100-
def read_dta(self, file):
100+
def read_dta(self, file, encoding='latin-1'):
101101
# Legacy default reader configuration
102-
return read_stata(file, convert_dates=True)
102+
return read_stata(file, convert_dates=True, encoding=encoding)
103103

104104
def read_csv(self, file):
105105
return read_csv(file, parse_dates=True)
@@ -266,7 +266,7 @@ def test_read_dta12(self):
266266
tm.assert_frame_equal(parsed_117, expected, check_dtype=False)
267267

268268
def test_read_dta18(self):
269-
parsed_118 = self.read_dta(self.dta22_118)
269+
parsed_118 = self.read_dta(self.dta22_118, encoding='utf-8')
270270
parsed_118["Bytes"] = parsed_118["Bytes"].astype('O')
271271
expected = DataFrame.from_records(
272272
[['Cat', 'Bogota', u'Bogotá', 1, 1.0, u'option b Ünicode', 1.0],
@@ -281,7 +281,7 @@ def test_read_dta18(self):
281281
for col in parsed_118.columns:
282282
tm.assert_almost_equal(parsed_118[col], expected[col])
283283

284-
with StataReader(self.dta22_118) as rdr:
284+
with StataReader(self.dta22_118, encoding='utf-8') as rdr:
285285
vl = rdr.variable_labels()
286286
vl_expected = {u'Unicode_Cities_Strl':
287287
u'Here are some strls with Ünicode chars',
@@ -1356,7 +1356,7 @@ def test_invalid_encoding(self):
13561356
original = self.read_csv(self.csv3)
13571357
with pytest.raises(ValueError):
13581358
with tm.ensure_clean() as path:
1359-
original.to_stata(path, encoding='utf-8')
1359+
original.to_stata(path, encoding='pokemon')
13601360

13611361
def test_path_pathlib(self):
13621362
df = tm.makeDataFrame()

0 commit comments

Comments
 (0)