Skip to content

Commit dcbbb32

Browse files
committed
BUG: Fix encoding for Stata format 118 files
Ensure that Stata 118 files always use utf-8 encoding Deprecate encoding from read_stata and StataReader
1 parent 4274b84 commit dcbbb32

File tree

4 files changed

+61
-45
lines changed

4 files changed

+61
-45
lines changed

doc/source/whatsnew/v0.23.1.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ I/O
9292

9393
- Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`)
9494
- Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`)
95-
-
95+
- Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`)
9696

9797
Plotting
9898
^^^^^^^^

pandas/io/stata.py

+30-31
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,12 @@
3636
from pandas.util._decorators import Appender
3737
from pandas.util._decorators import deprecate_kwarg
3838

39-
VALID_ENCODINGS = ('ascii', 'us-ascii', 'latin-1', 'latin_1', 'iso-8859-1',
39+
# Allowed encodings of Stata dta files. Preferred is first entry
40+
VALID_ENCODINGS = ('latin-1', 'latin_1', 'ascii', 'us-ascii', 'iso-8859-1',
4041
'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1')
4142

43+
VALID_ENCODINGS_118 = ('utf8', 'utf-8')
44+
4245
_version_error = ("Version of given Stata file is not 104, 105, 108, "
4346
"111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
4447
"115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)")
@@ -169,6 +172,7 @@
169172

170173

171174
@Appender(_read_stata_doc)
175+
@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
172176
@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
173177
def read_stata(filepath_or_buffer, convert_dates=True,
174178
convert_categoricals=True, encoding=None, index_col=None,
@@ -182,7 +186,7 @@ def read_stata(filepath_or_buffer, convert_dates=True,
182186
preserve_dtypes=preserve_dtypes,
183187
columns=columns,
184188
order_categoricals=order_categoricals,
185-
chunksize=chunksize, encoding=encoding)
189+
chunksize=chunksize)
186190

187191
if iterator or chunksize:
188192
data = reader
@@ -399,16 +403,19 @@ def parse_dates_safe(dates, delta=False, year=False, days=False):
399403
elif infer_dtype(dates) == 'datetime':
400404
if delta:
401405
delta = dates.values - stata_epoch
402-
f = lambda x: \
403-
US_PER_DAY * x.days + 1000000 * x.seconds + x.microseconds
406+
407+
def f(x):
408+
return US_PER_DAY * x.days + \
409+
1000000 * x.seconds + x.microseconds
404410
v = np.vectorize(f)
405411
d['delta'] = v(delta)
406412
if year:
407413
year_month = dates.apply(lambda x: 100 * x.year + x.month)
408414
d['year'] = year_month.values // 100
409415
d['month'] = (year_month.values - d['year'] * 100)
410416
if days:
411-
f = lambda x: (x - datetime.datetime(x.year, 1, 1)).days
417+
def f(x):
418+
return (x - datetime.datetime(x.year, 1, 1)).days
412419
v = np.vectorize(f)
413420
d['days'] = v(dates)
414421
else:
@@ -838,7 +845,6 @@ def get_base_missing_value(cls, dtype):
838845

839846

840847
class StataParser(object):
841-
_default_encoding = 'latin-1'
842848

843849
def __init__(self, encoding):
844850
if encoding is not None:
@@ -959,12 +965,13 @@ def __init__(self, encoding):
959965
class StataReader(StataParser, BaseIterator):
960966
__doc__ = _stata_reader_doc
961967

968+
@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
962969
@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
963970
def __init__(self, path_or_buf, convert_dates=True,
964971
convert_categoricals=True, index_col=None,
965972
convert_missing=False, preserve_dtypes=True,
966973
columns=None, order_categoricals=True,
967-
encoding='latin-1', chunksize=None):
974+
encoding=None, chunksize=None):
968975
super(StataReader, self).__init__(encoding)
969976
self.col_sizes = ()
970977

@@ -977,10 +984,6 @@ def __init__(self, path_or_buf, convert_dates=True,
977984
self._preserve_dtypes = preserve_dtypes
978985
self._columns = columns
979986
self._order_categoricals = order_categoricals
980-
if encoding is not None:
981-
if encoding not in VALID_ENCODINGS:
982-
raise ValueError('Unknown encoding. Only latin-1 and ascii '
983-
'supported.')
984987
self._encoding = encoding
985988
self._chunksize = chunksize
986989

@@ -998,18 +1001,13 @@ def __init__(self, path_or_buf, convert_dates=True,
9981001
path_or_buf = _stringify_path(path_or_buf)
9991002
if isinstance(path_or_buf, str):
10001003
path_or_buf, encoding, _, should_close = get_filepath_or_buffer(
1001-
path_or_buf, encoding=self._default_encoding
1002-
)
1004+
path_or_buf)
10031005

10041006
if isinstance(path_or_buf, (str, text_type, bytes)):
10051007
self.path_or_buf = open(path_or_buf, 'rb')
10061008
else:
10071009
# Copy to BytesIO, and ensure no encoding
10081010
contents = path_or_buf.read()
1009-
try:
1010-
contents = contents.encode(self._default_encoding)
1011-
except:
1012-
pass
10131011
self.path_or_buf = BytesIO(contents)
10141012

10151013
self._read_header()
@@ -1030,6 +1028,15 @@ def close(self):
10301028
except IOError:
10311029
pass
10321030

1031+
def _set_encoding(self):
1032+
"""
1033+
Check validity of user-set encoding set the default encoding
1034+
"""
1035+
if self.format_version < 118:
1036+
self._encoding = 'latin-1'
1037+
else:
1038+
self._encoding = 'utf-8'
1039+
10331040
def _read_header(self):
10341041
first_char = self.path_or_buf.read(1)
10351042
if struct.unpack('c', first_char)[0] == b'<':
@@ -1049,6 +1056,7 @@ def _read_new_header(self, first_char):
10491056
self.format_version = int(self.path_or_buf.read(3))
10501057
if self.format_version not in [117, 118]:
10511058
raise ValueError(_version_error)
1059+
self._set_encoding()
10521060
self.path_or_buf.read(21) # </release><byteorder>
10531061
self.byteorder = self.path_or_buf.read(3) == b'MSF' and '>' or '<'
10541062
self.path_or_buf.read(15) # </byteorder><K>
@@ -1235,6 +1243,7 @@ def _read_old_header(self, first_char):
12351243
self.format_version = struct.unpack('b', first_char)[0]
12361244
if self.format_version not in [104, 105, 108, 111, 113, 114, 115]:
12371245
raise ValueError(_version_error)
1246+
self._set_encoding()
12381247
self.byteorder = struct.unpack('b', self.path_or_buf.read(1))[
12391248
0] == 0x1 and '>' or '<'
12401249
self.filetype = struct.unpack('b', self.path_or_buf.read(1))[0]
@@ -1338,16 +1347,9 @@ def _decode(self, s):
13381347
return s.decode('utf-8')
13391348

13401349
def _null_terminate(self, s):
1341-
if compat.PY3 or self._encoding is not None:
1342-
# have bytes not strings, so must decode
1343-
s = s.partition(b"\0")[0]
1344-
return s.decode(self._encoding or self._default_encoding)
1345-
else:
1346-
null_byte = "\0"
1347-
try:
1348-
return s.lstrip(null_byte)[:s.index(null_byte)]
1349-
except:
1350-
return s
1350+
# have bytes not strings, so must decode
1351+
s = s.partition(b"\0")[0]
1352+
return s.decode(self._encoding)
13511353

13521354
def _read_value_labels(self):
13531355
if self._value_labels_read:
@@ -1433,10 +1435,7 @@ def _read_strls(self):
14331435
self.path_or_buf.read(4))[0]
14341436
va = self.path_or_buf.read(length)
14351437
if typ == 130:
1436-
encoding = 'utf-8'
1437-
if self.format_version == 117:
1438-
encoding = self._encoding or self._default_encoding
1439-
va = va[0:-1].decode(encoding)
1438+
va = va[0:-1].decode(self._encoding)
14401439
# Wrap v_o in a string to allow uint64 values as keys on 32bit OS
14411440
self.GSO[str(v_o)] = va
14421441

pandas/tests/io/data/stata16_118.dta

4.51 KB
Binary file not shown.

pandas/tests/io/test_stata.py

+30-13
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ def setup_method(self, method):
9696
self.dta23 = os.path.join(self.dirpath, 'stata15.dta')
9797

9898
self.dta24_111 = os.path.join(self.dirpath, 'stata7_111.dta')
99+
self.dta25_118 = os.path.join(self.dirpath, 'stata16_118.dta')
99100

100101
self.stata_dates = os.path.join(self.dirpath, 'stata13_dates.dta')
101102

@@ -360,22 +361,20 @@ def test_encoding(self, version):
360361

361362
# GH 4626, proper encoding handling
362363
raw = read_stata(self.dta_encoding)
363-
encoded = read_stata(self.dta_encoding, encoding="latin-1")
364+
with warnings.catch_warnings(record=True) as w:
365+
warnings.simplefilter("always")
366+
encoded = read_stata(self.dta_encoding, encoding="latin-1")
367+
assert len(w) == 1
364368
result = encoded.kreis1849[0]
365369

366-
if compat.PY3:
367-
expected = raw.kreis1849[0]
368-
assert result == expected
369-
assert isinstance(result, compat.string_types)
370-
else:
371-
expected = raw.kreis1849.str.decode("latin-1")[0]
372-
assert result == expected
373-
assert isinstance(result, unicode) # noqa
370+
expected = raw.kreis1849[0]
371+
assert result == expected
372+
assert isinstance(result, compat.string_types)
374373

375374
with tm.ensure_clean() as path:
376375
encoded.to_stata(path, encoding='latin-1',
377376
write_index=False, version=version)
378-
reread_encoded = read_stata(path, encoding='latin-1')
377+
reread_encoded = read_stata(path)
379378
tm.assert_frame_equal(encoded, reread_encoded)
380379

381380
def test_read_write_dta11(self):
@@ -520,7 +519,8 @@ def test_numeric_column_names(self):
520519
written_and_read_again = self.read_dta(path)
521520
written_and_read_again = written_and_read_again.set_index('index')
522521
columns = list(written_and_read_again.columns)
523-
convert_col_name = lambda x: int(x[1])
522+
523+
def convert_col_name(x): return int(x[1])
524524
written_and_read_again.columns = map(convert_col_name, columns)
525525
tm.assert_frame_equal(original, written_and_read_again)
526526

@@ -1363,14 +1363,16 @@ def test_invalid_encoding(self):
13631363
def test_path_pathlib(self):
13641364
df = tm.makeDataFrame()
13651365
df.index.name = 'index'
1366-
reader = lambda x: read_stata(x).set_index('index')
1366+
1367+
def reader(x): return read_stata(x).set_index('index')
13671368
result = tm.round_trip_pathlib(df.to_stata, reader)
13681369
tm.assert_frame_equal(df, result)
13691370

13701371
def test_pickle_path_localpath(self):
13711372
df = tm.makeDataFrame()
13721373
df.index.name = 'index'
1373-
reader = lambda x: read_stata(x).set_index('index')
1374+
1375+
def reader(x): return read_stata(x).set_index('index')
13741376
result = tm.round_trip_localpath(df.to_stata, reader)
13751377
tm.assert_frame_equal(df, result)
13761378

@@ -1500,3 +1502,18 @@ def test_gzip_writing(self):
15001502
with gzip.GzipFile(path, 'rb') as gz:
15011503
reread = pd.read_stata(gz, index_col='index')
15021504
tm.assert_frame_equal(df, reread)
1505+
1506+
def test_unicode_dta_118(self):
1507+
unicode_df = self.read_dta(self.dta25_118)
1508+
1509+
columns = ['utf8', 'latin1', 'ascii', 'utf8_strl', 'ascii_strl']
1510+
values = [[u'ραηδας', u'PÄNDÄS', 'p', u'ραηδας', 'p'],
1511+
[u'ƤĀńĐąŜ', u'Ö', 'a', u'ƤĀńĐąŜ', 'a'],
1512+
[u'ᴘᴀᴎᴅᴀS', u'Ü', 'n', u'ᴘᴀᴎᴅᴀS', 'n'],
1513+
[' ', ' ', 'd', ' ', 'd'],
1514+
[' ', '', 'a', ' ', 'a'],
1515+
['', '', 's', '', 's'],
1516+
['', '', ' ', '', ' ']]
1517+
expected = pd.DataFrame(values, columns=columns)
1518+
1519+
tm.assert_frame_equal(unicode_df, expected)

0 commit comments

Comments
 (0)