Skip to content

Commit 1c9d46a

Browse files
bashtagejreback
authored andcommitted
BUG: Enforce correct encoding in stata
Ensure StataReader and StataWriter have the correct encoding. Standardized default encoding to 'latin-1' closes pandas-dev#15723 Author: Kevin Sheppard <[email protected]> Closes pandas-dev#15768 from bashtage/limit-stata-encoding and squashes the following commits: 8278be7 [Kevin Sheppard] BUG: Fix limited key range on 32-bit platofrms 2f02697 [Kevin Sheppard] BUG: Enforce correct encoding in stata
1 parent 163d18e commit 1c9d46a

File tree

3 files changed

+32
-7
lines changed

3 files changed

+32
-7
lines changed

doc/source/whatsnew/v0.20.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -919,6 +919,8 @@ Bug Fixes
919919
- Avoid use of ``np.finfo()`` during ``import pandas`` removed to mitigate deadlock on Python GIL misuse (:issue:`14641`)
920920

921921
- Bug in ``DataFrame.to_stata()`` and ``StataWriter`` which produces incorrectly formatted files to be produced for some locales (:issue:`13856`)
922+
- Bug in ``StataReader`` and ``StataWriter`` which allows invalid encodings (:issue:`15723`)
923+
922924
- Bug in ``pd.concat()`` in which concatting with an empty dataframe with ``join='inner'`` was being improperly handled (:issue:`15328`)
923925
- Bug in ``groupby.agg()`` incorrectly localizing timezone on ``datetime`` (:issue:`15426`, :issue:`10668`, :issue:`13046`)
924926

@@ -933,3 +935,4 @@ Bug Fixes
933935
- Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`)
934936
- Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`)
935937
- Bug in ``pd.read_msgpack`` which did not allow to load dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`)
938+

pandas/io/stata.py

+22-7
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@
3333
from pandas._libs.lib import max_len_string_array, infer_dtype
3434
from pandas._libs.tslib import NaT, Timestamp
3535

36+
VALID_ENCODINGS = ('ascii', 'us-ascii', 'latin-1', 'latin_1', 'iso-8859-1',
37+
'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1')
38+
3639
_version_error = ("Version of given Stata file is not 104, 105, 108, "
3740
"111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
3841
"115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)")
@@ -45,7 +48,7 @@
4548

4649
_encoding_params = """\
4750
encoding : string, None or encoding
48-
Encoding used to parse the files. None defaults to iso-8859-1."""
51+
Encoding used to parse the files. None defaults to latin-1."""
4952

5053
_statafile_processing_params2 = """\
5154
index : identifier of index column
@@ -816,9 +819,14 @@ def get_base_missing_value(cls, dtype):
816819

817820

818821
class StataParser(object):
819-
_default_encoding = 'iso-8859-1'
822+
_default_encoding = 'latin-1'
820823

821824
def __init__(self, encoding):
825+
if encoding is not None:
826+
if encoding not in VALID_ENCODINGS:
827+
raise ValueError('Unknown encoding. Only latin-1 and ascii '
828+
'supported.')
829+
822830
self._encoding = encoding
823831

824832
# type code.
@@ -936,7 +944,7 @@ def __init__(self, path_or_buf, convert_dates=True,
936944
convert_categoricals=True, index=None,
937945
convert_missing=False, preserve_dtypes=True,
938946
columns=None, order_categoricals=True,
939-
encoding='iso-8859-1', chunksize=None):
947+
encoding='latin-1', chunksize=None):
940948
super(StataReader, self).__init__(encoding)
941949
self.col_sizes = ()
942950

@@ -949,6 +957,10 @@ def __init__(self, path_or_buf, convert_dates=True,
949957
self._preserve_dtypes = preserve_dtypes
950958
self._columns = columns
951959
self._order_categoricals = order_categoricals
960+
if encoding is not None:
961+
if encoding not in VALID_ENCODINGS:
962+
raise ValueError('Unknown encoding. Only latin-1 and ascii '
963+
'supported.')
952964
self._encoding = encoding
953965
self._chunksize = chunksize
954966

@@ -1362,7 +1374,8 @@ def _read_value_labels(self):
13621374

13631375
def _read_strls(self):
13641376
self.path_or_buf.seek(self.seek_strls)
1365-
self.GSO = {0: ''}
1377+
# Wrap v_o in a string to allow uint64 values as keys on 32bit OS
1378+
self.GSO = {'0': ''}
13661379
while True:
13671380
if self.path_or_buf.read(3) != b'GSO':
13681381
break
@@ -1387,7 +1400,8 @@ def _read_strls(self):
13871400
if self.format_version == 117:
13881401
encoding = self._encoding or self._default_encoding
13891402
va = va[0:-1].decode(encoding)
1390-
self.GSO[v_o] = va
1403+
# Wrap v_o in a string to allow uint64 values as keys on 32bit OS
1404+
self.GSO[str(v_o)] = va
13911405

13921406
# legacy
13931407
@Appender('DEPRECATED: ' + _data_method_doc)
@@ -1623,7 +1637,8 @@ def _insert_strls(self, data):
16231637
for i, typ in enumerate(self.typlist):
16241638
if typ != 'Q':
16251639
continue
1626-
data.iloc[:, i] = [self.GSO[k] for k in data.iloc[:, i]]
1640+
# Wrap v_o in a string to allow uint64 values as keys on 32bit OS
1641+
data.iloc[:, i] = [self.GSO[str(k)] for k in data.iloc[:, i]]
16271642
return data
16281643

16291644
def _do_select_columns(self, data, columns):
@@ -1855,7 +1870,7 @@ class StataWriter(StataParser):
18551870
write_index : bool
18561871
Write the index to Stata dataset.
18571872
encoding : str
1858-
Default is latin-1. Unicode is not supported
1873+
Default is latin-1. Only latin-1 and ascii are supported.
18591874
byteorder : str
18601875
Can be ">", "<", "little", or "big". default is `sys.byteorder`
18611876
time_stamp : datetime

pandas/tests/io/test_stata.py

+7
Original file line numberDiff line numberDiff line change
@@ -1276,3 +1276,10 @@ def test_out_of_range_float(self):
12761276
original.to_stata(path)
12771277
tm.assertTrue('ColumnTooBig' in cm.exception)
12781278
tm.assertTrue('infinity' in cm.exception)
1279+
1280+
def test_invalid_encoding(self):
1281+
# GH15723, validate encoding
1282+
original = self.read_csv(self.csv3)
1283+
with tm.assertRaises(ValueError):
1284+
with tm.ensure_clean() as path:
1285+
original.to_stata(path, encoding='utf-8')

0 commit comments

Comments
 (0)