Skip to content

Commit 8278be7

Browse files
committed
BUG: Fix limited key range on 32-bit platofrms
Fix use of 64-bit integers as keys in general string objects (GSO) by wrapping in strings when used as dictionary keys
1 parent 2f02697 commit 8278be7

File tree

2 files changed

+17
-12
lines changed

2 files changed

+17
-12
lines changed

pandas/io/stata.py

+16-12
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@
156156

157157
@Appender(_read_stata_doc)
158158
def read_stata(filepath_or_buffer, convert_dates=True,
159-
convert_categoricals=True, encoding='latin-1', index=None,
159+
convert_categoricals=True, encoding=None, index=None,
160160
convert_missing=False, preserve_dtypes=True, columns=None,
161161
order_categoricals=True, chunksize=None, iterator=False):
162162

@@ -821,11 +821,11 @@ def get_base_missing_value(cls, dtype):
821821
class StataParser(object):
822822
_default_encoding = 'latin-1'
823823

824-
def __init__(self, encoding='latin-1'):
825-
826-
if encoding not in VALID_ENCODINGS:
827-
raise ValueError('Unknown encoding. Only latin-1 and ascii '
828-
'supported.')
824+
def __init__(self, encoding):
825+
if encoding is not None:
826+
if encoding not in VALID_ENCODINGS:
827+
raise ValueError('Unknown encoding. Only latin-1 and ascii '
828+
'supported.')
829829

830830
self._encoding = encoding
831831

@@ -957,9 +957,10 @@ def __init__(self, path_or_buf, convert_dates=True,
957957
self._preserve_dtypes = preserve_dtypes
958958
self._columns = columns
959959
self._order_categoricals = order_categoricals
960-
if encoding not in VALID_ENCODINGS:
961-
raise ValueError('Unknown encoding. Only latin-1 and ascii '
962-
'supported.')
960+
if encoding is not None:
961+
if encoding not in VALID_ENCODINGS:
962+
raise ValueError('Unknown encoding. Only latin-1 and ascii '
963+
'supported.')
963964
self._encoding = encoding
964965
self._chunksize = chunksize
965966

@@ -1373,7 +1374,8 @@ def _read_value_labels(self):
13731374

13741375
def _read_strls(self):
13751376
self.path_or_buf.seek(self.seek_strls)
1376-
self.GSO = {0: ''}
1377+
# Wrap v_o in a string to allow uint64 values as keys on 32bit OS
1378+
self.GSO = {'0': ''}
13771379
while True:
13781380
if self.path_or_buf.read(3) != b'GSO':
13791381
break
@@ -1398,7 +1400,8 @@ def _read_strls(self):
13981400
if self.format_version == 117:
13991401
encoding = self._encoding or self._default_encoding
14001402
va = va[0:-1].decode(encoding)
1401-
self.GSO[v_o] = va
1403+
# Wrap v_o in a string to allow uint64 values as keys on 32bit OS
1404+
self.GSO[str(v_o)] = va
14021405

14031406
# legacy
14041407
@Appender('DEPRECATED: ' + _data_method_doc)
@@ -1634,7 +1637,8 @@ def _insert_strls(self, data):
16341637
for i, typ in enumerate(self.typlist):
16351638
if typ != 'Q':
16361639
continue
1637-
data.iloc[:, i] = [self.GSO[k] for k in data.iloc[:, i]]
1640+
# Wrap v_o in a string to allow uint64 values as keys on 32bit OS
1641+
data.iloc[:, i] = [self.GSO[str(k)] for k in data.iloc[:, i]]
16381642
return data
16391643

16401644
def _do_select_columns(self, data, columns):

pandas/tests/io/test_stata.py

+1
Original file line numberDiff line numberDiff line change
@@ -1277,6 +1277,7 @@ def test_out_of_range_float(self):
12771277
tm.assertTrue('ColumnTooBig' in cm.exception)
12781278
tm.assertTrue('infinity' in cm.exception)
12791279

1280+
# GH15723, validate encoding
12801281
def test_invalid_encoding(self):
12811282
original = self.read_csv(self.csv3)
12821283
with tm.assertRaises(ValueError):

0 commit comments

Comments
 (0)