Skip to content

Commit fe555db

Browse files
bashtagejreback
authored andcommitted
ENH: Explicit range checking of floats when writing Stata
Add explicit error checking for out-of-range doubles when writing Stata files Upcasts float32 to float64 if out-of-range values encountered Tests for infinite values and raises if found closes #14618 closes #14637
1 parent 2fc0c68 commit fe555db

File tree

4 files changed

+66
-2
lines changed

4 files changed

+66
-2
lines changed

doc/source/whatsnew/v0.19.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,4 @@ Bug Fixes
2626
- Compat with ``dateutil==2.6.0``; segfault reported in the testing suite (:issue:`14621`)
2727
- Allow ``nanoseconds`` in ``Timestamp.replace`` as a kwarg (:issue:`14621`)
2828
- Bug in ``pd.cut`` with negative values and a single bin (:issue:`14652`)
29+
- Explicit check in ``to_stata`` and ``StataWriter`` for out-of-range values when writing doubles (:issue:`14618`)

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -80,3 +80,4 @@ Performance Improvements
8080

8181
Bug Fixes
8282
~~~~~~~~~
83+

pandas/io/stata.py

+17
Original file line numberDiff line numberDiff line change
@@ -511,6 +511,9 @@ def _cast_to_stata_types(data):
511511
(np.uint16, np.int16, np.int32),
512512
(np.uint32, np.int32, np.int64))
513513

514+
float32_max = struct.unpack('<f', b'\xff\xff\xff\x7e')[0]
515+
float64_max = struct.unpack('<d', b'\xff\xff\xff\xff\xff\xff\xdf\x7f')[0]
516+
514517
for col in data:
515518
dtype = data[col].dtype
516519
# Cast from unsupported types to supported types
@@ -541,6 +544,19 @@ def _cast_to_stata_types(data):
541544
data[col] = data[col].astype(np.float64)
542545
if data[col].max() >= 2 ** 53 or data[col].min() <= -2 ** 53:
543546
ws = precision_loss_doc % ('int64', 'float64')
547+
elif dtype in (np.float32, np.float64):
548+
value = data[col].max()
549+
if np.isinf(value):
550+
msg = 'Column {0} has a maximum value of infinity which is ' \
551+
'outside the range supported by Stata.'
552+
raise ValueError(msg.format(col))
553+
if dtype == np.float32 and value > float32_max:
554+
data[col] = data[col].astype(np.float64)
555+
elif dtype == np.float64:
556+
if value > float64_max:
557+
msg = 'Column {0} has a maximum value ({1}) outside the ' \
558+
'range supported by Stata ({1})'
559+
raise ValueError(msg.format(col, value, float64_max))
544560

545561
if ws:
546562
import warnings
@@ -2048,6 +2064,7 @@ def _prepare_pandas(self, data):
20482064
data = self._check_column_names(data)
20492065

20502066
# Check columns for compatibility with stata, upcast if necessary
2067+
# Raise if outside the supported range
20512068
data = _cast_to_stata_types(data)
20522069

20532070
# Replace NaNs with Stata missing values

pandas/io/tests/test_stata.py

+47-2
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@
1111

1212
import nose
1313
import numpy as np
14-
from pandas.tslib import NaT
15-
1614
import pandas as pd
1715
import pandas.util.testing as tm
1816
from pandas import compat
@@ -21,6 +19,7 @@
2119
from pandas.io.parsers import read_csv
2220
from pandas.io.stata import (read_stata, StataReader, InvalidColumnName,
2321
PossiblePrecisionLoss, StataMissingValue)
22+
from pandas.tslib import NaT
2423
from pandas.types.common import is_categorical_dtype
2524

2625

@@ -1234,6 +1233,52 @@ def test_stata_111(self):
12341233
original = original[['y', 'x', 'w', 'z']]
12351234
tm.assert_frame_equal(original, df)
12361235

1236+
def test_out_of_range_double(self):
1237+
# GH 14618
1238+
df = DataFrame({'ColumnOk': [0.0,
1239+
np.finfo(np.double).eps,
1240+
4.49423283715579e+307],
1241+
'ColumnTooBig': [0.0,
1242+
np.finfo(np.double).eps,
1243+
np.finfo(np.double).max]})
1244+
with tm.assertRaises(ValueError) as cm:
1245+
with tm.ensure_clean() as path:
1246+
df.to_stata(path)
1247+
tm.assertTrue('ColumnTooBig' in cm.exception)
1248+
1249+
df.loc[2, 'ColumnTooBig'] = np.inf
1250+
with tm.assertRaises(ValueError) as cm:
1251+
with tm.ensure_clean() as path:
1252+
df.to_stata(path)
1253+
tm.assertTrue('ColumnTooBig' in cm.exception)
1254+
tm.assertTrue('infinity' in cm.exception)
1255+
1256+
def test_out_of_range_float(self):
1257+
original = DataFrame({'ColumnOk': [0.0,
1258+
np.finfo(np.float32).eps,
1259+
np.finfo(np.float32).max / 10.0],
1260+
'ColumnTooBig': [0.0,
1261+
np.finfo(np.float32).eps,
1262+
np.finfo(np.float32).max]})
1263+
original.index.name = 'index'
1264+
for col in original:
1265+
original[col] = original[col].astype(np.float32)
1266+
1267+
with tm.ensure_clean() as path:
1268+
original.to_stata(path)
1269+
reread = read_stata(path)
1270+
original['ColumnTooBig'] = original['ColumnTooBig'].astype(
1271+
np.float64)
1272+
tm.assert_frame_equal(original,
1273+
reread.set_index('index'))
1274+
1275+
original.loc[2, 'ColumnTooBig'] = np.inf
1276+
with tm.assertRaises(ValueError) as cm:
1277+
with tm.ensure_clean() as path:
1278+
original.to_stata(path)
1279+
tm.assertTrue('ColumnTooBig' in cm.exception)
1280+
tm.assertTrue('infinity' in cm.exception)
1281+
12371282

12381283
if __name__ == '__main__':
12391284
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

0 commit comments

Comments
 (0)