ENH: Explicit range checking of floats when writing Stata

bashtage · jreback · commit fe555db3f178 · 2016-11-17T07:53:14.000-05:00
Add explicit error checking for out-of-range doubles when writing Stata files Upcasts float32 to float64 if out-of-range values encountered Tests for infinite values and raises if found closes #14618 closes #14637
diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt
@@ -26,3 +26,4 @@ Bug Fixes
 - Compat with ``dateutil==2.6.0``; segfault reported in the testing suite (:issue:`14621`)
 - Allow ``nanoseconds`` in ``Timestamp.replace`` as a kwarg (:issue:`14621`)
 - Bug in ``pd.cut`` with negative values and a single bin (:issue:`14652`)
+- Explicit check in ``to_stata`` and ``StataWriter`` for out-of-range values when writing doubles (:issue:`14618`)
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -80,3 +80,4 @@ Performance Improvements
 
 Bug Fixes
 ~~~~~~~~~
+
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -511,6 +511,9 @@ def _cast_to_stata_types(data):
                        (np.uint16, np.int16, np.int32),
                        (np.uint32, np.int32, np.int64))
 
+    float32_max = struct.unpack('<f', b'\xff\xff\xff\x7e')[0]
+    float64_max = struct.unpack('<d', b'\xff\xff\xff\xff\xff\xff\xdf\x7f')[0]
+
     for col in data:
         dtype = data[col].dtype
         # Cast from unsupported types to supported types
@@ -541,6 +544,19 @@ def _cast_to_stata_types(data):
                 data[col] = data[col].astype(np.float64)
                 if data[col].max() >= 2 ** 53 or data[col].min() <= -2 ** 53:
                     ws = precision_loss_doc % ('int64', 'float64')
+        elif dtype in (np.float32, np.float64):
+            value = data[col].max()
+            if np.isinf(value):
+                msg = 'Column {0} has a maximum value of infinity which is ' \
+                      'outside the range supported by Stata.'
+                raise ValueError(msg.format(col))
+            if dtype == np.float32 and value > float32_max:
+                data[col] = data[col].astype(np.float64)
+            elif dtype == np.float64:
+                if value > float64_max:
+                    msg = 'Column {0} has a maximum value ({1}) outside the ' \
+                          'range supported by Stata ({1})'
+                    raise ValueError(msg.format(col, value, float64_max))
 
     if ws:
         import warnings
@@ -2048,6 +2064,7 @@ def _prepare_pandas(self, data):
         data = self._check_column_names(data)
 
         # Check columns for compatibility with stata, upcast if necessary
+        # Raise if outside the supported range
         data = _cast_to_stata_types(data)
 
         # Replace NaNs with Stata missing values
diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py
@@ -11,8 +11,6 @@
 
 import nose
 import numpy as np
-from pandas.tslib import NaT
-
 import pandas as pd
 import pandas.util.testing as tm
 from pandas import compat
@@ -21,6 +19,7 @@
 from pandas.io.parsers import read_csv
 from pandas.io.stata import (read_stata, StataReader, InvalidColumnName,
                              PossiblePrecisionLoss, StataMissingValue)
+from pandas.tslib import NaT
 from pandas.types.common import is_categorical_dtype
 
 
@@ -1234,6 +1233,52 @@ def test_stata_111(self):
         original = original[['y', 'x', 'w', 'z']]
         tm.assert_frame_equal(original, df)
 
+    def test_out_of_range_double(self):
+        # GH 14618
+        df = DataFrame({'ColumnOk': [0.0,
+                                     np.finfo(np.double).eps,
+                                     4.49423283715579e+307],
+                        'ColumnTooBig': [0.0,
+                                         np.finfo(np.double).eps,
+                                         np.finfo(np.double).max]})
+        with tm.assertRaises(ValueError) as cm:
+            with tm.ensure_clean() as path:
+                df.to_stata(path)
+            tm.assertTrue('ColumnTooBig' in cm.exception)
+
+        df.loc[2, 'ColumnTooBig'] = np.inf
+        with tm.assertRaises(ValueError) as cm:
+            with tm.ensure_clean() as path:
+                df.to_stata(path)
+            tm.assertTrue('ColumnTooBig' in cm.exception)
+            tm.assertTrue('infinity' in cm.exception)
+
+    def test_out_of_range_float(self):
+        original = DataFrame({'ColumnOk': [0.0,
+                                           np.finfo(np.float32).eps,
+                                           np.finfo(np.float32).max / 10.0],
+                              'ColumnTooBig': [0.0,
+                                               np.finfo(np.float32).eps,
+                                               np.finfo(np.float32).max]})
+        original.index.name = 'index'
+        for col in original:
+            original[col] = original[col].astype(np.float32)
+
+        with tm.ensure_clean() as path:
+            original.to_stata(path)
+            reread = read_stata(path)
+            original['ColumnTooBig'] = original['ColumnTooBig'].astype(
+                np.float64)
+            tm.assert_frame_equal(original,
+                                  reread.set_index('index'))
+
+        original.loc[2, 'ColumnTooBig'] = np.inf
+        with tm.assertRaises(ValueError) as cm:
+            with tm.ensure_clean() as path:
+                original.to_stata(path)
+            tm.assertTrue('ColumnTooBig' in cm.exception)
+            tm.assertTrue('infinity' in cm.exception)
+
 
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

Original file line number	Diff line number	Diff line change
`@@ -80,3 +80,4 @@ Performance Improvements`
`80`	`80`
`81`	`81`	`Bug Fixes`
`82`	`82`	`~~~~~~~~~`
	`83`	`+`