From 55a98f5d8f2e6c723a2c68198790d36eb3fbfbfe Mon Sep 17 00:00:00 2001
From: Kevin Sheppard <kevin.k.sheppard@gmail.com>
Date: Thu, 10 Nov 2016 22:45:56 +0000
Subject: [PATCH] ENH: Explicit range checking of floats when writing Stata

Add explicit error checking for out-of-range doubles when writing Stata files
Upcasts float32 to float64 if out-of-range values encountered
Tests for infinite values and raises if found

closes #14618
---
 doc/source/whatsnew/v0.19.2.txt |  1 +
 doc/source/whatsnew/v0.20.0.txt |  1 +
 pandas/io/stata.py              | 17 ++++++++++++
 pandas/io/tests/test_stata.py   | 49 +++++++++++++++++++++++++++++++--
 4 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt
index dc11dd17bfdd7..030315d5fbbba 100644
--- a/doc/source/whatsnew/v0.19.2.txt
+++ b/doc/source/whatsnew/v0.19.2.txt
@@ -25,3 +25,4 @@ Bug Fixes
 
 - compat with ``dateutil==2.6.0`` for testing (:issue:`14621`)
 - allow ``nanoseconds`` in ``Timestamp.replace`` kwargs (:issue:`14621`)
+- Explicit check in ``to_stata`` and ``StataWriter`` for out-of-range values when writing doubles (:issue:`14618`)
\ No newline at end of file
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
index 660300e1814e8..8819a95f27b0d 100644
--- a/doc/source/whatsnew/v0.20.0.txt
+++ b/doc/source/whatsnew/v0.20.0.txt
@@ -80,3 +80,4 @@ Performance Improvements
 
 Bug Fixes
 ~~~~~~~~~
+
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index 14bd670862b41..c35e07be2c31a 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -511,6 +511,9 @@ def _cast_to_stata_types(data):
                        (np.uint16, np.int16, np.int32),
                        (np.uint32, np.int32, np.int64))
 
+    float32_max = struct.unpack('<f', b'\xff\xff\xff\x7e')[0]
+    float64_max = struct.unpack('<d', b'\xff\xff\xff\xff\xff\xff\xdf\x7f')[0]
+
     for col in data:
         dtype = data[col].dtype
         # Cast from unsupported types to supported types
@@ -541,6 +544,19 @@ def _cast_to_stata_types(data):
                 data[col] = data[col].astype(np.float64)
                 if data[col].max() >= 2 ** 53 or data[col].min() <= -2 ** 53:
                     ws = precision_loss_doc % ('int64', 'float64')
+        elif dtype in (np.float32, np.float64):
+            value = data[col].max()
+            if np.isinf(value):
+                msg = 'Column {0} has a maximum value of infinity which is ' \
+                      'outside the range supported by Stata.'
+                raise ValueError(msg.format(col))
+            if dtype == np.float32 and value > float32_max:
+                data[col] = data[col].astype(np.float64)
+            elif dtype == np.float64:
+                if value > float64_max:
+                    msg = 'Column {0} has a maximum value ({1}) outside the ' \
+                          'range supported by Stata ({1})'
+                    raise ValueError(msg.format(col, value, float64_max))
 
     if ws:
         import warnings
@@ -2048,6 +2064,7 @@ def _prepare_pandas(self, data):
         data = self._check_column_names(data)
 
         # Check columns for compatibility with stata, upcast if necessary
+        # Raise if outside the supported range
         data = _cast_to_stata_types(data)
 
         # Replace NaNs with Stata missing values
diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py
index 1849b32a4a7c8..cd972868a6e32 100644
--- a/pandas/io/tests/test_stata.py
+++ b/pandas/io/tests/test_stata.py
@@ -11,8 +11,6 @@
 
 import nose
 import numpy as np
-from pandas.tslib import NaT
-
 import pandas as pd
 import pandas.util.testing as tm
 from pandas import compat
@@ -21,6 +19,7 @@
 from pandas.io.parsers import read_csv
 from pandas.io.stata import (read_stata, StataReader, InvalidColumnName,
                              PossiblePrecisionLoss, StataMissingValue)
+from pandas.tslib import NaT
 from pandas.types.common import is_categorical_dtype
 
 
@@ -1234,6 +1233,52 @@ def test_stata_111(self):
         original = original[['y', 'x', 'w', 'z']]
         tm.assert_frame_equal(original, df)
 
+    def test_out_of_range_double(self):
+        # GH 14618
+        df = DataFrame({'ColumnOk': [0.0,
+                                     np.finfo(np.double).eps,
+                                     4.49423283715579e+307],
+                        'ColumnTooBig': [0.0,
+                                         np.finfo(np.double).eps,
+                                         np.finfo(np.double).max]})
+        with tm.assertRaises(ValueError) as cm:
+            with tm.ensure_clean() as path:
+                df.to_stata(path)
+            tm.assertTrue('ColumnTooBig' in cm.exception)
+
+        df.loc[2, 'ColumnTooBig'] = np.inf
+        with tm.assertRaises(ValueError) as cm:
+            with tm.ensure_clean() as path:
+                df.to_stata(path)
+            tm.assertTrue('ColumnTooBig' in cm.exception)
+            tm.assertTrue('infinity' in cm.exception)
+
+    def test_out_of_range_float(self):
+        original = DataFrame({'ColumnOk': [0.0,
+                                           np.finfo(np.float32).eps,
+                                           np.finfo(np.float32).max / 10.0],
+                              'ColumnTooBig': [0.0,
+                                               np.finfo(np.float32).eps,
+                                               np.finfo(np.float32).max]})
+        original.index.name = 'index'
+        for col in original:
+            original[col] = original[col].astype(np.float32)
+
+        with tm.ensure_clean() as path:
+            original.to_stata(path)
+            reread = read_stata(path)
+            original['ColumnTooBig'] = original['ColumnTooBig'].astype(
+                np.float64)
+            tm.assert_frame_equal(original,
+                                  reread.set_index('index'))
+
+        original.loc[2, 'ColumnTooBig'] = np.inf
+        with tm.assertRaises(ValueError) as cm:
+            with tm.ensure_clean() as path:
+                original.to_stata(path)
+            tm.assertTrue('ColumnTooBig' in cm.exception)
+            tm.assertTrue('infinity' in cm.exception)
+
 
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],