From 55a98f5d8f2e6c723a2c68198790d36eb3fbfbfe Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Thu, 10 Nov 2016 22:45:56 +0000 Subject: [PATCH] ENH: Explicit range checking of floats when writing Stata Add explicit error checking for out-of-range doubles when writing Stata files Upcasts float32 to float64 if out-of-range values encountered Tests for infinite values and raises if found closes #14618 --- doc/source/whatsnew/v0.19.2.txt | 1 + doc/source/whatsnew/v0.20.0.txt | 1 + pandas/io/stata.py | 17 ++++++++++++ pandas/io/tests/test_stata.py | 49 +++++++++++++++++++++++++++++++-- 4 files changed, 66 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index dc11dd17bfdd7..030315d5fbbba 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -25,3 +25,4 @@ Bug Fixes - compat with ``dateutil==2.6.0`` for testing (:issue:`14621`) - allow ``nanoseconds`` in ``Timestamp.replace`` kwargs (:issue:`14621`) +- Explicit check in ``to_stata`` and ``StataWriter`` for out-of-range values when writing doubles (:issue:`14618`) \ No newline at end of file diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 660300e1814e8..8819a95f27b0d 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -80,3 +80,4 @@ Performance Improvements Bug Fixes ~~~~~~~~~ + diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 14bd670862b41..c35e07be2c31a 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -511,6 +511,9 @@ def _cast_to_stata_types(data): (np.uint16, np.int16, np.int32), (np.uint32, np.int32, np.int64)) + float32_max = struct.unpack('= 2 ** 53 or data[col].min() <= -2 ** 53: ws = precision_loss_doc % ('int64', 'float64') + elif dtype in (np.float32, np.float64): + value = data[col].max() + if np.isinf(value): + msg = 'Column {0} has a maximum value of infinity which is ' \ + 'outside the range supported by Stata.' + raise ValueError(msg.format(col)) + if dtype == np.float32 and value > float32_max: + data[col] = data[col].astype(np.float64) + elif dtype == np.float64: + if value > float64_max: + msg = 'Column {0} has a maximum value ({1}) outside the ' \ + 'range supported by Stata ({1})' + raise ValueError(msg.format(col, value, float64_max)) if ws: import warnings @@ -2048,6 +2064,7 @@ def _prepare_pandas(self, data): data = self._check_column_names(data) # Check columns for compatibility with stata, upcast if necessary + # Raise if outside the supported range data = _cast_to_stata_types(data) # Replace NaNs with Stata missing values diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 1849b32a4a7c8..cd972868a6e32 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -11,8 +11,6 @@ import nose import numpy as np -from pandas.tslib import NaT - import pandas as pd import pandas.util.testing as tm from pandas import compat @@ -21,6 +19,7 @@ from pandas.io.parsers import read_csv from pandas.io.stata import (read_stata, StataReader, InvalidColumnName, PossiblePrecisionLoss, StataMissingValue) +from pandas.tslib import NaT from pandas.types.common import is_categorical_dtype @@ -1234,6 +1233,52 @@ def test_stata_111(self): original = original[['y', 'x', 'w', 'z']] tm.assert_frame_equal(original, df) + def test_out_of_range_double(self): + # GH 14618 + df = DataFrame({'ColumnOk': [0.0, + np.finfo(np.double).eps, + 4.49423283715579e+307], + 'ColumnTooBig': [0.0, + np.finfo(np.double).eps, + np.finfo(np.double).max]}) + with tm.assertRaises(ValueError) as cm: + with tm.ensure_clean() as path: + df.to_stata(path) + tm.assertTrue('ColumnTooBig' in cm.exception) + + df.loc[2, 'ColumnTooBig'] = np.inf + with tm.assertRaises(ValueError) as cm: + with tm.ensure_clean() as path: + df.to_stata(path) + tm.assertTrue('ColumnTooBig' in cm.exception) + tm.assertTrue('infinity' in cm.exception) + + def test_out_of_range_float(self): + original = DataFrame({'ColumnOk': [0.0, + np.finfo(np.float32).eps, + np.finfo(np.float32).max / 10.0], + 'ColumnTooBig': [0.0, + np.finfo(np.float32).eps, + np.finfo(np.float32).max]}) + original.index.name = 'index' + for col in original: + original[col] = original[col].astype(np.float32) + + with tm.ensure_clean() as path: + original.to_stata(path) + reread = read_stata(path) + original['ColumnTooBig'] = original['ColumnTooBig'].astype( + np.float64) + tm.assert_frame_equal(original, + reread.set_index('index')) + + original.loc[2, 'ColumnTooBig'] = np.inf + with tm.assertRaises(ValueError) as cm: + with tm.ensure_clean() as path: + original.to_stata(path) + tm.assertTrue('ColumnTooBig' in cm.exception) + tm.assertTrue('infinity' in cm.exception) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],