From 86a6c9c04008588363b4b67f1b49a99fa3a70147 Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 19 Jul 2013 13:57:42 -0400 Subject: [PATCH] TST: raise an error json serialization of floats that cannot be accurately represented DOC: doc updates for small_float errors --- doc/source/io.rst | 5 +++ doc/source/release.rst | 3 ++ pandas/io/json.py | 46 ++++++++++++++++++------ pandas/io/tests/test_json/test_pandas.py | 12 +++++++ 4 files changed, 55 insertions(+), 11 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 653ac2cb10b69..a78075548b51d 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1020,6 +1020,11 @@ Writing to a file, with a date index and a date column dfj2.to_json('test.json') open('test.json').read() +.. warning:: + + Currently ``usjon`` cannot format small float numbers (< 1e15). A ``ValueError`` + will be raised in these cases. + Reading JSON ~~~~~~~~~~~~ diff --git a/doc/source/release.rst b/doc/source/release.rst index a64b2a77b376c..3ac77ac14e2fe 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -37,6 +37,8 @@ pandas 0.12 - Support for reading Amazon S3 files. (:issue:`3504`) - Added module for reading and writing Stata files: pandas.io.stata (:issue:`1512`) includes ``to_stata`` DataFrame method, and a ``read_stata`` top-level reader + - Added module for reading and writing JSON strings/files: pandas.io.json (:issue:`3876`) + includes ``to_json`` DataFrame/Series method, and a ``read_json`` top-level reader - Added support for writing in ``to_csv`` and reading in ``read_csv``, multi-index columns. The ``header`` option in ``read_csv`` now accepts a list of the rows from which to read the index. Added the option, @@ -345,6 +347,7 @@ pandas 0.12 - Fixed bug where html5lib wasn't being properly skipped (:issue:`4265`) - Fixed bug where get_data_famafrench wasn't using the correct file edges (:issue:`4281`) + - Raise a ``ValueError`` if trying to format small floats with ``to_json`` (:issue:`4042`) pandas 0.11.0 ============= diff --git a/pandas/io/json.py b/pandas/io/json.py index ce95c3394ce2c..a5d06afff4e95 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -16,9 +16,9 @@ ### interface to/from ### def to_json(path_or_buf, obj, orient=None, date_format='epoch', double_precision=10, force_ascii=True): - + if isinstance(obj, Series): - s = SeriesWriter(obj, orient=orient, date_format=date_format, double_precision=double_precision, + s = SeriesWriter(obj, orient=orient, date_format=date_format, double_precision=double_precision, ensure_ascii=force_ascii).write() elif isinstance(obj, DataFrame): s = FrameWriter(obj, orient=orient, date_format=date_format, double_precision=double_precision, @@ -41,7 +41,7 @@ def __init__(self, obj, orient, date_format, double_precision, ensure_ascii): if orient is None: orient = self._default_orient - + self.orient = orient self.date_format = date_format self.double_precision = double_precision @@ -64,19 +64,36 @@ def _format_to_date(self, data): if self._needs_to_date(data): return data.apply(lambda x: x.isoformat()) return data - + def copy_if_needed(self): """ copy myself if necessary """ if not self.is_copy: self.obj = self.obj.copy() self.is_copy = True + def _validate(self): + """ validate that we can accurately write the data """ + pass + + def _raise_on_small_floats(self): + raise ValueError("ujson currently cannot accurately format float data less\n" + "than 1e-15. A work-around is to multiply the data by\n" + "a large positive factor and divide on deseriliazation\n") + def write(self): + self._validate() return dumps(self.obj, orient=self.orient, double_precision=self.double_precision, ensure_ascii=self.ensure_ascii) class SeriesWriter(Writer): _default_orient = 'index' + def _validate(self): + if issubclass(self.obj.dtype.type, np.floating): + values = self.obj.values + values = values[values.nonzero()[0]] + if len(values) and (np.abs(values)<1e-15).any(): + self._raise_on_small_floats() + def _format_axes(self): if self._needs_to_date(self.obj.index): self.copy_if_needed() @@ -95,6 +112,13 @@ def _format_bools(self): class FrameWriter(Writer): _default_orient = 'columns' + def _validate(self): + cols = [ k for k, v in self.obj.dtypes.iteritems() if issubclass(v.type,np.floating) ] + values = self.obj.loc[:,cols].values.ravel() + values = values[values.nonzero()[0]] + if len(values) and (np.abs(values)<1e-15).any(): + self._raise_on_small_floats() + def _format_axes(self): """ try to axes if they are datelike """ if self.orient == 'columns': @@ -186,13 +210,13 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, return obj class Parser(object): - + def __init__(self, json, orient, dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=False, numpy=False): self.json = json if orient is None: orient = self._default_orient - + self.orient = orient self.dtype = dtype @@ -207,7 +231,7 @@ def __init__(self, json, orient, dtype=True, convert_axes=True, convert_dates=Tr def parse(self): - # try numpy + # try numpy numpy = self.numpy if numpy: self._parse_numpy() @@ -269,7 +293,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True): pass if data.dtype == 'float': - + # coerce floats to 64 try: data = data.astype('float64') @@ -291,7 +315,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True): # coerce ints to 64 if data.dtype == 'int': - + # coerce floats to 64 try: data = data.astype('int64') @@ -322,7 +346,7 @@ def _try_convert_to_date(self, data): if issubclass(new_data.dtype.type,np.number): if not ((new_data == iNaT) | (new_data > 31536000000000000L)).all(): return data, False - + try: new_data = to_datetime(new_data) except: @@ -342,7 +366,7 @@ class SeriesParser(Parser): _default_orient = 'index' def _parse_no_numpy(self): - + json = self.json orient = self.orient if orient == "split": diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index bc6ba1a45136c..32f951b3f17e1 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -409,6 +409,18 @@ def test_misc_example(self): expected = DataFrame([[1,2],[1,2]],columns=['a','b']) assert_frame_equal(result,expected) + def test_small_floats(self): + + # raise + df = DataFrame([[1e-16,'foo',1e-8]],columns=list('ABC')) + self.assertRaises(ValueError, df.to_json) + s = Series([1e-16]) + self.assertRaises(ValueError, s.to_json) + + # ok + df = DataFrame([[1e-15,'foo',1e-8]],columns=list('ABC')) + df.to_json() + @network @slow def test_round_trip_exception_(self):