PERF: changed default to numpy=False to have correct parsing using unordered JSON

jreback · jreback · commit 740b10fe1d5d · 2013-06-13T13:08:42.000-04:00
eliminated fallback parsing with numpy=True; This will raise ValueError
      if it fails to parse (a known case are strings in the frame data)
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -954,13 +954,21 @@ with optional parameters:
 
 - path_or_buf : the pathname or buffer to write the output
   This can be ``None`` in which case a JSON string is returned
-- orient : The format of the JSON string, default is ``index`` for ``Series``, ``columns`` for ``DataFrame``
+- orient :
 
-  * split   : dict like {index -> [index], columns -> [columns], data -> [values]}
-  * records : list like [{column -> value}, ... , {column -> value}]
-  * index   : dict like {index -> {column -> value}}
-  * columns : dict like {column -> {index -> value}}
-  * values  : just the values array
+  Series :
+    default is 'index', allowed values are: {'split','records','index'}
+
+  DataFrame :
+    default is 'columns', allowed values are: {'split','records','index','columns','values'}
+
+  The format of the JSON string
+
+    * split : dict like {index -> [index], columns -> [columns], data -> [values]}
+    * records : list like [{column -> value}, ... , {column -> value}]
+    * index : dict like {index -> {column -> value}}
+    * columns : dict like {column -> {index -> value}}
+    * values : just the values array
 
 - date_format : type of date conversion (epoch = epoch milliseconds, iso = ISO8601), default is epoch
 - double_precision : The number of decimal places to use when encoding floating point values, default 10.
@@ -1007,17 +1015,28 @@ is ``None``. To explicity force ``Series`` parsing, pass ``typ=series``
   is expected. For instance, a local file could be
   file ://localhost/path/to/table.json
 - typ    : type of object to recover (series or frame), default 'frame'
-- orient : The format of the JSON string, one of the following
+- orient :
+
+  Series :
+    default is 'index', allowed values are: {'split','records','index'}
+
+  DataFrame :
+    default is 'columns', allowed values are: {'split','records','index','columns','values'}
+
+  The format of the JSON string
 
-  * split : dict like {index -> [index], name -> name, data -> [values]}
-  * records : list like [value, ... , value]
-  * index : dict like {index -> value}
+    * split : dict like {index -> [index], columns -> [columns], data -> [values]}
+    * records : list like [{column -> value}, ... , {column -> value}]
+    * index : dict like {index -> {column -> value}}
+    * columns : dict like {column -> {index -> value}}
+    * values : just the values array
 
 - dtype : if True, infer dtypes, if a dict of column to dtype, then use those, if False, then don't infer dtypes at all, default is True, apply only to the data
 - convert_axes : boolean, try to convert the axes to the proper dtypes, default is True
 - convert_dates : a list of columns to parse for dates; If True, then try to parse datelike columns, default is True
 - keep_default_dates : boolean, default True. If parsing dates, then parse the default datelike columns
-- numpy: direct decoding to numpy arrays. default True but falls back to standard decoding if a problem occurs.
+- numpy: direct decoding to numpy arrays. default is False;
+  Note that the JSON ordering **MUST** be the same for each term if ``numpy=True``
 
 The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is
 not parsable.
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -507,8 +507,15 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch',
         ----------
         path_or_buf : the path or buffer to write the result string
             if this is None, return a StringIO of the converted string
-        orient : {'split', 'records', 'index', 'columns', 'values'},
-            default is 'index' for Series, 'columns' for DataFrame
+        orient :
+
+            Series :
+              default is 'index'
+              allowed values are: {'split','records','index'}
+
+            DataFrame :
+              default is 'columns'
+              allowed values are: {'split','records','index','columns','values'}
 
             The format of the JSON string
             split : dict like
@@ -517,6 +524,7 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch',
             index : dict like {index -> {column -> value}}
             columns : dict like {column -> {index -> value}}
             values : just the values array
+
         date_format : type of date conversion (epoch = epoch milliseconds, iso = ISO8601),
             default is epoch
         double_precision : The number of decimal places to use when encoding
diff --git a/pandas/io/json.py b/pandas/io/json.py
@@ -119,7 +119,7 @@ def _format_dates(self):
                     self.obj[c] = self._format_to_date(self.obj[c])
 
 def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
-              convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=True):
+              convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False):
     """
     Convert JSON string to pandas object
 
@@ -129,12 +129,22 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
         a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host
         is expected. For instance, a local file could be
         file ://localhost/path/to/table.json
-    orient : {'split', 'records', 'index'}, default 'index'
+    orient :
+        Series :
+          default is 'index'
+          allowed values are: {'split','records','index'}
+
+        DataFrame :
+          default is 'columns'
+          allowed values are: {'split','records','index','columns','values'}
+
         The format of the JSON string
-        split : dict like
-            {index -> [index], name -> name, data -> [values]}
-        records : list like [value, ... , value]
-        index : dict like {index -> value}
+          split : dict like {index -> [index], columns -> [columns], data -> [values]}
+          records : list like [{column -> value}, ... , {column -> value}]
+          index : dict like {index -> {column -> value}}
+          columns : dict like {column -> {index -> value}}
+          values : just the values array
+
     typ : type of object to recover (series or frame), default 'frame'
     dtype : if True, infer dtypes, if a dict of column to dtype, then use those,
         if False, then don't infer dtypes at all, default is True,
@@ -144,8 +154,8 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
         default is True
     keep_default_dates : boolean, default True. If parsing dates,
         then parse the default datelike columns
-    numpy: direct decoding to numpy arrays. default True but falls back
-        to standard decoding if a problem occurs.
+    numpy: direct decoding to numpy arrays. default is False.Note that the JSON ordering MUST be the same
+        for each term if numpy=True.
 
     Returns
     -------
@@ -177,7 +187,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
 
 class Parser(object):
     
-    def __init__(self, json, orient, dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=False, numpy=True):
+    def __init__(self, json, orient, dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=False, numpy=False):
         self.json = json
 
         if orient is None:
@@ -196,7 +206,15 @@ def __init__(self, json, orient, dtype=True, convert_axes=True, convert_dates=Tr
         self.obj = None
 
     def parse(self):
-        self._parse()
+
+        # try numpy 
+        numpy = self.numpy
+        if numpy:
+            self._parse_numpy()
+
+        else:
+            self._parse_no_numpy()
+
         if self.obj is None: return None
         if self.convert_axes:
             self._convert_axes()
@@ -304,33 +322,30 @@ def _try_convert_dates(self):
 class SeriesParser(Parser):
     _default_orient = 'index'
 
-    def _parse(self):
+    def _parse_no_numpy(self):
+    
+        json = self.json
+        orient = self.orient
+        if orient == "split":
+            decoded = dict((str(k), v)
+                           for k, v in loads(json).iteritems())
+            self.obj = Series(dtype=None, **decoded)
+        else:
+            self.obj = Series(loads(json), dtype=None)
+
+    def _parse_numpy(self):
 
         json = self.json
         orient = self.orient
-        numpy = self.numpy
-    
-        if numpy:
-            try:
-                if orient == "split":
-                    decoded = loads(json, dtype=None, numpy=True)
-                    decoded = dict((str(k), v) for k, v in decoded.iteritems())
-                    self.obj = Series(**decoded)
-                elif orient == "columns" or orient == "index":
-                    self.obj = Series(*loads(json, dtype=None, numpy=True,
-                                             labelled=True))
-                else:
-                    self.obj = Series(loads(json, dtype=None, numpy=True))
-            except (ValueError,TypeError):
-                numpy = False
-
-        if not numpy:
-            if orient == "split":
-                decoded = dict((str(k), v)
-                               for k, v in loads(json).iteritems())
-                self.obj = Series(dtype=None, **decoded)
-            else:
-                self.obj = Series(loads(json), dtype=None)
+        if orient == "split":
+            decoded = loads(json, dtype=None, numpy=True)
+            decoded = dict((str(k), v) for k, v in decoded.iteritems())
+            self.obj = Series(**decoded)
+        elif orient == "columns" or orient == "index":
+            self.obj = Series(*loads(json, dtype=None, numpy=True,
+                                     labelled=True))
+        else:
+            self.obj = Series(loads(json, dtype=None, numpy=True))
 
     def _try_convert_types(self):
         if self.obj is None: return
@@ -341,42 +356,40 @@ def _try_convert_types(self):
 class FrameParser(Parser):
     _default_orient = 'columns'
 
-    def _parse(self):
+    def _parse_numpy(self):
 
         json = self.json
         orient = self.orient
-        numpy = self.numpy
 
-        if numpy:
-            try:
-                if orient == "columns":
-                    args = loads(json, dtype=None, numpy=True, labelled=True)
-                    if args:
-                        args = (args[0].T, args[2], args[1])
-                    self.obj = DataFrame(*args)
-                elif orient == "split":
-                    decoded = loads(json, dtype=None, numpy=True)
-                    decoded = dict((str(k), v) for k, v in decoded.iteritems())
-                    self.obj = DataFrame(**decoded)
-                elif orient == "values":
-                    self.obj = DataFrame(loads(json, dtype=None, numpy=True))
-                else:
-                    self.obj = DataFrame(*loads(json, dtype=None, numpy=True,
-                                         labelled=True))
-            except (ValueError,TypeError):
-                numpy = False
-
-        if not numpy:
-            if orient == "columns":
-                self.obj = DataFrame(loads(json), dtype=None)
-            elif orient == "split":
-                decoded = dict((str(k), v)
-                               for k, v in loads(json).iteritems())
-                self.obj = DataFrame(dtype=None, **decoded)
-            elif orient == "index":
-                self.obj = DataFrame(loads(json), dtype=None).T
-            else:
-                self.obj = DataFrame(loads(json), dtype=None)
+        if orient == "columns":
+            args = loads(json, dtype=None, numpy=True, labelled=True)
+            if args:
+                args = (args[0].T, args[2], args[1])
+            self.obj = DataFrame(*args)
+        elif orient == "split":
+            decoded = loads(json, dtype=None, numpy=True)
+            decoded = dict((str(k), v) for k, v in decoded.iteritems())
+            self.obj = DataFrame(**decoded)
+        elif orient == "values":
+            self.obj = DataFrame(loads(json, dtype=None, numpy=True))
+        else:
+            self.obj = DataFrame(*loads(json, dtype=None, numpy=True, labelled=True))
+
+    def _parse_no_numpy(self):
+
+        json = self.json
+        orient = self.orient
+
+        if orient == "columns":
+            self.obj = DataFrame(loads(json), dtype=None)
+        elif orient == "split":
+            decoded = dict((str(k), v)
+                           for k, v in loads(json).iteritems())
+            self.obj = DataFrame(dtype=None, **decoded)
+        elif orient == "index":
+            self.obj = DataFrame(loads(json), dtype=None).T
+        else:
+            self.obj = DataFrame(loads(json), dtype=None)
 
     def _try_convert_types(self):
         if self.obj is None: return
diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py