Skip to content

Commit 740b10f

Browse files
committed
PERF: changed default to numpy=False to have correct parsing using unordered JSON
eliminated fallback parsing with numpy=True; This will raise ValueError if it fails to parse (a known case are strings in the frame data)
1 parent cbaf1ae commit 740b10f

File tree

4 files changed

+156
-103
lines changed

4 files changed

+156
-103
lines changed

doc/source/io.rst

+30-11
Original file line numberDiff line numberDiff line change
@@ -954,13 +954,21 @@ with optional parameters:
954954

955955
- path_or_buf : the pathname or buffer to write the output
956956
This can be ``None`` in which case a JSON string is returned
957-
- orient : The format of the JSON string, default is ``index`` for ``Series``, ``columns`` for ``DataFrame``
957+
- orient :
958958

959-
* split : dict like {index -> [index], columns -> [columns], data -> [values]}
960-
* records : list like [{column -> value}, ... , {column -> value}]
961-
* index : dict like {index -> {column -> value}}
962-
* columns : dict like {column -> {index -> value}}
963-
* values : just the values array
959+
Series :
960+
default is 'index', allowed values are: {'split','records','index'}
961+
962+
DataFrame :
963+
default is 'columns', allowed values are: {'split','records','index','columns','values'}
964+
965+
The format of the JSON string
966+
967+
* split : dict like {index -> [index], columns -> [columns], data -> [values]}
968+
* records : list like [{column -> value}, ... , {column -> value}]
969+
* index : dict like {index -> {column -> value}}
970+
* columns : dict like {column -> {index -> value}}
971+
* values : just the values array
964972

965973
- date_format : type of date conversion (epoch = epoch milliseconds, iso = ISO8601), default is epoch
966974
- double_precision : The number of decimal places to use when encoding floating point values, default 10.
@@ -1007,17 +1015,28 @@ is ``None``. To explicity force ``Series`` parsing, pass ``typ=series``
10071015
is expected. For instance, a local file could be
10081016
file ://localhost/path/to/table.json
10091017
- typ : type of object to recover (series or frame), default 'frame'
1010-
- orient : The format of the JSON string, one of the following
1018+
- orient :
1019+
1020+
Series :
1021+
default is 'index', allowed values are: {'split','records','index'}
1022+
1023+
DataFrame :
1024+
default is 'columns', allowed values are: {'split','records','index','columns','values'}
1025+
1026+
The format of the JSON string
10111027

1012-
* split : dict like {index -> [index], name -> name, data -> [values]}
1013-
* records : list like [value, ... , value]
1014-
* index : dict like {index -> value}
1028+
* split : dict like {index -> [index], columns -> [columns], data -> [values]}
1029+
* records : list like [{column -> value}, ... , {column -> value}]
1030+
* index : dict like {index -> {column -> value}}
1031+
* columns : dict like {column -> {index -> value}}
1032+
* values : just the values array
10151033

10161034
- dtype : if True, infer dtypes, if a dict of column to dtype, then use those, if False, then don't infer dtypes at all, default is True, apply only to the data
10171035
- convert_axes : boolean, try to convert the axes to the proper dtypes, default is True
10181036
- convert_dates : a list of columns to parse for dates; If True, then try to parse datelike columns, default is True
10191037
- keep_default_dates : boolean, default True. If parsing dates, then parse the default datelike columns
1020-
- numpy: direct decoding to numpy arrays. default True but falls back to standard decoding if a problem occurs.
1038+
- numpy: direct decoding to numpy arrays. default is False;
1039+
Note that the JSON ordering **MUST** be the same for each term if ``numpy=True``
10211040

10221041
The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is
10231042
not parsable.

pandas/core/generic.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -507,8 +507,15 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch',
507507
----------
508508
path_or_buf : the path or buffer to write the result string
509509
if this is None, return a StringIO of the converted string
510-
orient : {'split', 'records', 'index', 'columns', 'values'},
511-
default is 'index' for Series, 'columns' for DataFrame
510+
orient :
511+
512+
Series :
513+
default is 'index'
514+
allowed values are: {'split','records','index'}
515+
516+
DataFrame :
517+
default is 'columns'
518+
allowed values are: {'split','records','index','columns','values'}
512519
513520
The format of the JSON string
514521
split : dict like
@@ -517,6 +524,7 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch',
517524
index : dict like {index -> {column -> value}}
518525
columns : dict like {column -> {index -> value}}
519526
values : just the values array
527+
520528
date_format : type of date conversion (epoch = epoch milliseconds, iso = ISO8601),
521529
default is epoch
522530
double_precision : The number of decimal places to use when encoding

pandas/io/json.py

+79-66
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ def _format_dates(self):
119119
self.obj[c] = self._format_to_date(self.obj[c])
120120

121121
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
122-
convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=True):
122+
convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False):
123123
"""
124124
Convert JSON string to pandas object
125125
@@ -129,12 +129,22 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
129129
a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host
130130
is expected. For instance, a local file could be
131131
file ://localhost/path/to/table.json
132-
orient : {'split', 'records', 'index'}, default 'index'
132+
orient :
133+
Series :
134+
default is 'index'
135+
allowed values are: {'split','records','index'}
136+
137+
DataFrame :
138+
default is 'columns'
139+
allowed values are: {'split','records','index','columns','values'}
140+
133141
The format of the JSON string
134-
split : dict like
135-
{index -> [index], name -> name, data -> [values]}
136-
records : list like [value, ... , value]
137-
index : dict like {index -> value}
142+
split : dict like {index -> [index], columns -> [columns], data -> [values]}
143+
records : list like [{column -> value}, ... , {column -> value}]
144+
index : dict like {index -> {column -> value}}
145+
columns : dict like {column -> {index -> value}}
146+
values : just the values array
147+
138148
typ : type of object to recover (series or frame), default 'frame'
139149
dtype : if True, infer dtypes, if a dict of column to dtype, then use those,
140150
if False, then don't infer dtypes at all, default is True,
@@ -144,8 +154,8 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
144154
default is True
145155
keep_default_dates : boolean, default True. If parsing dates,
146156
then parse the default datelike columns
147-
numpy: direct decoding to numpy arrays. default True but falls back
148-
to standard decoding if a problem occurs.
157+
numpy: direct decoding to numpy arrays. default is False.Note that the JSON ordering MUST be the same
158+
for each term if numpy=True.
149159
150160
Returns
151161
-------
@@ -177,7 +187,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
177187

178188
class Parser(object):
179189

180-
def __init__(self, json, orient, dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=False, numpy=True):
190+
def __init__(self, json, orient, dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=False, numpy=False):
181191
self.json = json
182192

183193
if orient is None:
@@ -196,7 +206,15 @@ def __init__(self, json, orient, dtype=True, convert_axes=True, convert_dates=Tr
196206
self.obj = None
197207

198208
def parse(self):
199-
self._parse()
209+
210+
# try numpy
211+
numpy = self.numpy
212+
if numpy:
213+
self._parse_numpy()
214+
215+
else:
216+
self._parse_no_numpy()
217+
200218
if self.obj is None: return None
201219
if self.convert_axes:
202220
self._convert_axes()
@@ -304,33 +322,30 @@ def _try_convert_dates(self):
304322
class SeriesParser(Parser):
305323
_default_orient = 'index'
306324

307-
def _parse(self):
325+
def _parse_no_numpy(self):
326+
327+
json = self.json
328+
orient = self.orient
329+
if orient == "split":
330+
decoded = dict((str(k), v)
331+
for k, v in loads(json).iteritems())
332+
self.obj = Series(dtype=None, **decoded)
333+
else:
334+
self.obj = Series(loads(json), dtype=None)
335+
336+
def _parse_numpy(self):
308337

309338
json = self.json
310339
orient = self.orient
311-
numpy = self.numpy
312-
313-
if numpy:
314-
try:
315-
if orient == "split":
316-
decoded = loads(json, dtype=None, numpy=True)
317-
decoded = dict((str(k), v) for k, v in decoded.iteritems())
318-
self.obj = Series(**decoded)
319-
elif orient == "columns" or orient == "index":
320-
self.obj = Series(*loads(json, dtype=None, numpy=True,
321-
labelled=True))
322-
else:
323-
self.obj = Series(loads(json, dtype=None, numpy=True))
324-
except (ValueError,TypeError):
325-
numpy = False
326-
327-
if not numpy:
328-
if orient == "split":
329-
decoded = dict((str(k), v)
330-
for k, v in loads(json).iteritems())
331-
self.obj = Series(dtype=None, **decoded)
332-
else:
333-
self.obj = Series(loads(json), dtype=None)
340+
if orient == "split":
341+
decoded = loads(json, dtype=None, numpy=True)
342+
decoded = dict((str(k), v) for k, v in decoded.iteritems())
343+
self.obj = Series(**decoded)
344+
elif orient == "columns" or orient == "index":
345+
self.obj = Series(*loads(json, dtype=None, numpy=True,
346+
labelled=True))
347+
else:
348+
self.obj = Series(loads(json, dtype=None, numpy=True))
334349

335350
def _try_convert_types(self):
336351
if self.obj is None: return
@@ -341,42 +356,40 @@ def _try_convert_types(self):
341356
class FrameParser(Parser):
342357
_default_orient = 'columns'
343358

344-
def _parse(self):
359+
def _parse_numpy(self):
345360

346361
json = self.json
347362
orient = self.orient
348-
numpy = self.numpy
349363

350-
if numpy:
351-
try:
352-
if orient == "columns":
353-
args = loads(json, dtype=None, numpy=True, labelled=True)
354-
if args:
355-
args = (args[0].T, args[2], args[1])
356-
self.obj = DataFrame(*args)
357-
elif orient == "split":
358-
decoded = loads(json, dtype=None, numpy=True)
359-
decoded = dict((str(k), v) for k, v in decoded.iteritems())
360-
self.obj = DataFrame(**decoded)
361-
elif orient == "values":
362-
self.obj = DataFrame(loads(json, dtype=None, numpy=True))
363-
else:
364-
self.obj = DataFrame(*loads(json, dtype=None, numpy=True,
365-
labelled=True))
366-
except (ValueError,TypeError):
367-
numpy = False
368-
369-
if not numpy:
370-
if orient == "columns":
371-
self.obj = DataFrame(loads(json), dtype=None)
372-
elif orient == "split":
373-
decoded = dict((str(k), v)
374-
for k, v in loads(json).iteritems())
375-
self.obj = DataFrame(dtype=None, **decoded)
376-
elif orient == "index":
377-
self.obj = DataFrame(loads(json), dtype=None).T
378-
else:
379-
self.obj = DataFrame(loads(json), dtype=None)
364+
if orient == "columns":
365+
args = loads(json, dtype=None, numpy=True, labelled=True)
366+
if args:
367+
args = (args[0].T, args[2], args[1])
368+
self.obj = DataFrame(*args)
369+
elif orient == "split":
370+
decoded = loads(json, dtype=None, numpy=True)
371+
decoded = dict((str(k), v) for k, v in decoded.iteritems())
372+
self.obj = DataFrame(**decoded)
373+
elif orient == "values":
374+
self.obj = DataFrame(loads(json, dtype=None, numpy=True))
375+
else:
376+
self.obj = DataFrame(*loads(json, dtype=None, numpy=True, labelled=True))
377+
378+
def _parse_no_numpy(self):
379+
380+
json = self.json
381+
orient = self.orient
382+
383+
if orient == "columns":
384+
self.obj = DataFrame(loads(json), dtype=None)
385+
elif orient == "split":
386+
decoded = dict((str(k), v)
387+
for k, v in loads(json).iteritems())
388+
self.obj = DataFrame(dtype=None, **decoded)
389+
elif orient == "index":
390+
self.obj = DataFrame(loads(json), dtype=None).T
391+
else:
392+
self.obj = DataFrame(loads(json), dtype=None)
380393

381394
def _try_convert_types(self):
382395
if self.obj is None: return

0 commit comments

Comments
 (0)