|
1 | 1 | # pylint: disable-msg=E1101,W0613,W0603
|
| 2 | +from itertools import islice |
2 | 3 | import os
|
3 | 4 | import numpy as np
|
4 | 5 |
|
|
8 | 9 | from pandas import compat, isna
|
9 | 10 | from pandas import Series, DataFrame, to_datetime, MultiIndex
|
10 | 11 | from pandas.io.common import (get_filepath_or_buffer, _get_handle,
|
11 |
| - _stringify_path) |
| 12 | + _stringify_path, BaseIterator) |
| 13 | +from pandas.io.parsers import _validate_integer |
12 | 14 | from pandas.core.common import AbstractMethodError
|
| 15 | +from pandas.core.reshape.concat import concat |
13 | 16 | from pandas.io.formats.printing import pprint_thing
|
14 | 17 | from .normalize import _convert_to_line_delimits
|
15 | 18 | from .table_schema import build_table_schema
|
@@ -175,7 +178,7 @@ def write(self):
|
175 | 178 | def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
|
176 | 179 | convert_axes=True, convert_dates=True, keep_default_dates=True,
|
177 | 180 | numpy=False, precise_float=False, date_unit=None, encoding=None,
|
178 |
| - lines=False): |
| 181 | + lines=False, chunksize=None): |
179 | 182 | """
|
180 | 183 | Convert a JSON string to pandas object
|
181 | 184 |
|
@@ -264,6 +267,16 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
|
264 | 267 |
|
265 | 268 | .. versionadded:: 0.19.0
|
266 | 269 |
|
| 270 | + chunksize: integer, default None |
| 271 | + Return JsonReader object for iteration. |
| 272 | + See the `line-delimted json docs |
| 273 | + <http://pandas.pydata.org/pandas-docs/stable/io.html#io-jsonl>`_ |
| 274 | + for more information on ``chunksize``. |
| 275 | + This can only be passed if `lines=True`. |
| 276 | + If this is None, the file will be read into memory all at once. |
| 277 | +
|
| 278 | + .. versionadded:: 0.21.0 |
| 279 | +
|
267 | 280 | Returns
|
268 | 281 | -------
|
269 | 282 | result : Series or DataFrame, depending on the value of `typ`.
|
@@ -323,47 +336,167 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
|
323 | 336 |
|
324 | 337 | filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf,
|
325 | 338 | encoding=encoding)
|
326 |
| - if isinstance(filepath_or_buffer, compat.string_types): |
327 |
| - try: |
328 |
| - exists = os.path.exists(filepath_or_buffer) |
329 |
| - |
330 |
| - # if the filepath is too long will raise here |
331 |
| - # 5874 |
332 |
| - except (TypeError, ValueError): |
333 |
| - exists = False |
334 |
| - |
335 |
| - if exists: |
336 |
| - fh, handles = _get_handle(filepath_or_buffer, 'r', |
337 |
| - encoding=encoding) |
338 |
| - json = fh.read() |
339 |
| - fh.close() |
| 339 | + |
| 340 | + json_reader = JsonReader( |
| 341 | + filepath_or_buffer, orient=orient, typ=typ, dtype=dtype, |
| 342 | + convert_axes=convert_axes, convert_dates=convert_dates, |
| 343 | + keep_default_dates=keep_default_dates, numpy=numpy, |
| 344 | + precise_float=precise_float, date_unit=date_unit, encoding=encoding, |
| 345 | + lines=lines, chunksize=chunksize |
| 346 | + ) |
| 347 | + |
| 348 | + if chunksize: |
| 349 | + return json_reader |
| 350 | + |
| 351 | + return json_reader.read() |
| 352 | + |
| 353 | + |
| 354 | +class JsonReader(BaseIterator): |
| 355 | + """ |
| 356 | + JsonReader provides an interface for reading in a JSON file. |
| 357 | +
|
| 358 | + If initialized with ``lines=True`` and ``chunksize``, can be iterated over |
| 359 | + ``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the |
| 360 | + whole document. |
| 361 | + """ |
| 362 | + def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes, |
| 363 | + convert_dates, keep_default_dates, numpy, precise_float, |
| 364 | + date_unit, encoding, lines, chunksize): |
| 365 | + |
| 366 | + self.path_or_buf = filepath_or_buffer |
| 367 | + self.orient = orient |
| 368 | + self.typ = typ |
| 369 | + self.dtype = dtype |
| 370 | + self.convert_axes = convert_axes |
| 371 | + self.convert_dates = convert_dates |
| 372 | + self.keep_default_dates = keep_default_dates |
| 373 | + self.numpy = numpy |
| 374 | + self.precise_float = precise_float |
| 375 | + self.date_unit = date_unit |
| 376 | + self.encoding = encoding |
| 377 | + self.lines = lines |
| 378 | + self.chunksize = chunksize |
| 379 | + self.nrows_seen = 0 |
| 380 | + self.should_close = False |
| 381 | + |
| 382 | + if self.chunksize is not None: |
| 383 | + self.chunksize = _validate_integer("chunksize", self.chunksize, 1) |
| 384 | + if not self.lines: |
| 385 | + raise ValueError("chunksize can only be passed if lines=True") |
| 386 | + |
| 387 | + data = self._get_data_from_filepath(filepath_or_buffer) |
| 388 | + self.data = self._preprocess_data(data) |
| 389 | + |
| 390 | + def _preprocess_data(self, data): |
| 391 | + """ |
| 392 | + At this point, the data either has a `read` attribute (e.g. a file |
| 393 | + object or a StringIO) or is a string that is a JSON document. |
| 394 | +
|
| 395 | + If self.chunksize, we prepare the data for the `__next__` method. |
| 396 | + Otherwise, we read it into memory for the `read` method. |
| 397 | + """ |
| 398 | + if hasattr(data, 'read') and not self.chunksize: |
| 399 | + data = data.read() |
| 400 | + if not hasattr(data, 'read') and self.chunksize: |
| 401 | + data = StringIO(data) |
| 402 | + |
| 403 | + return data |
| 404 | + |
| 405 | + def _get_data_from_filepath(self, filepath_or_buffer): |
| 406 | + """ |
| 407 | + read_json accepts three input types: |
| 408 | + 1. filepath (string-like) |
| 409 | + 2. file-like object (e.g. open file object, StringIO) |
| 410 | + 3. JSON string |
| 411 | +
|
| 412 | + This method turns (1) into (2) to simplify the rest of the processing. |
| 413 | + It returns input types (2) and (3) unchanged. |
| 414 | + """ |
| 415 | + |
| 416 | + data = filepath_or_buffer |
| 417 | + |
| 418 | + if isinstance(data, compat.string_types): |
| 419 | + try: |
| 420 | + exists = os.path.exists(filepath_or_buffer) |
| 421 | + |
| 422 | + # gh-5874: if the filepath is too long will raise here |
| 423 | + except (TypeError, ValueError): |
| 424 | + pass |
| 425 | + |
| 426 | + else: |
| 427 | + if exists: |
| 428 | + data, _ = _get_handle(filepath_or_buffer, 'r', |
| 429 | + encoding=self.encoding) |
| 430 | + self.should_close = True |
| 431 | + self.open_stream = data |
| 432 | + |
| 433 | + return data |
| 434 | + |
| 435 | + def _combine_lines(self, lines): |
| 436 | + """Combines a list of JSON objects into one JSON object""" |
| 437 | + lines = filter(None, map(lambda x: x.strip(), lines)) |
| 438 | + return '[' + ','.join(lines) + ']' |
| 439 | + |
| 440 | + def read(self): |
| 441 | + """Read the whole JSON input into a pandas object""" |
| 442 | + if self.lines and self.chunksize: |
| 443 | + obj = concat(self) |
| 444 | + elif self.lines: |
| 445 | + obj = self._get_object_parser( |
| 446 | + self._combine_lines(self.data.split('\n')) |
| 447 | + ) |
340 | 448 | else:
|
341 |
| - json = filepath_or_buffer |
342 |
| - elif hasattr(filepath_or_buffer, 'read'): |
343 |
| - json = filepath_or_buffer.read() |
344 |
| - else: |
345 |
| - json = filepath_or_buffer |
| 449 | + obj = self._get_object_parser(self.data) |
| 450 | + self.close() |
| 451 | + return obj |
| 452 | + |
| 453 | + def _get_object_parser(self, json): |
| 454 | + """parses a json document into a pandas object""" |
| 455 | + typ = self.typ |
| 456 | + dtype = self.dtype |
| 457 | + kwargs = { |
| 458 | + "orient": self.orient, "dtype": self.dtype, |
| 459 | + "convert_axes": self.convert_axes, |
| 460 | + "convert_dates": self.convert_dates, |
| 461 | + "keep_default_dates": self.keep_default_dates, "numpy": self.numpy, |
| 462 | + "precise_float": self.precise_float, "date_unit": self.date_unit |
| 463 | + } |
| 464 | + obj = None |
| 465 | + if typ == 'frame': |
| 466 | + obj = FrameParser(json, **kwargs).parse() |
| 467 | + |
| 468 | + if typ == 'series' or obj is None: |
| 469 | + if not isinstance(dtype, bool): |
| 470 | + dtype = dict(data=dtype) |
| 471 | + obj = SeriesParser(json, **kwargs).parse() |
| 472 | + |
| 473 | + return obj |
| 474 | + |
| 475 | + def close(self): |
| 476 | + """ |
| 477 | + If we opened a stream earlier, in _get_data_from_filepath, we should |
| 478 | + close it. If an open stream or file was passed, we leave it open. |
| 479 | + """ |
| 480 | + if self.should_close: |
| 481 | + try: |
| 482 | + self.open_stream.close() |
| 483 | + except (IOError, AttributeError): |
| 484 | + pass |
346 | 485 |
|
347 |
| - if lines: |
348 |
| - # If given a json lines file, we break the string into lines, add |
349 |
| - # commas and put it in a json list to make a valid json object. |
350 |
| - lines = list(StringIO(json.strip())) |
351 |
| - json = '[' + ','.join(lines) + ']' |
352 |
| - |
353 |
| - obj = None |
354 |
| - if typ == 'frame': |
355 |
| - obj = FrameParser(json, orient, dtype, convert_axes, convert_dates, |
356 |
| - keep_default_dates, numpy, precise_float, |
357 |
| - date_unit).parse() |
358 |
| - |
359 |
| - if typ == 'series' or obj is None: |
360 |
| - if not isinstance(dtype, bool): |
361 |
| - dtype = dict(data=dtype) |
362 |
| - obj = SeriesParser(json, orient, dtype, convert_axes, convert_dates, |
363 |
| - keep_default_dates, numpy, precise_float, |
364 |
| - date_unit).parse() |
365 |
| - |
366 |
| - return obj |
| 486 | + def __next__(self): |
| 487 | + lines = list(islice(self.data, self.chunksize)) |
| 488 | + if lines: |
| 489 | + lines_json = self._combine_lines(lines) |
| 490 | + obj = self._get_object_parser(lines_json) |
| 491 | + |
| 492 | + # Make sure that the returned objects have the right index. |
| 493 | + obj.index = range(self.nrows_seen, self.nrows_seen + len(obj)) |
| 494 | + self.nrows_seen += len(obj) |
| 495 | + |
| 496 | + return obj |
| 497 | + |
| 498 | + self.close() |
| 499 | + raise StopIteration |
367 | 500 |
|
368 | 501 |
|
369 | 502 | class Parser(object):
|
|
0 commit comments