Skip to content

Commit 3b4121b

Browse files
simongibbonsTomAugspurger
authored andcommitted
ENH: Add tranparent compression to json reading/writing (#17798)
* ENH: Add tranparent compression to json reading/writing This works in the same way as the argument to ``read_csv`` and ``to_csv``. I've added tests confirming that it works with both file paths, as well and file URLs and S3 URLs. * Fix PEP8 violations * Add PR number to whatsnew entry * Remove problematic Windows test (The S3 test hits the same edge case) * Extract decompress file function so that pytest.paramatrize can be used cleanly * Fix typo in whatsnew entry
1 parent 4379d04 commit 3b4121b

File tree

6 files changed

+174
-19
lines changed

6 files changed

+174
-19
lines changed

doc/source/whatsnew/v0.21.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ Other Enhancements
195195
- :func:`read_json` now accepts a ``chunksize`` parameter that can be used when ``lines=True``. If ``chunksize`` is passed, read_json now returns an iterator which reads in ``chunksize`` lines with each iteration. (:issue:`17048`)
196196
- :meth:`DataFrame.assign` will preserve the original order of ``**kwargs`` for Python 3.6+ users instead of sorting the column names
197197
- Improved the import time of pandas by about 2.25x (:issue:`16764`)
198-
198+
- :func:`read_json` and :func:`to_json` now accept a ``compression`` argument which allows them to transparently handle compressed files. (:issue:`17798`)
199199

200200
.. _whatsnew_0210.api_breaking:
201201

pandas/core/generic.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -1258,7 +1258,7 @@ def _repr_latex_(self):
12581258

12591259
def to_json(self, path_or_buf=None, orient=None, date_format=None,
12601260
double_precision=10, force_ascii=True, date_unit='ms',
1261-
default_handler=None, lines=False):
1261+
default_handler=None, lines=False, compression=None):
12621262
"""
12631263
Convert the object to a JSON string.
12641264
@@ -1320,6 +1320,12 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None,
13201320
13211321
.. versionadded:: 0.19.0
13221322
1323+
compression : {None, 'gzip', 'bz2', 'xz'}
1324+
A string representing the compression to use in the output file,
1325+
only used when the first argument is a filename
1326+
1327+
.. versionadded:: 0.21.0
1328+
13231329
Returns
13241330
-------
13251331
same type as input object with filtered info axis
@@ -1372,7 +1378,7 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None,
13721378
double_precision=double_precision,
13731379
force_ascii=force_ascii, date_unit=date_unit,
13741380
default_handler=default_handler,
1375-
lines=lines)
1381+
lines=lines, compression=compression)
13761382

13771383
def to_hdf(self, path_or_buf, key, **kwargs):
13781384
"""Write the contained data to an HDF5 file using HDFStore.

pandas/io/json/json.py

+31-15
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99
from pandas import compat, isna
1010
from pandas import Series, DataFrame, to_datetime, MultiIndex
1111
from pandas.io.common import (get_filepath_or_buffer, _get_handle,
12-
_stringify_path, BaseIterator)
12+
_infer_compression, _stringify_path,
13+
BaseIterator)
1314
from pandas.io.parsers import _validate_integer
1415
from pandas.core.common import AbstractMethodError
1516
from pandas.core.reshape.concat import concat
@@ -27,7 +28,7 @@
2728
# interface to/from
2829
def to_json(path_or_buf, obj, orient=None, date_format='epoch',
2930
double_precision=10, force_ascii=True, date_unit='ms',
30-
default_handler=None, lines=False):
31+
default_handler=None, lines=False, compression=None):
3132

3233
path_or_buf = _stringify_path(path_or_buf)
3334
if lines and orient != 'records':
@@ -54,8 +55,11 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch',
5455
s = _convert_to_line_delimits(s)
5556

5657
if isinstance(path_or_buf, compat.string_types):
57-
with open(path_or_buf, 'w') as fh:
58+
fh, handles = _get_handle(path_or_buf, 'w', compression=compression)
59+
try:
5860
fh.write(s)
61+
finally:
62+
fh.close()
5963
elif path_or_buf is None:
6064
return s
6165
else:
@@ -178,7 +182,7 @@ def write(self):
178182
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
179183
convert_axes=True, convert_dates=True, keep_default_dates=True,
180184
numpy=False, precise_float=False, date_unit=None, encoding=None,
181-
lines=False, chunksize=None):
185+
lines=False, chunksize=None, compression='infer'):
182186
"""
183187
Convert a JSON string to pandas object
184188
@@ -277,6 +281,15 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
277281
278282
.. versionadded:: 0.21.0
279283
284+
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
285+
For on-the-fly decompression of on-disk data. If 'infer', then use
286+
gzip, bz2, zip or xz if path_or_buf is a string ending in
287+
'.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression
288+
otherwise. If using 'zip', the ZIP file must contain only one data
289+
file to be read in. Set to None for no decompression.
290+
291+
.. versionadded:: 0.21.0
292+
280293
Returns
281294
-------
282295
result : Series or DataFrame, depending on the value of `typ`.
@@ -334,15 +347,17 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
334347
{"index": "row 2", "col 1": "c", "col 2": "d"}]}'
335348
"""
336349

337-
filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf,
338-
encoding=encoding)
350+
compression = _infer_compression(path_or_buf, compression)
351+
filepath_or_buffer, _, compression = get_filepath_or_buffer(
352+
path_or_buf, encoding=encoding, compression=compression,
353+
)
339354

340355
json_reader = JsonReader(
341356
filepath_or_buffer, orient=orient, typ=typ, dtype=dtype,
342357
convert_axes=convert_axes, convert_dates=convert_dates,
343358
keep_default_dates=keep_default_dates, numpy=numpy,
344359
precise_float=precise_float, date_unit=date_unit, encoding=encoding,
345-
lines=lines, chunksize=chunksize
360+
lines=lines, chunksize=chunksize, compression=compression,
346361
)
347362

348363
if chunksize:
@@ -361,7 +376,7 @@ class JsonReader(BaseIterator):
361376
"""
362377
def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes,
363378
convert_dates, keep_default_dates, numpy, precise_float,
364-
date_unit, encoding, lines, chunksize):
379+
date_unit, encoding, lines, chunksize, compression):
365380

366381
self.path_or_buf = filepath_or_buffer
367382
self.orient = orient
@@ -374,6 +389,7 @@ def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes,
374389
self.precise_float = precise_float
375390
self.date_unit = date_unit
376391
self.encoding = encoding
392+
self.compression = compression
377393
self.lines = lines
378394
self.chunksize = chunksize
379395
self.nrows_seen = 0
@@ -415,20 +431,20 @@ def _get_data_from_filepath(self, filepath_or_buffer):
415431

416432
data = filepath_or_buffer
417433

434+
exists = False
418435
if isinstance(data, compat.string_types):
419436
try:
420437
exists = os.path.exists(filepath_or_buffer)
421-
422438
# gh-5874: if the filepath is too long will raise here
423439
except (TypeError, ValueError):
424440
pass
425441

426-
else:
427-
if exists:
428-
data, _ = _get_handle(filepath_or_buffer, 'r',
429-
encoding=self.encoding)
430-
self.should_close = True
431-
self.open_stream = data
442+
if exists or self.compression is not None:
443+
data, _ = _get_handle(filepath_or_buffer, 'r',
444+
encoding=self.encoding,
445+
compression=self.compression)
446+
self.should_close = True
447+
self.open_stream = data
432448

433449
return data
434450

436 Bytes
Binary file not shown.
+133
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
import pytest
2+
import moto
3+
4+
import pandas as pd
5+
from pandas import compat
6+
import pandas.util.testing as tm
7+
from pandas.util.testing import assert_frame_equal, assert_raises_regex
8+
9+
10+
COMPRESSION_TYPES = [None, 'bz2', 'gzip', 'xz']
11+
12+
13+
def decompress_file(path, compression):
14+
if compression is None:
15+
f = open(path, 'rb')
16+
elif compression == 'gzip':
17+
import gzip
18+
f = gzip.GzipFile(path, 'rb')
19+
elif compression == 'bz2':
20+
import bz2
21+
f = bz2.BZ2File(path, 'rb')
22+
elif compression == 'xz':
23+
lzma = compat.import_lzma()
24+
f = lzma.open(path, 'rb')
25+
else:
26+
msg = 'Unrecognized compression type: {}'.format(compression)
27+
raise ValueError(msg)
28+
29+
result = f.read().decode('utf8')
30+
f.close()
31+
return result
32+
33+
34+
@pytest.mark.parametrize('compression', COMPRESSION_TYPES)
35+
def test_compression_roundtrip(compression):
36+
if compression == 'xz':
37+
tm._skip_if_no_lzma()
38+
39+
df = pd.DataFrame([[0.123456, 0.234567, 0.567567],
40+
[12.32112, 123123.2, 321321.2]],
41+
index=['A', 'B'], columns=['X', 'Y', 'Z'])
42+
43+
with tm.ensure_clean() as path:
44+
df.to_json(path, compression=compression)
45+
assert_frame_equal(df, pd.read_json(path, compression=compression))
46+
47+
# explicitly ensure file was compressed.
48+
uncompressed_content = decompress_file(path, compression)
49+
assert_frame_equal(df, pd.read_json(uncompressed_content))
50+
51+
52+
def test_compress_zip_value_error():
53+
df = pd.DataFrame([[0.123456, 0.234567, 0.567567],
54+
[12.32112, 123123.2, 321321.2]],
55+
index=['A', 'B'], columns=['X', 'Y', 'Z'])
56+
57+
with tm.ensure_clean() as path:
58+
import zipfile
59+
pytest.raises(zipfile.BadZipfile, df.to_json, path, compression="zip")
60+
61+
62+
def test_read_zipped_json():
63+
uncompressed_path = tm.get_data_path("tsframe_v012.json")
64+
uncompressed_df = pd.read_json(uncompressed_path)
65+
66+
compressed_path = tm.get_data_path("tsframe_v012.json.zip")
67+
compressed_df = pd.read_json(compressed_path, compression='zip')
68+
69+
assert_frame_equal(uncompressed_df, compressed_df)
70+
71+
72+
@pytest.mark.parametrize('compression', COMPRESSION_TYPES)
73+
def test_with_s3_url(compression):
74+
boto3 = pytest.importorskip('boto3')
75+
pytest.importorskip('s3fs')
76+
if compression == 'xz':
77+
tm._skip_if_no_lzma()
78+
79+
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
80+
with moto.mock_s3():
81+
conn = boto3.resource("s3", region_name="us-east-1")
82+
bucket = conn.create_bucket(Bucket="pandas-test")
83+
84+
with tm.ensure_clean() as path:
85+
df.to_json(path, compression=compression)
86+
with open(path, 'rb') as f:
87+
bucket.put_object(Key='test-1', Body=f)
88+
89+
roundtripped_df = pd.read_json('s3://pandas-test/test-1',
90+
compression=compression)
91+
assert_frame_equal(df, roundtripped_df)
92+
93+
94+
@pytest.mark.parametrize('compression', COMPRESSION_TYPES)
95+
def test_lines_with_compression(compression):
96+
if compression == 'xz':
97+
tm._skip_if_no_lzma()
98+
99+
with tm.ensure_clean() as path:
100+
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
101+
df.to_json(path, orient='records', lines=True, compression=compression)
102+
roundtripped_df = pd.read_json(path, lines=True,
103+
compression=compression)
104+
assert_frame_equal(df, roundtripped_df)
105+
106+
107+
@pytest.mark.parametrize('compression', COMPRESSION_TYPES)
108+
def test_chunksize_with_compression(compression):
109+
if compression == 'xz':
110+
tm._skip_if_no_lzma()
111+
112+
with tm.ensure_clean() as path:
113+
df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}')
114+
df.to_json(path, orient='records', lines=True, compression=compression)
115+
116+
roundtripped_df = pd.concat(pd.read_json(path, lines=True, chunksize=1,
117+
compression=compression))
118+
assert_frame_equal(df, roundtripped_df)
119+
120+
121+
def test_write_unsupported_compression_type():
122+
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
123+
with tm.ensure_clean() as path:
124+
msg = "Unrecognized compression type: unsupported"
125+
assert_raises_regex(ValueError, msg, df.to_json,
126+
path, compression="unsupported")
127+
128+
129+
def test_read_unsupported_compression_type():
130+
with tm.ensure_clean() as path:
131+
msg = "Unrecognized compression type: unsupported"
132+
assert_raises_regex(ValueError, msg, pd.read_json,
133+
path, compression="unsupported")

pandas/tests/io/json/test_readlines.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def test_readjson_chunks_closes(chunksize):
128128
path, orient=None, typ="frame", dtype=True, convert_axes=True,
129129
convert_dates=True, keep_default_dates=True, numpy=False,
130130
precise_float=False, date_unit=None, encoding=None,
131-
lines=True, chunksize=chunksize)
131+
lines=True, chunksize=chunksize, compression=None)
132132
reader.read()
133133
assert reader.open_stream.closed, "didn't close stream with \
134134
chunksize = %s" % chunksize

0 commit comments

Comments
 (0)