ENH: Add tranparent compression to json reading/writing (#17798)

simongibbons · TomAugspurger · commit 3b4121b5fca8 · 2017-10-06T09:08:22.000-05:00
* ENH: Add tranparent compression to json reading/writing

This works in the same way as the argument to ``read_csv``
and ``to_csv``.

I've added tests confirming that it works with both file
paths, as well and file URLs and S3 URLs.

* Fix PEP8 violations

* Add PR number to whatsnew entry

* Remove problematic Windows test (The S3 test hits the same edge case)

* Extract decompress file function so that pytest.paramatrize can be used cleanly

* Fix typo in whatsnew entry
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -195,7 +195,7 @@ Other Enhancements
 - :func:`read_json` now accepts a ``chunksize`` parameter that can be used when ``lines=True``. If ``chunksize`` is passed, read_json now returns an iterator which reads in ``chunksize`` lines with each iteration. (:issue:`17048`)
 - :meth:`DataFrame.assign` will preserve the original order of ``**kwargs`` for Python 3.6+ users instead of sorting the column names
 - Improved the import time of pandas by about 2.25x  (:issue:`16764`)
-
+- :func:`read_json` and :func:`to_json` now accept a ``compression`` argument which allows them to transparently handle compressed files. (:issue:`17798`)
 
 .. _whatsnew_0210.api_breaking:
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1258,7 +1258,7 @@ def _repr_latex_(self):
 
     def to_json(self, path_or_buf=None, orient=None, date_format=None,
                 double_precision=10, force_ascii=True, date_unit='ms',
-                default_handler=None, lines=False):
+                default_handler=None, lines=False, compression=None):
         """
         Convert the object to a JSON string.
 
@@ -1320,6 +1320,12 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None,
 
             .. versionadded:: 0.19.0
 
+        compression : {None, 'gzip', 'bz2', 'xz'}
+            A string representing the compression to use in the output file,
+            only used when the first argument is a filename
+
+            .. versionadded:: 0.21.0
+
         Returns
         -------
         same type as input object with filtered info axis
@@ -1372,7 +1378,7 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None,
                             double_precision=double_precision,
                             force_ascii=force_ascii, date_unit=date_unit,
                             default_handler=default_handler,
-                            lines=lines)
+                            lines=lines, compression=compression)
 
     def to_hdf(self, path_or_buf, key, **kwargs):
         """Write the contained data to an HDF5 file using HDFStore.
diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
@@ -9,7 +9,8 @@
 from pandas import compat, isna
 from pandas import Series, DataFrame, to_datetime, MultiIndex
 from pandas.io.common import (get_filepath_or_buffer, _get_handle,
-                              _stringify_path, BaseIterator)
+                              _infer_compression, _stringify_path,
+                              BaseIterator)
 from pandas.io.parsers import _validate_integer
 from pandas.core.common import AbstractMethodError
 from pandas.core.reshape.concat import concat
@@ -27,7 +28,7 @@
 # interface to/from
 def to_json(path_or_buf, obj, orient=None, date_format='epoch',
             double_precision=10, force_ascii=True, date_unit='ms',
-            default_handler=None, lines=False):
+            default_handler=None, lines=False, compression=None):
 
     path_or_buf = _stringify_path(path_or_buf)
     if lines and orient != 'records':
@@ -54,8 +55,11 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch',
         s = _convert_to_line_delimits(s)
 
     if isinstance(path_or_buf, compat.string_types):
-        with open(path_or_buf, 'w') as fh:
+        fh, handles = _get_handle(path_or_buf, 'w', compression=compression)
+        try:
             fh.write(s)
+        finally:
+            fh.close()
     elif path_or_buf is None:
         return s
     else:
@@ -178,7 +182,7 @@ def write(self):
 def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
               convert_axes=True, convert_dates=True, keep_default_dates=True,
               numpy=False, precise_float=False, date_unit=None, encoding=None,
-              lines=False, chunksize=None):
+              lines=False, chunksize=None, compression='infer'):
     """
     Convert a JSON string to pandas object
 
@@ -277,6 +281,15 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
 
         .. versionadded:: 0.21.0
 
+    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
+        For on-the-fly decompression of on-disk data. If 'infer', then use
+        gzip, bz2, zip or xz if path_or_buf is a string ending in
+        '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression
+        otherwise. If using 'zip', the ZIP file must contain only one data
+        file to be read in. Set to None for no decompression.
+
+        .. versionadded:: 0.21.0
+
     Returns
     -------
     result : Series or DataFrame, depending on the value of `typ`.
@@ -334,15 +347,17 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
                 {"index": "row 2", "col 1": "c", "col 2": "d"}]}'
     """
 
-    filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf,
-                                                      encoding=encoding)
+    compression = _infer_compression(path_or_buf, compression)
+    filepath_or_buffer, _, compression = get_filepath_or_buffer(
+        path_or_buf, encoding=encoding, compression=compression,
+    )
 
     json_reader = JsonReader(
         filepath_or_buffer, orient=orient, typ=typ, dtype=dtype,
         convert_axes=convert_axes, convert_dates=convert_dates,
         keep_default_dates=keep_default_dates, numpy=numpy,
         precise_float=precise_float, date_unit=date_unit, encoding=encoding,
-        lines=lines, chunksize=chunksize
+        lines=lines, chunksize=chunksize, compression=compression,
     )
 
     if chunksize:
@@ -361,7 +376,7 @@ class JsonReader(BaseIterator):
     """
     def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes,
                  convert_dates, keep_default_dates, numpy, precise_float,
-                 date_unit, encoding, lines, chunksize):
+                 date_unit, encoding, lines, chunksize, compression):
 
         self.path_or_buf = filepath_or_buffer
         self.orient = orient
@@ -374,6 +389,7 @@ def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes,
         self.precise_float = precise_float
         self.date_unit = date_unit
         self.encoding = encoding
+        self.compression = compression
         self.lines = lines
         self.chunksize = chunksize
         self.nrows_seen = 0
@@ -415,20 +431,20 @@ def _get_data_from_filepath(self, filepath_or_buffer):
 
         data = filepath_or_buffer
 
+        exists = False
         if isinstance(data, compat.string_types):
             try:
                 exists = os.path.exists(filepath_or_buffer)
-
             # gh-5874: if the filepath is too long will raise here
             except (TypeError, ValueError):
                 pass
 
-            else:
-                if exists:
-                    data, _ = _get_handle(filepath_or_buffer, 'r',
-                                          encoding=self.encoding)
-                    self.should_close = True
-                    self.open_stream = data
+        if exists or self.compression is not None:
+            data, _ = _get_handle(filepath_or_buffer, 'r',
+                                  encoding=self.encoding,
+                                  compression=self.compression)
+            self.should_close = True
+            self.open_stream = data
 
         return data
 
diff --git a/pandas/tests/io/json/data/tsframe_v012.json.zip b/pandas/tests/io/json/data/tsframe_v012.json.zip
diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py
@@ -0,0 +1,133 @@
+import pytest
+import moto
+
+import pandas as pd
+from pandas import compat
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal, assert_raises_regex
+
+
+COMPRESSION_TYPES = [None, 'bz2', 'gzip', 'xz']
+
+
+def decompress_file(path, compression):
+    if compression is None:
+        f = open(path, 'rb')
+    elif compression == 'gzip':
+        import gzip
+        f = gzip.GzipFile(path, 'rb')
+    elif compression == 'bz2':
+        import bz2
+        f = bz2.BZ2File(path, 'rb')
+    elif compression == 'xz':
+        lzma = compat.import_lzma()
+        f = lzma.open(path, 'rb')
+    else:
+        msg = 'Unrecognized compression type: {}'.format(compression)
+        raise ValueError(msg)
+
+    result = f.read().decode('utf8')
+    f.close()
+    return result
+
+
+@pytest.mark.parametrize('compression', COMPRESSION_TYPES)
+def test_compression_roundtrip(compression):
+    if compression == 'xz':
+        tm._skip_if_no_lzma()
+
+    df = pd.DataFrame([[0.123456, 0.234567, 0.567567],
+                       [12.32112, 123123.2, 321321.2]],
+                      index=['A', 'B'], columns=['X', 'Y', 'Z'])
+
+    with tm.ensure_clean() as path:
+        df.to_json(path, compression=compression)
+        assert_frame_equal(df, pd.read_json(path, compression=compression))
+
+        # explicitly ensure file was compressed.
+        uncompressed_content = decompress_file(path, compression)
+        assert_frame_equal(df, pd.read_json(uncompressed_content))
+
+
+def test_compress_zip_value_error():
+    df = pd.DataFrame([[0.123456, 0.234567, 0.567567],
+                       [12.32112, 123123.2, 321321.2]],
+                      index=['A', 'B'], columns=['X', 'Y', 'Z'])
+
+    with tm.ensure_clean() as path:
+        import zipfile
+        pytest.raises(zipfile.BadZipfile, df.to_json, path, compression="zip")
+
+
+def test_read_zipped_json():
+    uncompressed_path = tm.get_data_path("tsframe_v012.json")
+    uncompressed_df = pd.read_json(uncompressed_path)
+
+    compressed_path = tm.get_data_path("tsframe_v012.json.zip")
+    compressed_df = pd.read_json(compressed_path, compression='zip')
+
+    assert_frame_equal(uncompressed_df, compressed_df)
+
+
+@pytest.mark.parametrize('compression', COMPRESSION_TYPES)
+def test_with_s3_url(compression):
+    boto3 = pytest.importorskip('boto3')
+    pytest.importorskip('s3fs')
+    if compression == 'xz':
+        tm._skip_if_no_lzma()
+
+    df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
+    with moto.mock_s3():
+        conn = boto3.resource("s3", region_name="us-east-1")
+        bucket = conn.create_bucket(Bucket="pandas-test")
+
+        with tm.ensure_clean() as path:
+            df.to_json(path, compression=compression)
+            with open(path, 'rb') as f:
+                bucket.put_object(Key='test-1', Body=f)
+
+        roundtripped_df = pd.read_json('s3://pandas-test/test-1',
+                                       compression=compression)
+        assert_frame_equal(df, roundtripped_df)
+
+
+@pytest.mark.parametrize('compression', COMPRESSION_TYPES)
+def test_lines_with_compression(compression):
+    if compression == 'xz':
+        tm._skip_if_no_lzma()
+
+    with tm.ensure_clean() as path:
+        df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
+        df.to_json(path, orient='records', lines=True, compression=compression)
+        roundtripped_df = pd.read_json(path, lines=True,
+                                       compression=compression)
+        assert_frame_equal(df, roundtripped_df)
+
+
+@pytest.mark.parametrize('compression', COMPRESSION_TYPES)
+def test_chunksize_with_compression(compression):
+    if compression == 'xz':
+        tm._skip_if_no_lzma()
+
+    with tm.ensure_clean() as path:
+        df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}')
+        df.to_json(path, orient='records', lines=True, compression=compression)
+
+        roundtripped_df = pd.concat(pd.read_json(path, lines=True, chunksize=1,
+                                                 compression=compression))
+        assert_frame_equal(df, roundtripped_df)
+
+
+def test_write_unsupported_compression_type():
+    df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
+    with tm.ensure_clean() as path:
+        msg = "Unrecognized compression type: unsupported"
+        assert_raises_regex(ValueError, msg, df.to_json,
+                            path, compression="unsupported")
+
+
+def test_read_unsupported_compression_type():
+    with tm.ensure_clean() as path:
+        msg = "Unrecognized compression type: unsupported"
+        assert_raises_regex(ValueError, msg, pd.read_json,
+                            path, compression="unsupported")
diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py
@@ -128,7 +128,7 @@ def test_readjson_chunks_closes(chunksize):
             path, orient=None, typ="frame", dtype=True, convert_axes=True,
             convert_dates=True, keep_default_dates=True, numpy=False,
             precise_float=False, date_unit=None, encoding=None,
-            lines=True, chunksize=chunksize)
+            lines=True, chunksize=chunksize, compression=None)
         reader.read()
         assert reader.open_stream.closed, "didn't close stream with \
             chunksize = %s" % chunksize