diff --git a/ci/requirements-2.7.pip b/ci/requirements-2.7.pip index 9334ca9e03cc1..cc3462dbf9ed0 100644 --- a/ci/requirements-2.7.pip +++ b/ci/requirements-2.7.pip @@ -4,4 +4,5 @@ google-api-python-client==1.2 python-gflags==2.0 oauth2client==1.5.0 pathlib +backports.lzma py diff --git a/doc/source/install.rst b/doc/source/install.rst index a7abf4ce54fb9..6bf707de5d925 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -271,6 +271,7 @@ Optional Dependencies `httplib2 `__ and `google-api-python-client `__ : Needed for :mod:`~pandas.io.gbq` +* `Backports.lzma `__: Only for Python 2, for writing to and/or reading from an xz compressed DataFrame in CSV. * One of the following combinations of libraries is needed to use the top-level :func:`~pandas.io.html.read_html` function: diff --git a/doc/source/io.rst b/doc/source/io.rst index 4abc3d722465d..d606e919e4292 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -217,14 +217,14 @@ chunksize : int, default ``None`` Quoting, Compression, and File Format +++++++++++++++++++++++++++++++++++++ -compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``None``}, default ``'infer'`` +compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``}, default ``'infer'`` For on-the-fly decompression of on-disk data. If 'infer', then use gzip, - bz2 or zip if filepath_or_buffer is a string ending in '.gz', '.bz2' or - '.zip', respectively, and no decompression otherwise. If using 'zip', + bz2, zip, or xz if filepath_or_buffer is a string ending in '.gz', '.bz2', + '.zip', or '.xz', respectively, and no decompression otherwise. If using 'zip', the ZIP file must contain only one data file to be read in. Set to ``None`` for no decompression. - .. versionadded:: 0.18.0 support for 'zip' compression. + .. versionadded:: 0.18.1 support for 'zip' and 'xz' compression. thousands : str, default ``None`` Thousands separator. diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index e664020946baf..7cf5a0780bb7c 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -57,6 +57,7 @@ Other Enhancements ^^^^^^^^^^^^^^^^^^ - ``pd.read_csv()`` now supports opening ZIP files that contains a single CSV, via extension inference or explict ``compression='zip'`` (:issue:`12175`) +- ``pd.read_csv()`` now supports opening files using xz compression when ``compression='xz'`` is specified, `xz` is also supported by ``DataFrame.to_csv`` in the same way (:issue:`11852`) - ``pd.read_msgpack()`` now always gives writeable ndarrays even when compression is used (:issue:`12359`). .. _whatsnew_0181.api: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b2c90bd1c38b8..a504f91705733 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1301,7 +1301,7 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, defaults to 'ascii' on Python 2 and 'utf-8' on Python 3. compression : string, optional a string representing the compression to use in the output file, - allowed values are 'gzip', 'bz2', + allowed values are 'gzip', 'bz2', 'xz', only used when the first argument is a filename line_terminator : string, default '\\n' The newline character or character sequence to use in the output diff --git a/pandas/io/common.py b/pandas/io/common.py index d44057178d27e..485f52f4274ff 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -375,6 +375,12 @@ def _get_handle(path, mode, encoding=None, compression=None): raise ValueError('Multiple files found in ZIP file.' ' Only one file per ZIP :{}' .format(zip_names)) + elif compression == 'xz': + if compat.PY2: + from backports import lzma + else: + import lzma + f = lzma.LZMAFile(path, mode) else: raise ValueError('Unrecognized compression type: %s' % compression) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 301b9c889f5ff..b475e316ab04e 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -158,14 +158,14 @@ class ParserWarning(Warning): information `_ on ``iterator`` and ``chunksize``. -compression : {'infer', 'gzip', 'bz2', 'zip', None}, default 'infer' +compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, - bz2 or zip if filepath_or_buffer is a string ending in '.gz', '.bz2' or - '.zip', respectively, and no decompression otherwise. If using 'zip', - the ZIP file must contain only one data file to be read in. + bz2, zip or xz if filepath_or_buffer is a string ending in '.gz', '.bz2', + '.zip', or 'xz', respectively, and no decompression otherwise. If using + 'zip', the ZIP file must contain only one data file to be read in. Set to None for no decompression. - .. versionadded:: 0.18.0 support for 'zip' compression. + .. versionadded:: 0.18.1 support for 'zip' and 'xz' compression. thousands : str, default None Thousands separator @@ -279,6 +279,8 @@ def _read(filepath_or_buffer, kwds): inferred_compression = 'bz2' elif filepath_or_buffer.endswith('.zip'): inferred_compression = 'zip' + elif filepath_or_buffer.endswith('.xz'): + inferred_compression = 'xz' else: inferred_compression = None else: @@ -1421,6 +1423,21 @@ def _wrap_compressed(f, compression, encoding=None): raise ValueError('Multiple files found in compressed ' 'zip file %s', str(zip_names)) + elif compression == 'xz': + + if compat.PY3: + import lzma + else: + from backports import lzma + f = lzma.LZMAFile(f) + + if compat.PY3: + from io import TextIOWrapper + + f = TextIOWrapper(f) + + return f + else: raise ValueError('do not recognize compression method %s' % compression) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 7c7b40d77e821..ea156b95ed3f7 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -2808,6 +2808,38 @@ def test_bz2(self): result = self.read_csv(path, compression='infer') tm.assert_frame_equal(result, expected) + def test_xz(self): + try: + if compat.PY3: + import lzma + else: + from backports import lzma + except ImportError: + raise nose.SkipTest('need lzma to run') + + with open(self.csv1, 'rb') as data_file: + data = data_file.read() + expected = self.read_csv(self.csv1) + + with tm.ensure_clean() as path: + tmp = lzma.LZMAFile(path, mode='wb') + tmp.write(data) + tmp.close() + + result = self.read_csv(path, compression='xz') + tm.assert_frame_equal(result, expected) + + with open(path, 'rb') as f: + result = self.read_csv(f, compression='xz') + tm.assert_frame_equal(result, expected) + + with tm.ensure_clean('test.xz') as path: + tmp = lzma.LZMAFile(path, mode='wb') + tmp.write(data) + tmp.close() + result = self.read_csv(path, compression='infer') + tm.assert_frame_equal(result, expected) + def test_decompression_regex_sep(self): try: import gzip diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 8bfc0ab8d6c56..a146b57cf5319 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -582,6 +582,16 @@ cdef class TextReader: else: raise ValueError('Multiple files found in compressed ' 'zip file %s', str(zip_names)) + elif self.compression == 'xz': + if PY3: + import lzma + else: + from backports import lzma + + if isinstance(source, basestring): + source = lzma.LZMAFile(source, 'rb') + else: + source = lzma.LZMAFile(filename=source) else: raise ValueError('Unrecognized compression type: %s' % self.compression) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 4faf67eda6c78..5713a246e2930 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -985,6 +985,30 @@ def test_to_csv_compression_bz2(self): for col in df.columns: self.assertIn(col, text) + def test_to_csv_compression_xz(self): + # GH11852 + # use the compression kw in to_csv + df = DataFrame([[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + index=['A', 'B'], columns=['X', 'Y', 'Z']) + + with ensure_clean() as filename: + + df.to_csv(filename, compression="xz") + + # test the round trip - to_csv -> read_csv + rs = read_csv(filename, compression="xz", index_col=0) + assert_frame_equal(df, rs) + + # explicitly make sure file is xzipped + if compat.PY2: + from backports import lzma + else: + import lzma + f = lzma.open(filename, 'rb') + assert_frame_equal(df, read_csv(f, index_col=0)) + f.close() + def test_to_csv_compression_value_error(self): # GH7615 # use the compression kw in to_csv