ENH: xz compression in to_csv()

terfilip · jreback · commit 247fe0718c3b · 2016-03-23T16:37:55.000-04:00
closes #11852 closes #12668
diff --git a/ci/requirements-2.7.pip b/ci/requirements-2.7.pip
@@ -4,4 +4,5 @@ google-api-python-client==1.2
 python-gflags==2.0
 oauth2client==1.5.0
 pathlib
+backports.lzma
 py
diff --git a/doc/source/install.rst b/doc/source/install.rst
@@ -271,6 +271,7 @@ Optional Dependencies
   `httplib2 <http://pypi.python.org/pypi/httplib2>`__
   and `google-api-python-client <http://github.com/google/google-api-python-client>`__
   : Needed for :mod:`~pandas.io.gbq`
+* `Backports.lzma <https://pypi.python.org/pypi/backports.lzma/>`__: Only for Python 2, for writing to and/or reading from an xz compressed DataFrame in CSV; Python 3 support is built into the standard library.
 * One of the following combinations of libraries is needed to use the
   top-level :func:`~pandas.io.html.read_html` function:
 
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -217,14 +217,14 @@ chunksize : int, default ``None``
 Quoting, Compression, and File Format
 +++++++++++++++++++++++++++++++++++++
 
-compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``None``}, default ``'infer'``
+compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``}, default ``'infer'``
   For on-the-fly decompression of on-disk data. If 'infer', then use gzip,
-  bz2 or zip if filepath_or_buffer is a string ending in '.gz', '.bz2' or
-  '.zip', respectively, and no decompression otherwise. If using 'zip',
+  bz2, zip, or xz if filepath_or_buffer is a string ending in '.gz', '.bz2',
+  '.zip', or '.xz', respectively, and no decompression otherwise. If using 'zip',
   the ZIP file must contain only one data file to be read in.
   Set to ``None`` for no decompression.
 
-  .. versionadded:: 0.18.0 support for 'zip' compression.
+  .. versionadded:: 0.18.1 support for 'zip' and 'xz' compression.
 
 thousands : str, default ``None``
   Thousands separator.
diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt
@@ -57,6 +57,7 @@ Other Enhancements
 ^^^^^^^^^^^^^^^^^^
 
 - ``pd.read_csv()`` now supports opening ZIP files that contains a single CSV, via extension inference or explict ``compression='zip'`` (:issue:`12175`)
+- ``pd.read_csv()`` now supports opening files using xz compression, via extension inference or explicit ``compression='xz'`` is specified; ``xz`` compressions is also supported by ``DataFrame.to_csv`` in the same way (:issue:`11852`)
 - ``pd.read_msgpack()`` now always gives writeable ndarrays even when compression is used (:issue:`12359`).
 
 .. _whatsnew_0181.api:
diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
@@ -237,6 +237,11 @@ def east_asian_len(data, encoding=None, ambiguous_width=1):
         else:
             return len(data)
 
+    def import_lzma():
+        """ import lzma from the std library """
+        import lzma
+        return lzma
+
 else:
     string_types = basestring,
     integer_types = (int, long)
@@ -273,6 +278,12 @@ def east_asian_len(data, encoding=None, ambiguous_width=1):
         else:
             return len(data)
 
+    def import_lzma():
+        """ import the backported lzma library
+        or raise ImportError if not available """
+        from backports import lzma
+        return lzma
+
 string_and_binary_types = string_types + (binary_type,)
 
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1301,7 +1301,7 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
             defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
         compression : string, optional
             a string representing the compression to use in the output file,
-            allowed values are 'gzip', 'bz2',
+            allowed values are 'gzip', 'bz2', 'xz',
             only used when the first argument is a filename
         line_terminator : string, default '\\n'
             The newline character or character sequence to use in the output
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -375,6 +375,9 @@ def _get_handle(path, mode, encoding=None, compression=None):
                 raise ValueError('Multiple files found in ZIP file.'
                                  ' Only one file per ZIP :{}'
                                  .format(zip_names))
+        elif compression == 'xz':
+            lzma = compat.import_lzma()
+            f = lzma.LZMAFile(path, mode)
         else:
             raise ValueError('Unrecognized compression type: %s' %
                              compression)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -158,14 +158,14 @@ class ParserWarning(Warning):
     information
     <http://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_ on
     ``iterator`` and ``chunksize``.
-compression : {'infer', 'gzip', 'bz2', 'zip', None}, default 'infer'
+compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
     For on-the-fly decompression of on-disk data. If 'infer', then use gzip,
-    bz2 or zip if filepath_or_buffer is a string ending in '.gz', '.bz2' or
-    '.zip', respectively, and no decompression otherwise. If using 'zip',
-    the ZIP file must contain only one data file to be read in.
+    bz2, zip or xz if filepath_or_buffer is a string ending in '.gz', '.bz2',
+    '.zip', or 'xz', respectively, and no decompression otherwise. If using
+    'zip', the ZIP file must contain only one data file to be read in.
     Set to None for no decompression.
 
-    .. versionadded:: 0.18.0 support for 'zip' compression.
+    .. versionadded:: 0.18.1 support for 'zip' and 'xz' compression.
 
 thousands : str, default None
     Thousands separator
@@ -279,6 +279,8 @@ def _read(filepath_or_buffer, kwds):
                 inferred_compression = 'bz2'
             elif filepath_or_buffer.endswith('.zip'):
                 inferred_compression = 'zip'
+            elif filepath_or_buffer.endswith('.xz'):
+                inferred_compression = 'xz'
             else:
                 inferred_compression = None
         else:
@@ -1421,6 +1423,18 @@ def _wrap_compressed(f, compression, encoding=None):
             raise ValueError('Multiple files found in compressed '
                              'zip file %s', str(zip_names))
 
+    elif compression == 'xz':
+
+        lzma = compat.import_lzma()
+        f = lzma.LZMAFile(f)
+
+        if compat.PY3:
+            from io import TextIOWrapper
+
+            f = TextIOWrapper(f)
+
+        return f
+
     else:
         raise ValueError('do not recognize compression method %s'
                          % compression)
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -2808,6 +2808,32 @@ def test_bz2(self):
             result = self.read_csv(path, compression='infer')
             tm.assert_frame_equal(result, expected)
 
+    def test_xz(self):
+        lzma = tm._skip_if_no_lzma()
+
+        with open(self.csv1, 'rb') as data_file:
+            data = data_file.read()
+            expected = self.read_csv(self.csv1)
+
+        with tm.ensure_clean() as path:
+            tmp = lzma.LZMAFile(path, mode='wb')
+            tmp.write(data)
+            tmp.close()
+
+            result = self.read_csv(path, compression='xz')
+            tm.assert_frame_equal(result, expected)
+
+            with open(path, 'rb') as f:
+                result = self.read_csv(f, compression='xz')
+                tm.assert_frame_equal(result, expected)
+
+        with tm.ensure_clean('test.xz') as path:
+            tmp = lzma.LZMAFile(path, mode='wb')
+            tmp.write(data)
+            tmp.close()
+            result = self.read_csv(path, compression='infer')
+            tm.assert_frame_equal(result, expected)
+
     def test_decompression_regex_sep(self):
         try:
             import gzip
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -582,6 +582,14 @@ cdef class TextReader:
                 else:
                     raise ValueError('Multiple files found in compressed '
                                      'zip file %s', str(zip_names))
+            elif self.compression == 'xz':
+                from pandas import compat
+                lzma = compat.import_lzma()
+
+                if isinstance(source, basestring):
+                    source = lzma.LZMAFile(source, 'rb')
+                else:
+                    source = lzma.LZMAFile(filename=source)
             else:
                 raise ValueError('Unrecognized compression type: %s' %
                                  self.compression)
diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py
@@ -985,6 +985,28 @@ def test_to_csv_compression_bz2(self):
             for col in df.columns:
                 self.assertIn(col, text)
 
+    def test_to_csv_compression_xz(self):
+        # GH11852
+        # use the compression kw in to_csv
+        tm._skip_if_no_lzma()
+        df = DataFrame([[0.123456, 0.234567, 0.567567],
+                        [12.32112, 123123.2, 321321.2]],
+                       index=['A', 'B'], columns=['X', 'Y', 'Z'])
+
+        with ensure_clean() as filename:
+
+            df.to_csv(filename, compression="xz")
+
+            # test the round trip - to_csv -> read_csv
+            rs = read_csv(filename, compression="xz", index_col=0)
+            assert_frame_equal(df, rs)
+
+            # explicitly make sure file is xzipped
+            lzma = compat.import_lzma()
+            f = lzma.open(filename, 'rb')
+            assert_frame_equal(df, read_csv(f, index_col=0))
+            f.close()
+
     def test_to_csv_compression_value_error(self):
         # GH7615
         # use the compression kw in to_csv
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
@@ -246,6 +246,14 @@ def _skip_if_scipy_0_17():
         raise nose.SkipTest("scipy 0.17")
 
 
+def _skip_if_no_lzma():
+    try:
+        return compat.import_lzma()
+    except ImportError:
+        import nose
+        raise nose.SkipTest('need backports.lzma to run')
+
+
 def _skip_if_no_xarray():
     try:
         import xarray