Skip to content

Commit 247fe07

Browse files
terfilipjreback
authored andcommitted
ENH: xz compression in to_csv()
closes #11852 closes #12668
1 parent 85f8cf7 commit 247fe07

File tree

12 files changed

+105
-10
lines changed

12 files changed

+105
-10
lines changed

ci/requirements-2.7.pip

+1
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@ google-api-python-client==1.2
44
python-gflags==2.0
55
oauth2client==1.5.0
66
pathlib
7+
backports.lzma
78
py

doc/source/install.rst

+1
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,7 @@ Optional Dependencies
271271
`httplib2 <http://pypi.python.org/pypi/httplib2>`__
272272
and `google-api-python-client <http://github.com/google/google-api-python-client>`__
273273
: Needed for :mod:`~pandas.io.gbq`
274+
* `Backports.lzma <https://pypi.python.org/pypi/backports.lzma/>`__: Only for Python 2, for writing to and/or reading from an xz compressed DataFrame in CSV; Python 3 support is built into the standard library.
274275
* One of the following combinations of libraries is needed to use the
275276
top-level :func:`~pandas.io.html.read_html` function:
276277

doc/source/io.rst

+4-4
Original file line numberDiff line numberDiff line change
@@ -217,14 +217,14 @@ chunksize : int, default ``None``
217217
Quoting, Compression, and File Format
218218
+++++++++++++++++++++++++++++++++++++
219219

220-
compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``None``}, default ``'infer'``
220+
compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``}, default ``'infer'``
221221
For on-the-fly decompression of on-disk data. If 'infer', then use gzip,
222-
bz2 or zip if filepath_or_buffer is a string ending in '.gz', '.bz2' or
223-
'.zip', respectively, and no decompression otherwise. If using 'zip',
222+
bz2, zip, or xz if filepath_or_buffer is a string ending in '.gz', '.bz2',
223+
'.zip', or '.xz', respectively, and no decompression otherwise. If using 'zip',
224224
the ZIP file must contain only one data file to be read in.
225225
Set to ``None`` for no decompression.
226226

227-
.. versionadded:: 0.18.0 support for 'zip' compression.
227+
.. versionadded:: 0.18.1 support for 'zip' and 'xz' compression.
228228

229229
thousands : str, default ``None``
230230
Thousands separator.

doc/source/whatsnew/v0.18.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ Other Enhancements
5757
^^^^^^^^^^^^^^^^^^
5858

5959
- ``pd.read_csv()`` now supports opening ZIP files that contains a single CSV, via extension inference or explict ``compression='zip'`` (:issue:`12175`)
60+
- ``pd.read_csv()`` now supports opening files using xz compression, via extension inference or explicit ``compression='xz'`` is specified; ``xz`` compressions is also supported by ``DataFrame.to_csv`` in the same way (:issue:`11852`)
6061
- ``pd.read_msgpack()`` now always gives writeable ndarrays even when compression is used (:issue:`12359`).
6162

6263
.. _whatsnew_0181.api:

pandas/compat/__init__.py

+11
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,11 @@ def east_asian_len(data, encoding=None, ambiguous_width=1):
237237
else:
238238
return len(data)
239239

240+
def import_lzma():
241+
""" import lzma from the std library """
242+
import lzma
243+
return lzma
244+
240245
else:
241246
string_types = basestring,
242247
integer_types = (int, long)
@@ -273,6 +278,12 @@ def east_asian_len(data, encoding=None, ambiguous_width=1):
273278
else:
274279
return len(data)
275280

281+
def import_lzma():
282+
""" import the backported lzma library
283+
or raise ImportError if not available """
284+
from backports import lzma
285+
return lzma
286+
276287
string_and_binary_types = string_types + (binary_type,)
277288

278289

pandas/core/frame.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1301,7 +1301,7 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
13011301
defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
13021302
compression : string, optional
13031303
a string representing the compression to use in the output file,
1304-
allowed values are 'gzip', 'bz2',
1304+
allowed values are 'gzip', 'bz2', 'xz',
13051305
only used when the first argument is a filename
13061306
line_terminator : string, default '\\n'
13071307
The newline character or character sequence to use in the output

pandas/io/common.py

+3
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,9 @@ def _get_handle(path, mode, encoding=None, compression=None):
375375
raise ValueError('Multiple files found in ZIP file.'
376376
' Only one file per ZIP :{}'
377377
.format(zip_names))
378+
elif compression == 'xz':
379+
lzma = compat.import_lzma()
380+
f = lzma.LZMAFile(path, mode)
378381
else:
379382
raise ValueError('Unrecognized compression type: %s' %
380383
compression)

pandas/io/parsers.py

+19-5
Original file line numberDiff line numberDiff line change
@@ -158,14 +158,14 @@ class ParserWarning(Warning):
158158
information
159159
<http://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_ on
160160
``iterator`` and ``chunksize``.
161-
compression : {'infer', 'gzip', 'bz2', 'zip', None}, default 'infer'
161+
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
162162
For on-the-fly decompression of on-disk data. If 'infer', then use gzip,
163-
bz2 or zip if filepath_or_buffer is a string ending in '.gz', '.bz2' or
164-
'.zip', respectively, and no decompression otherwise. If using 'zip',
165-
the ZIP file must contain only one data file to be read in.
163+
bz2, zip or xz if filepath_or_buffer is a string ending in '.gz', '.bz2',
164+
'.zip', or 'xz', respectively, and no decompression otherwise. If using
165+
'zip', the ZIP file must contain only one data file to be read in.
166166
Set to None for no decompression.
167167
168-
.. versionadded:: 0.18.0 support for 'zip' compression.
168+
.. versionadded:: 0.18.1 support for 'zip' and 'xz' compression.
169169
170170
thousands : str, default None
171171
Thousands separator
@@ -279,6 +279,8 @@ def _read(filepath_or_buffer, kwds):
279279
inferred_compression = 'bz2'
280280
elif filepath_or_buffer.endswith('.zip'):
281281
inferred_compression = 'zip'
282+
elif filepath_or_buffer.endswith('.xz'):
283+
inferred_compression = 'xz'
282284
else:
283285
inferred_compression = None
284286
else:
@@ -1421,6 +1423,18 @@ def _wrap_compressed(f, compression, encoding=None):
14211423
raise ValueError('Multiple files found in compressed '
14221424
'zip file %s', str(zip_names))
14231425

1426+
elif compression == 'xz':
1427+
1428+
lzma = compat.import_lzma()
1429+
f = lzma.LZMAFile(f)
1430+
1431+
if compat.PY3:
1432+
from io import TextIOWrapper
1433+
1434+
f = TextIOWrapper(f)
1435+
1436+
return f
1437+
14241438
else:
14251439
raise ValueError('do not recognize compression method %s'
14261440
% compression)

pandas/io/tests/test_parsers.py

+26
Original file line numberDiff line numberDiff line change
@@ -2808,6 +2808,32 @@ def test_bz2(self):
28082808
result = self.read_csv(path, compression='infer')
28092809
tm.assert_frame_equal(result, expected)
28102810

2811+
def test_xz(self):
2812+
lzma = tm._skip_if_no_lzma()
2813+
2814+
with open(self.csv1, 'rb') as data_file:
2815+
data = data_file.read()
2816+
expected = self.read_csv(self.csv1)
2817+
2818+
with tm.ensure_clean() as path:
2819+
tmp = lzma.LZMAFile(path, mode='wb')
2820+
tmp.write(data)
2821+
tmp.close()
2822+
2823+
result = self.read_csv(path, compression='xz')
2824+
tm.assert_frame_equal(result, expected)
2825+
2826+
with open(path, 'rb') as f:
2827+
result = self.read_csv(f, compression='xz')
2828+
tm.assert_frame_equal(result, expected)
2829+
2830+
with tm.ensure_clean('test.xz') as path:
2831+
tmp = lzma.LZMAFile(path, mode='wb')
2832+
tmp.write(data)
2833+
tmp.close()
2834+
result = self.read_csv(path, compression='infer')
2835+
tm.assert_frame_equal(result, expected)
2836+
28112837
def test_decompression_regex_sep(self):
28122838
try:
28132839
import gzip

pandas/parser.pyx

+8
Original file line numberDiff line numberDiff line change
@@ -582,6 +582,14 @@ cdef class TextReader:
582582
else:
583583
raise ValueError('Multiple files found in compressed '
584584
'zip file %s', str(zip_names))
585+
elif self.compression == 'xz':
586+
from pandas import compat
587+
lzma = compat.import_lzma()
588+
589+
if isinstance(source, basestring):
590+
source = lzma.LZMAFile(source, 'rb')
591+
else:
592+
source = lzma.LZMAFile(filename=source)
585593
else:
586594
raise ValueError('Unrecognized compression type: %s' %
587595
self.compression)

pandas/tests/frame/test_to_csv.py

+22
Original file line numberDiff line numberDiff line change
@@ -985,6 +985,28 @@ def test_to_csv_compression_bz2(self):
985985
for col in df.columns:
986986
self.assertIn(col, text)
987987

988+
def test_to_csv_compression_xz(self):
989+
# GH11852
990+
# use the compression kw in to_csv
991+
tm._skip_if_no_lzma()
992+
df = DataFrame([[0.123456, 0.234567, 0.567567],
993+
[12.32112, 123123.2, 321321.2]],
994+
index=['A', 'B'], columns=['X', 'Y', 'Z'])
995+
996+
with ensure_clean() as filename:
997+
998+
df.to_csv(filename, compression="xz")
999+
1000+
# test the round trip - to_csv -> read_csv
1001+
rs = read_csv(filename, compression="xz", index_col=0)
1002+
assert_frame_equal(df, rs)
1003+
1004+
# explicitly make sure file is xzipped
1005+
lzma = compat.import_lzma()
1006+
f = lzma.open(filename, 'rb')
1007+
assert_frame_equal(df, read_csv(f, index_col=0))
1008+
f.close()
1009+
9881010
def test_to_csv_compression_value_error(self):
9891011
# GH7615
9901012
# use the compression kw in to_csv

pandas/util/testing.py

+8
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,14 @@ def _skip_if_scipy_0_17():
246246
raise nose.SkipTest("scipy 0.17")
247247

248248

249+
def _skip_if_no_lzma():
250+
try:
251+
return compat.import_lzma()
252+
except ImportError:
253+
import nose
254+
raise nose.SkipTest('need backports.lzma to run')
255+
256+
249257
def _skip_if_no_xarray():
250258
try:
251259
import xarray

0 commit comments

Comments
 (0)