Skip to content

Commit 3ad16c9

Browse files
segatradetopper-123
authored andcommitted
ENH: xz compression_args df.to_pickle(path, compression={'method':'xz','pr… (pandas-dev#53443)
* xz compression_args df.to_pickle(path, compression={'method':'xz','preset': 9}) * fix Line too long pandas/io/common.py:829:89: E501 Line too long (103 > 88 characters) * fix black * xz compression_args docs * ruff 458:89: E501 Line too long * fix mypy Failed pandas/io/common.py:830: error: Argument 1 to "LZMAFile" has incompatible type "Union[str, BaseBuffer]"; expected "Optional[Union[Union[str, bytes, PathLike[str], PathLike[bytes]], IO[bytes]]]" [arg-type] pandas/io/common.py:831: error: Unused "type: ignore" comment * xz compression tests docs ench update * Sphinx lint & black fix * additional assert in test sort whatsnew entries alphabetically & black fix * fix test obj * fix path test * removed test_xz_compression_invalid_args , fix read_csv(compression --------- Co-authored-by: Se <--global>
1 parent 00a6aea commit 3ad16c9

File tree

4 files changed

+33
-9
lines changed

4 files changed

+33
-9
lines changed

doc/source/whatsnew/v2.1.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,9 @@ Other enhancements
101101
- :meth:`DataFrame.unstack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`)
102102
- :meth:`SeriesGroupby.agg` and :meth:`DataFrameGroupby.agg` now support passing in multiple functions for ``engine="numba"`` (:issue:`53486`)
103103
- Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`)
104+
- Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`)
104105
- Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`)
105106
- Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`)
106-
-
107107

108108
.. ---------------------------------------------------------------------------
109109
.. _whatsnew_210.notable_bug_fixes:

pandas/core/shared_docs.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -455,10 +455,10 @@
455455
(otherwise no compression).
456456
Set to ``None`` for no compression.
457457
Can also be a dict with key ``'method'`` set
458-
to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'tar'``} and other
459-
key-value pairs are forwarded to
458+
to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and
459+
other key-value pairs are forwarded to
460460
``zipfile.ZipFile``, ``gzip.GzipFile``,
461-
``bz2.BZ2File``, ``zstandard.ZstdCompressor`` or
461+
``bz2.BZ2File``, ``zstandard.ZstdCompressor``, ``lzma.LZMAFile`` or
462462
``tarfile.TarFile``, respectively.
463463
As an example, the following could be passed for faster compression and to create
464464
a reproducible gzip archive:
@@ -477,10 +477,10 @@
477477
If using 'zip' or 'tar', the ZIP file must contain only one data file to be read in.
478478
Set to ``None`` for no decompression.
479479
Can also be a dict with key ``'method'`` set
480-
to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'tar'``} and other
481-
key-value pairs are forwarded to
480+
to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and
481+
other key-value pairs are forwarded to
482482
``zipfile.ZipFile``, ``gzip.GzipFile``,
483-
``bz2.BZ2File``, ``zstandard.ZstdDecompressor`` or
483+
``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, ``lzma.LZMAFile`` or
484484
``tarfile.TarFile``, respectively.
485485
As an example, the following could be passed for Zstandard decompression using a
486486
custom compression dictionary:

pandas/io/common.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -825,8 +825,10 @@ def get_handle(
825825
elif compression == "xz":
826826
# error: Argument 1 to "LZMAFile" has incompatible type "Union[str,
827827
# BaseBuffer]"; expected "Optional[Union[Union[str, bytes, PathLike[str],
828-
# PathLike[bytes]], IO[bytes]]]"
829-
handle = get_lzma_file()(handle, ioargs.mode) # type: ignore[arg-type]
828+
# PathLike[bytes]], IO[bytes]], None]"
829+
handle = get_lzma_file()(
830+
handle, ioargs.mode, **compression_args # type: ignore[arg-type]
831+
)
830832

831833
# Zstd Compression
832834
elif compression == "zstd":

pandas/tests/io/test_compression.py

+22
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,28 @@ def test_gzip_compression_level(obj, method):
253253
assert compressed_size_default < compressed_size_fast
254254

255255

256+
@pytest.mark.parametrize(
257+
"obj",
258+
[
259+
pd.DataFrame(
260+
100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
261+
columns=["X", "Y", "Z"],
262+
),
263+
pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
264+
],
265+
)
266+
@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
267+
def test_xz_compression_level_read(obj, method):
268+
with tm.ensure_clean() as path:
269+
getattr(obj, method)(path, compression="xz")
270+
compressed_size_default = os.path.getsize(path)
271+
getattr(obj, method)(path, compression={"method": "xz", "preset": 1})
272+
compressed_size_fast = os.path.getsize(path)
273+
assert compressed_size_default < compressed_size_fast
274+
if method == "to_csv":
275+
pd.read_csv(path, compression="xz")
276+
277+
256278
@pytest.mark.parametrize(
257279
"obj",
258280
[

0 commit comments

Comments
 (0)