ENH: xz compression_args df.to_pickle(path, compression={'method':'xz','pr… (pandas-dev#53443)

segatrade · topper-123 · commit 3ad16c9c5772 · 2023-06-05T23:01:58.000+01:00
* xz compression_args df.to_pickle(path, compression={'method':'xz','preset': 9})

* fix Line too long

pandas/io/common.py:829:89: E501 Line too long (103 &gt; 88 characters)

* fix black

* xz compression_args docs

* ruff 458:89: E501 Line too long

* fix mypy Failed
pandas/io/common.py:830: error: Argument 1 to "LZMAFile" has incompatible type "Union[str, BaseBuffer]"; expected "Optional[Union[Union[str, bytes, PathLike[str], PathLike[bytes]], IO[bytes]]]"  [arg-type]
pandas/io/common.py:831: error: Unused "type: ignore" comment

* xz compression tests
docs ench update

* Sphinx lint &amp; black fix

* additional assert in test
sort whatsnew entries alphabetically &amp; black fix

* fix test obj

* fix path test

* removed test_xz_compression_invalid_args , fix read_csv(compression

---------

Co-authored-by: Se &lt;--global&gt;
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -101,9 +101,9 @@ Other enhancements
 - :meth:`DataFrame.unstack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`)
 - :meth:`SeriesGroupby.agg` and :meth:`DataFrameGroupby.agg` now support passing in multiple functions for ``engine="numba"`` (:issue:`53486`)
 - Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`)
+- Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`)
 - Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`)
 - Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`)
--
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_210.notable_bug_fixes:
diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py
@@ -455,10 +455,10 @@
     (otherwise no compression).
     Set to ``None`` for no compression.
     Can also be a dict with key ``'method'`` set
-    to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'tar'``} and other
-    key-value pairs are forwarded to
+    to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and
+    other key-value pairs are forwarded to
     ``zipfile.ZipFile``, ``gzip.GzipFile``,
-    ``bz2.BZ2File``, ``zstandard.ZstdCompressor`` or
+    ``bz2.BZ2File``, ``zstandard.ZstdCompressor``, ``lzma.LZMAFile`` or
     ``tarfile.TarFile``, respectively.
     As an example, the following could be passed for faster compression and to create
     a reproducible gzip archive:
@@ -477,10 +477,10 @@
     If using 'zip' or 'tar', the ZIP file must contain only one data file to be read in.
     Set to ``None`` for no decompression.
     Can also be a dict with key ``'method'`` set
-    to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'tar'``} and other
-    key-value pairs are forwarded to
+    to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and
+    other key-value pairs are forwarded to
     ``zipfile.ZipFile``, ``gzip.GzipFile``,
-    ``bz2.BZ2File``, ``zstandard.ZstdDecompressor`` or
+    ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, ``lzma.LZMAFile`` or
     ``tarfile.TarFile``, respectively.
     As an example, the following could be passed for Zstandard decompression using a
     custom compression dictionary:
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -825,8 +825,10 @@ def get_handle(
         elif compression == "xz":
             # error: Argument 1 to "LZMAFile" has incompatible type "Union[str,
             # BaseBuffer]"; expected "Optional[Union[Union[str, bytes, PathLike[str],
-            # PathLike[bytes]], IO[bytes]]]"
-            handle = get_lzma_file()(handle, ioargs.mode)  # type: ignore[arg-type]
+            # PathLike[bytes]], IO[bytes]], None]"
+            handle = get_lzma_file()(
+                handle, ioargs.mode, **compression_args  # type: ignore[arg-type]
+            )
 
         # Zstd Compression
         elif compression == "zstd":
diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py
@@ -253,6 +253,28 @@ def test_gzip_compression_level(obj, method):
         assert compressed_size_default < compressed_size_fast
 
 
+@pytest.mark.parametrize(
+    "obj",
+    [
+        pd.DataFrame(
+            100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
+            columns=["X", "Y", "Z"],
+        ),
+        pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
+    ],
+)
+@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
+def test_xz_compression_level_read(obj, method):
+    with tm.ensure_clean() as path:
+        getattr(obj, method)(path, compression="xz")
+        compressed_size_default = os.path.getsize(path)
+        getattr(obj, method)(path, compression={"method": "xz", "preset": 1})
+        compressed_size_fast = os.path.getsize(path)
+        assert compressed_size_default < compressed_size_fast
+        if method == "to_csv":
+            pd.read_csv(path, compression="xz")
+
+
 @pytest.mark.parametrize(
     "obj",
     [