pandas-dev · WillAyd · Aug 26, 2019 · Apr 8, 2019 · Apr 8, 2019 · Apr 8, 2019
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -80,6 +80,7 @@ Other Enhancements
 - :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`)
 - :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`)
 - :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`)
+- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support dicts as ``compression`` argument with key ``'method'`` being the compression method and others as additional compression options when the compression method is ``'zip'``. (:issue:`26023`)
 - :func:`merge_asof` now gives a more clear error message when merge keys are categoricals that are not equal (:issue:`26136`)
 - :meth:`pandas.core.window.Rolling` supports exponential (or Poisson) window type (:issue:`21303`)
 -

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -6,7 +6,7 @@
 import operator
 import pickle
 from textwrap import dedent
-from typing import Callable, FrozenSet, List, Set
+from typing import Any, Callable, Dict, FrozenSet, List, Optional, Set, Union
 import warnings
 import weakref
 
@@ -2942,10 +2942,11 @@ def to_latex(self, buf=None, columns=None, col_space=None, header=True,
 
     def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
                columns=None, header=True, index=True, index_label=None,
-               mode='w', encoding=None, compression='infer', quoting=None,
-               quotechar='"', line_terminator=None, chunksize=None,
-               tupleize_cols=None, date_format=None, doublequote=True,
-               escapechar=None, decimal='.'):
+               mode='w', encoding=None,
+               compression: Optional[Union[str, Dict[str, Any]]] = 'infer',
+               quoting=None, quotechar='"', line_terminator=None,
+               chunksize=None, tupleize_cols=None, date_format=None,
+               doublequote=True, escapechar=None, decimal='.'):
         r"""
         Write object to a comma-separated values (csv) file.
 
@@ -2992,16 +2993,21 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
         encoding : str, optional
             A string representing the encoding to use in the output file,
             defaults to 'utf-8'.
-        compression : str, default 'infer'
-            Compression mode among the following possible values: {'infer',
-            'gzip', 'bz2', 'zip', 'xz', None}. If 'infer' and `path_or_buf`
-            is path-like, then detect compression from the following
-            extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no
-            compression).
-
-            .. versionchanged:: 0.24.0
-
-               'infer' option added and set to default.
+        compression : str or dict, default 'infer'
+            If str, represents compression mode. If dict, value at 'method' is
+            the compression mode. Compression mode may be any of the following
+            possible values: {'infer', 'gzip', 'bz2', 'zip', 'xz', None}. If
+            compression mode is 'infer' and `path_or_buf` is path-like, then
+            detect compression mode from the following extensions: '.gz',
+            '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given
+            and mode is 'zip' or inferred as 'zip', other entries passed as
+            additional compression options.
+
+            .. versionchanged:: 0.25.0
+
+               May now be a dict with key 'method' as compression mode
+               and other entries as additional compression options if
+               compression mode is 'zip'.
 
         quoting : optional constant from csv module
             Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
@@ -3054,6 +3060,10 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
         ...                    'weapon': ['sai', 'bo staff']})
         >>> df.to_csv(index=False)
         'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'
+
+        # create 'out.zip' containing 'out.csv'
+        >>> compression_opts = dict(method='zip', archive_name='out.csv')
+        >>> df.to_csv('out.zip', index=False, compression=compression_opts)
         """
 
         df = self if isinstance(self, ABCDataFrame) else self.to_frame()

diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -10,6 +10,7 @@
 import mmap
 import os
 import pathlib
+from typing import Any, Dict, Optional, Tuple, Union
 from urllib.error import URLError  # noqa
 from urllib.parse import (  # noqa
     urlencode, urljoin, urlparse as parse_url, uses_netloc, uses_params,
@@ -219,13 +220,46 @@ def file_path_to_url(path):
 }
 
 
+def _get_compression_method(compression: Optional[Union[str, Dict[str, Any]]]):
+    """
+    Simplifies a compression argument to a compression method string and
+    a dict containing additional arguments.
+
+    Parameters
+    ----------
+    compression : str or dict
+        If string, specifies the compression method. If dict, value at key
+        'method' specifies compression method.
+
+    Returns
+    -------
+    tuple of ({compression method}, any
+              {compression arguments}, dict)
+
+    Raises
+    ------
+    ValueError on dict missing 'method' key
+    """
+    # Handle dict
+    if isinstance(compression, dict):
+        compression_args = compression.copy()
+        try:
+            compression = compression['method']
+            compression_args.pop('method')
+        except KeyError:
+            raise ValueError("If dict, compression "
+                             "must have key 'method'")
+    else:
+        compression_args = {}
+    return compression, compression_args
+
+
 def _infer_compression(filepath_or_buffer, compression):
     """
     Get the compression method for filepath_or_buffer. If compression='infer',
     the inferred compression method is returned. Otherwise, the input
     compression method is returned unchanged, unless it's invalid, in which
     case an error is raised.
-
     Parameters
     ----------
     filepath_or_buffer :
@@ -234,12 +268,10 @@ def _infer_compression(filepath_or_buffer, compression):
         If 'infer' and `filepath_or_buffer` is path-like, then detect
         compression from the following extensions: '.gz', '.bz2', '.zip',
         or '.xz' (otherwise no compression).
-
     Returns
     -------
     string or None :
         compression method
-
     Raises
     ------
     ValueError on invalid compression specified
@@ -273,7 +305,8 @@ def _infer_compression(filepath_or_buffer, compression):
     raise ValueError(msg)
 
 
-def _get_handle(path_or_buf, mode, encoding=None, compression=None,
+def _get_handle(path_or_buf, mode, encoding=None,
+                compression: Optional[Union[str, Dict[str, Any]]] = None,
                 memory_map=False, is_text=True):
     """
     Get file handle for given path/buffer and mode.
@@ -285,10 +318,21 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
     mode : str
         mode to open path_or_buf with
     encoding : str or None
-    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None
-        If 'infer' and `filepath_or_buffer` is path-like, then detect
-        compression from the following extensions: '.gz', '.bz2', '.zip',
-        or '.xz' (otherwise no compression).
+    compression : str or dict, default None
+        If string, specifies compression mode. If dict, value at key 'method'
+        specifies compression mode. Compression mode must be one of {'infer',
+        'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer'
+        and `filepath_or_buffer` is path-like, then detect compression from
+        the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise
+        no compression). If dict and compression mode is 'zip' or inferred as
+        'zip', other entries passed as additional compression options.
+
+        .. versionchanged:: 0.25.0
+
+           May now be a dict with key 'method' as compression mode
+           and other keys as compression options if compression
+           mode is 'zip'.
+
     memory_map : boolean, default False
         See parsers._parser_params for more information.
     is_text : boolean, default True
@@ -304,7 +348,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
     """
     try:
         from s3fs import S3File
-        need_text_wrapping = (BytesIO, S3File)
+        need_text_wrapping = (BytesIO, S3File)  # type: Tuple
     except ImportError:
         need_text_wrapping = (BytesIO,)
 
@@ -315,6 +359,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
     path_or_buf = _stringify_path(path_or_buf)
     is_path = isinstance(path_or_buf, str)
 
+    compression, compression_args = _get_compression_method(compression)
     if is_path:
         compression = _infer_compression(path_or_buf, compression)
 
@@ -336,7 +381,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
 
         # ZIP Compression
         elif compression == 'zip':
-            zf = BytesZipFile(path_or_buf, mode)
+            zf = BytesZipFile(path_or_buf, mode, **compression_args)
             # Ensure the container is closed as well.
             handles.append(zf)
             if zf.mode == 'w':
@@ -406,13 +451,19 @@ class BytesZipFile(zipfile.ZipFile, BytesIO):  # type: ignore
     bytes strings into a member of the archive.
     """
     # GH 17778
-    def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs):
+    def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED,
+                 archive_name: Optional[str] = None,
+                 **kwargs):
         if mode in ['wb', 'rb']:
             mode = mode.replace('b', '')
+        self.archive_name = archive_name
         super().__init__(file, mode, compression, **kwargs)
 
     def write(self, data):
-        super().writestr(self.filename, data)
+        archive_name = self.filename
+        if self.archive_name is not None:
+            archive_name = self.archive_name
+        super().writestr(archive_name, data)
 
     @property
     def closed(self):

diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
@@ -5,6 +5,7 @@
 import csv as csvlib
 from io import StringIO
 import os
+from typing import Any, Dict, Optional, Union
 import warnings
 from zipfile import ZipFile
 
@@ -17,26 +18,32 @@
 from pandas.core.dtypes.missing import notna
 
 from pandas.io.common import (
-    UnicodeWriter, _get_handle, _infer_compression, get_filepath_or_buffer)
+    UnicodeWriter, _get_compression_method, _get_handle, _infer_compression,
+    get_filepath_or_buffer)
 
 
 class CSVFormatter:
 
     def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
                  float_format=None, cols=None, header=True, index=True,
                  index_label=None, mode='w', nanRep=None, encoding=None,
-                 compression='infer', quoting=None, line_terminator='\n',
-                 chunksize=None, tupleize_cols=False, quotechar='"',
-                 date_format=None, doublequote=True, escapechar=None,
-                 decimal='.'):
+                 compression: Optional[Union[str, Dict[str, Any]]] = 'infer',
+                 quoting=None, line_terminator='\n', chunksize=None,
+                 tupleize_cols=False, quotechar='"', date_format=None,
+                 doublequote=True, escapechar=None, decimal='.'):
 
         self.obj = obj
 
         if path_or_buf is None:
             path_or_buf = StringIO()
 
+        # Extract compression mode as given, if dict
+        compression, self.compression_args = _get_compression_method(
+            compression)
+
         self.path_or_buf, _, _, _ = get_filepath_or_buffer(
-            path_or_buf, encoding=encoding, compression=compression, mode=mode
+            path_or_buf, encoding=encoding,
+            compression=compression, mode=mode
         )
         self.sep = sep
         self.na_rep = na_rep
@@ -114,7 +121,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
         self.data_index = obj.index
         if (isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex)) and
                 date_format is not None):
-            from pandas import Index
+            from pandas import Index  # type: ignore
             self.data_index = Index([x.strftime(date_format) if notna(x) else
                                      '' for x in self.data_index])
 
@@ -149,7 +156,8 @@ def save(self):
         else:
             f, handles = _get_handle(self.path_or_buf, self.mode,
                                      encoding=self.encoding,
-                                     compression=self.compression)
+                                     compression=dict(self.compression_args,
+                                                      method=self.compression))
             close = True
 
         try:
@@ -173,9 +181,11 @@ def save(self):
                 if hasattr(self.path_or_buf, 'write'):
                     self.path_or_buf.write(buf)
                 else:
+                    compression = dict(self.compression_args,
+                                       method=self.compression)
                     f, handles = _get_handle(self.path_or_buf, self.mode,
                                              encoding=self.encoding,
-                                             compression=self.compression)
+                                             compression=compression)
                     f.write(buf)
                     close = True
             if close:

diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py
@@ -534,3 +534,39 @@ def test_to_csv_compression(self, compression_only,
             result = pd.read_csv(path, index_col=0,
                                  compression=read_compression)
             tm.assert_frame_equal(result, df)
+
+    def test_to_csv_compression_dict(self, compression_only):
+        # GH 26023
+        method = compression_only
+        df = DataFrame({"ABC": [1]})
+        filename = "to_csv_compress_as_dict."
+        filename += "gz" if method == "gzip" else method
+        with tm.ensure_clean(filename) as path:
+            df.to_csv(path, compression={"method": method})
+            read_df = pd.read_csv(path, index_col=0)
+            tm.assert_frame_equal(read_df, df)
+
+    def test_to_csv_compression_dict_no_method(self):
+        # GH 26023
+        df = DataFrame({"ABC": [1]})
+        compression = {"some_option": True}
+        with tm.ensure_clean("out.zip") as path, pytest.raises(ValueError):
+            df.to_csv(path, compression=compression)
+
+    @pytest.mark.parametrize("compression", ["zip", "infer"])
+    @pytest.mark.parametrize("archive_name", [None, "test_to_csv.csv",
+                                              "test_to_csv.zip"])
+    def test_to_csv_zip_arguments(self, compression, archive_name):
+        # GH 26023
+        from zipfile import ZipFile
+
+        df = DataFrame({"ABC": [1]})
+        with tm.ensure_clean("to_csv_archive_name.zip") as path:
+            df.to_csv(path, compression={"method": compression,
+                                         "archive_name": archive_name})
+            zp = ZipFile(path)
+            expected_arcname = path if archive_name is None else archive_name
+            expected_arcname = os.path.basename(expected_arcname)
+            assert len(zp.filelist) == 1
+            archived_file = os.path.basename(zp.filelist[0].filename)
+            assert archived_file == expected_arcname