Skip to content

Commit 892233e

Browse files
drew-heenanproost
authored andcommitted
ENH: Allow compression in NDFrame.to_csv to be a dict with optional arguments (pandas-dev#26023) (pandas-dev#26024)
1 parent e43996f commit 892233e

File tree

5 files changed

+200
-58
lines changed

5 files changed

+200
-58
lines changed

doc/source/whatsnew/v1.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,7 @@ ExtensionArray
206206
Other
207207
^^^^^
208208
- Trying to set the ``display.precision``, ``display.max_rows`` or ``display.max_columns`` using :meth:`set_option` to anything but a ``None`` or a positive int will raise a ``ValueError`` (:issue:`23348`)
209+
- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support dicts as ``compression`` argument with key ``'method'`` being the compression method and others as additional compression options when the compression method is ``'zip'``. (:issue:`26023`)
209210

210211

211212
.. _whatsnew_1000.contributors:

pandas/core/generic.py

+59-32
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,17 @@
77
import pickle
88
import re
99
from textwrap import dedent
10-
from typing import Callable, Dict, FrozenSet, List, Optional, Set
10+
from typing import (
11+
Callable,
12+
Dict,
13+
FrozenSet,
14+
Hashable,
15+
List,
16+
Optional,
17+
Sequence,
18+
Set,
19+
Union,
20+
)
1121
import warnings
1222
import weakref
1323

@@ -50,7 +60,7 @@
5060
from pandas.core.dtypes.missing import isna, notna
5161

5262
import pandas as pd
53-
from pandas._typing import Dtype
63+
from pandas._typing import Dtype, FilePathOrBuffer
5464
from pandas.core import missing, nanops
5565
import pandas.core.algorithms as algos
5666
from pandas.core.base import PandasObject, SelectionMixin
@@ -122,6 +132,9 @@ def _single_replace(self, to_replace, method, inplace, limit):
122132
return result
123133

124134

135+
bool_t = bool # Need alias because NDFrame has def bool:
136+
137+
125138
class NDFrame(PandasObject, SelectionMixin):
126139
"""
127140
N-dimensional analogue of DataFrame. Store multi-dimensional in a
@@ -3051,26 +3064,26 @@ def to_latex(
30513064

30523065
def to_csv(
30533066
self,
3054-
path_or_buf=None,
3055-
sep=",",
3056-
na_rep="",
3057-
float_format=None,
3058-
columns=None,
3059-
header=True,
3060-
index=True,
3061-
index_label=None,
3062-
mode="w",
3063-
encoding=None,
3064-
compression="infer",
3065-
quoting=None,
3066-
quotechar='"',
3067-
line_terminator=None,
3068-
chunksize=None,
3069-
date_format=None,
3070-
doublequote=True,
3071-
escapechar=None,
3072-
decimal=".",
3073-
):
3067+
path_or_buf: Optional[FilePathOrBuffer] = None,
3068+
sep: str = ",",
3069+
na_rep: str = "",
3070+
float_format: Optional[str] = None,
3071+
columns: Optional[Sequence[Hashable]] = None,
3072+
header: Union[bool_t, List[str]] = True,
3073+
index: bool_t = True,
3074+
index_label: Optional[Union[bool_t, str, Sequence[Hashable]]] = None,
3075+
mode: str = "w",
3076+
encoding: Optional[str] = None,
3077+
compression: Optional[Union[str, Dict[str, str]]] = "infer",
3078+
quoting: Optional[int] = None,
3079+
quotechar: str = '"',
3080+
line_terminator: Optional[str] = None,
3081+
chunksize: Optional[int] = None,
3082+
date_format: Optional[str] = None,
3083+
doublequote: bool_t = True,
3084+
escapechar: Optional[str] = None,
3085+
decimal: Optional[str] = ".",
3086+
) -> Optional[str]:
30743087
r"""
30753088
Write object to a comma-separated values (csv) file.
30763089
@@ -3117,16 +3130,21 @@ def to_csv(
31173130
encoding : str, optional
31183131
A string representing the encoding to use in the output file,
31193132
defaults to 'utf-8'.
3120-
compression : str, default 'infer'
3121-
Compression mode among the following possible values: {'infer',
3122-
'gzip', 'bz2', 'zip', 'xz', None}. If 'infer' and `path_or_buf`
3123-
is path-like, then detect compression from the following
3124-
extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no
3125-
compression).
3126-
3127-
.. versionchanged:: 0.24.0
3128-
3129-
'infer' option added and set to default.
3133+
compression : str or dict, default 'infer'
3134+
If str, represents compression mode. If dict, value at 'method' is
3135+
the compression mode. Compression mode may be any of the following
3136+
possible values: {'infer', 'gzip', 'bz2', 'zip', 'xz', None}. If
3137+
compression mode is 'infer' and `path_or_buf` is path-like, then
3138+
detect compression mode from the following extensions: '.gz',
3139+
'.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given
3140+
and mode is 'zip' or inferred as 'zip', other entries passed as
3141+
additional compression options.
3142+
3143+
.. versionchanged:: 0.25.0
3144+
3145+
May now be a dict with key 'method' as compression mode
3146+
and other entries as additional compression options if
3147+
compression mode is 'zip'.
31303148
31313149
quoting : optional constant from csv module
31323150
Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
@@ -3171,6 +3189,13 @@ def to_csv(
31713189
... 'weapon': ['sai', 'bo staff']})
31723190
>>> df.to_csv(index=False)
31733191
'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'
3192+
3193+
# create 'out.zip' containing 'out.csv'
3194+
>>> compression_opts = dict(method='zip',
3195+
... archive_name='out.csv') # doctest: +SKIP
3196+
3197+
>>> df.to_csv('out.zip', index=False,
3198+
... compression=compression_opts) # doctest: +SKIP
31743199
"""
31753200

31763201
df = self if isinstance(self, ABCDataFrame) else self.to_frame()
@@ -3204,6 +3229,8 @@ def to_csv(
32043229
if path_or_buf is None:
32053230
return formatter.path_or_buf.getvalue()
32063231

3232+
return None
3233+
32073234
# ----------------------------------------------------------------------
32083235
# Fancy Indexing
32093236

pandas/io/common.py

+91-24
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,19 @@
99
import mmap
1010
import os
1111
import pathlib
12-
from typing import IO, AnyStr, BinaryIO, Optional, TextIO, Type
12+
from typing import (
13+
IO,
14+
Any,
15+
AnyStr,
16+
BinaryIO,
17+
Dict,
18+
List,
19+
Optional,
20+
TextIO,
21+
Tuple,
22+
Type,
23+
Union,
24+
)
1325
from urllib.error import URLError # noqa
1426
from urllib.parse import ( # noqa
1527
urlencode,
@@ -255,6 +267,40 @@ def file_path_to_url(path: str) -> str:
255267
_compression_to_extension = {"gzip": ".gz", "bz2": ".bz2", "zip": ".zip", "xz": ".xz"}
256268

257269

270+
def _get_compression_method(
271+
compression: Optional[Union[str, Dict[str, str]]]
272+
) -> Tuple[Optional[str], Dict[str, str]]:
273+
"""
274+
Simplifies a compression argument to a compression method string and
275+
a dict containing additional arguments.
276+
277+
Parameters
278+
----------
279+
compression : str or dict
280+
If string, specifies the compression method. If dict, value at key
281+
'method' specifies compression method.
282+
283+
Returns
284+
-------
285+
tuple of ({compression method}, Optional[str]
286+
{compression arguments}, Dict[str, str])
287+
288+
Raises
289+
------
290+
ValueError on dict missing 'method' key
291+
"""
292+
# Handle dict
293+
if isinstance(compression, dict):
294+
compression_args = compression.copy()
295+
try:
296+
compression = compression_args.pop("method")
297+
except KeyError:
298+
raise ValueError("If dict, compression must have key 'method'")
299+
else:
300+
compression_args = {}
301+
return compression, compression_args
302+
303+
258304
def _infer_compression(
259305
filepath_or_buffer: FilePathOrBuffer, compression: Optional[str]
260306
) -> Optional[str]:
@@ -266,21 +312,20 @@ def _infer_compression(
266312
267313
Parameters
268314
----------
269-
filepath_or_buffer :
270-
a path (str) or buffer
315+
filepath_or_buffer : str or file handle
316+
File path or object.
271317
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}
272318
If 'infer' and `filepath_or_buffer` is path-like, then detect
273319
compression from the following extensions: '.gz', '.bz2', '.zip',
274320
or '.xz' (otherwise no compression).
275321
276322
Returns
277323
-------
278-
string or None :
279-
compression method
324+
string or None
280325
281326
Raises
282327
------
283-
ValueError on invalid compression specified
328+
ValueError on invalid compression specified.
284329
"""
285330

286331
# No compression has been explicitly specified
@@ -312,32 +357,49 @@ def _infer_compression(
312357

313358

314359
def _get_handle(
315-
path_or_buf, mode, encoding=None, compression=None, memory_map=False, is_text=True
360+
path_or_buf,
361+
mode: str,
362+
encoding=None,
363+
compression: Optional[Union[str, Dict[str, Any]]] = None,
364+
memory_map: bool = False,
365+
is_text: bool = True,
316366
):
317367
"""
318368
Get file handle for given path/buffer and mode.
319369
320370
Parameters
321371
----------
322-
path_or_buf :
323-
a path (str) or buffer
372+
path_or_buf : str or file handle
373+
File path or object.
324374
mode : str
325-
mode to open path_or_buf with
375+
Mode to open path_or_buf with.
326376
encoding : str or None
327-
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None
328-
If 'infer' and `filepath_or_buffer` is path-like, then detect
329-
compression from the following extensions: '.gz', '.bz2', '.zip',
330-
or '.xz' (otherwise no compression).
377+
Encoding to use.
378+
compression : str or dict, default None
379+
If string, specifies compression mode. If dict, value at key 'method'
380+
specifies compression mode. Compression mode must be one of {'infer',
381+
'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer'
382+
and `filepath_or_buffer` is path-like, then detect compression from
383+
the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise
384+
no compression). If dict and compression mode is 'zip' or inferred as
385+
'zip', other entries passed as additional compression options.
386+
387+
.. versionchanged:: 1.0.0
388+
389+
May now be a dict with key 'method' as compression mode
390+
and other keys as compression options if compression
391+
mode is 'zip'.
392+
331393
memory_map : boolean, default False
332394
See parsers._parser_params for more information.
333395
is_text : boolean, default True
334396
whether file/buffer is in text format (csv, json, etc.), or in binary
335-
mode (pickle, etc.)
397+
mode (pickle, etc.).
336398
337399
Returns
338400
-------
339401
f : file-like
340-
A file-like object
402+
A file-like object.
341403
handles : list of file-like objects
342404
A list of file-like object that were opened in this function.
343405
"""
@@ -346,15 +408,16 @@ def _get_handle(
346408

347409
need_text_wrapping = (BufferedIOBase, S3File)
348410
except ImportError:
349-
need_text_wrapping = BufferedIOBase
411+
need_text_wrapping = BufferedIOBase # type: ignore
350412

351-
handles = list()
413+
handles = list() # type: List[IO]
352414
f = path_or_buf
353415

354416
# Convert pathlib.Path/py.path.local or string
355417
path_or_buf = _stringify_path(path_or_buf)
356418
is_path = isinstance(path_or_buf, str)
357419

420+
compression, compression_args = _get_compression_method(compression)
358421
if is_path:
359422
compression = _infer_compression(path_or_buf, compression)
360423

@@ -376,7 +439,7 @@ def _get_handle(
376439

377440
# ZIP Compression
378441
elif compression == "zip":
379-
zf = BytesZipFile(path_or_buf, mode)
442+
zf = BytesZipFile(path_or_buf, mode, **compression_args)
380443
# Ensure the container is closed as well.
381444
handles.append(zf)
382445
if zf.mode == "w":
@@ -429,9 +492,9 @@ def _get_handle(
429492

430493
if memory_map and hasattr(f, "fileno"):
431494
try:
432-
g = MMapWrapper(f)
495+
wrapped = MMapWrapper(f)
433496
f.close()
434-
f = g
497+
f = wrapped
435498
except Exception:
436499
# we catch any errors that may have occurred
437500
# because that is consistent with the lower-level
@@ -456,15 +519,19 @@ def __init__(
456519
self,
457520
file: FilePathOrBuffer,
458521
mode: str,
459-
compression: int = zipfile.ZIP_DEFLATED,
522+
archive_name: Optional[str] = None,
460523
**kwargs
461524
):
462525
if mode in ["wb", "rb"]:
463526
mode = mode.replace("b", "")
464-
super().__init__(file, mode, compression, **kwargs)
527+
self.archive_name = archive_name
528+
super().__init__(file, mode, zipfile.ZIP_DEFLATED, **kwargs)
465529

466530
def write(self, data):
467-
super().writestr(self.filename, data)
531+
archive_name = self.filename
532+
if self.archive_name is not None:
533+
archive_name = self.archive_name
534+
super().writestr(archive_name, data)
468535

469536
@property
470537
def closed(self):

0 commit comments

Comments
 (0)