From 4e73dc453c94843f9c448e83d19257309285bc29 Mon Sep 17 00:00:00 2001 From: Drew Heenan Date: Sun, 7 Apr 2019 21:08:05 -0400 Subject: [PATCH 01/28] ENH/BUG: Add arcname to to_csv for ZIP compressed csv filename (#26023) --- pandas/core/generic.py | 8 ++++++-- pandas/io/common.py | 10 ++++++---- pandas/io/formats/csvs.py | 7 +++++-- pandas/tests/io/formats/test_to_csv.py | 16 ++++++++++++++++ 4 files changed, 33 insertions(+), 8 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c12e9e7e04af6..79f23acaffac1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2921,7 +2921,7 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, mode='w', encoding=None, compression='infer', quoting=None, quotechar='"', line_terminator=None, chunksize=None, tupleize_cols=None, date_format=None, doublequote=True, - escapechar=None, decimal='.'): + escapechar=None, decimal='.', arcname=None): r""" Write object to a comma-separated values (csv) file. @@ -3011,6 +3011,9 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, decimal : str, default '.' Character recognized as decimal separator. E.g. use ',' for European data. + arcname : str, default None + Name of file within a ZIP archive. Only used when `path_or_buf` is + a path and `compression` is set to or inferred as 'zip'. Returns ------- @@ -3053,7 +3056,8 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, tupleize_cols=tupleize_cols, date_format=date_format, doublequote=doublequote, - escapechar=escapechar, decimal=decimal) + escapechar=escapechar, decimal=decimal, + arcname=arcname) formatter.save() if path_or_buf is None: diff --git a/pandas/io/common.py b/pandas/io/common.py index bcbfa7930311e..5cedec4db9763 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -288,7 +288,7 @@ def _infer_compression(filepath_or_buffer, compression): def _get_handle(path_or_buf, mode, encoding=None, compression=None, - memory_map=False, is_text=True): + memory_map=False, is_text=True, arcname=None): """ Get file handle for given path/buffer and mode. @@ -350,7 +350,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, # ZIP Compression elif compression == 'zip': - zf = BytesZipFile(path_or_buf, mode) + zf = BytesZipFile(path_or_buf, mode, arcname=arcname) # Ensure the container is closed as well. handles.append(zf) if zf.mode == 'w': @@ -420,13 +420,15 @@ class BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore bytes strings into a member of the archive. """ # GH 17778 - def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs): + def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, arcname=None, **kwargs): if mode in ['wb', 'rb']: mode = mode.replace('b', '') + self.arcname = arcname super(BytesZipFile, self).__init__(file, mode, compression, **kwargs) def write(self, data): - super(BytesZipFile, self).writestr(self.filename, data) + arcname = self.filename if self.arcname is None else self.arcname + super(BytesZipFile, self).writestr(arcname, data) @property def closed(self): diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 37f1372366545..bfcdf16275817 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -29,7 +29,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', compression='infer', quoting=None, line_terminator='\n', chunksize=None, tupleize_cols=False, quotechar='"', date_format=None, doublequote=True, escapechar=None, - decimal='.'): + decimal='.', arcname=None): self.obj = obj @@ -123,6 +123,8 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', if not index: self.nlevels = 0 + self.arcname = arcname + def save(self): """ Create the writer & save @@ -176,7 +178,8 @@ def save(self): else: f, handles = _get_handle(self.path_or_buf, self.mode, encoding=self.encoding, - compression=self.compression) + compression=self.compression, + arcname=self.arcname) f.write(buf) close = True if close: diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index fbd71dfa8262b..23e32c0f40083 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -537,3 +537,19 @@ def test_to_csv_compression(self, compression_only, result = pd.read_csv(path, index_col=0, compression=read_compression) tm.assert_frame_equal(result, df) + + @pytest.mark.parametrize("compression", ["zip", "infer"]) + @pytest.mark.parametrize("arcname", [None, "test_to_csv.csv", + "test_to_csv.zip"]) + def test_to_csv_zip_arcname(self, compression, arcname): + # GH 26023 + from zipfile import ZipFile + + df = DataFrame({"ABC": [1]}) + with tm.ensure_clean("to_csv_arcname.zip") as path: + df.to_csv(path, compression=compression, + arcname=arcname) + zp = ZipFile(path) + expected_arcname = path if arcname is None else arcname + assert len(zp.filelist) == 1 + assert zp.filelist[0].filename == expected_arcname From ab7620dfd27505b579e855b9e4a1dfb30ac4d66f Mon Sep 17 00:00:00 2001 From: Drew Heenan Date: Sun, 7 Apr 2019 21:33:25 -0400 Subject: [PATCH 02/28] DOC: Updated docs for arcname in NDFrame.to_csv (#26023) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/generic.py | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index fe047b4a141ef..186faa1690bdb 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -36,6 +36,7 @@ Other Enhancements - :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`) - :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`) - :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`) +- :meth:`NDFrame.to_csv` now supports the ``arcname`` argument to specify the written CSV file name when inside a ZIP archive. Default ``arcname=None`` maintains previous behavior where the CSV name matches given ZIP path ``path_or_buf`` (:issue:`26023`) .. _whatsnew_0250.api_breaking: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 79f23acaffac1..c27f091538d0d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3012,8 +3012,11 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, Character recognized as decimal separator. E.g. use ',' for European data. arcname : str, default None - Name of file within a ZIP archive. Only used when `path_or_buf` is - a path and `compression` is set to or inferred as 'zip'. + Name of CSV-formatted file within a ZIP archive. Only used when + `path_or_buf` is a path and `compression` is set to or inferred + as 'zip'. Uses `path_or_buf` if None. + + .. versionadded:: 0.25.0 Returns ------- From 2e782f9ac6c60af97dff037d6c569902ce351451 Mon Sep 17 00:00:00 2001 From: Drew Heenan Date: Sun, 7 Apr 2019 21:45:54 -0400 Subject: [PATCH 03/28] conform to line length limit --- pandas/io/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 5cedec4db9763..3e62d4d1c924c 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -420,7 +420,8 @@ class BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore bytes strings into a member of the archive. """ # GH 17778 - def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, arcname=None, **kwargs): + def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, + arcname=None, **kwargs): if mode in ['wb', 'rb']: mode = mode.replace('b', '') self.arcname = arcname From 83e88341f8685932973d6ca9174d91677edbfdb5 Mon Sep 17 00:00:00 2001 From: Drew Heenan Date: Sun, 7 Apr 2019 22:29:57 -0400 Subject: [PATCH 04/28] Fixed test_to_csv_zip_arcname for Windows paths --- pandas/tests/io/formats/test_to_csv.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 23e32c0f40083..b18cef7b4dfb9 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -551,5 +551,7 @@ def test_to_csv_zip_arcname(self, compression, arcname): arcname=arcname) zp = ZipFile(path) expected_arcname = path if arcname is None else arcname + expected_arcname = os.path.basename(expected_arcname) assert len(zp.filelist) == 1 - assert zp.filelist[0].filename == expected_arcname + archived_file = os.path.basename(zp.filelist[0].filename) + assert archived_file == expected_arcname From b41be549e0935e62f127ba698ab309af57acce2c Mon Sep 17 00:00:00 2001 From: Drew Heenan Date: Tue, 9 Apr 2019 03:07:03 -0400 Subject: [PATCH 05/28] to_csv compression may now be dict with possible keys 'method' and 'arcname' --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/generic.py | 36 ++++++------- pandas/io/common.py | 71 +++++++++++++++++--------- pandas/io/formats/csvs.py | 23 ++++++--- pandas/tests/io/formats/test_to_csv.py | 15 +++++- 5 files changed, 95 insertions(+), 52 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 186faa1690bdb..e58d3c3d36721 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -36,7 +36,7 @@ Other Enhancements - :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`) - :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`) - :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`) -- :meth:`NDFrame.to_csv` now supports the ``arcname`` argument to specify the written CSV file name when inside a ZIP archive. Default ``arcname=None`` maintains previous behavior where the CSV name matches given ZIP path ``path_or_buf`` (:issue:`26023`) +- :meth:`NDFrame.to_csv` now supports dicts as ``compression`` argument with key ``'method'`` being the compression method and optional key ``'arcname'`` specifying the archived CSV file name when the compression method is ``'zip'``. If key ``'arcname'`` unspecified or ``compression='zip'``, maintains previous behavior. (:issue:`26023`) .. _whatsnew_0250.api_breaking: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d45240505144f..a3192ee877c83 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2921,7 +2921,7 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, mode='w', encoding=None, compression='infer', quoting=None, quotechar='"', line_terminator=None, chunksize=None, tupleize_cols=None, date_format=None, doublequote=True, - escapechar=None, decimal='.', arcname=None): + escapechar=None, decimal='.'): r""" Write object to a comma-separated values (csv) file. @@ -2968,16 +2968,21 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, encoding : str, optional A string representing the encoding to use in the output file, defaults to 'ascii' on Python 2 and 'utf-8' on Python 3. - compression : str, default 'infer' - Compression mode among the following possible values: {'infer', - 'gzip', 'bz2', 'zip', 'xz', None}. If 'infer' and `path_or_buf` - is path-like, then detect compression from the following - extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no - compression). - - .. versionchanged:: 0.24.0 - - 'infer' option added and set to default. + compression : str or dict, default 'infer' + If str, represents compression mode. If dict, value at 'method' is + the compression mode. Compression mode may be any of the following + possible values: {'infer', 'gzip', 'bz2', 'zip', 'xz', None}. If + compression mode is 'infer' and `path_or_buf` is path-like, then + detect compression mode from the following extensions: '.gz', + '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given + and mode is 'zip' or inferred as 'zip', optional value at 'arcname' + specifies name of file within ZIP archive, assuming equal to + `path_or_buf` if not specified or None. + + .. versionchanged:: 0.25.0 + + May now be a dict with key 'method' as compression mode + and 'arcname' as CSV file name if mode is 'zip' quoting : optional constant from csv module Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` @@ -3011,12 +3016,6 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, decimal : str, default '.' Character recognized as decimal separator. E.g. use ',' for European data. - arcname : str, default None - Name of CSV-formatted file within a ZIP archive. Only used when - `path_or_buf` is a path and `compression` is set to or inferred - as 'zip'. Uses `path_or_buf` if None. - - .. versionadded:: 0.25.0 Returns ------- @@ -3059,8 +3058,7 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, tupleize_cols=tupleize_cols, date_format=date_format, doublequote=doublequote, - escapechar=escapechar, decimal=decimal, - arcname=arcname) + escapechar=escapechar, decimal=decimal) formatter.save() if path_or_buf is None: diff --git a/pandas/io/common.py b/pandas/io/common.py index 3e62d4d1c924c..f5268dbc34f6f 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -235,8 +235,8 @@ def file_path_to_url(path): def _infer_compression(filepath_or_buffer, compression): """ - Get the compression method for filepath_or_buffer. If compression='infer', - the inferred compression method is returned. Otherwise, the input + Get the compression method for filepath_or_buffer. If compression mode is + 'infer', the inferred compression method is returned. Otherwise, the input compression method is returned unchanged, unless it's invalid, in which case an error is raised. @@ -244,10 +244,17 @@ def _infer_compression(filepath_or_buffer, compression): ---------- filepath_or_buffer : a path (str) or buffer - compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None} - If 'infer' and `filepath_or_buffer` is path-like, then detect - compression from the following extensions: '.gz', '.bz2', '.zip', - or '.xz' (otherwise no compression). + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None} or dict + If string, specifies compression mode. If dict, value at key 'method' + specifies compression mode. If compression mode is 'infer' and + `filepath_or_buffer` is path-like, then detect compression from the + following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no + compression). + + .. versionchanged 0.25.0 + + May now be a dict with required key 'method' specifying compression + mode Returns ------- @@ -259,6 +266,14 @@ def _infer_compression(filepath_or_buffer, compression): ValueError on invalid compression specified """ + # Handle compression method as dict + if isinstance(compression, dict): + try: + compression = compression['method'] + except KeyError: + raise ValueError("Compression dict must have key " + "'method'") + # No compression has been explicitly specified if compression is None: return None @@ -288,7 +303,7 @@ def _infer_compression(filepath_or_buffer, compression): def _get_handle(path_or_buf, mode, encoding=None, compression=None, - memory_map=False, is_text=True, arcname=None): + memory_map=False, is_text=True): """ Get file handle for given path/buffer and mode. @@ -299,10 +314,21 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, mode : str mode to open path_or_buf with encoding : str or None - compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None - If 'infer' and `filepath_or_buffer` is path-like, then detect - compression from the following extensions: '.gz', '.bz2', '.zip', - or '.xz' (otherwise no compression). + compression : str or dict, default None + If string, specifies compression mode. If dict, value at key 'method' + specifies compression mode. Compression mode must be one of {'infer', + 'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer' + and `filepath_or_buffer` is path-like, then detect compression from + the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise + no compression). If dict and compression mode is 'zip' or inferred as + 'zip', optional value at key 'arcname' specifies the name of the file + within ZIP archive at `path_or_buf`. + + .. versionchanged:: 0.25.0 + + May now be a dict with key 'method' as compression mode + and 'arcname' as CSV file name if mode is 'zip' + memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True @@ -329,27 +355,31 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, path_or_buf = _stringify_path(path_or_buf) is_path = isinstance(path_or_buf, str) + compression_method = None if is_path: - compression = _infer_compression(path_or_buf, compression) + compression_method = _infer_compression(path_or_buf, compression) - if compression: + if compression_method: # GZ Compression - if compression == 'gzip': + if compression_method == 'gzip': if is_path: f = gzip.open(path_or_buf, mode) else: f = gzip.GzipFile(fileobj=path_or_buf) # BZ Compression - elif compression == 'bz2': + elif compression_method == 'bz2': if is_path: f = bz2.BZ2File(path_or_buf, mode) else: f = bz2.BZ2File(path_or_buf) # ZIP Compression - elif compression == 'zip': + elif compression_method == 'zip': + arcname = None + if isinstance(compression, dict) and 'arcname' in compression: + arcname = compression['arcname'] zf = BytesZipFile(path_or_buf, mode, arcname=arcname) # Ensure the container is closed as well. handles.append(zf) @@ -368,14 +398,9 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, .format(zip_names)) # XZ Compression - elif compression == 'xz': + elif compression_method == 'xz': f = lzma.LZMAFile(path_or_buf, mode) - # Unrecognized Compression - else: - msg = 'Unrecognized compression type: {}'.format(compression) - raise ValueError(msg) - handles.append(f) elif is_path: @@ -391,7 +416,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, handles.append(f) # Convert BytesIO or file objects passed with an encoding - if is_text and (compression or isinstance(f, need_text_wrapping)): + if is_text and (compression_method or isinstance(f, need_text_wrapping)): from io import TextIOWrapper f = TextIOWrapper(f, encoding=encoding, newline='') handles.append(f) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index bfcdf16275817..300c64aaadfb1 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -29,15 +29,27 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', compression='infer', quoting=None, line_terminator='\n', chunksize=None, tupleize_cols=False, quotechar='"', date_format=None, doublequote=True, escapechar=None, - decimal='.', arcname=None): + decimal='.'): self.obj = obj if path_or_buf is None: path_or_buf = StringIO() + self._compression_arg = compression + compression_mode = compression + + # Extract compression mode as given, if dict + if isinstance(compression, dict): + try: + compression_mode = compression['method'] + except KeyError: + raise ValueError("If dict, compression must have key " + "'method'") + self.path_or_buf, _, _, _ = get_filepath_or_buffer( - path_or_buf, encoding=encoding, compression=compression, mode=mode + path_or_buf, encoding=encoding, + compression=compression_mode, mode=mode ) self.sep = sep self.na_rep = na_rep @@ -123,8 +135,6 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', if not index: self.nlevels = 0 - self.arcname = arcname - def save(self): """ Create the writer & save @@ -152,7 +162,7 @@ def save(self): else: f, handles = _get_handle(self.path_or_buf, self.mode, encoding=self.encoding, - compression=self.compression) + compression=self._compression_arg) close = True try: @@ -178,8 +188,7 @@ def save(self): else: f, handles = _get_handle(self.path_or_buf, self.mode, encoding=self.encoding, - compression=self.compression, - arcname=self.arcname) + compression=self._compression_arg) f.write(buf) close = True if close: diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index b18cef7b4dfb9..6781ad6ae8d40 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -538,6 +538,17 @@ def test_to_csv_compression(self, compression_only, compression=read_compression) tm.assert_frame_equal(result, df) + @pytest.mark.parametrize("method", ["gzip", "bz2", "zip", "xz"]) + def test_to_csv_compression_dict(self, method): + # GH 26023 + df = DataFrame({"ABC": [1]}) + filename = "to_csv_compress_as_dict." + filename += "gz" if method == "gzip" else method + with tm.ensure_clean(filename) as path: + df.to_csv(path, compression={"method": method}) + read_df = pd.read_csv(path, index_col=0) + tm.assert_frame_equal(read_df, df) + @pytest.mark.parametrize("compression", ["zip", "infer"]) @pytest.mark.parametrize("arcname", [None, "test_to_csv.csv", "test_to_csv.zip"]) @@ -547,8 +558,8 @@ def test_to_csv_zip_arcname(self, compression, arcname): df = DataFrame({"ABC": [1]}) with tm.ensure_clean("to_csv_arcname.zip") as path: - df.to_csv(path, compression=compression, - arcname=arcname) + df.to_csv(path, compression={"method": compression, + "arcname": arcname}) zp = ZipFile(path) expected_arcname = path if arcname is None else arcname expected_arcname = os.path.basename(expected_arcname) From 60ea58c7e77f6d76e3034a4a5fdb9e0bdd223732 Mon Sep 17 00:00:00 2001 From: Drew Heenan Date: Tue, 9 Apr 2019 18:55:20 -0400 Subject: [PATCH 06/28] test_to_csv_compression_dict uses compression_only fixture --- pandas/tests/io/formats/test_to_csv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 6781ad6ae8d40..1592363f34190 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -538,9 +538,9 @@ def test_to_csv_compression(self, compression_only, compression=read_compression) tm.assert_frame_equal(result, df) - @pytest.mark.parametrize("method", ["gzip", "bz2", "zip", "xz"]) - def test_to_csv_compression_dict(self, method): + def test_to_csv_compression_dict(self, compression_only): # GH 26023 + method = compression_only df = DataFrame({"ABC": [1]}) filename = "to_csv_compress_as_dict." filename += "gz" if method == "gzip" else method From 8ba90821163a5251083ee8c1c27ab25deaab53e7 Mon Sep 17 00:00:00 2001 From: Drew Heenan Date: Wed, 10 Apr 2019 14:43:06 -0400 Subject: [PATCH 07/28] delegate dict handling to _get_compression_method, type annotations --- pandas/io/common.py | 74 ++++++++++++++++++++++++++++----------- pandas/io/formats/csvs.py | 32 ++++++++--------- 2 files changed, 69 insertions(+), 37 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index f5268dbc34f6f..4de1beafb1913 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -15,6 +15,7 @@ uses_relative) from urllib.request import pathname2url, urlopen import zipfile +from typing import Dict import pandas.compat as compat from pandas.errors import ( # noqa @@ -233,6 +234,39 @@ def file_path_to_url(path): } +def _get_compression_method(compression: (str, Dict)): + """ + Simplifies a compression argument to a compression method string and + a dict containing additional arguments. + + Parameters + ---------- + compression : str or dict + If string, specifies the compression method. If dict, value at key + 'method' specifies compression method. + + Returns + ------- + tuple of ({compression method}, str + {compression arguments}, dict) + + Raises + ------ + ValueError on dict missing 'method' key + """ + compression_args = {} + # Handle dict + if isinstance(compression, dict): + compression_args = compression.copy() + try: + compression = compression['method'] + compression_args.pop('method') + except KeyError: + raise ValueError("If dict, compression " + "must have key 'method'") + return compression, compression_args + + def _infer_compression(filepath_or_buffer, compression): """ Get the compression method for filepath_or_buffer. If compression mode is @@ -266,13 +300,8 @@ def _infer_compression(filepath_or_buffer, compression): ValueError on invalid compression specified """ - # Handle compression method as dict - if isinstance(compression, dict): - try: - compression = compression['method'] - except KeyError: - raise ValueError("Compression dict must have key " - "'method'") + # Handle compression as dict + compression, _ = _get_compression_method(compression) # No compression has been explicitly specified if compression is None: @@ -355,31 +384,31 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, path_or_buf = _stringify_path(path_or_buf) is_path = isinstance(path_or_buf, str) - compression_method = None + compression, compression_args = _get_compression_method(compression) if is_path: - compression_method = _infer_compression(path_or_buf, compression) + compression = _infer_compression(path_or_buf, compression) - if compression_method: + if compression: # GZ Compression - if compression_method == 'gzip': + if compression == 'gzip': if is_path: f = gzip.open(path_or_buf, mode) else: f = gzip.GzipFile(fileobj=path_or_buf) # BZ Compression - elif compression_method == 'bz2': + elif compression == 'bz2': if is_path: f = bz2.BZ2File(path_or_buf, mode) else: f = bz2.BZ2File(path_or_buf) # ZIP Compression - elif compression_method == 'zip': + elif compression == 'zip': arcname = None - if isinstance(compression, dict) and 'arcname' in compression: - arcname = compression['arcname'] + if 'arcname' in compression_args: + arcname = compression_args['arcname'] zf = BytesZipFile(path_or_buf, mode, arcname=arcname) # Ensure the container is closed as well. handles.append(zf) @@ -398,9 +427,14 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, .format(zip_names)) # XZ Compression - elif compression_method == 'xz': + elif compression == 'xz': f = lzma.LZMAFile(path_or_buf, mode) + # Unrecognized Compression + else: + msg = 'Unrecognized compression type: {}'.format(compression) + raise ValueError(msg) + handles.append(f) elif is_path: @@ -416,7 +450,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, handles.append(f) # Convert BytesIO or file objects passed with an encoding - if is_text and (compression_method or isinstance(f, need_text_wrapping)): + if is_text and (compression or isinstance(f, need_text_wrapping)): from io import TextIOWrapper f = TextIOWrapper(f, encoding=encoding, newline='') handles.append(f) @@ -446,15 +480,15 @@ class BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore """ # GH 17778 def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, - arcname=None, **kwargs): + arcname: (str, zipfile.ZipInfo) = None, **kwargs): if mode in ['wb', 'rb']: mode = mode.replace('b', '') self.arcname = arcname - super(BytesZipFile, self).__init__(file, mode, compression, **kwargs) + super().__init__(file, mode, compression, **kwargs) def write(self, data): arcname = self.filename if self.arcname is None else self.arcname - super(BytesZipFile, self).writestr(arcname, data) + super().writestr(arcname, data) @property def closed(self): diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 300c64aaadfb1..ac0b73cf4b844 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -8,6 +8,7 @@ import os import warnings from zipfile import ZipFile +from typing import Dict import numpy as np @@ -18,7 +19,8 @@ from pandas.core.dtypes.missing import notna from pandas.io.common import ( - UnicodeWriter, _get_handle, _infer_compression, get_filepath_or_buffer) + UnicodeWriter, _get_handle, _infer_compression, get_filepath_or_buffer, + _get_compression_method) class CSVFormatter(object): @@ -26,30 +28,23 @@ class CSVFormatter(object): def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, - compression='infer', quoting=None, line_terminator='\n', - chunksize=None, tupleize_cols=False, quotechar='"', - date_format=None, doublequote=True, escapechar=None, - decimal='.'): + compression: (str, Dict) = 'infer', quoting=None, + line_terminator='\n', chunksize=None, tupleize_cols=False, + quotechar='"', date_format=None, doublequote=True, + escapechar=None, decimal='.'): self.obj = obj if path_or_buf is None: path_or_buf = StringIO() - self._compression_arg = compression - compression_mode = compression - # Extract compression mode as given, if dict - if isinstance(compression, dict): - try: - compression_mode = compression['method'] - except KeyError: - raise ValueError("If dict, compression must have key " - "'method'") + compression, self.compression_args \ + = _get_compression_method(compression) self.path_or_buf, _, _, _ = get_filepath_or_buffer( path_or_buf, encoding=encoding, - compression=compression_mode, mode=mode + compression=compression, mode=mode ) self.sep = sep self.na_rep = na_rep @@ -162,7 +157,8 @@ def save(self): else: f, handles = _get_handle(self.path_or_buf, self.mode, encoding=self.encoding, - compression=self._compression_arg) + compression=dict(self.compression_args, + method=self.compression)) close = True try: @@ -186,9 +182,11 @@ def save(self): if hasattr(self.path_or_buf, 'write'): self.path_or_buf.write(buf) else: + compression = dict(self.compression_args, + method=self.compression) f, handles = _get_handle(self.path_or_buf, self.mode, encoding=self.encoding, - compression=self._compression_arg) + compression=compression) f.write(buf) close = True if close: From 0a3a9fd5b7a504ac7ab8ae59aed0cf824e112c45 Mon Sep 17 00:00:00 2001 From: Drew Heenan Date: Wed, 10 Apr 2019 15:18:32 -0400 Subject: [PATCH 08/28] fix import order, None type annotations --- pandas/core/generic.py | 4 +++- pandas/io/common.py | 15 ++++++++------- pandas/io/formats/csvs.py | 8 ++++---- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a3192ee877c83..15984b2ed07c1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7,6 +7,7 @@ import operator import pickle from textwrap import dedent +from typing import Dict, Union import warnings import weakref @@ -2918,7 +2919,8 @@ def to_latex(self, buf=None, columns=None, col_space=None, header=True, def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, - mode='w', encoding=None, compression='infer', quoting=None, + mode='w', encoding=None, + compression: Union[str, Dict, None] = 'infer', quoting=None, quotechar='"', line_terminator=None, chunksize=None, tupleize_cols=None, date_format=None, doublequote=True, escapechar=None, decimal='.'): diff --git a/pandas/io/common.py b/pandas/io/common.py index 4de1beafb1913..99bf83fa68e5d 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -9,13 +9,13 @@ import lzma import mmap import os +from typing import Dict, Union from urllib.error import URLError # noqa from urllib.parse import ( # noqa urlencode, urljoin, urlparse as parse_url, uses_netloc, uses_params, uses_relative) from urllib.request import pathname2url, urlopen import zipfile -from typing import Dict import pandas.compat as compat from pandas.errors import ( # noqa @@ -234,7 +234,7 @@ def file_path_to_url(path): } -def _get_compression_method(compression: (str, Dict)): +def _get_compression_method(compression: Union[str, Dict, None]): """ Simplifies a compression argument to a compression method string and a dict containing additional arguments. @@ -247,14 +247,14 @@ def _get_compression_method(compression: (str, Dict)): Returns ------- - tuple of ({compression method}, str + tuple of ({compression method}, any {compression arguments}, dict) Raises ------ ValueError on dict missing 'method' key """ - compression_args = {} + compression_args = {} # type: Dict # Handle dict if isinstance(compression, dict): compression_args = compression.copy() @@ -331,8 +331,9 @@ def _infer_compression(filepath_or_buffer, compression): raise ValueError(msg) -def _get_handle(path_or_buf, mode, encoding=None, compression=None, - memory_map=False, is_text=True): +def _get_handle(path_or_buf, mode, encoding=None, + compression: Union[str, Dict, None] = None, memory_map=False, + is_text=True): """ Get file handle for given path/buffer and mode. @@ -480,7 +481,7 @@ class BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore """ # GH 17778 def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, - arcname: (str, zipfile.ZipInfo) = None, **kwargs): + arcname: Union[str, zipfile.ZipInfo, None] = None, **kwargs): if mode in ['wb', 'rb']: mode = mode.replace('b', '') self.arcname = arcname diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index ac0b73cf4b844..62213477ab936 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -6,9 +6,9 @@ import csv as csvlib from io import StringIO import os +from typing import Dict, Union import warnings from zipfile import ZipFile -from typing import Dict import numpy as np @@ -19,8 +19,8 @@ from pandas.core.dtypes.missing import notna from pandas.io.common import ( - UnicodeWriter, _get_handle, _infer_compression, get_filepath_or_buffer, - _get_compression_method) + UnicodeWriter, _get_compression_method, _get_handle, _infer_compression, + get_filepath_or_buffer) class CSVFormatter(object): @@ -28,7 +28,7 @@ class CSVFormatter(object): def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, - compression: (str, Dict) = 'infer', quoting=None, + compression: Union[str, Dict, None] = 'infer', quoting=None, line_terminator='\n', chunksize=None, tupleize_cols=False, quotechar='"', date_format=None, doublequote=True, escapechar=None, decimal='.'): From a1cb3f7917efedc52d5d98c4e162a66c9db8a06b Mon Sep 17 00:00:00 2001 From: Drew Heenan Date: Sat, 13 Apr 2019 20:18:54 -0400 Subject: [PATCH 09/28] compression args passed as kwargs, update relevant docs --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/generic.py | 3 ++- pandas/io/common.py | 37 ++++++++++----------------------- pandas/io/formats/csvs.py | 2 +- 4 files changed, 15 insertions(+), 29 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index e58d3c3d36721..2b50ffeac7859 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -36,7 +36,7 @@ Other Enhancements - :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`) - :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`) - :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`) -- :meth:`NDFrame.to_csv` now supports dicts as ``compression`` argument with key ``'method'`` being the compression method and optional key ``'arcname'`` specifying the archived CSV file name when the compression method is ``'zip'``. If key ``'arcname'`` unspecified or ``compression='zip'``, maintains previous behavior. (:issue:`26023`) +- :meth:`NDFrame.to_csv` now supports dicts as ``compression`` argument with key ``'method'`` being the compression method and others as kwargs of ``ByteZipFile`` when the compression method is ``'zip'``. (:issue:`26023`) .. _whatsnew_0250.api_breaking: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 15984b2ed07c1..6bd6809689f49 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2984,7 +2984,8 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, .. versionchanged:: 0.25.0 May now be a dict with key 'method' as compression mode - and 'arcname' as CSV file name if mode is 'zip' + and other entries as ByteZipFile kwargs if compression mode + is 'zip' quoting : optional constant from csv module Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` diff --git a/pandas/io/common.py b/pandas/io/common.py index 99bf83fa68e5d..8ee8122f017b6 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -9,7 +9,7 @@ import lzma import mmap import os -from typing import Dict, Union +from typing import Dict, Tuple, Union from urllib.error import URLError # noqa from urllib.parse import ( # noqa urlencode, urljoin, urlparse as parse_url, uses_netloc, uses_params, @@ -269,40 +269,27 @@ def _get_compression_method(compression: Union[str, Dict, None]): def _infer_compression(filepath_or_buffer, compression): """ - Get the compression method for filepath_or_buffer. If compression mode is - 'infer', the inferred compression method is returned. Otherwise, the input + Get the compression method for filepath_or_buffer. If compression='infer', + the inferred compression method is returned. Otherwise, the input compression method is returned unchanged, unless it's invalid, in which case an error is raised. - Parameters ---------- filepath_or_buffer : a path (str) or buffer - compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None} or dict - If string, specifies compression mode. If dict, value at key 'method' - specifies compression mode. If compression mode is 'infer' and - `filepath_or_buffer` is path-like, then detect compression from the - following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no - compression). - - .. versionchanged 0.25.0 - - May now be a dict with required key 'method' specifying compression - mode - + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None} + If 'infer' and `filepath_or_buffer` is path-like, then detect + compression from the following extensions: '.gz', '.bz2', '.zip', + or '.xz' (otherwise no compression). Returns ------- string or None : compression method - Raises ------ ValueError on invalid compression specified """ - # Handle compression as dict - compression, _ = _get_compression_method(compression) - # No compression has been explicitly specified if compression is None: return None @@ -357,7 +344,8 @@ def _get_handle(path_or_buf, mode, encoding=None, .. versionchanged:: 0.25.0 May now be a dict with key 'method' as compression mode - and 'arcname' as CSV file name if mode is 'zip' + and other keys as kwargs for ByteZipFile if compression + mode is 'zip'. memory_map : boolean, default False See parsers._parser_params for more information. @@ -374,7 +362,7 @@ def _get_handle(path_or_buf, mode, encoding=None, """ try: from s3fs import S3File - need_text_wrapping = (BytesIO, S3File) + need_text_wrapping = (BytesIO, S3File) # type: Tuple except ImportError: need_text_wrapping = (BytesIO,) @@ -407,10 +395,7 @@ def _get_handle(path_or_buf, mode, encoding=None, # ZIP Compression elif compression == 'zip': - arcname = None - if 'arcname' in compression_args: - arcname = compression_args['arcname'] - zf = BytesZipFile(path_or_buf, mode, arcname=arcname) + zf = BytesZipFile(path_or_buf, mode, **compression_args) # Ensure the container is closed as well. handles.append(zf) if zf.mode == 'w': diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 62213477ab936..2fceac352faed 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -122,7 +122,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', self.data_index = obj.index if (isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex)) and date_format is not None): - from pandas import Index + from pandas import Index # type: ignore self.data_index = Index([x.strftime(date_format) if notna(x) else '' for x in self.data_index]) From af2a96cd40bbc8ad797d13128fa5e7447189ff51 Mon Sep 17 00:00:00 2001 From: Drew Heenan Date: Mon, 15 Apr 2019 12:59:36 -0400 Subject: [PATCH 10/28] style/doc improvements, change arcname to archive_name --- pandas/core/generic.py | 13 ++++++------- pandas/io/common.py | 22 ++++++++++++---------- pandas/io/formats/csvs.py | 12 ++++++------ pandas/tests/io/formats/test_to_csv.py | 12 ++++++------ 4 files changed, 30 insertions(+), 29 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6bd6809689f49..67abbc2375217 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2920,10 +2920,10 @@ def to_latex(self, buf=None, columns=None, col_space=None, header=True, def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, mode='w', encoding=None, - compression: Union[str, Dict, None] = 'infer', quoting=None, - quotechar='"', line_terminator=None, chunksize=None, - tupleize_cols=None, date_format=None, doublequote=True, - escapechar=None, decimal='.'): + compression: Union[str, Dict, None] = 'infer', + quoting=None, quotechar='"', line_terminator=None, + chunksize=None, tupleize_cols=None, date_format=None, + doublequote=True, escapechar=None, decimal='.'): r""" Write object to a comma-separated values (csv) file. @@ -2977,9 +2977,8 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, compression mode is 'infer' and `path_or_buf` is path-like, then detect compression mode from the following extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given - and mode is 'zip' or inferred as 'zip', optional value at 'arcname' - specifies name of file within ZIP archive, assuming equal to - `path_or_buf` if not specified or None. + and mode is 'zip' or inferred as 'zip', other entries passed as + kwargs to ByteZipFile. .. versionchanged:: 0.25.0 diff --git a/pandas/io/common.py b/pandas/io/common.py index 8ee8122f017b6..aa265b2c40447 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -9,7 +9,7 @@ import lzma import mmap import os -from typing import Dict, Tuple, Union +from typing import Any, Dict, Tuple, Union from urllib.error import URLError # noqa from urllib.parse import ( # noqa urlencode, urljoin, urlparse as parse_url, uses_netloc, uses_params, @@ -254,7 +254,7 @@ def _get_compression_method(compression: Union[str, Dict, None]): ------ ValueError on dict missing 'method' key """ - compression_args = {} # type: Dict + compression_args = {} # type: Dict[str, Any] # Handle dict if isinstance(compression, dict): compression_args = compression.copy() @@ -319,8 +319,8 @@ def _infer_compression(filepath_or_buffer, compression): def _get_handle(path_or_buf, mode, encoding=None, - compression: Union[str, Dict, None] = None, memory_map=False, - is_text=True): + compression: Union[str, Dict, None] = None, + memory_map=False, is_text=True): """ Get file handle for given path/buffer and mode. @@ -338,8 +338,7 @@ def _get_handle(path_or_buf, mode, encoding=None, and `filepath_or_buffer` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no compression). If dict and compression mode is 'zip' or inferred as - 'zip', optional value at key 'arcname' specifies the name of the file - within ZIP archive at `path_or_buf`. + 'zip', other entries passed as kwargs to ByteZipFile. .. versionchanged:: 0.25.0 @@ -466,15 +465,18 @@ class BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore """ # GH 17778 def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, - arcname: Union[str, zipfile.ZipInfo, None] = None, **kwargs): + archive_name: Union[str, zipfile.ZipInfo, None] = None, + **kwargs): if mode in ['wb', 'rb']: mode = mode.replace('b', '') - self.arcname = arcname + self.archive_name = archive_name super().__init__(file, mode, compression, **kwargs) def write(self, data): - arcname = self.filename if self.arcname is None else self.arcname - super().writestr(arcname, data) + archive_name = self.filename + if self.archive_name is not None: + archive_name = self.archive_name + super().writestr(archive_name, data) @property def closed(self): diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 2fceac352faed..1593a243fa77c 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -28,10 +28,10 @@ class CSVFormatter(object): def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, - compression: Union[str, Dict, None] = 'infer', quoting=None, - line_terminator='\n', chunksize=None, tupleize_cols=False, - quotechar='"', date_format=None, doublequote=True, - escapechar=None, decimal='.'): + compression: Union[str, Dict, None] = 'infer', + quoting=None, line_terminator='\n', chunksize=None, + tupleize_cols=False, quotechar='"', date_format=None, + doublequote=True, escapechar=None, decimal='.'): self.obj = obj @@ -39,8 +39,8 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', path_or_buf = StringIO() # Extract compression mode as given, if dict - compression, self.compression_args \ - = _get_compression_method(compression) + compression, self.compression_args = _get_compression_method( + compression) self.path_or_buf, _, _, _ = get_filepath_or_buffer( path_or_buf, encoding=encoding, diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 1592363f34190..0ab367b72a9e6 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -550,18 +550,18 @@ def test_to_csv_compression_dict(self, compression_only): tm.assert_frame_equal(read_df, df) @pytest.mark.parametrize("compression", ["zip", "infer"]) - @pytest.mark.parametrize("arcname", [None, "test_to_csv.csv", - "test_to_csv.zip"]) - def test_to_csv_zip_arcname(self, compression, arcname): + @pytest.mark.parametrize("archive_name", [None, "test_to_csv.csv", + "test_to_csv.zip"]) + def test_to_csv_zip_arguments(self, compression, archive_name): # GH 26023 from zipfile import ZipFile df = DataFrame({"ABC": [1]}) - with tm.ensure_clean("to_csv_arcname.zip") as path: + with tm.ensure_clean("to_csv_archive_name.zip") as path: df.to_csv(path, compression={"method": compression, - "arcname": arcname}) + "archive_name": archive_name}) zp = ZipFile(path) - expected_arcname = path if arcname is None else arcname + expected_arcname = path if archive_name is None else archive_name expected_arcname = os.path.basename(expected_arcname) assert len(zp.filelist) == 1 archived_file = os.path.basename(zp.filelist[0].filename) From 5b09e6fead48ee6757e4cf55d027a3c57154c003 Mon Sep 17 00:00:00 2001 From: Drew Heenan Date: Sun, 21 Apr 2019 21:35:14 -0400 Subject: [PATCH 11/28] add to_csv example, no method test, Optional types, tweaks; update whatsnew --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/generic.py | 8 ++++++-- pandas/io/common.py | 11 ++++++----- pandas/io/formats/csvs.py | 4 ++-- pandas/tests/io/formats/test_to_csv.py | 7 +++++++ 5 files changed, 22 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 73de1ec56e4c2..eddcae144ba06 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -36,7 +36,7 @@ Other Enhancements - :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`) - :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`) - :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`) -- :meth:`NDFrame.to_csv` now supports dicts as ``compression`` argument with key ``'method'`` being the compression method and others as kwargs of ``ByteZipFile`` when the compression method is ``'zip'``. (:issue:`26023`) +- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support dicts as ``compression`` argument with key ``'method'`` being the compression method and others as kwargs of ``ByteZipFile`` when the compression method is ``'zip'``. (:issue:`26023`) .. _whatsnew_0250.api_breaking: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 64dfbd049c73c..7fd11abe56338 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6,7 +6,7 @@ import operator import pickle from textwrap import dedent -from typing import Dict, FrozenSet, List, Set, Union +from typing import Any, Dict, FrozenSet, List, Optional, Set, Union import warnings import weakref @@ -2912,7 +2912,7 @@ def to_latex(self, buf=None, columns=None, col_space=None, header=True, def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, mode='w', encoding=None, - compression: Union[str, Dict, None] = 'infer', + compression: Optional[Union[str, Dict[str, Any]]] = 'infer', quoting=None, quotechar='"', line_terminator=None, chunksize=None, tupleize_cols=None, date_format=None, doublequote=True, escapechar=None, decimal='.'): @@ -3028,6 +3028,10 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, ... 'weapon': ['sai', 'bo staff']}) >>> df.to_csv(index=False) 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n' + >>> compression_opts = dict(method='zip', archive_name='out.csv') + >>> df.to_csv('out.zip', index=False, compression=compression_opts) + + # creates 'out.zip' containing 'out.csv' """ df = self if isinstance(self, ABCDataFrame) else self.to_frame() diff --git a/pandas/io/common.py b/pandas/io/common.py index ecf5ba7517d3d..af1d58636ab67 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -9,7 +9,7 @@ import lzma import mmap import os -from typing import Any, Dict, Tuple, Union +from typing import Any, Dict, Optional, Tuple, Union from urllib.error import URLError # noqa from urllib.parse import ( # noqa urlencode, urljoin, urlparse as parse_url, uses_netloc, uses_params, @@ -233,7 +233,7 @@ def file_path_to_url(path): } -def _get_compression_method(compression: Union[str, Dict, None]): +def _get_compression_method(compression: Optional[Union[str, Dict[str, Any]]]): """ Simplifies a compression argument to a compression method string and a dict containing additional arguments. @@ -253,7 +253,6 @@ def _get_compression_method(compression: Union[str, Dict, None]): ------ ValueError on dict missing 'method' key """ - compression_args = {} # type: Dict[str, Any] # Handle dict if isinstance(compression, dict): compression_args = compression.copy() @@ -263,6 +262,8 @@ def _get_compression_method(compression: Union[str, Dict, None]): except KeyError: raise ValueError("If dict, compression " "must have key 'method'") + else: + compression_args = {} return compression, compression_args @@ -318,7 +319,7 @@ def _infer_compression(filepath_or_buffer, compression): def _get_handle(path_or_buf, mode, encoding=None, - compression: Union[str, Dict, None] = None, + compression: Optional[Union[str, Dict[str, Any]]] = None, memory_map=False, is_text=True): """ Get file handle for given path/buffer and mode. @@ -464,7 +465,7 @@ class BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore """ # GH 17778 def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, - archive_name: Union[str, zipfile.ZipInfo, None] = None, + archive_name: Optional[str] = None, **kwargs): if mode in ['wb', 'rb']: mode = mode.replace('b', '') diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index a3733a8c4d23d..e47e01595c10c 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -6,7 +6,7 @@ import csv as csvlib from io import StringIO import os -from typing import Dict, Union +from typing import Any, Dict, Optional, Union import warnings from zipfile import ZipFile @@ -28,7 +28,7 @@ class CSVFormatter: def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, - compression: Union[str, Dict, None] = 'infer', + compression: Optional[Union[str, Dict[str, Any]]] = 'infer', quoting=None, line_terminator='\n', chunksize=None, tupleize_cols=False, quotechar='"', date_format=None, doublequote=True, escapechar=None, decimal='.'): diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 65ec9b20d4167..8a67b8140d573 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -548,6 +548,13 @@ def test_to_csv_compression_dict(self, compression_only): read_df = pd.read_csv(path, index_col=0) tm.assert_frame_equal(read_df, df) + def test_to_csv_compression_dict_no_method(self): + # GH 26023 + df = DataFrame({"ABC": [1]}) + compression = {"some_option": True} + with tm.ensure_clean("out.zip") as path, pytest.raises(ValueError): + df.to_csv(path, compression=compression) + @pytest.mark.parametrize("compression", ["zip", "infer"]) @pytest.mark.parametrize("archive_name", [None, "test_to_csv.csv", "test_to_csv.zip"]) From 68a2b4dad39bc60cb1bcf750ec18ae379d33bb29 Mon Sep 17 00:00:00 2001 From: Drew Heenan Date: Mon, 22 Apr 2019 12:13:00 -0400 Subject: [PATCH 12/28] remove Index import type ignore --- pandas/io/formats/csvs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index e47e01595c10c..dd330faa01989 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -122,7 +122,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', self.data_index = obj.index if (isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex)) and date_format is not None): - from pandas import Index # type: ignore + from pandas import Index self.data_index = Index([x.strftime(date_format) if notna(x) else '' for x in self.data_index]) From c856f500f4a5bc324812881c7ac27e17d073444a Mon Sep 17 00:00:00 2001 From: Drew Heenan Date: Mon, 22 Apr 2019 19:30:07 -0400 Subject: [PATCH 13/28] Revert "remove Index import type ignore" This reverts commit 68a2b4dad39bc60cb1bcf750ec18ae379d33bb29. --- pandas/io/formats/csvs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index dd330faa01989..e47e01595c10c 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -122,7 +122,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', self.data_index = obj.index if (isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex)) and date_format is not None): - from pandas import Index + from pandas import Index # type: ignore self.data_index = Index([x.strftime(date_format) if notna(x) else '' for x in self.data_index]) From 18a735dbf16a62c802cfd6afad4fcd1c9857c91d Mon Sep 17 00:00:00 2001 From: Drew Heenan Date: Sun, 5 May 2019 16:59:05 -0400 Subject: [PATCH 14/28] Improve docs/examples --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/generic.py | 11 ++++++----- pandas/io/common.py | 4 ++-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index eed62aa9129a0..c57b6bc711da3 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -37,7 +37,7 @@ Other Enhancements - :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`) - :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`) - :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`) -- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support dicts as ``compression`` argument with key ``'method'`` being the compression method and others as kwargs of ``ByteZipFile`` when the compression method is ``'zip'``. (:issue:`26023`) +- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support dicts as ``compression`` argument with key ``'method'`` being the compression method and others as additional compression options when the compression method is ``'zip'``. (:issue:`26023`) .. _whatsnew_0250.api_breaking: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index aaefcebdd8208..8680f5b296d89 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2970,13 +2970,14 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, detect compression mode from the following extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given and mode is 'zip' or inferred as 'zip', other entries passed as - kwargs to ByteZipFile. + additional compression options. .. versionchanged:: 0.25.0 May now be a dict with key 'method' as compression mode - and other entries as ByteZipFile kwargs if compression mode - is 'zip' + and other entries as additional compression options if + compression mode is 'zip'. + quoting : optional constant from csv module Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` then floats are converted to strings and thus csv.QUOTE_NONNUMERIC @@ -3028,10 +3029,10 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, ... 'weapon': ['sai', 'bo staff']}) >>> df.to_csv(index=False) 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n' + + # create 'out.zip' containing 'out.csv' >>> compression_opts = dict(method='zip', archive_name='out.csv') >>> df.to_csv('out.zip', index=False, compression=compression_opts) - - # creates 'out.zip' containing 'out.csv' """ df = self if isinstance(self, ABCDataFrame) else self.to_frame() diff --git a/pandas/io/common.py b/pandas/io/common.py index af1d58636ab67..6b831e8a4e2e1 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -338,12 +338,12 @@ def _get_handle(path_or_buf, mode, encoding=None, and `filepath_or_buffer` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no compression). If dict and compression mode is 'zip' or inferred as - 'zip', other entries passed as kwargs to ByteZipFile. + 'zip', other entries passed as additional compression options. .. versionchanged:: 0.25.0 May now be a dict with key 'method' as compression mode - and other keys as kwargs for ByteZipFile if compression + and other keys as compression options if compression mode is 'zip'. memory_map : boolean, default False From 969d387422cfa95869f3e19d77716cc9fdd3d97a Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 8 Jun 2019 16:55:33 -0400 Subject: [PATCH 15/28] Added back missed Callable import in generic --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f1d09dc104fbb..4d93838cc56fe 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6,7 +6,7 @@ import operator import pickle from textwrap import dedent -from typing import Any, Dict, FrozenSet, List, Optional, Set, Union +from typing import Any, Callable, Dict, FrozenSet, List, Optional, Set, Union import warnings import weakref From 04ae25da79ac8bd6e5bd32f96e6df9f921f02a61 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 9 Jun 2019 10:17:56 -0400 Subject: [PATCH 16/28] Address comments --- pandas/core/generic.py | 11 +++++++---- pandas/io/common.py | 4 +++- pandas/tests/io/formats/test_to_csv.py | 9 ++++++--- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4d93838cc56fe..c343c3245dd53 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6,7 +6,7 @@ import operator import pickle from textwrap import dedent -from typing import Any, Callable, Dict, FrozenSet, List, Optional, Set, Union +from typing import Callable, Dict, FrozenSet, List, Optional, Set, Union import warnings import weakref @@ -2943,7 +2943,7 @@ def to_latex(self, buf=None, columns=None, col_space=None, header=True, def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, mode='w', encoding=None, - compression: Optional[Union[str, Dict[str, Any]]] = 'infer', + compression: Optional[Union[str, Dict[str, str]]] = 'infer', quoting=None, quotechar='"', line_terminator=None, chunksize=None, tupleize_cols=None, date_format=None, doublequote=True, escapechar=None, decimal='.'): @@ -3062,8 +3062,11 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n' # create 'out.zip' containing 'out.csv' - >>> compression_opts = dict(method='zip', archive_name='out.csv') - >>> df.to_csv('out.zip', index=False, compression=compression_opts) + >>> compression_opts = dict(method='zip', + ... archive_name='out.csv') # doctest: +SKIP + + >>> df.to_csv('out.zip', index=False, + ... compression=compression_opts) # doctest: +SKIP """ df = self if isinstance(self, ABCDataFrame) else self.to_frame() diff --git a/pandas/io/common.py b/pandas/io/common.py index 3efcf00b382fe..73d1b5c554fe4 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -220,7 +220,9 @@ def file_path_to_url(path): } -def _get_compression_method(compression: Optional[Union[str, Dict[str, Any]]]): +def _get_compression_method( + compression: Optional[Union[Dict[str, str]]] +) -> Tuple[Optional[str], Dict[str, str]]: """ Simplifies a compression argument to a compression method string and a dict containing additional arguments. diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 7b57748119d27..1abe7d722986a 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -546,12 +546,15 @@ def test_to_csv_compression_dict(self, compression_only): read_df = pd.read_csv(path, index_col=0) tm.assert_frame_equal(read_df, df) - def test_to_csv_compression_dict_no_method(self): + def test_to_csv_compression_dict_no_method_raises(self): # GH 26023 df = DataFrame({"ABC": [1]}) compression = {"some_option": True} - with tm.ensure_clean("out.zip") as path, pytest.raises(ValueError): - df.to_csv(path, compression=compression) + msg = "must have key 'method'" + + with tm.ensure_clean("out.zip") as path: + with pytest.raises(ValueError, match=msg): + df.to_csv(path, compression=compression) @pytest.mark.parametrize("compression", ["zip", "infer"]) @pytest.mark.parametrize("archive_name", [None, "test_to_csv.csv", From 9c22652117b1a6a0bfd1da5387174844611e9264 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 9 Jun 2019 10:41:49 -0400 Subject: [PATCH 17/28] Typing cleanup --- pandas/io/common.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 73d1b5c554fe4..7aa5d5b966bd5 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -10,7 +10,7 @@ import mmap import os import pathlib -from typing import Any, Dict, Optional, Tuple, Union +from typing import Any, Dict, Optional, Tuple, Type, Union from urllib.error import URLError # noqa from urllib.parse import ( # noqa urlencode, urljoin, urlparse as parse_url, uses_netloc, uses_params, @@ -221,7 +221,7 @@ def file_path_to_url(path): def _get_compression_method( - compression: Optional[Union[Dict[str, str]]] + compression: Optional[Union[str, Dict[str, str]]] ) -> Tuple[Optional[str], Dict[str, str]]: """ Simplifies a compression argument to a compression method string and @@ -246,8 +246,7 @@ def _get_compression_method( if isinstance(compression, dict): compression_args = compression.copy() try: - compression = compression['method'] - compression_args.pop('method') + compression = compression_args.pop('method') except KeyError: raise ValueError("If dict, compression " "must have key 'method'") @@ -348,11 +347,12 @@ def _get_handle(path_or_buf, mode, encoding=None, handles : list of file-like objects A list of file-like object that were opened in this function. """ + need_text_wrapping = (BytesIO,) # type: Tuple[Type[BytesIO], ...] try: from s3fs import S3File - need_text_wrapping = (BytesIO, S3File) # type: Tuple + need_text_wrapping = need_text_wrapping + (S3File,) except ImportError: - need_text_wrapping = (BytesIO,) + pass handles = list() f = path_or_buf From 56a75c2e34590c908249f5b56e5bce81e175dada Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 9 Jun 2019 10:45:33 -0400 Subject: [PATCH 18/28] Cleaned up docstring --- pandas/io/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 7aa5d5b966bd5..d9fefd5ea8276 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -235,8 +235,8 @@ def _get_compression_method( Returns ------- - tuple of ({compression method}, any - {compression arguments}, dict) + tuple of ({compression method}, Optional[str] + {compression arguments}, Dict[str, str]) Raises ------ From 779511e05cb6a14eccc5c7f7475032a01a511d06 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 15 Jul 2019 09:04:13 -0700 Subject: [PATCH 19/28] blackify --- pandas/io/common.py | 30 ++++++++++++++++++++---------- pandas/io/formats/csvs.py | 12 ++++-------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index b33b3d2b37e6b..e51609aed25dd 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -244,7 +244,7 @@ def file_path_to_url(path): def _get_compression_method( - compression: Optional[Union[str, Dict[str, str]]] + compression: Optional[Union[str, Dict[str, str]]] ) -> Tuple[Optional[str], Dict[str, str]]: """ Simplifies a compression argument to a compression method string and @@ -269,10 +269,9 @@ def _get_compression_method( if isinstance(compression, dict): compression_args = compression.copy() try: - compression = compression_args.pop('method') + compression = compression_args.pop("method") except KeyError: - raise ValueError("If dict, compression " - "must have key 'method'") + raise ValueError("If dict, compression must have key 'method'") else: compression_args = {} return compression, compression_args @@ -330,7 +329,12 @@ def _infer_compression(filepath_or_buffer, compression): def _get_handle( - path_or_buf, mode, encoding=None, compression: Optional[Union[str, Dict[str, Any]]] = None, memory_map=False, is_text=True + path_or_buf, + mode, + encoding=None, + compression: Optional[Union[str, Dict[str, Any]]] = None, + memory_map=False, + is_text=True, ): """ Get file handle for given path/buffer and mode. @@ -373,6 +377,7 @@ def _get_handle( need_text_wrapping = (BytesIO,) # type: Tuple[Type[BytesIO], ...] try: from s3fs import S3File + need_text_wrapping = need_text_wrapping + (S3File,) except ImportError: pass @@ -480,11 +485,16 @@ class BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore """ # GH 17778 - def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, - archive_name: Optional[str] = None, - **kwargs): - if mode in ['wb', 'rb']: - mode = mode.replace('b', '') + def __init__( + self, + file, + mode, + compression=zipfile.ZIP_DEFLATED, + archive_name: Optional[str] = None, + **kwargs + ): + if mode in ["wb", "rb"]: + mode = mode.replace("b", "") self.archive_name = archive_name super().__init__(file, mode, compression, **kwargs) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 3e474ab06cb95..b677de7eba67b 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -61,12 +61,10 @@ def __init__( path_or_buf = StringIO() # Extract compression mode as given, if dict - compression, self.compression_args = _get_compression_method( - compression) + compression, self.compression_args = _get_compression_method(compression) self.path_or_buf, _, _, _ = get_filepath_or_buffer( - path_or_buf, encoding=encoding, - compression=compression, mode=mode + path_or_buf, encoding=encoding, compression=compression, mode=mode ) self.sep = sep self.na_rep = na_rep @@ -187,8 +185,7 @@ def save(self): self.path_or_buf, self.mode, encoding=self.encoding, - compression=dict(self.compression_args, - method=self.compression), + compression=dict(self.compression_args, method=self.compression), ) close = True @@ -216,8 +213,7 @@ def save(self): if hasattr(self.path_or_buf, "write"): self.path_or_buf.write(buf) else: - compression = dict(self.compression_args, - method=self.compression) + compression = dict(self.compression_args, method=self.compression) f, handles = _get_handle( self.path_or_buf, From 6c4e6797d0990ff0cdcc5cbbafd0861cd16b5751 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 16 Jul 2019 14:36:08 -0700 Subject: [PATCH 20/28] Added annotations where feasible --- pandas/core/generic.py | 46 ++++++++++++++++++++++++------------------ pandas/io/common.py | 14 ++++++------- 2 files changed, 33 insertions(+), 27 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 52b28123230d4..f641f99467e9d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6,7 +6,7 @@ import operator import pickle from textwrap import dedent -from typing import Callable, Dict, FrozenSet, List, Optional, Set, Union +from typing import Callable, Dict, FrozenSet, Hashable, List, Optional, Set, Sequence, Union import warnings import weakref @@ -15,6 +15,7 @@ from pandas._config import config from pandas._libs import Timestamp, iNaT, properties +from pandas._typing import FilePathOrBuffer from pandas.compat import set_function_name from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv @@ -121,6 +122,9 @@ def _single_replace(self, to_replace, method, inplace, limit): return result +bool_t = bool # Need alias because NDFrame has def bool: + + class NDFrame(PandasObject, SelectionMixin): """ N-dimensional analogue of DataFrame. Store multi-dimensional in a @@ -3078,26 +3082,26 @@ def to_latex( def to_csv( self, - path_or_buf=None, - sep=",", - na_rep="", - float_format=None, - columns=None, - header=True, - index=True, - index_label=None, - mode="w", - encoding=None, + path_or_buf: Optional[FilePathOrBuffer] = None, + sep: str = ",", + na_rep: str = "", + float_format: Optional[str] = None, + columns: Optional[Sequence[Hashable]] = None, + header: Union[bool_t, List[str]] = True, + index: bool_t = True, + index_label: Optional[Union[bool_t, str, Sequence[Hashable]]] = None, + mode: str="w", + encoding: Optional[str] = None, compression: Optional[Union[str, Dict[str, str]]] = "infer", - quoting=None, - quotechar='"', - line_terminator=None, - chunksize=None, - date_format=None, - doublequote=True, - escapechar=None, - decimal=".", - ): + quoting: Optional[int] = None, + quotechar: str = '"', + line_terminator: Optional[str] = None, + chunksize: Optional[int] = None, + date_format: Optional[str] = None, + doublequote: bool_t = True, + escapechar: Optional[str] = None, + decimal: Optional[str] = ".", + ) -> Optional[str]: r""" Write object to a comma-separated values (csv) file. @@ -3243,6 +3247,8 @@ def to_csv( if path_or_buf is None: return formatter.path_or_buf.getvalue() + return None + # ---------------------------------------------------------------------- # Fancy Indexing diff --git a/pandas/io/common.py b/pandas/io/common.py index e51609aed25dd..d4234614a3f9d 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -23,6 +23,7 @@ from urllib.request import pathname2url, urlopen import zipfile +from pandas._typing import FilePathOrBuffer from pandas.errors import ( # noqa AbstractMethodError, DtypeWarning, @@ -330,11 +331,11 @@ def _infer_compression(filepath_or_buffer, compression): def _get_handle( path_or_buf, - mode, + mode: str, encoding=None, compression: Optional[Union[str, Dict[str, Any]]] = None, - memory_map=False, - is_text=True, + memory_map: bool = False, + is_text: bool = True, ): """ Get file handle for given path/buffer and mode. @@ -487,16 +488,15 @@ class BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore # GH 17778 def __init__( self, - file, - mode, - compression=zipfile.ZIP_DEFLATED, + file: FilePathOrBuffer, + mode: str, archive_name: Optional[str] = None, **kwargs ): if mode in ["wb", "rb"]: mode = mode.replace("b", "") self.archive_name = archive_name - super().__init__(file, mode, compression, **kwargs) + super().__init__(file, mode, zipfile.ZIP_DEFLATED, **kwargs) def write(self, data): archive_name = self.filename From 1b567c9a79e43f383ad1b066146ebcac64a40bd6 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 16 Jul 2019 14:38:10 -0700 Subject: [PATCH 21/28] Black and lint --- pandas/core/generic.py | 14 ++++++++++++-- pandas/io/formats/csvs.py | 1 - pandas/tests/io/formats/test_to_csv.py | 10 ++++++---- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f641f99467e9d..f359755e02c7c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6,7 +6,17 @@ import operator import pickle from textwrap import dedent -from typing import Callable, Dict, FrozenSet, Hashable, List, Optional, Set, Sequence, Union +from typing import ( + Callable, + Dict, + FrozenSet, + Hashable, + List, + Optional, + Set, + Sequence, + Union, +) import warnings import weakref @@ -3090,7 +3100,7 @@ def to_csv( header: Union[bool_t, List[str]] = True, index: bool_t = True, index_label: Optional[Union[bool_t, str, Sequence[Hashable]]] = None, - mode: str="w", + mode: str = "w", encoding: Optional[str] = None, compression: Optional[Union[str, Dict[str, str]]] = "infer", quoting: Optional[int] = None, diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index b677de7eba67b..70a914335319b 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -5,7 +5,6 @@ import csv as csvlib from io import StringIO import os -from typing import Any, Dict, Optional, Union import warnings from zipfile import ZipFile diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index e24e8185cf74e..1ea114a99bfc9 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -538,16 +538,18 @@ def test_to_csv_compression_dict_no_method_raises(self): df.to_csv(path, compression=compression) @pytest.mark.parametrize("compression", ["zip", "infer"]) - @pytest.mark.parametrize("archive_name", [None, "test_to_csv.csv", - "test_to_csv.zip"]) + @pytest.mark.parametrize( + "archive_name", [None, "test_to_csv.csv", "test_to_csv.zip"] + ) def test_to_csv_zip_arguments(self, compression, archive_name): # GH 26023 from zipfile import ZipFile df = DataFrame({"ABC": [1]}) with tm.ensure_clean("to_csv_archive_name.zip") as path: - df.to_csv(path, compression={"method": compression, - "archive_name": archive_name}) + df.to_csv( + path, compression={"method": compression, "archive_name": archive_name} + ) zp = ZipFile(path) expected_arcname = path if archive_name is None else archive_name expected_arcname = os.path.basename(expected_arcname) From 7cf65ee853af68224a31bd694fcd2152e8ee2de8 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 17 Jul 2019 08:10:39 -0700 Subject: [PATCH 22/28] isort fixup --- pandas/core/generic.py | 5 ++--- pandas/io/common.py | 3 ++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f359755e02c7c..94a34cb8d76db 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -13,8 +13,8 @@ Hashable, List, Optional, - Set, Sequence, + Set, Union, ) import warnings @@ -25,7 +25,6 @@ from pandas._config import config from pandas._libs import Timestamp, iNaT, properties -from pandas._typing import FilePathOrBuffer from pandas.compat import set_function_name from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv @@ -61,7 +60,7 @@ from pandas.core.dtypes.missing import isna, notna import pandas as pd -from pandas._typing import Dtype +from pandas._typing import Dtype, FilePathOrBuffer from pandas.core import missing, nanops import pandas.core.algorithms as algos from pandas.core.base import PandasObject, SelectionMixin diff --git a/pandas/io/common.py b/pandas/io/common.py index d4234614a3f9d..21cdc68e2abc5 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -23,7 +23,6 @@ from urllib.request import pathname2url, urlopen import zipfile -from pandas._typing import FilePathOrBuffer from pandas.errors import ( # noqa AbstractMethodError, DtypeWarning, @@ -34,6 +33,8 @@ from pandas.core.dtypes.common import is_file_like +from pandas._typing import FilePathOrBuffer + # gh-12665: Alias for now and remove later. CParserError = ParserError From 29374f34ef573cb9715ff6008ededc27551362bc Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 17 Jul 2019 08:20:21 -0700 Subject: [PATCH 23/28] Docstring fixup and more annotations --- pandas/io/common.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 21cdc68e2abc5..a5082b0c93376 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -279,27 +279,29 @@ def _get_compression_method( return compression, compression_args -def _infer_compression(filepath_or_buffer, compression): +def _infer_compression(filepath_or_buffer, compression) -> Optional[str]: """ Get the compression method for filepath_or_buffer. If compression='infer', the inferred compression method is returned. Otherwise, the input compression method is returned unchanged, unless it's invalid, in which case an error is raised. + Parameters ---------- - filepath_or_buffer : - a path (str) or buffer + filepath_or_buffer : str or file handle + File path or object. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None} If 'infer' and `filepath_or_buffer` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no compression). + Returns ------- - string or None : - compression method + string or None + Raises ------ - ValueError on invalid compression specified + ValueError on invalid compression specified. """ # No compression has been explicitly specified @@ -343,11 +345,12 @@ def _get_handle( Parameters ---------- - path_or_buf : - a path (str) or buffer + path_or_buf : str or file handle + File path or object. mode : str - mode to open path_or_buf with + Mode to open path_or_buf with. encoding : str or None + Encoding to use. compression : str or dict, default None If string, specifies compression mode. If dict, value at key 'method' specifies compression mode. Compression mode must be one of {'infer', @@ -367,12 +370,12 @@ def _get_handle( See parsers._parser_params for more information. is_text : boolean, default True whether file/buffer is in text format (csv, json, etc.), or in binary - mode (pickle, etc.) + mode (pickle, etc.). Returns ------- f : file-like - A file-like object + A file-like object. handles : list of file-like objects A list of file-like object that were opened in this function. """ From 0f5489d5f5dd632b4804a38b5af7ff5c19ad208a Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 24 Aug 2019 16:44:17 -0700 Subject: [PATCH 24/28] lint fixup --- pandas/core/generic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index eb768f5f74670..f785caa392936 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -18,7 +18,6 @@ Set, Union, ) -from typing import Callable, Dict, FrozenSet, List, Optional, Set import warnings import weakref From e04138e7c008fd02f7436d4129484b08642f4f92 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 24 Aug 2019 16:57:33 -0700 Subject: [PATCH 25/28] mypy fixup --- pandas/io/common.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 6713e5a197b62..c5e2ad84041d2 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -9,7 +9,19 @@ import mmap import os import pathlib -from typing import IO, Any, AnyStr, BinaryIO, Dict, Optional, TextIO, Tuple, Type, Union +from typing import ( + IO, + Any, + AnyStr, + BinaryIO, + Dict, + List, + Optional, + TextIO, + Tuple, + Type, + Union, +) from urllib.error import URLError # noqa from urllib.parse import ( # noqa urlencode, @@ -396,9 +408,9 @@ def _get_handle( need_text_wrapping = (BufferedIOBase, S3File) except ImportError: - need_text_wrapping = BufferedIOBase + need_text_wrapping = BufferedIOBase # type: ignore - handles = list() + handles = list() # type: List[IO] f = path_or_buf # Convert pathlib.Path/py.path.local or string @@ -480,9 +492,9 @@ def _get_handle( if memory_map and hasattr(f, "fileno"): try: - g = MMapWrapper(f) + wrapped = MMapWrapper(f) f.close() - f = g + f = wrapped except Exception: # we catch any errors that may have occurred # because that is consistent with the lower-level From 6f2bf00aa514ce76bb19d8c0be280acabc7dcfc2 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 24 Aug 2019 17:00:43 -0700 Subject: [PATCH 26/28] whatsnew fixup --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index ff82f43c1fc7b..2bfc09e52c68b 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -206,7 +206,7 @@ ExtensionArray Other ^^^^^ - Trying to set the ``display.precision``, ``display.max_rows`` or ``display.max_columns`` using :meth:`set_option` to anything but a ``None`` or a positive int will raise a ``ValueError`` (:issue:`23348`) -- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support dicts as ``compression`` argument with key ``'method'`` being the compression method and others as additional compression options when the compression method is ``'zip'``. (:issue:`26023`) +- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support dicts as ``compression`` argument with key ``'method'`` being the compression method and others as additional compression options when the compression method is ``'zip'``. (:issue:`26023`) .. _whatsnew_1000.contributors: From 865aa8194923fb1e26bd407c4afd4f6e8e8b297f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 24 Aug 2019 17:03:37 -0700 Subject: [PATCH 27/28] Annotation and doc fixups --- pandas/io/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index c5e2ad84041d2..caff586480329 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -359,7 +359,7 @@ def _infer_compression( def _get_handle( path_or_buf, mode: str, - encoding=None, + encoding: str = None, compression: Optional[Union[str, Dict[str, Any]]] = None, memory_map: bool = False, is_text: bool = True, @@ -384,7 +384,7 @@ def _get_handle( no compression). If dict and compression mode is 'zip' or inferred as 'zip', other entries passed as additional compression options. - .. versionchanged:: 0.25.0 + .. versionchanged:: 1.0.0 May now be a dict with key 'method' as compression mode and other keys as compression options if compression From 8d1deeee79e5520279dfb169acaeac371b0afbd9 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 25 Aug 2019 13:17:42 -0700 Subject: [PATCH 28/28] mypy typeshed bug fix --- pandas/io/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index caff586480329..290022167e520 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -359,7 +359,7 @@ def _infer_compression( def _get_handle( path_or_buf, mode: str, - encoding: str = None, + encoding=None, compression: Optional[Union[str, Dict[str, Any]]] = None, memory_map: bool = False, is_text: bool = True,