From e73ede7f0065c3f6a415aa5f396c943402ee8d2e Mon Sep 17 00:00:00 2001 From: Marcel Gerber Date: Sun, 14 Nov 2021 13:30:25 +0100 Subject: [PATCH 1/4] ENH: Infer inner file name of zip archive (GH39465) relevant for `DataFrame.to_csv` and `Series.to_csv` with `compression='zip'` --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/io/common.py | 15 ++++++++++++++- pandas/tests/io/formats/test_to_csv.py | 24 ++++++++++++++++-------- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index a593a03de5c25..dcdd36d4ef8ff 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -210,6 +210,7 @@ Other enhancements - :meth:`read_excel` now accepts a ``decimal`` argument that allow the user to specify the decimal point when parsing string columns to numeric (:issue:`14403`) - :meth:`.GroupBy.mean` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`43731`) - :meth:`Timestamp.isoformat`, now handles the ``timespec`` argument from the base :class:``datetime`` class (:issue:`26131`) +- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file that contains another zip file. Instead, they try to infer the inner file name more smartly. (:issue:`39465`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/common.py b/pandas/io/common.py index 12c7afc8ee2e4..baf81f84668df 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -805,6 +805,19 @@ def __init__( # _PathLike[str]], IO[bytes]]" super().__init__(file, mode, **kwargs_zip) # type: ignore[arg-type] + # GH39465 + # If an explicit archive_name is not given, we still want the file _inside_ the zip + # file _not_ to be named something.zip, because that causes massive confusion. + def infer_filename(self): + if isinstance(self.filename, (os.PathLike, str)): + filename = Path(self.filename) + if filename.suffix == ".zip": + return filename.with_suffix("").name + else: + return filename.name + else: + return None + def write(self, data): # buffer multiple write calls, write on flush if self.multiple_write_buffer is None: @@ -819,7 +832,7 @@ def flush(self) -> None: return # ZipFile needs a non-empty string - archive_name = self.archive_name or self.filename or "zip" + archive_name = self.archive_name or self.infer_filename() or "zip" with self.multiple_write_buffer: super().writestr(archive_name, self.multiple_write_buffer.getvalue()) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 4c482bafa6c9c..119b8e1f7ccb2 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -1,6 +1,8 @@ import io import os +from pathlib import Path import sys +from zipfile import ZipFile import numpy as np import pytest @@ -541,23 +543,29 @@ def test_to_csv_compression_dict_no_method_raises(self): df.to_csv(path, compression=compression) @pytest.mark.parametrize("compression", ["zip", "infer"]) - @pytest.mark.parametrize( - "archive_name", [None, "test_to_csv.csv", "test_to_csv.zip"] - ) + @pytest.mark.parametrize("archive_name", ["test_to_csv.csv", "test_to_csv.zip"]) def test_to_csv_zip_arguments(self, compression, archive_name): # GH 26023 - from zipfile import ZipFile - df = DataFrame({"ABC": [1]}) with tm.ensure_clean("to_csv_archive_name.zip") as path: df.to_csv( path, compression={"method": compression, "archive_name": archive_name} ) with ZipFile(path) as zp: - expected_arcname = path if archive_name is None else archive_name - expected_arcname = os.path.basename(expected_arcname) assert len(zp.filelist) == 1 - archived_file = os.path.basename(zp.filelist[0].filename) + archived_file = zp.filelist[0].filename + assert archived_file == archive_name + + @pytest.mark.parametrize("compression", ["zip", "infer"]) + def test_to_csv_zip_infer_name(self, compression): + # GH 39465 + df = DataFrame({"ABC": [1]}) + with tm.ensure_clean("archive.csv.zip") as path: + df.to_csv(path, compression=compression) + expected_arcname = Path(path).with_suffix("").name + with ZipFile(path) as zp: + assert len(zp.filelist) == 1 + archived_file = zp.filelist[0].filename assert archived_file == expected_arcname @pytest.mark.parametrize("df_new_type", ["Int64"]) From c1c89719d1097ae3603f6b969fd0da483bfa8dc0 Mon Sep 17 00:00:00 2001 From: Marcel Gerber Date: Sun, 14 Nov 2021 18:21:11 +0100 Subject: [PATCH 2/4] TST: test covers more scenarios --- pandas/tests/io/formats/test_to_csv.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 119b8e1f7ccb2..059fd96db43ad 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -556,13 +556,22 @@ def test_to_csv_zip_arguments(self, compression, archive_name): archived_file = zp.filelist[0].filename assert archived_file == archive_name - @pytest.mark.parametrize("compression", ["zip", "infer"]) - def test_to_csv_zip_infer_name(self, compression): + @pytest.mark.parametrize( + "filename,expected_arcname", + [ + ("archive.csv", "archive.csv"), + ("archive.tsv", "archive.tsv"), + ("archive.csv.zip", "archive.csv"), + ("archive.tsv.zip", "archive.tsv"), + ("archive.zip", "archive"), + ], + ) + def test_to_csv_zip_infer_name(self, filename, expected_arcname): # GH 39465 df = DataFrame({"ABC": [1]}) - with tm.ensure_clean("archive.csv.zip") as path: - df.to_csv(path, compression=compression) - expected_arcname = Path(path).with_suffix("").name + with tm.ensure_clean_dir() as dir: + path = Path(dir, filename) + df.to_csv(path, compression="zip") with ZipFile(path) as zp: assert len(zp.filelist) == 1 archived_file = zp.filelist[0].filename From 3e6d45cbd3c491a36bdaeffc1e0302b6abbfae6d Mon Sep 17 00:00:00 2001 From: Marcel Gerber Date: Sun, 14 Nov 2021 18:21:50 +0100 Subject: [PATCH 3/4] CLN: code style --- pandas/io/common.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index baf81f84668df..071c09dd48672 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -805,9 +805,12 @@ def __init__( # _PathLike[str]], IO[bytes]]" super().__init__(file, mode, **kwargs_zip) # type: ignore[arg-type] - # GH39465 - # If an explicit archive_name is not given, we still want the file _inside_ the zip - # file _not_ to be named something.zip, because that causes massive confusion. + """ + GH39465 + If an explicit archive_name is not given, we still want the file _inside_ the zip + file _not_ to be named something.zip, because that causes massive confusion. + """ + def infer_filename(self): if isinstance(self.filename, (os.PathLike, str)): filename = Path(self.filename) From 5d09b0b090f6273441027b16ac4e3f328e2e0366 Mon Sep 17 00:00:00 2001 From: Marcel Gerber Date: Sun, 14 Nov 2021 21:30:14 +0100 Subject: [PATCH 4/4] ENH: address review comments --- doc/source/whatsnew/v1.4.0.rst | 3 +-- pandas/io/common.py | 16 ++++++---------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index dcdd36d4ef8ff..609f085ca7144 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -210,7 +210,6 @@ Other enhancements - :meth:`read_excel` now accepts a ``decimal`` argument that allow the user to specify the decimal point when parsing string columns to numeric (:issue:`14403`) - :meth:`.GroupBy.mean` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`43731`) - :meth:`Timestamp.isoformat`, now handles the ``timespec`` argument from the base :class:``datetime`` class (:issue:`26131`) -- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file that contains another zip file. Instead, they try to infer the inner file name more smartly. (:issue:`39465`) .. --------------------------------------------------------------------------- @@ -640,7 +639,7 @@ I/O - Bug in :func:`read_csv` with :code:`float_precision="round_trip"` which did not skip initial/trailing whitespace (:issue:`43713`) - Bug in dumping/loading a :class:`DataFrame` with ``yaml.dump(frame)`` (:issue:`42748`) - Bug in :func:`read_csv` raising ``ValueError`` when ``parse_dates`` was used with ``MultiIndex`` columns (:issue:`8991`) -- +- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file containing a file ending with ".zip". Instead, they try to infer the inner file name more smartly. (:issue:`39465`) Period ^^^^^^ diff --git a/pandas/io/common.py b/pandas/io/common.py index 071c09dd48672..990b584cb9533 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -805,21 +805,17 @@ def __init__( # _PathLike[str]], IO[bytes]]" super().__init__(file, mode, **kwargs_zip) # type: ignore[arg-type] - """ - GH39465 - If an explicit archive_name is not given, we still want the file _inside_ the zip - file _not_ to be named something.zip, because that causes massive confusion. - """ - def infer_filename(self): + """ + If an explicit archive_name is not given, we still want the file inside the zip + file not to be named something.zip, because that causes confusion (GH39465). + """ if isinstance(self.filename, (os.PathLike, str)): filename = Path(self.filename) if filename.suffix == ".zip": return filename.with_suffix("").name - else: - return filename.name - else: - return None + return filename.name + return None def write(self, data): # buffer multiple write calls, write on flush