diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index a593a03de5c25..609f085ca7144 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -639,7 +639,7 @@ I/O - Bug in :func:`read_csv` with :code:`float_precision="round_trip"` which did not skip initial/trailing whitespace (:issue:`43713`) - Bug in dumping/loading a :class:`DataFrame` with ``yaml.dump(frame)`` (:issue:`42748`) - Bug in :func:`read_csv` raising ``ValueError`` when ``parse_dates`` was used with ``MultiIndex`` columns (:issue:`8991`) -- +- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file containing a file ending with ".zip". Instead, they try to infer the inner file name more smartly. (:issue:`39465`) Period ^^^^^^ diff --git a/pandas/io/common.py b/pandas/io/common.py index 12c7afc8ee2e4..990b584cb9533 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -805,6 +805,18 @@ def __init__( # _PathLike[str]], IO[bytes]]" super().__init__(file, mode, **kwargs_zip) # type: ignore[arg-type] + def infer_filename(self): + """ + If an explicit archive_name is not given, we still want the file inside the zip + file not to be named something.zip, because that causes confusion (GH39465). + """ + if isinstance(self.filename, (os.PathLike, str)): + filename = Path(self.filename) + if filename.suffix == ".zip": + return filename.with_suffix("").name + return filename.name + return None + def write(self, data): # buffer multiple write calls, write on flush if self.multiple_write_buffer is None: @@ -819,7 +831,7 @@ def flush(self) -> None: return # ZipFile needs a non-empty string - archive_name = self.archive_name or self.filename or "zip" + archive_name = self.archive_name or self.infer_filename() or "zip" with self.multiple_write_buffer: super().writestr(archive_name, self.multiple_write_buffer.getvalue()) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 4c482bafa6c9c..059fd96db43ad 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -1,6 +1,8 @@ import io import os +from pathlib import Path import sys +from zipfile import ZipFile import numpy as np import pytest @@ -541,23 +543,38 @@ def test_to_csv_compression_dict_no_method_raises(self): df.to_csv(path, compression=compression) @pytest.mark.parametrize("compression", ["zip", "infer"]) - @pytest.mark.parametrize( - "archive_name", [None, "test_to_csv.csv", "test_to_csv.zip"] - ) + @pytest.mark.parametrize("archive_name", ["test_to_csv.csv", "test_to_csv.zip"]) def test_to_csv_zip_arguments(self, compression, archive_name): # GH 26023 - from zipfile import ZipFile - df = DataFrame({"ABC": [1]}) with tm.ensure_clean("to_csv_archive_name.zip") as path: df.to_csv( path, compression={"method": compression, "archive_name": archive_name} ) with ZipFile(path) as zp: - expected_arcname = path if archive_name is None else archive_name - expected_arcname = os.path.basename(expected_arcname) assert len(zp.filelist) == 1 - archived_file = os.path.basename(zp.filelist[0].filename) + archived_file = zp.filelist[0].filename + assert archived_file == archive_name + + @pytest.mark.parametrize( + "filename,expected_arcname", + [ + ("archive.csv", "archive.csv"), + ("archive.tsv", "archive.tsv"), + ("archive.csv.zip", "archive.csv"), + ("archive.tsv.zip", "archive.tsv"), + ("archive.zip", "archive"), + ], + ) + def test_to_csv_zip_infer_name(self, filename, expected_arcname): + # GH 39465 + df = DataFrame({"ABC": [1]}) + with tm.ensure_clean_dir() as dir: + path = Path(dir, filename) + df.to_csv(path, compression="zip") + with ZipFile(path) as zp: + assert len(zp.filelist) == 1 + archived_file = zp.filelist[0].filename assert archived_file == expected_arcname @pytest.mark.parametrize("df_new_type", ["Int64"])