Skip to content

Commit b3f33a1

Browse files
authored
ENH: Infer inner file name of zip archive (GH39465) (#44445)
1 parent 9c421f0 commit b3f33a1

File tree

3 files changed

+39
-10
lines changed

3 files changed

+39
-10
lines changed

doc/source/whatsnew/v1.4.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -644,7 +644,7 @@ I/O
644644
- Bug in :func:`read_csv` with :code:`float_precision="round_trip"` which did not skip initial/trailing whitespace (:issue:`43713`)
645645
- Bug in dumping/loading a :class:`DataFrame` with ``yaml.dump(frame)`` (:issue:`42748`)
646646
- Bug in :func:`read_csv` raising ``ValueError`` when ``parse_dates`` was used with ``MultiIndex`` columns (:issue:`8991`)
647-
-
647+
- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file containing a file ending with ".zip". Instead, they try to infer the inner file name more smartly. (:issue:`39465`)
648648

649649
Period
650650
^^^^^^

pandas/io/common.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -839,6 +839,18 @@ def __init__(
839839
# _PathLike[str]], IO[bytes]]"
840840
super().__init__(file, mode, **kwargs_zip) # type: ignore[arg-type]
841841

842+
def infer_filename(self):
843+
"""
844+
If an explicit archive_name is not given, we still want the file inside the zip
845+
file not to be named something.zip, because that causes confusion (GH39465).
846+
"""
847+
if isinstance(self.filename, (os.PathLike, str)):
848+
filename = Path(self.filename)
849+
if filename.suffix == ".zip":
850+
return filename.with_suffix("").name
851+
return filename.name
852+
return None
853+
842854
def write(self, data):
843855
# buffer multiple write calls, write on flush
844856
if self.multiple_write_buffer is None:
@@ -853,7 +865,7 @@ def flush(self) -> None:
853865
return
854866

855867
# ZipFile needs a non-empty string
856-
archive_name = self.archive_name or self.filename or "zip"
868+
archive_name = self.archive_name or self.infer_filename() or "zip"
857869
with self.multiple_write_buffer:
858870
super().writestr(archive_name, self.multiple_write_buffer.getvalue())
859871

pandas/tests/io/formats/test_to_csv.py

+25-8
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import io
22
import os
3+
from pathlib import Path
34
import sys
5+
from zipfile import ZipFile
46

57
import numpy as np
68
import pytest
@@ -541,23 +543,38 @@ def test_to_csv_compression_dict_no_method_raises(self):
541543
df.to_csv(path, compression=compression)
542544

543545
@pytest.mark.parametrize("compression", ["zip", "infer"])
544-
@pytest.mark.parametrize(
545-
"archive_name", [None, "test_to_csv.csv", "test_to_csv.zip"]
546-
)
546+
@pytest.mark.parametrize("archive_name", ["test_to_csv.csv", "test_to_csv.zip"])
547547
def test_to_csv_zip_arguments(self, compression, archive_name):
548548
# GH 26023
549-
from zipfile import ZipFile
550-
551549
df = DataFrame({"ABC": [1]})
552550
with tm.ensure_clean("to_csv_archive_name.zip") as path:
553551
df.to_csv(
554552
path, compression={"method": compression, "archive_name": archive_name}
555553
)
556554
with ZipFile(path) as zp:
557-
expected_arcname = path if archive_name is None else archive_name
558-
expected_arcname = os.path.basename(expected_arcname)
559555
assert len(zp.filelist) == 1
560-
archived_file = os.path.basename(zp.filelist[0].filename)
556+
archived_file = zp.filelist[0].filename
557+
assert archived_file == archive_name
558+
559+
@pytest.mark.parametrize(
560+
"filename,expected_arcname",
561+
[
562+
("archive.csv", "archive.csv"),
563+
("archive.tsv", "archive.tsv"),
564+
("archive.csv.zip", "archive.csv"),
565+
("archive.tsv.zip", "archive.tsv"),
566+
("archive.zip", "archive"),
567+
],
568+
)
569+
def test_to_csv_zip_infer_name(self, filename, expected_arcname):
570+
# GH 39465
571+
df = DataFrame({"ABC": [1]})
572+
with tm.ensure_clean_dir() as dir:
573+
path = Path(dir, filename)
574+
df.to_csv(path, compression="zip")
575+
with ZipFile(path) as zp:
576+
assert len(zp.filelist) == 1
577+
archived_file = zp.filelist[0].filename
561578
assert archived_file == expected_arcname
562579

563580
@pytest.mark.parametrize("df_new_type", ["Int64"])

0 commit comments

Comments
 (0)