Skip to content

Commit 79b8074

Browse files
Backport PR #38728: REGR: to_csv created corrupt ZIP files when chunksize<rows (#38767)
Co-authored-by: Torsten Wörtwein <[email protected]>
1 parent e852abc commit 79b8074

File tree

3 files changed

+47
-5
lines changed

3 files changed

+47
-5
lines changed

doc/source/whatsnew/v1.2.1.rst

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ including other versions of pandas.
1515
Fixed regressions
1616
~~~~~~~~~~~~~~~~~
1717
- The deprecated attributes ``_AXIS_NAMES`` and ``_AXIS_NUMBERS`` of :class:`DataFrame` and :class:`Series` will no longer show up in ``dir`` or ``inspect.getmembers`` calls (:issue:`38740`)
18+
- :meth:`to_csv` created corrupted zip files when there were more rows than ``chunksize`` (issue:`38714`)
1819
-
1920

2021
.. ---------------------------------------------------------------------------

pandas/io/common.py

+24-5
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
from collections import abc
55
import dataclasses
66
import gzip
7-
from io import BufferedIOBase, BytesIO, RawIOBase, TextIOWrapper
7+
from io import BufferedIOBase, BytesIO, RawIOBase, StringIO, TextIOWrapper
88
import mmap
99
import os
10-
from typing import IO, Any, AnyStr, Dict, List, Mapping, Optional, Tuple, cast
10+
from typing import IO, Any, AnyStr, Dict, List, Mapping, Optional, Tuple, Union, cast
1111
from urllib.parse import (
1212
urljoin,
1313
urlparse as parse_url,
@@ -707,17 +707,36 @@ def __init__(
707707
archive_name: Optional[str] = None,
708708
**kwargs,
709709
):
710-
if mode in ["wb", "rb"]:
711-
mode = mode.replace("b", "")
710+
mode = mode.replace("b", "")
712711
self.archive_name = archive_name
712+
self.multiple_write_buffer: Optional[Union[StringIO, BytesIO]] = None
713+
713714
kwargs_zip: Dict[str, Any] = {"compression": zipfile.ZIP_DEFLATED}
714715
kwargs_zip.update(kwargs)
716+
715717
super().__init__(file, mode, **kwargs_zip) # type: ignore[arg-type]
716718

717719
def write(self, data):
720+
# buffer multiple write calls, write on flush
721+
if self.multiple_write_buffer is None:
722+
self.multiple_write_buffer = (
723+
BytesIO() if isinstance(data, bytes) else StringIO()
724+
)
725+
self.multiple_write_buffer.write(data)
726+
727+
def flush(self) -> None:
728+
# write to actual handle and close write buffer
729+
if self.multiple_write_buffer is None or self.multiple_write_buffer.closed:
730+
return
731+
718732
# ZipFile needs a non-empty string
719733
archive_name = self.archive_name or self.filename or "zip"
720-
super().writestr(archive_name, data)
734+
with self.multiple_write_buffer:
735+
super().writestr(archive_name, self.multiple_write_buffer.getvalue())
736+
737+
def close(self):
738+
self.flush()
739+
super().close()
721740

722741
@property
723742
def closed(self):

pandas/tests/io/formats/test_to_csv.py

+22
Original file line numberDiff line numberDiff line change
@@ -640,3 +640,25 @@ def test_to_csv_encoding_binary_handle(self, mode):
640640

641641
handle.seek(0)
642642
assert handle.read().startswith(b'\xef\xbb\xbf""')
643+
644+
645+
def test_to_csv_iterative_compression_name(compression):
646+
# GH 38714
647+
df = tm.makeDataFrame()
648+
with tm.ensure_clean() as path:
649+
df.to_csv(path, compression=compression, chunksize=1)
650+
tm.assert_frame_equal(
651+
pd.read_csv(path, compression=compression, index_col=0), df
652+
)
653+
654+
655+
def test_to_csv_iterative_compression_buffer(compression):
656+
# GH 38714
657+
df = tm.makeDataFrame()
658+
with io.BytesIO() as buffer:
659+
df.to_csv(buffer, compression=compression, chunksize=1)
660+
buffer.seek(0)
661+
tm.assert_frame_equal(
662+
pd.read_csv(buffer, compression=compression, index_col=0), df
663+
)
664+
assert not buffer.closed

0 commit comments

Comments
 (0)