REGR: to_stata tried to remove file before closing it (#39202)

twoertwein · simonjayhawkins · web-flow · commit 6ff2e7c60212 · 2021-01-18T10:22:07.000-05:00
Co-authored-by: Simon Hawkins &lt;simonjayhawkins@gmail.com&gt;
diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst
@@ -17,6 +17,7 @@ Fixed regressions
 - Fixed regression in :meth:`~DataFrame.to_csv` that created corrupted zip files when there were more rows than ``chunksize`` (:issue:`38714`)
 - Fixed regression in :meth:`read_csv` and other read functions were the encoding error policy (``errors``) did not default to ``"replace"`` when no encoding was specified (:issue:`38989`)
 - Fixed regression in :func:`read_excel` with non-rawbyte file handles (:issue:`38788`)
+- Fixed regression in :meth:`DataFrame.to_stata` not removing the created file when an error occured (:issue:`39202`)
 - Fixed regression in ``DataFrame.__setitem__`` raising ``ValueError`` when expanding :class:`DataFrame` and new column is from type ``"0 - name"`` (:issue:`39010`)
 - Fixed regression in setting with :meth:`DataFrame.loc`  raising ``ValueError`` when :class:`DataFrame` has unsorted :class:`MultiIndex` columns and indexer is a scalar (:issue:`38601`)
 - Fixed regression in setting with :meth:`DataFrame.loc` raising ``KeyError`` with :class:`MultiIndex` and list-like columns indexer enlarging :class:`DataFrame` (:issue:`39147`)
diff --git a/pandas/_testing/contexts.py b/pandas/_testing/contexts.py
@@ -1,7 +1,11 @@
 from contextlib import contextmanager
 import os
+from pathlib import Path
+import random
 from shutil import rmtree
+import string
 import tempfile
+from typing import IO, Any, Union
 
 import numpy as np
 
@@ -73,66 +77,48 @@ def setTZ(tz):
 
 
 @contextmanager
-def ensure_clean(filename=None, return_filelike=False, **kwargs):
+def ensure_clean(filename=None, return_filelike: bool = False, **kwargs: Any):
     """
     Gets a temporary path and agrees to remove on close.
 
+    This implementation does not use tempfile.mkstemp to avoid having a file handle.
+    If the code using the returned path wants to delete the file itself, windows
+    requires that no program has a file handle to it.
+
     Parameters
     ----------
     filename : str (optional)
-        if None, creates a temporary file which is then removed when out of
-        scope. if passed, creates temporary file with filename as ending.
+        suffix of the created file.
     return_filelike : bool (default False)
         if True, returns a file-like which is *always* cleaned. Necessary for
         savefig and other functions which want to append extensions.
     **kwargs
-        Additional keywords passed in for creating a temporary file.
-        :meth:`tempFile.TemporaryFile` is used when `return_filelike` is ``True``.
-        :meth:`tempfile.mkstemp` is used when `return_filelike` is ``False``.
-        Note that the `filename` parameter will be passed in as the `suffix`
-        argument to either function.
+        Additional keywords are passed to open().
 
-    See Also
-    --------
-    tempfile.TemporaryFile
-    tempfile.mkstemp
     """
-    filename = filename or ""
-    fd = None
-
-    kwargs["suffix"] = filename
+    folder = Path(tempfile.gettempdir())
 
-    if return_filelike:
-        f = tempfile.TemporaryFile(**kwargs)
-
-        try:
-            yield f
-        finally:
-            f.close()
-    else:
-        # Don't generate tempfile if using a path with directory specified.
-        if len(os.path.dirname(filename)):
-            raise ValueError("Can't pass a qualified name to ensure_clean()")
+    if filename is None:
+        filename = ""
+    filename = (
+        "".join(random.choices(string.ascii_letters + string.digits, k=30)) + filename
+    )
+    path = folder / filename
 
-        try:
-            fd, filename = tempfile.mkstemp(**kwargs)
-        except UnicodeEncodeError:
-            import pytest
+    path.touch()
 
-            pytest.skip("no unicode file names on this system")
+    handle_or_str: Union[str, IO] = str(path)
+    if return_filelike:
+        kwargs.setdefault("mode", "w+b")
+        handle_or_str = open(path, **kwargs)
 
-        try:
-            yield filename
-        finally:
-            try:
-                os.close(fd)
-            except OSError:
-                print(f"Couldn't close file descriptor: {fd} (file: {filename})")
-            try:
-                if os.path.exists(filename):
-                    os.remove(filename)
-            except OSError as e:
-                print(f"Exception on removing file: {e}")
+    try:
+        yield handle_or_str
+    finally:
+        if not isinstance(handle_or_str, str):
+            handle_or_str.close()
+        if path.is_file():
+            path.unlink()
 
 
 @contextmanager
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -15,7 +15,6 @@
 import datetime
 from io import BytesIO
 import os
-from pathlib import Path
 import struct
 import sys
 from typing import (
@@ -2466,8 +2465,8 @@ def write_file(self) -> None:
             if self.handles.compression["method"] is not None:
                 # ZipFile creates a file (with the same name) for each write call.
                 # Write it first into a buffer and then write the buffer to the ZipFile.
-                self._output_file = self.handles.handle
-                self.handles.handle = BytesIO()
+                self._output_file, self.handles.handle = self.handles.handle, BytesIO()
+                self.handles.created_handles.append(self.handles.handle)
 
             try:
                 self._write_header(
@@ -2488,20 +2487,23 @@ def write_file(self) -> None:
                 self._write_value_labels()
                 self._write_file_close_tag()
                 self._write_map()
-            except Exception as exc:
                 self._close()
-                if isinstance(self._fname, (str, Path)):
+            except Exception as exc:
+                self.handles.close()
+                # Only @runtime_checkable protocols can be used with instance and class
+                # checks
+                if isinstance(
+                    self._fname, (str, os.PathLike)  # type: ignore[misc]
+                ) and os.path.isfile(self._fname):
                     try:
                         os.unlink(self._fname)
                     except OSError:
                         warnings.warn(
                             f"This save was not successful but {self._fname} could not "
-                            "be deleted.  This file is not valid.",
+                            "be deleted. This file is not valid.",
                             ResourceWarning,
                         )
                 raise exc
-            else:
-                self._close()
 
     def _close(self) -> None:
         """
@@ -2513,11 +2515,8 @@ def _close(self) -> None:
         # write compression
         if self._output_file is not None:
             assert isinstance(self.handles.handle, BytesIO)
-            bio = self.handles.handle
-            bio.seek(0)
-            self.handles.handle = self._output_file
-            self.handles.handle.write(bio.read())  # type: ignore[arg-type]
-            bio.close()
+            bio, self.handles.handle = self.handles.handle, self._output_file
+            self.handles.handle.write(bio.getvalue())  # type: ignore[arg-type]
 
     def _write_map(self) -> None:
         """No-op, future compatibility"""
diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py
@@ -657,30 +657,27 @@ def test_excel_date_datetime_format(self, engine, ext, path):
         )
 
         with tm.ensure_clean(ext) as filename2:
-            writer1 = ExcelWriter(path)
-            writer2 = ExcelWriter(
+            with ExcelWriter(path) as writer1:
+                df.to_excel(writer1, "test1")
+
+            with ExcelWriter(
                 filename2,
                 date_format="DD.MM.YYYY",
                 datetime_format="DD.MM.YYYY HH-MM-SS",
-            )
-
-            df.to_excel(writer1, "test1")
-            df.to_excel(writer2, "test1")
-
-            writer1.close()
-            writer2.close()
+            ) as writer2:
+                df.to_excel(writer2, "test1")
 
-            reader1 = ExcelFile(path)
-            reader2 = ExcelFile(filename2)
+            with ExcelFile(path) as reader1:
+                rs1 = pd.read_excel(reader1, sheet_name="test1", index_col=0)
 
-            rs1 = pd.read_excel(reader1, sheet_name="test1", index_col=0)
-            rs2 = pd.read_excel(reader2, sheet_name="test1", index_col=0)
+            with ExcelFile(filename2) as reader2:
+                rs2 = pd.read_excel(reader2, sheet_name="test1", index_col=0)
 
-            tm.assert_frame_equal(rs1, rs2)
+        tm.assert_frame_equal(rs1, rs2)
 
-            # Since the reader returns a datetime object for dates,
-            # we need to use df_expected to check the result.
-            tm.assert_frame_equal(rs2, df_expected)
+        # Since the reader returns a datetime object for dates,
+        # we need to use df_expected to check the result.
+        tm.assert_frame_equal(rs2, df_expected)
 
     def test_to_excel_interval_no_labels(self, path):
         # see gh-19242
@@ -862,7 +859,7 @@ def test_to_excel_unicode_filename(self, ext, path):
                 f = open(filename, "wb")
             except UnicodeEncodeError:
                 pytest.skip("No unicode file names on this system")
-            else:
+            finally:
                 f.close()
 
             df = DataFrame(
@@ -872,15 +869,15 @@ def test_to_excel_unicode_filename(self, ext, path):
             )
             df.to_excel(filename, "test1", float_format="%.2f")
 
-            reader = ExcelFile(filename)
-            result = pd.read_excel(reader, sheet_name="test1", index_col=0)
+            with ExcelFile(filename) as reader:
+                result = pd.read_excel(reader, sheet_name="test1", index_col=0)
 
-            expected = DataFrame(
-                [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]],
-                index=["A", "B"],
-                columns=["X", "Y", "Z"],
-            )
-            tm.assert_frame_equal(result, expected)
+        expected = DataFrame(
+            [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]],
+            index=["A", "B"],
+            columns=["X", "Y", "Z"],
+        )
+        tm.assert_frame_equal(result, expected)
 
     # FIXME: dont leave commented-out
     # def test_to_excel_header_styling_xls(self, engine, ext):
@@ -1374,8 +1371,8 @@ def test_excelfile_fspath(self):
         with tm.ensure_clean("foo.xlsx") as path:
             df = DataFrame({"A": [1, 2]})
             df.to_excel(path)
-            xl = ExcelFile(path)
-            result = os.fspath(xl)
+            with ExcelFile(path) as xl:
+                result = os.fspath(xl)
             assert result == path
 
     def test_excelwriter_fspath(self):
diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py
@@ -545,12 +545,12 @@ def test_to_csv_zip_arguments(self, compression, archive_name):
             df.to_csv(
                 path, compression={"method": compression, "archive_name": archive_name}
             )
-            zp = ZipFile(path)
-            expected_arcname = path if archive_name is None else archive_name
-            expected_arcname = os.path.basename(expected_arcname)
-            assert len(zp.filelist) == 1
-            archived_file = os.path.basename(zp.filelist[0].filename)
-            assert archived_file == expected_arcname
+            with ZipFile(path) as zp:
+                expected_arcname = path if archive_name is None else archive_name
+                expected_arcname = os.path.basename(expected_arcname)
+                assert len(zp.filelist) == 1
+                archived_file = os.path.basename(zp.filelist[0].filename)
+                assert archived_file == expected_arcname
 
     @pytest.mark.parametrize("df_new_type", ["Int64"])
     def test_to_csv_na_rep_long_string(self, df_new_type):
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -555,6 +555,7 @@ def test_invalid_timestamp(self, version):
             msg = "time_stamp should be datetime type"
             with pytest.raises(ValueError, match=msg):
                 original.to_stata(path, time_stamp=time_stamp, version=version)
+            assert not os.path.isfile(path)
 
     def test_numeric_column_names(self):
         original = DataFrame(np.reshape(np.arange(25.0), (5, 5)))
@@ -1921,10 +1922,10 @@ def test_compression_dict(method, file_ext):
         compression = {"method": method, "archive_name": archive_name}
         df.to_stata(path, compression=compression)
         if method == "zip" or file_ext == "zip":
-            zp = zipfile.ZipFile(path, "r")
-            assert len(zp.filelist) == 1
-            assert zp.filelist[0].filename == archive_name
-            fp = io.BytesIO(zp.read(zp.filelist[0]))
+            with zipfile.ZipFile(path, "r") as zp:
+                assert len(zp.filelist) == 1
+                assert zp.filelist[0].filename == archive_name
+                fp = io.BytesIO(zp.read(zp.filelist[0]))
         else:
             fp = path
         reread = read_stata(fp, index_col="index")