From 519ae083b0cb6ebe22fbd194171b3e2fbf7e03f2 Mon Sep 17 00:00:00 2001
From: Mike Roberts <mike.roberts.2k10@googlemail.com>
Date: Sat, 15 Jan 2022 19:55:06 +0000
Subject: [PATCH] BUG: Fix file leaks in csv parsers (GH#45384)

---
 doc/source/whatsnew/v1.4.0.rst                |   1 +
 pandas/io/parsers/c_parser_wrapper.py         | 219 +++++++++---------
 pandas/io/parsers/python_parser.py            | 108 ++++-----
 pandas/tests/io/parser/test_parse_dates.py    |  18 ++
 .../io/parser/usecols/test_usecols_basic.py   |  18 ++
 5 files changed, 201 insertions(+), 163 deletions(-)

diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
index 64d8e36177051..5b587df47e635 100644
--- a/doc/source/whatsnew/v1.4.0.rst
+++ b/doc/source/whatsnew/v1.4.0.rst
@@ -874,6 +874,7 @@ I/O
 - Bug in :func:`read_csv` when passing a ``tempfile.SpooledTemporaryFile`` opened in binary mode (:issue:`44748`)
 - Bug in :func:`read_json` raising ``ValueError`` when attempting to parse json strings containing "://" (:issue:`36271`)
 - Bug in :func:`read_csv` when ``engine="c"`` and ``encoding_errors=None`` which caused a segfault (:issue:`45180`)
+- Bug in :func:`read_csv` allowing file handles to be leaked if an Exception is raised during parser initialisation (e.g. if the file does not pass ``usecols`` validation) (:issue:`45384`)
 
 Period
 ^^^^^^
diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
index 4088bf8d854bc..89ec847a93acc 100644
--- a/pandas/io/parsers/c_parser_wrapper.py
+++ b/pandas/io/parsers/c_parser_wrapper.py
@@ -65,135 +65,136 @@ def __init__(
         self._open_handles(src, kwds)
         assert self.handles is not None
 
-        # Have to pass int, would break tests using TextReader directly otherwise :(
-        kwds["on_bad_lines"] = self.on_bad_lines.value
-
-        for key in (
-            "storage_options",
-            "encoding",
-            "memory_map",
-            "compression",
-            "error_bad_lines",
-            "warn_bad_lines",
-        ):
-            kwds.pop(key, None)
-
-        kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
         try:
-            self._reader = parsers.TextReader(self.handles.handle, **kwds)
-        except Exception:
-            self.handles.close()
-            raise
+            # Have to pass int, would break tests using TextReader directly otherwise :(
+            kwds["on_bad_lines"] = self.on_bad_lines.value
+
+            for key in (
+                "storage_options",
+                "encoding",
+                "memory_map",
+                "compression",
+                "error_bad_lines",
+                "warn_bad_lines",
+            ):
+                kwds.pop(key, None)
 
-        self.unnamed_cols = self._reader.unnamed_cols
+            kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
 
-        # error: Cannot determine type of 'names'
-        passed_names = self.names is None  # type: ignore[has-type]
+            self._reader = parsers.TextReader(self.handles.handle, **kwds)
+
+            self.unnamed_cols = self._reader.unnamed_cols
 
-        if self._reader.header is None:
-            self.names = None
-        else:
             # error: Cannot determine type of 'names'
-            # error: Cannot determine type of 'index_names'
-            (
-                self.names,  # type: ignore[has-type]
-                self.index_names,
-                self.col_names,
-                passed_names,
-            ) = self._extract_multi_indexer_columns(
-                self._reader.header,
-                self.index_names,  # type: ignore[has-type]
-                passed_names,
-            )
+            passed_names = self.names is None  # type: ignore[has-type]
 
-        # error: Cannot determine type of 'names'
-        if self.names is None:  # type: ignore[has-type]
-            if self.prefix:
-                # error: Cannot determine type of 'names'
-                self.names = [  # type: ignore[has-type]
-                    f"{self.prefix}{i}" for i in range(self._reader.table_width)
-                ]
+            if self._reader.header is None:
+                self.names = None
             else:
                 # error: Cannot determine type of 'names'
-                self.names = list(  # type: ignore[has-type]
-                    range(self._reader.table_width)
+                # error: Cannot determine type of 'index_names'
+                (
+                    self.names,  # type: ignore[has-type]
+                    self.index_names,
+                    self.col_names,
+                    passed_names,
+                ) = self._extract_multi_indexer_columns(
+                    self._reader.header,
+                    self.index_names,  # type: ignore[has-type]
+                    passed_names,
                 )
 
-        # gh-9755
-        #
-        # need to set orig_names here first
-        # so that proper indexing can be done
-        # with _set_noconvert_columns
-        #
-        # once names has been filtered, we will
-        # then set orig_names again to names
-        # error: Cannot determine type of 'names'
-        self.orig_names = self.names[:]  # type: ignore[has-type]
-
-        if self.usecols:
-            usecols = self._evaluate_usecols(self.usecols, self.orig_names)
-
-            # GH 14671
-            # assert for mypy, orig_names is List or None, None would error in issubset
-            assert self.orig_names is not None
-            if self.usecols_dtype == "string" and not set(usecols).issubset(
-                self.orig_names
-            ):
-                self._validate_usecols_names(usecols, self.orig_names)
-
             # error: Cannot determine type of 'names'
-            if len(self.names) > len(usecols):  # type: ignore[has-type]
-                # error: Cannot determine type of 'names'
-                self.names = [  # type: ignore[has-type]
-                    n
+            if self.names is None:  # type: ignore[has-type]
+                if self.prefix:
                     # error: Cannot determine type of 'names'
-                    for i, n in enumerate(self.names)  # type: ignore[has-type]
-                    if (i in usecols or n in usecols)
-                ]
-
+                    self.names = [  # type: ignore[has-type]
+                        f"{self.prefix}{i}" for i in range(self._reader.table_width)
+                    ]
+                else:
+                    # error: Cannot determine type of 'names'
+                    self.names = list(  # type: ignore[has-type]
+                        range(self._reader.table_width)
+                    )
+
+            # gh-9755
+            #
+            # need to set orig_names here first
+            # so that proper indexing can be done
+            # with _set_noconvert_columns
+            #
+            # once names has been filtered, we will
+            # then set orig_names again to names
             # error: Cannot determine type of 'names'
-            if len(self.names) < len(usecols):  # type: ignore[has-type]
-                # error: Cannot determine type of 'names'
-                self._validate_usecols_names(
-                    usecols,
-                    self.names,  # type: ignore[has-type]
-                )
-
-        # error: Cannot determine type of 'names'
-        self._validate_parse_dates_presence(self.names)  # type: ignore[has-type]
-        self._set_noconvert_columns()
+            self.orig_names = self.names[:]  # type: ignore[has-type]
 
-        # error: Cannot determine type of 'names'
-        self.orig_names = self.names  # type: ignore[has-type]
+            if self.usecols:
+                usecols = self._evaluate_usecols(self.usecols, self.orig_names)
 
-        if not self._has_complex_date_col:
-            # error: Cannot determine type of 'index_col'
-            if self._reader.leading_cols == 0 and is_index_col(
-                self.index_col  # type: ignore[has-type]
-            ):
+                # GH 14671
+                # assert for mypy, orig_names is List/None, None would error in issubset
+                assert self.orig_names is not None
+                if self.usecols_dtype == "string" and not set(usecols).issubset(
+                    self.orig_names
+                ):
+                    self._validate_usecols_names(usecols, self.orig_names)
 
-                self._name_processed = True
-                (
-                    index_names,
-                    # error: Cannot determine type of 'names'
-                    self.names,  # type: ignore[has-type]
-                    self.index_col,
-                ) = self._clean_index_names(
+                # error: Cannot determine type of 'names'
+                if len(self.names) > len(usecols):  # type: ignore[has-type]
                     # error: Cannot determine type of 'names'
-                    self.names,  # type: ignore[has-type]
-                    # error: Cannot determine type of 'index_col'
-                    self.index_col,  # type: ignore[has-type]
-                    self.unnamed_cols,
-                )
+                    self.names = [  # type: ignore[has-type]
+                        n
+                        # error: Cannot determine type of 'names'
+                        for i, n in enumerate(self.names)  # type: ignore[has-type]
+                        if (i in usecols or n in usecols)
+                    ]
 
-                if self.index_names is None:
-                    self.index_names = index_names
+                # error: Cannot determine type of 'names'
+                if len(self.names) < len(usecols):  # type: ignore[has-type]
+                    # error: Cannot determine type of 'names'
+                    self._validate_usecols_names(
+                        usecols,
+                        self.names,  # type: ignore[has-type]
+                    )
 
-            if self._reader.header is None and not passed_names:
-                assert self.index_names is not None
-                self.index_names = [None] * len(self.index_names)
+            # error: Cannot determine type of 'names'
+            self._validate_parse_dates_presence(self.names)  # type: ignore[has-type]
+            self._set_noconvert_columns()
 
-        self._implicit_index = self._reader.leading_cols > 0
+            # error: Cannot determine type of 'names'
+            self.orig_names = self.names  # type: ignore[has-type]
+
+            if not self._has_complex_date_col:
+                # error: Cannot determine type of 'index_col'
+                if self._reader.leading_cols == 0 and is_index_col(
+                    self.index_col  # type: ignore[has-type]
+                ):
+
+                    self._name_processed = True
+                    (
+                        index_names,
+                        # error: Cannot determine type of 'names'
+                        self.names,  # type: ignore[has-type]
+                        self.index_col,
+                    ) = self._clean_index_names(
+                        # error: Cannot determine type of 'names'
+                        self.names,  # type: ignore[has-type]
+                        # error: Cannot determine type of 'index_col'
+                        self.index_col,  # type: ignore[has-type]
+                        self.unnamed_cols,
+                    )
+
+                    if self.index_names is None:
+                        self.index_names = index_names
+
+                if self._reader.header is None and not passed_names:
+                    assert self.index_names is not None
+                    self.index_names = [None] * len(self.index_names)
+
+            self._implicit_index = self._reader.leading_cols > 0
+        except Exception:
+            self.handles.close()
+            raise
 
     def close(self) -> None:
         super().close()
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index 55ad6be3100e7..19b0ccd44d11e 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -113,72 +113,72 @@ def __init__(
                 self.close()
                 raise
 
-        # Get columns in two steps: infer from data, then
-        # infer column indices from self.usecols if it is specified.
-        self._col_indices: list[int] | None = None
-        columns: list[list[Scalar | None]]
         try:
+            # Get columns in two steps: infer from data, then
+            # infer column indices from self.usecols if it is specified.
+            self._col_indices: list[int] | None = None
+            columns: list[list[Scalar | None]]
             (
                 columns,
                 self.num_original_columns,
                 self.unnamed_cols,
             ) = self._infer_columns()
-        except (TypeError, ValueError):
-            self.close()
-            raise
 
-        # Now self.columns has the set of columns that we will process.
-        # The original set is stored in self.original_columns.
-        # error: Cannot determine type of 'index_names'
-        self.columns: list[Hashable]
-        (
-            self.columns,
-            self.index_names,
-            self.col_names,
-            _,
-        ) = self._extract_multi_indexer_columns(
-            columns,
-            self.index_names,  # type: ignore[has-type]
-        )
+            # Now self.columns has the set of columns that we will process.
+            # The original set is stored in self.original_columns.
+            # error: Cannot determine type of 'index_names'
+            self.columns: list[Hashable]
+            (
+                self.columns,
+                self.index_names,
+                self.col_names,
+                _,
+            ) = self._extract_multi_indexer_columns(
+                columns,
+                self.index_names,  # type: ignore[has-type]
+            )
 
-        # get popped off for index
-        self.orig_names: list[Hashable] = list(self.columns)
+            # get popped off for index
+            self.orig_names: list[Hashable] = list(self.columns)
 
-        # needs to be cleaned/refactored
-        # multiple date column thing turning into a real spaghetti factory
+            # needs to be cleaned/refactored
+            # multiple date column thing turning into a real spaghetti factory
 
-        if not self._has_complex_date_col:
-            (index_names, self.orig_names, self.columns) = self._get_index_name(
-                self.columns
-            )
-            self._name_processed = True
-            if self.index_names is None:
-                self.index_names = index_names
-
-        if self._col_indices is None:
-            self._col_indices = list(range(len(self.columns)))
-
-        self._parse_date_cols = self._validate_parse_dates_presence(self.columns)
-        no_thousands_columns: set[int] | None = None
-        if self.parse_dates:
-            no_thousands_columns = self._set_noconvert_dtype_columns(
-                self._col_indices, self.columns
-            )
-        self._no_thousands_columns = no_thousands_columns
+            if not self._has_complex_date_col:
+                (index_names, self.orig_names, self.columns) = self._get_index_name(
+                    self.columns
+                )
+                self._name_processed = True
+                if self.index_names is None:
+                    self.index_names = index_names
+
+            if self._col_indices is None:
+                self._col_indices = list(range(len(self.columns)))
+
+            self._parse_date_cols = self._validate_parse_dates_presence(self.columns)
+            no_thousands_columns: set[int] | None = None
+            if self.parse_dates:
+                no_thousands_columns = self._set_noconvert_dtype_columns(
+                    self._col_indices, self.columns
+                )
+            self._no_thousands_columns = no_thousands_columns
 
-        if len(self.decimal) != 1:
-            raise ValueError("Only length-1 decimal markers supported")
+            if len(self.decimal) != 1:
+                raise ValueError("Only length-1 decimal markers supported")
 
-        decimal = re.escape(self.decimal)
-        if self.thousands is None:
-            regex = fr"^[\-\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$"
-        else:
-            thousands = re.escape(self.thousands)
-            regex = (
-                fr"^[\-\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?"
-                fr"([0-9]?(E|e)\-?[0-9]+)?$"
-            )
-        self.num = re.compile(regex)
+            decimal = re.escape(self.decimal)
+            if self.thousands is None:
+                regex = fr"^[\-\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$"
+            else:
+                thousands = re.escape(self.thousands)
+                regex = (
+                    fr"^[\-\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?"
+                    fr"([0-9]?(E|e)\-?[0-9]+)?$"
+                )
+            self.num = re.compile(regex)
+        except Exception:
+            self.close()
+            raise
 
     def _make_reader(self, f) -> None:
         sep = self.delimiter
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
index 1dfd81366de72..d8f0390134770 100644
--- a/pandas/tests/io/parser/test_parse_dates.py
+++ b/pandas/tests/io/parser/test_parse_dates.py
@@ -22,6 +22,7 @@
 from pandas._libs.tslibs import parsing
 from pandas._libs.tslibs.parsing import parse_datetime_string
 from pandas.compat.pyarrow import pa_version_under6p0
+import pandas.util._test_decorators as td
 
 import pandas as pd
 from pandas import (
@@ -1985,3 +1986,20 @@ def test_parse_dates_and_string_dtype(all_parsers):
     expected = DataFrame({"a": ["1"], "b": [Timestamp("2019-12-31")]})
     expected["a"] = expected["a"].astype("string")
     tm.assert_frame_equal(result, expected)
+
+
+def test_parse_dates_error_doesnt_leak_files(all_parsers, datapath):
+    # GH#45384
+    parser = all_parsers
+    test2csv = datapath("io", "parser", "data", "test2.csv")
+    psutil = td.safe_import("psutil")
+    if not psutil:
+        return
+
+    proc = psutil.Process()
+    files_before = [(f.path, f.fd) for f in proc.open_files()]
+    try:
+        parser.read_csv(test2csv, parse_dates=["F"])  # "F" not in file
+    except ValueError:
+        files_after = [(f.path, f.fd) for f in proc.open_files()]
+        assert files_before == files_after, (files_before, files_after)
diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py
index f35caf38c847f..544ab7c73c578 100644
--- a/pandas/tests/io/parser/usecols/test_usecols_basic.py
+++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py
@@ -7,6 +7,8 @@
 import numpy as np
 import pytest
 
+import pandas.util._test_decorators as td
+
 from pandas import (
     DataFrame,
     Index,
@@ -416,3 +418,19 @@ def test_usecols_indices_out_of_bounds(all_parsers, names):
     if names is None and parser.engine == "python":
         expected = DataFrame({"a": [1]})
     tm.assert_frame_equal(result, expected)
+
+
+def test_usecols_error_doesnt_leak_files(all_parsers, datapath):
+    # GH#45384
+    test2csv = datapath("io", "parser", "data", "test2.csv")
+    psutil = td.safe_import("psutil")
+    if not psutil:
+        return
+
+    proc = psutil.Process()
+    files_before = [(f.path, f.fd) for f in proc.open_files()]
+    try:
+        all_parsers.read_csv(test2csv, usecols=["F"])  # "F" not in file
+    except ValueError:
+        files_after = [(f.path, f.fd) for f in proc.open_files()]
+        assert files_before == files_after, (files_before, files_after)