From 519ae083b0cb6ebe22fbd194171b3e2fbf7e03f2 Mon Sep 17 00:00:00 2001 From: Mike Roberts Date: Sat, 15 Jan 2022 19:55:06 +0000 Subject: [PATCH] BUG: Fix file leaks in csv parsers (GH#45384) --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/io/parsers/c_parser_wrapper.py | 219 +++++++++--------- pandas/io/parsers/python_parser.py | 108 ++++----- pandas/tests/io/parser/test_parse_dates.py | 18 ++ .../io/parser/usecols/test_usecols_basic.py | 18 ++ 5 files changed, 201 insertions(+), 163 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 64d8e36177051..5b587df47e635 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -874,6 +874,7 @@ I/O - Bug in :func:`read_csv` when passing a ``tempfile.SpooledTemporaryFile`` opened in binary mode (:issue:`44748`) - Bug in :func:`read_json` raising ``ValueError`` when attempting to parse json strings containing "://" (:issue:`36271`) - Bug in :func:`read_csv` when ``engine="c"`` and ``encoding_errors=None`` which caused a segfault (:issue:`45180`) +- Bug in :func:`read_csv` allowing file handles to be leaked if an Exception is raised during parser initialisation (e.g. if the file does not pass ``usecols`` validation) (:issue:`45384`) Period ^^^^^^ diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 4088bf8d854bc..89ec847a93acc 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -65,135 +65,136 @@ def __init__( self._open_handles(src, kwds) assert self.handles is not None - # Have to pass int, would break tests using TextReader directly otherwise :( - kwds["on_bad_lines"] = self.on_bad_lines.value - - for key in ( - "storage_options", - "encoding", - "memory_map", - "compression", - "error_bad_lines", - "warn_bad_lines", - ): - kwds.pop(key, None) - - kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None)) try: - self._reader = parsers.TextReader(self.handles.handle, **kwds) - except Exception: - self.handles.close() - raise + # Have to pass int, would break tests using TextReader directly otherwise :( + kwds["on_bad_lines"] = self.on_bad_lines.value + + for key in ( + "storage_options", + "encoding", + "memory_map", + "compression", + "error_bad_lines", + "warn_bad_lines", + ): + kwds.pop(key, None) - self.unnamed_cols = self._reader.unnamed_cols + kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None)) - # error: Cannot determine type of 'names' - passed_names = self.names is None # type: ignore[has-type] + self._reader = parsers.TextReader(self.handles.handle, **kwds) + + self.unnamed_cols = self._reader.unnamed_cols - if self._reader.header is None: - self.names = None - else: # error: Cannot determine type of 'names' - # error: Cannot determine type of 'index_names' - ( - self.names, # type: ignore[has-type] - self.index_names, - self.col_names, - passed_names, - ) = self._extract_multi_indexer_columns( - self._reader.header, - self.index_names, # type: ignore[has-type] - passed_names, - ) + passed_names = self.names is None # type: ignore[has-type] - # error: Cannot determine type of 'names' - if self.names is None: # type: ignore[has-type] - if self.prefix: - # error: Cannot determine type of 'names' - self.names = [ # type: ignore[has-type] - f"{self.prefix}{i}" for i in range(self._reader.table_width) - ] + if self._reader.header is None: + self.names = None else: # error: Cannot determine type of 'names' - self.names = list( # type: ignore[has-type] - range(self._reader.table_width) + # error: Cannot determine type of 'index_names' + ( + self.names, # type: ignore[has-type] + self.index_names, + self.col_names, + passed_names, + ) = self._extract_multi_indexer_columns( + self._reader.header, + self.index_names, # type: ignore[has-type] + passed_names, ) - # gh-9755 - # - # need to set orig_names here first - # so that proper indexing can be done - # with _set_noconvert_columns - # - # once names has been filtered, we will - # then set orig_names again to names - # error: Cannot determine type of 'names' - self.orig_names = self.names[:] # type: ignore[has-type] - - if self.usecols: - usecols = self._evaluate_usecols(self.usecols, self.orig_names) - - # GH 14671 - # assert for mypy, orig_names is List or None, None would error in issubset - assert self.orig_names is not None - if self.usecols_dtype == "string" and not set(usecols).issubset( - self.orig_names - ): - self._validate_usecols_names(usecols, self.orig_names) - # error: Cannot determine type of 'names' - if len(self.names) > len(usecols): # type: ignore[has-type] - # error: Cannot determine type of 'names' - self.names = [ # type: ignore[has-type] - n + if self.names is None: # type: ignore[has-type] + if self.prefix: # error: Cannot determine type of 'names' - for i, n in enumerate(self.names) # type: ignore[has-type] - if (i in usecols or n in usecols) - ] - + self.names = [ # type: ignore[has-type] + f"{self.prefix}{i}" for i in range(self._reader.table_width) + ] + else: + # error: Cannot determine type of 'names' + self.names = list( # type: ignore[has-type] + range(self._reader.table_width) + ) + + # gh-9755 + # + # need to set orig_names here first + # so that proper indexing can be done + # with _set_noconvert_columns + # + # once names has been filtered, we will + # then set orig_names again to names # error: Cannot determine type of 'names' - if len(self.names) < len(usecols): # type: ignore[has-type] - # error: Cannot determine type of 'names' - self._validate_usecols_names( - usecols, - self.names, # type: ignore[has-type] - ) - - # error: Cannot determine type of 'names' - self._validate_parse_dates_presence(self.names) # type: ignore[has-type] - self._set_noconvert_columns() + self.orig_names = self.names[:] # type: ignore[has-type] - # error: Cannot determine type of 'names' - self.orig_names = self.names # type: ignore[has-type] + if self.usecols: + usecols = self._evaluate_usecols(self.usecols, self.orig_names) - if not self._has_complex_date_col: - # error: Cannot determine type of 'index_col' - if self._reader.leading_cols == 0 and is_index_col( - self.index_col # type: ignore[has-type] - ): + # GH 14671 + # assert for mypy, orig_names is List/None, None would error in issubset + assert self.orig_names is not None + if self.usecols_dtype == "string" and not set(usecols).issubset( + self.orig_names + ): + self._validate_usecols_names(usecols, self.orig_names) - self._name_processed = True - ( - index_names, - # error: Cannot determine type of 'names' - self.names, # type: ignore[has-type] - self.index_col, - ) = self._clean_index_names( + # error: Cannot determine type of 'names' + if len(self.names) > len(usecols): # type: ignore[has-type] # error: Cannot determine type of 'names' - self.names, # type: ignore[has-type] - # error: Cannot determine type of 'index_col' - self.index_col, # type: ignore[has-type] - self.unnamed_cols, - ) + self.names = [ # type: ignore[has-type] + n + # error: Cannot determine type of 'names' + for i, n in enumerate(self.names) # type: ignore[has-type] + if (i in usecols or n in usecols) + ] - if self.index_names is None: - self.index_names = index_names + # error: Cannot determine type of 'names' + if len(self.names) < len(usecols): # type: ignore[has-type] + # error: Cannot determine type of 'names' + self._validate_usecols_names( + usecols, + self.names, # type: ignore[has-type] + ) - if self._reader.header is None and not passed_names: - assert self.index_names is not None - self.index_names = [None] * len(self.index_names) + # error: Cannot determine type of 'names' + self._validate_parse_dates_presence(self.names) # type: ignore[has-type] + self._set_noconvert_columns() - self._implicit_index = self._reader.leading_cols > 0 + # error: Cannot determine type of 'names' + self.orig_names = self.names # type: ignore[has-type] + + if not self._has_complex_date_col: + # error: Cannot determine type of 'index_col' + if self._reader.leading_cols == 0 and is_index_col( + self.index_col # type: ignore[has-type] + ): + + self._name_processed = True + ( + index_names, + # error: Cannot determine type of 'names' + self.names, # type: ignore[has-type] + self.index_col, + ) = self._clean_index_names( + # error: Cannot determine type of 'names' + self.names, # type: ignore[has-type] + # error: Cannot determine type of 'index_col' + self.index_col, # type: ignore[has-type] + self.unnamed_cols, + ) + + if self.index_names is None: + self.index_names = index_names + + if self._reader.header is None and not passed_names: + assert self.index_names is not None + self.index_names = [None] * len(self.index_names) + + self._implicit_index = self._reader.leading_cols > 0 + except Exception: + self.handles.close() + raise def close(self) -> None: super().close() diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 55ad6be3100e7..19b0ccd44d11e 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -113,72 +113,72 @@ def __init__( self.close() raise - # Get columns in two steps: infer from data, then - # infer column indices from self.usecols if it is specified. - self._col_indices: list[int] | None = None - columns: list[list[Scalar | None]] try: + # Get columns in two steps: infer from data, then + # infer column indices from self.usecols if it is specified. + self._col_indices: list[int] | None = None + columns: list[list[Scalar | None]] ( columns, self.num_original_columns, self.unnamed_cols, ) = self._infer_columns() - except (TypeError, ValueError): - self.close() - raise - # Now self.columns has the set of columns that we will process. - # The original set is stored in self.original_columns. - # error: Cannot determine type of 'index_names' - self.columns: list[Hashable] - ( - self.columns, - self.index_names, - self.col_names, - _, - ) = self._extract_multi_indexer_columns( - columns, - self.index_names, # type: ignore[has-type] - ) + # Now self.columns has the set of columns that we will process. + # The original set is stored in self.original_columns. + # error: Cannot determine type of 'index_names' + self.columns: list[Hashable] + ( + self.columns, + self.index_names, + self.col_names, + _, + ) = self._extract_multi_indexer_columns( + columns, + self.index_names, # type: ignore[has-type] + ) - # get popped off for index - self.orig_names: list[Hashable] = list(self.columns) + # get popped off for index + self.orig_names: list[Hashable] = list(self.columns) - # needs to be cleaned/refactored - # multiple date column thing turning into a real spaghetti factory + # needs to be cleaned/refactored + # multiple date column thing turning into a real spaghetti factory - if not self._has_complex_date_col: - (index_names, self.orig_names, self.columns) = self._get_index_name( - self.columns - ) - self._name_processed = True - if self.index_names is None: - self.index_names = index_names - - if self._col_indices is None: - self._col_indices = list(range(len(self.columns))) - - self._parse_date_cols = self._validate_parse_dates_presence(self.columns) - no_thousands_columns: set[int] | None = None - if self.parse_dates: - no_thousands_columns = self._set_noconvert_dtype_columns( - self._col_indices, self.columns - ) - self._no_thousands_columns = no_thousands_columns + if not self._has_complex_date_col: + (index_names, self.orig_names, self.columns) = self._get_index_name( + self.columns + ) + self._name_processed = True + if self.index_names is None: + self.index_names = index_names + + if self._col_indices is None: + self._col_indices = list(range(len(self.columns))) + + self._parse_date_cols = self._validate_parse_dates_presence(self.columns) + no_thousands_columns: set[int] | None = None + if self.parse_dates: + no_thousands_columns = self._set_noconvert_dtype_columns( + self._col_indices, self.columns + ) + self._no_thousands_columns = no_thousands_columns - if len(self.decimal) != 1: - raise ValueError("Only length-1 decimal markers supported") + if len(self.decimal) != 1: + raise ValueError("Only length-1 decimal markers supported") - decimal = re.escape(self.decimal) - if self.thousands is None: - regex = fr"^[\-\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$" - else: - thousands = re.escape(self.thousands) - regex = ( - fr"^[\-\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?" - fr"([0-9]?(E|e)\-?[0-9]+)?$" - ) - self.num = re.compile(regex) + decimal = re.escape(self.decimal) + if self.thousands is None: + regex = fr"^[\-\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$" + else: + thousands = re.escape(self.thousands) + regex = ( + fr"^[\-\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?" + fr"([0-9]?(E|e)\-?[0-9]+)?$" + ) + self.num = re.compile(regex) + except Exception: + self.close() + raise def _make_reader(self, f) -> None: sep = self.delimiter diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 1dfd81366de72..d8f0390134770 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -22,6 +22,7 @@ from pandas._libs.tslibs import parsing from pandas._libs.tslibs.parsing import parse_datetime_string from pandas.compat.pyarrow import pa_version_under6p0 +import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -1985,3 +1986,20 @@ def test_parse_dates_and_string_dtype(all_parsers): expected = DataFrame({"a": ["1"], "b": [Timestamp("2019-12-31")]}) expected["a"] = expected["a"].astype("string") tm.assert_frame_equal(result, expected) + + +def test_parse_dates_error_doesnt_leak_files(all_parsers, datapath): + # GH#45384 + parser = all_parsers + test2csv = datapath("io", "parser", "data", "test2.csv") + psutil = td.safe_import("psutil") + if not psutil: + return + + proc = psutil.Process() + files_before = [(f.path, f.fd) for f in proc.open_files()] + try: + parser.read_csv(test2csv, parse_dates=["F"]) # "F" not in file + except ValueError: + files_after = [(f.path, f.fd) for f in proc.open_files()] + assert files_before == files_after, (files_before, files_after) diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index f35caf38c847f..544ab7c73c578 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -7,6 +7,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( DataFrame, Index, @@ -416,3 +418,19 @@ def test_usecols_indices_out_of_bounds(all_parsers, names): if names is None and parser.engine == "python": expected = DataFrame({"a": [1]}) tm.assert_frame_equal(result, expected) + + +def test_usecols_error_doesnt_leak_files(all_parsers, datapath): + # GH#45384 + test2csv = datapath("io", "parser", "data", "test2.csv") + psutil = td.safe_import("psutil") + if not psutil: + return + + proc = psutil.Process() + files_before = [(f.path, f.fd) for f in proc.open_files()] + try: + all_parsers.read_csv(test2csv, usecols=["F"]) # "F" not in file + except ValueError: + files_after = [(f.path, f.fd) for f in proc.open_files()] + assert files_before == files_after, (files_before, files_after)