Skip to content

BUG: Fix file leaks in csv parsers (GH#45384) #45388

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -874,6 +874,7 @@ I/O
- Bug in :func:`read_csv` when passing a ``tempfile.SpooledTemporaryFile`` opened in binary mode (:issue:`44748`)
- Bug in :func:`read_json` raising ``ValueError`` when attempting to parse json strings containing "://" (:issue:`36271`)
- Bug in :func:`read_csv` when ``engine="c"`` and ``encoding_errors=None`` which caused a segfault (:issue:`45180`)
- Bug in :func:`read_csv` allowing file handles to be leaked if an Exception is raised during parser initialisation (e.g. if the file does not pass ``usecols`` validation) (:issue:`45384`)

Period
^^^^^^
Expand Down
219 changes: 110 additions & 109 deletions pandas/io/parsers/c_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,135 +65,136 @@ def __init__(
self._open_handles(src, kwds)
assert self.handles is not None

# Have to pass int, would break tests using TextReader directly otherwise :(
kwds["on_bad_lines"] = self.on_bad_lines.value

for key in (
"storage_options",
"encoding",
"memory_map",
"compression",
"error_bad_lines",
"warn_bad_lines",
):
kwds.pop(key, None)

kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
try:
self._reader = parsers.TextReader(self.handles.handle, **kwds)
except Exception:
self.handles.close()
raise
# Have to pass int, would break tests using TextReader directly otherwise :(
kwds["on_bad_lines"] = self.on_bad_lines.value

for key in (
"storage_options",
"encoding",
"memory_map",
"compression",
"error_bad_lines",
"warn_bad_lines",
):
kwds.pop(key, None)

self.unnamed_cols = self._reader.unnamed_cols
kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))

# error: Cannot determine type of 'names'
passed_names = self.names is None # type: ignore[has-type]
self._reader = parsers.TextReader(self.handles.handle, **kwds)

self.unnamed_cols = self._reader.unnamed_cols

if self._reader.header is None:
self.names = None
else:
# error: Cannot determine type of 'names'
# error: Cannot determine type of 'index_names'
(
self.names, # type: ignore[has-type]
self.index_names,
self.col_names,
passed_names,
) = self._extract_multi_indexer_columns(
self._reader.header,
self.index_names, # type: ignore[has-type]
passed_names,
)
passed_names = self.names is None # type: ignore[has-type]

# error: Cannot determine type of 'names'
if self.names is None: # type: ignore[has-type]
if self.prefix:
# error: Cannot determine type of 'names'
self.names = [ # type: ignore[has-type]
f"{self.prefix}{i}" for i in range(self._reader.table_width)
]
if self._reader.header is None:
self.names = None
else:
# error: Cannot determine type of 'names'
self.names = list( # type: ignore[has-type]
range(self._reader.table_width)
# error: Cannot determine type of 'index_names'
(
self.names, # type: ignore[has-type]
self.index_names,
self.col_names,
passed_names,
) = self._extract_multi_indexer_columns(
self._reader.header,
self.index_names, # type: ignore[has-type]
passed_names,
)

# gh-9755
#
# need to set orig_names here first
# so that proper indexing can be done
# with _set_noconvert_columns
#
# once names has been filtered, we will
# then set orig_names again to names
# error: Cannot determine type of 'names'
self.orig_names = self.names[:] # type: ignore[has-type]

if self.usecols:
usecols = self._evaluate_usecols(self.usecols, self.orig_names)

# GH 14671
# assert for mypy, orig_names is List or None, None would error in issubset
assert self.orig_names is not None
if self.usecols_dtype == "string" and not set(usecols).issubset(
self.orig_names
):
self._validate_usecols_names(usecols, self.orig_names)

# error: Cannot determine type of 'names'
if len(self.names) > len(usecols): # type: ignore[has-type]
# error: Cannot determine type of 'names'
self.names = [ # type: ignore[has-type]
n
if self.names is None: # type: ignore[has-type]
if self.prefix:
# error: Cannot determine type of 'names'
for i, n in enumerate(self.names) # type: ignore[has-type]
if (i in usecols or n in usecols)
]

self.names = [ # type: ignore[has-type]
f"{self.prefix}{i}" for i in range(self._reader.table_width)
]
else:
# error: Cannot determine type of 'names'
self.names = list( # type: ignore[has-type]
range(self._reader.table_width)
)

# gh-9755
#
# need to set orig_names here first
# so that proper indexing can be done
# with _set_noconvert_columns
#
# once names has been filtered, we will
# then set orig_names again to names
# error: Cannot determine type of 'names'
if len(self.names) < len(usecols): # type: ignore[has-type]
# error: Cannot determine type of 'names'
self._validate_usecols_names(
usecols,
self.names, # type: ignore[has-type]
)

# error: Cannot determine type of 'names'
self._validate_parse_dates_presence(self.names) # type: ignore[has-type]
self._set_noconvert_columns()
self.orig_names = self.names[:] # type: ignore[has-type]

# error: Cannot determine type of 'names'
self.orig_names = self.names # type: ignore[has-type]
if self.usecols:
usecols = self._evaluate_usecols(self.usecols, self.orig_names)

if not self._has_complex_date_col:
# error: Cannot determine type of 'index_col'
if self._reader.leading_cols == 0 and is_index_col(
self.index_col # type: ignore[has-type]
):
# GH 14671
# assert for mypy, orig_names is List/None, None would error in issubset
assert self.orig_names is not None
if self.usecols_dtype == "string" and not set(usecols).issubset(
self.orig_names
):
self._validate_usecols_names(usecols, self.orig_names)

self._name_processed = True
(
index_names,
# error: Cannot determine type of 'names'
self.names, # type: ignore[has-type]
self.index_col,
) = self._clean_index_names(
# error: Cannot determine type of 'names'
if len(self.names) > len(usecols): # type: ignore[has-type]
# error: Cannot determine type of 'names'
self.names, # type: ignore[has-type]
# error: Cannot determine type of 'index_col'
self.index_col, # type: ignore[has-type]
self.unnamed_cols,
)
self.names = [ # type: ignore[has-type]
n
# error: Cannot determine type of 'names'
for i, n in enumerate(self.names) # type: ignore[has-type]
if (i in usecols or n in usecols)
]

if self.index_names is None:
self.index_names = index_names
# error: Cannot determine type of 'names'
if len(self.names) < len(usecols): # type: ignore[has-type]
# error: Cannot determine type of 'names'
self._validate_usecols_names(
usecols,
self.names, # type: ignore[has-type]
)

if self._reader.header is None and not passed_names:
assert self.index_names is not None
self.index_names = [None] * len(self.index_names)
# error: Cannot determine type of 'names'
self._validate_parse_dates_presence(self.names) # type: ignore[has-type]
self._set_noconvert_columns()

self._implicit_index = self._reader.leading_cols > 0
# error: Cannot determine type of 'names'
self.orig_names = self.names # type: ignore[has-type]

if not self._has_complex_date_col:
# error: Cannot determine type of 'index_col'
if self._reader.leading_cols == 0 and is_index_col(
self.index_col # type: ignore[has-type]
):

self._name_processed = True
(
index_names,
# error: Cannot determine type of 'names'
self.names, # type: ignore[has-type]
self.index_col,
) = self._clean_index_names(
# error: Cannot determine type of 'names'
self.names, # type: ignore[has-type]
# error: Cannot determine type of 'index_col'
self.index_col, # type: ignore[has-type]
self.unnamed_cols,
)

if self.index_names is None:
self.index_names = index_names

if self._reader.header is None and not passed_names:
assert self.index_names is not None
self.index_names = [None] * len(self.index_names)

self._implicit_index = self._reader.leading_cols > 0
except Exception:
self.handles.close()
raise

def close(self) -> None:
super().close()
Expand Down
108 changes: 54 additions & 54 deletions pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,72 +113,72 @@ def __init__(
self.close()
raise

# Get columns in two steps: infer from data, then
# infer column indices from self.usecols if it is specified.
self._col_indices: list[int] | None = None
columns: list[list[Scalar | None]]
try:
# Get columns in two steps: infer from data, then
# infer column indices from self.usecols if it is specified.
self._col_indices: list[int] | None = None
columns: list[list[Scalar | None]]
(
columns,
self.num_original_columns,
self.unnamed_cols,
) = self._infer_columns()
except (TypeError, ValueError):
self.close()
raise

# Now self.columns has the set of columns that we will process.
# The original set is stored in self.original_columns.
# error: Cannot determine type of 'index_names'
self.columns: list[Hashable]
(
self.columns,
self.index_names,
self.col_names,
_,
) = self._extract_multi_indexer_columns(
columns,
self.index_names, # type: ignore[has-type]
)
# Now self.columns has the set of columns that we will process.
# The original set is stored in self.original_columns.
# error: Cannot determine type of 'index_names'
self.columns: list[Hashable]
(
self.columns,
self.index_names,
self.col_names,
_,
) = self._extract_multi_indexer_columns(
columns,
self.index_names, # type: ignore[has-type]
)

# get popped off for index
self.orig_names: list[Hashable] = list(self.columns)
# get popped off for index
self.orig_names: list[Hashable] = list(self.columns)

# needs to be cleaned/refactored
# multiple date column thing turning into a real spaghetti factory
# needs to be cleaned/refactored
# multiple date column thing turning into a real spaghetti factory

if not self._has_complex_date_col:
(index_names, self.orig_names, self.columns) = self._get_index_name(
self.columns
)
self._name_processed = True
if self.index_names is None:
self.index_names = index_names

if self._col_indices is None:
self._col_indices = list(range(len(self.columns)))

self._parse_date_cols = self._validate_parse_dates_presence(self.columns)
no_thousands_columns: set[int] | None = None
if self.parse_dates:
no_thousands_columns = self._set_noconvert_dtype_columns(
self._col_indices, self.columns
)
self._no_thousands_columns = no_thousands_columns
if not self._has_complex_date_col:
(index_names, self.orig_names, self.columns) = self._get_index_name(
self.columns
)
self._name_processed = True
if self.index_names is None:
self.index_names = index_names

if self._col_indices is None:
self._col_indices = list(range(len(self.columns)))

self._parse_date_cols = self._validate_parse_dates_presence(self.columns)
no_thousands_columns: set[int] | None = None
if self.parse_dates:
no_thousands_columns = self._set_noconvert_dtype_columns(
self._col_indices, self.columns
)
self._no_thousands_columns = no_thousands_columns

if len(self.decimal) != 1:
raise ValueError("Only length-1 decimal markers supported")
if len(self.decimal) != 1:
raise ValueError("Only length-1 decimal markers supported")

decimal = re.escape(self.decimal)
if self.thousands is None:
regex = fr"^[\-\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$"
else:
thousands = re.escape(self.thousands)
regex = (
fr"^[\-\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?"
fr"([0-9]?(E|e)\-?[0-9]+)?$"
)
self.num = re.compile(regex)
decimal = re.escape(self.decimal)
if self.thousands is None:
regex = fr"^[\-\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$"
else:
thousands = re.escape(self.thousands)
regex = (
fr"^[\-\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?"
fr"([0-9]?(E|e)\-?[0-9]+)?$"
)
self.num = re.compile(regex)
except Exception:
self.close()
raise

def _make_reader(self, f) -> None:
sep = self.delimiter
Expand Down
Loading