Skip to content

Commit 2b1af8d

Browse files
committed
BUG: Fix file leaks in csv parsers (GH#45384)
1 parent 96f2f2a commit 2b1af8d

File tree

5 files changed

+201
-163
lines changed

5 files changed

+201
-163
lines changed

doc/source/whatsnew/v1.4.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -874,6 +874,7 @@ I/O
874874
- Bug in :func:`read_csv` when passing a ``tempfile.SpooledTemporaryFile`` opened in binary mode (:issue:`44748`)
875875
- Bug in :func:`read_json` raising ``ValueError`` when attempting to parse json strings containing "://" (:issue:`36271`)
876876
- Bug in :func:`read_csv` when ``engine="c"`` and ``encoding_errors=None`` which caused a segfault (:issue:`45180`)
877+
- Bug in :func:`read_csv` allowing file handles to be leaked if an Exception is raised during parser initialisation (e.g. if the file does not pass ``usecols`` validation) (:issue:`45384`)
877878

878879
Period
879880
^^^^^^

pandas/io/parsers/c_parser_wrapper.py

+110-109
Original file line numberDiff line numberDiff line change
@@ -65,135 +65,136 @@ def __init__(
6565
self._open_handles(src, kwds)
6666
assert self.handles is not None
6767

68-
# Have to pass int, would break tests using TextReader directly otherwise :(
69-
kwds["on_bad_lines"] = self.on_bad_lines.value
70-
71-
for key in (
72-
"storage_options",
73-
"encoding",
74-
"memory_map",
75-
"compression",
76-
"error_bad_lines",
77-
"warn_bad_lines",
78-
):
79-
kwds.pop(key, None)
80-
81-
kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
8268
try:
83-
self._reader = parsers.TextReader(self.handles.handle, **kwds)
84-
except Exception:
85-
self.handles.close()
86-
raise
69+
# Have to pass int, would break tests using TextReader directly otherwise :(
70+
kwds["on_bad_lines"] = self.on_bad_lines.value
71+
72+
for key in (
73+
"storage_options",
74+
"encoding",
75+
"memory_map",
76+
"compression",
77+
"error_bad_lines",
78+
"warn_bad_lines",
79+
):
80+
kwds.pop(key, None)
8781

88-
self.unnamed_cols = self._reader.unnamed_cols
82+
kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
8983

90-
# error: Cannot determine type of 'names'
91-
passed_names = self.names is None # type: ignore[has-type]
84+
self._reader = parsers.TextReader(self.handles.handle, **kwds)
85+
86+
self.unnamed_cols = self._reader.unnamed_cols
9287

93-
if self._reader.header is None:
94-
self.names = None
95-
else:
9688
# error: Cannot determine type of 'names'
97-
# error: Cannot determine type of 'index_names'
98-
(
99-
self.names, # type: ignore[has-type]
100-
self.index_names,
101-
self.col_names,
102-
passed_names,
103-
) = self._extract_multi_indexer_columns(
104-
self._reader.header,
105-
self.index_names, # type: ignore[has-type]
106-
passed_names,
107-
)
89+
passed_names = self.names is None # type: ignore[has-type]
10890

109-
# error: Cannot determine type of 'names'
110-
if self.names is None: # type: ignore[has-type]
111-
if self.prefix:
112-
# error: Cannot determine type of 'names'
113-
self.names = [ # type: ignore[has-type]
114-
f"{self.prefix}{i}" for i in range(self._reader.table_width)
115-
]
91+
if self._reader.header is None:
92+
self.names = None
11693
else:
11794
# error: Cannot determine type of 'names'
118-
self.names = list( # type: ignore[has-type]
119-
range(self._reader.table_width)
95+
# error: Cannot determine type of 'index_names'
96+
(
97+
self.names, # type: ignore[has-type]
98+
self.index_names,
99+
self.col_names,
100+
passed_names,
101+
) = self._extract_multi_indexer_columns(
102+
self._reader.header,
103+
self.index_names, # type: ignore[has-type]
104+
passed_names,
120105
)
121106

122-
# gh-9755
123-
#
124-
# need to set orig_names here first
125-
# so that proper indexing can be done
126-
# with _set_noconvert_columns
127-
#
128-
# once names has been filtered, we will
129-
# then set orig_names again to names
130-
# error: Cannot determine type of 'names'
131-
self.orig_names = self.names[:] # type: ignore[has-type]
132-
133-
if self.usecols:
134-
usecols = self._evaluate_usecols(self.usecols, self.orig_names)
135-
136-
# GH 14671
137-
# assert for mypy, orig_names is List or None, None would error in issubset
138-
assert self.orig_names is not None
139-
if self.usecols_dtype == "string" and not set(usecols).issubset(
140-
self.orig_names
141-
):
142-
self._validate_usecols_names(usecols, self.orig_names)
143-
144107
# error: Cannot determine type of 'names'
145-
if len(self.names) > len(usecols): # type: ignore[has-type]
146-
# error: Cannot determine type of 'names'
147-
self.names = [ # type: ignore[has-type]
148-
n
108+
if self.names is None: # type: ignore[has-type]
109+
if self.prefix:
149110
# error: Cannot determine type of 'names'
150-
for i, n in enumerate(self.names) # type: ignore[has-type]
151-
if (i in usecols or n in usecols)
152-
]
153-
111+
self.names = [ # type: ignore[has-type]
112+
f"{self.prefix}{i}" for i in range(self._reader.table_width)
113+
]
114+
else:
115+
# error: Cannot determine type of 'names'
116+
self.names = list( # type: ignore[has-type]
117+
range(self._reader.table_width)
118+
)
119+
120+
# gh-9755
121+
#
122+
# need to set orig_names here first
123+
# so that proper indexing can be done
124+
# with _set_noconvert_columns
125+
#
126+
# once names has been filtered, we will
127+
# then set orig_names again to names
154128
# error: Cannot determine type of 'names'
155-
if len(self.names) < len(usecols): # type: ignore[has-type]
156-
# error: Cannot determine type of 'names'
157-
self._validate_usecols_names(
158-
usecols,
159-
self.names, # type: ignore[has-type]
160-
)
161-
162-
# error: Cannot determine type of 'names'
163-
self._validate_parse_dates_presence(self.names) # type: ignore[has-type]
164-
self._set_noconvert_columns()
129+
self.orig_names = self.names[:] # type: ignore[has-type]
165130

166-
# error: Cannot determine type of 'names'
167-
self.orig_names = self.names # type: ignore[has-type]
131+
if self.usecols:
132+
usecols = self._evaluate_usecols(self.usecols, self.orig_names)
168133

169-
if not self._has_complex_date_col:
170-
# error: Cannot determine type of 'index_col'
171-
if self._reader.leading_cols == 0 and is_index_col(
172-
self.index_col # type: ignore[has-type]
173-
):
134+
# GH 14671
135+
# assert for mypy, orig_names is List or None, None would error in issubset
136+
assert self.orig_names is not None
137+
if self.usecols_dtype == "string" and not set(usecols).issubset(
138+
self.orig_names
139+
):
140+
self._validate_usecols_names(usecols, self.orig_names)
174141

175-
self._name_processed = True
176-
(
177-
index_names,
178-
# error: Cannot determine type of 'names'
179-
self.names, # type: ignore[has-type]
180-
self.index_col,
181-
) = self._clean_index_names(
142+
# error: Cannot determine type of 'names'
143+
if len(self.names) > len(usecols): # type: ignore[has-type]
182144
# error: Cannot determine type of 'names'
183-
self.names, # type: ignore[has-type]
184-
# error: Cannot determine type of 'index_col'
185-
self.index_col, # type: ignore[has-type]
186-
self.unnamed_cols,
187-
)
145+
self.names = [ # type: ignore[has-type]
146+
n
147+
# error: Cannot determine type of 'names'
148+
for i, n in enumerate(self.names) # type: ignore[has-type]
149+
if (i in usecols or n in usecols)
150+
]
188151

189-
if self.index_names is None:
190-
self.index_names = index_names
152+
# error: Cannot determine type of 'names'
153+
if len(self.names) < len(usecols): # type: ignore[has-type]
154+
# error: Cannot determine type of 'names'
155+
self._validate_usecols_names(
156+
usecols,
157+
self.names, # type: ignore[has-type]
158+
)
191159

192-
if self._reader.header is None and not passed_names:
193-
assert self.index_names is not None
194-
self.index_names = [None] * len(self.index_names)
160+
# error: Cannot determine type of 'names'
161+
self._validate_parse_dates_presence(self.names) # type: ignore[has-type]
162+
self._set_noconvert_columns()
195163

196-
self._implicit_index = self._reader.leading_cols > 0
164+
# error: Cannot determine type of 'names'
165+
self.orig_names = self.names # type: ignore[has-type]
166+
167+
if not self._has_complex_date_col:
168+
# error: Cannot determine type of 'index_col'
169+
if self._reader.leading_cols == 0 and is_index_col(
170+
self.index_col # type: ignore[has-type]
171+
):
172+
173+
self._name_processed = True
174+
(
175+
index_names,
176+
# error: Cannot determine type of 'names'
177+
self.names, # type: ignore[has-type]
178+
self.index_col,
179+
) = self._clean_index_names(
180+
# error: Cannot determine type of 'names'
181+
self.names, # type: ignore[has-type]
182+
# error: Cannot determine type of 'index_col'
183+
self.index_col, # type: ignore[has-type]
184+
self.unnamed_cols,
185+
)
186+
187+
if self.index_names is None:
188+
self.index_names = index_names
189+
190+
if self._reader.header is None and not passed_names:
191+
assert self.index_names is not None
192+
self.index_names = [None] * len(self.index_names)
193+
194+
self._implicit_index = self._reader.leading_cols > 0
195+
except Exception:
196+
self.handles.close()
197+
raise
197198

198199
def close(self) -> None:
199200
super().close()

pandas/io/parsers/python_parser.py

+54-54
Original file line numberDiff line numberDiff line change
@@ -113,72 +113,72 @@ def __init__(
113113
self.close()
114114
raise
115115

116-
# Get columns in two steps: infer from data, then
117-
# infer column indices from self.usecols if it is specified.
118-
self._col_indices: list[int] | None = None
119-
columns: list[list[Scalar | None]]
120116
try:
117+
# Get columns in two steps: infer from data, then
118+
# infer column indices from self.usecols if it is specified.
119+
self._col_indices: list[int] | None = None
120+
columns: list[list[Scalar | None]]
121121
(
122122
columns,
123123
self.num_original_columns,
124124
self.unnamed_cols,
125125
) = self._infer_columns()
126-
except (TypeError, ValueError):
127-
self.close()
128-
raise
129126

130-
# Now self.columns has the set of columns that we will process.
131-
# The original set is stored in self.original_columns.
132-
# error: Cannot determine type of 'index_names'
133-
self.columns: list[Hashable]
134-
(
135-
self.columns,
136-
self.index_names,
137-
self.col_names,
138-
_,
139-
) = self._extract_multi_indexer_columns(
140-
columns,
141-
self.index_names, # type: ignore[has-type]
142-
)
127+
# Now self.columns has the set of columns that we will process.
128+
# The original set is stored in self.original_columns.
129+
# error: Cannot determine type of 'index_names'
130+
self.columns: list[Hashable]
131+
(
132+
self.columns,
133+
self.index_names,
134+
self.col_names,
135+
_,
136+
) = self._extract_multi_indexer_columns(
137+
columns,
138+
self.index_names, # type: ignore[has-type]
139+
)
143140

144-
# get popped off for index
145-
self.orig_names: list[Hashable] = list(self.columns)
141+
# get popped off for index
142+
self.orig_names: list[Hashable] = list(self.columns)
146143

147-
# needs to be cleaned/refactored
148-
# multiple date column thing turning into a real spaghetti factory
144+
# needs to be cleaned/refactored
145+
# multiple date column thing turning into a real spaghetti factory
149146

150-
if not self._has_complex_date_col:
151-
(index_names, self.orig_names, self.columns) = self._get_index_name(
152-
self.columns
153-
)
154-
self._name_processed = True
155-
if self.index_names is None:
156-
self.index_names = index_names
157-
158-
if self._col_indices is None:
159-
self._col_indices = list(range(len(self.columns)))
160-
161-
self._parse_date_cols = self._validate_parse_dates_presence(self.columns)
162-
no_thousands_columns: set[int] | None = None
163-
if self.parse_dates:
164-
no_thousands_columns = self._set_noconvert_dtype_columns(
165-
self._col_indices, self.columns
166-
)
167-
self._no_thousands_columns = no_thousands_columns
147+
if not self._has_complex_date_col:
148+
(index_names, self.orig_names, self.columns) = self._get_index_name(
149+
self.columns
150+
)
151+
self._name_processed = True
152+
if self.index_names is None:
153+
self.index_names = index_names
154+
155+
if self._col_indices is None:
156+
self._col_indices = list(range(len(self.columns)))
157+
158+
self._parse_date_cols = self._validate_parse_dates_presence(self.columns)
159+
no_thousands_columns: set[int] | None = None
160+
if self.parse_dates:
161+
no_thousands_columns = self._set_noconvert_dtype_columns(
162+
self._col_indices, self.columns
163+
)
164+
self._no_thousands_columns = no_thousands_columns
168165

169-
if len(self.decimal) != 1:
170-
raise ValueError("Only length-1 decimal markers supported")
166+
if len(self.decimal) != 1:
167+
raise ValueError("Only length-1 decimal markers supported")
171168

172-
decimal = re.escape(self.decimal)
173-
if self.thousands is None:
174-
regex = fr"^[\-\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$"
175-
else:
176-
thousands = re.escape(self.thousands)
177-
regex = (
178-
fr"^[\-\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?"
179-
fr"([0-9]?(E|e)\-?[0-9]+)?$"
180-
)
181-
self.num = re.compile(regex)
169+
decimal = re.escape(self.decimal)
170+
if self.thousands is None:
171+
regex = fr"^[\-\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$"
172+
else:
173+
thousands = re.escape(self.thousands)
174+
regex = (
175+
fr"^[\-\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?"
176+
fr"([0-9]?(E|e)\-?[0-9]+)?$"
177+
)
178+
self.num = re.compile(regex)
179+
except Exception:
180+
self.close()
181+
raise
182182

183183
def _make_reader(self, f) -> None:
184184
sep = self.delimiter

0 commit comments

Comments
 (0)