Skip to content

Commit 6cc5584

Browse files
authored
REG/REF: close file handles engine-independently in read_csv (#45389)
1 parent 3743dbc commit 6cc5584

File tree

9 files changed

+118
-107
lines changed

9 files changed

+118
-107
lines changed

doc/source/whatsnew/v1.4.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -933,6 +933,7 @@ I/O
933933
- Bug in :func:`read_csv` when passing a ``tempfile.SpooledTemporaryFile`` opened in binary mode (:issue:`44748`)
934934
- Bug in :func:`read_json` raising ``ValueError`` when attempting to parse json strings containing "://" (:issue:`36271`)
935935
- Bug in :func:`read_csv` when ``engine="c"`` and ``encoding_errors=None`` which caused a segfault (:issue:`45180`)
936+
- Bug in :func:`read_csv` an invalid value of ``usecols`` leading to an un-closed file handle (:issue:`45384`)
936937

937938
Period
938939
^^^^^^

pandas/_typing.py

+3
Original file line numberDiff line numberDiff line change
@@ -292,3 +292,6 @@ def closed(self) -> bool:
292292

293293
# Windowing rank methods
294294
WindowingRankType = Literal["average", "min", "max"]
295+
296+
# read_csv engines
297+
CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"]

pandas/io/parsers/arrow_parser_wrapper.py

+11-19
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,12 @@
11
from __future__ import annotations
22

3-
from pandas._typing import (
4-
FilePath,
5-
ReadBuffer,
6-
)
3+
from pandas._typing import ReadBuffer
74
from pandas.compat._optional import import_optional_dependency
85

96
from pandas.core.dtypes.inference import is_integer
107

118
from pandas.core.frame import DataFrame
129

13-
from pandas.io.common import get_handle
1410
from pandas.io.parsers.base_parser import ParserBase
1511

1612

@@ -19,12 +15,11 @@ class ArrowParserWrapper(ParserBase):
1915
Wrapper for the pyarrow engine for read_csv()
2016
"""
2117

22-
def __init__(self, src: FilePath | ReadBuffer[bytes], **kwds):
18+
def __init__(self, src: ReadBuffer[bytes], **kwds):
19+
super().__init__(kwds)
2320
self.kwds = kwds
2421
self.src = src
2522

26-
ParserBase.__init__(self, kwds)
27-
2823
self._parse_kwds()
2924

3025
def _parse_kwds(self):
@@ -151,15 +146,12 @@ def read(self) -> DataFrame:
151146
pyarrow_csv = import_optional_dependency("pyarrow.csv")
152147
self._get_pyarrow_options()
153148

154-
with get_handle(
155-
self.src, "rb", encoding=self.encoding, is_text=False
156-
) as handles:
157-
table = pyarrow_csv.read_csv(
158-
handles.handle,
159-
read_options=pyarrow_csv.ReadOptions(**self.read_options),
160-
parse_options=pyarrow_csv.ParseOptions(**self.parse_options),
161-
convert_options=pyarrow_csv.ConvertOptions(**self.convert_options),
162-
)
149+
table = pyarrow_csv.read_csv(
150+
self.src,
151+
read_options=pyarrow_csv.ReadOptions(**self.read_options),
152+
parse_options=pyarrow_csv.ParseOptions(**self.parse_options),
153+
convert_options=pyarrow_csv.ConvertOptions(**self.convert_options),
154+
)
163155

164-
frame = table.to_pandas()
165-
return self._finalize_output(frame)
156+
frame = table.to_pandas()
157+
return self._finalize_output(frame)

pandas/io/parsers/base_parser.py

+1-29
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
from enum import Enum
88
import itertools
99
from typing import (
10-
Any,
1110
Callable,
1211
DefaultDict,
1312
Hashable,
@@ -32,8 +31,6 @@
3231
from pandas._typing import (
3332
ArrayLike,
3433
DtypeArg,
35-
FilePath,
36-
ReadCsvBuffer,
3734
)
3835
from pandas.errors import (
3936
ParserError,
@@ -71,10 +68,6 @@
7168
from pandas.core.series import Series
7269
from pandas.core.tools import datetimes as tools
7370

74-
from pandas.io.common import (
75-
IOHandles,
76-
get_handle,
77-
)
7871
from pandas.io.date_converters import generic_parser
7972

8073

@@ -176,30 +169,10 @@ def __init__(self, kwds):
176169

177170
self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])
178171

179-
self.handles: IOHandles[str] | None = None
180-
181172
# Fallback to error to pass a sketchy test(test_override_set_noconvert_columns)
182173
# Normally, this arg would get pre-processed earlier on
183174
self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR)
184175

185-
def _open_handles(
186-
self,
187-
src: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
188-
kwds: dict[str, Any],
189-
) -> None:
190-
"""
191-
Let the readers open IOHandles after they are done with their potential raises.
192-
"""
193-
self.handles = get_handle(
194-
src,
195-
"r",
196-
encoding=kwds.get("encoding", None),
197-
compression=kwds.get("compression", None),
198-
memory_map=kwds.get("memory_map", False),
199-
storage_options=kwds.get("storage_options", None),
200-
errors=kwds.get("encoding_errors", "strict"),
201-
)
202-
203176
def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterable:
204177
"""
205178
Check if parse_dates are in columns.
@@ -262,8 +235,7 @@ def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterabl
262235
]
263236

264237
def close(self):
265-
if self.handles is not None:
266-
self.handles.close()
238+
pass
267239

268240
@final
269241
@property

pandas/io/parsers/c_parser_wrapper.py

+4-17
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
ArrayLike,
1515
DtypeArg,
1616
DtypeObj,
17-
FilePath,
1817
ReadCsvBuffer,
1918
)
2019
from pandas.errors import DtypeWarning
@@ -43,12 +42,10 @@ class CParserWrapper(ParserBase):
4342
low_memory: bool
4443
_reader: parsers.TextReader
4544

46-
def __init__(
47-
self, src: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], **kwds
48-
):
45+
def __init__(self, src: ReadCsvBuffer[str], **kwds):
46+
super().__init__(kwds)
4947
self.kwds = kwds
5048
kwds = kwds.copy()
51-
ParserBase.__init__(self, kwds)
5249

5350
self.low_memory = kwds.pop("low_memory", False)
5451

@@ -61,10 +58,6 @@ def __init__(
6158
# GH20529, validate usecol arg before TextReader
6259
kwds["usecols"] = self.usecols
6360

64-
# open handles
65-
self._open_handles(src, kwds)
66-
assert self.handles is not None
67-
6861
# Have to pass int, would break tests using TextReader directly otherwise :(
6962
kwds["on_bad_lines"] = self.on_bad_lines.value
7063

@@ -79,11 +72,7 @@ def __init__(
7972
kwds.pop(key, None)
8073

8174
kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
82-
try:
83-
self._reader = parsers.TextReader(self.handles.handle, **kwds)
84-
except Exception:
85-
self.handles.close()
86-
raise
75+
self._reader = parsers.TextReader(src, **kwds)
8776

8877
self.unnamed_cols = self._reader.unnamed_cols
8978

@@ -196,9 +185,7 @@ def __init__(
196185
self._implicit_index = self._reader.leading_cols > 0
197186

198187
def close(self) -> None:
199-
super().close()
200-
201-
# close additional handles opened by C parser
188+
# close handles opened by C parser
202189
try:
203190
self._reader.close()
204191
except ValueError:

pandas/io/parsers/python_parser.py

+10-25
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
import pandas._libs.lib as lib
2626
from pandas._typing import (
2727
ArrayLike,
28-
FilePath,
2928
ReadCsvBuffer,
3029
Scalar,
3130
)
@@ -51,13 +50,11 @@
5150

5251

5352
class PythonParser(ParserBase):
54-
def __init__(
55-
self, f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list, **kwds
56-
):
53+
def __init__(self, f: ReadCsvBuffer[str] | list, **kwds):
5754
"""
5855
Workhorse function for processing nested list into DataFrame
5956
"""
60-
ParserBase.__init__(self, kwds)
57+
super().__init__(kwds)
6158

6259
self.data: Iterator[str] | None = None
6360
self.buf: list = []
@@ -104,28 +101,18 @@ def __init__(
104101
# read_excel: f is a list
105102
self.data = cast(Iterator[str], f)
106103
else:
107-
self._open_handles(f, kwds)
108-
assert self.handles is not None
109-
assert hasattr(self.handles.handle, "readline")
110-
try:
111-
self._make_reader(self.handles.handle)
112-
except (csv.Error, UnicodeDecodeError):
113-
self.close()
114-
raise
104+
assert hasattr(f, "readline")
105+
self._make_reader(f)
115106

116107
# Get columns in two steps: infer from data, then
117108
# infer column indices from self.usecols if it is specified.
118109
self._col_indices: list[int] | None = None
119110
columns: list[list[Scalar | None]]
120-
try:
121-
(
122-
columns,
123-
self.num_original_columns,
124-
self.unnamed_cols,
125-
) = self._infer_columns()
126-
except (TypeError, ValueError):
127-
self.close()
128-
raise
111+
(
112+
columns,
113+
self.num_original_columns,
114+
self.unnamed_cols,
115+
) = self._infer_columns()
129116

130117
# Now self.columns has the set of columns that we will process.
131118
# The original set is stored in self.original_columns.
@@ -1259,9 +1246,7 @@ class FixedWidthFieldParser(PythonParser):
12591246
See PythonParser for details.
12601247
"""
12611248

1262-
def __init__(
1263-
self, f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], **kwds
1264-
) -> None:
1249+
def __init__(self, f: ReadCsvBuffer[str], **kwds) -> None:
12651250
# Support iterators, convert to a list.
12661251
self.colspecs = kwds.pop("colspecs")
12671252
self.infer_nrows = kwds.pop("infer_nrows")

0 commit comments

Comments
 (0)