Skip to content

Backport PR #45389 on branch 1.4.x (REG/REF: close file handles engine-independently in read_csv) #45398

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -874,6 +874,7 @@ I/O
- Bug in :func:`read_csv` when passing a ``tempfile.SpooledTemporaryFile`` opened in binary mode (:issue:`44748`)
- Bug in :func:`read_json` raising ``ValueError`` when attempting to parse json strings containing "://" (:issue:`36271`)
- Bug in :func:`read_csv` when ``engine="c"`` and ``encoding_errors=None`` which caused a segfault (:issue:`45180`)
- Bug in :func:`read_csv` an invalid value of ``usecols`` leading to an un-closed file handle (:issue:`45384`)

Period
^^^^^^
Expand Down
3 changes: 3 additions & 0 deletions pandas/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,3 +292,6 @@ def closed(self) -> bool:

# Windowing rank methods
WindowingRankType = Literal["average", "min", "max"]

# read_csv engines
CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"]
30 changes: 11 additions & 19 deletions pandas/io/parsers/arrow_parser_wrapper.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,12 @@
from __future__ import annotations

from pandas._typing import (
FilePath,
ReadBuffer,
)
from pandas._typing import ReadBuffer
from pandas.compat._optional import import_optional_dependency

from pandas.core.dtypes.inference import is_integer

from pandas.core.frame import DataFrame

from pandas.io.common import get_handle
from pandas.io.parsers.base_parser import ParserBase


Expand All @@ -19,12 +15,11 @@ class ArrowParserWrapper(ParserBase):
Wrapper for the pyarrow engine for read_csv()
"""

def __init__(self, src: FilePath | ReadBuffer[bytes], **kwds):
def __init__(self, src: ReadBuffer[bytes], **kwds):
super().__init__(kwds)
self.kwds = kwds
self.src = src

ParserBase.__init__(self, kwds)

self._parse_kwds()

def _parse_kwds(self):
Expand Down Expand Up @@ -151,15 +146,12 @@ def read(self) -> DataFrame:
pyarrow_csv = import_optional_dependency("pyarrow.csv")
self._get_pyarrow_options()

with get_handle(
self.src, "rb", encoding=self.encoding, is_text=False
) as handles:
table = pyarrow_csv.read_csv(
handles.handle,
read_options=pyarrow_csv.ReadOptions(**self.read_options),
parse_options=pyarrow_csv.ParseOptions(**self.parse_options),
convert_options=pyarrow_csv.ConvertOptions(**self.convert_options),
)
table = pyarrow_csv.read_csv(
self.src,
read_options=pyarrow_csv.ReadOptions(**self.read_options),
parse_options=pyarrow_csv.ParseOptions(**self.parse_options),
convert_options=pyarrow_csv.ConvertOptions(**self.convert_options),
)

frame = table.to_pandas()
return self._finalize_output(frame)
frame = table.to_pandas()
return self._finalize_output(frame)
30 changes: 1 addition & 29 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from enum import Enum
import itertools
from typing import (
Any,
Callable,
DefaultDict,
Hashable,
Expand All @@ -32,8 +31,6 @@
from pandas._typing import (
ArrayLike,
DtypeArg,
FilePath,
ReadCsvBuffer,
)
from pandas.errors import (
ParserError,
Expand Down Expand Up @@ -71,10 +68,6 @@
from pandas.core.series import Series
from pandas.core.tools import datetimes as tools

from pandas.io.common import (
IOHandles,
get_handle,
)
from pandas.io.date_converters import generic_parser


Expand Down Expand Up @@ -176,30 +169,10 @@ def __init__(self, kwds):

self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])

self.handles: IOHandles[str] | None = None

# Fallback to error to pass a sketchy test(test_override_set_noconvert_columns)
# Normally, this arg would get pre-processed earlier on
self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR)

def _open_handles(
self,
src: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
kwds: dict[str, Any],
) -> None:
"""
Let the readers open IOHandles after they are done with their potential raises.
"""
self.handles = get_handle(
src,
"r",
encoding=kwds.get("encoding", None),
compression=kwds.get("compression", None),
memory_map=kwds.get("memory_map", False),
storage_options=kwds.get("storage_options", None),
errors=kwds.get("encoding_errors", "strict"),
)

def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterable:
"""
Check if parse_dates are in columns.
Expand Down Expand Up @@ -262,8 +235,7 @@ def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterabl
]

def close(self):
if self.handles is not None:
self.handles.close()
pass

@final
@property
Expand Down
21 changes: 4 additions & 17 deletions pandas/io/parsers/c_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
ArrayLike,
DtypeArg,
DtypeObj,
FilePath,
ReadCsvBuffer,
)
from pandas.errors import DtypeWarning
Expand Down Expand Up @@ -43,12 +42,10 @@ class CParserWrapper(ParserBase):
low_memory: bool
_reader: parsers.TextReader

def __init__(
self, src: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], **kwds
):
def __init__(self, src: ReadCsvBuffer[str], **kwds):
super().__init__(kwds)
self.kwds = kwds
kwds = kwds.copy()
ParserBase.__init__(self, kwds)

self.low_memory = kwds.pop("low_memory", False)

Expand All @@ -61,10 +58,6 @@ def __init__(
# GH20529, validate usecol arg before TextReader
kwds["usecols"] = self.usecols

# open handles
self._open_handles(src, kwds)
assert self.handles is not None

# Have to pass int, would break tests using TextReader directly otherwise :(
kwds["on_bad_lines"] = self.on_bad_lines.value

Expand All @@ -79,11 +72,7 @@ def __init__(
kwds.pop(key, None)

kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
try:
self._reader = parsers.TextReader(self.handles.handle, **kwds)
except Exception:
self.handles.close()
raise
self._reader = parsers.TextReader(src, **kwds)

self.unnamed_cols = self._reader.unnamed_cols

Expand Down Expand Up @@ -196,9 +185,7 @@ def __init__(
self._implicit_index = self._reader.leading_cols > 0

def close(self) -> None:
super().close()

# close additional handles opened by C parser
# close handles opened by C parser
try:
self._reader.close()
except ValueError:
Expand Down
35 changes: 10 additions & 25 deletions pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
import pandas._libs.lib as lib
from pandas._typing import (
ArrayLike,
FilePath,
ReadCsvBuffer,
Scalar,
)
Expand All @@ -51,13 +50,11 @@


class PythonParser(ParserBase):
def __init__(
self, f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list, **kwds
):
def __init__(self, f: ReadCsvBuffer[str] | list, **kwds):
"""
Workhorse function for processing nested list into DataFrame
"""
ParserBase.__init__(self, kwds)
super().__init__(kwds)

self.data: Iterator[str] | None = None
self.buf: list = []
Expand Down Expand Up @@ -104,28 +101,18 @@ def __init__(
# read_excel: f is a list
self.data = cast(Iterator[str], f)
else:
self._open_handles(f, kwds)
assert self.handles is not None
assert hasattr(self.handles.handle, "readline")
try:
self._make_reader(self.handles.handle)
except (csv.Error, UnicodeDecodeError):
self.close()
raise
assert hasattr(f, "readline")
self._make_reader(f)

# Get columns in two steps: infer from data, then
# infer column indices from self.usecols if it is specified.
self._col_indices: list[int] | None = None
columns: list[list[Scalar | None]]
try:
(
columns,
self.num_original_columns,
self.unnamed_cols,
) = self._infer_columns()
except (TypeError, ValueError):
self.close()
raise
(
columns,
self.num_original_columns,
self.unnamed_cols,
) = self._infer_columns()

# Now self.columns has the set of columns that we will process.
# The original set is stored in self.original_columns.
Expand Down Expand Up @@ -1259,9 +1246,7 @@ class FixedWidthFieldParser(PythonParser):
See PythonParser for details.
"""

def __init__(
self, f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], **kwds
) -> None:
def __init__(self, f: ReadCsvBuffer[str], **kwds) -> None:
# Support iterators, convert to a list.
self.colspecs = kwds.pop("colspecs")
self.infer_nrows = kwds.pop("infer_nrows")
Expand Down
Loading