Skip to content

Commit a648fb2

Browse files
twoertweinjreback
andauthored
REF/BUG/TYP: read_csv shouldn't close user-provided file handles (#36997)
* BUG/REF: read_csv shouldn't close user-provided file handles * get_handle: typing, returns is_wrapped, use dataclass, and make sure that all created handlers are returned * remove unused imports * added IOHandleArgs.close * added IOArgs.close * mostly comments * move memory_map from TextReader to CParserWrapper * moved IOArgs and IOHandles * more comments Co-authored-by: Jeff Reback <[email protected]>
1 parent d378852 commit a648fb2

24 files changed

+480
-510
lines changed

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -498,6 +498,7 @@ I/O
498498
- Bug in output rendering of complex numbers showing too many trailing zeros (:issue:`36799`)
499499
- Bug in :class:`HDFStore` threw a ``TypeError`` when exporting an empty :class:`DataFrame` with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`)
500500
- Bug in :class:`HDFStore` was dropping timezone information when exporting :class:`Series` with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`)
501+
- :func:`read_csv` was closing user-provided binary file handles when ``engine="c"`` and an ``encoding`` was requested (:issue:`36980`)
501502
- Bug in :meth:`DataFrame.to_hdf` was not dropping missing rows with ``dropna=True`` (:issue:`35719`)
502503

503504
Plotting

pandas/_libs/parsers.pyx

+14-108
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,18 @@
11
# Copyright (c) 2012, Lambda Foundry, Inc.
22
# See LICENSE for the license
3-
import bz2
43
from csv import QUOTE_MINIMAL, QUOTE_NONE, QUOTE_NONNUMERIC
54
from errno import ENOENT
6-
import gzip
7-
import io
8-
import os
95
import sys
106
import time
117
import warnings
12-
import zipfile
138

149
from libc.stdlib cimport free
1510
from libc.string cimport strcasecmp, strlen, strncpy
1611

1712
import cython
1813
from cython import Py_ssize_t
1914

20-
from cpython.bytes cimport PyBytes_AsString, PyBytes_FromString
15+
from cpython.bytes cimport PyBytes_AsString
2116
from cpython.exc cimport PyErr_Fetch, PyErr_Occurred
2217
from cpython.object cimport PyObject
2318
from cpython.ref cimport Py_XDECREF
@@ -67,7 +62,6 @@ from pandas._libs.khash cimport (
6762
khiter_t,
6863
)
6964

70-
from pandas.compat import get_lzma_file, import_lzma
7165
from pandas.errors import DtypeWarning, EmptyDataError, ParserError, ParserWarning
7266

7367
from pandas.core.dtypes.common import (
@@ -82,11 +76,10 @@ from pandas.core.dtypes.common import (
8276
)
8377
from pandas.core.dtypes.concat import union_categoricals
8478

85-
lzma = import_lzma()
86-
8779
cdef:
8880
float64_t INF = <float64_t>np.inf
8981
float64_t NEGINF = -INF
82+
int64_t DEFAULT_CHUNKSIZE = 256 * 1024
9083

9184

9285
cdef extern from "headers/portable.h":
@@ -275,14 +268,15 @@ cdef extern from "parser/io.h":
275268
size_t *bytes_read, int *status)
276269

277270

278-
DEFAULT_CHUNKSIZE = 256 * 1024
279-
280-
281271
cdef class TextReader:
282272
"""
283273
284274
# source: StringIO or file object
285275
276+
..versionchange:: 1.2.0
277+
removed 'compression', 'memory_map', and 'encoding' argument.
278+
These arguments are outsourced to CParserWrapper.
279+
'source' has to be a file handle.
286280
"""
287281

288282
cdef:
@@ -299,16 +293,14 @@ cdef class TextReader:
299293

300294
cdef public:
301295
int64_t leading_cols, table_width, skipfooter, buffer_lines
302-
bint allow_leading_cols, mangle_dupe_cols, memory_map, low_memory
296+
bint allow_leading_cols, mangle_dupe_cols, low_memory
303297
bint delim_whitespace
304298
object delimiter, converters
305299
object na_values
306300
object header, orig_header, names, header_start, header_end
307301
object index_col
308302
object skiprows
309303
object dtype
310-
object encoding
311-
object compression
312304
object usecols
313305
list dtype_cast_order
314306
set unnamed_cols
@@ -321,18 +313,15 @@ cdef class TextReader:
321313
header_end=0,
322314
index_col=None,
323315
names=None,
324-
bint memory_map=False,
325316
tokenize_chunksize=DEFAULT_CHUNKSIZE,
326317
bint delim_whitespace=False,
327-
compression=None,
328318
converters=None,
329319
bint skipinitialspace=False,
330320
escapechar=None,
331321
bint doublequote=True,
332322
quotechar=b'"',
333323
quoting=0,
334324
lineterminator=None,
335-
encoding=None,
336325
comment=None,
337326
decimal=b'.',
338327
thousands=None,
@@ -356,15 +345,7 @@ cdef class TextReader:
356345
bint skip_blank_lines=True):
357346

358347
# set encoding for native Python and C library
359-
if encoding is not None:
360-
if not isinstance(encoding, bytes):
361-
encoding = encoding.encode('utf-8')
362-
encoding = encoding.lower()
363-
self.c_encoding = <char*>encoding
364-
else:
365-
self.c_encoding = NULL
366-
367-
self.encoding = encoding
348+
self.c_encoding = NULL
368349

369350
self.parser = parser_new()
370351
self.parser.chunksize = tokenize_chunksize
@@ -374,9 +355,6 @@ cdef class TextReader:
374355
# For timekeeping
375356
self.clocks = []
376357

377-
self.compression = compression
378-
self.memory_map = memory_map
379-
380358
self.parser.usecols = (usecols is not None)
381359

382360
self._setup_parser_source(source)
@@ -562,11 +540,6 @@ cdef class TextReader:
562540
parser_del(self.parser)
563541

564542
def close(self):
565-
# we need to properly close an open derived
566-
# filehandle here, e.g. and UTFRecoder
567-
if self.handle is not None:
568-
self.handle.close()
569-
570543
# also preemptively free all allocated memory
571544
parser_free(self.parser)
572545
if self.true_set:
@@ -614,82 +587,15 @@ cdef class TextReader:
614587
cdef:
615588
void *ptr
616589

617-
self.parser.cb_io = NULL
618-
self.parser.cb_cleanup = NULL
619-
620-
if self.compression:
621-
if self.compression == 'gzip':
622-
if isinstance(source, str):
623-
source = gzip.GzipFile(source, 'rb')
624-
else:
625-
source = gzip.GzipFile(fileobj=source)
626-
elif self.compression == 'bz2':
627-
source = bz2.BZ2File(source, 'rb')
628-
elif self.compression == 'zip':
629-
zip_file = zipfile.ZipFile(source)
630-
zip_names = zip_file.namelist()
631-
632-
if len(zip_names) == 1:
633-
file_name = zip_names.pop()
634-
source = zip_file.open(file_name)
635-
636-
elif len(zip_names) == 0:
637-
raise ValueError(f'Zero files found in compressed '
638-
f'zip file {source}')
639-
else:
640-
raise ValueError(f'Multiple files found in compressed '
641-
f'zip file {zip_names}')
642-
elif self.compression == 'xz':
643-
if isinstance(source, str):
644-
source = get_lzma_file(lzma)(source, 'rb')
645-
else:
646-
source = get_lzma_file(lzma)(filename=source)
647-
else:
648-
raise ValueError(f'Unrecognized compression type: '
649-
f'{self.compression}')
650-
651-
if (self.encoding and hasattr(source, "read") and
652-
not hasattr(source, "encoding")):
653-
source = io.TextIOWrapper(
654-
source, self.encoding.decode('utf-8'), newline='')
655-
656-
self.encoding = b'utf-8'
657-
self.c_encoding = <char*>self.encoding
658-
659-
self.handle = source
660-
661-
if isinstance(source, str):
662-
encoding = sys.getfilesystemencoding() or "utf-8"
663-
usource = source
664-
source = source.encode(encoding)
665-
666-
if self.memory_map:
667-
ptr = new_mmap(source)
668-
if ptr == NULL:
669-
# fall back
670-
ptr = new_file_source(source, self.parser.chunksize)
671-
self.parser.cb_io = &buffer_file_bytes
672-
self.parser.cb_cleanup = &del_file_source
673-
else:
674-
self.parser.cb_io = &buffer_mmap_bytes
675-
self.parser.cb_cleanup = &del_mmap
676-
else:
677-
ptr = new_file_source(source, self.parser.chunksize)
678-
self.parser.cb_io = &buffer_file_bytes
679-
self.parser.cb_cleanup = &del_file_source
680-
self.parser.source = ptr
681-
682-
elif hasattr(source, 'read'):
683-
# e.g., StringIO
684-
685-
ptr = new_rd_source(source)
686-
self.parser.source = ptr
687-
self.parser.cb_io = &buffer_rd_bytes
688-
self.parser.cb_cleanup = &del_rd_source
689-
else:
590+
if not hasattr(source, "read"):
690591
raise IOError(f'Expected file path name or file-like object, '
691592
f'got {type(source)} type')
692593

594+
ptr = new_rd_source(source)
595+
self.parser.source = ptr
596+
self.parser.cb_io = &buffer_rd_bytes
597+
self.parser.cb_cleanup = &del_rd_source
598+
693599
cdef _get_header(self):
694600
# header is now a list of lists, so field_count should use header[0]
695601

pandas/_typing.py

+6-23
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
from dataclasses import dataclass
21
from datetime import datetime, timedelta, tzinfo
3-
from io import IOBase
2+
from io import BufferedIOBase, RawIOBase, TextIOBase, TextIOWrapper
3+
from mmap import mmap
44
from pathlib import Path
55
from typing import (
66
IO,
@@ -10,7 +10,6 @@
1010
Callable,
1111
Collection,
1212
Dict,
13-
Generic,
1413
Hashable,
1514
List,
1615
Mapping,
@@ -77,8 +76,6 @@
7776
"ExtensionDtype", str, np.dtype, Type[Union[str, float, int, complex, bool, object]]
7877
]
7978
DtypeObj = Union[np.dtype, "ExtensionDtype"]
80-
FilePathOrBuffer = Union[str, Path, IO[AnyStr], IOBase]
81-
FileOrBuffer = Union[str, IO[AnyStr], IOBase]
8279

8380
# FrameOrSeriesUnion means either a DataFrame or a Series. E.g.
8481
# `def func(a: FrameOrSeriesUnion) -> FrameOrSeriesUnion: ...` means that if a Series
@@ -133,6 +130,10 @@
133130
"Resampler",
134131
]
135132

133+
# filenames and file-like-objects
134+
Buffer = Union[IO[AnyStr], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]
135+
FileOrBuffer = Union[str, Buffer[T]]
136+
FilePathOrBuffer = Union[Path, FileOrBuffer[T]]
136137

137138
# for arbitrary kwargs passed during reading/writing files
138139
StorageOptions = Optional[Dict[str, Any]]
@@ -150,21 +151,3 @@
150151

151152
# type of float formatter in DataFrameFormatter
152153
FloatFormatType = Union[str, Callable, "EngFormatter"]
153-
154-
155-
@dataclass
156-
class IOargs(Generic[ModeVar, EncodingVar]):
157-
"""
158-
Return value of io/common.py:get_filepath_or_buffer.
159-
160-
Note (copy&past from io/parsers):
161-
filepath_or_buffer can be Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile]
162-
though mypy handling of conditional imports is difficult.
163-
See https://github.com/python/mypy/issues/1297
164-
"""
165-
166-
filepath_or_buffer: FileOrBuffer
167-
encoding: EncodingVar
168-
compression: CompressionDict
169-
should_close: bool
170-
mode: Union[ModeVar, str]

pandas/core/frame.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import datetime
1616
from io import StringIO
1717
import itertools
18+
import mmap
1819
from textwrap import dedent
1920
from typing import (
2021
IO,
@@ -2286,10 +2287,9 @@ def to_markdown(
22862287
if buf is None:
22872288
return result
22882289
ioargs = get_filepath_or_buffer(buf, mode=mode, storage_options=storage_options)
2289-
assert not isinstance(ioargs.filepath_or_buffer, str)
2290+
assert not isinstance(ioargs.filepath_or_buffer, (str, mmap.mmap))
22902291
ioargs.filepath_or_buffer.writelines(result)
2291-
if ioargs.should_close:
2292-
ioargs.filepath_or_buffer.close()
2292+
ioargs.close()
22932293
return None
22942294

22952295
@deprecate_kwarg(old_arg_name="fname", new_arg_name="path")

0 commit comments

Comments
 (0)