diff --git a/pandas/io/common.py b/pandas/io/common.py index 15a8f2e114041..fdee1600c2a32 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -640,7 +640,7 @@ def get_handle( .. versionchanged:: 1.4.0 Zstandard support. memory_map : bool, default False - See parsers._parser_params for more information. + See parsers._parser_params for more information. Only used by read_csv. is_text : bool, default True Whether the type of the content passed to the file/buffer is string or bytes. This is not the same as `"b" not in mode`. If a string content is @@ -659,6 +659,8 @@ def get_handle( # Windows does not default to utf-8. Set to utf-8 for a consistent behavior encoding = encoding or "utf-8" + errors = errors or "strict" + # read_csv does not know whether the buffer is opened in binary/text mode if _is_binary_mode(path_or_buf, mode) and "b" not in mode: mode += "b" @@ -681,6 +683,7 @@ def get_handle( handles: list[BaseBuffer] # memory mapping needs to be the first step + # only used for read_csv handle, memory_map, handles = _maybe_memory_map( handle, memory_map, @@ -1064,7 +1067,7 @@ def closed(self): return self.fp is None -class _MMapWrapper(abc.Iterator): +class _CSVMMapWrapper(abc.Iterator): """ Wrapper for the Python's mmap class so that it can be properly read in by Python's csv.reader class. @@ -1079,7 +1082,7 @@ class _MMapWrapper(abc.Iterator): def __init__( self, - f: IO, + f: ReadBuffer[bytes], encoding: str = "utf-8", errors: str = "strict", decode: bool = True, @@ -1089,11 +1092,13 @@ def __init__( self.decoder = codecs.getincrementaldecoder(encoding)(errors=errors) self.decode = decode + # needed for compression libraries and TextIOWrapper self.attributes = {} for attribute in ("seekable", "readable"): if not hasattr(f, attribute): continue self.attributes[attribute] = getattr(f, attribute)() + self.mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) def __getattr__(self, name: str): @@ -1101,7 +1106,7 @@ def __getattr__(self, name: str): return lambda: self.attributes[name] return getattr(self.mmap, name) - def __iter__(self) -> _MMapWrapper: + def __iter__(self) -> _CSVMMapWrapper: return self def read(self, size: int = -1) -> str | bytes: @@ -1196,7 +1201,7 @@ def _maybe_memory_map( memory_map: bool, encoding: str, mode: str, - errors: str | None, + errors: str, decode: bool, ) -> tuple[str | BaseBuffer, bool, list[BaseBuffer]]: """Try to memory map file/buffer.""" @@ -1207,25 +1212,22 @@ def _maybe_memory_map( # need to open the file first if isinstance(handle, str): - if encoding and "b" not in mode: - # Encoding - handle = open(handle, mode, encoding=encoding, errors=errors, newline="") - else: - # Binary mode - handle = open(handle, mode) + handle = open(handle, "rb") handles.append(handle) # error: Argument 1 to "_MMapWrapper" has incompatible type "Union[IO[Any], # RawIOBase, BufferedIOBase, TextIOBase, mmap]"; expected "IO[Any]" try: + # open mmap, adds *-able, and convert to string wrapped = cast( BaseBuffer, - _MMapWrapper(handle, encoding, errors, decode), # type: ignore[arg-type] + _CSVMMapWrapper(handle, encoding, errors, decode), # type: ignore[arg-type] ) finally: for handle in reversed(handles): # error: "BaseBuffer" has no attribute "close" handle.close() # type: ignore[attr-defined] + handles = [] handles.append(wrapped) return wrapped, memory_map, handles diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index ca6809470b2b1..22399917f2bf7 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -413,18 +413,18 @@ def test_constructor_bad_file(self, mmap_file): err = mmap.error with pytest.raises(err, match=msg): - icom._MMapWrapper(non_file) + icom._CSVMMapWrapper(non_file) with open(mmap_file) as target: pass msg = "I/O operation on closed file" with pytest.raises(ValueError, match=msg): - icom._MMapWrapper(target) + icom._CSVMMapWrapper(target) def test_get_attr(self, mmap_file): with open(mmap_file) as target: - wrapper = icom._MMapWrapper(target) + wrapper = icom._CSVMMapWrapper(target) attrs = dir(wrapper.mmap) attrs = [attr for attr in attrs if not attr.startswith("__")] @@ -437,7 +437,7 @@ def test_get_attr(self, mmap_file): def test_next(self, mmap_file): with open(mmap_file) as target: - wrapper = icom._MMapWrapper(target) + wrapper = icom._CSVMMapWrapper(target) lines = target.readlines() for line in lines: