Skip to content

Commit e707814

Browse files
Backport PR #48651 on branch 1.5.x (REGR: TextIOWrapper raising an error in read_csv) (#48666)
Backport PR #48651: REGR: TextIOWrapper raising an error in read_csv Co-authored-by: Torsten Wörtwein <[email protected]>
1 parent 690f641 commit e707814

File tree

4 files changed

+26
-13
lines changed

4 files changed

+26
-13
lines changed

doc/source/whatsnew/v1.5.1.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ including other versions of pandas.
1414

1515
Fixed regressions
1616
~~~~~~~~~~~~~~~~~
17-
-
17+
- Regression in :func:`.read_csv` causing an ``EmptyDataError`` when using an UTF-8 file handle that was already read from (:issue:`48646`)
1818
-
1919

2020
.. ---------------------------------------------------------------------------

pandas/io/parsers/c_parser_wrapper.py

-12
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
from collections import defaultdict
44
import inspect
5-
from io import TextIOWrapper
65
from typing import (
76
TYPE_CHECKING,
87
Hashable,
@@ -67,17 +66,6 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None:
6766
# Have to pass int, would break tests using TextReader directly otherwise :(
6867
kwds["on_bad_lines"] = self.on_bad_lines.value
6968

70-
# c-engine can cope with utf-8 bytes. Remove TextIOWrapper when its errors
71-
# policy is the same as the one given to read_csv
72-
if (
73-
isinstance(src, TextIOWrapper)
74-
and src.encoding == "utf-8"
75-
and (src.errors or "strict") == kwds["encoding_errors"]
76-
):
77-
# error: Incompatible types in assignment (expression has type "BinaryIO",
78-
# variable has type "ReadCsvBuffer[str]")
79-
src = src.buffer # type: ignore[assignment]
80-
8169
for key in (
8270
"storage_options",
8371
"encoding",

pandas/io/parsers/readers.py

+11
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
from pandas.io.common import (
6060
IOHandles,
6161
get_handle,
62+
stringify_path,
6263
validate_header_arg,
6364
)
6465
from pandas.io.parsers.arrow_parser_wrapper import ArrowParserWrapper
@@ -1726,6 +1727,16 @@ def _make_engine(
17261727
if engine == "pyarrow":
17271728
is_text = False
17281729
mode = "rb"
1730+
elif (
1731+
engine == "c"
1732+
and self.options.get("encoding", "utf-8") == "utf-8"
1733+
and isinstance(stringify_path(f), str)
1734+
):
1735+
# c engine can decode utf-8 bytes, adding TextIOWrapper makes
1736+
# the c-engine especially for memory_map=True far slower
1737+
is_text = False
1738+
if "b" not in mode:
1739+
mode += "b"
17291740
self.handles = get_handle(
17301741
f,
17311742
mode,

pandas/tests/io/parser/common/test_common_basic.py

+14
Original file line numberDiff line numberDiff line change
@@ -928,3 +928,17 @@ def test_read_table_posargs_deprecation(all_parsers):
928928
"except for the argument 'filepath_or_buffer' will be keyword-only"
929929
)
930930
parser.read_table_check_warnings(FutureWarning, msg, data, " ")
931+
932+
933+
def test_read_seek(all_parsers):
934+
# GH48646
935+
parser = all_parsers
936+
prefix = "### DATA\n"
937+
content = "nkey,value\ntables,rectangular\n"
938+
with tm.ensure_clean() as path:
939+
Path(path).write_text(prefix + content)
940+
with open(path, encoding="utf-8") as file:
941+
file.readline()
942+
actual = parser.read_csv(file)
943+
expected = parser.read_csv(StringIO(content))
944+
tm.assert_frame_equal(actual, expected)

0 commit comments

Comments
 (0)