Skip to content

Commit 839c068

Browse files
committed
FIX: StataReader: don't buffer entire file into memory unless necessary
Refs #48922
1 parent aa99afe commit 839c068

File tree

3 files changed

+54
-5
lines changed

3 files changed

+54
-5
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,7 @@ Performance improvements
257257
- Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`)
258258
- Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`)
259259
- Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`)
260+
- Memory improvement in :class:`StataReader` when reading seekable files (:issue:`48922`)
260261

261262
.. ---------------------------------------------------------------------------
262263
.. _whatsnew_200.bug_fixes:

pandas/io/stata.py

+24-5
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
TYPE_CHECKING,
2424
Any,
2525
AnyStr,
26+
Callable,
2627
Final,
2728
Hashable,
2829
Sequence,
@@ -1150,6 +1151,7 @@ def __init__(
11501151
self._encoding = ""
11511152
self._chunksize = chunksize
11521153
self._using_iterator = False
1154+
self._entered = False
11531155
if self._chunksize is None:
11541156
self._chunksize = 1
11551157
elif not isinstance(chunksize, int) or chunksize <= 0:
@@ -1159,6 +1161,7 @@ def __init__(
11591161
# NB: _path_or_buf is mistyped on purpose, since the alternative is to placate
11601162
# mypy by having an assert before every read.
11611163
self._path_or_buf: IO[bytes] = None # type: ignore[assignment]
1164+
self._close_file: Callable[[], None] | None = None
11621165
self._has_string_data = False
11631166
self._missing_values = False
11641167
self._can_read_value_labels = False
@@ -1181,21 +1184,36 @@ def _open_file(self) -> None:
11811184
"""
11821185
Open the file (with compression options, etc.), and read header information.
11831186
"""
1184-
with get_handle(
1187+
if not self._entered:
1188+
warnings.warn(
1189+
"Please use StataReader as a context manager to ensure that "
1190+
"the file is properly closed when reading is finished.",
1191+
ResourceWarning,
1192+
stacklevel=4,
1193+
)
1194+
handles = get_handle(
11851195
self._original_path_or_buf,
11861196
"rb",
11871197
storage_options=self._storage_options,
11881198
is_text=False,
11891199
compression=self._compression,
1190-
) as handles:
1191-
# Copy to BytesIO, and ensure no encoding
1192-
self._path_or_buf = BytesIO(handles.handle.read())
1200+
)
1201+
if hasattr(handles.handle, "seekable") and handles.handle.seekable():
1202+
# If the handle is directly seekable, use it without an extra copy.
1203+
self._path_or_buf = handles.handle
1204+
self._close_file = handles.close
1205+
else:
1206+
# Copy to memory, and ensure no encoding.
1207+
with handles:
1208+
self._path_or_buf = BytesIO(handles.handle.read())
1209+
self._close_file = self._path_or_buf.close
11931210

11941211
self._read_header()
11951212
self._setup_dtype()
11961213

11971214
def __enter__(self) -> StataReader:
11981215
"""enter context manager"""
1216+
self._entered = True
11991217
return self
12001218

12011219
def __exit__(
@@ -1209,7 +1227,8 @@ def __exit__(
12091227

12101228
def close(self) -> None:
12111229
"""close the handle if its open"""
1212-
self._path_or_buf.close()
1230+
if self._close_file:
1231+
self._close_file()
12131232

12141233
def _set_encoding(self) -> None:
12151234
"""

pandas/tests/io/test_stata.py

+29
Original file line numberDiff line numberDiff line change
@@ -1841,6 +1841,35 @@ def test_backward_compat(version, datapath):
18411841
tm.assert_frame_equal(old_dta, expected, check_dtype=False)
18421842

18431843

1844+
def test_direct_read(datapath, monkeypatch):
1845+
file_path = datapath("io", "data", "stata", "stata-compat-118.dta")
1846+
1847+
# Test that opening a file path doesn't buffer the file.
1848+
with StataReader(file_path) as reader:
1849+
# Must not have been buffered to memory
1850+
assert not reader.read().empty
1851+
assert not isinstance(reader.path_or_buf, io.BytesIO)
1852+
1853+
# Test that we use a given fp exactly, if possible.
1854+
with open(file_path, "rb") as fp:
1855+
with StataReader(fp) as reader:
1856+
assert not reader.read().empty
1857+
assert reader.path_or_buf is fp
1858+
1859+
# Test that we use a given BytesIO exactly, if possible.
1860+
with open(file_path, "rb") as fp:
1861+
with io.BytesIO(fp.read()) as bio:
1862+
with StataReader(bio) as reader:
1863+
assert not reader.read().empty
1864+
assert reader.path_or_buf is bio
1865+
1866+
1867+
def test_statareader_warns_when_used_without_context(datapath):
1868+
file_path = datapath("io", "data", "stata", "stata-compat-118.dta")
1869+
with tm.assert_produces_warning(ResourceWarning):
1870+
StataReader(file_path).read()
1871+
1872+
18441873
@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
18451874
@pytest.mark.parametrize("use_dict", [True, False])
18461875
@pytest.mark.parametrize("infer", [True, False])

0 commit comments

Comments
 (0)