Skip to content

Commit d72d5f9

Browse files
committed
FIX: StataReader: don't buffer entire file into memory unless necessary
Refs #48922
1 parent 4602aed commit d72d5f9

File tree

3 files changed

+79
-8
lines changed

3 files changed

+79
-8
lines changed

doc/source/whatsnew/v2.0.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -1163,6 +1163,8 @@ Performance improvements
11631163
- Fixed a reference leak in :func:`read_hdf` (:issue:`37441`)
11641164
- Fixed a memory leak in :meth:`DataFrame.to_json` and :meth:`Series.to_json` when serializing datetimes and timedeltas (:issue:`40443`)
11651165
- Decreased memory usage in many :class:`DataFrameGroupBy` methods (:issue:`51090`)
1166+
- Memory improvement in :class:`StataReader` when reading seekable files (:issue:`48922`)
1167+
11661168

11671169
.. ---------------------------------------------------------------------------
11681170
.. _whatsnew_200.bug_fixes:

pandas/io/stata.py

+39-8
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
TYPE_CHECKING,
2424
Any,
2525
AnyStr,
26+
Callable,
2627
Final,
2728
Hashable,
2829
Sequence,
@@ -1148,6 +1149,7 @@ def __init__(
11481149
self._encoding = ""
11491150
self._chunksize = chunksize
11501151
self._using_iterator = False
1152+
self._entered = False
11511153
if self._chunksize is None:
11521154
self._chunksize = 1
11531155
elif not isinstance(chunksize, int) or chunksize <= 0:
@@ -1177,21 +1179,36 @@ def _open_file(self) -> None:
11771179
"""
11781180
Open the file (with compression options, etc.), and read header information.
11791181
"""
1180-
with get_handle(
1182+
if not self._entered:
1183+
warnings.warn(
1184+
"StataReader is being used without using a context manager. "
1185+
"Using StataReader as a context manager is the only supported method.",
1186+
ResourceWarning,
1187+
stacklevel=find_stack_level(),
1188+
)
1189+
handles = get_handle(
11811190
self._original_path_or_buf,
11821191
"rb",
11831192
storage_options=self._storage_options,
11841193
is_text=False,
11851194
compression=self._compression,
1186-
) as handles:
1187-
# Copy to BytesIO, and ensure no encoding
1188-
self._path_or_buf = BytesIO(handles.handle.read())
1195+
)
1196+
if hasattr(handles.handle, "seekable") and handles.handle.seekable():
1197+
# If the handle is directly seekable, use it without an extra copy.
1198+
self._path_or_buf = handles.handle
1199+
self._close_file = handles.close
1200+
else:
1201+
# Copy to memory, and ensure no encoding.
1202+
with handles:
1203+
self._path_or_buf = BytesIO(handles.handle.read())
1204+
self._close_file = self._path_or_buf.close
11891205

11901206
self._read_header()
11911207
self._setup_dtype()
11921208

11931209
def __enter__(self) -> StataReader:
11941210
"""enter context manager"""
1211+
self._entered = True
11951212
return self
11961213

11971214
def __exit__(
@@ -1200,12 +1217,26 @@ def __exit__(
12001217
exc_value: BaseException | None,
12011218
traceback: TracebackType | None,
12021219
) -> None:
1203-
"""exit context manager"""
1204-
self.close()
1220+
if self._close_file:
1221+
self._close_file()
12051222

12061223
def close(self) -> None:
1207-
"""close the handle if its open"""
1208-
self._path_or_buf.close()
1224+
"""Close the handle if its open.
1225+
1226+
.. deprecated: 2.0.0
1227+
1228+
The close method is not part of the public API.
1229+
The only supported way to use StataReader is to use it as a context manager.
1230+
"""
1231+
warnings.warn(
1232+
"The StataReader.close() method is not part of the public API and "
1233+
"will be removed in a future version without notice. "
1234+
"Using StataReader as a context manager is the only supported method.",
1235+
FutureWarning,
1236+
stacklevel=2,
1237+
)
1238+
if self._close_file:
1239+
self._close_file()
12091240

12101241
def _set_encoding(self) -> None:
12111242
"""

pandas/tests/io/test_stata.py

+38
Original file line numberDiff line numberDiff line change
@@ -1889,6 +1889,44 @@ def test_backward_compat(version, datapath):
18891889
tm.assert_frame_equal(old_dta, expected, check_dtype=False)
18901890

18911891

1892+
def test_direct_read(datapath, monkeypatch):
1893+
file_path = datapath("io", "data", "stata", "stata-compat-118.dta")
1894+
1895+
# Test that opening a file path doesn't buffer the file.
1896+
with StataReader(file_path) as reader:
1897+
# Must not have been buffered to memory
1898+
assert not reader.read().empty
1899+
assert not isinstance(reader._path_or_buf, io.BytesIO)
1900+
1901+
# Test that we use a given fp exactly, if possible.
1902+
with open(file_path, "rb") as fp:
1903+
with StataReader(fp) as reader:
1904+
assert not reader.read().empty
1905+
assert reader._path_or_buf is fp
1906+
1907+
# Test that we use a given BytesIO exactly, if possible.
1908+
with open(file_path, "rb") as fp:
1909+
with io.BytesIO(fp.read()) as bio:
1910+
with StataReader(bio) as reader:
1911+
assert not reader.read().empty
1912+
assert reader._path_or_buf is bio
1913+
1914+
1915+
def test_statareader_warns_when_used_without_context(datapath):
1916+
file_path = datapath("io", "data", "stata", "stata-compat-118.dta")
1917+
with tm.assert_produces_warning(
1918+
ResourceWarning,
1919+
match="without using a context manager",
1920+
):
1921+
sr = StataReader(file_path)
1922+
sr.read()
1923+
with tm.assert_produces_warning(
1924+
FutureWarning,
1925+
match="is not part of the public API",
1926+
):
1927+
sr.close()
1928+
1929+
18921930
@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
18931931
@pytest.mark.parametrize("use_dict", [True, False])
18941932
@pytest.mark.parametrize("infer", [True, False])

0 commit comments

Comments
 (0)