Skip to content

Commit 8775e24

Browse files
committed
FIX: StataReader: don't buffer entire file into memory unless necessary
Refs #48922
1 parent bfa332d commit 8775e24

File tree

3 files changed

+73
-8
lines changed

3 files changed

+73
-8
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -487,6 +487,7 @@ Performance improvements
487487
- Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`)
488488
- Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`)
489489
- Performance improvement in :func:`merge` when not merging on the index - the new index will now be :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49478`)
490+
- Memory improvement in :class:`StataReader` when reading seekable files (:issue:`48922`)
490491

491492
.. ---------------------------------------------------------------------------
492493
.. _whatsnew_200.bug_fixes:

pandas/io/stata.py

+40-8
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
TYPE_CHECKING,
2424
Any,
2525
AnyStr,
26+
Callable,
2627
Final,
2728
Hashable,
2829
Sequence,
@@ -1149,6 +1150,7 @@ def __init__(
11491150
self._encoding = ""
11501151
self._chunksize = chunksize
11511152
self._using_iterator = False
1153+
self._entered = False
11521154
if self._chunksize is None:
11531155
self._chunksize = 1
11541156
elif not isinstance(chunksize, int) or chunksize <= 0:
@@ -1158,6 +1160,7 @@ def __init__(
11581160
# NB: _path_or_buf is mistyped on purpose, since the alternative is to placate
11591161
# mypy by having an assert before every read.
11601162
self._path_or_buf: IO[bytes] = None # type: ignore[assignment]
1163+
self._close_file: Callable[[], None] | None = None
11611164
self._has_string_data = False
11621165
self._missing_values = False
11631166
self._can_read_value_labels = False
@@ -1180,21 +1183,36 @@ def _open_file(self) -> None:
11801183
"""
11811184
Open the file (with compression options, etc.), and read header information.
11821185
"""
1183-
with get_handle(
1186+
if not self._entered:
1187+
warnings.warn(
1188+
"StataReader is being used without using a context manager. "
1189+
"Using StataReader as a context manager is the only supported method.",
1190+
ResourceWarning,
1191+
stacklevel=4,
1192+
)
1193+
handles = get_handle(
11841194
self._original_path_or_buf,
11851195
"rb",
11861196
storage_options=self._storage_options,
11871197
is_text=False,
11881198
compression=self._compression,
1189-
) as handles:
1190-
# Copy to BytesIO, and ensure no encoding
1191-
self._path_or_buf = BytesIO(handles.handle.read())
1199+
)
1200+
if hasattr(handles.handle, "seekable") and handles.handle.seekable():
1201+
# If the handle is directly seekable, use it without an extra copy.
1202+
self._path_or_buf = handles.handle
1203+
self._close_file = handles.close
1204+
else:
1205+
# Copy to memory, and ensure no encoding.
1206+
with handles:
1207+
self._path_or_buf = BytesIO(handles.handle.read())
1208+
self._close_file = self._path_or_buf.close
11921209

11931210
self._read_header()
11941211
self._setup_dtype()
11951212

11961213
def __enter__(self) -> StataReader:
11971214
"""enter context manager"""
1215+
self._entered = True
11981216
return self
11991217

12001218
def __exit__(
@@ -1203,12 +1221,26 @@ def __exit__(
12031221
exc_value: BaseException | None,
12041222
traceback: TracebackType | None,
12051223
) -> None:
1206-
"""exit context manager"""
1207-
self.close()
1224+
if self._close_file:
1225+
self._close_file()
12081226

12091227
def close(self) -> None:
1210-
"""close the handle if its open"""
1211-
self._path_or_buf.close()
1228+
"""Close the handle if its open.
1229+
1230+
.. deprecated: 1.6.0
1231+
1232+
The close method is not part of the public API.
1233+
The only supported way to use StataReader is to use it as a context manager.
1234+
"""
1235+
warnings.warn(
1236+
"The StataReader.close() method is not part of the public API and "
1237+
"may be removed in a future version without notice. "
1238+
"Using StataReader as a context manager is the only supported method.",
1239+
FutureWarning,
1240+
stacklevel=2,
1241+
)
1242+
if self._close_file:
1243+
self._close_file()
12121244

12131245
def _set_encoding(self) -> None:
12141246
"""

pandas/tests/io/test_stata.py

+32
Original file line numberDiff line numberDiff line change
@@ -1839,6 +1839,38 @@ def test_backward_compat(version, datapath):
18391839
tm.assert_frame_equal(old_dta, expected, check_dtype=False)
18401840

18411841

1842+
def test_direct_read(datapath, monkeypatch):
1843+
file_path = datapath("io", "data", "stata", "stata-compat-118.dta")
1844+
1845+
# Test that opening a file path doesn't buffer the file.
1846+
with StataReader(file_path) as reader:
1847+
# Must not have been buffered to memory
1848+
assert not reader.read().empty
1849+
assert not isinstance(reader._path_or_buf, io.BytesIO)
1850+
1851+
# Test that we use a given fp exactly, if possible.
1852+
with open(file_path, "rb") as fp:
1853+
with StataReader(fp) as reader:
1854+
assert not reader.read().empty
1855+
assert reader._path_or_buf is fp
1856+
1857+
# Test that we use a given BytesIO exactly, if possible.
1858+
with open(file_path, "rb") as fp:
1859+
with io.BytesIO(fp.read()) as bio:
1860+
with StataReader(bio) as reader:
1861+
assert not reader.read().empty
1862+
assert reader._path_or_buf is bio
1863+
1864+
1865+
def test_statareader_warns_when_used_without_context(datapath):
1866+
file_path = datapath("io", "data", "stata", "stata-compat-118.dta")
1867+
with tm.assert_produces_warning(ResourceWarning):
1868+
sr = StataReader(file_path)
1869+
sr.read()
1870+
with tm.assert_produces_warning(FutureWarning):
1871+
sr.close()
1872+
1873+
18421874
@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
18431875
@pytest.mark.parametrize("use_dict", [True, False])
18441876
@pytest.mark.parametrize("infer", [True, False])

0 commit comments

Comments
 (0)