Skip to content

Commit 1cd8cad

Browse files
committed
FIX: StataReader: don't buffer entire file into memory unless necessary
Refs pandas-dev#48922
1 parent c370a80 commit 1cd8cad

File tree

3 files changed

+72
-8
lines changed

3 files changed

+72
-8
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -613,6 +613,7 @@ Performance improvements
613613
- Performance improvement in :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default). Now the index will be a :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49745`)
614614
- Performance improvement in :func:`merge` when not merging on the index - the new index will now be :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49478`)
615615
- Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when using any non-object dtypes (:issue:`46470`)
616+
- Memory improvement in :class:`StataReader` when reading seekable files (:issue:`48922`)
616617

617618
.. ---------------------------------------------------------------------------
618619
.. _whatsnew_200.bug_fixes:

pandas/io/stata.py

+39-8
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
TYPE_CHECKING,
2424
Any,
2525
AnyStr,
26+
Callable,
2627
Final,
2728
Hashable,
2829
Sequence,
@@ -1151,6 +1152,7 @@ def __init__(
11511152
self._encoding = ""
11521153
self._chunksize = chunksize
11531154
self._using_iterator = False
1155+
self._entered = False
11541156
if self._chunksize is None:
11551157
self._chunksize = 1
11561158
elif not isinstance(chunksize, int) or chunksize <= 0:
@@ -1180,21 +1182,36 @@ def _open_file(self) -> None:
11801182
"""
11811183
Open the file (with compression options, etc.), and read header information.
11821184
"""
1183-
with get_handle(
1185+
if not self._entered:
1186+
warnings.warn(
1187+
"StataReader is being used without using a context manager. "
1188+
"Using StataReader as a context manager is the only supported method.",
1189+
ResourceWarning,
1190+
stacklevel=4,
1191+
)
1192+
handles = get_handle(
11841193
self._original_path_or_buf,
11851194
"rb",
11861195
storage_options=self._storage_options,
11871196
is_text=False,
11881197
compression=self._compression,
1189-
) as handles:
1190-
# Copy to BytesIO, and ensure no encoding
1191-
self._path_or_buf = BytesIO(handles.handle.read())
1198+
)
1199+
if hasattr(handles.handle, "seekable") and handles.handle.seekable():
1200+
# If the handle is directly seekable, use it without an extra copy.
1201+
self._path_or_buf = handles.handle
1202+
self._close_file = handles.close
1203+
else:
1204+
# Copy to memory, and ensure no encoding.
1205+
with handles:
1206+
self._path_or_buf = BytesIO(handles.handle.read())
1207+
self._close_file = self._path_or_buf.close
11921208

11931209
self._read_header()
11941210
self._setup_dtype()
11951211

11961212
def __enter__(self) -> StataReader:
11971213
"""enter context manager"""
1214+
self._entered = True
11981215
return self
11991216

12001217
def __exit__(
@@ -1203,12 +1220,26 @@ def __exit__(
12031220
exc_value: BaseException | None,
12041221
traceback: TracebackType | None,
12051222
) -> None:
1206-
"""exit context manager"""
1207-
self.close()
1223+
if self._close_file:
1224+
self._close_file()
12081225

12091226
def close(self) -> None:
1210-
"""close the handle if its open"""
1211-
self._path_or_buf.close()
1227+
"""Close the handle if its open.
1228+
1229+
.. deprecated: 2.0.0
1230+
1231+
The close method is not part of the public API.
1232+
The only supported way to use StataReader is to use it as a context manager.
1233+
"""
1234+
warnings.warn(
1235+
"The StataReader.close() method is not part of the public API and "
1236+
"will be removed in a future version without notice. "
1237+
"Using StataReader as a context manager is the only supported method.",
1238+
FutureWarning,
1239+
stacklevel=2,
1240+
)
1241+
if self._close_file:
1242+
self._close_file()
12121243

12131244
def _set_encoding(self) -> None:
12141245
"""

pandas/tests/io/test_stata.py

+32
Original file line numberDiff line numberDiff line change
@@ -1846,6 +1846,38 @@ def test_backward_compat(version, datapath):
18461846
tm.assert_frame_equal(old_dta, expected, check_dtype=False)
18471847

18481848

1849+
def test_direct_read(datapath, monkeypatch):
1850+
file_path = datapath("io", "data", "stata", "stata-compat-118.dta")
1851+
1852+
# Test that opening a file path doesn't buffer the file.
1853+
with StataReader(file_path) as reader:
1854+
# Must not have been buffered to memory
1855+
assert not reader.read().empty
1856+
assert not isinstance(reader._path_or_buf, io.BytesIO)
1857+
1858+
# Test that we use a given fp exactly, if possible.
1859+
with open(file_path, "rb") as fp:
1860+
with StataReader(fp) as reader:
1861+
assert not reader.read().empty
1862+
assert reader._path_or_buf is fp
1863+
1864+
# Test that we use a given BytesIO exactly, if possible.
1865+
with open(file_path, "rb") as fp:
1866+
with io.BytesIO(fp.read()) as bio:
1867+
with StataReader(bio) as reader:
1868+
assert not reader.read().empty
1869+
assert reader._path_or_buf is bio
1870+
1871+
1872+
def test_statareader_warns_when_used_without_context(datapath):
1873+
file_path = datapath("io", "data", "stata", "stata-compat-118.dta")
1874+
with tm.assert_produces_warning(ResourceWarning):
1875+
sr = StataReader(file_path)
1876+
sr.read()
1877+
with tm.assert_produces_warning(FutureWarning):
1878+
sr.close()
1879+
1880+
18491881
@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
18501882
@pytest.mark.parametrize("use_dict", [True, False])
18511883
@pytest.mark.parametrize("infer", [True, False])

0 commit comments

Comments
 (0)