Skip to content

Commit d1b9e41

Browse files
committed
FIX: StataReader: don't buffer entire file into memory unless necessary
Refs #48922
1 parent 2f5d192 commit d1b9e41

File tree

3 files changed

+52
-5
lines changed

3 files changed

+52
-5
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,7 @@ Performance improvements
197197
- Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`)
198198
- Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`)
199199
- Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`)
200+
- Memory improvement in :class:`StataReader` when reading seekable files (:issue:`48922`)
200201

201202
.. ---------------------------------------------------------------------------
202203
.. _whatsnew_200.bug_fixes:

pandas/io/stata.py

+22-5
Original file line numberDiff line numberDiff line change
@@ -1156,7 +1156,9 @@ def __init__(
11561156
raise ValueError("chunksize must be a positive integer when set.")
11571157

11581158
# State variables for the file
1159+
self._entered = False
11591160
self._path_or_buf = None
1161+
self._close_file = None
11601162
self._has_string_data = False
11611163
self._missing_values = False
11621164
self._can_read_value_labels = False
@@ -1179,21 +1181,35 @@ def _open_file(self) -> None:
11791181
"""
11801182
Open the file (with compression options, etc.), and read header information.
11811183
"""
1182-
with get_handle(
1184+
if not self._entered:
1185+
warnings.warn(
1186+
"Please use StataReader as a context manager to ensure that "
1187+
"the file is properly closed when reading is finished.",
1188+
ResourceWarning,
1189+
)
1190+
handles = get_handle(
11831191
self._original_path_or_buf,
11841192
"rb",
11851193
storage_options=self._storage_options,
11861194
is_text=False,
11871195
compression=self._compression,
1188-
) as handles:
1189-
# Copy to BytesIO, and ensure no encoding
1190-
self._path_or_buf = BytesIO(handles.handle.read())
1196+
)
1197+
if hasattr(handles.handle, "seekable") and handles.handle.seekable():
1198+
# If the handle is directly seekable, use it without an extra copy.
1199+
self._path_or_buf = handles.handle
1200+
self._close_file = handles.close
1201+
else:
1202+
# Copy to memory, and ensure no encoding.
1203+
with handles:
1204+
self._path_or_buf = BytesIO(handles.handle.read())
1205+
self._close_file = self.path_or_buf.close
11911206

11921207
self._read_header()
11931208
self._setup_dtype()
11941209

11951210
def __enter__(self) -> StataReader:
11961211
"""enter context manager"""
1212+
self._entered = True
11971213
return self
11981214

11991215
def __exit__(
@@ -1207,7 +1223,8 @@ def __exit__(
12071223

12081224
def close(self) -> None:
12091225
"""close the handle if its open"""
1210-
self._path_or_buf.close()
1226+
if self._close_file:
1227+
self._close_file()
12111228

12121229
def _set_encoding(self) -> None:
12131230
"""

pandas/tests/io/test_stata.py

+29
Original file line numberDiff line numberDiff line change
@@ -1842,6 +1842,35 @@ def test_backward_compat(version, datapath):
18421842
tm.assert_frame_equal(old_dta, expected, check_dtype=False)
18431843

18441844

1845+
def test_direct_read(datapath, monkeypatch):
1846+
file_path = datapath("io", "data", "stata", "stata-compat-118.dta")
1847+
1848+
# Test that opening a file path doesn't buffer the file.
1849+
with StataReader(file_path) as reader:
1850+
# Must not have been buffered to memory
1851+
assert not reader.read().empty
1852+
assert not isinstance(reader.path_or_buf, io.BytesIO)
1853+
1854+
# Test that we use a given fp exactly, if possible.
1855+
with open(file_path, "rb") as fp:
1856+
with StataReader(fp) as reader:
1857+
assert not reader.read().empty
1858+
assert reader.path_or_buf is fp
1859+
1860+
# Test that we use a given BytesIO exactly, if possible.
1861+
with open(file_path, "rb") as fp:
1862+
with io.BytesIO(fp.read()) as bio:
1863+
with StataReader(bio) as reader:
1864+
assert not reader.read().empty
1865+
assert reader.path_or_buf is bio
1866+
1867+
1868+
def test_statareader_warns_when_used_without_context(datapath):
1869+
file_path = datapath("io", "data", "stata", "stata-compat-118.dta")
1870+
with tm.assert_produces_warning(ResourceWarning):
1871+
StataReader(file_path).read()
1872+
1873+
18451874
@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
18461875
@pytest.mark.parametrize("use_dict", [True, False])
18471876
@pytest.mark.parametrize("infer", [True, False])

0 commit comments

Comments
 (0)