Skip to content

Commit f6b4d52

Browse files
committed
REGR: be able to read Stata files without reading them fully into memory
Fixes pandas-dev#48700 Regressed in pandas-dev#9245 Regressed in 2f0ada3
1 parent f47d82b commit f6b4d52

File tree

3 files changed

+25
-6
lines changed

3 files changed

+25
-6
lines changed

doc/source/whatsnew/v1.6.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,8 @@ Performance improvements
151151
- Performance improvement in ``var`` for nullable dtypes (:issue:`48379`).
152152
- Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`)
153153
- Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`)
154+
- Memory improvement in :class:`StataReader` when reading seekable files (:issue:`48922`)
155+
154156

155157
.. ---------------------------------------------------------------------------
156158
.. _whatsnew_160.bug_fixes:

pandas/io/stata.py

+15-5
Original file line numberDiff line numberDiff line change
@@ -1164,15 +1164,25 @@ def __init__(
11641164
self._lines_read = 0
11651165

11661166
self._native_byteorder = _set_endianness(sys.byteorder)
1167-
with get_handle(
1167+
1168+
handles = get_handle(
11681169
path_or_buf,
11691170
"rb",
11701171
storage_options=storage_options,
11711172
is_text=False,
11721173
compression=compression,
1173-
) as handles:
1174-
# Copy to BytesIO, and ensure no encoding
1175-
self.path_or_buf = BytesIO(handles.handle.read())
1174+
)
1175+
if hasattr(handles.handle, "seekable") and handles.handle.seekable():
1176+
# If the handle is directly seekable, use it without an extra copy.
1177+
self.path_or_buf = handles.handle
1178+
self._close_file = handles.close
1179+
self._copied_to_memory = False
1180+
else:
1181+
# Copy to memory, and ensure no encoding.
1182+
with handles:
1183+
self.path_or_buf = BytesIO(handles.handle.read())
1184+
self._close_file = self.path_or_buf.close
1185+
self._copied_to_memory = True
11761186

11771187
self._read_header()
11781188
self._setup_dtype()
@@ -1192,7 +1202,7 @@ def __exit__(
11921202

11931203
def close(self) -> None:
11941204
"""close the handle if its open"""
1195-
self.path_or_buf.close()
1205+
self._close_file()
11961206

11971207
def _set_encoding(self) -> None:
11981208
"""

pandas/tests/io/test_stata.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -2027,7 +2027,14 @@ def test_stata_compression(compression_only, read_infer, to_infer):
20272027

20282028
with tm.ensure_clean(filename) as path:
20292029
df.to_stata(path, compression=to_compression)
2030-
result = read_stata(path, compression=read_compression, index_col="index")
2030+
with StataReader(
2031+
path, compression=read_compression, index_col="index"
2032+
) as reader:
2033+
# A zstd stream is not seekable, so the reader should have decompressed it.
2034+
# This assumption could change in the future.
2035+
assert reader._copied_to_memory == (compression == "zstd")
2036+
2037+
result = reader.read()
20312038
tm.assert_frame_equal(result, df)
20322039

20332040

0 commit comments

Comments
 (0)