Skip to content

Commit 0c51920

Browse files
committed
REGR: be able to read Stata files without reading them fully into memory
Fixes pandas-dev#48700 Refs pandas-dev#9245 Refs pandas-dev#37639 Regressed in 6d1541e
1 parent 55dc324 commit 0c51920

File tree

3 files changed

+41
-6
lines changed

3 files changed

+41
-6
lines changed

doc/source/whatsnew/v1.6.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ Performance improvements
155155
- Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`)
156156
- Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`)
157157
- Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`)
158+
- Memory improvement in :class:`StataReader` when reading seekable files (:issue:`48922`)
158159

159160
.. ---------------------------------------------------------------------------
160161
.. _whatsnew_160.bug_fixes:

pandas/io/stata.py

+13-5
Original file line numberDiff line numberDiff line change
@@ -1164,15 +1164,23 @@ def __init__(
11641164
self._lines_read = 0
11651165

11661166
self._native_byteorder = _set_endianness(sys.byteorder)
1167-
with get_handle(
1167+
1168+
handles = get_handle(
11681169
path_or_buf,
11691170
"rb",
11701171
storage_options=storage_options,
11711172
is_text=False,
11721173
compression=compression,
1173-
) as handles:
1174-
# Copy to BytesIO, and ensure no encoding
1175-
self.path_or_buf = BytesIO(handles.handle.read())
1174+
)
1175+
if hasattr(handles.handle, "seekable") and handles.handle.seekable():
1176+
# If the handle is directly seekable, use it without an extra copy.
1177+
self.path_or_buf = handles.handle
1178+
self._close_file = handles.close
1179+
else:
1180+
# Copy to memory, and ensure no encoding.
1181+
with handles:
1182+
self.path_or_buf = BytesIO(handles.handle.read())
1183+
self._close_file = self.path_or_buf.close
11761184

11771185
self._read_header()
11781186
self._setup_dtype()
@@ -1192,7 +1200,7 @@ def __exit__(
11921200

11931201
def close(self) -> None:
11941202
"""close the handle if its open"""
1195-
self.path_or_buf.close()
1203+
self._close_file()
11961204

11971205
def _set_encoding(self) -> None:
11981206
"""

pandas/tests/io/test_stata.py

+27-1
Original file line numberDiff line numberDiff line change
@@ -1848,6 +1848,29 @@ def test_backward_compat(version, datapath):
18481848
tm.assert_frame_equal(old_dta, expected, check_dtype=False)
18491849

18501850

1851+
def test_direct_read(datapath, monkeypatch):
1852+
file_path = datapath("io", "data", "stata", "stata-compat-118.dta")
1853+
1854+
# Test that opening a file path doesn't buffer the file.
1855+
with StataReader(file_path) as reader:
1856+
# Must not have been buffered to memory
1857+
assert not isinstance(reader.path_or_buf, io.BytesIO)
1858+
assert not reader.read().empty
1859+
1860+
# Test that we use a given fp exactly, if possible.
1861+
with open(file_path, "rb") as fp:
1862+
with StataReader(fp) as reader:
1863+
assert reader.path_or_buf is fp
1864+
assert not reader.read().empty
1865+
1866+
# Test that we use a given BytesIO exactly, if possible.
1867+
with open(file_path, "rb") as fp:
1868+
with io.BytesIO(fp.read()) as bio:
1869+
with StataReader(bio) as reader:
1870+
assert reader.path_or_buf is bio
1871+
assert not reader.read().empty
1872+
1873+
18511874
@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
18521875
@pytest.mark.parametrize("use_dict", [True, False])
18531876
@pytest.mark.parametrize("infer", [True, False])
@@ -2027,7 +2050,10 @@ def test_stata_compression(compression_only, read_infer, to_infer):
20272050

20282051
with tm.ensure_clean(filename) as path:
20292052
df.to_stata(path, compression=to_compression)
2030-
result = read_stata(path, compression=read_compression, index_col="index")
2053+
with StataReader(
2054+
path, compression=read_compression, index_col="index"
2055+
) as reader:
2056+
result = reader.read()
20312057
tm.assert_frame_equal(result, df)
20322058

20332059

0 commit comments

Comments
 (0)