Skip to content

Commit 169a3aa

Browse files
committed
REGR: be able to read Stata files without reading them fully into memory
Fixes pandas-dev#48700 Refs pandas-dev#9245 Refs pandas-dev#37639 Regressed in 6d1541e
1 parent fb19ddb commit 169a3aa

File tree

4 files changed

+44
-5
lines changed

4 files changed

+44
-5
lines changed

doc/source/user_guide/io.rst

+7
Original file line numberDiff line numberDiff line change
@@ -6183,6 +6183,13 @@ values will have ``object`` data type.
61836183
``int64`` for all integer types and ``float64`` for floating point data. By default,
61846184
the Stata data types are preserved when importing.
61856185

6186+
.. note::
6187+
6188+
All :class:`~pandas.io.stata.StataReader` objects, whether created by :func:`~pandas.read_stata`
6189+
(when using ``iterator=True`` or ``chunksize``) or instantiated by hand, must be closed by
6190+
calling :meth:`~pandas.io.stata.StataReader.close` (or by using the ``with`` statement, as
6191+
in the examples above) to avoid leaking file handles.
6192+
61866193
.. ipython:: python
61876194
:suppress:
61886195

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ Performance improvements
160160
- Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`)
161161
- Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`)
162162
- Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`)
163+
- Memory improvement in :class:`StataReader` when reading seekable files (:issue:`48922`)
163164

164165
.. ---------------------------------------------------------------------------
165166
.. _whatsnew_200.bug_fixes:

pandas/io/stata.py

+13-5
Original file line numberDiff line numberDiff line change
@@ -1163,15 +1163,23 @@ def __init__(
11631163
self._lines_read = 0
11641164

11651165
self._native_byteorder = _set_endianness(sys.byteorder)
1166-
with get_handle(
1166+
1167+
handles = get_handle(
11671168
path_or_buf,
11681169
"rb",
11691170
storage_options=storage_options,
11701171
is_text=False,
11711172
compression=compression,
1172-
) as handles:
1173-
# Copy to BytesIO, and ensure no encoding
1174-
self.path_or_buf = BytesIO(handles.handle.read())
1173+
)
1174+
if hasattr(handles.handle, "seekable") and handles.handle.seekable():
1175+
# If the handle is directly seekable, use it without an extra copy.
1176+
self.path_or_buf = handles.handle
1177+
self._close_file = handles.close
1178+
else:
1179+
# Copy to memory, and ensure no encoding.
1180+
with handles:
1181+
self.path_or_buf = BytesIO(handles.handle.read())
1182+
self._close_file = self.path_or_buf.close
11751183

11761184
self._read_header()
11771185
self._setup_dtype()
@@ -1191,7 +1199,7 @@ def __exit__(
11911199

11921200
def close(self) -> None:
11931201
"""close the handle if its open"""
1194-
self.path_or_buf.close()
1202+
self._close_file()
11951203

11961204
def _set_encoding(self) -> None:
11971205
"""

pandas/tests/io/test_stata.py

+23
Original file line numberDiff line numberDiff line change
@@ -1842,6 +1842,29 @@ def test_backward_compat(version, datapath):
18421842
tm.assert_frame_equal(old_dta, expected, check_dtype=False)
18431843

18441844

1845+
def test_direct_read(datapath, monkeypatch):
1846+
file_path = datapath("io", "data", "stata", "stata-compat-118.dta")
1847+
1848+
# Test that opening a file path doesn't buffer the file.
1849+
with StataReader(file_path) as reader:
1850+
# Must not have been buffered to memory
1851+
assert not isinstance(reader.path_or_buf, io.BytesIO)
1852+
assert not reader.read().empty
1853+
1854+
# Test that we use a given fp exactly, if possible.
1855+
with open(file_path, "rb") as fp:
1856+
with StataReader(fp) as reader:
1857+
assert reader.path_or_buf is fp
1858+
assert not reader.read().empty
1859+
1860+
# Test that we use a given BytesIO exactly, if possible.
1861+
with open(file_path, "rb") as fp:
1862+
with io.BytesIO(fp.read()) as bio:
1863+
with StataReader(bio) as reader:
1864+
assert reader.path_or_buf is bio
1865+
assert not reader.read().empty
1866+
1867+
18451868
@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
18461869
@pytest.mark.parametrize("use_dict", [True, False])
18471870
@pytest.mark.parametrize("infer", [True, False])

0 commit comments

Comments
 (0)