pandas-dev · akx · Oct 3, 2022
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -6183,6 +6183,13 @@ values will have ``object`` data type.
    ``int64`` for all integer types and ``float64`` for floating point data.  By default,
    the Stata data types are preserved when importing.
 
+.. note::
+
+   All :class:`~pandas.io.stata.StataReader` objects, whether created by :func:`~pandas.read_stata`
+   (when using ``iterator=True`` or ``chunksize``) or instantiated by hand, must be closed by
+   calling :meth:`~pandas.io.stata.StataReader.close` (or by using the ``with`` statement, as
+   in the examples above) to avoid leaking file handles.
+
 .. ipython:: python
    :suppress:
 

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -160,6 +160,7 @@ Performance improvements
 - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`)
 - Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`)
 - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`)
+- Memory improvement in :class:`StataReader` when reading seekable files (:issue:`48922`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_200.bug_fixes:

diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -1163,15 +1163,23 @@ def __init__(
         self._lines_read = 0
 
         self._native_byteorder = _set_endianness(sys.byteorder)
-        with get_handle(
+
+        handles = get_handle(
             path_or_buf,
             "rb",
             storage_options=storage_options,
             is_text=False,
             compression=compression,
-        ) as handles:
-            # Copy to BytesIO, and ensure no encoding
-            self.path_or_buf = BytesIO(handles.handle.read())
+        )
+        if hasattr(handles.handle, "seekable") and handles.handle.seekable():
+            # If the handle is directly seekable, use it without an extra copy.
+            self.path_or_buf = handles.handle
+            self._close_file = handles.close
+        else:
+            # Copy to memory, and ensure no encoding.
+            with handles:
+                self.path_or_buf = BytesIO(handles.handle.read())
+            self._close_file = self.path_or_buf.close
 
         self._read_header()
         self._setup_dtype()
@@ -1191,7 +1199,7 @@ def __exit__(
 
     def close(self) -> None:
         """close the handle if its open"""
-        self.path_or_buf.close()
+        self._close_file()
 
     def _set_encoding(self) -> None:
         """

diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -1842,6 +1842,29 @@ def test_backward_compat(version, datapath):
     tm.assert_frame_equal(old_dta, expected, check_dtype=False)
 
 
+def test_direct_read(datapath, monkeypatch):
+    file_path = datapath("io", "data", "stata", "stata-compat-118.dta")
+
+    # Test that opening a file path doesn't buffer the file.
+    with StataReader(file_path) as reader:
+        # Must not have been buffered to memory
+        assert not isinstance(reader.path_or_buf, io.BytesIO)
+        assert not reader.read().empty
+
+    # Test that we use a given fp exactly, if possible.
+    with open(file_path, "rb") as fp:
+        with StataReader(fp) as reader:
+            assert reader.path_or_buf is fp
+            assert not reader.read().empty
+
+    # Test that we use a given BytesIO exactly, if possible.
+    with open(file_path, "rb") as fp:
+        with io.BytesIO(fp.read()) as bio:
+            with StataReader(bio) as reader:
+                assert reader.path_or_buf is bio
+                assert not reader.read().empty
+
+
 @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
 @pytest.mark.parametrize("use_dict", [True, False])
 @pytest.mark.parametrize("infer", [True, False])