Skip to content

Commit c1470a9

Browse files
committed
FIX: StataReader: defer opening file to when data is required
1 parent 3351ce7 commit c1470a9

File tree

2 files changed

+28
-42
lines changed

2 files changed

+28
-42
lines changed

pandas/io/stata.py

+26-38
Original file line numberDiff line numberDiff line change
@@ -1118,6 +1118,8 @@ def __init__(self) -> None:
11181118
class StataReader(StataParser, abc.Iterator):
11191119
__doc__ = _stata_reader_doc
11201120

1121+
_path_or_buf: IO[bytes] # Will be assigned by `_open_file`.
1122+
11211123
def __init__(
11221124
self,
11231125
path_or_buf: FilePath | ReadBuffer[bytes],
@@ -1144,6 +1146,9 @@ def __init__(
11441146
self._preserve_dtypes = preserve_dtypes
11451147
self._columns = columns
11461148
self._order_categoricals = order_categoricals
1149+
self._original_path_or_buf = path_or_buf
1150+
self._compression = compression
1151+
self._storage_options = storage_options
11471152
self._encoding = ""
11481153
self._chunksize = chunksize
11491154
self._using_iterator = False
@@ -1153,6 +1158,7 @@ def __init__(
11531158
raise ValueError("chunksize must be a positive integer when set.")
11541159

11551160
# State variables for the file
1161+
self._close_file: Callable[[], None] | None = None
11561162
self._has_string_data = False
11571163
self._missing_values = False
11581164
self._can_read_value_labels = False
@@ -1163,12 +1169,24 @@ def __init__(
11631169
self._lines_read = 0
11641170

11651171
self._native_byteorder = _set_endianness(sys.byteorder)
1172+
1173+
def _ensure_open(self) -> None:
1174+
"""
1175+
Ensure the file has been opened and its header data read.
1176+
"""
1177+
if not hasattr(self, "_path_or_buf"):
1178+
self._open_file()
1179+
1180+
def _open_file(self) -> None:
1181+
"""
1182+
Open the file (with compression options, etc.), and read header information.
1183+
"""
11661184
with get_handle(
1167-
path_or_buf,
1185+
self._original_path_or_buf,
11681186
"rb",
1169-
storage_options=storage_options,
1187+
storage_options=self._storage_options,
11701188
is_text=False,
1171-
compression=compression,
1189+
compression=self._compression,
11721190
) as handles:
11731191
# Copy to BytesIO, and ensure no encoding
11741192
self._path_or_buf = BytesIO(handles.handle.read())
@@ -1535,6 +1553,7 @@ def _decode(self, s: bytes) -> str:
15351553
return s.decode("latin-1")
15361554

15371555
def _read_value_labels(self) -> None:
1556+
self._ensure_open()
15381557
if self._value_labels_read:
15391558
# Don't read twice
15401559
return
@@ -1654,6 +1673,7 @@ def read(
16541673
columns: Sequence[str] | None = None,
16551674
order_categoricals: bool | None = None,
16561675
) -> DataFrame:
1676+
self._ensure_open()
16571677
# Handle empty file or chunk. If reading incrementally raise
16581678
# StopIteration. If reading the whole thing return an empty
16591679
# data frame.
@@ -1982,48 +2002,15 @@ def data_label(self) -> str:
19822002
"""
19832003
Return data label of Stata file.
19842004
"""
2005+
self._ensure_open()
19852006
return self._data_label
19862007

1987-
@property
1988-
def typlist(self) -> list[int | str]:
1989-
"""
1990-
Return list of variable types.
1991-
"""
1992-
return self._typlist
1993-
1994-
@property
1995-
def dtyplist(self) -> list[str | np.dtype]:
1996-
"""
1997-
Return list of variable types.
1998-
"""
1999-
return self._dtyplist
2000-
2001-
@property
2002-
def lbllist(self) -> list[str]:
2003-
"""
2004-
Return list of variable labels.
2005-
"""
2006-
return self._lbllist
2007-
2008-
@property
2009-
def varlist(self) -> list[str]:
2010-
"""
2011-
Return list of variable names.
2012-
"""
2013-
return self._varlist
2014-
2015-
@property
2016-
def fmtlist(self) -> list[str]:
2017-
"""
2018-
Return list of variable formats.
2019-
"""
2020-
return self._fmtlist
2021-
20222008
@property
20232009
def time_stamp(self) -> str:
20242010
"""
20252011
Return time stamp of Stata file.
20262012
"""
2013+
self._ensure_open()
20272014
return self._time_stamp
20282015

20292016
def variable_labels(self) -> dict[str, str]:
@@ -2034,6 +2021,7 @@ def variable_labels(self) -> dict[str, str]:
20342021
-------
20352022
dict
20362023
"""
2024+
self._ensure_open()
20372025
return dict(zip(self._varlist, self._variable_labels))
20382026

20392027
def value_labels(self) -> dict[str, dict[float, str]]:

pandas/tests/io/test_stata.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -742,10 +742,8 @@ def test_minimal_size_col(self):
742742
original.to_stata(path, write_index=False)
743743

744744
with StataReader(path) as sr:
745-
typlist = sr.typlist
746-
variables = sr.varlist
747-
formats = sr.fmtlist
748-
for variable, fmt, typ in zip(variables, formats, typlist):
745+
sr._ensure_open() # The `_*list` variables are initialized here
746+
for variable, fmt, typ in zip(sr._varlist, sr._fmtlist, sr._typlist):
749747
assert int(variable[1:]) == int(fmt[1:-1])
750748
assert int(variable[1:]) == typ
751749

0 commit comments

Comments
 (0)