Skip to content

Commit e8bf7b7

Browse files
committed
FIX: StataReader: defer opening file to when data is required
1 parent 0fe113c commit e8bf7b7

File tree

2 files changed

+28
-42
lines changed

2 files changed

+28
-42
lines changed

pandas/io/stata.py

+26-38
Original file line numberDiff line numberDiff line change
@@ -1117,6 +1117,8 @@ def __init__(self) -> None:
11171117
class StataReader(StataParser, abc.Iterator):
11181118
__doc__ = _stata_reader_doc
11191119

1120+
_path_or_buf: IO[bytes] # Will be assigned by `_open_file`.
1121+
11201122
def __init__(
11211123
self,
11221124
path_or_buf: FilePath | ReadBuffer[bytes],
@@ -1143,6 +1145,9 @@ def __init__(
11431145
self._preserve_dtypes = preserve_dtypes
11441146
self._columns = columns
11451147
self._order_categoricals = order_categoricals
1148+
self._original_path_or_buf = path_or_buf
1149+
self._compression = compression
1150+
self._storage_options = storage_options
11461151
self._encoding = ""
11471152
self._chunksize = chunksize
11481153
self._using_iterator = False
@@ -1152,6 +1157,7 @@ def __init__(
11521157
raise ValueError("chunksize must be a positive integer when set.")
11531158

11541159
# State variables for the file
1160+
self._close_file: Callable[[], None] | None = None
11551161
self._has_string_data = False
11561162
self._missing_values = False
11571163
self._can_read_value_labels = False
@@ -1162,12 +1168,24 @@ def __init__(
11621168
self._lines_read = 0
11631169

11641170
self._native_byteorder = _set_endianness(sys.byteorder)
1171+
1172+
def _ensure_open(self) -> None:
1173+
"""
1174+
Ensure the file has been opened and its header data read.
1175+
"""
1176+
if not hasattr(self, "_path_or_buf"):
1177+
self._open_file()
1178+
1179+
def _open_file(self) -> None:
1180+
"""
1181+
Open the file (with compression options, etc.), and read header information.
1182+
"""
11651183
with get_handle(
1166-
path_or_buf,
1184+
self._original_path_or_buf,
11671185
"rb",
1168-
storage_options=storage_options,
1186+
storage_options=self._storage_options,
11691187
is_text=False,
1170-
compression=compression,
1188+
compression=self._compression,
11711189
) as handles:
11721190
# Copy to BytesIO, and ensure no encoding
11731191
self._path_or_buf = BytesIO(handles.handle.read())
@@ -1534,6 +1552,7 @@ def _decode(self, s: bytes) -> str:
15341552
return s.decode("latin-1")
15351553

15361554
def _read_value_labels(self) -> None:
1555+
self._ensure_open()
15371556
if self._value_labels_read:
15381557
# Don't read twice
15391558
return
@@ -1653,6 +1672,7 @@ def read(
16531672
columns: Sequence[str] | None = None,
16541673
order_categoricals: bool | None = None,
16551674
) -> DataFrame:
1675+
self._ensure_open()
16561676
# Handle empty file or chunk. If reading incrementally raise
16571677
# StopIteration. If reading the whole thing return an empty
16581678
# data frame.
@@ -1981,48 +2001,15 @@ def data_label(self) -> str:
19812001
"""
19822002
Return data label of Stata file.
19832003
"""
2004+
self._ensure_open()
19842005
return self._data_label
19852006

1986-
@property
1987-
def typlist(self) -> list[int | str]:
1988-
"""
1989-
Return list of variable types.
1990-
"""
1991-
return self._typlist
1992-
1993-
@property
1994-
def dtyplist(self) -> list[str | np.dtype]:
1995-
"""
1996-
Return list of variable types.
1997-
"""
1998-
return self._dtyplist
1999-
2000-
@property
2001-
def lbllist(self) -> list[str]:
2002-
"""
2003-
Return list of variable labels.
2004-
"""
2005-
return self._lbllist
2006-
2007-
@property
2008-
def varlist(self) -> list[str]:
2009-
"""
2010-
Return list of variable names.
2011-
"""
2012-
return self._varlist
2013-
2014-
@property
2015-
def fmtlist(self) -> list[str]:
2016-
"""
2017-
Return list of variable formats.
2018-
"""
2019-
return self._fmtlist
2020-
20212007
@property
20222008
def time_stamp(self) -> str:
20232009
"""
20242010
Return time stamp of Stata file.
20252011
"""
2012+
self._ensure_open()
20262013
return self._time_stamp
20272014

20282015
def variable_labels(self) -> dict[str, str]:
@@ -2033,6 +2020,7 @@ def variable_labels(self) -> dict[str, str]:
20332020
-------
20342021
dict
20352022
"""
2023+
self._ensure_open()
20362024
return dict(zip(self._varlist, self._variable_labels))
20372025

20382026
def value_labels(self) -> dict[str, dict[float, str]]:

pandas/tests/io/test_stata.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -694,10 +694,8 @@ def test_minimal_size_col(self):
694694
original.to_stata(path, write_index=False)
695695

696696
with StataReader(path) as sr:
697-
typlist = sr.typlist
698-
variables = sr.varlist
699-
formats = sr.fmtlist
700-
for variable, fmt, typ in zip(variables, formats, typlist):
697+
sr._ensure_open() # The `_*list` variables are initialized here
698+
for variable, fmt, typ in zip(sr._varlist, sr._fmtlist, sr._typlist):
701699
assert int(variable[1:]) == int(fmt[1:-1])
702700
assert int(variable[1:]) == typ
703701

0 commit comments

Comments
 (0)