Skip to content

Commit dba4e40

Browse files
committed
FIX: StataReader: defer opening file to when data is required
1 parent 8e94f6f commit dba4e40

File tree

2 files changed

+29
-42
lines changed

2 files changed

+29
-42
lines changed

pandas/io/stata.py

+27-38
Original file line numberDiff line numberDiff line change
@@ -1143,6 +1143,9 @@ def __init__(
11431143
self._preserve_dtypes = preserve_dtypes
11441144
self._columns = columns
11451145
self._order_categoricals = order_categoricals
1146+
self._original_path_or_buf = path_or_buf
1147+
self._compression = compression
1148+
self._storage_options = storage_options
11461149
self._encoding = ""
11471150
self._chunksize = chunksize
11481151
self._using_iterator = False
@@ -1152,6 +1155,9 @@ def __init__(
11521155
raise ValueError("chunksize must be a positive integer when set.")
11531156

11541157
# State variables for the file
1158+
# NB: _path_or_buf is mistyped on purpose, since the alternative is to placate
1159+
# mypy by having an assert before every read.
1160+
self._path_or_buf: IO[bytes] = None # type: ignore[assignment]
11551161
self._has_string_data = False
11561162
self._missing_values = False
11571163
self._can_read_value_labels = False
@@ -1162,12 +1168,24 @@ def __init__(
11621168
self._lines_read = 0
11631169

11641170
self._native_byteorder = _set_endianness(sys.byteorder)
1171+
1172+
def _ensure_open(self) -> None:
1173+
"""
1174+
Ensure the file has been opened and its header data read.
1175+
"""
1176+
if self._path_or_buf is None:
1177+
self._open_file()
1178+
1179+
def _open_file(self) -> None:
1180+
"""
1181+
Open the file (with compression options, etc.), and read header information.
1182+
"""
11651183
with get_handle(
1166-
path_or_buf,
1184+
self._original_path_or_buf,
11671185
"rb",
1168-
storage_options=storage_options,
1186+
storage_options=self._storage_options,
11691187
is_text=False,
1170-
compression=compression,
1188+
compression=self._compression,
11711189
) as handles:
11721190
# Copy to BytesIO, and ensure no encoding
11731191
self._path_or_buf = BytesIO(handles.handle.read())
@@ -1534,6 +1552,7 @@ def _decode(self, s: bytes) -> str:
15341552
return s.decode("latin-1")
15351553

15361554
def _read_value_labels(self) -> None:
1555+
self._ensure_open()
15371556
if self._value_labels_read:
15381557
# Don't read twice
15391558
return
@@ -1653,6 +1672,7 @@ def read(
16531672
columns: Sequence[str] | None = None,
16541673
order_categoricals: bool | None = None,
16551674
) -> DataFrame:
1675+
self._ensure_open()
16561676
# Handle empty file or chunk. If reading incrementally raise
16571677
# StopIteration. If reading the whole thing return an empty
16581678
# data frame.
@@ -1981,55 +2001,23 @@ def data_label(self) -> str:
19812001
"""
19822002
Return data label of Stata file.
19832003
"""
2004+
self._ensure_open()
19842005
return self._data_label
19852006

1986-
@property
1987-
def typlist(self) -> list[int | str]:
1988-
"""
1989-
Return list of variable types.
1990-
"""
1991-
return self._typlist
1992-
1993-
@property
1994-
def dtyplist(self) -> list[str | np.dtype]:
1995-
"""
1996-
Return list of variable types.
1997-
"""
1998-
return self._dtyplist
1999-
2000-
@property
2001-
def lbllist(self) -> list[str]:
2002-
"""
2003-
Return list of variable labels.
2004-
"""
2005-
return self._lbllist
2006-
2007-
@property
2008-
def varlist(self) -> list[str]:
2009-
"""
2010-
Return list of variable names.
2011-
"""
2012-
return self._varlist
2013-
2014-
@property
2015-
def fmtlist(self) -> list[str]:
2016-
"""
2017-
Return list of variable formats.
2018-
"""
2019-
return self._fmtlist
2020-
20212007
@property
20222008
def time_stamp(self) -> str:
20232009
"""
20242010
Return time stamp of Stata file.
20252011
"""
2012+
self._ensure_open()
20262013
return self._time_stamp
20272014

20282015
@property
20292016
def format_version(self) -> int:
20302017
"""
20312018
Return format version of Stata file.
20322019
"""
2020+
self._ensure_open()
20332021
return self._format_version
20342022

20352023
def variable_labels(self) -> dict[str, str]:
@@ -2040,6 +2028,7 @@ def variable_labels(self) -> dict[str, str]:
20402028
-------
20412029
dict
20422030
"""
2031+
self._ensure_open()
20432032
return dict(zip(self._varlist, self._variable_labels))
20442033

20452034
def value_labels(self) -> dict[str, dict[float, str]]:

pandas/tests/io/test_stata.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -694,10 +694,8 @@ def test_minimal_size_col(self):
694694
original.to_stata(path, write_index=False)
695695

696696
with StataReader(path) as sr:
697-
typlist = sr.typlist
698-
variables = sr.varlist
699-
formats = sr.fmtlist
700-
for variable, fmt, typ in zip(variables, formats, typlist):
697+
sr._ensure_open() # The `_*list` variables are initialized here
698+
for variable, fmt, typ in zip(sr._varlist, sr._fmtlist, sr._typlist):
701699
assert int(variable[1:]) == int(fmt[1:-1])
702700
assert int(variable[1:]) == typ
703701

0 commit comments

Comments
 (0)