Skip to content

Commit c476b52

Browse files
committed
FIX: StataReader: defer opening file to when data is required
1 parent bb17e67 commit c476b52

File tree

1 file changed

+39
-3
lines changed

1 file changed

+39
-3
lines changed

pandas/io/stata.py

+39-3
Original file line numberDiff line numberDiff line change
@@ -1143,6 +1143,9 @@ def __init__(
11431143
self._preserve_dtypes = preserve_dtypes
11441144
self._columns = columns
11451145
self._order_categoricals = order_categoricals
1146+
self._original_path_or_buf = path_or_buf
1147+
self._compression = compression
1148+
self._storage_options = storage_options
11461149
self._encoding = ""
11471150
self._chunksize = chunksize
11481151
self._using_iterator = False
@@ -1152,6 +1155,9 @@ def __init__(
11521155
raise ValueError("chunksize must be a positive integer when set.")
11531156

11541157
# State variables for the file
1158+
# NB: _path_or_buf is mistyped on purpose, since the alternative is to placate
1159+
# mypy by having an assert before every read.
1160+
self._path_or_buf: IO[bytes] = None # type: ignore[assignment]
11551161
self._has_string_data = False
11561162
self._missing_values = False
11571163
self._can_read_value_labels = False
@@ -1162,12 +1168,24 @@ def __init__(
11621168
self._lines_read = 0
11631169

11641170
self._native_byteorder = _set_endianness(sys.byteorder)
1171+
1172+
def _ensure_open(self) -> None:
1173+
"""
1174+
Ensure the file has been opened and its header data read.
1175+
"""
1176+
if self._path_or_buf is None:
1177+
self._open_file()
1178+
1179+
def _open_file(self) -> None:
1180+
"""
1181+
Open the file (with compression options, etc.), and read header information.
1182+
"""
11651183
with get_handle(
1166-
path_or_buf,
1184+
self._original_path_or_buf,
11671185
"rb",
1168-
storage_options=storage_options,
1186+
storage_options=self._storage_options,
11691187
is_text=False,
1170-
compression=compression,
1188+
compression=self._compression,
11711189
) as handles:
11721190
# Copy to BytesIO, and ensure no encoding
11731191
self._path_or_buf = BytesIO(handles.handle.read())
@@ -1534,6 +1552,7 @@ def _decode(self, s: bytes) -> str:
15341552
return s.decode("latin-1")
15351553

15361554
def _read_value_labels(self) -> None:
1555+
self._ensure_open()
15371556
if self._value_labels_read:
15381557
# Don't read twice
15391558
return
@@ -1653,6 +1672,7 @@ def read(
16531672
columns: Sequence[str] | None = None,
16541673
order_categoricals: bool | None = None,
16551674
) -> DataFrame:
1675+
self._ensure_open()
16561676
# Handle empty file or chunk. If reading incrementally raise
16571677
# StopIteration. If reading the whole thing return an empty
16581678
# data frame.
@@ -1981,57 +2001,72 @@ def data_label(self) -> str:
19812001
"""
19822002
Return data label of Stata file.
19832003
"""
2004+
self._ensure_open()
19842005
return self._data_label
19852006

19862007
@property
19872008
def typlist(self) -> list[int | str]:
19882009
"""
19892010
Return list of variable types.
19902011
"""
2012+
self._ensure_open()
19912013
return self._typlist
19922014

19932015
@property
19942016
def dtyplist(self) -> list[str | np.dtype]:
19952017
"""
19962018
Return list of variable types.
19972019
"""
2020+
self._ensure_open()
19982021
return self._dtyplist
19992022

20002023
@property
20012024
def lbllist(self) -> list[str]:
20022025
"""
20032026
Return list of variable labels.
20042027
"""
2028+
self._ensure_open()
20052029
return self._lbllist
20062030

20072031
@property
20082032
def varlist(self) -> list[str]:
20092033
"""
20102034
Return list of variable names.
20112035
"""
2036+
self._ensure_open()
20122037
return self._varlist
20132038

20142039
@property
20152040
def fmtlist(self) -> list[str]:
20162041
"""
20172042
Return list of variable formats.
20182043
"""
2044+
self._ensure_open()
20192045
return self._fmtlist
20202046

20212047
@property
20222048
def time_stamp(self) -> str:
20232049
"""
20242050
Return time stamp of Stata file.
20252051
"""
2052+
self._ensure_open()
20262053
return self._time_stamp
20272054

20282055
@property
20292056
def format_version(self) -> int:
20302057
"""
20312058
Return format version of Stata file.
20322059
"""
2060+
self._ensure_open()
20332061
return self._format_version
20342062

2063+
@property
2064+
def path_or_buf(self) -> IO[bytes]:
2065+
"""
2066+
Return the file handle of the Stata file being read.
2067+
"""
2068+
return self._path_or_buf
2069+
20352070
def variable_labels(self) -> dict[str, str]:
20362071
"""
20372072
Return a dict associating each variable name with corresponding label.
@@ -2040,6 +2075,7 @@ def variable_labels(self) -> dict[str, str]:
20402075
-------
20412076
dict
20422077
"""
2078+
self._ensure_open()
20432079
return dict(zip(self._varlist, self._variable_labels))
20442080

20452081
def value_labels(self) -> dict[str, dict[float, str]]:

0 commit comments

Comments
 (0)