Skip to content

Commit 2f5d192

Browse files
committed
FIX: StataReader: defer opening file to when data is required
1 parent f9c19cc commit 2f5d192

File tree

1 file changed

+37
-3
lines changed

1 file changed

+37
-3
lines changed

pandas/io/stata.py

+37-3
Original file line numberDiff line numberDiff line change
@@ -1144,6 +1144,9 @@ def __init__(
11441144
self._preserve_dtypes = preserve_dtypes
11451145
self._columns = columns
11461146
self._order_categoricals = order_categoricals
1147+
self._original_path_or_buf = path_or_buf
1148+
self._compression = compression
1149+
self._storage_options = storage_options
11471150
self._encoding = ""
11481151
self._chunksize = chunksize
11491152
self._using_iterator = False
@@ -1153,6 +1156,7 @@ def __init__(
11531156
raise ValueError("chunksize must be a positive integer when set.")
11541157

11551158
# State variables for the file
1159+
self._path_or_buf = None
11561160
self._has_string_data = False
11571161
self._missing_values = False
11581162
self._can_read_value_labels = False
@@ -1163,12 +1167,24 @@ def __init__(
11631167
self._lines_read = 0
11641168

11651169
self._native_byteorder = _set_endianness(sys.byteorder)
1170+
1171+
def _ensure_open(self) -> None:
1172+
"""
1173+
Ensure the file has been opened and its header data read.
1174+
"""
1175+
if self._path_or_buf is None:
1176+
self._open_file()
1177+
1178+
def _open_file(self) -> None:
1179+
"""
1180+
Open the file (with compression options, etc.), and read header information.
1181+
"""
11661182
with get_handle(
1167-
path_or_buf,
1183+
self._original_path_or_buf,
11681184
"rb",
1169-
storage_options=storage_options,
1185+
storage_options=self._storage_options,
11701186
is_text=False,
1171-
compression=compression,
1187+
compression=self._compression,
11721188
) as handles:
11731189
# Copy to BytesIO, and ensure no encoding
11741190
self._path_or_buf = BytesIO(handles.handle.read())
@@ -1536,6 +1552,7 @@ def _decode(self, s: bytes) -> str:
15361552
return s.decode("latin-1")
15371553

15381554
def _read_value_labels(self) -> None:
1555+
self._ensure_open()
15391556
if self._value_labels_read:
15401557
# Don't read twice
15411558
return
@@ -1655,6 +1672,7 @@ def read(
16551672
columns: Sequence[str] | None = None,
16561673
order_categoricals: bool | None = None,
16571674
) -> DataFrame:
1675+
self._ensure_open()
16581676
# Handle empty file or chunk. If reading incrementally raise
16591677
# StopIteration. If reading the whole thing return an empty
16601678
# data frame.
@@ -1983,57 +2001,72 @@ def data_label(self) -> str:
19832001
"""
19842002
Return data label of Stata file.
19852003
"""
2004+
self._ensure_open()
19862005
return self._data_label
19872006

19882007
@property
19892008
def typlist(self) -> list[int | str]:
19902009
"""
19912010
Return list of variable types.
19922011
"""
2012+
self._ensure_open()
19932013
return self._typlist
19942014

19952015
@property
19962016
def dtyplist(self) -> list[str | np.dtype]:
19972017
"""
19982018
Return list of variable types.
19992019
"""
2020+
self._ensure_open()
20002021
return self._dtyplist
20012022

20022023
@property
20032024
def lbllist(self) -> list[str]:
20042025
"""
20052026
Return list of variable labels.
20062027
"""
2028+
self._ensure_open()
20072029
return self._lbllist
20082030

20092031
@property
20102032
def varlist(self) -> list[str]:
20112033
"""
20122034
Return list of variable names.
20132035
"""
2036+
self._ensure_open()
20142037
return self._varlist
20152038

20162039
@property
20172040
def fmtlist(self) -> list[str]:
20182041
"""
20192042
Return list of variable formats.
20202043
"""
2044+
self._ensure_open()
20212045
return self._fmtlist
20222046

20232047
@property
20242048
def time_stamp(self) -> str:
20252049
"""
20262050
Return time stamp of Stata file.
20272051
"""
2052+
self._ensure_open()
20282053
return self._time_stamp
20292054

20302055
@property
20312056
def format_version(self) -> int:
20322057
"""
20332058
Return format version of Stata file.
20342059
"""
2060+
self._ensure_open()
20352061
return self._format_version
20362062

2063+
@property
2064+
def path_or_buf(self) -> IO[bytes]:
2065+
"""
2066+
Return the file handle of the Stata file being read.
2067+
"""
2068+
return self._path_or_buf
2069+
20372070
def variable_labels(self) -> dict[str, str]:
20382071
"""
20392072
Return a dict associating each variable name with corresponding label.
@@ -2042,6 +2075,7 @@ def variable_labels(self) -> dict[str, str]:
20422075
-------
20432076
dict
20442077
"""
2078+
self._ensure_open()
20452079
return dict(zip(self._varlist, self._variable_labels))
20462080

20472081
def value_labels(self) -> dict[str, dict[float, str]]:

0 commit comments

Comments
 (0)