Skip to content

Commit 3b0f25f

Browse files
committed
Properly close opened files in StataReader
1 parent 1e39a5e commit 3b0f25f

File tree

1 file changed

+141
-141
lines changed

1 file changed

+141
-141
lines changed

pandas/io/stata.py

+141-141
Original file line numberDiff line numberDiff line change
@@ -167,15 +167,11 @@ def read_stata(filepath_or_buffer, convert_dates=True,
167167
chunksize=chunksize, encoding=encoding)
168168

169169
if iterator or chunksize:
170-
try:
171-
return reader
172-
except StopIteration:
173-
reader.close()
174-
175-
try:
176-
return reader.read()
177-
finally:
170+
data = reader
171+
else:
172+
data = reader.read()
178173
reader.close()
174+
return data
179175

180176
_date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"]
181177

@@ -1411,150 +1407,154 @@ def read(self, nrows=None, convert_dates=None,
14111407
convert_categoricals=None, index=None,
14121408
convert_missing=None, preserve_dtypes=None,
14131409
columns=None, order_categoricals=None):
1414-
14151410
# Handle empty file or chunk. If reading incrementally raise
14161411
# StopIteration. If reading the whole thing return an empty
14171412
# data frame.
14181413
if (self.nobs == 0) and (nrows is None):
14191414
self._can_read_value_labels = True
14201415
self._data_read = True
1416+
self.close()
14211417
return DataFrame(columns=self.varlist)
14221418

1423-
# Handle options
1424-
if convert_dates is None:
1425-
convert_dates = self._convert_dates
1426-
if convert_categoricals is None:
1427-
convert_categoricals = self._convert_categoricals
1428-
if convert_missing is None:
1429-
convert_missing = self._convert_missing
1430-
if preserve_dtypes is None:
1431-
preserve_dtypes = self._preserve_dtypes
1432-
if columns is None:
1433-
columns = self._columns
1434-
if order_categoricals is None:
1435-
order_categoricals = self._order_categoricals
1436-
1437-
if nrows is None:
1438-
nrows = self.nobs
1439-
1440-
if (self.format_version >= 117) and (self._dtype is None):
1441-
self._can_read_value_labels = True
1442-
self._read_strls()
1443-
1444-
# Setup the dtype.
1445-
if self._dtype is None:
1446-
dtype = [] # Convert struct data types to numpy data type
1447-
for i, typ in enumerate(self.typlist):
1448-
if typ in self.NUMPY_TYPE_MAP:
1449-
dtype.append(('s' + str(i), self.byteorder +
1450-
self.NUMPY_TYPE_MAP[typ]))
1451-
else:
1452-
dtype.append(('s' + str(i), 'S' + str(typ)))
1453-
dtype = np.dtype(dtype)
1454-
self._dtype = dtype
1455-
1456-
# Read data
1457-
dtype = self._dtype
1458-
max_read_len = (self.nobs - self._lines_read) * dtype.itemsize
1459-
read_len = nrows * dtype.itemsize
1460-
read_len = min(read_len, max_read_len)
1461-
if read_len <= 0:
1462-
# Iterator has finished, should never be here unless
1463-
# we are reading the file incrementally
1419+
try:
1420+
# Handle options
1421+
if convert_dates is None:
1422+
convert_dates = self._convert_dates
1423+
if convert_categoricals is None:
1424+
convert_categoricals = self._convert_categoricals
1425+
if convert_missing is None:
1426+
convert_missing = self._convert_missing
1427+
if preserve_dtypes is None:
1428+
preserve_dtypes = self._preserve_dtypes
1429+
if columns is None:
1430+
columns = self._columns
1431+
if order_categoricals is None:
1432+
order_categoricals = self._order_categoricals
1433+
1434+
if nrows is None:
1435+
nrows = self.nobs
1436+
1437+
if (self.format_version >= 117) and (self._dtype is None):
1438+
self._can_read_value_labels = True
1439+
self._read_strls()
1440+
1441+
# Setup the dtype.
1442+
if self._dtype is None:
1443+
dtype = [] # Convert struct data types to numpy data type
1444+
for i, typ in enumerate(self.typlist):
1445+
if typ in self.NUMPY_TYPE_MAP:
1446+
dtype.append(('s' + str(i), self.byteorder +
1447+
self.NUMPY_TYPE_MAP[typ]))
1448+
else:
1449+
dtype.append(('s' + str(i), 'S' + str(typ)))
1450+
dtype = np.dtype(dtype)
1451+
self._dtype = dtype
1452+
1453+
# Read data
1454+
dtype = self._dtype
1455+
max_read_len = (self.nobs - self._lines_read) * dtype.itemsize
1456+
read_len = nrows * dtype.itemsize
1457+
read_len = min(read_len, max_read_len)
1458+
if read_len <= 0:
1459+
# Iterator has finished, should never be here unless
1460+
# we are reading the file incrementally
1461+
if convert_categoricals:
1462+
self._read_value_labels()
1463+
raise StopIteration
1464+
offset = self._lines_read * dtype.itemsize
1465+
self.path_or_buf.seek(self.data_location + offset)
1466+
read_lines = min(nrows, self.nobs - self._lines_read)
1467+
data = np.frombuffer(self.path_or_buf.read(read_len), dtype=dtype,
1468+
count=read_lines)
1469+
1470+
self._lines_read += read_lines
1471+
if self._lines_read == self.nobs:
1472+
self._can_read_value_labels = True
1473+
self._data_read = True
1474+
# if necessary, swap the byte order to native here
1475+
if self.byteorder != self._native_byteorder:
1476+
data = data.byteswap().newbyteorder()
1477+
14641478
if convert_categoricals:
14651479
self._read_value_labels()
1466-
raise StopIteration
1467-
offset = self._lines_read * dtype.itemsize
1468-
self.path_or_buf.seek(self.data_location + offset)
1469-
read_lines = min(nrows, self.nobs - self._lines_read)
1470-
data = np.frombuffer(self.path_or_buf.read(read_len), dtype=dtype,
1471-
count=read_lines)
1472-
1473-
self._lines_read += read_lines
1474-
if self._lines_read == self.nobs:
1475-
self._can_read_value_labels = True
1476-
self._data_read = True
1477-
# if necessary, swap the byte order to native here
1478-
if self.byteorder != self._native_byteorder:
1479-
data = data.byteswap().newbyteorder()
1480-
1481-
if convert_categoricals:
1482-
self._read_value_labels()
14831480

1484-
if len(data) == 0:
1485-
data = DataFrame(columns=self.varlist, index=index)
1486-
else:
1487-
data = DataFrame.from_records(data, index=index)
1488-
data.columns = self.varlist
1489-
1490-
# If index is not specified, use actual row number rather than
1491-
# restarting at 0 for each chunk.
1492-
if index is None:
1493-
ix = np.arange(self._lines_read - read_lines, self._lines_read)
1494-
data = data.set_index(ix)
1495-
1496-
if columns is not None:
1497-
data = self._do_select_columns(data, columns)
1498-
1499-
# Decode strings
1500-
for col, typ in zip(data, self.typlist):
1501-
if type(typ) is int:
1502-
data[col] = data[col].apply(
1503-
self._null_terminate, convert_dtype=True)
1504-
1505-
data = self._insert_strls(data)
1506-
1507-
cols_ = np.where(self.dtyplist)[0]
1508-
1509-
# Convert columns (if needed) to match input type
1510-
index = data.index
1511-
requires_type_conversion = False
1512-
data_formatted = []
1513-
for i in cols_:
1514-
if self.dtyplist[i] is not None:
1515-
col = data.columns[i]
1516-
dtype = data[col].dtype
1517-
if (dtype != np.dtype(object)) and (dtype != self.dtyplist[i]):
1518-
requires_type_conversion = True
1519-
data_formatted.append(
1520-
(col, Series(data[col], index, self.dtyplist[i])))
1521-
else:
1522-
data_formatted.append((col, data[col]))
1523-
if requires_type_conversion:
1524-
data = DataFrame.from_items(data_formatted)
1525-
del data_formatted
1526-
1527-
self._do_convert_missing(data, convert_missing)
1528-
1529-
if convert_dates:
1530-
cols = np.where(lmap(lambda x: x in _date_formats,
1531-
self.fmtlist))[0]
1532-
for i in cols:
1533-
col = data.columns[i]
1534-
data[col] = _stata_elapsed_date_to_datetime_vec(
1535-
data[col],
1536-
self.fmtlist[i])
1537-
1538-
if convert_categoricals and self.format_version > 108:
1539-
data = self._do_convert_categoricals(data,
1540-
self.value_label_dict,
1541-
self.lbllist,
1542-
order_categoricals)
1543-
1544-
if not preserve_dtypes:
1545-
retyped_data = []
1546-
convert = False
1547-
for col in data:
1548-
dtype = data[col].dtype
1549-
if dtype in (np.float16, np.float32):
1550-
dtype = np.float64
1551-
convert = True
1552-
elif dtype in (np.int8, np.int16, np.int32):
1553-
dtype = np.int64
1554-
convert = True
1555-
retyped_data.append((col, data[col].astype(dtype)))
1556-
if convert:
1557-
data = DataFrame.from_items(retyped_data)
1481+
if len(data) == 0:
1482+
data = DataFrame(columns=self.varlist, index=index)
1483+
else:
1484+
data = DataFrame.from_records(data, index=index)
1485+
data.columns = self.varlist
1486+
1487+
# If index is not specified, use actual row number rather than
1488+
# restarting at 0 for each chunk.
1489+
if index is None:
1490+
ix = np.arange(self._lines_read - read_lines, self._lines_read)
1491+
data = data.set_index(ix)
1492+
1493+
if columns is not None:
1494+
data = self._do_select_columns(data, columns)
1495+
1496+
# Decode strings
1497+
for col, typ in zip(data, self.typlist):
1498+
if type(typ) is int:
1499+
data[col] = data[col].apply(
1500+
self._null_terminate, convert_dtype=True)
1501+
1502+
data = self._insert_strls(data)
1503+
1504+
cols_ = np.where(self.dtyplist)[0]
1505+
1506+
# Convert columns (if needed) to match input type
1507+
index = data.index
1508+
requires_type_conversion = False
1509+
data_formatted = []
1510+
for i in cols_:
1511+
if self.dtyplist[i] is not None:
1512+
col = data.columns[i]
1513+
dtype = data[col].dtype
1514+
if (dtype != np.dtype(object)) and (dtype != self.dtyplist[i]):
1515+
requires_type_conversion = True
1516+
data_formatted.append(
1517+
(col, Series(data[col], index, self.dtyplist[i])))
1518+
else:
1519+
data_formatted.append((col, data[col]))
1520+
if requires_type_conversion:
1521+
data = DataFrame.from_items(data_formatted)
1522+
del data_formatted
1523+
1524+
self._do_convert_missing(data, convert_missing)
1525+
1526+
if convert_dates:
1527+
cols = np.where(lmap(lambda x: x in _date_formats,
1528+
self.fmtlist))[0]
1529+
for i in cols:
1530+
col = data.columns[i]
1531+
data[col] = _stata_elapsed_date_to_datetime_vec(
1532+
data[col],
1533+
self.fmtlist[i])
1534+
1535+
if convert_categoricals and self.format_version > 108:
1536+
data = self._do_convert_categoricals(data,
1537+
self.value_label_dict,
1538+
self.lbllist,
1539+
order_categoricals)
1540+
1541+
if not preserve_dtypes:
1542+
retyped_data = []
1543+
convert = False
1544+
for col in data:
1545+
dtype = data[col].dtype
1546+
if dtype in (np.float16, np.float32):
1547+
dtype = np.float64
1548+
convert = True
1549+
elif dtype in (np.int8, np.int16, np.int32):
1550+
dtype = np.int64
1551+
convert = True
1552+
retyped_data.append((col, data[col].astype(dtype)))
1553+
if convert:
1554+
data = DataFrame.from_items(retyped_data)
1555+
except:
1556+
self.close()
1557+
raise
15581558

15591559
return data
15601560

0 commit comments

Comments
 (0)