Skip to content

Commit 75fc34d

Browse files
committed
Make try/except blocks in StataReader.read as small as possible
1 parent 812e6ec commit 75fc34d

File tree

1 file changed

+135
-130
lines changed

1 file changed

+135
-130
lines changed

pandas/io/stata.py

+135-130
Original file line numberDiff line numberDiff line change
@@ -1416,146 +1416,151 @@ def read(self, nrows=None, convert_dates=None,
14161416
self.close()
14171417
return DataFrame(columns=self.varlist)
14181418

1419-
try:
1420-
# Handle options
1421-
if convert_dates is None:
1422-
convert_dates = self._convert_dates
1423-
if convert_categoricals is None:
1424-
convert_categoricals = self._convert_categoricals
1425-
if convert_missing is None:
1426-
convert_missing = self._convert_missing
1427-
if preserve_dtypes is None:
1428-
preserve_dtypes = self._preserve_dtypes
1429-
if columns is None:
1430-
columns = self._columns
1431-
if order_categoricals is None:
1432-
order_categoricals = self._order_categoricals
1433-
1434-
if nrows is None:
1435-
nrows = self.nobs
1436-
1437-
if (self.format_version >= 117) and (self._dtype is None):
1438-
self._can_read_value_labels = True
1439-
self._read_strls()
1440-
1441-
# Setup the dtype.
1442-
if self._dtype is None:
1443-
dtype = [] # Convert struct data types to numpy data type
1444-
for i, typ in enumerate(self.typlist):
1445-
if typ in self.NUMPY_TYPE_MAP:
1446-
dtype.append(('s' + str(i), self.byteorder +
1447-
self.NUMPY_TYPE_MAP[typ]))
1448-
else:
1449-
dtype.append(('s' + str(i), 'S' + str(typ)))
1450-
dtype = np.dtype(dtype)
1451-
self._dtype = dtype
1452-
1453-
# Read data
1454-
dtype = self._dtype
1455-
max_read_len = (self.nobs - self._lines_read) * dtype.itemsize
1456-
read_len = nrows * dtype.itemsize
1457-
read_len = min(read_len, max_read_len)
1458-
if read_len <= 0:
1459-
# Iterator has finished, should never be here unless
1460-
# we are reading the file incrementally
1461-
if convert_categoricals:
1462-
self._read_value_labels()
1463-
raise StopIteration
1464-
offset = self._lines_read * dtype.itemsize
1465-
self.path_or_buf.seek(self.data_location + offset)
1466-
read_lines = min(nrows, self.nobs - self._lines_read)
1467-
data = np.frombuffer(self.path_or_buf.read(read_len), dtype=dtype,
1468-
count=read_lines)
1469-
1470-
self._lines_read += read_lines
1471-
if self._lines_read == self.nobs:
1472-
self._can_read_value_labels = True
1473-
self._data_read = True
1474-
# if necessary, swap the byte order to native here
1475-
if self.byteorder != self._native_byteorder:
1476-
data = data.byteswap().newbyteorder()
1477-
1419+
# Handle options
1420+
if convert_dates is None:
1421+
convert_dates = self._convert_dates
1422+
if convert_categoricals is None:
1423+
convert_categoricals = self._convert_categoricals
1424+
if convert_missing is None:
1425+
convert_missing = self._convert_missing
1426+
if preserve_dtypes is None:
1427+
preserve_dtypes = self._preserve_dtypes
1428+
if columns is None:
1429+
columns = self._columns
1430+
if order_categoricals is None:
1431+
order_categoricals = self._order_categoricals
1432+
1433+
if nrows is None:
1434+
nrows = self.nobs
1435+
1436+
if (self.format_version >= 117) and (self._dtype is None):
1437+
self._can_read_value_labels = True
1438+
self._read_strls()
1439+
1440+
# Setup the dtype.
1441+
if self._dtype is None:
1442+
dtype = [] # Convert struct data types to numpy data type
1443+
for i, typ in enumerate(self.typlist):
1444+
if typ in self.NUMPY_TYPE_MAP:
1445+
dtype.append(('s' + str(i), self.byteorder +
1446+
self.NUMPY_TYPE_MAP[typ]))
1447+
else:
1448+
dtype.append(('s' + str(i), 'S' + str(typ)))
1449+
dtype = np.dtype(dtype)
1450+
self._dtype = dtype
1451+
1452+
# Read data
1453+
dtype = self._dtype
1454+
max_read_len = (self.nobs - self._lines_read) * dtype.itemsize
1455+
read_len = nrows * dtype.itemsize
1456+
read_len = min(read_len, max_read_len)
1457+
if read_len <= 0:
1458+
# Iterator has finished, should never be here unless
1459+
# we are reading the file incrementally
14781460
if convert_categoricals:
14791461
self._read_value_labels()
1462+
self.close()
1463+
raise StopIteration
1464+
offset = self._lines_read * dtype.itemsize
1465+
self.path_or_buf.seek(self.data_location + offset)
1466+
read_lines = min(nrows, self.nobs - self._lines_read)
1467+
data = np.frombuffer(self.path_or_buf.read(read_len), dtype=dtype,
1468+
count=read_lines)
1469+
1470+
self._lines_read += read_lines
1471+
if self._lines_read == self.nobs:
1472+
self._can_read_value_labels = True
1473+
self._data_read = True
1474+
# if necessary, swap the byte order to native here
1475+
if self.byteorder != self._native_byteorder:
1476+
data = data.byteswap().newbyteorder()
14801477

1481-
if len(data) == 0:
1482-
data = DataFrame(columns=self.varlist, index=index)
1483-
else:
1484-
data = DataFrame.from_records(data, index=index)
1485-
data.columns = self.varlist
1478+
if convert_categoricals:
1479+
self._read_value_labels()
14861480

1487-
# If index is not specified, use actual row number rather than
1488-
# restarting at 0 for each chunk.
1489-
if index is None:
1490-
ix = np.arange(self._lines_read - read_lines, self._lines_read)
1491-
data = data.set_index(ix)
1481+
if len(data) == 0:
1482+
data = DataFrame(columns=self.varlist, index=index)
1483+
else:
1484+
data = DataFrame.from_records(data, index=index)
1485+
data.columns = self.varlist
1486+
1487+
# If index is not specified, use actual row number rather than
1488+
# restarting at 0 for each chunk.
1489+
if index is None:
1490+
ix = np.arange(self._lines_read - read_lines, self._lines_read)
1491+
data = data.set_index(ix)
14921492

1493-
if columns is not None:
1493+
if columns is not None:
1494+
try:
14941495
data = self._do_select_columns(data, columns)
1496+
except ValueError:
1497+
self.close()
1498+
raise
14951499

1496-
# Decode strings
1497-
for col, typ in zip(data, self.typlist):
1498-
if type(typ) is int:
1499-
data[col] = data[col].apply(
1500-
self._null_terminate, convert_dtype=True)
1501-
1502-
data = self._insert_strls(data)
1503-
1504-
cols_ = np.where(self.dtyplist)[0]
1505-
1506-
# Convert columns (if needed) to match input type
1507-
index = data.index
1508-
requires_type_conversion = False
1509-
data_formatted = []
1510-
for i in cols_:
1511-
if self.dtyplist[i] is not None:
1512-
col = data.columns[i]
1513-
dtype = data[col].dtype
1514-
if ((dtype != np.dtype(object)) and
1515-
(dtype != self.dtyplist[i])):
1516-
requires_type_conversion = True
1517-
data_formatted.append(
1518-
(col, Series(data[col], index, self.dtyplist[i])))
1519-
else:
1520-
data_formatted.append((col, data[col]))
1521-
if requires_type_conversion:
1522-
data = DataFrame.from_items(data_formatted)
1523-
del data_formatted
1524-
1525-
self._do_convert_missing(data, convert_missing)
1526-
1527-
if convert_dates:
1528-
cols = np.where(lmap(lambda x: x in _date_formats,
1529-
self.fmtlist))[0]
1530-
for i in cols:
1531-
col = data.columns[i]
1500+
# Decode strings
1501+
for col, typ in zip(data, self.typlist):
1502+
if type(typ) is int:
1503+
data[col] = data[col].apply(
1504+
self._null_terminate, convert_dtype=True)
1505+
1506+
data = self._insert_strls(data)
1507+
1508+
cols_ = np.where(self.dtyplist)[0]
1509+
1510+
# Convert columns (if needed) to match input type
1511+
index = data.index
1512+
requires_type_conversion = False
1513+
data_formatted = []
1514+
for i in cols_:
1515+
if self.dtyplist[i] is not None:
1516+
col = data.columns[i]
1517+
dtype = data[col].dtype
1518+
if ((dtype != np.dtype(object)) and
1519+
(dtype != self.dtyplist[i])):
1520+
requires_type_conversion = True
1521+
data_formatted.append(
1522+
(col, Series(data[col], index, self.dtyplist[i])))
1523+
else:
1524+
data_formatted.append((col, data[col]))
1525+
if requires_type_conversion:
1526+
data = DataFrame.from_items(data_formatted)
1527+
del data_formatted
1528+
1529+
self._do_convert_missing(data, convert_missing)
1530+
1531+
if convert_dates:
1532+
cols = np.where(lmap(lambda x: x in _date_formats,
1533+
self.fmtlist))[0]
1534+
for i in cols:
1535+
col = data.columns[i]
1536+
try:
15321537
data[col] = _stata_elapsed_date_to_datetime_vec(
15331538
data[col],
15341539
self.fmtlist[i])
1535-
1536-
if convert_categoricals and self.format_version > 108:
1537-
data = self._do_convert_categoricals(data,
1538-
self.value_label_dict,
1539-
self.lbllist,
1540-
order_categoricals)
1541-
1542-
if not preserve_dtypes:
1543-
retyped_data = []
1544-
convert = False
1545-
for col in data:
1546-
dtype = data[col].dtype
1547-
if dtype in (np.float16, np.float32):
1548-
dtype = np.float64
1549-
convert = True
1550-
elif dtype in (np.int8, np.int16, np.int32):
1551-
dtype = np.int64
1552-
convert = True
1553-
retyped_data.append((col, data[col].astype(dtype)))
1554-
if convert:
1555-
data = DataFrame.from_items(retyped_data)
1556-
except:
1557-
self.close()
1558-
raise
1540+
except ValueError:
1541+
self.close()
1542+
raise
1543+
1544+
if convert_categoricals and self.format_version > 108:
1545+
data = self._do_convert_categoricals(data,
1546+
self.value_label_dict,
1547+
self.lbllist,
1548+
order_categoricals)
1549+
1550+
if not preserve_dtypes:
1551+
retyped_data = []
1552+
convert = False
1553+
for col in data:
1554+
dtype = data[col].dtype
1555+
if dtype in (np.float16, np.float32):
1556+
dtype = np.float64
1557+
convert = True
1558+
elif dtype in (np.int8, np.int16, np.int32):
1559+
dtype = np.int64
1560+
convert = True
1561+
retyped_data.append((col, data[col].astype(dtype)))
1562+
if convert:
1563+
data = DataFrame.from_items(retyped_data)
15591564

15601565
return data
15611566

0 commit comments

Comments
 (0)