@@ -167,15 +167,11 @@ def read_stata(filepath_or_buffer, convert_dates=True,
167
167
chunksize = chunksize , encoding = encoding )
168
168
169
169
if iterator or chunksize :
170
- try :
171
- return reader
172
- except StopIteration :
173
- reader .close ()
174
-
175
- try :
176
- return reader .read ()
177
- finally :
170
+ data = reader
171
+ else :
172
+ data = reader .read ()
178
173
reader .close ()
174
+ return data
179
175
180
176
_date_formats = ["%tc" , "%tC" , "%td" , "%d" , "%tw" , "%tm" , "%tq" , "%th" , "%ty" ]
181
177
@@ -1411,150 +1407,154 @@ def read(self, nrows=None, convert_dates=None,
1411
1407
convert_categoricals = None , index = None ,
1412
1408
convert_missing = None , preserve_dtypes = None ,
1413
1409
columns = None , order_categoricals = None ):
1414
-
1415
1410
# Handle empty file or chunk. If reading incrementally raise
1416
1411
# StopIteration. If reading the whole thing return an empty
1417
1412
# data frame.
1418
1413
if (self .nobs == 0 ) and (nrows is None ):
1419
1414
self ._can_read_value_labels = True
1420
1415
self ._data_read = True
1416
+ self .close ()
1421
1417
return DataFrame (columns = self .varlist )
1422
1418
1423
- # Handle options
1424
- if convert_dates is None :
1425
- convert_dates = self ._convert_dates
1426
- if convert_categoricals is None :
1427
- convert_categoricals = self ._convert_categoricals
1428
- if convert_missing is None :
1429
- convert_missing = self ._convert_missing
1430
- if preserve_dtypes is None :
1431
- preserve_dtypes = self ._preserve_dtypes
1432
- if columns is None :
1433
- columns = self ._columns
1434
- if order_categoricals is None :
1435
- order_categoricals = self ._order_categoricals
1436
-
1437
- if nrows is None :
1438
- nrows = self .nobs
1439
-
1440
- if (self .format_version >= 117 ) and (self ._dtype is None ):
1441
- self ._can_read_value_labels = True
1442
- self ._read_strls ()
1443
-
1444
- # Setup the dtype.
1445
- if self ._dtype is None :
1446
- dtype = [] # Convert struct data types to numpy data type
1447
- for i , typ in enumerate (self .typlist ):
1448
- if typ in self .NUMPY_TYPE_MAP :
1449
- dtype .append (('s' + str (i ), self .byteorder +
1450
- self .NUMPY_TYPE_MAP [typ ]))
1451
- else :
1452
- dtype .append (('s' + str (i ), 'S' + str (typ )))
1453
- dtype = np .dtype (dtype )
1454
- self ._dtype = dtype
1455
-
1456
- # Read data
1457
- dtype = self ._dtype
1458
- max_read_len = (self .nobs - self ._lines_read ) * dtype .itemsize
1459
- read_len = nrows * dtype .itemsize
1460
- read_len = min (read_len , max_read_len )
1461
- if read_len <= 0 :
1462
- # Iterator has finished, should never be here unless
1463
- # we are reading the file incrementally
1419
+ try :
1420
+ # Handle options
1421
+ if convert_dates is None :
1422
+ convert_dates = self ._convert_dates
1423
+ if convert_categoricals is None :
1424
+ convert_categoricals = self ._convert_categoricals
1425
+ if convert_missing is None :
1426
+ convert_missing = self ._convert_missing
1427
+ if preserve_dtypes is None :
1428
+ preserve_dtypes = self ._preserve_dtypes
1429
+ if columns is None :
1430
+ columns = self ._columns
1431
+ if order_categoricals is None :
1432
+ order_categoricals = self ._order_categoricals
1433
+
1434
+ if nrows is None :
1435
+ nrows = self .nobs
1436
+
1437
+ if (self .format_version >= 117 ) and (self ._dtype is None ):
1438
+ self ._can_read_value_labels = True
1439
+ self ._read_strls ()
1440
+
1441
+ # Setup the dtype.
1442
+ if self ._dtype is None :
1443
+ dtype = [] # Convert struct data types to numpy data type
1444
+ for i , typ in enumerate (self .typlist ):
1445
+ if typ in self .NUMPY_TYPE_MAP :
1446
+ dtype .append (('s' + str (i ), self .byteorder +
1447
+ self .NUMPY_TYPE_MAP [typ ]))
1448
+ else :
1449
+ dtype .append (('s' + str (i ), 'S' + str (typ )))
1450
+ dtype = np .dtype (dtype )
1451
+ self ._dtype = dtype
1452
+
1453
+ # Read data
1454
+ dtype = self ._dtype
1455
+ max_read_len = (self .nobs - self ._lines_read ) * dtype .itemsize
1456
+ read_len = nrows * dtype .itemsize
1457
+ read_len = min (read_len , max_read_len )
1458
+ if read_len <= 0 :
1459
+ # Iterator has finished, should never be here unless
1460
+ # we are reading the file incrementally
1461
+ if convert_categoricals :
1462
+ self ._read_value_labels ()
1463
+ raise StopIteration
1464
+ offset = self ._lines_read * dtype .itemsize
1465
+ self .path_or_buf .seek (self .data_location + offset )
1466
+ read_lines = min (nrows , self .nobs - self ._lines_read )
1467
+ data = np .frombuffer (self .path_or_buf .read (read_len ), dtype = dtype ,
1468
+ count = read_lines )
1469
+
1470
+ self ._lines_read += read_lines
1471
+ if self ._lines_read == self .nobs :
1472
+ self ._can_read_value_labels = True
1473
+ self ._data_read = True
1474
+ # if necessary, swap the byte order to native here
1475
+ if self .byteorder != self ._native_byteorder :
1476
+ data = data .byteswap ().newbyteorder ()
1477
+
1464
1478
if convert_categoricals :
1465
1479
self ._read_value_labels ()
1466
- raise StopIteration
1467
- offset = self ._lines_read * dtype .itemsize
1468
- self .path_or_buf .seek (self .data_location + offset )
1469
- read_lines = min (nrows , self .nobs - self ._lines_read )
1470
- data = np .frombuffer (self .path_or_buf .read (read_len ), dtype = dtype ,
1471
- count = read_lines )
1472
-
1473
- self ._lines_read += read_lines
1474
- if self ._lines_read == self .nobs :
1475
- self ._can_read_value_labels = True
1476
- self ._data_read = True
1477
- # if necessary, swap the byte order to native here
1478
- if self .byteorder != self ._native_byteorder :
1479
- data = data .byteswap ().newbyteorder ()
1480
-
1481
- if convert_categoricals :
1482
- self ._read_value_labels ()
1483
1480
1484
- if len (data ) == 0 :
1485
- data = DataFrame (columns = self .varlist , index = index )
1486
- else :
1487
- data = DataFrame .from_records (data , index = index )
1488
- data .columns = self .varlist
1489
-
1490
- # If index is not specified, use actual row number rather than
1491
- # restarting at 0 for each chunk.
1492
- if index is None :
1493
- ix = np .arange (self ._lines_read - read_lines , self ._lines_read )
1494
- data = data .set_index (ix )
1495
-
1496
- if columns is not None :
1497
- data = self ._do_select_columns (data , columns )
1498
-
1499
- # Decode strings
1500
- for col , typ in zip (data , self .typlist ):
1501
- if type (typ ) is int :
1502
- data [col ] = data [col ].apply (
1503
- self ._null_terminate , convert_dtype = True )
1504
-
1505
- data = self ._insert_strls (data )
1506
-
1507
- cols_ = np .where (self .dtyplist )[0 ]
1508
-
1509
- # Convert columns (if needed) to match input type
1510
- index = data .index
1511
- requires_type_conversion = False
1512
- data_formatted = []
1513
- for i in cols_ :
1514
- if self .dtyplist [i ] is not None :
1515
- col = data .columns [i ]
1516
- dtype = data [col ].dtype
1517
- if (dtype != np .dtype (object )) and (dtype != self .dtyplist [i ]):
1518
- requires_type_conversion = True
1519
- data_formatted .append (
1520
- (col , Series (data [col ], index , self .dtyplist [i ])))
1521
- else :
1522
- data_formatted .append ((col , data [col ]))
1523
- if requires_type_conversion :
1524
- data = DataFrame .from_items (data_formatted )
1525
- del data_formatted
1526
-
1527
- self ._do_convert_missing (data , convert_missing )
1528
-
1529
- if convert_dates :
1530
- cols = np .where (lmap (lambda x : x in _date_formats ,
1531
- self .fmtlist ))[0 ]
1532
- for i in cols :
1533
- col = data .columns [i ]
1534
- data [col ] = _stata_elapsed_date_to_datetime_vec (
1535
- data [col ],
1536
- self .fmtlist [i ])
1537
-
1538
- if convert_categoricals and self .format_version > 108 :
1539
- data = self ._do_convert_categoricals (data ,
1540
- self .value_label_dict ,
1541
- self .lbllist ,
1542
- order_categoricals )
1543
-
1544
- if not preserve_dtypes :
1545
- retyped_data = []
1546
- convert = False
1547
- for col in data :
1548
- dtype = data [col ].dtype
1549
- if dtype in (np .float16 , np .float32 ):
1550
- dtype = np .float64
1551
- convert = True
1552
- elif dtype in (np .int8 , np .int16 , np .int32 ):
1553
- dtype = np .int64
1554
- convert = True
1555
- retyped_data .append ((col , data [col ].astype (dtype )))
1556
- if convert :
1557
- data = DataFrame .from_items (retyped_data )
1481
+ if len (data ) == 0 :
1482
+ data = DataFrame (columns = self .varlist , index = index )
1483
+ else :
1484
+ data = DataFrame .from_records (data , index = index )
1485
+ data .columns = self .varlist
1486
+
1487
+ # If index is not specified, use actual row number rather than
1488
+ # restarting at 0 for each chunk.
1489
+ if index is None :
1490
+ ix = np .arange (self ._lines_read - read_lines , self ._lines_read )
1491
+ data = data .set_index (ix )
1492
+
1493
+ if columns is not None :
1494
+ data = self ._do_select_columns (data , columns )
1495
+
1496
+ # Decode strings
1497
+ for col , typ in zip (data , self .typlist ):
1498
+ if type (typ ) is int :
1499
+ data [col ] = data [col ].apply (
1500
+ self ._null_terminate , convert_dtype = True )
1501
+
1502
+ data = self ._insert_strls (data )
1503
+
1504
+ cols_ = np .where (self .dtyplist )[0 ]
1505
+
1506
+ # Convert columns (if needed) to match input type
1507
+ index = data .index
1508
+ requires_type_conversion = False
1509
+ data_formatted = []
1510
+ for i in cols_ :
1511
+ if self .dtyplist [i ] is not None :
1512
+ col = data .columns [i ]
1513
+ dtype = data [col ].dtype
1514
+ if (dtype != np .dtype (object )) and (dtype != self .dtyplist [i ]):
1515
+ requires_type_conversion = True
1516
+ data_formatted .append (
1517
+ (col , Series (data [col ], index , self .dtyplist [i ])))
1518
+ else :
1519
+ data_formatted .append ((col , data [col ]))
1520
+ if requires_type_conversion :
1521
+ data = DataFrame .from_items (data_formatted )
1522
+ del data_formatted
1523
+
1524
+ self ._do_convert_missing (data , convert_missing )
1525
+
1526
+ if convert_dates :
1527
+ cols = np .where (lmap (lambda x : x in _date_formats ,
1528
+ self .fmtlist ))[0 ]
1529
+ for i in cols :
1530
+ col = data .columns [i ]
1531
+ data [col ] = _stata_elapsed_date_to_datetime_vec (
1532
+ data [col ],
1533
+ self .fmtlist [i ])
1534
+
1535
+ if convert_categoricals and self .format_version > 108 :
1536
+ data = self ._do_convert_categoricals (data ,
1537
+ self .value_label_dict ,
1538
+ self .lbllist ,
1539
+ order_categoricals )
1540
+
1541
+ if not preserve_dtypes :
1542
+ retyped_data = []
1543
+ convert = False
1544
+ for col in data :
1545
+ dtype = data [col ].dtype
1546
+ if dtype in (np .float16 , np .float32 ):
1547
+ dtype = np .float64
1548
+ convert = True
1549
+ elif dtype in (np .int8 , np .int16 , np .int32 ):
1550
+ dtype = np .int64
1551
+ convert = True
1552
+ retyped_data .append ((col , data [col ].astype (dtype )))
1553
+ if convert :
1554
+ data = DataFrame .from_items (retyped_data )
1555
+ except :
1556
+ self .close ()
1557
+ raise
1558
1558
1559
1559
return data
1560
1560
0 commit comments