@@ -1416,146 +1416,151 @@ def read(self, nrows=None, convert_dates=None,
1416
1416
self .close ()
1417
1417
return DataFrame (columns = self .varlist )
1418
1418
1419
- try :
1420
- # Handle options
1421
- if convert_dates is None :
1422
- convert_dates = self ._convert_dates
1423
- if convert_categoricals is None :
1424
- convert_categoricals = self ._convert_categoricals
1425
- if convert_missing is None :
1426
- convert_missing = self ._convert_missing
1427
- if preserve_dtypes is None :
1428
- preserve_dtypes = self ._preserve_dtypes
1429
- if columns is None :
1430
- columns = self ._columns
1431
- if order_categoricals is None :
1432
- order_categoricals = self ._order_categoricals
1433
-
1434
- if nrows is None :
1435
- nrows = self .nobs
1436
-
1437
- if (self .format_version >= 117 ) and (self ._dtype is None ):
1438
- self ._can_read_value_labels = True
1439
- self ._read_strls ()
1440
-
1441
- # Setup the dtype.
1442
- if self ._dtype is None :
1443
- dtype = [] # Convert struct data types to numpy data type
1444
- for i , typ in enumerate (self .typlist ):
1445
- if typ in self .NUMPY_TYPE_MAP :
1446
- dtype .append (('s' + str (i ), self .byteorder +
1447
- self .NUMPY_TYPE_MAP [typ ]))
1448
- else :
1449
- dtype .append (('s' + str (i ), 'S' + str (typ )))
1450
- dtype = np .dtype (dtype )
1451
- self ._dtype = dtype
1452
-
1453
- # Read data
1454
- dtype = self ._dtype
1455
- max_read_len = (self .nobs - self ._lines_read ) * dtype .itemsize
1456
- read_len = nrows * dtype .itemsize
1457
- read_len = min (read_len , max_read_len )
1458
- if read_len <= 0 :
1459
- # Iterator has finished, should never be here unless
1460
- # we are reading the file incrementally
1461
- if convert_categoricals :
1462
- self ._read_value_labels ()
1463
- raise StopIteration
1464
- offset = self ._lines_read * dtype .itemsize
1465
- self .path_or_buf .seek (self .data_location + offset )
1466
- read_lines = min (nrows , self .nobs - self ._lines_read )
1467
- data = np .frombuffer (self .path_or_buf .read (read_len ), dtype = dtype ,
1468
- count = read_lines )
1469
-
1470
- self ._lines_read += read_lines
1471
- if self ._lines_read == self .nobs :
1472
- self ._can_read_value_labels = True
1473
- self ._data_read = True
1474
- # if necessary, swap the byte order to native here
1475
- if self .byteorder != self ._native_byteorder :
1476
- data = data .byteswap ().newbyteorder ()
1477
-
1419
+ # Handle options
1420
+ if convert_dates is None :
1421
+ convert_dates = self ._convert_dates
1422
+ if convert_categoricals is None :
1423
+ convert_categoricals = self ._convert_categoricals
1424
+ if convert_missing is None :
1425
+ convert_missing = self ._convert_missing
1426
+ if preserve_dtypes is None :
1427
+ preserve_dtypes = self ._preserve_dtypes
1428
+ if columns is None :
1429
+ columns = self ._columns
1430
+ if order_categoricals is None :
1431
+ order_categoricals = self ._order_categoricals
1432
+
1433
+ if nrows is None :
1434
+ nrows = self .nobs
1435
+
1436
+ if (self .format_version >= 117 ) and (self ._dtype is None ):
1437
+ self ._can_read_value_labels = True
1438
+ self ._read_strls ()
1439
+
1440
+ # Setup the dtype.
1441
+ if self ._dtype is None :
1442
+ dtype = [] # Convert struct data types to numpy data type
1443
+ for i , typ in enumerate (self .typlist ):
1444
+ if typ in self .NUMPY_TYPE_MAP :
1445
+ dtype .append (('s' + str (i ), self .byteorder +
1446
+ self .NUMPY_TYPE_MAP [typ ]))
1447
+ else :
1448
+ dtype .append (('s' + str (i ), 'S' + str (typ )))
1449
+ dtype = np .dtype (dtype )
1450
+ self ._dtype = dtype
1451
+
1452
+ # Read data
1453
+ dtype = self ._dtype
1454
+ max_read_len = (self .nobs - self ._lines_read ) * dtype .itemsize
1455
+ read_len = nrows * dtype .itemsize
1456
+ read_len = min (read_len , max_read_len )
1457
+ if read_len <= 0 :
1458
+ # Iterator has finished, should never be here unless
1459
+ # we are reading the file incrementally
1478
1460
if convert_categoricals :
1479
1461
self ._read_value_labels ()
1462
+ self .close ()
1463
+ raise StopIteration
1464
+ offset = self ._lines_read * dtype .itemsize
1465
+ self .path_or_buf .seek (self .data_location + offset )
1466
+ read_lines = min (nrows , self .nobs - self ._lines_read )
1467
+ data = np .frombuffer (self .path_or_buf .read (read_len ), dtype = dtype ,
1468
+ count = read_lines )
1469
+
1470
+ self ._lines_read += read_lines
1471
+ if self ._lines_read == self .nobs :
1472
+ self ._can_read_value_labels = True
1473
+ self ._data_read = True
1474
+ # if necessary, swap the byte order to native here
1475
+ if self .byteorder != self ._native_byteorder :
1476
+ data = data .byteswap ().newbyteorder ()
1480
1477
1481
- if len (data ) == 0 :
1482
- data = DataFrame (columns = self .varlist , index = index )
1483
- else :
1484
- data = DataFrame .from_records (data , index = index )
1485
- data .columns = self .varlist
1478
+ if convert_categoricals :
1479
+ self ._read_value_labels ()
1486
1480
1487
- # If index is not specified, use actual row number rather than
1488
- # restarting at 0 for each chunk.
1489
- if index is None :
1490
- ix = np .arange (self ._lines_read - read_lines , self ._lines_read )
1491
- data = data .set_index (ix )
1481
+ if len (data ) == 0 :
1482
+ data = DataFrame (columns = self .varlist , index = index )
1483
+ else :
1484
+ data = DataFrame .from_records (data , index = index )
1485
+ data .columns = self .varlist
1486
+
1487
+ # If index is not specified, use actual row number rather than
1488
+ # restarting at 0 for each chunk.
1489
+ if index is None :
1490
+ ix = np .arange (self ._lines_read - read_lines , self ._lines_read )
1491
+ data = data .set_index (ix )
1492
1492
1493
- if columns is not None :
1493
+ if columns is not None :
1494
+ try :
1494
1495
data = self ._do_select_columns (data , columns )
1496
+ except ValueError :
1497
+ self .close ()
1498
+ raise
1495
1499
1496
- # Decode strings
1497
- for col , typ in zip (data , self .typlist ):
1498
- if type (typ ) is int :
1499
- data [col ] = data [col ].apply (
1500
- self ._null_terminate , convert_dtype = True )
1501
-
1502
- data = self ._insert_strls (data )
1503
-
1504
- cols_ = np .where (self .dtyplist )[0 ]
1505
-
1506
- # Convert columns (if needed) to match input type
1507
- index = data .index
1508
- requires_type_conversion = False
1509
- data_formatted = []
1510
- for i in cols_ :
1511
- if self .dtyplist [i ] is not None :
1512
- col = data .columns [i ]
1513
- dtype = data [col ].dtype
1514
- if ((dtype != np .dtype (object )) and
1515
- (dtype != self .dtyplist [i ])):
1516
- requires_type_conversion = True
1517
- data_formatted .append (
1518
- (col , Series (data [col ], index , self .dtyplist [i ])))
1519
- else :
1520
- data_formatted .append ((col , data [col ]))
1521
- if requires_type_conversion :
1522
- data = DataFrame .from_items (data_formatted )
1523
- del data_formatted
1524
-
1525
- self ._do_convert_missing (data , convert_missing )
1526
-
1527
- if convert_dates :
1528
- cols = np .where (lmap (lambda x : x in _date_formats ,
1529
- self .fmtlist ))[0 ]
1530
- for i in cols :
1531
- col = data .columns [i ]
1500
+ # Decode strings
1501
+ for col , typ in zip (data , self .typlist ):
1502
+ if type (typ ) is int :
1503
+ data [col ] = data [col ].apply (
1504
+ self ._null_terminate , convert_dtype = True )
1505
+
1506
+ data = self ._insert_strls (data )
1507
+
1508
+ cols_ = np .where (self .dtyplist )[0 ]
1509
+
1510
+ # Convert columns (if needed) to match input type
1511
+ index = data .index
1512
+ requires_type_conversion = False
1513
+ data_formatted = []
1514
+ for i in cols_ :
1515
+ if self .dtyplist [i ] is not None :
1516
+ col = data .columns [i ]
1517
+ dtype = data [col ].dtype
1518
+ if ((dtype != np .dtype (object )) and
1519
+ (dtype != self .dtyplist [i ])):
1520
+ requires_type_conversion = True
1521
+ data_formatted .append (
1522
+ (col , Series (data [col ], index , self .dtyplist [i ])))
1523
+ else :
1524
+ data_formatted .append ((col , data [col ]))
1525
+ if requires_type_conversion :
1526
+ data = DataFrame .from_items (data_formatted )
1527
+ del data_formatted
1528
+
1529
+ self ._do_convert_missing (data , convert_missing )
1530
+
1531
+ if convert_dates :
1532
+ cols = np .where (lmap (lambda x : x in _date_formats ,
1533
+ self .fmtlist ))[0 ]
1534
+ for i in cols :
1535
+ col = data .columns [i ]
1536
+ try :
1532
1537
data [col ] = _stata_elapsed_date_to_datetime_vec (
1533
1538
data [col ],
1534
1539
self .fmtlist [i ])
1535
-
1536
- if convert_categoricals and self . format_version > 108 :
1537
- data = self . _do_convert_categoricals ( data ,
1538
- self . value_label_dict ,
1539
- self .lbllist ,
1540
- order_categoricals )
1541
-
1542
- if not preserve_dtypes :
1543
- retyped_data = []
1544
- convert = False
1545
- for col in data :
1546
- dtype = data [ col ]. dtype
1547
- if dtype in ( np . float16 , np . float32 ):
1548
- dtype = np . float64
1549
- convert = True
1550
- elif dtype in (np .int8 , np .int16 , np . int32 ):
1551
- dtype = np .int64
1552
- convert = True
1553
- retyped_data . append (( col , data [ col ]. astype ( dtype )))
1554
- if convert :
1555
- data = DataFrame . from_items ( retyped_data )
1556
- except :
1557
- self . close ()
1558
- raise
1540
+ except ValueError :
1541
+ self . close ()
1542
+ raise
1543
+
1544
+ if convert_categoricals and self .format_version > 108 :
1545
+ data = self . _do_convert_categoricals ( data ,
1546
+ self . value_label_dict ,
1547
+ self . lbllist ,
1548
+ order_categoricals )
1549
+
1550
+ if not preserve_dtypes :
1551
+ retyped_data = []
1552
+ convert = False
1553
+ for col in data :
1554
+ dtype = data [ col ]. dtype
1555
+ if dtype in (np .float16 , np .float32 ):
1556
+ dtype = np .float64
1557
+ convert = True
1558
+ elif dtype in ( np . int8 , np . int16 , np . int32 ):
1559
+ dtype = np . int64
1560
+ convert = True
1561
+ retyped_data . append (( col , data [ col ]. astype ( dtype )))
1562
+ if convert :
1563
+ data = DataFrame . from_items ( retyped_data )
1559
1564
1560
1565
return data
1561
1566
0 commit comments