@@ -1122,6 +1122,7 @@ def __init__(
1122
1122
# State variables for the file
1123
1123
self ._close_file : Callable [[], None ] | None = None
1124
1124
self ._column_selector_set = False
1125
+ self ._value_label_dict : dict [str , dict [int , str ]] = {}
1125
1126
self ._value_labels_read = False
1126
1127
self ._dtype : np .dtype | None = None
1127
1128
self ._lines_read = 0
@@ -1502,36 +1503,26 @@ def _decode(self, s: bytes) -> str:
1502
1503
)
1503
1504
return s .decode ("latin-1" )
1504
1505
1505
- def _read_value_labels (self ) -> None :
1506
- self ._ensure_open ()
1507
- if self ._value_labels_read :
1508
- # Don't read twice
1509
- return
1510
- if self ._format_version <= 108 :
1511
- # Value labels are not supported in version 108 and earlier.
1512
- self ._value_labels_read = True
1513
- self ._value_label_dict : dict [str , dict [float , str ]] = {}
1514
- return
1515
-
1506
+ def _read_new_value_labels (self ) -> None :
1507
+ """Reads value labels with variable length strings (108 and later format)"""
1516
1508
if self ._format_version >= 117 :
1517
1509
self ._path_or_buf .seek (self ._seek_value_labels )
1518
1510
else :
1519
1511
assert self ._dtype is not None
1520
1512
offset = self ._nobs * self ._dtype .itemsize
1521
1513
self ._path_or_buf .seek (self ._data_location + offset )
1522
1514
1523
- self ._value_labels_read = True
1524
- self ._value_label_dict = {}
1525
-
1526
1515
while True :
1527
1516
if self ._format_version >= 117 :
1528
1517
if self ._path_or_buf .read (5 ) == b"</val" : # <lbl>
1529
1518
break # end of value label table
1530
1519
1531
1520
slength = self ._path_or_buf .read (4 )
1532
1521
if not slength :
1533
- break # end of value label table (format < 117)
1534
- if self ._format_version <= 117 :
1522
+ break # end of value label table (format < 117), or end-of-file
1523
+ if self ._format_version == 108 :
1524
+ labname = self ._decode (self ._path_or_buf .read (9 ))
1525
+ elif self ._format_version <= 117 :
1535
1526
labname = self ._decode (self ._path_or_buf .read (33 ))
1536
1527
else :
1537
1528
labname = self ._decode (self ._path_or_buf .read (129 ))
@@ -1555,8 +1546,45 @@ def _read_value_labels(self) -> None:
1555
1546
self ._value_label_dict [labname ][val [i ]] = self ._decode (
1556
1547
txt [off [i ] : end ]
1557
1548
)
1549
+
1558
1550
if self ._format_version >= 117 :
1559
1551
self ._path_or_buf .read (6 ) # </lbl>
1552
+
1553
+ def _read_old_value_labels (self ) -> None :
1554
+ """Reads value labels with fixed-length strings (105 and earlier format)"""
1555
+ assert self ._dtype is not None
1556
+ offset = self ._nobs * self ._dtype .itemsize
1557
+ self ._path_or_buf .seek (self ._data_location + offset )
1558
+
1559
+ while True :
1560
+ if not self ._path_or_buf .read (2 ):
1561
+ # end-of-file may have been reached, if so stop here
1562
+ break
1563
+
1564
+ # otherwise back up and read again, taking byteorder into account
1565
+ self ._path_or_buf .seek (- 2 , os .SEEK_CUR )
1566
+ n = self ._read_uint16 ()
1567
+ labname = self ._decode (self ._path_or_buf .read (9 ))
1568
+ self ._path_or_buf .read (1 ) # padding
1569
+ codes = np .frombuffer (
1570
+ self ._path_or_buf .read (2 * n ), dtype = f"{ self ._byteorder } i2" , count = n
1571
+ )
1572
+ self ._value_label_dict [labname ] = {}
1573
+ for i in range (n ):
1574
+ self ._value_label_dict [labname ][codes [i ]] = self ._decode (
1575
+ self ._path_or_buf .read (8 )
1576
+ )
1577
+
1578
+ def _read_value_labels (self ) -> None :
1579
+ self ._ensure_open ()
1580
+ if self ._value_labels_read :
1581
+ # Don't read twice
1582
+ return
1583
+
1584
+ if self ._format_version >= 108 :
1585
+ self ._read_new_value_labels ()
1586
+ else :
1587
+ self ._read_old_value_labels ()
1560
1588
self ._value_labels_read = True
1561
1589
1562
1590
def _read_strls (self ) -> None :
@@ -1729,7 +1757,7 @@ def read(
1729
1757
i , _stata_elapsed_date_to_datetime_vec (data .iloc [:, i ], fmt )
1730
1758
)
1731
1759
1732
- if convert_categoricals and self . _format_version > 108 :
1760
+ if convert_categoricals :
1733
1761
data = self ._do_convert_categoricals (
1734
1762
data , self ._value_label_dict , self ._lbllist , order_categoricals
1735
1763
)
@@ -1845,7 +1873,7 @@ def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFra
1845
1873
def _do_convert_categoricals (
1846
1874
self ,
1847
1875
data : DataFrame ,
1848
- value_label_dict : dict [str , dict [float , str ]],
1876
+ value_label_dict : dict [str , dict [int , str ]],
1849
1877
lbllist : Sequence [str ],
1850
1878
order_categoricals : bool ,
1851
1879
) -> DataFrame :
@@ -1983,7 +2011,7 @@ def variable_labels(self) -> dict[str, str]:
1983
2011
self ._ensure_open ()
1984
2012
return dict (zip (self ._varlist , self ._variable_labels ))
1985
2013
1986
- def value_labels (self ) -> dict [str , dict [float , str ]]:
2014
+ def value_labels (self ) -> dict [str , dict [int , str ]]:
1987
2015
"""
1988
2016
Return a nested dict associating each variable name to its value and label.
1989
2017
0 commit comments