66
66
from pandas .core .arrays .integer import IntegerDtype
67
67
from pandas .core .frame import DataFrame
68
68
from pandas .core .indexes .base import Index
69
+ from pandas .core .indexes .range import RangeIndex
69
70
from pandas .core .series import Series
70
71
from pandas .core .shared_docs import _shared_docs
71
72
@@ -690,10 +691,7 @@ def __init__(
690
691
self .labname = catarray .name
691
692
self ._encoding = encoding
692
693
categories = catarray .cat .categories
693
- self .value_labels : list [tuple [float , str ]] = list (
694
- zip (np .arange (len (categories )), categories )
695
- )
696
- self .value_labels .sort (key = lambda x : x [0 ])
694
+ self .value_labels = enumerate (categories )
697
695
698
696
self ._prepare_value_labels ()
699
697
@@ -819,7 +817,7 @@ def __init__(
819
817
820
818
self .labname = labname
821
819
self ._encoding = encoding
822
- self .value_labels : list [ tuple [ float , str ]] = sorted (
820
+ self .value_labels = sorted ( # type: ignore[assignment]
823
821
value_labels .items (), key = lambda x : x [0 ]
824
822
)
825
823
self ._prepare_value_labels ()
@@ -1054,7 +1052,7 @@ def __init__(self) -> None:
1054
1052
}
1055
1053
1056
1054
# Reserved words cannot be used as variable names
1057
- self .RESERVED_WORDS = (
1055
+ self .RESERVED_WORDS = {
1058
1056
"aggregate" ,
1059
1057
"array" ,
1060
1058
"boolean" ,
@@ -1115,7 +1113,7 @@ def __init__(self) -> None:
1115
1113
"_se" ,
1116
1114
"with" ,
1117
1115
"_n" ,
1118
- )
1116
+ }
1119
1117
1120
1118
1121
1119
class StataReader (StataParser , abc .Iterator ):
@@ -1138,7 +1136,6 @@ def __init__(
1138
1136
storage_options : StorageOptions | None = None ,
1139
1137
) -> None :
1140
1138
super ().__init__ ()
1141
- self ._col_sizes : list [int ] = []
1142
1139
1143
1140
# Arguments to the reader (can be temporarily overridden in
1144
1141
# calls to read).
@@ -1163,7 +1160,6 @@ def __init__(
1163
1160
1164
1161
# State variables for the file
1165
1162
self ._close_file : Callable [[], None ] | None = None
1166
- self ._has_string_data = False
1167
1163
self ._missing_values = False
1168
1164
self ._can_read_value_labels = False
1169
1165
self ._column_selector_set = False
@@ -1293,13 +1289,6 @@ def _read_header(self) -> None:
1293
1289
else :
1294
1290
self ._read_old_header (first_char )
1295
1291
1296
- self ._has_string_data = (
1297
- len ([x for x in self ._typlist if isinstance (x , int )]) > 0
1298
- )
1299
-
1300
- # calculate size of a data record
1301
- self ._col_sizes = [self ._calcsize (typ ) for typ in self ._typlist ]
1302
-
1303
1292
def _read_new_header (self ) -> None :
1304
1293
# The first part of the header is common to 117 - 119.
1305
1294
self ._path_or_buf .read (27 ) # stata_dta><header><release>
@@ -1360,29 +1349,21 @@ def _get_dtypes(
1360
1349
self , seek_vartypes : int
1361
1350
) -> tuple [list [int | str ], list [str | np .dtype ]]:
1362
1351
self ._path_or_buf .seek (seek_vartypes )
1363
- raw_typlist = [self ._read_uint16 () for _ in range (self ._nvar )]
1364
-
1365
- def f (typ : int ) -> int | str :
1352
+ typlist = []
1353
+ dtyplist = []
1354
+ for _ in range (self ._nvar ):
1355
+ typ = self ._read_uint16 ()
1366
1356
if typ <= 2045 :
1367
- return typ
1368
- try :
1369
- return self .TYPE_MAP_XML [typ ]
1370
- except KeyError as err :
1371
- raise ValueError (f"cannot convert stata types [{ typ } ]" ) from err
1372
-
1373
- typlist = [f (x ) for x in raw_typlist ]
1374
-
1375
- def g (typ : int ) -> str | np .dtype :
1376
- if typ <= 2045 :
1377
- return str (typ )
1378
- try :
1379
- return self .DTYPE_MAP_XML [typ ]
1380
- except KeyError as err :
1381
- raise ValueError (f"cannot convert stata dtype [{ typ } ]" ) from err
1382
-
1383
- dtyplist = [g (x ) for x in raw_typlist ]
1357
+ typlist .append (typ )
1358
+ dtyplist .append (str (typ ))
1359
+ else :
1360
+ try :
1361
+ typlist .append (self .TYPE_MAP_XML [typ ]) # type: ignore[arg-type]
1362
+ dtyplist .append (self .DTYPE_MAP_XML [typ ]) # type: ignore[arg-type]
1363
+ except KeyError as err :
1364
+ raise ValueError (f"cannot convert stata types [{ typ } ]" ) from err
1384
1365
1385
- return typlist , dtyplist
1366
+ return typlist , dtyplist # type: ignore[return-value]
1386
1367
1387
1368
def _get_varlist (self ) -> list [str ]:
1388
1369
# 33 in order formats, 129 in formats 118 and 119
@@ -1560,11 +1541,6 @@ def _setup_dtype(self) -> np.dtype:
1560
1541
1561
1542
return self ._dtype
1562
1543
1563
- def _calcsize (self , fmt : int | str ) -> int :
1564
- if isinstance (fmt , int ):
1565
- return fmt
1566
- return struct .calcsize (self ._byteorder + fmt )
1567
-
1568
1544
def _decode (self , s : bytes ) -> str :
1569
1545
# have bytes not strings, so must decode
1570
1546
s = s .partition (b"\0 " )[0 ]
@@ -1787,8 +1763,9 @@ def read(
1787
1763
# If index is not specified, use actual row number rather than
1788
1764
# restarting at 0 for each chunk.
1789
1765
if index_col is None :
1790
- rng = range (self ._lines_read - read_lines , self ._lines_read )
1791
- data .index = Index (rng ) # set attr instead of set_index to avoid copy
1766
+ data .index = RangeIndex (
1767
+ self ._lines_read - read_lines , self ._lines_read
1768
+ ) # set attr instead of set_index to avoid copy
1792
1769
1793
1770
if columns is not None :
1794
1771
data = self ._do_select_columns (data , columns )
@@ -1800,39 +1777,22 @@ def read(
1800
1777
1801
1778
data = self ._insert_strls (data )
1802
1779
1803
- cols_ = np .where ([dtyp is not None for dtyp in self ._dtyplist ])[0 ]
1804
1780
# Convert columns (if needed) to match input type
1805
- ix = data .index
1806
- requires_type_conversion = False
1807
- data_formatted = []
1808
- for i in cols_ :
1809
- if self ._dtyplist [i ] is not None :
1810
- col = data .columns [i ]
1811
- dtype = data [col ].dtype
1812
- if dtype != np .dtype (object ) and dtype != self ._dtyplist [i ]:
1813
- requires_type_conversion = True
1814
- data_formatted .append (
1815
- (col , Series (data [col ], ix , self ._dtyplist [i ]))
1816
- )
1817
- else :
1818
- data_formatted .append ((col , data [col ]))
1819
- if requires_type_conversion :
1820
- data = DataFrame .from_dict (dict (data_formatted ))
1821
- del data_formatted
1781
+ valid_dtypes = [i for i , dtyp in enumerate (self ._dtyplist ) if dtyp is not None ]
1782
+ object_type = np .dtype (object )
1783
+ for idx in valid_dtypes :
1784
+ dtype = data .iloc [:, idx ].dtype
1785
+ if dtype not in (object_type , self ._dtyplist [idx ]):
1786
+ data .iloc [:, idx ] = data .iloc [:, idx ].astype (dtype )
1822
1787
1823
1788
data = self ._do_convert_missing (data , convert_missing )
1824
1789
1825
1790
if convert_dates :
1826
-
1827
- def any_startswith (x : str ) -> bool :
1828
- return any (x .startswith (fmt ) for fmt in _date_formats )
1829
-
1830
- cols = np .where ([any_startswith (x ) for x in self ._fmtlist ])[0 ]
1831
- for i in cols :
1832
- col = data .columns [i ]
1833
- data [col ] = _stata_elapsed_date_to_datetime_vec (
1834
- data [col ], self ._fmtlist [i ]
1835
- )
1791
+ for i , fmt in enumerate (self ._fmtlist ):
1792
+ if any (fmt .startswith (date_fmt ) for date_fmt in _date_formats ):
1793
+ data .iloc [:, i ] = _stata_elapsed_date_to_datetime_vec (
1794
+ data .iloc [:, i ], fmt
1795
+ )
1836
1796
1837
1797
if convert_categoricals and self ._format_version > 108 :
1838
1798
data = self ._do_convert_categoricals (
@@ -1866,14 +1826,14 @@ def any_startswith(x: str) -> bool:
1866
1826
def _do_convert_missing (self , data : DataFrame , convert_missing : bool ) -> DataFrame :
1867
1827
# Check for missing values, and replace if found
1868
1828
replacements = {}
1869
- for i , colname in enumerate ( data ):
1829
+ for i in range ( len ( data . columns ) ):
1870
1830
fmt = self ._typlist [i ]
1871
1831
if fmt not in self .VALID_RANGE :
1872
1832
continue
1873
1833
1874
1834
fmt = cast (str , fmt ) # only strs in VALID_RANGE
1875
1835
nmin , nmax = self .VALID_RANGE [fmt ]
1876
- series = data [ colname ]
1836
+ series = data . iloc [:, i ]
1877
1837
1878
1838
# appreciably faster to do this with ndarray instead of Series
1879
1839
svals = series ._values
@@ -1903,11 +1863,10 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra
1903
1863
# Note: operating on ._values is much faster than directly
1904
1864
# TODO: can we fix that?
1905
1865
replacement ._values [missing ] = np .nan
1906
- replacements [colname ] = replacement
1907
-
1866
+ replacements [i ] = replacement
1908
1867
if replacements :
1909
- for col , value in replacements .items ():
1910
- data [ col ] = value
1868
+ for idx , value in replacements .items ():
1869
+ data . iloc [:, idx ] = value
1911
1870
return data
1912
1871
1913
1872
def _insert_strls (self , data : DataFrame ) -> DataFrame :
@@ -1962,10 +1921,11 @@ def _do_convert_categoricals(
1962
1921
"""
1963
1922
Converts categorical columns to Categorical type.
1964
1923
"""
1965
- value_labels = list (value_label_dict .keys ())
1924
+ if not value_label_dict :
1925
+ return data
1966
1926
cat_converted_data = []
1967
1927
for col , label in zip (data , lbllist ):
1968
- if label in value_labels :
1928
+ if label in value_label_dict :
1969
1929
# Explicit call with ordered=True
1970
1930
vl = value_label_dict [label ]
1971
1931
keys = np .array (list (vl .keys ()))
@@ -2466,7 +2426,7 @@ def _prepare_categoricals(self, data: DataFrame) -> DataFrame:
2466
2426
Check for categorical columns, retain categorical information for
2467
2427
Stata file and convert categorical data to int
2468
2428
"""
2469
- is_cat = [isinstance (data [ col ]. dtype , CategoricalDtype ) for col in data ]
2429
+ is_cat = [isinstance (dtype , CategoricalDtype ) for dtype in data . dtypes ]
2470
2430
if not any (is_cat ):
2471
2431
return data
2472
2432
0 commit comments