@@ -469,7 +469,7 @@ class PossiblePrecisionLoss(Warning):
469
469
470
470
471
471
precision_loss_doc = """
472
- Column converted from %s to %s , and some data are outside of the lossless
472
+ Column converted from {0} to {1} , and some data are outside of the lossless
473
473
conversion range. This may result in a loss of precision in the saved data.
474
474
"""
475
475
@@ -543,7 +543,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
543
543
object in a DataFrame.
544
544
"""
545
545
ws = ""
546
- # original, if small, if large
546
+ # original, if small, if large
547
547
conversion_data = (
548
548
(np .bool_ , np .int8 , np .int8 ),
549
549
(np .uint8 , np .int8 , np .int16 ),
@@ -563,7 +563,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
563
563
dtype = c_data [1 ]
564
564
else :
565
565
dtype = c_data [2 ]
566
- if c_data [2 ] == np .float64 : # Warn if necessary
566
+ if c_data [2 ] == np .int64 : # Warn if necessary
567
567
if data [col ].max () >= 2 ** 53 :
568
568
ws = precision_loss_doc .format ("uint64" , "float64" )
569
569
@@ -627,12 +627,12 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"):
627
627
self .value_labels = list (zip (np .arange (len (categories )), categories ))
628
628
self .value_labels .sort (key = lambda x : x [0 ])
629
629
self .text_len = 0
630
- self .off : List [int ] = []
631
- self .val : List [int ] = []
632
630
self .txt : List [bytes ] = []
633
631
self .n = 0
634
632
635
633
# Compute lengths and setup lists of offsets and labels
634
+ offsets : List [int ] = []
635
+ values : List [int ] = []
636
636
for vl in self .value_labels :
637
637
category = vl [1 ]
638
638
if not isinstance (category , str ):
@@ -642,9 +642,9 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"):
642
642
ValueLabelTypeMismatch ,
643
643
)
644
644
category = category .encode (encoding )
645
- self . off .append (self .text_len )
645
+ offsets .append (self .text_len )
646
646
self .text_len += len (category ) + 1 # +1 for the padding
647
- self . val .append (vl [0 ])
647
+ values .append (vl [0 ])
648
648
self .txt .append (category )
649
649
self .n += 1
650
650
@@ -655,8 +655,8 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"):
655
655
)
656
656
657
657
# Ensure int32
658
- self .off = np .array (self . off , dtype = np .int32 )
659
- self .val = np .array (self . val , dtype = np .int32 )
658
+ self .off = np .array (offsets , dtype = np .int32 )
659
+ self .val = np .array (values , dtype = np .int32 )
660
660
661
661
# Total length
662
662
self .len = 4 + 4 + 4 * self .n + 4 * self .n + self .text_len
@@ -868,23 +868,23 @@ def __init__(self):
868
868
# with a label, but the underlying variable is -127 to 100
869
869
# we're going to drop the label and cast to int
870
870
self .DTYPE_MAP = dict (
871
- list (zip (range (1 , 245 ), ["a" + str (i ) for i in range (1 , 245 )]))
871
+ list (zip (range (1 , 245 ), [np . dtype ( "a" + str (i ) ) for i in range (1 , 245 )]))
872
872
+ [
873
- (251 , np .int8 ),
874
- (252 , np .int16 ),
875
- (253 , np .int32 ),
876
- (254 , np .float32 ),
877
- (255 , np .float64 ),
873
+ (251 , np .dtype ( np . int8 ) ),
874
+ (252 , np .dtype ( np . int16 ) ),
875
+ (253 , np .dtype ( np . int32 ) ),
876
+ (254 , np .dtype ( np . float32 ) ),
877
+ (255 , np .dtype ( np . float64 ) ),
878
878
]
879
879
)
880
880
self .DTYPE_MAP_XML = dict (
881
881
[
882
- (32768 , np .uint8 ), # Keys to GSO
883
- (65526 , np .float64 ),
884
- (65527 , np .float32 ),
885
- (65528 , np .int32 ),
886
- (65529 , np .int16 ),
887
- (65530 , np .int8 ),
882
+ (32768 , np .dtype ( np . uint8 ) ), # Keys to GSO
883
+ (65526 , np .dtype ( np . float64 ) ),
884
+ (65527 , np .dtype ( np . float32 ) ),
885
+ (65528 , np .dtype ( np . int32 ) ),
886
+ (65529 , np .dtype ( np . int16 ) ),
887
+ (65530 , np .dtype ( np . int8 ) ),
888
888
]
889
889
)
890
890
# error: Argument 1 to "list" has incompatible type "str";
@@ -1045,10 +1045,12 @@ def __init__(
1045
1045
self ._order_categoricals = order_categoricals
1046
1046
self ._encoding = ""
1047
1047
self ._chunksize = chunksize
1048
- if self ._chunksize is not None and (
1049
- not isinstance (chunksize , int ) or chunksize <= 0
1050
- ):
1051
- raise ValueError ("chunksize must be a positive integer when set." )
1048
+ self ._using_iterator = False
1049
+ if self ._chunksize is None :
1050
+ self ._chunksize = 1
1051
+ else :
1052
+ if not isinstance (chunksize , int ) or chunksize <= 0 :
1053
+ raise ValueError ("chunksize must be a positive integer when set." )
1052
1054
1053
1055
# State variables for the file
1054
1056
self ._has_string_data = False
@@ -1057,7 +1059,7 @@ def __init__(
1057
1059
self ._column_selector_set = False
1058
1060
self ._value_labels_read = False
1059
1061
self ._data_read = False
1060
- self ._dtype = None
1062
+ self ._dtype : Optional [ np . dtype ] = None
1061
1063
self ._lines_read = 0
1062
1064
1063
1065
self ._native_byteorder = _set_endianness (sys .byteorder )
@@ -1193,7 +1195,7 @@ def _read_new_header(self) -> None:
1193
1195
# Get data type information, works for versions 117-119.
1194
1196
def _get_dtypes (
1195
1197
self , seek_vartypes : int
1196
- ) -> Tuple [List [Union [int , str ]], List [Union [int , np .dtype ]]]:
1198
+ ) -> Tuple [List [Union [int , str ]], List [Union [str , np .dtype ]]]:
1197
1199
1198
1200
self .path_or_buf .seek (seek_vartypes )
1199
1201
raw_typlist = [
@@ -1518,11 +1520,8 @@ def _read_strls(self) -> None:
1518
1520
self .GSO [str (v_o )] = decoded_va
1519
1521
1520
1522
def __next__ (self ) -> DataFrame :
1521
- if self ._chunksize is None :
1522
- raise ValueError (
1523
- "chunksize must be set to a positive integer to use as an iterator."
1524
- )
1525
- return self .read (nrows = self ._chunksize or 1 )
1523
+ self ._using_iterator = True
1524
+ return self .read (nrows = self ._chunksize )
1526
1525
1527
1526
def get_chunk (self , size : Optional [int ] = None ) -> DataFrame :
1528
1527
"""
@@ -1690,11 +1689,15 @@ def any_startswith(x: str) -> bool:
1690
1689
convert = False
1691
1690
for col in data :
1692
1691
dtype = data [col ].dtype
1693
- if dtype in (np .float16 , np .float32 ):
1694
- dtype = np .float64
1692
+ if dtype in (np .dtype ( np . float16 ) , np .dtype ( np . float32 ) ):
1693
+ dtype = np .dtype ( np . float64 )
1695
1694
convert = True
1696
- elif dtype in (np .int8 , np .int16 , np .int32 ):
1697
- dtype = np .int64
1695
+ elif dtype in (
1696
+ np .dtype (np .int8 ),
1697
+ np .dtype (np .int16 ),
1698
+ np .dtype (np .int32 ),
1699
+ ):
1700
+ dtype = np .dtype (np .int64 )
1698
1701
convert = True
1699
1702
retyped_data .append ((col , data [col ].astype (dtype )))
1700
1703
if convert :
@@ -1806,14 +1809,14 @@ def _do_convert_categoricals(
1806
1809
keys = np .array (list (vl .keys ()))
1807
1810
column = data [col ]
1808
1811
key_matches = column .isin (keys )
1809
- if self ._chunksize is not None and key_matches .all ():
1810
- initial_categories = keys
1812
+ if self ._using_iterator and key_matches .all ():
1813
+ initial_categories : Optional [ np . ndarray ] = keys
1811
1814
# If all categories are in the keys and we are iterating,
1812
1815
# use the same keys for all chunks. If some are missing
1813
1816
# value labels, then we will fall back to the categories
1814
1817
# varying across chunks.
1815
1818
else :
1816
- if self ._chunksize is not None :
1819
+ if self ._using_iterator :
1817
1820
# warn is using an iterator
1818
1821
warnings .warn (
1819
1822
categorical_conversion_warning , CategoricalConversionWarning
@@ -2024,7 +2027,7 @@ def _convert_datetime_to_stata_type(fmt: str) -> np.dtype:
2024
2027
"ty" ,
2025
2028
"%ty" ,
2026
2029
]:
2027
- return np .float64 # Stata expects doubles for SIFs
2030
+ return np .dtype ( np . float64 ) # Stata expects doubles for SIFs
2028
2031
else :
2029
2032
raise NotImplementedError (f"Format { fmt } not implemented" )
2030
2033
0 commit comments