@@ -477,7 +477,7 @@ class PossiblePrecisionLoss(Warning):
477
477
478
478
479
479
precision_loss_doc = """
480
- Column converted from %s to %s , and some data are outside of the lossless
480
+ Column converted from {0} to {1} , and some data are outside of the lossless
481
481
conversion range. This may result in a loss of precision in the saved data.
482
482
"""
483
483
@@ -551,7 +551,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
551
551
object in a DataFrame.
552
552
"""
553
553
ws = ""
554
- # original, if small, if large
554
+ # original, if small, if large
555
555
conversion_data = (
556
556
(np .bool_ , np .int8 , np .int8 ),
557
557
(np .uint8 , np .int8 , np .int16 ),
@@ -571,7 +571,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
571
571
dtype = c_data [1 ]
572
572
else :
573
573
dtype = c_data [2 ]
574
- if c_data [2 ] == np .float64 : # Warn if necessary
574
+ if c_data [2 ] == np .int64 : # Warn if necessary
575
575
if data [col ].max () >= 2 ** 53 :
576
576
ws = precision_loss_doc .format ("uint64" , "float64" )
577
577
@@ -635,12 +635,12 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"):
635
635
self .value_labels = list (zip (np .arange (len (categories )), categories ))
636
636
self .value_labels .sort (key = lambda x : x [0 ])
637
637
self .text_len = 0
638
- self .off : List [int ] = []
639
- self .val : List [int ] = []
640
638
self .txt : List [bytes ] = []
641
639
self .n = 0
642
640
643
641
# Compute lengths and setup lists of offsets and labels
642
+ offsets : List [int ] = []
643
+ values : List [int ] = []
644
644
for vl in self .value_labels :
645
645
category = vl [1 ]
646
646
if not isinstance (category , str ):
@@ -650,9 +650,9 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"):
650
650
ValueLabelTypeMismatch ,
651
651
)
652
652
category = category .encode (encoding )
653
- self . off .append (self .text_len )
653
+ offsets .append (self .text_len )
654
654
self .text_len += len (category ) + 1 # +1 for the padding
655
- self . val .append (vl [0 ])
655
+ values .append (vl [0 ])
656
656
self .txt .append (category )
657
657
self .n += 1
658
658
@@ -663,8 +663,8 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"):
663
663
)
664
664
665
665
# Ensure int32
666
- self .off = np .array (self . off , dtype = np .int32 )
667
- self .val = np .array (self . val , dtype = np .int32 )
666
+ self .off = np .array (offsets , dtype = np .int32 )
667
+ self .val = np .array (values , dtype = np .int32 )
668
668
669
669
# Total length
670
670
self .len = 4 + 4 + 4 * self .n + 4 * self .n + self .text_len
@@ -876,23 +876,23 @@ def __init__(self):
876
876
# with a label, but the underlying variable is -127 to 100
877
877
# we're going to drop the label and cast to int
878
878
self .DTYPE_MAP = dict (
879
- list (zip (range (1 , 245 ), ["a" + str (i ) for i in range (1 , 245 )]))
879
+ list (zip (range (1 , 245 ), [np . dtype ( "a" + str (i ) ) for i in range (1 , 245 )]))
880
880
+ [
881
- (251 , np .int8 ),
882
- (252 , np .int16 ),
883
- (253 , np .int32 ),
884
- (254 , np .float32 ),
885
- (255 , np .float64 ),
881
+ (251 , np .dtype ( np . int8 ) ),
882
+ (252 , np .dtype ( np . int16 ) ),
883
+ (253 , np .dtype ( np . int32 ) ),
884
+ (254 , np .dtype ( np . float32 ) ),
885
+ (255 , np .dtype ( np . float64 ) ),
886
886
]
887
887
)
888
888
self .DTYPE_MAP_XML = dict (
889
889
[
890
- (32768 , np .uint8 ), # Keys to GSO
891
- (65526 , np .float64 ),
892
- (65527 , np .float32 ),
893
- (65528 , np .int32 ),
894
- (65529 , np .int16 ),
895
- (65530 , np .int8 ),
890
+ (32768 , np .dtype ( np . uint8 ) ), # Keys to GSO
891
+ (65526 , np .dtype ( np . float64 ) ),
892
+ (65527 , np .dtype ( np . float32 ) ),
893
+ (65528 , np .dtype ( np . int32 ) ),
894
+ (65529 , np .dtype ( np . int16 ) ),
895
+ (65530 , np .dtype ( np . int8 ) ),
896
896
]
897
897
)
898
898
self .TYPE_MAP = list (range (251 )) + list ("bhlfd" )
@@ -1050,9 +1050,10 @@ def __init__(
1050
1050
self ._order_categoricals = order_categoricals
1051
1051
self ._encoding = ""
1052
1052
self ._chunksize = chunksize
1053
- if self ._chunksize is not None and (
1054
- not isinstance (chunksize , int ) or chunksize <= 0
1055
- ):
1053
+ self ._using_iterator = False
1054
+ if self ._chunksize is None :
1055
+ self ._chunksize = 1
1056
+ elif not isinstance (chunksize , int ) or chunksize <= 0 :
1056
1057
raise ValueError ("chunksize must be a positive integer when set." )
1057
1058
1058
1059
# State variables for the file
@@ -1062,7 +1063,7 @@ def __init__(
1062
1063
self ._column_selector_set = False
1063
1064
self ._value_labels_read = False
1064
1065
self ._data_read = False
1065
- self ._dtype = None
1066
+ self ._dtype : Optional [ np . dtype ] = None
1066
1067
self ._lines_read = 0
1067
1068
1068
1069
self ._native_byteorder = _set_endianness (sys .byteorder )
@@ -1195,7 +1196,7 @@ def _read_new_header(self) -> None:
1195
1196
# Get data type information, works for versions 117-119.
1196
1197
def _get_dtypes (
1197
1198
self , seek_vartypes : int
1198
- ) -> Tuple [List [Union [int , str ]], List [Union [int , np .dtype ]]]:
1199
+ ) -> Tuple [List [Union [int , str ]], List [Union [str , np .dtype ]]]:
1199
1200
1200
1201
self .path_or_buf .seek (seek_vartypes )
1201
1202
raw_typlist = [
@@ -1519,11 +1520,8 @@ def _read_strls(self) -> None:
1519
1520
self .GSO [str (v_o )] = decoded_va
1520
1521
1521
1522
def __next__ (self ) -> DataFrame :
1522
- if self ._chunksize is None :
1523
- raise ValueError (
1524
- "chunksize must be set to a positive integer to use as an iterator."
1525
- )
1526
- return self .read (nrows = self ._chunksize or 1 )
1523
+ self ._using_iterator = True
1524
+ return self .read (nrows = self ._chunksize )
1527
1525
1528
1526
def get_chunk (self , size : Optional [int ] = None ) -> DataFrame :
1529
1527
"""
@@ -1692,11 +1690,15 @@ def any_startswith(x: str) -> bool:
1692
1690
convert = False
1693
1691
for col in data :
1694
1692
dtype = data [col ].dtype
1695
- if dtype in (np .float16 , np .float32 ):
1696
- dtype = np .float64
1693
+ if dtype in (np .dtype ( np . float16 ) , np .dtype ( np . float32 ) ):
1694
+ dtype = np .dtype ( np . float64 )
1697
1695
convert = True
1698
- elif dtype in (np .int8 , np .int16 , np .int32 ):
1699
- dtype = np .int64
1696
+ elif dtype in (
1697
+ np .dtype (np .int8 ),
1698
+ np .dtype (np .int16 ),
1699
+ np .dtype (np .int32 ),
1700
+ ):
1701
+ dtype = np .dtype (np .int64 )
1700
1702
convert = True
1701
1703
retyped_data .append ((col , data [col ].astype (dtype )))
1702
1704
if convert :
@@ -1807,14 +1809,14 @@ def _do_convert_categoricals(
1807
1809
keys = np .array (list (vl .keys ()))
1808
1810
column = data [col ]
1809
1811
key_matches = column .isin (keys )
1810
- if self ._chunksize is not None and key_matches .all ():
1811
- initial_categories = keys
1812
+ if self ._using_iterator and key_matches .all ():
1813
+ initial_categories : Optional [ np . ndarray ] = keys
1812
1814
# If all categories are in the keys and we are iterating,
1813
1815
# use the same keys for all chunks. If some are missing
1814
1816
# value labels, then we will fall back to the categories
1815
1817
# varying across chunks.
1816
1818
else :
1817
- if self ._chunksize is not None :
1819
+ if self ._using_iterator :
1818
1820
# warn is using an iterator
1819
1821
warnings .warn (
1820
1822
categorical_conversion_warning , CategoricalConversionWarning
@@ -2010,7 +2012,7 @@ def _convert_datetime_to_stata_type(fmt: str) -> np.dtype:
2010
2012
"ty" ,
2011
2013
"%ty" ,
2012
2014
]:
2013
- return np .float64 # Stata expects doubles for SIFs
2015
+ return np .dtype ( np . float64 ) # Stata expects doubles for SIFs
2014
2016
else :
2015
2017
raise NotImplementedError (f"Format { fmt } not implemented" )
2016
2018
0 commit comments