@@ -319,19 +319,21 @@ cdef class TextReader:
319
319
int64_t leading_cols, table_width, skipfooter, buffer_lines
320
320
bint allow_leading_cols, mangle_dupe_cols, low_memory
321
321
bint delim_whitespace
322
- object delimiter, converters
322
+ object delimiter # bytes or str
323
+ object converters
323
324
object na_values
324
- object header, orig_header, names, header_start, header_end
325
+ object orig_header, names, header_start, header_end
326
+ list header # list[list[non-negative integers]]
325
327
object index_col
326
328
object skiprows
327
329
object dtype
328
330
object usecols
329
331
list dtype_cast_order # list[np.dtype]
330
- set unnamed_cols
331
- set noconvert
332
+ set unnamed_cols # set[str]
333
+ set noconvert # set[int]
332
334
333
335
def __cinit__ (self , source ,
334
- delimiter = b' ,' ,
336
+ delimiter = b' ,' , # bytes | str
335
337
header = 0 ,
336
338
header_start = 0 ,
337
339
header_end = 0 ,
@@ -341,14 +343,14 @@ cdef class TextReader:
341
343
bint delim_whitespace = False ,
342
344
converters = None ,
343
345
bint skipinitialspace = False ,
344
- escapechar = None ,
346
+ escapechar = None , # bytes | str
345
347
bint doublequote = True ,
346
348
quotechar = b' "' ,
347
- quoting = 0 ,
348
- lineterminator = None ,
349
+ quoting = 0 , # int
350
+ lineterminator = None , # bytes | str
349
351
comment = None ,
350
- decimal = b' .' ,
351
- thousands = None ,
352
+ decimal = b' .' , # bytes | str
353
+ thousands = None , # bytes | str
352
354
dtype = None ,
353
355
usecols = None ,
354
356
bint error_bad_lines = True ,
@@ -362,7 +364,7 @@ cdef class TextReader:
362
364
bint allow_leading_cols = True ,
363
365
bint low_memory = False ,
364
366
skiprows = None ,
365
- skipfooter = 0 ,
367
+ skipfooter = 0 , # int64_t
366
368
bint verbose = False ,
367
369
bint mangle_dupe_cols = True ,
368
370
float_precision = None ,
@@ -518,7 +520,7 @@ cdef class TextReader:
518
520
self .parser.header_end = - 1
519
521
self .parser.header = - 1
520
522
self .parser_start = 0
521
- self .header = []
523
+ prelim_header = []
522
524
else :
523
525
if isinstance (header, list ):
524
526
if len (header) > 1 :
@@ -534,16 +536,19 @@ cdef class TextReader:
534
536
self .parser_start = header[- 1 ] + 1
535
537
self .parser.header_start = header[0 ]
536
538
self .parser.header = header[0 ]
537
- self .header = header
539
+ prelim_header = header
538
540
else :
539
541
self .parser.header_start = header
540
542
self .parser.header_end = header
541
543
self .parser_start = header + 1
542
544
self .parser.header = header
543
- self .header = [ header ]
545
+ prelim_header = [ header ]
544
546
545
547
self .names = names
546
- self .header, self .table_width, self .unnamed_cols = self ._get_header()
548
+ header, table_width, unnamed_cols = self ._get_header(prelim_header)
549
+ self .header = header
550
+ self .table_width = table_width
551
+ self .unnamed_cols = unnamed_cols
547
552
548
553
if not self .table_width:
549
554
raise EmptyDataError(" No columns to parse from file" )
@@ -561,7 +566,7 @@ cdef class TextReader:
561
566
self .close()
562
567
parser_del(self .parser)
563
568
564
- def close (self ):
569
+ def close (self ) -> None :
565
570
# also preemptively free all allocated memory
566
571
parser_free(self.parser )
567
572
if self.true_set:
@@ -571,10 +576,10 @@ cdef class TextReader:
571
576
kh_destroy_str_starts(self.false_set )
572
577
self.false_set = NULL
573
578
574
- def set_error_bad_lines (self , int status ):
579
+ def set_error_bad_lines(self , int status ) -> None :
575
580
self.parser.error_bad_lines = status
576
581
577
- def _set_quoting (self , quote_char , quoting ):
582
+ def _set_quoting(self , quote_char: str | bytes | None , quoting: int ):
578
583
if not isinstance (quoting, int ):
579
584
raise TypeError (' "quoting" must be an integer' )
580
585
@@ -618,21 +623,21 @@ cdef class TextReader:
618
623
self .parser.cb_io = & buffer_rd_bytes
619
624
self .parser.cb_cleanup = & del_rd_source
620
625
621
- cdef _get_header(self ):
626
+ cdef _get_header(self , list prelim_header ):
622
627
# header is now a list of lists, so field_count should use header[0]
623
628
624
629
cdef:
625
630
Py_ssize_t i, start, field_count, passed_count, unnamed_count, level
626
631
char * word
627
- object name, old_name
632
+ str name, old_name
628
633
uint64_t hr, data_line = 0
629
634
list header = []
630
635
set unnamed_cols = set ()
631
636
632
637
if self .parser.header_start >= 0 :
633
638
634
639
# Header is in the file
635
- for level, hr in enumerate (self .header ):
640
+ for level, hr in enumerate (prelim_header ):
636
641
637
642
this_header = []
638
643
@@ -697,7 +702,7 @@ cdef class TextReader:
697
702
# If we have grabbed an extra line, but it's not in our
698
703
# format, save in the buffer, and create an blank extra
699
704
# line for the rest of the parsing code.
700
- if hr == self .header [- 1 ]:
705
+ if hr == prelim_header [- 1 ]:
701
706
lc = len (this_header)
702
707
ic = (len (self .index_col) if self .index_col
703
708
is not None else 0 )
@@ -764,7 +769,7 @@ cdef class TextReader:
764
769
765
770
return header, field_count, unnamed_cols
766
771
767
- def read (self , rows = None ):
772
+ def read (self , rows: int | None = None ) -> dict[ int , "ArrayLike"] :
768
773
"""
769
774
rows = None -- > read all rows
770
775
"""
@@ -777,6 +782,7 @@ cdef class TextReader:
777
782
778
783
return columns
779
784
785
+ # -> dict[int , "ArrayLike"]
780
786
cdef _read_low_memory(self , rows ):
781
787
cdef:
782
788
size_t rows_read = 0
@@ -830,6 +836,7 @@ cdef class TextReader:
830
836
if status < 0 :
831
837
raise_parser_error(' Error tokenizing data' , self .parser)
832
838
839
+ # -> dict[int, "ArrayLike"]
833
840
cdef _read_rows(self , rows, bint trim):
834
841
cdef:
835
842
int64_t buffered_lines
@@ -889,13 +896,16 @@ cdef class TextReader:
889
896
elapsed = time.time() - self .clocks.pop(- 1 )
890
897
print (f' {what} took: {elapsed * 1000:.2f} ms' )
891
898
892
- def set_noconvert (self , i ) :
899
+ def set_noconvert (self , i: int ) -> None :
893
900
self.noconvert.add(i )
894
901
895
- def remove_noconvert (self , i ) :
902
+ def remove_noconvert(self , i: int ) -> None :
896
903
self.noconvert.remove(i )
897
904
898
- def _convert_column_data (self , rows = None , upcast_na = False , footer = 0 ):
905
+ # TODO: upcast_na only ever False , footer never passed
906
+ def _convert_column_data(
907
+ self , rows: int | None = None , upcast_na: bool = False , footer: int = 0
908
+ ) -> dict[int , "ArrayLike"]:
899
909
cdef:
900
910
int64_t i
901
911
int nused
@@ -904,6 +914,7 @@ cdef class TextReader:
904
914
object name , na_flist , col_dtype = None
905
915
bint na_filter = 0
906
916
int64_t num_cols
917
+ dict result
907
918
908
919
start = self .parser_start
909
920
@@ -1020,6 +1031,7 @@ cdef class TextReader:
1020
1031
1021
1032
return results
1022
1033
1034
+ # -> tuple["ArrayLike", int]:
1023
1035
cdef inline _convert_tokens(self , Py_ssize_t i, int start, int end,
1024
1036
object name, bint na_filter,
1025
1037
kh_str_starts_t * na_hashset,
@@ -1181,13 +1193,14 @@ cdef class TextReader:
1181
1193
else :
1182
1194
raise TypeError (f" the dtype {dtype} is not supported for parsing" )
1183
1195
1196
+ # -> tuple[ndarray[object], int]
1184
1197
cdef _string_convert(self , Py_ssize_t i, int64_t start, int64_t end,
1185
1198
bint na_filter, kh_str_starts_t * na_hashset):
1186
1199
1187
1200
return _string_box_utf8(self .parser, i, start, end, na_filter,
1188
1201
na_hashset, self .encoding_errors)
1189
1202
1190
- def _get_converter (self , i , name ):
1203
+ def _get_converter (self , i: int , name ):
1191
1204
if self .converters is None :
1192
1205
return None
1193
1206
@@ -1197,7 +1210,7 @@ cdef class TextReader:
1197
1210
# Converter for position, if any
1198
1211
return self .converters.get(i)
1199
1212
1200
- cdef _get_na_list(self , i, name):
1213
+ cdef _get_na_list(self , Py_ssize_t i, name):
1201
1214
if self .na_values is None :
1202
1215
return None , set ()
1203
1216
@@ -1319,6 +1332,7 @@ def _maybe_upcast(arr):
1319
1332
# Type conversions / inference support code
1320
1333
1321
1334
1335
+ # -> tuple[ndarray[object], int]
1322
1336
cdef _string_box_utf8(parser_t * parser, int64_t col,
1323
1337
int64_t line_start, int64_t line_end,
1324
1338
bint na_filter, kh_str_starts_t * na_hashset,
@@ -1432,6 +1446,7 @@ cdef _categorical_convert(parser_t *parser, int64_t col,
1432
1446
return np.asarray(codes), result, na_count
1433
1447
1434
1448
1449
+ # -> ndarray[f'|S{width}']
1435
1450
cdef _to_fw_string(parser_t * parser, int64_t col, int64_t line_start,
1436
1451
int64_t line_end, int64_t width):
1437
1452
cdef:
@@ -1473,6 +1488,7 @@ cdef:
1473
1488
char * cneginfty = b' -Infinity'
1474
1489
1475
1490
1491
+ # -> tuple[ndarray[float64_t], int] | tuple[None, None]
1476
1492
cdef _try_double(parser_t * parser, int64_t col,
1477
1493
int64_t line_start, int64_t line_end,
1478
1494
bint na_filter, kh_str_starts_t * na_hashset, object na_flist):
@@ -1482,7 +1498,7 @@ cdef _try_double(parser_t *parser, int64_t col,
1482
1498
float64_t * data
1483
1499
float64_t NA = na_values[np.float64]
1484
1500
kh_float64_t * na_fset
1485
- ndarray result
1501
+ ndarray[float64_t] result
1486
1502
bint use_na_flist = len (na_flist) > 0
1487
1503
1488
1504
lines = line_end - line_start
@@ -1712,6 +1728,7 @@ cdef inline int _try_int64_nogil(parser_t *parser, int64_t col,
1712
1728
return 0
1713
1729
1714
1730
1731
+ # -> tuple[ndarray[bool], int]
1715
1732
cdef _try_bool_flex(parser_t * parser, int64_t col,
1716
1733
int64_t line_start, int64_t line_end,
1717
1734
bint na_filter, const kh_str_starts_t * na_hashset,
@@ -1890,7 +1907,9 @@ cdef raise_parser_error(object base, parser_t *parser):
1890
1907
raise ParserError(message)
1891
1908
1892
1909
1893
- def _concatenate_chunks (list chunks ):
1910
+ # chunks: list[dict[int, "ArrayLike"]]
1911
+ # -> dict[int, "ArrayLike"]
1912
+ def _concatenate_chunks (list chunks ) -> dict:
1894
1913
cdef:
1895
1914
list names = list (chunks[0 ].keys())
1896
1915
object name
@@ -1964,6 +1983,7 @@ for k in list(na_values):
1964
1983
na_values[np.dtype(k)] = na_values[k]
1965
1984
1966
1985
1986
+ # -> ArrayLike
1967
1987
cdef _apply_converter(object f, parser_t * parser, int64_t col,
1968
1988
int64_t line_start, int64_t line_end):
1969
1989
cdef:
@@ -1986,14 +2006,15 @@ cdef _apply_converter(object f, parser_t *parser, int64_t col,
1986
2006
return lib.maybe_convert_objects(result)
1987
2007
1988
2008
1989
- def _maybe_encode (values ):
2009
+ cdef list _maybe_encode(list values):
1990
2010
if values is None :
1991
2011
return []
1992
2012
return [x.encode(' utf-8' ) if isinstance (x, str ) else x for x in values]
1993
2013
1994
2014
2015
+ # TODO: only ever called with convert_empty=False
1995
2016
def sanitize_objects (ndarray[object] values , set na_values ,
1996
- bint convert_empty = True ):
2017
+ bint convert_empty = True ) -> int :
1997
2018
"""
1998
2019
Convert specified values , including the given set na_values and empty
1999
2020
strings if convert_empty is True , to np.nan.
@@ -2003,6 +2024,10 @@ def sanitize_objects(ndarray[object] values, set na_values,
2003
2024
values : ndarray[object]
2004
2025
na_values : set
2005
2026
convert_empty : bool , default True
2027
+
2028
+ Returns
2029
+ -------
2030
+ na_count : int
2006
2031
"""
2007
2032
cdef:
2008
2033
Py_ssize_t i , n
0 commit comments