Skip to content

Commit cf73413

Browse files
jbrockmendelyeshsurya
authored andcommitted
TYP: parsers.pyi (pandas-dev#40508)
1 parent 786fb90 commit cf73413

File tree

7 files changed

+172
-50
lines changed

7 files changed

+172
-50
lines changed

pandas/_libs/parsers.pyi

+77
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
from typing import (
2+
Hashable,
3+
Literal,
4+
)
5+
6+
import numpy as np
7+
8+
from pandas._typing import (
9+
ArrayLike,
10+
Dtype,
11+
)
12+
13+
STR_NA_VALUES: set[str]
14+
15+
16+
def sanitize_objects(
17+
values: np.ndarray, # ndarray[object]
18+
na_values: set,
19+
convert_empty: bool = ...,
20+
) -> int: ...
21+
22+
23+
class TextReader:
24+
unnamed_cols: set[str]
25+
table_width: int # int64_t
26+
leading_cols: int # int64_t
27+
header: list[list[int]] # non-negative integers
28+
29+
def __init__(
30+
self,
31+
source,
32+
delimiter: bytes | str = ..., # single-character only
33+
header=...,
34+
header_start=...,
35+
header_end=...,
36+
index_col=...,
37+
names=...,
38+
tokenize_chunksize: int = ..., # int64_t
39+
delim_whitespace: bool = ...,
40+
converters=...,
41+
skipinitialspace: bool = ...,
42+
escapechar: bytes | str | None = ..., # single-character only
43+
doublequote: bool = ...,
44+
quotechar: str | bytes | None = ..., # at most 1 character
45+
quoting: int = ...,
46+
lineterminator: bytes | str | None = ..., # at most 1 character
47+
comment=...,
48+
decimal: bytes | str = ..., # single-character only
49+
thousands: bytes | str | None = ..., # single-character only
50+
dtype: Dtype | dict[Hashable, Dtype] = ...,
51+
usecols=...,
52+
error_bad_lines: bool = ...,
53+
warn_bad_lines: bool = ...,
54+
na_filter: bool = ...,
55+
na_values=...,
56+
na_fvalues=...,
57+
keep_default_na: bool = ...,
58+
true_values=...,
59+
false_values=...,
60+
allow_leading_cols: bool = ...,
61+
low_memory: bool = ...,
62+
skiprows=...,
63+
skipfooter: int = ..., # int64_t
64+
verbose: bool = ...,
65+
mangle_dupe_cols: bool = ...,
66+
float_precision: Literal["round_trip", "legacy", "high"] | None = ...,
67+
skip_blank_lines: bool = ...,
68+
encoding_errors: bytes | str = ...
69+
): ...
70+
71+
def set_error_bad_lines(self, status: int) -> None: ...
72+
def set_noconvert(self, i: int) -> None: ...
73+
def remove_noconvert(self, i: int) -> None: ...
74+
75+
def close(self) -> None: ...
76+
77+
def read(self, rows: int | None = ...) -> dict[int, ArrayLike]: ...

pandas/_libs/parsers.pyx

+57-32
Original file line numberDiff line numberDiff line change
@@ -319,19 +319,21 @@ cdef class TextReader:
319319
int64_t leading_cols, table_width, skipfooter, buffer_lines
320320
bint allow_leading_cols, mangle_dupe_cols, low_memory
321321
bint delim_whitespace
322-
object delimiter, converters
322+
object delimiter # bytes or str
323+
object converters
323324
object na_values
324-
object header, orig_header, names, header_start, header_end
325+
object orig_header, names, header_start, header_end
326+
list header # list[list[non-negative integers]]
325327
object index_col
326328
object skiprows
327329
object dtype
328330
object usecols
329331
list dtype_cast_order # list[np.dtype]
330-
set unnamed_cols
331-
set noconvert
332+
set unnamed_cols # set[str]
333+
set noconvert # set[int]
332334

333335
def __cinit__(self, source,
334-
delimiter=b',',
336+
delimiter=b',', # bytes | str
335337
header=0,
336338
header_start=0,
337339
header_end=0,
@@ -341,14 +343,14 @@ cdef class TextReader:
341343
bint delim_whitespace=False,
342344
converters=None,
343345
bint skipinitialspace=False,
344-
escapechar=None,
346+
escapechar=None, # bytes | str
345347
bint doublequote=True,
346348
quotechar=b'"',
347-
quoting=0,
348-
lineterminator=None,
349+
quoting=0, # int
350+
lineterminator=None, # bytes | str
349351
comment=None,
350-
decimal=b'.',
351-
thousands=None,
352+
decimal=b'.', # bytes | str
353+
thousands=None, # bytes | str
352354
dtype=None,
353355
usecols=None,
354356
bint error_bad_lines=True,
@@ -362,7 +364,7 @@ cdef class TextReader:
362364
bint allow_leading_cols=True,
363365
bint low_memory=False,
364366
skiprows=None,
365-
skipfooter=0,
367+
skipfooter=0, # int64_t
366368
bint verbose=False,
367369
bint mangle_dupe_cols=True,
368370
float_precision=None,
@@ -518,7 +520,7 @@ cdef class TextReader:
518520
self.parser.header_end = -1
519521
self.parser.header = -1
520522
self.parser_start = 0
521-
self.header = []
523+
prelim_header = []
522524
else:
523525
if isinstance(header, list):
524526
if len(header) > 1:
@@ -534,16 +536,19 @@ cdef class TextReader:
534536
self.parser_start = header[-1] + 1
535537
self.parser.header_start = header[0]
536538
self.parser.header = header[0]
537-
self.header = header
539+
prelim_header = header
538540
else:
539541
self.parser.header_start = header
540542
self.parser.header_end = header
541543
self.parser_start = header + 1
542544
self.parser.header = header
543-
self.header = [ header ]
545+
prelim_header = [ header ]
544546

545547
self.names = names
546-
self.header, self.table_width, self.unnamed_cols = self._get_header()
548+
header, table_width, unnamed_cols = self._get_header(prelim_header)
549+
self.header = header
550+
self.table_width = table_width
551+
self.unnamed_cols = unnamed_cols
547552

548553
if not self.table_width:
549554
raise EmptyDataError("No columns to parse from file")
@@ -561,7 +566,7 @@ cdef class TextReader:
561566
self.close()
562567
parser_del(self.parser)
563568

564-
def close(self):
569+
def close(self) -> None:
565570
# also preemptively free all allocated memory
566571
parser_free(self.parser)
567572
if self.true_set:
@@ -571,10 +576,10 @@ cdef class TextReader:
571576
kh_destroy_str_starts(self.false_set)
572577
self.false_set = NULL
573578

574-
def set_error_bad_lines(self, int status):
579+
def set_error_bad_lines(self, int status) -> None:
575580
self.parser.error_bad_lines = status
576581

577-
def _set_quoting(self, quote_char, quoting):
582+
def _set_quoting(self, quote_char: str | bytes | None, quoting: int):
578583
if not isinstance(quoting, int):
579584
raise TypeError('"quoting" must be an integer')
580585

@@ -618,21 +623,21 @@ cdef class TextReader:
618623
self.parser.cb_io = &buffer_rd_bytes
619624
self.parser.cb_cleanup = &del_rd_source
620625

621-
cdef _get_header(self):
626+
cdef _get_header(self, list prelim_header):
622627
# header is now a list of lists, so field_count should use header[0]
623628

624629
cdef:
625630
Py_ssize_t i, start, field_count, passed_count, unnamed_count, level
626631
char *word
627-
object name, old_name
632+
str name, old_name
628633
uint64_t hr, data_line = 0
629634
list header = []
630635
set unnamed_cols = set()
631636

632637
if self.parser.header_start >= 0:
633638

634639
# Header is in the file
635-
for level, hr in enumerate(self.header):
640+
for level, hr in enumerate(prelim_header):
636641

637642
this_header = []
638643

@@ -697,7 +702,7 @@ cdef class TextReader:
697702
# If we have grabbed an extra line, but it's not in our
698703
# format, save in the buffer, and create an blank extra
699704
# line for the rest of the parsing code.
700-
if hr == self.header[-1]:
705+
if hr == prelim_header[-1]:
701706
lc = len(this_header)
702707
ic = (len(self.index_col) if self.index_col
703708
is not None else 0)
@@ -764,7 +769,7 @@ cdef class TextReader:
764769

765770
return header, field_count, unnamed_cols
766771

767-
def read(self, rows=None):
772+
def read(self, rows: int | None = None) -> dict[int, "ArrayLike"]:
768773
"""
769774
rows=None --> read all rows
770775
"""
@@ -777,6 +782,7 @@ cdef class TextReader:
777782

778783
return columns
779784

785+
# -> dict[int, "ArrayLike"]
780786
cdef _read_low_memory(self, rows):
781787
cdef:
782788
size_t rows_read = 0
@@ -830,6 +836,7 @@ cdef class TextReader:
830836
if status < 0:
831837
raise_parser_error('Error tokenizing data', self.parser)
832838

839+
# -> dict[int, "ArrayLike"]
833840
cdef _read_rows(self, rows, bint trim):
834841
cdef:
835842
int64_t buffered_lines
@@ -889,13 +896,16 @@ cdef class TextReader:
889896
elapsed = time.time() - self.clocks.pop(-1)
890897
print(f'{what} took: {elapsed * 1000:.2f} ms')
891898

892-
def set_noconvert(self, i):
899+
def set_noconvert(self, i: int) -> None:
893900
self.noconvert.add(i)
894901

895-
def remove_noconvert(self, i):
902+
def remove_noconvert(self, i: int) -> None:
896903
self.noconvert.remove(i)
897904

898-
def _convert_column_data(self, rows=None, upcast_na=False, footer=0):
905+
# TODO: upcast_na only ever False, footer never passed
906+
def _convert_column_data(
907+
self, rows: int | None = None, upcast_na: bool = False, footer: int = 0
908+
) -> dict[int, "ArrayLike"]:
899909
cdef:
900910
int64_t i
901911
int nused
@@ -904,6 +914,7 @@ cdef class TextReader:
904914
object name, na_flist, col_dtype = None
905915
bint na_filter = 0
906916
int64_t num_cols
917+
dict result
907918

908919
start = self.parser_start
909920

@@ -1020,6 +1031,7 @@ cdef class TextReader:
10201031

10211032
return results
10221033

1034+
# -> tuple["ArrayLike", int]:
10231035
cdef inline _convert_tokens(self, Py_ssize_t i, int start, int end,
10241036
object name, bint na_filter,
10251037
kh_str_starts_t *na_hashset,
@@ -1181,13 +1193,14 @@ cdef class TextReader:
11811193
else:
11821194
raise TypeError(f"the dtype {dtype} is not supported for parsing")
11831195

1196+
# -> tuple[ndarray[object], int]
11841197
cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end,
11851198
bint na_filter, kh_str_starts_t *na_hashset):
11861199

11871200
return _string_box_utf8(self.parser, i, start, end, na_filter,
11881201
na_hashset, self.encoding_errors)
11891202

1190-
def _get_converter(self, i, name):
1203+
def _get_converter(self, i: int, name):
11911204
if self.converters is None:
11921205
return None
11931206

@@ -1197,7 +1210,7 @@ cdef class TextReader:
11971210
# Converter for position, if any
11981211
return self.converters.get(i)
11991212

1200-
cdef _get_na_list(self, i, name):
1213+
cdef _get_na_list(self, Py_ssize_t i, name):
12011214
if self.na_values is None:
12021215
return None, set()
12031216

@@ -1319,6 +1332,7 @@ def _maybe_upcast(arr):
13191332
# Type conversions / inference support code
13201333

13211334

1335+
# -> tuple[ndarray[object], int]
13221336
cdef _string_box_utf8(parser_t *parser, int64_t col,
13231337
int64_t line_start, int64_t line_end,
13241338
bint na_filter, kh_str_starts_t *na_hashset,
@@ -1432,6 +1446,7 @@ cdef _categorical_convert(parser_t *parser, int64_t col,
14321446
return np.asarray(codes), result, na_count
14331447

14341448

1449+
# -> ndarray[f'|S{width}']
14351450
cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start,
14361451
int64_t line_end, int64_t width):
14371452
cdef:
@@ -1473,6 +1488,7 @@ cdef:
14731488
char* cneginfty = b'-Infinity'
14741489

14751490

1491+
# -> tuple[ndarray[float64_t], int] | tuple[None, None]
14761492
cdef _try_double(parser_t *parser, int64_t col,
14771493
int64_t line_start, int64_t line_end,
14781494
bint na_filter, kh_str_starts_t *na_hashset, object na_flist):
@@ -1482,7 +1498,7 @@ cdef _try_double(parser_t *parser, int64_t col,
14821498
float64_t *data
14831499
float64_t NA = na_values[np.float64]
14841500
kh_float64_t *na_fset
1485-
ndarray result
1501+
ndarray[float64_t] result
14861502
bint use_na_flist = len(na_flist) > 0
14871503

14881504
lines = line_end - line_start
@@ -1712,6 +1728,7 @@ cdef inline int _try_int64_nogil(parser_t *parser, int64_t col,
17121728
return 0
17131729

17141730

1731+
# -> tuple[ndarray[bool], int]
17151732
cdef _try_bool_flex(parser_t *parser, int64_t col,
17161733
int64_t line_start, int64_t line_end,
17171734
bint na_filter, const kh_str_starts_t *na_hashset,
@@ -1890,7 +1907,9 @@ cdef raise_parser_error(object base, parser_t *parser):
18901907
raise ParserError(message)
18911908

18921909

1893-
def _concatenate_chunks(list chunks):
1910+
# chunks: list[dict[int, "ArrayLike"]]
1911+
# -> dict[int, "ArrayLike"]
1912+
def _concatenate_chunks(list chunks) -> dict:
18941913
cdef:
18951914
list names = list(chunks[0].keys())
18961915
object name
@@ -1964,6 +1983,7 @@ for k in list(na_values):
19641983
na_values[np.dtype(k)] = na_values[k]
19651984

19661985

1986+
# -> ArrayLike
19671987
cdef _apply_converter(object f, parser_t *parser, int64_t col,
19681988
int64_t line_start, int64_t line_end):
19691989
cdef:
@@ -1986,14 +2006,15 @@ cdef _apply_converter(object f, parser_t *parser, int64_t col,
19862006
return lib.maybe_convert_objects(result)
19872007

19882008

1989-
def _maybe_encode(values):
2009+
cdef list _maybe_encode(list values):
19902010
if values is None:
19912011
return []
19922012
return [x.encode('utf-8') if isinstance(x, str) else x for x in values]
19932013

19942014

2015+
# TODO: only ever called with convert_empty=False
19952016
def sanitize_objects(ndarray[object] values, set na_values,
1996-
bint convert_empty=True):
2017+
bint convert_empty=True) -> int:
19972018
"""
19982019
Convert specified values, including the given set na_values and empty
19992020
strings if convert_empty is True, to np.nan.
@@ -2003,6 +2024,10 @@ def sanitize_objects(ndarray[object] values, set na_values,
20032024
values : ndarray[object]
20042025
na_values : set
20052026
convert_empty : bool, default True
2027+
2028+
Returns
2029+
-------
2030+
na_count : int
20062031
"""
20072032
cdef:
20082033
Py_ssize_t i, n

0 commit comments

Comments
 (0)