diff --git a/pandas/_libs/parsers.pyi b/pandas/_libs/parsers.pyi new file mode 100644 index 0000000000000..1051c319b769b --- /dev/null +++ b/pandas/_libs/parsers.pyi @@ -0,0 +1,77 @@ +from typing import ( + Hashable, + Literal, +) + +import numpy as np + +from pandas._typing import ( + ArrayLike, + Dtype, +) + +STR_NA_VALUES: set[str] + + +def sanitize_objects( + values: np.ndarray, # ndarray[object] + na_values: set, + convert_empty: bool = ..., +) -> int: ... + + +class TextReader: + unnamed_cols: set[str] + table_width: int # int64_t + leading_cols: int # int64_t + header: list[list[int]] # non-negative integers + + def __init__( + self, + source, + delimiter: bytes | str = ..., # single-character only + header=..., + header_start=..., + header_end=..., + index_col=..., + names=..., + tokenize_chunksize: int = ..., # int64_t + delim_whitespace: bool = ..., + converters=..., + skipinitialspace: bool = ..., + escapechar: bytes | str | None = ..., # single-character only + doublequote: bool = ..., + quotechar: str | bytes | None = ..., # at most 1 character + quoting: int = ..., + lineterminator: bytes | str | None = ..., # at most 1 character + comment=..., + decimal: bytes | str = ..., # single-character only + thousands: bytes | str | None = ..., # single-character only + dtype: Dtype | dict[Hashable, Dtype] = ..., + usecols=..., + error_bad_lines: bool = ..., + warn_bad_lines: bool = ..., + na_filter: bool = ..., + na_values=..., + na_fvalues=..., + keep_default_na: bool = ..., + true_values=..., + false_values=..., + allow_leading_cols: bool = ..., + low_memory: bool = ..., + skiprows=..., + skipfooter: int = ..., # int64_t + verbose: bool = ..., + mangle_dupe_cols: bool = ..., + float_precision: Literal["round_trip", "legacy", "high"] | None = ..., + skip_blank_lines: bool = ..., + encoding_errors: bytes | str = ... + ): ... + + def set_error_bad_lines(self, status: int) -> None: ... + def set_noconvert(self, i: int) -> None: ... + def remove_noconvert(self, i: int) -> None: ... + + def close(self) -> None: ... + + def read(self, rows: int | None = ...) -> dict[int, ArrayLike]: ... diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index a11bf370412d2..153ac4b5f0893 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -319,19 +319,21 @@ cdef class TextReader: int64_t leading_cols, table_width, skipfooter, buffer_lines bint allow_leading_cols, mangle_dupe_cols, low_memory bint delim_whitespace - object delimiter, converters + object delimiter # bytes or str + object converters object na_values - object header, orig_header, names, header_start, header_end + object orig_header, names, header_start, header_end + list header # list[list[non-negative integers]] object index_col object skiprows object dtype object usecols list dtype_cast_order # list[np.dtype] - set unnamed_cols - set noconvert + set unnamed_cols # set[str] + set noconvert # set[int] def __cinit__(self, source, - delimiter=b',', + delimiter=b',', # bytes | str header=0, header_start=0, header_end=0, @@ -341,14 +343,14 @@ cdef class TextReader: bint delim_whitespace=False, converters=None, bint skipinitialspace=False, - escapechar=None, + escapechar=None, # bytes | str bint doublequote=True, quotechar=b'"', - quoting=0, - lineterminator=None, + quoting=0, # int + lineterminator=None, # bytes | str comment=None, - decimal=b'.', - thousands=None, + decimal=b'.', # bytes | str + thousands=None, # bytes | str dtype=None, usecols=None, bint error_bad_lines=True, @@ -362,7 +364,7 @@ cdef class TextReader: bint allow_leading_cols=True, bint low_memory=False, skiprows=None, - skipfooter=0, + skipfooter=0, # int64_t bint verbose=False, bint mangle_dupe_cols=True, float_precision=None, @@ -518,7 +520,7 @@ cdef class TextReader: self.parser.header_end = -1 self.parser.header = -1 self.parser_start = 0 - self.header = [] + prelim_header = [] else: if isinstance(header, list): if len(header) > 1: @@ -534,16 +536,19 @@ cdef class TextReader: self.parser_start = header[-1] + 1 self.parser.header_start = header[0] self.parser.header = header[0] - self.header = header + prelim_header = header else: self.parser.header_start = header self.parser.header_end = header self.parser_start = header + 1 self.parser.header = header - self.header = [ header ] + prelim_header = [ header ] self.names = names - self.header, self.table_width, self.unnamed_cols = self._get_header() + header, table_width, unnamed_cols = self._get_header(prelim_header) + self.header = header + self.table_width = table_width + self.unnamed_cols = unnamed_cols if not self.table_width: raise EmptyDataError("No columns to parse from file") @@ -561,7 +566,7 @@ cdef class TextReader: self.close() parser_del(self.parser) - def close(self): + def close(self) -> None: # also preemptively free all allocated memory parser_free(self.parser) if self.true_set: @@ -571,10 +576,10 @@ cdef class TextReader: kh_destroy_str_starts(self.false_set) self.false_set = NULL - def set_error_bad_lines(self, int status): + def set_error_bad_lines(self, int status) -> None: self.parser.error_bad_lines = status - def _set_quoting(self, quote_char, quoting): + def _set_quoting(self, quote_char: str | bytes | None, quoting: int): if not isinstance(quoting, int): raise TypeError('"quoting" must be an integer') @@ -618,13 +623,13 @@ cdef class TextReader: self.parser.cb_io = &buffer_rd_bytes self.parser.cb_cleanup = &del_rd_source - cdef _get_header(self): + cdef _get_header(self, list prelim_header): # header is now a list of lists, so field_count should use header[0] cdef: Py_ssize_t i, start, field_count, passed_count, unnamed_count, level char *word - object name, old_name + str name, old_name uint64_t hr, data_line = 0 list header = [] set unnamed_cols = set() @@ -632,7 +637,7 @@ cdef class TextReader: if self.parser.header_start >= 0: # Header is in the file - for level, hr in enumerate(self.header): + for level, hr in enumerate(prelim_header): this_header = [] @@ -697,7 +702,7 @@ cdef class TextReader: # If we have grabbed an extra line, but it's not in our # format, save in the buffer, and create an blank extra # line for the rest of the parsing code. - if hr == self.header[-1]: + if hr == prelim_header[-1]: lc = len(this_header) ic = (len(self.index_col) if self.index_col is not None else 0) @@ -764,7 +769,7 @@ cdef class TextReader: return header, field_count, unnamed_cols - def read(self, rows=None): + def read(self, rows: int | None = None) -> dict[int, "ArrayLike"]: """ rows=None --> read all rows """ @@ -777,6 +782,7 @@ cdef class TextReader: return columns + # -> dict[int, "ArrayLike"] cdef _read_low_memory(self, rows): cdef: size_t rows_read = 0 @@ -830,6 +836,7 @@ cdef class TextReader: if status < 0: raise_parser_error('Error tokenizing data', self.parser) + # -> dict[int, "ArrayLike"] cdef _read_rows(self, rows, bint trim): cdef: int64_t buffered_lines @@ -889,13 +896,16 @@ cdef class TextReader: elapsed = time.time() - self.clocks.pop(-1) print(f'{what} took: {elapsed * 1000:.2f} ms') - def set_noconvert(self, i): + def set_noconvert(self, i: int) -> None: self.noconvert.add(i) - def remove_noconvert(self, i): + def remove_noconvert(self, i: int) -> None: self.noconvert.remove(i) - def _convert_column_data(self, rows=None, upcast_na=False, footer=0): + # TODO: upcast_na only ever False, footer never passed + def _convert_column_data( + self, rows: int | None = None, upcast_na: bool = False, footer: int = 0 + ) -> dict[int, "ArrayLike"]: cdef: int64_t i int nused @@ -904,6 +914,7 @@ cdef class TextReader: object name, na_flist, col_dtype = None bint na_filter = 0 int64_t num_cols + dict result start = self.parser_start @@ -1020,6 +1031,7 @@ cdef class TextReader: return results + # -> tuple["ArrayLike", int]: cdef inline _convert_tokens(self, Py_ssize_t i, int start, int end, object name, bint na_filter, kh_str_starts_t *na_hashset, @@ -1181,13 +1193,14 @@ cdef class TextReader: else: raise TypeError(f"the dtype {dtype} is not supported for parsing") + # -> tuple[ndarray[object], int] cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end, bint na_filter, kh_str_starts_t *na_hashset): return _string_box_utf8(self.parser, i, start, end, na_filter, na_hashset, self.encoding_errors) - def _get_converter(self, i, name): + def _get_converter(self, i: int, name): if self.converters is None: return None @@ -1197,7 +1210,7 @@ cdef class TextReader: # Converter for position, if any return self.converters.get(i) - cdef _get_na_list(self, i, name): + cdef _get_na_list(self, Py_ssize_t i, name): if self.na_values is None: return None, set() @@ -1319,6 +1332,7 @@ def _maybe_upcast(arr): # Type conversions / inference support code +# -> tuple[ndarray[object], int] cdef _string_box_utf8(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_starts_t *na_hashset, @@ -1432,6 +1446,7 @@ cdef _categorical_convert(parser_t *parser, int64_t col, return np.asarray(codes), result, na_count +# -> ndarray[f'|S{width}'] cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, int64_t width): cdef: @@ -1473,6 +1488,7 @@ cdef: char* cneginfty = b'-Infinity' +# -> tuple[ndarray[float64_t], int] | tuple[None, None] cdef _try_double(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_starts_t *na_hashset, object na_flist): @@ -1482,7 +1498,7 @@ cdef _try_double(parser_t *parser, int64_t col, float64_t *data float64_t NA = na_values[np.float64] kh_float64_t *na_fset - ndarray result + ndarray[float64_t] result bint use_na_flist = len(na_flist) > 0 lines = line_end - line_start @@ -1712,6 +1728,7 @@ cdef inline int _try_int64_nogil(parser_t *parser, int64_t col, return 0 +# -> tuple[ndarray[bool], int] cdef _try_bool_flex(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, const kh_str_starts_t *na_hashset, @@ -1890,7 +1907,9 @@ cdef raise_parser_error(object base, parser_t *parser): raise ParserError(message) -def _concatenate_chunks(list chunks): +# chunks: list[dict[int, "ArrayLike"]] +# -> dict[int, "ArrayLike"] +def _concatenate_chunks(list chunks) -> dict: cdef: list names = list(chunks[0].keys()) object name @@ -1964,6 +1983,7 @@ for k in list(na_values): na_values[np.dtype(k)] = na_values[k] +# -> ArrayLike cdef _apply_converter(object f, parser_t *parser, int64_t col, int64_t line_start, int64_t line_end): cdef: @@ -1986,14 +2006,15 @@ cdef _apply_converter(object f, parser_t *parser, int64_t col, return lib.maybe_convert_objects(result) -def _maybe_encode(values): +cdef list _maybe_encode(list values): if values is None: return [] return [x.encode('utf-8') if isinstance(x, str) else x for x in values] +# TODO: only ever called with convert_empty=False def sanitize_objects(ndarray[object] values, set na_values, - bint convert_empty=True): + bint convert_empty=True) -> int: """ Convert specified values, including the given set na_values and empty strings if convert_empty is True, to np.nan. @@ -2003,6 +2024,10 @@ def sanitize_objects(ndarray[object] values, set na_values, values : ndarray[object] na_values : set convert_empty : bool, default True + + Returns + ------- + na_count : int """ cdef: Py_ssize_t i, n diff --git a/pandas/_libs/window/indexers.pyi b/pandas/_libs/window/indexers.pyi new file mode 100644 index 0000000000000..a32fe2f0f8b03 --- /dev/null +++ b/pandas/_libs/window/indexers.pyi @@ -0,0 +1,13 @@ +import numpy as np + +def calculate_variable_window_bounds( + num_values: int, # int64_t + window_size: int, # int64_t + min_periods, + center: bool, + closed: str | None, + index: np.ndarray, # const int64_t[:] +) -> tuple[ + np.ndarray, # np.ndarray[np.int64] + np.ndarray, # np.ndarray[np.int64] +]: ... diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 5e2b137db64a6..d188770576e05 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -15,7 +15,7 @@ def calculate_variable_window_bounds( int64_t window_size, object min_periods, # unused but here to match get_window_bounds signature bint center, - object closed, + str closed, const int64_t[:] index ): """ diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index 2cf68fc8995ee..1ad80b2e4c908 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -113,8 +113,17 @@ def get_window_bounds( closed: Optional[str] = None, ) -> Tuple[np.ndarray, np.ndarray]: + # error: Argument 4 to "calculate_variable_window_bounds" has incompatible + # type "Optional[bool]"; expected "bool" + # error: Argument 6 to "calculate_variable_window_bounds" has incompatible + # type "Optional[ndarray]"; expected "ndarray" return calculate_variable_window_bounds( - num_values, self.window_size, min_periods, center, closed, self.index_array + num_values, + self.window_size, + min_periods, + center, # type: ignore[arg-type] + closed, + self.index_array, # type: ignore[arg-type] ) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 8ba38a44ecd2e..915a17fc702c3 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -12,6 +12,7 @@ Hashable, Iterator, Sequence, + cast, ) import numpy as np @@ -42,6 +43,8 @@ class CSVFormatter: + cols: np.ndarray + def __init__( self, formatter: DataFrameFormatter, @@ -136,9 +139,7 @@ def _initialize_quotechar(self, quotechar: str | None) -> str | None: def has_mi_columns(self) -> bool: return bool(isinstance(self.obj.columns, ABCMultiIndex)) - def _initialize_columns( - self, cols: Sequence[Hashable] | None - ) -> Sequence[Hashable]: + def _initialize_columns(self, cols: Sequence[Hashable] | None) -> np.ndarray: # validate mi options if self.has_mi_columns: if cols is not None: @@ -155,10 +156,7 @@ def _initialize_columns( # update columns to include possible multiplicity of dupes # and make sure cols is just a list of labels new_cols = self.obj.columns - if isinstance(new_cols, ABCIndex): - return new_cols._format_native_types(**self._number_format) - else: - return list(new_cols) + return new_cols._format_native_types(**self._number_format) def _initialize_chunksize(self, chunksize: int | None) -> int: if chunksize is None: @@ -214,7 +212,9 @@ def write_cols(self) -> Sequence[Hashable]: else: return self.header else: - return self.cols + # self.cols is an ndarray derived from Index._format_native_types, + # so its entries are strings, i.e. hashable + return cast(Sequence[Hashable], self.cols) @property def encoded_labels(self) -> list[Hashable]: @@ -308,12 +308,10 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: data = [res.iget_values(i) for i in range(len(res.items))] ix = self.data_index[slicer]._format_native_types(**self._number_format) - # error: Argument 4 to "write_csv_rows" has incompatible type - # "Sequence[Hashable]"; expected "ndarray" libwriters.write_csv_rows( data, ix, self.nlevels, - self.cols, # type: ignore[arg-type] + self.cols, self.writer, ) diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 135e093cdc1e0..8305ff64c42c6 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -213,14 +213,14 @@ def read(self, nrows=None): names = self._maybe_dedup_names(names) # rename dict keys - data = sorted(data.items()) - data = {k: v for k, (i, v) in zip(names, data)} + data_tups = sorted(data.items()) + data = {k: v for k, (i, v) in zip(names, data_tups)} names, data = self._do_date_conversions(names, data) else: # rename dict keys - data = sorted(data.items()) + data_tups = sorted(data.items()) # ugh, mutation @@ -233,9 +233,9 @@ def read(self, nrows=None): names = self._filter_usecols(names) # columns as list - alldata = [x[1] for x in data] + alldata = [x[1] for x in data_tups] - data = {k: v for k, (i, v) in zip(names, data)} + data = {k: v for k, (i, v) in zip(names, data_tups)} names, data = self._do_date_conversions(names, data) index, names = self._make_index(data, alldata, names)