From bc296ce325369077773e2421dbe424b364c8f701 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 18 Mar 2021 16:16:44 -0700 Subject: [PATCH 1/4] TYP: parsers.pyi --- pandas/_libs/parsers.pyi | 77 +++++++++++++++++++++++ pandas/_libs/parsers.pyx | 89 +++++++++++++++++---------- pandas/_libs/window/indexers.pyi | 13 ++++ pandas/_libs/writers.pyi | 24 ++++++++ pandas/_libs/writers.pyx | 6 +- pandas/core/window/indexers.py | 9 ++- pandas/io/formats/csvs.py | 16 ++--- pandas/io/parsers/c_parser_wrapper.py | 10 +-- pandas/io/pytables.py | 2 +- 9 files changed, 195 insertions(+), 51 deletions(-) create mode 100644 pandas/_libs/parsers.pyi create mode 100644 pandas/_libs/window/indexers.pyi create mode 100644 pandas/_libs/writers.pyi diff --git a/pandas/_libs/parsers.pyi b/pandas/_libs/parsers.pyi new file mode 100644 index 0000000000000..208a76b70a153 --- /dev/null +++ b/pandas/_libs/parsers.pyi @@ -0,0 +1,77 @@ +from typing import ( + Any, + Literal, +) + +import numpy as np + +from pandas._typing import ( + ArrayLike, + Dtype, +) + +STR_NA_VALUES: set[str] + + +def sanitize_objects( + values: np.ndarray, # ndarray[object] + na_values: set, + convert_empty: bool = ..., +) -> int: ... + + +class TextReader: + unnamed_cols: set[str] + table_width: int # int64_t + leading_cols: int # int64_t + header: list[list[int]] # non-negative integers + + def __init__( + self, + source, + delimiter: bytes | str = ..., # single-character only + header=0, + header_start=0, + header_end=0, + index_col=..., + names=..., + tokenize_chunksize: int = ..., # int64_t + delim_whitespace: bool = ..., + converters=..., + skipinitialspace: bool = ..., + escapechar: bytes | str | None = ..., # single-character only + doublequote: bool = ..., + quotechar: str | bytes | None = ..., # at most 1 character + quoting: int = ..., + lineterminator: bytes | str | None = ..., # at most 1 character + comment=..., + decimal: bytes | str = ..., # single-character only + thousands: bytes | str | None = ..., # single-character only + dtype: Dtype | dict[Any, Dtype] = ..., + usecols=..., + error_bad_lines: bool = ..., + warn_bad_lines: bool = ..., + na_filter: bool = ..., + na_values=..., + na_fvalues=..., + keep_default_na: bool = ..., + true_values=..., + false_values=..., + allow_leading_cols: bool = ..., + low_memory: bool = ..., + skiprows=..., + skipfooter: int = ..., # int64_t + verbose: bool = ..., + mangle_dupe_cols: bool = ..., + float_precision: Literal["round_trip", "legacy", "high"] | None = ..., + skip_blank_lines: bool = ..., + encoding_errors: bytes | str = ... + ): ... + + def set_error_bad_lines(self, status: int) -> None: ... + def set_noconvert(self, i: int) -> None: ... + def remove_noconvert(self, i: int) -> None: ... + + def close(self) -> None: ... + + def read(self, rows: int | None = None) -> dict[int, ArrayLike]: ... diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index a11bf370412d2..153ac4b5f0893 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -319,19 +319,21 @@ cdef class TextReader: int64_t leading_cols, table_width, skipfooter, buffer_lines bint allow_leading_cols, mangle_dupe_cols, low_memory bint delim_whitespace - object delimiter, converters + object delimiter # bytes or str + object converters object na_values - object header, orig_header, names, header_start, header_end + object orig_header, names, header_start, header_end + list header # list[list[non-negative integers]] object index_col object skiprows object dtype object usecols list dtype_cast_order # list[np.dtype] - set unnamed_cols - set noconvert + set unnamed_cols # set[str] + set noconvert # set[int] def __cinit__(self, source, - delimiter=b',', + delimiter=b',', # bytes | str header=0, header_start=0, header_end=0, @@ -341,14 +343,14 @@ cdef class TextReader: bint delim_whitespace=False, converters=None, bint skipinitialspace=False, - escapechar=None, + escapechar=None, # bytes | str bint doublequote=True, quotechar=b'"', - quoting=0, - lineterminator=None, + quoting=0, # int + lineterminator=None, # bytes | str comment=None, - decimal=b'.', - thousands=None, + decimal=b'.', # bytes | str + thousands=None, # bytes | str dtype=None, usecols=None, bint error_bad_lines=True, @@ -362,7 +364,7 @@ cdef class TextReader: bint allow_leading_cols=True, bint low_memory=False, skiprows=None, - skipfooter=0, + skipfooter=0, # int64_t bint verbose=False, bint mangle_dupe_cols=True, float_precision=None, @@ -518,7 +520,7 @@ cdef class TextReader: self.parser.header_end = -1 self.parser.header = -1 self.parser_start = 0 - self.header = [] + prelim_header = [] else: if isinstance(header, list): if len(header) > 1: @@ -534,16 +536,19 @@ cdef class TextReader: self.parser_start = header[-1] + 1 self.parser.header_start = header[0] self.parser.header = header[0] - self.header = header + prelim_header = header else: self.parser.header_start = header self.parser.header_end = header self.parser_start = header + 1 self.parser.header = header - self.header = [ header ] + prelim_header = [ header ] self.names = names - self.header, self.table_width, self.unnamed_cols = self._get_header() + header, table_width, unnamed_cols = self._get_header(prelim_header) + self.header = header + self.table_width = table_width + self.unnamed_cols = unnamed_cols if not self.table_width: raise EmptyDataError("No columns to parse from file") @@ -561,7 +566,7 @@ cdef class TextReader: self.close() parser_del(self.parser) - def close(self): + def close(self) -> None: # also preemptively free all allocated memory parser_free(self.parser) if self.true_set: @@ -571,10 +576,10 @@ cdef class TextReader: kh_destroy_str_starts(self.false_set) self.false_set = NULL - def set_error_bad_lines(self, int status): + def set_error_bad_lines(self, int status) -> None: self.parser.error_bad_lines = status - def _set_quoting(self, quote_char, quoting): + def _set_quoting(self, quote_char: str | bytes | None, quoting: int): if not isinstance(quoting, int): raise TypeError('"quoting" must be an integer') @@ -618,13 +623,13 @@ cdef class TextReader: self.parser.cb_io = &buffer_rd_bytes self.parser.cb_cleanup = &del_rd_source - cdef _get_header(self): + cdef _get_header(self, list prelim_header): # header is now a list of lists, so field_count should use header[0] cdef: Py_ssize_t i, start, field_count, passed_count, unnamed_count, level char *word - object name, old_name + str name, old_name uint64_t hr, data_line = 0 list header = [] set unnamed_cols = set() @@ -632,7 +637,7 @@ cdef class TextReader: if self.parser.header_start >= 0: # Header is in the file - for level, hr in enumerate(self.header): + for level, hr in enumerate(prelim_header): this_header = [] @@ -697,7 +702,7 @@ cdef class TextReader: # If we have grabbed an extra line, but it's not in our # format, save in the buffer, and create an blank extra # line for the rest of the parsing code. - if hr == self.header[-1]: + if hr == prelim_header[-1]: lc = len(this_header) ic = (len(self.index_col) if self.index_col is not None else 0) @@ -764,7 +769,7 @@ cdef class TextReader: return header, field_count, unnamed_cols - def read(self, rows=None): + def read(self, rows: int | None = None) -> dict[int, "ArrayLike"]: """ rows=None --> read all rows """ @@ -777,6 +782,7 @@ cdef class TextReader: return columns + # -> dict[int, "ArrayLike"] cdef _read_low_memory(self, rows): cdef: size_t rows_read = 0 @@ -830,6 +836,7 @@ cdef class TextReader: if status < 0: raise_parser_error('Error tokenizing data', self.parser) + # -> dict[int, "ArrayLike"] cdef _read_rows(self, rows, bint trim): cdef: int64_t buffered_lines @@ -889,13 +896,16 @@ cdef class TextReader: elapsed = time.time() - self.clocks.pop(-1) print(f'{what} took: {elapsed * 1000:.2f} ms') - def set_noconvert(self, i): + def set_noconvert(self, i: int) -> None: self.noconvert.add(i) - def remove_noconvert(self, i): + def remove_noconvert(self, i: int) -> None: self.noconvert.remove(i) - def _convert_column_data(self, rows=None, upcast_na=False, footer=0): + # TODO: upcast_na only ever False, footer never passed + def _convert_column_data( + self, rows: int | None = None, upcast_na: bool = False, footer: int = 0 + ) -> dict[int, "ArrayLike"]: cdef: int64_t i int nused @@ -904,6 +914,7 @@ cdef class TextReader: object name, na_flist, col_dtype = None bint na_filter = 0 int64_t num_cols + dict result start = self.parser_start @@ -1020,6 +1031,7 @@ cdef class TextReader: return results + # -> tuple["ArrayLike", int]: cdef inline _convert_tokens(self, Py_ssize_t i, int start, int end, object name, bint na_filter, kh_str_starts_t *na_hashset, @@ -1181,13 +1193,14 @@ cdef class TextReader: else: raise TypeError(f"the dtype {dtype} is not supported for parsing") + # -> tuple[ndarray[object], int] cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end, bint na_filter, kh_str_starts_t *na_hashset): return _string_box_utf8(self.parser, i, start, end, na_filter, na_hashset, self.encoding_errors) - def _get_converter(self, i, name): + def _get_converter(self, i: int, name): if self.converters is None: return None @@ -1197,7 +1210,7 @@ cdef class TextReader: # Converter for position, if any return self.converters.get(i) - cdef _get_na_list(self, i, name): + cdef _get_na_list(self, Py_ssize_t i, name): if self.na_values is None: return None, set() @@ -1319,6 +1332,7 @@ def _maybe_upcast(arr): # Type conversions / inference support code +# -> tuple[ndarray[object], int] cdef _string_box_utf8(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_starts_t *na_hashset, @@ -1432,6 +1446,7 @@ cdef _categorical_convert(parser_t *parser, int64_t col, return np.asarray(codes), result, na_count +# -> ndarray[f'|S{width}'] cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, int64_t width): cdef: @@ -1473,6 +1488,7 @@ cdef: char* cneginfty = b'-Infinity' +# -> tuple[ndarray[float64_t], int] | tuple[None, None] cdef _try_double(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_starts_t *na_hashset, object na_flist): @@ -1482,7 +1498,7 @@ cdef _try_double(parser_t *parser, int64_t col, float64_t *data float64_t NA = na_values[np.float64] kh_float64_t *na_fset - ndarray result + ndarray[float64_t] result bint use_na_flist = len(na_flist) > 0 lines = line_end - line_start @@ -1712,6 +1728,7 @@ cdef inline int _try_int64_nogil(parser_t *parser, int64_t col, return 0 +# -> tuple[ndarray[bool], int] cdef _try_bool_flex(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, const kh_str_starts_t *na_hashset, @@ -1890,7 +1907,9 @@ cdef raise_parser_error(object base, parser_t *parser): raise ParserError(message) -def _concatenate_chunks(list chunks): +# chunks: list[dict[int, "ArrayLike"]] +# -> dict[int, "ArrayLike"] +def _concatenate_chunks(list chunks) -> dict: cdef: list names = list(chunks[0].keys()) object name @@ -1964,6 +1983,7 @@ for k in list(na_values): na_values[np.dtype(k)] = na_values[k] +# -> ArrayLike cdef _apply_converter(object f, parser_t *parser, int64_t col, int64_t line_start, int64_t line_end): cdef: @@ -1986,14 +2006,15 @@ cdef _apply_converter(object f, parser_t *parser, int64_t col, return lib.maybe_convert_objects(result) -def _maybe_encode(values): +cdef list _maybe_encode(list values): if values is None: return [] return [x.encode('utf-8') if isinstance(x, str) else x for x in values] +# TODO: only ever called with convert_empty=False def sanitize_objects(ndarray[object] values, set na_values, - bint convert_empty=True): + bint convert_empty=True) -> int: """ Convert specified values, including the given set na_values and empty strings if convert_empty is True, to np.nan. @@ -2003,6 +2024,10 @@ def sanitize_objects(ndarray[object] values, set na_values, values : ndarray[object] na_values : set convert_empty : bool, default True + + Returns + ------- + na_count : int """ cdef: Py_ssize_t i, n diff --git a/pandas/_libs/window/indexers.pyi b/pandas/_libs/window/indexers.pyi new file mode 100644 index 0000000000000..c3e0409e35df2 --- /dev/null +++ b/pandas/_libs/window/indexers.pyi @@ -0,0 +1,13 @@ +import numpy as np + +def calculate_variable_window_bounds( + num_values: int, # int64_t + window_size: int, # int64_t + min_periods: object, + center: object, + closed: object, + index: np.ndarray, # const int64_t[:] +) -> tuple[ + np.ndarray, # np.ndarray[np.int64] + np.ndarray, # np.ndarray[np.int64] +]: ... diff --git a/pandas/_libs/writers.pyi b/pandas/_libs/writers.pyi new file mode 100644 index 0000000000000..67f6059c2a825 --- /dev/null +++ b/pandas/_libs/writers.pyi @@ -0,0 +1,24 @@ +import numpy as np + +# TODO: can make this more specific +def write_csv_rows( + data: list, + data_index: np.ndarray, + nlevels: int, + cols: np.ndarray, + writer: object, # _csv.writer +) -> None: ... + +def convert_json_to_lines(arr: str) -> str: ... + +def max_len_string_array( + arr: np.ndarray, # pandas_string[:] +) -> int: ... + +def word_len(val: object) -> int: ... + +def string_array_replace_from_nan_rep( + arr: np.ndarray, # np.ndarray[object, ndim=1] + nan_rep: object, + replace: object = ..., +) -> None: ... diff --git a/pandas/_libs/writers.pyx b/pandas/_libs/writers.pyx index 6adda1fe92044..5867a60a7cb64 100644 --- a/pandas/_libs/writers.pyx +++ b/pandas/_libs/writers.pyx @@ -23,7 +23,7 @@ def write_csv_rows( Py_ssize_t nlevels, ndarray cols, object writer -): +) -> None: """ Write the given data to the writer object, pre-allocating where possible for performance improvements. @@ -162,7 +162,7 @@ def string_array_replace_from_nan_rep( ndarray[object, ndim=1] arr, object nan_rep, object replace=np.nan -): +) -> None: """ Replace the values in the array with 'replacement' if they are 'nan_rep'. Return the same array. @@ -173,5 +173,3 @@ def string_array_replace_from_nan_rep( for i in range(length): if arr[i] == nan_rep: arr[i] = replace - - return arr diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index f8e2734b99e20..29ebba4c06175 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -113,8 +113,15 @@ def get_window_bounds( closed: Optional[str] = None, ) -> Tuple[np.ndarray, np.ndarray]: + # error: Argument 6 to "calculate_variable_window_bounds" has incompatible + # type "Optional[ndarray]"; expected "ndarray" return calculate_variable_window_bounds( - num_values, self.window_size, min_periods, center, closed, self.index_array + num_values, + self.window_size, + min_periods, + center, + closed, + self.index_array, # type: ignore[arg-type] ) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index ca8340cfd0a24..ef45062a2f941 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -16,6 +16,7 @@ Optional, Sequence, Union, + cast, ) import numpy as np @@ -46,6 +47,8 @@ class CSVFormatter: + cols: np.ndarray + def __init__( self, formatter: DataFrameFormatter, @@ -140,9 +143,7 @@ def _initialize_quotechar(self, quotechar: Optional[str]) -> Optional[str]: def has_mi_columns(self) -> bool: return bool(isinstance(self.obj.columns, ABCMultiIndex)) - def _initialize_columns( - self, cols: Optional[Sequence[Hashable]] - ) -> Sequence[Hashable]: + def _initialize_columns(self, cols: Optional[Sequence[Hashable]]) -> np.ndarray: # validate mi options if self.has_mi_columns: if cols is not None: @@ -159,10 +160,7 @@ def _initialize_columns( # update columns to include possible multiplicity of dupes # and make sure cols is just a list of labels new_cols = self.obj.columns - if isinstance(new_cols, ABCIndex): - return new_cols._format_native_types(**self._number_format) - else: - return list(new_cols) + return new_cols._format_native_types(**self._number_format) def _initialize_chunksize(self, chunksize: Optional[int]) -> int: if chunksize is None: @@ -218,7 +216,9 @@ def write_cols(self) -> Sequence[Hashable]: else: return self.header else: - return self.cols + # self.cols is an ndarray derived from Index._format_native_types, + # so its entries are strings, i.e. hashable + return cast(Sequence[Hashable], self.cols) @property def encoded_labels(self) -> List[Hashable]: diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 135e093cdc1e0..8305ff64c42c6 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -213,14 +213,14 @@ def read(self, nrows=None): names = self._maybe_dedup_names(names) # rename dict keys - data = sorted(data.items()) - data = {k: v for k, (i, v) in zip(names, data)} + data_tups = sorted(data.items()) + data = {k: v for k, (i, v) in zip(names, data_tups)} names, data = self._do_date_conversions(names, data) else: # rename dict keys - data = sorted(data.items()) + data_tups = sorted(data.items()) # ugh, mutation @@ -233,9 +233,9 @@ def read(self, nrows=None): names = self._filter_usecols(names) # columns as list - alldata = [x[1] for x in data] + alldata = [x[1] for x in data_tups] - data = {k: v for k, (i, v) in zip(names, data)} + data = {k: v for k, (i, v) in zip(names, data_tups)} names, data = self._do_date_conversions(names, data) index, names = self._make_index(data, alldata, names) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 02a723902271e..d9896d5f3998c 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -5078,7 +5078,7 @@ def _unconvert_string_array( if nan_rep is None: nan_rep = "nan" - data = libwriters.string_array_replace_from_nan_rep(data, nan_rep) + libwriters.string_array_replace_from_nan_rep(data, nan_rep) return data.reshape(shape) From e127ce03669c0c40781b3e94253390a298ef15cc Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 19 Mar 2021 07:52:19 -0700 Subject: [PATCH 2/4] Any->Hashable --- pandas/_libs/parsers.pyi | 12 ++++++------ pandas/_libs/window/indexers.pyi | 6 +++--- pandas/_libs/window/indexers.pyx | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/parsers.pyi b/pandas/_libs/parsers.pyi index 208a76b70a153..1051c319b769b 100644 --- a/pandas/_libs/parsers.pyi +++ b/pandas/_libs/parsers.pyi @@ -1,5 +1,5 @@ from typing import ( - Any, + Hashable, Literal, ) @@ -30,9 +30,9 @@ class TextReader: self, source, delimiter: bytes | str = ..., # single-character only - header=0, - header_start=0, - header_end=0, + header=..., + header_start=..., + header_end=..., index_col=..., names=..., tokenize_chunksize: int = ..., # int64_t @@ -47,7 +47,7 @@ class TextReader: comment=..., decimal: bytes | str = ..., # single-character only thousands: bytes | str | None = ..., # single-character only - dtype: Dtype | dict[Any, Dtype] = ..., + dtype: Dtype | dict[Hashable, Dtype] = ..., usecols=..., error_bad_lines: bool = ..., warn_bad_lines: bool = ..., @@ -74,4 +74,4 @@ class TextReader: def close(self) -> None: ... - def read(self, rows: int | None = None) -> dict[int, ArrayLike]: ... + def read(self, rows: int | None = ...) -> dict[int, ArrayLike]: ... diff --git a/pandas/_libs/window/indexers.pyi b/pandas/_libs/window/indexers.pyi index c3e0409e35df2..658ccb78a3735 100644 --- a/pandas/_libs/window/indexers.pyi +++ b/pandas/_libs/window/indexers.pyi @@ -3,9 +3,9 @@ import numpy as np def calculate_variable_window_bounds( num_values: int, # int64_t window_size: int, # int64_t - min_periods: object, - center: object, - closed: object, + min_periods, + center, + closed: str | None, index: np.ndarray, # const int64_t[:] ) -> tuple[ np.ndarray, # np.ndarray[np.int64] diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 67b196b7cb179..9de4d424571a0 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -15,7 +15,7 @@ def calculate_variable_window_bounds( int64_t window_size, object min_periods, # unused but here to match get_window_bounds signature object center, # unused but here to match get_window_bounds signature - object closed, + str closed, const int64_t[:] index ): """ From 429cb8654d5be5ac2a1f38481fcc71cb92503c6b Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 8 Apr 2021 19:12:54 -0700 Subject: [PATCH 3/4] mypy fixup --- pandas/io/formats/csvs.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 1130cb0cc110c..915a17fc702c3 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -308,12 +308,10 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: data = [res.iget_values(i) for i in range(len(res.items))] ix = self.data_index[slicer]._format_native_types(**self._number_format) - # error: Argument 4 to "write_csv_rows" has incompatible type - # "Sequence[Hashable]"; expected "ndarray" libwriters.write_csv_rows( data, ix, self.nlevels, - self.cols, # type: ignore[arg-type] + self.cols, self.writer, ) From 82acddce6c11512977b21aff2ca509c6d8a44d45 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 9 Apr 2021 12:35:57 -0700 Subject: [PATCH 4/4] mypy fixup --- pandas/core/window/indexers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index 42be859c0ae55..1ad80b2e4c908 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -113,13 +113,15 @@ def get_window_bounds( closed: Optional[str] = None, ) -> Tuple[np.ndarray, np.ndarray]: + # error: Argument 4 to "calculate_variable_window_bounds" has incompatible + # type "Optional[bool]"; expected "bool" # error: Argument 6 to "calculate_variable_window_bounds" has incompatible # type "Optional[ndarray]"; expected "ndarray" return calculate_variable_window_bounds( num_values, self.window_size, min_periods, - center, + center, # type: ignore[arg-type] closed, self.index_array, # type: ignore[arg-type] )