diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 13010bb2ef147..3e4780ec21378 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1407,6 +1407,7 @@ def _value_formatter( if float_format: def base_formatter(v): + assert float_format is not None # for mypy return float_format(value=v) if notna(v) else self.na_rep else: diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 1df37da3da8d0..0089d7a32f723 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -1511,7 +1511,10 @@ def from_custom_template(cls, searchpath, name): """ loader = jinja2.ChoiceLoader([jinja2.FileSystemLoader(searchpath), cls.loader]) - class MyStyler(cls): + # mypy doesnt like dynamically-defined class + # error: Variable "cls" is not valid as a type [valid-type] + # error: Invalid base class "cls" [misc] + class MyStyler(cls): # type:ignore[valid-type,misc] env = jinja2.Environment(loader=loader) template = env.get_template(name) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index f2ee642d8fd42..989036917b265 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -16,6 +16,7 @@ from collections import abc from datetime import datetime, timedelta import struct +from typing import IO, Any, Union import numpy as np @@ -62,12 +63,42 @@ def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series: raise ValueError("unit must be 'd' or 's'") -class _subheader_pointer: - pass +class _SubheaderPointer: + offset: int + length: int + compression: int + ptype: int + def __init__(self, offset: int, length: int, compression: int, ptype: int): + self.offset = offset + self.length = length + self.compression = compression + self.ptype = ptype -class _column: - pass + +class _Column: + col_id: int + name: Union[str, bytes] + label: Union[str, bytes] + format: Union[str, bytes] # TODO: i think allowing bytes is from py2 days + ctype: bytes + length: int + + def __init__( + self, + col_id: int, + name: Union[str, bytes], + label: Union[str, bytes], + format: Union[str, bytes], + ctype: bytes, + length: int, + ): + self.col_id = col_id + self.name = name + self.label = label + self.format = format + self.ctype = ctype + self.length = length # SAS7BDAT represents a SAS data file in SAS7BDAT format. @@ -100,6 +131,8 @@ class SAS7BDATReader(ReaderBase, abc.Iterator): bytes. """ + _path_or_buf: IO[Any] + def __init__( self, path_or_buf, @@ -121,7 +154,7 @@ def __init__( self.convert_header_text = convert_header_text self.default_encoding = "latin-1" - self.compression = "" + self.compression = b"" self.column_names_strings = [] self.column_names = [] self.column_formats = [] @@ -137,10 +170,14 @@ def __init__( self._current_row_on_page_index = 0 self._current_row_in_file_index = 0 - self._path_or_buf = get_filepath_or_buffer(path_or_buf).filepath_or_buffer - if isinstance(self._path_or_buf, str): - self._path_or_buf = open(self._path_or_buf, "rb") - self.handle = self._path_or_buf + path_or_buf = get_filepath_or_buffer(path_or_buf).filepath_or_buffer + if isinstance(path_or_buf, str): + buf = open(path_or_buf, "rb") + self.handle = buf + else: + buf = path_or_buf + + self._path_or_buf: IO[Any] = buf try: self._get_properties() @@ -319,7 +356,7 @@ def _read_float(self, offset, width): return struct.unpack(self.byte_order + fd, buf)[0] # Read a single signed integer of the given width (1, 2, 4 or 8). - def _read_int(self, offset, width): + def _read_int(self, offset: int, width: int) -> int: if width not in (1, 2, 4, 8): self.close() raise ValueError("invalid int width") @@ -328,7 +365,7 @@ def _read_int(self, offset, width): iv = struct.unpack(self.byte_order + it, buf)[0] return iv - def _read_bytes(self, offset, length): + def _read_bytes(self, offset: int, length: int): if self._cached_page is None: self._path_or_buf.seek(offset) buf = self._path_or_buf.read(length) @@ -400,14 +437,14 @@ def _get_subheader_index(self, signature, compression, ptype): if index is None: f1 = (compression == const.compressed_subheader_id) or (compression == 0) f2 = ptype == const.compressed_subheader_type - if (self.compression != "") and f1 and f2: + if (self.compression != b"") and f1 and f2: index = const.SASIndex.data_subheader_index else: self.close() raise ValueError("Unknown subheader signature") return index - def _process_subheader_pointers(self, offset, subheader_pointer_index): + def _process_subheader_pointers(self, offset: int, subheader_pointer_index: int): subheader_pointer_length = self._subheader_pointer_length total_offset = offset + subheader_pointer_length * subheader_pointer_index @@ -423,11 +460,9 @@ def _process_subheader_pointers(self, offset, subheader_pointer_index): subheader_type = self._read_int(total_offset, 1) - x = _subheader_pointer() - x.offset = subheader_offset - x.length = subheader_length - x.compression = subheader_compression - x.ptype = subheader_type + x = _SubheaderPointer( + subheader_offset, subheader_length, subheader_compression, subheader_type + ) return x @@ -519,7 +554,7 @@ def _process_columntext_subheader(self, offset, length): self.column_names_strings.append(cname) if len(self.column_names_strings) == 1: - compression_literal = "" + compression_literal = b"" for cl in const.compression_literals: if cl in cname_raw: compression_literal = cl @@ -532,7 +567,7 @@ def _process_columntext_subheader(self, offset, length): buf = self._read_bytes(offset1, self._lcp) compression_literal = buf.rstrip(b"\x00") - if compression_literal == "": + if compression_literal == b"": self._lcs = 0 offset1 = offset + 32 if self.U64: @@ -657,13 +692,14 @@ def _process_format_subheader(self, offset, length): column_format = format_names[format_start : format_start + format_len] current_column_number = len(self.columns) - col = _column() - col.col_id = current_column_number - col.name = self.column_names[current_column_number] - col.label = column_label - col.format = column_format - col.ctype = self._column_types[current_column_number] - col.length = self._column_data_lengths[current_column_number] + col = _Column( + current_column_number, + self.column_names[current_column_number], + column_label, + column_format, + self._column_types[current_column_number], + self._column_data_lengths[current_column_number], + ) self.column_formats.append(column_format) self.columns.append(col) diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index 9727ec930119b..2a48abe9fbd63 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -337,16 +337,16 @@ def _read_header(self): obs_length = 0 while len(fielddata) >= fieldnamelength: # pull data for one field - field, fielddata = ( + fieldbytes, fielddata = ( fielddata[:fieldnamelength], fielddata[fieldnamelength:], ) # rest at end gets ignored, so if field is short, pad out # to match struct pattern below - field = field.ljust(140) + fieldbytes = fieldbytes.ljust(140) - fieldstruct = struct.unpack(">hhhh8s40s8shhh2s8shhl52s", field) + fieldstruct = struct.unpack(">hhhh8s40s8shhh2s8shhl52s", fieldbytes) field = dict(zip(_fieldkeys, fieldstruct)) del field["_"] field["ntype"] = types[field["ntype"]] @@ -408,8 +408,8 @@ def _record_count(self) -> int: return total_records_length // self.record_length self.filepath_or_buffer.seek(-80, 2) - last_card = self.filepath_or_buffer.read(80) - last_card = np.frombuffer(last_card, dtype=np.uint64) + last_card_bytes = self.filepath_or_buffer.read(80) + last_card = np.frombuffer(last_card_bytes, dtype=np.uint64) # 8 byte blank ix = np.flatnonzero(last_card == 2314885530818453536) @@ -483,7 +483,7 @@ def read(self, nrows=None): df[x] = v if self._index is None: - df.index = range(self._lines_read, self._lines_read + read_lines) + df.index = pd.Index(range(self._lines_read, self._lines_read + read_lines)) else: df = df.set_index(self._index) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index d36bd42e7da8d..55dde374048b6 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -16,7 +16,18 @@ from pathlib import Path import struct import sys -from typing import Any, AnyStr, BinaryIO, Dict, List, Optional, Sequence, Tuple, Union +from typing import ( + Any, + AnyStr, + BinaryIO, + Dict, + List, + Optional, + Sequence, + Tuple, + Union, + cast, +) import warnings from dateutil.relativedelta import relativedelta @@ -1389,6 +1400,7 @@ def _setup_dtype(self) -> np.dtype: dtypes = [] # Convert struct data types to numpy data type for i, typ in enumerate(self.typlist): if typ in self.NUMPY_TYPE_MAP: + typ = cast(str, typ) # only strs in NUMPY_TYPE_MAP dtypes.append(("s" + str(i), self.byteorder + self.NUMPY_TYPE_MAP[typ])) else: dtypes.append(("s" + str(i), "S" + str(typ))) @@ -1699,6 +1711,7 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra if fmt not in self.VALID_RANGE: continue + fmt = cast(str, fmt) # only strs in VALID_RANGE nmin, nmax = self.VALID_RANGE[fmt] series = data[colname] missing = np.logical_or(series < nmin, series > nmax) diff --git a/setup.cfg b/setup.cfg index cd20249728062..836b3460f3896 100644 --- a/setup.cfg +++ b/setup.cfg @@ -235,21 +235,12 @@ check_untyped_defs=False [mypy-pandas.io.formats.format] check_untyped_defs=False -[mypy-pandas.io.formats.style] -check_untyped_defs=False - [mypy-pandas.io.parsers] check_untyped_defs=False [mypy-pandas.io.pytables] check_untyped_defs=False -[mypy-pandas.io.sas.sas_xport] -check_untyped_defs=False - -[mypy-pandas.io.sas.sas7bdat] -check_untyped_defs=False - [mypy-pandas.io.stata] check_untyped_defs=False