TYP: sas, stata, style (pandas-dev#36990)

jbrockmendel · Kevin D Smith · commit 31bb3c41c330 · 2020-11-02T08:51:46.000-06:00
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
@@ -1407,6 +1407,7 @@ def _value_formatter(
         if float_format:
 
             def base_formatter(v):
+                assert float_format is not None  # for mypy
                 return float_format(value=v) if notna(v) else self.na_rep
 
         else:
diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py
@@ -1511,7 +1511,10 @@ def from_custom_template(cls, searchpath, name):
         """
         loader = jinja2.ChoiceLoader([jinja2.FileSystemLoader(searchpath), cls.loader])
 
-        class MyStyler(cls):
+        # mypy doesnt like dynamically-defined class
+        # error: Variable "cls" is not valid as a type  [valid-type]
+        # error: Invalid base class "cls"  [misc]
+        class MyStyler(cls):  # type:ignore[valid-type,misc]
             env = jinja2.Environment(loader=loader)
             template = env.get_template(name)
 
diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py
@@ -16,6 +16,7 @@
 from collections import abc
 from datetime import datetime, timedelta
 import struct
+from typing import IO, Any, Union
 
 import numpy as np
 
@@ -62,12 +63,42 @@ def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series:
             raise ValueError("unit must be 'd' or 's'")
 
 
-class _subheader_pointer:
-    pass
+class _SubheaderPointer:
+    offset: int
+    length: int
+    compression: int
+    ptype: int
 
+    def __init__(self, offset: int, length: int, compression: int, ptype: int):
+        self.offset = offset
+        self.length = length
+        self.compression = compression
+        self.ptype = ptype
 
-class _column:
-    pass
+
+class _Column:
+    col_id: int
+    name: Union[str, bytes]
+    label: Union[str, bytes]
+    format: Union[str, bytes]  # TODO: i think allowing bytes is from py2 days
+    ctype: bytes
+    length: int
+
+    def __init__(
+        self,
+        col_id: int,
+        name: Union[str, bytes],
+        label: Union[str, bytes],
+        format: Union[str, bytes],
+        ctype: bytes,
+        length: int,
+    ):
+        self.col_id = col_id
+        self.name = name
+        self.label = label
+        self.format = format
+        self.ctype = ctype
+        self.length = length
 
 
 # SAS7BDAT represents a SAS data file in SAS7BDAT format.
@@ -100,6 +131,8 @@ class SAS7BDATReader(ReaderBase, abc.Iterator):
         bytes.
     """
 
+    _path_or_buf: IO[Any]
+
     def __init__(
         self,
         path_or_buf,
@@ -121,7 +154,7 @@ def __init__(
         self.convert_header_text = convert_header_text
 
         self.default_encoding = "latin-1"
-        self.compression = ""
+        self.compression = b""
         self.column_names_strings = []
         self.column_names = []
         self.column_formats = []
@@ -137,10 +170,14 @@ def __init__(
         self._current_row_on_page_index = 0
         self._current_row_in_file_index = 0
 
-        self._path_or_buf = get_filepath_or_buffer(path_or_buf).filepath_or_buffer
-        if isinstance(self._path_or_buf, str):
-            self._path_or_buf = open(self._path_or_buf, "rb")
-            self.handle = self._path_or_buf
+        path_or_buf = get_filepath_or_buffer(path_or_buf).filepath_or_buffer
+        if isinstance(path_or_buf, str):
+            buf = open(path_or_buf, "rb")
+            self.handle = buf
+        else:
+            buf = path_or_buf
+
+        self._path_or_buf: IO[Any] = buf
 
         try:
             self._get_properties()
@@ -319,7 +356,7 @@ def _read_float(self, offset, width):
         return struct.unpack(self.byte_order + fd, buf)[0]
 
     # Read a single signed integer of the given width (1, 2, 4 or 8).
-    def _read_int(self, offset, width):
+    def _read_int(self, offset: int, width: int) -> int:
         if width not in (1, 2, 4, 8):
             self.close()
             raise ValueError("invalid int width")
@@ -328,7 +365,7 @@ def _read_int(self, offset, width):
         iv = struct.unpack(self.byte_order + it, buf)[0]
         return iv
 
-    def _read_bytes(self, offset, length):
+    def _read_bytes(self, offset: int, length: int):
         if self._cached_page is None:
             self._path_or_buf.seek(offset)
             buf = self._path_or_buf.read(length)
@@ -400,14 +437,14 @@ def _get_subheader_index(self, signature, compression, ptype):
         if index is None:
             f1 = (compression == const.compressed_subheader_id) or (compression == 0)
             f2 = ptype == const.compressed_subheader_type
-            if (self.compression != "") and f1 and f2:
+            if (self.compression != b"") and f1 and f2:
                 index = const.SASIndex.data_subheader_index
             else:
                 self.close()
                 raise ValueError("Unknown subheader signature")
         return index
 
-    def _process_subheader_pointers(self, offset, subheader_pointer_index):
+    def _process_subheader_pointers(self, offset: int, subheader_pointer_index: int):
 
         subheader_pointer_length = self._subheader_pointer_length
         total_offset = offset + subheader_pointer_length * subheader_pointer_index
@@ -423,11 +460,9 @@ def _process_subheader_pointers(self, offset, subheader_pointer_index):
 
         subheader_type = self._read_int(total_offset, 1)
 
-        x = _subheader_pointer()
-        x.offset = subheader_offset
-        x.length = subheader_length
-        x.compression = subheader_compression
-        x.ptype = subheader_type
+        x = _SubheaderPointer(
+            subheader_offset, subheader_length, subheader_compression, subheader_type
+        )
 
         return x
 
@@ -519,7 +554,7 @@ def _process_columntext_subheader(self, offset, length):
         self.column_names_strings.append(cname)
 
         if len(self.column_names_strings) == 1:
-            compression_literal = ""
+            compression_literal = b""
             for cl in const.compression_literals:
                 if cl in cname_raw:
                     compression_literal = cl
@@ -532,7 +567,7 @@ def _process_columntext_subheader(self, offset, length):
 
             buf = self._read_bytes(offset1, self._lcp)
             compression_literal = buf.rstrip(b"\x00")
-            if compression_literal == "":
+            if compression_literal == b"":
                 self._lcs = 0
                 offset1 = offset + 32
                 if self.U64:
@@ -657,13 +692,14 @@ def _process_format_subheader(self, offset, length):
         column_format = format_names[format_start : format_start + format_len]
         current_column_number = len(self.columns)
 
-        col = _column()
-        col.col_id = current_column_number
-        col.name = self.column_names[current_column_number]
-        col.label = column_label
-        col.format = column_format
-        col.ctype = self._column_types[current_column_number]
-        col.length = self._column_data_lengths[current_column_number]
+        col = _Column(
+            current_column_number,
+            self.column_names[current_column_number],
+            column_label,
+            column_format,
+            self._column_types[current_column_number],
+            self._column_data_lengths[current_column_number],
+        )
 
         self.column_formats.append(column_format)
         self.columns.append(col)
diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py
@@ -337,16 +337,16 @@ def _read_header(self):
         obs_length = 0
         while len(fielddata) >= fieldnamelength:
             # pull data for one field
-            field, fielddata = (
+            fieldbytes, fielddata = (
                 fielddata[:fieldnamelength],
                 fielddata[fieldnamelength:],
             )
 
             # rest at end gets ignored, so if field is short, pad out
             # to match struct pattern below
-            field = field.ljust(140)
+            fieldbytes = fieldbytes.ljust(140)
 
-            fieldstruct = struct.unpack(">hhhh8s40s8shhh2s8shhl52s", field)
+            fieldstruct = struct.unpack(">hhhh8s40s8shhh2s8shhl52s", fieldbytes)
             field = dict(zip(_fieldkeys, fieldstruct))
             del field["_"]
             field["ntype"] = types[field["ntype"]]
@@ -408,8 +408,8 @@ def _record_count(self) -> int:
             return total_records_length // self.record_length
 
         self.filepath_or_buffer.seek(-80, 2)
-        last_card = self.filepath_or_buffer.read(80)
-        last_card = np.frombuffer(last_card, dtype=np.uint64)
+        last_card_bytes = self.filepath_or_buffer.read(80)
+        last_card = np.frombuffer(last_card_bytes, dtype=np.uint64)
 
         # 8 byte blank
         ix = np.flatnonzero(last_card == 2314885530818453536)
@@ -483,7 +483,7 @@ def read(self, nrows=None):
             df[x] = v
 
         if self._index is None:
-            df.index = range(self._lines_read, self._lines_read + read_lines)
+            df.index = pd.Index(range(self._lines_read, self._lines_read + read_lines))
         else:
             df = df.set_index(self._index)
 
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -16,7 +16,18 @@
 from pathlib import Path
 import struct
 import sys
-from typing import Any, AnyStr, BinaryIO, Dict, List, Optional, Sequence, Tuple, Union
+from typing import (
+    Any,
+    AnyStr,
+    BinaryIO,
+    Dict,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+    cast,
+)
 import warnings
 
 from dateutil.relativedelta import relativedelta
@@ -1389,6 +1400,7 @@ def _setup_dtype(self) -> np.dtype:
         dtypes = []  # Convert struct data types to numpy data type
         for i, typ in enumerate(self.typlist):
             if typ in self.NUMPY_TYPE_MAP:
+                typ = cast(str, typ)  # only strs in NUMPY_TYPE_MAP
                 dtypes.append(("s" + str(i), self.byteorder + self.NUMPY_TYPE_MAP[typ]))
             else:
                 dtypes.append(("s" + str(i), "S" + str(typ)))
@@ -1699,6 +1711,7 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra
             if fmt not in self.VALID_RANGE:
                 continue
 
+            fmt = cast(str, fmt)  # only strs in VALID_RANGE
             nmin, nmax = self.VALID_RANGE[fmt]
             series = data[colname]
             missing = np.logical_or(series < nmin, series > nmax)
diff --git a/setup.cfg b/setup.cfg
@@ -226,21 +226,12 @@ check_untyped_defs=False
 [mypy-pandas.io.formats.format]
 check_untyped_defs=False
 
-[mypy-pandas.io.formats.style]
-check_untyped_defs=False
-
 [mypy-pandas.io.parsers]
 check_untyped_defs=False
 
 [mypy-pandas.io.pytables]
 check_untyped_defs=False
 
-[mypy-pandas.io.sas.sas_xport]
-check_untyped_defs=False
-
-[mypy-pandas.io.sas.sas7bdat]
-check_untyped_defs=False
-
 [mypy-pandas.io.stata]
 check_untyped_defs=False