Skip to content

TYP: sas, stata, style #36990

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Oct 10, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pandas/io/formats/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -1407,6 +1407,7 @@ def _value_formatter(
if float_format:

def base_formatter(v):
assert float_format is not None # for mypy
return float_format(value=v) if notna(v) else self.na_rep

else:
Expand Down
5 changes: 4 additions & 1 deletion pandas/io/formats/style.py
Original file line number Diff line number Diff line change
Expand Up @@ -1511,7 +1511,10 @@ def from_custom_template(cls, searchpath, name):
"""
loader = jinja2.ChoiceLoader([jinja2.FileSystemLoader(searchpath), cls.loader])

class MyStyler(cls):
# mypy doesnt like dynamically-defined class
# error: Variable "cls" is not valid as a type [valid-type]
# error: Invalid base class "cls" [misc]
class MyStyler(cls): # type:ignore[valid-type,misc]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this resolve if using an appropriate constructor like: https://docs.python.org/3.7/library/types.html (or the old type constructor)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i didnt know that types.new_class was a thing. id rather keep the more-idiomatic usage even if mypy doesnt like it

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

env = jinja2.Environment(loader=loader)
template = env.get_template(name)

Expand Down
90 changes: 63 additions & 27 deletions pandas/io/sas/sas7bdat.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from collections import abc
from datetime import datetime, timedelta
import struct
from typing import IO, Any, Union

import numpy as np

Expand Down Expand Up @@ -62,12 +63,42 @@ def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series:
raise ValueError("unit must be 'd' or 's'")


class _subheader_pointer:
pass
class _SubheaderPointer:
offset: int
length: int
compression: int
ptype: int

def __init__(self, offset: int, length: int, compression: int, ptype: int):
self.offset = offset
self.length = length
self.compression = compression
self.ptype = ptype

class _column:
pass

class _Column:
col_id: int
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could be a dataclass to avoid this repetition

name: Union[str, bytes]
label: Union[str, bytes]
format: Union[str, bytes] # TODO: i think allowing bytes is from py2 days
ctype: bytes
length: int

def __init__(
self,
col_id: int,
name: Union[str, bytes],
label: Union[str, bytes],
format: Union[str, bytes],
ctype: bytes,
length: int,
):
self.col_id = col_id
self.name = name
self.label = label
self.format = format
self.ctype = ctype
self.length = length


# SAS7BDAT represents a SAS data file in SAS7BDAT format.
Expand Down Expand Up @@ -100,6 +131,8 @@ class SAS7BDATReader(ReaderBase, abc.Iterator):
bytes.
"""

_path_or_buf: IO[Any]

def __init__(
self,
path_or_buf,
Expand All @@ -121,7 +154,7 @@ def __init__(
self.convert_header_text = convert_header_text

self.default_encoding = "latin-1"
self.compression = ""
self.compression = b""
self.column_names_strings = []
self.column_names = []
self.column_formats = []
Expand All @@ -137,10 +170,14 @@ def __init__(
self._current_row_on_page_index = 0
self._current_row_in_file_index = 0

self._path_or_buf = get_filepath_or_buffer(path_or_buf).filepath_or_buffer
if isinstance(self._path_or_buf, str):
self._path_or_buf = open(self._path_or_buf, "rb")
self.handle = self._path_or_buf
path_or_buf = get_filepath_or_buffer(path_or_buf).filepath_or_buffer
if isinstance(path_or_buf, str):
buf = open(path_or_buf, "rb")
self.handle = buf
else:
buf = path_or_buf

self._path_or_buf: IO[Any] = buf
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

define as a class attribute?


try:
self._get_properties()
Expand Down Expand Up @@ -319,7 +356,7 @@ def _read_float(self, offset, width):
return struct.unpack(self.byte_order + fd, buf)[0]

# Read a single signed integer of the given width (1, 2, 4 or 8).
def _read_int(self, offset, width):
def _read_int(self, offset: int, width: int) -> int:
if width not in (1, 2, 4, 8):
self.close()
raise ValueError("invalid int width")
Expand All @@ -328,7 +365,7 @@ def _read_int(self, offset, width):
iv = struct.unpack(self.byte_order + it, buf)[0]
return iv

def _read_bytes(self, offset, length):
def _read_bytes(self, offset: int, length: int):
if self._cached_page is None:
self._path_or_buf.seek(offset)
buf = self._path_or_buf.read(length)
Expand Down Expand Up @@ -400,14 +437,14 @@ def _get_subheader_index(self, signature, compression, ptype):
if index is None:
f1 = (compression == const.compressed_subheader_id) or (compression == 0)
f2 = ptype == const.compressed_subheader_type
if (self.compression != "") and f1 and f2:
if (self.compression != b"") and f1 and f2:
index = const.SASIndex.data_subheader_index
else:
self.close()
raise ValueError("Unknown subheader signature")
return index

def _process_subheader_pointers(self, offset, subheader_pointer_index):
def _process_subheader_pointers(self, offset: int, subheader_pointer_index: int):

subheader_pointer_length = self._subheader_pointer_length
total_offset = offset + subheader_pointer_length * subheader_pointer_index
Expand All @@ -423,11 +460,9 @@ def _process_subheader_pointers(self, offset, subheader_pointer_index):

subheader_type = self._read_int(total_offset, 1)

x = _subheader_pointer()
x.offset = subheader_offset
x.length = subheader_length
x.compression = subheader_compression
x.ptype = subheader_type
x = _SubheaderPointer(
subheader_offset, subheader_length, subheader_compression, subheader_type
)

return x

Expand Down Expand Up @@ -519,7 +554,7 @@ def _process_columntext_subheader(self, offset, length):
self.column_names_strings.append(cname)

if len(self.column_names_strings) == 1:
compression_literal = ""
compression_literal = b""
for cl in const.compression_literals:
if cl in cname_raw:
compression_literal = cl
Expand All @@ -532,7 +567,7 @@ def _process_columntext_subheader(self, offset, length):

buf = self._read_bytes(offset1, self._lcp)
compression_literal = buf.rstrip(b"\x00")
if compression_literal == "":
if compression_literal == b"":
self._lcs = 0
offset1 = offset + 32
if self.U64:
Expand Down Expand Up @@ -657,13 +692,14 @@ def _process_format_subheader(self, offset, length):
column_format = format_names[format_start : format_start + format_len]
current_column_number = len(self.columns)

col = _column()
col.col_id = current_column_number
col.name = self.column_names[current_column_number]
col.label = column_label
col.format = column_format
col.ctype = self._column_types[current_column_number]
col.length = self._column_data_lengths[current_column_number]
col = _Column(
current_column_number,
self.column_names[current_column_number],
column_label,
column_format,
self._column_types[current_column_number],
self._column_data_lengths[current_column_number],
)

self.column_formats.append(column_format)
self.columns.append(col)
Expand Down
12 changes: 6 additions & 6 deletions pandas/io/sas/sas_xport.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,16 +337,16 @@ def _read_header(self):
obs_length = 0
while len(fielddata) >= fieldnamelength:
# pull data for one field
field, fielddata = (
fieldbytes, fielddata = (
fielddata[:fieldnamelength],
fielddata[fieldnamelength:],
)

# rest at end gets ignored, so if field is short, pad out
# to match struct pattern below
field = field.ljust(140)
fieldbytes = fieldbytes.ljust(140)

fieldstruct = struct.unpack(">hhhh8s40s8shhh2s8shhl52s", field)
fieldstruct = struct.unpack(">hhhh8s40s8shhh2s8shhl52s", fieldbytes)
field = dict(zip(_fieldkeys, fieldstruct))
del field["_"]
field["ntype"] = types[field["ntype"]]
Expand Down Expand Up @@ -408,8 +408,8 @@ def _record_count(self) -> int:
return total_records_length // self.record_length

self.filepath_or_buffer.seek(-80, 2)
last_card = self.filepath_or_buffer.read(80)
last_card = np.frombuffer(last_card, dtype=np.uint64)
last_card_bytes = self.filepath_or_buffer.read(80)
last_card = np.frombuffer(last_card_bytes, dtype=np.uint64)

# 8 byte blank
ix = np.flatnonzero(last_card == 2314885530818453536)
Expand Down Expand Up @@ -483,7 +483,7 @@ def read(self, nrows=None):
df[x] = v

if self._index is None:
df.index = range(self._lines_read, self._lines_read + read_lines)
df.index = pd.Index(range(self._lines_read, self._lines_read + read_lines))
else:
df = df.set_index(self._index)

Expand Down
15 changes: 14 additions & 1 deletion pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,18 @@
from pathlib import Path
import struct
import sys
from typing import Any, AnyStr, BinaryIO, Dict, List, Optional, Sequence, Tuple, Union
from typing import (
Any,
AnyStr,
BinaryIO,
Dict,
List,
Optional,
Sequence,
Tuple,
Union,
cast,
)
import warnings

from dateutil.relativedelta import relativedelta
Expand Down Expand Up @@ -1389,6 +1400,7 @@ def _setup_dtype(self) -> np.dtype:
dtypes = [] # Convert struct data types to numpy data type
for i, typ in enumerate(self.typlist):
if typ in self.NUMPY_TYPE_MAP:
typ = cast(str, typ) # only strs in NUMPY_TYPE_MAP
dtypes.append(("s" + str(i), self.byteorder + self.NUMPY_TYPE_MAP[typ]))
else:
dtypes.append(("s" + str(i), "S" + str(typ)))
Expand Down Expand Up @@ -1699,6 +1711,7 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra
if fmt not in self.VALID_RANGE:
continue

fmt = cast(str, fmt) # only strs in VALID_RANGE
nmin, nmax = self.VALID_RANGE[fmt]
series = data[colname]
missing = np.logical_or(series < nmin, series > nmax)
Expand Down
9 changes: 0 additions & 9 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -235,21 +235,12 @@ check_untyped_defs=False
[mypy-pandas.io.formats.format]
check_untyped_defs=False

[mypy-pandas.io.formats.style]
check_untyped_defs=False

[mypy-pandas.io.parsers]
check_untyped_defs=False

[mypy-pandas.io.pytables]
check_untyped_defs=False

[mypy-pandas.io.sas.sas_xport]
check_untyped_defs=False

[mypy-pandas.io.sas.sas7bdat]
check_untyped_defs=False

[mypy-pandas.io.stata]
check_untyped_defs=False

Expand Down