Skip to content

TYP: parsers #52993

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
May 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions pandas/io/parsers/arrow_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,6 @@ def _parse_kwds(self):
encoding: str | None = self.kwds.get("encoding")
self.encoding = "utf-8" if encoding is None else encoding

self.usecols, self.usecols_dtype = self._validate_usecols_arg(
self.kwds["usecols"]
)
na_values = self.kwds["na_values"]
if isinstance(na_values, dict):
raise ValueError(
Expand Down Expand Up @@ -121,13 +118,15 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
# we only need the frame not the names
frame.columns, frame = self._do_date_conversions(frame.columns, frame)
if self.index_col is not None:
index_to_set = self.index_col.copy()
for i, item in enumerate(self.index_col):
if is_integer(item):
self.index_col[i] = frame.columns[item]
index_to_set[i] = frame.columns[item]
# String case
elif item not in frame.columns:
raise ValueError(f"Index {item} invalid")
frame.set_index(self.index_col, drop=True, inplace=True)

frame.set_index(index_to_set, drop=True, inplace=True)
# Clear names if headerless and no name given
if self.header is None and not multi_index_named:
frame.index.names = [None] * len(frame.index.names)
Expand Down
36 changes: 29 additions & 7 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,10 +102,17 @@ class BadLineHandleMethod(Enum):
WARN = 1
SKIP = 2

_implicit_index: bool = False
_implicit_index: bool
_first_chunk: bool
keep_default_na: bool
dayfirst: bool
cache_dates: bool
keep_date_col: bool
usecols_dtype: str | None

def __init__(self, kwds) -> None:
self._implicit_index = False

self.names = kwds.get("names")
self.orig_names: Sequence[Hashable] | None = None

Expand Down Expand Up @@ -155,15 +162,19 @@ def __init__(self, kwds) -> None:

# validate index_col that only contains integers
if self.index_col is not None:
if not (
# In this case we can pin down index_col as list[int]
if is_integer(self.index_col):
self.index_col = [self.index_col]
elif not (
is_list_like(self.index_col, allow_sets=False)
and all(map(is_integer, self.index_col))
or is_integer(self.index_col)
):
raise ValueError(
"index_col must only contain row numbers "
"when specifying a multi-index header"
)
else:
self.index_col = list(self.index_col)

self._name_processed = False

Expand Down Expand Up @@ -428,6 +439,7 @@ def _get_name(icol):

return index

@final
def _clean_mapping(self, mapping):
"""converts col numbers to names"""
if not isinstance(mapping, dict):
Expand Down Expand Up @@ -656,6 +668,7 @@ def _set(x) -> int:

return noconvert_columns

@final
def _infer_types(
self, values, na_values, no_dtype_specified, try_num_bool: bool = True
) -> tuple[ArrayLike, int]:
Expand Down Expand Up @@ -760,6 +773,7 @@ def _infer_types(

return result, na_count

@final
def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike:
"""
Cast values to specified type
Expand Down Expand Up @@ -847,6 +861,7 @@ def _do_date_conversions(
) -> tuple[Sequence[Hashable], Mapping[Hashable, ArrayLike]]:
...

@final
def _do_date_conversions(
self,
names: Sequence[Hashable] | Index,
Expand All @@ -868,6 +883,7 @@ def _do_date_conversions(

return names, data

@final
def _check_data_length(
self,
columns: Sequence[Hashable],
Expand Down Expand Up @@ -911,6 +927,7 @@ def _evaluate_usecols(
) -> set[str]:
...

@final
def _evaluate_usecols(
self,
usecols: Callable[[Hashable], object] | set[str] | set[int],
Expand All @@ -927,6 +944,7 @@ def _evaluate_usecols(
return {i for i, name in enumerate(names) if usecols(name)}
return usecols

@final
def _validate_usecols_names(self, usecols, names: Sequence):
"""
Validates that all usecols are present in a given
Expand Down Expand Up @@ -958,6 +976,7 @@ def _validate_usecols_names(self, usecols, names: Sequence):

return usecols

@final
def _validate_usecols_arg(self, usecols):
"""
Validate the 'usecols' parameter.
Expand Down Expand Up @@ -1007,6 +1026,7 @@ def _validate_usecols_arg(self, usecols):
return usecols, usecols_dtype
return usecols, None

@final
def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, list]:
if not is_index_col(index_col):
return None, columns, index_col
Expand Down Expand Up @@ -1044,11 +1064,13 @@ def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, lis

return index_names, columns, index_col

def _get_empty_meta(
self, columns, index_col, index_names, dtype: DtypeArg | None = None
):
@final
def _get_empty_meta(self, columns, dtype: DtypeArg | None = None):
columns = list(columns)

index_col = self.index_col
index_names = self.index_names

# Convert `dtype` to a defaultdict of some kind.
# This will enable us to write `dtype[col_name]`
# without worrying about KeyError issues later on.
Expand Down Expand Up @@ -1319,7 +1341,7 @@ def _try_convert_dates(
return new_name, new_col, colnames


def _get_na_values(col, na_values, na_fvalues, keep_default_na):
def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool):
"""
Get the NaN values for a given column.

Expand Down
15 changes: 1 addition & 14 deletions pandas/io/parsers/c_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,9 +245,7 @@ def read(
)
index, columns, col_dict = self._get_empty_meta(
names,
self.index_col,
self.index_names,
dtype=self.kwds.get("dtype"),
dtype=self.dtype,
)
columns = self._maybe_make_multi_index_columns(columns, self.col_names)

Expand Down Expand Up @@ -344,17 +342,6 @@ def _filter_usecols(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
]
return names

def _get_index_names(self):
names = list(self._reader.header[0])
idx_names = None

if self._reader.leading_cols == 0 and self.index_col is not None:
(idx_names, names, self.index_col) = self._clean_index_names(
names, self.index_col
)

return names, idx_names

def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True):
if try_parse_dates and self._should_parse_dates(index):
values = self._date_conv(
Expand Down
34 changes: 14 additions & 20 deletions pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
EmptyDataError,
ParserError,
)
from pandas.util._decorators import cache_readonly

from pandas.core.dtypes.common import (
is_bool_dtype,
Expand Down Expand Up @@ -65,6 +66,8 @@


class PythonParser(ParserBase):
_no_thousands_columns: set[int]

def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
"""
Workhorse function for processing nested list into DataFrame
Expand Down Expand Up @@ -97,8 +100,6 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
self.quoting = kwds["quoting"]
self.skip_blank_lines = kwds["skip_blank_lines"]

self.names_passed = kwds["names"] or None

self.has_index_names = False
if "has_index_names" in kwds:
self.has_index_names = kwds["has_index_names"]
Expand All @@ -116,7 +117,7 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
self.data = cast(Iterator[str], f)
else:
assert hasattr(f, "readline")
self._make_reader(f)
self.data = self._make_reader(f)

# Get columns in two steps: infer from data, then
# infer column indices from self.usecols if it is specified.
Expand Down Expand Up @@ -148,9 +149,7 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
# multiple date column thing turning into a real spaghetti factory

if not self._has_complex_date_col:
(index_names, self.orig_names, self.columns) = self._get_index_name(
self.columns
)
(index_names, self.orig_names, self.columns) = self._get_index_name()
self._name_processed = True
if self.index_names is None:
self.index_names = index_names
Expand All @@ -164,6 +163,8 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
if len(self.decimal) != 1:
raise ValueError("Only length-1 decimal markers supported")

@cache_readonly
def num(self) -> re.Pattern:
decimal = re.escape(self.decimal)
if self.thousands is None:
regex = rf"^[\-\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$"
Expand All @@ -173,9 +174,9 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
rf"^[\-\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?"
rf"([0-9]?(E|e)\-?[0-9]+)?$"
)
self.num = re.compile(regex)
return re.compile(regex)

def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]):
sep = self.delimiter

if sep is None or len(sep) == 1:
Expand Down Expand Up @@ -237,10 +238,7 @@ def _read():

reader = _read()

# error: Incompatible types in assignment (expression has type "_reader",
# variable has type "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase,
# TextIOWrapper, mmap, None]")
self.data = reader # type: ignore[assignment]
return reader

def read(
self, rows: int | None = None
Expand Down Expand Up @@ -270,11 +268,8 @@ def read(
self.index_col, # type: ignore[has-type]
),
)
# error: Cannot determine type of 'index_col'
index, columns, col_dict = self._get_empty_meta(
names,
self.index_col, # type: ignore[has-type]
self.index_names,
self.dtype,
)
conv_columns = self._maybe_make_multi_index_columns(columns, self.col_names)
Expand Down Expand Up @@ -908,10 +903,8 @@ def _check_decimal(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:
def _clear_buffer(self) -> None:
self.buf = []

_implicit_index = False

def _get_index_name(
self, columns: Sequence[Hashable]
self,
) -> tuple[Sequence[Hashable] | None, list[Hashable], list[Hashable]]:
"""
Try several cases to get lines:
Expand All @@ -924,6 +917,7 @@ def _get_index_name(
1 lists index columns and row 0 lists normal columns.
2) Get index from the columns if it was listed.
"""
columns: Sequence[Hashable] = self.orig_names
orig_names = list(columns)
columns = list(columns)

Expand Down Expand Up @@ -1317,8 +1311,8 @@ def __init__(self, f: ReadCsvBuffer[str], **kwds) -> None:
self.infer_nrows = kwds.pop("infer_nrows")
PythonParser.__init__(self, f, **kwds)

def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
self.data = FixedWidthReader(
def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> FixedWidthReader:
return FixedWidthReader(
f,
self.colspecs,
self.delimiter,
Expand Down