Skip to content

Typ parts of c parser #44677

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Dec 22, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 30 additions & 4 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@
DefaultDict,
Hashable,
Iterable,
List,
Mapping,
Sequence,
Tuple,
cast,
final,
overload,
Expand Down Expand Up @@ -441,10 +443,15 @@ def _maybe_dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
return names

@final
def _maybe_make_multi_index_columns(self, columns, col_names=None):
def _maybe_make_multi_index_columns(
self,
columns: Sequence[Hashable],
col_names: Sequence[Hashable] | None = None,
) -> Sequence[Hashable] | MultiIndex:
# possibly create a column mi here
if _is_potential_multi_index(columns):
columns = MultiIndex.from_tuples(columns, names=col_names)
list_columns = cast(List[Tuple], columns)
return MultiIndex.from_tuples(list_columns, names=col_names)
return columns

@final
Expand Down Expand Up @@ -923,7 +930,25 @@ def _check_data_length(
stacklevel=find_stack_level(),
)

def _evaluate_usecols(self, usecols, names):
@overload
def _evaluate_usecols(
self,
usecols: set[int] | Callable[[Hashable], object],
names: Sequence[Hashable],
) -> set[int]:
...

@overload
def _evaluate_usecols(
self, usecols: set[str], names: Sequence[Hashable]
) -> set[str]:
...

def _evaluate_usecols(
self,
usecols: Callable[[Hashable], object] | set[str] | set[int],
names: Sequence[Hashable],
) -> set[str] | set[int]:
"""
Check whether or not the 'usecols' parameter
is a callable. If so, enumerates the 'names'
Expand Down Expand Up @@ -1289,7 +1314,8 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na):


def _is_potential_multi_index(
columns, index_col: bool | Sequence[int] | None = None
columns: Sequence[Hashable] | MultiIndex,
index_col: bool | Sequence[int] | None = None,
) -> bool:
"""
Check whether or not the `columns` parameter
Expand Down
36 changes: 28 additions & 8 deletions pandas/io/parsers/c_parser_wrapper.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
from __future__ import annotations

from typing import (
Hashable,
Mapping,
Sequence,
)
import warnings

import numpy as np

import pandas._libs.parsers as parsers
from pandas._typing import (
ArrayLike,
DtypeArg,
DtypeObj,
FilePath,
ReadCsvBuffer,
)
Expand All @@ -20,6 +27,10 @@
from pandas.core.dtypes.concat import union_categoricals
from pandas.core.dtypes.dtypes import ExtensionDtype

from pandas import (
Index,
MultiIndex,
)
from pandas.core.indexes.api import ensure_index_from_sequences

from pandas.io.parsers.base_parser import (
Expand Down Expand Up @@ -193,7 +204,7 @@ def close(self) -> None:
except ValueError:
pass

def _set_noconvert_columns(self):
def _set_noconvert_columns(self) -> None:
"""
Set the columns that should not undergo dtype conversions.

Expand All @@ -214,7 +225,14 @@ def _set_noconvert_columns(self):
for col in noconvert_columns:
self._reader.set_noconvert(col)

def read(self, nrows=None):
def read(
self,
nrows: int | None = None,
) -> tuple[
Index | MultiIndex | None,
Sequence[Hashable] | MultiIndex,
Mapping[Hashable, ArrayLike],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Return types should be as concrete as possible. If you know it is a list/dict, it probably shouldn't be Sequence/Mapping.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The basic issue here is, if we type some functions as sequence, we also have to type the return types as sequence, because most of the time there is one code branch just passing the inputs along (for example _do_date_conversion). If we want to use lists, we get into a bunch of other issues. Not 100% sure what to do there. Went with Sequence for now

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW i usually use list/tuple/whatever on the theory that "well it's accurate and more specific than Sequence!" and then @simonjayhawkins tells me to use Sequence anyway

]:
try:
if self.low_memory:
chunks = self._reader.read_low_memory(nrows)
Expand Down Expand Up @@ -306,11 +324,11 @@ def read(self, nrows=None):
index, names = self._make_index(date_data, alldata, names)

# maybe create a mi on the columns
names = self._maybe_make_multi_index_columns(names, self.col_names)
conv_names = self._maybe_make_multi_index_columns(names, self.col_names)

return index, names, date_data
return index, conv_names, date_data

def _filter_usecols(self, names):
def _filter_usecols(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
# hackish
usecols = self._evaluate_usecols(self.usecols, names)
if usecols is not None and len(names) != len(usecols):
Expand Down Expand Up @@ -395,13 +413,15 @@ def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict:
return result


def ensure_dtype_objs(dtype):
def ensure_dtype_objs(
dtype: DtypeArg | dict[Hashable, DtypeArg] | None
) -> DtypeObj | dict[Hashable, DtypeObj] | None:
"""
Ensure we have either None, a dtype object, or a dictionary mapping to
dtype objects.
"""
if isinstance(dtype, dict):
dtype = {k: pandas_dtype(dtype[k]) for k in dtype}
return {k: pandas_dtype(dtype[k]) for k in dtype}
elif dtype is not None:
dtype = pandas_dtype(dtype)
return pandas_dtype(dtype)
return dtype
5 changes: 3 additions & 2 deletions pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,8 +270,8 @@ def read(self, rows: int | None = None):
self.index_names,
self.dtype,
)
columns = self._maybe_make_multi_index_columns(columns, self.col_names)
return index, columns, col_dict
conv_columns = self._maybe_make_multi_index_columns(columns, self.col_names)
return index, conv_columns, col_dict

# handle new style for names in index
count_empty_content_vals = count_empty_vals(content[0])
Expand Down Expand Up @@ -560,6 +560,7 @@ def _handle_usecols(

usecols_key is used if there are string usecols.
"""
col_indices: set[int] | list[int]
if self.usecols is not None:
if callable(self.usecols):
col_indices = self._evaluate_usecols(self.usecols, usecols_key)
Expand Down