Skip to content

Commit bc0022e

Browse files
jbrockmendelYi Wei
authored and
Yi Wei
committed
TYP: parsers (pandas-dev#52993)
* REF: avoid modifying self.index_col * REF: remove unused, do less in init * REF: simplify parsers calls * use orig_names * simplify * stronger typing sorta * TYP/REF: make Parsers less stateful * final
1 parent 88b6396 commit bc0022e

File tree

4 files changed

+48
-46
lines changed

4 files changed

+48
-46
lines changed

pandas/io/parsers/arrow_parser_wrapper.py

+4-5
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,6 @@ def _parse_kwds(self):
3636
encoding: str | None = self.kwds.get("encoding")
3737
self.encoding = "utf-8" if encoding is None else encoding
3838

39-
self.usecols, self.usecols_dtype = self._validate_usecols_arg(
40-
self.kwds["usecols"]
41-
)
4239
na_values = self.kwds["na_values"]
4340
if isinstance(na_values, dict):
4441
raise ValueError(
@@ -121,13 +118,15 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
121118
# we only need the frame not the names
122119
frame.columns, frame = self._do_date_conversions(frame.columns, frame)
123120
if self.index_col is not None:
121+
index_to_set = self.index_col.copy()
124122
for i, item in enumerate(self.index_col):
125123
if is_integer(item):
126-
self.index_col[i] = frame.columns[item]
124+
index_to_set[i] = frame.columns[item]
127125
# String case
128126
elif item not in frame.columns:
129127
raise ValueError(f"Index {item} invalid")
130-
frame.set_index(self.index_col, drop=True, inplace=True)
128+
129+
frame.set_index(index_to_set, drop=True, inplace=True)
131130
# Clear names if headerless and no name given
132131
if self.header is None and not multi_index_named:
133132
frame.index.names = [None] * len(frame.index.names)

pandas/io/parsers/base_parser.py

+29-7
Original file line numberDiff line numberDiff line change
@@ -102,10 +102,17 @@ class BadLineHandleMethod(Enum):
102102
WARN = 1
103103
SKIP = 2
104104

105-
_implicit_index: bool = False
105+
_implicit_index: bool
106106
_first_chunk: bool
107+
keep_default_na: bool
108+
dayfirst: bool
109+
cache_dates: bool
110+
keep_date_col: bool
111+
usecols_dtype: str | None
107112

108113
def __init__(self, kwds) -> None:
114+
self._implicit_index = False
115+
109116
self.names = kwds.get("names")
110117
self.orig_names: Sequence[Hashable] | None = None
111118

@@ -155,15 +162,19 @@ def __init__(self, kwds) -> None:
155162

156163
# validate index_col that only contains integers
157164
if self.index_col is not None:
158-
if not (
165+
# In this case we can pin down index_col as list[int]
166+
if is_integer(self.index_col):
167+
self.index_col = [self.index_col]
168+
elif not (
159169
is_list_like(self.index_col, allow_sets=False)
160170
and all(map(is_integer, self.index_col))
161-
or is_integer(self.index_col)
162171
):
163172
raise ValueError(
164173
"index_col must only contain row numbers "
165174
"when specifying a multi-index header"
166175
)
176+
else:
177+
self.index_col = list(self.index_col)
167178

168179
self._name_processed = False
169180

@@ -428,6 +439,7 @@ def _get_name(icol):
428439

429440
return index
430441

442+
@final
431443
def _clean_mapping(self, mapping):
432444
"""converts col numbers to names"""
433445
if not isinstance(mapping, dict):
@@ -656,6 +668,7 @@ def _set(x) -> int:
656668

657669
return noconvert_columns
658670

671+
@final
659672
def _infer_types(
660673
self, values, na_values, no_dtype_specified, try_num_bool: bool = True
661674
) -> tuple[ArrayLike, int]:
@@ -760,6 +773,7 @@ def _infer_types(
760773

761774
return result, na_count
762775

776+
@final
763777
def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike:
764778
"""
765779
Cast values to specified type
@@ -847,6 +861,7 @@ def _do_date_conversions(
847861
) -> tuple[Sequence[Hashable], Mapping[Hashable, ArrayLike]]:
848862
...
849863

864+
@final
850865
def _do_date_conversions(
851866
self,
852867
names: Sequence[Hashable] | Index,
@@ -868,6 +883,7 @@ def _do_date_conversions(
868883

869884
return names, data
870885

886+
@final
871887
def _check_data_length(
872888
self,
873889
columns: Sequence[Hashable],
@@ -911,6 +927,7 @@ def _evaluate_usecols(
911927
) -> set[str]:
912928
...
913929

930+
@final
914931
def _evaluate_usecols(
915932
self,
916933
usecols: Callable[[Hashable], object] | set[str] | set[int],
@@ -927,6 +944,7 @@ def _evaluate_usecols(
927944
return {i for i, name in enumerate(names) if usecols(name)}
928945
return usecols
929946

947+
@final
930948
def _validate_usecols_names(self, usecols, names: Sequence):
931949
"""
932950
Validates that all usecols are present in a given
@@ -958,6 +976,7 @@ def _validate_usecols_names(self, usecols, names: Sequence):
958976

959977
return usecols
960978

979+
@final
961980
def _validate_usecols_arg(self, usecols):
962981
"""
963982
Validate the 'usecols' parameter.
@@ -1007,6 +1026,7 @@ def _validate_usecols_arg(self, usecols):
10071026
return usecols, usecols_dtype
10081027
return usecols, None
10091028

1029+
@final
10101030
def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, list]:
10111031
if not is_index_col(index_col):
10121032
return None, columns, index_col
@@ -1044,11 +1064,13 @@ def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, lis
10441064

10451065
return index_names, columns, index_col
10461066

1047-
def _get_empty_meta(
1048-
self, columns, index_col, index_names, dtype: DtypeArg | None = None
1049-
):
1067+
@final
1068+
def _get_empty_meta(self, columns, dtype: DtypeArg | None = None):
10501069
columns = list(columns)
10511070

1071+
index_col = self.index_col
1072+
index_names = self.index_names
1073+
10521074
# Convert `dtype` to a defaultdict of some kind.
10531075
# This will enable us to write `dtype[col_name]`
10541076
# without worrying about KeyError issues later on.
@@ -1319,7 +1341,7 @@ def _try_convert_dates(
13191341
return new_name, new_col, colnames
13201342

13211343

1322-
def _get_na_values(col, na_values, na_fvalues, keep_default_na):
1344+
def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool):
13231345
"""
13241346
Get the NaN values for a given column.
13251347

pandas/io/parsers/c_parser_wrapper.py

+1-14
Original file line numberDiff line numberDiff line change
@@ -245,9 +245,7 @@ def read(
245245
)
246246
index, columns, col_dict = self._get_empty_meta(
247247
names,
248-
self.index_col,
249-
self.index_names,
250-
dtype=self.kwds.get("dtype"),
248+
dtype=self.dtype,
251249
)
252250
columns = self._maybe_make_multi_index_columns(columns, self.col_names)
253251

@@ -344,17 +342,6 @@ def _filter_usecols(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
344342
]
345343
return names
346344

347-
def _get_index_names(self):
348-
names = list(self._reader.header[0])
349-
idx_names = None
350-
351-
if self._reader.leading_cols == 0 and self.index_col is not None:
352-
(idx_names, names, self.index_col) = self._clean_index_names(
353-
names, self.index_col
354-
)
355-
356-
return names, idx_names
357-
358345
def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True):
359346
if try_parse_dates and self._should_parse_dates(index):
360347
values = self._date_conv(

pandas/io/parsers/python_parser.py

+14-20
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
EmptyDataError,
2929
ParserError,
3030
)
31+
from pandas.util._decorators import cache_readonly
3132

3233
from pandas.core.dtypes.common import (
3334
is_bool_dtype,
@@ -65,6 +66,8 @@
6566

6667

6768
class PythonParser(ParserBase):
69+
_no_thousands_columns: set[int]
70+
6871
def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
6972
"""
7073
Workhorse function for processing nested list into DataFrame
@@ -97,8 +100,6 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
97100
self.quoting = kwds["quoting"]
98101
self.skip_blank_lines = kwds["skip_blank_lines"]
99102

100-
self.names_passed = kwds["names"] or None
101-
102103
self.has_index_names = False
103104
if "has_index_names" in kwds:
104105
self.has_index_names = kwds["has_index_names"]
@@ -116,7 +117,7 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
116117
self.data = cast(Iterator[str], f)
117118
else:
118119
assert hasattr(f, "readline")
119-
self._make_reader(f)
120+
self.data = self._make_reader(f)
120121

121122
# Get columns in two steps: infer from data, then
122123
# infer column indices from self.usecols if it is specified.
@@ -148,9 +149,7 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
148149
# multiple date column thing turning into a real spaghetti factory
149150

150151
if not self._has_complex_date_col:
151-
(index_names, self.orig_names, self.columns) = self._get_index_name(
152-
self.columns
153-
)
152+
(index_names, self.orig_names, self.columns) = self._get_index_name()
154153
self._name_processed = True
155154
if self.index_names is None:
156155
self.index_names = index_names
@@ -164,6 +163,8 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
164163
if len(self.decimal) != 1:
165164
raise ValueError("Only length-1 decimal markers supported")
166165

166+
@cache_readonly
167+
def num(self) -> re.Pattern:
167168
decimal = re.escape(self.decimal)
168169
if self.thousands is None:
169170
regex = rf"^[\-\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$"
@@ -173,9 +174,9 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
173174
rf"^[\-\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?"
174175
rf"([0-9]?(E|e)\-?[0-9]+)?$"
175176
)
176-
self.num = re.compile(regex)
177+
return re.compile(regex)
177178

178-
def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
179+
def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]):
179180
sep = self.delimiter
180181

181182
if sep is None or len(sep) == 1:
@@ -237,10 +238,7 @@ def _read():
237238

238239
reader = _read()
239240

240-
# error: Incompatible types in assignment (expression has type "_reader",
241-
# variable has type "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase,
242-
# TextIOWrapper, mmap, None]")
243-
self.data = reader # type: ignore[assignment]
241+
return reader
244242

245243
def read(
246244
self, rows: int | None = None
@@ -270,11 +268,8 @@ def read(
270268
self.index_col, # type: ignore[has-type]
271269
),
272270
)
273-
# error: Cannot determine type of 'index_col'
274271
index, columns, col_dict = self._get_empty_meta(
275272
names,
276-
self.index_col, # type: ignore[has-type]
277-
self.index_names,
278273
self.dtype,
279274
)
280275
conv_columns = self._maybe_make_multi_index_columns(columns, self.col_names)
@@ -908,10 +903,8 @@ def _check_decimal(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:
908903
def _clear_buffer(self) -> None:
909904
self.buf = []
910905

911-
_implicit_index = False
912-
913906
def _get_index_name(
914-
self, columns: Sequence[Hashable]
907+
self,
915908
) -> tuple[Sequence[Hashable] | None, list[Hashable], list[Hashable]]:
916909
"""
917910
Try several cases to get lines:
@@ -924,6 +917,7 @@ def _get_index_name(
924917
1 lists index columns and row 0 lists normal columns.
925918
2) Get index from the columns if it was listed.
926919
"""
920+
columns: Sequence[Hashable] = self.orig_names
927921
orig_names = list(columns)
928922
columns = list(columns)
929923

@@ -1317,8 +1311,8 @@ def __init__(self, f: ReadCsvBuffer[str], **kwds) -> None:
13171311
self.infer_nrows = kwds.pop("infer_nrows")
13181312
PythonParser.__init__(self, f, **kwds)
13191313

1320-
def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
1321-
self.data = FixedWidthReader(
1314+
def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> FixedWidthReader:
1315+
return FixedWidthReader(
13221316
f,
13231317
self.colspecs,
13241318
self.delimiter,

0 commit comments

Comments
 (0)